SinaTools 0.1.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. SinaTools-0.1.1.data/data/nlptools/environment.yml +227 -0
  2. SinaTools-0.1.1.dist-info/AUTHORS.rst +13 -0
  3. SinaTools-0.1.1.dist-info/LICENSE +22 -0
  4. SinaTools-0.1.1.dist-info/METADATA +72 -0
  5. SinaTools-0.1.1.dist-info/RECORD +122 -0
  6. SinaTools-0.1.1.dist-info/WHEEL +6 -0
  7. SinaTools-0.1.1.dist-info/entry_points.txt +18 -0
  8. SinaTools-0.1.1.dist-info/top_level.txt +1 -0
  9. nlptools/CLI/DataDownload/download_files.py +71 -0
  10. nlptools/CLI/arabiner/bin/infer.py +117 -0
  11. nlptools/CLI/arabiner/bin/infer2.py +81 -0
  12. nlptools/CLI/morphology/ALMA_multi_word.py +75 -0
  13. nlptools/CLI/morphology/morph_analyzer.py +91 -0
  14. nlptools/CLI/salma/salma_tools.py +68 -0
  15. nlptools/CLI/utils/__init__.py +0 -0
  16. nlptools/CLI/utils/arStrip.py +99 -0
  17. nlptools/CLI/utils/corpus_tokenizer.py +74 -0
  18. nlptools/CLI/utils/implication.py +92 -0
  19. nlptools/CLI/utils/jaccard.py +96 -0
  20. nlptools/CLI/utils/latin_remove.py +51 -0
  21. nlptools/CLI/utils/remove_Punc.py +53 -0
  22. nlptools/CLI/utils/sentence_tokenizer.py +90 -0
  23. nlptools/CLI/utils/text_transliteration.py +77 -0
  24. nlptools/DataDownload/__init__.py +0 -0
  25. nlptools/DataDownload/downloader.py +185 -0
  26. nlptools/VERSION +1 -0
  27. nlptools/__init__.py +5 -0
  28. nlptools/arabert/__init__.py +1 -0
  29. nlptools/arabert/arabert/__init__.py +14 -0
  30. nlptools/arabert/arabert/create_classification_data.py +260 -0
  31. nlptools/arabert/arabert/create_pretraining_data.py +534 -0
  32. nlptools/arabert/arabert/extract_features.py +444 -0
  33. nlptools/arabert/arabert/lamb_optimizer.py +158 -0
  34. nlptools/arabert/arabert/modeling.py +1027 -0
  35. nlptools/arabert/arabert/optimization.py +202 -0
  36. nlptools/arabert/arabert/run_classifier.py +1078 -0
  37. nlptools/arabert/arabert/run_pretraining.py +593 -0
  38. nlptools/arabert/arabert/run_squad.py +1440 -0
  39. nlptools/arabert/arabert/tokenization.py +414 -0
  40. nlptools/arabert/araelectra/__init__.py +1 -0
  41. nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +103 -0
  42. nlptools/arabert/araelectra/build_pretraining_dataset.py +230 -0
  43. nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py +90 -0
  44. nlptools/arabert/araelectra/configure_finetuning.py +172 -0
  45. nlptools/arabert/araelectra/configure_pretraining.py +143 -0
  46. nlptools/arabert/araelectra/finetune/__init__.py +14 -0
  47. nlptools/arabert/araelectra/finetune/feature_spec.py +56 -0
  48. nlptools/arabert/araelectra/finetune/preprocessing.py +173 -0
  49. nlptools/arabert/araelectra/finetune/scorer.py +54 -0
  50. nlptools/arabert/araelectra/finetune/task.py +74 -0
  51. nlptools/arabert/araelectra/finetune/task_builder.py +70 -0
  52. nlptools/arabert/araelectra/flops_computation.py +215 -0
  53. nlptools/arabert/araelectra/model/__init__.py +14 -0
  54. nlptools/arabert/araelectra/model/modeling.py +1029 -0
  55. nlptools/arabert/araelectra/model/optimization.py +193 -0
  56. nlptools/arabert/araelectra/model/tokenization.py +355 -0
  57. nlptools/arabert/araelectra/pretrain/__init__.py +14 -0
  58. nlptools/arabert/araelectra/pretrain/pretrain_data.py +160 -0
  59. nlptools/arabert/araelectra/pretrain/pretrain_helpers.py +229 -0
  60. nlptools/arabert/araelectra/run_finetuning.py +323 -0
  61. nlptools/arabert/araelectra/run_pretraining.py +469 -0
  62. nlptools/arabert/araelectra/util/__init__.py +14 -0
  63. nlptools/arabert/araelectra/util/training_utils.py +112 -0
  64. nlptools/arabert/araelectra/util/utils.py +109 -0
  65. nlptools/arabert/aragpt2/__init__.py +2 -0
  66. nlptools/arabert/aragpt2/create_pretraining_data.py +95 -0
  67. nlptools/arabert/aragpt2/gpt2/__init__.py +2 -0
  68. nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py +158 -0
  69. nlptools/arabert/aragpt2/gpt2/optimization.py +225 -0
  70. nlptools/arabert/aragpt2/gpt2/run_pretraining.py +397 -0
  71. nlptools/arabert/aragpt2/grover/__init__.py +0 -0
  72. nlptools/arabert/aragpt2/grover/dataloader.py +161 -0
  73. nlptools/arabert/aragpt2/grover/modeling.py +803 -0
  74. nlptools/arabert/aragpt2/grover/modeling_gpt2.py +1196 -0
  75. nlptools/arabert/aragpt2/grover/optimization_adafactor.py +234 -0
  76. nlptools/arabert/aragpt2/grover/train_tpu.py +187 -0
  77. nlptools/arabert/aragpt2/grover/utils.py +234 -0
  78. nlptools/arabert/aragpt2/train_bpe_tokenizer.py +59 -0
  79. nlptools/arabert/preprocess.py +818 -0
  80. nlptools/arabiner/__init__.py +0 -0
  81. nlptools/arabiner/bin/__init__.py +14 -0
  82. nlptools/arabiner/bin/eval.py +87 -0
  83. nlptools/arabiner/bin/infer.py +91 -0
  84. nlptools/arabiner/bin/process.py +140 -0
  85. nlptools/arabiner/bin/train.py +221 -0
  86. nlptools/arabiner/data/__init__.py +1 -0
  87. nlptools/arabiner/data/datasets.py +146 -0
  88. nlptools/arabiner/data/transforms.py +118 -0
  89. nlptools/arabiner/nn/BaseModel.py +22 -0
  90. nlptools/arabiner/nn/BertNestedTagger.py +34 -0
  91. nlptools/arabiner/nn/BertSeqTagger.py +17 -0
  92. nlptools/arabiner/nn/__init__.py +3 -0
  93. nlptools/arabiner/trainers/BaseTrainer.py +117 -0
  94. nlptools/arabiner/trainers/BertNestedTrainer.py +203 -0
  95. nlptools/arabiner/trainers/BertTrainer.py +163 -0
  96. nlptools/arabiner/trainers/__init__.py +3 -0
  97. nlptools/arabiner/utils/__init__.py +0 -0
  98. nlptools/arabiner/utils/data.py +124 -0
  99. nlptools/arabiner/utils/helpers.py +151 -0
  100. nlptools/arabiner/utils/metrics.py +69 -0
  101. nlptools/environment.yml +227 -0
  102. nlptools/install_env.py +13 -0
  103. nlptools/morphology/ALMA_multi_word.py +34 -0
  104. nlptools/morphology/__init__.py +52 -0
  105. nlptools/morphology/charsets.py +60 -0
  106. nlptools/morphology/morph_analyzer.py +170 -0
  107. nlptools/morphology/settings.py +8 -0
  108. nlptools/morphology/tokenizers_words.py +19 -0
  109. nlptools/nlptools.py +1 -0
  110. nlptools/salma/__init__.py +12 -0
  111. nlptools/salma/settings.py +31 -0
  112. nlptools/salma/views.py +459 -0
  113. nlptools/salma/wsd.py +126 -0
  114. nlptools/utils/__init__.py +0 -0
  115. nlptools/utils/corpus_tokenizer.py +73 -0
  116. nlptools/utils/implication.py +662 -0
  117. nlptools/utils/jaccard.py +247 -0
  118. nlptools/utils/parser.py +147 -0
  119. nlptools/utils/readfile.py +3 -0
  120. nlptools/utils/sentence_tokenizer.py +53 -0
  121. nlptools/utils/text_transliteration.py +232 -0
  122. nlptools/utils/utils.py +2 -0
@@ -0,0 +1,227 @@
1
+ name: arabicner
2
+ channels:
3
+ - anaconda
4
+ - pytorch
5
+ - nvidia
6
+ - conda-forge
7
+ - defaults
8
+ dependencies:
9
+ - _libgcc_mutex=0.1=main
10
+ - _openmp_mutex=5.1=1_gnu
11
+ - abseil-cpp=20211102.0=h27087fc_1
12
+ - absl-py=1.3.0=pyhd8ed1ab_0
13
+ - aiohttp=3.8.1=py310h5764c6d_1
14
+ - aiosignal=1.2.0=pyhd8ed1ab_0
15
+ - arrow-cpp=8.0.0=py310h3098874_0
16
+ - async-timeout=4.0.2=py310h06a4308_0
17
+ - attrs=22.1.0=pyh71513ae_1
18
+ - aws-c-common=0.4.57=he6710b0_1
19
+ - aws-c-event-stream=0.1.6=h2531618_5
20
+ - aws-checksums=0.1.9=he6710b0_0
21
+ - aws-sdk-cpp=1.8.185=hce553d0_0
22
+ - blas=1.0=mkl
23
+ - blinker=1.5=pyhd8ed1ab_0
24
+ - boost-cpp=1.78.0=he72f1d9_0
25
+ - bottleneck=1.3.5=py310ha9d4c09_0
26
+ - brotli=1.0.9=h166bdaf_7
27
+ - brotli-bin=1.0.9=h166bdaf_7
28
+ - brotlipy=0.7.0=py310h7f8727e_1002
29
+ - bzip2=1.0.8=h7b6447c_0
30
+ - c-ares=1.18.1=h7f98852_0
31
+ - ca-certificates=2022.9.24=ha878542_0
32
+ - cachetools=5.2.0=pyhd8ed1ab_0
33
+ - certifi=2022.9.24=pyhd8ed1ab_0
34
+ - cffi=1.15.1=py310h74dc2b5_0
35
+ - charset-normalizer=2.0.4=pyhd3eb1b0_0
36
+ - click=8.1.3=unix_pyhd8ed1ab_2
37
+ - cryptography=38.0.1=py310h9ce1e76_0
38
+ - cuda=11.7.1=0
39
+ - cuda-cccl=11.7.91=0
40
+ - cuda-command-line-tools=11.7.1=0
41
+ - cuda-compiler=11.7.1=0
42
+ - cuda-cudart=11.7.99=0
43
+ - cuda-cudart-dev=11.7.99=0
44
+ - cuda-cuobjdump=11.7.91=0
45
+ - cuda-cupti=11.7.101=0
46
+ - cuda-cuxxfilt=11.7.91=0
47
+ - cuda-demo-suite=11.8.86=0
48
+ - cuda-documentation=11.8.86=0
49
+ - cuda-driver-dev=11.7.99=0
50
+ - cuda-gdb=11.8.86=0
51
+ - cuda-libraries=11.7.1=0
52
+ - cuda-libraries-dev=11.7.1=0
53
+ - cuda-memcheck=11.8.86=0
54
+ - cuda-nsight=11.8.86=0
55
+ - cuda-nsight-compute=11.8.0=0
56
+ - cuda-nvcc=11.7.99=0
57
+ - cuda-nvdisasm=11.8.86=0
58
+ - cuda-nvml-dev=11.7.91=0
59
+ - cuda-nvprof=11.8.87=0
60
+ - cuda-nvprune=11.7.91=0
61
+ - cuda-nvrtc=11.7.99=0
62
+ - cuda-nvrtc-dev=11.7.99=0
63
+ - cuda-nvtx=11.7.91=0
64
+ - cuda-nvvp=11.8.87=0
65
+ - cuda-runtime=11.7.1=0
66
+ - cuda-sanitizer-api=11.8.86=0
67
+ - cuda-toolkit=11.7.1=0
68
+ - cuda-tools=11.7.1=0
69
+ - cuda-visual-tools=11.7.1=0
70
+ - dataclasses=0.8=pyhc8e2a94_3
71
+ - datasets=2.6.1=pyhd8ed1ab_0
72
+ - dill=0.3.5.1=pyhd8ed1ab_0
73
+ - ffmpeg=4.3=hf484d3e_0
74
+ - fftw=3.3.10=nompi_h77c792f_102
75
+ - filelock=3.8.0=pyhd8ed1ab_0
76
+ - freetype=2.12.1=h4a9f257_0
77
+ - frozenlist=1.2.0=py310h7f8727e_1
78
+ - fsspec=2022.10.0=pyhd8ed1ab_0
79
+ - gds-tools=1.4.0.31=0
80
+ - gflags=2.2.2=he1b5a44_1004
81
+ - giflib=5.2.1=h7b6447c_0
82
+ - glog=0.6.0=h6f12383_0
83
+ - gmp=6.2.1=h295c915_3
84
+ - gnutls=3.6.15=he1e5248_0
85
+ #- google-auth=2.14.0=pyh1a96a4e_0
86
+ #- google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
87
+ - grpc-cpp=1.46.1=h33aed49_0
88
+ - grpcio=1.42.0=py310hce63b2e_0
89
+ - huggingface_hub=0.10.1=pyhd8ed1ab_0
90
+ - icu=70.1=h27087fc_0
91
+ - idna=3.4=py310h06a4308_0
92
+ - importlib-metadata=5.0.0=pyha770c72_1
93
+ - importlib_metadata=5.0.0=hd8ed1ab_1
94
+ - intel-openmp=2021.4.0=h06a4308_3561
95
+ - joblib=1.2.0=pyhd8ed1ab_0
96
+ - jpeg=9e=h7f8727e_0
97
+ - keyutils=1.6.1=h166bdaf_0
98
+ - krb5=1.19.3=h3790be6_0
99
+ - lame=3.100=h7b6447c_0
100
+ - lcms2=2.12=h3be6417_0
101
+ - ld_impl_linux-64=2.38=h1181459_1
102
+ - lerc=3.0=h295c915_0
103
+ - libbrotlicommon=1.0.9=h166bdaf_7
104
+ - libbrotlidec=1.0.9=h166bdaf_7
105
+ - libbrotlienc=1.0.9=h166bdaf_7
106
+ - libcublas=11.11.3.6=0
107
+ - libcublas-dev=11.11.3.6=0
108
+ - libcufft=10.9.0.58=0
109
+ - libcufft-dev=10.9.0.58=0
110
+ - libcufile=1.4.0.31=0
111
+ - libcufile-dev=1.4.0.31=0
112
+ - libcurand=10.3.0.86=0
113
+ - libcurl=7.85.0=h91b91d3_0
114
+ - libcusolver=11.4.1.48=0
115
+ - libcusolver-dev=11.4.1.48=0
116
+ - libcusparse=11.7.5.86=0
117
+ - libcusparse-dev=11.7.5.86=0
118
+ - libdeflate=1.8=h7f8727e_5
119
+ - libedit=3.1.20191231=he28a2e2_2
120
+ - libev=4.33=h516909a_1
121
+ - libevent=2.1.10=h9b69904_4
122
+ - libffi=3.3=he6710b0_2
123
+ - libgcc-ng=11.2.0=h1234567_1
124
+ - libgfortran-ng=12.2.0=h69a702a_19
125
+ - libgfortran5=12.2.0=h337968e_19
126
+ - libgomp=11.2.0=h1234567_1
127
+ - libiconv=1.16=h7f8727e_2
128
+ - libidn2=2.3.2=h7f8727e_0
129
+ - libnghttp2=1.46.0=hce63b2e_0
130
+ - libnpp=11.8.0.86=0
131
+ - libnpp-dev=11.8.0.86=0
132
+ - libnvjpeg=11.9.0.86=0
133
+ - libnvjpeg-dev=11.9.0.86=0
134
+ - libpng=1.6.37=hbc83047_0
135
+ - libprotobuf=3.20.1=h4ff587b_0
136
+ - libssh2=1.10.0=ha56f1ee_2
137
+ - libstdcxx-ng=11.2.0=h1234567_1
138
+ - libtasn1=4.16.0=h27cfd23_0
139
+ - libthrift=0.15.0=he6d91bd_0
140
+ - libtiff=4.4.0=hecacb30_0
141
+ - libunistring=0.9.10=h27cfd23_0
142
+ - libuuid=1.0.3=h7f8727e_2
143
+ - libwebp=1.2.4=h11a3e52_0
144
+ - libwebp-base=1.2.4=h5eee18b_0
145
+ - lz4-c=1.9.3=h295c915_1
146
+ - markdown=3.4.1=pyhd8ed1ab_0
147
+ - markupsafe=2.1.1=py310h5764c6d_1
148
+ - mkl=2021.4.0=h06a4308_640
149
+ - mkl-service=2.4.0=py310h7f8727e_0
150
+ - mkl_fft=1.3.1=py310hd6ae3a3_0
151
+ - mkl_random=1.2.2=py310h00e6091_0
152
+ - multidict=6.0.2=py310h5764c6d_1
153
+ - multiprocess=0.70.12.2=py310h5764c6d_2
154
+ - natsort=7.1.1=pyhd3eb1b0_0
155
+ - ncurses=6.3=h5eee18b_3
156
+ - nettle=3.7.3=hbbd107a_1
157
+ - nsight-compute=2022.3.0.22=0
158
+ - numexpr=2.8.3=py310hcea2de6_0
159
+ - numpy=1.23.3=py310hd5efca6_0
160
+ #- numpy-base=1.23.3=py310h8e6c178_0
161
+ - oauthlib=3.2.2=pyhd8ed1ab_0
162
+ - openh264=2.1.1=h4ff587b_0
163
+ - openssl=1.1.1s=h7f8727e_0
164
+ - orc=1.7.4=h07ed6aa_0
165
+ - packaging=21.3=pyhd8ed1ab_0
166
+ - pandas=1.4.4=py310h6a678d5_0
167
+ - pillow=9.2.0=py310hace64e9_1
168
+ - pip=22.2.2=py310h06a4308_0
169
+ - protobuf=3.20.1=py310hd8f1fbe_0
170
+ - pyarrow=8.0.0=py310h468efa6_0
171
+ - pyasn1=0.4.8=py_0
172
+ - pyasn1-modules=0.2.7=py_0
173
+ - pycparser=2.21=pyhd3eb1b0_0
174
+ - pyjwt=2.6.0=pyhd8ed1ab_0
175
+ - pyopenssl=22.0.0=pyhd3eb1b0_0
176
+ - pyparsing=3.0.9=pyhd8ed1ab_0
177
+ - pysocks=1.7.1=py310h06a4308_0
178
+ - python=3.10.6=haa1d7c7_1
179
+ - python-dateutil=2.8.2=pyhd8ed1ab_0
180
+ - python-xxhash=3.0.0=py310h5764c6d_1
181
+ - python_abi=3.10=2_cp310
182
+ - pytorch=1.13.0=py3.10_cuda11.7_cudnn8.5.0_0
183
+ - pytorch-cuda=11.7=h67b0de4_0
184
+ - pytorch-mutex=1.0=cuda
185
+ - pytz=2022.6=pyhd8ed1ab_0
186
+ - pyu2f=0.1.5=pyhd8ed1ab_0
187
+ - pyyaml=6.0=py310h5764c6d_4
188
+ - re2=2022.04.01=h27087fc_0
189
+ - readline=8.2=h5eee18b_0
190
+ - regex=2022.7.9=py310h5eee18b_0
191
+ - requests=2.28.1=py310h06a4308_0
192
+ - requests-oauthlib=1.3.1=pyhd8ed1ab_0
193
+ - responses=0.18.0=pyhd8ed1ab_0
194
+ - rsa=4.9=pyhd8ed1ab_0
195
+ - sacremoses=0.0.53=pyhd8ed1ab_0
196
+ - scikit-learn=1.1.3=py310h6a678d5_0
197
+ - scipy=1.9.3=py310hd5efca6_0
198
+ - seqeval=1.2.2=pyhd3deb0d_0
199
+ - setuptools=65.4.0=py310h06a4308_0
200
+ - six=1.16.0=pyhd3eb1b0_1
201
+ - snappy=1.1.9=hbd366e4_1
202
+ - sqlite=3.39.3=h5082296_0
203
+ - tensorboard=2.10.1=pyhd8ed1ab_0
204
+ - tensorboard-data-server=0.6.0=py310h597c629_2
205
+ - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
206
+ - threadpoolctl=3.1.0=pyh8a188c0_0
207
+ - tk=8.6.12=h1ccaba5_0
208
+ - tokenizers=0.11.4=py310h3dcd8bd_1
209
+ - torchaudio=0.13.0=py310_cu117
210
+ - torchtext=0.14.0=py310
211
+ - torchvision=0.14.0=py310_cu117
212
+ - tqdm=4.64.1=py310h06a4308_0
213
+ - transformers=4.24.0=pyhd8ed1ab_0
214
+ - typing-extensions=4.3.0=py310h06a4308_0
215
+ - typing_extensions=4.3.0=py310h06a4308_0
216
+ - tzdata=2022e=h04d1e81_0
217
+ - urllib3=1.26.12=py310h06a4308_0
218
+ - utf8proc=2.6.1=h27cfd23_0
219
+ - werkzeug=2.2.2=pyhd8ed1ab_0
220
+ - wheel=0.37.1=pyhd3eb1b0_0
221
+ - xxhash=0.8.0=h7f98852_3
222
+ - xz=5.2.6=h5eee18b_0
223
+ - yaml=0.2.5=h7f98852_2
224
+ - yarl=1.7.2=py310h5764c6d_2
225
+ - zipp=3.10.0=pyhd8ed1ab_0
226
+ - zlib=1.2.13=h5eee18b_0
227
+ - zstd=1.5.2=ha4553b6_0
@@ -0,0 +1,13 @@
1
+ import os
2
+ import subprocess
3
+
4
+ def main():
5
+ # Determine the path to the 'environment.yml' file within the package
6
+ package_dir = os.path.dirname(__file__)
7
+ env_file = os.path.join(package_dir, 'environment.yml')
8
+
9
+ # Create the conda environment using the 'environment.yml' file
10
+ subprocess.call(["conda", "env", "create", "-f", env_file])
11
+
12
+ if __name__ == "__main__":
13
+ main()
@@ -0,0 +1,34 @@
1
+ from nlptools.morphology import settings
2
+ from nlptools.utils.parser import arStrip
3
+ import json
4
+
5
+
6
+ def ALMA_multi_word(multi_word):
7
+ undiac_multi_word = arStrip(multi_word, True, True, True, False, True, False) # diacs , smallDiacs , shaddah , digit , alif , specialChars
8
+ result_word = []
9
+ if undiac_multi_word in settings.div_dic.keys():
10
+ result_word = settings.div_dic[undiac_multi_word]
11
+
12
+ my_json = {}
13
+ glosses_list = []
14
+ output_list = []
15
+ concept_count = 0
16
+ my_json['multi_word_lemma'] = multi_word
17
+ my_json['undiac_multi_word_lemma'] = multi_word
18
+ ids = []
19
+ if result_word != []:
20
+ #my_json['concept_count'] = result_word[0][1] #concept_count
21
+ #my_json['POS'] = result_word[0][2] #POS
22
+ my_json['POS'] = result_word[0][1] #POS
23
+
24
+ for result in result_word:
25
+ ids.append(result[3])
26
+ #if lemma_id in settings.glosses_dic.keys():
27
+ # value = settings.glosses_dic[lemma_id]
28
+ # glosses_list.append(json.loads(value[1]))
29
+ # concept_count = concept_count + value[0]
30
+ my_json['ids'] = ids
31
+ #my_json['concept_count'] = concept_count
32
+ #my_json['glosses'] = glosses_list
33
+ output_list.append(my_json)
34
+ return output_list
@@ -0,0 +1,52 @@
1
+ from nlptools.morphology import settings
2
+ import pickle
3
+ from nlptools.DataDownload import downloader
4
+ import os
5
+
6
+ #filename = 'ALMA27012000.pickle'
7
+ #path =downloader.get_appdatadir()
8
+ #file_path = os.path.join(path, filename)
9
+ #with open(file_path, 'rb') as f:
10
+ # #Load the serialized data from the file
11
+ # settings.div_dic = pickle.load(f)
12
+
13
+
14
+ filename = 'lemmas_dic.pickle'
15
+ path =downloader.get_appdatadir()
16
+ file_path = os.path.join(path, filename)
17
+ with open(file_path, 'rb') as f:
18
+ #Load the serialized data from the file
19
+ settings.div_dic = pickle.load(f)
20
+
21
+
22
+ #filename_five = 'five_grams.pickle'
23
+ #path =downloader.get_appdatadir()
24
+ #file_path = os.path.join(path, filename_five)
25
+ #with open(file_path, 'rb') as f:
26
+ # #Load the serialized data from the file
27
+ # settings.five_grams_dict = pickle.load(f, encoding='utf-8')
28
+ #
29
+ #
30
+ #filename_four = 'four_grams.pickle'
31
+ #path =downloader.get_appdatadir()
32
+ #file_path = os.path.join(path, filename_four)
33
+ #with open(file_path, 'rb') as f:
34
+ # #Load the serialized data from the file
35
+ # settings.four_grams_dict = pickle.load(f, encoding='utf-8')
36
+ #
37
+ #
38
+ #filename_three = 'three_grams.pickle'
39
+ #path =downloader.get_appdatadir()
40
+ #file_path = os.path.join(path, filename_three)
41
+ #with open(file_path, 'rb') as f:
42
+ # #Load the serialized data from the file
43
+ # settings.three_grams_dict = pickle.load(f, encoding='utf-8')
44
+ #
45
+ #
46
+ #filename_two = 'two_grams.pickle'
47
+ #path =downloader.get_appdatadir()
48
+ #file_path = os.path.join(path, filename_two)
49
+ #with open(file_path, 'rb') as f:
50
+ # #Load the serialized data from the file
51
+ # settings.two_grams_dict = pickle.load(f, encoding='utf-8')
52
+ #
@@ -0,0 +1,60 @@
1
+ # -*- coding: utf-8 -*-
2
+ # We acknoledge that this file charsets.py is imported from Camel tools citation. url
3
+ #
4
+
5
+ import unicodedata
6
+
7
+ from six import unichr
8
+
9
+
10
+ UNICODE_PUNCT_CHARSET = frozenset(
11
+ [unichr(x) for x in range(65536) if unicodedata.category(
12
+ unichr(x))[0] == 'P'])
13
+ UNICODE_SYMBOL_CHARSET = frozenset(
14
+ [unichr(x) for x in range(65536) if unicodedata.category(
15
+ unichr(x))[0] == 'S'])
16
+ UNICODE_PUNCT_SYMBOL_CHARSET = UNICODE_PUNCT_CHARSET | UNICODE_SYMBOL_CHARSET
17
+
18
+ UNICODE_LETTER_CHARSET = frozenset(
19
+ [unichr(x) for x in range(65536) if unicodedata.category(
20
+ unichr(x))[0] == 'L'])
21
+ UNICODE_MARK_CHARSET = frozenset(
22
+ [unichr(x) for x in range(65536) if unicodedata.category(
23
+ unichr(x))[0] == 'M'])
24
+ UNICODE_NUMBER_CHARSET = frozenset(
25
+ [unichr(x) for x in range(65536) if unicodedata.category(
26
+ unichr(x))[0] == 'N'])
27
+ UNICODE_LETTER_MARK_NUMBER_CHARSET = (UNICODE_LETTER_CHARSET |
28
+ UNICODE_MARK_CHARSET |
29
+ UNICODE_NUMBER_CHARSET)
30
+
31
+ AR_LETTERS_CHARSET = frozenset(u'\u0621\u0622\u0623\u0624\u0625\u0626\u0627'
32
+ u'\u0628\u0629\u062a\u062b\u062c\u062d\u062e'
33
+ u'\u062f\u0630\u0631\u0632\u0633\u0634\u0635'
34
+ u'\u0636\u0637\u0638\u0639\u063a\u0640\u0641'
35
+ u'\u0642\u0643\u0644\u0645\u0646\u0647\u0648'
36
+ u'\u0649\u064a\u0671\u067e\u0686\u06a4\u06af')
37
+ AR_DIAC_CHARSET = frozenset(u'\u064b\u064c\u064d\u064e\u064f\u0650\u0651\u0652'
38
+ u'\u0670\u0640')
39
+ AR_CHARSET = AR_LETTERS_CHARSET | AR_DIAC_CHARSET
40
+
41
+ BW_LETTERS_CHARSET = frozenset(u'$&\'*<>ADEGHJPSTVYZ_bdfghjklmnpqrstvwxyz{|}')
42
+ BW_DIAC_CHARSET = frozenset(u'FKN`aiou~_')
43
+ BW_CHARSET = BW_LETTERS_CHARSET | BW_DIAC_CHARSET
44
+
45
+ SAFEBW_LETTERS_CHARSET = frozenset(u'ABCDEGHIJLMOPQSTVWYZ_bcdefghjklmnpqrstvwx'
46
+ u'yz')
47
+ SAFEBW_DIAC_CHARSET = frozenset(u'FKNaeiou~_')
48
+ SAFEBW_CHARSET = SAFEBW_LETTERS_CHARSET | SAFEBW_DIAC_CHARSET
49
+
50
+ XMLBW_LETTERS_CHARSET = frozenset(u'$\'*ABDEGHIJOPSTWYZ_bdfghjklmnpqrstvwxyz{|'
51
+ u'}')
52
+ XMLBW_DIAC_CHARSET = frozenset(u'FKN`aiou~_')
53
+ XMLBW_CHARSET = XMLBW_LETTERS_CHARSET | XMLBW_DIAC_CHARSET
54
+
55
+ HSB_LETTERS_CHARSET = frozenset(u'\'ADHST_bcdfghjklmnpqrstvwxyz'
56
+ u'\u00c2\u00c4\u00e1\u00f0\u00fd\u0100\u0102'
57
+ u'\u010e\u0127\u0161\u0175\u0177\u03b3\u03b8'
58
+ u'\u03c2')
59
+ HSB_DIAC_CHARSET = frozenset(u'.aiu~\u00c4\u00e1\u00e3\u0129\u0169_')
60
+ HSB_CHARSET = HSB_LETTERS_CHARSET | HSB_DIAC_CHARSET
@@ -0,0 +1,170 @@
1
+
2
+ from nlptools.morphology import settings
3
+ import re
4
+ from nlptools.morphology.tokenizers_words import simple_word_tokenize
5
+ from nlptools.utils.parser import arStrip
6
+ from nlptools.morphology.charsets import AR_CHARSET, AR_DIAC_CHARSET
7
+
8
+ _IS_AR_RE = re.compile(u'^[' + re.escape(u''.join(AR_CHARSET)) + u']+$')
9
+ def find_solution(token, language, task):
10
+ """
11
+ Given a token, this method finds the morphological solution lemma and/or pos based on a spesific language and task.
12
+
13
+ Args:
14
+ token (:obj:`str`): The Arabic token to be morphologcaly analyzed.
15
+ language (:obj:`str`): In the current version, `MSA` is only supported.
16
+ task (:obj:`str`): The task to filter the results by [lemmatizer, pos, full]. The defualt task if not specified is `full`.
17
+
18
+ Returns:
19
+ list (:obj:`list`): A list of [token, lemma, pos], where:
20
+ token: the original input token
21
+ lemma: the lemma of the token
22
+ pos: the part-of-speech of the token
23
+ Note:
24
+ If no sloution is found for this token, an empty list is returned.
25
+ """
26
+
27
+ if token in settings.div_dic.keys():
28
+ resulted_solutions = []
29
+ solutions = settings.div_dic[token]
30
+ for solution in solutions:
31
+ resulted_solutions.append([token, solution[0], solution[1], solution[3]])
32
+ return resulted_solutions
33
+ else:
34
+ return []
35
+
36
+ def analyze(text, language ='MSA', task ='full'):
37
+ """
38
+ This method takes a text as input and returns a morphological solution for each token in this text, Based on the input language and task, such that,
39
+ if:
40
+ the task is lemmatizer, then the morphological soltuion is only the lemma.
41
+ the task is pos, then the morphological soltuion is only the pos.
42
+ the task is full, the the morphological soltuion is both the lemma and the pos.
43
+
44
+ Args:
45
+ token (:obj:`str`): The Arabic token to be morphologcaly analyzed.
46
+ language (:obj:`str`): In the current version, `MSA` is only supported.
47
+ task (:obj:`str`): The task to filter the results by [lemmatizer, pos, full]. The defualt task if not specified is `full`.
48
+
49
+ Returns:
50
+ list (:obj:`list`): A list of [token, lemma, pos], based on the spesified task, where:
51
+ token: the original input token
52
+ lemma: the lemma of the token
53
+ pos: the part-of-speech of the token
54
+
55
+ **Example:**
56
+
57
+ .. highlight:: python
58
+ .. code-block:: python
59
+
60
+ from nlptools.morphology import morph_analyzer
61
+
62
+ #Return the morpological solution for each token in this text
63
+ #Example: task = full
64
+ morph_analyzer.analyze('ذهب الولد الى المدرسة')
65
+
66
+ [['ذهب', 'ذَهَبَ۪ 1', 'فعل'],
67
+ ['الولد', 'وَلَد 1', 'اسم'],
68
+ ['الى', 'إِلَى 1', 'كلمة وظيفية'],
69
+ ['المدرسة', 'مَدْرَسَة 1', 'اسم']]
70
+
71
+ #Exampel: task = pos
72
+ morph_analyzer.analyze('ذهب الولد الى المدرسة',task='pos')
73
+ #the output
74
+ [['ذهب', 'فعل'], ['الولد', 'اسم'], ['الى', 'كلمة وظيفية'], ['المدرسة', 'اسم']]
75
+
76
+ #Exampel: task = lemmatizer
77
+ morph_analyzer.analyze('طار العصور فوق الشجرة', task='lemmatizer')
78
+ #the output
79
+ [['طار', 'طارِ۪ 1'],
80
+ ['العصور', 'عَصْر 1'],
81
+ ['فوق', 'فَوْق 1'],
82
+ ['الشجرة', 'شَجَرَة 1']]
83
+ """
84
+
85
+ #@check if the init does not load data correctly, call load_alma inside
86
+ output_list = []
87
+
88
+ tokens = simple_word_tokenize(text)
89
+
90
+ for token in tokens:
91
+ result_token =[]
92
+ token = arStrip(token , False , True , False , False , False , False)
93
+ token = re.sub('[ٱ]','ﺍ',token)
94
+ solution=[token, token+"_0","",0]
95
+
96
+ if token.isdigit():
97
+ solution[2] = "digit" #pos
98
+
99
+ elif not _is_ar(token):
100
+ solution[2] = "Foreign" #pos
101
+
102
+ # elif re.match("^[a-zA-Z]*$", token):
103
+ # solution[2] = "Foreign" #pos
104
+
105
+ else:
106
+ result_token = find_solution(token,language, task)
107
+
108
+ if result_token == []:
109
+ token_without_al = re.sub(r'^[ﻝ]','',re.sub(r'^[ﺍ]','',token))
110
+ if len(token_without_al) > 5 :
111
+ result_token = find_solution(token_without_al, language, task)
112
+
113
+ if result_token == []:
114
+ # try with replace ﻩ with ﺓ
115
+ result_token = find_solution(re.sub(r'[ﻩ]$','ﺓ',token), language, task)
116
+
117
+
118
+ if result_token == []:
119
+ # try with unify Alef
120
+ word_with_unify_alef = arStrip(token , False , False , False , False , True , False) # Unify Alef
121
+ result_token = find_solution(word_with_unify_alef, language, task)
122
+
123
+ if result_token == []:
124
+ # try with remove diac
125
+ word_undiac = arStrip(token , True , False , True , True , False , False) # remove diacs, shaddah , digit
126
+ result_token = find_solution(word_undiac, language, task)
127
+
128
+ if result_token == []:
129
+ # try with remove diac and unify alef
130
+ word_undiac = arStrip(token , True , True , True , False, True , False) # diacs , smallDiacs , shaddah , alif
131
+ result_token = find_solution(word_undiac, language, task)
132
+
133
+ if result_token != []:
134
+ output_list.append(result_token)
135
+ else:
136
+ # if no solution is found
137
+ output_list.append([solution])
138
+
139
+ return filter_results(task, output_list)
140
+
141
+
142
+ def filter_results(task, lst):
143
+ if task == 'lemmatizer':
144
+ return remove_items_by_index(lst, [2])
145
+ elif task == 'pos':
146
+ return remove_items_by_index(lst, [1])
147
+ else:
148
+ return lst
149
+
150
+
151
+ def remove_items_by_index(lst, index_list):
152
+ for inner_list in lst:
153
+ for index in sorted(index_list, reverse=True):
154
+ if len(inner_list) > index:
155
+ inner_list.pop(index)
156
+ return lst
157
+
158
+
159
+ def _is_ar(word):
160
+ return _IS_AR_RE.match(word) is not None
161
+
162
+
163
+
164
+
165
+
166
+
167
+
168
+
169
+
170
+
@@ -0,0 +1,8 @@
1
+ flag = True
2
+ div_dic = {}
3
+ lemma_source = "DIC"
4
+
5
+ #two_grams_dict = {}
6
+ #three_grams_dict = {}
7
+ #four_grams_dict = {}
8
+ #five_grams_dict = {}
@@ -0,0 +1,19 @@
1
+ # This code was taken from Camel tools without any change
2
+
3
+ # -*- coding: utf-8 -*-
4
+
5
+
6
+ import re
7
+ from nlptools.morphology.charsets import UNICODE_PUNCT_SYMBOL_CHARSET
8
+ from nlptools.morphology.charsets import UNICODE_LETTER_MARK_NUMBER_CHARSET
9
+
10
+
11
+ _ALL_PUNCT = u''.join(UNICODE_PUNCT_SYMBOL_CHARSET)
12
+ _ALL_LETTER_MARK_NUMBER = u''.join(UNICODE_LETTER_MARK_NUMBER_CHARSET)
13
+ _TOKENIZE_RE = re.compile(r'[' + re.escape(_ALL_PUNCT) + r']|[' +
14
+ re.escape(_ALL_LETTER_MARK_NUMBER) + r']+')
15
+
16
+
17
+ def simple_word_tokenize(sentence):
18
+
19
+ return _TOKENIZE_RE.findall(sentence)
nlptools/nlptools.py ADDED
@@ -0,0 +1 @@
1
+ """Main module."""
@@ -0,0 +1,12 @@
1
+ from nlptools.salma import settings
2
+ import pickle
3
+ from nlptools.DataDownload import downloader
4
+ import os
5
+
6
+ #filename = 'glosses_dic.pickle'
7
+ #path =downloader.get_appdatadir()
8
+ #file_path = os.path.join(path, filename)
9
+ #with open(file_path, 'rb') as f:
10
+ # #Load the serialized data from the file
11
+ # settings.glosses_dic = pickle.load(f)
12
+ settings.glosses_dic = {}
@@ -0,0 +1,31 @@
1
+ from transformers import BertTokenizer,BertForSequenceClassification
2
+ import warnings
3
+ warnings.filterwarnings("ignore")
4
+ import pandas as pd
5
+
6
+
7
+
8
+
9
+ from nlptools.DataDownload import downloader
10
+ import os
11
+
12
+ glosses_dic = {}
13
+
14
+ model_file_name = "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01"
15
+ path =downloader.get_appdatadir()
16
+ model_file_path = os.path.join(path, model_file_name)
17
+
18
+ tokenizer_file_name = "bert-base-arabertv02"
19
+ path =downloader.get_appdatadir()
20
+ tokenizer_file_path = os.path.join(path, tokenizer_file_name)
21
+
22
+ dftrue = pd.DataFrame()
23
+
24
+ # model = BertForSequenceClassification.from_pretrained('{}'.format("bert-base-arabertv02_22_May_2021_00h_allglosses_unused01"),
25
+ # output_hidden_states = True,
26
+ # num_labels=2
27
+ # )
28
+
29
+ model = BertForSequenceClassification.from_pretrained(model_file_path, output_hidden_states=True, num_labels=2)
30
+
31
+ tokenizer = BertTokenizer.from_pretrained('{}'.format(tokenizer_file_path))