SinaTools 0.1.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. SinaTools-0.1.1.data/data/nlptools/environment.yml +227 -0
  2. SinaTools-0.1.1.dist-info/AUTHORS.rst +13 -0
  3. SinaTools-0.1.1.dist-info/LICENSE +22 -0
  4. SinaTools-0.1.1.dist-info/METADATA +72 -0
  5. SinaTools-0.1.1.dist-info/RECORD +122 -0
  6. SinaTools-0.1.1.dist-info/WHEEL +6 -0
  7. SinaTools-0.1.1.dist-info/entry_points.txt +18 -0
  8. SinaTools-0.1.1.dist-info/top_level.txt +1 -0
  9. nlptools/CLI/DataDownload/download_files.py +71 -0
  10. nlptools/CLI/arabiner/bin/infer.py +117 -0
  11. nlptools/CLI/arabiner/bin/infer2.py +81 -0
  12. nlptools/CLI/morphology/ALMA_multi_word.py +75 -0
  13. nlptools/CLI/morphology/morph_analyzer.py +91 -0
  14. nlptools/CLI/salma/salma_tools.py +68 -0
  15. nlptools/CLI/utils/__init__.py +0 -0
  16. nlptools/CLI/utils/arStrip.py +99 -0
  17. nlptools/CLI/utils/corpus_tokenizer.py +74 -0
  18. nlptools/CLI/utils/implication.py +92 -0
  19. nlptools/CLI/utils/jaccard.py +96 -0
  20. nlptools/CLI/utils/latin_remove.py +51 -0
  21. nlptools/CLI/utils/remove_Punc.py +53 -0
  22. nlptools/CLI/utils/sentence_tokenizer.py +90 -0
  23. nlptools/CLI/utils/text_transliteration.py +77 -0
  24. nlptools/DataDownload/__init__.py +0 -0
  25. nlptools/DataDownload/downloader.py +185 -0
  26. nlptools/VERSION +1 -0
  27. nlptools/__init__.py +5 -0
  28. nlptools/arabert/__init__.py +1 -0
  29. nlptools/arabert/arabert/__init__.py +14 -0
  30. nlptools/arabert/arabert/create_classification_data.py +260 -0
  31. nlptools/arabert/arabert/create_pretraining_data.py +534 -0
  32. nlptools/arabert/arabert/extract_features.py +444 -0
  33. nlptools/arabert/arabert/lamb_optimizer.py +158 -0
  34. nlptools/arabert/arabert/modeling.py +1027 -0
  35. nlptools/arabert/arabert/optimization.py +202 -0
  36. nlptools/arabert/arabert/run_classifier.py +1078 -0
  37. nlptools/arabert/arabert/run_pretraining.py +593 -0
  38. nlptools/arabert/arabert/run_squad.py +1440 -0
  39. nlptools/arabert/arabert/tokenization.py +414 -0
  40. nlptools/arabert/araelectra/__init__.py +1 -0
  41. nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +103 -0
  42. nlptools/arabert/araelectra/build_pretraining_dataset.py +230 -0
  43. nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py +90 -0
  44. nlptools/arabert/araelectra/configure_finetuning.py +172 -0
  45. nlptools/arabert/araelectra/configure_pretraining.py +143 -0
  46. nlptools/arabert/araelectra/finetune/__init__.py +14 -0
  47. nlptools/arabert/araelectra/finetune/feature_spec.py +56 -0
  48. nlptools/arabert/araelectra/finetune/preprocessing.py +173 -0
  49. nlptools/arabert/araelectra/finetune/scorer.py +54 -0
  50. nlptools/arabert/araelectra/finetune/task.py +74 -0
  51. nlptools/arabert/araelectra/finetune/task_builder.py +70 -0
  52. nlptools/arabert/araelectra/flops_computation.py +215 -0
  53. nlptools/arabert/araelectra/model/__init__.py +14 -0
  54. nlptools/arabert/araelectra/model/modeling.py +1029 -0
  55. nlptools/arabert/araelectra/model/optimization.py +193 -0
  56. nlptools/arabert/araelectra/model/tokenization.py +355 -0
  57. nlptools/arabert/araelectra/pretrain/__init__.py +14 -0
  58. nlptools/arabert/araelectra/pretrain/pretrain_data.py +160 -0
  59. nlptools/arabert/araelectra/pretrain/pretrain_helpers.py +229 -0
  60. nlptools/arabert/araelectra/run_finetuning.py +323 -0
  61. nlptools/arabert/araelectra/run_pretraining.py +469 -0
  62. nlptools/arabert/araelectra/util/__init__.py +14 -0
  63. nlptools/arabert/araelectra/util/training_utils.py +112 -0
  64. nlptools/arabert/araelectra/util/utils.py +109 -0
  65. nlptools/arabert/aragpt2/__init__.py +2 -0
  66. nlptools/arabert/aragpt2/create_pretraining_data.py +95 -0
  67. nlptools/arabert/aragpt2/gpt2/__init__.py +2 -0
  68. nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py +158 -0
  69. nlptools/arabert/aragpt2/gpt2/optimization.py +225 -0
  70. nlptools/arabert/aragpt2/gpt2/run_pretraining.py +397 -0
  71. nlptools/arabert/aragpt2/grover/__init__.py +0 -0
  72. nlptools/arabert/aragpt2/grover/dataloader.py +161 -0
  73. nlptools/arabert/aragpt2/grover/modeling.py +803 -0
  74. nlptools/arabert/aragpt2/grover/modeling_gpt2.py +1196 -0
  75. nlptools/arabert/aragpt2/grover/optimization_adafactor.py +234 -0
  76. nlptools/arabert/aragpt2/grover/train_tpu.py +187 -0
  77. nlptools/arabert/aragpt2/grover/utils.py +234 -0
  78. nlptools/arabert/aragpt2/train_bpe_tokenizer.py +59 -0
  79. nlptools/arabert/preprocess.py +818 -0
  80. nlptools/arabiner/__init__.py +0 -0
  81. nlptools/arabiner/bin/__init__.py +14 -0
  82. nlptools/arabiner/bin/eval.py +87 -0
  83. nlptools/arabiner/bin/infer.py +91 -0
  84. nlptools/arabiner/bin/process.py +140 -0
  85. nlptools/arabiner/bin/train.py +221 -0
  86. nlptools/arabiner/data/__init__.py +1 -0
  87. nlptools/arabiner/data/datasets.py +146 -0
  88. nlptools/arabiner/data/transforms.py +118 -0
  89. nlptools/arabiner/nn/BaseModel.py +22 -0
  90. nlptools/arabiner/nn/BertNestedTagger.py +34 -0
  91. nlptools/arabiner/nn/BertSeqTagger.py +17 -0
  92. nlptools/arabiner/nn/__init__.py +3 -0
  93. nlptools/arabiner/trainers/BaseTrainer.py +117 -0
  94. nlptools/arabiner/trainers/BertNestedTrainer.py +203 -0
  95. nlptools/arabiner/trainers/BertTrainer.py +163 -0
  96. nlptools/arabiner/trainers/__init__.py +3 -0
  97. nlptools/arabiner/utils/__init__.py +0 -0
  98. nlptools/arabiner/utils/data.py +124 -0
  99. nlptools/arabiner/utils/helpers.py +151 -0
  100. nlptools/arabiner/utils/metrics.py +69 -0
  101. nlptools/environment.yml +227 -0
  102. nlptools/install_env.py +13 -0
  103. nlptools/morphology/ALMA_multi_word.py +34 -0
  104. nlptools/morphology/__init__.py +52 -0
  105. nlptools/morphology/charsets.py +60 -0
  106. nlptools/morphology/morph_analyzer.py +170 -0
  107. nlptools/morphology/settings.py +8 -0
  108. nlptools/morphology/tokenizers_words.py +19 -0
  109. nlptools/nlptools.py +1 -0
  110. nlptools/salma/__init__.py +12 -0
  111. nlptools/salma/settings.py +31 -0
  112. nlptools/salma/views.py +459 -0
  113. nlptools/salma/wsd.py +126 -0
  114. nlptools/utils/__init__.py +0 -0
  115. nlptools/utils/corpus_tokenizer.py +73 -0
  116. nlptools/utils/implication.py +662 -0
  117. nlptools/utils/jaccard.py +247 -0
  118. nlptools/utils/parser.py +147 -0
  119. nlptools/utils/readfile.py +3 -0
  120. nlptools/utils/sentence_tokenizer.py +53 -0
  121. nlptools/utils/text_transliteration.py +232 -0
  122. nlptools/utils/utils.py +2 -0
@@ -0,0 +1,227 @@
1
+ name: arabicner
2
+ channels:
3
+ - anaconda
4
+ - pytorch
5
+ - nvidia
6
+ - conda-forge
7
+ - defaults
8
+ dependencies:
9
+ - _libgcc_mutex=0.1=main
10
+ - _openmp_mutex=5.1=1_gnu
11
+ - abseil-cpp=20211102.0=h27087fc_1
12
+ - absl-py=1.3.0=pyhd8ed1ab_0
13
+ - aiohttp=3.8.1=py310h5764c6d_1
14
+ - aiosignal=1.2.0=pyhd8ed1ab_0
15
+ - arrow-cpp=8.0.0=py310h3098874_0
16
+ - async-timeout=4.0.2=py310h06a4308_0
17
+ - attrs=22.1.0=pyh71513ae_1
18
+ - aws-c-common=0.4.57=he6710b0_1
19
+ - aws-c-event-stream=0.1.6=h2531618_5
20
+ - aws-checksums=0.1.9=he6710b0_0
21
+ - aws-sdk-cpp=1.8.185=hce553d0_0
22
+ - blas=1.0=mkl
23
+ - blinker=1.5=pyhd8ed1ab_0
24
+ - boost-cpp=1.78.0=he72f1d9_0
25
+ - bottleneck=1.3.5=py310ha9d4c09_0
26
+ - brotli=1.0.9=h166bdaf_7
27
+ - brotli-bin=1.0.9=h166bdaf_7
28
+ - brotlipy=0.7.0=py310h7f8727e_1002
29
+ - bzip2=1.0.8=h7b6447c_0
30
+ - c-ares=1.18.1=h7f98852_0
31
+ - ca-certificates=2022.9.24=ha878542_0
32
+ - cachetools=5.2.0=pyhd8ed1ab_0
33
+ - certifi=2022.9.24=pyhd8ed1ab_0
34
+ - cffi=1.15.1=py310h74dc2b5_0
35
+ - charset-normalizer=2.0.4=pyhd3eb1b0_0
36
+ - click=8.1.3=unix_pyhd8ed1ab_2
37
+ - cryptography=38.0.1=py310h9ce1e76_0
38
+ - cuda=11.7.1=0
39
+ - cuda-cccl=11.7.91=0
40
+ - cuda-command-line-tools=11.7.1=0
41
+ - cuda-compiler=11.7.1=0
42
+ - cuda-cudart=11.7.99=0
43
+ - cuda-cudart-dev=11.7.99=0
44
+ - cuda-cuobjdump=11.7.91=0
45
+ - cuda-cupti=11.7.101=0
46
+ - cuda-cuxxfilt=11.7.91=0
47
+ - cuda-demo-suite=11.8.86=0
48
+ - cuda-documentation=11.8.86=0
49
+ - cuda-driver-dev=11.7.99=0
50
+ - cuda-gdb=11.8.86=0
51
+ - cuda-libraries=11.7.1=0
52
+ - cuda-libraries-dev=11.7.1=0
53
+ - cuda-memcheck=11.8.86=0
54
+ - cuda-nsight=11.8.86=0
55
+ - cuda-nsight-compute=11.8.0=0
56
+ - cuda-nvcc=11.7.99=0
57
+ - cuda-nvdisasm=11.8.86=0
58
+ - cuda-nvml-dev=11.7.91=0
59
+ - cuda-nvprof=11.8.87=0
60
+ - cuda-nvprune=11.7.91=0
61
+ - cuda-nvrtc=11.7.99=0
62
+ - cuda-nvrtc-dev=11.7.99=0
63
+ - cuda-nvtx=11.7.91=0
64
+ - cuda-nvvp=11.8.87=0
65
+ - cuda-runtime=11.7.1=0
66
+ - cuda-sanitizer-api=11.8.86=0
67
+ - cuda-toolkit=11.7.1=0
68
+ - cuda-tools=11.7.1=0
69
+ - cuda-visual-tools=11.7.1=0
70
+ - dataclasses=0.8=pyhc8e2a94_3
71
+ - datasets=2.6.1=pyhd8ed1ab_0
72
+ - dill=0.3.5.1=pyhd8ed1ab_0
73
+ - ffmpeg=4.3=hf484d3e_0
74
+ - fftw=3.3.10=nompi_h77c792f_102
75
+ - filelock=3.8.0=pyhd8ed1ab_0
76
+ - freetype=2.12.1=h4a9f257_0
77
+ - frozenlist=1.2.0=py310h7f8727e_1
78
+ - fsspec=2022.10.0=pyhd8ed1ab_0
79
+ - gds-tools=1.4.0.31=0
80
+ - gflags=2.2.2=he1b5a44_1004
81
+ - giflib=5.2.1=h7b6447c_0
82
+ - glog=0.6.0=h6f12383_0
83
+ - gmp=6.2.1=h295c915_3
84
+ - gnutls=3.6.15=he1e5248_0
85
+ #- google-auth=2.14.0=pyh1a96a4e_0
86
+ #- google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
87
+ - grpc-cpp=1.46.1=h33aed49_0
88
+ - grpcio=1.42.0=py310hce63b2e_0
89
+ - huggingface_hub=0.10.1=pyhd8ed1ab_0
90
+ - icu=70.1=h27087fc_0
91
+ - idna=3.4=py310h06a4308_0
92
+ - importlib-metadata=5.0.0=pyha770c72_1
93
+ - importlib_metadata=5.0.0=hd8ed1ab_1
94
+ - intel-openmp=2021.4.0=h06a4308_3561
95
+ - joblib=1.2.0=pyhd8ed1ab_0
96
+ - jpeg=9e=h7f8727e_0
97
+ - keyutils=1.6.1=h166bdaf_0
98
+ - krb5=1.19.3=h3790be6_0
99
+ - lame=3.100=h7b6447c_0
100
+ - lcms2=2.12=h3be6417_0
101
+ - ld_impl_linux-64=2.38=h1181459_1
102
+ - lerc=3.0=h295c915_0
103
+ - libbrotlicommon=1.0.9=h166bdaf_7
104
+ - libbrotlidec=1.0.9=h166bdaf_7
105
+ - libbrotlienc=1.0.9=h166bdaf_7
106
+ - libcublas=11.11.3.6=0
107
+ - libcublas-dev=11.11.3.6=0
108
+ - libcufft=10.9.0.58=0
109
+ - libcufft-dev=10.9.0.58=0
110
+ - libcufile=1.4.0.31=0
111
+ - libcufile-dev=1.4.0.31=0
112
+ - libcurand=10.3.0.86=0
113
+ - libcurl=7.85.0=h91b91d3_0
114
+ - libcusolver=11.4.1.48=0
115
+ - libcusolver-dev=11.4.1.48=0
116
+ - libcusparse=11.7.5.86=0
117
+ - libcusparse-dev=11.7.5.86=0
118
+ - libdeflate=1.8=h7f8727e_5
119
+ - libedit=3.1.20191231=he28a2e2_2
120
+ - libev=4.33=h516909a_1
121
+ - libevent=2.1.10=h9b69904_4
122
+ - libffi=3.3=he6710b0_2
123
+ - libgcc-ng=11.2.0=h1234567_1
124
+ - libgfortran-ng=12.2.0=h69a702a_19
125
+ - libgfortran5=12.2.0=h337968e_19
126
+ - libgomp=11.2.0=h1234567_1
127
+ - libiconv=1.16=h7f8727e_2
128
+ - libidn2=2.3.2=h7f8727e_0
129
+ - libnghttp2=1.46.0=hce63b2e_0
130
+ - libnpp=11.8.0.86=0
131
+ - libnpp-dev=11.8.0.86=0
132
+ - libnvjpeg=11.9.0.86=0
133
+ - libnvjpeg-dev=11.9.0.86=0
134
+ - libpng=1.6.37=hbc83047_0
135
+ - libprotobuf=3.20.1=h4ff587b_0
136
+ - libssh2=1.10.0=ha56f1ee_2
137
+ - libstdcxx-ng=11.2.0=h1234567_1
138
+ - libtasn1=4.16.0=h27cfd23_0
139
+ - libthrift=0.15.0=he6d91bd_0
140
+ - libtiff=4.4.0=hecacb30_0
141
+ - libunistring=0.9.10=h27cfd23_0
142
+ - libuuid=1.0.3=h7f8727e_2
143
+ - libwebp=1.2.4=h11a3e52_0
144
+ - libwebp-base=1.2.4=h5eee18b_0
145
+ - lz4-c=1.9.3=h295c915_1
146
+ - markdown=3.4.1=pyhd8ed1ab_0
147
+ - markupsafe=2.1.1=py310h5764c6d_1
148
+ - mkl=2021.4.0=h06a4308_640
149
+ - mkl-service=2.4.0=py310h7f8727e_0
150
+ - mkl_fft=1.3.1=py310hd6ae3a3_0
151
+ - mkl_random=1.2.2=py310h00e6091_0
152
+ - multidict=6.0.2=py310h5764c6d_1
153
+ - multiprocess=0.70.12.2=py310h5764c6d_2
154
+ - natsort=7.1.1=pyhd3eb1b0_0
155
+ - ncurses=6.3=h5eee18b_3
156
+ - nettle=3.7.3=hbbd107a_1
157
+ - nsight-compute=2022.3.0.22=0
158
+ - numexpr=2.8.3=py310hcea2de6_0
159
+ - numpy=1.23.3=py310hd5efca6_0
160
+ #- numpy-base=1.23.3=py310h8e6c178_0
161
+ - oauthlib=3.2.2=pyhd8ed1ab_0
162
+ - openh264=2.1.1=h4ff587b_0
163
+ - openssl=1.1.1s=h7f8727e_0
164
+ - orc=1.7.4=h07ed6aa_0
165
+ - packaging=21.3=pyhd8ed1ab_0
166
+ - pandas=1.4.4=py310h6a678d5_0
167
+ - pillow=9.2.0=py310hace64e9_1
168
+ - pip=22.2.2=py310h06a4308_0
169
+ - protobuf=3.20.1=py310hd8f1fbe_0
170
+ - pyarrow=8.0.0=py310h468efa6_0
171
+ - pyasn1=0.4.8=py_0
172
+ - pyasn1-modules=0.2.7=py_0
173
+ - pycparser=2.21=pyhd3eb1b0_0
174
+ - pyjwt=2.6.0=pyhd8ed1ab_0
175
+ - pyopenssl=22.0.0=pyhd3eb1b0_0
176
+ - pyparsing=3.0.9=pyhd8ed1ab_0
177
+ - pysocks=1.7.1=py310h06a4308_0
178
+ - python=3.10.6=haa1d7c7_1
179
+ - python-dateutil=2.8.2=pyhd8ed1ab_0
180
+ - python-xxhash=3.0.0=py310h5764c6d_1
181
+ - python_abi=3.10=2_cp310
182
+ - pytorch=1.13.0=py3.10_cuda11.7_cudnn8.5.0_0
183
+ - pytorch-cuda=11.7=h67b0de4_0
184
+ - pytorch-mutex=1.0=cuda
185
+ - pytz=2022.6=pyhd8ed1ab_0
186
+ - pyu2f=0.1.5=pyhd8ed1ab_0
187
+ - pyyaml=6.0=py310h5764c6d_4
188
+ - re2=2022.04.01=h27087fc_0
189
+ - readline=8.2=h5eee18b_0
190
+ - regex=2022.7.9=py310h5eee18b_0
191
+ - requests=2.28.1=py310h06a4308_0
192
+ - requests-oauthlib=1.3.1=pyhd8ed1ab_0
193
+ - responses=0.18.0=pyhd8ed1ab_0
194
+ - rsa=4.9=pyhd8ed1ab_0
195
+ - sacremoses=0.0.53=pyhd8ed1ab_0
196
+ - scikit-learn=1.1.3=py310h6a678d5_0
197
+ - scipy=1.9.3=py310hd5efca6_0
198
+ - seqeval=1.2.2=pyhd3deb0d_0
199
+ - setuptools=65.4.0=py310h06a4308_0
200
+ - six=1.16.0=pyhd3eb1b0_1
201
+ - snappy=1.1.9=hbd366e4_1
202
+ - sqlite=3.39.3=h5082296_0
203
+ - tensorboard=2.10.1=pyhd8ed1ab_0
204
+ - tensorboard-data-server=0.6.0=py310h597c629_2
205
+ - tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
206
+ - threadpoolctl=3.1.0=pyh8a188c0_0
207
+ - tk=8.6.12=h1ccaba5_0
208
+ - tokenizers=0.11.4=py310h3dcd8bd_1
209
+ - torchaudio=0.13.0=py310_cu117
210
+ - torchtext=0.14.0=py310
211
+ - torchvision=0.14.0=py310_cu117
212
+ - tqdm=4.64.1=py310h06a4308_0
213
+ - transformers=4.24.0=pyhd8ed1ab_0
214
+ - typing-extensions=4.3.0=py310h06a4308_0
215
+ - typing_extensions=4.3.0=py310h06a4308_0
216
+ - tzdata=2022e=h04d1e81_0
217
+ - urllib3=1.26.12=py310h06a4308_0
218
+ - utf8proc=2.6.1=h27cfd23_0
219
+ - werkzeug=2.2.2=pyhd8ed1ab_0
220
+ - wheel=0.37.1=pyhd3eb1b0_0
221
+ - xxhash=0.8.0=h7f98852_3
222
+ - xz=5.2.6=h5eee18b_0
223
+ - yaml=0.2.5=h7f98852_2
224
+ - yarl=1.7.2=py310h5764c6d_2
225
+ - zipp=3.10.0=pyhd8ed1ab_0
226
+ - zlib=1.2.13=h5eee18b_0
227
+ - zstd=1.5.2=ha4553b6_0
@@ -0,0 +1,13 @@
1
+ =======
2
+ Credits
3
+ =======
4
+
5
+ Development Lead
6
+ ----------------
7
+
8
+ * SinaLab <sina.institute.bzu@gmail.com>
9
+
10
+ Contributors
11
+ ------------
12
+
13
+ None yet. Why not be the first?
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2023, SinaLab
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
22
+
@@ -0,0 +1,72 @@
1
+ Metadata-Version: 2.1
2
+ Name: SinaTools
3
+ Version: 0.1.1
4
+ Summary: UNKNOWN
5
+ Home-page: https://github.com/SinaLab/nlptools
6
+ Author: UNKNOWN
7
+ Author-email: UNKNOWN
8
+ License: MIT license
9
+ Keywords: nlptools
10
+ Platform: UNKNOWN
11
+ Requires-Dist: six
12
+ Requires-Dist: farasapy
13
+ Requires-Dist: tqdm
14
+ Requires-Dist: requests
15
+ Requires-Dist: regex
16
+ Requires-Dist: pathlib
17
+ Requires-Dist: torch (==1.13.0)
18
+ Requires-Dist: transformers (==4.24.0)
19
+ Requires-Dist: torchtext (==0.14.0)
20
+ Requires-Dist: torchvision (==0.14.0)
21
+ Requires-Dist: seqeval (==1.2.2)
22
+ Requires-Dist: natsort (==7.1.1)
23
+
24
+ ========
25
+ nlptools
26
+ ========
27
+
28
+
29
+ .. image:: https://img.shields.io/pypi/v/nlptools.svg
30
+ :target: https://pypi.python.org/pypi/SinaTools
31
+
32
+ .. image:: https://img.shields.io/travis/sina_institute/nlptools.svg
33
+ :target: https://travis-ci.com/sina_institute/SinaTools
34
+
35
+ .. image:: https://readthedocs.org/projects/nlptools/badge/?version=latest
36
+ :target: https://SinaTools.readthedocs.io/en/latest/?version=latest
37
+ :alt: Documentation Status
38
+
39
+
40
+
41
+
42
+ Python Boilerplate contains all the boilerplate you need to create a Python package.
43
+
44
+
45
+ * Free software: MIT license
46
+ * Documentation: https://SinaTools.readthedocs.io.
47
+
48
+
49
+ Features
50
+ --------
51
+
52
+ * TODO
53
+
54
+ Credits
55
+ -------
56
+
57
+ This package was created with Cookiecutter_ and the `audreyr/cookiecutter-pypackage`_ project template.
58
+
59
+ .. _Cookiecutter: https://github.com/audreyr/cookiecutter
60
+ .. _`audreyr/cookiecutter-pypackage`: https://github.com/audreyr/cookiecutter-pypackage
61
+
62
+
63
+ =======
64
+ History
65
+ =======
66
+
67
+ 0.1.0 (2023-04-15)
68
+ ------------------
69
+
70
+ * First release on PyPI.
71
+
72
+
@@ -0,0 +1,122 @@
1
+ SinaTools-0.1.1.data/data/nlptools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
2
+ nlptools/VERSION,sha256=Ee4juPwvxhnW6rYnettaUnJhBnrAHM_D4RhX9Vvxi80,5
3
+ nlptools/__init__.py,sha256=OoA_p_y2jPjMytcUrG1ED5uJlJemVhSRr9L9Wsym-rQ,134
4
+ nlptools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
5
+ nlptools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
6
+ nlptools/nlptools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
7
+ nlptools/CLI/DataDownload/download_files.py,sha256=PMDEPXxZQbrFo-7iyhvrCpzx2RG5T5kPk6NJAwh8RSI,2322
8
+ nlptools/CLI/arabiner/bin/infer.py,sha256=YrNCVro8B3UxpsHjIo_01qiBQURpDNTK7pKTkw1L21Y,4921
9
+ nlptools/CLI/arabiner/bin/infer2.py,sha256=CtR9rwe20ks_qq-l_fQU-ThLqft_1o3Ztmd1my1kHMg,3905
10
+ nlptools/CLI/morphology/ALMA_multi_word.py,sha256=NINts8BtT8BGQPBvs4BJ_y2PsR7czsGPOVAwngaT85A,2644
11
+ nlptools/CLI/morphology/morph_analyzer.py,sha256=39vrFx6ppu7yEITcz8lAJhk3xHweaPWEqL-CcqBM37Q,3565
12
+ nlptools/CLI/salma/salma_tools.py,sha256=7awpCb68QUc3kx-EuwRHxDmItZlX2aSdpukwKF1G3Fo,1999
13
+ nlptools/CLI/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ nlptools/CLI/utils/arStrip.py,sha256=dzy16wZfSznkvGHHBn5P21EvyusKB55dqrZ4zbaa41w,3621
15
+ nlptools/CLI/utils/corpus_tokenizer.py,sha256=S0YG8FRS29K1C8eJVEYuWSV1ABS7PKymlNS7KxvYqxI,2817
16
+ nlptools/CLI/utils/implication.py,sha256=hjYTN0oiLf0bz0bRO_GD4rphZkaB3cH770clFFhuevE,3172
17
+ nlptools/CLI/utils/jaccard.py,sha256=a6oc28yMgm7UewO6Lz25A4Yv8QEzVa85XF-QV9uhMwI,4639
18
+ nlptools/CLI/utils/latin_remove.py,sha256=Xw6PB4GtMLLiYK3zTEwdLhBbivMyy1msD5Ab_QdJoQA,1303
19
+ nlptools/CLI/utils/remove_Punc.py,sha256=dvSiSs9UulhGCogBgtpD8fU860BFuMBTnwa8Ek9aPKQ,1393
20
+ nlptools/CLI/utils/sentence_tokenizer.py,sha256=AcJa_yRdlQqKMwVWWKSv1vRO1Yk-NK75-NpalkHqewc,3469
21
+ nlptools/CLI/utils/text_transliteration.py,sha256=blIGB8FeF10iFeXADM-z01XJ4qeB1qgj6S2Xnk9w5fI,2266
22
+ nlptools/DataDownload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
+ nlptools/DataDownload/downloader.py,sha256=yONVa99OtPXD5Lewy4Fm3eUiJMpBt492G1JOPh5sXAU,6523
24
+ nlptools/arabert/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
25
+ nlptools/arabert/preprocess.py,sha256=qI0FsuMTOzdRlYGCtLrjpXgikNElUZPv9bnjaKDZKJ4,33024
26
+ nlptools/arabert/arabert/__init__.py,sha256=KbSAH-XqbRygn0y59m5-ZYOLXgpT1gSgE3F-qd4rKEc,627
27
+ nlptools/arabert/arabert/create_classification_data.py,sha256=BhemGNRbYz_Pun0Q5WerN2-9n-ILmU3tm4J-OlHw5-A,7678
28
+ nlptools/arabert/arabert/create_pretraining_data.py,sha256=2M-cF3CLHbQ0cdWrzFT6Frg1vVP4Y-CFoq8iEPyxgsE,18924
29
+ nlptools/arabert/arabert/extract_features.py,sha256=C1IzASrlX7u4_M2xdr_PjzWfTRZgklhUXA2WHKgQt-I,15585
30
+ nlptools/arabert/arabert/lamb_optimizer.py,sha256=uN3Dcx-6n2_OwepyymRrGrB4EcSkR8b2ZczZrOr7bpY,6263
31
+ nlptools/arabert/arabert/modeling.py,sha256=KliecCmA1pP3owg0mYge6On3IRHunMF5kMLuEwc0VLw,40896
32
+ nlptools/arabert/arabert/optimization.py,sha256=Wx0Js6Zsfc3iVw-_7Q1SCnxfP_qqbdTAyFD-vZSpOyk,8153
33
+ nlptools/arabert/arabert/run_classifier.py,sha256=AdVGyvidlmbEp12b-PauiBo6EmFLEO7tqeJKuLhK2DA,38777
34
+ nlptools/arabert/arabert/run_pretraining.py,sha256=yO16nKkHDfcYA2Zx7vv8KN4te6_1qFOzyVeDzFT-DQw,21894
35
+ nlptools/arabert/arabert/run_squad.py,sha256=PORxgiByP8L6vZqAFkqgHPJ_ZjAlqlg64gtkdLmDNns,53456
36
+ nlptools/arabert/arabert/tokenization.py,sha256=R6xkyCb8_vgeksXiLeqDvV5vOnLb1cPNsvfDij6YVFk,14132
37
+ nlptools/arabert/araelectra/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
38
+ nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py,sha256=pIo6VFT3XXOYroZaab3msZAP6XjCKu0KcrIZQA0Pj8U,3881
39
+ nlptools/arabert/araelectra/build_pretraining_dataset.py,sha256=Z8ZmKznaE_2SPDRoPYR1SDhjTN_NTpNCFFuhUkykwl8,9041
40
+ nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py,sha256=W7HFr1XoO6bCDR7X7w-bOuwULFtTSjeKbJ2LHzzHf9k,3224
41
+ nlptools/arabert/araelectra/configure_finetuning.py,sha256=YfGLMdgN6Qqm357Mzy5UMjkuLPPWtBs7f4dA-DKE6JM,7768
42
+ nlptools/arabert/araelectra/configure_pretraining.py,sha256=oafQgu4WmVdxBcU5mSfXhPlvCk43CJwAWXC10Q58BlI,5801
43
+ nlptools/arabert/araelectra/flops_computation.py,sha256=krHTeuPH9xQu5ldprBOPJNlJRvC7fmmvXXqUjfWrzPE,9499
44
+ nlptools/arabert/araelectra/run_finetuning.py,sha256=JecbrSmGikBNyid4JKRZ49Rm5xFpt02WfgIIcs3TpcU,12976
45
+ nlptools/arabert/araelectra/run_pretraining.py,sha256=1K2aAFTY0p3iaLY0xkhTlm6v0B-Zun8SwEzz-K6RXM4,20665
46
+ nlptools/arabert/araelectra/finetune/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
47
+ nlptools/arabert/araelectra/finetune/feature_spec.py,sha256=cqNlBa2KK_G1-vkKm1EJUv6BoS3gesCUAHwVagZB6wM,1888
48
+ nlptools/arabert/araelectra/finetune/preprocessing.py,sha256=1mf7-IxknCRsobQZ-VV1zs4Cwt-mfOtoVxysDJa9LZ0,6657
49
+ nlptools/arabert/araelectra/finetune/scorer.py,sha256=PjRg0P5ANCtul2ute7ccq3mRCCoIAoCb-lVLlwd4rVY,1571
50
+ nlptools/arabert/araelectra/finetune/task.py,sha256=zM8M4PGSIrY2u6ytpmkQEXxG-jjoeN9wouEyVR23qeQ,1991
51
+ nlptools/arabert/araelectra/finetune/task_builder.py,sha256=Zsoiuw5M3Ca8QhaZVLVLZyWw09K5R75UeMuPmazMlHI,2768
52
+ nlptools/arabert/araelectra/model/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
53
+ nlptools/arabert/araelectra/model/modeling.py,sha256=5XLIutnmr-SFQOV_XntJ-U5evSCY-J2e9NjvlwVXKkk,40877
54
+ nlptools/arabert/araelectra/model/optimization.py,sha256=BCMb_C5hgBw7wC9ZR8AQ4lwoPopqLIcSiqcCrIjx9XU,7254
55
+ nlptools/arabert/araelectra/model/tokenization.py,sha256=9CkyPzs3L6OEPzN-7EWQDNQmW2mIJoZD4o1rn6xLdL4,11082
56
+ nlptools/arabert/araelectra/pretrain/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
57
+ nlptools/arabert/araelectra/pretrain/pretrain_data.py,sha256=NLgIcLAq1-MgtBNXYu_isDxnOY5k67SyADYy-8nzBok,5413
58
+ nlptools/arabert/araelectra/pretrain/pretrain_helpers.py,sha256=nFl7LEdxAU5kKwiodqJHzi-ty9jMFsCCNYOF__A69j8,9255
59
+ nlptools/arabert/araelectra/util/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
60
+ nlptools/arabert/araelectra/util/training_utils.py,sha256=7h_J1ljUWM0ynBcofEtjZWL_oAfZtTxEemQLkixgn-0,4142
61
+ nlptools/arabert/araelectra/util/utils.py,sha256=G0UAETUCZMlU9R9ASD9AXrWZeodWI1aZJEE9F-goaH4,2591
62
+ nlptools/arabert/aragpt2/__init__.py,sha256=aQkKhQwWaS61wYEeOdx682upeMWFPUjLxXSs7JM1sOA,18
63
+ nlptools/arabert/aragpt2/create_pretraining_data.py,sha256=fFa2_DAyTwc8L2IqQbshsh_Ia26nj1qtVLzC6DooSac,3105
64
+ nlptools/arabert/aragpt2/train_bpe_tokenizer.py,sha256=b-8zHQ02fLmZV4GfjnrPptwjpX259F41SlnWzBrflMA,1888
65
+ nlptools/arabert/aragpt2/gpt2/__init__.py,sha256=aQkKhQwWaS61wYEeOdx682upeMWFPUjLxXSs7JM1sOA,18
66
+ nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py,sha256=uN3Dcx-6n2_OwepyymRrGrB4EcSkR8b2ZczZrOr7bpY,6263
67
+ nlptools/arabert/aragpt2/gpt2/optimization.py,sha256=iqh23cypRSRUt53wt2G5SbNNpJMwERM7gZAOKVh5l4U,8411
68
+ nlptools/arabert/aragpt2/gpt2/run_pretraining.py,sha256=4jjkUbvTO1DHoKJ89yKtlkkofcND_fyAunQ-mlnJhTM,13298
69
+ nlptools/arabert/aragpt2/grover/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
+ nlptools/arabert/aragpt2/grover/dataloader.py,sha256=-FWPTjtsvweEE1WaWRHBXfOSbsGiUmnXT3qK7KJP8cM,6853
71
+ nlptools/arabert/aragpt2/grover/modeling.py,sha256=XcUvFwqRaxAwWiJstrH2FPBvDJe03pTWIyipdMfWj9g,38280
72
+ nlptools/arabert/aragpt2/grover/modeling_gpt2.py,sha256=WFpCWn1792yATFzt8rZ0rpWvExfbLzV2BqiEs7llFUw,51602
73
+ nlptools/arabert/aragpt2/grover/optimization_adafactor.py,sha256=1geOsCWuv5xxtSnKDz9a8aY5SVwZ1MGq-xVQDBg4Gpg,9765
74
+ nlptools/arabert/aragpt2/grover/train_tpu.py,sha256=qNgLI_j6-KYkTMJfVoFlh4NIKweY1aPz1XPDw6odld0,7102
75
+ nlptools/arabert/aragpt2/grover/utils.py,sha256=V5wMUxK03r5g_pb7R3_uGLOPqQJfbIB0VaJ8ZDM4XAo,8473
76
+ nlptools/arabiner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
+ nlptools/arabiner/bin/__init__.py,sha256=d1ToN2uheCCVby3TjiSuD1dqo_pvNIuTgz4COFr2Khs,438
78
+ nlptools/arabiner/bin/eval.py,sha256=ihtjJinY1jXpZXW5bQJzTC5MF6_V3GQ5zHzsc691_HQ,2591
79
+ nlptools/arabiner/bin/infer.py,sha256=EZKeq4zucIE-ooHYnegODNxsRiIY_gY5GvDPChH5WRQ,3237
80
+ nlptools/arabiner/bin/process.py,sha256=4QCZjsmYV5lep6waQE37fs7Fe59_1G5seIJLDkArg4s,4698
81
+ nlptools/arabiner/bin/train.py,sha256=hf6ZRhqMZ7bFealMSusBjtWrbzHGHc5HB2Lh4rp2uQA,6390
82
+ nlptools/arabiner/data/__init__.py,sha256=XPic1bPijmZda_LPFL5J6TOps_IHUTiBDJvMx-iJqKo,61
83
+ nlptools/arabiner/data/datasets.py,sha256=p52Uc8Q2D3EzN1OmoHQcWVsJ2oB3TqgTzAcy1B9fJ68,5068
84
+ nlptools/arabiner/data/transforms.py,sha256=KPCDdjZOEvhMC38eiFwJuiQC84cfDrvC0XM4Ye0o3do,4878
85
+ nlptools/arabiner/nn/BaseModel.py,sha256=3GmujQasTZZunOBuFXpY2p1W8W256iI_Uu4hxhOY2Z0,608
86
+ nlptools/arabiner/nn/BertNestedTagger.py,sha256=7vU2tmDSoqSHn6GvMJmyN0hEMLvCkbr_r-AaiAaYdw8,1223
87
+ nlptools/arabiner/nn/BertSeqTagger.py,sha256=dFcBBiMw2QCWsyy7aQDe_PS3aRuNn4DOxKIHgTblFvc,504
88
+ nlptools/arabiner/nn/__init__.py,sha256=ZN7Psm83pysUhGI3ZSaJra2aCYBZb9DZ0UX4CiKGc0A,182
89
+ nlptools/arabiner/trainers/BaseTrainer.py,sha256=oZgFJW-CawfCKT5gtaBHA7Q7XjNfiyqM62KnFsgVzPU,3919
90
+ nlptools/arabiner/trainers/BertNestedTrainer.py,sha256=hVqPRdmaHf2iwftseNpgsAfwGkl6eMHJx1rKunQS_vM,8443
91
+ nlptools/arabiner/trainers/BertTrainer.py,sha256=KkcgZXu6kqsrrnfFtiAQ8ucLsrQtDxLRqdbTiTnRWqI,6447
92
+ nlptools/arabiner/trainers/__init__.py,sha256=kt8WqsaOjX0h1JMa-v7Y9ywT5mfwQIsZTyVWnIAWsEQ,200
93
+ nlptools/arabiner/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
94
+ nlptools/arabiner/utils/data.py,sha256=uuPiu-7v0gccNygZjdTKomJGE7X0H9FC24Y9nHZpf4c,4376
95
+ nlptools/arabiner/utils/helpers.py,sha256=PyOOlx5uabvZVmU3SZtZ3ZLA3pliinJ3JXsvos9SUWU,5032
96
+ nlptools/arabiner/utils/metrics.py,sha256=Irz6SsIvpOzGIA2lWxrEV86xnTnm0TzKm9SUVT4SXUU,2734
97
+ nlptools/morphology/ALMA_multi_word.py,sha256=hlzZCk-uUdZ-GbiPsFxDTvoWoIuVof2Sm7NdaxaFipM,1313
98
+ nlptools/morphology/__init__.py,sha256=z6_RGhiyfNHXNKMmhNSI6ObTLmdjQyP58vsFottI8GA,1706
99
+ nlptools/morphology/charsets.py,sha256=7w9OrbnZTnLU3A9q-SUi9GhUN97qNtbYR5T0Pm72cF8,2784
100
+ nlptools/morphology/morph_analyzer.py,sha256=OmCxm4-fM2qfYzKk8yOd6D_T3RsfzZCcd7Oz2V4Advg,6507
101
+ nlptools/morphology/settings.py,sha256=sEZdnA7MiYXHdxrfHWXop1RcKClVzpOYzZwzHC1PxJ8,144
102
+ nlptools/morphology/tokenizers_words.py,sha256=Smtt_KXifl2wRI464Qn07PtUvOsyGBJjZ7E20gd8zVM,602
103
+ nlptools/salma/__init__.py,sha256=pOauGjD-xrGHw05sNx3EiSFc_wpM3bD1vJxQHoDDXOA,376
104
+ nlptools/salma/settings.py,sha256=fqAQg2b22gorzT9Pf_AEJD9p8AlVUaVyKD3FH8g2yUs,1110
105
+ nlptools/salma/views.py,sha256=EH1vc6P88CeAIzQKt7EU_HTI0uJipv4JdXiAX5NjrJY,18416
106
+ nlptools/salma/wsd.py,sha256=kmP5ZvvVMkxApgk91TAGSBkMJZbPPbS0qoNk8OE37og,4434
107
+ nlptools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
+ nlptools/utils/corpus_tokenizer.py,sha256=IDWh87XJaFa7V2P4kIxY4QVywPKhz0fIErc_c0gJGUU,4581
109
+ nlptools/utils/implication.py,sha256=Ro1Vw62oOBzELkX-zpHyieq4v2OsoyFrFeTU7BiK7qc,27794
110
+ nlptools/utils/jaccard.py,sha256=TTC5KTVv6kONw5vZtzxEQvv7QM79BCsD0xcJAY0T5tU,10111
111
+ nlptools/utils/parser.py,sha256=0Yd40CZf4wXso2q-d9LULUNAVUAMdiYMImfcVb6i9qQ,6175
112
+ nlptools/utils/readfile.py,sha256=xE4LEaCqXJIk9v37QUSSmWb-aY3UnCFUNb7uVdx3cpM,133
113
+ nlptools/utils/sentence_tokenizer.py,sha256=3C0Wx1ns8ZHiGwKlUkcti-8zA3fB4ju0fIEtGACM7oU,2162
114
+ nlptools/utils/text_transliteration.py,sha256=zhB3sFXSMtkkdqImRMVg415AAB80DOm9lMFKb2IBynw,8765
115
+ nlptools/utils/utils.py,sha256=vKkFOkYclMu8nXS_VZb6Kobx8QGKW9onXkkLCeiRb6g,32
116
+ SinaTools-0.1.1.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
117
+ SinaTools-0.1.1.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
118
+ SinaTools-0.1.1.dist-info/METADATA,sha256=G-49Kky9vazLGwQcFV-lbFQ_tb2PzwidvGdTN3wTG_c,1577
119
+ SinaTools-0.1.1.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
120
+ SinaTools-0.1.1.dist-info/entry_points.txt,sha256=9-PNkvWGCid8SVN03S2NkJFuxAzvcB22tGpHe-et2q8,951
121
+ SinaTools-0.1.1.dist-info/top_level.txt,sha256=sREDI6iHe4D0BZQmZbZ-LxYIn2cBWUayk9CZwAR9jaE,9
122
+ SinaTools-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,6 @@
1
+ Wheel-Version: 1.0
2
+ Generator: bdist_wheel (0.34.2)
3
+ Root-Is-Purelib: true
4
+ Tag: py2-none-any
5
+ Tag: py3-none-any
6
+
@@ -0,0 +1,18 @@
1
+ [console_scripts]
2
+ arabi_ner = nlptools.CLI.arabiner.bin.infer:main
3
+ arabi_ner2 = nlptools.CLI.arabiner.bin.infer2:main
4
+ install_env = nlptools.install_env:main
5
+ sina_alma_multi_word = nlptools.CLI.morphology.ALMA_multi_word:main
6
+ sina_appdatadir = nlptools.CLI.DataDownload.get_appdatadir:main
7
+ sina_arStrip = nlptools.CLI.utils.arStrip:main
8
+ sina_corpus_tokenizer = nlptools.CLI.utils.corpus_tokenizer:main
9
+ sina_download_files = nlptools.CLI.DataDownload.download_files:main
10
+ sina_implication = nlptools.CLI.utils.implication:main
11
+ sina_jaccard_similarity = nlptools.CLI.utils.jaccard:main
12
+ sina_morph_analyze = nlptools.CLI.morphology.morph_analyzer:main
13
+ sina_remove_latin = nlptools.CLI.utils.latin_remove:main
14
+ sina_remove_punctuation = nlptools.CLI.utils.remove_Punc:main
15
+ sina_salma = nlptools.CLI.salma.salma_tools:main
16
+ sina_sentence_tokenize = nlptools.CLI.utils.sentence_tokenizer:main
17
+ sina_transliterate = nlptools.CLI.utils.text_transliteration:main
18
+
@@ -0,0 +1 @@
1
+ nlptools
@@ -0,0 +1,71 @@
1
+ """
2
+ About:
3
+ ------
4
+
5
+ The sina_download_files tool is a command-line interface for downloading various NLP resources from pre-specified URLs. It is a part of the nlptools package and provides options to choose which files to download and to specify a download directory. The tool automatically handles file extraction for zip and tar.gz files.
6
+
7
+ Usage:
8
+ ------
9
+
10
+ Below is the usage information that can be generated by running sina_download_files --help.
11
+
12
+ .. code-block:: none
13
+
14
+ Usage:
15
+ sina_download_files [OPTIONS]
16
+
17
+ .. code-block:: none
18
+
19
+ Options:
20
+ -f, --files FILES
21
+ Names of the files to download. Available files are: ner, morph, salma_model, salma_tokenizer, glosses_dic, lemma_dic, five_grams, four_grams, three_grams, two_grams.
22
+ If no file is specified, all files will be downloaded.
23
+
24
+ Examples:
25
+ ---------
26
+
27
+ .. code-block:: none
28
+
29
+ sina_download_files -f morph ner
30
+ This command will download only the `morph` and `ner` files to the default directory.
31
+
32
+ Note:
33
+ -----
34
+
35
+ .. code-block:: none
36
+
37
+ - The script automatically handles the extraction of zip and tar.gz files after downloading.
38
+ - Ensure you have the necessary permissions to write to the specified directory.
39
+ - The default download directory is based on the operating system and can be obtained using the `get_appdatadir` function.
40
+
41
+
42
+ """
43
+
44
+ import argparse
45
+ from nlptools.DataDownload.downloader import download_file
46
+ from nlptools.DataDownload.downloader import download_files
47
+ from nlptools.DataDownload.downloader import get_appdatadir
48
+ from nlptools.DataDownload.downloader import urls
49
+
50
+
51
+ def main():
52
+ parser = argparse.ArgumentParser(description="Download files from specified URLs.")
53
+ parser.add_argument('-f', '--files', nargs="*", choices=urls.keys(),
54
+ help="Names of the files to download. Available files are: "
55
+ f"{', '.join(urls.keys())}. If no file is specified, all files will be downloaded.")
56
+
57
+ get_appdatadir()
58
+
59
+ args = parser.parse_args()
60
+
61
+ if args.files:
62
+ for file in args.files:
63
+ url = urls[file]
64
+ download_file(url)
65
+ else:
66
+ download_files()
67
+
68
+ if __name__ == '__main__':
69
+ main()
70
+
71
+ #sina_download_files -f morph ner
@@ -0,0 +1,117 @@
1
+ """
2
+ About:
3
+ ------
4
+ The ArabiNER tool carries out Named Entity Recognition (NER) utilizing the ArabiNER utility from the SinaTools suite. It identifies the named entities and provides a comprehensive analysis in JSON format if the input consists of text, or in a CSV file if the input is a directory of files.
5
+
6
+ Usage:
7
+ ------
8
+ Below is the usage information that can be generated by running arabi_ner --help.
9
+
10
+ .. code-block:: none
11
+
12
+ arabi_ner --text=INPUT_TEXT
13
+ arabi_ner --dir=INPUT_FILE --output_csv=OUTPUT_FILE_NAME
14
+
15
+ Options:
16
+ --------
17
+
18
+ .. code-block:: none
19
+
20
+ --text INPUT_TEXT
21
+ The text that needs to be analyzed for Named Entity Recognition.
22
+ --file INPUT_FILE
23
+ File containing the text to be analyzed for Named Entity Recognition.
24
+ --output_csv OUTPUT_FILE_NAME
25
+ A file containing the tokenized text and its Named Entity tags.
26
+ Examples:
27
+ ---------
28
+
29
+ .. code-block:: none
30
+
31
+ arabi_ner --text "Your text here"
32
+ arabi_ner --dir "/path/to/your/directory" --output_csv "output.csv"
33
+
34
+ Note:
35
+ -----
36
+
37
+ .. code-block:: none
38
+
39
+ - Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
40
+ - The tool returns results in JSON format with proper indentation for better readability.
41
+ - The quality and accuracy of the analysis depend on the underlying capabilities of the ArabiNER utility.
42
+
43
+ """
44
+
45
+ import argparse
46
+ import json
47
+ import pandas as pd
48
+ from nlptools.arabiner.bin.infer import ner
49
+ from nlptools.utils.corpus_tokenizer import corpus_tokenizer
50
+ from nlptools.morphology.tokenizers_words import simple_word_tokenize
51
+
52
+
53
+ def infer(sentence):
54
+ # Now infer returns all NER tags for a sentence
55
+ output = ner(sentence)
56
+ ##print("ner output : ", output)
57
+ return [word[1] for word in output]
58
+
59
+
60
+ def main():
61
+ parser = argparse.ArgumentParser(description='NER Analysis using ArabiNER')
62
+
63
+ parser.add_argument('--text', type=str, help='Text to be analyzed for Named Entity Recognition')
64
+ parser.add_argument('--dir', type=str, help='dir containing the text files to be analyzed for Named Entity Recognition')
65
+ parser.add_argument('--output_csv', type=str, help='Output CSV file to write the results')
66
+
67
+ args = parser.parse_args()
68
+
69
+ if args.text is not None:
70
+ results = ner(args.text)
71
+ # Print the results in JSON format
72
+ print(json.dumps(results, ensure_ascii=False, indent=4))
73
+ elif args.dir is not None:
74
+ corpus_tokenizer(args.dir, args.output_csv)
75
+ df = pd.read_csv(args.output_csv)
76
+ df['NER tags'] = None
77
+ i = 0
78
+
79
+ # Use drop_duplicates to get unique values based on Row_ID and Sentence
80
+ result = df.drop_duplicates(subset=['Global Sentence ID', 'Sentence'])
81
+
82
+ # Get the "Sentence" column as an array
83
+ unique_sentences = result['Sentence'].to_numpy()
84
+
85
+ # Print the result
86
+ #print(unique_sentences, len(result['Sentence']))
87
+ #print("#############")
88
+
89
+ for sentence in unique_sentences: # iterating over unique sentences
90
+ #print(" Sentence : ", simple_word_tokenize(sentence), len(simple_word_tokenize(sentence)))
91
+ ner_tags = infer(sentence) # getting all NER tags for the sentence
92
+ #if len(ner_tags) != len(df[i:i+len(ner_tags)]):
93
+ # print("Not Equal...", len(ner_tags) , len(df[i:i+len(ner_tags)]))
94
+ # return
95
+ if len(simple_word_tokenize(sentence)) > 300:
96
+ print(" Length of this sentence is more than 300 word: ", sentence)
97
+ return
98
+ #df['NER tags'].iloc[i:i+len(ner_tags)] = ner_tags
99
+ df.loc[i:i+len(ner_tags)-1, 'NER tags'] = ner_tags # Use .loc to assign values
100
+ #print("Exit with ner tags = ", ner_tags, " and length : ", len(ner_tags), type(len(ner_tags)), " and df is " , df[i:i+len(ner_tags)], " with length : ", len(df[i:i+len(ner_tags)]), type(len(df[i:i+len(ner_tags)])), " i:i+len(ner_tags) : ", i," , ", i+len(ner_tags))
101
+ i = i + len(ner_tags)
102
+
103
+ df.to_csv(args.output_csv, index=False)
104
+ else:
105
+ print("Error: Either --text or --file argument must be provided.")
106
+ return
107
+
108
+
109
+ if __name__ == '__main__':
110
+ main()
111
+
112
+ #arabi_ner --text "Your text here."
113
+ #arabi_ner --dir /path/to/your/directory --output_csv output.csv
114
+
115
+ #Each unique sentence in the CSV file is processed once by the infer function to get the NER tags for all the words in the sentence.
116
+ #The current_word_position variable is used to keep track of the position within the list of NER tags returned by infer, ensuring that each word in the CSV file is assigned the correct NER tag.
117
+ #The final CSV file will contain an additional column, NER tags, which contains the NER tag for each word in the Sentence column of the CSV file.