SinaTools 0.1.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- SinaTools-0.1.1.data/data/nlptools/environment.yml +227 -0
- SinaTools-0.1.1.dist-info/AUTHORS.rst +13 -0
- SinaTools-0.1.1.dist-info/LICENSE +22 -0
- SinaTools-0.1.1.dist-info/METADATA +72 -0
- SinaTools-0.1.1.dist-info/RECORD +122 -0
- SinaTools-0.1.1.dist-info/WHEEL +6 -0
- SinaTools-0.1.1.dist-info/entry_points.txt +18 -0
- SinaTools-0.1.1.dist-info/top_level.txt +1 -0
- nlptools/CLI/DataDownload/download_files.py +71 -0
- nlptools/CLI/arabiner/bin/infer.py +117 -0
- nlptools/CLI/arabiner/bin/infer2.py +81 -0
- nlptools/CLI/morphology/ALMA_multi_word.py +75 -0
- nlptools/CLI/morphology/morph_analyzer.py +91 -0
- nlptools/CLI/salma/salma_tools.py +68 -0
- nlptools/CLI/utils/__init__.py +0 -0
- nlptools/CLI/utils/arStrip.py +99 -0
- nlptools/CLI/utils/corpus_tokenizer.py +74 -0
- nlptools/CLI/utils/implication.py +92 -0
- nlptools/CLI/utils/jaccard.py +96 -0
- nlptools/CLI/utils/latin_remove.py +51 -0
- nlptools/CLI/utils/remove_Punc.py +53 -0
- nlptools/CLI/utils/sentence_tokenizer.py +90 -0
- nlptools/CLI/utils/text_transliteration.py +77 -0
- nlptools/DataDownload/__init__.py +0 -0
- nlptools/DataDownload/downloader.py +185 -0
- nlptools/VERSION +1 -0
- nlptools/__init__.py +5 -0
- nlptools/arabert/__init__.py +1 -0
- nlptools/arabert/arabert/__init__.py +14 -0
- nlptools/arabert/arabert/create_classification_data.py +260 -0
- nlptools/arabert/arabert/create_pretraining_data.py +534 -0
- nlptools/arabert/arabert/extract_features.py +444 -0
- nlptools/arabert/arabert/lamb_optimizer.py +158 -0
- nlptools/arabert/arabert/modeling.py +1027 -0
- nlptools/arabert/arabert/optimization.py +202 -0
- nlptools/arabert/arabert/run_classifier.py +1078 -0
- nlptools/arabert/arabert/run_pretraining.py +593 -0
- nlptools/arabert/arabert/run_squad.py +1440 -0
- nlptools/arabert/arabert/tokenization.py +414 -0
- nlptools/arabert/araelectra/__init__.py +1 -0
- nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +103 -0
- nlptools/arabert/araelectra/build_pretraining_dataset.py +230 -0
- nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py +90 -0
- nlptools/arabert/araelectra/configure_finetuning.py +172 -0
- nlptools/arabert/araelectra/configure_pretraining.py +143 -0
- nlptools/arabert/araelectra/finetune/__init__.py +14 -0
- nlptools/arabert/araelectra/finetune/feature_spec.py +56 -0
- nlptools/arabert/araelectra/finetune/preprocessing.py +173 -0
- nlptools/arabert/araelectra/finetune/scorer.py +54 -0
- nlptools/arabert/araelectra/finetune/task.py +74 -0
- nlptools/arabert/araelectra/finetune/task_builder.py +70 -0
- nlptools/arabert/araelectra/flops_computation.py +215 -0
- nlptools/arabert/araelectra/model/__init__.py +14 -0
- nlptools/arabert/araelectra/model/modeling.py +1029 -0
- nlptools/arabert/araelectra/model/optimization.py +193 -0
- nlptools/arabert/araelectra/model/tokenization.py +355 -0
- nlptools/arabert/araelectra/pretrain/__init__.py +14 -0
- nlptools/arabert/araelectra/pretrain/pretrain_data.py +160 -0
- nlptools/arabert/araelectra/pretrain/pretrain_helpers.py +229 -0
- nlptools/arabert/araelectra/run_finetuning.py +323 -0
- nlptools/arabert/araelectra/run_pretraining.py +469 -0
- nlptools/arabert/araelectra/util/__init__.py +14 -0
- nlptools/arabert/araelectra/util/training_utils.py +112 -0
- nlptools/arabert/araelectra/util/utils.py +109 -0
- nlptools/arabert/aragpt2/__init__.py +2 -0
- nlptools/arabert/aragpt2/create_pretraining_data.py +95 -0
- nlptools/arabert/aragpt2/gpt2/__init__.py +2 -0
- nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py +158 -0
- nlptools/arabert/aragpt2/gpt2/optimization.py +225 -0
- nlptools/arabert/aragpt2/gpt2/run_pretraining.py +397 -0
- nlptools/arabert/aragpt2/grover/__init__.py +0 -0
- nlptools/arabert/aragpt2/grover/dataloader.py +161 -0
- nlptools/arabert/aragpt2/grover/modeling.py +803 -0
- nlptools/arabert/aragpt2/grover/modeling_gpt2.py +1196 -0
- nlptools/arabert/aragpt2/grover/optimization_adafactor.py +234 -0
- nlptools/arabert/aragpt2/grover/train_tpu.py +187 -0
- nlptools/arabert/aragpt2/grover/utils.py +234 -0
- nlptools/arabert/aragpt2/train_bpe_tokenizer.py +59 -0
- nlptools/arabert/preprocess.py +818 -0
- nlptools/arabiner/__init__.py +0 -0
- nlptools/arabiner/bin/__init__.py +14 -0
- nlptools/arabiner/bin/eval.py +87 -0
- nlptools/arabiner/bin/infer.py +91 -0
- nlptools/arabiner/bin/process.py +140 -0
- nlptools/arabiner/bin/train.py +221 -0
- nlptools/arabiner/data/__init__.py +1 -0
- nlptools/arabiner/data/datasets.py +146 -0
- nlptools/arabiner/data/transforms.py +118 -0
- nlptools/arabiner/nn/BaseModel.py +22 -0
- nlptools/arabiner/nn/BertNestedTagger.py +34 -0
- nlptools/arabiner/nn/BertSeqTagger.py +17 -0
- nlptools/arabiner/nn/__init__.py +3 -0
- nlptools/arabiner/trainers/BaseTrainer.py +117 -0
- nlptools/arabiner/trainers/BertNestedTrainer.py +203 -0
- nlptools/arabiner/trainers/BertTrainer.py +163 -0
- nlptools/arabiner/trainers/__init__.py +3 -0
- nlptools/arabiner/utils/__init__.py +0 -0
- nlptools/arabiner/utils/data.py +124 -0
- nlptools/arabiner/utils/helpers.py +151 -0
- nlptools/arabiner/utils/metrics.py +69 -0
- nlptools/environment.yml +227 -0
- nlptools/install_env.py +13 -0
- nlptools/morphology/ALMA_multi_word.py +34 -0
- nlptools/morphology/__init__.py +52 -0
- nlptools/morphology/charsets.py +60 -0
- nlptools/morphology/morph_analyzer.py +170 -0
- nlptools/morphology/settings.py +8 -0
- nlptools/morphology/tokenizers_words.py +19 -0
- nlptools/nlptools.py +1 -0
- nlptools/salma/__init__.py +12 -0
- nlptools/salma/settings.py +31 -0
- nlptools/salma/views.py +459 -0
- nlptools/salma/wsd.py +126 -0
- nlptools/utils/__init__.py +0 -0
- nlptools/utils/corpus_tokenizer.py +73 -0
- nlptools/utils/implication.py +662 -0
- nlptools/utils/jaccard.py +247 -0
- nlptools/utils/parser.py +147 -0
- nlptools/utils/readfile.py +3 -0
- nlptools/utils/sentence_tokenizer.py +53 -0
- nlptools/utils/text_transliteration.py +232 -0
- nlptools/utils/utils.py +2 -0
nlptools/environment.yml
ADDED
@@ -0,0 +1,227 @@
|
|
1
|
+
name: arabicner
|
2
|
+
channels:
|
3
|
+
- anaconda
|
4
|
+
- pytorch
|
5
|
+
- nvidia
|
6
|
+
- conda-forge
|
7
|
+
- defaults
|
8
|
+
dependencies:
|
9
|
+
- _libgcc_mutex=0.1=main
|
10
|
+
- _openmp_mutex=5.1=1_gnu
|
11
|
+
- abseil-cpp=20211102.0=h27087fc_1
|
12
|
+
- absl-py=1.3.0=pyhd8ed1ab_0
|
13
|
+
- aiohttp=3.8.1=py310h5764c6d_1
|
14
|
+
- aiosignal=1.2.0=pyhd8ed1ab_0
|
15
|
+
- arrow-cpp=8.0.0=py310h3098874_0
|
16
|
+
- async-timeout=4.0.2=py310h06a4308_0
|
17
|
+
- attrs=22.1.0=pyh71513ae_1
|
18
|
+
- aws-c-common=0.4.57=he6710b0_1
|
19
|
+
- aws-c-event-stream=0.1.6=h2531618_5
|
20
|
+
- aws-checksums=0.1.9=he6710b0_0
|
21
|
+
- aws-sdk-cpp=1.8.185=hce553d0_0
|
22
|
+
- blas=1.0=mkl
|
23
|
+
- blinker=1.5=pyhd8ed1ab_0
|
24
|
+
- boost-cpp=1.78.0=he72f1d9_0
|
25
|
+
- bottleneck=1.3.5=py310ha9d4c09_0
|
26
|
+
- brotli=1.0.9=h166bdaf_7
|
27
|
+
- brotli-bin=1.0.9=h166bdaf_7
|
28
|
+
- brotlipy=0.7.0=py310h7f8727e_1002
|
29
|
+
- bzip2=1.0.8=h7b6447c_0
|
30
|
+
- c-ares=1.18.1=h7f98852_0
|
31
|
+
- ca-certificates=2022.9.24=ha878542_0
|
32
|
+
- cachetools=5.2.0=pyhd8ed1ab_0
|
33
|
+
- certifi=2022.9.24=pyhd8ed1ab_0
|
34
|
+
- cffi=1.15.1=py310h74dc2b5_0
|
35
|
+
- charset-normalizer=2.0.4=pyhd3eb1b0_0
|
36
|
+
- click=8.1.3=unix_pyhd8ed1ab_2
|
37
|
+
- cryptography=38.0.1=py310h9ce1e76_0
|
38
|
+
- cuda=11.7.1=0
|
39
|
+
- cuda-cccl=11.7.91=0
|
40
|
+
- cuda-command-line-tools=11.7.1=0
|
41
|
+
- cuda-compiler=11.7.1=0
|
42
|
+
- cuda-cudart=11.7.99=0
|
43
|
+
- cuda-cudart-dev=11.7.99=0
|
44
|
+
- cuda-cuobjdump=11.7.91=0
|
45
|
+
- cuda-cupti=11.7.101=0
|
46
|
+
- cuda-cuxxfilt=11.7.91=0
|
47
|
+
- cuda-demo-suite=11.8.86=0
|
48
|
+
- cuda-documentation=11.8.86=0
|
49
|
+
- cuda-driver-dev=11.7.99=0
|
50
|
+
- cuda-gdb=11.8.86=0
|
51
|
+
- cuda-libraries=11.7.1=0
|
52
|
+
- cuda-libraries-dev=11.7.1=0
|
53
|
+
- cuda-memcheck=11.8.86=0
|
54
|
+
- cuda-nsight=11.8.86=0
|
55
|
+
- cuda-nsight-compute=11.8.0=0
|
56
|
+
- cuda-nvcc=11.7.99=0
|
57
|
+
- cuda-nvdisasm=11.8.86=0
|
58
|
+
- cuda-nvml-dev=11.7.91=0
|
59
|
+
- cuda-nvprof=11.8.87=0
|
60
|
+
- cuda-nvprune=11.7.91=0
|
61
|
+
- cuda-nvrtc=11.7.99=0
|
62
|
+
- cuda-nvrtc-dev=11.7.99=0
|
63
|
+
- cuda-nvtx=11.7.91=0
|
64
|
+
- cuda-nvvp=11.8.87=0
|
65
|
+
- cuda-runtime=11.7.1=0
|
66
|
+
- cuda-sanitizer-api=11.8.86=0
|
67
|
+
- cuda-toolkit=11.7.1=0
|
68
|
+
- cuda-tools=11.7.1=0
|
69
|
+
- cuda-visual-tools=11.7.1=0
|
70
|
+
- dataclasses=0.8=pyhc8e2a94_3
|
71
|
+
- datasets=2.6.1=pyhd8ed1ab_0
|
72
|
+
- dill=0.3.5.1=pyhd8ed1ab_0
|
73
|
+
- ffmpeg=4.3=hf484d3e_0
|
74
|
+
- fftw=3.3.10=nompi_h77c792f_102
|
75
|
+
- filelock=3.8.0=pyhd8ed1ab_0
|
76
|
+
- freetype=2.12.1=h4a9f257_0
|
77
|
+
- frozenlist=1.2.0=py310h7f8727e_1
|
78
|
+
- fsspec=2022.10.0=pyhd8ed1ab_0
|
79
|
+
- gds-tools=1.4.0.31=0
|
80
|
+
- gflags=2.2.2=he1b5a44_1004
|
81
|
+
- giflib=5.2.1=h7b6447c_0
|
82
|
+
- glog=0.6.0=h6f12383_0
|
83
|
+
- gmp=6.2.1=h295c915_3
|
84
|
+
- gnutls=3.6.15=he1e5248_0
|
85
|
+
#- google-auth=2.14.0=pyh1a96a4e_0
|
86
|
+
#- google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
|
87
|
+
- grpc-cpp=1.46.1=h33aed49_0
|
88
|
+
- grpcio=1.42.0=py310hce63b2e_0
|
89
|
+
- huggingface_hub=0.10.1=pyhd8ed1ab_0
|
90
|
+
- icu=70.1=h27087fc_0
|
91
|
+
- idna=3.4=py310h06a4308_0
|
92
|
+
- importlib-metadata=5.0.0=pyha770c72_1
|
93
|
+
- importlib_metadata=5.0.0=hd8ed1ab_1
|
94
|
+
- intel-openmp=2021.4.0=h06a4308_3561
|
95
|
+
- joblib=1.2.0=pyhd8ed1ab_0
|
96
|
+
- jpeg=9e=h7f8727e_0
|
97
|
+
- keyutils=1.6.1=h166bdaf_0
|
98
|
+
- krb5=1.19.3=h3790be6_0
|
99
|
+
- lame=3.100=h7b6447c_0
|
100
|
+
- lcms2=2.12=h3be6417_0
|
101
|
+
- ld_impl_linux-64=2.38=h1181459_1
|
102
|
+
- lerc=3.0=h295c915_0
|
103
|
+
- libbrotlicommon=1.0.9=h166bdaf_7
|
104
|
+
- libbrotlidec=1.0.9=h166bdaf_7
|
105
|
+
- libbrotlienc=1.0.9=h166bdaf_7
|
106
|
+
- libcublas=11.11.3.6=0
|
107
|
+
- libcublas-dev=11.11.3.6=0
|
108
|
+
- libcufft=10.9.0.58=0
|
109
|
+
- libcufft-dev=10.9.0.58=0
|
110
|
+
- libcufile=1.4.0.31=0
|
111
|
+
- libcufile-dev=1.4.0.31=0
|
112
|
+
- libcurand=10.3.0.86=0
|
113
|
+
- libcurl=7.85.0=h91b91d3_0
|
114
|
+
- libcusolver=11.4.1.48=0
|
115
|
+
- libcusolver-dev=11.4.1.48=0
|
116
|
+
- libcusparse=11.7.5.86=0
|
117
|
+
- libcusparse-dev=11.7.5.86=0
|
118
|
+
- libdeflate=1.8=h7f8727e_5
|
119
|
+
- libedit=3.1.20191231=he28a2e2_2
|
120
|
+
- libev=4.33=h516909a_1
|
121
|
+
- libevent=2.1.10=h9b69904_4
|
122
|
+
- libffi=3.3=he6710b0_2
|
123
|
+
- libgcc-ng=11.2.0=h1234567_1
|
124
|
+
- libgfortran-ng=12.2.0=h69a702a_19
|
125
|
+
- libgfortran5=12.2.0=h337968e_19
|
126
|
+
- libgomp=11.2.0=h1234567_1
|
127
|
+
- libiconv=1.16=h7f8727e_2
|
128
|
+
- libidn2=2.3.2=h7f8727e_0
|
129
|
+
- libnghttp2=1.46.0=hce63b2e_0
|
130
|
+
- libnpp=11.8.0.86=0
|
131
|
+
- libnpp-dev=11.8.0.86=0
|
132
|
+
- libnvjpeg=11.9.0.86=0
|
133
|
+
- libnvjpeg-dev=11.9.0.86=0
|
134
|
+
- libpng=1.6.37=hbc83047_0
|
135
|
+
- libprotobuf=3.20.1=h4ff587b_0
|
136
|
+
- libssh2=1.10.0=ha56f1ee_2
|
137
|
+
- libstdcxx-ng=11.2.0=h1234567_1
|
138
|
+
- libtasn1=4.16.0=h27cfd23_0
|
139
|
+
- libthrift=0.15.0=he6d91bd_0
|
140
|
+
- libtiff=4.4.0=hecacb30_0
|
141
|
+
- libunistring=0.9.10=h27cfd23_0
|
142
|
+
- libuuid=1.0.3=h7f8727e_2
|
143
|
+
- libwebp=1.2.4=h11a3e52_0
|
144
|
+
- libwebp-base=1.2.4=h5eee18b_0
|
145
|
+
- lz4-c=1.9.3=h295c915_1
|
146
|
+
- markdown=3.4.1=pyhd8ed1ab_0
|
147
|
+
- markupsafe=2.1.1=py310h5764c6d_1
|
148
|
+
- mkl=2021.4.0=h06a4308_640
|
149
|
+
- mkl-service=2.4.0=py310h7f8727e_0
|
150
|
+
- mkl_fft=1.3.1=py310hd6ae3a3_0
|
151
|
+
- mkl_random=1.2.2=py310h00e6091_0
|
152
|
+
- multidict=6.0.2=py310h5764c6d_1
|
153
|
+
- multiprocess=0.70.12.2=py310h5764c6d_2
|
154
|
+
- natsort=7.1.1=pyhd3eb1b0_0
|
155
|
+
- ncurses=6.3=h5eee18b_3
|
156
|
+
- nettle=3.7.3=hbbd107a_1
|
157
|
+
- nsight-compute=2022.3.0.22=0
|
158
|
+
- numexpr=2.8.3=py310hcea2de6_0
|
159
|
+
- numpy=1.23.3=py310hd5efca6_0
|
160
|
+
#- numpy-base=1.23.3=py310h8e6c178_0
|
161
|
+
- oauthlib=3.2.2=pyhd8ed1ab_0
|
162
|
+
- openh264=2.1.1=h4ff587b_0
|
163
|
+
- openssl=1.1.1s=h7f8727e_0
|
164
|
+
- orc=1.7.4=h07ed6aa_0
|
165
|
+
- packaging=21.3=pyhd8ed1ab_0
|
166
|
+
- pandas=1.4.4=py310h6a678d5_0
|
167
|
+
- pillow=9.2.0=py310hace64e9_1
|
168
|
+
- pip=22.2.2=py310h06a4308_0
|
169
|
+
- protobuf=3.20.1=py310hd8f1fbe_0
|
170
|
+
- pyarrow=8.0.0=py310h468efa6_0
|
171
|
+
- pyasn1=0.4.8=py_0
|
172
|
+
- pyasn1-modules=0.2.7=py_0
|
173
|
+
- pycparser=2.21=pyhd3eb1b0_0
|
174
|
+
- pyjwt=2.6.0=pyhd8ed1ab_0
|
175
|
+
- pyopenssl=22.0.0=pyhd3eb1b0_0
|
176
|
+
- pyparsing=3.0.9=pyhd8ed1ab_0
|
177
|
+
- pysocks=1.7.1=py310h06a4308_0
|
178
|
+
- python=3.10.6=haa1d7c7_1
|
179
|
+
- python-dateutil=2.8.2=pyhd8ed1ab_0
|
180
|
+
- python-xxhash=3.0.0=py310h5764c6d_1
|
181
|
+
- python_abi=3.10=2_cp310
|
182
|
+
- pytorch=1.13.0=py3.10_cuda11.7_cudnn8.5.0_0
|
183
|
+
- pytorch-cuda=11.7=h67b0de4_0
|
184
|
+
- pytorch-mutex=1.0=cuda
|
185
|
+
- pytz=2022.6=pyhd8ed1ab_0
|
186
|
+
- pyu2f=0.1.5=pyhd8ed1ab_0
|
187
|
+
- pyyaml=6.0=py310h5764c6d_4
|
188
|
+
- re2=2022.04.01=h27087fc_0
|
189
|
+
- readline=8.2=h5eee18b_0
|
190
|
+
- regex=2022.7.9=py310h5eee18b_0
|
191
|
+
- requests=2.28.1=py310h06a4308_0
|
192
|
+
- requests-oauthlib=1.3.1=pyhd8ed1ab_0
|
193
|
+
- responses=0.18.0=pyhd8ed1ab_0
|
194
|
+
- rsa=4.9=pyhd8ed1ab_0
|
195
|
+
- sacremoses=0.0.53=pyhd8ed1ab_0
|
196
|
+
- scikit-learn=1.1.3=py310h6a678d5_0
|
197
|
+
- scipy=1.9.3=py310hd5efca6_0
|
198
|
+
- seqeval=1.2.2=pyhd3deb0d_0
|
199
|
+
- setuptools=65.4.0=py310h06a4308_0
|
200
|
+
- six=1.16.0=pyhd3eb1b0_1
|
201
|
+
- snappy=1.1.9=hbd366e4_1
|
202
|
+
- sqlite=3.39.3=h5082296_0
|
203
|
+
- tensorboard=2.10.1=pyhd8ed1ab_0
|
204
|
+
- tensorboard-data-server=0.6.0=py310h597c629_2
|
205
|
+
- tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
|
206
|
+
- threadpoolctl=3.1.0=pyh8a188c0_0
|
207
|
+
- tk=8.6.12=h1ccaba5_0
|
208
|
+
- tokenizers=0.11.4=py310h3dcd8bd_1
|
209
|
+
- torchaudio=0.13.0=py310_cu117
|
210
|
+
- torchtext=0.14.0=py310
|
211
|
+
- torchvision=0.14.0=py310_cu117
|
212
|
+
- tqdm=4.64.1=py310h06a4308_0
|
213
|
+
- transformers=4.24.0=pyhd8ed1ab_0
|
214
|
+
- typing-extensions=4.3.0=py310h06a4308_0
|
215
|
+
- typing_extensions=4.3.0=py310h06a4308_0
|
216
|
+
- tzdata=2022e=h04d1e81_0
|
217
|
+
- urllib3=1.26.12=py310h06a4308_0
|
218
|
+
- utf8proc=2.6.1=h27cfd23_0
|
219
|
+
- werkzeug=2.2.2=pyhd8ed1ab_0
|
220
|
+
- wheel=0.37.1=pyhd3eb1b0_0
|
221
|
+
- xxhash=0.8.0=h7f98852_3
|
222
|
+
- xz=5.2.6=h5eee18b_0
|
223
|
+
- yaml=0.2.5=h7f98852_2
|
224
|
+
- yarl=1.7.2=py310h5764c6d_2
|
225
|
+
- zipp=3.10.0=pyhd8ed1ab_0
|
226
|
+
- zlib=1.2.13=h5eee18b_0
|
227
|
+
- zstd=1.5.2=ha4553b6_0
|
nlptools/install_env.py
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
import os
|
2
|
+
import subprocess
|
3
|
+
|
4
|
+
def main():
|
5
|
+
# Determine the path to the 'environment.yml' file within the package
|
6
|
+
package_dir = os.path.dirname(__file__)
|
7
|
+
env_file = os.path.join(package_dir, 'environment.yml')
|
8
|
+
|
9
|
+
# Create the conda environment using the 'environment.yml' file
|
10
|
+
subprocess.call(["conda", "env", "create", "-f", env_file])
|
11
|
+
|
12
|
+
if __name__ == "__main__":
|
13
|
+
main()
|
@@ -0,0 +1,34 @@
|
|
1
|
+
from nlptools.morphology import settings
|
2
|
+
from nlptools.utils.parser import arStrip
|
3
|
+
import json
|
4
|
+
|
5
|
+
|
6
|
+
def ALMA_multi_word(multi_word):
|
7
|
+
undiac_multi_word = arStrip(multi_word, True, True, True, False, True, False) # diacs , smallDiacs , shaddah , digit , alif , specialChars
|
8
|
+
result_word = []
|
9
|
+
if undiac_multi_word in settings.div_dic.keys():
|
10
|
+
result_word = settings.div_dic[undiac_multi_word]
|
11
|
+
|
12
|
+
my_json = {}
|
13
|
+
glosses_list = []
|
14
|
+
output_list = []
|
15
|
+
concept_count = 0
|
16
|
+
my_json['multi_word_lemma'] = multi_word
|
17
|
+
my_json['undiac_multi_word_lemma'] = multi_word
|
18
|
+
ids = []
|
19
|
+
if result_word != []:
|
20
|
+
#my_json['concept_count'] = result_word[0][1] #concept_count
|
21
|
+
#my_json['POS'] = result_word[0][2] #POS
|
22
|
+
my_json['POS'] = result_word[0][1] #POS
|
23
|
+
|
24
|
+
for result in result_word:
|
25
|
+
ids.append(result[3])
|
26
|
+
#if lemma_id in settings.glosses_dic.keys():
|
27
|
+
# value = settings.glosses_dic[lemma_id]
|
28
|
+
# glosses_list.append(json.loads(value[1]))
|
29
|
+
# concept_count = concept_count + value[0]
|
30
|
+
my_json['ids'] = ids
|
31
|
+
#my_json['concept_count'] = concept_count
|
32
|
+
#my_json['glosses'] = glosses_list
|
33
|
+
output_list.append(my_json)
|
34
|
+
return output_list
|
@@ -0,0 +1,52 @@
|
|
1
|
+
from nlptools.morphology import settings
|
2
|
+
import pickle
|
3
|
+
from nlptools.DataDownload import downloader
|
4
|
+
import os
|
5
|
+
|
6
|
+
#filename = 'ALMA27012000.pickle'
|
7
|
+
#path =downloader.get_appdatadir()
|
8
|
+
#file_path = os.path.join(path, filename)
|
9
|
+
#with open(file_path, 'rb') as f:
|
10
|
+
# #Load the serialized data from the file
|
11
|
+
# settings.div_dic = pickle.load(f)
|
12
|
+
|
13
|
+
|
14
|
+
filename = 'lemmas_dic.pickle'
|
15
|
+
path =downloader.get_appdatadir()
|
16
|
+
file_path = os.path.join(path, filename)
|
17
|
+
with open(file_path, 'rb') as f:
|
18
|
+
#Load the serialized data from the file
|
19
|
+
settings.div_dic = pickle.load(f)
|
20
|
+
|
21
|
+
|
22
|
+
#filename_five = 'five_grams.pickle'
|
23
|
+
#path =downloader.get_appdatadir()
|
24
|
+
#file_path = os.path.join(path, filename_five)
|
25
|
+
#with open(file_path, 'rb') as f:
|
26
|
+
# #Load the serialized data from the file
|
27
|
+
# settings.five_grams_dict = pickle.load(f, encoding='utf-8')
|
28
|
+
#
|
29
|
+
#
|
30
|
+
#filename_four = 'four_grams.pickle'
|
31
|
+
#path =downloader.get_appdatadir()
|
32
|
+
#file_path = os.path.join(path, filename_four)
|
33
|
+
#with open(file_path, 'rb') as f:
|
34
|
+
# #Load the serialized data from the file
|
35
|
+
# settings.four_grams_dict = pickle.load(f, encoding='utf-8')
|
36
|
+
#
|
37
|
+
#
|
38
|
+
#filename_three = 'three_grams.pickle'
|
39
|
+
#path =downloader.get_appdatadir()
|
40
|
+
#file_path = os.path.join(path, filename_three)
|
41
|
+
#with open(file_path, 'rb') as f:
|
42
|
+
# #Load the serialized data from the file
|
43
|
+
# settings.three_grams_dict = pickle.load(f, encoding='utf-8')
|
44
|
+
#
|
45
|
+
#
|
46
|
+
#filename_two = 'two_grams.pickle'
|
47
|
+
#path =downloader.get_appdatadir()
|
48
|
+
#file_path = os.path.join(path, filename_two)
|
49
|
+
#with open(file_path, 'rb') as f:
|
50
|
+
# #Load the serialized data from the file
|
51
|
+
# settings.two_grams_dict = pickle.load(f, encoding='utf-8')
|
52
|
+
#
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
# We acknoledge that this file charsets.py is imported from Camel tools citation. url
|
3
|
+
#
|
4
|
+
|
5
|
+
import unicodedata
|
6
|
+
|
7
|
+
from six import unichr
|
8
|
+
|
9
|
+
|
10
|
+
UNICODE_PUNCT_CHARSET = frozenset(
|
11
|
+
[unichr(x) for x in range(65536) if unicodedata.category(
|
12
|
+
unichr(x))[0] == 'P'])
|
13
|
+
UNICODE_SYMBOL_CHARSET = frozenset(
|
14
|
+
[unichr(x) for x in range(65536) if unicodedata.category(
|
15
|
+
unichr(x))[0] == 'S'])
|
16
|
+
UNICODE_PUNCT_SYMBOL_CHARSET = UNICODE_PUNCT_CHARSET | UNICODE_SYMBOL_CHARSET
|
17
|
+
|
18
|
+
UNICODE_LETTER_CHARSET = frozenset(
|
19
|
+
[unichr(x) for x in range(65536) if unicodedata.category(
|
20
|
+
unichr(x))[0] == 'L'])
|
21
|
+
UNICODE_MARK_CHARSET = frozenset(
|
22
|
+
[unichr(x) for x in range(65536) if unicodedata.category(
|
23
|
+
unichr(x))[0] == 'M'])
|
24
|
+
UNICODE_NUMBER_CHARSET = frozenset(
|
25
|
+
[unichr(x) for x in range(65536) if unicodedata.category(
|
26
|
+
unichr(x))[0] == 'N'])
|
27
|
+
UNICODE_LETTER_MARK_NUMBER_CHARSET = (UNICODE_LETTER_CHARSET |
|
28
|
+
UNICODE_MARK_CHARSET |
|
29
|
+
UNICODE_NUMBER_CHARSET)
|
30
|
+
|
31
|
+
AR_LETTERS_CHARSET = frozenset(u'\u0621\u0622\u0623\u0624\u0625\u0626\u0627'
|
32
|
+
u'\u0628\u0629\u062a\u062b\u062c\u062d\u062e'
|
33
|
+
u'\u062f\u0630\u0631\u0632\u0633\u0634\u0635'
|
34
|
+
u'\u0636\u0637\u0638\u0639\u063a\u0640\u0641'
|
35
|
+
u'\u0642\u0643\u0644\u0645\u0646\u0647\u0648'
|
36
|
+
u'\u0649\u064a\u0671\u067e\u0686\u06a4\u06af')
|
37
|
+
AR_DIAC_CHARSET = frozenset(u'\u064b\u064c\u064d\u064e\u064f\u0650\u0651\u0652'
|
38
|
+
u'\u0670\u0640')
|
39
|
+
AR_CHARSET = AR_LETTERS_CHARSET | AR_DIAC_CHARSET
|
40
|
+
|
41
|
+
BW_LETTERS_CHARSET = frozenset(u'$&\'*<>ADEGHJPSTVYZ_bdfghjklmnpqrstvwxyz{|}')
|
42
|
+
BW_DIAC_CHARSET = frozenset(u'FKN`aiou~_')
|
43
|
+
BW_CHARSET = BW_LETTERS_CHARSET | BW_DIAC_CHARSET
|
44
|
+
|
45
|
+
SAFEBW_LETTERS_CHARSET = frozenset(u'ABCDEGHIJLMOPQSTVWYZ_bcdefghjklmnpqrstvwx'
|
46
|
+
u'yz')
|
47
|
+
SAFEBW_DIAC_CHARSET = frozenset(u'FKNaeiou~_')
|
48
|
+
SAFEBW_CHARSET = SAFEBW_LETTERS_CHARSET | SAFEBW_DIAC_CHARSET
|
49
|
+
|
50
|
+
XMLBW_LETTERS_CHARSET = frozenset(u'$\'*ABDEGHIJOPSTWYZ_bdfghjklmnpqrstvwxyz{|'
|
51
|
+
u'}')
|
52
|
+
XMLBW_DIAC_CHARSET = frozenset(u'FKN`aiou~_')
|
53
|
+
XMLBW_CHARSET = XMLBW_LETTERS_CHARSET | XMLBW_DIAC_CHARSET
|
54
|
+
|
55
|
+
HSB_LETTERS_CHARSET = frozenset(u'\'ADHST_bcdfghjklmnpqrstvwxyz'
|
56
|
+
u'\u00c2\u00c4\u00e1\u00f0\u00fd\u0100\u0102'
|
57
|
+
u'\u010e\u0127\u0161\u0175\u0177\u03b3\u03b8'
|
58
|
+
u'\u03c2')
|
59
|
+
HSB_DIAC_CHARSET = frozenset(u'.aiu~\u00c4\u00e1\u00e3\u0129\u0169_')
|
60
|
+
HSB_CHARSET = HSB_LETTERS_CHARSET | HSB_DIAC_CHARSET
|
@@ -0,0 +1,170 @@
|
|
1
|
+
|
2
|
+
from nlptools.morphology import settings
|
3
|
+
import re
|
4
|
+
from nlptools.morphology.tokenizers_words import simple_word_tokenize
|
5
|
+
from nlptools.utils.parser import arStrip
|
6
|
+
from nlptools.morphology.charsets import AR_CHARSET, AR_DIAC_CHARSET
|
7
|
+
|
8
|
+
_IS_AR_RE = re.compile(u'^[' + re.escape(u''.join(AR_CHARSET)) + u']+$')
|
9
|
+
def find_solution(token, language, task):
|
10
|
+
"""
|
11
|
+
Given a token, this method finds the morphological solution lemma and/or pos based on a spesific language and task.
|
12
|
+
|
13
|
+
Args:
|
14
|
+
token (:obj:`str`): The Arabic token to be morphologcaly analyzed.
|
15
|
+
language (:obj:`str`): In the current version, `MSA` is only supported.
|
16
|
+
task (:obj:`str`): The task to filter the results by [lemmatizer, pos, full]. The defualt task if not specified is `full`.
|
17
|
+
|
18
|
+
Returns:
|
19
|
+
list (:obj:`list`): A list of [token, lemma, pos], where:
|
20
|
+
token: the original input token
|
21
|
+
lemma: the lemma of the token
|
22
|
+
pos: the part-of-speech of the token
|
23
|
+
Note:
|
24
|
+
If no sloution is found for this token, an empty list is returned.
|
25
|
+
"""
|
26
|
+
|
27
|
+
if token in settings.div_dic.keys():
|
28
|
+
resulted_solutions = []
|
29
|
+
solutions = settings.div_dic[token]
|
30
|
+
for solution in solutions:
|
31
|
+
resulted_solutions.append([token, solution[0], solution[1], solution[3]])
|
32
|
+
return resulted_solutions
|
33
|
+
else:
|
34
|
+
return []
|
35
|
+
|
36
|
+
def analyze(text, language ='MSA', task ='full'):
|
37
|
+
"""
|
38
|
+
This method takes a text as input and returns a morphological solution for each token in this text, Based on the input language and task, such that,
|
39
|
+
if:
|
40
|
+
the task is lemmatizer, then the morphological soltuion is only the lemma.
|
41
|
+
the task is pos, then the morphological soltuion is only the pos.
|
42
|
+
the task is full, the the morphological soltuion is both the lemma and the pos.
|
43
|
+
|
44
|
+
Args:
|
45
|
+
token (:obj:`str`): The Arabic token to be morphologcaly analyzed.
|
46
|
+
language (:obj:`str`): In the current version, `MSA` is only supported.
|
47
|
+
task (:obj:`str`): The task to filter the results by [lemmatizer, pos, full]. The defualt task if not specified is `full`.
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
list (:obj:`list`): A list of [token, lemma, pos], based on the spesified task, where:
|
51
|
+
token: the original input token
|
52
|
+
lemma: the lemma of the token
|
53
|
+
pos: the part-of-speech of the token
|
54
|
+
|
55
|
+
**Example:**
|
56
|
+
|
57
|
+
.. highlight:: python
|
58
|
+
.. code-block:: python
|
59
|
+
|
60
|
+
from nlptools.morphology import morph_analyzer
|
61
|
+
|
62
|
+
#Return the morpological solution for each token in this text
|
63
|
+
#Example: task = full
|
64
|
+
morph_analyzer.analyze('ذهب الولد الى المدرسة')
|
65
|
+
|
66
|
+
[['ذهب', 'ذَهَبَ۪ 1', 'فعل'],
|
67
|
+
['الولد', 'وَلَد 1', 'اسم'],
|
68
|
+
['الى', 'إِلَى 1', 'كلمة وظيفية'],
|
69
|
+
['المدرسة', 'مَدْرَسَة 1', 'اسم']]
|
70
|
+
|
71
|
+
#Exampel: task = pos
|
72
|
+
morph_analyzer.analyze('ذهب الولد الى المدرسة',task='pos')
|
73
|
+
#the output
|
74
|
+
[['ذهب', 'فعل'], ['الولد', 'اسم'], ['الى', 'كلمة وظيفية'], ['المدرسة', 'اسم']]
|
75
|
+
|
76
|
+
#Exampel: task = lemmatizer
|
77
|
+
morph_analyzer.analyze('طار العصور فوق الشجرة', task='lemmatizer')
|
78
|
+
#the output
|
79
|
+
[['طار', 'طارِ۪ 1'],
|
80
|
+
['العصور', 'عَصْر 1'],
|
81
|
+
['فوق', 'فَوْق 1'],
|
82
|
+
['الشجرة', 'شَجَرَة 1']]
|
83
|
+
"""
|
84
|
+
|
85
|
+
#@check if the init does not load data correctly, call load_alma inside
|
86
|
+
output_list = []
|
87
|
+
|
88
|
+
tokens = simple_word_tokenize(text)
|
89
|
+
|
90
|
+
for token in tokens:
|
91
|
+
result_token =[]
|
92
|
+
token = arStrip(token , False , True , False , False , False , False)
|
93
|
+
token = re.sub('[ٱ]','ﺍ',token)
|
94
|
+
solution=[token, token+"_0","",0]
|
95
|
+
|
96
|
+
if token.isdigit():
|
97
|
+
solution[2] = "digit" #pos
|
98
|
+
|
99
|
+
elif not _is_ar(token):
|
100
|
+
solution[2] = "Foreign" #pos
|
101
|
+
|
102
|
+
# elif re.match("^[a-zA-Z]*$", token):
|
103
|
+
# solution[2] = "Foreign" #pos
|
104
|
+
|
105
|
+
else:
|
106
|
+
result_token = find_solution(token,language, task)
|
107
|
+
|
108
|
+
if result_token == []:
|
109
|
+
token_without_al = re.sub(r'^[ﻝ]','',re.sub(r'^[ﺍ]','',token))
|
110
|
+
if len(token_without_al) > 5 :
|
111
|
+
result_token = find_solution(token_without_al, language, task)
|
112
|
+
|
113
|
+
if result_token == []:
|
114
|
+
# try with replace ﻩ with ﺓ
|
115
|
+
result_token = find_solution(re.sub(r'[ﻩ]$','ﺓ',token), language, task)
|
116
|
+
|
117
|
+
|
118
|
+
if result_token == []:
|
119
|
+
# try with unify Alef
|
120
|
+
word_with_unify_alef = arStrip(token , False , False , False , False , True , False) # Unify Alef
|
121
|
+
result_token = find_solution(word_with_unify_alef, language, task)
|
122
|
+
|
123
|
+
if result_token == []:
|
124
|
+
# try with remove diac
|
125
|
+
word_undiac = arStrip(token , True , False , True , True , False , False) # remove diacs, shaddah , digit
|
126
|
+
result_token = find_solution(word_undiac, language, task)
|
127
|
+
|
128
|
+
if result_token == []:
|
129
|
+
# try with remove diac and unify alef
|
130
|
+
word_undiac = arStrip(token , True , True , True , False, True , False) # diacs , smallDiacs , shaddah , alif
|
131
|
+
result_token = find_solution(word_undiac, language, task)
|
132
|
+
|
133
|
+
if result_token != []:
|
134
|
+
output_list.append(result_token)
|
135
|
+
else:
|
136
|
+
# if no solution is found
|
137
|
+
output_list.append([solution])
|
138
|
+
|
139
|
+
return filter_results(task, output_list)
|
140
|
+
|
141
|
+
|
142
|
+
def filter_results(task, lst):
|
143
|
+
if task == 'lemmatizer':
|
144
|
+
return remove_items_by_index(lst, [2])
|
145
|
+
elif task == 'pos':
|
146
|
+
return remove_items_by_index(lst, [1])
|
147
|
+
else:
|
148
|
+
return lst
|
149
|
+
|
150
|
+
|
151
|
+
def remove_items_by_index(lst, index_list):
|
152
|
+
for inner_list in lst:
|
153
|
+
for index in sorted(index_list, reverse=True):
|
154
|
+
if len(inner_list) > index:
|
155
|
+
inner_list.pop(index)
|
156
|
+
return lst
|
157
|
+
|
158
|
+
|
159
|
+
def _is_ar(word):
|
160
|
+
return _IS_AR_RE.match(word) is not None
|
161
|
+
|
162
|
+
|
163
|
+
|
164
|
+
|
165
|
+
|
166
|
+
|
167
|
+
|
168
|
+
|
169
|
+
|
170
|
+
|
@@ -0,0 +1,19 @@
|
|
1
|
+
# This code was taken from Camel tools without any change
|
2
|
+
|
3
|
+
# -*- coding: utf-8 -*-
|
4
|
+
|
5
|
+
|
6
|
+
import re
|
7
|
+
from nlptools.morphology.charsets import UNICODE_PUNCT_SYMBOL_CHARSET
|
8
|
+
from nlptools.morphology.charsets import UNICODE_LETTER_MARK_NUMBER_CHARSET
|
9
|
+
|
10
|
+
|
11
|
+
_ALL_PUNCT = u''.join(UNICODE_PUNCT_SYMBOL_CHARSET)
|
12
|
+
_ALL_LETTER_MARK_NUMBER = u''.join(UNICODE_LETTER_MARK_NUMBER_CHARSET)
|
13
|
+
_TOKENIZE_RE = re.compile(r'[' + re.escape(_ALL_PUNCT) + r']|[' +
|
14
|
+
re.escape(_ALL_LETTER_MARK_NUMBER) + r']+')
|
15
|
+
|
16
|
+
|
17
|
+
def simple_word_tokenize(sentence):
|
18
|
+
|
19
|
+
return _TOKENIZE_RE.findall(sentence)
|
nlptools/nlptools.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
"""Main module."""
|
@@ -0,0 +1,12 @@
|
|
1
|
+
from nlptools.salma import settings
|
2
|
+
import pickle
|
3
|
+
from nlptools.DataDownload import downloader
|
4
|
+
import os
|
5
|
+
|
6
|
+
#filename = 'glosses_dic.pickle'
|
7
|
+
#path =downloader.get_appdatadir()
|
8
|
+
#file_path = os.path.join(path, filename)
|
9
|
+
#with open(file_path, 'rb') as f:
|
10
|
+
# #Load the serialized data from the file
|
11
|
+
# settings.glosses_dic = pickle.load(f)
|
12
|
+
settings.glosses_dic = {}
|
@@ -0,0 +1,31 @@
|
|
1
|
+
from transformers import BertTokenizer,BertForSequenceClassification
|
2
|
+
import warnings
|
3
|
+
warnings.filterwarnings("ignore")
|
4
|
+
import pandas as pd
|
5
|
+
|
6
|
+
|
7
|
+
|
8
|
+
|
9
|
+
from nlptools.DataDownload import downloader
|
10
|
+
import os
|
11
|
+
|
12
|
+
glosses_dic = {}
|
13
|
+
|
14
|
+
model_file_name = "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01"
|
15
|
+
path =downloader.get_appdatadir()
|
16
|
+
model_file_path = os.path.join(path, model_file_name)
|
17
|
+
|
18
|
+
tokenizer_file_name = "bert-base-arabertv02"
|
19
|
+
path =downloader.get_appdatadir()
|
20
|
+
tokenizer_file_path = os.path.join(path, tokenizer_file_name)
|
21
|
+
|
22
|
+
dftrue = pd.DataFrame()
|
23
|
+
|
24
|
+
# model = BertForSequenceClassification.from_pretrained('{}'.format("bert-base-arabertv02_22_May_2021_00h_allglosses_unused01"),
|
25
|
+
# output_hidden_states = True,
|
26
|
+
# num_labels=2
|
27
|
+
# )
|
28
|
+
|
29
|
+
model = BertForSequenceClassification.from_pretrained(model_file_path, output_hidden_states=True, num_labels=2)
|
30
|
+
|
31
|
+
tokenizer = BertTokenizer.from_pretrained('{}'.format(tokenizer_file_path))
|