SinaTools 0.1.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- SinaTools-0.1.1.data/data/nlptools/environment.yml +227 -0
- SinaTools-0.1.1.dist-info/AUTHORS.rst +13 -0
- SinaTools-0.1.1.dist-info/LICENSE +22 -0
- SinaTools-0.1.1.dist-info/METADATA +72 -0
- SinaTools-0.1.1.dist-info/RECORD +122 -0
- SinaTools-0.1.1.dist-info/WHEEL +6 -0
- SinaTools-0.1.1.dist-info/entry_points.txt +18 -0
- SinaTools-0.1.1.dist-info/top_level.txt +1 -0
- nlptools/CLI/DataDownload/download_files.py +71 -0
- nlptools/CLI/arabiner/bin/infer.py +117 -0
- nlptools/CLI/arabiner/bin/infer2.py +81 -0
- nlptools/CLI/morphology/ALMA_multi_word.py +75 -0
- nlptools/CLI/morphology/morph_analyzer.py +91 -0
- nlptools/CLI/salma/salma_tools.py +68 -0
- nlptools/CLI/utils/__init__.py +0 -0
- nlptools/CLI/utils/arStrip.py +99 -0
- nlptools/CLI/utils/corpus_tokenizer.py +74 -0
- nlptools/CLI/utils/implication.py +92 -0
- nlptools/CLI/utils/jaccard.py +96 -0
- nlptools/CLI/utils/latin_remove.py +51 -0
- nlptools/CLI/utils/remove_Punc.py +53 -0
- nlptools/CLI/utils/sentence_tokenizer.py +90 -0
- nlptools/CLI/utils/text_transliteration.py +77 -0
- nlptools/DataDownload/__init__.py +0 -0
- nlptools/DataDownload/downloader.py +185 -0
- nlptools/VERSION +1 -0
- nlptools/__init__.py +5 -0
- nlptools/arabert/__init__.py +1 -0
- nlptools/arabert/arabert/__init__.py +14 -0
- nlptools/arabert/arabert/create_classification_data.py +260 -0
- nlptools/arabert/arabert/create_pretraining_data.py +534 -0
- nlptools/arabert/arabert/extract_features.py +444 -0
- nlptools/arabert/arabert/lamb_optimizer.py +158 -0
- nlptools/arabert/arabert/modeling.py +1027 -0
- nlptools/arabert/arabert/optimization.py +202 -0
- nlptools/arabert/arabert/run_classifier.py +1078 -0
- nlptools/arabert/arabert/run_pretraining.py +593 -0
- nlptools/arabert/arabert/run_squad.py +1440 -0
- nlptools/arabert/arabert/tokenization.py +414 -0
- nlptools/arabert/araelectra/__init__.py +1 -0
- nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +103 -0
- nlptools/arabert/araelectra/build_pretraining_dataset.py +230 -0
- nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py +90 -0
- nlptools/arabert/araelectra/configure_finetuning.py +172 -0
- nlptools/arabert/araelectra/configure_pretraining.py +143 -0
- nlptools/arabert/araelectra/finetune/__init__.py +14 -0
- nlptools/arabert/araelectra/finetune/feature_spec.py +56 -0
- nlptools/arabert/araelectra/finetune/preprocessing.py +173 -0
- nlptools/arabert/araelectra/finetune/scorer.py +54 -0
- nlptools/arabert/araelectra/finetune/task.py +74 -0
- nlptools/arabert/araelectra/finetune/task_builder.py +70 -0
- nlptools/arabert/araelectra/flops_computation.py +215 -0
- nlptools/arabert/araelectra/model/__init__.py +14 -0
- nlptools/arabert/araelectra/model/modeling.py +1029 -0
- nlptools/arabert/araelectra/model/optimization.py +193 -0
- nlptools/arabert/araelectra/model/tokenization.py +355 -0
- nlptools/arabert/araelectra/pretrain/__init__.py +14 -0
- nlptools/arabert/araelectra/pretrain/pretrain_data.py +160 -0
- nlptools/arabert/araelectra/pretrain/pretrain_helpers.py +229 -0
- nlptools/arabert/araelectra/run_finetuning.py +323 -0
- nlptools/arabert/araelectra/run_pretraining.py +469 -0
- nlptools/arabert/araelectra/util/__init__.py +14 -0
- nlptools/arabert/araelectra/util/training_utils.py +112 -0
- nlptools/arabert/araelectra/util/utils.py +109 -0
- nlptools/arabert/aragpt2/__init__.py +2 -0
- nlptools/arabert/aragpt2/create_pretraining_data.py +95 -0
- nlptools/arabert/aragpt2/gpt2/__init__.py +2 -0
- nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py +158 -0
- nlptools/arabert/aragpt2/gpt2/optimization.py +225 -0
- nlptools/arabert/aragpt2/gpt2/run_pretraining.py +397 -0
- nlptools/arabert/aragpt2/grover/__init__.py +0 -0
- nlptools/arabert/aragpt2/grover/dataloader.py +161 -0
- nlptools/arabert/aragpt2/grover/modeling.py +803 -0
- nlptools/arabert/aragpt2/grover/modeling_gpt2.py +1196 -0
- nlptools/arabert/aragpt2/grover/optimization_adafactor.py +234 -0
- nlptools/arabert/aragpt2/grover/train_tpu.py +187 -0
- nlptools/arabert/aragpt2/grover/utils.py +234 -0
- nlptools/arabert/aragpt2/train_bpe_tokenizer.py +59 -0
- nlptools/arabert/preprocess.py +818 -0
- nlptools/arabiner/__init__.py +0 -0
- nlptools/arabiner/bin/__init__.py +14 -0
- nlptools/arabiner/bin/eval.py +87 -0
- nlptools/arabiner/bin/infer.py +91 -0
- nlptools/arabiner/bin/process.py +140 -0
- nlptools/arabiner/bin/train.py +221 -0
- nlptools/arabiner/data/__init__.py +1 -0
- nlptools/arabiner/data/datasets.py +146 -0
- nlptools/arabiner/data/transforms.py +118 -0
- nlptools/arabiner/nn/BaseModel.py +22 -0
- nlptools/arabiner/nn/BertNestedTagger.py +34 -0
- nlptools/arabiner/nn/BertSeqTagger.py +17 -0
- nlptools/arabiner/nn/__init__.py +3 -0
- nlptools/arabiner/trainers/BaseTrainer.py +117 -0
- nlptools/arabiner/trainers/BertNestedTrainer.py +203 -0
- nlptools/arabiner/trainers/BertTrainer.py +163 -0
- nlptools/arabiner/trainers/__init__.py +3 -0
- nlptools/arabiner/utils/__init__.py +0 -0
- nlptools/arabiner/utils/data.py +124 -0
- nlptools/arabiner/utils/helpers.py +151 -0
- nlptools/arabiner/utils/metrics.py +69 -0
- nlptools/environment.yml +227 -0
- nlptools/install_env.py +13 -0
- nlptools/morphology/ALMA_multi_word.py +34 -0
- nlptools/morphology/__init__.py +52 -0
- nlptools/morphology/charsets.py +60 -0
- nlptools/morphology/morph_analyzer.py +170 -0
- nlptools/morphology/settings.py +8 -0
- nlptools/morphology/tokenizers_words.py +19 -0
- nlptools/nlptools.py +1 -0
- nlptools/salma/__init__.py +12 -0
- nlptools/salma/settings.py +31 -0
- nlptools/salma/views.py +459 -0
- nlptools/salma/wsd.py +126 -0
- nlptools/utils/__init__.py +0 -0
- nlptools/utils/corpus_tokenizer.py +73 -0
- nlptools/utils/implication.py +662 -0
- nlptools/utils/jaccard.py +247 -0
- nlptools/utils/parser.py +147 -0
- nlptools/utils/readfile.py +3 -0
- nlptools/utils/sentence_tokenizer.py +53 -0
- nlptools/utils/text_transliteration.py +232 -0
- nlptools/utils/utils.py +2 -0
@@ -0,0 +1,227 @@
|
|
1
|
+
name: arabicner
|
2
|
+
channels:
|
3
|
+
- anaconda
|
4
|
+
- pytorch
|
5
|
+
- nvidia
|
6
|
+
- conda-forge
|
7
|
+
- defaults
|
8
|
+
dependencies:
|
9
|
+
- _libgcc_mutex=0.1=main
|
10
|
+
- _openmp_mutex=5.1=1_gnu
|
11
|
+
- abseil-cpp=20211102.0=h27087fc_1
|
12
|
+
- absl-py=1.3.0=pyhd8ed1ab_0
|
13
|
+
- aiohttp=3.8.1=py310h5764c6d_1
|
14
|
+
- aiosignal=1.2.0=pyhd8ed1ab_0
|
15
|
+
- arrow-cpp=8.0.0=py310h3098874_0
|
16
|
+
- async-timeout=4.0.2=py310h06a4308_0
|
17
|
+
- attrs=22.1.0=pyh71513ae_1
|
18
|
+
- aws-c-common=0.4.57=he6710b0_1
|
19
|
+
- aws-c-event-stream=0.1.6=h2531618_5
|
20
|
+
- aws-checksums=0.1.9=he6710b0_0
|
21
|
+
- aws-sdk-cpp=1.8.185=hce553d0_0
|
22
|
+
- blas=1.0=mkl
|
23
|
+
- blinker=1.5=pyhd8ed1ab_0
|
24
|
+
- boost-cpp=1.78.0=he72f1d9_0
|
25
|
+
- bottleneck=1.3.5=py310ha9d4c09_0
|
26
|
+
- brotli=1.0.9=h166bdaf_7
|
27
|
+
- brotli-bin=1.0.9=h166bdaf_7
|
28
|
+
- brotlipy=0.7.0=py310h7f8727e_1002
|
29
|
+
- bzip2=1.0.8=h7b6447c_0
|
30
|
+
- c-ares=1.18.1=h7f98852_0
|
31
|
+
- ca-certificates=2022.9.24=ha878542_0
|
32
|
+
- cachetools=5.2.0=pyhd8ed1ab_0
|
33
|
+
- certifi=2022.9.24=pyhd8ed1ab_0
|
34
|
+
- cffi=1.15.1=py310h74dc2b5_0
|
35
|
+
- charset-normalizer=2.0.4=pyhd3eb1b0_0
|
36
|
+
- click=8.1.3=unix_pyhd8ed1ab_2
|
37
|
+
- cryptography=38.0.1=py310h9ce1e76_0
|
38
|
+
- cuda=11.7.1=0
|
39
|
+
- cuda-cccl=11.7.91=0
|
40
|
+
- cuda-command-line-tools=11.7.1=0
|
41
|
+
- cuda-compiler=11.7.1=0
|
42
|
+
- cuda-cudart=11.7.99=0
|
43
|
+
- cuda-cudart-dev=11.7.99=0
|
44
|
+
- cuda-cuobjdump=11.7.91=0
|
45
|
+
- cuda-cupti=11.7.101=0
|
46
|
+
- cuda-cuxxfilt=11.7.91=0
|
47
|
+
- cuda-demo-suite=11.8.86=0
|
48
|
+
- cuda-documentation=11.8.86=0
|
49
|
+
- cuda-driver-dev=11.7.99=0
|
50
|
+
- cuda-gdb=11.8.86=0
|
51
|
+
- cuda-libraries=11.7.1=0
|
52
|
+
- cuda-libraries-dev=11.7.1=0
|
53
|
+
- cuda-memcheck=11.8.86=0
|
54
|
+
- cuda-nsight=11.8.86=0
|
55
|
+
- cuda-nsight-compute=11.8.0=0
|
56
|
+
- cuda-nvcc=11.7.99=0
|
57
|
+
- cuda-nvdisasm=11.8.86=0
|
58
|
+
- cuda-nvml-dev=11.7.91=0
|
59
|
+
- cuda-nvprof=11.8.87=0
|
60
|
+
- cuda-nvprune=11.7.91=0
|
61
|
+
- cuda-nvrtc=11.7.99=0
|
62
|
+
- cuda-nvrtc-dev=11.7.99=0
|
63
|
+
- cuda-nvtx=11.7.91=0
|
64
|
+
- cuda-nvvp=11.8.87=0
|
65
|
+
- cuda-runtime=11.7.1=0
|
66
|
+
- cuda-sanitizer-api=11.8.86=0
|
67
|
+
- cuda-toolkit=11.7.1=0
|
68
|
+
- cuda-tools=11.7.1=0
|
69
|
+
- cuda-visual-tools=11.7.1=0
|
70
|
+
- dataclasses=0.8=pyhc8e2a94_3
|
71
|
+
- datasets=2.6.1=pyhd8ed1ab_0
|
72
|
+
- dill=0.3.5.1=pyhd8ed1ab_0
|
73
|
+
- ffmpeg=4.3=hf484d3e_0
|
74
|
+
- fftw=3.3.10=nompi_h77c792f_102
|
75
|
+
- filelock=3.8.0=pyhd8ed1ab_0
|
76
|
+
- freetype=2.12.1=h4a9f257_0
|
77
|
+
- frozenlist=1.2.0=py310h7f8727e_1
|
78
|
+
- fsspec=2022.10.0=pyhd8ed1ab_0
|
79
|
+
- gds-tools=1.4.0.31=0
|
80
|
+
- gflags=2.2.2=he1b5a44_1004
|
81
|
+
- giflib=5.2.1=h7b6447c_0
|
82
|
+
- glog=0.6.0=h6f12383_0
|
83
|
+
- gmp=6.2.1=h295c915_3
|
84
|
+
- gnutls=3.6.15=he1e5248_0
|
85
|
+
#- google-auth=2.14.0=pyh1a96a4e_0
|
86
|
+
#- google-auth-oauthlib=0.4.6=pyhd8ed1ab_0
|
87
|
+
- grpc-cpp=1.46.1=h33aed49_0
|
88
|
+
- grpcio=1.42.0=py310hce63b2e_0
|
89
|
+
- huggingface_hub=0.10.1=pyhd8ed1ab_0
|
90
|
+
- icu=70.1=h27087fc_0
|
91
|
+
- idna=3.4=py310h06a4308_0
|
92
|
+
- importlib-metadata=5.0.0=pyha770c72_1
|
93
|
+
- importlib_metadata=5.0.0=hd8ed1ab_1
|
94
|
+
- intel-openmp=2021.4.0=h06a4308_3561
|
95
|
+
- joblib=1.2.0=pyhd8ed1ab_0
|
96
|
+
- jpeg=9e=h7f8727e_0
|
97
|
+
- keyutils=1.6.1=h166bdaf_0
|
98
|
+
- krb5=1.19.3=h3790be6_0
|
99
|
+
- lame=3.100=h7b6447c_0
|
100
|
+
- lcms2=2.12=h3be6417_0
|
101
|
+
- ld_impl_linux-64=2.38=h1181459_1
|
102
|
+
- lerc=3.0=h295c915_0
|
103
|
+
- libbrotlicommon=1.0.9=h166bdaf_7
|
104
|
+
- libbrotlidec=1.0.9=h166bdaf_7
|
105
|
+
- libbrotlienc=1.0.9=h166bdaf_7
|
106
|
+
- libcublas=11.11.3.6=0
|
107
|
+
- libcublas-dev=11.11.3.6=0
|
108
|
+
- libcufft=10.9.0.58=0
|
109
|
+
- libcufft-dev=10.9.0.58=0
|
110
|
+
- libcufile=1.4.0.31=0
|
111
|
+
- libcufile-dev=1.4.0.31=0
|
112
|
+
- libcurand=10.3.0.86=0
|
113
|
+
- libcurl=7.85.0=h91b91d3_0
|
114
|
+
- libcusolver=11.4.1.48=0
|
115
|
+
- libcusolver-dev=11.4.1.48=0
|
116
|
+
- libcusparse=11.7.5.86=0
|
117
|
+
- libcusparse-dev=11.7.5.86=0
|
118
|
+
- libdeflate=1.8=h7f8727e_5
|
119
|
+
- libedit=3.1.20191231=he28a2e2_2
|
120
|
+
- libev=4.33=h516909a_1
|
121
|
+
- libevent=2.1.10=h9b69904_4
|
122
|
+
- libffi=3.3=he6710b0_2
|
123
|
+
- libgcc-ng=11.2.0=h1234567_1
|
124
|
+
- libgfortran-ng=12.2.0=h69a702a_19
|
125
|
+
- libgfortran5=12.2.0=h337968e_19
|
126
|
+
- libgomp=11.2.0=h1234567_1
|
127
|
+
- libiconv=1.16=h7f8727e_2
|
128
|
+
- libidn2=2.3.2=h7f8727e_0
|
129
|
+
- libnghttp2=1.46.0=hce63b2e_0
|
130
|
+
- libnpp=11.8.0.86=0
|
131
|
+
- libnpp-dev=11.8.0.86=0
|
132
|
+
- libnvjpeg=11.9.0.86=0
|
133
|
+
- libnvjpeg-dev=11.9.0.86=0
|
134
|
+
- libpng=1.6.37=hbc83047_0
|
135
|
+
- libprotobuf=3.20.1=h4ff587b_0
|
136
|
+
- libssh2=1.10.0=ha56f1ee_2
|
137
|
+
- libstdcxx-ng=11.2.0=h1234567_1
|
138
|
+
- libtasn1=4.16.0=h27cfd23_0
|
139
|
+
- libthrift=0.15.0=he6d91bd_0
|
140
|
+
- libtiff=4.4.0=hecacb30_0
|
141
|
+
- libunistring=0.9.10=h27cfd23_0
|
142
|
+
- libuuid=1.0.3=h7f8727e_2
|
143
|
+
- libwebp=1.2.4=h11a3e52_0
|
144
|
+
- libwebp-base=1.2.4=h5eee18b_0
|
145
|
+
- lz4-c=1.9.3=h295c915_1
|
146
|
+
- markdown=3.4.1=pyhd8ed1ab_0
|
147
|
+
- markupsafe=2.1.1=py310h5764c6d_1
|
148
|
+
- mkl=2021.4.0=h06a4308_640
|
149
|
+
- mkl-service=2.4.0=py310h7f8727e_0
|
150
|
+
- mkl_fft=1.3.1=py310hd6ae3a3_0
|
151
|
+
- mkl_random=1.2.2=py310h00e6091_0
|
152
|
+
- multidict=6.0.2=py310h5764c6d_1
|
153
|
+
- multiprocess=0.70.12.2=py310h5764c6d_2
|
154
|
+
- natsort=7.1.1=pyhd3eb1b0_0
|
155
|
+
- ncurses=6.3=h5eee18b_3
|
156
|
+
- nettle=3.7.3=hbbd107a_1
|
157
|
+
- nsight-compute=2022.3.0.22=0
|
158
|
+
- numexpr=2.8.3=py310hcea2de6_0
|
159
|
+
- numpy=1.23.3=py310hd5efca6_0
|
160
|
+
#- numpy-base=1.23.3=py310h8e6c178_0
|
161
|
+
- oauthlib=3.2.2=pyhd8ed1ab_0
|
162
|
+
- openh264=2.1.1=h4ff587b_0
|
163
|
+
- openssl=1.1.1s=h7f8727e_0
|
164
|
+
- orc=1.7.4=h07ed6aa_0
|
165
|
+
- packaging=21.3=pyhd8ed1ab_0
|
166
|
+
- pandas=1.4.4=py310h6a678d5_0
|
167
|
+
- pillow=9.2.0=py310hace64e9_1
|
168
|
+
- pip=22.2.2=py310h06a4308_0
|
169
|
+
- protobuf=3.20.1=py310hd8f1fbe_0
|
170
|
+
- pyarrow=8.0.0=py310h468efa6_0
|
171
|
+
- pyasn1=0.4.8=py_0
|
172
|
+
- pyasn1-modules=0.2.7=py_0
|
173
|
+
- pycparser=2.21=pyhd3eb1b0_0
|
174
|
+
- pyjwt=2.6.0=pyhd8ed1ab_0
|
175
|
+
- pyopenssl=22.0.0=pyhd3eb1b0_0
|
176
|
+
- pyparsing=3.0.9=pyhd8ed1ab_0
|
177
|
+
- pysocks=1.7.1=py310h06a4308_0
|
178
|
+
- python=3.10.6=haa1d7c7_1
|
179
|
+
- python-dateutil=2.8.2=pyhd8ed1ab_0
|
180
|
+
- python-xxhash=3.0.0=py310h5764c6d_1
|
181
|
+
- python_abi=3.10=2_cp310
|
182
|
+
- pytorch=1.13.0=py3.10_cuda11.7_cudnn8.5.0_0
|
183
|
+
- pytorch-cuda=11.7=h67b0de4_0
|
184
|
+
- pytorch-mutex=1.0=cuda
|
185
|
+
- pytz=2022.6=pyhd8ed1ab_0
|
186
|
+
- pyu2f=0.1.5=pyhd8ed1ab_0
|
187
|
+
- pyyaml=6.0=py310h5764c6d_4
|
188
|
+
- re2=2022.04.01=h27087fc_0
|
189
|
+
- readline=8.2=h5eee18b_0
|
190
|
+
- regex=2022.7.9=py310h5eee18b_0
|
191
|
+
- requests=2.28.1=py310h06a4308_0
|
192
|
+
- requests-oauthlib=1.3.1=pyhd8ed1ab_0
|
193
|
+
- responses=0.18.0=pyhd8ed1ab_0
|
194
|
+
- rsa=4.9=pyhd8ed1ab_0
|
195
|
+
- sacremoses=0.0.53=pyhd8ed1ab_0
|
196
|
+
- scikit-learn=1.1.3=py310h6a678d5_0
|
197
|
+
- scipy=1.9.3=py310hd5efca6_0
|
198
|
+
- seqeval=1.2.2=pyhd3deb0d_0
|
199
|
+
- setuptools=65.4.0=py310h06a4308_0
|
200
|
+
- six=1.16.0=pyhd3eb1b0_1
|
201
|
+
- snappy=1.1.9=hbd366e4_1
|
202
|
+
- sqlite=3.39.3=h5082296_0
|
203
|
+
- tensorboard=2.10.1=pyhd8ed1ab_0
|
204
|
+
- tensorboard-data-server=0.6.0=py310h597c629_2
|
205
|
+
- tensorboard-plugin-wit=1.8.1=pyhd8ed1ab_0
|
206
|
+
- threadpoolctl=3.1.0=pyh8a188c0_0
|
207
|
+
- tk=8.6.12=h1ccaba5_0
|
208
|
+
- tokenizers=0.11.4=py310h3dcd8bd_1
|
209
|
+
- torchaudio=0.13.0=py310_cu117
|
210
|
+
- torchtext=0.14.0=py310
|
211
|
+
- torchvision=0.14.0=py310_cu117
|
212
|
+
- tqdm=4.64.1=py310h06a4308_0
|
213
|
+
- transformers=4.24.0=pyhd8ed1ab_0
|
214
|
+
- typing-extensions=4.3.0=py310h06a4308_0
|
215
|
+
- typing_extensions=4.3.0=py310h06a4308_0
|
216
|
+
- tzdata=2022e=h04d1e81_0
|
217
|
+
- urllib3=1.26.12=py310h06a4308_0
|
218
|
+
- utf8proc=2.6.1=h27cfd23_0
|
219
|
+
- werkzeug=2.2.2=pyhd8ed1ab_0
|
220
|
+
- wheel=0.37.1=pyhd3eb1b0_0
|
221
|
+
- xxhash=0.8.0=h7f98852_3
|
222
|
+
- xz=5.2.6=h5eee18b_0
|
223
|
+
- yaml=0.2.5=h7f98852_2
|
224
|
+
- yarl=1.7.2=py310h5764c6d_2
|
225
|
+
- zipp=3.10.0=pyhd8ed1ab_0
|
226
|
+
- zlib=1.2.13=h5eee18b_0
|
227
|
+
- zstd=1.5.2=ha4553b6_0
|
@@ -0,0 +1,22 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2023, SinaLab
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
22
|
+
|
@@ -0,0 +1,72 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: SinaTools
|
3
|
+
Version: 0.1.1
|
4
|
+
Summary: UNKNOWN
|
5
|
+
Home-page: https://github.com/SinaLab/nlptools
|
6
|
+
Author: UNKNOWN
|
7
|
+
Author-email: UNKNOWN
|
8
|
+
License: MIT license
|
9
|
+
Keywords: nlptools
|
10
|
+
Platform: UNKNOWN
|
11
|
+
Requires-Dist: six
|
12
|
+
Requires-Dist: farasapy
|
13
|
+
Requires-Dist: tqdm
|
14
|
+
Requires-Dist: requests
|
15
|
+
Requires-Dist: regex
|
16
|
+
Requires-Dist: pathlib
|
17
|
+
Requires-Dist: torch (==1.13.0)
|
18
|
+
Requires-Dist: transformers (==4.24.0)
|
19
|
+
Requires-Dist: torchtext (==0.14.0)
|
20
|
+
Requires-Dist: torchvision (==0.14.0)
|
21
|
+
Requires-Dist: seqeval (==1.2.2)
|
22
|
+
Requires-Dist: natsort (==7.1.1)
|
23
|
+
|
24
|
+
========
|
25
|
+
nlptools
|
26
|
+
========
|
27
|
+
|
28
|
+
|
29
|
+
.. image:: https://img.shields.io/pypi/v/nlptools.svg
|
30
|
+
:target: https://pypi.python.org/pypi/SinaTools
|
31
|
+
|
32
|
+
.. image:: https://img.shields.io/travis/sina_institute/nlptools.svg
|
33
|
+
:target: https://travis-ci.com/sina_institute/SinaTools
|
34
|
+
|
35
|
+
.. image:: https://readthedocs.org/projects/nlptools/badge/?version=latest
|
36
|
+
:target: https://SinaTools.readthedocs.io/en/latest/?version=latest
|
37
|
+
:alt: Documentation Status
|
38
|
+
|
39
|
+
|
40
|
+
|
41
|
+
|
42
|
+
Python Boilerplate contains all the boilerplate you need to create a Python package.
|
43
|
+
|
44
|
+
|
45
|
+
* Free software: MIT license
|
46
|
+
* Documentation: https://SinaTools.readthedocs.io.
|
47
|
+
|
48
|
+
|
49
|
+
Features
|
50
|
+
--------
|
51
|
+
|
52
|
+
* TODO
|
53
|
+
|
54
|
+
Credits
|
55
|
+
-------
|
56
|
+
|
57
|
+
This package was created with Cookiecutter_ and the `audreyr/cookiecutter-pypackage`_ project template.
|
58
|
+
|
59
|
+
.. _Cookiecutter: https://github.com/audreyr/cookiecutter
|
60
|
+
.. _`audreyr/cookiecutter-pypackage`: https://github.com/audreyr/cookiecutter-pypackage
|
61
|
+
|
62
|
+
|
63
|
+
=======
|
64
|
+
History
|
65
|
+
=======
|
66
|
+
|
67
|
+
0.1.0 (2023-04-15)
|
68
|
+
------------------
|
69
|
+
|
70
|
+
* First release on PyPI.
|
71
|
+
|
72
|
+
|
@@ -0,0 +1,122 @@
|
|
1
|
+
SinaTools-0.1.1.data/data/nlptools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
|
2
|
+
nlptools/VERSION,sha256=Ee4juPwvxhnW6rYnettaUnJhBnrAHM_D4RhX9Vvxi80,5
|
3
|
+
nlptools/__init__.py,sha256=OoA_p_y2jPjMytcUrG1ED5uJlJemVhSRr9L9Wsym-rQ,134
|
4
|
+
nlptools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
|
5
|
+
nlptools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
|
6
|
+
nlptools/nlptools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
|
7
|
+
nlptools/CLI/DataDownload/download_files.py,sha256=PMDEPXxZQbrFo-7iyhvrCpzx2RG5T5kPk6NJAwh8RSI,2322
|
8
|
+
nlptools/CLI/arabiner/bin/infer.py,sha256=YrNCVro8B3UxpsHjIo_01qiBQURpDNTK7pKTkw1L21Y,4921
|
9
|
+
nlptools/CLI/arabiner/bin/infer2.py,sha256=CtR9rwe20ks_qq-l_fQU-ThLqft_1o3Ztmd1my1kHMg,3905
|
10
|
+
nlptools/CLI/morphology/ALMA_multi_word.py,sha256=NINts8BtT8BGQPBvs4BJ_y2PsR7czsGPOVAwngaT85A,2644
|
11
|
+
nlptools/CLI/morphology/morph_analyzer.py,sha256=39vrFx6ppu7yEITcz8lAJhk3xHweaPWEqL-CcqBM37Q,3565
|
12
|
+
nlptools/CLI/salma/salma_tools.py,sha256=7awpCb68QUc3kx-EuwRHxDmItZlX2aSdpukwKF1G3Fo,1999
|
13
|
+
nlptools/CLI/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
nlptools/CLI/utils/arStrip.py,sha256=dzy16wZfSznkvGHHBn5P21EvyusKB55dqrZ4zbaa41w,3621
|
15
|
+
nlptools/CLI/utils/corpus_tokenizer.py,sha256=S0YG8FRS29K1C8eJVEYuWSV1ABS7PKymlNS7KxvYqxI,2817
|
16
|
+
nlptools/CLI/utils/implication.py,sha256=hjYTN0oiLf0bz0bRO_GD4rphZkaB3cH770clFFhuevE,3172
|
17
|
+
nlptools/CLI/utils/jaccard.py,sha256=a6oc28yMgm7UewO6Lz25A4Yv8QEzVa85XF-QV9uhMwI,4639
|
18
|
+
nlptools/CLI/utils/latin_remove.py,sha256=Xw6PB4GtMLLiYK3zTEwdLhBbivMyy1msD5Ab_QdJoQA,1303
|
19
|
+
nlptools/CLI/utils/remove_Punc.py,sha256=dvSiSs9UulhGCogBgtpD8fU860BFuMBTnwa8Ek9aPKQ,1393
|
20
|
+
nlptools/CLI/utils/sentence_tokenizer.py,sha256=AcJa_yRdlQqKMwVWWKSv1vRO1Yk-NK75-NpalkHqewc,3469
|
21
|
+
nlptools/CLI/utils/text_transliteration.py,sha256=blIGB8FeF10iFeXADM-z01XJ4qeB1qgj6S2Xnk9w5fI,2266
|
22
|
+
nlptools/DataDownload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
+
nlptools/DataDownload/downloader.py,sha256=yONVa99OtPXD5Lewy4Fm3eUiJMpBt492G1JOPh5sXAU,6523
|
24
|
+
nlptools/arabert/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
|
25
|
+
nlptools/arabert/preprocess.py,sha256=qI0FsuMTOzdRlYGCtLrjpXgikNElUZPv9bnjaKDZKJ4,33024
|
26
|
+
nlptools/arabert/arabert/__init__.py,sha256=KbSAH-XqbRygn0y59m5-ZYOLXgpT1gSgE3F-qd4rKEc,627
|
27
|
+
nlptools/arabert/arabert/create_classification_data.py,sha256=BhemGNRbYz_Pun0Q5WerN2-9n-ILmU3tm4J-OlHw5-A,7678
|
28
|
+
nlptools/arabert/arabert/create_pretraining_data.py,sha256=2M-cF3CLHbQ0cdWrzFT6Frg1vVP4Y-CFoq8iEPyxgsE,18924
|
29
|
+
nlptools/arabert/arabert/extract_features.py,sha256=C1IzASrlX7u4_M2xdr_PjzWfTRZgklhUXA2WHKgQt-I,15585
|
30
|
+
nlptools/arabert/arabert/lamb_optimizer.py,sha256=uN3Dcx-6n2_OwepyymRrGrB4EcSkR8b2ZczZrOr7bpY,6263
|
31
|
+
nlptools/arabert/arabert/modeling.py,sha256=KliecCmA1pP3owg0mYge6On3IRHunMF5kMLuEwc0VLw,40896
|
32
|
+
nlptools/arabert/arabert/optimization.py,sha256=Wx0Js6Zsfc3iVw-_7Q1SCnxfP_qqbdTAyFD-vZSpOyk,8153
|
33
|
+
nlptools/arabert/arabert/run_classifier.py,sha256=AdVGyvidlmbEp12b-PauiBo6EmFLEO7tqeJKuLhK2DA,38777
|
34
|
+
nlptools/arabert/arabert/run_pretraining.py,sha256=yO16nKkHDfcYA2Zx7vv8KN4te6_1qFOzyVeDzFT-DQw,21894
|
35
|
+
nlptools/arabert/arabert/run_squad.py,sha256=PORxgiByP8L6vZqAFkqgHPJ_ZjAlqlg64gtkdLmDNns,53456
|
36
|
+
nlptools/arabert/arabert/tokenization.py,sha256=R6xkyCb8_vgeksXiLeqDvV5vOnLb1cPNsvfDij6YVFk,14132
|
37
|
+
nlptools/arabert/araelectra/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
|
38
|
+
nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py,sha256=pIo6VFT3XXOYroZaab3msZAP6XjCKu0KcrIZQA0Pj8U,3881
|
39
|
+
nlptools/arabert/araelectra/build_pretraining_dataset.py,sha256=Z8ZmKznaE_2SPDRoPYR1SDhjTN_NTpNCFFuhUkykwl8,9041
|
40
|
+
nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py,sha256=W7HFr1XoO6bCDR7X7w-bOuwULFtTSjeKbJ2LHzzHf9k,3224
|
41
|
+
nlptools/arabert/araelectra/configure_finetuning.py,sha256=YfGLMdgN6Qqm357Mzy5UMjkuLPPWtBs7f4dA-DKE6JM,7768
|
42
|
+
nlptools/arabert/araelectra/configure_pretraining.py,sha256=oafQgu4WmVdxBcU5mSfXhPlvCk43CJwAWXC10Q58BlI,5801
|
43
|
+
nlptools/arabert/araelectra/flops_computation.py,sha256=krHTeuPH9xQu5ldprBOPJNlJRvC7fmmvXXqUjfWrzPE,9499
|
44
|
+
nlptools/arabert/araelectra/run_finetuning.py,sha256=JecbrSmGikBNyid4JKRZ49Rm5xFpt02WfgIIcs3TpcU,12976
|
45
|
+
nlptools/arabert/araelectra/run_pretraining.py,sha256=1K2aAFTY0p3iaLY0xkhTlm6v0B-Zun8SwEzz-K6RXM4,20665
|
46
|
+
nlptools/arabert/araelectra/finetune/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
|
47
|
+
nlptools/arabert/araelectra/finetune/feature_spec.py,sha256=cqNlBa2KK_G1-vkKm1EJUv6BoS3gesCUAHwVagZB6wM,1888
|
48
|
+
nlptools/arabert/araelectra/finetune/preprocessing.py,sha256=1mf7-IxknCRsobQZ-VV1zs4Cwt-mfOtoVxysDJa9LZ0,6657
|
49
|
+
nlptools/arabert/araelectra/finetune/scorer.py,sha256=PjRg0P5ANCtul2ute7ccq3mRCCoIAoCb-lVLlwd4rVY,1571
|
50
|
+
nlptools/arabert/araelectra/finetune/task.py,sha256=zM8M4PGSIrY2u6ytpmkQEXxG-jjoeN9wouEyVR23qeQ,1991
|
51
|
+
nlptools/arabert/araelectra/finetune/task_builder.py,sha256=Zsoiuw5M3Ca8QhaZVLVLZyWw09K5R75UeMuPmazMlHI,2768
|
52
|
+
nlptools/arabert/araelectra/model/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
|
53
|
+
nlptools/arabert/araelectra/model/modeling.py,sha256=5XLIutnmr-SFQOV_XntJ-U5evSCY-J2e9NjvlwVXKkk,40877
|
54
|
+
nlptools/arabert/araelectra/model/optimization.py,sha256=BCMb_C5hgBw7wC9ZR8AQ4lwoPopqLIcSiqcCrIjx9XU,7254
|
55
|
+
nlptools/arabert/araelectra/model/tokenization.py,sha256=9CkyPzs3L6OEPzN-7EWQDNQmW2mIJoZD4o1rn6xLdL4,11082
|
56
|
+
nlptools/arabert/araelectra/pretrain/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
|
57
|
+
nlptools/arabert/araelectra/pretrain/pretrain_data.py,sha256=NLgIcLAq1-MgtBNXYu_isDxnOY5k67SyADYy-8nzBok,5413
|
58
|
+
nlptools/arabert/araelectra/pretrain/pretrain_helpers.py,sha256=nFl7LEdxAU5kKwiodqJHzi-ty9jMFsCCNYOF__A69j8,9255
|
59
|
+
nlptools/arabert/araelectra/util/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
|
60
|
+
nlptools/arabert/araelectra/util/training_utils.py,sha256=7h_J1ljUWM0ynBcofEtjZWL_oAfZtTxEemQLkixgn-0,4142
|
61
|
+
nlptools/arabert/araelectra/util/utils.py,sha256=G0UAETUCZMlU9R9ASD9AXrWZeodWI1aZJEE9F-goaH4,2591
|
62
|
+
nlptools/arabert/aragpt2/__init__.py,sha256=aQkKhQwWaS61wYEeOdx682upeMWFPUjLxXSs7JM1sOA,18
|
63
|
+
nlptools/arabert/aragpt2/create_pretraining_data.py,sha256=fFa2_DAyTwc8L2IqQbshsh_Ia26nj1qtVLzC6DooSac,3105
|
64
|
+
nlptools/arabert/aragpt2/train_bpe_tokenizer.py,sha256=b-8zHQ02fLmZV4GfjnrPptwjpX259F41SlnWzBrflMA,1888
|
65
|
+
nlptools/arabert/aragpt2/gpt2/__init__.py,sha256=aQkKhQwWaS61wYEeOdx682upeMWFPUjLxXSs7JM1sOA,18
|
66
|
+
nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py,sha256=uN3Dcx-6n2_OwepyymRrGrB4EcSkR8b2ZczZrOr7bpY,6263
|
67
|
+
nlptools/arabert/aragpt2/gpt2/optimization.py,sha256=iqh23cypRSRUt53wt2G5SbNNpJMwERM7gZAOKVh5l4U,8411
|
68
|
+
nlptools/arabert/aragpt2/gpt2/run_pretraining.py,sha256=4jjkUbvTO1DHoKJ89yKtlkkofcND_fyAunQ-mlnJhTM,13298
|
69
|
+
nlptools/arabert/aragpt2/grover/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
70
|
+
nlptools/arabert/aragpt2/grover/dataloader.py,sha256=-FWPTjtsvweEE1WaWRHBXfOSbsGiUmnXT3qK7KJP8cM,6853
|
71
|
+
nlptools/arabert/aragpt2/grover/modeling.py,sha256=XcUvFwqRaxAwWiJstrH2FPBvDJe03pTWIyipdMfWj9g,38280
|
72
|
+
nlptools/arabert/aragpt2/grover/modeling_gpt2.py,sha256=WFpCWn1792yATFzt8rZ0rpWvExfbLzV2BqiEs7llFUw,51602
|
73
|
+
nlptools/arabert/aragpt2/grover/optimization_adafactor.py,sha256=1geOsCWuv5xxtSnKDz9a8aY5SVwZ1MGq-xVQDBg4Gpg,9765
|
74
|
+
nlptools/arabert/aragpt2/grover/train_tpu.py,sha256=qNgLI_j6-KYkTMJfVoFlh4NIKweY1aPz1XPDw6odld0,7102
|
75
|
+
nlptools/arabert/aragpt2/grover/utils.py,sha256=V5wMUxK03r5g_pb7R3_uGLOPqQJfbIB0VaJ8ZDM4XAo,8473
|
76
|
+
nlptools/arabiner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
77
|
+
nlptools/arabiner/bin/__init__.py,sha256=d1ToN2uheCCVby3TjiSuD1dqo_pvNIuTgz4COFr2Khs,438
|
78
|
+
nlptools/arabiner/bin/eval.py,sha256=ihtjJinY1jXpZXW5bQJzTC5MF6_V3GQ5zHzsc691_HQ,2591
|
79
|
+
nlptools/arabiner/bin/infer.py,sha256=EZKeq4zucIE-ooHYnegODNxsRiIY_gY5GvDPChH5WRQ,3237
|
80
|
+
nlptools/arabiner/bin/process.py,sha256=4QCZjsmYV5lep6waQE37fs7Fe59_1G5seIJLDkArg4s,4698
|
81
|
+
nlptools/arabiner/bin/train.py,sha256=hf6ZRhqMZ7bFealMSusBjtWrbzHGHc5HB2Lh4rp2uQA,6390
|
82
|
+
nlptools/arabiner/data/__init__.py,sha256=XPic1bPijmZda_LPFL5J6TOps_IHUTiBDJvMx-iJqKo,61
|
83
|
+
nlptools/arabiner/data/datasets.py,sha256=p52Uc8Q2D3EzN1OmoHQcWVsJ2oB3TqgTzAcy1B9fJ68,5068
|
84
|
+
nlptools/arabiner/data/transforms.py,sha256=KPCDdjZOEvhMC38eiFwJuiQC84cfDrvC0XM4Ye0o3do,4878
|
85
|
+
nlptools/arabiner/nn/BaseModel.py,sha256=3GmujQasTZZunOBuFXpY2p1W8W256iI_Uu4hxhOY2Z0,608
|
86
|
+
nlptools/arabiner/nn/BertNestedTagger.py,sha256=7vU2tmDSoqSHn6GvMJmyN0hEMLvCkbr_r-AaiAaYdw8,1223
|
87
|
+
nlptools/arabiner/nn/BertSeqTagger.py,sha256=dFcBBiMw2QCWsyy7aQDe_PS3aRuNn4DOxKIHgTblFvc,504
|
88
|
+
nlptools/arabiner/nn/__init__.py,sha256=ZN7Psm83pysUhGI3ZSaJra2aCYBZb9DZ0UX4CiKGc0A,182
|
89
|
+
nlptools/arabiner/trainers/BaseTrainer.py,sha256=oZgFJW-CawfCKT5gtaBHA7Q7XjNfiyqM62KnFsgVzPU,3919
|
90
|
+
nlptools/arabiner/trainers/BertNestedTrainer.py,sha256=hVqPRdmaHf2iwftseNpgsAfwGkl6eMHJx1rKunQS_vM,8443
|
91
|
+
nlptools/arabiner/trainers/BertTrainer.py,sha256=KkcgZXu6kqsrrnfFtiAQ8ucLsrQtDxLRqdbTiTnRWqI,6447
|
92
|
+
nlptools/arabiner/trainers/__init__.py,sha256=kt8WqsaOjX0h1JMa-v7Y9ywT5mfwQIsZTyVWnIAWsEQ,200
|
93
|
+
nlptools/arabiner/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
94
|
+
nlptools/arabiner/utils/data.py,sha256=uuPiu-7v0gccNygZjdTKomJGE7X0H9FC24Y9nHZpf4c,4376
|
95
|
+
nlptools/arabiner/utils/helpers.py,sha256=PyOOlx5uabvZVmU3SZtZ3ZLA3pliinJ3JXsvos9SUWU,5032
|
96
|
+
nlptools/arabiner/utils/metrics.py,sha256=Irz6SsIvpOzGIA2lWxrEV86xnTnm0TzKm9SUVT4SXUU,2734
|
97
|
+
nlptools/morphology/ALMA_multi_word.py,sha256=hlzZCk-uUdZ-GbiPsFxDTvoWoIuVof2Sm7NdaxaFipM,1313
|
98
|
+
nlptools/morphology/__init__.py,sha256=z6_RGhiyfNHXNKMmhNSI6ObTLmdjQyP58vsFottI8GA,1706
|
99
|
+
nlptools/morphology/charsets.py,sha256=7w9OrbnZTnLU3A9q-SUi9GhUN97qNtbYR5T0Pm72cF8,2784
|
100
|
+
nlptools/morphology/morph_analyzer.py,sha256=OmCxm4-fM2qfYzKk8yOd6D_T3RsfzZCcd7Oz2V4Advg,6507
|
101
|
+
nlptools/morphology/settings.py,sha256=sEZdnA7MiYXHdxrfHWXop1RcKClVzpOYzZwzHC1PxJ8,144
|
102
|
+
nlptools/morphology/tokenizers_words.py,sha256=Smtt_KXifl2wRI464Qn07PtUvOsyGBJjZ7E20gd8zVM,602
|
103
|
+
nlptools/salma/__init__.py,sha256=pOauGjD-xrGHw05sNx3EiSFc_wpM3bD1vJxQHoDDXOA,376
|
104
|
+
nlptools/salma/settings.py,sha256=fqAQg2b22gorzT9Pf_AEJD9p8AlVUaVyKD3FH8g2yUs,1110
|
105
|
+
nlptools/salma/views.py,sha256=EH1vc6P88CeAIzQKt7EU_HTI0uJipv4JdXiAX5NjrJY,18416
|
106
|
+
nlptools/salma/wsd.py,sha256=kmP5ZvvVMkxApgk91TAGSBkMJZbPPbS0qoNk8OE37og,4434
|
107
|
+
nlptools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
108
|
+
nlptools/utils/corpus_tokenizer.py,sha256=IDWh87XJaFa7V2P4kIxY4QVywPKhz0fIErc_c0gJGUU,4581
|
109
|
+
nlptools/utils/implication.py,sha256=Ro1Vw62oOBzELkX-zpHyieq4v2OsoyFrFeTU7BiK7qc,27794
|
110
|
+
nlptools/utils/jaccard.py,sha256=TTC5KTVv6kONw5vZtzxEQvv7QM79BCsD0xcJAY0T5tU,10111
|
111
|
+
nlptools/utils/parser.py,sha256=0Yd40CZf4wXso2q-d9LULUNAVUAMdiYMImfcVb6i9qQ,6175
|
112
|
+
nlptools/utils/readfile.py,sha256=xE4LEaCqXJIk9v37QUSSmWb-aY3UnCFUNb7uVdx3cpM,133
|
113
|
+
nlptools/utils/sentence_tokenizer.py,sha256=3C0Wx1ns8ZHiGwKlUkcti-8zA3fB4ju0fIEtGACM7oU,2162
|
114
|
+
nlptools/utils/text_transliteration.py,sha256=zhB3sFXSMtkkdqImRMVg415AAB80DOm9lMFKb2IBynw,8765
|
115
|
+
nlptools/utils/utils.py,sha256=vKkFOkYclMu8nXS_VZb6Kobx8QGKW9onXkkLCeiRb6g,32
|
116
|
+
SinaTools-0.1.1.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
|
117
|
+
SinaTools-0.1.1.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
|
118
|
+
SinaTools-0.1.1.dist-info/METADATA,sha256=G-49Kky9vazLGwQcFV-lbFQ_tb2PzwidvGdTN3wTG_c,1577
|
119
|
+
SinaTools-0.1.1.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
|
120
|
+
SinaTools-0.1.1.dist-info/entry_points.txt,sha256=9-PNkvWGCid8SVN03S2NkJFuxAzvcB22tGpHe-et2q8,951
|
121
|
+
SinaTools-0.1.1.dist-info/top_level.txt,sha256=sREDI6iHe4D0BZQmZbZ-LxYIn2cBWUayk9CZwAR9jaE,9
|
122
|
+
SinaTools-0.1.1.dist-info/RECORD,,
|
@@ -0,0 +1,18 @@
|
|
1
|
+
[console_scripts]
|
2
|
+
arabi_ner = nlptools.CLI.arabiner.bin.infer:main
|
3
|
+
arabi_ner2 = nlptools.CLI.arabiner.bin.infer2:main
|
4
|
+
install_env = nlptools.install_env:main
|
5
|
+
sina_alma_multi_word = nlptools.CLI.morphology.ALMA_multi_word:main
|
6
|
+
sina_appdatadir = nlptools.CLI.DataDownload.get_appdatadir:main
|
7
|
+
sina_arStrip = nlptools.CLI.utils.arStrip:main
|
8
|
+
sina_corpus_tokenizer = nlptools.CLI.utils.corpus_tokenizer:main
|
9
|
+
sina_download_files = nlptools.CLI.DataDownload.download_files:main
|
10
|
+
sina_implication = nlptools.CLI.utils.implication:main
|
11
|
+
sina_jaccard_similarity = nlptools.CLI.utils.jaccard:main
|
12
|
+
sina_morph_analyze = nlptools.CLI.morphology.morph_analyzer:main
|
13
|
+
sina_remove_latin = nlptools.CLI.utils.latin_remove:main
|
14
|
+
sina_remove_punctuation = nlptools.CLI.utils.remove_Punc:main
|
15
|
+
sina_salma = nlptools.CLI.salma.salma_tools:main
|
16
|
+
sina_sentence_tokenize = nlptools.CLI.utils.sentence_tokenizer:main
|
17
|
+
sina_transliterate = nlptools.CLI.utils.text_transliteration:main
|
18
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
nlptools
|
@@ -0,0 +1,71 @@
|
|
1
|
+
"""
|
2
|
+
About:
|
3
|
+
------
|
4
|
+
|
5
|
+
The sina_download_files tool is a command-line interface for downloading various NLP resources from pre-specified URLs. It is a part of the nlptools package and provides options to choose which files to download and to specify a download directory. The tool automatically handles file extraction for zip and tar.gz files.
|
6
|
+
|
7
|
+
Usage:
|
8
|
+
------
|
9
|
+
|
10
|
+
Below is the usage information that can be generated by running sina_download_files --help.
|
11
|
+
|
12
|
+
.. code-block:: none
|
13
|
+
|
14
|
+
Usage:
|
15
|
+
sina_download_files [OPTIONS]
|
16
|
+
|
17
|
+
.. code-block:: none
|
18
|
+
|
19
|
+
Options:
|
20
|
+
-f, --files FILES
|
21
|
+
Names of the files to download. Available files are: ner, morph, salma_model, salma_tokenizer, glosses_dic, lemma_dic, five_grams, four_grams, three_grams, two_grams.
|
22
|
+
If no file is specified, all files will be downloaded.
|
23
|
+
|
24
|
+
Examples:
|
25
|
+
---------
|
26
|
+
|
27
|
+
.. code-block:: none
|
28
|
+
|
29
|
+
sina_download_files -f morph ner
|
30
|
+
This command will download only the `morph` and `ner` files to the default directory.
|
31
|
+
|
32
|
+
Note:
|
33
|
+
-----
|
34
|
+
|
35
|
+
.. code-block:: none
|
36
|
+
|
37
|
+
- The script automatically handles the extraction of zip and tar.gz files after downloading.
|
38
|
+
- Ensure you have the necessary permissions to write to the specified directory.
|
39
|
+
- The default download directory is based on the operating system and can be obtained using the `get_appdatadir` function.
|
40
|
+
|
41
|
+
|
42
|
+
"""
|
43
|
+
|
44
|
+
import argparse
|
45
|
+
from nlptools.DataDownload.downloader import download_file
|
46
|
+
from nlptools.DataDownload.downloader import download_files
|
47
|
+
from nlptools.DataDownload.downloader import get_appdatadir
|
48
|
+
from nlptools.DataDownload.downloader import urls
|
49
|
+
|
50
|
+
|
51
|
+
def main():
|
52
|
+
parser = argparse.ArgumentParser(description="Download files from specified URLs.")
|
53
|
+
parser.add_argument('-f', '--files', nargs="*", choices=urls.keys(),
|
54
|
+
help="Names of the files to download. Available files are: "
|
55
|
+
f"{', '.join(urls.keys())}. If no file is specified, all files will be downloaded.")
|
56
|
+
|
57
|
+
get_appdatadir()
|
58
|
+
|
59
|
+
args = parser.parse_args()
|
60
|
+
|
61
|
+
if args.files:
|
62
|
+
for file in args.files:
|
63
|
+
url = urls[file]
|
64
|
+
download_file(url)
|
65
|
+
else:
|
66
|
+
download_files()
|
67
|
+
|
68
|
+
if __name__ == '__main__':
|
69
|
+
main()
|
70
|
+
|
71
|
+
#sina_download_files -f morph ner
|
@@ -0,0 +1,117 @@
|
|
1
|
+
"""
|
2
|
+
About:
|
3
|
+
------
|
4
|
+
The ArabiNER tool carries out Named Entity Recognition (NER) utilizing the ArabiNER utility from the SinaTools suite. It identifies the named entities and provides a comprehensive analysis in JSON format if the input consists of text, or in a CSV file if the input is a directory of files.
|
5
|
+
|
6
|
+
Usage:
|
7
|
+
------
|
8
|
+
Below is the usage information that can be generated by running arabi_ner --help.
|
9
|
+
|
10
|
+
.. code-block:: none
|
11
|
+
|
12
|
+
arabi_ner --text=INPUT_TEXT
|
13
|
+
arabi_ner --dir=INPUT_FILE --output_csv=OUTPUT_FILE_NAME
|
14
|
+
|
15
|
+
Options:
|
16
|
+
--------
|
17
|
+
|
18
|
+
.. code-block:: none
|
19
|
+
|
20
|
+
--text INPUT_TEXT
|
21
|
+
The text that needs to be analyzed for Named Entity Recognition.
|
22
|
+
--file INPUT_FILE
|
23
|
+
File containing the text to be analyzed for Named Entity Recognition.
|
24
|
+
--output_csv OUTPUT_FILE_NAME
|
25
|
+
A file containing the tokenized text and its Named Entity tags.
|
26
|
+
Examples:
|
27
|
+
---------
|
28
|
+
|
29
|
+
.. code-block:: none
|
30
|
+
|
31
|
+
arabi_ner --text "Your text here"
|
32
|
+
arabi_ner --dir "/path/to/your/directory" --output_csv "output.csv"
|
33
|
+
|
34
|
+
Note:
|
35
|
+
-----
|
36
|
+
|
37
|
+
.. code-block:: none
|
38
|
+
|
39
|
+
- Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
|
40
|
+
- The tool returns results in JSON format with proper indentation for better readability.
|
41
|
+
- The quality and accuracy of the analysis depend on the underlying capabilities of the ArabiNER utility.
|
42
|
+
|
43
|
+
"""
|
44
|
+
|
45
|
+
import argparse
|
46
|
+
import json
|
47
|
+
import pandas as pd
|
48
|
+
from nlptools.arabiner.bin.infer import ner
|
49
|
+
from nlptools.utils.corpus_tokenizer import corpus_tokenizer
|
50
|
+
from nlptools.morphology.tokenizers_words import simple_word_tokenize
|
51
|
+
|
52
|
+
|
53
|
+
def infer(sentence):
|
54
|
+
# Now infer returns all NER tags for a sentence
|
55
|
+
output = ner(sentence)
|
56
|
+
##print("ner output : ", output)
|
57
|
+
return [word[1] for word in output]
|
58
|
+
|
59
|
+
|
60
|
+
def main():
|
61
|
+
parser = argparse.ArgumentParser(description='NER Analysis using ArabiNER')
|
62
|
+
|
63
|
+
parser.add_argument('--text', type=str, help='Text to be analyzed for Named Entity Recognition')
|
64
|
+
parser.add_argument('--dir', type=str, help='dir containing the text files to be analyzed for Named Entity Recognition')
|
65
|
+
parser.add_argument('--output_csv', type=str, help='Output CSV file to write the results')
|
66
|
+
|
67
|
+
args = parser.parse_args()
|
68
|
+
|
69
|
+
if args.text is not None:
|
70
|
+
results = ner(args.text)
|
71
|
+
# Print the results in JSON format
|
72
|
+
print(json.dumps(results, ensure_ascii=False, indent=4))
|
73
|
+
elif args.dir is not None:
|
74
|
+
corpus_tokenizer(args.dir, args.output_csv)
|
75
|
+
df = pd.read_csv(args.output_csv)
|
76
|
+
df['NER tags'] = None
|
77
|
+
i = 0
|
78
|
+
|
79
|
+
# Use drop_duplicates to get unique values based on Row_ID and Sentence
|
80
|
+
result = df.drop_duplicates(subset=['Global Sentence ID', 'Sentence'])
|
81
|
+
|
82
|
+
# Get the "Sentence" column as an array
|
83
|
+
unique_sentences = result['Sentence'].to_numpy()
|
84
|
+
|
85
|
+
# Print the result
|
86
|
+
#print(unique_sentences, len(result['Sentence']))
|
87
|
+
#print("#############")
|
88
|
+
|
89
|
+
for sentence in unique_sentences: # iterating over unique sentences
|
90
|
+
#print(" Sentence : ", simple_word_tokenize(sentence), len(simple_word_tokenize(sentence)))
|
91
|
+
ner_tags = infer(sentence) # getting all NER tags for the sentence
|
92
|
+
#if len(ner_tags) != len(df[i:i+len(ner_tags)]):
|
93
|
+
# print("Not Equal...", len(ner_tags) , len(df[i:i+len(ner_tags)]))
|
94
|
+
# return
|
95
|
+
if len(simple_word_tokenize(sentence)) > 300:
|
96
|
+
print(" Length of this sentence is more than 300 word: ", sentence)
|
97
|
+
return
|
98
|
+
#df['NER tags'].iloc[i:i+len(ner_tags)] = ner_tags
|
99
|
+
df.loc[i:i+len(ner_tags)-1, 'NER tags'] = ner_tags # Use .loc to assign values
|
100
|
+
#print("Exit with ner tags = ", ner_tags, " and length : ", len(ner_tags), type(len(ner_tags)), " and df is " , df[i:i+len(ner_tags)], " with length : ", len(df[i:i+len(ner_tags)]), type(len(df[i:i+len(ner_tags)])), " i:i+len(ner_tags) : ", i," , ", i+len(ner_tags))
|
101
|
+
i = i + len(ner_tags)
|
102
|
+
|
103
|
+
df.to_csv(args.output_csv, index=False)
|
104
|
+
else:
|
105
|
+
print("Error: Either --text or --file argument must be provided.")
|
106
|
+
return
|
107
|
+
|
108
|
+
|
109
|
+
if __name__ == '__main__':
|
110
|
+
main()
|
111
|
+
|
112
|
+
#arabi_ner --text "Your text here."
|
113
|
+
#arabi_ner --dir /path/to/your/directory --output_csv output.csv
|
114
|
+
|
115
|
+
#Each unique sentence in the CSV file is processed once by the infer function to get the NER tags for all the words in the sentence.
|
116
|
+
#The current_word_position variable is used to keep track of the position within the list of NER tags returned by infer, ensuring that each word in the CSV file is assigned the correct NER tag.
|
117
|
+
#The final CSV file will contain an additional column, NER tags, which contains the NER tag for each word in the Sentence column of the CSV file.
|