SinaTools 0.1.3__py2.py3-none-any.whl → 0.1.7__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.3.dist-info → SinaTools-0.1.7.dist-info}/METADATA +14 -20
- SinaTools-0.1.7.dist-info/RECORD +101 -0
- SinaTools-0.1.7.dist-info/entry_points.txt +18 -0
- SinaTools-0.1.7.dist-info/top_level.txt +1 -0
- {nlptools → sinatools}/CLI/DataDownload/download_files.py +9 -9
- {nlptools → sinatools}/CLI/morphology/ALMA_multi_word.py +10 -20
- sinatools/CLI/morphology/morph_analyzer.py +80 -0
- nlptools/CLI/arabiner/bin/infer2.py → sinatools/CLI/ner/corpus_entity_extractor.py +5 -9
- nlptools/CLI/arabiner/bin/infer.py → sinatools/CLI/ner/entity_extractor.py +4 -8
- {nlptools → sinatools}/CLI/salma/salma_tools.py +8 -8
- {nlptools → sinatools}/CLI/utils/arStrip.py +10 -21
- sinatools/CLI/utils/corpus_tokenizer.py +50 -0
- {nlptools → sinatools}/CLI/utils/implication.py +9 -9
- {nlptools → sinatools}/CLI/utils/jaccard.py +10 -10
- sinatools/CLI/utils/remove_latin.py +34 -0
- sinatools/CLI/utils/remove_punctuation.py +42 -0
- {nlptools → sinatools}/CLI/utils/sentence_tokenizer.py +9 -22
- {nlptools → sinatools}/CLI/utils/text_transliteration.py +10 -17
- {nlptools → sinatools}/DataDownload/downloader.py +10 -10
- sinatools/VERSION +1 -0
- {nlptools → sinatools}/__init__.py +1 -1
- {nlptools → sinatools}/morphology/ALMA_multi_word.py +4 -5
- {nlptools → sinatools}/morphology/__init__.py +4 -14
- sinatools/morphology/morph_analyzer.py +172 -0
- sinatools/ner/__init__.py +12 -0
- nlptools/arabiner/bin/infer.py → sinatools/ner/entity_extractor.py +9 -8
- {nlptools → sinatools}/salma/__init__.py +2 -2
- {nlptools → sinatools}/salma/settings.py +1 -1
- {nlptools → sinatools}/salma/views.py +12 -12
- {nlptools → sinatools}/salma/wsd.py +2 -2
- {nlptools/morphology → sinatools/utils}/charsets.py +1 -3
- {nlptools → sinatools}/utils/implication.py +10 -10
- {nlptools → sinatools}/utils/jaccard.py +2 -2
- {nlptools → sinatools}/utils/parser.py +18 -21
- {nlptools → sinatools}/utils/text_transliteration.py +1 -1
- nlptools/utils/corpus_tokenizer.py → sinatools/utils/tokenizer.py +58 -5
- {nlptools/morphology → sinatools/utils}/tokenizers_words.py +3 -6
- SinaTools-0.1.3.dist-info/RECORD +0 -122
- SinaTools-0.1.3.dist-info/entry_points.txt +0 -18
- SinaTools-0.1.3.dist-info/top_level.txt +0 -1
- nlptools/CLI/morphology/morph_analyzer.py +0 -91
- nlptools/CLI/utils/corpus_tokenizer.py +0 -74
- nlptools/CLI/utils/latin_remove.py +0 -51
- nlptools/CLI/utils/remove_Punc.py +0 -53
- nlptools/VERSION +0 -1
- nlptools/arabiner/bin/__init__.py +0 -14
- nlptools/arabiner/bin/eval.py +0 -87
- nlptools/arabiner/bin/process.py +0 -140
- nlptools/arabiner/bin/train.py +0 -221
- nlptools/arabiner/data/__init__.py +0 -1
- nlptools/arabiner/data/datasets.py +0 -146
- nlptools/arabiner/data/transforms.py +0 -118
- nlptools/arabiner/nn/BaseModel.py +0 -22
- nlptools/arabiner/nn/BertNestedTagger.py +0 -34
- nlptools/arabiner/nn/BertSeqTagger.py +0 -17
- nlptools/arabiner/nn/__init__.py +0 -3
- nlptools/arabiner/trainers/BaseTrainer.py +0 -117
- nlptools/arabiner/trainers/BertNestedTrainer.py +0 -203
- nlptools/arabiner/trainers/BertTrainer.py +0 -163
- nlptools/arabiner/trainers/__init__.py +0 -3
- nlptools/arabiner/utils/__init__.py +0 -0
- nlptools/arabiner/utils/data.py +0 -124
- nlptools/arabiner/utils/helpers.py +0 -151
- nlptools/arabiner/utils/metrics.py +0 -69
- nlptools/morphology/morph_analyzer.py +0 -170
- nlptools/morphology/settings.py +0 -8
- nlptools/utils/__init__.py +0 -0
- nlptools/utils/sentence_tokenizer.py +0 -53
- {SinaTools-0.1.3.data/data/nlptools → SinaTools-0.1.7.data/data/sinatools}/environment.yml +0 -0
- {SinaTools-0.1.3.dist-info → SinaTools-0.1.7.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.3.dist-info → SinaTools-0.1.7.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.3.dist-info → SinaTools-0.1.7.dist-info}/WHEEL +0 -0
- {nlptools → sinatools}/CLI/utils/__init__.py +0 -0
- {nlptools → sinatools}/DataDownload/__init__.py +0 -0
- {nlptools → sinatools}/arabert/__init__.py +0 -0
- {nlptools → sinatools}/arabert/arabert/__init__.py +0 -0
- {nlptools → sinatools}/arabert/arabert/create_classification_data.py +0 -0
- {nlptools → sinatools}/arabert/arabert/create_pretraining_data.py +0 -0
- {nlptools → sinatools}/arabert/arabert/extract_features.py +0 -0
- {nlptools → sinatools}/arabert/arabert/lamb_optimizer.py +0 -0
- {nlptools → sinatools}/arabert/arabert/modeling.py +0 -0
- {nlptools → sinatools}/arabert/arabert/optimization.py +0 -0
- {nlptools → sinatools}/arabert/arabert/run_classifier.py +0 -0
- {nlptools → sinatools}/arabert/arabert/run_pretraining.py +0 -0
- {nlptools → sinatools}/arabert/arabert/run_squad.py +0 -0
- {nlptools → sinatools}/arabert/arabert/tokenization.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/build_pretraining_dataset.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/configure_finetuning.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/configure_pretraining.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/feature_spec.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/preprocessing.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/scorer.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/task.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/task_builder.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/flops_computation.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/model/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/model/modeling.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/model/optimization.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/model/tokenization.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/pretrain/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/pretrain/pretrain_data.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/pretrain/pretrain_helpers.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/run_finetuning.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/run_pretraining.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/util/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/util/training_utils.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/util/utils.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/__init__.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/create_pretraining_data.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/gpt2/__init__.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/gpt2/optimization.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/gpt2/run_pretraining.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/__init__.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/dataloader.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/modeling.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/modeling_gpt2.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/optimization_adafactor.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/train_tpu.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/utils.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/train_bpe_tokenizer.py +0 -0
- {nlptools → sinatools}/arabert/preprocess.py +0 -0
- {nlptools → sinatools}/environment.yml +0 -0
- {nlptools → sinatools}/install_env.py +0 -0
- /nlptools/nlptools.py → /sinatools/sinatools.py +0 -0
- {nlptools/arabiner → sinatools/utils}/__init__.py +0 -0
- {nlptools → sinatools}/utils/readfile.py +0 -0
- {nlptools → sinatools}/utils/utils.py +0 -0
@@ -1,13 +1,12 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: SinaTools
|
3
|
-
Version: 0.1.
|
4
|
-
Summary:
|
5
|
-
Home-page: https://github.com/SinaLab/
|
6
|
-
Author: UNKNOWN
|
7
|
-
Author-email: UNKNOWN
|
3
|
+
Version: 0.1.7
|
4
|
+
Summary: A short description of your project
|
5
|
+
Home-page: https://github.com/SinaLab/sinatools
|
8
6
|
License: MIT license
|
9
|
-
Keywords:
|
7
|
+
Keywords: sinatools
|
10
8
|
Platform: UNKNOWN
|
9
|
+
Description-Content-Type: text/markdown
|
11
10
|
Requires-Dist: six
|
12
11
|
Requires-Dist: farasapy
|
13
12
|
Requires-Dist: tqdm
|
@@ -20,32 +19,27 @@ Requires-Dist: torchtext (==0.14.0)
|
|
20
19
|
Requires-Dist: torchvision (==0.14.0)
|
21
20
|
Requires-Dist: seqeval (==1.2.2)
|
22
21
|
Requires-Dist: natsort (==7.1.1)
|
22
|
+
Requires-Dist: pandas (==1.2.4)
|
23
23
|
|
24
24
|
========
|
25
|
-
|
25
|
+
sinatools
|
26
26
|
========
|
27
27
|
|
28
|
+
.. image:: https://img.shields.io/pypi/v/sinatools.svg
|
29
|
+
:target: https://pypi.python.org/pypi/SinaTools
|
28
30
|
|
29
|
-
.. image:: https://img.shields.io/
|
30
|
-
|
31
|
-
|
32
|
-
.. image:: https://img.shields.io/travis/sina_institute/nlptools.svg
|
33
|
-
:target: https://travis-ci.com/sina_institute/SinaTools
|
34
|
-
|
35
|
-
.. image:: https://readthedocs.org/projects/nlptools/badge/?version=latest
|
36
|
-
:target: https://SinaTools.readthedocs.io/en/latest/?version=latest
|
37
|
-
:alt: Documentation Status
|
38
|
-
|
39
|
-
|
31
|
+
.. image:: https://img.shields.io/travis/sina_institute/sinatools.svg
|
32
|
+
:target: https://travis-ci.com/sina_institute/SinaTools
|
40
33
|
|
34
|
+
.. image:: https://readthedocs.org/projects/sinatools/badge/?version=latest
|
35
|
+
:target: https://SinaTools.readthedocs.io/en/latest/?version=latest
|
36
|
+
:alt: Documentation Status
|
41
37
|
|
42
38
|
Python Boilerplate contains all the boilerplate you need to create a Python package.
|
43
39
|
|
44
|
-
|
45
40
|
* Free software: MIT license
|
46
41
|
* Documentation: https://sina.birzeit.edu/sinatools/
|
47
42
|
|
48
|
-
|
49
43
|
Credits
|
50
44
|
-------
|
51
45
|
|
@@ -0,0 +1,101 @@
|
|
1
|
+
SinaTools-0.1.7.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
|
2
|
+
sinatools/VERSION,sha256=Gmytzwl0rsvqV5jsEYdTXHSbWrOb2vARjvgA3N9TGwY,5
|
3
|
+
sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
|
4
|
+
sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
|
5
|
+
sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
|
6
|
+
sinatools/sinatools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
|
7
|
+
sinatools/CLI/DataDownload/download_files.py,sha256=17CtswdAT66KO7hw3o87RaWbM-BxdUpsheE6bysP3-c,2302
|
8
|
+
sinatools/CLI/morphology/ALMA_multi_word.py,sha256=ZImJ1vtcpSHydI1BjJmK3KcMJbGBZX16kO4L6rxvBvA,2086
|
9
|
+
sinatools/CLI/morphology/morph_analyzer.py,sha256=ieIM47QK9Nct3MtCS9uq3h2rZN5r4qNhsLmlVeE6wiE,3503
|
10
|
+
sinatools/CLI/ner/corpus_entity_extractor.py,sha256=jsxTQsR4i8ZwsWrX1XxkYUbLGygYKV7-pWDiubfaANE,3751
|
11
|
+
sinatools/CLI/ner/entity_extractor.py,sha256=BHAs2nGKL9npHUXj-6FDHQCuR2jidvFJX8yUkgQKxhc,4436
|
12
|
+
sinatools/CLI/salma/salma_tools.py,sha256=8IDMSXjpM2u8jXc6c5JcI_l2CmiwdCxsUBJVN1Rrfk0,1971
|
13
|
+
sinatools/CLI/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
+
sinatools/CLI/utils/arStrip.py,sha256=pOMh9e4O-vBixbv2HM5qjlA8-qJH3Zf7DeJVekPrgjo,3252
|
15
|
+
sinatools/CLI/utils/corpus_tokenizer.py,sha256=nH0T4h6urr_0Qy6-wN3PquOtnwybj0REde5Ts_OE4U8,1650
|
16
|
+
sinatools/CLI/utils/implication.py,sha256=3vw526ZL0WR8LiIKbjYibTQWE_UeYvHThc1W9-BlbHg,3133
|
17
|
+
sinatools/CLI/utils/jaccard.py,sha256=1zSkEQevB-1D5xcT__qmrgB1s8CISU70wDMBteCKCSo,4601
|
18
|
+
sinatools/CLI/utils/remove_latin.py,sha256=dzRzRapmM4mJwS-rhNy9PYQKS-ONMsRBmN1ZcPfEBfE,848
|
19
|
+
sinatools/CLI/utils/remove_punctuation.py,sha256=vJAZlEn7WGftZAFVFYnddkRrxdJ_rMmKB9vFZkY-jN4,1097
|
20
|
+
sinatools/CLI/utils/sentence_tokenizer.py,sha256=Wli8eiDbWSd_Z8UKpu_JkaS8jImowa1vnRL0oYCSfqw,2823
|
21
|
+
sinatools/CLI/utils/text_transliteration.py,sha256=vz-3kxWf8pNYVCqNAtBAiA6u_efrS5NtWT-ofN1NX6I,2014
|
22
|
+
sinatools/DataDownload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
+
sinatools/DataDownload/downloader.py,sha256=MbTPqqlg5vOTErxeVvdMn5k0TsYaG6kef2zHkeBLXlk,6480
|
24
|
+
sinatools/arabert/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
|
25
|
+
sinatools/arabert/preprocess.py,sha256=qI0FsuMTOzdRlYGCtLrjpXgikNElUZPv9bnjaKDZKJ4,33024
|
26
|
+
sinatools/arabert/arabert/__init__.py,sha256=KbSAH-XqbRygn0y59m5-ZYOLXgpT1gSgE3F-qd4rKEc,627
|
27
|
+
sinatools/arabert/arabert/create_classification_data.py,sha256=BhemGNRbYz_Pun0Q5WerN2-9n-ILmU3tm4J-OlHw5-A,7678
|
28
|
+
sinatools/arabert/arabert/create_pretraining_data.py,sha256=2M-cF3CLHbQ0cdWrzFT6Frg1vVP4Y-CFoq8iEPyxgsE,18924
|
29
|
+
sinatools/arabert/arabert/extract_features.py,sha256=C1IzASrlX7u4_M2xdr_PjzWfTRZgklhUXA2WHKgQt-I,15585
|
30
|
+
sinatools/arabert/arabert/lamb_optimizer.py,sha256=uN3Dcx-6n2_OwepyymRrGrB4EcSkR8b2ZczZrOr7bpY,6263
|
31
|
+
sinatools/arabert/arabert/modeling.py,sha256=KliecCmA1pP3owg0mYge6On3IRHunMF5kMLuEwc0VLw,40896
|
32
|
+
sinatools/arabert/arabert/optimization.py,sha256=Wx0Js6Zsfc3iVw-_7Q1SCnxfP_qqbdTAyFD-vZSpOyk,8153
|
33
|
+
sinatools/arabert/arabert/run_classifier.py,sha256=AdVGyvidlmbEp12b-PauiBo6EmFLEO7tqeJKuLhK2DA,38777
|
34
|
+
sinatools/arabert/arabert/run_pretraining.py,sha256=yO16nKkHDfcYA2Zx7vv8KN4te6_1qFOzyVeDzFT-DQw,21894
|
35
|
+
sinatools/arabert/arabert/run_squad.py,sha256=PORxgiByP8L6vZqAFkqgHPJ_ZjAlqlg64gtkdLmDNns,53456
|
36
|
+
sinatools/arabert/arabert/tokenization.py,sha256=R6xkyCb8_vgeksXiLeqDvV5vOnLb1cPNsvfDij6YVFk,14132
|
37
|
+
sinatools/arabert/araelectra/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
|
38
|
+
sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py,sha256=pIo6VFT3XXOYroZaab3msZAP6XjCKu0KcrIZQA0Pj8U,3881
|
39
|
+
sinatools/arabert/araelectra/build_pretraining_dataset.py,sha256=Z8ZmKznaE_2SPDRoPYR1SDhjTN_NTpNCFFuhUkykwl8,9041
|
40
|
+
sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py,sha256=W7HFr1XoO6bCDR7X7w-bOuwULFtTSjeKbJ2LHzzHf9k,3224
|
41
|
+
sinatools/arabert/araelectra/configure_finetuning.py,sha256=YfGLMdgN6Qqm357Mzy5UMjkuLPPWtBs7f4dA-DKE6JM,7768
|
42
|
+
sinatools/arabert/araelectra/configure_pretraining.py,sha256=oafQgu4WmVdxBcU5mSfXhPlvCk43CJwAWXC10Q58BlI,5801
|
43
|
+
sinatools/arabert/araelectra/flops_computation.py,sha256=krHTeuPH9xQu5ldprBOPJNlJRvC7fmmvXXqUjfWrzPE,9499
|
44
|
+
sinatools/arabert/araelectra/run_finetuning.py,sha256=JecbrSmGikBNyid4JKRZ49Rm5xFpt02WfgIIcs3TpcU,12976
|
45
|
+
sinatools/arabert/araelectra/run_pretraining.py,sha256=1K2aAFTY0p3iaLY0xkhTlm6v0B-Zun8SwEzz-K6RXM4,20665
|
46
|
+
sinatools/arabert/araelectra/finetune/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
|
47
|
+
sinatools/arabert/araelectra/finetune/feature_spec.py,sha256=cqNlBa2KK_G1-vkKm1EJUv6BoS3gesCUAHwVagZB6wM,1888
|
48
|
+
sinatools/arabert/araelectra/finetune/preprocessing.py,sha256=1mf7-IxknCRsobQZ-VV1zs4Cwt-mfOtoVxysDJa9LZ0,6657
|
49
|
+
sinatools/arabert/araelectra/finetune/scorer.py,sha256=PjRg0P5ANCtul2ute7ccq3mRCCoIAoCb-lVLlwd4rVY,1571
|
50
|
+
sinatools/arabert/araelectra/finetune/task.py,sha256=zM8M4PGSIrY2u6ytpmkQEXxG-jjoeN9wouEyVR23qeQ,1991
|
51
|
+
sinatools/arabert/araelectra/finetune/task_builder.py,sha256=Zsoiuw5M3Ca8QhaZVLVLZyWw09K5R75UeMuPmazMlHI,2768
|
52
|
+
sinatools/arabert/araelectra/model/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
|
53
|
+
sinatools/arabert/araelectra/model/modeling.py,sha256=5XLIutnmr-SFQOV_XntJ-U5evSCY-J2e9NjvlwVXKkk,40877
|
54
|
+
sinatools/arabert/araelectra/model/optimization.py,sha256=BCMb_C5hgBw7wC9ZR8AQ4lwoPopqLIcSiqcCrIjx9XU,7254
|
55
|
+
sinatools/arabert/araelectra/model/tokenization.py,sha256=9CkyPzs3L6OEPzN-7EWQDNQmW2mIJoZD4o1rn6xLdL4,11082
|
56
|
+
sinatools/arabert/araelectra/pretrain/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
|
57
|
+
sinatools/arabert/araelectra/pretrain/pretrain_data.py,sha256=NLgIcLAq1-MgtBNXYu_isDxnOY5k67SyADYy-8nzBok,5413
|
58
|
+
sinatools/arabert/araelectra/pretrain/pretrain_helpers.py,sha256=nFl7LEdxAU5kKwiodqJHzi-ty9jMFsCCNYOF__A69j8,9255
|
59
|
+
sinatools/arabert/araelectra/util/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
|
60
|
+
sinatools/arabert/araelectra/util/training_utils.py,sha256=7h_J1ljUWM0ynBcofEtjZWL_oAfZtTxEemQLkixgn-0,4142
|
61
|
+
sinatools/arabert/araelectra/util/utils.py,sha256=G0UAETUCZMlU9R9ASD9AXrWZeodWI1aZJEE9F-goaH4,2591
|
62
|
+
sinatools/arabert/aragpt2/__init__.py,sha256=aQkKhQwWaS61wYEeOdx682upeMWFPUjLxXSs7JM1sOA,18
|
63
|
+
sinatools/arabert/aragpt2/create_pretraining_data.py,sha256=fFa2_DAyTwc8L2IqQbshsh_Ia26nj1qtVLzC6DooSac,3105
|
64
|
+
sinatools/arabert/aragpt2/train_bpe_tokenizer.py,sha256=b-8zHQ02fLmZV4GfjnrPptwjpX259F41SlnWzBrflMA,1888
|
65
|
+
sinatools/arabert/aragpt2/gpt2/__init__.py,sha256=aQkKhQwWaS61wYEeOdx682upeMWFPUjLxXSs7JM1sOA,18
|
66
|
+
sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py,sha256=uN3Dcx-6n2_OwepyymRrGrB4EcSkR8b2ZczZrOr7bpY,6263
|
67
|
+
sinatools/arabert/aragpt2/gpt2/optimization.py,sha256=iqh23cypRSRUt53wt2G5SbNNpJMwERM7gZAOKVh5l4U,8411
|
68
|
+
sinatools/arabert/aragpt2/gpt2/run_pretraining.py,sha256=4jjkUbvTO1DHoKJ89yKtlkkofcND_fyAunQ-mlnJhTM,13298
|
69
|
+
sinatools/arabert/aragpt2/grover/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
70
|
+
sinatools/arabert/aragpt2/grover/dataloader.py,sha256=-FWPTjtsvweEE1WaWRHBXfOSbsGiUmnXT3qK7KJP8cM,6853
|
71
|
+
sinatools/arabert/aragpt2/grover/modeling.py,sha256=XcUvFwqRaxAwWiJstrH2FPBvDJe03pTWIyipdMfWj9g,38280
|
72
|
+
sinatools/arabert/aragpt2/grover/modeling_gpt2.py,sha256=WFpCWn1792yATFzt8rZ0rpWvExfbLzV2BqiEs7llFUw,51602
|
73
|
+
sinatools/arabert/aragpt2/grover/optimization_adafactor.py,sha256=1geOsCWuv5xxtSnKDz9a8aY5SVwZ1MGq-xVQDBg4Gpg,9765
|
74
|
+
sinatools/arabert/aragpt2/grover/train_tpu.py,sha256=qNgLI_j6-KYkTMJfVoFlh4NIKweY1aPz1XPDw6odld0,7102
|
75
|
+
sinatools/arabert/aragpt2/grover/utils.py,sha256=V5wMUxK03r5g_pb7R3_uGLOPqQJfbIB0VaJ8ZDM4XAo,8473
|
76
|
+
sinatools/morphology/ALMA_multi_word.py,sha256=GPM2-N7_5JIZwNdi1we6gBG0rh59AlGM0XWYxmEE7jY,1283
|
77
|
+
sinatools/morphology/__init__.py,sha256=avTxtexZELp1Fya6cBNFLyeYPB31OcmQOlT2L-uAQnI,1386
|
78
|
+
sinatools/morphology/morph_analyzer.py,sha256=tA78gWg6iaE_G1c2xqxZoXZWNbvHBJLrTSxPyir5Xn8,6941
|
79
|
+
sinatools/ner/__init__.py,sha256=8R8epTEyvpbreLYTrC5M5lctlzZrNr7T7B4KmENnB3I,341
|
80
|
+
sinatools/ner/entity_extractor.py,sha256=amVU6tXoAAL9NcadfJlx1qyEPlxBY8wRo5Tn-ZLOVIw,3236
|
81
|
+
sinatools/salma/__init__.py,sha256=_by3PsXetNjkxSyg24nF592T-21JEWhPXzMAPzwDOhQ,378
|
82
|
+
sinatools/salma/settings.py,sha256=b_AqTxVWALuGXnsMd9KhnnwIo9-JEoWOTekB-7_xJCU,1111
|
83
|
+
sinatools/salma/views.py,sha256=G5W5BSr770NapWz5j6hcuwInrR40JKG-LkzP1OpcYeA,18416
|
84
|
+
sinatools/salma/wsd.py,sha256=vCiiR5h3bjAOHi3yxxkh_7GUgBWKQf297aHbO4Z8CBk,4436
|
85
|
+
sinatools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
86
|
+
sinatools/utils/charsets.py,sha256=rs82oZJqRqosZdTKXfFAJfJ5t4PxjMM_oAPsiWSWuwU,2817
|
87
|
+
sinatools/utils/implication.py,sha256=MsbI6S1LNY-fCxGMxFTuaV639r3QijkkdcfH48rvY7A,27804
|
88
|
+
sinatools/utils/jaccard.py,sha256=S7OgvaMqkN5HFgTZkKhMCNAuAnQ0LhRyXPN79jAzmKM,10113
|
89
|
+
sinatools/utils/parser.py,sha256=CPPtCrsbxUqsjhY5C9wTOgkAs6iw0k_WvMUxLEPM1IU,6168
|
90
|
+
sinatools/utils/readfile.py,sha256=xE4LEaCqXJIk9v37QUSSmWb-aY3UnCFUNb7uVdx3cpM,133
|
91
|
+
sinatools/utils/text_transliteration.py,sha256=NQoXrxI-h0UXnvVtDA3skNJduxIy0IW26r46N4tDxGk,8766
|
92
|
+
sinatools/utils/tokenizer.py,sha256=QHyrVqJA_On4rKxexiWR2ovq4pI1-u6iZkdhRbK9tew,6676
|
93
|
+
sinatools/utils/tokenizers_words.py,sha256=efNfOil9qDNVJ9yynk_8sqf65PsL-xtsHG7y2SZCkjQ,656
|
94
|
+
sinatools/utils/utils.py,sha256=vKkFOkYclMu8nXS_VZb6Kobx8QGKW9onXkkLCeiRb6g,32
|
95
|
+
SinaTools-0.1.7.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
|
96
|
+
SinaTools-0.1.7.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
|
97
|
+
SinaTools-0.1.7.dist-info/METADATA,sha256=TWtbd8m_tSIStY0O0mLGnf5y5zR0Yk7PVFAkBOwqrTo,1569
|
98
|
+
SinaTools-0.1.7.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
|
99
|
+
SinaTools-0.1.7.dist-info/entry_points.txt,sha256=9uGvOGRicf-CsHMaFyQjq1odtr3RMeOvEfiZwpDQ9VU,926
|
100
|
+
SinaTools-0.1.7.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
|
101
|
+
SinaTools-0.1.7.dist-info/RECORD,,
|
@@ -0,0 +1,18 @@
|
|
1
|
+
[console_scripts]
|
2
|
+
alma_multi_word = sinatools.CLI.morphology.ALMA_multi_word:main
|
3
|
+
appdatadir = sinatools.CLI.DataDownload.get_appdatadir:main
|
4
|
+
arStrip = sinatools.CLI.utils.arStrip:main
|
5
|
+
arabi_ner = sinatools.CLI.ner.entity_extractor:main
|
6
|
+
arabi_ner2 = sinatools.CLI.ner.corpus_entity_extractor:main
|
7
|
+
corpus_tokenizer = sinatools.CLI.utils.corpus_tokenizer:main
|
8
|
+
download_files = sinatools.CLI.DataDownload.download_files:main
|
9
|
+
implication = sinatools.CLI.utils.implication:main
|
10
|
+
install_env = sinatools.install_env:main
|
11
|
+
jaccard_similarity = sinatools.CLI.utils.jaccard:main
|
12
|
+
morphology_analyzer = sinatools.CLI.morphology.morph_analyzer:main
|
13
|
+
remove_latin = sinatools.CLI.utils.remove_latin:main
|
14
|
+
remove_punctuation = sinatools.CLI.utils.remove_punctuation:main
|
15
|
+
salma = sinatools.CLI.salma.salma_tools:main
|
16
|
+
sentence_tokenizer = sinatools.CLI.utils.sentence_tokenizer:main
|
17
|
+
transliterate = sinatools.CLI.utils.text_transliteration:main
|
18
|
+
|
@@ -0,0 +1 @@
|
|
1
|
+
sinatools
|
@@ -2,17 +2,17 @@
|
|
2
2
|
About:
|
3
3
|
------
|
4
4
|
|
5
|
-
The
|
5
|
+
The download_files tool is a command-line interface for downloading various NLP resources from pre-specified URLs. It is a part of the sinatools package and provides options to choose which files to download and to specify a download directory. The tool automatically handles file extraction for zip and tar.gz files.
|
6
6
|
|
7
7
|
Usage:
|
8
8
|
------
|
9
9
|
|
10
|
-
Below is the usage information that can be generated by running
|
10
|
+
Below is the usage information that can be generated by running download_files --help.
|
11
11
|
|
12
12
|
.. code-block:: none
|
13
13
|
|
14
14
|
Usage:
|
15
|
-
|
15
|
+
download_files [OPTIONS]
|
16
16
|
|
17
17
|
.. code-block:: none
|
18
18
|
|
@@ -26,7 +26,7 @@ Examples:
|
|
26
26
|
|
27
27
|
.. code-block:: none
|
28
28
|
|
29
|
-
|
29
|
+
download_files -f morph ner
|
30
30
|
This command will download only the `morph` and `ner` files to the default directory.
|
31
31
|
|
32
32
|
Note:
|
@@ -42,10 +42,10 @@ Note:
|
|
42
42
|
"""
|
43
43
|
|
44
44
|
import argparse
|
45
|
-
from
|
46
|
-
from
|
47
|
-
from
|
48
|
-
from
|
45
|
+
from sinatools.DataDownload.downloader import download_file
|
46
|
+
from sinatools.DataDownload.downloader import download_files
|
47
|
+
from sinatools.DataDownload.downloader import get_appdatadir
|
48
|
+
from sinatools.DataDownload.downloader import urls
|
49
49
|
|
50
50
|
|
51
51
|
def main():
|
@@ -68,4 +68,4 @@ def main():
|
|
68
68
|
if __name__ == '__main__':
|
69
69
|
main()
|
70
70
|
|
71
|
-
#
|
71
|
+
#download_files -f morph ner
|
@@ -1,16 +1,16 @@
|
|
1
1
|
"""
|
2
2
|
About:
|
3
3
|
------
|
4
|
-
The
|
4
|
+
The alma_multi_word tool performs multi-word morphological analysis using SinaTools' `ALMA_multi_word` utility. Given a multi-word Arabic text input, it returns a detailed analysis in JSON format.
|
5
5
|
|
6
6
|
Usage:
|
7
7
|
------
|
8
|
-
Below is the usage information that can be generated by running
|
8
|
+
Below is the usage information that can be generated by running alma_multi_word --help.
|
9
9
|
|
10
10
|
.. code-block:: none
|
11
11
|
|
12
|
-
|
13
|
-
|
12
|
+
alma_multi_word --multi_word=MULTI_WORD_TEXT
|
13
|
+
alma_multi_word --file
|
14
14
|
|
15
15
|
Options:
|
16
16
|
--------
|
@@ -27,25 +27,15 @@ Examples:
|
|
27
27
|
|
28
28
|
.. code-block:: none
|
29
29
|
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
Note:
|
34
|
-
-----
|
35
|
-
|
36
|
-
.. code-block:: none
|
37
|
-
|
38
|
-
- Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
|
39
|
-
- The tool returns results in JSON format with proper indentation for better readability.
|
40
|
-
- The quality and accuracy of the analysis depend on the underlying capabilities of the SinaTools' `ALMA_multi_word` utility.
|
41
|
-
- The tool is specifically designed for multi-word input. For single-word morphological analysis, other specific utilities/tools might be more appropriate.
|
30
|
+
alma_multi_word --multi_word "Your multi-word text here"
|
31
|
+
alma_multi_word --file "path/to/your/file.txt"
|
42
32
|
|
43
33
|
"""
|
44
34
|
|
45
35
|
import argparse
|
46
|
-
from
|
36
|
+
from sinatools.morphology.ALMA_multi_word import ALMA_multi_word
|
47
37
|
import json
|
48
|
-
from
|
38
|
+
from sinatools.utils.readfile import read_file
|
49
39
|
|
50
40
|
def main():
|
51
41
|
parser = argparse.ArgumentParser(description='Multi-Word Analysis using SinaTools')
|
@@ -71,5 +61,5 @@ def main():
|
|
71
61
|
|
72
62
|
if __name__ == '__main__':
|
73
63
|
main()
|
74
|
-
#
|
75
|
-
#
|
64
|
+
#alma_multi_word --multi_word "Your multi-word text here"
|
65
|
+
#alma_multi_word --file "path/to/your/file.txt"
|
@@ -0,0 +1,80 @@
|
|
1
|
+
"""
|
2
|
+
About:
|
3
|
+
------
|
4
|
+
The morphology_analyzer command is designed to provide morphological analysis for Arabic text using the SinaTools morph_analyzer component. Users can specify the language and desired analysis task (lemmatization, part-of-speech tagging, or full morphological analysis), and flag.
|
5
|
+
|
6
|
+
Usage:
|
7
|
+
------
|
8
|
+
Below is the usage information that can be generated by running morphology_analyzer --help.
|
9
|
+
|
10
|
+
.. code-block:: none
|
11
|
+
|
12
|
+
morphology_analyzer --text=TEXT [OPTIONS]
|
13
|
+
morphology_analyzer --file=FILE [OPTIONS]
|
14
|
+
|
15
|
+
Options:
|
16
|
+
--------
|
17
|
+
|
18
|
+
.. code-block:: none
|
19
|
+
|
20
|
+
--text TEXT
|
21
|
+
The text that needs to be morphologically analyzed.
|
22
|
+
|
23
|
+
--file FILE
|
24
|
+
File containing the text to be morphologically analyzed
|
25
|
+
|
26
|
+
--language LANGUAGE [default=MSA]
|
27
|
+
Specifies the language for the analysis. In the current version, MSA is only supported.
|
28
|
+
|
29
|
+
--task TASK [default=full]
|
30
|
+
Determines the specific type of morphological analysis to be performed. Available options are:
|
31
|
+
- lemmatization: the morphological solution includes only the lemma_id, lemma, token, and token frequency.
|
32
|
+
- pos: the morphological solution includes only the part-of-speech, token, and token frequency.
|
33
|
+
- root: the morphological solution includes only the root, token, and token frequency.
|
34
|
+
- full: the morphological solution includes the lemma_id, lemma, part-of-speech, root, token, and token frequency.
|
35
|
+
The default is full.
|
36
|
+
|
37
|
+
--flag FLAG [default=1]
|
38
|
+
The flag to filter the returned results. If the flag is `1`, the solution with the highest frequency will be returned. If the flag is `*`, all solutions will be returned, ordered descendingly, with the highest frequency solution first. The default flag if not specified is `1`.
|
39
|
+
|
40
|
+
Examples:
|
41
|
+
---------
|
42
|
+
|
43
|
+
.. code-block:: none
|
44
|
+
|
45
|
+
morphology_analyzer --text "Your Arabic text here" --language MSA --task full --flag 1
|
46
|
+
morphology_analyzer --file "path/to/your/file.txt" --language MSA --task full --flag 1
|
47
|
+
|
48
|
+
"""
|
49
|
+
|
50
|
+
import argparse
|
51
|
+
from sinatools.morphology.morph_analyzer import analyze
|
52
|
+
from sinatools.utils.readfile import read_file
|
53
|
+
|
54
|
+
def main():
|
55
|
+
parser = argparse.ArgumentParser(description='Morphological Analysis using SinaTools')
|
56
|
+
|
57
|
+
parser.add_argument('--text', type=str, help='Text to be morphologically analyzed')
|
58
|
+
parser.add_argument('--file', type=str, help='File containing the text to be morphologically analyzed')
|
59
|
+
parser.add_argument('--language', type=str, default='MSA', help='Language for analysis (default: MSA)')
|
60
|
+
parser.add_argument('--task', type=str, default='full', choices=['lemmatization', 'pos', 'root', 'full'], help='Task for the result filter [lemmatization, pos, root, full] (default: full)')
|
61
|
+
parser.add_argument('--flag', type=str, default='1', choices=['1','*'], help='The flag to filter the returned results')
|
62
|
+
|
63
|
+
args = parser.parse_args()
|
64
|
+
|
65
|
+
if args.text is None and args.file is None:
|
66
|
+
print("Error: Either --text or --file argument must be provided.")
|
67
|
+
return
|
68
|
+
|
69
|
+
# Get the input either from the --text argument or from the file specified in the --file argument
|
70
|
+
input_text = args.text if args.text else " ".join(read_file(args.file))
|
71
|
+
|
72
|
+
# Perform morphological analysis
|
73
|
+
results = analyze(input_text, args.language, args.task, args.flag)
|
74
|
+
|
75
|
+
# Print the results
|
76
|
+
for result in results:
|
77
|
+
print(result)
|
78
|
+
|
79
|
+
if __name__ == '__main__':
|
80
|
+
main()
|
@@ -1,8 +1,10 @@
|
|
1
1
|
import os
|
2
2
|
import csv
|
3
|
-
from
|
4
|
-
from
|
3
|
+
from sinatools.utils.tokenizer import sentence_tokenizer
|
4
|
+
from sinatools.utils.tokenizers_words import simple_word_tokenize
|
5
5
|
import pandas as pd
|
6
|
+
import argparse
|
7
|
+
from sinatools.ner.entity_extractor import ner
|
6
8
|
|
7
9
|
"""
|
8
10
|
CSV NER Tagging Tool
|
@@ -14,12 +16,6 @@ Run the script with the following command:
|
|
14
16
|
arabi_ner2 input.csv --text-columns "TextColumn1,TextColumn2" --additional-columns "Column3,Column4" --output-csv output.csv
|
15
17
|
"""
|
16
18
|
|
17
|
-
import argparse
|
18
|
-
import pandas as pd
|
19
|
-
from nlptools.utils.sentence_tokenizer import sent_tokenize
|
20
|
-
from nlptools.morphology.tokenizers_words import simple_word_tokenize
|
21
|
-
from nlptools.arabiner.bin.infer import ner
|
22
|
-
|
23
19
|
def infer(sentence):
|
24
20
|
output = ner(sentence)
|
25
21
|
return [word[1] for word in output]
|
@@ -39,7 +35,7 @@ def corpus_tokenizer(input_csv, output_csv, text_column, additional_columns, row
|
|
39
35
|
|
40
36
|
df = pd.read_csv(input_csv)
|
41
37
|
for index, row in df.iterrows():
|
42
|
-
sentences =
|
38
|
+
sentences = sentence_tokenizer(row[text_column], dot=True, new_line=True, question_mark=False, exclamation_mark=False)
|
43
39
|
for sentence_id, sentence in enumerate(sentences, start=1):
|
44
40
|
words = simple_word_tokenize(sentence)
|
45
41
|
global_sentence_id += 1
|
@@ -45,9 +45,9 @@ Note:
|
|
45
45
|
import argparse
|
46
46
|
import json
|
47
47
|
import pandas as pd
|
48
|
-
from
|
49
|
-
from
|
50
|
-
from
|
48
|
+
from sinatools.ner.entity_extractor import ner
|
49
|
+
from sinatools.utils.tokenizer import corpus_tokenizer
|
50
|
+
from sinatools.utils.tokenizers_words import simple_word_tokenize
|
51
51
|
|
52
52
|
|
53
53
|
def infer(sentence):
|
@@ -110,8 +110,4 @@ if __name__ == '__main__':
|
|
110
110
|
main()
|
111
111
|
|
112
112
|
#arabi_ner --text "Your text here."
|
113
|
-
#arabi_ner --dir /path/to/your/directory --output_csv output.csv
|
114
|
-
|
115
|
-
#Each unique sentence in the CSV file is processed once by the infer function to get the NER tags for all the words in the sentence.
|
116
|
-
#The current_word_position variable is used to keep track of the position within the list of NER tags returned by infer, ensuring that each word in the CSV file is assigned the correct NER tag.
|
117
|
-
#The final CSV file will contain an additional column, NER tags, which contains the NER tag for each word in the Sentence column of the CSV file.
|
113
|
+
#arabi_ner --dir /path/to/your/directory --output_csv output.csv
|
@@ -11,8 +11,8 @@ Below is the usage information that can be generated by running the command with
|
|
11
11
|
|
12
12
|
.. code-block:: none
|
13
13
|
|
14
|
-
|
15
|
-
|
14
|
+
salma --text=TEXT
|
15
|
+
salma --file=INPUT_FILE
|
16
16
|
|
17
17
|
Options:
|
18
18
|
--------
|
@@ -27,8 +27,8 @@ Examples:
|
|
27
27
|
---------
|
28
28
|
.. code-block:: none
|
29
29
|
|
30
|
-
|
31
|
-
|
30
|
+
salma --text "your Arabic sentence here"
|
31
|
+
salma --file "path/to/your/file.txt"
|
32
32
|
|
33
33
|
Note:
|
34
34
|
-----
|
@@ -42,8 +42,8 @@ Note:
|
|
42
42
|
|
43
43
|
import argparse
|
44
44
|
import json
|
45
|
-
from
|
46
|
-
from
|
45
|
+
from sinatools.salma.views import SALMA
|
46
|
+
from sinatools.utils.readfile import read_file
|
47
47
|
|
48
48
|
def main():
|
49
49
|
parser = argparse.ArgumentParser(description='Arabic text stripping tool using SinaTools')
|
@@ -64,5 +64,5 @@ def main():
|
|
64
64
|
if __name__ == "__main__":
|
65
65
|
main()
|
66
66
|
|
67
|
-
#
|
68
|
-
#
|
67
|
+
#salma --text "your Arabic sentence here"
|
68
|
+
#salma --file "path/to/your/file.txt"
|
@@ -2,17 +2,17 @@
|
|
2
2
|
|
3
3
|
About:
|
4
4
|
------
|
5
|
-
The
|
5
|
+
The arStrip command offers functionality to strip various elements from Arabic text using the SinaTools' `arStrip` utility. It provides flexibility to selectively strip diacritics, small diacritics, shaddah, digits, alif, and special characters.
|
6
6
|
|
7
7
|
Usage:
|
8
8
|
------
|
9
|
-
Below is the usage information that can be generated by running
|
9
|
+
Below is the usage information that can be generated by running arStrip --help.
|
10
10
|
|
11
11
|
.. code-block:: none
|
12
12
|
|
13
13
|
Usage:
|
14
|
-
|
15
|
-
|
14
|
+
arStrip --text=TEXT [OPTIONS]
|
15
|
+
arStrip --file "path/to/your/file.txt" [OPTIONS]
|
16
16
|
|
17
17
|
.. code-block:: none
|
18
18
|
|
@@ -43,27 +43,16 @@ Below is the usage information that can be generated by running sina_arStrip --h
|
|
43
43
|
|
44
44
|
Examples:
|
45
45
|
---------
|
46
|
-
|
47
|
-
.. code-block:: none
|
48
|
-
|
49
|
-
sina_arStrip --text "مُختَبَر سينا لحوسبة اللغة!" --diacs=True --smallDiacs=False --shaddah=True --digit=False --alif=False --specialChars=False
|
50
|
-
|
51
|
-
sina_arStrip --file "path/to/your/file.txt" --diacs=True --smallDiacs=False --shaddah=True --digit=False --alif=False --specialChars=False
|
52
|
-
|
53
|
-
Note:
|
54
|
-
-----
|
55
|
-
|
56
46
|
.. code-block:: none
|
57
47
|
|
58
|
-
|
59
|
-
|
60
|
-
- Stripping certain elements might change the meaning or readability of the text. Use it judiciously.
|
48
|
+
arStrip --text "مُختَبَر سينا لحوسبة اللغة!" --diacs=True --smallDiacs=False --shaddah=True --digit=False --alif=False --specialChars=False
|
49
|
+
arStrip --file "path/to/your/file.txt" --diacs=True --smallDiacs=False --shaddah=True --digit=False --alif=False --specialChars=False
|
61
50
|
|
62
51
|
"""
|
63
52
|
|
64
53
|
import argparse
|
65
|
-
from
|
66
|
-
from
|
54
|
+
from sinatools.utils.parser import arStrip
|
55
|
+
from sinatools.utils.readfile import read_file
|
67
56
|
|
68
57
|
def main():
|
69
58
|
parser = argparse.ArgumentParser(description='Arabic text stripping tool using SinaTools')
|
@@ -95,5 +84,5 @@ def main():
|
|
95
84
|
if __name__ == '__main__':
|
96
85
|
main()
|
97
86
|
|
98
|
-
#
|
99
|
-
#
|
87
|
+
#arStrip --text "example text" --diacs=True
|
88
|
+
#arStrip --file "path/to/your/file.txt" --diacs=True
|
@@ -0,0 +1,50 @@
|
|
1
|
+
"""
|
2
|
+
|
3
|
+
About:
|
4
|
+
------
|
5
|
+
The corpus_tokenizer command offers functionality to tokenize a corpus and write the results to a CSV file. It recursively searches through a specified directory for text files, tokenizes the content, and outputs the results, including various metadata, to a specified CSV file.
|
6
|
+
|
7
|
+
Usage:
|
8
|
+
-------
|
9
|
+
Below is the usage information that can be generated by running corpus_tokenizer --help.
|
10
|
+
|
11
|
+
.. code-block:: none
|
12
|
+
|
13
|
+
Usage:
|
14
|
+
corpus_tokenizer dir_path output_csv
|
15
|
+
|
16
|
+
.. code-block:: none
|
17
|
+
dir_path
|
18
|
+
The path to the directory containing the text files.
|
19
|
+
|
20
|
+
output_csv
|
21
|
+
The path to the output CSV file.
|
22
|
+
|
23
|
+
Examples:
|
24
|
+
---------
|
25
|
+
.. code-block:: none
|
26
|
+
corpus_tokenizer --dir_path "/path/to/text/directory/of/files" --output_csv "outputFile.csv"
|
27
|
+
"""
|
28
|
+
|
29
|
+
import argparse
|
30
|
+
from sinatools.utils.tokenizer import corpus_tokenizer
|
31
|
+
|
32
|
+
# Define the main function that will parse the arguments
|
33
|
+
def main():
|
34
|
+
# Create an ArgumentParser object
|
35
|
+
parser = argparse.ArgumentParser(description='Tokenize the corpus and write the results to a CSV file.')
|
36
|
+
|
37
|
+
# Add arguments to the parser
|
38
|
+
parser.add_argument('--dir_path', type=str, help='The path to the directory containing the text files.')
|
39
|
+
parser.add_argument('--output_csv', type=str, help='The path to the output CSV file.')
|
40
|
+
|
41
|
+
# Parse the command-line arguments
|
42
|
+
args = parser.parse_args()
|
43
|
+
|
44
|
+
# Call the corpus_tokenizer function with the parsed arguments
|
45
|
+
corpus_tokenizer(args.dir_path, args.output_csv)
|
46
|
+
|
47
|
+
# Call the main function when the script is executed
|
48
|
+
if __name__ == '__main__':
|
49
|
+
main()
|
50
|
+
|
@@ -1,18 +1,18 @@
|
|
1
1
|
"""
|
2
2
|
About:
|
3
3
|
------
|
4
|
-
The
|
4
|
+
The implication tool evaluates the implication between two words using the functionalities provided by the `Implication` class of SinaTools. This tool can be utilized to determine the relationship between two words and understand if one implies the other.
|
5
5
|
|
6
6
|
Usage:
|
7
7
|
------
|
8
|
-
Below is the usage information that can be generated by running
|
8
|
+
Below is the usage information that can be generated by running implication --help.
|
9
9
|
|
10
10
|
.. code-block:: none
|
11
11
|
|
12
12
|
Usage:
|
13
|
-
|
13
|
+
implication --inputWord1=WORD1 --inputWord2=WORD2
|
14
14
|
|
15
|
-
|
15
|
+
implication --inputFile1=File1 --inputFile2=File2
|
16
16
|
|
17
17
|
.. code-block:: none
|
18
18
|
|
@@ -33,9 +33,9 @@ Examples:
|
|
33
33
|
|
34
34
|
.. code-block:: none
|
35
35
|
|
36
|
-
|
36
|
+
implication --inputWord1 "word1" --inputWord2 "word2"
|
37
37
|
|
38
|
-
|
38
|
+
implication --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt"
|
39
39
|
|
40
40
|
Note:
|
41
41
|
-----
|
@@ -47,7 +47,7 @@ Note:
|
|
47
47
|
|
48
48
|
"""
|
49
49
|
import argparse
|
50
|
-
from
|
50
|
+
from sinatools.utils.implication import Implication
|
51
51
|
|
52
52
|
def read_file(file_path):
|
53
53
|
with open(file_path, 'r', encoding='utf-8') as file:
|
@@ -86,7 +86,7 @@ def main():
|
|
86
86
|
|
87
87
|
if __name__ == '__main__':
|
88
88
|
main()
|
89
|
-
#
|
90
|
-
#
|
89
|
+
# implication --inputWord1 "word1" --inputWord2 "word2"
|
90
|
+
# implication --file1 "path/to/your/firstfile.txt" --file2 "path/to/your/secondfile.txt"
|
91
91
|
|
92
92
|
|