SinaTools 0.1.4__py2.py3-none-any.whl → 0.1.7__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. {SinaTools-0.1.4.dist-info → SinaTools-0.1.7.dist-info}/METADATA +14 -20
  2. SinaTools-0.1.7.dist-info/RECORD +101 -0
  3. SinaTools-0.1.7.dist-info/entry_points.txt +18 -0
  4. SinaTools-0.1.7.dist-info/top_level.txt +1 -0
  5. {nlptools → sinatools}/CLI/DataDownload/download_files.py +9 -9
  6. {nlptools → sinatools}/CLI/morphology/ALMA_multi_word.py +10 -20
  7. sinatools/CLI/morphology/morph_analyzer.py +80 -0
  8. nlptools/CLI/arabiner/bin/infer2.py → sinatools/CLI/ner/corpus_entity_extractor.py +5 -9
  9. nlptools/CLI/arabiner/bin/infer.py → sinatools/CLI/ner/entity_extractor.py +4 -8
  10. {nlptools → sinatools}/CLI/salma/salma_tools.py +8 -8
  11. {nlptools → sinatools}/CLI/utils/arStrip.py +10 -21
  12. sinatools/CLI/utils/corpus_tokenizer.py +50 -0
  13. {nlptools → sinatools}/CLI/utils/implication.py +9 -9
  14. {nlptools → sinatools}/CLI/utils/jaccard.py +10 -10
  15. sinatools/CLI/utils/remove_latin.py +34 -0
  16. sinatools/CLI/utils/remove_punctuation.py +42 -0
  17. {nlptools → sinatools}/CLI/utils/sentence_tokenizer.py +9 -22
  18. {nlptools → sinatools}/CLI/utils/text_transliteration.py +10 -17
  19. {nlptools → sinatools}/DataDownload/downloader.py +9 -9
  20. sinatools/VERSION +1 -0
  21. {nlptools → sinatools}/__init__.py +1 -1
  22. {nlptools → sinatools}/morphology/ALMA_multi_word.py +4 -5
  23. {nlptools → sinatools}/morphology/__init__.py +4 -14
  24. sinatools/morphology/morph_analyzer.py +172 -0
  25. sinatools/ner/__init__.py +12 -0
  26. nlptools/arabiner/bin/infer.py → sinatools/ner/entity_extractor.py +9 -8
  27. {nlptools → sinatools}/salma/__init__.py +2 -2
  28. {nlptools → sinatools}/salma/settings.py +1 -1
  29. {nlptools → sinatools}/salma/views.py +9 -9
  30. {nlptools → sinatools}/salma/wsd.py +2 -2
  31. {nlptools/morphology → sinatools/utils}/charsets.py +1 -3
  32. {nlptools → sinatools}/utils/implication.py +10 -10
  33. {nlptools → sinatools}/utils/jaccard.py +2 -2
  34. {nlptools → sinatools}/utils/parser.py +18 -21
  35. {nlptools → sinatools}/utils/text_transliteration.py +1 -1
  36. nlptools/utils/corpus_tokenizer.py → sinatools/utils/tokenizer.py +58 -5
  37. {nlptools/morphology → sinatools/utils}/tokenizers_words.py +3 -6
  38. SinaTools-0.1.4.dist-info/RECORD +0 -122
  39. SinaTools-0.1.4.dist-info/entry_points.txt +0 -18
  40. SinaTools-0.1.4.dist-info/top_level.txt +0 -1
  41. nlptools/CLI/morphology/morph_analyzer.py +0 -91
  42. nlptools/CLI/utils/corpus_tokenizer.py +0 -74
  43. nlptools/CLI/utils/latin_remove.py +0 -51
  44. nlptools/CLI/utils/remove_Punc.py +0 -53
  45. nlptools/VERSION +0 -1
  46. nlptools/arabiner/bin/__init__.py +0 -14
  47. nlptools/arabiner/bin/eval.py +0 -87
  48. nlptools/arabiner/bin/process.py +0 -140
  49. nlptools/arabiner/bin/train.py +0 -221
  50. nlptools/arabiner/data/__init__.py +0 -1
  51. nlptools/arabiner/data/datasets.py +0 -146
  52. nlptools/arabiner/data/transforms.py +0 -118
  53. nlptools/arabiner/nn/BaseModel.py +0 -22
  54. nlptools/arabiner/nn/BertNestedTagger.py +0 -34
  55. nlptools/arabiner/nn/BertSeqTagger.py +0 -17
  56. nlptools/arabiner/nn/__init__.py +0 -3
  57. nlptools/arabiner/trainers/BaseTrainer.py +0 -117
  58. nlptools/arabiner/trainers/BertNestedTrainer.py +0 -203
  59. nlptools/arabiner/trainers/BertTrainer.py +0 -163
  60. nlptools/arabiner/trainers/__init__.py +0 -3
  61. nlptools/arabiner/utils/__init__.py +0 -0
  62. nlptools/arabiner/utils/data.py +0 -124
  63. nlptools/arabiner/utils/helpers.py +0 -151
  64. nlptools/arabiner/utils/metrics.py +0 -69
  65. nlptools/morphology/morph_analyzer.py +0 -171
  66. nlptools/morphology/settings.py +0 -8
  67. nlptools/utils/__init__.py +0 -0
  68. nlptools/utils/sentence_tokenizer.py +0 -53
  69. {SinaTools-0.1.4.data/data/nlptools → SinaTools-0.1.7.data/data/sinatools}/environment.yml +0 -0
  70. {SinaTools-0.1.4.dist-info → SinaTools-0.1.7.dist-info}/AUTHORS.rst +0 -0
  71. {SinaTools-0.1.4.dist-info → SinaTools-0.1.7.dist-info}/LICENSE +0 -0
  72. {SinaTools-0.1.4.dist-info → SinaTools-0.1.7.dist-info}/WHEEL +0 -0
  73. {nlptools → sinatools}/CLI/utils/__init__.py +0 -0
  74. {nlptools → sinatools}/DataDownload/__init__.py +0 -0
  75. {nlptools → sinatools}/arabert/__init__.py +0 -0
  76. {nlptools → sinatools}/arabert/arabert/__init__.py +0 -0
  77. {nlptools → sinatools}/arabert/arabert/create_classification_data.py +0 -0
  78. {nlptools → sinatools}/arabert/arabert/create_pretraining_data.py +0 -0
  79. {nlptools → sinatools}/arabert/arabert/extract_features.py +0 -0
  80. {nlptools → sinatools}/arabert/arabert/lamb_optimizer.py +0 -0
  81. {nlptools → sinatools}/arabert/arabert/modeling.py +0 -0
  82. {nlptools → sinatools}/arabert/arabert/optimization.py +0 -0
  83. {nlptools → sinatools}/arabert/arabert/run_classifier.py +0 -0
  84. {nlptools → sinatools}/arabert/arabert/run_pretraining.py +0 -0
  85. {nlptools → sinatools}/arabert/arabert/run_squad.py +0 -0
  86. {nlptools → sinatools}/arabert/arabert/tokenization.py +0 -0
  87. {nlptools → sinatools}/arabert/araelectra/__init__.py +0 -0
  88. {nlptools → sinatools}/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -0
  89. {nlptools → sinatools}/arabert/araelectra/build_pretraining_dataset.py +0 -0
  90. {nlptools → sinatools}/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -0
  91. {nlptools → sinatools}/arabert/araelectra/configure_finetuning.py +0 -0
  92. {nlptools → sinatools}/arabert/araelectra/configure_pretraining.py +0 -0
  93. {nlptools → sinatools}/arabert/araelectra/finetune/__init__.py +0 -0
  94. {nlptools → sinatools}/arabert/araelectra/finetune/feature_spec.py +0 -0
  95. {nlptools → sinatools}/arabert/araelectra/finetune/preprocessing.py +0 -0
  96. {nlptools → sinatools}/arabert/araelectra/finetune/scorer.py +0 -0
  97. {nlptools → sinatools}/arabert/araelectra/finetune/task.py +0 -0
  98. {nlptools → sinatools}/arabert/araelectra/finetune/task_builder.py +0 -0
  99. {nlptools → sinatools}/arabert/araelectra/flops_computation.py +0 -0
  100. {nlptools → sinatools}/arabert/araelectra/model/__init__.py +0 -0
  101. {nlptools → sinatools}/arabert/araelectra/model/modeling.py +0 -0
  102. {nlptools → sinatools}/arabert/araelectra/model/optimization.py +0 -0
  103. {nlptools → sinatools}/arabert/araelectra/model/tokenization.py +0 -0
  104. {nlptools → sinatools}/arabert/araelectra/pretrain/__init__.py +0 -0
  105. {nlptools → sinatools}/arabert/araelectra/pretrain/pretrain_data.py +0 -0
  106. {nlptools → sinatools}/arabert/araelectra/pretrain/pretrain_helpers.py +0 -0
  107. {nlptools → sinatools}/arabert/araelectra/run_finetuning.py +0 -0
  108. {nlptools → sinatools}/arabert/araelectra/run_pretraining.py +0 -0
  109. {nlptools → sinatools}/arabert/araelectra/util/__init__.py +0 -0
  110. {nlptools → sinatools}/arabert/araelectra/util/training_utils.py +0 -0
  111. {nlptools → sinatools}/arabert/araelectra/util/utils.py +0 -0
  112. {nlptools → sinatools}/arabert/aragpt2/__init__.py +0 -0
  113. {nlptools → sinatools}/arabert/aragpt2/create_pretraining_data.py +0 -0
  114. {nlptools → sinatools}/arabert/aragpt2/gpt2/__init__.py +0 -0
  115. {nlptools → sinatools}/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -0
  116. {nlptools → sinatools}/arabert/aragpt2/gpt2/optimization.py +0 -0
  117. {nlptools → sinatools}/arabert/aragpt2/gpt2/run_pretraining.py +0 -0
  118. {nlptools → sinatools}/arabert/aragpt2/grover/__init__.py +0 -0
  119. {nlptools → sinatools}/arabert/aragpt2/grover/dataloader.py +0 -0
  120. {nlptools → sinatools}/arabert/aragpt2/grover/modeling.py +0 -0
  121. {nlptools → sinatools}/arabert/aragpt2/grover/modeling_gpt2.py +0 -0
  122. {nlptools → sinatools}/arabert/aragpt2/grover/optimization_adafactor.py +0 -0
  123. {nlptools → sinatools}/arabert/aragpt2/grover/train_tpu.py +0 -0
  124. {nlptools → sinatools}/arabert/aragpt2/grover/utils.py +0 -0
  125. {nlptools → sinatools}/arabert/aragpt2/train_bpe_tokenizer.py +0 -0
  126. {nlptools → sinatools}/arabert/preprocess.py +0 -0
  127. {nlptools → sinatools}/environment.yml +0 -0
  128. {nlptools → sinatools}/install_env.py +0 -0
  129. /nlptools/nlptools.py → /sinatools/sinatools.py +0 -0
  130. {nlptools/arabiner → sinatools/utils}/__init__.py +0 -0
  131. {nlptools → sinatools}/utils/readfile.py +0 -0
  132. {nlptools → sinatools}/utils/utils.py +0 -0
@@ -1,13 +1,12 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: SinaTools
3
- Version: 0.1.4
4
- Summary: UNKNOWN
5
- Home-page: https://github.com/SinaLab/nlptools
6
- Author: UNKNOWN
7
- Author-email: UNKNOWN
3
+ Version: 0.1.7
4
+ Summary: A short description of your project
5
+ Home-page: https://github.com/SinaLab/sinatools
8
6
  License: MIT license
9
- Keywords: nlptools
7
+ Keywords: sinatools
10
8
  Platform: UNKNOWN
9
+ Description-Content-Type: text/markdown
11
10
  Requires-Dist: six
12
11
  Requires-Dist: farasapy
13
12
  Requires-Dist: tqdm
@@ -20,32 +19,27 @@ Requires-Dist: torchtext (==0.14.0)
20
19
  Requires-Dist: torchvision (==0.14.0)
21
20
  Requires-Dist: seqeval (==1.2.2)
22
21
  Requires-Dist: natsort (==7.1.1)
22
+ Requires-Dist: pandas (==1.2.4)
23
23
 
24
24
  ========
25
- nlptools
25
+ sinatools
26
26
  ========
27
27
 
28
+ .. image:: https://img.shields.io/pypi/v/sinatools.svg
29
+ :target: https://pypi.python.org/pypi/SinaTools
28
30
 
29
- .. image:: https://img.shields.io/pypi/v/nlptools.svg
30
- :target: https://pypi.python.org/pypi/SinaTools
31
-
32
- .. image:: https://img.shields.io/travis/sina_institute/nlptools.svg
33
- :target: https://travis-ci.com/sina_institute/SinaTools
34
-
35
- .. image:: https://readthedocs.org/projects/nlptools/badge/?version=latest
36
- :target: https://SinaTools.readthedocs.io/en/latest/?version=latest
37
- :alt: Documentation Status
38
-
39
-
31
+ .. image:: https://img.shields.io/travis/sina_institute/sinatools.svg
32
+ :target: https://travis-ci.com/sina_institute/SinaTools
40
33
 
34
+ .. image:: https://readthedocs.org/projects/sinatools/badge/?version=latest
35
+ :target: https://SinaTools.readthedocs.io/en/latest/?version=latest
36
+ :alt: Documentation Status
41
37
 
42
38
  Python Boilerplate contains all the boilerplate you need to create a Python package.
43
39
 
44
-
45
40
  * Free software: MIT license
46
41
  * Documentation: https://sina.birzeit.edu/sinatools/
47
42
 
48
-
49
43
  Credits
50
44
  -------
51
45
 
@@ -0,0 +1,101 @@
1
+ SinaTools-0.1.7.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
2
+ sinatools/VERSION,sha256=Gmytzwl0rsvqV5jsEYdTXHSbWrOb2vARjvgA3N9TGwY,5
3
+ sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
4
+ sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
5
+ sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
6
+ sinatools/sinatools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
7
+ sinatools/CLI/DataDownload/download_files.py,sha256=17CtswdAT66KO7hw3o87RaWbM-BxdUpsheE6bysP3-c,2302
8
+ sinatools/CLI/morphology/ALMA_multi_word.py,sha256=ZImJ1vtcpSHydI1BjJmK3KcMJbGBZX16kO4L6rxvBvA,2086
9
+ sinatools/CLI/morphology/morph_analyzer.py,sha256=ieIM47QK9Nct3MtCS9uq3h2rZN5r4qNhsLmlVeE6wiE,3503
10
+ sinatools/CLI/ner/corpus_entity_extractor.py,sha256=jsxTQsR4i8ZwsWrX1XxkYUbLGygYKV7-pWDiubfaANE,3751
11
+ sinatools/CLI/ner/entity_extractor.py,sha256=BHAs2nGKL9npHUXj-6FDHQCuR2jidvFJX8yUkgQKxhc,4436
12
+ sinatools/CLI/salma/salma_tools.py,sha256=8IDMSXjpM2u8jXc6c5JcI_l2CmiwdCxsUBJVN1Rrfk0,1971
13
+ sinatools/CLI/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
+ sinatools/CLI/utils/arStrip.py,sha256=pOMh9e4O-vBixbv2HM5qjlA8-qJH3Zf7DeJVekPrgjo,3252
15
+ sinatools/CLI/utils/corpus_tokenizer.py,sha256=nH0T4h6urr_0Qy6-wN3PquOtnwybj0REde5Ts_OE4U8,1650
16
+ sinatools/CLI/utils/implication.py,sha256=3vw526ZL0WR8LiIKbjYibTQWE_UeYvHThc1W9-BlbHg,3133
17
+ sinatools/CLI/utils/jaccard.py,sha256=1zSkEQevB-1D5xcT__qmrgB1s8CISU70wDMBteCKCSo,4601
18
+ sinatools/CLI/utils/remove_latin.py,sha256=dzRzRapmM4mJwS-rhNy9PYQKS-ONMsRBmN1ZcPfEBfE,848
19
+ sinatools/CLI/utils/remove_punctuation.py,sha256=vJAZlEn7WGftZAFVFYnddkRrxdJ_rMmKB9vFZkY-jN4,1097
20
+ sinatools/CLI/utils/sentence_tokenizer.py,sha256=Wli8eiDbWSd_Z8UKpu_JkaS8jImowa1vnRL0oYCSfqw,2823
21
+ sinatools/CLI/utils/text_transliteration.py,sha256=vz-3kxWf8pNYVCqNAtBAiA6u_efrS5NtWT-ofN1NX6I,2014
22
+ sinatools/DataDownload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
+ sinatools/DataDownload/downloader.py,sha256=MbTPqqlg5vOTErxeVvdMn5k0TsYaG6kef2zHkeBLXlk,6480
24
+ sinatools/arabert/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
25
+ sinatools/arabert/preprocess.py,sha256=qI0FsuMTOzdRlYGCtLrjpXgikNElUZPv9bnjaKDZKJ4,33024
26
+ sinatools/arabert/arabert/__init__.py,sha256=KbSAH-XqbRygn0y59m5-ZYOLXgpT1gSgE3F-qd4rKEc,627
27
+ sinatools/arabert/arabert/create_classification_data.py,sha256=BhemGNRbYz_Pun0Q5WerN2-9n-ILmU3tm4J-OlHw5-A,7678
28
+ sinatools/arabert/arabert/create_pretraining_data.py,sha256=2M-cF3CLHbQ0cdWrzFT6Frg1vVP4Y-CFoq8iEPyxgsE,18924
29
+ sinatools/arabert/arabert/extract_features.py,sha256=C1IzASrlX7u4_M2xdr_PjzWfTRZgklhUXA2WHKgQt-I,15585
30
+ sinatools/arabert/arabert/lamb_optimizer.py,sha256=uN3Dcx-6n2_OwepyymRrGrB4EcSkR8b2ZczZrOr7bpY,6263
31
+ sinatools/arabert/arabert/modeling.py,sha256=KliecCmA1pP3owg0mYge6On3IRHunMF5kMLuEwc0VLw,40896
32
+ sinatools/arabert/arabert/optimization.py,sha256=Wx0Js6Zsfc3iVw-_7Q1SCnxfP_qqbdTAyFD-vZSpOyk,8153
33
+ sinatools/arabert/arabert/run_classifier.py,sha256=AdVGyvidlmbEp12b-PauiBo6EmFLEO7tqeJKuLhK2DA,38777
34
+ sinatools/arabert/arabert/run_pretraining.py,sha256=yO16nKkHDfcYA2Zx7vv8KN4te6_1qFOzyVeDzFT-DQw,21894
35
+ sinatools/arabert/arabert/run_squad.py,sha256=PORxgiByP8L6vZqAFkqgHPJ_ZjAlqlg64gtkdLmDNns,53456
36
+ sinatools/arabert/arabert/tokenization.py,sha256=R6xkyCb8_vgeksXiLeqDvV5vOnLb1cPNsvfDij6YVFk,14132
37
+ sinatools/arabert/araelectra/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
38
+ sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py,sha256=pIo6VFT3XXOYroZaab3msZAP6XjCKu0KcrIZQA0Pj8U,3881
39
+ sinatools/arabert/araelectra/build_pretraining_dataset.py,sha256=Z8ZmKznaE_2SPDRoPYR1SDhjTN_NTpNCFFuhUkykwl8,9041
40
+ sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py,sha256=W7HFr1XoO6bCDR7X7w-bOuwULFtTSjeKbJ2LHzzHf9k,3224
41
+ sinatools/arabert/araelectra/configure_finetuning.py,sha256=YfGLMdgN6Qqm357Mzy5UMjkuLPPWtBs7f4dA-DKE6JM,7768
42
+ sinatools/arabert/araelectra/configure_pretraining.py,sha256=oafQgu4WmVdxBcU5mSfXhPlvCk43CJwAWXC10Q58BlI,5801
43
+ sinatools/arabert/araelectra/flops_computation.py,sha256=krHTeuPH9xQu5ldprBOPJNlJRvC7fmmvXXqUjfWrzPE,9499
44
+ sinatools/arabert/araelectra/run_finetuning.py,sha256=JecbrSmGikBNyid4JKRZ49Rm5xFpt02WfgIIcs3TpcU,12976
45
+ sinatools/arabert/araelectra/run_pretraining.py,sha256=1K2aAFTY0p3iaLY0xkhTlm6v0B-Zun8SwEzz-K6RXM4,20665
46
+ sinatools/arabert/araelectra/finetune/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
47
+ sinatools/arabert/araelectra/finetune/feature_spec.py,sha256=cqNlBa2KK_G1-vkKm1EJUv6BoS3gesCUAHwVagZB6wM,1888
48
+ sinatools/arabert/araelectra/finetune/preprocessing.py,sha256=1mf7-IxknCRsobQZ-VV1zs4Cwt-mfOtoVxysDJa9LZ0,6657
49
+ sinatools/arabert/araelectra/finetune/scorer.py,sha256=PjRg0P5ANCtul2ute7ccq3mRCCoIAoCb-lVLlwd4rVY,1571
50
+ sinatools/arabert/araelectra/finetune/task.py,sha256=zM8M4PGSIrY2u6ytpmkQEXxG-jjoeN9wouEyVR23qeQ,1991
51
+ sinatools/arabert/araelectra/finetune/task_builder.py,sha256=Zsoiuw5M3Ca8QhaZVLVLZyWw09K5R75UeMuPmazMlHI,2768
52
+ sinatools/arabert/araelectra/model/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
53
+ sinatools/arabert/araelectra/model/modeling.py,sha256=5XLIutnmr-SFQOV_XntJ-U5evSCY-J2e9NjvlwVXKkk,40877
54
+ sinatools/arabert/araelectra/model/optimization.py,sha256=BCMb_C5hgBw7wC9ZR8AQ4lwoPopqLIcSiqcCrIjx9XU,7254
55
+ sinatools/arabert/araelectra/model/tokenization.py,sha256=9CkyPzs3L6OEPzN-7EWQDNQmW2mIJoZD4o1rn6xLdL4,11082
56
+ sinatools/arabert/araelectra/pretrain/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
57
+ sinatools/arabert/araelectra/pretrain/pretrain_data.py,sha256=NLgIcLAq1-MgtBNXYu_isDxnOY5k67SyADYy-8nzBok,5413
58
+ sinatools/arabert/araelectra/pretrain/pretrain_helpers.py,sha256=nFl7LEdxAU5kKwiodqJHzi-ty9jMFsCCNYOF__A69j8,9255
59
+ sinatools/arabert/araelectra/util/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
60
+ sinatools/arabert/araelectra/util/training_utils.py,sha256=7h_J1ljUWM0ynBcofEtjZWL_oAfZtTxEemQLkixgn-0,4142
61
+ sinatools/arabert/araelectra/util/utils.py,sha256=G0UAETUCZMlU9R9ASD9AXrWZeodWI1aZJEE9F-goaH4,2591
62
+ sinatools/arabert/aragpt2/__init__.py,sha256=aQkKhQwWaS61wYEeOdx682upeMWFPUjLxXSs7JM1sOA,18
63
+ sinatools/arabert/aragpt2/create_pretraining_data.py,sha256=fFa2_DAyTwc8L2IqQbshsh_Ia26nj1qtVLzC6DooSac,3105
64
+ sinatools/arabert/aragpt2/train_bpe_tokenizer.py,sha256=b-8zHQ02fLmZV4GfjnrPptwjpX259F41SlnWzBrflMA,1888
65
+ sinatools/arabert/aragpt2/gpt2/__init__.py,sha256=aQkKhQwWaS61wYEeOdx682upeMWFPUjLxXSs7JM1sOA,18
66
+ sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py,sha256=uN3Dcx-6n2_OwepyymRrGrB4EcSkR8b2ZczZrOr7bpY,6263
67
+ sinatools/arabert/aragpt2/gpt2/optimization.py,sha256=iqh23cypRSRUt53wt2G5SbNNpJMwERM7gZAOKVh5l4U,8411
68
+ sinatools/arabert/aragpt2/gpt2/run_pretraining.py,sha256=4jjkUbvTO1DHoKJ89yKtlkkofcND_fyAunQ-mlnJhTM,13298
69
+ sinatools/arabert/aragpt2/grover/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
+ sinatools/arabert/aragpt2/grover/dataloader.py,sha256=-FWPTjtsvweEE1WaWRHBXfOSbsGiUmnXT3qK7KJP8cM,6853
71
+ sinatools/arabert/aragpt2/grover/modeling.py,sha256=XcUvFwqRaxAwWiJstrH2FPBvDJe03pTWIyipdMfWj9g,38280
72
+ sinatools/arabert/aragpt2/grover/modeling_gpt2.py,sha256=WFpCWn1792yATFzt8rZ0rpWvExfbLzV2BqiEs7llFUw,51602
73
+ sinatools/arabert/aragpt2/grover/optimization_adafactor.py,sha256=1geOsCWuv5xxtSnKDz9a8aY5SVwZ1MGq-xVQDBg4Gpg,9765
74
+ sinatools/arabert/aragpt2/grover/train_tpu.py,sha256=qNgLI_j6-KYkTMJfVoFlh4NIKweY1aPz1XPDw6odld0,7102
75
+ sinatools/arabert/aragpt2/grover/utils.py,sha256=V5wMUxK03r5g_pb7R3_uGLOPqQJfbIB0VaJ8ZDM4XAo,8473
76
+ sinatools/morphology/ALMA_multi_word.py,sha256=GPM2-N7_5JIZwNdi1we6gBG0rh59AlGM0XWYxmEE7jY,1283
77
+ sinatools/morphology/__init__.py,sha256=avTxtexZELp1Fya6cBNFLyeYPB31OcmQOlT2L-uAQnI,1386
78
+ sinatools/morphology/morph_analyzer.py,sha256=tA78gWg6iaE_G1c2xqxZoXZWNbvHBJLrTSxPyir5Xn8,6941
79
+ sinatools/ner/__init__.py,sha256=8R8epTEyvpbreLYTrC5M5lctlzZrNr7T7B4KmENnB3I,341
80
+ sinatools/ner/entity_extractor.py,sha256=amVU6tXoAAL9NcadfJlx1qyEPlxBY8wRo5Tn-ZLOVIw,3236
81
+ sinatools/salma/__init__.py,sha256=_by3PsXetNjkxSyg24nF592T-21JEWhPXzMAPzwDOhQ,378
82
+ sinatools/salma/settings.py,sha256=b_AqTxVWALuGXnsMd9KhnnwIo9-JEoWOTekB-7_xJCU,1111
83
+ sinatools/salma/views.py,sha256=G5W5BSr770NapWz5j6hcuwInrR40JKG-LkzP1OpcYeA,18416
84
+ sinatools/salma/wsd.py,sha256=vCiiR5h3bjAOHi3yxxkh_7GUgBWKQf297aHbO4Z8CBk,4436
85
+ sinatools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
86
+ sinatools/utils/charsets.py,sha256=rs82oZJqRqosZdTKXfFAJfJ5t4PxjMM_oAPsiWSWuwU,2817
87
+ sinatools/utils/implication.py,sha256=MsbI6S1LNY-fCxGMxFTuaV639r3QijkkdcfH48rvY7A,27804
88
+ sinatools/utils/jaccard.py,sha256=S7OgvaMqkN5HFgTZkKhMCNAuAnQ0LhRyXPN79jAzmKM,10113
89
+ sinatools/utils/parser.py,sha256=CPPtCrsbxUqsjhY5C9wTOgkAs6iw0k_WvMUxLEPM1IU,6168
90
+ sinatools/utils/readfile.py,sha256=xE4LEaCqXJIk9v37QUSSmWb-aY3UnCFUNb7uVdx3cpM,133
91
+ sinatools/utils/text_transliteration.py,sha256=NQoXrxI-h0UXnvVtDA3skNJduxIy0IW26r46N4tDxGk,8766
92
+ sinatools/utils/tokenizer.py,sha256=QHyrVqJA_On4rKxexiWR2ovq4pI1-u6iZkdhRbK9tew,6676
93
+ sinatools/utils/tokenizers_words.py,sha256=efNfOil9qDNVJ9yynk_8sqf65PsL-xtsHG7y2SZCkjQ,656
94
+ sinatools/utils/utils.py,sha256=vKkFOkYclMu8nXS_VZb6Kobx8QGKW9onXkkLCeiRb6g,32
95
+ SinaTools-0.1.7.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
96
+ SinaTools-0.1.7.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
97
+ SinaTools-0.1.7.dist-info/METADATA,sha256=TWtbd8m_tSIStY0O0mLGnf5y5zR0Yk7PVFAkBOwqrTo,1569
98
+ SinaTools-0.1.7.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
99
+ SinaTools-0.1.7.dist-info/entry_points.txt,sha256=9uGvOGRicf-CsHMaFyQjq1odtr3RMeOvEfiZwpDQ9VU,926
100
+ SinaTools-0.1.7.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
101
+ SinaTools-0.1.7.dist-info/RECORD,,
@@ -0,0 +1,18 @@
1
+ [console_scripts]
2
+ alma_multi_word = sinatools.CLI.morphology.ALMA_multi_word:main
3
+ appdatadir = sinatools.CLI.DataDownload.get_appdatadir:main
4
+ arStrip = sinatools.CLI.utils.arStrip:main
5
+ arabi_ner = sinatools.CLI.ner.entity_extractor:main
6
+ arabi_ner2 = sinatools.CLI.ner.corpus_entity_extractor:main
7
+ corpus_tokenizer = sinatools.CLI.utils.corpus_tokenizer:main
8
+ download_files = sinatools.CLI.DataDownload.download_files:main
9
+ implication = sinatools.CLI.utils.implication:main
10
+ install_env = sinatools.install_env:main
11
+ jaccard_similarity = sinatools.CLI.utils.jaccard:main
12
+ morphology_analyzer = sinatools.CLI.morphology.morph_analyzer:main
13
+ remove_latin = sinatools.CLI.utils.remove_latin:main
14
+ remove_punctuation = sinatools.CLI.utils.remove_punctuation:main
15
+ salma = sinatools.CLI.salma.salma_tools:main
16
+ sentence_tokenizer = sinatools.CLI.utils.sentence_tokenizer:main
17
+ transliterate = sinatools.CLI.utils.text_transliteration:main
18
+
@@ -0,0 +1 @@
1
+ sinatools
@@ -2,17 +2,17 @@
2
2
  About:
3
3
  ------
4
4
 
5
- The sina_download_files tool is a command-line interface for downloading various NLP resources from pre-specified URLs. It is a part of the nlptools package and provides options to choose which files to download and to specify a download directory. The tool automatically handles file extraction for zip and tar.gz files.
5
+ The download_files tool is a command-line interface for downloading various NLP resources from pre-specified URLs. It is a part of the sinatools package and provides options to choose which files to download and to specify a download directory. The tool automatically handles file extraction for zip and tar.gz files.
6
6
 
7
7
  Usage:
8
8
  ------
9
9
 
10
- Below is the usage information that can be generated by running sina_download_files --help.
10
+ Below is the usage information that can be generated by running download_files --help.
11
11
 
12
12
  .. code-block:: none
13
13
 
14
14
  Usage:
15
- sina_download_files [OPTIONS]
15
+ download_files [OPTIONS]
16
16
 
17
17
  .. code-block:: none
18
18
 
@@ -26,7 +26,7 @@ Examples:
26
26
 
27
27
  .. code-block:: none
28
28
 
29
- sina_download_files -f morph ner
29
+ download_files -f morph ner
30
30
  This command will download only the `morph` and `ner` files to the default directory.
31
31
 
32
32
  Note:
@@ -42,10 +42,10 @@ Note:
42
42
  """
43
43
 
44
44
  import argparse
45
- from nlptools.DataDownload.downloader import download_file
46
- from nlptools.DataDownload.downloader import download_files
47
- from nlptools.DataDownload.downloader import get_appdatadir
48
- from nlptools.DataDownload.downloader import urls
45
+ from sinatools.DataDownload.downloader import download_file
46
+ from sinatools.DataDownload.downloader import download_files
47
+ from sinatools.DataDownload.downloader import get_appdatadir
48
+ from sinatools.DataDownload.downloader import urls
49
49
 
50
50
 
51
51
  def main():
@@ -68,4 +68,4 @@ def main():
68
68
  if __name__ == '__main__':
69
69
  main()
70
70
 
71
- #sina_download_files -f morph ner
71
+ #download_files -f morph ner
@@ -1,16 +1,16 @@
1
1
  """
2
2
  About:
3
3
  ------
4
- The sina_alma_multi_word tool performs multi-word morphological analysis using SinaTools' `ALMA_multi_word` utility. Given a multi-word Arabic text input, it returns a detailed analysis in JSON format.
4
+ The alma_multi_word tool performs multi-word morphological analysis using SinaTools' `ALMA_multi_word` utility. Given a multi-word Arabic text input, it returns a detailed analysis in JSON format.
5
5
 
6
6
  Usage:
7
7
  ------
8
- Below is the usage information that can be generated by running sina_alma_multi_word --help.
8
+ Below is the usage information that can be generated by running alma_multi_word --help.
9
9
 
10
10
  .. code-block:: none
11
11
 
12
- sina_alma_multi_word --multi_word=MULTI_WORD_TEXT
13
- sina_alma_multi_word --file
12
+ alma_multi_word --multi_word=MULTI_WORD_TEXT
13
+ alma_multi_word --file
14
14
 
15
15
  Options:
16
16
  --------
@@ -27,25 +27,15 @@ Examples:
27
27
 
28
28
  .. code-block:: none
29
29
 
30
- sina_alma_multi_word --multi_word "Your multi-word text here"
31
- sina_alma_multi_word --file "path/to/your/file.txt"
32
-
33
- Note:
34
- -----
35
-
36
- .. code-block:: none
37
-
38
- - Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
39
- - The tool returns results in JSON format with proper indentation for better readability.
40
- - The quality and accuracy of the analysis depend on the underlying capabilities of the SinaTools' `ALMA_multi_word` utility.
41
- - The tool is specifically designed for multi-word input. For single-word morphological analysis, other specific utilities/tools might be more appropriate.
30
+ alma_multi_word --multi_word "Your multi-word text here"
31
+ alma_multi_word --file "path/to/your/file.txt"
42
32
 
43
33
  """
44
34
 
45
35
  import argparse
46
- from nlptools.morphology.ALMA_multi_word import ALMA_multi_word
36
+ from sinatools.morphology.ALMA_multi_word import ALMA_multi_word
47
37
  import json
48
- from nlptools.utils.readfile import read_file
38
+ from sinatools.utils.readfile import read_file
49
39
 
50
40
  def main():
51
41
  parser = argparse.ArgumentParser(description='Multi-Word Analysis using SinaTools')
@@ -71,5 +61,5 @@ def main():
71
61
 
72
62
  if __name__ == '__main__':
73
63
  main()
74
- #sina_alma_multi_word --multi_word "Your multi-word text here"
75
- #sina_alma_multi_word --file "path/to/your/file.txt"
64
+ #alma_multi_word --multi_word "Your multi-word text here"
65
+ #alma_multi_word --file "path/to/your/file.txt"
@@ -0,0 +1,80 @@
1
+ """
2
+ About:
3
+ ------
4
+ The morphology_analyzer command is designed to provide morphological analysis for Arabic text using the SinaTools morph_analyzer component. Users can specify the language and desired analysis task (lemmatization, part-of-speech tagging, or full morphological analysis), and flag.
5
+
6
+ Usage:
7
+ ------
8
+ Below is the usage information that can be generated by running morphology_analyzer --help.
9
+
10
+ .. code-block:: none
11
+
12
+ morphology_analyzer --text=TEXT [OPTIONS]
13
+ morphology_analyzer --file=FILE [OPTIONS]
14
+
15
+ Options:
16
+ --------
17
+
18
+ .. code-block:: none
19
+
20
+ --text TEXT
21
+ The text that needs to be morphologically analyzed.
22
+
23
+ --file FILE
24
+ File containing the text to be morphologically analyzed
25
+
26
+ --language LANGUAGE [default=MSA]
27
+ Specifies the language for the analysis. In the current version, MSA is only supported.
28
+
29
+ --task TASK [default=full]
30
+ Determines the specific type of morphological analysis to be performed. Available options are:
31
+ - lemmatization: the morphological solution includes only the lemma_id, lemma, token, and token frequency.
32
+ - pos: the morphological solution includes only the part-of-speech, token, and token frequency.
33
+ - root: the morphological solution includes only the root, token, and token frequency.
34
+ - full: the morphological solution includes the lemma_id, lemma, part-of-speech, root, token, and token frequency.
35
+ The default is full.
36
+
37
+ --flag FLAG [default=1]
38
+ The flag to filter the returned results. If the flag is `1`, the solution with the highest frequency will be returned. If the flag is `*`, all solutions will be returned, ordered descendingly, with the highest frequency solution first. The default flag if not specified is `1`.
39
+
40
+ Examples:
41
+ ---------
42
+
43
+ .. code-block:: none
44
+
45
+ morphology_analyzer --text "Your Arabic text here" --language MSA --task full --flag 1
46
+ morphology_analyzer --file "path/to/your/file.txt" --language MSA --task full --flag 1
47
+
48
+ """
49
+
50
+ import argparse
51
+ from sinatools.morphology.morph_analyzer import analyze
52
+ from sinatools.utils.readfile import read_file
53
+
54
+ def main():
55
+ parser = argparse.ArgumentParser(description='Morphological Analysis using SinaTools')
56
+
57
+ parser.add_argument('--text', type=str, help='Text to be morphologically analyzed')
58
+ parser.add_argument('--file', type=str, help='File containing the text to be morphologically analyzed')
59
+ parser.add_argument('--language', type=str, default='MSA', help='Language for analysis (default: MSA)')
60
+ parser.add_argument('--task', type=str, default='full', choices=['lemmatization', 'pos', 'root', 'full'], help='Task for the result filter [lemmatization, pos, root, full] (default: full)')
61
+ parser.add_argument('--flag', type=str, default='1', choices=['1','*'], help='The flag to filter the returned results')
62
+
63
+ args = parser.parse_args()
64
+
65
+ if args.text is None and args.file is None:
66
+ print("Error: Either --text or --file argument must be provided.")
67
+ return
68
+
69
+ # Get the input either from the --text argument or from the file specified in the --file argument
70
+ input_text = args.text if args.text else " ".join(read_file(args.file))
71
+
72
+ # Perform morphological analysis
73
+ results = analyze(input_text, args.language, args.task, args.flag)
74
+
75
+ # Print the results
76
+ for result in results:
77
+ print(result)
78
+
79
+ if __name__ == '__main__':
80
+ main()
@@ -1,8 +1,10 @@
1
1
  import os
2
2
  import csv
3
- from nlptools.utils.sentence_tokenizer import sent_tokenize
4
- from nlptools.morphology.tokenizers_words import simple_word_tokenize
3
+ from sinatools.utils.tokenizer import sentence_tokenizer
4
+ from sinatools.utils.tokenizers_words import simple_word_tokenize
5
5
  import pandas as pd
6
+ import argparse
7
+ from sinatools.ner.entity_extractor import ner
6
8
 
7
9
  """
8
10
  CSV NER Tagging Tool
@@ -14,12 +16,6 @@ Run the script with the following command:
14
16
  arabi_ner2 input.csv --text-columns "TextColumn1,TextColumn2" --additional-columns "Column3,Column4" --output-csv output.csv
15
17
  """
16
18
 
17
- import argparse
18
- import pandas as pd
19
- from nlptools.utils.sentence_tokenizer import sent_tokenize
20
- from nlptools.morphology.tokenizers_words import simple_word_tokenize
21
- from nlptools.arabiner.bin.infer import ner
22
-
23
19
  def infer(sentence):
24
20
  output = ner(sentence)
25
21
  return [word[1] for word in output]
@@ -39,7 +35,7 @@ def corpus_tokenizer(input_csv, output_csv, text_column, additional_columns, row
39
35
 
40
36
  df = pd.read_csv(input_csv)
41
37
  for index, row in df.iterrows():
42
- sentences = sent_tokenize(row[text_column], dot=True, new_line=True, question_mark=False, exclamation_mark=False)
38
+ sentences = sentence_tokenizer(row[text_column], dot=True, new_line=True, question_mark=False, exclamation_mark=False)
43
39
  for sentence_id, sentence in enumerate(sentences, start=1):
44
40
  words = simple_word_tokenize(sentence)
45
41
  global_sentence_id += 1
@@ -45,9 +45,9 @@ Note:
45
45
  import argparse
46
46
  import json
47
47
  import pandas as pd
48
- from nlptools.arabiner.bin.infer import ner
49
- from nlptools.utils.corpus_tokenizer import corpus_tokenizer
50
- from nlptools.morphology.tokenizers_words import simple_word_tokenize
48
+ from sinatools.ner.entity_extractor import ner
49
+ from sinatools.utils.tokenizer import corpus_tokenizer
50
+ from sinatools.utils.tokenizers_words import simple_word_tokenize
51
51
 
52
52
 
53
53
  def infer(sentence):
@@ -110,8 +110,4 @@ if __name__ == '__main__':
110
110
  main()
111
111
 
112
112
  #arabi_ner --text "Your text here."
113
- #arabi_ner --dir /path/to/your/directory --output_csv output.csv
114
-
115
- #Each unique sentence in the CSV file is processed once by the infer function to get the NER tags for all the words in the sentence.
116
- #The current_word_position variable is used to keep track of the position within the list of NER tags returned by infer, ensuring that each word in the CSV file is assigned the correct NER tag.
117
- #The final CSV file will contain an additional column, NER tags, which contains the NER tag for each word in the Sentence column of the CSV file.
113
+ #arabi_ner --dir /path/to/your/directory --output_csv output.csv
@@ -11,8 +11,8 @@ Below is the usage information that can be generated by running the command with
11
11
 
12
12
  .. code-block:: none
13
13
 
14
- sina_salma --text=TEXT
15
- sina_salma --file=INPUT_FILE
14
+ salma --text=TEXT
15
+ salma --file=INPUT_FILE
16
16
 
17
17
  Options:
18
18
  --------
@@ -27,8 +27,8 @@ Examples:
27
27
  ---------
28
28
  .. code-block:: none
29
29
 
30
- sina_salma --text "your Arabic sentence here"
31
- sina_salma --file "path/to/your/file.txt"
30
+ salma --text "your Arabic sentence here"
31
+ salma --file "path/to/your/file.txt"
32
32
 
33
33
  Note:
34
34
  -----
@@ -42,8 +42,8 @@ Note:
42
42
 
43
43
  import argparse
44
44
  import json
45
- from nlptools.salma.views import SALMA
46
- from nlptools.utils.readfile import read_file
45
+ from sinatools.salma.views import SALMA
46
+ from sinatools.utils.readfile import read_file
47
47
 
48
48
  def main():
49
49
  parser = argparse.ArgumentParser(description='Arabic text stripping tool using SinaTools')
@@ -64,5 +64,5 @@ def main():
64
64
  if __name__ == "__main__":
65
65
  main()
66
66
 
67
- #sina_salma --text "your Arabic sentence here"
68
- #sina_salma --file "path/to/your/file.txt"
67
+ #salma --text "your Arabic sentence here"
68
+ #salma --file "path/to/your/file.txt"
@@ -2,17 +2,17 @@
2
2
 
3
3
  About:
4
4
  ------
5
- The sina_arStrip tool offers functionality to strip various elements from Arabic text using the SinaTools' `arStrip` utility. It provides flexibility to selectively strip diacritics, small diacritics, shaddah, digits, alif, and special characters.
5
+ The arStrip command offers functionality to strip various elements from Arabic text using the SinaTools' `arStrip` utility. It provides flexibility to selectively strip diacritics, small diacritics, shaddah, digits, alif, and special characters.
6
6
 
7
7
  Usage:
8
8
  ------
9
- Below is the usage information that can be generated by running sina_arStrip --help.
9
+ Below is the usage information that can be generated by running arStrip --help.
10
10
 
11
11
  .. code-block:: none
12
12
 
13
13
  Usage:
14
- sina_arStrip --text=TEXT [OPTIONS]
15
- sina_arStrip --file "path/to/your/file.txt" [OPTIONS]
14
+ arStrip --text=TEXT [OPTIONS]
15
+ arStrip --file "path/to/your/file.txt" [OPTIONS]
16
16
 
17
17
  .. code-block:: none
18
18
 
@@ -43,27 +43,16 @@ Below is the usage information that can be generated by running sina_arStrip --h
43
43
 
44
44
  Examples:
45
45
  ---------
46
-
47
- .. code-block:: none
48
-
49
- sina_arStrip --text "مُختَبَر سينا لحوسبة اللغة!" --diacs=True --smallDiacs=False --shaddah=True --digit=False --alif=False --specialChars=False
50
-
51
- sina_arStrip --file "path/to/your/file.txt" --diacs=True --smallDiacs=False --shaddah=True --digit=False --alif=False --specialChars=False
52
-
53
- Note:
54
- -----
55
-
56
46
  .. code-block:: none
57
47
 
58
- - This tool is specific to Arabic text, as it focuses on Arabic linguistic elements.
59
- - Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
60
- - Stripping certain elements might change the meaning or readability of the text. Use it judiciously.
48
+ arStrip --text "مُختَبَر سينا لحوسبة اللغة!" --diacs=True --smallDiacs=False --shaddah=True --digit=False --alif=False --specialChars=False
49
+ arStrip --file "path/to/your/file.txt" --diacs=True --smallDiacs=False --shaddah=True --digit=False --alif=False --specialChars=False
61
50
 
62
51
  """
63
52
 
64
53
  import argparse
65
- from nlptools.utils.parser import arStrip
66
- from nlptools.utils.readfile import read_file
54
+ from sinatools.utils.parser import arStrip
55
+ from sinatools.utils.readfile import read_file
67
56
 
68
57
  def main():
69
58
  parser = argparse.ArgumentParser(description='Arabic text stripping tool using SinaTools')
@@ -95,5 +84,5 @@ def main():
95
84
  if __name__ == '__main__':
96
85
  main()
97
86
 
98
- #sina_arStrip --text "example text" --diacs=True
99
- #sina_arStrip --file "path/to/your/file.txt" --diacs=True
87
+ #arStrip --text "example text" --diacs=True
88
+ #arStrip --file "path/to/your/file.txt" --diacs=True
@@ -0,0 +1,50 @@
1
+ """
2
+
3
+ About:
4
+ ------
5
+ The corpus_tokenizer command offers functionality to tokenize a corpus and write the results to a CSV file. It recursively searches through a specified directory for text files, tokenizes the content, and outputs the results, including various metadata, to a specified CSV file.
6
+
7
+ Usage:
8
+ -------
9
+ Below is the usage information that can be generated by running corpus_tokenizer --help.
10
+
11
+ .. code-block:: none
12
+
13
+ Usage:
14
+ corpus_tokenizer dir_path output_csv
15
+
16
+ .. code-block:: none
17
+ dir_path
18
+ The path to the directory containing the text files.
19
+
20
+ output_csv
21
+ The path to the output CSV file.
22
+
23
+ Examples:
24
+ ---------
25
+ .. code-block:: none
26
+ corpus_tokenizer --dir_path "/path/to/text/directory/of/files" --output_csv "outputFile.csv"
27
+ """
28
+
29
+ import argparse
30
+ from sinatools.utils.tokenizer import corpus_tokenizer
31
+
32
+ # Define the main function that will parse the arguments
33
+ def main():
34
+ # Create an ArgumentParser object
35
+ parser = argparse.ArgumentParser(description='Tokenize the corpus and write the results to a CSV file.')
36
+
37
+ # Add arguments to the parser
38
+ parser.add_argument('--dir_path', type=str, help='The path to the directory containing the text files.')
39
+ parser.add_argument('--output_csv', type=str, help='The path to the output CSV file.')
40
+
41
+ # Parse the command-line arguments
42
+ args = parser.parse_args()
43
+
44
+ # Call the corpus_tokenizer function with the parsed arguments
45
+ corpus_tokenizer(args.dir_path, args.output_csv)
46
+
47
+ # Call the main function when the script is executed
48
+ if __name__ == '__main__':
49
+ main()
50
+
@@ -1,18 +1,18 @@
1
1
  """
2
2
  About:
3
3
  ------
4
- The sina_implication tool evaluates the implication between two words using the functionalities provided by the `Implication` class of SinaTools. This tool can be utilized to determine the relationship between two words and understand if one implies the other.
4
+ The implication tool evaluates the implication between two words using the functionalities provided by the `Implication` class of SinaTools. This tool can be utilized to determine the relationship between two words and understand if one implies the other.
5
5
 
6
6
  Usage:
7
7
  ------
8
- Below is the usage information that can be generated by running sina_implication --help.
8
+ Below is the usage information that can be generated by running implication --help.
9
9
 
10
10
  .. code-block:: none
11
11
 
12
12
  Usage:
13
- sina_implication --inputWord1=WORD1 --inputWord2=WORD2
13
+ implication --inputWord1=WORD1 --inputWord2=WORD2
14
14
 
15
- sina_implication --inputFile1=File1 --inputFile2=File2
15
+ implication --inputFile1=File1 --inputFile2=File2
16
16
 
17
17
  .. code-block:: none
18
18
 
@@ -33,9 +33,9 @@ Examples:
33
33
 
34
34
  .. code-block:: none
35
35
 
36
- sina_implication --inputWord1 "word1" --inputWord2 "word2"
36
+ implication --inputWord1 "word1" --inputWord2 "word2"
37
37
 
38
- sina_implication --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt"
38
+ implication --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt"
39
39
 
40
40
  Note:
41
41
  -----
@@ -47,7 +47,7 @@ Note:
47
47
 
48
48
  """
49
49
  import argparse
50
- from nlptools.utils.implication import Implication
50
+ from sinatools.utils.implication import Implication
51
51
 
52
52
  def read_file(file_path):
53
53
  with open(file_path, 'r', encoding='utf-8') as file:
@@ -86,7 +86,7 @@ def main():
86
86
 
87
87
  if __name__ == '__main__':
88
88
  main()
89
- # sina_implication --inputWord1 "word1" --inputWord2 "word2"
90
- # sina_implication --file1 "path/to/your/firstfile.txt" --file2 "path/to/your/secondfile.txt"
89
+ # implication --inputWord1 "word1" --inputWord2 "word2"
90
+ # implication --file1 "path/to/your/firstfile.txt" --file2 "path/to/your/secondfile.txt"
91
91
 
92
92