SinaTools 0.1.37__tar.gz → 0.1.38__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (191) hide show
  1. {sinatools-0.1.37 → sinatools-0.1.38}/PKG-INFO +1 -2
  2. {sinatools-0.1.37 → sinatools-0.1.38}/SinaTools.egg-info/PKG-INFO +1 -2
  3. {sinatools-0.1.37 → sinatools-0.1.38}/SinaTools.egg-info/SOURCES.txt +0 -50
  4. {sinatools-0.1.37 → sinatools-0.1.38}/SinaTools.egg-info/requires.txt +0 -1
  5. {sinatools-0.1.37 → sinatools-0.1.38}/setup.py +1 -1
  6. sinatools-0.1.38/sinatools/VERSION +1 -0
  7. sinatools-0.1.37/sinatools/VERSION +0 -1
  8. sinatools-0.1.37/sinatools/arabert/arabert/__init__.py +0 -14
  9. sinatools-0.1.37/sinatools/arabert/arabert/create_classification_data.py +0 -260
  10. sinatools-0.1.37/sinatools/arabert/arabert/create_pretraining_data.py +0 -534
  11. sinatools-0.1.37/sinatools/arabert/arabert/extract_features.py +0 -444
  12. sinatools-0.1.37/sinatools/arabert/arabert/lamb_optimizer.py +0 -158
  13. sinatools-0.1.37/sinatools/arabert/arabert/modeling.py +0 -1027
  14. sinatools-0.1.37/sinatools/arabert/arabert/optimization.py +0 -202
  15. sinatools-0.1.37/sinatools/arabert/arabert/run_classifier.py +0 -1078
  16. sinatools-0.1.37/sinatools/arabert/arabert/run_pretraining.py +0 -593
  17. sinatools-0.1.37/sinatools/arabert/arabert/run_squad.py +0 -1440
  18. sinatools-0.1.37/sinatools/arabert/arabert/tokenization.py +0 -414
  19. sinatools-0.1.37/sinatools/arabert/araelectra/__init__.py +0 -1
  20. sinatools-0.1.37/sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
  21. sinatools-0.1.37/sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
  22. sinatools-0.1.37/sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
  23. sinatools-0.1.37/sinatools/arabert/araelectra/configure_finetuning.py +0 -172
  24. sinatools-0.1.37/sinatools/arabert/araelectra/configure_pretraining.py +0 -143
  25. sinatools-0.1.37/sinatools/arabert/araelectra/finetune/__init__.py +0 -14
  26. sinatools-0.1.37/sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
  27. sinatools-0.1.37/sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
  28. sinatools-0.1.37/sinatools/arabert/araelectra/finetune/scorer.py +0 -54
  29. sinatools-0.1.37/sinatools/arabert/araelectra/finetune/task.py +0 -74
  30. sinatools-0.1.37/sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
  31. sinatools-0.1.37/sinatools/arabert/araelectra/flops_computation.py +0 -215
  32. sinatools-0.1.37/sinatools/arabert/araelectra/model/__init__.py +0 -14
  33. sinatools-0.1.37/sinatools/arabert/araelectra/model/modeling.py +0 -1029
  34. sinatools-0.1.37/sinatools/arabert/araelectra/model/optimization.py +0 -193
  35. sinatools-0.1.37/sinatools/arabert/araelectra/model/tokenization.py +0 -355
  36. sinatools-0.1.37/sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
  37. sinatools-0.1.37/sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
  38. sinatools-0.1.37/sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
  39. sinatools-0.1.37/sinatools/arabert/araelectra/run_finetuning.py +0 -323
  40. sinatools-0.1.37/sinatools/arabert/araelectra/run_pretraining.py +0 -469
  41. sinatools-0.1.37/sinatools/arabert/araelectra/util/__init__.py +0 -14
  42. sinatools-0.1.37/sinatools/arabert/araelectra/util/training_utils.py +0 -112
  43. sinatools-0.1.37/sinatools/arabert/araelectra/util/utils.py +0 -109
  44. sinatools-0.1.37/sinatools/arabert/aragpt2/__init__.py +0 -2
  45. sinatools-0.1.37/sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
  46. sinatools-0.1.37/sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
  47. sinatools-0.1.37/sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
  48. sinatools-0.1.37/sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
  49. sinatools-0.1.37/sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
  50. sinatools-0.1.37/sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
  51. sinatools-0.1.37/sinatools/arabert/aragpt2/grover/modeling.py +0 -803
  52. sinatools-0.1.37/sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
  53. sinatools-0.1.37/sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
  54. sinatools-0.1.37/sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
  55. sinatools-0.1.37/sinatools/arabert/aragpt2/grover/utils.py +0 -234
  56. sinatools-0.1.37/sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
  57. sinatools-0.1.37/sinatools/utils/__init__.py +0 -0
  58. {sinatools-0.1.37 → sinatools-0.1.38}/AUTHORS.rst +0 -0
  59. {sinatools-0.1.37 → sinatools-0.1.38}/CONTRIBUTING.rst +0 -0
  60. {sinatools-0.1.37 → sinatools-0.1.38}/LICENSE +0 -0
  61. {sinatools-0.1.37 → sinatools-0.1.38}/MANIFEST.in +0 -0
  62. {sinatools-0.1.37 → sinatools-0.1.38}/README.rst +0 -0
  63. {sinatools-0.1.37 → sinatools-0.1.38}/SinaTools.egg-info/dependency_links.txt +0 -0
  64. {sinatools-0.1.37 → sinatools-0.1.38}/SinaTools.egg-info/entry_points.txt +0 -0
  65. {sinatools-0.1.37 → sinatools-0.1.38}/SinaTools.egg-info/not-zip-safe +0 -0
  66. {sinatools-0.1.37 → sinatools-0.1.38}/SinaTools.egg-info/top_level.txt +0 -0
  67. {sinatools-0.1.37 → sinatools-0.1.38}/docs/Makefile +0 -0
  68. {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/_images/download.png +0 -0
  69. {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/_static/download.png +0 -0
  70. {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/_static/file.png +0 -0
  71. {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/_static/minus.png +0 -0
  72. {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/_static/plus.png +0 -0
  73. {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/html/_images/SinaLogo.jpg +0 -0
  74. {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/html/_images/download.png +0 -0
  75. {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/html/_static/SinaLogo.jpg +0 -0
  76. {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/html/_static/download.png +0 -0
  77. {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/html/_static/file.png +0 -0
  78. {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/html/_static/minus.png +0 -0
  79. {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/html/_static/plus.png +0 -0
  80. {sinatools-0.1.37 → sinatools-0.1.38}/docs/make.bat +0 -0
  81. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/License.rst +0 -0
  82. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/Overview.rst +0 -0
  83. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/_static/SinaLogo.jpg +0 -0
  84. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/_static/download.png +0 -0
  85. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/about.rst +0 -0
  86. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/DataDownload/downloader.rst +0 -0
  87. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/DataDownload.rst +0 -0
  88. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/arabiner/bin/infer.rst +0 -0
  89. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/arabiner.rst +0 -0
  90. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/morphology/morph_analyzer.rst +0 -0
  91. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/morphology.rst +0 -0
  92. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/salma/views.rst +0 -0
  93. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/salma.rst +0 -0
  94. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/utils/corpus_tokenizer.rst +0 -0
  95. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/utils/implication.rst +0 -0
  96. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/utils/jaccard.rst +0 -0
  97. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/utils/parser.rst +0 -0
  98. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/utils/sentence_tokenizer.rst +0 -0
  99. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/utils/text_transliteration.rst +0 -0
  100. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/utils.rst +0 -0
  101. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api.rst +0 -0
  102. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/authors.rst +0 -0
  103. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/DataDownload/download_files.rst +0 -0
  104. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/DataDownload/get_appdatadir.rst +0 -0
  105. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/DataDownload.rst +0 -0
  106. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/arabiner/infer.rst +0 -0
  107. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/arabiner.rst +0 -0
  108. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/morphology/ALMA_multi_word.rst +0 -0
  109. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/morphology/morph_analyzer.rst +0 -0
  110. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/morphology.rst +0 -0
  111. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/salma/salma_tools.rst +0 -0
  112. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/salma.rst +0 -0
  113. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils/arStrip.rst +0 -0
  114. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils/corpus_tokenizer.rst +0 -0
  115. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils/implication.rst +0 -0
  116. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils/jaccard.rst +0 -0
  117. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils/latin_remove.rst +0 -0
  118. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils/remove_punc.rst +0 -0
  119. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils/sentence_tokenizer.rst +0 -0
  120. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils/text_transliteration.rst +0 -0
  121. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils.rst +0 -0
  122. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools.rst +0 -0
  123. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/conf.py +0 -0
  124. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/index.rst +0 -0
  125. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/installation.rst +0 -0
  126. {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/readme.rst +0 -0
  127. {sinatools-0.1.37 → sinatools-0.1.38}/setup.cfg +0 -0
  128. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/DataDownload/download_files.py +0 -0
  129. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/morphology/ALMA_multi_word.py +0 -0
  130. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/morphology/morph_analyzer.py +0 -0
  131. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/ner/corpus_entity_extractor.py +0 -0
  132. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/ner/entity_extractor.py +0 -0
  133. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/__init__.py +0 -0
  134. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/arStrip.py +0 -0
  135. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/corpus_tokenizer.py +0 -0
  136. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/implication.py +0 -0
  137. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/jaccard.py +0 -0
  138. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/remove_latin.py +0 -0
  139. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/remove_punctuation.py +0 -0
  140. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/sentence_tokenizer.py +0 -0
  141. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/text_dublication_detector.py +0 -0
  142. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/text_transliteration.py +0 -0
  143. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/DataDownload/__init__.py +0 -0
  144. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/DataDownload/downloader.py +0 -0
  145. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/__init__.py +0 -0
  146. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/arabert/__init__.py +0 -0
  147. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/arabert/preprocess.py +0 -0
  148. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/environment.yml +0 -0
  149. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/install_env.py +0 -0
  150. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/morphology/ALMA_multi_word.py +0 -0
  151. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/morphology/__init__.py +0 -0
  152. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/morphology/morph_analyzer.py +0 -0
  153. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/__init__.py +0 -0
  154. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/data/__init__.py +0 -0
  155. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/data/datasets.py +0 -0
  156. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/data/transforms.py +0 -0
  157. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/data_format.py +0 -0
  158. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/datasets.py +0 -0
  159. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/entity_extractor.py +0 -0
  160. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/helpers.py +0 -0
  161. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/metrics.py +0 -0
  162. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/nn/BaseModel.py +0 -0
  163. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/nn/BertNestedTagger.py +0 -0
  164. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/nn/BertSeqTagger.py +0 -0
  165. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/nn/__init__.py +0 -0
  166. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/trainers/BaseTrainer.py +0 -0
  167. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/trainers/BertNestedTrainer.py +0 -0
  168. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/trainers/BertTrainer.py +0 -0
  169. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/trainers/__init__.py +0 -0
  170. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/transforms.py +0 -0
  171. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/relations/__init__.py +0 -0
  172. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/relations/relation_extractor.py +0 -0
  173. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/semantic_relatedness/__init__.py +0 -0
  174. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/semantic_relatedness/compute_relatedness.py +0 -0
  175. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/sinatools.py +0 -0
  176. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/synonyms/__init__.py +0 -0
  177. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/synonyms/synonyms_generator.py +0 -0
  178. {sinatools-0.1.37/sinatools/arabert/aragpt2/grover → sinatools-0.1.38/sinatools/utils}/__init__.py +0 -0
  179. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/charsets.py +0 -0
  180. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/parser.py +0 -0
  181. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/readfile.py +0 -0
  182. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/similarity.py +0 -0
  183. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/text_dublication_detector.py +0 -0
  184. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/text_transliteration.py +0 -0
  185. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/tokenizer.py +0 -0
  186. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/tokenizers_words.py +0 -0
  187. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/word_compare.py +0 -0
  188. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/wsd/__init__.py +0 -0
  189. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/wsd/disambiguator.py +0 -0
  190. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/wsd/settings.py +0 -0
  191. {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/wsd/wsd.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: SinaTools
3
- Version: 0.1.37
3
+ Version: 0.1.38
4
4
  Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
5
  Home-page: https://github.com/SinaLab/sinatools
6
6
  License: MIT license
@@ -12,7 +12,6 @@ Requires-Dist: six
12
12
  Requires-Dist: farasapy
13
13
  Requires-Dist: tqdm
14
14
  Requires-Dist: requests
15
- Requires-Dist: regex
16
15
  Requires-Dist: pathlib
17
16
  Requires-Dist: torch==1.13.0
18
17
  Requires-Dist: transformers==4.24.0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: SinaTools
3
- Version: 0.1.37
3
+ Version: 0.1.38
4
4
  Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
5
  Home-page: https://github.com/SinaLab/sinatools
6
6
  License: MIT license
@@ -12,7 +12,6 @@ Requires-Dist: six
12
12
  Requires-Dist: farasapy
13
13
  Requires-Dist: tqdm
14
14
  Requires-Dist: requests
15
- Requires-Dist: regex
16
15
  Requires-Dist: pathlib
17
16
  Requires-Dist: torch==1.13.0
18
17
  Requires-Dist: transformers==4.24.0
@@ -96,56 +96,6 @@ sinatools/DataDownload/__init__.py
96
96
  sinatools/DataDownload/downloader.py
97
97
  sinatools/arabert/__init__.py
98
98
  sinatools/arabert/preprocess.py
99
- sinatools/arabert/arabert/__init__.py
100
- sinatools/arabert/arabert/create_classification_data.py
101
- sinatools/arabert/arabert/create_pretraining_data.py
102
- sinatools/arabert/arabert/extract_features.py
103
- sinatools/arabert/arabert/lamb_optimizer.py
104
- sinatools/arabert/arabert/modeling.py
105
- sinatools/arabert/arabert/optimization.py
106
- sinatools/arabert/arabert/run_classifier.py
107
- sinatools/arabert/arabert/run_pretraining.py
108
- sinatools/arabert/arabert/run_squad.py
109
- sinatools/arabert/arabert/tokenization.py
110
- sinatools/arabert/araelectra/__init__.py
111
- sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py
112
- sinatools/arabert/araelectra/build_pretraining_dataset.py
113
- sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py
114
- sinatools/arabert/araelectra/configure_finetuning.py
115
- sinatools/arabert/araelectra/configure_pretraining.py
116
- sinatools/arabert/araelectra/flops_computation.py
117
- sinatools/arabert/araelectra/run_finetuning.py
118
- sinatools/arabert/araelectra/run_pretraining.py
119
- sinatools/arabert/araelectra/finetune/__init__.py
120
- sinatools/arabert/araelectra/finetune/feature_spec.py
121
- sinatools/arabert/araelectra/finetune/preprocessing.py
122
- sinatools/arabert/araelectra/finetune/scorer.py
123
- sinatools/arabert/araelectra/finetune/task.py
124
- sinatools/arabert/araelectra/finetune/task_builder.py
125
- sinatools/arabert/araelectra/model/__init__.py
126
- sinatools/arabert/araelectra/model/modeling.py
127
- sinatools/arabert/araelectra/model/optimization.py
128
- sinatools/arabert/araelectra/model/tokenization.py
129
- sinatools/arabert/araelectra/pretrain/__init__.py
130
- sinatools/arabert/araelectra/pretrain/pretrain_data.py
131
- sinatools/arabert/araelectra/pretrain/pretrain_helpers.py
132
- sinatools/arabert/araelectra/util/__init__.py
133
- sinatools/arabert/araelectra/util/training_utils.py
134
- sinatools/arabert/araelectra/util/utils.py
135
- sinatools/arabert/aragpt2/__init__.py
136
- sinatools/arabert/aragpt2/create_pretraining_data.py
137
- sinatools/arabert/aragpt2/train_bpe_tokenizer.py
138
- sinatools/arabert/aragpt2/gpt2/__init__.py
139
- sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py
140
- sinatools/arabert/aragpt2/gpt2/optimization.py
141
- sinatools/arabert/aragpt2/gpt2/run_pretraining.py
142
- sinatools/arabert/aragpt2/grover/__init__.py
143
- sinatools/arabert/aragpt2/grover/dataloader.py
144
- sinatools/arabert/aragpt2/grover/modeling.py
145
- sinatools/arabert/aragpt2/grover/modeling_gpt2.py
146
- sinatools/arabert/aragpt2/grover/optimization_adafactor.py
147
- sinatools/arabert/aragpt2/grover/train_tpu.py
148
- sinatools/arabert/aragpt2/grover/utils.py
149
99
  sinatools/morphology/ALMA_multi_word.py
150
100
  sinatools/morphology/__init__.py
151
101
  sinatools/morphology/morph_analyzer.py
@@ -2,7 +2,6 @@ six
2
2
  farasapy
3
3
  tqdm
4
4
  requests
5
- regex
6
5
  pathlib
7
6
  torch==1.13.0
8
7
  transformers==4.24.0
@@ -16,7 +16,7 @@ requirements = [
16
16
  'farasapy',
17
17
  'tqdm',
18
18
  'requests',
19
- 'regex',
19
+ # 'regex',
20
20
  'pathlib',
21
21
  'torch==1.13.0',
22
22
  'transformers==4.24.0',
@@ -0,0 +1 @@
1
+ 0.1.38
@@ -1 +0,0 @@
1
- 0.1.37
@@ -1,14 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2018 The Google AI Language Team Authors.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
@@ -1,260 +0,0 @@
1
- # Scripts used to pre_process and create the data for classifier evaluation
2
- #%%
3
- import pandas as pd
4
- from sklearn.model_selection import train_test_split
5
-
6
- import sys
7
- sys.path.append("..")
8
-
9
- from arabert.preprocess import ArabertPreprocessor
10
-
11
-
12
- from tqdm import tqdm
13
-
14
- tqdm.pandas()
15
-
16
- from tokenization import FullTokenizer
17
- from run_classifier import input_fn_builder, model_fn_builder
18
-
19
-
20
- model_name = "bert-base-arabert"
21
- arabert_prep = ArabertPreprocessor(model_name=model_name, keep_emojis=False)
22
-
23
-
24
- class Dataset:
25
- def __init__(
26
- self,
27
- name,
28
- train,
29
- test,
30
- label_list,
31
- train_InputExamples=None,
32
- test_InputExamples=None,
33
- train_features=None,
34
- test_features=None,
35
- ):
36
- self.name = name
37
- self.train = train
38
- self.test = test
39
- self.label_list = label_list
40
- self.train_InputExamples = train_InputExamples
41
- self.test_InputExamples = test_InputExamples
42
- self.train_features = train_features
43
- self.test_features = test_features
44
-
45
-
46
- all_datasets = []
47
- #%%
48
- # *************HARD************
49
- df_HARD = pd.read_csv("Datasets\\HARD\\balanced-reviews-utf8.tsv", sep="\t", header=0)
50
-
51
- df_HARD = df_HARD[["rating", "review"]] # we are interested in rating and review only
52
- # code rating as +ve if > 3, -ve if less, no 3s in dataset
53
- df_HARD["rating"] = df_HARD["rating"].apply(lambda x: 0 if x < 3 else 1)
54
- # rename columns to fit default constructor in fastai
55
- df_HARD.columns = ["label", "text"]
56
- df_HARD["text"] = df_HARD["text"].progress_apply(
57
- lambda x: arabert_prep.preprocess(
58
- x
59
- )
60
- )
61
- train_HARD, test_HARD = train_test_split(df_HARD, test_size=0.2, random_state=42)
62
- label_list_HARD = [0, 1]
63
-
64
- data_Hard = Dataset("HARD", train_HARD, test_HARD, label_list_HARD)
65
- all_datasets.append(data_Hard)
66
-
67
- #%%
68
- # *************ASTD-Unbalanced************
69
- df_ASTD_UN = pd.read_csv(
70
- "Datasets\\ASTD-master\\data\\Tweets.txt", sep="\t", header=None
71
- )
72
-
73
- DATA_COLUMN = "text"
74
- LABEL_COLUMN = "label"
75
- df_ASTD_UN.columns = [DATA_COLUMN, LABEL_COLUMN]
76
-
77
- df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
78
- lambda x: 0 if (x == "NEG") else x
79
- )
80
- df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
81
- lambda x: 1 if (x == "POS") else x
82
- )
83
- df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
84
- lambda x: 2 if (x == "NEUTRAL") else x
85
- )
86
- df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
87
- lambda x: 3 if (x == "OBJ") else x
88
- )
89
- df_ASTD_UN["text"] = df_ASTD_UN["text"].progress_apply(
90
- lambda x: arabert_prep.preprocess(
91
- x
92
- )
93
- )
94
- train_ASTD_UN, test_ASTD_UN = train_test_split(
95
- df_ASTD_UN, test_size=0.2, random_state=42
96
- )
97
- label_list_ASTD_UN = [0, 1, 2, 3]
98
-
99
- data_ASTD_UN = Dataset(
100
- "ASTD-Unbalanced", train_ASTD_UN, test_ASTD_UN, label_list_ASTD_UN
101
- )
102
- all_datasets.append(data_ASTD_UN)
103
- #%%
104
- # *************ASTD-Dahou-Balanced************
105
-
106
- df_ASTD_B = pd.read_csv(
107
- "Datasets\\Dahou\\data_csv_balanced\\ASTD-balanced-not-linked.csv",
108
- sep=",",
109
- header=0,
110
- )
111
-
112
- df_ASTD_B.columns = [DATA_COLUMN, LABEL_COLUMN]
113
-
114
- df_ASTD_B[LABEL_COLUMN] = df_ASTD_B[LABEL_COLUMN].apply(lambda x: 0 if (x == -1) else x)
115
- df_ASTD_B["text"] = df_ASTD_B["text"].progress_apply(
116
- lambda x: arabert_prep.preprocess(
117
- x
118
- )
119
- )
120
- train_ASTD_B, test_ASTD_B = train_test_split(df_ASTD_B, test_size=0.2, random_state=42)
121
- label_list_ASTD_B = [0, 1]
122
-
123
- data_ASTD_B = Dataset(
124
- "ASTD-Dahou-Balanced", train_ASTD_B, test_ASTD_B, label_list_ASTD_B
125
- )
126
- all_datasets.append(data_ASTD_B)
127
-
128
- #%%
129
- # *************ArSenTD-LEV************
130
- df_ArSenTD = pd.read_csv(
131
- "Datasets\\ArSenTD-LEV\\ArSenTD-LEV-processed-no-emojis2.csv", sep=",", header=0
132
- )
133
-
134
- df_ArSenTD.columns = [DATA_COLUMN, LABEL_COLUMN]
135
-
136
- df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
137
- lambda x: 0 if (x == "very_negative") else x
138
- )
139
- df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
140
- lambda x: 1 if (x == "negative") else x
141
- )
142
- df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
143
- lambda x: 2 if (x == "neutral") else x
144
- )
145
- df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
146
- lambda x: 3 if (x == "positive") else x
147
- )
148
- df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
149
- lambda x: 4 if (x == "very_positive") else x
150
- )
151
- df_ArSenTD["text"] = df_ArSenTD["text"].progress_apply(
152
- lambda x: arabert_prep.preprocess(
153
- x
154
- )
155
- )
156
- label_list_ArSenTD = [0, 1, 2, 3, 4]
157
-
158
- train_ArSenTD, test_ArSenTD = train_test_split(
159
- df_ArSenTD, test_size=0.2, random_state=42
160
- )
161
-
162
- data_ArSenTD = Dataset("ArSenTD-LEV", train_ArSenTD, test_ArSenTD, label_list_ArSenTD)
163
- all_datasets.append(data_ArSenTD)
164
-
165
- #%%
166
- # *************AJGT************
167
- df_AJGT = pd.read_excel("Datasets\\Ajgt\\AJGT.xlsx", header=0)
168
-
169
- df_AJGT = df_AJGT[["Feed", "Sentiment"]]
170
- df_AJGT.columns = [DATA_COLUMN, LABEL_COLUMN]
171
-
172
- df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(
173
- lambda x: 0 if (x == "Negative") else x
174
- )
175
- df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(
176
- lambda x: 1 if (x == "Positive") else x
177
- )
178
- df_AJGT["text"] = df_AJGT["text"].progress_apply(
179
- lambda x: arabert_prep.preprocess(
180
- x
181
- )
182
- )
183
- train_AJGT, test_AJGT = train_test_split(df_AJGT, test_size=0.2, random_state=42)
184
- label_list_AJGT = [0, 1]
185
-
186
- data_AJGT = Dataset("AJGT", train_AJGT, test_AJGT, label_list_AJGT)
187
- all_datasets.append(data_AJGT)
188
- #%%
189
- # *************LABR-UN-Binary************
190
- from labr import LABR
191
-
192
- labr_helper = LABR()
193
-
194
- (d_train, y_train, d_test, y_test) = labr_helper.get_train_test(
195
- klass="2", balanced="unbalanced"
196
- )
197
-
198
- train_LABR_B_U = pd.DataFrame({"text": d_train, "label": y_train})
199
- test_LABR_B_U = pd.DataFrame({"text": d_test, "label": y_test})
200
-
201
- train_LABR_B_U["text"] = train_LABR_B_U["text"].progress_apply(
202
- lambda x: arabert_prep.preprocess(
203
- x
204
- )
205
- )
206
- test_LABR_B_U["text"] = test_LABR_B_U["text"].progress_apply(
207
- lambda x: arabert_prep.preprocess(
208
- x
209
- )
210
- )
211
- label_list_LABR_B_U = [0, 1]
212
-
213
- data_LABR_B_U = Dataset(
214
- "LABR-UN-Binary", train_LABR_B_U, test_LABR_B_U, label_list_LABR_B_U
215
- )
216
- # all_datasets.append(data_LABR_B_U)
217
-
218
- #%%
219
- for data in tqdm(all_datasets):
220
- # Use the InputExample class from BERT's run_classifier code to create examples from the data
221
- data.train_InputExamples = data.train.apply(
222
- lambda x: run_classifier.InputExample(
223
- guid=None, # Globally unique ID for bookkeeping, unused in this example
224
- text_a=x[DATA_COLUMN],
225
- text_b=None,
226
- label=x[LABEL_COLUMN],
227
- ),
228
- axis=1,
229
- )
230
-
231
- data.test_InputExamples = data.test.apply(
232
- lambda x: run_classifier.InputExample(
233
- guid=None, text_a=x[DATA_COLUMN], text_b=None, label=x[LABEL_COLUMN]
234
- ),
235
- axis=1,
236
- )
237
- #%%
238
- # We'll set sequences to be at most 128 tokens long.
239
- MAX_SEQ_LENGTH = 256
240
-
241
- VOC_FNAME = "./64000_vocab_sp_70m.txt"
242
- tokenizer = FullTokenizer(VOC_FNAME)
243
-
244
- for data in tqdm(all_datasets):
245
- # Convert our train and test features to InputFeatures that BERT understands.
246
- data.train_features = run_classifier.convert_examples_to_features(
247
- data.train_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer
248
- )
249
- data.test_features = run_classifier.convert_examples_to_features(
250
- data.test_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer
251
- )
252
-
253
- # %%
254
- import pickle
255
-
256
- with open("all_datasets_64k_farasa_256.pickle", "wb") as fp: # Pickling
257
- pickle.dump(all_datasets, fp)
258
-
259
-
260
- # %%