SinaTools 0.1.24__tar.gz → 0.1.26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. {SinaTools-0.1.24 → SinaTools-0.1.26}/PKG-INFO +1 -1
  2. {SinaTools-0.1.24 → SinaTools-0.1.26}/SinaTools.egg-info/PKG-INFO +1 -1
  3. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/DataDownload/download_files.py +18 -3
  4. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/ner/corpus_entity_extractor.py +1 -1
  5. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/ner/entity_extractor.py +1 -1
  6. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/DataDownload/downloader.py +35 -31
  7. SinaTools-0.1.26/sinatools/VERSION +1 -0
  8. SinaTools-0.1.26/sinatools/ner/entity_extractor.py +63 -0
  9. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/trainers/BaseTrainer.py +117 -117
  10. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/wsd/disambiguator.py +19 -22
  11. SinaTools-0.1.24/sinatools/VERSION +0 -1
  12. SinaTools-0.1.24/sinatools/ner/entity_extractor.py +0 -72
  13. {SinaTools-0.1.24 → SinaTools-0.1.26}/AUTHORS.rst +0 -0
  14. {SinaTools-0.1.24 → SinaTools-0.1.26}/CONTRIBUTING.rst +0 -0
  15. {SinaTools-0.1.24 → SinaTools-0.1.26}/LICENSE +0 -0
  16. {SinaTools-0.1.24 → SinaTools-0.1.26}/MANIFEST.in +0 -0
  17. {SinaTools-0.1.24 → SinaTools-0.1.26}/README.rst +0 -0
  18. {SinaTools-0.1.24 → SinaTools-0.1.26}/SinaTools.egg-info/SOURCES.txt +0 -0
  19. {SinaTools-0.1.24 → SinaTools-0.1.26}/SinaTools.egg-info/dependency_links.txt +0 -0
  20. {SinaTools-0.1.24 → SinaTools-0.1.26}/SinaTools.egg-info/entry_points.txt +0 -0
  21. {SinaTools-0.1.24 → SinaTools-0.1.26}/SinaTools.egg-info/not-zip-safe +0 -0
  22. {SinaTools-0.1.24 → SinaTools-0.1.26}/SinaTools.egg-info/requires.txt +0 -0
  23. {SinaTools-0.1.24 → SinaTools-0.1.26}/SinaTools.egg-info/top_level.txt +0 -0
  24. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/Makefile +0 -0
  25. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/_images/download.png +0 -0
  26. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/_static/download.png +0 -0
  27. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/_static/file.png +0 -0
  28. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/_static/minus.png +0 -0
  29. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/_static/plus.png +0 -0
  30. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/html/_images/SinaLogo.jpg +0 -0
  31. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/html/_images/download.png +0 -0
  32. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/html/_static/SinaLogo.jpg +0 -0
  33. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/html/_static/download.png +0 -0
  34. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/html/_static/file.png +0 -0
  35. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/html/_static/minus.png +0 -0
  36. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/html/_static/plus.png +0 -0
  37. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/make.bat +0 -0
  38. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/License.rst +0 -0
  39. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/Overview.rst +0 -0
  40. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/_static/SinaLogo.jpg +0 -0
  41. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/_static/download.png +0 -0
  42. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/about.rst +0 -0
  43. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/DataDownload/downloader.rst +0 -0
  44. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/DataDownload.rst +0 -0
  45. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/arabiner/bin/infer.rst +0 -0
  46. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/arabiner.rst +0 -0
  47. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/morphology/morph_analyzer.rst +0 -0
  48. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/morphology.rst +0 -0
  49. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/salma/views.rst +0 -0
  50. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/salma.rst +0 -0
  51. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/utils/corpus_tokenizer.rst +0 -0
  52. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/utils/implication.rst +0 -0
  53. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/utils/jaccard.rst +0 -0
  54. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/utils/parser.rst +0 -0
  55. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/utils/sentence_tokenizer.rst +0 -0
  56. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/utils/text_transliteration.rst +0 -0
  57. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/utils.rst +0 -0
  58. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api.rst +0 -0
  59. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/authors.rst +0 -0
  60. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/DataDownload/download_files.rst +0 -0
  61. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/DataDownload/get_appdatadir.rst +0 -0
  62. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/DataDownload.rst +0 -0
  63. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/arabiner/infer.rst +0 -0
  64. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/arabiner.rst +0 -0
  65. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/morphology/ALMA_multi_word.rst +0 -0
  66. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/morphology/morph_analyzer.rst +0 -0
  67. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/morphology.rst +0 -0
  68. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/salma/salma_tools.rst +0 -0
  69. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/salma.rst +0 -0
  70. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils/arStrip.rst +0 -0
  71. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils/corpus_tokenizer.rst +0 -0
  72. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils/implication.rst +0 -0
  73. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils/jaccard.rst +0 -0
  74. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils/latin_remove.rst +0 -0
  75. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils/remove_punc.rst +0 -0
  76. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils/sentence_tokenizer.rst +0 -0
  77. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils/text_transliteration.rst +0 -0
  78. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils.rst +0 -0
  79. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools.rst +0 -0
  80. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/conf.py +0 -0
  81. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/index.rst +0 -0
  82. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/installation.rst +0 -0
  83. {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/readme.rst +0 -0
  84. {SinaTools-0.1.24 → SinaTools-0.1.26}/setup.cfg +0 -0
  85. {SinaTools-0.1.24 → SinaTools-0.1.26}/setup.py +0 -0
  86. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/morphology/ALMA_multi_word.py +0 -0
  87. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/morphology/morph_analyzer.py +0 -0
  88. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/__init__.py +0 -0
  89. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/arStrip.py +0 -0
  90. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/corpus_tokenizer.py +0 -0
  91. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/implication.py +0 -0
  92. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/jaccard.py +0 -0
  93. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/remove_latin.py +0 -0
  94. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/remove_punctuation.py +0 -0
  95. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/sentence_tokenizer.py +0 -0
  96. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/text_dublication_detector.py +0 -0
  97. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/text_transliteration.py +0 -0
  98. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/DataDownload/__init__.py +0 -0
  99. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/__init__.py +0 -0
  100. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/__init__.py +0 -0
  101. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/__init__.py +0 -0
  102. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/create_classification_data.py +0 -0
  103. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/create_pretraining_data.py +0 -0
  104. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/extract_features.py +0 -0
  105. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/lamb_optimizer.py +0 -0
  106. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/modeling.py +0 -0
  107. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/optimization.py +0 -0
  108. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/run_classifier.py +0 -0
  109. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/run_pretraining.py +0 -0
  110. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/run_squad.py +0 -0
  111. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/tokenization.py +0 -0
  112. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/__init__.py +0 -0
  113. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -0
  114. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -0
  115. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -0
  116. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/configure_finetuning.py +0 -0
  117. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/configure_pretraining.py +0 -0
  118. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/finetune/__init__.py +0 -0
  119. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/finetune/feature_spec.py +0 -0
  120. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/finetune/preprocessing.py +0 -0
  121. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/finetune/scorer.py +0 -0
  122. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/finetune/task.py +0 -0
  123. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/finetune/task_builder.py +0 -0
  124. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/flops_computation.py +0 -0
  125. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/model/__init__.py +0 -0
  126. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/model/modeling.py +0 -0
  127. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/model/optimization.py +0 -0
  128. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/model/tokenization.py +0 -0
  129. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/pretrain/__init__.py +0 -0
  130. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -0
  131. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -0
  132. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/run_finetuning.py +0 -0
  133. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/run_pretraining.py +0 -0
  134. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/util/__init__.py +0 -0
  135. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/util/training_utils.py +0 -0
  136. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/util/utils.py +0 -0
  137. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/__init__.py +0 -0
  138. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/create_pretraining_data.py +0 -0
  139. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/gpt2/__init__.py +0 -0
  140. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -0
  141. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/gpt2/optimization.py +0 -0
  142. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -0
  143. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/grover/__init__.py +0 -0
  144. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/grover/dataloader.py +0 -0
  145. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/grover/modeling.py +0 -0
  146. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -0
  147. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -0
  148. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/grover/train_tpu.py +0 -0
  149. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/grover/utils.py +0 -0
  150. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -0
  151. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/preprocess.py +0 -0
  152. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/environment.yml +0 -0
  153. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/install_env.py +0 -0
  154. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/morphology/ALMA_multi_word.py +0 -0
  155. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/morphology/__init__.py +0 -0
  156. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/morphology/morph_analyzer.py +0 -0
  157. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/__init__.py +0 -0
  158. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/data/__init__.py +0 -0
  159. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/data/datasets.py +0 -0
  160. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/data/transforms.py +0 -0
  161. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/data_format.py +0 -0
  162. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/datasets.py +0 -0
  163. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/helpers.py +0 -0
  164. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/metrics.py +0 -0
  165. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/nn/BaseModel.py +0 -0
  166. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/nn/BertNestedTagger.py +0 -0
  167. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/nn/BertSeqTagger.py +0 -0
  168. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/nn/__init__.py +0 -0
  169. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/trainers/BertNestedTrainer.py +0 -0
  170. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/trainers/BertTrainer.py +0 -0
  171. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/trainers/__init__.py +0 -0
  172. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/transforms.py +0 -0
  173. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/semantic_relatedness/__init__.py +0 -0
  174. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/semantic_relatedness/compute_relatedness.py +0 -0
  175. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/sinatools.py +0 -0
  176. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/synonyms/__init__.py +0 -0
  177. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/synonyms/synonyms_generator.py +0 -0
  178. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/__init__.py +0 -0
  179. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/charsets.py +0 -0
  180. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/implication.py +0 -0
  181. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/jaccard.py +0 -0
  182. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/parser.py +0 -0
  183. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/readfile.py +0 -0
  184. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/text_dublication_detector.py +0 -0
  185. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/text_transliteration.py +0 -0
  186. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/tokenizer.py +0 -0
  187. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/tokenizers_words.py +0 -0
  188. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/wsd/__init__.py +0 -0
  189. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/wsd/settings.py +0 -0
  190. {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/wsd/wsd.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: SinaTools
3
- Version: 0.1.24
3
+ Version: 0.1.26
4
4
  Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
5
  Home-page: https://github.com/SinaLab/sinatools
6
6
  License: MIT license
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: SinaTools
3
- Version: 0.1.24
3
+ Version: 0.1.26
4
4
  Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
5
  Home-page: https://github.com/SinaLab/sinatools
6
6
  License: MIT license
@@ -40,7 +40,7 @@ from sinatools.DataDownload.downloader import urls
40
40
 
41
41
  def main():
42
42
  parser = argparse.ArgumentParser(description="Download files from specified URLs.")
43
- parser.add_argument('-f', '--files', nargs="*", choices=urls.keys(),
43
+ parser.add_argument('-f', '--files', nargs="*",
44
44
  help="Names of the files to download. Available files are: "
45
45
  f"{', '.join(urls.keys())}. If no file is specified, all files will be downloaded.")
46
46
 
@@ -50,8 +50,23 @@ def main():
50
50
 
51
51
  if args.files:
52
52
  for file in args.files:
53
- url = urls[file]
54
- download_file(url)
53
+ print("file: ", file)
54
+ if file == "wsd":
55
+ download_file(urls["morph"])
56
+ download_file(urls["ner"])
57
+ download_file(urls["wsd_model"])
58
+ download_file(urls["wsd_tokenizer"])
59
+ download_file(urls["glosses_dic"])
60
+ download_file(urls["five_grams"])
61
+ download_file(urls["four_grams"])
62
+ download_file(urls["three_grams"])
63
+ download_file(urls["two_grams"])
64
+ elif file == "synonyms":
65
+ download_file(urls["synonyms_level2"])
66
+ download_file(urls["synonyms_level3"])
67
+ else:
68
+ url = urls[file]
69
+ download_file(url)
55
70
  else:
56
71
  download_files()
57
72
 
@@ -20,7 +20,7 @@ def jsons_to_list_of_lists(json_list):
20
20
  return [[d['token'], d['tags']] for d in json_list]
21
21
 
22
22
  def combine_tags(sentence):
23
- output = jsons_to_list_of_lists(extract(sentence))
23
+ output = jsons_to_list_of_lists(extract(sentence, "nested"))
24
24
  return [word[1] for word in output]
25
25
 
26
26
 
@@ -46,7 +46,7 @@ def jsons_to_list_of_lists(json_list):
46
46
  return [[d['token'], d['tags']] for d in json_list]
47
47
 
48
48
  def combine_tags(sentence):
49
- output = jsons_to_list_of_lists(extract(sentence))
49
+ output = jsons_to_list_of_lists(extract(sentence, "nested"))
50
50
  return [word[1] for word in output]
51
51
 
52
52
 
@@ -95,37 +95,41 @@ def download_file(url, dest_path=get_appdatadir()):
95
95
  print(filename)
96
96
  headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
97
97
 
98
- # try:
99
- with requests.get(url, headers=headers, stream=True) as r:
100
- r.raise_for_status()
101
- with open(file_path, 'wb') as f:
102
- total_size = int(r.headers.get('content-length', 0))
103
- block_size = 8192
104
- progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True)
105
- for chunk in r.iter_content(chunk_size=block_size):
106
- if chunk:
107
- f.write(chunk)
108
- progress_bar.update(len(chunk))
109
- progress_bar.close()
110
- # Check the file type and extract accordingly
111
- file_extension = os.path.splitext(file_path)[1]
112
- extracted_folder_name = os.path.splitext(file_path)[0]
113
-
114
- if file_extension == '.zip':
115
- extract_zip(file_path, extracted_folder_name)
116
- elif file_extension == '.gz':
117
- extract_tar(file_path, extracted_folder_name)
118
- elif file_extension =='.pickle':
119
- print(f'Done: {file_extension}')
120
- else:
121
- print(f'Unsupported file type for extraction: {file_extension}')
122
- return file_path
123
-
124
- # except requests.exceptions.HTTPError as e:
125
- # if e.response.status_code == 403:
126
- # print(f'Error 403: Forbidden. The requested file URL {url} could not be downloaded due to insufficient permissions. Please check the URL and try again.')
127
- # else:
128
- # print('An error occurred while downloading the file:', e)
98
+ try:
99
+ with requests.get(url, headers=headers, stream=True) as r:
100
+ r.raise_for_status()
101
+ with open(file_path, 'wb') as f:
102
+ total_size = int(r.headers.get('content-length', 0))
103
+ block_size = 8192
104
+ progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True)
105
+ for chunk in r.iter_content(chunk_size=block_size):
106
+ if chunk:
107
+ f.write(chunk)
108
+ progress_bar.update(len(chunk))
109
+ progress_bar.close()
110
+
111
+ # Check the file type and extract accordingly
112
+ file_extension = os.path.splitext(file_path)[1]
113
+ extracted_folder_name = os.path.splitext(file_path)[0]
114
+
115
+ if file_extension == '.zip':
116
+ extract_zip(file_path, extracted_folder_name)
117
+ elif file_extension == '.gz':
118
+
119
+ extract_tar(file_path, extracted_folder_name)
120
+ elif file_extension =='.pickle':
121
+ print(f'Done: {file_extension}')
122
+
123
+ else:
124
+ print(f'Unsupported file type for extraction: {file_extension}')
125
+
126
+ return file_path
127
+
128
+ except requests.exceptions.HTTPError as e:
129
+ if e.response.status_code == 403:
130
+ print(f'Error 403: Forbidden. The requested file URL {url} could not be downloaded due to insufficient permissions. Please check the URL and try again.')
131
+ else:
132
+ print('An error occurred while downloading the file:', e)
129
133
 
130
134
  def extract_zip(file_path, extracted_folder_name):
131
135
  """
@@ -0,0 +1 @@
1
+ 0.1.26
@@ -0,0 +1,63 @@
1
+ import os
2
+ from collections import namedtuple
3
+ from sinatools.ner.data_format import get_dataloaders, text2segments
4
+ from . import tagger, tag_vocab, train_config
5
+
6
+
7
+ def convert_nested_to_flat(nested_tags):
8
+ flat_tags = []
9
+
10
+ for entry in nested_tags:
11
+ word = entry['token']
12
+ tags = entry['tags'].split()
13
+
14
+ # Initialize with the first tag in the sequence
15
+ flat_tag = tags[0]
16
+
17
+ for tag in tags[1:]:
18
+ # Check if the tag is an "I-" tag, indicating continuation of an entity
19
+ if tag.startswith('I-'):
20
+ flat_tag = tag
21
+ break
22
+
23
+ flat_tags.append({
24
+ 'token': word,
25
+ 'tags': flat_tag
26
+ })
27
+
28
+ return flat_tags
29
+
30
+ def extract(text, ner_method):
31
+
32
+ dataset, token_vocab = text2segments(text)
33
+
34
+ vocabs = namedtuple("Vocab", ["tags", "tokens"])
35
+ vocab = vocabs(tokens=token_vocab, tags=tag_vocab)
36
+
37
+ dataloader = get_dataloaders(
38
+ (dataset,),
39
+ vocab,
40
+ train_config.data_config,
41
+ batch_size=32,
42
+ shuffle=(False,),
43
+ )[0]
44
+
45
+
46
+ segments = tagger.infer(dataloader)
47
+ segments_lists = []
48
+
49
+ for segment in segments:
50
+ for token in segment:
51
+ segments_list = {}
52
+ segments_list["token"] = token.text
53
+ list_of_tags = [t['tag'] for t in token.pred_tag]
54
+ list_of_tags = [i for i in list_of_tags if i not in('O',' ','')]
55
+ if list_of_tags == []:
56
+ segments_list["tags"] = ' '.join(['O'])
57
+ else:
58
+ segments_list["tags"] = ' '.join(list_of_tags)
59
+ segments_lists.append(segments_list)
60
+
61
+ if ner_method == "flat":
62
+ segments_lists = convert_nested_to_flat(segments_lists)
63
+ return segments_lists
@@ -1,117 +1,117 @@
1
- import os
2
- import torch
3
- import logging
4
- import natsort
5
- import glob
6
-
7
- logger = logging.getLogger(__name__)
8
-
9
-
10
- class BaseTrainer:
11
- def __init__(
12
- self,
13
- model=None,
14
- max_epochs=50,
15
- optimizer=None,
16
- scheduler=None,
17
- loss=None,
18
- train_dataloader=None,
19
- val_dataloader=None,
20
- test_dataloader=None,
21
- log_interval=10,
22
- summary_writer=None,
23
- output_path=None,
24
- clip=5,
25
- patience=5
26
- ):
27
- self.model = model
28
- self.max_epochs = max_epochs
29
- self.train_dataloader = train_dataloader
30
- self.val_dataloader = val_dataloader
31
- self.test_dataloader = test_dataloader
32
- self.optimizer = optimizer
33
- self.scheduler = scheduler
34
- self.loss = loss
35
- self.log_interval = log_interval
36
- self.summary_writer = summary_writer
37
- self.output_path = output_path
38
- self.current_timestep = 0
39
- self.current_epoch = 0
40
- self.clip = clip
41
- self.patience = patience
42
-
43
- def tag(self, dataloader, is_train=True):
44
- """
45
- Given a dataloader containing segments, predict the tags
46
- :param dataloader: torch.utils.data.DataLoader
47
- :param is_train: boolean - True for training model, False for evaluation
48
- :return: Iterator
49
- subwords (B x T x NUM_LABELS)- torch.Tensor - BERT subword ID
50
- gold_tags (B x T x NUM_LABELS) - torch.Tensor - ground truth tags IDs
51
- tokens - List[arabiner.data.dataset.Token] - list of tokens
52
- valid_len (B x 1) - int - valiud length of each sequence
53
- logits (B x T x NUM_LABELS) - logits for each token and each tag
54
- """
55
- for subwords, gold_tags, tokens, valid_len in dataloader:
56
- self.model.train(is_train)
57
-
58
- if torch.cuda.is_available():
59
- subwords = subwords.cuda()
60
- gold_tags = gold_tags.cuda()
61
-
62
- if is_train:
63
- self.optimizer.zero_grad()
64
- logits = self.model(subwords)
65
- else:
66
- with torch.no_grad():
67
- logits = self.model(subwords)
68
-
69
- yield subwords, gold_tags, tokens, valid_len, logits
70
-
71
- def segments_to_file(self, segments, filename):
72
- """
73
- Write segments to file
74
- :param segments: [List[arabiner.data.dataset.Token]] - list of list of tokens
75
- :param filename: str - output filename
76
- :return: None
77
- """
78
- with open(filename, "w") as fh:
79
- results = "\n\n".join(["\n".join([t.__str__() for t in segment]) for segment in segments])
80
- fh.write("Token\tGold Tag\tPredicted Tag\n")
81
- fh.write(results)
82
- logging.info("Predictions written to %s", filename)
83
-
84
- def save(self):
85
- """
86
- Save model checkpoint
87
- :return:
88
- """
89
- filename = os.path.join(
90
- self.output_path,
91
- "checkpoints",
92
- "checkpoint_{}.pt".format(self.current_epoch),
93
- )
94
-
95
- checkpoint = {
96
- "model": self.model.state_dict(),
97
- "optimizer": self.optimizer.state_dict(),
98
- "epoch": self.current_epoch
99
- }
100
-
101
- logger.info("Saving checkpoint to %s", filename)
102
- torch.save(checkpoint, filename)
103
-
104
- def load(self, checkpoint_path):
105
- """
106
- Load model checkpoint
107
- :param checkpoint_path: str - path/to/checkpoints
108
- :return: None
109
- """
110
- checkpoint_path = natsort.natsorted(glob.glob(f"{checkpoint_path}/checkpoint_*.pt"))
111
- checkpoint_path = checkpoint_path[-1]
112
-
113
- logger.info("Loading checkpoint %s", checkpoint_path)
114
-
115
- device = None if torch.cuda.is_available() else torch.device('cpu')
116
- checkpoint = torch.load(checkpoint_path, map_location=device)
117
- self.model.load_state_dict(checkpoint["model"])
1
+ import os
2
+ import torch
3
+ import logging
4
+ import natsort
5
+ import glob
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class BaseTrainer:
11
+ def __init__(
12
+ self,
13
+ model=None,
14
+ max_epochs=50,
15
+ optimizer=None,
16
+ scheduler=None,
17
+ loss=None,
18
+ train_dataloader=None,
19
+ val_dataloader=None,
20
+ test_dataloader=None,
21
+ log_interval=10,
22
+ summary_writer=None,
23
+ output_path=None,
24
+ clip=5,
25
+ patience=5
26
+ ):
27
+ self.model = model
28
+ self.max_epochs = max_epochs
29
+ self.train_dataloader = train_dataloader
30
+ self.val_dataloader = val_dataloader
31
+ self.test_dataloader = test_dataloader
32
+ self.optimizer = optimizer
33
+ self.scheduler = scheduler
34
+ self.loss = loss
35
+ self.log_interval = log_interval
36
+ self.summary_writer = summary_writer
37
+ self.output_path = output_path
38
+ self.current_timestep = 0
39
+ self.current_epoch = 0
40
+ self.clip = clip
41
+ self.patience = patience
42
+
43
+ def tag(self, dataloader, is_train=True):
44
+ """
45
+ Given a dataloader containing segments, predict the tags
46
+ :param dataloader: torch.utils.data.DataLoader
47
+ :param is_train: boolean - True for training model, False for evaluation
48
+ :return: Iterator
49
+ subwords (B x T x NUM_LABELS)- torch.Tensor - BERT subword ID
50
+ gold_tags (B x T x NUM_LABELS) - torch.Tensor - ground truth tags IDs
51
+ tokens - List[arabiner.data.dataset.Token] - list of tokens
52
+ valid_len (B x 1) - int - valiud length of each sequence
53
+ logits (B x T x NUM_LABELS) - logits for each token and each tag
54
+ """
55
+ for subwords, gold_tags, tokens, valid_len in dataloader:
56
+ self.model.train(is_train)
57
+
58
+ if torch.cuda.is_available():
59
+ subwords = subwords.cuda()
60
+ gold_tags = gold_tags.cuda()
61
+
62
+ if is_train:
63
+ self.optimizer.zero_grad()
64
+ logits = self.model(subwords)
65
+ else:
66
+ with torch.no_grad():
67
+ logits = self.model(subwords)
68
+
69
+ yield subwords, gold_tags, tokens, valid_len, logits
70
+
71
+ def segments_to_file(self, segments, filename):
72
+ """
73
+ Write segments to file
74
+ :param segments: [List[arabiner.data.dataset.Token]] - list of list of tokens
75
+ :param filename: str - output filename
76
+ :return: None
77
+ """
78
+ with open(filename, "w") as fh:
79
+ results = "\n\n".join(["\n".join([t.__str__() for t in segment]) for segment in segments])
80
+ fh.write("Token\tGold Tag\tPredicted Tag\n")
81
+ fh.write(results)
82
+ logging.info("Predictions written to %s", filename)
83
+
84
+ def save(self):
85
+ """
86
+ Save model checkpoint
87
+ :return:
88
+ """
89
+ filename = os.path.join(
90
+ self.output_path,
91
+ "checkpoints",
92
+ "checkpoint_{}.pt".format(self.current_epoch),
93
+ )
94
+
95
+ checkpoint = {
96
+ "model": self.model.state_dict(),
97
+ "optimizer": self.optimizer.state_dict(),
98
+ "epoch": self.current_epoch
99
+ }
100
+
101
+ logger.info("Saving checkpoint to %s", filename)
102
+ torch.save(checkpoint, filename)
103
+
104
+ def load(self, checkpoint_path):
105
+ """
106
+ Load model checkpoint
107
+ :param checkpoint_path: str - path/to/checkpoints
108
+ :return: None
109
+ """
110
+ checkpoint_path = natsort.natsorted(glob.glob(f"{checkpoint_path}/checkpoint_*.pt"))
111
+ checkpoint_path = checkpoint_path[-1]
112
+
113
+ logger.info("Loading checkpoint %s", checkpoint_path)
114
+
115
+ device = None if torch.cuda.is_available() else torch.device('cpu')
116
+ checkpoint = torch.load(checkpoint_path, map_location=device)
117
+ self.model.load_state_dict(checkpoint["model"], strict=False)
@@ -217,7 +217,7 @@ def jsons_to_list_of_lists(json_list):
217
217
  def find_named_entities(string):
218
218
  found_entities = []
219
219
 
220
- ner_entites = extract(string)
220
+ ner_entites = extract(string, "nested")
221
221
  list_of_entites = jsons_to_list_of_lists(ner_entites)
222
222
  entites = distill_entities(list_of_entites)
223
223
 
@@ -288,17 +288,17 @@ def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, se
288
288
  concept_id, gloss = GlossPredictor(Diac_lemma, Undiac_lemma,word,sentence,glosses_dictionary)
289
289
 
290
290
  my_json = {}
291
- my_json['Concept_id'] = concept_id
291
+ my_json['concept_id'] = concept_id
292
292
  # my_json['Gloss'] = gloss
293
293
  my_json['word'] = word
294
- my_json['Undiac_lemma'] = Undiac_lemma
295
- my_json['Diac_lemma'] = Diac_lemma
294
+ #my_json['Undiac_lemma'] = Undiac_lemma
295
+ my_json['lemma'] = Diac_lemma
296
296
  return my_json
297
297
  else:
298
298
  my_json = {}
299
299
  my_json['word'] = word
300
- my_json['Undiac_lemma'] = Undiac_lemma
301
- my_json['Diac_lemma'] = Diac_lemma
300
+ #my_json['Undiac_lemma'] = Undiac_lemma
301
+ my_json['lemma'] = Diac_lemma
302
302
  return my_json
303
303
 
304
304
 
@@ -405,26 +405,26 @@ def disambiguate_glosses_main(word, sentence):
405
405
  if concept_count == 0:
406
406
  my_json = {}
407
407
  my_json['word'] = word['word']
408
- my_json['Diac_lemma'] = word['Diac_lemma']
409
- my_json['Undiac_lemma'] = word['Undiac_lemma']
408
+ my_json['lemma'] = word['Diac_lemma']
409
+ #my_json['Undiac_lemma'] = word['Undiac_lemma']
410
410
  return my_json
411
411
  elif concept_count == 1:
412
412
  my_json = {}
413
413
  my_json['word'] = word['word']
414
414
  glosses = word['glosses'][0]
415
415
  # my_json['Gloss'] = glosses['gloss']
416
- my_json['Concept_id'] = glosses['concept_id']
417
- my_json['Diac_lemma'] = word['Diac_lemma']
418
- my_json['Undiac_lemma'] = word['Undiac_lemma']
416
+ my_json['concept_id'] = glosses['concept_id']
417
+ my_json['lemma'] = word['Diac_lemma']
418
+ #my_json['Undiac_lemma'] = word['Undiac_lemma']
419
419
  return my_json
420
420
  elif concept_count == '*':
421
421
  my_json = {}
422
422
  my_json['word'] = word['word']
423
423
  glosses = word['glosses'][0]
424
424
  my_json['Gloss'] = glosses['gloss']
425
- my_json['Concept_id'] = glosses['concept_id']
426
- my_json['Diac_lemma'] = word['Diac_lemma']
427
- my_json['Undiac_lemma'] = word['Undiac_lemma']
425
+ my_json['concept_id'] = glosses['concept_id']
426
+ my_json['lemma'] = word['Diac_lemma']
427
+ #my_json['Undiac_lemma'] = word['Undiac_lemma']
428
428
  return my_json
429
429
  else:
430
430
  input_word = word['word']
@@ -477,21 +477,18 @@ def disambiguate(sentence):
477
477
  #output
478
478
  [
479
479
  {
480
- "Concept_id": "303019218",
480
+ "concept_id": "303019218",
481
481
  "word": "ذهبت",
482
- "Undiac_lemma": "ذهب",
483
- "Diac_lemma": "ذَهَبَ۪ 1"
482
+ "lemma": "ذَهَبَ۪ 1"
484
483
  },
485
484
  {
486
485
  "word": "إلى",
487
- "Diac_lemma": إِلَى 1,
488
- "Undiac_lemma": "الى"
486
+ "lemma": "إِلَى 1"
489
487
  },
490
488
  {
491
489
  "word": "جامعة بيرزيت",
492
- "Concept_id": "334000099",
493
- "Diac_lemma": جامِعَة بيرزَيت,
494
- "Undiac_lemma": "جامعة بيرزيت"
490
+ "concept_id": "334000099",
491
+ "lemma": "جامِعَة بيرزَيت"
495
492
  }
496
493
  ]
497
494
  """
@@ -1 +0,0 @@
1
- 0.1.24
@@ -1,72 +0,0 @@
1
- import os
2
- from collections import namedtuple
3
- from sinatools.ner.data_format import get_dataloaders, text2segments
4
- from . import tagger, tag_vocab, train_config
5
-
6
- def extract(text, batch_size=32):
7
- """
8
- This method processes an input text and returns named entites for each token within the text, based on the specified batch size. As follows:
9
-
10
- Args:
11
- text (:obj:`str`): The Arabic text to be tagged.
12
- batch_size (int, optional): Batch size for inference. Default is 32.
13
-
14
- Returns:
15
- list (:obj:`list`): A list of JSON objects, where each JSON could be contains:
16
- token: The token from the original text.
17
- NER tag: The label pairs for each segment.
18
-
19
- **Example:**
20
-
21
- .. highlight:: python
22
- .. code-block:: python
23
-
24
- from sinatools.ner.entity_extractor import extract
25
- extract('ذهب محمد إلى جامعة بيرزيت')
26
- [{
27
- "word":"ذهب",
28
- "tags":"O"
29
- },{
30
- "word":"محمد",
31
- "tags":"B-PERS"
32
- },{
33
- "word":"إلى",
34
- "tags":"O"
35
- },{
36
- "word":"جامعة",
37
- "tags":"B-ORG"
38
- },{
39
- "word":"بيرزيت",
40
- "tags":"B-GPE I-ORG"
41
- }]
42
- """
43
-
44
- dataset, token_vocab = text2segments(text)
45
-
46
- vocabs = namedtuple("Vocab", ["tags", "tokens"])
47
- vocab = vocabs(tokens=token_vocab, tags=tag_vocab)
48
-
49
- dataloader = get_dataloaders(
50
- (dataset,),
51
- vocab,
52
- train_config.data_config,
53
- batch_size=batch_size,
54
- shuffle=(False,),
55
- )[0]
56
-
57
-
58
- segments = tagger.infer(dataloader)
59
- segments_lists = []
60
-
61
- for segment in segments:
62
- for token in segment:
63
- segments_list = {}
64
- segments_list["token"] = token.text
65
- list_of_tags = [t['tag'] for t in token.pred_tag]
66
- list_of_tags = [i for i in list_of_tags if i not in('O',' ','')]
67
- if list_of_tags == []:
68
- segments_list["tags"] = ' '.join(['O'])
69
- else:
70
- segments_list["tags"] = ' '.join(list_of_tags)
71
- segments_lists.append(segments_list)
72
- return segments_lists
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes