SinaTools 0.1.26__tar.gz → 0.1.28__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (195) hide show
  1. SinaTools-0.1.28/PKG-INFO +50 -0
  2. SinaTools-0.1.28/README.rst +39 -0
  3. SinaTools-0.1.28/SinaTools.egg-info/PKG-INFO +50 -0
  4. {SinaTools-0.1.26 → SinaTools-0.1.28}/SinaTools.egg-info/SOURCES.txt +3 -2
  5. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/CLI/DataDownload/download_files.py +5 -8
  6. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/CLI/morphology/ALMA_multi_word.py +0 -34
  7. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/CLI/morphology/morph_analyzer.py +1 -1
  8. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/CLI/ner/corpus_entity_extractor.py +17 -4
  9. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/CLI/ner/entity_extractor.py +8 -8
  10. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/CLI/utils/implication.py +3 -3
  11. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/CLI/utils/jaccard.py +2 -2
  12. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/DataDownload/downloader.py +3 -3
  13. SinaTools-0.1.28/sinatools/VERSION +1 -0
  14. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/morphology/morph_analyzer.py +44 -45
  15. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/ner/__init__.py +6 -1
  16. SinaTools-0.1.28/sinatools/ner/entity_extractor.py +104 -0
  17. SinaTools-0.1.28/sinatools/ner/relation_extractor.py +201 -0
  18. SinaTools-0.1.28/sinatools/semantic_relatedness/compute_relatedness.py +53 -0
  19. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/synonyms/__init__.py +2 -2
  20. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/synonyms/synonyms_generator.py +45 -1
  21. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/utils/parser.py +12 -15
  22. SinaTools-0.1.26/sinatools/utils/jaccard.py → SinaTools-0.1.28/sinatools/utils/similarity.py +81 -88
  23. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/utils/text_dublication_detector.py +22 -0
  24. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/utils/text_transliteration.py +1 -1
  25. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/utils/tokenizer.py +1 -1
  26. SinaTools-0.1.26/sinatools/utils/implication.py → SinaTools-0.1.28/sinatools/utils/word_compare.py +15 -10
  27. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/wsd/__init__.py +1 -1
  28. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/wsd/disambiguator.py +20 -19
  29. SinaTools-0.1.26/PKG-INFO +0 -20
  30. SinaTools-0.1.26/README.rst +0 -9
  31. SinaTools-0.1.26/SinaTools.egg-info/PKG-INFO +0 -20
  32. SinaTools-0.1.26/sinatools/VERSION +0 -1
  33. SinaTools-0.1.26/sinatools/ner/entity_extractor.py +0 -63
  34. SinaTools-0.1.26/sinatools/semantic_relatedness/compute_relatedness.py +0 -31
  35. {SinaTools-0.1.26 → SinaTools-0.1.28}/AUTHORS.rst +0 -0
  36. {SinaTools-0.1.26 → SinaTools-0.1.28}/CONTRIBUTING.rst +0 -0
  37. {SinaTools-0.1.26 → SinaTools-0.1.28}/LICENSE +0 -0
  38. {SinaTools-0.1.26 → SinaTools-0.1.28}/MANIFEST.in +0 -0
  39. {SinaTools-0.1.26 → SinaTools-0.1.28}/SinaTools.egg-info/dependency_links.txt +0 -0
  40. {SinaTools-0.1.26 → SinaTools-0.1.28}/SinaTools.egg-info/entry_points.txt +0 -0
  41. {SinaTools-0.1.26 → SinaTools-0.1.28}/SinaTools.egg-info/not-zip-safe +0 -0
  42. {SinaTools-0.1.26 → SinaTools-0.1.28}/SinaTools.egg-info/requires.txt +0 -0
  43. {SinaTools-0.1.26 → SinaTools-0.1.28}/SinaTools.egg-info/top_level.txt +0 -0
  44. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/Makefile +0 -0
  45. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/build/_images/download.png +0 -0
  46. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/build/_static/download.png +0 -0
  47. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/build/_static/file.png +0 -0
  48. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/build/_static/minus.png +0 -0
  49. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/build/_static/plus.png +0 -0
  50. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/build/html/_images/SinaLogo.jpg +0 -0
  51. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/build/html/_images/download.png +0 -0
  52. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/build/html/_static/SinaLogo.jpg +0 -0
  53. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/build/html/_static/download.png +0 -0
  54. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/build/html/_static/file.png +0 -0
  55. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/build/html/_static/minus.png +0 -0
  56. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/build/html/_static/plus.png +0 -0
  57. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/make.bat +0 -0
  58. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/License.rst +0 -0
  59. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/Overview.rst +0 -0
  60. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/_static/SinaLogo.jpg +0 -0
  61. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/_static/download.png +0 -0
  62. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/about.rst +0 -0
  63. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/api/DataDownload/downloader.rst +0 -0
  64. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/api/DataDownload.rst +0 -0
  65. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/api/arabiner/bin/infer.rst +0 -0
  66. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/api/arabiner.rst +0 -0
  67. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/api/morphology/morph_analyzer.rst +0 -0
  68. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/api/morphology.rst +0 -0
  69. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/api/salma/views.rst +0 -0
  70. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/api/salma.rst +0 -0
  71. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/api/utils/corpus_tokenizer.rst +0 -0
  72. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/api/utils/implication.rst +0 -0
  73. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/api/utils/jaccard.rst +0 -0
  74. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/api/utils/parser.rst +0 -0
  75. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/api/utils/sentence_tokenizer.rst +0 -0
  76. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/api/utils/text_transliteration.rst +0 -0
  77. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/api/utils.rst +0 -0
  78. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/api.rst +0 -0
  79. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/authors.rst +0 -0
  80. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/DataDownload/download_files.rst +0 -0
  81. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/DataDownload/get_appdatadir.rst +0 -0
  82. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/DataDownload.rst +0 -0
  83. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/arabiner/infer.rst +0 -0
  84. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/arabiner.rst +0 -0
  85. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/morphology/ALMA_multi_word.rst +0 -0
  86. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/morphology/morph_analyzer.rst +0 -0
  87. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/morphology.rst +0 -0
  88. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/salma/salma_tools.rst +0 -0
  89. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/salma.rst +0 -0
  90. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/utils/arStrip.rst +0 -0
  91. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/utils/corpus_tokenizer.rst +0 -0
  92. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/utils/implication.rst +0 -0
  93. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/utils/jaccard.rst +0 -0
  94. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/utils/latin_remove.rst +0 -0
  95. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/utils/remove_punc.rst +0 -0
  96. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/utils/sentence_tokenizer.rst +0 -0
  97. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/utils/text_transliteration.rst +0 -0
  98. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools/utils.rst +0 -0
  99. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/cli_tools.rst +0 -0
  100. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/conf.py +0 -0
  101. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/index.rst +0 -0
  102. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/installation.rst +0 -0
  103. {SinaTools-0.1.26 → SinaTools-0.1.28}/docs/source/readme.rst +0 -0
  104. {SinaTools-0.1.26 → SinaTools-0.1.28}/setup.cfg +0 -0
  105. {SinaTools-0.1.26 → SinaTools-0.1.28}/setup.py +0 -0
  106. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/CLI/utils/__init__.py +0 -0
  107. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/CLI/utils/arStrip.py +0 -0
  108. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/CLI/utils/corpus_tokenizer.py +0 -0
  109. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/CLI/utils/remove_latin.py +0 -0
  110. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/CLI/utils/remove_punctuation.py +0 -0
  111. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/CLI/utils/sentence_tokenizer.py +0 -0
  112. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/CLI/utils/text_dublication_detector.py +0 -0
  113. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/CLI/utils/text_transliteration.py +0 -0
  114. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/DataDownload/__init__.py +0 -0
  115. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/__init__.py +0 -0
  116. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/__init__.py +0 -0
  117. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/arabert/__init__.py +0 -0
  118. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/arabert/create_classification_data.py +0 -0
  119. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/arabert/create_pretraining_data.py +0 -0
  120. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/arabert/extract_features.py +0 -0
  121. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/arabert/lamb_optimizer.py +0 -0
  122. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/arabert/modeling.py +0 -0
  123. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/arabert/optimization.py +0 -0
  124. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/arabert/run_classifier.py +0 -0
  125. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/arabert/run_pretraining.py +0 -0
  126. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/arabert/run_squad.py +0 -0
  127. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/arabert/tokenization.py +0 -0
  128. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/__init__.py +0 -0
  129. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -0
  130. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -0
  131. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -0
  132. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/configure_finetuning.py +0 -0
  133. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/configure_pretraining.py +0 -0
  134. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/finetune/__init__.py +0 -0
  135. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/finetune/feature_spec.py +0 -0
  136. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/finetune/preprocessing.py +0 -0
  137. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/finetune/scorer.py +0 -0
  138. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/finetune/task.py +0 -0
  139. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/finetune/task_builder.py +0 -0
  140. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/flops_computation.py +0 -0
  141. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/model/__init__.py +0 -0
  142. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/model/modeling.py +0 -0
  143. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/model/optimization.py +0 -0
  144. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/model/tokenization.py +0 -0
  145. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/pretrain/__init__.py +0 -0
  146. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -0
  147. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -0
  148. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/run_finetuning.py +0 -0
  149. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/run_pretraining.py +0 -0
  150. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/util/__init__.py +0 -0
  151. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/util/training_utils.py +0 -0
  152. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/araelectra/util/utils.py +0 -0
  153. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/aragpt2/__init__.py +0 -0
  154. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/aragpt2/create_pretraining_data.py +0 -0
  155. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/aragpt2/gpt2/__init__.py +0 -0
  156. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -0
  157. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/aragpt2/gpt2/optimization.py +0 -0
  158. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -0
  159. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/aragpt2/grover/__init__.py +0 -0
  160. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/aragpt2/grover/dataloader.py +0 -0
  161. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/aragpt2/grover/modeling.py +0 -0
  162. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -0
  163. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -0
  164. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/aragpt2/grover/train_tpu.py +0 -0
  165. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/aragpt2/grover/utils.py +0 -0
  166. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -0
  167. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/arabert/preprocess.py +0 -0
  168. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/environment.yml +0 -0
  169. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/install_env.py +0 -0
  170. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/morphology/ALMA_multi_word.py +0 -0
  171. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/morphology/__init__.py +0 -0
  172. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/ner/data/__init__.py +0 -0
  173. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/ner/data/datasets.py +0 -0
  174. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/ner/data/transforms.py +0 -0
  175. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/ner/data_format.py +0 -0
  176. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/ner/datasets.py +0 -0
  177. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/ner/helpers.py +0 -0
  178. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/ner/metrics.py +0 -0
  179. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/ner/nn/BaseModel.py +0 -0
  180. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/ner/nn/BertNestedTagger.py +0 -0
  181. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/ner/nn/BertSeqTagger.py +0 -0
  182. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/ner/nn/__init__.py +0 -0
  183. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/ner/trainers/BaseTrainer.py +0 -0
  184. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/ner/trainers/BertNestedTrainer.py +0 -0
  185. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/ner/trainers/BertTrainer.py +0 -0
  186. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/ner/trainers/__init__.py +0 -0
  187. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/ner/transforms.py +0 -0
  188. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/semantic_relatedness/__init__.py +0 -0
  189. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/sinatools.py +0 -0
  190. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/utils/__init__.py +0 -0
  191. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/utils/charsets.py +0 -0
  192. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/utils/readfile.py +0 -0
  193. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/utils/tokenizers_words.py +0 -0
  194. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/wsd/settings.py +0 -0
  195. {SinaTools-0.1.26 → SinaTools-0.1.28}/sinatools/wsd/wsd.py +0 -0
@@ -0,0 +1,50 @@
1
+ Metadata-Version: 2.1
2
+ Name: SinaTools
3
+ Version: 0.1.28
4
+ Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
+ Home-page: https://github.com/SinaLab/sinatools
6
+ License: MIT license
7
+ Description: SinaTools
8
+ ======================
9
+ Open Source Toolkit for Arabic NLP and NLU developed by [SinaLab](http://sina.birzeit.edu/) at Birzeit University. SinaTools is available through Python APIs, command lines, colabs, and online demos.
10
+
11
+ See the full list of [Available Packages](https://sina.birzeit.edu/sinatools/), which include: (1) [Morphology Tagging](https://sina.birzeit.edu/sinatools/index.html#morph), (2) [Named Entity Recognition (NER)](https://sina.birzeit.edu/sinatools/index.html#ner), (3) [Word Sense Disambiguation (WSD)](https://sina.birzeit.edu/sinatools/index.html#wsd), (4) [Semantic Relatedness](https://sina.birzeit.edu/sinatools/index.html#sr), (5) [Synonymy Extraction and Evaluation](https://sina.birzeit.edu/sinatools/index.html#se), (6) [Relation Extraction](https://sina.birzeit.edu/sinatools/index.html#re), (7) [Utilities](https://sina.birzeit.edu/sinatools/index.html#u) (diacritic-based word matching, Jaccard similarly, parser, tokenizers, corpora processing, transliteration, etc).
12
+
13
+ See [Demo Pages](https://sina.birzeit.edu/sinatools/).
14
+
15
+ See the [benchmarking](https://www.jarrar.info/publications/HJK24.pdf), which shows that SinaTools outperformed all related toolkits.
16
+
17
+ Installation
18
+ --------
19
+ To install SinaTools, ensure you are using Python version 3.10.8, then clone the [GitHub](git://github.com/SinaLab/SinaTools) repository.
20
+
21
+ Alternatively, you can execute the following command:
22
+
23
+ ```bash
24
+ pip install sinatools
25
+ ```
26
+
27
+ Installing Models and Data Files
28
+ --------
29
+ Some modules in SinaTools require some data files and fine-tuned models to be downloaded. To download these models, please consult the [DataDownload](https://sina.birzeit.edu/sinatools/documentation/cli_tools/DataDownload/DataDownload.html).
30
+
31
+ Documentation
32
+ --------
33
+ For information, please refer to the [main page](https://sina.birzeit.edu/sinatools) or the [online domuementation](https://sina.birzeit.edu/sinatools/documentation).
34
+
35
+ Citation
36
+ -------
37
+ Tymaa Hammouda, Mustafa Jarrar, Mohammed Khalilia: [SinaTools: Open Source Toolkit for Arabic Natural Language Understanding](http://www.jarrar.info/publications/HJK24.pdf). In Proceedings of the 2024 AI in Computational Linguistics (ACLing 2024), Procedia Computer Science, Dubai. ELSEVIER.
38
+
39
+ License
40
+ --------
41
+ SinaTools is available under the MIT License. See the [LICENSE](https://github.com/SinaLab/sinatools/blob/main/LICENSE) file for more information.
42
+
43
+ Reporting Issues
44
+ --------
45
+ To report any issues or bugs, please contact us at "sina.institute.bzu@gmail.com" or visit [SinaTools Issues](https://github.com/SinaLab/sinatools/issues).
46
+
47
+
48
+ Keywords: sinatools
49
+ Platform: UNKNOWN
50
+ Description-Content-Type: text/markdown
@@ -0,0 +1,39 @@
1
+ SinaTools
2
+ ======================
3
+ Open Source Toolkit for Arabic NLP and NLU developed by [SinaLab](http://sina.birzeit.edu/) at Birzeit University. SinaTools is available through Python APIs, command lines, colabs, and online demos.
4
+
5
+ See the full list of [Available Packages](https://sina.birzeit.edu/sinatools/), which include: (1) [Morphology Tagging](https://sina.birzeit.edu/sinatools/index.html#morph), (2) [Named Entity Recognition (NER)](https://sina.birzeit.edu/sinatools/index.html#ner), (3) [Word Sense Disambiguation (WSD)](https://sina.birzeit.edu/sinatools/index.html#wsd), (4) [Semantic Relatedness](https://sina.birzeit.edu/sinatools/index.html#sr), (5) [Synonymy Extraction and Evaluation](https://sina.birzeit.edu/sinatools/index.html#se), (6) [Relation Extraction](https://sina.birzeit.edu/sinatools/index.html#re), (7) [Utilities](https://sina.birzeit.edu/sinatools/index.html#u) (diacritic-based word matching, Jaccard similarly, parser, tokenizers, corpora processing, transliteration, etc).
6
+
7
+ See [Demo Pages](https://sina.birzeit.edu/sinatools/).
8
+
9
+ See the [benchmarking](https://www.jarrar.info/publications/HJK24.pdf), which shows that SinaTools outperformed all related toolkits.
10
+
11
+ Installation
12
+ --------
13
+ To install SinaTools, ensure you are using Python version 3.10.8, then clone the [GitHub](git://github.com/SinaLab/SinaTools) repository.
14
+
15
+ Alternatively, you can execute the following command:
16
+
17
+ ```bash
18
+ pip install sinatools
19
+ ```
20
+
21
+ Installing Models and Data Files
22
+ --------
23
+ Some modules in SinaTools require some data files and fine-tuned models to be downloaded. To download these models, please consult the [DataDownload](https://sina.birzeit.edu/sinatools/documentation/cli_tools/DataDownload/DataDownload.html).
24
+
25
+ Documentation
26
+ --------
27
+ For information, please refer to the [main page](https://sina.birzeit.edu/sinatools) or the [online domuementation](https://sina.birzeit.edu/sinatools/documentation).
28
+
29
+ Citation
30
+ -------
31
+ Tymaa Hammouda, Mustafa Jarrar, Mohammed Khalilia: [SinaTools: Open Source Toolkit for Arabic Natural Language Understanding](http://www.jarrar.info/publications/HJK24.pdf). In Proceedings of the 2024 AI in Computational Linguistics (ACLing 2024), Procedia Computer Science, Dubai. ELSEVIER.
32
+
33
+ License
34
+ --------
35
+ SinaTools is available under the MIT License. See the [LICENSE](https://github.com/SinaLab/sinatools/blob/main/LICENSE) file for more information.
36
+
37
+ Reporting Issues
38
+ --------
39
+ To report any issues or bugs, please contact us at "sina.institute.bzu@gmail.com" or visit [SinaTools Issues](https://github.com/SinaLab/sinatools/issues).
@@ -0,0 +1,50 @@
1
+ Metadata-Version: 2.1
2
+ Name: SinaTools
3
+ Version: 0.1.28
4
+ Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
+ Home-page: https://github.com/SinaLab/sinatools
6
+ License: MIT license
7
+ Description: SinaTools
8
+ ======================
9
+ Open Source Toolkit for Arabic NLP and NLU developed by [SinaLab](http://sina.birzeit.edu/) at Birzeit University. SinaTools is available through Python APIs, command lines, colabs, and online demos.
10
+
11
+ See the full list of [Available Packages](https://sina.birzeit.edu/sinatools/), which include: (1) [Morphology Tagging](https://sina.birzeit.edu/sinatools/index.html#morph), (2) [Named Entity Recognition (NER)](https://sina.birzeit.edu/sinatools/index.html#ner), (3) [Word Sense Disambiguation (WSD)](https://sina.birzeit.edu/sinatools/index.html#wsd), (4) [Semantic Relatedness](https://sina.birzeit.edu/sinatools/index.html#sr), (5) [Synonymy Extraction and Evaluation](https://sina.birzeit.edu/sinatools/index.html#se), (6) [Relation Extraction](https://sina.birzeit.edu/sinatools/index.html#re), (7) [Utilities](https://sina.birzeit.edu/sinatools/index.html#u) (diacritic-based word matching, Jaccard similarly, parser, tokenizers, corpora processing, transliteration, etc).
12
+
13
+ See [Demo Pages](https://sina.birzeit.edu/sinatools/).
14
+
15
+ See the [benchmarking](https://www.jarrar.info/publications/HJK24.pdf), which shows that SinaTools outperformed all related toolkits.
16
+
17
+ Installation
18
+ --------
19
+ To install SinaTools, ensure you are using Python version 3.10.8, then clone the [GitHub](git://github.com/SinaLab/SinaTools) repository.
20
+
21
+ Alternatively, you can execute the following command:
22
+
23
+ ```bash
24
+ pip install sinatools
25
+ ```
26
+
27
+ Installing Models and Data Files
28
+ --------
29
+ Some modules in SinaTools require some data files and fine-tuned models to be downloaded. To download these models, please consult the [DataDownload](https://sina.birzeit.edu/sinatools/documentation/cli_tools/DataDownload/DataDownload.html).
30
+
31
+ Documentation
32
+ --------
33
+ For information, please refer to the [main page](https://sina.birzeit.edu/sinatools) or the [online domuementation](https://sina.birzeit.edu/sinatools/documentation).
34
+
35
+ Citation
36
+ -------
37
+ Tymaa Hammouda, Mustafa Jarrar, Mohammed Khalilia: [SinaTools: Open Source Toolkit for Arabic Natural Language Understanding](http://www.jarrar.info/publications/HJK24.pdf). In Proceedings of the 2024 AI in Computational Linguistics (ACLing 2024), Procedia Computer Science, Dubai. ELSEVIER.
38
+
39
+ License
40
+ --------
41
+ SinaTools is available under the MIT License. See the [LICENSE](https://github.com/SinaLab/sinatools/blob/main/LICENSE) file for more information.
42
+
43
+ Reporting Issues
44
+ --------
45
+ To report any issues or bugs, please contact us at "sina.institute.bzu@gmail.com" or visit [SinaTools Issues](https://github.com/SinaLab/sinatools/issues).
46
+
47
+
48
+ Keywords: sinatools
49
+ Platform: UNKNOWN
50
+ Description-Content-Type: text/markdown
@@ -155,6 +155,7 @@ sinatools/ner/datasets.py
155
155
  sinatools/ner/entity_extractor.py
156
156
  sinatools/ner/helpers.py
157
157
  sinatools/ner/metrics.py
158
+ sinatools/ner/relation_extractor.py
158
159
  sinatools/ner/transforms.py
159
160
  sinatools/ner/data/__init__.py
160
161
  sinatools/ner/data/datasets.py
@@ -173,14 +174,14 @@ sinatools/synonyms/__init__.py
173
174
  sinatools/synonyms/synonyms_generator.py
174
175
  sinatools/utils/__init__.py
175
176
  sinatools/utils/charsets.py
176
- sinatools/utils/implication.py
177
- sinatools/utils/jaccard.py
178
177
  sinatools/utils/parser.py
179
178
  sinatools/utils/readfile.py
179
+ sinatools/utils/similarity.py
180
180
  sinatools/utils/text_dublication_detector.py
181
181
  sinatools/utils/text_transliteration.py
182
182
  sinatools/utils/tokenizer.py
183
183
  sinatools/utils/tokenizers_words.py
184
+ sinatools/utils/word_compare.py
184
185
  sinatools/wsd/__init__.py
185
186
  sinatools/wsd/disambiguator.py
186
187
  sinatools/wsd/settings.py
@@ -2,7 +2,7 @@
2
2
  About:
3
3
  ------
4
4
 
5
- The download_files is a command-line interface for downloading various NLP resources from pre-specified URLs. It is a part of the sinatools package and provides options to choose which files to download and to specify a download directory. The tool automatically handles file extraction for zip and tar.gz files.
5
+ The download_files command, allows users to select specific files and models to download and use it within SinaTools modules. Additionally, it automatically manages the extraction of compressed files, including zip and tar.gz formats.
6
6
 
7
7
  Usage:
8
8
  ------
@@ -18,7 +18,7 @@ Below is the usage information that can be generated by running download_files -
18
18
 
19
19
  Options:
20
20
  -f, --files FILES
21
- Names of the files to download. Available files are: ner, morph, wsd_model, wsd_tokenizer, glosses_dic, five_grams, four_grams, three_grams, two_grams, synonyms_level2, synonyms_level3.
21
+ Names of the files to download. Available files are: ner, morph, wsd, synonyms.
22
22
  If no file is specified, all files will be downloaded.
23
23
 
24
24
  Examples:
@@ -28,7 +28,6 @@ Examples:
28
28
 
29
29
  download_files -f morph ner
30
30
  This command will download only the `morph` and `ner` files to the default directory.
31
-
32
31
  """
33
32
 
34
33
  import argparse
@@ -56,14 +55,14 @@ def main():
56
55
  download_file(urls["ner"])
57
56
  download_file(urls["wsd_model"])
58
57
  download_file(urls["wsd_tokenizer"])
59
- download_file(urls["glosses_dic"])
58
+ download_file(urls["one_gram"])
60
59
  download_file(urls["five_grams"])
61
60
  download_file(urls["four_grams"])
62
61
  download_file(urls["three_grams"])
63
62
  download_file(urls["two_grams"])
64
63
  elif file == "synonyms":
65
- download_file(urls["synonyms_level2"])
66
- download_file(urls["synonyms_level3"])
64
+ download_file(urls["graph_l2"])
65
+ download_file(urls["graph_l3"])
67
66
  else:
68
67
  url = urls[file]
69
68
  download_file(url)
@@ -72,5 +71,3 @@ def main():
72
71
 
73
72
  if __name__ == '__main__':
74
73
  main()
75
-
76
- #download_files -f morph ner
@@ -1,37 +1,3 @@
1
- """
2
- About:
3
- ------
4
- The alma_multi_word tool performs multi-word morphological analysis using SinaTools' `ALMA_multi_word` utility. Given a multi-word Arabic text input, it returns a detailed analysis in JSON format.
5
-
6
- Usage:
7
- ------
8
- Below is the usage information that can be generated by running alma_multi_word --help.
9
-
10
- .. code-block:: none
11
-
12
- alma_multi_word --multi_word=MULTI_WORD_TEXT
13
- alma_multi_word --file
14
-
15
- Options:
16
- --------
17
-
18
- .. code-block:: none
19
-
20
- --multi_word MULTI_WORD_TEXT
21
- The multi-word Arabic text that needs to be analyzed.
22
- --file
23
- File containing the multi-word text to be analyzed
24
-
25
- Examples:
26
- ---------
27
-
28
- .. code-block:: none
29
-
30
- alma_multi_word --multi_word "Your multi-word text here"
31
- alma_multi_word --file "path/to/your/file.txt"
32
-
33
- """
34
-
35
1
  import argparse
36
2
  from sinatools.morphology.ALMA_multi_word import ALMA_multi_word
37
3
  import json
@@ -1,7 +1,7 @@
1
1
  """
2
2
  About:
3
3
  ------
4
- The morphology_analyzer command is designed to provide morphological analysis for Arabic text using the SinaTools morph_analyzer component. Users can specify the language and desired analysis task (lemmatization, part-of-speech tagging, or full morphological analysis), and flag.
4
+ The morphology_analyzer command is designed to provide morphological analysis for Arabic text using the SinaTools morph_analyzer API. Users can specify the language and desired analysis task (lemmatization, part-of-speech tagging, or full morphological analysis), and flag.
5
5
 
6
6
  Usage:
7
7
  ------
@@ -7,13 +7,26 @@ import argparse
7
7
  from sinatools.ner.entity_extractor import extract
8
8
 
9
9
  """
10
- This tool processes a csv file and returns named entites for each token within the text, based on the specified batch size. As follows:
10
+ The following command takes a CSV file as input. It splits a specific column into tokens and tags them using named entity recognition (NER). It retains all other columns as they are, and it also adds sentences and tokens. Additionally, it assigns an auto-incrementing ID, a sentence ID, and a global sentence ID to each token. As follows:
11
11
 
12
12
  Usage:
13
13
  ------
14
- Run the script with the following command:
15
-
16
- corpus_entity_extractor input.csv --text-columns "TextColumn1,TextColumn2" --additional-columns "Column3,Column4" --output-csv output.csv
14
+ Below is the usage information that can be generated by running corpus_entity_extractor --help.
15
+
16
+ corpus_entity_extractor --input_csv path/to/csv/file --text-columns "name of the column to be tokenized" --additional-columns "Column3,Column4" --output-csv path/to/csv/file
17
+
18
+ Options:
19
+ -------
20
+ --input_csv CSV_FILE_PATH
21
+ Path of csv file
22
+ --text-columns STR
23
+ Name of the text column that need to be tagged
24
+ -- additional-columns
25
+ name of columns that returned as they are
26
+ -- output-csv
27
+ path to csv file
28
+
29
+ corpus_entity_extractor --input_csv "input.csv" --text-columns "TextColumn1" --additional-columns "Column3,Column4" --output-csv "output.csv"
17
30
  """
18
31
 
19
32
  def jsons_to_list_of_lists(json_list):
@@ -1,7 +1,7 @@
1
1
  """
2
2
  About:
3
3
  ------
4
- This tool processes an input text and returns named entites for each token within the text, based on the specified batch size. As follows:
4
+ This command processes an input text and returns named entites for each token within the text. As follows:
5
5
 
6
6
  Usage:
7
7
  ------
@@ -10,7 +10,7 @@ Below is the usage information that can be generated by running entity_extractor
10
10
  .. code-block:: none
11
11
 
12
12
  entity_extractor --text=INPUT_TEXT
13
- entity_extractor --dir=INPUT_FILE --output_csv=OUTPUT_FILE_NAME
13
+ entity_extractor --dir=DIRECTORY_PATH --output_csv "path/to/csv/file"
14
14
 
15
15
  Options:
16
16
  --------
@@ -18,11 +18,11 @@ Options:
18
18
  .. code-block:: none
19
19
 
20
20
  --text INPUT_TEXT
21
- The text that needs to be analyzed for Named Entity Recognition.
22
- --file INPUT_FILE
23
- File containing the text to be analyzed for Named Entity Recognition.
24
- --output_csv OUTPUT_FILE_NAME
25
- A file containing the tokenized text and its Named Entity tags.
21
+ The text that needs to be analyzed for Named Entity Recognition.
22
+ --dir DIRECTORY_PATH
23
+ Directory containing the text files to be analyzed for Named Entity Recognition
24
+ --output_csv CSV_FILE
25
+ The path for output csv file
26
26
 
27
27
 
28
28
  Examples:
@@ -31,7 +31,7 @@ Examples:
31
31
  .. code-block:: none
32
32
 
33
33
  entity_extractor --text "Your text here"
34
- entity_extractor --dir "/path/to/your/directory" --output_csv "output.csv"
34
+ entity_extractor --dir "path/to/your/dir" --output_csv "path/to/your/file"
35
35
 
36
36
  """
37
37
 
@@ -39,7 +39,7 @@ Examples:
39
39
 
40
40
  """
41
41
  import argparse
42
- from sinatools.utils.implication import Implication
42
+ from sinatools.utils.word_compare import Implication
43
43
 
44
44
  def read_file(file_path):
45
45
  with open(file_path, 'r', encoding='utf-8') as file:
@@ -72,8 +72,8 @@ def main():
72
72
  # Instantiate the Implication class
73
73
  implication_obj = Implication(word1, word2)
74
74
 
75
- # For this example, assuming there is a method `get_result()` in the Implication class.
76
- result = implication_obj.get_result()
75
+ # For this example, assuming there is a method `get_verdict()` in the Implication class.
76
+ result = implication_obj.get_verdict()
77
77
  print(result)
78
78
 
79
79
  if __name__ == '__main__':
@@ -46,7 +46,7 @@ Examples:
46
46
  """
47
47
 
48
48
  import argparse
49
- from sinatools.utils.jaccard import jaccard
49
+ from sinatools.utils.similarity import get_jaccard
50
50
  from sinatools.utils.readfile import read_file
51
51
 
52
52
 
@@ -76,7 +76,7 @@ def main():
76
76
  print("Either --file1 and --file2 arguments or both --set1 and --set2 arguments must be provided.")
77
77
  return
78
78
 
79
- similarity = jaccard(args.delimiter, set1, set2, args.selection, args.ignoreAllDiacriticsButNotShadda, args.ignoreShaddaDiacritic)
79
+ similarity = get_jaccard(args.delimiter, set1, set2, args.selection, args.ignoreAllDiacriticsButNotShadda, args.ignoreShaddaDiacritic)
80
80
 
81
81
  print("Jaccard Result:", similarity)
82
82
 
@@ -10,13 +10,13 @@ urls = {
10
10
  'ner': 'https://sina.birzeit.edu/Wj27012000.tar.gz',
11
11
  'wsd_model': 'https://sina.birzeit.edu/bert-base-arabertv02_22_May_2021_00h_allglosses_unused01.zip',
12
12
  'wsd_tokenizer': 'https://sina.birzeit.edu/bert-base-arabertv02.zip',
13
- 'glosses_dic': 'https://sina.birzeit.edu/glosses_dic.pickle',
13
+ 'one_gram': 'https://sina.birzeit.edu/one_gram.pickle',
14
14
  'five_grams': 'https://sina.birzeit.edu/five_grams.pickle',
15
15
  'four_grams':'https://sina.birzeit.edu/four_grams.pickle',
16
16
  'three_grams':'https://sina.birzeit.edu/three_grams.pickle',
17
17
  'two_grams':'https://sina.birzeit.edu/two_grams.pickle',
18
- 'synonyms_level2':'https://sina.birzeit.edu/synonyms_level2.pkl',
19
- 'synonyms_level3':'https://sina.birzeit.edu/synonyms_level3.pkl'
18
+ 'graph_l2':'https://sina.birzeit.edu/graph_l2.pkl',
19
+ 'graph_l3':'https://sina.birzeit.edu/graph_l3.pkl'
20
20
  }
21
21
 
22
22
  def get_appdatadir():
@@ -0,0 +1 @@
1
+ 0.1.28
@@ -24,27 +24,27 @@ def find_solution(token, language, flag):
24
24
 
25
25
  def analyze(text, language ='MSA', task ='full', flag="1"):
26
26
  """
27
- This method processes an input text and returns morphological analysis for each token within the text, based on the specified language, task, and flag. As follows:
28
- If:
29
- The task is lemmatization, the morphological solution includes only the lemma_id, lemma, token, and token frequency.
30
- The task is pos, the morphological solution includes only the part-of-speech, token, and token frequency.
31
- The task is root, the morphological solution includes only the root, token, and token frequency.
32
- The task is full, the morphological solution includes the lemma_id, lemma, part-of-speech, root, token, and token frequency.
27
+ This method processes an input text and returns morphological analysis for each token within the text, based on the specified language, task, and flag. You can try the demo online. See article for more details
28
+
29
+ * If the task is lemmatization, the morphological solution includes only the lemma_id, lemma, token, and token frequency.
30
+ * If the task is pos, the morphological solution includes only the part-of-speech, token, and token frequency.
31
+ * If the task is root, the morphological solution includes only the root, token, and token frequency.
32
+ * If the task is full, the morphological solution includes the lemma_id, lemma, part-of-speech, root, token, and token frequency.
33
33
 
34
- Args:
34
+ Parameters:
35
35
  text (:obj:`str`): The Arabic text to be morphologically analyzed.
36
- language (:obj:`str`): The type of the input text. Currently, only Modern Standard Arabic (MSA) is supported.
36
+ language (:obj:`str`): Currently, only Modern Standard Arabic (MSA) is supported.
37
37
  task (:obj:`str`): The task to filter the results by. Options are [lemmatization, pos, root, full]. The default task if not specified is `full`.
38
- flag (:obj:`str`): The flag to filter the returned results. If the flag is `1`, the solution with the highest frequency will be returned. If the flag is `*`, all solutions will be returned, ordered descendingly, with the highest frequency solution first. The default flag if not specified is `1`.
38
+ flag (:obj:`str`): The flag to filter the returned results. If the flag is `1`, the solution with the highest frequency will be returned. If the flag is `*`, all solutions will be returned, ordered descendingly, with the highest frequency solution first. The default flag if not specified is `1`.
39
39
 
40
40
  Returns:
41
41
  list (:obj:`list`): A list of JSON objects, where each JSON could be contains:
42
42
  token: The token from the original text.
43
- lemma: The lemma of the token.
44
- lemma_id: The id of the lemma.
45
- pos: The part-of-speech of the token.
46
- root: The root of the token.
47
- frequency: The frequency of the token.
43
+ lemma: The lemma of the token (Lemmas from the Qabas lexicon).
44
+ lemma_id: The id of the lemma (qabas lemma ids).
45
+ pos: The part-of-speech of the token (see Qabas POS tags).
46
+ root: The root of the token (qabas roots).
47
+ frequency: The frequency of the token (see section 3 in article).
48
48
 
49
49
  **Example:**
50
50
 
@@ -57,37 +57,36 @@ def analyze(text, language ='MSA', task ='full', flag="1"):
57
57
  #Example: task = full
58
58
  analyze('ذهب الولد الى المدرسة')
59
59
 
60
- [
61
- {
62
- "token": "ذهب",
63
- "lemma": "ذَهَبَ",
64
- "lemma_id": "202001617",
65
- "root": "ذ ه ب",
66
- "pos": "فعل ماضي",
67
- "frequency": "82202"
68
- },{
69
- "token": "الولد",
70
- "lemma": "وَلَدٌ",
71
- "lemma_id": "202003092",
72
- "root": "و ل د",
73
- "pos": "اسم",
74
- "frequency": "19066"
75
- },{
76
- "token": "إلى",
77
- "lemma": "إِلَى",
78
- "lemma_id": "202000856",
79
- "root": "إ ل ى",
80
- "pos": "حرف جر",
81
- "frequency": "7367507"
82
- },{
83
- "token": "المدرسة",
84
- "lemma": "مَدْرَسَةٌ",
85
- "lemma_id": "202002620",
86
- "root": "د ر س",
87
- "pos": "اسم",
88
- "frequency": "145285"
89
- }
90
- ]
60
+ [{
61
+ "token": "ذهب",
62
+ "lemma": "ذَهَبَ",
63
+ "lemma_id": "202001617",
64
+ "root": "ذ ه ب",
65
+ "pos": "فعل ماضي",
66
+ "frequency": "82202"
67
+ },{
68
+ "token": "الولد",
69
+ "lemma": "وَلَدٌ",
70
+ "lemma_id": "202003092",
71
+ "root": "و ل د",
72
+ "pos": "اسم",
73
+ "frequency": "19066"
74
+ },{
75
+ "token": "إلى",
76
+ "lemma": "إِلَى",
77
+ "lemma_id": "202000856",
78
+ "root": "إ ل ى",
79
+ "pos": "حرف جر",
80
+ "frequency": "7367507"
81
+ },{
82
+ "token": "المدرسة",
83
+ "lemma": "مَدْرَسَةٌ",
84
+ "lemma_id": "202002620",
85
+ "root": "د ر س",
86
+ "pos": "اسم",
87
+ "frequency": "145285"
88
+ }]
89
+
91
90
  """
92
91
 
93
92
  output_list = []
@@ -7,6 +7,8 @@ import torch
7
7
  import pickle
8
8
  import json
9
9
  from argparse import Namespace
10
+ from transformers import pipeline
11
+ #from transformers import AutoModelForSequenceClassification
10
12
 
11
13
  tagger = None
12
14
  tag_vocab = None
@@ -35,4 +37,7 @@ if torch.cuda.is_available():
35
37
 
36
38
  train_config.trainer_config["kwargs"]["model"] = model
37
39
  tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
38
- tagger.load(os.path.join(model_path,"checkpoints"))
40
+ tagger.load(os.path.join(model_path,"checkpoints"))
41
+
42
+ pipe = pipeline("sentiment-analysis", model= os.path.join(path, "best_model"), return_all_scores =True, max_length=128, truncation=True)
43
+ #pipe = AutoModelForSequenceClassification.from_pretrained(os.path.join(path, "best_model"))
@@ -0,0 +1,104 @@
1
+ import os
2
+ from collections import namedtuple
3
+ from sinatools.ner.data_format import get_dataloaders, text2segments
4
+ from . import tagger, tag_vocab, train_config
5
+
6
+
7
+ def convert_nested_to_flat(nested_tags):
8
+ flat_tags = []
9
+
10
+ for entry in nested_tags:
11
+ word = entry['token']
12
+ tags = entry['tags'].split()
13
+
14
+ # Initialize with the first tag in the sequence
15
+ flat_tag = tags[0]
16
+
17
+ for tag in tags[1:]:
18
+ # Check if the tag is an "I-" tag, indicating continuation of an entity
19
+ if tag.startswith('I-'):
20
+ flat_tag = tag
21
+ break
22
+
23
+ flat_tags.append({
24
+ 'token': word,
25
+ 'tags': flat_tag
26
+ })
27
+
28
+ return flat_tags
29
+
30
+ def extract(text, ner_method="nested"):
31
+ """
32
+ This method processes an input text and returns named entites for each token within the text. It support 21 class of entites. The method also support flat and nested NER. You can try the demo online. See article for details.
33
+
34
+ Args:
35
+ * text (:obj:`str`) – The Arabic text to be tagged.
36
+ * ner_method (:obj:`str`) – The NER method can produce either flat or nested output formats. The default method is nested.
37
+ nested method: If the method is nested, the output will include nested tags.
38
+ flat method: If the method is flat, the output will consist of only flat tags.
39
+ The choice between flat and nested methods determines the structure and detail of the named entity recognition output.
40
+
41
+ Returns:
42
+ A list of JSON objects, where each object could be contains:
43
+ token: The token from the original text.
44
+ NER tag: The label pairs for each segment.
45
+
46
+ **Example:**
47
+
48
+ .. highlight:: python
49
+ .. code-block:: python
50
+
51
+ from sinatools.ner.entity_extractor import extract
52
+ #Example of nested ner. Notice that the last word in this sentense contains nested tags.
53
+ extract('ذهب محمد الى جامعة بيرزيت')
54
+ #the output
55
+ [{
56
+ "token":"ذهب",
57
+ "tags":"O"
58
+ },{
59
+ "token":"محمد",
60
+ "tags":"B-PERS"
61
+ },{
62
+ "token":"إلى",
63
+ "tags":"O"
64
+ },{
65
+ "token":"جامعة",
66
+ "tags":"B-ORG"
67
+ },{
68
+ "token":"بيرزيت",
69
+ "tags":"B-GPE I-ORG"
70
+ }]
71
+ """
72
+
73
+ dataset, token_vocab = text2segments(text)
74
+
75
+ vocabs = namedtuple("Vocab", ["tags", "tokens"])
76
+ vocab = vocabs(tokens=token_vocab, tags=tag_vocab)
77
+
78
+ dataloader = get_dataloaders(
79
+ (dataset,),
80
+ vocab,
81
+ train_config.data_config,
82
+ batch_size=32,
83
+ shuffle=(False,),
84
+ )[0]
85
+
86
+
87
+ segments = tagger.infer(dataloader)
88
+ segments_lists = []
89
+
90
+ for segment in segments:
91
+ for token in segment:
92
+ segments_list = {}
93
+ segments_list["token"] = token.text
94
+ list_of_tags = [t['tag'] for t in token.pred_tag]
95
+ list_of_tags = [i for i in list_of_tags if i not in('O',' ','')]
96
+ if list_of_tags == []:
97
+ segments_list["tags"] = ' '.join(['O'])
98
+ else:
99
+ segments_list["tags"] = ' '.join(list_of_tags)
100
+ segments_lists.append(segments_list)
101
+
102
+ if ner_method == "flat":
103
+ segments_lists = convert_nested_to_flat(segments_lists)
104
+ return segments_lists