SinaTools 0.1.36__tar.gz → 0.1.38__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. sinatools-0.1.38/PKG-INFO +62 -0
  2. sinatools-0.1.38/SinaTools.egg-info/PKG-INFO +62 -0
  3. {SinaTools-0.1.36 → sinatools-0.1.38}/SinaTools.egg-info/SOURCES.txt +0 -50
  4. {SinaTools-0.1.36 → sinatools-0.1.38}/SinaTools.egg-info/entry_points.txt +0 -1
  5. {SinaTools-0.1.36 → sinatools-0.1.38}/SinaTools.egg-info/requires.txt +0 -1
  6. {SinaTools-0.1.36 → sinatools-0.1.38}/setup.py +1 -1
  7. sinatools-0.1.38/sinatools/VERSION +1 -0
  8. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/trainers/BertNestedTrainer.py +203 -203
  9. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/trainers/BertTrainer.py +163 -163
  10. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/trainers/__init__.py +2 -2
  11. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/utils/similarity.py +62 -27
  12. SinaTools-0.1.36/PKG-INFO +0 -50
  13. SinaTools-0.1.36/SinaTools.egg-info/PKG-INFO +0 -50
  14. SinaTools-0.1.36/sinatools/VERSION +0 -1
  15. SinaTools-0.1.36/sinatools/arabert/arabert/__init__.py +0 -14
  16. SinaTools-0.1.36/sinatools/arabert/arabert/create_classification_data.py +0 -260
  17. SinaTools-0.1.36/sinatools/arabert/arabert/create_pretraining_data.py +0 -534
  18. SinaTools-0.1.36/sinatools/arabert/arabert/extract_features.py +0 -444
  19. SinaTools-0.1.36/sinatools/arabert/arabert/lamb_optimizer.py +0 -158
  20. SinaTools-0.1.36/sinatools/arabert/arabert/modeling.py +0 -1027
  21. SinaTools-0.1.36/sinatools/arabert/arabert/optimization.py +0 -202
  22. SinaTools-0.1.36/sinatools/arabert/arabert/run_classifier.py +0 -1078
  23. SinaTools-0.1.36/sinatools/arabert/arabert/run_pretraining.py +0 -593
  24. SinaTools-0.1.36/sinatools/arabert/arabert/run_squad.py +0 -1440
  25. SinaTools-0.1.36/sinatools/arabert/arabert/tokenization.py +0 -414
  26. SinaTools-0.1.36/sinatools/arabert/araelectra/__init__.py +0 -1
  27. SinaTools-0.1.36/sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
  28. SinaTools-0.1.36/sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
  29. SinaTools-0.1.36/sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
  30. SinaTools-0.1.36/sinatools/arabert/araelectra/configure_finetuning.py +0 -172
  31. SinaTools-0.1.36/sinatools/arabert/araelectra/configure_pretraining.py +0 -143
  32. SinaTools-0.1.36/sinatools/arabert/araelectra/finetune/__init__.py +0 -14
  33. SinaTools-0.1.36/sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
  34. SinaTools-0.1.36/sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
  35. SinaTools-0.1.36/sinatools/arabert/araelectra/finetune/scorer.py +0 -54
  36. SinaTools-0.1.36/sinatools/arabert/araelectra/finetune/task.py +0 -74
  37. SinaTools-0.1.36/sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
  38. SinaTools-0.1.36/sinatools/arabert/araelectra/flops_computation.py +0 -215
  39. SinaTools-0.1.36/sinatools/arabert/araelectra/model/__init__.py +0 -14
  40. SinaTools-0.1.36/sinatools/arabert/araelectra/model/modeling.py +0 -1029
  41. SinaTools-0.1.36/sinatools/arabert/araelectra/model/optimization.py +0 -193
  42. SinaTools-0.1.36/sinatools/arabert/araelectra/model/tokenization.py +0 -355
  43. SinaTools-0.1.36/sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
  44. SinaTools-0.1.36/sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
  45. SinaTools-0.1.36/sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
  46. SinaTools-0.1.36/sinatools/arabert/araelectra/run_finetuning.py +0 -323
  47. SinaTools-0.1.36/sinatools/arabert/araelectra/run_pretraining.py +0 -469
  48. SinaTools-0.1.36/sinatools/arabert/araelectra/util/__init__.py +0 -14
  49. SinaTools-0.1.36/sinatools/arabert/araelectra/util/training_utils.py +0 -112
  50. SinaTools-0.1.36/sinatools/arabert/araelectra/util/utils.py +0 -109
  51. SinaTools-0.1.36/sinatools/arabert/aragpt2/__init__.py +0 -2
  52. SinaTools-0.1.36/sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
  53. SinaTools-0.1.36/sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
  54. SinaTools-0.1.36/sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
  55. SinaTools-0.1.36/sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
  56. SinaTools-0.1.36/sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
  57. SinaTools-0.1.36/sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
  58. SinaTools-0.1.36/sinatools/arabert/aragpt2/grover/modeling.py +0 -803
  59. SinaTools-0.1.36/sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
  60. SinaTools-0.1.36/sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
  61. SinaTools-0.1.36/sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
  62. SinaTools-0.1.36/sinatools/arabert/aragpt2/grover/utils.py +0 -234
  63. SinaTools-0.1.36/sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
  64. SinaTools-0.1.36/sinatools/utils/__init__.py +0 -0
  65. {SinaTools-0.1.36 → sinatools-0.1.38}/AUTHORS.rst +0 -0
  66. {SinaTools-0.1.36 → sinatools-0.1.38}/CONTRIBUTING.rst +0 -0
  67. {SinaTools-0.1.36 → sinatools-0.1.38}/LICENSE +0 -0
  68. {SinaTools-0.1.36 → sinatools-0.1.38}/MANIFEST.in +0 -0
  69. {SinaTools-0.1.36 → sinatools-0.1.38}/README.rst +0 -0
  70. {SinaTools-0.1.36 → sinatools-0.1.38}/SinaTools.egg-info/dependency_links.txt +0 -0
  71. {SinaTools-0.1.36 → sinatools-0.1.38}/SinaTools.egg-info/not-zip-safe +0 -0
  72. {SinaTools-0.1.36 → sinatools-0.1.38}/SinaTools.egg-info/top_level.txt +0 -0
  73. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/Makefile +0 -0
  74. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/build/_images/download.png +0 -0
  75. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/build/_static/download.png +0 -0
  76. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/build/_static/file.png +0 -0
  77. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/build/_static/minus.png +0 -0
  78. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/build/_static/plus.png +0 -0
  79. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/build/html/_images/SinaLogo.jpg +0 -0
  80. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/build/html/_images/download.png +0 -0
  81. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/build/html/_static/SinaLogo.jpg +0 -0
  82. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/build/html/_static/download.png +0 -0
  83. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/build/html/_static/file.png +0 -0
  84. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/build/html/_static/minus.png +0 -0
  85. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/build/html/_static/plus.png +0 -0
  86. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/make.bat +0 -0
  87. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/License.rst +0 -0
  88. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/Overview.rst +0 -0
  89. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/_static/SinaLogo.jpg +0 -0
  90. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/_static/download.png +0 -0
  91. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/about.rst +0 -0
  92. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/api/DataDownload/downloader.rst +0 -0
  93. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/api/DataDownload.rst +0 -0
  94. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/api/arabiner/bin/infer.rst +0 -0
  95. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/api/arabiner.rst +0 -0
  96. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/api/morphology/morph_analyzer.rst +0 -0
  97. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/api/morphology.rst +0 -0
  98. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/api/salma/views.rst +0 -0
  99. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/api/salma.rst +0 -0
  100. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/api/utils/corpus_tokenizer.rst +0 -0
  101. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/api/utils/implication.rst +0 -0
  102. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/api/utils/jaccard.rst +0 -0
  103. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/api/utils/parser.rst +0 -0
  104. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/api/utils/sentence_tokenizer.rst +0 -0
  105. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/api/utils/text_transliteration.rst +0 -0
  106. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/api/utils.rst +0 -0
  107. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/api.rst +0 -0
  108. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/authors.rst +0 -0
  109. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/DataDownload/download_files.rst +0 -0
  110. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/DataDownload/get_appdatadir.rst +0 -0
  111. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/DataDownload.rst +0 -0
  112. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/arabiner/infer.rst +0 -0
  113. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/arabiner.rst +0 -0
  114. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/morphology/ALMA_multi_word.rst +0 -0
  115. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/morphology/morph_analyzer.rst +0 -0
  116. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/morphology.rst +0 -0
  117. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/salma/salma_tools.rst +0 -0
  118. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/salma.rst +0 -0
  119. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/utils/arStrip.rst +0 -0
  120. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/utils/corpus_tokenizer.rst +0 -0
  121. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/utils/implication.rst +0 -0
  122. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/utils/jaccard.rst +0 -0
  123. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/utils/latin_remove.rst +0 -0
  124. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/utils/remove_punc.rst +0 -0
  125. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/utils/sentence_tokenizer.rst +0 -0
  126. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/utils/text_transliteration.rst +0 -0
  127. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools/utils.rst +0 -0
  128. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/cli_tools.rst +0 -0
  129. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/conf.py +0 -0
  130. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/index.rst +0 -0
  131. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/installation.rst +0 -0
  132. {SinaTools-0.1.36 → sinatools-0.1.38}/docs/source/readme.rst +0 -0
  133. {SinaTools-0.1.36 → sinatools-0.1.38}/setup.cfg +0 -0
  134. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/CLI/DataDownload/download_files.py +0 -0
  135. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/CLI/morphology/ALMA_multi_word.py +0 -0
  136. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/CLI/morphology/morph_analyzer.py +0 -0
  137. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/CLI/ner/corpus_entity_extractor.py +0 -0
  138. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/CLI/ner/entity_extractor.py +0 -0
  139. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/CLI/utils/__init__.py +0 -0
  140. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/CLI/utils/arStrip.py +0 -0
  141. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/CLI/utils/corpus_tokenizer.py +0 -0
  142. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/CLI/utils/implication.py +0 -0
  143. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/CLI/utils/jaccard.py +0 -0
  144. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/CLI/utils/remove_latin.py +0 -0
  145. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/CLI/utils/remove_punctuation.py +0 -0
  146. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/CLI/utils/sentence_tokenizer.py +0 -0
  147. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/CLI/utils/text_dublication_detector.py +0 -0
  148. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/CLI/utils/text_transliteration.py +0 -0
  149. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/DataDownload/__init__.py +0 -0
  150. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/DataDownload/downloader.py +0 -0
  151. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/__init__.py +0 -0
  152. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/arabert/__init__.py +0 -0
  153. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/arabert/preprocess.py +0 -0
  154. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/environment.yml +0 -0
  155. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/install_env.py +0 -0
  156. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/morphology/ALMA_multi_word.py +0 -0
  157. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/morphology/__init__.py +0 -0
  158. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/morphology/morph_analyzer.py +0 -0
  159. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/__init__.py +0 -0
  160. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/data/__init__.py +0 -0
  161. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/data/datasets.py +0 -0
  162. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/data/transforms.py +0 -0
  163. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/data_format.py +0 -0
  164. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/datasets.py +0 -0
  165. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/entity_extractor.py +0 -0
  166. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/helpers.py +0 -0
  167. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/metrics.py +0 -0
  168. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/nn/BaseModel.py +0 -0
  169. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/nn/BertNestedTagger.py +0 -0
  170. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/nn/BertSeqTagger.py +0 -0
  171. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/nn/__init__.py +0 -0
  172. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/trainers/BaseTrainer.py +0 -0
  173. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/ner/transforms.py +0 -0
  174. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/relations/__init__.py +0 -0
  175. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/relations/relation_extractor.py +0 -0
  176. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/semantic_relatedness/__init__.py +0 -0
  177. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/semantic_relatedness/compute_relatedness.py +0 -0
  178. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/sinatools.py +0 -0
  179. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/synonyms/__init__.py +0 -0
  180. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/synonyms/synonyms_generator.py +0 -0
  181. {SinaTools-0.1.36/sinatools/arabert/aragpt2/grover → sinatools-0.1.38/sinatools/utils}/__init__.py +0 -0
  182. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/utils/charsets.py +0 -0
  183. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/utils/parser.py +0 -0
  184. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/utils/readfile.py +0 -0
  185. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/utils/text_dublication_detector.py +0 -0
  186. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/utils/text_transliteration.py +0 -0
  187. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/utils/tokenizer.py +0 -0
  188. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/utils/tokenizers_words.py +0 -0
  189. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/utils/word_compare.py +0 -0
  190. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/wsd/__init__.py +0 -0
  191. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/wsd/disambiguator.py +0 -0
  192. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/wsd/settings.py +0 -0
  193. {SinaTools-0.1.36 → sinatools-0.1.38}/sinatools/wsd/wsd.py +0 -0
@@ -0,0 +1,62 @@
1
+ Metadata-Version: 2.1
2
+ Name: SinaTools
3
+ Version: 0.1.38
4
+ Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
+ Home-page: https://github.com/SinaLab/sinatools
6
+ License: MIT license
7
+ Keywords: sinatools
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ License-File: AUTHORS.rst
11
+ Requires-Dist: six
12
+ Requires-Dist: farasapy
13
+ Requires-Dist: tqdm
14
+ Requires-Dist: requests
15
+ Requires-Dist: pathlib
16
+ Requires-Dist: torch==1.13.0
17
+ Requires-Dist: transformers==4.24.0
18
+ Requires-Dist: torchtext==0.14.0
19
+ Requires-Dist: torchvision==0.14.0
20
+ Requires-Dist: seqeval==1.2.2
21
+ Requires-Dist: natsort==7.1.1
22
+
23
+ SinaTools
24
+ ======================
25
+ Open Source Toolkit for Arabic NLP and NLU developed by [SinaLab](http://sina.birzeit.edu/) at Birzeit University. SinaTools is available through Python APIs, command lines, colabs, and online demos.
26
+
27
+ See the full list of [Available Packages](https://sina.birzeit.edu/sinatools/), which include: (1) [Morphology Tagging](https://sina.birzeit.edu/sinatools/index.html#morph), (2) [Named Entity Recognition (NER)](https://sina.birzeit.edu/sinatools/index.html#ner), (3) [Word Sense Disambiguation (WSD)](https://sina.birzeit.edu/sinatools/index.html#wsd), (4) [Semantic Relatedness](https://sina.birzeit.edu/sinatools/index.html#sr), (5) [Synonymy Extraction and Evaluation](https://sina.birzeit.edu/sinatools/index.html#se), (6) [Relation Extraction](https://sina.birzeit.edu/sinatools/index.html#re), (7) [Utilities](https://sina.birzeit.edu/sinatools/index.html#u) (diacritic-based word matching, Jaccard similarly, parser, tokenizers, corpora processing, transliteration, etc).
28
+
29
+ See [Demo Pages](https://sina.birzeit.edu/sinatools/).
30
+
31
+ See the [benchmarking](https://www.jarrar.info/publications/HJK24.pdf), which shows that SinaTools outperformed all related toolkits.
32
+
33
+ Installation
34
+ --------
35
+ To install SinaTools, ensure you are using Python version 3.10.8, then clone the [GitHub](git://github.com/SinaLab/SinaTools) repository.
36
+
37
+ Alternatively, you can execute the following command:
38
+
39
+ ```bash
40
+ pip install sinatools
41
+ ```
42
+
43
+ Installing Models and Data Files
44
+ --------
45
+ Some modules in SinaTools require some data files and fine-tuned models to be downloaded. To download these models, please consult the [DataDownload](https://sina.birzeit.edu/sinatools/documentation/cli_tools/DataDownload/DataDownload.html).
46
+
47
+ Documentation
48
+ --------
49
+ For information, please refer to the [main page](https://sina.birzeit.edu/sinatools) or the [online domuementation](https://sina.birzeit.edu/sinatools/documentation).
50
+
51
+ Citation
52
+ -------
53
+ Tymaa Hammouda, Mustafa Jarrar, Mohammed Khalilia: [SinaTools: Open Source Toolkit for Arabic Natural Language Understanding](http://www.jarrar.info/publications/HJK24.pdf). In Proceedings of the 2024 AI in Computational Linguistics (ACLing 2024), Procedia Computer Science, Dubai. ELSEVIER.
54
+
55
+ License
56
+ --------
57
+ SinaTools is available under the MIT License. See the [LICENSE](https://github.com/SinaLab/sinatools/blob/main/LICENSE) file for more information.
58
+
59
+ Reporting Issues
60
+ --------
61
+ To report any issues or bugs, please contact us at "sina.institute.bzu@gmail.com" or visit [SinaTools Issues](https://github.com/SinaLab/sinatools/issues).
62
+
@@ -0,0 +1,62 @@
1
+ Metadata-Version: 2.1
2
+ Name: SinaTools
3
+ Version: 0.1.38
4
+ Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
+ Home-page: https://github.com/SinaLab/sinatools
6
+ License: MIT license
7
+ Keywords: sinatools
8
+ Description-Content-Type: text/markdown
9
+ License-File: LICENSE
10
+ License-File: AUTHORS.rst
11
+ Requires-Dist: six
12
+ Requires-Dist: farasapy
13
+ Requires-Dist: tqdm
14
+ Requires-Dist: requests
15
+ Requires-Dist: pathlib
16
+ Requires-Dist: torch==1.13.0
17
+ Requires-Dist: transformers==4.24.0
18
+ Requires-Dist: torchtext==0.14.0
19
+ Requires-Dist: torchvision==0.14.0
20
+ Requires-Dist: seqeval==1.2.2
21
+ Requires-Dist: natsort==7.1.1
22
+
23
+ SinaTools
24
+ ======================
25
+ Open Source Toolkit for Arabic NLP and NLU developed by [SinaLab](http://sina.birzeit.edu/) at Birzeit University. SinaTools is available through Python APIs, command lines, colabs, and online demos.
26
+
27
+ See the full list of [Available Packages](https://sina.birzeit.edu/sinatools/), which include: (1) [Morphology Tagging](https://sina.birzeit.edu/sinatools/index.html#morph), (2) [Named Entity Recognition (NER)](https://sina.birzeit.edu/sinatools/index.html#ner), (3) [Word Sense Disambiguation (WSD)](https://sina.birzeit.edu/sinatools/index.html#wsd), (4) [Semantic Relatedness](https://sina.birzeit.edu/sinatools/index.html#sr), (5) [Synonymy Extraction and Evaluation](https://sina.birzeit.edu/sinatools/index.html#se), (6) [Relation Extraction](https://sina.birzeit.edu/sinatools/index.html#re), (7) [Utilities](https://sina.birzeit.edu/sinatools/index.html#u) (diacritic-based word matching, Jaccard similarly, parser, tokenizers, corpora processing, transliteration, etc).
28
+
29
+ See [Demo Pages](https://sina.birzeit.edu/sinatools/).
30
+
31
+ See the [benchmarking](https://www.jarrar.info/publications/HJK24.pdf), which shows that SinaTools outperformed all related toolkits.
32
+
33
+ Installation
34
+ --------
35
+ To install SinaTools, ensure you are using Python version 3.10.8, then clone the [GitHub](git://github.com/SinaLab/SinaTools) repository.
36
+
37
+ Alternatively, you can execute the following command:
38
+
39
+ ```bash
40
+ pip install sinatools
41
+ ```
42
+
43
+ Installing Models and Data Files
44
+ --------
45
+ Some modules in SinaTools require some data files and fine-tuned models to be downloaded. To download these models, please consult the [DataDownload](https://sina.birzeit.edu/sinatools/documentation/cli_tools/DataDownload/DataDownload.html).
46
+
47
+ Documentation
48
+ --------
49
+ For information, please refer to the [main page](https://sina.birzeit.edu/sinatools) or the [online domuementation](https://sina.birzeit.edu/sinatools/documentation).
50
+
51
+ Citation
52
+ -------
53
+ Tymaa Hammouda, Mustafa Jarrar, Mohammed Khalilia: [SinaTools: Open Source Toolkit for Arabic Natural Language Understanding](http://www.jarrar.info/publications/HJK24.pdf). In Proceedings of the 2024 AI in Computational Linguistics (ACLing 2024), Procedia Computer Science, Dubai. ELSEVIER.
54
+
55
+ License
56
+ --------
57
+ SinaTools is available under the MIT License. See the [LICENSE](https://github.com/SinaLab/sinatools/blob/main/LICENSE) file for more information.
58
+
59
+ Reporting Issues
60
+ --------
61
+ To report any issues or bugs, please contact us at "sina.institute.bzu@gmail.com" or visit [SinaTools Issues](https://github.com/SinaLab/sinatools/issues).
62
+
@@ -96,56 +96,6 @@ sinatools/DataDownload/__init__.py
96
96
  sinatools/DataDownload/downloader.py
97
97
  sinatools/arabert/__init__.py
98
98
  sinatools/arabert/preprocess.py
99
- sinatools/arabert/arabert/__init__.py
100
- sinatools/arabert/arabert/create_classification_data.py
101
- sinatools/arabert/arabert/create_pretraining_data.py
102
- sinatools/arabert/arabert/extract_features.py
103
- sinatools/arabert/arabert/lamb_optimizer.py
104
- sinatools/arabert/arabert/modeling.py
105
- sinatools/arabert/arabert/optimization.py
106
- sinatools/arabert/arabert/run_classifier.py
107
- sinatools/arabert/arabert/run_pretraining.py
108
- sinatools/arabert/arabert/run_squad.py
109
- sinatools/arabert/arabert/tokenization.py
110
- sinatools/arabert/araelectra/__init__.py
111
- sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py
112
- sinatools/arabert/araelectra/build_pretraining_dataset.py
113
- sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py
114
- sinatools/arabert/araelectra/configure_finetuning.py
115
- sinatools/arabert/araelectra/configure_pretraining.py
116
- sinatools/arabert/araelectra/flops_computation.py
117
- sinatools/arabert/araelectra/run_finetuning.py
118
- sinatools/arabert/araelectra/run_pretraining.py
119
- sinatools/arabert/araelectra/finetune/__init__.py
120
- sinatools/arabert/araelectra/finetune/feature_spec.py
121
- sinatools/arabert/araelectra/finetune/preprocessing.py
122
- sinatools/arabert/araelectra/finetune/scorer.py
123
- sinatools/arabert/araelectra/finetune/task.py
124
- sinatools/arabert/araelectra/finetune/task_builder.py
125
- sinatools/arabert/araelectra/model/__init__.py
126
- sinatools/arabert/araelectra/model/modeling.py
127
- sinatools/arabert/araelectra/model/optimization.py
128
- sinatools/arabert/araelectra/model/tokenization.py
129
- sinatools/arabert/araelectra/pretrain/__init__.py
130
- sinatools/arabert/araelectra/pretrain/pretrain_data.py
131
- sinatools/arabert/araelectra/pretrain/pretrain_helpers.py
132
- sinatools/arabert/araelectra/util/__init__.py
133
- sinatools/arabert/araelectra/util/training_utils.py
134
- sinatools/arabert/araelectra/util/utils.py
135
- sinatools/arabert/aragpt2/__init__.py
136
- sinatools/arabert/aragpt2/create_pretraining_data.py
137
- sinatools/arabert/aragpt2/train_bpe_tokenizer.py
138
- sinatools/arabert/aragpt2/gpt2/__init__.py
139
- sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py
140
- sinatools/arabert/aragpt2/gpt2/optimization.py
141
- sinatools/arabert/aragpt2/gpt2/run_pretraining.py
142
- sinatools/arabert/aragpt2/grover/__init__.py
143
- sinatools/arabert/aragpt2/grover/dataloader.py
144
- sinatools/arabert/aragpt2/grover/modeling.py
145
- sinatools/arabert/aragpt2/grover/modeling_gpt2.py
146
- sinatools/arabert/aragpt2/grover/optimization_adafactor.py
147
- sinatools/arabert/aragpt2/grover/train_tpu.py
148
- sinatools/arabert/aragpt2/grover/utils.py
149
99
  sinatools/morphology/ALMA_multi_word.py
150
100
  sinatools/morphology/__init__.py
151
101
  sinatools/morphology/morph_analyzer.py
@@ -20,4 +20,3 @@ sentence_tokenizer = sinatools.CLI.utils.sentence_tokenizer:main
20
20
  text_dublication_detector = sinatools.CLI.utils.text_dublication_detector:main
21
21
  transliterate = sinatools.CLI.utils.text_transliteration:main
22
22
  wsd = sinatools.CLI.wsd.disambiguator:main
23
-
@@ -2,7 +2,6 @@ six
2
2
  farasapy
3
3
  tqdm
4
4
  requests
5
- regex
6
5
  pathlib
7
6
  torch==1.13.0
8
7
  transformers==4.24.0
@@ -16,7 +16,7 @@ requirements = [
16
16
  'farasapy',
17
17
  'tqdm',
18
18
  'requests',
19
- 'regex',
19
+ # 'regex',
20
20
  'pathlib',
21
21
  'torch==1.13.0',
22
22
  'transformers==4.24.0',
@@ -0,0 +1 @@
1
+ 0.1.38
@@ -1,203 +1,203 @@
1
- import os
2
- import logging
3
- import torch
4
- import numpy as np
5
- from sinatools.ner.trainers import BaseTrainer
6
- from sinatools.ner.metrics import compute_nested_metrics
7
-
8
- logger = logging.getLogger(__name__)
9
-
10
-
11
- class BertNestedTrainer(BaseTrainer):
12
- def __init__(self, **kwargs):
13
- super().__init__(**kwargs)
14
-
15
- def train(self):
16
- best_val_loss, test_loss = np.inf, np.inf
17
- num_train_batch = len(self.train_dataloader)
18
- num_labels = [len(v) for v in self.train_dataloader.dataset.vocab.tags[1:]]
19
- patience = self.patience
20
-
21
- for epoch_index in range(self.max_epochs):
22
- self.current_epoch = epoch_index
23
- train_loss = 0
24
-
25
- for batch_index, (subwords, gold_tags, tokens, valid_len, logits) in enumerate(self.tag(
26
- self.train_dataloader, is_train=True
27
- ), 1):
28
- self.current_timestep += 1
29
-
30
- # Compute loses for each output
31
- # logits = B x T x L x C
32
- losses = [self.loss(logits[:, :, i, 0:l].view(-1, logits[:, :, i, 0:l].shape[-1]),
33
- torch.reshape(gold_tags[:, i, :], (-1,)).long())
34
- for i, l in enumerate(num_labels)]
35
-
36
- torch.autograd.backward(losses)
37
-
38
- # Avoid exploding gradient by doing gradient clipping
39
- torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
40
-
41
- self.optimizer.step()
42
- self.scheduler.step()
43
- batch_loss = sum(l.item() for l in losses)
44
- train_loss += batch_loss
45
-
46
- if self.current_timestep % self.log_interval == 0:
47
- logger.info(
48
- "Epoch %d | Batch %d/%d | Timestep %d | LR %.10f | Loss %f",
49
- epoch_index,
50
- batch_index,
51
- num_train_batch,
52
- self.current_timestep,
53
- self.optimizer.param_groups[0]['lr'],
54
- batch_loss
55
- )
56
-
57
- train_loss /= num_train_batch
58
-
59
- logger.info("** Evaluating on validation dataset **")
60
- val_preds, segments, valid_len, val_loss = self.eval(self.val_dataloader)
61
- val_metrics = compute_nested_metrics(segments, self.val_dataloader.dataset.transform.vocab.tags[1:])
62
-
63
- epoch_summary_loss = {
64
- "train_loss": train_loss,
65
- "val_loss": val_loss
66
- }
67
- epoch_summary_metrics = {
68
- "val_micro_f1": val_metrics.micro_f1,
69
- "val_precision": val_metrics.precision,
70
- "val_recall": val_metrics.recall
71
- }
72
-
73
- logger.info(
74
- "Epoch %d | Timestep %d | Train Loss %f | Val Loss %f | F1 %f",
75
- epoch_index,
76
- self.current_timestep,
77
- train_loss,
78
- val_loss,
79
- val_metrics.micro_f1
80
- )
81
-
82
- if val_loss < best_val_loss:
83
- patience = self.patience
84
- best_val_loss = val_loss
85
- logger.info("** Validation improved, evaluating test data **")
86
- test_preds, segments, valid_len, test_loss = self.eval(self.test_dataloader)
87
- self.segments_to_file(segments, os.path.join(self.output_path, "predictions.txt"))
88
- test_metrics = compute_nested_metrics(segments, self.test_dataloader.dataset.transform.vocab.tags[1:])
89
-
90
- epoch_summary_loss["test_loss"] = test_loss
91
- epoch_summary_metrics["test_micro_f1"] = test_metrics.micro_f1
92
- epoch_summary_metrics["test_precision"] = test_metrics.precision
93
- epoch_summary_metrics["test_recall"] = test_metrics.recall
94
-
95
- logger.info(
96
- f"Epoch %d | Timestep %d | Test Loss %f | F1 %f",
97
- epoch_index,
98
- self.current_timestep,
99
- test_loss,
100
- test_metrics.micro_f1
101
- )
102
-
103
- self.save()
104
- else:
105
- patience -= 1
106
-
107
- # No improvements, terminating early
108
- if patience == 0:
109
- logger.info("Early termination triggered")
110
- break
111
-
112
- self.summary_writer.add_scalars("Loss", epoch_summary_loss, global_step=self.current_timestep)
113
- self.summary_writer.add_scalars("Metrics", epoch_summary_metrics, global_step=self.current_timestep)
114
-
115
- def tag(self, dataloader, is_train=True):
116
- """
117
- Given a dataloader containing segments, predict the tags
118
- :param dataloader: torch.utils.data.DataLoader
119
- :param is_train: boolean - True for training model, False for evaluation
120
- :return: Iterator
121
- subwords (B x T x NUM_LABELS)- torch.Tensor - BERT subword ID
122
- gold_tags (B x T x NUM_LABELS) - torch.Tensor - ground truth tags IDs
123
- tokens - List[arabiner.data.dataset.Token] - list of tokens
124
- valid_len (B x 1) - int - valiud length of each sequence
125
- logits (B x T x NUM_LABELS) - logits for each token and each tag
126
- """
127
- for subwords, gold_tags, tokens, mask, valid_len in dataloader:
128
- self.model.train(is_train)
129
-
130
- if torch.cuda.is_available():
131
- subwords = subwords.cuda()
132
- gold_tags = gold_tags.cuda()
133
-
134
- if is_train:
135
- self.optimizer.zero_grad()
136
- logits = self.model(subwords)
137
- else:
138
- with torch.no_grad():
139
- logits = self.model(subwords)
140
-
141
- yield subwords, gold_tags, tokens, valid_len, logits
142
-
143
- def eval(self, dataloader):
144
- golds, preds, segments, valid_lens = list(), list(), list(), list()
145
- num_labels = [len(v) for v in dataloader.dataset.vocab.tags[1:]]
146
- loss = 0
147
-
148
- for _, gold_tags, tokens, valid_len, logits in self.tag(
149
- dataloader, is_train=False
150
- ):
151
- losses = [self.loss(logits[:, :, i, 0:l].view(-1, logits[:, :, i, 0:l].shape[-1]),
152
- torch.reshape(gold_tags[:, i, :], (-1,)).long())
153
- for i, l in enumerate(num_labels)]
154
- loss += sum(losses)
155
- preds += torch.argmax(logits, dim=3)
156
- segments += tokens
157
- valid_lens += list(valid_len)
158
-
159
- loss /= len(dataloader)
160
-
161
- # Update segments, attach predicted tags to each token
162
- segments = self.to_segments(segments, preds, valid_lens, dataloader.dataset.vocab)
163
-
164
- return preds, segments, valid_lens, loss
165
-
166
- def infer(self, dataloader):
167
- golds, preds, segments, valid_lens = list(), list(), list(), list()
168
-
169
- for _, gold_tags, tokens, valid_len, logits in self.tag(
170
- dataloader, is_train=False
171
- ):
172
- preds += torch.argmax(logits, dim=3)
173
- segments += tokens
174
- valid_lens += list(valid_len)
175
-
176
- segments = self.to_segments(segments, preds, valid_lens, dataloader.dataset.vocab)
177
- return segments
178
-
179
- def to_segments(self, segments, preds, valid_lens, vocab):
180
- if vocab is None:
181
- vocab = self.vocab
182
-
183
- tagged_segments = list()
184
- tokens_stoi = vocab.tokens.get_stoi()
185
- unk_id = tokens_stoi["UNK"]
186
-
187
- for segment, pred, valid_len in zip(segments, preds, valid_lens):
188
- # First, the token at 0th index [CLS] and token at nth index [SEP]
189
- # Combine the tokens with their corresponding predictions
190
- segment_pred = zip(segment[1:valid_len-1], pred[1:valid_len-1])
191
-
192
- # Ignore the sub-tokens/subwords, which are identified with text being UNK
193
- segment_pred = list(filter(lambda t: tokens_stoi[t[0].text] != unk_id, segment_pred))
194
-
195
- # Attach the predicted tags to each token
196
- list(map(lambda t: setattr(t[0], 'pred_tag', [{"tag": vocab.get_itos()[tag_id]}
197
- for tag_id, vocab in zip(t[1].int().tolist(), vocab.tags[1:])]), segment_pred))
198
-
199
- # We are only interested in the tagged tokens, we do no longer need raw model predictions
200
- tagged_segment = [t for t, _ in segment_pred]
201
- tagged_segments.append(tagged_segment)
202
-
203
- return tagged_segments
1
+ import os
2
+ import logging
3
+ import torch
4
+ import numpy as np
5
+ from sinatools.ner.trainers import BaseTrainer
6
+ from sinatools.ner.metrics import compute_nested_metrics
7
+
8
+ logger = logging.getLogger(__name__)
9
+
10
+
11
+ class BertNestedTrainer(BaseTrainer):
12
+ def __init__(self, **kwargs):
13
+ super().__init__(**kwargs)
14
+
15
+ def train(self):
16
+ best_val_loss, test_loss = np.inf, np.inf
17
+ num_train_batch = len(self.train_dataloader)
18
+ num_labels = [len(v) for v in self.train_dataloader.dataset.vocab.tags[1:]]
19
+ patience = self.patience
20
+
21
+ for epoch_index in range(self.max_epochs):
22
+ self.current_epoch = epoch_index
23
+ train_loss = 0
24
+
25
+ for batch_index, (subwords, gold_tags, tokens, valid_len, logits) in enumerate(self.tag(
26
+ self.train_dataloader, is_train=True
27
+ ), 1):
28
+ self.current_timestep += 1
29
+
30
+ # Compute loses for each output
31
+ # logits = B x T x L x C
32
+ losses = [self.loss(logits[:, :, i, 0:l].view(-1, logits[:, :, i, 0:l].shape[-1]),
33
+ torch.reshape(gold_tags[:, i, :], (-1,)).long())
34
+ for i, l in enumerate(num_labels)]
35
+
36
+ torch.autograd.backward(losses)
37
+
38
+ # Avoid exploding gradient by doing gradient clipping
39
+ torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
40
+
41
+ self.optimizer.step()
42
+ self.scheduler.step()
43
+ batch_loss = sum(l.item() for l in losses)
44
+ train_loss += batch_loss
45
+
46
+ if self.current_timestep % self.log_interval == 0:
47
+ logger.info(
48
+ "Epoch %d | Batch %d/%d | Timestep %d | LR %.10f | Loss %f",
49
+ epoch_index,
50
+ batch_index,
51
+ num_train_batch,
52
+ self.current_timestep,
53
+ self.optimizer.param_groups[0]['lr'],
54
+ batch_loss
55
+ )
56
+
57
+ train_loss /= num_train_batch
58
+
59
+ logger.info("** Evaluating on validation dataset **")
60
+ val_preds, segments, valid_len, val_loss = self.eval(self.val_dataloader)
61
+ val_metrics = compute_nested_metrics(segments, self.val_dataloader.dataset.transform.vocab.tags[1:])
62
+
63
+ epoch_summary_loss = {
64
+ "train_loss": train_loss,
65
+ "val_loss": val_loss
66
+ }
67
+ epoch_summary_metrics = {
68
+ "val_micro_f1": val_metrics.micro_f1,
69
+ "val_precision": val_metrics.precision,
70
+ "val_recall": val_metrics.recall
71
+ }
72
+
73
+ logger.info(
74
+ "Epoch %d | Timestep %d | Train Loss %f | Val Loss %f | F1 %f",
75
+ epoch_index,
76
+ self.current_timestep,
77
+ train_loss,
78
+ val_loss,
79
+ val_metrics.micro_f1
80
+ )
81
+
82
+ if val_loss < best_val_loss:
83
+ patience = self.patience
84
+ best_val_loss = val_loss
85
+ logger.info("** Validation improved, evaluating test data **")
86
+ test_preds, segments, valid_len, test_loss = self.eval(self.test_dataloader)
87
+ self.segments_to_file(segments, os.path.join(self.output_path, "predictions.txt"))
88
+ test_metrics = compute_nested_metrics(segments, self.test_dataloader.dataset.transform.vocab.tags[1:])
89
+
90
+ epoch_summary_loss["test_loss"] = test_loss
91
+ epoch_summary_metrics["test_micro_f1"] = test_metrics.micro_f1
92
+ epoch_summary_metrics["test_precision"] = test_metrics.precision
93
+ epoch_summary_metrics["test_recall"] = test_metrics.recall
94
+
95
+ logger.info(
96
+ f"Epoch %d | Timestep %d | Test Loss %f | F1 %f",
97
+ epoch_index,
98
+ self.current_timestep,
99
+ test_loss,
100
+ test_metrics.micro_f1
101
+ )
102
+
103
+ self.save()
104
+ else:
105
+ patience -= 1
106
+
107
+ # No improvements, terminating early
108
+ if patience == 0:
109
+ logger.info("Early termination triggered")
110
+ break
111
+
112
+ self.summary_writer.add_scalars("Loss", epoch_summary_loss, global_step=self.current_timestep)
113
+ self.summary_writer.add_scalars("Metrics", epoch_summary_metrics, global_step=self.current_timestep)
114
+
115
+ def tag(self, dataloader, is_train=True):
116
+ """
117
+ Given a dataloader containing segments, predict the tags
118
+ :param dataloader: torch.utils.data.DataLoader
119
+ :param is_train: boolean - True for training model, False for evaluation
120
+ :return: Iterator
121
+ subwords (B x T x NUM_LABELS)- torch.Tensor - BERT subword ID
122
+ gold_tags (B x T x NUM_LABELS) - torch.Tensor - ground truth tags IDs
123
+ tokens - List[arabiner.data.dataset.Token] - list of tokens
124
+ valid_len (B x 1) - int - valiud length of each sequence
125
+ logits (B x T x NUM_LABELS) - logits for each token and each tag
126
+ """
127
+ for subwords, gold_tags, tokens, mask, valid_len in dataloader:
128
+ self.model.train(is_train)
129
+
130
+ if torch.cuda.is_available():
131
+ subwords = subwords.cuda()
132
+ gold_tags = gold_tags.cuda()
133
+
134
+ if is_train:
135
+ self.optimizer.zero_grad()
136
+ logits = self.model(subwords)
137
+ else:
138
+ with torch.no_grad():
139
+ logits = self.model(subwords)
140
+
141
+ yield subwords, gold_tags, tokens, valid_len, logits
142
+
143
+ def eval(self, dataloader):
144
+ golds, preds, segments, valid_lens = list(), list(), list(), list()
145
+ num_labels = [len(v) for v in dataloader.dataset.vocab.tags[1:]]
146
+ loss = 0
147
+
148
+ for _, gold_tags, tokens, valid_len, logits in self.tag(
149
+ dataloader, is_train=False
150
+ ):
151
+ losses = [self.loss(logits[:, :, i, 0:l].view(-1, logits[:, :, i, 0:l].shape[-1]),
152
+ torch.reshape(gold_tags[:, i, :], (-1,)).long())
153
+ for i, l in enumerate(num_labels)]
154
+ loss += sum(losses)
155
+ preds += torch.argmax(logits, dim=3)
156
+ segments += tokens
157
+ valid_lens += list(valid_len)
158
+
159
+ loss /= len(dataloader)
160
+
161
+ # Update segments, attach predicted tags to each token
162
+ segments = self.to_segments(segments, preds, valid_lens, dataloader.dataset.vocab)
163
+
164
+ return preds, segments, valid_lens, loss
165
+
166
+ def infer(self, dataloader):
167
+ golds, preds, segments, valid_lens = list(), list(), list(), list()
168
+
169
+ for _, gold_tags, tokens, valid_len, logits in self.tag(
170
+ dataloader, is_train=False
171
+ ):
172
+ preds += torch.argmax(logits, dim=3)
173
+ segments += tokens
174
+ valid_lens += list(valid_len)
175
+
176
+ segments = self.to_segments(segments, preds, valid_lens, dataloader.dataset.vocab)
177
+ return segments
178
+
179
+ def to_segments(self, segments, preds, valid_lens, vocab):
180
+ if vocab is None:
181
+ vocab = self.vocab
182
+
183
+ tagged_segments = list()
184
+ tokens_stoi = vocab.tokens.get_stoi()
185
+ unk_id = tokens_stoi["UNK"]
186
+
187
+ for segment, pred, valid_len in zip(segments, preds, valid_lens):
188
+ # First, the token at 0th index [CLS] and token at nth index [SEP]
189
+ # Combine the tokens with their corresponding predictions
190
+ segment_pred = zip(segment[1:valid_len-1], pred[1:valid_len-1])
191
+
192
+ # Ignore the sub-tokens/subwords, which are identified with text being UNK
193
+ segment_pred = list(filter(lambda t: tokens_stoi[t[0].text] != unk_id, segment_pred))
194
+
195
+ # Attach the predicted tags to each token
196
+ list(map(lambda t: setattr(t[0], 'pred_tag', [{"tag": vocab.get_itos()[tag_id]}
197
+ for tag_id, vocab in zip(t[1].int().tolist(), vocab.tags[1:])]), segment_pred))
198
+
199
+ # We are only interested in the tagged tokens, we do no longer need raw model predictions
200
+ tagged_segment = [t for t, _ in segment_pred]
201
+ tagged_segments.append(tagged_segment)
202
+
203
+ return tagged_segments