SinaTools 0.1.38__tar.gz → 0.1.39__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. {sinatools-0.1.38 → sinatools-0.1.39}/PKG-INFO +11 -6
  2. {sinatools-0.1.38 → sinatools-0.1.39}/SinaTools.egg-info/PKG-INFO +11 -6
  3. sinatools-0.1.39/SinaTools.egg-info/requires.txt +9 -0
  4. {sinatools-0.1.38 → sinatools-0.1.39}/setup.py +4 -4
  5. sinatools-0.1.39/sinatools/VERSION +1 -0
  6. sinatools-0.1.39/sinatools/environment.yml +182 -0
  7. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/data/datasets.py +7 -3
  8. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/data_format.py +24 -12
  9. sinatools-0.1.39/sinatools/ner/helpers.py +117 -0
  10. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/trainers/BaseTrainer.py +2 -2
  11. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/trainers/BertNestedTrainer.py +203 -203
  12. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/trainers/BertTrainer.py +163 -163
  13. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/trainers/__init__.py +2 -2
  14. sinatools-0.1.38/SinaTools.egg-info/requires.txt +0 -11
  15. sinatools-0.1.38/sinatools/VERSION +0 -1
  16. sinatools-0.1.38/sinatools/environment.yml +0 -227
  17. sinatools-0.1.38/sinatools/ner/helpers.py +0 -86
  18. {sinatools-0.1.38 → sinatools-0.1.39}/AUTHORS.rst +0 -0
  19. {sinatools-0.1.38 → sinatools-0.1.39}/CONTRIBUTING.rst +0 -0
  20. {sinatools-0.1.38 → sinatools-0.1.39}/LICENSE +0 -0
  21. {sinatools-0.1.38 → sinatools-0.1.39}/MANIFEST.in +0 -0
  22. {sinatools-0.1.38 → sinatools-0.1.39}/README.rst +0 -0
  23. {sinatools-0.1.38 → sinatools-0.1.39}/SinaTools.egg-info/SOURCES.txt +0 -0
  24. {sinatools-0.1.38 → sinatools-0.1.39}/SinaTools.egg-info/dependency_links.txt +0 -0
  25. {sinatools-0.1.38 → sinatools-0.1.39}/SinaTools.egg-info/entry_points.txt +0 -0
  26. {sinatools-0.1.38 → sinatools-0.1.39}/SinaTools.egg-info/not-zip-safe +0 -0
  27. {sinatools-0.1.38 → sinatools-0.1.39}/SinaTools.egg-info/top_level.txt +0 -0
  28. {sinatools-0.1.38 → sinatools-0.1.39}/docs/Makefile +0 -0
  29. {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/_images/download.png +0 -0
  30. {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/_static/download.png +0 -0
  31. {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/_static/file.png +0 -0
  32. {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/_static/minus.png +0 -0
  33. {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/_static/plus.png +0 -0
  34. {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/html/_images/SinaLogo.jpg +0 -0
  35. {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/html/_images/download.png +0 -0
  36. {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/html/_static/SinaLogo.jpg +0 -0
  37. {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/html/_static/download.png +0 -0
  38. {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/html/_static/file.png +0 -0
  39. {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/html/_static/minus.png +0 -0
  40. {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/html/_static/plus.png +0 -0
  41. {sinatools-0.1.38 → sinatools-0.1.39}/docs/make.bat +0 -0
  42. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/License.rst +0 -0
  43. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/Overview.rst +0 -0
  44. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/_static/SinaLogo.jpg +0 -0
  45. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/_static/download.png +0 -0
  46. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/about.rst +0 -0
  47. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/DataDownload/downloader.rst +0 -0
  48. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/DataDownload.rst +0 -0
  49. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/arabiner/bin/infer.rst +0 -0
  50. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/arabiner.rst +0 -0
  51. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/morphology/morph_analyzer.rst +0 -0
  52. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/morphology.rst +0 -0
  53. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/salma/views.rst +0 -0
  54. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/salma.rst +0 -0
  55. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/utils/corpus_tokenizer.rst +0 -0
  56. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/utils/implication.rst +0 -0
  57. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/utils/jaccard.rst +0 -0
  58. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/utils/parser.rst +0 -0
  59. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/utils/sentence_tokenizer.rst +0 -0
  60. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/utils/text_transliteration.rst +0 -0
  61. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/utils.rst +0 -0
  62. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api.rst +0 -0
  63. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/authors.rst +0 -0
  64. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/DataDownload/download_files.rst +0 -0
  65. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/DataDownload/get_appdatadir.rst +0 -0
  66. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/DataDownload.rst +0 -0
  67. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/arabiner/infer.rst +0 -0
  68. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/arabiner.rst +0 -0
  69. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/morphology/ALMA_multi_word.rst +0 -0
  70. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/morphology/morph_analyzer.rst +0 -0
  71. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/morphology.rst +0 -0
  72. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/salma/salma_tools.rst +0 -0
  73. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/salma.rst +0 -0
  74. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils/arStrip.rst +0 -0
  75. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils/corpus_tokenizer.rst +0 -0
  76. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils/implication.rst +0 -0
  77. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils/jaccard.rst +0 -0
  78. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils/latin_remove.rst +0 -0
  79. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils/remove_punc.rst +0 -0
  80. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils/sentence_tokenizer.rst +0 -0
  81. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils/text_transliteration.rst +0 -0
  82. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils.rst +0 -0
  83. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools.rst +0 -0
  84. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/conf.py +0 -0
  85. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/index.rst +0 -0
  86. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/installation.rst +0 -0
  87. {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/readme.rst +0 -0
  88. {sinatools-0.1.38 → sinatools-0.1.39}/setup.cfg +0 -0
  89. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/DataDownload/download_files.py +0 -0
  90. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/morphology/ALMA_multi_word.py +0 -0
  91. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/morphology/morph_analyzer.py +0 -0
  92. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/ner/corpus_entity_extractor.py +0 -0
  93. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/ner/entity_extractor.py +0 -0
  94. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/__init__.py +0 -0
  95. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/arStrip.py +0 -0
  96. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/corpus_tokenizer.py +0 -0
  97. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/implication.py +0 -0
  98. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/jaccard.py +0 -0
  99. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/remove_latin.py +0 -0
  100. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/remove_punctuation.py +0 -0
  101. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/sentence_tokenizer.py +0 -0
  102. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/text_dublication_detector.py +0 -0
  103. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/text_transliteration.py +0 -0
  104. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/DataDownload/__init__.py +0 -0
  105. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/DataDownload/downloader.py +0 -0
  106. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/__init__.py +0 -0
  107. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/arabert/__init__.py +0 -0
  108. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/arabert/preprocess.py +0 -0
  109. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/install_env.py +0 -0
  110. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/morphology/ALMA_multi_word.py +0 -0
  111. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/morphology/__init__.py +0 -0
  112. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/morphology/morph_analyzer.py +0 -0
  113. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/__init__.py +0 -0
  114. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/data/__init__.py +0 -0
  115. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/data/transforms.py +0 -0
  116. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/datasets.py +0 -0
  117. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/entity_extractor.py +0 -0
  118. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/metrics.py +0 -0
  119. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/nn/BaseModel.py +0 -0
  120. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/nn/BertNestedTagger.py +0 -0
  121. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/nn/BertSeqTagger.py +0 -0
  122. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/nn/__init__.py +0 -0
  123. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/transforms.py +0 -0
  124. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/relations/__init__.py +0 -0
  125. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/relations/relation_extractor.py +0 -0
  126. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/semantic_relatedness/__init__.py +0 -0
  127. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/semantic_relatedness/compute_relatedness.py +0 -0
  128. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/sinatools.py +0 -0
  129. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/synonyms/__init__.py +0 -0
  130. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/synonyms/synonyms_generator.py +0 -0
  131. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/__init__.py +0 -0
  132. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/charsets.py +0 -0
  133. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/parser.py +0 -0
  134. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/readfile.py +0 -0
  135. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/similarity.py +0 -0
  136. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/text_dublication_detector.py +0 -0
  137. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/text_transliteration.py +0 -0
  138. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/tokenizer.py +0 -0
  139. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/tokenizers_words.py +0 -0
  140. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/word_compare.py +0 -0
  141. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/wsd/__init__.py +0 -0
  142. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/wsd/disambiguator.py +0 -0
  143. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/wsd/settings.py +0 -0
  144. {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/wsd/wsd.py +0 -0
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: SinaTools
3
- Version: 0.1.38
3
+ Version: 0.1.39
4
4
  Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
5
  Home-page: https://github.com/SinaLab/sinatools
6
6
  License: MIT license
@@ -13,12 +13,17 @@ Requires-Dist: farasapy
13
13
  Requires-Dist: tqdm
14
14
  Requires-Dist: requests
15
15
  Requires-Dist: pathlib
16
- Requires-Dist: torch==1.13.0
17
- Requires-Dist: transformers==4.24.0
18
- Requires-Dist: torchtext==0.14.0
19
- Requires-Dist: torchvision==0.14.0
16
+ Requires-Dist: transformers==4.47.1
17
+ Requires-Dist: torchvision==0.20.1
20
18
  Requires-Dist: seqeval==1.2.2
21
19
  Requires-Dist: natsort==7.1.1
20
+ Dynamic: description
21
+ Dynamic: description-content-type
22
+ Dynamic: home-page
23
+ Dynamic: keywords
24
+ Dynamic: license
25
+ Dynamic: requires-dist
26
+ Dynamic: summary
22
27
 
23
28
  SinaTools
24
29
  ======================
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.1
1
+ Metadata-Version: 2.2
2
2
  Name: SinaTools
3
- Version: 0.1.38
3
+ Version: 0.1.39
4
4
  Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
5
  Home-page: https://github.com/SinaLab/sinatools
6
6
  License: MIT license
@@ -13,12 +13,17 @@ Requires-Dist: farasapy
13
13
  Requires-Dist: tqdm
14
14
  Requires-Dist: requests
15
15
  Requires-Dist: pathlib
16
- Requires-Dist: torch==1.13.0
17
- Requires-Dist: transformers==4.24.0
18
- Requires-Dist: torchtext==0.14.0
19
- Requires-Dist: torchvision==0.14.0
16
+ Requires-Dist: transformers==4.47.1
17
+ Requires-Dist: torchvision==0.20.1
20
18
  Requires-Dist: seqeval==1.2.2
21
19
  Requires-Dist: natsort==7.1.1
20
+ Dynamic: description
21
+ Dynamic: description-content-type
22
+ Dynamic: home-page
23
+ Dynamic: keywords
24
+ Dynamic: license
25
+ Dynamic: requires-dist
26
+ Dynamic: summary
22
27
 
23
28
  SinaTools
24
29
  ======================
@@ -0,0 +1,9 @@
1
+ six
2
+ farasapy
3
+ tqdm
4
+ requests
5
+ pathlib
6
+ transformers==4.47.1
7
+ torchvision==0.20.1
8
+ seqeval==1.2.2
9
+ natsort==7.1.1
@@ -18,10 +18,10 @@ requirements = [
18
18
  'requests',
19
19
  # 'regex',
20
20
  'pathlib',
21
- 'torch==1.13.0',
22
- 'transformers==4.24.0',
23
- 'torchtext==0.14.0',
24
- 'torchvision==0.14.0',
21
+ # 'torch==2.5.1',
22
+ 'transformers==4.47.1',
23
+ # 'torchtext==0.14.0',
24
+ 'torchvision==0.20.1',
25
25
  'seqeval==1.2.2',
26
26
  'natsort==7.1.1'
27
27
  ]
@@ -0,0 +1 @@
1
+ 0.1.39
@@ -0,0 +1,182 @@
1
+ name: dev
2
+ channels:
3
+ - pytorch
4
+ - nvidia
5
+ - defaults
6
+ - https://repo.anaconda.com/pkgs/main
7
+ - https://repo.anaconda.com/pkgs/r
8
+ dependencies:
9
+ - _libgcc_mutex=0.1=main
10
+ - _openmp_mutex=5.1=1_gnu
11
+ - _sysroot_linux-64_curr_repodata_hack=3=haa98f57_10
12
+ - binutils_impl_linux-64=2.40=h5293946_0
13
+ - binutils_linux-64=2.40.0=hc2dff05_1
14
+ - blas=1.0=mkl
15
+ - brotli-python=1.0.9=py311h6a678d5_8
16
+ - bzip2=1.0.8=h5eee18b_6
17
+ - ca-certificates=2024.11.26=h06a4308_0
18
+ - certifi=2024.12.14=py311h06a4308_0
19
+ - charset-normalizer=3.3.2=pyhd3eb1b0_0
20
+ - cuda-cudart=12.4.127=0
21
+ - cuda-cupti=12.4.127=0
22
+ - cuda-libraries=12.4.1=0
23
+ - cuda-nvrtc=12.4.127=0
24
+ - cuda-nvtx=12.4.127=0
25
+ - cuda-opencl=12.4.127=0
26
+ - cuda-runtime=12.4.1=0
27
+ - cuda-version=11.7=h6a555f7_3
28
+ - cudatoolkit=11.7.0=hd8887f6_10
29
+ - ffmpeg=4.3=hf484d3e_0
30
+ - filelock=3.13.1=py311h06a4308_0
31
+ - freetype=2.12.1=h4a9f257_0
32
+ - fsspec=2024.6.1=py311h06a4308_0
33
+ - gcc_impl_linux-64=11.2.0=h1234567_1
34
+ - gcc_linux-64=11.2.0=h5c386dc_1
35
+ - giflib=5.2.2=h5eee18b_0
36
+ - gmp=6.2.1=h295c915_3
37
+ - gmpy2=2.1.2=py311hc9b5ff0_0
38
+ - gnutls=3.6.15=he1e5248_0
39
+ - gxx_impl_linux-64=11.2.0=h1234567_1
40
+ - gxx_linux-64=11.2.0=hc2dff05_1
41
+ - idna=3.7=py311h06a4308_0
42
+ - intel-openmp=2023.1.0=hdb19cb5_46306
43
+ - jinja2=3.1.4=py311h06a4308_1
44
+ - jpeg=9e=h5eee18b_3
45
+ - kernel-headers_linux-64=3.10.0=h57e8cba_10
46
+ - lame=3.100=h7b6447c_0
47
+ - lcms2=2.16=hb9589c4_0
48
+ - ld_impl_linux-64=2.40=h12ee557_0
49
+ - lerc=4.0.0=h6a678d5_0
50
+ - libabseil=20240116.2=cxx17_h6a678d5_0
51
+ - libcublas=12.4.5.8=0
52
+ - libcufft=11.2.1.3=0
53
+ - libcufile=1.9.1.3=0
54
+ - libcurand=10.3.5.147=0
55
+ - libcusolver=11.6.1.9=0
56
+ - libcusparse=12.3.1.170=0
57
+ - libdeflate=1.22=h5eee18b_0
58
+ - libffi=3.4.4=h6a678d5_1
59
+ - libgcc-devel_linux-64=11.2.0=h1234567_1
60
+ - libgcc-ng=11.2.0=h1234567_1
61
+ - libgomp=11.2.0=h1234567_1
62
+ - libiconv=1.16=h5eee18b_3
63
+ - libidn2=2.3.4=h5eee18b_0
64
+ - libjpeg-turbo=2.0.0=h9bf148f_0
65
+ - libnpp=12.2.5.30=0
66
+ - libnvfatbin=12.4.127=0
67
+ - libnvjitlink=12.4.127=0
68
+ - libnvjpeg=12.3.1.117=0
69
+ - libpng=1.6.39=h5eee18b_0
70
+ - libprotobuf=4.25.3=he621ea3_0
71
+ - libstdcxx-devel_linux-64=11.2.0=h1234567_1
72
+ - libstdcxx-ng=11.2.0=h1234567_1
73
+ - libtasn1=4.19.0=h5eee18b_0
74
+ - libtiff=4.5.1=hffd6297_1
75
+ - libunistring=0.9.10=h27cfd23_0
76
+ - libuuid=1.41.5=h5eee18b_0
77
+ - libwebp=1.3.2=h11a3e52_0
78
+ - libwebp-base=1.3.2=h5eee18b_1
79
+ - llvm-openmp=14.0.6=h9e868ea_0
80
+ - lz4-c=1.9.4=h6a678d5_1
81
+ - markupsafe=2.1.3=py311h5eee18b_0
82
+ - mkl=2023.1.0=h213fc3f_46344
83
+ - mkl-service=2.4.0=py311h5eee18b_1
84
+ - mkl_fft=1.3.11=py311h5eee18b_0
85
+ - mkl_random=1.2.8=py311ha02d727_0
86
+ - mpc=1.1.0=h10f8cd9_1
87
+ - mpfr=4.0.2=hb69a4c5_1
88
+ - mpmath=1.3.0=py311h06a4308_0
89
+ - ncurses=6.4=h6a678d5_0
90
+ - nettle=3.7.3=hbbd107a_1
91
+ - networkx=3.2.1=py311h06a4308_0
92
+ - numpy=2.0.1=py311h08b1b3b_1
93
+ - numpy-base=2.0.1=py311hf175353_1
94
+ - openh264=2.1.1=h4ff587b_0
95
+ - openjpeg=2.5.2=he7f1fd0_0
96
+ - openssl=3.0.15=h5eee18b_0
97
+ - pillow=11.0.0=py311hcea889d_1
98
+ - pip=24.2=py311h06a4308_0
99
+ - pysocks=1.7.1=py311h06a4308_0
100
+ - python=3.11.11=he870216_0
101
+ - pytorch=2.5.1=py3.11_cuda12.4_cudnn9.1.0_0
102
+ - pytorch-cuda=12.4=hc786d27_7
103
+ - pytorch-mutex=1.0=cuda
104
+ - pyyaml=6.0.2=py311h5eee18b_0
105
+ - readline=8.2=h5eee18b_0
106
+ - requests=2.32.3=py311h06a4308_1
107
+ - setuptools=75.1.0=py311h06a4308_0
108
+ - sqlite=3.45.3=h5eee18b_0
109
+ - sysroot_linux-64=2.17=h57e8cba_10
110
+ - tbb=2021.8.0=hdb19cb5_0
111
+ - tk=8.6.14=h39e8969_0
112
+ - torchaudio=2.5.1=py311_cu124
113
+ - torchtriton=3.1.0=py311
114
+ - torchvision=0.20.1=py311_cu124
115
+ - typing_extensions=4.12.2=py311h06a4308_0
116
+ - urllib3=2.2.3=py311h06a4308_0
117
+ - wheel=0.44.0=py311h06a4308_0
118
+ - xz=5.4.6=h5eee18b_1
119
+ - yaml=0.2.5=h7b6447c_0
120
+ - zlib=1.2.13=h5eee18b_1
121
+ - zstd=1.5.6=hc292b87_0
122
+ - pip:
123
+ - absl-py==2.1.0
124
+ - accelerate==1.2.1
125
+ - aiohappyeyeballs==2.4.4
126
+ - aiohttp==3.11.11
127
+ - aiosignal==1.3.2
128
+ - annotated-types==0.7.0
129
+ - attrs==24.3.0
130
+ - datasets==3.2.0
131
+ - deepspeed==0.16.2
132
+ - dill==0.3.8
133
+ - einops==0.8.0
134
+ - flash-attn==2.7.2.post1
135
+ - frozenlist==1.5.0
136
+ - grpcio==1.70.0
137
+ - hjson==3.1.0
138
+ - huggingface-hub==0.27.0
139
+ - joblib==1.4.2
140
+ - markdown==3.7
141
+ - markdown-it-py==3.0.0
142
+ - mdurl==0.1.2
143
+ - mpi4py==4.0.1
144
+ - msgpack==1.1.0
145
+ - multidict==6.1.0
146
+ - multiprocess==0.70.16
147
+ - natsort==8.4.0
148
+ - ninja==1.11.1.3
149
+ - nvidia-ml-py==12.560.30
150
+ - packaging==24.2
151
+ - pandas==2.2.3
152
+ - peft==0.14.0
153
+ - propcache==0.2.1
154
+ - protobuf==6.30.0
155
+ - psutil==6.1.1
156
+ - py-cpuinfo==9.0.0
157
+ - pyarrow==18.1.0
158
+ - pydantic==2.10.4
159
+ - pydantic-core==2.27.2
160
+ - pygments==2.18.0
161
+ - python-dateutil==2.9.0.post0
162
+ - pytz==2024.2
163
+ - regex==2024.11.6
164
+ - rich==13.9.4
165
+ - safetensors==0.4.5
166
+ - scikit-learn==1.6.1
167
+ - scipy==1.15.2
168
+ - seqeval==1.2.2
169
+ - six==1.17.0
170
+ - sympy==1.13.1
171
+ - tensorboard==2.19.0
172
+ - tensorboard-data-server==0.7.2
173
+ - threadpoolctl==3.5.0
174
+ - tokenizers==0.21.0
175
+ - tqdm==4.67.1
176
+ - transformers==4.47.1
177
+ - trl==0.12.0
178
+ - tzdata==2024.2
179
+ - werkzeug==3.1.3
180
+ - xxhash==3.5.0
181
+ - yarl==1.18.3
182
+
@@ -37,7 +37,11 @@ class Token:
37
37
  :return: str
38
38
  """
39
39
  gold_tags = "|".join(self.gold_tag)
40
- pred_tags = "|".join([pred_tag["tag"] for pred_tag in self.pred_tag])
40
+
41
+ if self.pred_tag:
42
+ pred_tags = "|".join([pred_tag["tag"] for pred_tag in self.pred_tag])
43
+ else:
44
+ pred_tags = ""
41
45
 
42
46
  if self.gold_tag:
43
47
  r = f"{self.text}\t{gold_tags}\t{pred_tags}"
@@ -139,8 +143,8 @@ class NestedTagsDataset(Dataset):
139
143
  masks = torch.cat(masks)
140
144
 
141
145
  # Pad the tags, do the padding for each tag type
142
- tags = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), vocab.get_stoi()["<pad>"])(tag)
146
+ tags = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), vocab.get_stoi()["O"])(tag)
143
147
  for tag, vocab in zip(tags, self.vocab.tags[1:])]
144
148
  tags = torch.cat(tags)
145
149
 
146
- return subwords, tags, tokens, masks, valid_len
150
+ return subwords, tags, tokens, masks, valid_len
@@ -1,16 +1,30 @@
1
1
  from torch.utils.data import DataLoader
2
- from torchtext.vocab import vocab
3
2
  from collections import Counter, namedtuple
4
3
  import logging
5
4
  import re
6
5
  import itertools
7
6
  from sinatools.ner.helpers import load_object
8
- from sinatools.ner.datasets import Token
9
- from sinatools.utils.tokenizers_words import simple_word_tokenize
7
+ from sinatools.ner.data.datasets import Token
10
8
 
11
9
  logger = logging.getLogger(__name__)
12
10
 
13
11
 
12
+ class Vocab:
13
+ def __init__(self, counter, specials=[]) -> None:
14
+ self.itos = list(counter.keys()) + specials
15
+ self.stoi = {s: i for i, s in enumerate(self.itos)}
16
+ self.word_count = counter
17
+
18
+ def get_itos(self) -> list[str]:
19
+ return self.itos
20
+
21
+ def get_stoi(self) -> dict[str, int]:
22
+ return self.stoi
23
+
24
+ def __len__(self):
25
+ return len(self.itos)
26
+
27
+
14
28
  def conll_to_segments(filename):
15
29
  """
16
30
  Convert CoNLL files to segments. This return list of segments and each segment is
@@ -60,8 +74,8 @@ def parse_conll_files(data_paths):
60
74
 
61
75
  # Generate vocabs for tags and tokens
62
76
  tag_vocabs = tag_vocab_by_type(tags)
63
- tag_vocabs.insert(0, vocab(Counter(tags)))
64
- vocabs = vocabs(tokens=vocab(Counter(tokens), specials=["UNK"]), tags=tag_vocabs)
77
+ tag_vocabs.insert(0, Vocab(Counter(tags)))
78
+ vocabs = vocabs(tokens=Vocab(Counter(tokens), specials=["UNK"]), tags=tag_vocabs)
65
79
  return tuple(datasets), vocabs
66
80
 
67
81
 
@@ -72,9 +86,9 @@ def tag_vocab_by_type(tags):
72
86
  tag_types = sorted(list(set([tag.split("-", 1)[1] for tag in tag_names if "-" in tag])))
73
87
 
74
88
  for tag_type in tag_types:
75
- r = re.compile(".*-" + tag_type)
89
+ r = re.compile(".*-" + tag_type + "$")
76
90
  t = list(filter(r.match, tags)) + ["O"]
77
- vocabs.append(vocab(Counter(t), specials=["<pad>"]))
91
+ vocabs.append(Vocab(Counter(t)))
78
92
 
79
93
  return vocabs
80
94
 
@@ -83,13 +97,11 @@ def text2segments(text):
83
97
  """
84
98
  Convert text to a datasets and index the tokens
85
99
  """
86
- #dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
87
- list_of_tokens = simple_word_tokenize(text)
88
- dataset = [[Token(text=token, gold_tag=["O"]) for token in list_of_tokens]]
100
+ dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
89
101
  tokens = [token.text for segment in dataset for token in segment]
90
102
 
91
103
  # Generate vocabs for the tokens
92
- segment_vocab = vocab(Counter(tokens), specials=["UNK"])
104
+ segment_vocab = Vocab(Counter(tokens), specials=["UNK"])
93
105
  return dataset, segment_vocab
94
106
 
95
107
 
@@ -121,4 +133,4 @@ def get_dataloaders(
121
133
  logger.info("%s batches found", len(dataloader))
122
134
  dataloaders.append(dataloader)
123
135
 
124
- return dataloaders
136
+ return dataloaders
@@ -0,0 +1,117 @@
1
+ import os
2
+ import sys
3
+ import logging
4
+ import importlib
5
+ import shutil
6
+ import torch
7
+ import pickle
8
+ import json
9
+ import random
10
+ import numpy as np
11
+ from argparse import Namespace
12
+
13
+
14
+ def logging_config(log_file=None):
15
+ """
16
+ Initialize custom logger
17
+ :param log_file: str - path to log file, full path
18
+ :return: None
19
+ """
20
+ handlers = [logging.StreamHandler(sys.stdout)]
21
+
22
+ if log_file:
23
+ handlers.append(logging.FileHandler(log_file, "w", "utf-8"))
24
+ print("Logging to {}".format(log_file))
25
+
26
+ logging.basicConfig(
27
+ level=logging.INFO,
28
+ handlers=handlers,
29
+ format="%(levelname)s\t%(name)s\t%(asctime)s\t%(message)s",
30
+ datefmt="%a, %d %b %Y %H:%M:%S",
31
+ force=True
32
+ )
33
+
34
+
35
+ def load_object(name, kwargs):
36
+ """
37
+ Load objects dynamically given the object name and its arguments
38
+ :param name: str - object name, class name or function name
39
+ :param kwargs: dict - keyword arguments
40
+ :return: object
41
+ """
42
+ object_module, object_name = name.rsplit(".", 1)
43
+ object_module = importlib.import_module(object_module)
44
+ fn = getattr(object_module, object_name)(**kwargs)
45
+ return fn
46
+
47
+
48
+ def make_output_dirs(path, subdirs=[], overwrite=True):
49
+ """
50
+ Create root directory and any other sub-directories
51
+ :param path: str - root directory
52
+ :param subdirs: List[str] - list of sub-directories
53
+ :param overwrite: boolean - to overwrite the directory or not
54
+ :return: None
55
+ """
56
+ if overwrite:
57
+ shutil.rmtree(path, ignore_errors=True)
58
+
59
+ os.makedirs(path)
60
+
61
+ for subdir in subdirs:
62
+ os.makedirs(os.path.join(path, subdir))
63
+
64
+
65
+ def load_checkpoint(model_path):
66
+ """
67
+ Load model given the model path
68
+ :param model_path: str - path to model
69
+ :return: tagger - arabiner.trainers.BaseTrainer - the tagger model
70
+ vocab - arabicner.utils.data.Vocab - indexed tags
71
+ train_config - argparse.Namespace - training configurations
72
+ """
73
+ with open(os.path.join(model_path, "tag_vocab.pkl"), "rb") as fh:
74
+ tag_vocab = pickle.load(fh)
75
+
76
+ # Load train configurations from checkpoint
77
+ train_config = Namespace()
78
+ with open(os.path.join(model_path, "args.json"), "r") as fh:
79
+ train_config.__dict__ = json.load(fh)
80
+
81
+ # Initialize the loss function, not used for inference, but evaluation
82
+ loss = load_object(train_config.loss["fn"], train_config.loss["kwargs"])
83
+
84
+ # Load BERT tagger
85
+ model = load_object(train_config.network_config["fn"], train_config.network_config["kwargs"])
86
+ model = torch.nn.DataParallel(model)
87
+
88
+ if torch.cuda.is_available():
89
+ model = model.cuda()
90
+
91
+ # Update arguments for the tagger
92
+ # Attach the model, loss (used for evaluations cases)
93
+ train_config.trainer_config["kwargs"]["model"] = model
94
+ train_config.trainer_config["kwargs"]["loss"] = loss
95
+
96
+ tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
97
+ tagger.load(os.path.join(model_path, "checkpoints"))
98
+ return tagger, tag_vocab, train_config
99
+
100
+
101
+ def set_seed(seed):
102
+ """
103
+ Set the seed for random intialization and set
104
+ CUDANN parameters to ensure determmihstic results across
105
+ multiple runs with the same seed
106
+
107
+ :param seed: int
108
+ """
109
+ np.random.seed(seed)
110
+ random.seed(seed)
111
+ torch.manual_seed(seed)
112
+ torch.cuda.manual_seed(seed)
113
+ torch.cuda.manual_seed_all(seed)
114
+
115
+ torch.backends.cudnn.deterministic = True
116
+ torch.backends.cudnn.benchmark = False
117
+ torch.backends.cudnn.enabled = False
@@ -113,5 +113,5 @@ class BaseTrainer:
113
113
  logger.info("Loading checkpoint %s", checkpoint_path)
114
114
 
115
115
  device = None if torch.cuda.is_available() else torch.device('cpu')
116
- checkpoint = torch.load(checkpoint_path, map_location=device)
117
- self.model.load_state_dict(checkpoint["model"], strict=False)
116
+ checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
117
+ self.model.load_state_dict(checkpoint["model"], strict=False)