SinaTools 0.1.38__tar.gz → 0.1.39__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sinatools-0.1.38 → sinatools-0.1.39}/PKG-INFO +11 -6
- {sinatools-0.1.38 → sinatools-0.1.39}/SinaTools.egg-info/PKG-INFO +11 -6
- sinatools-0.1.39/SinaTools.egg-info/requires.txt +9 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/setup.py +4 -4
- sinatools-0.1.39/sinatools/VERSION +1 -0
- sinatools-0.1.39/sinatools/environment.yml +182 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/data/datasets.py +7 -3
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/data_format.py +24 -12
- sinatools-0.1.39/sinatools/ner/helpers.py +117 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/trainers/BaseTrainer.py +2 -2
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/trainers/BertNestedTrainer.py +203 -203
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/trainers/BertTrainer.py +163 -163
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/trainers/__init__.py +2 -2
- sinatools-0.1.38/SinaTools.egg-info/requires.txt +0 -11
- sinatools-0.1.38/sinatools/VERSION +0 -1
- sinatools-0.1.38/sinatools/environment.yml +0 -227
- sinatools-0.1.38/sinatools/ner/helpers.py +0 -86
- {sinatools-0.1.38 → sinatools-0.1.39}/AUTHORS.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/CONTRIBUTING.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/LICENSE +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/MANIFEST.in +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/README.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/SinaTools.egg-info/SOURCES.txt +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/SinaTools.egg-info/dependency_links.txt +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/SinaTools.egg-info/entry_points.txt +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/SinaTools.egg-info/not-zip-safe +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/SinaTools.egg-info/top_level.txt +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/Makefile +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/_images/download.png +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/_static/download.png +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/_static/file.png +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/_static/minus.png +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/_static/plus.png +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/html/_images/SinaLogo.jpg +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/html/_images/download.png +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/html/_static/SinaLogo.jpg +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/html/_static/download.png +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/html/_static/file.png +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/html/_static/minus.png +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/build/html/_static/plus.png +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/make.bat +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/License.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/Overview.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/_static/SinaLogo.jpg +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/_static/download.png +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/about.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/DataDownload/downloader.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/DataDownload.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/arabiner/bin/infer.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/arabiner.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/morphology/morph_analyzer.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/morphology.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/salma/views.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/salma.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/utils/corpus_tokenizer.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/utils/implication.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/utils/jaccard.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/utils/parser.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/utils/sentence_tokenizer.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/utils/text_transliteration.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api/utils.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/api.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/authors.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/DataDownload/download_files.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/DataDownload/get_appdatadir.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/DataDownload.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/arabiner/infer.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/arabiner.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/morphology/ALMA_multi_word.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/morphology/morph_analyzer.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/morphology.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/salma/salma_tools.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/salma.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils/arStrip.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils/corpus_tokenizer.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils/implication.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils/jaccard.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils/latin_remove.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils/remove_punc.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils/sentence_tokenizer.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils/text_transliteration.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools/utils.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/cli_tools.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/conf.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/index.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/installation.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/docs/source/readme.rst +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/setup.cfg +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/DataDownload/download_files.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/morphology/ALMA_multi_word.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/morphology/morph_analyzer.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/ner/corpus_entity_extractor.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/ner/entity_extractor.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/__init__.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/arStrip.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/corpus_tokenizer.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/implication.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/jaccard.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/remove_latin.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/remove_punctuation.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/sentence_tokenizer.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/text_dublication_detector.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/CLI/utils/text_transliteration.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/DataDownload/__init__.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/DataDownload/downloader.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/__init__.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/arabert/__init__.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/arabert/preprocess.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/install_env.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/morphology/ALMA_multi_word.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/morphology/__init__.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/morphology/morph_analyzer.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/__init__.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/data/__init__.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/data/transforms.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/datasets.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/entity_extractor.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/metrics.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/nn/BaseModel.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/nn/BertNestedTagger.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/nn/BertSeqTagger.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/nn/__init__.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/ner/transforms.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/relations/__init__.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/relations/relation_extractor.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/semantic_relatedness/__init__.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/semantic_relatedness/compute_relatedness.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/sinatools.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/synonyms/__init__.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/synonyms/synonyms_generator.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/__init__.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/charsets.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/parser.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/readfile.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/similarity.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/text_dublication_detector.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/text_transliteration.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/tokenizer.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/tokenizers_words.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/utils/word_compare.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/wsd/__init__.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/wsd/disambiguator.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/wsd/settings.py +0 -0
- {sinatools-0.1.38 → sinatools-0.1.39}/sinatools/wsd/wsd.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: SinaTools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.39
|
4
4
|
Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
5
5
|
Home-page: https://github.com/SinaLab/sinatools
|
6
6
|
License: MIT license
|
@@ -13,12 +13,17 @@ Requires-Dist: farasapy
|
|
13
13
|
Requires-Dist: tqdm
|
14
14
|
Requires-Dist: requests
|
15
15
|
Requires-Dist: pathlib
|
16
|
-
Requires-Dist:
|
17
|
-
Requires-Dist:
|
18
|
-
Requires-Dist: torchtext==0.14.0
|
19
|
-
Requires-Dist: torchvision==0.14.0
|
16
|
+
Requires-Dist: transformers==4.47.1
|
17
|
+
Requires-Dist: torchvision==0.20.1
|
20
18
|
Requires-Dist: seqeval==1.2.2
|
21
19
|
Requires-Dist: natsort==7.1.1
|
20
|
+
Dynamic: description
|
21
|
+
Dynamic: description-content-type
|
22
|
+
Dynamic: home-page
|
23
|
+
Dynamic: keywords
|
24
|
+
Dynamic: license
|
25
|
+
Dynamic: requires-dist
|
26
|
+
Dynamic: summary
|
22
27
|
|
23
28
|
SinaTools
|
24
29
|
======================
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: SinaTools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.39
|
4
4
|
Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
5
5
|
Home-page: https://github.com/SinaLab/sinatools
|
6
6
|
License: MIT license
|
@@ -13,12 +13,17 @@ Requires-Dist: farasapy
|
|
13
13
|
Requires-Dist: tqdm
|
14
14
|
Requires-Dist: requests
|
15
15
|
Requires-Dist: pathlib
|
16
|
-
Requires-Dist:
|
17
|
-
Requires-Dist:
|
18
|
-
Requires-Dist: torchtext==0.14.0
|
19
|
-
Requires-Dist: torchvision==0.14.0
|
16
|
+
Requires-Dist: transformers==4.47.1
|
17
|
+
Requires-Dist: torchvision==0.20.1
|
20
18
|
Requires-Dist: seqeval==1.2.2
|
21
19
|
Requires-Dist: natsort==7.1.1
|
20
|
+
Dynamic: description
|
21
|
+
Dynamic: description-content-type
|
22
|
+
Dynamic: home-page
|
23
|
+
Dynamic: keywords
|
24
|
+
Dynamic: license
|
25
|
+
Dynamic: requires-dist
|
26
|
+
Dynamic: summary
|
22
27
|
|
23
28
|
SinaTools
|
24
29
|
======================
|
@@ -18,10 +18,10 @@ requirements = [
|
|
18
18
|
'requests',
|
19
19
|
# 'regex',
|
20
20
|
'pathlib',
|
21
|
-
'torch==
|
22
|
-
'transformers==4.
|
23
|
-
'torchtext==0.14.0',
|
24
|
-
'torchvision==0.
|
21
|
+
# 'torch==2.5.1',
|
22
|
+
'transformers==4.47.1',
|
23
|
+
# 'torchtext==0.14.0',
|
24
|
+
'torchvision==0.20.1',
|
25
25
|
'seqeval==1.2.2',
|
26
26
|
'natsort==7.1.1'
|
27
27
|
]
|
@@ -0,0 +1 @@
|
|
1
|
+
0.1.39
|
@@ -0,0 +1,182 @@
|
|
1
|
+
name: dev
|
2
|
+
channels:
|
3
|
+
- pytorch
|
4
|
+
- nvidia
|
5
|
+
- defaults
|
6
|
+
- https://repo.anaconda.com/pkgs/main
|
7
|
+
- https://repo.anaconda.com/pkgs/r
|
8
|
+
dependencies:
|
9
|
+
- _libgcc_mutex=0.1=main
|
10
|
+
- _openmp_mutex=5.1=1_gnu
|
11
|
+
- _sysroot_linux-64_curr_repodata_hack=3=haa98f57_10
|
12
|
+
- binutils_impl_linux-64=2.40=h5293946_0
|
13
|
+
- binutils_linux-64=2.40.0=hc2dff05_1
|
14
|
+
- blas=1.0=mkl
|
15
|
+
- brotli-python=1.0.9=py311h6a678d5_8
|
16
|
+
- bzip2=1.0.8=h5eee18b_6
|
17
|
+
- ca-certificates=2024.11.26=h06a4308_0
|
18
|
+
- certifi=2024.12.14=py311h06a4308_0
|
19
|
+
- charset-normalizer=3.3.2=pyhd3eb1b0_0
|
20
|
+
- cuda-cudart=12.4.127=0
|
21
|
+
- cuda-cupti=12.4.127=0
|
22
|
+
- cuda-libraries=12.4.1=0
|
23
|
+
- cuda-nvrtc=12.4.127=0
|
24
|
+
- cuda-nvtx=12.4.127=0
|
25
|
+
- cuda-opencl=12.4.127=0
|
26
|
+
- cuda-runtime=12.4.1=0
|
27
|
+
- cuda-version=11.7=h6a555f7_3
|
28
|
+
- cudatoolkit=11.7.0=hd8887f6_10
|
29
|
+
- ffmpeg=4.3=hf484d3e_0
|
30
|
+
- filelock=3.13.1=py311h06a4308_0
|
31
|
+
- freetype=2.12.1=h4a9f257_0
|
32
|
+
- fsspec=2024.6.1=py311h06a4308_0
|
33
|
+
- gcc_impl_linux-64=11.2.0=h1234567_1
|
34
|
+
- gcc_linux-64=11.2.0=h5c386dc_1
|
35
|
+
- giflib=5.2.2=h5eee18b_0
|
36
|
+
- gmp=6.2.1=h295c915_3
|
37
|
+
- gmpy2=2.1.2=py311hc9b5ff0_0
|
38
|
+
- gnutls=3.6.15=he1e5248_0
|
39
|
+
- gxx_impl_linux-64=11.2.0=h1234567_1
|
40
|
+
- gxx_linux-64=11.2.0=hc2dff05_1
|
41
|
+
- idna=3.7=py311h06a4308_0
|
42
|
+
- intel-openmp=2023.1.0=hdb19cb5_46306
|
43
|
+
- jinja2=3.1.4=py311h06a4308_1
|
44
|
+
- jpeg=9e=h5eee18b_3
|
45
|
+
- kernel-headers_linux-64=3.10.0=h57e8cba_10
|
46
|
+
- lame=3.100=h7b6447c_0
|
47
|
+
- lcms2=2.16=hb9589c4_0
|
48
|
+
- ld_impl_linux-64=2.40=h12ee557_0
|
49
|
+
- lerc=4.0.0=h6a678d5_0
|
50
|
+
- libabseil=20240116.2=cxx17_h6a678d5_0
|
51
|
+
- libcublas=12.4.5.8=0
|
52
|
+
- libcufft=11.2.1.3=0
|
53
|
+
- libcufile=1.9.1.3=0
|
54
|
+
- libcurand=10.3.5.147=0
|
55
|
+
- libcusolver=11.6.1.9=0
|
56
|
+
- libcusparse=12.3.1.170=0
|
57
|
+
- libdeflate=1.22=h5eee18b_0
|
58
|
+
- libffi=3.4.4=h6a678d5_1
|
59
|
+
- libgcc-devel_linux-64=11.2.0=h1234567_1
|
60
|
+
- libgcc-ng=11.2.0=h1234567_1
|
61
|
+
- libgomp=11.2.0=h1234567_1
|
62
|
+
- libiconv=1.16=h5eee18b_3
|
63
|
+
- libidn2=2.3.4=h5eee18b_0
|
64
|
+
- libjpeg-turbo=2.0.0=h9bf148f_0
|
65
|
+
- libnpp=12.2.5.30=0
|
66
|
+
- libnvfatbin=12.4.127=0
|
67
|
+
- libnvjitlink=12.4.127=0
|
68
|
+
- libnvjpeg=12.3.1.117=0
|
69
|
+
- libpng=1.6.39=h5eee18b_0
|
70
|
+
- libprotobuf=4.25.3=he621ea3_0
|
71
|
+
- libstdcxx-devel_linux-64=11.2.0=h1234567_1
|
72
|
+
- libstdcxx-ng=11.2.0=h1234567_1
|
73
|
+
- libtasn1=4.19.0=h5eee18b_0
|
74
|
+
- libtiff=4.5.1=hffd6297_1
|
75
|
+
- libunistring=0.9.10=h27cfd23_0
|
76
|
+
- libuuid=1.41.5=h5eee18b_0
|
77
|
+
- libwebp=1.3.2=h11a3e52_0
|
78
|
+
- libwebp-base=1.3.2=h5eee18b_1
|
79
|
+
- llvm-openmp=14.0.6=h9e868ea_0
|
80
|
+
- lz4-c=1.9.4=h6a678d5_1
|
81
|
+
- markupsafe=2.1.3=py311h5eee18b_0
|
82
|
+
- mkl=2023.1.0=h213fc3f_46344
|
83
|
+
- mkl-service=2.4.0=py311h5eee18b_1
|
84
|
+
- mkl_fft=1.3.11=py311h5eee18b_0
|
85
|
+
- mkl_random=1.2.8=py311ha02d727_0
|
86
|
+
- mpc=1.1.0=h10f8cd9_1
|
87
|
+
- mpfr=4.0.2=hb69a4c5_1
|
88
|
+
- mpmath=1.3.0=py311h06a4308_0
|
89
|
+
- ncurses=6.4=h6a678d5_0
|
90
|
+
- nettle=3.7.3=hbbd107a_1
|
91
|
+
- networkx=3.2.1=py311h06a4308_0
|
92
|
+
- numpy=2.0.1=py311h08b1b3b_1
|
93
|
+
- numpy-base=2.0.1=py311hf175353_1
|
94
|
+
- openh264=2.1.1=h4ff587b_0
|
95
|
+
- openjpeg=2.5.2=he7f1fd0_0
|
96
|
+
- openssl=3.0.15=h5eee18b_0
|
97
|
+
- pillow=11.0.0=py311hcea889d_1
|
98
|
+
- pip=24.2=py311h06a4308_0
|
99
|
+
- pysocks=1.7.1=py311h06a4308_0
|
100
|
+
- python=3.11.11=he870216_0
|
101
|
+
- pytorch=2.5.1=py3.11_cuda12.4_cudnn9.1.0_0
|
102
|
+
- pytorch-cuda=12.4=hc786d27_7
|
103
|
+
- pytorch-mutex=1.0=cuda
|
104
|
+
- pyyaml=6.0.2=py311h5eee18b_0
|
105
|
+
- readline=8.2=h5eee18b_0
|
106
|
+
- requests=2.32.3=py311h06a4308_1
|
107
|
+
- setuptools=75.1.0=py311h06a4308_0
|
108
|
+
- sqlite=3.45.3=h5eee18b_0
|
109
|
+
- sysroot_linux-64=2.17=h57e8cba_10
|
110
|
+
- tbb=2021.8.0=hdb19cb5_0
|
111
|
+
- tk=8.6.14=h39e8969_0
|
112
|
+
- torchaudio=2.5.1=py311_cu124
|
113
|
+
- torchtriton=3.1.0=py311
|
114
|
+
- torchvision=0.20.1=py311_cu124
|
115
|
+
- typing_extensions=4.12.2=py311h06a4308_0
|
116
|
+
- urllib3=2.2.3=py311h06a4308_0
|
117
|
+
- wheel=0.44.0=py311h06a4308_0
|
118
|
+
- xz=5.4.6=h5eee18b_1
|
119
|
+
- yaml=0.2.5=h7b6447c_0
|
120
|
+
- zlib=1.2.13=h5eee18b_1
|
121
|
+
- zstd=1.5.6=hc292b87_0
|
122
|
+
- pip:
|
123
|
+
- absl-py==2.1.0
|
124
|
+
- accelerate==1.2.1
|
125
|
+
- aiohappyeyeballs==2.4.4
|
126
|
+
- aiohttp==3.11.11
|
127
|
+
- aiosignal==1.3.2
|
128
|
+
- annotated-types==0.7.0
|
129
|
+
- attrs==24.3.0
|
130
|
+
- datasets==3.2.0
|
131
|
+
- deepspeed==0.16.2
|
132
|
+
- dill==0.3.8
|
133
|
+
- einops==0.8.0
|
134
|
+
- flash-attn==2.7.2.post1
|
135
|
+
- frozenlist==1.5.0
|
136
|
+
- grpcio==1.70.0
|
137
|
+
- hjson==3.1.0
|
138
|
+
- huggingface-hub==0.27.0
|
139
|
+
- joblib==1.4.2
|
140
|
+
- markdown==3.7
|
141
|
+
- markdown-it-py==3.0.0
|
142
|
+
- mdurl==0.1.2
|
143
|
+
- mpi4py==4.0.1
|
144
|
+
- msgpack==1.1.0
|
145
|
+
- multidict==6.1.0
|
146
|
+
- multiprocess==0.70.16
|
147
|
+
- natsort==8.4.0
|
148
|
+
- ninja==1.11.1.3
|
149
|
+
- nvidia-ml-py==12.560.30
|
150
|
+
- packaging==24.2
|
151
|
+
- pandas==2.2.3
|
152
|
+
- peft==0.14.0
|
153
|
+
- propcache==0.2.1
|
154
|
+
- protobuf==6.30.0
|
155
|
+
- psutil==6.1.1
|
156
|
+
- py-cpuinfo==9.0.0
|
157
|
+
- pyarrow==18.1.0
|
158
|
+
- pydantic==2.10.4
|
159
|
+
- pydantic-core==2.27.2
|
160
|
+
- pygments==2.18.0
|
161
|
+
- python-dateutil==2.9.0.post0
|
162
|
+
- pytz==2024.2
|
163
|
+
- regex==2024.11.6
|
164
|
+
- rich==13.9.4
|
165
|
+
- safetensors==0.4.5
|
166
|
+
- scikit-learn==1.6.1
|
167
|
+
- scipy==1.15.2
|
168
|
+
- seqeval==1.2.2
|
169
|
+
- six==1.17.0
|
170
|
+
- sympy==1.13.1
|
171
|
+
- tensorboard==2.19.0
|
172
|
+
- tensorboard-data-server==0.7.2
|
173
|
+
- threadpoolctl==3.5.0
|
174
|
+
- tokenizers==0.21.0
|
175
|
+
- tqdm==4.67.1
|
176
|
+
- transformers==4.47.1
|
177
|
+
- trl==0.12.0
|
178
|
+
- tzdata==2024.2
|
179
|
+
- werkzeug==3.1.3
|
180
|
+
- xxhash==3.5.0
|
181
|
+
- yarl==1.18.3
|
182
|
+
|
@@ -37,7 +37,11 @@ class Token:
|
|
37
37
|
:return: str
|
38
38
|
"""
|
39
39
|
gold_tags = "|".join(self.gold_tag)
|
40
|
-
|
40
|
+
|
41
|
+
if self.pred_tag:
|
42
|
+
pred_tags = "|".join([pred_tag["tag"] for pred_tag in self.pred_tag])
|
43
|
+
else:
|
44
|
+
pred_tags = ""
|
41
45
|
|
42
46
|
if self.gold_tag:
|
43
47
|
r = f"{self.text}\t{gold_tags}\t{pred_tags}"
|
@@ -139,8 +143,8 @@ class NestedTagsDataset(Dataset):
|
|
139
143
|
masks = torch.cat(masks)
|
140
144
|
|
141
145
|
# Pad the tags, do the padding for each tag type
|
142
|
-
tags = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), vocab.get_stoi()["
|
146
|
+
tags = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), vocab.get_stoi()["O"])(tag)
|
143
147
|
for tag, vocab in zip(tags, self.vocab.tags[1:])]
|
144
148
|
tags = torch.cat(tags)
|
145
149
|
|
146
|
-
return subwords, tags, tokens, masks, valid_len
|
150
|
+
return subwords, tags, tokens, masks, valid_len
|
@@ -1,16 +1,30 @@
|
|
1
1
|
from torch.utils.data import DataLoader
|
2
|
-
from torchtext.vocab import vocab
|
3
2
|
from collections import Counter, namedtuple
|
4
3
|
import logging
|
5
4
|
import re
|
6
5
|
import itertools
|
7
6
|
from sinatools.ner.helpers import load_object
|
8
|
-
from sinatools.ner.datasets import Token
|
9
|
-
from sinatools.utils.tokenizers_words import simple_word_tokenize
|
7
|
+
from sinatools.ner.data.datasets import Token
|
10
8
|
|
11
9
|
logger = logging.getLogger(__name__)
|
12
10
|
|
13
11
|
|
12
|
+
class Vocab:
|
13
|
+
def __init__(self, counter, specials=[]) -> None:
|
14
|
+
self.itos = list(counter.keys()) + specials
|
15
|
+
self.stoi = {s: i for i, s in enumerate(self.itos)}
|
16
|
+
self.word_count = counter
|
17
|
+
|
18
|
+
def get_itos(self) -> list[str]:
|
19
|
+
return self.itos
|
20
|
+
|
21
|
+
def get_stoi(self) -> dict[str, int]:
|
22
|
+
return self.stoi
|
23
|
+
|
24
|
+
def __len__(self):
|
25
|
+
return len(self.itos)
|
26
|
+
|
27
|
+
|
14
28
|
def conll_to_segments(filename):
|
15
29
|
"""
|
16
30
|
Convert CoNLL files to segments. This return list of segments and each segment is
|
@@ -60,8 +74,8 @@ def parse_conll_files(data_paths):
|
|
60
74
|
|
61
75
|
# Generate vocabs for tags and tokens
|
62
76
|
tag_vocabs = tag_vocab_by_type(tags)
|
63
|
-
tag_vocabs.insert(0,
|
64
|
-
vocabs = vocabs(tokens=
|
77
|
+
tag_vocabs.insert(0, Vocab(Counter(tags)))
|
78
|
+
vocabs = vocabs(tokens=Vocab(Counter(tokens), specials=["UNK"]), tags=tag_vocabs)
|
65
79
|
return tuple(datasets), vocabs
|
66
80
|
|
67
81
|
|
@@ -72,9 +86,9 @@ def tag_vocab_by_type(tags):
|
|
72
86
|
tag_types = sorted(list(set([tag.split("-", 1)[1] for tag in tag_names if "-" in tag])))
|
73
87
|
|
74
88
|
for tag_type in tag_types:
|
75
|
-
r = re.compile(".*-" + tag_type)
|
89
|
+
r = re.compile(".*-" + tag_type + "$")
|
76
90
|
t = list(filter(r.match, tags)) + ["O"]
|
77
|
-
vocabs.append(
|
91
|
+
vocabs.append(Vocab(Counter(t)))
|
78
92
|
|
79
93
|
return vocabs
|
80
94
|
|
@@ -83,13 +97,11 @@ def text2segments(text):
|
|
83
97
|
"""
|
84
98
|
Convert text to a datasets and index the tokens
|
85
99
|
"""
|
86
|
-
|
87
|
-
list_of_tokens = simple_word_tokenize(text)
|
88
|
-
dataset = [[Token(text=token, gold_tag=["O"]) for token in list_of_tokens]]
|
100
|
+
dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
|
89
101
|
tokens = [token.text for segment in dataset for token in segment]
|
90
102
|
|
91
103
|
# Generate vocabs for the tokens
|
92
|
-
segment_vocab =
|
104
|
+
segment_vocab = Vocab(Counter(tokens), specials=["UNK"])
|
93
105
|
return dataset, segment_vocab
|
94
106
|
|
95
107
|
|
@@ -121,4 +133,4 @@ def get_dataloaders(
|
|
121
133
|
logger.info("%s batches found", len(dataloader))
|
122
134
|
dataloaders.append(dataloader)
|
123
135
|
|
124
|
-
return dataloaders
|
136
|
+
return dataloaders
|
@@ -0,0 +1,117 @@
|
|
1
|
+
import os
|
2
|
+
import sys
|
3
|
+
import logging
|
4
|
+
import importlib
|
5
|
+
import shutil
|
6
|
+
import torch
|
7
|
+
import pickle
|
8
|
+
import json
|
9
|
+
import random
|
10
|
+
import numpy as np
|
11
|
+
from argparse import Namespace
|
12
|
+
|
13
|
+
|
14
|
+
def logging_config(log_file=None):
|
15
|
+
"""
|
16
|
+
Initialize custom logger
|
17
|
+
:param log_file: str - path to log file, full path
|
18
|
+
:return: None
|
19
|
+
"""
|
20
|
+
handlers = [logging.StreamHandler(sys.stdout)]
|
21
|
+
|
22
|
+
if log_file:
|
23
|
+
handlers.append(logging.FileHandler(log_file, "w", "utf-8"))
|
24
|
+
print("Logging to {}".format(log_file))
|
25
|
+
|
26
|
+
logging.basicConfig(
|
27
|
+
level=logging.INFO,
|
28
|
+
handlers=handlers,
|
29
|
+
format="%(levelname)s\t%(name)s\t%(asctime)s\t%(message)s",
|
30
|
+
datefmt="%a, %d %b %Y %H:%M:%S",
|
31
|
+
force=True
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
def load_object(name, kwargs):
|
36
|
+
"""
|
37
|
+
Load objects dynamically given the object name and its arguments
|
38
|
+
:param name: str - object name, class name or function name
|
39
|
+
:param kwargs: dict - keyword arguments
|
40
|
+
:return: object
|
41
|
+
"""
|
42
|
+
object_module, object_name = name.rsplit(".", 1)
|
43
|
+
object_module = importlib.import_module(object_module)
|
44
|
+
fn = getattr(object_module, object_name)(**kwargs)
|
45
|
+
return fn
|
46
|
+
|
47
|
+
|
48
|
+
def make_output_dirs(path, subdirs=[], overwrite=True):
|
49
|
+
"""
|
50
|
+
Create root directory and any other sub-directories
|
51
|
+
:param path: str - root directory
|
52
|
+
:param subdirs: List[str] - list of sub-directories
|
53
|
+
:param overwrite: boolean - to overwrite the directory or not
|
54
|
+
:return: None
|
55
|
+
"""
|
56
|
+
if overwrite:
|
57
|
+
shutil.rmtree(path, ignore_errors=True)
|
58
|
+
|
59
|
+
os.makedirs(path)
|
60
|
+
|
61
|
+
for subdir in subdirs:
|
62
|
+
os.makedirs(os.path.join(path, subdir))
|
63
|
+
|
64
|
+
|
65
|
+
def load_checkpoint(model_path):
|
66
|
+
"""
|
67
|
+
Load model given the model path
|
68
|
+
:param model_path: str - path to model
|
69
|
+
:return: tagger - arabiner.trainers.BaseTrainer - the tagger model
|
70
|
+
vocab - arabicner.utils.data.Vocab - indexed tags
|
71
|
+
train_config - argparse.Namespace - training configurations
|
72
|
+
"""
|
73
|
+
with open(os.path.join(model_path, "tag_vocab.pkl"), "rb") as fh:
|
74
|
+
tag_vocab = pickle.load(fh)
|
75
|
+
|
76
|
+
# Load train configurations from checkpoint
|
77
|
+
train_config = Namespace()
|
78
|
+
with open(os.path.join(model_path, "args.json"), "r") as fh:
|
79
|
+
train_config.__dict__ = json.load(fh)
|
80
|
+
|
81
|
+
# Initialize the loss function, not used for inference, but evaluation
|
82
|
+
loss = load_object(train_config.loss["fn"], train_config.loss["kwargs"])
|
83
|
+
|
84
|
+
# Load BERT tagger
|
85
|
+
model = load_object(train_config.network_config["fn"], train_config.network_config["kwargs"])
|
86
|
+
model = torch.nn.DataParallel(model)
|
87
|
+
|
88
|
+
if torch.cuda.is_available():
|
89
|
+
model = model.cuda()
|
90
|
+
|
91
|
+
# Update arguments for the tagger
|
92
|
+
# Attach the model, loss (used for evaluations cases)
|
93
|
+
train_config.trainer_config["kwargs"]["model"] = model
|
94
|
+
train_config.trainer_config["kwargs"]["loss"] = loss
|
95
|
+
|
96
|
+
tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
|
97
|
+
tagger.load(os.path.join(model_path, "checkpoints"))
|
98
|
+
return tagger, tag_vocab, train_config
|
99
|
+
|
100
|
+
|
101
|
+
def set_seed(seed):
|
102
|
+
"""
|
103
|
+
Set the seed for random intialization and set
|
104
|
+
CUDANN parameters to ensure determmihstic results across
|
105
|
+
multiple runs with the same seed
|
106
|
+
|
107
|
+
:param seed: int
|
108
|
+
"""
|
109
|
+
np.random.seed(seed)
|
110
|
+
random.seed(seed)
|
111
|
+
torch.manual_seed(seed)
|
112
|
+
torch.cuda.manual_seed(seed)
|
113
|
+
torch.cuda.manual_seed_all(seed)
|
114
|
+
|
115
|
+
torch.backends.cudnn.deterministic = True
|
116
|
+
torch.backends.cudnn.benchmark = False
|
117
|
+
torch.backends.cudnn.enabled = False
|
@@ -113,5 +113,5 @@ class BaseTrainer:
|
|
113
113
|
logger.info("Loading checkpoint %s", checkpoint_path)
|
114
114
|
|
115
115
|
device = None if torch.cuda.is_available() else torch.device('cpu')
|
116
|
-
checkpoint = torch.load(checkpoint_path, map_location=device)
|
117
|
-
self.model.load_state_dict(checkpoint["model"], strict=False)
|
116
|
+
checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
|
117
|
+
self.model.load_state_dict(checkpoint["model"], strict=False)
|