SinaTools 0.1.37__tar.gz → 0.1.39__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sinatools-0.1.37 → sinatools-0.1.39}/PKG-INFO +11 -7
- {sinatools-0.1.37 → sinatools-0.1.39}/SinaTools.egg-info/PKG-INFO +11 -7
- {sinatools-0.1.37 → sinatools-0.1.39}/SinaTools.egg-info/SOURCES.txt +0 -50
- sinatools-0.1.39/SinaTools.egg-info/requires.txt +9 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/setup.py +5 -5
- sinatools-0.1.39/sinatools/VERSION +1 -0
- sinatools-0.1.39/sinatools/environment.yml +182 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/ner/data/datasets.py +7 -3
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/ner/data_format.py +24 -12
- sinatools-0.1.39/sinatools/ner/helpers.py +117 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/ner/trainers/BaseTrainer.py +2 -2
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/ner/trainers/BertNestedTrainer.py +203 -203
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/ner/trainers/BertTrainer.py +163 -163
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/ner/trainers/__init__.py +2 -2
- sinatools-0.1.37/SinaTools.egg-info/requires.txt +0 -12
- sinatools-0.1.37/sinatools/VERSION +0 -1
- sinatools-0.1.37/sinatools/arabert/arabert/__init__.py +0 -14
- sinatools-0.1.37/sinatools/arabert/arabert/create_classification_data.py +0 -260
- sinatools-0.1.37/sinatools/arabert/arabert/create_pretraining_data.py +0 -534
- sinatools-0.1.37/sinatools/arabert/arabert/extract_features.py +0 -444
- sinatools-0.1.37/sinatools/arabert/arabert/lamb_optimizer.py +0 -158
- sinatools-0.1.37/sinatools/arabert/arabert/modeling.py +0 -1027
- sinatools-0.1.37/sinatools/arabert/arabert/optimization.py +0 -202
- sinatools-0.1.37/sinatools/arabert/arabert/run_classifier.py +0 -1078
- sinatools-0.1.37/sinatools/arabert/arabert/run_pretraining.py +0 -593
- sinatools-0.1.37/sinatools/arabert/arabert/run_squad.py +0 -1440
- sinatools-0.1.37/sinatools/arabert/arabert/tokenization.py +0 -414
- sinatools-0.1.37/sinatools/arabert/araelectra/__init__.py +0 -1
- sinatools-0.1.37/sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
- sinatools-0.1.37/sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
- sinatools-0.1.37/sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
- sinatools-0.1.37/sinatools/arabert/araelectra/configure_finetuning.py +0 -172
- sinatools-0.1.37/sinatools/arabert/araelectra/configure_pretraining.py +0 -143
- sinatools-0.1.37/sinatools/arabert/araelectra/finetune/__init__.py +0 -14
- sinatools-0.1.37/sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
- sinatools-0.1.37/sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
- sinatools-0.1.37/sinatools/arabert/araelectra/finetune/scorer.py +0 -54
- sinatools-0.1.37/sinatools/arabert/araelectra/finetune/task.py +0 -74
- sinatools-0.1.37/sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
- sinatools-0.1.37/sinatools/arabert/araelectra/flops_computation.py +0 -215
- sinatools-0.1.37/sinatools/arabert/araelectra/model/__init__.py +0 -14
- sinatools-0.1.37/sinatools/arabert/araelectra/model/modeling.py +0 -1029
- sinatools-0.1.37/sinatools/arabert/araelectra/model/optimization.py +0 -193
- sinatools-0.1.37/sinatools/arabert/araelectra/model/tokenization.py +0 -355
- sinatools-0.1.37/sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
- sinatools-0.1.37/sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
- sinatools-0.1.37/sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
- sinatools-0.1.37/sinatools/arabert/araelectra/run_finetuning.py +0 -323
- sinatools-0.1.37/sinatools/arabert/araelectra/run_pretraining.py +0 -469
- sinatools-0.1.37/sinatools/arabert/araelectra/util/__init__.py +0 -14
- sinatools-0.1.37/sinatools/arabert/araelectra/util/training_utils.py +0 -112
- sinatools-0.1.37/sinatools/arabert/araelectra/util/utils.py +0 -109
- sinatools-0.1.37/sinatools/arabert/aragpt2/__init__.py +0 -2
- sinatools-0.1.37/sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
- sinatools-0.1.37/sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
- sinatools-0.1.37/sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
- sinatools-0.1.37/sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
- sinatools-0.1.37/sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
- sinatools-0.1.37/sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
- sinatools-0.1.37/sinatools/arabert/aragpt2/grover/modeling.py +0 -803
- sinatools-0.1.37/sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
- sinatools-0.1.37/sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
- sinatools-0.1.37/sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
- sinatools-0.1.37/sinatools/arabert/aragpt2/grover/utils.py +0 -234
- sinatools-0.1.37/sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
- sinatools-0.1.37/sinatools/environment.yml +0 -227
- sinatools-0.1.37/sinatools/ner/helpers.py +0 -86
- sinatools-0.1.37/sinatools/utils/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/AUTHORS.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/CONTRIBUTING.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/LICENSE +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/MANIFEST.in +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/README.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/SinaTools.egg-info/dependency_links.txt +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/SinaTools.egg-info/entry_points.txt +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/SinaTools.egg-info/not-zip-safe +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/SinaTools.egg-info/top_level.txt +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/Makefile +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/build/_images/download.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/build/_static/download.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/build/_static/file.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/build/_static/minus.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/build/_static/plus.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/build/html/_images/SinaLogo.jpg +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/build/html/_images/download.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/build/html/_static/SinaLogo.jpg +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/build/html/_static/download.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/build/html/_static/file.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/build/html/_static/minus.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/build/html/_static/plus.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/make.bat +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/License.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/Overview.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/_static/SinaLogo.jpg +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/_static/download.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/about.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/api/DataDownload/downloader.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/api/DataDownload.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/api/arabiner/bin/infer.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/api/arabiner.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/api/morphology/morph_analyzer.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/api/morphology.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/api/salma/views.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/api/salma.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/api/utils/corpus_tokenizer.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/api/utils/implication.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/api/utils/jaccard.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/api/utils/parser.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/api/utils/sentence_tokenizer.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/api/utils/text_transliteration.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/api/utils.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/api.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/authors.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/DataDownload/download_files.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/DataDownload/get_appdatadir.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/DataDownload.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/arabiner/infer.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/arabiner.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/morphology/ALMA_multi_word.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/morphology/morph_analyzer.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/morphology.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/salma/salma_tools.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/salma.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/utils/arStrip.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/utils/corpus_tokenizer.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/utils/implication.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/utils/jaccard.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/utils/latin_remove.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/utils/remove_punc.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/utils/sentence_tokenizer.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/utils/text_transliteration.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools/utils.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/cli_tools.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/conf.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/index.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/installation.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/docs/source/readme.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/setup.cfg +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/CLI/DataDownload/download_files.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/CLI/morphology/ALMA_multi_word.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/CLI/morphology/morph_analyzer.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/CLI/ner/corpus_entity_extractor.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/CLI/ner/entity_extractor.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/CLI/utils/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/CLI/utils/arStrip.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/CLI/utils/corpus_tokenizer.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/CLI/utils/implication.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/CLI/utils/jaccard.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/CLI/utils/remove_latin.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/CLI/utils/remove_punctuation.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/CLI/utils/sentence_tokenizer.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/CLI/utils/text_dublication_detector.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/CLI/utils/text_transliteration.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/DataDownload/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/DataDownload/downloader.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/arabert/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/arabert/preprocess.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/install_env.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/morphology/ALMA_multi_word.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/morphology/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/morphology/morph_analyzer.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/ner/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/ner/data/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/ner/data/transforms.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/ner/datasets.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/ner/entity_extractor.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/ner/metrics.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/ner/nn/BaseModel.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/ner/nn/BertNestedTagger.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/ner/nn/BertSeqTagger.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/ner/nn/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/ner/transforms.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/relations/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/relations/relation_extractor.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/semantic_relatedness/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/semantic_relatedness/compute_relatedness.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/sinatools.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/synonyms/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/synonyms/synonyms_generator.py +0 -0
- {sinatools-0.1.37/sinatools/arabert/aragpt2/grover → sinatools-0.1.39/sinatools/utils}/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/utils/charsets.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/utils/parser.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/utils/readfile.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/utils/similarity.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/utils/text_dublication_detector.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/utils/text_transliteration.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/utils/tokenizer.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/utils/tokenizers_words.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/utils/word_compare.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/wsd/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/wsd/disambiguator.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/wsd/settings.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.39}/sinatools/wsd/wsd.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: SinaTools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.39
|
4
4
|
Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
5
5
|
Home-page: https://github.com/SinaLab/sinatools
|
6
6
|
License: MIT license
|
@@ -12,14 +12,18 @@ Requires-Dist: six
|
|
12
12
|
Requires-Dist: farasapy
|
13
13
|
Requires-Dist: tqdm
|
14
14
|
Requires-Dist: requests
|
15
|
-
Requires-Dist: regex
|
16
15
|
Requires-Dist: pathlib
|
17
|
-
Requires-Dist:
|
18
|
-
Requires-Dist:
|
19
|
-
Requires-Dist: torchtext==0.14.0
|
20
|
-
Requires-Dist: torchvision==0.14.0
|
16
|
+
Requires-Dist: transformers==4.47.1
|
17
|
+
Requires-Dist: torchvision==0.20.1
|
21
18
|
Requires-Dist: seqeval==1.2.2
|
22
19
|
Requires-Dist: natsort==7.1.1
|
20
|
+
Dynamic: description
|
21
|
+
Dynamic: description-content-type
|
22
|
+
Dynamic: home-page
|
23
|
+
Dynamic: keywords
|
24
|
+
Dynamic: license
|
25
|
+
Dynamic: requires-dist
|
26
|
+
Dynamic: summary
|
23
27
|
|
24
28
|
SinaTools
|
25
29
|
======================
|
@@ -1,6 +1,6 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.2
|
2
2
|
Name: SinaTools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.39
|
4
4
|
Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
5
5
|
Home-page: https://github.com/SinaLab/sinatools
|
6
6
|
License: MIT license
|
@@ -12,14 +12,18 @@ Requires-Dist: six
|
|
12
12
|
Requires-Dist: farasapy
|
13
13
|
Requires-Dist: tqdm
|
14
14
|
Requires-Dist: requests
|
15
|
-
Requires-Dist: regex
|
16
15
|
Requires-Dist: pathlib
|
17
|
-
Requires-Dist:
|
18
|
-
Requires-Dist:
|
19
|
-
Requires-Dist: torchtext==0.14.0
|
20
|
-
Requires-Dist: torchvision==0.14.0
|
16
|
+
Requires-Dist: transformers==4.47.1
|
17
|
+
Requires-Dist: torchvision==0.20.1
|
21
18
|
Requires-Dist: seqeval==1.2.2
|
22
19
|
Requires-Dist: natsort==7.1.1
|
20
|
+
Dynamic: description
|
21
|
+
Dynamic: description-content-type
|
22
|
+
Dynamic: home-page
|
23
|
+
Dynamic: keywords
|
24
|
+
Dynamic: license
|
25
|
+
Dynamic: requires-dist
|
26
|
+
Dynamic: summary
|
23
27
|
|
24
28
|
SinaTools
|
25
29
|
======================
|
@@ -96,56 +96,6 @@ sinatools/DataDownload/__init__.py
|
|
96
96
|
sinatools/DataDownload/downloader.py
|
97
97
|
sinatools/arabert/__init__.py
|
98
98
|
sinatools/arabert/preprocess.py
|
99
|
-
sinatools/arabert/arabert/__init__.py
|
100
|
-
sinatools/arabert/arabert/create_classification_data.py
|
101
|
-
sinatools/arabert/arabert/create_pretraining_data.py
|
102
|
-
sinatools/arabert/arabert/extract_features.py
|
103
|
-
sinatools/arabert/arabert/lamb_optimizer.py
|
104
|
-
sinatools/arabert/arabert/modeling.py
|
105
|
-
sinatools/arabert/arabert/optimization.py
|
106
|
-
sinatools/arabert/arabert/run_classifier.py
|
107
|
-
sinatools/arabert/arabert/run_pretraining.py
|
108
|
-
sinatools/arabert/arabert/run_squad.py
|
109
|
-
sinatools/arabert/arabert/tokenization.py
|
110
|
-
sinatools/arabert/araelectra/__init__.py
|
111
|
-
sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py
|
112
|
-
sinatools/arabert/araelectra/build_pretraining_dataset.py
|
113
|
-
sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py
|
114
|
-
sinatools/arabert/araelectra/configure_finetuning.py
|
115
|
-
sinatools/arabert/araelectra/configure_pretraining.py
|
116
|
-
sinatools/arabert/araelectra/flops_computation.py
|
117
|
-
sinatools/arabert/araelectra/run_finetuning.py
|
118
|
-
sinatools/arabert/araelectra/run_pretraining.py
|
119
|
-
sinatools/arabert/araelectra/finetune/__init__.py
|
120
|
-
sinatools/arabert/araelectra/finetune/feature_spec.py
|
121
|
-
sinatools/arabert/araelectra/finetune/preprocessing.py
|
122
|
-
sinatools/arabert/araelectra/finetune/scorer.py
|
123
|
-
sinatools/arabert/araelectra/finetune/task.py
|
124
|
-
sinatools/arabert/araelectra/finetune/task_builder.py
|
125
|
-
sinatools/arabert/araelectra/model/__init__.py
|
126
|
-
sinatools/arabert/araelectra/model/modeling.py
|
127
|
-
sinatools/arabert/araelectra/model/optimization.py
|
128
|
-
sinatools/arabert/araelectra/model/tokenization.py
|
129
|
-
sinatools/arabert/araelectra/pretrain/__init__.py
|
130
|
-
sinatools/arabert/araelectra/pretrain/pretrain_data.py
|
131
|
-
sinatools/arabert/araelectra/pretrain/pretrain_helpers.py
|
132
|
-
sinatools/arabert/araelectra/util/__init__.py
|
133
|
-
sinatools/arabert/araelectra/util/training_utils.py
|
134
|
-
sinatools/arabert/araelectra/util/utils.py
|
135
|
-
sinatools/arabert/aragpt2/__init__.py
|
136
|
-
sinatools/arabert/aragpt2/create_pretraining_data.py
|
137
|
-
sinatools/arabert/aragpt2/train_bpe_tokenizer.py
|
138
|
-
sinatools/arabert/aragpt2/gpt2/__init__.py
|
139
|
-
sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py
|
140
|
-
sinatools/arabert/aragpt2/gpt2/optimization.py
|
141
|
-
sinatools/arabert/aragpt2/gpt2/run_pretraining.py
|
142
|
-
sinatools/arabert/aragpt2/grover/__init__.py
|
143
|
-
sinatools/arabert/aragpt2/grover/dataloader.py
|
144
|
-
sinatools/arabert/aragpt2/grover/modeling.py
|
145
|
-
sinatools/arabert/aragpt2/grover/modeling_gpt2.py
|
146
|
-
sinatools/arabert/aragpt2/grover/optimization_adafactor.py
|
147
|
-
sinatools/arabert/aragpt2/grover/train_tpu.py
|
148
|
-
sinatools/arabert/aragpt2/grover/utils.py
|
149
99
|
sinatools/morphology/ALMA_multi_word.py
|
150
100
|
sinatools/morphology/__init__.py
|
151
101
|
sinatools/morphology/morph_analyzer.py
|
@@ -16,12 +16,12 @@ requirements = [
|
|
16
16
|
'farasapy',
|
17
17
|
'tqdm',
|
18
18
|
'requests',
|
19
|
-
'regex',
|
19
|
+
# 'regex',
|
20
20
|
'pathlib',
|
21
|
-
'torch==
|
22
|
-
'transformers==4.
|
23
|
-
'torchtext==0.14.0',
|
24
|
-
'torchvision==0.
|
21
|
+
# 'torch==2.5.1',
|
22
|
+
'transformers==4.47.1',
|
23
|
+
# 'torchtext==0.14.0',
|
24
|
+
'torchvision==0.20.1',
|
25
25
|
'seqeval==1.2.2',
|
26
26
|
'natsort==7.1.1'
|
27
27
|
]
|
@@ -0,0 +1 @@
|
|
1
|
+
0.1.39
|
@@ -0,0 +1,182 @@
|
|
1
|
+
name: dev
|
2
|
+
channels:
|
3
|
+
- pytorch
|
4
|
+
- nvidia
|
5
|
+
- defaults
|
6
|
+
- https://repo.anaconda.com/pkgs/main
|
7
|
+
- https://repo.anaconda.com/pkgs/r
|
8
|
+
dependencies:
|
9
|
+
- _libgcc_mutex=0.1=main
|
10
|
+
- _openmp_mutex=5.1=1_gnu
|
11
|
+
- _sysroot_linux-64_curr_repodata_hack=3=haa98f57_10
|
12
|
+
- binutils_impl_linux-64=2.40=h5293946_0
|
13
|
+
- binutils_linux-64=2.40.0=hc2dff05_1
|
14
|
+
- blas=1.0=mkl
|
15
|
+
- brotli-python=1.0.9=py311h6a678d5_8
|
16
|
+
- bzip2=1.0.8=h5eee18b_6
|
17
|
+
- ca-certificates=2024.11.26=h06a4308_0
|
18
|
+
- certifi=2024.12.14=py311h06a4308_0
|
19
|
+
- charset-normalizer=3.3.2=pyhd3eb1b0_0
|
20
|
+
- cuda-cudart=12.4.127=0
|
21
|
+
- cuda-cupti=12.4.127=0
|
22
|
+
- cuda-libraries=12.4.1=0
|
23
|
+
- cuda-nvrtc=12.4.127=0
|
24
|
+
- cuda-nvtx=12.4.127=0
|
25
|
+
- cuda-opencl=12.4.127=0
|
26
|
+
- cuda-runtime=12.4.1=0
|
27
|
+
- cuda-version=11.7=h6a555f7_3
|
28
|
+
- cudatoolkit=11.7.0=hd8887f6_10
|
29
|
+
- ffmpeg=4.3=hf484d3e_0
|
30
|
+
- filelock=3.13.1=py311h06a4308_0
|
31
|
+
- freetype=2.12.1=h4a9f257_0
|
32
|
+
- fsspec=2024.6.1=py311h06a4308_0
|
33
|
+
- gcc_impl_linux-64=11.2.0=h1234567_1
|
34
|
+
- gcc_linux-64=11.2.0=h5c386dc_1
|
35
|
+
- giflib=5.2.2=h5eee18b_0
|
36
|
+
- gmp=6.2.1=h295c915_3
|
37
|
+
- gmpy2=2.1.2=py311hc9b5ff0_0
|
38
|
+
- gnutls=3.6.15=he1e5248_0
|
39
|
+
- gxx_impl_linux-64=11.2.0=h1234567_1
|
40
|
+
- gxx_linux-64=11.2.0=hc2dff05_1
|
41
|
+
- idna=3.7=py311h06a4308_0
|
42
|
+
- intel-openmp=2023.1.0=hdb19cb5_46306
|
43
|
+
- jinja2=3.1.4=py311h06a4308_1
|
44
|
+
- jpeg=9e=h5eee18b_3
|
45
|
+
- kernel-headers_linux-64=3.10.0=h57e8cba_10
|
46
|
+
- lame=3.100=h7b6447c_0
|
47
|
+
- lcms2=2.16=hb9589c4_0
|
48
|
+
- ld_impl_linux-64=2.40=h12ee557_0
|
49
|
+
- lerc=4.0.0=h6a678d5_0
|
50
|
+
- libabseil=20240116.2=cxx17_h6a678d5_0
|
51
|
+
- libcublas=12.4.5.8=0
|
52
|
+
- libcufft=11.2.1.3=0
|
53
|
+
- libcufile=1.9.1.3=0
|
54
|
+
- libcurand=10.3.5.147=0
|
55
|
+
- libcusolver=11.6.1.9=0
|
56
|
+
- libcusparse=12.3.1.170=0
|
57
|
+
- libdeflate=1.22=h5eee18b_0
|
58
|
+
- libffi=3.4.4=h6a678d5_1
|
59
|
+
- libgcc-devel_linux-64=11.2.0=h1234567_1
|
60
|
+
- libgcc-ng=11.2.0=h1234567_1
|
61
|
+
- libgomp=11.2.0=h1234567_1
|
62
|
+
- libiconv=1.16=h5eee18b_3
|
63
|
+
- libidn2=2.3.4=h5eee18b_0
|
64
|
+
- libjpeg-turbo=2.0.0=h9bf148f_0
|
65
|
+
- libnpp=12.2.5.30=0
|
66
|
+
- libnvfatbin=12.4.127=0
|
67
|
+
- libnvjitlink=12.4.127=0
|
68
|
+
- libnvjpeg=12.3.1.117=0
|
69
|
+
- libpng=1.6.39=h5eee18b_0
|
70
|
+
- libprotobuf=4.25.3=he621ea3_0
|
71
|
+
- libstdcxx-devel_linux-64=11.2.0=h1234567_1
|
72
|
+
- libstdcxx-ng=11.2.0=h1234567_1
|
73
|
+
- libtasn1=4.19.0=h5eee18b_0
|
74
|
+
- libtiff=4.5.1=hffd6297_1
|
75
|
+
- libunistring=0.9.10=h27cfd23_0
|
76
|
+
- libuuid=1.41.5=h5eee18b_0
|
77
|
+
- libwebp=1.3.2=h11a3e52_0
|
78
|
+
- libwebp-base=1.3.2=h5eee18b_1
|
79
|
+
- llvm-openmp=14.0.6=h9e868ea_0
|
80
|
+
- lz4-c=1.9.4=h6a678d5_1
|
81
|
+
- markupsafe=2.1.3=py311h5eee18b_0
|
82
|
+
- mkl=2023.1.0=h213fc3f_46344
|
83
|
+
- mkl-service=2.4.0=py311h5eee18b_1
|
84
|
+
- mkl_fft=1.3.11=py311h5eee18b_0
|
85
|
+
- mkl_random=1.2.8=py311ha02d727_0
|
86
|
+
- mpc=1.1.0=h10f8cd9_1
|
87
|
+
- mpfr=4.0.2=hb69a4c5_1
|
88
|
+
- mpmath=1.3.0=py311h06a4308_0
|
89
|
+
- ncurses=6.4=h6a678d5_0
|
90
|
+
- nettle=3.7.3=hbbd107a_1
|
91
|
+
- networkx=3.2.1=py311h06a4308_0
|
92
|
+
- numpy=2.0.1=py311h08b1b3b_1
|
93
|
+
- numpy-base=2.0.1=py311hf175353_1
|
94
|
+
- openh264=2.1.1=h4ff587b_0
|
95
|
+
- openjpeg=2.5.2=he7f1fd0_0
|
96
|
+
- openssl=3.0.15=h5eee18b_0
|
97
|
+
- pillow=11.0.0=py311hcea889d_1
|
98
|
+
- pip=24.2=py311h06a4308_0
|
99
|
+
- pysocks=1.7.1=py311h06a4308_0
|
100
|
+
- python=3.11.11=he870216_0
|
101
|
+
- pytorch=2.5.1=py3.11_cuda12.4_cudnn9.1.0_0
|
102
|
+
- pytorch-cuda=12.4=hc786d27_7
|
103
|
+
- pytorch-mutex=1.0=cuda
|
104
|
+
- pyyaml=6.0.2=py311h5eee18b_0
|
105
|
+
- readline=8.2=h5eee18b_0
|
106
|
+
- requests=2.32.3=py311h06a4308_1
|
107
|
+
- setuptools=75.1.0=py311h06a4308_0
|
108
|
+
- sqlite=3.45.3=h5eee18b_0
|
109
|
+
- sysroot_linux-64=2.17=h57e8cba_10
|
110
|
+
- tbb=2021.8.0=hdb19cb5_0
|
111
|
+
- tk=8.6.14=h39e8969_0
|
112
|
+
- torchaudio=2.5.1=py311_cu124
|
113
|
+
- torchtriton=3.1.0=py311
|
114
|
+
- torchvision=0.20.1=py311_cu124
|
115
|
+
- typing_extensions=4.12.2=py311h06a4308_0
|
116
|
+
- urllib3=2.2.3=py311h06a4308_0
|
117
|
+
- wheel=0.44.0=py311h06a4308_0
|
118
|
+
- xz=5.4.6=h5eee18b_1
|
119
|
+
- yaml=0.2.5=h7b6447c_0
|
120
|
+
- zlib=1.2.13=h5eee18b_1
|
121
|
+
- zstd=1.5.6=hc292b87_0
|
122
|
+
- pip:
|
123
|
+
- absl-py==2.1.0
|
124
|
+
- accelerate==1.2.1
|
125
|
+
- aiohappyeyeballs==2.4.4
|
126
|
+
- aiohttp==3.11.11
|
127
|
+
- aiosignal==1.3.2
|
128
|
+
- annotated-types==0.7.0
|
129
|
+
- attrs==24.3.0
|
130
|
+
- datasets==3.2.0
|
131
|
+
- deepspeed==0.16.2
|
132
|
+
- dill==0.3.8
|
133
|
+
- einops==0.8.0
|
134
|
+
- flash-attn==2.7.2.post1
|
135
|
+
- frozenlist==1.5.0
|
136
|
+
- grpcio==1.70.0
|
137
|
+
- hjson==3.1.0
|
138
|
+
- huggingface-hub==0.27.0
|
139
|
+
- joblib==1.4.2
|
140
|
+
- markdown==3.7
|
141
|
+
- markdown-it-py==3.0.0
|
142
|
+
- mdurl==0.1.2
|
143
|
+
- mpi4py==4.0.1
|
144
|
+
- msgpack==1.1.0
|
145
|
+
- multidict==6.1.0
|
146
|
+
- multiprocess==0.70.16
|
147
|
+
- natsort==8.4.0
|
148
|
+
- ninja==1.11.1.3
|
149
|
+
- nvidia-ml-py==12.560.30
|
150
|
+
- packaging==24.2
|
151
|
+
- pandas==2.2.3
|
152
|
+
- peft==0.14.0
|
153
|
+
- propcache==0.2.1
|
154
|
+
- protobuf==6.30.0
|
155
|
+
- psutil==6.1.1
|
156
|
+
- py-cpuinfo==9.0.0
|
157
|
+
- pyarrow==18.1.0
|
158
|
+
- pydantic==2.10.4
|
159
|
+
- pydantic-core==2.27.2
|
160
|
+
- pygments==2.18.0
|
161
|
+
- python-dateutil==2.9.0.post0
|
162
|
+
- pytz==2024.2
|
163
|
+
- regex==2024.11.6
|
164
|
+
- rich==13.9.4
|
165
|
+
- safetensors==0.4.5
|
166
|
+
- scikit-learn==1.6.1
|
167
|
+
- scipy==1.15.2
|
168
|
+
- seqeval==1.2.2
|
169
|
+
- six==1.17.0
|
170
|
+
- sympy==1.13.1
|
171
|
+
- tensorboard==2.19.0
|
172
|
+
- tensorboard-data-server==0.7.2
|
173
|
+
- threadpoolctl==3.5.0
|
174
|
+
- tokenizers==0.21.0
|
175
|
+
- tqdm==4.67.1
|
176
|
+
- transformers==4.47.1
|
177
|
+
- trl==0.12.0
|
178
|
+
- tzdata==2024.2
|
179
|
+
- werkzeug==3.1.3
|
180
|
+
- xxhash==3.5.0
|
181
|
+
- yarl==1.18.3
|
182
|
+
|
@@ -37,7 +37,11 @@ class Token:
|
|
37
37
|
:return: str
|
38
38
|
"""
|
39
39
|
gold_tags = "|".join(self.gold_tag)
|
40
|
-
|
40
|
+
|
41
|
+
if self.pred_tag:
|
42
|
+
pred_tags = "|".join([pred_tag["tag"] for pred_tag in self.pred_tag])
|
43
|
+
else:
|
44
|
+
pred_tags = ""
|
41
45
|
|
42
46
|
if self.gold_tag:
|
43
47
|
r = f"{self.text}\t{gold_tags}\t{pred_tags}"
|
@@ -139,8 +143,8 @@ class NestedTagsDataset(Dataset):
|
|
139
143
|
masks = torch.cat(masks)
|
140
144
|
|
141
145
|
# Pad the tags, do the padding for each tag type
|
142
|
-
tags = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), vocab.get_stoi()["
|
146
|
+
tags = [torch.nn.ConstantPad1d((0, subwords.shape[-1] - tag.shape[-1]), vocab.get_stoi()["O"])(tag)
|
143
147
|
for tag, vocab in zip(tags, self.vocab.tags[1:])]
|
144
148
|
tags = torch.cat(tags)
|
145
149
|
|
146
|
-
return subwords, tags, tokens, masks, valid_len
|
150
|
+
return subwords, tags, tokens, masks, valid_len
|
@@ -1,16 +1,30 @@
|
|
1
1
|
from torch.utils.data import DataLoader
|
2
|
-
from torchtext.vocab import vocab
|
3
2
|
from collections import Counter, namedtuple
|
4
3
|
import logging
|
5
4
|
import re
|
6
5
|
import itertools
|
7
6
|
from sinatools.ner.helpers import load_object
|
8
|
-
from sinatools.ner.datasets import Token
|
9
|
-
from sinatools.utils.tokenizers_words import simple_word_tokenize
|
7
|
+
from sinatools.ner.data.datasets import Token
|
10
8
|
|
11
9
|
logger = logging.getLogger(__name__)
|
12
10
|
|
13
11
|
|
12
|
+
class Vocab:
|
13
|
+
def __init__(self, counter, specials=[]) -> None:
|
14
|
+
self.itos = list(counter.keys()) + specials
|
15
|
+
self.stoi = {s: i for i, s in enumerate(self.itos)}
|
16
|
+
self.word_count = counter
|
17
|
+
|
18
|
+
def get_itos(self) -> list[str]:
|
19
|
+
return self.itos
|
20
|
+
|
21
|
+
def get_stoi(self) -> dict[str, int]:
|
22
|
+
return self.stoi
|
23
|
+
|
24
|
+
def __len__(self):
|
25
|
+
return len(self.itos)
|
26
|
+
|
27
|
+
|
14
28
|
def conll_to_segments(filename):
|
15
29
|
"""
|
16
30
|
Convert CoNLL files to segments. This return list of segments and each segment is
|
@@ -60,8 +74,8 @@ def parse_conll_files(data_paths):
|
|
60
74
|
|
61
75
|
# Generate vocabs for tags and tokens
|
62
76
|
tag_vocabs = tag_vocab_by_type(tags)
|
63
|
-
tag_vocabs.insert(0,
|
64
|
-
vocabs = vocabs(tokens=
|
77
|
+
tag_vocabs.insert(0, Vocab(Counter(tags)))
|
78
|
+
vocabs = vocabs(tokens=Vocab(Counter(tokens), specials=["UNK"]), tags=tag_vocabs)
|
65
79
|
return tuple(datasets), vocabs
|
66
80
|
|
67
81
|
|
@@ -72,9 +86,9 @@ def tag_vocab_by_type(tags):
|
|
72
86
|
tag_types = sorted(list(set([tag.split("-", 1)[1] for tag in tag_names if "-" in tag])))
|
73
87
|
|
74
88
|
for tag_type in tag_types:
|
75
|
-
r = re.compile(".*-" + tag_type)
|
89
|
+
r = re.compile(".*-" + tag_type + "$")
|
76
90
|
t = list(filter(r.match, tags)) + ["O"]
|
77
|
-
vocabs.append(
|
91
|
+
vocabs.append(Vocab(Counter(t)))
|
78
92
|
|
79
93
|
return vocabs
|
80
94
|
|
@@ -83,13 +97,11 @@ def text2segments(text):
|
|
83
97
|
"""
|
84
98
|
Convert text to a datasets and index the tokens
|
85
99
|
"""
|
86
|
-
|
87
|
-
list_of_tokens = simple_word_tokenize(text)
|
88
|
-
dataset = [[Token(text=token, gold_tag=["O"]) for token in list_of_tokens]]
|
100
|
+
dataset = [[Token(text=token, gold_tag=["O"]) for token in text.split()]]
|
89
101
|
tokens = [token.text for segment in dataset for token in segment]
|
90
102
|
|
91
103
|
# Generate vocabs for the tokens
|
92
|
-
segment_vocab =
|
104
|
+
segment_vocab = Vocab(Counter(tokens), specials=["UNK"])
|
93
105
|
return dataset, segment_vocab
|
94
106
|
|
95
107
|
|
@@ -121,4 +133,4 @@ def get_dataloaders(
|
|
121
133
|
logger.info("%s batches found", len(dataloader))
|
122
134
|
dataloaders.append(dataloader)
|
123
135
|
|
124
|
-
return dataloaders
|
136
|
+
return dataloaders
|
@@ -0,0 +1,117 @@
|
|
1
|
+
import os
|
2
|
+
import sys
|
3
|
+
import logging
|
4
|
+
import importlib
|
5
|
+
import shutil
|
6
|
+
import torch
|
7
|
+
import pickle
|
8
|
+
import json
|
9
|
+
import random
|
10
|
+
import numpy as np
|
11
|
+
from argparse import Namespace
|
12
|
+
|
13
|
+
|
14
|
+
def logging_config(log_file=None):
|
15
|
+
"""
|
16
|
+
Initialize custom logger
|
17
|
+
:param log_file: str - path to log file, full path
|
18
|
+
:return: None
|
19
|
+
"""
|
20
|
+
handlers = [logging.StreamHandler(sys.stdout)]
|
21
|
+
|
22
|
+
if log_file:
|
23
|
+
handlers.append(logging.FileHandler(log_file, "w", "utf-8"))
|
24
|
+
print("Logging to {}".format(log_file))
|
25
|
+
|
26
|
+
logging.basicConfig(
|
27
|
+
level=logging.INFO,
|
28
|
+
handlers=handlers,
|
29
|
+
format="%(levelname)s\t%(name)s\t%(asctime)s\t%(message)s",
|
30
|
+
datefmt="%a, %d %b %Y %H:%M:%S",
|
31
|
+
force=True
|
32
|
+
)
|
33
|
+
|
34
|
+
|
35
|
+
def load_object(name, kwargs):
|
36
|
+
"""
|
37
|
+
Load objects dynamically given the object name and its arguments
|
38
|
+
:param name: str - object name, class name or function name
|
39
|
+
:param kwargs: dict - keyword arguments
|
40
|
+
:return: object
|
41
|
+
"""
|
42
|
+
object_module, object_name = name.rsplit(".", 1)
|
43
|
+
object_module = importlib.import_module(object_module)
|
44
|
+
fn = getattr(object_module, object_name)(**kwargs)
|
45
|
+
return fn
|
46
|
+
|
47
|
+
|
48
|
+
def make_output_dirs(path, subdirs=[], overwrite=True):
|
49
|
+
"""
|
50
|
+
Create root directory and any other sub-directories
|
51
|
+
:param path: str - root directory
|
52
|
+
:param subdirs: List[str] - list of sub-directories
|
53
|
+
:param overwrite: boolean - to overwrite the directory or not
|
54
|
+
:return: None
|
55
|
+
"""
|
56
|
+
if overwrite:
|
57
|
+
shutil.rmtree(path, ignore_errors=True)
|
58
|
+
|
59
|
+
os.makedirs(path)
|
60
|
+
|
61
|
+
for subdir in subdirs:
|
62
|
+
os.makedirs(os.path.join(path, subdir))
|
63
|
+
|
64
|
+
|
65
|
+
def load_checkpoint(model_path):
|
66
|
+
"""
|
67
|
+
Load model given the model path
|
68
|
+
:param model_path: str - path to model
|
69
|
+
:return: tagger - arabiner.trainers.BaseTrainer - the tagger model
|
70
|
+
vocab - arabicner.utils.data.Vocab - indexed tags
|
71
|
+
train_config - argparse.Namespace - training configurations
|
72
|
+
"""
|
73
|
+
with open(os.path.join(model_path, "tag_vocab.pkl"), "rb") as fh:
|
74
|
+
tag_vocab = pickle.load(fh)
|
75
|
+
|
76
|
+
# Load train configurations from checkpoint
|
77
|
+
train_config = Namespace()
|
78
|
+
with open(os.path.join(model_path, "args.json"), "r") as fh:
|
79
|
+
train_config.__dict__ = json.load(fh)
|
80
|
+
|
81
|
+
# Initialize the loss function, not used for inference, but evaluation
|
82
|
+
loss = load_object(train_config.loss["fn"], train_config.loss["kwargs"])
|
83
|
+
|
84
|
+
# Load BERT tagger
|
85
|
+
model = load_object(train_config.network_config["fn"], train_config.network_config["kwargs"])
|
86
|
+
model = torch.nn.DataParallel(model)
|
87
|
+
|
88
|
+
if torch.cuda.is_available():
|
89
|
+
model = model.cuda()
|
90
|
+
|
91
|
+
# Update arguments for the tagger
|
92
|
+
# Attach the model, loss (used for evaluations cases)
|
93
|
+
train_config.trainer_config["kwargs"]["model"] = model
|
94
|
+
train_config.trainer_config["kwargs"]["loss"] = loss
|
95
|
+
|
96
|
+
tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
|
97
|
+
tagger.load(os.path.join(model_path, "checkpoints"))
|
98
|
+
return tagger, tag_vocab, train_config
|
99
|
+
|
100
|
+
|
101
|
+
def set_seed(seed):
|
102
|
+
"""
|
103
|
+
Set the seed for random intialization and set
|
104
|
+
CUDANN parameters to ensure determmihstic results across
|
105
|
+
multiple runs with the same seed
|
106
|
+
|
107
|
+
:param seed: int
|
108
|
+
"""
|
109
|
+
np.random.seed(seed)
|
110
|
+
random.seed(seed)
|
111
|
+
torch.manual_seed(seed)
|
112
|
+
torch.cuda.manual_seed(seed)
|
113
|
+
torch.cuda.manual_seed_all(seed)
|
114
|
+
|
115
|
+
torch.backends.cudnn.deterministic = True
|
116
|
+
torch.backends.cudnn.benchmark = False
|
117
|
+
torch.backends.cudnn.enabled = False
|
@@ -113,5 +113,5 @@ class BaseTrainer:
|
|
113
113
|
logger.info("Loading checkpoint %s", checkpoint_path)
|
114
114
|
|
115
115
|
device = None if torch.cuda.is_available() else torch.device('cpu')
|
116
|
-
checkpoint = torch.load(checkpoint_path, map_location=device)
|
117
|
-
self.model.load_state_dict(checkpoint["model"], strict=False)
|
116
|
+
checkpoint = torch.load(checkpoint_path, map_location=device, weights_only=False)
|
117
|
+
self.model.load_state_dict(checkpoint["model"], strict=False)
|