SinaTools 0.1.37__tar.gz → 0.1.38__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sinatools-0.1.37 → sinatools-0.1.38}/PKG-INFO +1 -2
- {sinatools-0.1.37 → sinatools-0.1.38}/SinaTools.egg-info/PKG-INFO +1 -2
- {sinatools-0.1.37 → sinatools-0.1.38}/SinaTools.egg-info/SOURCES.txt +0 -50
- {sinatools-0.1.37 → sinatools-0.1.38}/SinaTools.egg-info/requires.txt +0 -1
- {sinatools-0.1.37 → sinatools-0.1.38}/setup.py +1 -1
- sinatools-0.1.38/sinatools/VERSION +1 -0
- sinatools-0.1.37/sinatools/VERSION +0 -1
- sinatools-0.1.37/sinatools/arabert/arabert/__init__.py +0 -14
- sinatools-0.1.37/sinatools/arabert/arabert/create_classification_data.py +0 -260
- sinatools-0.1.37/sinatools/arabert/arabert/create_pretraining_data.py +0 -534
- sinatools-0.1.37/sinatools/arabert/arabert/extract_features.py +0 -444
- sinatools-0.1.37/sinatools/arabert/arabert/lamb_optimizer.py +0 -158
- sinatools-0.1.37/sinatools/arabert/arabert/modeling.py +0 -1027
- sinatools-0.1.37/sinatools/arabert/arabert/optimization.py +0 -202
- sinatools-0.1.37/sinatools/arabert/arabert/run_classifier.py +0 -1078
- sinatools-0.1.37/sinatools/arabert/arabert/run_pretraining.py +0 -593
- sinatools-0.1.37/sinatools/arabert/arabert/run_squad.py +0 -1440
- sinatools-0.1.37/sinatools/arabert/arabert/tokenization.py +0 -414
- sinatools-0.1.37/sinatools/arabert/araelectra/__init__.py +0 -1
- sinatools-0.1.37/sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -103
- sinatools-0.1.37/sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -230
- sinatools-0.1.37/sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -90
- sinatools-0.1.37/sinatools/arabert/araelectra/configure_finetuning.py +0 -172
- sinatools-0.1.37/sinatools/arabert/araelectra/configure_pretraining.py +0 -143
- sinatools-0.1.37/sinatools/arabert/araelectra/finetune/__init__.py +0 -14
- sinatools-0.1.37/sinatools/arabert/araelectra/finetune/feature_spec.py +0 -56
- sinatools-0.1.37/sinatools/arabert/araelectra/finetune/preprocessing.py +0 -173
- sinatools-0.1.37/sinatools/arabert/araelectra/finetune/scorer.py +0 -54
- sinatools-0.1.37/sinatools/arabert/araelectra/finetune/task.py +0 -74
- sinatools-0.1.37/sinatools/arabert/araelectra/finetune/task_builder.py +0 -70
- sinatools-0.1.37/sinatools/arabert/araelectra/flops_computation.py +0 -215
- sinatools-0.1.37/sinatools/arabert/araelectra/model/__init__.py +0 -14
- sinatools-0.1.37/sinatools/arabert/araelectra/model/modeling.py +0 -1029
- sinatools-0.1.37/sinatools/arabert/araelectra/model/optimization.py +0 -193
- sinatools-0.1.37/sinatools/arabert/araelectra/model/tokenization.py +0 -355
- sinatools-0.1.37/sinatools/arabert/araelectra/pretrain/__init__.py +0 -14
- sinatools-0.1.37/sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -160
- sinatools-0.1.37/sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -229
- sinatools-0.1.37/sinatools/arabert/araelectra/run_finetuning.py +0 -323
- sinatools-0.1.37/sinatools/arabert/araelectra/run_pretraining.py +0 -469
- sinatools-0.1.37/sinatools/arabert/araelectra/util/__init__.py +0 -14
- sinatools-0.1.37/sinatools/arabert/araelectra/util/training_utils.py +0 -112
- sinatools-0.1.37/sinatools/arabert/araelectra/util/utils.py +0 -109
- sinatools-0.1.37/sinatools/arabert/aragpt2/__init__.py +0 -2
- sinatools-0.1.37/sinatools/arabert/aragpt2/create_pretraining_data.py +0 -95
- sinatools-0.1.37/sinatools/arabert/aragpt2/gpt2/__init__.py +0 -2
- sinatools-0.1.37/sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -158
- sinatools-0.1.37/sinatools/arabert/aragpt2/gpt2/optimization.py +0 -225
- sinatools-0.1.37/sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -397
- sinatools-0.1.37/sinatools/arabert/aragpt2/grover/dataloader.py +0 -161
- sinatools-0.1.37/sinatools/arabert/aragpt2/grover/modeling.py +0 -803
- sinatools-0.1.37/sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -1196
- sinatools-0.1.37/sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -234
- sinatools-0.1.37/sinatools/arabert/aragpt2/grover/train_tpu.py +0 -187
- sinatools-0.1.37/sinatools/arabert/aragpt2/grover/utils.py +0 -234
- sinatools-0.1.37/sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -59
- sinatools-0.1.37/sinatools/utils/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/AUTHORS.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/CONTRIBUTING.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/LICENSE +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/MANIFEST.in +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/README.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/SinaTools.egg-info/dependency_links.txt +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/SinaTools.egg-info/entry_points.txt +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/SinaTools.egg-info/not-zip-safe +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/SinaTools.egg-info/top_level.txt +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/Makefile +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/_images/download.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/_static/download.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/_static/file.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/_static/minus.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/_static/plus.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/html/_images/SinaLogo.jpg +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/html/_images/download.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/html/_static/SinaLogo.jpg +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/html/_static/download.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/html/_static/file.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/html/_static/minus.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/build/html/_static/plus.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/make.bat +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/License.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/Overview.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/_static/SinaLogo.jpg +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/_static/download.png +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/about.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/DataDownload/downloader.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/DataDownload.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/arabiner/bin/infer.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/arabiner.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/morphology/morph_analyzer.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/morphology.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/salma/views.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/salma.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/utils/corpus_tokenizer.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/utils/implication.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/utils/jaccard.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/utils/parser.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/utils/sentence_tokenizer.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/utils/text_transliteration.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api/utils.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/api.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/authors.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/DataDownload/download_files.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/DataDownload/get_appdatadir.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/DataDownload.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/arabiner/infer.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/arabiner.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/morphology/ALMA_multi_word.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/morphology/morph_analyzer.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/morphology.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/salma/salma_tools.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/salma.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils/arStrip.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils/corpus_tokenizer.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils/implication.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils/jaccard.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils/latin_remove.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils/remove_punc.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils/sentence_tokenizer.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils/text_transliteration.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools/utils.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/cli_tools.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/conf.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/index.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/installation.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/docs/source/readme.rst +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/setup.cfg +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/DataDownload/download_files.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/morphology/ALMA_multi_word.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/morphology/morph_analyzer.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/ner/corpus_entity_extractor.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/ner/entity_extractor.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/arStrip.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/corpus_tokenizer.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/implication.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/jaccard.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/remove_latin.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/remove_punctuation.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/sentence_tokenizer.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/text_dublication_detector.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/CLI/utils/text_transliteration.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/DataDownload/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/DataDownload/downloader.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/arabert/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/arabert/preprocess.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/environment.yml +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/install_env.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/morphology/ALMA_multi_word.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/morphology/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/morphology/morph_analyzer.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/data/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/data/datasets.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/data/transforms.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/data_format.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/datasets.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/entity_extractor.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/helpers.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/metrics.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/nn/BaseModel.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/nn/BertNestedTagger.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/nn/BertSeqTagger.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/nn/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/trainers/BaseTrainer.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/trainers/BertNestedTrainer.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/trainers/BertTrainer.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/trainers/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/ner/transforms.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/relations/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/relations/relation_extractor.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/semantic_relatedness/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/semantic_relatedness/compute_relatedness.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/sinatools.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/synonyms/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/synonyms/synonyms_generator.py +0 -0
- {sinatools-0.1.37/sinatools/arabert/aragpt2/grover → sinatools-0.1.38/sinatools/utils}/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/charsets.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/parser.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/readfile.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/similarity.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/text_dublication_detector.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/text_transliteration.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/tokenizer.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/tokenizers_words.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/utils/word_compare.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/wsd/__init__.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/wsd/disambiguator.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/wsd/settings.py +0 -0
- {sinatools-0.1.37 → sinatools-0.1.38}/sinatools/wsd/wsd.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: SinaTools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.38
|
4
4
|
Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
5
5
|
Home-page: https://github.com/SinaLab/sinatools
|
6
6
|
License: MIT license
|
@@ -12,7 +12,6 @@ Requires-Dist: six
|
|
12
12
|
Requires-Dist: farasapy
|
13
13
|
Requires-Dist: tqdm
|
14
14
|
Requires-Dist: requests
|
15
|
-
Requires-Dist: regex
|
16
15
|
Requires-Dist: pathlib
|
17
16
|
Requires-Dist: torch==1.13.0
|
18
17
|
Requires-Dist: transformers==4.24.0
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: SinaTools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.38
|
4
4
|
Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
5
5
|
Home-page: https://github.com/SinaLab/sinatools
|
6
6
|
License: MIT license
|
@@ -12,7 +12,6 @@ Requires-Dist: six
|
|
12
12
|
Requires-Dist: farasapy
|
13
13
|
Requires-Dist: tqdm
|
14
14
|
Requires-Dist: requests
|
15
|
-
Requires-Dist: regex
|
16
15
|
Requires-Dist: pathlib
|
17
16
|
Requires-Dist: torch==1.13.0
|
18
17
|
Requires-Dist: transformers==4.24.0
|
@@ -96,56 +96,6 @@ sinatools/DataDownload/__init__.py
|
|
96
96
|
sinatools/DataDownload/downloader.py
|
97
97
|
sinatools/arabert/__init__.py
|
98
98
|
sinatools/arabert/preprocess.py
|
99
|
-
sinatools/arabert/arabert/__init__.py
|
100
|
-
sinatools/arabert/arabert/create_classification_data.py
|
101
|
-
sinatools/arabert/arabert/create_pretraining_data.py
|
102
|
-
sinatools/arabert/arabert/extract_features.py
|
103
|
-
sinatools/arabert/arabert/lamb_optimizer.py
|
104
|
-
sinatools/arabert/arabert/modeling.py
|
105
|
-
sinatools/arabert/arabert/optimization.py
|
106
|
-
sinatools/arabert/arabert/run_classifier.py
|
107
|
-
sinatools/arabert/arabert/run_pretraining.py
|
108
|
-
sinatools/arabert/arabert/run_squad.py
|
109
|
-
sinatools/arabert/arabert/tokenization.py
|
110
|
-
sinatools/arabert/araelectra/__init__.py
|
111
|
-
sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py
|
112
|
-
sinatools/arabert/araelectra/build_pretraining_dataset.py
|
113
|
-
sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py
|
114
|
-
sinatools/arabert/araelectra/configure_finetuning.py
|
115
|
-
sinatools/arabert/araelectra/configure_pretraining.py
|
116
|
-
sinatools/arabert/araelectra/flops_computation.py
|
117
|
-
sinatools/arabert/araelectra/run_finetuning.py
|
118
|
-
sinatools/arabert/araelectra/run_pretraining.py
|
119
|
-
sinatools/arabert/araelectra/finetune/__init__.py
|
120
|
-
sinatools/arabert/araelectra/finetune/feature_spec.py
|
121
|
-
sinatools/arabert/araelectra/finetune/preprocessing.py
|
122
|
-
sinatools/arabert/araelectra/finetune/scorer.py
|
123
|
-
sinatools/arabert/araelectra/finetune/task.py
|
124
|
-
sinatools/arabert/araelectra/finetune/task_builder.py
|
125
|
-
sinatools/arabert/araelectra/model/__init__.py
|
126
|
-
sinatools/arabert/araelectra/model/modeling.py
|
127
|
-
sinatools/arabert/araelectra/model/optimization.py
|
128
|
-
sinatools/arabert/araelectra/model/tokenization.py
|
129
|
-
sinatools/arabert/araelectra/pretrain/__init__.py
|
130
|
-
sinatools/arabert/araelectra/pretrain/pretrain_data.py
|
131
|
-
sinatools/arabert/araelectra/pretrain/pretrain_helpers.py
|
132
|
-
sinatools/arabert/araelectra/util/__init__.py
|
133
|
-
sinatools/arabert/araelectra/util/training_utils.py
|
134
|
-
sinatools/arabert/araelectra/util/utils.py
|
135
|
-
sinatools/arabert/aragpt2/__init__.py
|
136
|
-
sinatools/arabert/aragpt2/create_pretraining_data.py
|
137
|
-
sinatools/arabert/aragpt2/train_bpe_tokenizer.py
|
138
|
-
sinatools/arabert/aragpt2/gpt2/__init__.py
|
139
|
-
sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py
|
140
|
-
sinatools/arabert/aragpt2/gpt2/optimization.py
|
141
|
-
sinatools/arabert/aragpt2/gpt2/run_pretraining.py
|
142
|
-
sinatools/arabert/aragpt2/grover/__init__.py
|
143
|
-
sinatools/arabert/aragpt2/grover/dataloader.py
|
144
|
-
sinatools/arabert/aragpt2/grover/modeling.py
|
145
|
-
sinatools/arabert/aragpt2/grover/modeling_gpt2.py
|
146
|
-
sinatools/arabert/aragpt2/grover/optimization_adafactor.py
|
147
|
-
sinatools/arabert/aragpt2/grover/train_tpu.py
|
148
|
-
sinatools/arabert/aragpt2/grover/utils.py
|
149
99
|
sinatools/morphology/ALMA_multi_word.py
|
150
100
|
sinatools/morphology/__init__.py
|
151
101
|
sinatools/morphology/morph_analyzer.py
|
@@ -0,0 +1 @@
|
|
1
|
+
0.1.38
|
@@ -1 +0,0 @@
|
|
1
|
-
0.1.37
|
@@ -1,14 +0,0 @@
|
|
1
|
-
# coding=utf-8
|
2
|
-
# Copyright 2018 The Google AI Language Team Authors.
|
3
|
-
#
|
4
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
5
|
-
# you may not use this file except in compliance with the License.
|
6
|
-
# You may obtain a copy of the License at
|
7
|
-
#
|
8
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
9
|
-
#
|
10
|
-
# Unless required by applicable law or agreed to in writing, software
|
11
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
12
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
13
|
-
# See the License for the specific language governing permissions and
|
14
|
-
# limitations under the License.
|
@@ -1,260 +0,0 @@
|
|
1
|
-
# Scripts used to pre_process and create the data for classifier evaluation
|
2
|
-
#%%
|
3
|
-
import pandas as pd
|
4
|
-
from sklearn.model_selection import train_test_split
|
5
|
-
|
6
|
-
import sys
|
7
|
-
sys.path.append("..")
|
8
|
-
|
9
|
-
from arabert.preprocess import ArabertPreprocessor
|
10
|
-
|
11
|
-
|
12
|
-
from tqdm import tqdm
|
13
|
-
|
14
|
-
tqdm.pandas()
|
15
|
-
|
16
|
-
from tokenization import FullTokenizer
|
17
|
-
from run_classifier import input_fn_builder, model_fn_builder
|
18
|
-
|
19
|
-
|
20
|
-
model_name = "bert-base-arabert"
|
21
|
-
arabert_prep = ArabertPreprocessor(model_name=model_name, keep_emojis=False)
|
22
|
-
|
23
|
-
|
24
|
-
class Dataset:
|
25
|
-
def __init__(
|
26
|
-
self,
|
27
|
-
name,
|
28
|
-
train,
|
29
|
-
test,
|
30
|
-
label_list,
|
31
|
-
train_InputExamples=None,
|
32
|
-
test_InputExamples=None,
|
33
|
-
train_features=None,
|
34
|
-
test_features=None,
|
35
|
-
):
|
36
|
-
self.name = name
|
37
|
-
self.train = train
|
38
|
-
self.test = test
|
39
|
-
self.label_list = label_list
|
40
|
-
self.train_InputExamples = train_InputExamples
|
41
|
-
self.test_InputExamples = test_InputExamples
|
42
|
-
self.train_features = train_features
|
43
|
-
self.test_features = test_features
|
44
|
-
|
45
|
-
|
46
|
-
all_datasets = []
|
47
|
-
#%%
|
48
|
-
# *************HARD************
|
49
|
-
df_HARD = pd.read_csv("Datasets\\HARD\\balanced-reviews-utf8.tsv", sep="\t", header=0)
|
50
|
-
|
51
|
-
df_HARD = df_HARD[["rating", "review"]] # we are interested in rating and review only
|
52
|
-
# code rating as +ve if > 3, -ve if less, no 3s in dataset
|
53
|
-
df_HARD["rating"] = df_HARD["rating"].apply(lambda x: 0 if x < 3 else 1)
|
54
|
-
# rename columns to fit default constructor in fastai
|
55
|
-
df_HARD.columns = ["label", "text"]
|
56
|
-
df_HARD["text"] = df_HARD["text"].progress_apply(
|
57
|
-
lambda x: arabert_prep.preprocess(
|
58
|
-
x
|
59
|
-
)
|
60
|
-
)
|
61
|
-
train_HARD, test_HARD = train_test_split(df_HARD, test_size=0.2, random_state=42)
|
62
|
-
label_list_HARD = [0, 1]
|
63
|
-
|
64
|
-
data_Hard = Dataset("HARD", train_HARD, test_HARD, label_list_HARD)
|
65
|
-
all_datasets.append(data_Hard)
|
66
|
-
|
67
|
-
#%%
|
68
|
-
# *************ASTD-Unbalanced************
|
69
|
-
df_ASTD_UN = pd.read_csv(
|
70
|
-
"Datasets\\ASTD-master\\data\\Tweets.txt", sep="\t", header=None
|
71
|
-
)
|
72
|
-
|
73
|
-
DATA_COLUMN = "text"
|
74
|
-
LABEL_COLUMN = "label"
|
75
|
-
df_ASTD_UN.columns = [DATA_COLUMN, LABEL_COLUMN]
|
76
|
-
|
77
|
-
df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
|
78
|
-
lambda x: 0 if (x == "NEG") else x
|
79
|
-
)
|
80
|
-
df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
|
81
|
-
lambda x: 1 if (x == "POS") else x
|
82
|
-
)
|
83
|
-
df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
|
84
|
-
lambda x: 2 if (x == "NEUTRAL") else x
|
85
|
-
)
|
86
|
-
df_ASTD_UN[LABEL_COLUMN] = df_ASTD_UN[LABEL_COLUMN].apply(
|
87
|
-
lambda x: 3 if (x == "OBJ") else x
|
88
|
-
)
|
89
|
-
df_ASTD_UN["text"] = df_ASTD_UN["text"].progress_apply(
|
90
|
-
lambda x: arabert_prep.preprocess(
|
91
|
-
x
|
92
|
-
)
|
93
|
-
)
|
94
|
-
train_ASTD_UN, test_ASTD_UN = train_test_split(
|
95
|
-
df_ASTD_UN, test_size=0.2, random_state=42
|
96
|
-
)
|
97
|
-
label_list_ASTD_UN = [0, 1, 2, 3]
|
98
|
-
|
99
|
-
data_ASTD_UN = Dataset(
|
100
|
-
"ASTD-Unbalanced", train_ASTD_UN, test_ASTD_UN, label_list_ASTD_UN
|
101
|
-
)
|
102
|
-
all_datasets.append(data_ASTD_UN)
|
103
|
-
#%%
|
104
|
-
# *************ASTD-Dahou-Balanced************
|
105
|
-
|
106
|
-
df_ASTD_B = pd.read_csv(
|
107
|
-
"Datasets\\Dahou\\data_csv_balanced\\ASTD-balanced-not-linked.csv",
|
108
|
-
sep=",",
|
109
|
-
header=0,
|
110
|
-
)
|
111
|
-
|
112
|
-
df_ASTD_B.columns = [DATA_COLUMN, LABEL_COLUMN]
|
113
|
-
|
114
|
-
df_ASTD_B[LABEL_COLUMN] = df_ASTD_B[LABEL_COLUMN].apply(lambda x: 0 if (x == -1) else x)
|
115
|
-
df_ASTD_B["text"] = df_ASTD_B["text"].progress_apply(
|
116
|
-
lambda x: arabert_prep.preprocess(
|
117
|
-
x
|
118
|
-
)
|
119
|
-
)
|
120
|
-
train_ASTD_B, test_ASTD_B = train_test_split(df_ASTD_B, test_size=0.2, random_state=42)
|
121
|
-
label_list_ASTD_B = [0, 1]
|
122
|
-
|
123
|
-
data_ASTD_B = Dataset(
|
124
|
-
"ASTD-Dahou-Balanced", train_ASTD_B, test_ASTD_B, label_list_ASTD_B
|
125
|
-
)
|
126
|
-
all_datasets.append(data_ASTD_B)
|
127
|
-
|
128
|
-
#%%
|
129
|
-
# *************ArSenTD-LEV************
|
130
|
-
df_ArSenTD = pd.read_csv(
|
131
|
-
"Datasets\\ArSenTD-LEV\\ArSenTD-LEV-processed-no-emojis2.csv", sep=",", header=0
|
132
|
-
)
|
133
|
-
|
134
|
-
df_ArSenTD.columns = [DATA_COLUMN, LABEL_COLUMN]
|
135
|
-
|
136
|
-
df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
|
137
|
-
lambda x: 0 if (x == "very_negative") else x
|
138
|
-
)
|
139
|
-
df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
|
140
|
-
lambda x: 1 if (x == "negative") else x
|
141
|
-
)
|
142
|
-
df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
|
143
|
-
lambda x: 2 if (x == "neutral") else x
|
144
|
-
)
|
145
|
-
df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
|
146
|
-
lambda x: 3 if (x == "positive") else x
|
147
|
-
)
|
148
|
-
df_ArSenTD[LABEL_COLUMN] = df_ArSenTD[LABEL_COLUMN].apply(
|
149
|
-
lambda x: 4 if (x == "very_positive") else x
|
150
|
-
)
|
151
|
-
df_ArSenTD["text"] = df_ArSenTD["text"].progress_apply(
|
152
|
-
lambda x: arabert_prep.preprocess(
|
153
|
-
x
|
154
|
-
)
|
155
|
-
)
|
156
|
-
label_list_ArSenTD = [0, 1, 2, 3, 4]
|
157
|
-
|
158
|
-
train_ArSenTD, test_ArSenTD = train_test_split(
|
159
|
-
df_ArSenTD, test_size=0.2, random_state=42
|
160
|
-
)
|
161
|
-
|
162
|
-
data_ArSenTD = Dataset("ArSenTD-LEV", train_ArSenTD, test_ArSenTD, label_list_ArSenTD)
|
163
|
-
all_datasets.append(data_ArSenTD)
|
164
|
-
|
165
|
-
#%%
|
166
|
-
# *************AJGT************
|
167
|
-
df_AJGT = pd.read_excel("Datasets\\Ajgt\\AJGT.xlsx", header=0)
|
168
|
-
|
169
|
-
df_AJGT = df_AJGT[["Feed", "Sentiment"]]
|
170
|
-
df_AJGT.columns = [DATA_COLUMN, LABEL_COLUMN]
|
171
|
-
|
172
|
-
df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(
|
173
|
-
lambda x: 0 if (x == "Negative") else x
|
174
|
-
)
|
175
|
-
df_AJGT[LABEL_COLUMN] = df_AJGT[LABEL_COLUMN].apply(
|
176
|
-
lambda x: 1 if (x == "Positive") else x
|
177
|
-
)
|
178
|
-
df_AJGT["text"] = df_AJGT["text"].progress_apply(
|
179
|
-
lambda x: arabert_prep.preprocess(
|
180
|
-
x
|
181
|
-
)
|
182
|
-
)
|
183
|
-
train_AJGT, test_AJGT = train_test_split(df_AJGT, test_size=0.2, random_state=42)
|
184
|
-
label_list_AJGT = [0, 1]
|
185
|
-
|
186
|
-
data_AJGT = Dataset("AJGT", train_AJGT, test_AJGT, label_list_AJGT)
|
187
|
-
all_datasets.append(data_AJGT)
|
188
|
-
#%%
|
189
|
-
# *************LABR-UN-Binary************
|
190
|
-
from labr import LABR
|
191
|
-
|
192
|
-
labr_helper = LABR()
|
193
|
-
|
194
|
-
(d_train, y_train, d_test, y_test) = labr_helper.get_train_test(
|
195
|
-
klass="2", balanced="unbalanced"
|
196
|
-
)
|
197
|
-
|
198
|
-
train_LABR_B_U = pd.DataFrame({"text": d_train, "label": y_train})
|
199
|
-
test_LABR_B_U = pd.DataFrame({"text": d_test, "label": y_test})
|
200
|
-
|
201
|
-
train_LABR_B_U["text"] = train_LABR_B_U["text"].progress_apply(
|
202
|
-
lambda x: arabert_prep.preprocess(
|
203
|
-
x
|
204
|
-
)
|
205
|
-
)
|
206
|
-
test_LABR_B_U["text"] = test_LABR_B_U["text"].progress_apply(
|
207
|
-
lambda x: arabert_prep.preprocess(
|
208
|
-
x
|
209
|
-
)
|
210
|
-
)
|
211
|
-
label_list_LABR_B_U = [0, 1]
|
212
|
-
|
213
|
-
data_LABR_B_U = Dataset(
|
214
|
-
"LABR-UN-Binary", train_LABR_B_U, test_LABR_B_U, label_list_LABR_B_U
|
215
|
-
)
|
216
|
-
# all_datasets.append(data_LABR_B_U)
|
217
|
-
|
218
|
-
#%%
|
219
|
-
for data in tqdm(all_datasets):
|
220
|
-
# Use the InputExample class from BERT's run_classifier code to create examples from the data
|
221
|
-
data.train_InputExamples = data.train.apply(
|
222
|
-
lambda x: run_classifier.InputExample(
|
223
|
-
guid=None, # Globally unique ID for bookkeeping, unused in this example
|
224
|
-
text_a=x[DATA_COLUMN],
|
225
|
-
text_b=None,
|
226
|
-
label=x[LABEL_COLUMN],
|
227
|
-
),
|
228
|
-
axis=1,
|
229
|
-
)
|
230
|
-
|
231
|
-
data.test_InputExamples = data.test.apply(
|
232
|
-
lambda x: run_classifier.InputExample(
|
233
|
-
guid=None, text_a=x[DATA_COLUMN], text_b=None, label=x[LABEL_COLUMN]
|
234
|
-
),
|
235
|
-
axis=1,
|
236
|
-
)
|
237
|
-
#%%
|
238
|
-
# We'll set sequences to be at most 128 tokens long.
|
239
|
-
MAX_SEQ_LENGTH = 256
|
240
|
-
|
241
|
-
VOC_FNAME = "./64000_vocab_sp_70m.txt"
|
242
|
-
tokenizer = FullTokenizer(VOC_FNAME)
|
243
|
-
|
244
|
-
for data in tqdm(all_datasets):
|
245
|
-
# Convert our train and test features to InputFeatures that BERT understands.
|
246
|
-
data.train_features = run_classifier.convert_examples_to_features(
|
247
|
-
data.train_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer
|
248
|
-
)
|
249
|
-
data.test_features = run_classifier.convert_examples_to_features(
|
250
|
-
data.test_InputExamples, data.label_list, MAX_SEQ_LENGTH, tokenizer
|
251
|
-
)
|
252
|
-
|
253
|
-
# %%
|
254
|
-
import pickle
|
255
|
-
|
256
|
-
with open("all_datasets_64k_farasa_256.pickle", "wb") as fp: # Pickling
|
257
|
-
pickle.dump(all_datasets, fp)
|
258
|
-
|
259
|
-
|
260
|
-
# %%
|