SinaTools 0.1.20__tar.gz → 0.1.22__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.20 → SinaTools-0.1.22}/PKG-INFO +1 -1
- {SinaTools-0.1.20 → SinaTools-0.1.22}/SinaTools.egg-info/PKG-INFO +1 -1
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/CLI/DataDownload/download_files.py +2 -2
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/DataDownload/downloader.py +37 -40
- SinaTools-0.1.22/sinatools/VERSION +1 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/wsd/__init__.py +2 -2
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/wsd/disambiguator.py +36 -33
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/wsd/settings.py +0 -6
- SinaTools-0.1.20/sinatools/VERSION +0 -1
- {SinaTools-0.1.20 → SinaTools-0.1.22}/AUTHORS.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/CONTRIBUTING.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/LICENSE +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/MANIFEST.in +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/README.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/SinaTools.egg-info/SOURCES.txt +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/SinaTools.egg-info/dependency_links.txt +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/SinaTools.egg-info/entry_points.txt +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/SinaTools.egg-info/not-zip-safe +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/SinaTools.egg-info/requires.txt +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/SinaTools.egg-info/top_level.txt +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/Makefile +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/build/_images/download.png +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/build/_static/download.png +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/build/_static/file.png +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/build/_static/minus.png +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/build/_static/plus.png +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/build/html/_images/SinaLogo.jpg +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/build/html/_images/download.png +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/build/html/_static/SinaLogo.jpg +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/build/html/_static/download.png +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/build/html/_static/file.png +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/build/html/_static/minus.png +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/build/html/_static/plus.png +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/make.bat +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/License.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/Overview.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/_static/SinaLogo.jpg +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/_static/download.png +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/about.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/api/DataDownload/downloader.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/api/DataDownload.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/api/arabiner/bin/infer.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/api/arabiner.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/api/morphology/morph_analyzer.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/api/morphology.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/api/salma/views.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/api/salma.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/api/utils/corpus_tokenizer.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/api/utils/implication.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/api/utils/jaccard.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/api/utils/parser.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/api/utils/sentence_tokenizer.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/api/utils/text_transliteration.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/api/utils.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/api.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/authors.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/DataDownload/download_files.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/DataDownload/get_appdatadir.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/DataDownload.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/arabiner/infer.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/arabiner.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/morphology/ALMA_multi_word.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/morphology/morph_analyzer.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/morphology.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/salma/salma_tools.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/salma.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/utils/arStrip.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/utils/corpus_tokenizer.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/utils/implication.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/utils/jaccard.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/utils/latin_remove.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/utils/remove_punc.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/utils/sentence_tokenizer.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/utils/text_transliteration.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools/utils.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/cli_tools.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/conf.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/index.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/installation.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/docs/source/readme.rst +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/setup.cfg +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/setup.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/CLI/morphology/ALMA_multi_word.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/CLI/morphology/morph_analyzer.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/CLI/ner/corpus_entity_extractor.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/CLI/ner/entity_extractor.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/CLI/utils/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/CLI/utils/arStrip.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/CLI/utils/corpus_tokenizer.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/CLI/utils/implication.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/CLI/utils/jaccard.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/CLI/utils/remove_latin.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/CLI/utils/remove_punctuation.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/CLI/utils/sentence_tokenizer.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/CLI/utils/text_dublication_detector.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/CLI/utils/text_transliteration.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/DataDownload/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/arabert/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/arabert/create_classification_data.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/arabert/create_pretraining_data.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/arabert/extract_features.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/arabert/lamb_optimizer.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/arabert/modeling.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/arabert/optimization.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/arabert/run_classifier.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/arabert/run_pretraining.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/arabert/run_squad.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/arabert/tokenization.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/configure_finetuning.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/configure_pretraining.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/finetune/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/finetune/feature_spec.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/finetune/preprocessing.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/finetune/scorer.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/finetune/task.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/finetune/task_builder.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/flops_computation.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/model/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/model/modeling.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/model/optimization.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/model/tokenization.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/pretrain/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/run_finetuning.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/run_pretraining.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/util/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/util/training_utils.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/util/utils.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/aragpt2/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/aragpt2/create_pretraining_data.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/aragpt2/gpt2/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/aragpt2/gpt2/optimization.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/aragpt2/grover/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/aragpt2/grover/dataloader.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/aragpt2/grover/modeling.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/aragpt2/grover/train_tpu.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/aragpt2/grover/utils.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/preprocess.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/environment.yml +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/install_env.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/morphology/ALMA_multi_word.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/morphology/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/morphology/morph_analyzer.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/data/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/data/datasets.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/data/transforms.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/data_format.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/datasets.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/entity_extractor.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/helpers.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/metrics.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/nn/BaseModel.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/nn/BertNestedTagger.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/nn/BertSeqTagger.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/nn/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/trainers/BaseTrainer.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/trainers/BertNestedTrainer.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/trainers/BertTrainer.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/trainers/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/ner/transforms.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/semantic_relatedness/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/semantic_relatedness/compute_relatedness.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/sinatools.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/synonyms/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/synonyms/synonyms_generator.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/utils/__init__.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/utils/charsets.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/utils/implication.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/utils/jaccard.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/utils/parser.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/utils/readfile.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/utils/text_dublication_detector.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/utils/text_transliteration.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/utils/tokenizer.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/utils/tokenizers_words.py +0 -0
- {SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/wsd/wsd.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: SinaTools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.22
|
4
4
|
Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
5
5
|
Home-page: https://github.com/SinaLab/sinatools
|
6
6
|
License: MIT license
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: SinaTools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.22
|
4
4
|
Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
5
5
|
Home-page: https://github.com/SinaLab/sinatools
|
6
6
|
License: MIT license
|
@@ -2,7 +2,7 @@
|
|
2
2
|
About:
|
3
3
|
------
|
4
4
|
|
5
|
-
The download_files
|
5
|
+
The download_files is a command-line interface for downloading various NLP resources from pre-specified URLs. It is a part of the sinatools package and provides options to choose which files to download and to specify a download directory. The tool automatically handles file extraction for zip and tar.gz files.
|
6
6
|
|
7
7
|
Usage:
|
8
8
|
------
|
@@ -18,7 +18,7 @@ Below is the usage information that can be generated by running download_files -
|
|
18
18
|
|
19
19
|
Options:
|
20
20
|
-f, --files FILES
|
21
|
-
Names of the files to download. Available files are: ner, morph,
|
21
|
+
Names of the files to download. Available files are: ner, morph, wsd_model, wsd_tokenizer, glosses_dic, five_grams, four_grams, three_grams, two_grams, synonyms_level2, synonyms_level3.
|
22
22
|
If no file is specified, all files will be downloaded.
|
23
23
|
|
24
24
|
Examples:
|
@@ -6,16 +6,17 @@ import zipfile
|
|
6
6
|
from tqdm import tqdm
|
7
7
|
import tarfile
|
8
8
|
urls = {
|
9
|
-
'morph': 'https://portal.sina.birzeit.edu/
|
9
|
+
'morph': 'https://portal.sina.birzeit.edu/lemmas_dic.pickle',
|
10
10
|
'ner': 'https://portal.sina.birzeit.edu/Wj27012000.tar.gz',
|
11
|
-
'
|
12
|
-
'
|
11
|
+
'wsd_model': 'https://portal.sina.birzeit.edu/bert-base-arabertv02_22_May_2021_00h_allglosses_unused01.zip',
|
12
|
+
'wsd_tokenizer': 'https://portal.sina.birzeit.edu/bert-base-arabertv02.zip',
|
13
13
|
'glosses_dic': 'https://portal.sina.birzeit.edu/glosses_dic.pickle',
|
14
|
-
'lemma_dic': 'https://portal.sina.birzeit.edu/lemmas_dic.pickle',
|
15
14
|
'five_grams': 'https://portal.sina.birzeit.edu/five_grams.pickle',
|
16
15
|
'four_grams':'https://portal.sina.birzeit.edu/four_grams.pickle',
|
17
16
|
'three_grams':'https://portal.sina.birzeit.edu/three_grams.pickle',
|
18
|
-
'two_grams':'https://portal.sina.birzeit.edu/two_grams.pickle'
|
17
|
+
'two_grams':'https://portal.sina.birzeit.edu/two_grams.pickle',
|
18
|
+
'synonyms_level2':'https://portal.sina.birzeit.edu/synonyms_level2.pkl',
|
19
|
+
'synonyms_level3':'https://portal.sina.birzeit.edu/synonyms_level3.pkl'
|
19
20
|
}
|
20
21
|
|
21
22
|
def get_appdatadir():
|
@@ -94,41 +95,37 @@ def download_file(url, dest_path=get_appdatadir()):
|
|
94
95
|
print(filename)
|
95
96
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
|
96
97
|
|
97
|
-
try:
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
if e.response.status_code == 403:
|
129
|
-
print(f'Error 403: Forbidden. The requested file URL {url} could not be downloaded due to insufficient permissions. Please check the URL and try again.')
|
130
|
-
else:
|
131
|
-
print('An error occurred while downloading the file:', e)
|
98
|
+
# try:
|
99
|
+
with requests.get(url, headers=headers, stream=True) as r:
|
100
|
+
r.raise_for_status()
|
101
|
+
with open(file_path, 'wb') as f:
|
102
|
+
total_size = int(r.headers.get('content-length', 0))
|
103
|
+
block_size = 8192
|
104
|
+
progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True)
|
105
|
+
for chunk in r.iter_content(chunk_size=block_size):
|
106
|
+
if chunk:
|
107
|
+
f.write(chunk)
|
108
|
+
progress_bar.update(len(chunk))
|
109
|
+
progress_bar.close()
|
110
|
+
# Check the file type and extract accordingly
|
111
|
+
file_extension = os.path.splitext(file_path)[1]
|
112
|
+
extracted_folder_name = os.path.splitext(file_path)[0]
|
113
|
+
|
114
|
+
if file_extension == '.zip':
|
115
|
+
extract_zip(file_path, extracted_folder_name)
|
116
|
+
elif file_extension == '.gz':
|
117
|
+
extract_tar(file_path, extracted_folder_name)
|
118
|
+
elif file_extension =='.pickle':
|
119
|
+
print(f'Done: {file_extension}')
|
120
|
+
else:
|
121
|
+
print(f'Unsupported file type for extraction: {file_extension}')
|
122
|
+
return file_path
|
123
|
+
|
124
|
+
# except requests.exceptions.HTTPError as e:
|
125
|
+
# if e.response.status_code == 403:
|
126
|
+
# print(f'Error 403: Forbidden. The requested file URL {url} could not be downloaded due to insufficient permissions. Please check the URL and try again.')
|
127
|
+
# else:
|
128
|
+
# print('An error occurred while downloading the file:', e)
|
132
129
|
|
133
130
|
def extract_zip(file_path, extracted_folder_name):
|
134
131
|
"""
|
@@ -0,0 +1 @@
|
|
1
|
+
0.1.22
|
@@ -3,9 +3,9 @@ import pickle
|
|
3
3
|
from sinatools.DataDownload import downloader
|
4
4
|
import os
|
5
5
|
|
6
|
-
|
6
|
+
glosses_dic = {}
|
7
7
|
filename = 'glosses_dic.pickle'
|
8
8
|
path =downloader.get_appdatadir()
|
9
9
|
file_path = os.path.join(path, filename)
|
10
10
|
with open(file_path, 'rb') as f:
|
11
|
-
|
11
|
+
glosses_dic = pickle.load(f)
|
@@ -7,6 +7,7 @@ from sinatools.utils.tokenizers_words import simple_word_tokenize
|
|
7
7
|
from sinatools.morphology.ALMA_multi_word import ALMA_multi_word
|
8
8
|
from sinatools.morphology.morph_analyzer import analyze
|
9
9
|
from sinatools.ner.entity_extractor import extract
|
10
|
+
from . import glosses_dic
|
10
11
|
|
11
12
|
|
12
13
|
def distill_entities(entities):
|
@@ -135,12 +136,9 @@ def find_two_word_lemma(input_sentence):
|
|
135
136
|
glosses_list = []
|
136
137
|
concept_count = 0
|
137
138
|
ids = data[0]["ids"]
|
138
|
-
for
|
139
|
-
|
140
|
-
|
141
|
-
glosses_list.append(json.loads(value[1]))
|
142
|
-
concept_count = concept_count + value[0]
|
143
|
-
|
139
|
+
for concepts in ids:
|
140
|
+
glosses_list.append(json.loads(concepts))
|
141
|
+
concept_count = concept_count + data[0]["POS"]
|
144
142
|
found_2Word_lemma = [two_grams, glosses_list, i, i + 1, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
145
143
|
output.append(found_2Word_lemma)
|
146
144
|
i = i + 1
|
@@ -160,12 +158,9 @@ def find_three_word_lemma(input_sentence):
|
|
160
158
|
glosses_list = []
|
161
159
|
concept_count = 0
|
162
160
|
ids = data[0]["ids"]
|
163
|
-
for
|
164
|
-
|
165
|
-
|
166
|
-
glosses_list.append(json.loads(value[1]))
|
167
|
-
concept_count = concept_count + value[0]
|
168
|
-
|
161
|
+
for concepts in ids:
|
162
|
+
glosses_list.append(json.loads(concepts))
|
163
|
+
concept_count = concept_count + data[0]["POS"]
|
169
164
|
found_3Word_lemma = [three_grams, glosses_list, i, i + 2, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
170
165
|
output.append(found_3Word_lemma)
|
171
166
|
i = i + 1
|
@@ -184,11 +179,9 @@ def find_four_word_lemma(input_sentence):
|
|
184
179
|
glosses_list = []
|
185
180
|
concept_count = 0
|
186
181
|
ids = data[0]["ids"]
|
187
|
-
for
|
188
|
-
|
189
|
-
|
190
|
-
glosses_list.append(json.loads(value[1]))
|
191
|
-
concept_count = concept_count + value[0]
|
182
|
+
for concepts in ids:
|
183
|
+
glosses_list.append(json.loads(concepts))
|
184
|
+
concept_count = concept_count + data[0]["POS"]
|
192
185
|
found_4Word_lemma = [four_grams, glosses_list, i, i + 3, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
193
186
|
output.append(found_4Word_lemma)
|
194
187
|
i = i + 1
|
@@ -208,11 +201,9 @@ def find_five_word_lemma(input_sentence):
|
|
208
201
|
glosses_list = []
|
209
202
|
concept_count = 0
|
210
203
|
ids = data[0]["ids"]
|
211
|
-
for
|
212
|
-
|
213
|
-
|
214
|
-
glosses_list.append(json.loads(value[1]))
|
215
|
-
concept_count = concept_count + value[0]
|
204
|
+
for concepts in ids:
|
205
|
+
glosses_list.append(json.loads(concepts))
|
206
|
+
concept_count = concept_count + data[0]["POS"]
|
216
207
|
found_5Word_lemma = [five_grams, glosses_list, i, i + 4, concept_count, data[0]['undiac_multi_word_lemma'], data[0]['multi_word_lemma']]
|
217
208
|
output.append(found_5Word_lemma)
|
218
209
|
i = i + 1
|
@@ -276,16 +267,18 @@ def find_glosses_using_ALMA(word):
|
|
276
267
|
pos = data[0]["pos"]
|
277
268
|
Undiac_lemma = arStrip(Diac_lemma, True, True, True, True, True, False) # Remove diacs , smallDiacs , shaddah , digit , alif , specialChars
|
278
269
|
ids = []
|
279
|
-
|
270
|
+
# glosses_list = []
|
280
271
|
concept_count = 0
|
281
272
|
lemma_id = data[0]["lemma_id"]
|
282
|
-
if lemma_id in settings.glosses_dic.keys():
|
283
|
-
value = settings.glosses_dic[lemma_id]
|
284
|
-
glosses_list.append(json.loads(value[1]))
|
285
|
-
concept_count = concept_count + value[0]
|
286
273
|
|
287
|
-
|
274
|
+
if lemma_id in glosses_dic.keys():
|
275
|
+
value = glosses_dic[lemma_id]
|
276
|
+
glosses= json.loads(value[1])
|
277
|
+
# glosses_list.append(json.loads(value[1]))
|
278
|
+
concept_count = concept_count + value[0]
|
288
279
|
|
280
|
+
return word, Undiac_lemma, Diac_lemma, pos , concept_count, glosses
|
281
|
+
|
289
282
|
def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, sentence):
|
290
283
|
word = normalizearabert(word)
|
291
284
|
glosses_dictionary = {}
|
@@ -309,7 +302,7 @@ def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, se
|
|
309
302
|
return my_json
|
310
303
|
|
311
304
|
|
312
|
-
def find_glosses(input_sentence,
|
305
|
+
def find_glosses(input_sentence, two_word_lemma, three_word_lemma,four_word_lemma, five_word_lemma, ner):
|
313
306
|
output_list = []
|
314
307
|
position = 0
|
315
308
|
while position < len(input_sentence):
|
@@ -376,7 +369,7 @@ def find_glosses(input_sentence, three_word_lemma, two_word_lemma, four_word_lem
|
|
376
369
|
position = position + 1
|
377
370
|
|
378
371
|
|
379
|
-
|
372
|
+
|
380
373
|
output_from_ner = delete_form_list(position, ner)
|
381
374
|
ner = output_from_ner[0]
|
382
375
|
if output_from_ner[1] != []:
|
@@ -385,11 +378,13 @@ def find_glosses(input_sentence, three_word_lemma, two_word_lemma, four_word_lem
|
|
385
378
|
my_json = {}
|
386
379
|
word = output_from_ner[1][0][0]
|
387
380
|
my_json['word'] = word
|
388
|
-
my_json['concept_count'] = output_from_ner[1][0][2]
|
381
|
+
# my_json['concept_count'] = output_from_ner[1][0][2]
|
382
|
+
my_json['concept_count'] = '*'
|
389
383
|
my_json['glosses'] = output_from_ner[1][0][1]
|
390
384
|
my_json['Diac_lemma'] = output_from_ner[1][0][4]
|
391
385
|
my_json['Undiac_lemma'] = output_from_ner[1][0][3]
|
392
386
|
output_list.append(my_json)
|
387
|
+
# print("output list: ", output_list)
|
393
388
|
position = position + 1
|
394
389
|
|
395
390
|
if flag == "False": # Not found in ner or in multi_word_dictionary, ASK ALMA
|
@@ -417,6 +412,15 @@ def disambiguate_glosses_main(word, sentence):
|
|
417
412
|
my_json = {}
|
418
413
|
my_json['word'] = word['word']
|
419
414
|
glosses = word['glosses'][0]
|
415
|
+
# my_json['Gloss'] = glosses['gloss']
|
416
|
+
my_json['Concept_id'] = glosses['concept_id']
|
417
|
+
my_json['Diac_lemma'] = word['Diac_lemma']
|
418
|
+
my_json['Undiac_lemma'] = word['Undiac_lemma']
|
419
|
+
return my_json
|
420
|
+
elif concept_count == '*':
|
421
|
+
my_json = {}
|
422
|
+
my_json['word'] = word['word']
|
423
|
+
glosses = word['glosses'][0]
|
420
424
|
my_json['Gloss'] = glosses['gloss']
|
421
425
|
my_json['Concept_id'] = glosses['concept_id']
|
422
426
|
my_json['Diac_lemma'] = word['Diac_lemma']
|
@@ -444,8 +448,7 @@ def WSD(sentence):
|
|
444
448
|
|
445
449
|
ner = find_named_entities(" ".join(input_sentence))
|
446
450
|
|
447
|
-
output_list = find_glosses(input_sentence,
|
448
|
-
|
451
|
+
output_list = find_glosses(input_sentence, two_word_lemma, three_word_lemma, four_word_lemma, five_word_lemma, ner)
|
449
452
|
results = []
|
450
453
|
for word in output_list:
|
451
454
|
results.append(disambiguate_glosses_main(word, sentence))
|
@@ -9,7 +9,6 @@ import pandas as pd
|
|
9
9
|
from sinatools.DataDownload import downloader
|
10
10
|
import os
|
11
11
|
|
12
|
-
glosses_dic = {}
|
13
12
|
|
14
13
|
model_file_name = "bert-base-arabertv02_22_May_2021_00h_allglosses_unused01"
|
15
14
|
path =downloader.get_appdatadir()
|
@@ -21,11 +20,6 @@ tokenizer_file_path = os.path.join(path, tokenizer_file_name)
|
|
21
20
|
|
22
21
|
dftrue = pd.DataFrame()
|
23
22
|
|
24
|
-
# model = BertForSequenceClassification.from_pretrained('{}'.format("bert-base-arabertv02_22_May_2021_00h_allglosses_unused01"),
|
25
|
-
# output_hidden_states = True,
|
26
|
-
# num_labels=2
|
27
|
-
# )
|
28
|
-
|
29
23
|
model = BertForSequenceClassification.from_pretrained(model_file_path, output_hidden_states=True, num_labels=2)
|
30
24
|
|
31
25
|
tokenizer = BertTokenizer.from_pretrained('{}'.format(tokenizer_file_path))
|
@@ -1 +0,0 @@
|
|
1
|
-
0.1.20
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/arabert/create_classification_data.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/build_pretraining_dataset.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/finetune/preprocessing.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/pretrain/pretrain_data.py
RENAMED
File without changes
|
{SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/araelectra/pretrain/pretrain_helpers.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{SinaTools-0.1.20 → SinaTools-0.1.22}/sinatools/arabert/aragpt2/grover/optimization_adafactor.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|