SinaTools 0.1.24__tar.gz → 0.1.26__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.24 → SinaTools-0.1.26}/PKG-INFO +1 -1
- {SinaTools-0.1.24 → SinaTools-0.1.26}/SinaTools.egg-info/PKG-INFO +1 -1
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/DataDownload/download_files.py +18 -3
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/ner/corpus_entity_extractor.py +1 -1
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/ner/entity_extractor.py +1 -1
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/DataDownload/downloader.py +35 -31
- SinaTools-0.1.26/sinatools/VERSION +1 -0
- SinaTools-0.1.26/sinatools/ner/entity_extractor.py +63 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/trainers/BaseTrainer.py +117 -117
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/wsd/disambiguator.py +19 -22
- SinaTools-0.1.24/sinatools/VERSION +0 -1
- SinaTools-0.1.24/sinatools/ner/entity_extractor.py +0 -72
- {SinaTools-0.1.24 → SinaTools-0.1.26}/AUTHORS.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/CONTRIBUTING.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/LICENSE +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/MANIFEST.in +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/README.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/SinaTools.egg-info/SOURCES.txt +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/SinaTools.egg-info/dependency_links.txt +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/SinaTools.egg-info/entry_points.txt +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/SinaTools.egg-info/not-zip-safe +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/SinaTools.egg-info/requires.txt +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/SinaTools.egg-info/top_level.txt +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/Makefile +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/_images/download.png +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/_static/download.png +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/_static/file.png +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/_static/minus.png +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/_static/plus.png +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/html/_images/SinaLogo.jpg +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/html/_images/download.png +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/html/_static/SinaLogo.jpg +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/html/_static/download.png +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/html/_static/file.png +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/html/_static/minus.png +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/build/html/_static/plus.png +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/make.bat +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/License.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/Overview.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/_static/SinaLogo.jpg +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/_static/download.png +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/about.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/DataDownload/downloader.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/DataDownload.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/arabiner/bin/infer.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/arabiner.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/morphology/morph_analyzer.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/morphology.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/salma/views.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/salma.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/utils/corpus_tokenizer.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/utils/implication.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/utils/jaccard.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/utils/parser.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/utils/sentence_tokenizer.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/utils/text_transliteration.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api/utils.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/api.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/authors.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/DataDownload/download_files.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/DataDownload/get_appdatadir.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/DataDownload.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/arabiner/infer.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/arabiner.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/morphology/ALMA_multi_word.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/morphology/morph_analyzer.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/morphology.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/salma/salma_tools.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/salma.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils/arStrip.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils/corpus_tokenizer.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils/implication.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils/jaccard.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils/latin_remove.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils/remove_punc.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils/sentence_tokenizer.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils/text_transliteration.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools/utils.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/cli_tools.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/conf.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/index.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/installation.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/docs/source/readme.rst +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/setup.cfg +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/setup.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/morphology/ALMA_multi_word.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/morphology/morph_analyzer.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/arStrip.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/corpus_tokenizer.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/implication.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/jaccard.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/remove_latin.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/remove_punctuation.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/sentence_tokenizer.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/text_dublication_detector.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/CLI/utils/text_transliteration.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/DataDownload/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/create_classification_data.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/create_pretraining_data.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/extract_features.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/lamb_optimizer.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/modeling.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/optimization.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/run_classifier.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/run_pretraining.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/run_squad.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/arabert/tokenization.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/configure_finetuning.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/configure_pretraining.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/finetune/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/finetune/feature_spec.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/finetune/preprocessing.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/finetune/scorer.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/finetune/task.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/finetune/task_builder.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/flops_computation.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/model/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/model/modeling.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/model/optimization.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/model/tokenization.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/pretrain/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/run_finetuning.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/run_pretraining.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/util/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/util/training_utils.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/araelectra/util/utils.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/create_pretraining_data.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/gpt2/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/gpt2/optimization.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/grover/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/grover/dataloader.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/grover/modeling.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/grover/train_tpu.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/grover/utils.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/arabert/preprocess.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/environment.yml +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/install_env.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/morphology/ALMA_multi_word.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/morphology/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/morphology/morph_analyzer.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/data/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/data/datasets.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/data/transforms.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/data_format.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/datasets.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/helpers.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/metrics.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/nn/BaseModel.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/nn/BertNestedTagger.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/nn/BertSeqTagger.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/nn/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/trainers/BertNestedTrainer.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/trainers/BertTrainer.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/trainers/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/ner/transforms.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/semantic_relatedness/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/semantic_relatedness/compute_relatedness.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/sinatools.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/synonyms/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/synonyms/synonyms_generator.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/charsets.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/implication.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/jaccard.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/parser.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/readfile.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/text_dublication_detector.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/text_transliteration.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/tokenizer.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/utils/tokenizers_words.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/wsd/__init__.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/wsd/settings.py +0 -0
- {SinaTools-0.1.24 → SinaTools-0.1.26}/sinatools/wsd/wsd.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: SinaTools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.26
|
4
4
|
Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
5
5
|
Home-page: https://github.com/SinaLab/sinatools
|
6
6
|
License: MIT license
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: SinaTools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.26
|
4
4
|
Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
5
5
|
Home-page: https://github.com/SinaLab/sinatools
|
6
6
|
License: MIT license
|
@@ -40,7 +40,7 @@ from sinatools.DataDownload.downloader import urls
|
|
40
40
|
|
41
41
|
def main():
|
42
42
|
parser = argparse.ArgumentParser(description="Download files from specified URLs.")
|
43
|
-
parser.add_argument('-f', '--files', nargs="*",
|
43
|
+
parser.add_argument('-f', '--files', nargs="*",
|
44
44
|
help="Names of the files to download. Available files are: "
|
45
45
|
f"{', '.join(urls.keys())}. If no file is specified, all files will be downloaded.")
|
46
46
|
|
@@ -50,8 +50,23 @@ def main():
|
|
50
50
|
|
51
51
|
if args.files:
|
52
52
|
for file in args.files:
|
53
|
-
|
54
|
-
|
53
|
+
print("file: ", file)
|
54
|
+
if file == "wsd":
|
55
|
+
download_file(urls["morph"])
|
56
|
+
download_file(urls["ner"])
|
57
|
+
download_file(urls["wsd_model"])
|
58
|
+
download_file(urls["wsd_tokenizer"])
|
59
|
+
download_file(urls["glosses_dic"])
|
60
|
+
download_file(urls["five_grams"])
|
61
|
+
download_file(urls["four_grams"])
|
62
|
+
download_file(urls["three_grams"])
|
63
|
+
download_file(urls["two_grams"])
|
64
|
+
elif file == "synonyms":
|
65
|
+
download_file(urls["synonyms_level2"])
|
66
|
+
download_file(urls["synonyms_level3"])
|
67
|
+
else:
|
68
|
+
url = urls[file]
|
69
|
+
download_file(url)
|
55
70
|
else:
|
56
71
|
download_files()
|
57
72
|
|
@@ -20,7 +20,7 @@ def jsons_to_list_of_lists(json_list):
|
|
20
20
|
return [[d['token'], d['tags']] for d in json_list]
|
21
21
|
|
22
22
|
def combine_tags(sentence):
|
23
|
-
output = jsons_to_list_of_lists(extract(sentence))
|
23
|
+
output = jsons_to_list_of_lists(extract(sentence, "nested"))
|
24
24
|
return [word[1] for word in output]
|
25
25
|
|
26
26
|
|
@@ -46,7 +46,7 @@ def jsons_to_list_of_lists(json_list):
|
|
46
46
|
return [[d['token'], d['tags']] for d in json_list]
|
47
47
|
|
48
48
|
def combine_tags(sentence):
|
49
|
-
output = jsons_to_list_of_lists(extract(sentence))
|
49
|
+
output = jsons_to_list_of_lists(extract(sentence, "nested"))
|
50
50
|
return [word[1] for word in output]
|
51
51
|
|
52
52
|
|
@@ -95,37 +95,41 @@ def download_file(url, dest_path=get_appdatadir()):
|
|
95
95
|
print(filename)
|
96
96
|
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
|
97
97
|
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
98
|
+
try:
|
99
|
+
with requests.get(url, headers=headers, stream=True) as r:
|
100
|
+
r.raise_for_status()
|
101
|
+
with open(file_path, 'wb') as f:
|
102
|
+
total_size = int(r.headers.get('content-length', 0))
|
103
|
+
block_size = 8192
|
104
|
+
progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True)
|
105
|
+
for chunk in r.iter_content(chunk_size=block_size):
|
106
|
+
if chunk:
|
107
|
+
f.write(chunk)
|
108
|
+
progress_bar.update(len(chunk))
|
109
|
+
progress_bar.close()
|
110
|
+
|
111
|
+
# Check the file type and extract accordingly
|
112
|
+
file_extension = os.path.splitext(file_path)[1]
|
113
|
+
extracted_folder_name = os.path.splitext(file_path)[0]
|
114
|
+
|
115
|
+
if file_extension == '.zip':
|
116
|
+
extract_zip(file_path, extracted_folder_name)
|
117
|
+
elif file_extension == '.gz':
|
118
|
+
|
119
|
+
extract_tar(file_path, extracted_folder_name)
|
120
|
+
elif file_extension =='.pickle':
|
121
|
+
print(f'Done: {file_extension}')
|
122
|
+
|
123
|
+
else:
|
124
|
+
print(f'Unsupported file type for extraction: {file_extension}')
|
125
|
+
|
126
|
+
return file_path
|
127
|
+
|
128
|
+
except requests.exceptions.HTTPError as e:
|
129
|
+
if e.response.status_code == 403:
|
130
|
+
print(f'Error 403: Forbidden. The requested file URL {url} could not be downloaded due to insufficient permissions. Please check the URL and try again.')
|
131
|
+
else:
|
132
|
+
print('An error occurred while downloading the file:', e)
|
129
133
|
|
130
134
|
def extract_zip(file_path, extracted_folder_name):
|
131
135
|
"""
|
@@ -0,0 +1 @@
|
|
1
|
+
0.1.26
|
@@ -0,0 +1,63 @@
|
|
1
|
+
import os
|
2
|
+
from collections import namedtuple
|
3
|
+
from sinatools.ner.data_format import get_dataloaders, text2segments
|
4
|
+
from . import tagger, tag_vocab, train_config
|
5
|
+
|
6
|
+
|
7
|
+
def convert_nested_to_flat(nested_tags):
|
8
|
+
flat_tags = []
|
9
|
+
|
10
|
+
for entry in nested_tags:
|
11
|
+
word = entry['token']
|
12
|
+
tags = entry['tags'].split()
|
13
|
+
|
14
|
+
# Initialize with the first tag in the sequence
|
15
|
+
flat_tag = tags[0]
|
16
|
+
|
17
|
+
for tag in tags[1:]:
|
18
|
+
# Check if the tag is an "I-" tag, indicating continuation of an entity
|
19
|
+
if tag.startswith('I-'):
|
20
|
+
flat_tag = tag
|
21
|
+
break
|
22
|
+
|
23
|
+
flat_tags.append({
|
24
|
+
'token': word,
|
25
|
+
'tags': flat_tag
|
26
|
+
})
|
27
|
+
|
28
|
+
return flat_tags
|
29
|
+
|
30
|
+
def extract(text, ner_method):
|
31
|
+
|
32
|
+
dataset, token_vocab = text2segments(text)
|
33
|
+
|
34
|
+
vocabs = namedtuple("Vocab", ["tags", "tokens"])
|
35
|
+
vocab = vocabs(tokens=token_vocab, tags=tag_vocab)
|
36
|
+
|
37
|
+
dataloader = get_dataloaders(
|
38
|
+
(dataset,),
|
39
|
+
vocab,
|
40
|
+
train_config.data_config,
|
41
|
+
batch_size=32,
|
42
|
+
shuffle=(False,),
|
43
|
+
)[0]
|
44
|
+
|
45
|
+
|
46
|
+
segments = tagger.infer(dataloader)
|
47
|
+
segments_lists = []
|
48
|
+
|
49
|
+
for segment in segments:
|
50
|
+
for token in segment:
|
51
|
+
segments_list = {}
|
52
|
+
segments_list["token"] = token.text
|
53
|
+
list_of_tags = [t['tag'] for t in token.pred_tag]
|
54
|
+
list_of_tags = [i for i in list_of_tags if i not in('O',' ','')]
|
55
|
+
if list_of_tags == []:
|
56
|
+
segments_list["tags"] = ' '.join(['O'])
|
57
|
+
else:
|
58
|
+
segments_list["tags"] = ' '.join(list_of_tags)
|
59
|
+
segments_lists.append(segments_list)
|
60
|
+
|
61
|
+
if ner_method == "flat":
|
62
|
+
segments_lists = convert_nested_to_flat(segments_lists)
|
63
|
+
return segments_lists
|
@@ -1,117 +1,117 @@
|
|
1
|
-
import os
|
2
|
-
import torch
|
3
|
-
import logging
|
4
|
-
import natsort
|
5
|
-
import glob
|
6
|
-
|
7
|
-
logger = logging.getLogger(__name__)
|
8
|
-
|
9
|
-
|
10
|
-
class BaseTrainer:
|
11
|
-
def __init__(
|
12
|
-
self,
|
13
|
-
model=None,
|
14
|
-
max_epochs=50,
|
15
|
-
optimizer=None,
|
16
|
-
scheduler=None,
|
17
|
-
loss=None,
|
18
|
-
train_dataloader=None,
|
19
|
-
val_dataloader=None,
|
20
|
-
test_dataloader=None,
|
21
|
-
log_interval=10,
|
22
|
-
summary_writer=None,
|
23
|
-
output_path=None,
|
24
|
-
clip=5,
|
25
|
-
patience=5
|
26
|
-
):
|
27
|
-
self.model = model
|
28
|
-
self.max_epochs = max_epochs
|
29
|
-
self.train_dataloader = train_dataloader
|
30
|
-
self.val_dataloader = val_dataloader
|
31
|
-
self.test_dataloader = test_dataloader
|
32
|
-
self.optimizer = optimizer
|
33
|
-
self.scheduler = scheduler
|
34
|
-
self.loss = loss
|
35
|
-
self.log_interval = log_interval
|
36
|
-
self.summary_writer = summary_writer
|
37
|
-
self.output_path = output_path
|
38
|
-
self.current_timestep = 0
|
39
|
-
self.current_epoch = 0
|
40
|
-
self.clip = clip
|
41
|
-
self.patience = patience
|
42
|
-
|
43
|
-
def tag(self, dataloader, is_train=True):
|
44
|
-
"""
|
45
|
-
Given a dataloader containing segments, predict the tags
|
46
|
-
:param dataloader: torch.utils.data.DataLoader
|
47
|
-
:param is_train: boolean - True for training model, False for evaluation
|
48
|
-
:return: Iterator
|
49
|
-
subwords (B x T x NUM_LABELS)- torch.Tensor - BERT subword ID
|
50
|
-
gold_tags (B x T x NUM_LABELS) - torch.Tensor - ground truth tags IDs
|
51
|
-
tokens - List[arabiner.data.dataset.Token] - list of tokens
|
52
|
-
valid_len (B x 1) - int - valiud length of each sequence
|
53
|
-
logits (B x T x NUM_LABELS) - logits for each token and each tag
|
54
|
-
"""
|
55
|
-
for subwords, gold_tags, tokens, valid_len in dataloader:
|
56
|
-
self.model.train(is_train)
|
57
|
-
|
58
|
-
if torch.cuda.is_available():
|
59
|
-
subwords = subwords.cuda()
|
60
|
-
gold_tags = gold_tags.cuda()
|
61
|
-
|
62
|
-
if is_train:
|
63
|
-
self.optimizer.zero_grad()
|
64
|
-
logits = self.model(subwords)
|
65
|
-
else:
|
66
|
-
with torch.no_grad():
|
67
|
-
logits = self.model(subwords)
|
68
|
-
|
69
|
-
yield subwords, gold_tags, tokens, valid_len, logits
|
70
|
-
|
71
|
-
def segments_to_file(self, segments, filename):
|
72
|
-
"""
|
73
|
-
Write segments to file
|
74
|
-
:param segments: [List[arabiner.data.dataset.Token]] - list of list of tokens
|
75
|
-
:param filename: str - output filename
|
76
|
-
:return: None
|
77
|
-
"""
|
78
|
-
with open(filename, "w") as fh:
|
79
|
-
results = "\n\n".join(["\n".join([t.__str__() for t in segment]) for segment in segments])
|
80
|
-
fh.write("Token\tGold Tag\tPredicted Tag\n")
|
81
|
-
fh.write(results)
|
82
|
-
logging.info("Predictions written to %s", filename)
|
83
|
-
|
84
|
-
def save(self):
|
85
|
-
"""
|
86
|
-
Save model checkpoint
|
87
|
-
:return:
|
88
|
-
"""
|
89
|
-
filename = os.path.join(
|
90
|
-
self.output_path,
|
91
|
-
"checkpoints",
|
92
|
-
"checkpoint_{}.pt".format(self.current_epoch),
|
93
|
-
)
|
94
|
-
|
95
|
-
checkpoint = {
|
96
|
-
"model": self.model.state_dict(),
|
97
|
-
"optimizer": self.optimizer.state_dict(),
|
98
|
-
"epoch": self.current_epoch
|
99
|
-
}
|
100
|
-
|
101
|
-
logger.info("Saving checkpoint to %s", filename)
|
102
|
-
torch.save(checkpoint, filename)
|
103
|
-
|
104
|
-
def load(self, checkpoint_path):
|
105
|
-
"""
|
106
|
-
Load model checkpoint
|
107
|
-
:param checkpoint_path: str - path/to/checkpoints
|
108
|
-
:return: None
|
109
|
-
"""
|
110
|
-
checkpoint_path = natsort.natsorted(glob.glob(f"{checkpoint_path}/checkpoint_*.pt"))
|
111
|
-
checkpoint_path = checkpoint_path[-1]
|
112
|
-
|
113
|
-
logger.info("Loading checkpoint %s", checkpoint_path)
|
114
|
-
|
115
|
-
device = None if torch.cuda.is_available() else torch.device('cpu')
|
116
|
-
checkpoint = torch.load(checkpoint_path, map_location=device)
|
117
|
-
self.model.load_state_dict(checkpoint["model"])
|
1
|
+
import os
|
2
|
+
import torch
|
3
|
+
import logging
|
4
|
+
import natsort
|
5
|
+
import glob
|
6
|
+
|
7
|
+
logger = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
class BaseTrainer:
|
11
|
+
def __init__(
|
12
|
+
self,
|
13
|
+
model=None,
|
14
|
+
max_epochs=50,
|
15
|
+
optimizer=None,
|
16
|
+
scheduler=None,
|
17
|
+
loss=None,
|
18
|
+
train_dataloader=None,
|
19
|
+
val_dataloader=None,
|
20
|
+
test_dataloader=None,
|
21
|
+
log_interval=10,
|
22
|
+
summary_writer=None,
|
23
|
+
output_path=None,
|
24
|
+
clip=5,
|
25
|
+
patience=5
|
26
|
+
):
|
27
|
+
self.model = model
|
28
|
+
self.max_epochs = max_epochs
|
29
|
+
self.train_dataloader = train_dataloader
|
30
|
+
self.val_dataloader = val_dataloader
|
31
|
+
self.test_dataloader = test_dataloader
|
32
|
+
self.optimizer = optimizer
|
33
|
+
self.scheduler = scheduler
|
34
|
+
self.loss = loss
|
35
|
+
self.log_interval = log_interval
|
36
|
+
self.summary_writer = summary_writer
|
37
|
+
self.output_path = output_path
|
38
|
+
self.current_timestep = 0
|
39
|
+
self.current_epoch = 0
|
40
|
+
self.clip = clip
|
41
|
+
self.patience = patience
|
42
|
+
|
43
|
+
def tag(self, dataloader, is_train=True):
|
44
|
+
"""
|
45
|
+
Given a dataloader containing segments, predict the tags
|
46
|
+
:param dataloader: torch.utils.data.DataLoader
|
47
|
+
:param is_train: boolean - True for training model, False for evaluation
|
48
|
+
:return: Iterator
|
49
|
+
subwords (B x T x NUM_LABELS)- torch.Tensor - BERT subword ID
|
50
|
+
gold_tags (B x T x NUM_LABELS) - torch.Tensor - ground truth tags IDs
|
51
|
+
tokens - List[arabiner.data.dataset.Token] - list of tokens
|
52
|
+
valid_len (B x 1) - int - valiud length of each sequence
|
53
|
+
logits (B x T x NUM_LABELS) - logits for each token and each tag
|
54
|
+
"""
|
55
|
+
for subwords, gold_tags, tokens, valid_len in dataloader:
|
56
|
+
self.model.train(is_train)
|
57
|
+
|
58
|
+
if torch.cuda.is_available():
|
59
|
+
subwords = subwords.cuda()
|
60
|
+
gold_tags = gold_tags.cuda()
|
61
|
+
|
62
|
+
if is_train:
|
63
|
+
self.optimizer.zero_grad()
|
64
|
+
logits = self.model(subwords)
|
65
|
+
else:
|
66
|
+
with torch.no_grad():
|
67
|
+
logits = self.model(subwords)
|
68
|
+
|
69
|
+
yield subwords, gold_tags, tokens, valid_len, logits
|
70
|
+
|
71
|
+
def segments_to_file(self, segments, filename):
|
72
|
+
"""
|
73
|
+
Write segments to file
|
74
|
+
:param segments: [List[arabiner.data.dataset.Token]] - list of list of tokens
|
75
|
+
:param filename: str - output filename
|
76
|
+
:return: None
|
77
|
+
"""
|
78
|
+
with open(filename, "w") as fh:
|
79
|
+
results = "\n\n".join(["\n".join([t.__str__() for t in segment]) for segment in segments])
|
80
|
+
fh.write("Token\tGold Tag\tPredicted Tag\n")
|
81
|
+
fh.write(results)
|
82
|
+
logging.info("Predictions written to %s", filename)
|
83
|
+
|
84
|
+
def save(self):
|
85
|
+
"""
|
86
|
+
Save model checkpoint
|
87
|
+
:return:
|
88
|
+
"""
|
89
|
+
filename = os.path.join(
|
90
|
+
self.output_path,
|
91
|
+
"checkpoints",
|
92
|
+
"checkpoint_{}.pt".format(self.current_epoch),
|
93
|
+
)
|
94
|
+
|
95
|
+
checkpoint = {
|
96
|
+
"model": self.model.state_dict(),
|
97
|
+
"optimizer": self.optimizer.state_dict(),
|
98
|
+
"epoch": self.current_epoch
|
99
|
+
}
|
100
|
+
|
101
|
+
logger.info("Saving checkpoint to %s", filename)
|
102
|
+
torch.save(checkpoint, filename)
|
103
|
+
|
104
|
+
def load(self, checkpoint_path):
|
105
|
+
"""
|
106
|
+
Load model checkpoint
|
107
|
+
:param checkpoint_path: str - path/to/checkpoints
|
108
|
+
:return: None
|
109
|
+
"""
|
110
|
+
checkpoint_path = natsort.natsorted(glob.glob(f"{checkpoint_path}/checkpoint_*.pt"))
|
111
|
+
checkpoint_path = checkpoint_path[-1]
|
112
|
+
|
113
|
+
logger.info("Loading checkpoint %s", checkpoint_path)
|
114
|
+
|
115
|
+
device = None if torch.cuda.is_available() else torch.device('cpu')
|
116
|
+
checkpoint = torch.load(checkpoint_path, map_location=device)
|
117
|
+
self.model.load_state_dict(checkpoint["model"], strict=False)
|
@@ -217,7 +217,7 @@ def jsons_to_list_of_lists(json_list):
|
|
217
217
|
def find_named_entities(string):
|
218
218
|
found_entities = []
|
219
219
|
|
220
|
-
ner_entites = extract(string)
|
220
|
+
ner_entites = extract(string, "nested")
|
221
221
|
list_of_entites = jsons_to_list_of_lists(ner_entites)
|
222
222
|
entites = distill_entities(list_of_entites)
|
223
223
|
|
@@ -288,17 +288,17 @@ def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, se
|
|
288
288
|
concept_id, gloss = GlossPredictor(Diac_lemma, Undiac_lemma,word,sentence,glosses_dictionary)
|
289
289
|
|
290
290
|
my_json = {}
|
291
|
-
my_json['
|
291
|
+
my_json['concept_id'] = concept_id
|
292
292
|
# my_json['Gloss'] = gloss
|
293
293
|
my_json['word'] = word
|
294
|
-
my_json['Undiac_lemma'] = Undiac_lemma
|
295
|
-
my_json['
|
294
|
+
#my_json['Undiac_lemma'] = Undiac_lemma
|
295
|
+
my_json['lemma'] = Diac_lemma
|
296
296
|
return my_json
|
297
297
|
else:
|
298
298
|
my_json = {}
|
299
299
|
my_json['word'] = word
|
300
|
-
my_json['Undiac_lemma'] = Undiac_lemma
|
301
|
-
my_json['
|
300
|
+
#my_json['Undiac_lemma'] = Undiac_lemma
|
301
|
+
my_json['lemma'] = Diac_lemma
|
302
302
|
return my_json
|
303
303
|
|
304
304
|
|
@@ -405,26 +405,26 @@ def disambiguate_glosses_main(word, sentence):
|
|
405
405
|
if concept_count == 0:
|
406
406
|
my_json = {}
|
407
407
|
my_json['word'] = word['word']
|
408
|
-
my_json['
|
409
|
-
my_json['Undiac_lemma'] = word['Undiac_lemma']
|
408
|
+
my_json['lemma'] = word['Diac_lemma']
|
409
|
+
#my_json['Undiac_lemma'] = word['Undiac_lemma']
|
410
410
|
return my_json
|
411
411
|
elif concept_count == 1:
|
412
412
|
my_json = {}
|
413
413
|
my_json['word'] = word['word']
|
414
414
|
glosses = word['glosses'][0]
|
415
415
|
# my_json['Gloss'] = glosses['gloss']
|
416
|
-
my_json['
|
417
|
-
my_json['
|
418
|
-
my_json['Undiac_lemma'] = word['Undiac_lemma']
|
416
|
+
my_json['concept_id'] = glosses['concept_id']
|
417
|
+
my_json['lemma'] = word['Diac_lemma']
|
418
|
+
#my_json['Undiac_lemma'] = word['Undiac_lemma']
|
419
419
|
return my_json
|
420
420
|
elif concept_count == '*':
|
421
421
|
my_json = {}
|
422
422
|
my_json['word'] = word['word']
|
423
423
|
glosses = word['glosses'][0]
|
424
424
|
my_json['Gloss'] = glosses['gloss']
|
425
|
-
my_json['
|
426
|
-
my_json['
|
427
|
-
my_json['Undiac_lemma'] = word['Undiac_lemma']
|
425
|
+
my_json['concept_id'] = glosses['concept_id']
|
426
|
+
my_json['lemma'] = word['Diac_lemma']
|
427
|
+
#my_json['Undiac_lemma'] = word['Undiac_lemma']
|
428
428
|
return my_json
|
429
429
|
else:
|
430
430
|
input_word = word['word']
|
@@ -477,21 +477,18 @@ def disambiguate(sentence):
|
|
477
477
|
#output
|
478
478
|
[
|
479
479
|
{
|
480
|
-
"
|
480
|
+
"concept_id": "303019218",
|
481
481
|
"word": "ذهبت",
|
482
|
-
"
|
483
|
-
"Diac_lemma": "ذَهَبَ۪ 1"
|
482
|
+
"lemma": "ذَهَبَ۪ 1"
|
484
483
|
},
|
485
484
|
{
|
486
485
|
"word": "إلى",
|
487
|
-
"
|
488
|
-
"Undiac_lemma": "الى"
|
486
|
+
"lemma": "إِلَى 1"
|
489
487
|
},
|
490
488
|
{
|
491
489
|
"word": "جامعة بيرزيت",
|
492
|
-
"
|
493
|
-
"
|
494
|
-
"Undiac_lemma": "جامعة بيرزيت"
|
490
|
+
"concept_id": "334000099",
|
491
|
+
"lemma": "جامِعَة بيرزَيت"
|
495
492
|
}
|
496
493
|
]
|
497
494
|
"""
|
@@ -1 +0,0 @@
|
|
1
|
-
0.1.24
|
@@ -1,72 +0,0 @@
|
|
1
|
-
import os
|
2
|
-
from collections import namedtuple
|
3
|
-
from sinatools.ner.data_format import get_dataloaders, text2segments
|
4
|
-
from . import tagger, tag_vocab, train_config
|
5
|
-
|
6
|
-
def extract(text, batch_size=32):
|
7
|
-
"""
|
8
|
-
This method processes an input text and returns named entites for each token within the text, based on the specified batch size. As follows:
|
9
|
-
|
10
|
-
Args:
|
11
|
-
text (:obj:`str`): The Arabic text to be tagged.
|
12
|
-
batch_size (int, optional): Batch size for inference. Default is 32.
|
13
|
-
|
14
|
-
Returns:
|
15
|
-
list (:obj:`list`): A list of JSON objects, where each JSON could be contains:
|
16
|
-
token: The token from the original text.
|
17
|
-
NER tag: The label pairs for each segment.
|
18
|
-
|
19
|
-
**Example:**
|
20
|
-
|
21
|
-
.. highlight:: python
|
22
|
-
.. code-block:: python
|
23
|
-
|
24
|
-
from sinatools.ner.entity_extractor import extract
|
25
|
-
extract('ذهب محمد إلى جامعة بيرزيت')
|
26
|
-
[{
|
27
|
-
"word":"ذهب",
|
28
|
-
"tags":"O"
|
29
|
-
},{
|
30
|
-
"word":"محمد",
|
31
|
-
"tags":"B-PERS"
|
32
|
-
},{
|
33
|
-
"word":"إلى",
|
34
|
-
"tags":"O"
|
35
|
-
},{
|
36
|
-
"word":"جامعة",
|
37
|
-
"tags":"B-ORG"
|
38
|
-
},{
|
39
|
-
"word":"بيرزيت",
|
40
|
-
"tags":"B-GPE I-ORG"
|
41
|
-
}]
|
42
|
-
"""
|
43
|
-
|
44
|
-
dataset, token_vocab = text2segments(text)
|
45
|
-
|
46
|
-
vocabs = namedtuple("Vocab", ["tags", "tokens"])
|
47
|
-
vocab = vocabs(tokens=token_vocab, tags=tag_vocab)
|
48
|
-
|
49
|
-
dataloader = get_dataloaders(
|
50
|
-
(dataset,),
|
51
|
-
vocab,
|
52
|
-
train_config.data_config,
|
53
|
-
batch_size=batch_size,
|
54
|
-
shuffle=(False,),
|
55
|
-
)[0]
|
56
|
-
|
57
|
-
|
58
|
-
segments = tagger.infer(dataloader)
|
59
|
-
segments_lists = []
|
60
|
-
|
61
|
-
for segment in segments:
|
62
|
-
for token in segment:
|
63
|
-
segments_list = {}
|
64
|
-
segments_list["token"] = token.text
|
65
|
-
list_of_tags = [t['tag'] for t in token.pred_tag]
|
66
|
-
list_of_tags = [i for i in list_of_tags if i not in('O',' ','')]
|
67
|
-
if list_of_tags == []:
|
68
|
-
segments_list["tags"] = ' '.join(['O'])
|
69
|
-
else:
|
70
|
-
segments_list["tags"] = ' '.join(list_of_tags)
|
71
|
-
segments_lists.append(segments_list)
|
72
|
-
return segments_lists
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|