SinaTools 0.1.11__tar.gz → 0.1.12__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.11 → SinaTools-0.1.12}/PKG-INFO +2 -2
- {SinaTools-0.1.11 → SinaTools-0.1.12}/README.rst +1 -1
- {SinaTools-0.1.11 → SinaTools-0.1.12}/SinaTools.egg-info/PKG-INFO +2 -2
- {SinaTools-0.1.11 → SinaTools-0.1.12}/SinaTools.egg-info/SOURCES.txt +26 -6
- {SinaTools-0.1.11 → SinaTools-0.1.12}/SinaTools.egg-info/entry_points.txt +7 -3
- {SinaTools-0.1.11 → SinaTools-0.1.12}/SinaTools.egg-info/requires.txt +0 -1
- {SinaTools-0.1.11 → SinaTools-0.1.12}/setup.py +14 -7
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/CLI/DataDownload/download_files.py +0 -10
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/CLI/ner/corpus_entity_extractor.py +6 -6
- SinaTools-0.1.12/sinatools/CLI/ner/entity_extractor.py +89 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/CLI/utils/arStrip.py +8 -8
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/CLI/utils/implication.py +0 -8
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/CLI/utils/jaccard.py +5 -14
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/CLI/utils/remove_latin.py +2 -2
- SinaTools-0.1.12/sinatools/CLI/utils/text_dublication_detector.py +25 -0
- SinaTools-0.1.12/sinatools/VERSION +1 -0
- SinaTools-0.1.12/sinatools/morphology/ALMA_multi_word.py +31 -0
- SinaTools-0.1.12/sinatools/morphology/__init__.py +43 -0
- SinaTools-0.1.12/sinatools/ner/__init__.py +38 -0
- SinaTools-0.1.12/sinatools/ner/data/__init__.py +1 -0
- SinaTools-0.1.12/sinatools/ner/data/datasets.py +146 -0
- SinaTools-0.1.12/sinatools/ner/data/transforms.py +118 -0
- SinaTools-0.1.12/sinatools/ner/data_format.py +124 -0
- SinaTools-0.1.12/sinatools/ner/datasets.py +146 -0
- SinaTools-0.1.12/sinatools/ner/entity_extractor.py +72 -0
- SinaTools-0.1.12/sinatools/ner/helpers.py +86 -0
- SinaTools-0.1.12/sinatools/ner/metrics.py +69 -0
- SinaTools-0.1.12/sinatools/ner/nn/BaseModel.py +22 -0
- SinaTools-0.1.12/sinatools/ner/nn/BertNestedTagger.py +34 -0
- SinaTools-0.1.12/sinatools/ner/nn/BertSeqTagger.py +17 -0
- SinaTools-0.1.12/sinatools/ner/nn/__init__.py +3 -0
- SinaTools-0.1.12/sinatools/ner/trainers/BaseTrainer.py +117 -0
- SinaTools-0.1.12/sinatools/ner/trainers/BertNestedTrainer.py +203 -0
- SinaTools-0.1.12/sinatools/ner/trainers/BertTrainer.py +163 -0
- SinaTools-0.1.12/sinatools/ner/trainers/__init__.py +3 -0
- SinaTools-0.1.12/sinatools/ner/transforms.py +119 -0
- SinaTools-0.1.12/sinatools/semantic_relatedness/__init__.py +20 -0
- SinaTools-0.1.12/sinatools/semantic_relatedness/compute_relatedness.py +31 -0
- SinaTools-0.1.12/sinatools/synonyms/__init__.py +18 -0
- SinaTools-0.1.12/sinatools/synonyms/synonyms_generator.py +192 -0
- SinaTools-0.1.12/sinatools/utils/text_dublication_detector.py +110 -0
- SinaTools-0.1.12/sinatools/wsd/__init__.py +11 -0
- SinaTools-0.1.11/sinatools/salma/views.py → SinaTools-0.1.12/sinatools/wsd/disambiguator.py +135 -94
- {SinaTools-0.1.11/sinatools/salma → SinaTools-0.1.12/sinatools/wsd}/wsd.py +1 -1
- SinaTools-0.1.11/sinatools/CLI/ner/entity_extractor.py +0 -113
- SinaTools-0.1.11/sinatools/CLI/salma/salma_tools.py +0 -68
- SinaTools-0.1.11/sinatools/VERSION +0 -1
- SinaTools-0.1.11/sinatools/morphology/ALMA_multi_word.py +0 -33
- SinaTools-0.1.11/sinatools/morphology/__init__.py +0 -42
- SinaTools-0.1.11/sinatools/ner/__init__.py +0 -12
- SinaTools-0.1.11/sinatools/ner/entity_extractor.py +0 -92
- SinaTools-0.1.11/sinatools/salma/__init__.py +0 -12
- SinaTools-0.1.11/sinatools/utils/utils.py +0 -2
- {SinaTools-0.1.11 → SinaTools-0.1.12}/AUTHORS.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/CONTRIBUTING.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/LICENSE +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/MANIFEST.in +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/SinaTools.egg-info/dependency_links.txt +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/SinaTools.egg-info/not-zip-safe +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/SinaTools.egg-info/top_level.txt +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/Makefile +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/build/_images/download.png +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/build/_static/download.png +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/build/_static/file.png +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/build/_static/minus.png +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/build/_static/plus.png +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/build/html/_images/SinaLogo.jpg +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/build/html/_images/download.png +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/build/html/_static/SinaLogo.jpg +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/build/html/_static/download.png +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/build/html/_static/file.png +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/build/html/_static/minus.png +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/build/html/_static/plus.png +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/make.bat +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/License.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/Overview.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/_static/SinaLogo.jpg +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/_static/download.png +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/about.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/api/DataDownload/downloader.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/api/DataDownload.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/api/arabiner/bin/infer.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/api/arabiner.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/api/morphology/morph_analyzer.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/api/morphology.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/api/salma/views.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/api/salma.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/api/utils/corpus_tokenizer.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/api/utils/implication.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/api/utils/jaccard.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/api/utils/parser.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/api/utils/sentence_tokenizer.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/api/utils/text_transliteration.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/api/utils.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/api.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/authors.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/DataDownload/download_files.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/DataDownload/get_appdatadir.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/DataDownload.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/arabiner/infer.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/arabiner.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/morphology/ALMA_multi_word.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/morphology/morph_analyzer.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/morphology.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/salma/salma_tools.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/salma.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/utils/arStrip.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/utils/corpus_tokenizer.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/utils/implication.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/utils/jaccard.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/utils/latin_remove.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/utils/remove_punc.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/utils/sentence_tokenizer.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/utils/text_transliteration.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools/utils.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/cli_tools.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/conf.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/index.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/installation.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/docs/source/readme.rst +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/setup.cfg +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/CLI/morphology/ALMA_multi_word.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/CLI/morphology/morph_analyzer.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/CLI/utils/__init__.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/CLI/utils/corpus_tokenizer.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/CLI/utils/remove_punctuation.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/CLI/utils/sentence_tokenizer.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/CLI/utils/text_transliteration.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/DataDownload/__init__.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/DataDownload/downloader.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/__init__.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/__init__.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/arabert/__init__.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/arabert/create_classification_data.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/arabert/create_pretraining_data.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/arabert/extract_features.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/arabert/lamb_optimizer.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/arabert/modeling.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/arabert/optimization.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/arabert/run_classifier.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/arabert/run_pretraining.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/arabert/run_squad.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/arabert/tokenization.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/__init__.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/build_pretraining_dataset.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/configure_finetuning.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/configure_pretraining.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/finetune/__init__.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/finetune/feature_spec.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/finetune/preprocessing.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/finetune/scorer.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/finetune/task.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/finetune/task_builder.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/flops_computation.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/model/__init__.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/model/modeling.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/model/optimization.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/model/tokenization.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/pretrain/__init__.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/pretrain/pretrain_data.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/pretrain/pretrain_helpers.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/run_finetuning.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/run_pretraining.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/util/__init__.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/util/training_utils.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/araelectra/util/utils.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/aragpt2/__init__.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/aragpt2/create_pretraining_data.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/aragpt2/gpt2/__init__.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/aragpt2/gpt2/optimization.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/aragpt2/gpt2/run_pretraining.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/aragpt2/grover/__init__.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/aragpt2/grover/dataloader.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/aragpt2/grover/modeling.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/aragpt2/grover/modeling_gpt2.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/aragpt2/grover/optimization_adafactor.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/aragpt2/grover/train_tpu.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/aragpt2/grover/utils.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/aragpt2/train_bpe_tokenizer.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/arabert/preprocess.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/environment.yml +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/install_env.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/morphology/morph_analyzer.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/sinatools.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/utils/__init__.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/utils/charsets.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/utils/implication.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/utils/jaccard.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/utils/parser.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/utils/readfile.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/utils/text_transliteration.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/utils/tokenizer.py +0 -0
- {SinaTools-0.1.11 → SinaTools-0.1.12}/sinatools/utils/tokenizers_words.py +0 -0
- {SinaTools-0.1.11/sinatools/salma → SinaTools-0.1.12/sinatools/wsd}/settings.py +0 -0
@@ -1,10 +1,10 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: SinaTools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.12
|
4
4
|
Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
5
5
|
Home-page: https://github.com/SinaLab/sinatools
|
6
6
|
License: MIT license
|
7
|
-
Description:
|
7
|
+
Description: SinaTools
|
8
8
|
---------
|
9
9
|
|
10
10
|
Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
@@ -1,10 +1,10 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: SinaTools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.12
|
4
4
|
Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
5
5
|
Home-page: https://github.com/SinaLab/sinatools
|
6
6
|
License: MIT license
|
7
|
-
Description:
|
7
|
+
Description: SinaTools
|
8
8
|
---------
|
9
9
|
|
10
10
|
Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
@@ -82,7 +82,6 @@ sinatools/CLI/morphology/ALMA_multi_word.py
|
|
82
82
|
sinatools/CLI/morphology/morph_analyzer.py
|
83
83
|
sinatools/CLI/ner/corpus_entity_extractor.py
|
84
84
|
sinatools/CLI/ner/entity_extractor.py
|
85
|
-
sinatools/CLI/salma/salma_tools.py
|
86
85
|
sinatools/CLI/utils/__init__.py
|
87
86
|
sinatools/CLI/utils/arStrip.py
|
88
87
|
sinatools/CLI/utils/corpus_tokenizer.py
|
@@ -91,6 +90,7 @@ sinatools/CLI/utils/jaccard.py
|
|
91
90
|
sinatools/CLI/utils/remove_latin.py
|
92
91
|
sinatools/CLI/utils/remove_punctuation.py
|
93
92
|
sinatools/CLI/utils/sentence_tokenizer.py
|
93
|
+
sinatools/CLI/utils/text_dublication_detector.py
|
94
94
|
sinatools/CLI/utils/text_transliteration.py
|
95
95
|
sinatools/DataDownload/__init__.py
|
96
96
|
sinatools/DataDownload/downloader.py
|
@@ -150,18 +150,38 @@ sinatools/morphology/ALMA_multi_word.py
|
|
150
150
|
sinatools/morphology/__init__.py
|
151
151
|
sinatools/morphology/morph_analyzer.py
|
152
152
|
sinatools/ner/__init__.py
|
153
|
+
sinatools/ner/data_format.py
|
154
|
+
sinatools/ner/datasets.py
|
153
155
|
sinatools/ner/entity_extractor.py
|
154
|
-
sinatools/
|
155
|
-
sinatools/
|
156
|
-
sinatools/
|
157
|
-
sinatools/
|
156
|
+
sinatools/ner/helpers.py
|
157
|
+
sinatools/ner/metrics.py
|
158
|
+
sinatools/ner/transforms.py
|
159
|
+
sinatools/ner/data/__init__.py
|
160
|
+
sinatools/ner/data/datasets.py
|
161
|
+
sinatools/ner/data/transforms.py
|
162
|
+
sinatools/ner/nn/BaseModel.py
|
163
|
+
sinatools/ner/nn/BertNestedTagger.py
|
164
|
+
sinatools/ner/nn/BertSeqTagger.py
|
165
|
+
sinatools/ner/nn/__init__.py
|
166
|
+
sinatools/ner/trainers/BaseTrainer.py
|
167
|
+
sinatools/ner/trainers/BertNestedTrainer.py
|
168
|
+
sinatools/ner/trainers/BertTrainer.py
|
169
|
+
sinatools/ner/trainers/__init__.py
|
170
|
+
sinatools/semantic_relatedness/__init__.py
|
171
|
+
sinatools/semantic_relatedness/compute_relatedness.py
|
172
|
+
sinatools/synonyms/__init__.py
|
173
|
+
sinatools/synonyms/synonyms_generator.py
|
158
174
|
sinatools/utils/__init__.py
|
159
175
|
sinatools/utils/charsets.py
|
160
176
|
sinatools/utils/implication.py
|
161
177
|
sinatools/utils/jaccard.py
|
162
178
|
sinatools/utils/parser.py
|
163
179
|
sinatools/utils/readfile.py
|
180
|
+
sinatools/utils/text_dublication_detector.py
|
164
181
|
sinatools/utils/text_transliteration.py
|
165
182
|
sinatools/utils/tokenizer.py
|
166
183
|
sinatools/utils/tokenizers_words.py
|
167
|
-
sinatools/
|
184
|
+
sinatools/wsd/__init__.py
|
185
|
+
sinatools/wsd/disambiguator.py
|
186
|
+
sinatools/wsd/settings.py
|
187
|
+
sinatools/wsd/wsd.py
|
@@ -2,17 +2,21 @@
|
|
2
2
|
alma_multi_word = sinatools.CLI.morphology.ALMA_multi_word:main
|
3
3
|
appdatadir = sinatools.CLI.DataDownload.get_appdatadir:main
|
4
4
|
arStrip = sinatools.CLI.utils.arStrip:main
|
5
|
-
|
6
|
-
arabi_ner2 = sinatools.CLI.ner.corpus_entity_extractor:main
|
5
|
+
corpus_entity_extractor = sinatools.CLI.ner.corpus_entity_extractor:main
|
7
6
|
corpus_tokenizer = sinatools.CLI.utils.corpus_tokenizer:main
|
8
7
|
download_files = sinatools.CLI.DataDownload.download_files:main
|
8
|
+
entity_extractor = sinatools.CLI.ner.entity_extractor:main
|
9
|
+
evaluate_synonyms = sinatools.CLI.synonyms.evaluate_synonyms:main
|
10
|
+
extend_synonyms = sinatools.CLI.synonyms.extend_synonyms:main
|
9
11
|
implication = sinatools.CLI.utils.implication:main
|
10
12
|
install_env = sinatools.install_env:main
|
11
13
|
jaccard_similarity = sinatools.CLI.utils.jaccard:main
|
12
14
|
morphology_analyzer = sinatools.CLI.morphology.morph_analyzer:main
|
13
15
|
remove_latin = sinatools.CLI.utils.remove_latin:main
|
14
16
|
remove_punctuation = sinatools.CLI.utils.remove_punctuation:main
|
15
|
-
|
17
|
+
semantic_relatedness = sinatools.CLI.semantic_relatedness.compute_relatedness:main
|
16
18
|
sentence_tokenizer = sinatools.CLI.utils.sentence_tokenizer:main
|
19
|
+
text_dublication_detector = sinatools.CLI.utils.text_dublication_detector:main
|
17
20
|
transliterate = sinatools.CLI.utils.text_transliteration:main
|
21
|
+
wsd = sinatools.CLI.wsd.disambiguator:main
|
18
22
|
|
@@ -23,8 +23,7 @@ requirements = [
|
|
23
23
|
'torchtext==0.14.0',
|
24
24
|
'torchvision==0.14.0',
|
25
25
|
'seqeval==1.2.2',
|
26
|
-
'natsort==7.1.1'
|
27
|
-
'pandas==1.2.4'
|
26
|
+
'natsort==7.1.1'
|
28
27
|
]
|
29
28
|
|
30
29
|
|
@@ -56,22 +55,30 @@ setup(
|
|
56
55
|
'sinatools.CLI.morphology.morph_analyzer:main'),
|
57
56
|
('alma_multi_word='
|
58
57
|
'sinatools.CLI.morphology.ALMA_multi_word:main'),
|
59
|
-
('
|
58
|
+
('entity_extractor='
|
60
59
|
'sinatools.CLI.ner.entity_extractor:main'),
|
61
60
|
('remove_punctuation='
|
62
61
|
'sinatools.CLI.utils.remove_punctuation:main'),
|
63
62
|
('remove_latin='
|
64
63
|
'sinatools.CLI.utils.remove_latin:main'),
|
65
|
-
('
|
66
|
-
'sinatools.CLI.
|
64
|
+
('wsd='
|
65
|
+
'sinatools.CLI.wsd.disambiguator:main'),
|
67
66
|
('corpus_tokenizer='
|
68
67
|
'sinatools.CLI.utils.corpus_tokenizer:main'),
|
69
68
|
('appdatadir='
|
70
69
|
'sinatools.CLI.DataDownload.get_appdatadir:main'),
|
71
70
|
('download_files='
|
72
71
|
'sinatools.CLI.DataDownload.download_files:main'),
|
73
|
-
('
|
74
|
-
'sinatools.CLI.ner.corpus_entity_extractor:main')
|
72
|
+
('corpus_entity_extractor='
|
73
|
+
'sinatools.CLI.ner.corpus_entity_extractor:main'),
|
74
|
+
('text_dublication_detector='
|
75
|
+
'sinatools.CLI.utils.text_dublication_detector:main'),
|
76
|
+
('evaluate_synonyms='
|
77
|
+
'sinatools.CLI.synonyms.evaluate_synonyms:main'),
|
78
|
+
('extend_synonyms='
|
79
|
+
'sinatools.CLI.synonyms.extend_synonyms:main'),
|
80
|
+
('semantic_relatedness='
|
81
|
+
'sinatools.CLI.semantic_relatedness.compute_relatedness:main'),
|
75
82
|
],
|
76
83
|
},
|
77
84
|
data_files=[('sinatools', ['sinatools/environment.yml'])],
|
@@ -29,16 +29,6 @@ Examples:
|
|
29
29
|
download_files -f morph ner
|
30
30
|
This command will download only the `morph` and `ner` files to the default directory.
|
31
31
|
|
32
|
-
Note:
|
33
|
-
-----
|
34
|
-
|
35
|
-
.. code-block:: none
|
36
|
-
|
37
|
-
- The script automatically handles the extraction of zip and tar.gz files after downloading.
|
38
|
-
- Ensure you have the necessary permissions to write to the specified directory.
|
39
|
-
- The default download directory is based on the operating system and can be obtained using the `get_appdatadir` function.
|
40
|
-
|
41
|
-
|
42
32
|
"""
|
43
33
|
|
44
34
|
import argparse
|
@@ -4,20 +4,20 @@ from sinatools.utils.tokenizer import sentence_tokenizer
|
|
4
4
|
from sinatools.utils.tokenizers_words import simple_word_tokenize
|
5
5
|
import pandas as pd
|
6
6
|
import argparse
|
7
|
-
from sinatools.ner.entity_extractor import
|
7
|
+
from sinatools.ner.entity_extractor import extract
|
8
8
|
|
9
9
|
"""
|
10
|
-
|
10
|
+
This tool processes a csv file and returns named entites for each token within the text, based on the specified batch size. As follows:
|
11
11
|
|
12
12
|
Usage:
|
13
13
|
------
|
14
14
|
Run the script with the following command:
|
15
15
|
|
16
|
-
|
16
|
+
corpus_entity_extractor input.csv --text-columns "TextColumn1,TextColumn2" --additional-columns "Column3,Column4" --output-csv output.csv
|
17
17
|
"""
|
18
18
|
|
19
|
-
def
|
20
|
-
output =
|
19
|
+
def combine_tags(sentence):
|
20
|
+
output = extract(sentence)
|
21
21
|
return [word[1] for word in output]
|
22
22
|
|
23
23
|
|
@@ -40,7 +40,7 @@ def corpus_tokenizer(input_csv, output_csv, text_column, additional_columns, row
|
|
40
40
|
words = simple_word_tokenize(sentence)
|
41
41
|
global_sentence_id += 1
|
42
42
|
|
43
|
-
tags =
|
43
|
+
tags = combine_tags(sentence)
|
44
44
|
for word_position, word in enumerate(words, start=1):
|
45
45
|
row_id += 1
|
46
46
|
doc_sentence_filename = input_csv.split(".csv")[0]
|
@@ -0,0 +1,89 @@
|
|
1
|
+
"""
|
2
|
+
About:
|
3
|
+
------
|
4
|
+
This tool processes an input text and returns named entites for each token within the text, based on the specified batch size. As follows:
|
5
|
+
|
6
|
+
Usage:
|
7
|
+
------
|
8
|
+
Below is the usage information that can be generated by running entity_extractor --help.
|
9
|
+
|
10
|
+
.. code-block:: none
|
11
|
+
|
12
|
+
entity_extractor --text=INPUT_TEXT
|
13
|
+
entity_extractor --dir=INPUT_FILE --output_csv=OUTPUT_FILE_NAME
|
14
|
+
|
15
|
+
Options:
|
16
|
+
--------
|
17
|
+
|
18
|
+
.. code-block:: none
|
19
|
+
|
20
|
+
--text INPUT_TEXT
|
21
|
+
The text that needs to be analyzed for Named Entity Recognition.
|
22
|
+
--file INPUT_FILE
|
23
|
+
File containing the text to be analyzed for Named Entity Recognition.
|
24
|
+
--output_csv OUTPUT_FILE_NAME
|
25
|
+
A file containing the tokenized text and its Named Entity tags.
|
26
|
+
|
27
|
+
|
28
|
+
Examples:
|
29
|
+
---------
|
30
|
+
|
31
|
+
.. code-block:: none
|
32
|
+
|
33
|
+
entity_extractor --text "Your text here"
|
34
|
+
entity_extractor --dir "/path/to/your/directory" --output_csv "output.csv"
|
35
|
+
|
36
|
+
"""
|
37
|
+
|
38
|
+
import argparse
|
39
|
+
import json
|
40
|
+
import pandas as pd
|
41
|
+
from sinatools.ner.entity_extractor import extract
|
42
|
+
from sinatools.utils.tokenizer import corpus_tokenizer
|
43
|
+
from sinatools.utils.tokenizers_words import simple_word_tokenize
|
44
|
+
|
45
|
+
|
46
|
+
def combine_tags(sentence):
|
47
|
+
output = extract(sentence)
|
48
|
+
return [word[1] for word in output]
|
49
|
+
|
50
|
+
|
51
|
+
def main():
|
52
|
+
parser = argparse.ArgumentParser(description='NER Analysis using ArabiNER')
|
53
|
+
|
54
|
+
parser.add_argument('--text', type=str, help='Text to be analyzed for Named Entity Recognition')
|
55
|
+
parser.add_argument('--dir', type=str, help='dir containing the text files to be analyzed for Named Entity Recognition')
|
56
|
+
parser.add_argument('--output_csv', type=str, help='Output CSV file to write the results')
|
57
|
+
|
58
|
+
args = parser.parse_args()
|
59
|
+
|
60
|
+
if args.text is not None:
|
61
|
+
results = extract(args.text)
|
62
|
+
# Print the results in JSON format
|
63
|
+
print(json.dumps(results, ensure_ascii=False, indent=4))
|
64
|
+
elif args.dir is not None:
|
65
|
+
corpus_tokenizer(args.dir, args.output_csv)
|
66
|
+
df = pd.read_csv(args.output_csv)
|
67
|
+
df['NER tags'] = None
|
68
|
+
i = 0
|
69
|
+
|
70
|
+
result = df.drop_duplicates(subset=['Global Sentence ID', 'Sentence'])
|
71
|
+
unique_sentences = result['Sentence'].to_numpy()
|
72
|
+
|
73
|
+
for sentence in unique_sentences:
|
74
|
+
ner_tags = combine_tags(sentence)
|
75
|
+
if len(simple_word_tokenize(sentence)) > 300:
|
76
|
+
print(" Length of this sentence is more than 300 word: ", sentence)
|
77
|
+
return
|
78
|
+
|
79
|
+
df.loc[i:i+len(ner_tags)-1, 'NER tags'] = ner_tags
|
80
|
+
i = i + len(ner_tags)
|
81
|
+
|
82
|
+
df.to_csv(args.output_csv, index=False)
|
83
|
+
else:
|
84
|
+
print("Error: Either --text or --file argument must be provided.")
|
85
|
+
return
|
86
|
+
|
87
|
+
|
88
|
+
if __name__ == '__main__':
|
89
|
+
main()
|
@@ -26,7 +26,7 @@ Below is the usage information that can be generated by running arStrip --help.
|
|
26
26
|
--diacs BOOL [default=True]
|
27
27
|
Indicates whether to strip diacritics.
|
28
28
|
|
29
|
-
--
|
29
|
+
--small_diacs BOOL [default=True]
|
30
30
|
Indicates whether to strip small diacritics.
|
31
31
|
|
32
32
|
--shaddah BOOL [default=True]
|
@@ -38,15 +38,15 @@ Below is the usage information that can be generated by running arStrip --help.
|
|
38
38
|
--alif BOOL [default=True]
|
39
39
|
Indicates whether to strip alif.
|
40
40
|
|
41
|
-
--
|
41
|
+
--special_chars BOOL [default=True]
|
42
42
|
Indicates whether to strip special characters.
|
43
43
|
|
44
44
|
Examples:
|
45
45
|
---------
|
46
46
|
.. code-block:: none
|
47
47
|
|
48
|
-
arStrip --text "مُختَبَر سينا لحوسبة اللغة!" --diacs=True --
|
49
|
-
arStrip --file "path/to/your/file.txt" --diacs=True --
|
48
|
+
arStrip --text "مُختَبَر سينا لحوسبة اللغة!" --diacs=True --small_diacs=False --shaddah=True --digit=False --alif=False --special_chars=False
|
49
|
+
arStrip --file "path/to/your/file.txt" --diacs=True --small_diacs=False --shaddah=True --digit=False --alif=False --special_chars=False
|
50
50
|
|
51
51
|
"""
|
52
52
|
|
@@ -60,11 +60,11 @@ def main():
|
|
60
60
|
parser.add_argument('--text', type=str, help='Text to be stripped')
|
61
61
|
parser.add_argument('--file', type=str, help='File containing text to be stripped')
|
62
62
|
parser.add_argument('--diacs', type=bool, default=True, help='Whether to strip diacritics')
|
63
|
-
parser.add_argument('--
|
63
|
+
parser.add_argument('--small_diacs', type=bool, default=True, help='Whether to strip small diacritics')
|
64
64
|
parser.add_argument('--shaddah', type=bool, default=True, help='Whether to strip shaddah')
|
65
65
|
parser.add_argument('--digit', type=bool, default=True, help='Whether to strip digits')
|
66
66
|
parser.add_argument('--alif', type=bool, default=True, help='Whether to strip alif')
|
67
|
-
parser.add_argument('--
|
67
|
+
parser.add_argument('--special_chars', type=bool, default=True, help='Whether to strip special characters')
|
68
68
|
|
69
69
|
args = parser.parse_args()
|
70
70
|
|
@@ -76,8 +76,8 @@ def main():
|
|
76
76
|
print("Either --text or --file argument must be provided.")
|
77
77
|
return
|
78
78
|
|
79
|
-
stripped_text = arStrip(text_content, diacs=args.diacs,
|
80
|
-
shaddah=args.shaddah, digit=args.digit, alif=args.alif,
|
79
|
+
stripped_text = arStrip(text_content, diacs=args.diacs, small_diacs=args.small_diacs,
|
80
|
+
shaddah=args.shaddah, digit=args.digit, alif=args.alif, special_chars=args.special_chars)
|
81
81
|
|
82
82
|
print(stripped_text)
|
83
83
|
|
@@ -37,14 +37,6 @@ Examples:
|
|
37
37
|
|
38
38
|
implication --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt"
|
39
39
|
|
40
|
-
Note:
|
41
|
-
-----
|
42
|
-
|
43
|
-
.. code-block:: none
|
44
|
-
|
45
|
-
- The results are based on the underlying logic and data sets present in the `Implication` class of SinaTools.
|
46
|
-
- The tool compares the implication between two words, and the relationship might vary based on linguistic nuances.
|
47
|
-
|
48
40
|
"""
|
49
41
|
import argparse
|
50
42
|
from sinatools.utils.implication import Implication
|
@@ -5,14 +5,14 @@ The jaccard tool computes the Jaccard similarity between two sets of strings. Th
|
|
5
5
|
|
6
6
|
Usage:
|
7
7
|
------
|
8
|
-
Below is the usage information that can be generated by running
|
8
|
+
Below is the usage information that can be generated by running jaccard_similarity --help.
|
9
9
|
|
10
10
|
.. code-block:: none
|
11
11
|
|
12
12
|
Usage:
|
13
|
-
|
13
|
+
jaccard_similarity --list1="WORD1, WORD2" --list2="WORD1,WORD2" --delimiter="DELIMITER" --selection="SELECTION" [OPTIONS]
|
14
14
|
|
15
|
-
|
15
|
+
jaccard_similarity --file1=File1 --file2=File2 --delimiter="DELIMITER" --selection="SELECTION" [OPTIONS]
|
16
16
|
|
17
17
|
.. code-block:: none
|
18
18
|
|
@@ -39,18 +39,9 @@ Examples:
|
|
39
39
|
|
40
40
|
.. code-block:: none
|
41
41
|
|
42
|
-
|
42
|
+
jaccard_similarity --list1 "word1,word2" --list2 "word1, word2" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
|
43
43
|
|
44
|
-
|
45
|
-
|
46
|
-
Note:
|
47
|
-
-----
|
48
|
-
|
49
|
-
.. code-block:: none
|
50
|
-
|
51
|
-
- The Jaccard similarity ranges from 0 to 1. A value of 1 indicates that the sets are identical, while a value of 0 indicates no similarity between the sets.
|
52
|
-
- Diacritics refer to the Arabic Diacritics (like fatha, damma, kasra, etc.) and shadda.
|
53
|
-
- The two normalization options can be used individually or together. However, the combination will result in both rules being applied, and thus,
|
44
|
+
jaccard_similarity --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
|
54
45
|
|
55
46
|
"""
|
56
47
|
|
@@ -14,8 +14,8 @@ Below is the usage information that can be generated by running remove_latin --h
|
|
14
14
|
Examples:
|
15
15
|
---------
|
16
16
|
.. code-block:: none
|
17
|
-
|
18
|
-
|
17
|
+
remove_latin --text "123test"
|
18
|
+
remove_latin --file "path/to/your/file.txt"
|
19
19
|
"""
|
20
20
|
|
21
21
|
import argparse
|
@@ -0,0 +1,25 @@
|
|
1
|
+
import argparse
|
2
|
+
from sinatools.utils.text_dublication_detector import removal
|
3
|
+
|
4
|
+
def main():
|
5
|
+
parser = argparse.ArgumentParser(description='Processes a CSV file of sentences to identify and remove duplicate sentences based on a specified threshold and cosine similarity. It saves the filtered results and the identified duplicates to separate files.')
|
6
|
+
|
7
|
+
parser.add_argument('--csv_file', type=str, help='The path to the input CSV file that will be processed.')
|
8
|
+
parser.add_argument('--column_name', type=str, help='The name of the column from which duplicates will be removed.')
|
9
|
+
parser.add_argument('--final_file_name', type=str, help='The name of the output file that will contain the deduplicated results.')
|
10
|
+
parser.add_argument('--deleted_file_name', type=str, help='The name of the output file that will contain the records that were identified as duplicates and removed.')
|
11
|
+
parser.add_argument('--similarity_threshold', type=float, default=0.8, help='The similarity threshold for determining duplicates. Records with a similarity score above this value will be considered duplicates (default is 0.8).')
|
12
|
+
|
13
|
+
args = parser.parse_args()
|
14
|
+
|
15
|
+
if args.csv_file is None and args.column_name is None:
|
16
|
+
print("Either --csv_file or --column_name argument must be provided.")
|
17
|
+
return
|
18
|
+
|
19
|
+
removal(args.csv_file, args.column_name, args.final_file_name, args.deleted_file_name, args.similarity_threshold)
|
20
|
+
|
21
|
+
|
22
|
+
if __name__ == '__main__':
|
23
|
+
main()
|
24
|
+
|
25
|
+
# text_dublication_detector --csv_file "text.csv" --column_name "A" --final_file_name "Final.csv" --deleted_file_name "deleted.csv" --similarity_threshold 0.8
|
@@ -0,0 +1 @@
|
|
1
|
+
0.1.12
|
@@ -0,0 +1,31 @@
|
|
1
|
+
from sinatools.utils.parser import arStrip
|
2
|
+
from . import five_grams_dict, four_grams_dict , three_grams_dict , two_grams_dict
|
3
|
+
|
4
|
+
def ALMA_multi_word(multi_word, n):
|
5
|
+
undiac_multi_word = arStrip(multi_word, True, True, True, False, True, False) # diacs , smallDiacs , shaddah , digit , alif , specialChars
|
6
|
+
result_word = []
|
7
|
+
if n == 2:
|
8
|
+
if undiac_multi_word in two_grams_dict.keys():
|
9
|
+
result_word = two_grams_dict[undiac_multi_word]
|
10
|
+
elif n == 3:
|
11
|
+
if undiac_multi_word in three_grams_dict.keys():
|
12
|
+
result_word = three_grams_dict[undiac_multi_word]
|
13
|
+
elif n == 4:
|
14
|
+
if undiac_multi_word in four_grams_dict.keys():
|
15
|
+
result_word = four_grams_dict[undiac_multi_word]
|
16
|
+
else:
|
17
|
+
if undiac_multi_word in five_grams_dict.keys():
|
18
|
+
result_word = five_grams_dict[undiac_multi_word]
|
19
|
+
|
20
|
+
my_json = {}
|
21
|
+
output_list = []
|
22
|
+
my_json['multi_word_lemma'] = multi_word
|
23
|
+
my_json['undiac_multi_word_lemma'] = multi_word
|
24
|
+
ids = []
|
25
|
+
if result_word != []:
|
26
|
+
my_json['POS'] = result_word[0][1] #POS
|
27
|
+
for result in result_word:
|
28
|
+
ids.append(result[3])
|
29
|
+
my_json['ids'] = ids
|
30
|
+
output_list.append(my_json)
|
31
|
+
return output_list
|
@@ -0,0 +1,43 @@
|
|
1
|
+
import pickle
|
2
|
+
from sinatools.DataDownload import downloader
|
3
|
+
import os
|
4
|
+
|
5
|
+
dictionary = {}
|
6
|
+
five_grams_dict = {}
|
7
|
+
four_grams_dict = {}
|
8
|
+
three_grams_dict = {}
|
9
|
+
two_grams_dict = {}
|
10
|
+
|
11
|
+
filename = 'lemmas_dic.pickle'
|
12
|
+
path = downloader.get_appdatadir()
|
13
|
+
file_path = os.path.join(path, filename)
|
14
|
+
with open(file_path, 'rb') as f:
|
15
|
+
dictionary = pickle.load(f)
|
16
|
+
|
17
|
+
filename_five = 'five_grams.pickle'
|
18
|
+
path =downloader.get_appdatadir()
|
19
|
+
file_path = os.path.join(path, filename_five)
|
20
|
+
with open(file_path, 'rb') as f:
|
21
|
+
five_grams_dict = pickle.load(f, encoding='utf-8')
|
22
|
+
|
23
|
+
|
24
|
+
filename_four = 'four_grams.pickle'
|
25
|
+
path =downloader.get_appdatadir()
|
26
|
+
file_path = os.path.join(path, filename_four)
|
27
|
+
with open(file_path, 'rb') as f:
|
28
|
+
four_grams_dict = pickle.load(f, encoding='utf-8')
|
29
|
+
|
30
|
+
|
31
|
+
filename_three = 'three_grams.pickle'
|
32
|
+
path =downloader.get_appdatadir()
|
33
|
+
file_path = os.path.join(path, filename_three)
|
34
|
+
with open(file_path, 'rb') as f:
|
35
|
+
three_grams_dict = pickle.load(f, encoding='utf-8')
|
36
|
+
|
37
|
+
|
38
|
+
filename_two = 'two_grams.pickle'
|
39
|
+
path =downloader.get_appdatadir()
|
40
|
+
file_path = os.path.join(path, filename_two)
|
41
|
+
with open(file_path, 'rb') as f:
|
42
|
+
two_grams_dict = pickle.load(f, encoding='utf-8')
|
43
|
+
|
@@ -0,0 +1,38 @@
|
|
1
|
+
from sinatools.DataDownload import downloader
|
2
|
+
import os
|
3
|
+
from sinatools.ner.helpers import load_object
|
4
|
+
import pickle
|
5
|
+
import os
|
6
|
+
import torch
|
7
|
+
import pickle
|
8
|
+
import json
|
9
|
+
from argparse import Namespace
|
10
|
+
|
11
|
+
tagger = None
|
12
|
+
tag_vocab = None
|
13
|
+
train_config = None
|
14
|
+
|
15
|
+
filename = 'Wj27012000.tar'
|
16
|
+
path =downloader.get_appdatadir()
|
17
|
+
model_path = os.path.join(path, filename)
|
18
|
+
|
19
|
+
_path = os.path.join(model_path, "tag_vocab.pkl")
|
20
|
+
|
21
|
+
with open(_path, "rb") as fh:
|
22
|
+
tag_vocab = pickle.load(fh)
|
23
|
+
|
24
|
+
train_config = Namespace()
|
25
|
+
args_path = os.path.join(model_path, "args.json")
|
26
|
+
|
27
|
+
with open(args_path, "r") as fh:
|
28
|
+
train_config.__dict__ = json.load(fh)
|
29
|
+
|
30
|
+
model = load_object(train_config.network_config["fn"], train_config.network_config["kwargs"])
|
31
|
+
model = torch.nn.DataParallel(model)
|
32
|
+
|
33
|
+
if torch.cuda.is_available():
|
34
|
+
model = model.cuda()
|
35
|
+
|
36
|
+
train_config.trainer_config["kwargs"]["model"] = model
|
37
|
+
tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
|
38
|
+
tagger.load(os.path.join(model_path,"checkpoints"))
|
@@ -0,0 +1 @@
|
|
1
|
+
from sinatools.ner.data.datasets import NestedTagsDataset
|