SinaTools 0.1.4__py2.py3-none-any.whl → 0.1.8__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.4.dist-info → SinaTools-0.1.8.dist-info}/METADATA +10 -10
- SinaTools-0.1.8.dist-info/RECORD +101 -0
- SinaTools-0.1.8.dist-info/entry_points.txt +18 -0
- SinaTools-0.1.8.dist-info/top_level.txt +1 -0
- {nlptools → sinatools}/CLI/DataDownload/download_files.py +9 -9
- {nlptools → sinatools}/CLI/morphology/ALMA_multi_word.py +10 -20
- sinatools/CLI/morphology/morph_analyzer.py +80 -0
- nlptools/CLI/arabiner/bin/infer2.py → sinatools/CLI/ner/corpus_entity_extractor.py +5 -9
- nlptools/CLI/arabiner/bin/infer.py → sinatools/CLI/ner/entity_extractor.py +4 -8
- {nlptools → sinatools}/CLI/salma/salma_tools.py +8 -8
- {nlptools → sinatools}/CLI/utils/arStrip.py +10 -21
- sinatools/CLI/utils/corpus_tokenizer.py +50 -0
- {nlptools → sinatools}/CLI/utils/implication.py +9 -9
- {nlptools → sinatools}/CLI/utils/jaccard.py +10 -10
- sinatools/CLI/utils/remove_latin.py +34 -0
- sinatools/CLI/utils/remove_punctuation.py +42 -0
- {nlptools → sinatools}/CLI/utils/sentence_tokenizer.py +9 -22
- {nlptools → sinatools}/CLI/utils/text_transliteration.py +10 -17
- {nlptools → sinatools}/DataDownload/downloader.py +9 -9
- sinatools/VERSION +1 -0
- {nlptools → sinatools}/__init__.py +1 -1
- {nlptools → sinatools}/morphology/ALMA_multi_word.py +4 -5
- {nlptools → sinatools}/morphology/__init__.py +4 -14
- sinatools/morphology/morph_analyzer.py +172 -0
- sinatools/ner/__init__.py +12 -0
- nlptools/arabiner/bin/infer.py → sinatools/ner/entity_extractor.py +9 -8
- {nlptools → sinatools}/salma/__init__.py +2 -2
- {nlptools → sinatools}/salma/settings.py +1 -1
- {nlptools → sinatools}/salma/views.py +9 -9
- {nlptools → sinatools}/salma/wsd.py +2 -2
- {nlptools/morphology → sinatools/utils}/charsets.py +1 -3
- {nlptools → sinatools}/utils/implication.py +10 -10
- {nlptools → sinatools}/utils/jaccard.py +2 -2
- {nlptools → sinatools}/utils/parser.py +18 -21
- {nlptools → sinatools}/utils/text_transliteration.py +1 -1
- nlptools/utils/corpus_tokenizer.py → sinatools/utils/tokenizer.py +58 -5
- {nlptools/morphology → sinatools/utils}/tokenizers_words.py +3 -6
- SinaTools-0.1.4.dist-info/RECORD +0 -122
- SinaTools-0.1.4.dist-info/entry_points.txt +0 -18
- SinaTools-0.1.4.dist-info/top_level.txt +0 -1
- nlptools/CLI/morphology/morph_analyzer.py +0 -91
- nlptools/CLI/utils/corpus_tokenizer.py +0 -74
- nlptools/CLI/utils/latin_remove.py +0 -51
- nlptools/CLI/utils/remove_Punc.py +0 -53
- nlptools/VERSION +0 -1
- nlptools/arabiner/bin/__init__.py +0 -14
- nlptools/arabiner/bin/eval.py +0 -87
- nlptools/arabiner/bin/process.py +0 -140
- nlptools/arabiner/bin/train.py +0 -221
- nlptools/arabiner/data/__init__.py +0 -1
- nlptools/arabiner/data/datasets.py +0 -146
- nlptools/arabiner/data/transforms.py +0 -118
- nlptools/arabiner/nn/BaseModel.py +0 -22
- nlptools/arabiner/nn/BertNestedTagger.py +0 -34
- nlptools/arabiner/nn/BertSeqTagger.py +0 -17
- nlptools/arabiner/nn/__init__.py +0 -3
- nlptools/arabiner/trainers/BaseTrainer.py +0 -117
- nlptools/arabiner/trainers/BertNestedTrainer.py +0 -203
- nlptools/arabiner/trainers/BertTrainer.py +0 -163
- nlptools/arabiner/trainers/__init__.py +0 -3
- nlptools/arabiner/utils/__init__.py +0 -0
- nlptools/arabiner/utils/data.py +0 -124
- nlptools/arabiner/utils/helpers.py +0 -151
- nlptools/arabiner/utils/metrics.py +0 -69
- nlptools/morphology/morph_analyzer.py +0 -171
- nlptools/morphology/settings.py +0 -8
- nlptools/utils/__init__.py +0 -0
- nlptools/utils/sentence_tokenizer.py +0 -53
- {SinaTools-0.1.4.data/data/nlptools → SinaTools-0.1.8.data/data/sinatools}/environment.yml +0 -0
- {SinaTools-0.1.4.dist-info → SinaTools-0.1.8.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.4.dist-info → SinaTools-0.1.8.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.4.dist-info → SinaTools-0.1.8.dist-info}/WHEEL +0 -0
- {nlptools → sinatools}/CLI/utils/__init__.py +0 -0
- {nlptools → sinatools}/DataDownload/__init__.py +0 -0
- {nlptools → sinatools}/arabert/__init__.py +0 -0
- {nlptools → sinatools}/arabert/arabert/__init__.py +0 -0
- {nlptools → sinatools}/arabert/arabert/create_classification_data.py +0 -0
- {nlptools → sinatools}/arabert/arabert/create_pretraining_data.py +0 -0
- {nlptools → sinatools}/arabert/arabert/extract_features.py +0 -0
- {nlptools → sinatools}/arabert/arabert/lamb_optimizer.py +0 -0
- {nlptools → sinatools}/arabert/arabert/modeling.py +0 -0
- {nlptools → sinatools}/arabert/arabert/optimization.py +0 -0
- {nlptools → sinatools}/arabert/arabert/run_classifier.py +0 -0
- {nlptools → sinatools}/arabert/arabert/run_pretraining.py +0 -0
- {nlptools → sinatools}/arabert/arabert/run_squad.py +0 -0
- {nlptools → sinatools}/arabert/arabert/tokenization.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/build_pretraining_dataset.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/configure_finetuning.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/configure_pretraining.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/feature_spec.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/preprocessing.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/scorer.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/task.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/task_builder.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/flops_computation.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/model/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/model/modeling.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/model/optimization.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/model/tokenization.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/pretrain/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/pretrain/pretrain_data.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/pretrain/pretrain_helpers.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/run_finetuning.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/run_pretraining.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/util/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/util/training_utils.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/util/utils.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/__init__.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/create_pretraining_data.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/gpt2/__init__.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/gpt2/optimization.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/gpt2/run_pretraining.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/__init__.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/dataloader.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/modeling.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/modeling_gpt2.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/optimization_adafactor.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/train_tpu.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/utils.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/train_bpe_tokenizer.py +0 -0
- {nlptools → sinatools}/arabert/preprocess.py +0 -0
- {nlptools → sinatools}/environment.yml +0 -0
- {nlptools → sinatools}/install_env.py +0 -0
- /nlptools/nlptools.py → /sinatools/sinatools.py +0 -0
- {nlptools/arabiner → sinatools/utils}/__init__.py +0 -0
- {nlptools → sinatools}/utils/readfile.py +0 -0
- {nlptools → sinatools}/utils/utils.py +0 -0
nlptools/utils/__init__.py
DELETED
File without changes
|
@@ -1,53 +0,0 @@
|
|
1
|
-
def remove_empty_values(sentences):
|
2
|
-
return [value for value in sentences if value != '']
|
3
|
-
|
4
|
-
|
5
|
-
def sent_tokenize(text, dot=True, new_line=True, question_mark=True, exclamation_mark=True):
|
6
|
-
"""
|
7
|
-
This method tokenizes a text into a set of sentences based on the selected separators, including the dot, new line, question mark, and exclamation mark.
|
8
|
-
|
9
|
-
Args:
|
10
|
-
text (:obj:`str`): Arabic text to be tokenized.
|
11
|
-
dot (:obj:`str`): flag to split text based on Dot (default is True).
|
12
|
-
new_line (:obj:`str`): flag to split text based on new_line (default is True).
|
13
|
-
question_mark (:obj:`str`): flag to split text based on question_mark (default is True).
|
14
|
-
exclamation_mark (:obj:`str`): flag to split text based on exclamation_mark (default is True).
|
15
|
-
|
16
|
-
Returns:
|
17
|
-
:obj:`list`: list of sentences.
|
18
|
-
|
19
|
-
**Example:**
|
20
|
-
|
21
|
-
.. highlight:: python
|
22
|
-
.. code-block:: python
|
23
|
-
|
24
|
-
from nlptools.utils import sentence_tokenizer
|
25
|
-
sentences = sentence_tokenizer.sent_tokenize("مختبر سينا لحوسبة اللغة والذكاء الإصطناعي. في جامعة بيرزيت.", dot=True, new_line=True, question_mark=True, exclamation_mark=True)
|
26
|
-
print(sentences)
|
27
|
-
|
28
|
-
#output
|
29
|
-
['مختبر سينا لحوسبة اللغة والذكاء الإصطناعي.', 'في جامعة بيرزيت.']
|
30
|
-
"""
|
31
|
-
separators = []
|
32
|
-
split_text = [text]
|
33
|
-
if new_line==True:
|
34
|
-
separators.append('\n')
|
35
|
-
if dot==True:
|
36
|
-
separators.append('.')
|
37
|
-
if question_mark==True:
|
38
|
-
separators.append('?')
|
39
|
-
separators.append('؟')
|
40
|
-
if exclamation_mark==True:
|
41
|
-
separators.append('!')
|
42
|
-
|
43
|
-
for sep in separators:
|
44
|
-
new_split_text = []
|
45
|
-
for part in split_text:
|
46
|
-
tokens = part.split(sep)
|
47
|
-
tokens_with_separator = [token + sep for token in tokens[:-1]]
|
48
|
-
tokens_with_separator.append(tokens[-1].strip())
|
49
|
-
new_split_text.extend(tokens_with_separator)
|
50
|
-
split_text = new_split_text
|
51
|
-
|
52
|
-
split_text = remove_empty_values(split_text)
|
53
|
-
return split_text
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|