SinaTools 0.1.3__py2.py3-none-any.whl → 0.1.7__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.3.dist-info → SinaTools-0.1.7.dist-info}/METADATA +14 -20
- SinaTools-0.1.7.dist-info/RECORD +101 -0
- SinaTools-0.1.7.dist-info/entry_points.txt +18 -0
- SinaTools-0.1.7.dist-info/top_level.txt +1 -0
- {nlptools → sinatools}/CLI/DataDownload/download_files.py +9 -9
- {nlptools → sinatools}/CLI/morphology/ALMA_multi_word.py +10 -20
- sinatools/CLI/morphology/morph_analyzer.py +80 -0
- nlptools/CLI/arabiner/bin/infer2.py → sinatools/CLI/ner/corpus_entity_extractor.py +5 -9
- nlptools/CLI/arabiner/bin/infer.py → sinatools/CLI/ner/entity_extractor.py +4 -8
- {nlptools → sinatools}/CLI/salma/salma_tools.py +8 -8
- {nlptools → sinatools}/CLI/utils/arStrip.py +10 -21
- sinatools/CLI/utils/corpus_tokenizer.py +50 -0
- {nlptools → sinatools}/CLI/utils/implication.py +9 -9
- {nlptools → sinatools}/CLI/utils/jaccard.py +10 -10
- sinatools/CLI/utils/remove_latin.py +34 -0
- sinatools/CLI/utils/remove_punctuation.py +42 -0
- {nlptools → sinatools}/CLI/utils/sentence_tokenizer.py +9 -22
- {nlptools → sinatools}/CLI/utils/text_transliteration.py +10 -17
- {nlptools → sinatools}/DataDownload/downloader.py +10 -10
- sinatools/VERSION +1 -0
- {nlptools → sinatools}/__init__.py +1 -1
- {nlptools → sinatools}/morphology/ALMA_multi_word.py +4 -5
- {nlptools → sinatools}/morphology/__init__.py +4 -14
- sinatools/morphology/morph_analyzer.py +172 -0
- sinatools/ner/__init__.py +12 -0
- nlptools/arabiner/bin/infer.py → sinatools/ner/entity_extractor.py +9 -8
- {nlptools → sinatools}/salma/__init__.py +2 -2
- {nlptools → sinatools}/salma/settings.py +1 -1
- {nlptools → sinatools}/salma/views.py +12 -12
- {nlptools → sinatools}/salma/wsd.py +2 -2
- {nlptools/morphology → sinatools/utils}/charsets.py +1 -3
- {nlptools → sinatools}/utils/implication.py +10 -10
- {nlptools → sinatools}/utils/jaccard.py +2 -2
- {nlptools → sinatools}/utils/parser.py +18 -21
- {nlptools → sinatools}/utils/text_transliteration.py +1 -1
- nlptools/utils/corpus_tokenizer.py → sinatools/utils/tokenizer.py +58 -5
- {nlptools/morphology → sinatools/utils}/tokenizers_words.py +3 -6
- SinaTools-0.1.3.dist-info/RECORD +0 -122
- SinaTools-0.1.3.dist-info/entry_points.txt +0 -18
- SinaTools-0.1.3.dist-info/top_level.txt +0 -1
- nlptools/CLI/morphology/morph_analyzer.py +0 -91
- nlptools/CLI/utils/corpus_tokenizer.py +0 -74
- nlptools/CLI/utils/latin_remove.py +0 -51
- nlptools/CLI/utils/remove_Punc.py +0 -53
- nlptools/VERSION +0 -1
- nlptools/arabiner/bin/__init__.py +0 -14
- nlptools/arabiner/bin/eval.py +0 -87
- nlptools/arabiner/bin/process.py +0 -140
- nlptools/arabiner/bin/train.py +0 -221
- nlptools/arabiner/data/__init__.py +0 -1
- nlptools/arabiner/data/datasets.py +0 -146
- nlptools/arabiner/data/transforms.py +0 -118
- nlptools/arabiner/nn/BaseModel.py +0 -22
- nlptools/arabiner/nn/BertNestedTagger.py +0 -34
- nlptools/arabiner/nn/BertSeqTagger.py +0 -17
- nlptools/arabiner/nn/__init__.py +0 -3
- nlptools/arabiner/trainers/BaseTrainer.py +0 -117
- nlptools/arabiner/trainers/BertNestedTrainer.py +0 -203
- nlptools/arabiner/trainers/BertTrainer.py +0 -163
- nlptools/arabiner/trainers/__init__.py +0 -3
- nlptools/arabiner/utils/__init__.py +0 -0
- nlptools/arabiner/utils/data.py +0 -124
- nlptools/arabiner/utils/helpers.py +0 -151
- nlptools/arabiner/utils/metrics.py +0 -69
- nlptools/morphology/morph_analyzer.py +0 -170
- nlptools/morphology/settings.py +0 -8
- nlptools/utils/__init__.py +0 -0
- nlptools/utils/sentence_tokenizer.py +0 -53
- {SinaTools-0.1.3.data/data/nlptools → SinaTools-0.1.7.data/data/sinatools}/environment.yml +0 -0
- {SinaTools-0.1.3.dist-info → SinaTools-0.1.7.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.3.dist-info → SinaTools-0.1.7.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.3.dist-info → SinaTools-0.1.7.dist-info}/WHEEL +0 -0
- {nlptools → sinatools}/CLI/utils/__init__.py +0 -0
- {nlptools → sinatools}/DataDownload/__init__.py +0 -0
- {nlptools → sinatools}/arabert/__init__.py +0 -0
- {nlptools → sinatools}/arabert/arabert/__init__.py +0 -0
- {nlptools → sinatools}/arabert/arabert/create_classification_data.py +0 -0
- {nlptools → sinatools}/arabert/arabert/create_pretraining_data.py +0 -0
- {nlptools → sinatools}/arabert/arabert/extract_features.py +0 -0
- {nlptools → sinatools}/arabert/arabert/lamb_optimizer.py +0 -0
- {nlptools → sinatools}/arabert/arabert/modeling.py +0 -0
- {nlptools → sinatools}/arabert/arabert/optimization.py +0 -0
- {nlptools → sinatools}/arabert/arabert/run_classifier.py +0 -0
- {nlptools → sinatools}/arabert/arabert/run_pretraining.py +0 -0
- {nlptools → sinatools}/arabert/arabert/run_squad.py +0 -0
- {nlptools → sinatools}/arabert/arabert/tokenization.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/build_pretraining_dataset.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/configure_finetuning.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/configure_pretraining.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/feature_spec.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/preprocessing.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/scorer.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/task.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/finetune/task_builder.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/flops_computation.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/model/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/model/modeling.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/model/optimization.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/model/tokenization.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/pretrain/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/pretrain/pretrain_data.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/pretrain/pretrain_helpers.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/run_finetuning.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/run_pretraining.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/util/__init__.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/util/training_utils.py +0 -0
- {nlptools → sinatools}/arabert/araelectra/util/utils.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/__init__.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/create_pretraining_data.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/gpt2/__init__.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/gpt2/optimization.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/gpt2/run_pretraining.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/__init__.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/dataloader.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/modeling.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/modeling_gpt2.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/optimization_adafactor.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/train_tpu.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/grover/utils.py +0 -0
- {nlptools → sinatools}/arabert/aragpt2/train_bpe_tokenizer.py +0 -0
- {nlptools → sinatools}/arabert/preprocess.py +0 -0
- {nlptools → sinatools}/environment.yml +0 -0
- {nlptools → sinatools}/install_env.py +0 -0
- /nlptools/nlptools.py → /sinatools/sinatools.py +0 -0
- {nlptools/arabiner → sinatools/utils}/__init__.py +0 -0
- {nlptools → sinatools}/utils/readfile.py +0 -0
- {nlptools → sinatools}/utils/utils.py +0 -0
@@ -1,11 +1,11 @@
|
|
1
|
-
from
|
1
|
+
from sinatools.salma import settings
|
2
2
|
import re
|
3
3
|
import warnings
|
4
4
|
warnings.filterwarnings("ignore")
|
5
5
|
import torch
|
6
6
|
import numpy as np
|
7
7
|
import pandas as pd
|
8
|
-
from
|
8
|
+
from sinatools.arabert.preprocess import ArabertPreprocessor
|
9
9
|
|
10
10
|
def normalizearabert(s):
|
11
11
|
model_name = 'aubmindlab/bert-base-arabertv02'
|
@@ -1,6 +1,4 @@
|
|
1
|
-
#
|
2
|
-
# We acknoledge that this file charsets.py is imported from Camel tools citation. url
|
3
|
-
#
|
1
|
+
# We acknowledge that this file, charsets.py, is imported from Camel Tools. [https://camel-tools.readthedocs.io/en/latest/api/utils/charsets.html].
|
4
2
|
|
5
3
|
import unicodedata
|
6
4
|
|
@@ -3,7 +3,7 @@
|
|
3
3
|
# The matching between two words is defined as a tuple:
|
4
4
|
# <w1, w2, implication direction, distance, conflicts, verdict, preferredWord> .
|
5
5
|
|
6
|
-
from
|
6
|
+
from sinatools.utils.parser import arStrip
|
7
7
|
class Implication:
|
8
8
|
"""
|
9
9
|
The implication class computes whether the two Arabic words are the same or not, regardless of how they are diacritized. The output also contains implication direction, distance, number of conflicts, and other outputs.
|
@@ -215,7 +215,7 @@ class Implication:
|
|
215
215
|
.. highlight:: python
|
216
216
|
.. code-block:: python
|
217
217
|
|
218
|
-
from
|
218
|
+
from sinatools.utils.implication import Implication
|
219
219
|
|
220
220
|
word = Implication.normalize_alef("ًى") # Returns "ىً"
|
221
221
|
word = Implication.normalize_alef("ًا") # Returns "اً"
|
@@ -249,7 +249,7 @@ class Implication:
|
|
249
249
|
.. highlight:: python
|
250
250
|
.. code-block:: python
|
251
251
|
|
252
|
-
from
|
252
|
+
from sinatools.utils.implication import Implication
|
253
253
|
|
254
254
|
diacritics = ["َ", "ُ", "ِ", "ّ"]
|
255
255
|
has_error = Implication.diacritics_syntax_error_in(diacritics) # Returns False
|
@@ -288,7 +288,7 @@ class Implication:
|
|
288
288
|
.. highlight:: python
|
289
289
|
.. code-block:: python
|
290
290
|
|
291
|
-
from
|
291
|
+
from sinatools.utils.implication import Implication
|
292
292
|
|
293
293
|
diacritic = 0
|
294
294
|
is_wrong_end = Implication.wrong_end_diacritic(diacritic) # Returns False
|
@@ -324,7 +324,7 @@ class Implication:
|
|
324
324
|
.. highlight:: python
|
325
325
|
.. code-block:: python
|
326
326
|
|
327
|
-
from
|
327
|
+
from sinatools.utils.implication import Implication
|
328
328
|
|
329
329
|
implication = Implication(word1, word2)
|
330
330
|
implication.calculate_words_implication()
|
@@ -377,7 +377,7 @@ class Implication:
|
|
377
377
|
.. highlight:: python
|
378
378
|
.. code-block:: python
|
379
379
|
|
380
|
-
from
|
380
|
+
from sinatools.utils.implication Implication
|
381
381
|
|
382
382
|
implication = Implication(word1, word2)
|
383
383
|
result = implication.equal_words()
|
@@ -428,7 +428,7 @@ class Implication:
|
|
428
428
|
.. highlight:: python
|
429
429
|
.. code-block:: python
|
430
430
|
|
431
|
-
from
|
431
|
+
from sinatools.utils.implication import Implication
|
432
432
|
|
433
433
|
implication = Implication(word1, word2)
|
434
434
|
result = implication.calculate_letters_implication()
|
@@ -508,7 +508,7 @@ class Implication:
|
|
508
508
|
.. highlight:: python
|
509
509
|
.. code-block:: python
|
510
510
|
|
511
|
-
from
|
511
|
+
from sinatools.utils.implication import Implication
|
512
512
|
word = "مُرَحَّبًا"
|
513
513
|
diacritics = Implication.calculate_direction(word)
|
514
514
|
print(diacritics)
|
@@ -600,7 +600,7 @@ class Implication:
|
|
600
600
|
.. highlight:: python
|
601
601
|
.. code-block:: python
|
602
602
|
|
603
|
-
from
|
603
|
+
from sinatools.utils.implication import Implication
|
604
604
|
word = "مرحبا"
|
605
605
|
letters = get_letters_array(word)
|
606
606
|
print(letters)
|
@@ -644,7 +644,7 @@ class Implication:
|
|
644
644
|
.. highlight:: python
|
645
645
|
.. code-block:: python
|
646
646
|
|
647
|
-
from
|
647
|
+
from sinatools.utils.implication import Implication
|
648
648
|
w1 = "hello"
|
649
649
|
w2 = "hell"
|
650
650
|
implication = Implication(w1, w2)
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
|
3
|
-
from
|
4
|
-
from
|
3
|
+
from sinatools.utils.parser import arStrip
|
4
|
+
from sinatools.utils.implication import Implication
|
5
5
|
import argparse
|
6
6
|
|
7
7
|
def normalize_word(word: str, ignore_all_diacritics_but_not_shadda: bool=True, ignore_shadda_diacritic: bool=True) -> str:
|
@@ -1,20 +1,19 @@
|
|
1
1
|
import re
|
2
2
|
import argparse
|
3
3
|
|
4
|
-
def arStrip(text , diacs=True ,
|
4
|
+
def arStrip(text , diacs=True , small_diacs=True , shaddah=True , digit=True, alif=True , special_chars=True ):
|
5
5
|
|
6
6
|
"""
|
7
|
-
This method removes Arabic diacritics, small diacritcs, shaddah, Latin and Arabic digits, and
|
8
|
-
And remove special characters, spaces, underscore and Arabic tatwelah from the input text.
|
7
|
+
This method removes Arabic diacritics, small diacritcs, shaddah, Latin and Arabic digits, unify alif, remove special characters, extra spaces, underscore and Arabic tatwelah from the input text.
|
9
8
|
|
10
9
|
Args:
|
11
10
|
text (:obj:`str`): Arabic text to be processed.
|
12
11
|
diacs (:obj:`bool`): flag to remove Arabic diacretics [ ًٌٍَُِْ] (default is True).
|
13
|
-
|
12
|
+
small_diacs (:obj:`bool`): flag to remove small diacretics (default is True).
|
14
13
|
shaddah (:obj:`bool`): flag to remove shaddah (default is True).
|
15
14
|
digit (:obj:`bool`): flag to remove Latin and Arabic digits (default is True).
|
16
15
|
alif (:obj:`bool`): flag to unify alif (default is True).
|
17
|
-
|
16
|
+
special_chars (:obj:`bool`): flag to remove special characters (default is True).
|
18
17
|
|
19
18
|
Returns:
|
20
19
|
:obj:`str`: stripped text.
|
@@ -24,19 +23,17 @@ def arStrip(text , diacs=True , smallDiacs=True , shaddah=True , digit=True, ali
|
|
24
23
|
.. highlight:: python
|
25
24
|
.. code-block:: python
|
26
25
|
|
27
|
-
from
|
28
|
-
|
29
|
-
print(
|
26
|
+
from sinatools.utils import parser
|
27
|
+
output = parser.arStrip('2023الجو جميلُ')
|
28
|
+
print(output)
|
30
29
|
|
31
|
-
#
|
30
|
+
# output
|
32
31
|
الجو جميل
|
33
32
|
|
34
|
-
|
35
|
-
print(
|
36
|
-
#
|
33
|
+
output = parser.arStrip('أَلَمۡ یَأۡنِ لِلَّذِینَ ءَامَنُوۤا۟ أَن تَخۡشَعَ قُلُوبُهُمۡ لِذِكۡرِ ٱللَّهِ وَمَا نَزَلَ مِنَ ٱلۡحَقِّ وَلَا یَكُونُوا۟ كَٱلَّذِینَ أُوتُوا۟ ٱلۡكِتَـٰبَ مِن قَبۡلُ فَطَالَ عَلَیۡهِمُ ٱلۡأَمَدُ فَقَسَتۡ قُلُوبُهُمۡۖ وَكَثِیر مِّنۡهُمۡ فَـسِقُونَ' , True , True , True , True , True , True )
|
34
|
+
print(output)
|
35
|
+
#output
|
37
36
|
الم یان للذین ءامنوا ان تخشع قلوبهم لذكر الله وما نزل من الحق ولا یكونوا كالذین اوتوا الكتٰب من قبل فطال علیهم الامد فقست قلوبهم وكثیر منهم فسقون
|
38
|
-
|
39
|
-
|
40
37
|
"""
|
41
38
|
try:
|
42
39
|
if text: # if the input string is not empty do the following
|
@@ -46,7 +43,7 @@ def arStrip(text , diacs=True , smallDiacs=True , shaddah=True , digit=True, ali
|
|
46
43
|
text = re.sub(r'[\u0652]+', '',text) # Remove SUKUN
|
47
44
|
if shaddah == True:
|
48
45
|
text = re.sub(r'[\u0651]+', '',text) # Remove shddah
|
49
|
-
if
|
46
|
+
if small_diacs == True:
|
50
47
|
text = re.sub(r'[\u06D6-\u06ED]+', '',text) # Remove all small Quranic annotation signs
|
51
48
|
if digit == True:
|
52
49
|
text = re.sub('[0-9]+', ' ',text) # Remove English digits
|
@@ -57,7 +54,7 @@ def arStrip(text , diacs=True , smallDiacs=True , shaddah=True , digit=True, ali
|
|
57
54
|
text = re.sub('أ', 'ا',text);
|
58
55
|
text = re.sub('إ', 'ا',text);
|
59
56
|
text = re.sub('آ', 'ا',text);
|
60
|
-
if
|
57
|
+
if special_chars == True:
|
61
58
|
text = re.sub('[?؟!@#$%-]+' , '' , text) # Remove some of special chars
|
62
59
|
|
63
60
|
text = re.sub('[\\s]+'," ",text) # Remove all spaces
|
@@ -83,7 +80,7 @@ def remove_punctuation(text):
|
|
83
80
|
.. highlight:: python
|
84
81
|
.. code-block:: python
|
85
82
|
|
86
|
-
from
|
83
|
+
from sinatools.utils import parser
|
87
84
|
return parser.remove_punctuation("te!@#،$%%؟st")
|
88
85
|
|
89
86
|
#output
|
@@ -103,12 +100,12 @@ def remove_punctuation(text):
|
|
103
100
|
r'[\u061B]+', r'[\u061E]+', r'[\u061F]+', r'[\u0640]+',
|
104
101
|
r'[\u0653]+', r'[\u065C]+', r'[\u066C]+', r'[\u066A]+',
|
105
102
|
r'["}"]+', r'["{"]+']
|
106
|
-
|
103
|
+
output_string = text
|
107
104
|
for punctuation in punctuation_marks:
|
108
|
-
|
105
|
+
output_string = re.sub(punctuation, '', output_string)
|
109
106
|
except:
|
110
107
|
return text
|
111
|
-
return
|
108
|
+
return output_string
|
112
109
|
|
113
110
|
def remove_latin(text):
|
114
111
|
"""
|
@@ -126,7 +123,7 @@ def remove_latin(text):
|
|
126
123
|
.. highlight:: python
|
127
124
|
.. code-block:: python
|
128
125
|
|
129
|
-
from
|
126
|
+
from sinatools.utils import parser
|
130
127
|
return parser.remove_latin("miojkdujhvaj1546545spkdpoqfoiehwv nWEQFGWERHERTJETAWIKUYFC")
|
131
128
|
|
132
129
|
#output
|
@@ -181,7 +181,7 @@ def perform_transliteration(text , schema ):
|
|
181
181
|
.. highlight:: python
|
182
182
|
.. code-block:: python
|
183
183
|
|
184
|
-
from
|
184
|
+
from sinatools.utils import text_transliteration
|
185
185
|
|
186
186
|
print(text_transliteration.perform_transliteration("مُحَمَدٌ نَـشِيْطٌـ1" , "ar2bw"))
|
187
187
|
print(text_transliteration.perform_transliteration("muHamadN" , "bw2ar"))
|
@@ -1,7 +1,60 @@
|
|
1
1
|
import os
|
2
2
|
import csv
|
3
|
-
from
|
4
|
-
|
3
|
+
from sinatools.utils.tokenizers_words import simple_word_tokenize
|
4
|
+
|
5
|
+
def remove_empty_values(sentences):
|
6
|
+
return [value for value in sentences if value != '']
|
7
|
+
|
8
|
+
|
9
|
+
def sentence_tokenizer(text, dot=True, new_line=True, question_mark=True, exclamation_mark=True):
|
10
|
+
"""
|
11
|
+
This method tokenizes a text into a set of sentences based on the selected separators, including the dot, new line, question mark, and exclamation mark.
|
12
|
+
|
13
|
+
Args:
|
14
|
+
text (:obj:`str`): Arabic text to be tokenized.
|
15
|
+
dot (:obj:`str`): flag to split text based on Dot (default is True).
|
16
|
+
new_line (:obj:`str`): flag to split text based on new_line (default is True).
|
17
|
+
question_mark (:obj:`str`): flag to split text based on question_mark (default is True).
|
18
|
+
exclamation_mark (:obj:`str`): flag to split text based on exclamation_mark (default is True).
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
:obj:`list`: list of sentences.
|
22
|
+
|
23
|
+
**Example:**
|
24
|
+
|
25
|
+
.. highlight:: python
|
26
|
+
.. code-block:: python
|
27
|
+
|
28
|
+
from sinatools.utils import tokenizer
|
29
|
+
sentences = tokenizer.sentence_tokenizer("مختبر سينا لحوسبة اللغة والذكاء الإصطناعي. في جامعة بيرزيت.", dot=True, new_line=True, question_mark=True, exclamation_mark=True)
|
30
|
+
print(sentences)
|
31
|
+
|
32
|
+
#output
|
33
|
+
['مختبر سينا لحوسبة اللغة والذكاء الإصطناعي.', 'في جامعة بيرزيت.']
|
34
|
+
"""
|
35
|
+
separators = []
|
36
|
+
split_text = [text]
|
37
|
+
if new_line==True:
|
38
|
+
separators.append('\n')
|
39
|
+
if dot==True:
|
40
|
+
separators.append('.')
|
41
|
+
if question_mark==True:
|
42
|
+
separators.append('?')
|
43
|
+
separators.append('؟')
|
44
|
+
if exclamation_mark==True:
|
45
|
+
separators.append('!')
|
46
|
+
|
47
|
+
for sep in separators:
|
48
|
+
new_split_text = []
|
49
|
+
for part in split_text:
|
50
|
+
tokens = part.split(sep)
|
51
|
+
tokens_with_separator = [token + sep for token in tokens[:-1]]
|
52
|
+
tokens_with_separator.append(tokens[-1].strip())
|
53
|
+
new_split_text.extend(tokens_with_separator)
|
54
|
+
split_text = new_split_text
|
55
|
+
|
56
|
+
split_text = remove_empty_values(split_text)
|
57
|
+
return split_text
|
5
58
|
|
6
59
|
def corpus_tokenizer(dir_path, output_csv, row_id = 1, global_sentence_id = 1):
|
7
60
|
"""
|
@@ -28,8 +81,8 @@ def corpus_tokenizer(dir_path, output_csv, row_id = 1, global_sentence_id = 1):
|
|
28
81
|
.. highlight:: python
|
29
82
|
.. code-block:: python
|
30
83
|
|
31
|
-
from
|
32
|
-
corpus_tokenizer(dir_path="History", output_csv="ouputFile.csv", row_id = 1, global_sentence_id = 1)
|
84
|
+
from sinatools.utils import tokenizer
|
85
|
+
output = tokenizer.corpus_tokenizer(dir_path="History", output_csv="ouputFile.csv", row_id = 1, global_sentence_id = 1)
|
33
86
|
|
34
87
|
#output
|
35
88
|
# csv file called: ouputFile.csv
|
@@ -55,7 +108,7 @@ def corpus_tokenizer(dir_path, output_csv, row_id = 1, global_sentence_id = 1):
|
|
55
108
|
file_path = os.path.join(root, file)
|
56
109
|
with open(file_path, 'r', encoding="utf-8") as f:
|
57
110
|
content = f.read()
|
58
|
-
sentences =
|
111
|
+
sentences = sentence_tokenizer(content, dot=True, new_line=True, question_mark=False, exclamation_mark=False)
|
59
112
|
for sentence_id, sentence in enumerate(sentences, start=1):
|
60
113
|
words = simple_word_tokenize(sentence)
|
61
114
|
global_sentence_id += 1
|
@@ -1,11 +1,8 @@
|
|
1
|
-
#
|
2
|
-
|
3
|
-
# -*- coding: utf-8 -*-
|
4
|
-
|
1
|
+
# We acknowledge that this file, charsets.py, is imported from Camel Tools. [https://camel-tools.readthedocs.io/en/latest/api/tokenizers/word.html].
|
5
2
|
|
6
3
|
import re
|
7
|
-
from
|
8
|
-
from
|
4
|
+
from sinatools.utils.charsets import UNICODE_PUNCT_SYMBOL_CHARSET
|
5
|
+
from sinatools.utils.charsets import UNICODE_LETTER_MARK_NUMBER_CHARSET
|
9
6
|
|
10
7
|
|
11
8
|
_ALL_PUNCT = u''.join(UNICODE_PUNCT_SYMBOL_CHARSET)
|
SinaTools-0.1.3.dist-info/RECORD
DELETED
@@ -1,122 +0,0 @@
|
|
1
|
-
SinaTools-0.1.3.data/data/nlptools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
|
2
|
-
nlptools/VERSION,sha256=2_CXjsK1h6XWGH_cxBzOn_LA647vrboOtR84QKtu60Y,5
|
3
|
-
nlptools/__init__.py,sha256=OoA_p_y2jPjMytcUrG1ED5uJlJemVhSRr9L9Wsym-rQ,134
|
4
|
-
nlptools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
|
5
|
-
nlptools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
|
6
|
-
nlptools/nlptools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
|
7
|
-
nlptools/CLI/DataDownload/download_files.py,sha256=PMDEPXxZQbrFo-7iyhvrCpzx2RG5T5kPk6NJAwh8RSI,2322
|
8
|
-
nlptools/CLI/arabiner/bin/infer.py,sha256=YrNCVro8B3UxpsHjIo_01qiBQURpDNTK7pKTkw1L21Y,4921
|
9
|
-
nlptools/CLI/arabiner/bin/infer2.py,sha256=CtR9rwe20ks_qq-l_fQU-ThLqft_1o3Ztmd1my1kHMg,3905
|
10
|
-
nlptools/CLI/morphology/ALMA_multi_word.py,sha256=NINts8BtT8BGQPBvs4BJ_y2PsR7czsGPOVAwngaT85A,2644
|
11
|
-
nlptools/CLI/morphology/morph_analyzer.py,sha256=39vrFx6ppu7yEITcz8lAJhk3xHweaPWEqL-CcqBM37Q,3565
|
12
|
-
nlptools/CLI/salma/salma_tools.py,sha256=7awpCb68QUc3kx-EuwRHxDmItZlX2aSdpukwKF1G3Fo,1999
|
13
|
-
nlptools/CLI/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
-
nlptools/CLI/utils/arStrip.py,sha256=dzy16wZfSznkvGHHBn5P21EvyusKB55dqrZ4zbaa41w,3621
|
15
|
-
nlptools/CLI/utils/corpus_tokenizer.py,sha256=S0YG8FRS29K1C8eJVEYuWSV1ABS7PKymlNS7KxvYqxI,2817
|
16
|
-
nlptools/CLI/utils/implication.py,sha256=hjYTN0oiLf0bz0bRO_GD4rphZkaB3cH770clFFhuevE,3172
|
17
|
-
nlptools/CLI/utils/jaccard.py,sha256=a6oc28yMgm7UewO6Lz25A4Yv8QEzVa85XF-QV9uhMwI,4639
|
18
|
-
nlptools/CLI/utils/latin_remove.py,sha256=Xw6PB4GtMLLiYK3zTEwdLhBbivMyy1msD5Ab_QdJoQA,1303
|
19
|
-
nlptools/CLI/utils/remove_Punc.py,sha256=dvSiSs9UulhGCogBgtpD8fU860BFuMBTnwa8Ek9aPKQ,1393
|
20
|
-
nlptools/CLI/utils/sentence_tokenizer.py,sha256=AcJa_yRdlQqKMwVWWKSv1vRO1Yk-NK75-NpalkHqewc,3469
|
21
|
-
nlptools/CLI/utils/text_transliteration.py,sha256=blIGB8FeF10iFeXADM-z01XJ4qeB1qgj6S2Xnk9w5fI,2266
|
22
|
-
nlptools/DataDownload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
-
nlptools/DataDownload/downloader.py,sha256=yONVa99OtPXD5Lewy4Fm3eUiJMpBt492G1JOPh5sXAU,6523
|
24
|
-
nlptools/arabert/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
|
25
|
-
nlptools/arabert/preprocess.py,sha256=qI0FsuMTOzdRlYGCtLrjpXgikNElUZPv9bnjaKDZKJ4,33024
|
26
|
-
nlptools/arabert/arabert/__init__.py,sha256=KbSAH-XqbRygn0y59m5-ZYOLXgpT1gSgE3F-qd4rKEc,627
|
27
|
-
nlptools/arabert/arabert/create_classification_data.py,sha256=BhemGNRbYz_Pun0Q5WerN2-9n-ILmU3tm4J-OlHw5-A,7678
|
28
|
-
nlptools/arabert/arabert/create_pretraining_data.py,sha256=2M-cF3CLHbQ0cdWrzFT6Frg1vVP4Y-CFoq8iEPyxgsE,18924
|
29
|
-
nlptools/arabert/arabert/extract_features.py,sha256=C1IzASrlX7u4_M2xdr_PjzWfTRZgklhUXA2WHKgQt-I,15585
|
30
|
-
nlptools/arabert/arabert/lamb_optimizer.py,sha256=uN3Dcx-6n2_OwepyymRrGrB4EcSkR8b2ZczZrOr7bpY,6263
|
31
|
-
nlptools/arabert/arabert/modeling.py,sha256=KliecCmA1pP3owg0mYge6On3IRHunMF5kMLuEwc0VLw,40896
|
32
|
-
nlptools/arabert/arabert/optimization.py,sha256=Wx0Js6Zsfc3iVw-_7Q1SCnxfP_qqbdTAyFD-vZSpOyk,8153
|
33
|
-
nlptools/arabert/arabert/run_classifier.py,sha256=AdVGyvidlmbEp12b-PauiBo6EmFLEO7tqeJKuLhK2DA,38777
|
34
|
-
nlptools/arabert/arabert/run_pretraining.py,sha256=yO16nKkHDfcYA2Zx7vv8KN4te6_1qFOzyVeDzFT-DQw,21894
|
35
|
-
nlptools/arabert/arabert/run_squad.py,sha256=PORxgiByP8L6vZqAFkqgHPJ_ZjAlqlg64gtkdLmDNns,53456
|
36
|
-
nlptools/arabert/arabert/tokenization.py,sha256=R6xkyCb8_vgeksXiLeqDvV5vOnLb1cPNsvfDij6YVFk,14132
|
37
|
-
nlptools/arabert/araelectra/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
|
38
|
-
nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py,sha256=pIo6VFT3XXOYroZaab3msZAP6XjCKu0KcrIZQA0Pj8U,3881
|
39
|
-
nlptools/arabert/araelectra/build_pretraining_dataset.py,sha256=Z8ZmKznaE_2SPDRoPYR1SDhjTN_NTpNCFFuhUkykwl8,9041
|
40
|
-
nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py,sha256=W7HFr1XoO6bCDR7X7w-bOuwULFtTSjeKbJ2LHzzHf9k,3224
|
41
|
-
nlptools/arabert/araelectra/configure_finetuning.py,sha256=YfGLMdgN6Qqm357Mzy5UMjkuLPPWtBs7f4dA-DKE6JM,7768
|
42
|
-
nlptools/arabert/araelectra/configure_pretraining.py,sha256=oafQgu4WmVdxBcU5mSfXhPlvCk43CJwAWXC10Q58BlI,5801
|
43
|
-
nlptools/arabert/araelectra/flops_computation.py,sha256=krHTeuPH9xQu5ldprBOPJNlJRvC7fmmvXXqUjfWrzPE,9499
|
44
|
-
nlptools/arabert/araelectra/run_finetuning.py,sha256=JecbrSmGikBNyid4JKRZ49Rm5xFpt02WfgIIcs3TpcU,12976
|
45
|
-
nlptools/arabert/araelectra/run_pretraining.py,sha256=1K2aAFTY0p3iaLY0xkhTlm6v0B-Zun8SwEzz-K6RXM4,20665
|
46
|
-
nlptools/arabert/araelectra/finetune/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
|
47
|
-
nlptools/arabert/araelectra/finetune/feature_spec.py,sha256=cqNlBa2KK_G1-vkKm1EJUv6BoS3gesCUAHwVagZB6wM,1888
|
48
|
-
nlptools/arabert/araelectra/finetune/preprocessing.py,sha256=1mf7-IxknCRsobQZ-VV1zs4Cwt-mfOtoVxysDJa9LZ0,6657
|
49
|
-
nlptools/arabert/araelectra/finetune/scorer.py,sha256=PjRg0P5ANCtul2ute7ccq3mRCCoIAoCb-lVLlwd4rVY,1571
|
50
|
-
nlptools/arabert/araelectra/finetune/task.py,sha256=zM8M4PGSIrY2u6ytpmkQEXxG-jjoeN9wouEyVR23qeQ,1991
|
51
|
-
nlptools/arabert/araelectra/finetune/task_builder.py,sha256=Zsoiuw5M3Ca8QhaZVLVLZyWw09K5R75UeMuPmazMlHI,2768
|
52
|
-
nlptools/arabert/araelectra/model/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
|
53
|
-
nlptools/arabert/araelectra/model/modeling.py,sha256=5XLIutnmr-SFQOV_XntJ-U5evSCY-J2e9NjvlwVXKkk,40877
|
54
|
-
nlptools/arabert/araelectra/model/optimization.py,sha256=BCMb_C5hgBw7wC9ZR8AQ4lwoPopqLIcSiqcCrIjx9XU,7254
|
55
|
-
nlptools/arabert/araelectra/model/tokenization.py,sha256=9CkyPzs3L6OEPzN-7EWQDNQmW2mIJoZD4o1rn6xLdL4,11082
|
56
|
-
nlptools/arabert/araelectra/pretrain/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
|
57
|
-
nlptools/arabert/araelectra/pretrain/pretrain_data.py,sha256=NLgIcLAq1-MgtBNXYu_isDxnOY5k67SyADYy-8nzBok,5413
|
58
|
-
nlptools/arabert/araelectra/pretrain/pretrain_helpers.py,sha256=nFl7LEdxAU5kKwiodqJHzi-ty9jMFsCCNYOF__A69j8,9255
|
59
|
-
nlptools/arabert/araelectra/util/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
|
60
|
-
nlptools/arabert/araelectra/util/training_utils.py,sha256=7h_J1ljUWM0ynBcofEtjZWL_oAfZtTxEemQLkixgn-0,4142
|
61
|
-
nlptools/arabert/araelectra/util/utils.py,sha256=G0UAETUCZMlU9R9ASD9AXrWZeodWI1aZJEE9F-goaH4,2591
|
62
|
-
nlptools/arabert/aragpt2/__init__.py,sha256=aQkKhQwWaS61wYEeOdx682upeMWFPUjLxXSs7JM1sOA,18
|
63
|
-
nlptools/arabert/aragpt2/create_pretraining_data.py,sha256=fFa2_DAyTwc8L2IqQbshsh_Ia26nj1qtVLzC6DooSac,3105
|
64
|
-
nlptools/arabert/aragpt2/train_bpe_tokenizer.py,sha256=b-8zHQ02fLmZV4GfjnrPptwjpX259F41SlnWzBrflMA,1888
|
65
|
-
nlptools/arabert/aragpt2/gpt2/__init__.py,sha256=aQkKhQwWaS61wYEeOdx682upeMWFPUjLxXSs7JM1sOA,18
|
66
|
-
nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py,sha256=uN3Dcx-6n2_OwepyymRrGrB4EcSkR8b2ZczZrOr7bpY,6263
|
67
|
-
nlptools/arabert/aragpt2/gpt2/optimization.py,sha256=iqh23cypRSRUt53wt2G5SbNNpJMwERM7gZAOKVh5l4U,8411
|
68
|
-
nlptools/arabert/aragpt2/gpt2/run_pretraining.py,sha256=4jjkUbvTO1DHoKJ89yKtlkkofcND_fyAunQ-mlnJhTM,13298
|
69
|
-
nlptools/arabert/aragpt2/grover/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
70
|
-
nlptools/arabert/aragpt2/grover/dataloader.py,sha256=-FWPTjtsvweEE1WaWRHBXfOSbsGiUmnXT3qK7KJP8cM,6853
|
71
|
-
nlptools/arabert/aragpt2/grover/modeling.py,sha256=XcUvFwqRaxAwWiJstrH2FPBvDJe03pTWIyipdMfWj9g,38280
|
72
|
-
nlptools/arabert/aragpt2/grover/modeling_gpt2.py,sha256=WFpCWn1792yATFzt8rZ0rpWvExfbLzV2BqiEs7llFUw,51602
|
73
|
-
nlptools/arabert/aragpt2/grover/optimization_adafactor.py,sha256=1geOsCWuv5xxtSnKDz9a8aY5SVwZ1MGq-xVQDBg4Gpg,9765
|
74
|
-
nlptools/arabert/aragpt2/grover/train_tpu.py,sha256=qNgLI_j6-KYkTMJfVoFlh4NIKweY1aPz1XPDw6odld0,7102
|
75
|
-
nlptools/arabert/aragpt2/grover/utils.py,sha256=V5wMUxK03r5g_pb7R3_uGLOPqQJfbIB0VaJ8ZDM4XAo,8473
|
76
|
-
nlptools/arabiner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
77
|
-
nlptools/arabiner/bin/__init__.py,sha256=d1ToN2uheCCVby3TjiSuD1dqo_pvNIuTgz4COFr2Khs,438
|
78
|
-
nlptools/arabiner/bin/eval.py,sha256=ihtjJinY1jXpZXW5bQJzTC5MF6_V3GQ5zHzsc691_HQ,2591
|
79
|
-
nlptools/arabiner/bin/infer.py,sha256=EZKeq4zucIE-ooHYnegODNxsRiIY_gY5GvDPChH5WRQ,3237
|
80
|
-
nlptools/arabiner/bin/process.py,sha256=4QCZjsmYV5lep6waQE37fs7Fe59_1G5seIJLDkArg4s,4698
|
81
|
-
nlptools/arabiner/bin/train.py,sha256=hf6ZRhqMZ7bFealMSusBjtWrbzHGHc5HB2Lh4rp2uQA,6390
|
82
|
-
nlptools/arabiner/data/__init__.py,sha256=XPic1bPijmZda_LPFL5J6TOps_IHUTiBDJvMx-iJqKo,61
|
83
|
-
nlptools/arabiner/data/datasets.py,sha256=p52Uc8Q2D3EzN1OmoHQcWVsJ2oB3TqgTzAcy1B9fJ68,5068
|
84
|
-
nlptools/arabiner/data/transforms.py,sha256=KPCDdjZOEvhMC38eiFwJuiQC84cfDrvC0XM4Ye0o3do,4878
|
85
|
-
nlptools/arabiner/nn/BaseModel.py,sha256=3GmujQasTZZunOBuFXpY2p1W8W256iI_Uu4hxhOY2Z0,608
|
86
|
-
nlptools/arabiner/nn/BertNestedTagger.py,sha256=7vU2tmDSoqSHn6GvMJmyN0hEMLvCkbr_r-AaiAaYdw8,1223
|
87
|
-
nlptools/arabiner/nn/BertSeqTagger.py,sha256=dFcBBiMw2QCWsyy7aQDe_PS3aRuNn4DOxKIHgTblFvc,504
|
88
|
-
nlptools/arabiner/nn/__init__.py,sha256=ZN7Psm83pysUhGI3ZSaJra2aCYBZb9DZ0UX4CiKGc0A,182
|
89
|
-
nlptools/arabiner/trainers/BaseTrainer.py,sha256=oZgFJW-CawfCKT5gtaBHA7Q7XjNfiyqM62KnFsgVzPU,3919
|
90
|
-
nlptools/arabiner/trainers/BertNestedTrainer.py,sha256=hVqPRdmaHf2iwftseNpgsAfwGkl6eMHJx1rKunQS_vM,8443
|
91
|
-
nlptools/arabiner/trainers/BertTrainer.py,sha256=KkcgZXu6kqsrrnfFtiAQ8ucLsrQtDxLRqdbTiTnRWqI,6447
|
92
|
-
nlptools/arabiner/trainers/__init__.py,sha256=kt8WqsaOjX0h1JMa-v7Y9ywT5mfwQIsZTyVWnIAWsEQ,200
|
93
|
-
nlptools/arabiner/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
94
|
-
nlptools/arabiner/utils/data.py,sha256=uuPiu-7v0gccNygZjdTKomJGE7X0H9FC24Y9nHZpf4c,4376
|
95
|
-
nlptools/arabiner/utils/helpers.py,sha256=PyOOlx5uabvZVmU3SZtZ3ZLA3pliinJ3JXsvos9SUWU,5032
|
96
|
-
nlptools/arabiner/utils/metrics.py,sha256=Irz6SsIvpOzGIA2lWxrEV86xnTnm0TzKm9SUVT4SXUU,2734
|
97
|
-
nlptools/morphology/ALMA_multi_word.py,sha256=hlzZCk-uUdZ-GbiPsFxDTvoWoIuVof2Sm7NdaxaFipM,1313
|
98
|
-
nlptools/morphology/__init__.py,sha256=z6_RGhiyfNHXNKMmhNSI6ObTLmdjQyP58vsFottI8GA,1706
|
99
|
-
nlptools/morphology/charsets.py,sha256=7w9OrbnZTnLU3A9q-SUi9GhUN97qNtbYR5T0Pm72cF8,2784
|
100
|
-
nlptools/morphology/morph_analyzer.py,sha256=OmCxm4-fM2qfYzKk8yOd6D_T3RsfzZCcd7Oz2V4Advg,6507
|
101
|
-
nlptools/morphology/settings.py,sha256=sEZdnA7MiYXHdxrfHWXop1RcKClVzpOYzZwzHC1PxJ8,144
|
102
|
-
nlptools/morphology/tokenizers_words.py,sha256=Smtt_KXifl2wRI464Qn07PtUvOsyGBJjZ7E20gd8zVM,602
|
103
|
-
nlptools/salma/__init__.py,sha256=pOauGjD-xrGHw05sNx3EiSFc_wpM3bD1vJxQHoDDXOA,376
|
104
|
-
nlptools/salma/settings.py,sha256=fqAQg2b22gorzT9Pf_AEJD9p8AlVUaVyKD3FH8g2yUs,1110
|
105
|
-
nlptools/salma/views.py,sha256=EH1vc6P88CeAIzQKt7EU_HTI0uJipv4JdXiAX5NjrJY,18416
|
106
|
-
nlptools/salma/wsd.py,sha256=kmP5ZvvVMkxApgk91TAGSBkMJZbPPbS0qoNk8OE37og,4434
|
107
|
-
nlptools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
108
|
-
nlptools/utils/corpus_tokenizer.py,sha256=IDWh87XJaFa7V2P4kIxY4QVywPKhz0fIErc_c0gJGUU,4581
|
109
|
-
nlptools/utils/implication.py,sha256=Ro1Vw62oOBzELkX-zpHyieq4v2OsoyFrFeTU7BiK7qc,27794
|
110
|
-
nlptools/utils/jaccard.py,sha256=TTC5KTVv6kONw5vZtzxEQvv7QM79BCsD0xcJAY0T5tU,10111
|
111
|
-
nlptools/utils/parser.py,sha256=0Yd40CZf4wXso2q-d9LULUNAVUAMdiYMImfcVb6i9qQ,6175
|
112
|
-
nlptools/utils/readfile.py,sha256=xE4LEaCqXJIk9v37QUSSmWb-aY3UnCFUNb7uVdx3cpM,133
|
113
|
-
nlptools/utils/sentence_tokenizer.py,sha256=3C0Wx1ns8ZHiGwKlUkcti-8zA3fB4ju0fIEtGACM7oU,2162
|
114
|
-
nlptools/utils/text_transliteration.py,sha256=zhB3sFXSMtkkdqImRMVg415AAB80DOm9lMFKb2IBynw,8765
|
115
|
-
nlptools/utils/utils.py,sha256=vKkFOkYclMu8nXS_VZb6Kobx8QGKW9onXkkLCeiRb6g,32
|
116
|
-
SinaTools-0.1.3.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
|
117
|
-
SinaTools-0.1.3.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
|
118
|
-
SinaTools-0.1.3.dist-info/METADATA,sha256=zxuxnKe_i5AAHNC_uPGxpmAzB2T2y01iL-kHIRV5H-o,1527
|
119
|
-
SinaTools-0.1.3.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
|
120
|
-
SinaTools-0.1.3.dist-info/entry_points.txt,sha256=9-PNkvWGCid8SVN03S2NkJFuxAzvcB22tGpHe-et2q8,951
|
121
|
-
SinaTools-0.1.3.dist-info/top_level.txt,sha256=sREDI6iHe4D0BZQmZbZ-LxYIn2cBWUayk9CZwAR9jaE,9
|
122
|
-
SinaTools-0.1.3.dist-info/RECORD,,
|
@@ -1,18 +0,0 @@
|
|
1
|
-
[console_scripts]
|
2
|
-
arabi_ner = nlptools.CLI.arabiner.bin.infer:main
|
3
|
-
arabi_ner2 = nlptools.CLI.arabiner.bin.infer2:main
|
4
|
-
install_env = nlptools.install_env:main
|
5
|
-
sina_alma_multi_word = nlptools.CLI.morphology.ALMA_multi_word:main
|
6
|
-
sina_appdatadir = nlptools.CLI.DataDownload.get_appdatadir:main
|
7
|
-
sina_arStrip = nlptools.CLI.utils.arStrip:main
|
8
|
-
sina_corpus_tokenizer = nlptools.CLI.utils.corpus_tokenizer:main
|
9
|
-
sina_download_files = nlptools.CLI.DataDownload.download_files:main
|
10
|
-
sina_implication = nlptools.CLI.utils.implication:main
|
11
|
-
sina_jaccard_similarity = nlptools.CLI.utils.jaccard:main
|
12
|
-
sina_morph_analyze = nlptools.CLI.morphology.morph_analyzer:main
|
13
|
-
sina_remove_latin = nlptools.CLI.utils.latin_remove:main
|
14
|
-
sina_remove_punctuation = nlptools.CLI.utils.remove_Punc:main
|
15
|
-
sina_salma = nlptools.CLI.salma.salma_tools:main
|
16
|
-
sina_sentence_tokenize = nlptools.CLI.utils.sentence_tokenizer:main
|
17
|
-
sina_transliterate = nlptools.CLI.utils.text_transliteration:main
|
18
|
-
|
@@ -1 +0,0 @@
|
|
1
|
-
nlptools
|
@@ -1,91 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
About:
|
3
|
-
------
|
4
|
-
The sina_morph_analyze tool is designed to provide morphological analysis for Arabic text using the SinaTools' `analyze` utility. Users can specify the language and desired analysis task (e.g., lemmatization, part-of-speech tagging, or a full morphological analysis).
|
5
|
-
|
6
|
-
Usage:
|
7
|
-
------
|
8
|
-
Below is the usage information that can be generated by running sina_morph_analyze --help.
|
9
|
-
|
10
|
-
.. code-block:: none
|
11
|
-
|
12
|
-
sina_morph_analyze --text=TEXT [OPTIONS]
|
13
|
-
sina_morph_analyze --file=FILE [OPTIONS]
|
14
|
-
|
15
|
-
Options:
|
16
|
-
--------
|
17
|
-
|
18
|
-
.. code-block:: none
|
19
|
-
|
20
|
-
--text TEXT
|
21
|
-
The text that needs to be morphologically analyzed.
|
22
|
-
|
23
|
-
--file FILE
|
24
|
-
File containing the text to be morphologically analyzed
|
25
|
-
|
26
|
-
--language LANGUAGE [default=MSA]
|
27
|
-
Specifies the language for the analysis. The default is MSA (Modern Standard Arabic).
|
28
|
-
Use other codes as appropriate for your requirements.
|
29
|
-
|
30
|
-
--task TASK [default=full]
|
31
|
-
Determines the specific type of morphological analysis to be performed. Available options are:
|
32
|
-
- lemmatizer: Provides lemmatization results.
|
33
|
-
- pos: Provides part-of-speech tagging.
|
34
|
-
- full: Provides a comprehensive morphological analysis.
|
35
|
-
The default is a full morphological analysis.
|
36
|
-
|
37
|
-
Examples:
|
38
|
-
---------
|
39
|
-
|
40
|
-
.. code-block:: none
|
41
|
-
|
42
|
-
sina_morph_analyze --text "Your Arabic text here" --language MSA --task full
|
43
|
-
sina_morph_analyze --text "Your Arabic text here" --task lemmatizer
|
44
|
-
sina_morph_analyze --file "path/to/your/file.txt" --language MSA --task full
|
45
|
-
sina_morph_analyze --file "path/to/your/file.txt" --task lemmatizer
|
46
|
-
|
47
|
-
Note:
|
48
|
-
-----
|
49
|
-
|
50
|
-
.. code-block:: none
|
51
|
-
|
52
|
-
- Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
|
53
|
-
- The quality and accuracy of the analysis depend on the underlying capabilities of the SinaTools' `analyze` utility.
|
54
|
-
- The analysis can be influenced by the choice of language. Ensure you are using the correct language setting.
|
55
|
-
|
56
|
-
"""
|
57
|
-
|
58
|
-
import argparse
|
59
|
-
from nlptools.morphology.morph_analyzer import analyze
|
60
|
-
from nlptools.utils.readfile import read_file
|
61
|
-
|
62
|
-
def main():
|
63
|
-
parser = argparse.ArgumentParser(description='Morphological Analysis using SinaTools')
|
64
|
-
|
65
|
-
# Adding arguments for the text, file, language, and task
|
66
|
-
parser.add_argument('--text', type=str, help='Text to be morphologically analyzed')
|
67
|
-
parser.add_argument('--file', type=str, help='File containing the text to be morphologically analyzed')
|
68
|
-
parser.add_argument('--language', type=str, default='MSA', help='Language for analysis (default: MSA)')
|
69
|
-
parser.add_argument('--task', type=str, default='full', choices=['lemmatizer', 'pos', 'full'],
|
70
|
-
help='Task for the result filter [lemmatizer, pos, full] (default: full)')
|
71
|
-
|
72
|
-
args = parser.parse_args()
|
73
|
-
|
74
|
-
if args.text is None and args.file is None:
|
75
|
-
print("Error: Either --text or --file argument must be provided.")
|
76
|
-
return
|
77
|
-
|
78
|
-
# Get the input either from the --text argument or from the file specified in the --file argument
|
79
|
-
input_text = args.text if args.text else " ".join(read_file(args.file))
|
80
|
-
|
81
|
-
# Perform morphological analysis
|
82
|
-
results = analyze(input_text, args.language, args.task)
|
83
|
-
|
84
|
-
# Print the results
|
85
|
-
for result in results:
|
86
|
-
print(result)
|
87
|
-
|
88
|
-
if __name__ == '__main__':
|
89
|
-
main()
|
90
|
-
#sina_morph_analyze --text "Your Arabic text here" --language MSA --task full
|
91
|
-
#sina_morph_analyze --file "path/to/your/file.txt" --language MSA --task full
|
@@ -1,74 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
|
3
|
-
About:
|
4
|
-
------
|
5
|
-
|
6
|
-
The sina_corpus_tokenizer tool offers functionality to tokenize a corpus and write the results to a CSV file. It recursively searches through a specified directory for text files, tokenizes the content, and outputs the results, including various metadata, to a specified CSV file.
|
7
|
-
|
8
|
-
Usage:
|
9
|
-
-------
|
10
|
-
|
11
|
-
Below is the usage information that can be generated by running sina_corpus_tokenizer --help.
|
12
|
-
|
13
|
-
.. code-block:: none
|
14
|
-
|
15
|
-
Usage:
|
16
|
-
sina_corpus_tokenizer dir_path output_csv
|
17
|
-
|
18
|
-
.. code-block:: none
|
19
|
-
|
20
|
-
Positional Arguments:
|
21
|
-
dir_path
|
22
|
-
The path to the directory containing the text files.
|
23
|
-
|
24
|
-
output_csv
|
25
|
-
The path to the output CSV file.
|
26
|
-
|
27
|
-
Examples:
|
28
|
-
---------
|
29
|
-
|
30
|
-
.. code-block:: none
|
31
|
-
|
32
|
-
sina_corpus_tokenizer --dir_path "/path/to/text/directory/of/files" --output_csv "outputFile.csv"
|
33
|
-
|
34
|
-
Note:
|
35
|
-
-----
|
36
|
-
|
37
|
-
.. code-block:: none
|
38
|
-
|
39
|
-
- The tool only processes text files (with a .txt extension).
|
40
|
-
- The output CSV will contain the following columns:
|
41
|
-
- 'Row_ID' (a unique identifier for each records in outputfile)
|
42
|
-
- 'Docs_Sentence_Word_ID' (a concatenated identifier comprising directory name, file name, global sentence id, sentence id, and word position).
|
43
|
-
- 'GlobalSentenceID' (Integer, a unique identifier for each sentence in the entire file)
|
44
|
-
- 'SentenceID' (Integer, a unique identifier for each file within the CSV file)
|
45
|
-
- 'Sentence' (Generated text that forms a sentence)
|
46
|
-
- 'Word Position' (Integer, the position of each word within the sentence)
|
47
|
-
- 'Word' (Each row contains a word from the generated sentence).
|
48
|
-
- Ensure that the text files are appropriately encoded in UTF-8 or compatible formats.
|
49
|
-
- The tool uses the `nltk` library for sentence and word tokenization. Make sure to have the library installed in your environment.
|
50
|
-
"""
|
51
|
-
|
52
|
-
import argparse
|
53
|
-
from nlptools.utils.corpus_tokenizer import corpus_tokenizer
|
54
|
-
|
55
|
-
# Define the main function that will parse the arguments
|
56
|
-
def main():
|
57
|
-
# Create an ArgumentParser object
|
58
|
-
parser = argparse.ArgumentParser(description='Tokenize the corpus and write the results to a CSV file.')
|
59
|
-
|
60
|
-
# Add arguments to the parser
|
61
|
-
parser.add_argument('--dir_path', type=str, help='The path to the directory containing the text files.')
|
62
|
-
parser.add_argument('--output_csv', type=str, help='The path to the output CSV file.')
|
63
|
-
|
64
|
-
# Parse the command-line arguments
|
65
|
-
args = parser.parse_args()
|
66
|
-
|
67
|
-
# Call the corpus_tokenizer function with the parsed arguments
|
68
|
-
corpus_tokenizer(args.dir_path, args.output_csv)
|
69
|
-
|
70
|
-
# Call the main function when the script is executed
|
71
|
-
if __name__ == '__main__':
|
72
|
-
main()
|
73
|
-
|
74
|
-
#sina_corpus_tokenizer /path/to/text/files output.csv
|