SinaTools 0.1.3__py2.py3-none-any.whl → 0.1.7__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. {SinaTools-0.1.3.dist-info → SinaTools-0.1.7.dist-info}/METADATA +14 -20
  2. SinaTools-0.1.7.dist-info/RECORD +101 -0
  3. SinaTools-0.1.7.dist-info/entry_points.txt +18 -0
  4. SinaTools-0.1.7.dist-info/top_level.txt +1 -0
  5. {nlptools → sinatools}/CLI/DataDownload/download_files.py +9 -9
  6. {nlptools → sinatools}/CLI/morphology/ALMA_multi_word.py +10 -20
  7. sinatools/CLI/morphology/morph_analyzer.py +80 -0
  8. nlptools/CLI/arabiner/bin/infer2.py → sinatools/CLI/ner/corpus_entity_extractor.py +5 -9
  9. nlptools/CLI/arabiner/bin/infer.py → sinatools/CLI/ner/entity_extractor.py +4 -8
  10. {nlptools → sinatools}/CLI/salma/salma_tools.py +8 -8
  11. {nlptools → sinatools}/CLI/utils/arStrip.py +10 -21
  12. sinatools/CLI/utils/corpus_tokenizer.py +50 -0
  13. {nlptools → sinatools}/CLI/utils/implication.py +9 -9
  14. {nlptools → sinatools}/CLI/utils/jaccard.py +10 -10
  15. sinatools/CLI/utils/remove_latin.py +34 -0
  16. sinatools/CLI/utils/remove_punctuation.py +42 -0
  17. {nlptools → sinatools}/CLI/utils/sentence_tokenizer.py +9 -22
  18. {nlptools → sinatools}/CLI/utils/text_transliteration.py +10 -17
  19. {nlptools → sinatools}/DataDownload/downloader.py +10 -10
  20. sinatools/VERSION +1 -0
  21. {nlptools → sinatools}/__init__.py +1 -1
  22. {nlptools → sinatools}/morphology/ALMA_multi_word.py +4 -5
  23. {nlptools → sinatools}/morphology/__init__.py +4 -14
  24. sinatools/morphology/morph_analyzer.py +172 -0
  25. sinatools/ner/__init__.py +12 -0
  26. nlptools/arabiner/bin/infer.py → sinatools/ner/entity_extractor.py +9 -8
  27. {nlptools → sinatools}/salma/__init__.py +2 -2
  28. {nlptools → sinatools}/salma/settings.py +1 -1
  29. {nlptools → sinatools}/salma/views.py +12 -12
  30. {nlptools → sinatools}/salma/wsd.py +2 -2
  31. {nlptools/morphology → sinatools/utils}/charsets.py +1 -3
  32. {nlptools → sinatools}/utils/implication.py +10 -10
  33. {nlptools → sinatools}/utils/jaccard.py +2 -2
  34. {nlptools → sinatools}/utils/parser.py +18 -21
  35. {nlptools → sinatools}/utils/text_transliteration.py +1 -1
  36. nlptools/utils/corpus_tokenizer.py → sinatools/utils/tokenizer.py +58 -5
  37. {nlptools/morphology → sinatools/utils}/tokenizers_words.py +3 -6
  38. SinaTools-0.1.3.dist-info/RECORD +0 -122
  39. SinaTools-0.1.3.dist-info/entry_points.txt +0 -18
  40. SinaTools-0.1.3.dist-info/top_level.txt +0 -1
  41. nlptools/CLI/morphology/morph_analyzer.py +0 -91
  42. nlptools/CLI/utils/corpus_tokenizer.py +0 -74
  43. nlptools/CLI/utils/latin_remove.py +0 -51
  44. nlptools/CLI/utils/remove_Punc.py +0 -53
  45. nlptools/VERSION +0 -1
  46. nlptools/arabiner/bin/__init__.py +0 -14
  47. nlptools/arabiner/bin/eval.py +0 -87
  48. nlptools/arabiner/bin/process.py +0 -140
  49. nlptools/arabiner/bin/train.py +0 -221
  50. nlptools/arabiner/data/__init__.py +0 -1
  51. nlptools/arabiner/data/datasets.py +0 -146
  52. nlptools/arabiner/data/transforms.py +0 -118
  53. nlptools/arabiner/nn/BaseModel.py +0 -22
  54. nlptools/arabiner/nn/BertNestedTagger.py +0 -34
  55. nlptools/arabiner/nn/BertSeqTagger.py +0 -17
  56. nlptools/arabiner/nn/__init__.py +0 -3
  57. nlptools/arabiner/trainers/BaseTrainer.py +0 -117
  58. nlptools/arabiner/trainers/BertNestedTrainer.py +0 -203
  59. nlptools/arabiner/trainers/BertTrainer.py +0 -163
  60. nlptools/arabiner/trainers/__init__.py +0 -3
  61. nlptools/arabiner/utils/__init__.py +0 -0
  62. nlptools/arabiner/utils/data.py +0 -124
  63. nlptools/arabiner/utils/helpers.py +0 -151
  64. nlptools/arabiner/utils/metrics.py +0 -69
  65. nlptools/morphology/morph_analyzer.py +0 -170
  66. nlptools/morphology/settings.py +0 -8
  67. nlptools/utils/__init__.py +0 -0
  68. nlptools/utils/sentence_tokenizer.py +0 -53
  69. {SinaTools-0.1.3.data/data/nlptools → SinaTools-0.1.7.data/data/sinatools}/environment.yml +0 -0
  70. {SinaTools-0.1.3.dist-info → SinaTools-0.1.7.dist-info}/AUTHORS.rst +0 -0
  71. {SinaTools-0.1.3.dist-info → SinaTools-0.1.7.dist-info}/LICENSE +0 -0
  72. {SinaTools-0.1.3.dist-info → SinaTools-0.1.7.dist-info}/WHEEL +0 -0
  73. {nlptools → sinatools}/CLI/utils/__init__.py +0 -0
  74. {nlptools → sinatools}/DataDownload/__init__.py +0 -0
  75. {nlptools → sinatools}/arabert/__init__.py +0 -0
  76. {nlptools → sinatools}/arabert/arabert/__init__.py +0 -0
  77. {nlptools → sinatools}/arabert/arabert/create_classification_data.py +0 -0
  78. {nlptools → sinatools}/arabert/arabert/create_pretraining_data.py +0 -0
  79. {nlptools → sinatools}/arabert/arabert/extract_features.py +0 -0
  80. {nlptools → sinatools}/arabert/arabert/lamb_optimizer.py +0 -0
  81. {nlptools → sinatools}/arabert/arabert/modeling.py +0 -0
  82. {nlptools → sinatools}/arabert/arabert/optimization.py +0 -0
  83. {nlptools → sinatools}/arabert/arabert/run_classifier.py +0 -0
  84. {nlptools → sinatools}/arabert/arabert/run_pretraining.py +0 -0
  85. {nlptools → sinatools}/arabert/arabert/run_squad.py +0 -0
  86. {nlptools → sinatools}/arabert/arabert/tokenization.py +0 -0
  87. {nlptools → sinatools}/arabert/araelectra/__init__.py +0 -0
  88. {nlptools → sinatools}/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -0
  89. {nlptools → sinatools}/arabert/araelectra/build_pretraining_dataset.py +0 -0
  90. {nlptools → sinatools}/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -0
  91. {nlptools → sinatools}/arabert/araelectra/configure_finetuning.py +0 -0
  92. {nlptools → sinatools}/arabert/araelectra/configure_pretraining.py +0 -0
  93. {nlptools → sinatools}/arabert/araelectra/finetune/__init__.py +0 -0
  94. {nlptools → sinatools}/arabert/araelectra/finetune/feature_spec.py +0 -0
  95. {nlptools → sinatools}/arabert/araelectra/finetune/preprocessing.py +0 -0
  96. {nlptools → sinatools}/arabert/araelectra/finetune/scorer.py +0 -0
  97. {nlptools → sinatools}/arabert/araelectra/finetune/task.py +0 -0
  98. {nlptools → sinatools}/arabert/araelectra/finetune/task_builder.py +0 -0
  99. {nlptools → sinatools}/arabert/araelectra/flops_computation.py +0 -0
  100. {nlptools → sinatools}/arabert/araelectra/model/__init__.py +0 -0
  101. {nlptools → sinatools}/arabert/araelectra/model/modeling.py +0 -0
  102. {nlptools → sinatools}/arabert/araelectra/model/optimization.py +0 -0
  103. {nlptools → sinatools}/arabert/araelectra/model/tokenization.py +0 -0
  104. {nlptools → sinatools}/arabert/araelectra/pretrain/__init__.py +0 -0
  105. {nlptools → sinatools}/arabert/araelectra/pretrain/pretrain_data.py +0 -0
  106. {nlptools → sinatools}/arabert/araelectra/pretrain/pretrain_helpers.py +0 -0
  107. {nlptools → sinatools}/arabert/araelectra/run_finetuning.py +0 -0
  108. {nlptools → sinatools}/arabert/araelectra/run_pretraining.py +0 -0
  109. {nlptools → sinatools}/arabert/araelectra/util/__init__.py +0 -0
  110. {nlptools → sinatools}/arabert/araelectra/util/training_utils.py +0 -0
  111. {nlptools → sinatools}/arabert/araelectra/util/utils.py +0 -0
  112. {nlptools → sinatools}/arabert/aragpt2/__init__.py +0 -0
  113. {nlptools → sinatools}/arabert/aragpt2/create_pretraining_data.py +0 -0
  114. {nlptools → sinatools}/arabert/aragpt2/gpt2/__init__.py +0 -0
  115. {nlptools → sinatools}/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -0
  116. {nlptools → sinatools}/arabert/aragpt2/gpt2/optimization.py +0 -0
  117. {nlptools → sinatools}/arabert/aragpt2/gpt2/run_pretraining.py +0 -0
  118. {nlptools → sinatools}/arabert/aragpt2/grover/__init__.py +0 -0
  119. {nlptools → sinatools}/arabert/aragpt2/grover/dataloader.py +0 -0
  120. {nlptools → sinatools}/arabert/aragpt2/grover/modeling.py +0 -0
  121. {nlptools → sinatools}/arabert/aragpt2/grover/modeling_gpt2.py +0 -0
  122. {nlptools → sinatools}/arabert/aragpt2/grover/optimization_adafactor.py +0 -0
  123. {nlptools → sinatools}/arabert/aragpt2/grover/train_tpu.py +0 -0
  124. {nlptools → sinatools}/arabert/aragpt2/grover/utils.py +0 -0
  125. {nlptools → sinatools}/arabert/aragpt2/train_bpe_tokenizer.py +0 -0
  126. {nlptools → sinatools}/arabert/preprocess.py +0 -0
  127. {nlptools → sinatools}/environment.yml +0 -0
  128. {nlptools → sinatools}/install_env.py +0 -0
  129. /nlptools/nlptools.py → /sinatools/sinatools.py +0 -0
  130. {nlptools/arabiner → sinatools/utils}/__init__.py +0 -0
  131. {nlptools → sinatools}/utils/readfile.py +0 -0
  132. {nlptools → sinatools}/utils/utils.py +0 -0
@@ -1,11 +1,11 @@
1
- from nlptools.salma import settings
1
+ from sinatools.salma import settings
2
2
  import re
3
3
  import warnings
4
4
  warnings.filterwarnings("ignore")
5
5
  import torch
6
6
  import numpy as np
7
7
  import pandas as pd
8
- from nlptools.arabert.preprocess import ArabertPreprocessor
8
+ from sinatools.arabert.preprocess import ArabertPreprocessor
9
9
 
10
10
  def normalizearabert(s):
11
11
  model_name = 'aubmindlab/bert-base-arabertv02'
@@ -1,6 +1,4 @@
1
- # -*- coding: utf-8 -*-
2
- # We acknoledge that this file charsets.py is imported from Camel tools citation. url
3
- #
1
+ # We acknowledge that this file, charsets.py, is imported from Camel Tools. [https://camel-tools.readthedocs.io/en/latest/api/utils/charsets.html].
4
2
 
5
3
  import unicodedata
6
4
 
@@ -3,7 +3,7 @@
3
3
  # The matching between two words is defined as a tuple:
4
4
  # <w1, w2, implication direction, distance, conflicts, verdict, preferredWord> .
5
5
 
6
- from nlptools.utils.parser import arStrip
6
+ from sinatools.utils.parser import arStrip
7
7
  class Implication:
8
8
  """
9
9
  The implication class computes whether the two Arabic words are the same or not, regardless of how they are diacritized. The output also contains implication direction, distance, number of conflicts, and other outputs.
@@ -215,7 +215,7 @@ class Implication:
215
215
  .. highlight:: python
216
216
  .. code-block:: python
217
217
 
218
- from nlptools.utils.implication import Implication
218
+ from sinatools.utils.implication import Implication
219
219
 
220
220
  word = Implication.normalize_alef("ًى") # Returns "ىً"
221
221
  word = Implication.normalize_alef("ًا") # Returns "اً"
@@ -249,7 +249,7 @@ class Implication:
249
249
  .. highlight:: python
250
250
  .. code-block:: python
251
251
 
252
- from nlptools.utils.implication import Implication
252
+ from sinatools.utils.implication import Implication
253
253
 
254
254
  diacritics = ["َ", "ُ", "ِ", "ّ"]
255
255
  has_error = Implication.diacritics_syntax_error_in(diacritics) # Returns False
@@ -288,7 +288,7 @@ class Implication:
288
288
  .. highlight:: python
289
289
  .. code-block:: python
290
290
 
291
- from nlptools.utils.implication import Implication
291
+ from sinatools.utils.implication import Implication
292
292
 
293
293
  diacritic = 0
294
294
  is_wrong_end = Implication.wrong_end_diacritic(diacritic) # Returns False
@@ -324,7 +324,7 @@ class Implication:
324
324
  .. highlight:: python
325
325
  .. code-block:: python
326
326
 
327
- from nlptools.utils.implication import Implication
327
+ from sinatools.utils.implication import Implication
328
328
 
329
329
  implication = Implication(word1, word2)
330
330
  implication.calculate_words_implication()
@@ -377,7 +377,7 @@ class Implication:
377
377
  .. highlight:: python
378
378
  .. code-block:: python
379
379
 
380
- from nlptools.utils.implication Implication
380
+ from sinatools.utils.implication Implication
381
381
 
382
382
  implication = Implication(word1, word2)
383
383
  result = implication.equal_words()
@@ -428,7 +428,7 @@ class Implication:
428
428
  .. highlight:: python
429
429
  .. code-block:: python
430
430
 
431
- from nlptools.utils.implication import Implication
431
+ from sinatools.utils.implication import Implication
432
432
 
433
433
  implication = Implication(word1, word2)
434
434
  result = implication.calculate_letters_implication()
@@ -508,7 +508,7 @@ class Implication:
508
508
  .. highlight:: python
509
509
  .. code-block:: python
510
510
 
511
- from nlptools.utils.implication import Implication
511
+ from sinatools.utils.implication import Implication
512
512
  word = "مُرَحَّبًا"
513
513
  diacritics = Implication.calculate_direction(word)
514
514
  print(diacritics)
@@ -600,7 +600,7 @@ class Implication:
600
600
  .. highlight:: python
601
601
  .. code-block:: python
602
602
 
603
- from nlptools.utils.implication import Implication
603
+ from sinatools.utils.implication import Implication
604
604
  word = "مرحبا"
605
605
  letters = get_letters_array(word)
606
606
  print(letters)
@@ -644,7 +644,7 @@ class Implication:
644
644
  .. highlight:: python
645
645
  .. code-block:: python
646
646
 
647
- from nlptools.utils.implication import Implication
647
+ from sinatools.utils.implication import Implication
648
648
  w1 = "hello"
649
649
  w2 = "hell"
650
650
  implication = Implication(w1, w2)
@@ -1,7 +1,7 @@
1
1
  # -*- coding: utf-8 -*-
2
2
 
3
- from nlptools.utils.parser import arStrip
4
- from nlptools.utils.implication import Implication
3
+ from sinatools.utils.parser import arStrip
4
+ from sinatools.utils.implication import Implication
5
5
  import argparse
6
6
 
7
7
  def normalize_word(word: str, ignore_all_diacritics_but_not_shadda: bool=True, ignore_shadda_diacritic: bool=True) -> str:
@@ -1,20 +1,19 @@
1
1
  import re
2
2
  import argparse
3
3
 
4
- def arStrip(text , diacs=True , smallDiacs=True , shaddah=True , digit=True, alif=True , specialChars=True ):
4
+ def arStrip(text , diacs=True , small_diacs=True , shaddah=True , digit=True, alif=True , special_chars=True ):
5
5
 
6
6
  """
7
- This method removes Arabic diacritics, small diacritcs, shaddah, Latin and Arabic digits, and unify alif.
8
- And remove special characters, spaces, underscore and Arabic tatwelah from the input text.
7
+ This method removes Arabic diacritics, small diacritcs, shaddah, Latin and Arabic digits, unify alif, remove special characters, extra spaces, underscore and Arabic tatwelah from the input text.
9
8
 
10
9
  Args:
11
10
  text (:obj:`str`): Arabic text to be processed.
12
11
  diacs (:obj:`bool`): flag to remove Arabic diacretics [ ًٌٍَُِْ] (default is True).
13
- smallDiacs (:obj:`bool`): flag to remove small diacretics (default is True).
12
+ small_diacs (:obj:`bool`): flag to remove small diacretics (default is True).
14
13
  shaddah (:obj:`bool`): flag to remove shaddah (default is True).
15
14
  digit (:obj:`bool`): flag to remove Latin and Arabic digits (default is True).
16
15
  alif (:obj:`bool`): flag to unify alif (default is True).
17
- specialChars (:obj:`bool`): flag to remove special characters (default is True).
16
+ special_chars (:obj:`bool`): flag to remove special characters (default is True).
18
17
 
19
18
  Returns:
20
19
  :obj:`str`: stripped text.
@@ -24,19 +23,17 @@ def arStrip(text , diacs=True , smallDiacs=True , shaddah=True , digit=True, ali
24
23
  .. highlight:: python
25
24
  .. code-block:: python
26
25
 
27
- from nlptools.utils import parser
28
- processed_text =parser.arStrip('2023الجو جميلُ')
29
- print(processed_text)
26
+ from sinatools.utils import parser
27
+ output = parser.arStrip('2023الجو جميلُ')
28
+ print(output)
30
29
 
31
- #putput
30
+ # output
32
31
  الجو جميل
33
32
 
34
- name =parser.arStrip('أَلَمۡ یَأۡنِ لِلَّذِینَ ءَامَنُوۤا۟ أَن تَخۡشَعَ قُلُوبُهُمۡ لِذِكۡرِ ٱللَّهِ وَمَا نَزَلَ مِنَ ٱلۡحَقِّ وَلَا یَكُونُوا۟ كَٱلَّذِینَ أُوتُوا۟ ٱلۡكِتَـٰبَ مِن قَبۡلُ فَطَالَ عَلَیۡهِمُ ٱلۡأَمَدُ فَقَسَتۡ قُلُوبُهُمۡۖ وَكَثِیر مِّنۡهُمۡ فَـسِقُونَ' , True , True , True , True , True , True )
35
- print(name)
36
- #putput
33
+ output = parser.arStrip('أَلَمۡ یَأۡنِ لِلَّذِینَ ءَامَنُوۤا۟ أَن تَخۡشَعَ قُلُوبُهُمۡ لِذِكۡرِ ٱللَّهِ وَمَا نَزَلَ مِنَ ٱلۡحَقِّ وَلَا یَكُونُوا۟ كَٱلَّذِینَ أُوتُوا۟ ٱلۡكِتَـٰبَ مِن قَبۡلُ فَطَالَ عَلَیۡهِمُ ٱلۡأَمَدُ فَقَسَتۡ قُلُوبُهُمۡۖ وَكَثِیر مِّنۡهُمۡ فَـسِقُونَ' , True , True , True , True , True , True )
34
+ print(output)
35
+ #output
37
36
  الم یان للذین ءامنوا ان تخشع قلوبهم لذكر الله وما نزل من الحق ولا یكونوا كالذین اوتوا الكتٰب من قبل فطال علیهم الامد فقست قلوبهم وكثیر منهم فسقون
38
-
39
-
40
37
  """
41
38
  try:
42
39
  if text: # if the input string is not empty do the following
@@ -46,7 +43,7 @@ def arStrip(text , diacs=True , smallDiacs=True , shaddah=True , digit=True, ali
46
43
  text = re.sub(r'[\u0652]+', '',text) # Remove SUKUN
47
44
  if shaddah == True:
48
45
  text = re.sub(r'[\u0651]+', '',text) # Remove shddah
49
- if smallDiacs == True:
46
+ if small_diacs == True:
50
47
  text = re.sub(r'[\u06D6-\u06ED]+', '',text) # Remove all small Quranic annotation signs
51
48
  if digit == True:
52
49
  text = re.sub('[0-9]+', ' ',text) # Remove English digits
@@ -57,7 +54,7 @@ def arStrip(text , diacs=True , smallDiacs=True , shaddah=True , digit=True, ali
57
54
  text = re.sub('أ', 'ا',text);
58
55
  text = re.sub('إ', 'ا',text);
59
56
  text = re.sub('آ', 'ا',text);
60
- if specialChars == True:
57
+ if special_chars == True:
61
58
  text = re.sub('[?؟!@#$%-]+' , '' , text) # Remove some of special chars
62
59
 
63
60
  text = re.sub('[\\s]+'," ",text) # Remove all spaces
@@ -83,7 +80,7 @@ def remove_punctuation(text):
83
80
  .. highlight:: python
84
81
  .. code-block:: python
85
82
 
86
- from nlptools.utils import parser
83
+ from sinatools.utils import parser
87
84
  return parser.remove_punctuation("te!@#،$%%؟st")
88
85
 
89
86
  #output
@@ -103,12 +100,12 @@ def remove_punctuation(text):
103
100
  r'[\u061B]+', r'[\u061E]+', r'[\u061F]+', r'[\u0640]+',
104
101
  r'[\u0653]+', r'[\u065C]+', r'[\u066C]+', r'[\u066A]+',
105
102
  r'["}"]+', r'["{"]+']
106
- outputString = text
103
+ output_string = text
107
104
  for punctuation in punctuation_marks:
108
- outputString = re.sub(punctuation, '', outputString)
105
+ output_string = re.sub(punctuation, '', output_string)
109
106
  except:
110
107
  return text
111
- return outputString
108
+ return output_string
112
109
 
113
110
  def remove_latin(text):
114
111
  """
@@ -126,7 +123,7 @@ def remove_latin(text):
126
123
  .. highlight:: python
127
124
  .. code-block:: python
128
125
 
129
- from nlptools.utils import parser
126
+ from sinatools.utils import parser
130
127
  return parser.remove_latin("miojkdujhvaj1546545spkdpoqfoiehwv nWEQFGWERHERTJETAWIKUYFC")
131
128
 
132
129
  #output
@@ -181,7 +181,7 @@ def perform_transliteration(text , schema ):
181
181
  .. highlight:: python
182
182
  .. code-block:: python
183
183
 
184
- from nlptools.utils import text_transliteration
184
+ from sinatools.utils import text_transliteration
185
185
 
186
186
  print(text_transliteration.perform_transliteration("مُحَمَدٌ نَـشِيْطٌـ1" , "ar2bw"))
187
187
  print(text_transliteration.perform_transliteration("muHamadN" , "bw2ar"))
@@ -1,7 +1,60 @@
1
1
  import os
2
2
  import csv
3
- from nlptools.utils.sentence_tokenizer import sent_tokenize
4
- from nlptools.morphology.tokenizers_words import simple_word_tokenize
3
+ from sinatools.utils.tokenizers_words import simple_word_tokenize
4
+
5
+ def remove_empty_values(sentences):
6
+ return [value for value in sentences if value != '']
7
+
8
+
9
+ def sentence_tokenizer(text, dot=True, new_line=True, question_mark=True, exclamation_mark=True):
10
+ """
11
+ This method tokenizes a text into a set of sentences based on the selected separators, including the dot, new line, question mark, and exclamation mark.
12
+
13
+ Args:
14
+ text (:obj:`str`): Arabic text to be tokenized.
15
+ dot (:obj:`str`): flag to split text based on Dot (default is True).
16
+ new_line (:obj:`str`): flag to split text based on new_line (default is True).
17
+ question_mark (:obj:`str`): flag to split text based on question_mark (default is True).
18
+ exclamation_mark (:obj:`str`): flag to split text based on exclamation_mark (default is True).
19
+
20
+ Returns:
21
+ :obj:`list`: list of sentences.
22
+
23
+ **Example:**
24
+
25
+ .. highlight:: python
26
+ .. code-block:: python
27
+
28
+ from sinatools.utils import tokenizer
29
+ sentences = tokenizer.sentence_tokenizer("مختبر سينا لحوسبة اللغة والذكاء الإصطناعي. في جامعة بيرزيت.", dot=True, new_line=True, question_mark=True, exclamation_mark=True)
30
+ print(sentences)
31
+
32
+ #output
33
+ ['مختبر سينا لحوسبة اللغة والذكاء الإصطناعي.', 'في جامعة بيرزيت.']
34
+ """
35
+ separators = []
36
+ split_text = [text]
37
+ if new_line==True:
38
+ separators.append('\n')
39
+ if dot==True:
40
+ separators.append('.')
41
+ if question_mark==True:
42
+ separators.append('?')
43
+ separators.append('؟')
44
+ if exclamation_mark==True:
45
+ separators.append('!')
46
+
47
+ for sep in separators:
48
+ new_split_text = []
49
+ for part in split_text:
50
+ tokens = part.split(sep)
51
+ tokens_with_separator = [token + sep for token in tokens[:-1]]
52
+ tokens_with_separator.append(tokens[-1].strip())
53
+ new_split_text.extend(tokens_with_separator)
54
+ split_text = new_split_text
55
+
56
+ split_text = remove_empty_values(split_text)
57
+ return split_text
5
58
 
6
59
  def corpus_tokenizer(dir_path, output_csv, row_id = 1, global_sentence_id = 1):
7
60
  """
@@ -28,8 +81,8 @@ def corpus_tokenizer(dir_path, output_csv, row_id = 1, global_sentence_id = 1):
28
81
  .. highlight:: python
29
82
  .. code-block:: python
30
83
 
31
- from nlptools.utils.corpus_tokenizer import corpus_tokenizer
32
- corpus_tokenizer(dir_path="History", output_csv="ouputFile.csv", row_id = 1, global_sentence_id = 1)
84
+ from sinatools.utils import tokenizer
85
+ output = tokenizer.corpus_tokenizer(dir_path="History", output_csv="ouputFile.csv", row_id = 1, global_sentence_id = 1)
33
86
 
34
87
  #output
35
88
  # csv file called: ouputFile.csv
@@ -55,7 +108,7 @@ def corpus_tokenizer(dir_path, output_csv, row_id = 1, global_sentence_id = 1):
55
108
  file_path = os.path.join(root, file)
56
109
  with open(file_path, 'r', encoding="utf-8") as f:
57
110
  content = f.read()
58
- sentences = sent_tokenize(content, dot=True, new_line=True, question_mark=False, exclamation_mark=False)
111
+ sentences = sentence_tokenizer(content, dot=True, new_line=True, question_mark=False, exclamation_mark=False)
59
112
  for sentence_id, sentence in enumerate(sentences, start=1):
60
113
  words = simple_word_tokenize(sentence)
61
114
  global_sentence_id += 1
@@ -1,11 +1,8 @@
1
- # This code was taken from Camel tools without any change
2
-
3
- # -*- coding: utf-8 -*-
4
-
1
+ # We acknowledge that this file, charsets.py, is imported from Camel Tools. [https://camel-tools.readthedocs.io/en/latest/api/tokenizers/word.html].
5
2
 
6
3
  import re
7
- from nlptools.morphology.charsets import UNICODE_PUNCT_SYMBOL_CHARSET
8
- from nlptools.morphology.charsets import UNICODE_LETTER_MARK_NUMBER_CHARSET
4
+ from sinatools.utils.charsets import UNICODE_PUNCT_SYMBOL_CHARSET
5
+ from sinatools.utils.charsets import UNICODE_LETTER_MARK_NUMBER_CHARSET
9
6
 
10
7
 
11
8
  _ALL_PUNCT = u''.join(UNICODE_PUNCT_SYMBOL_CHARSET)
@@ -1,122 +0,0 @@
1
- SinaTools-0.1.3.data/data/nlptools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
2
- nlptools/VERSION,sha256=2_CXjsK1h6XWGH_cxBzOn_LA647vrboOtR84QKtu60Y,5
3
- nlptools/__init__.py,sha256=OoA_p_y2jPjMytcUrG1ED5uJlJemVhSRr9L9Wsym-rQ,134
4
- nlptools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
5
- nlptools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
6
- nlptools/nlptools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
7
- nlptools/CLI/DataDownload/download_files.py,sha256=PMDEPXxZQbrFo-7iyhvrCpzx2RG5T5kPk6NJAwh8RSI,2322
8
- nlptools/CLI/arabiner/bin/infer.py,sha256=YrNCVro8B3UxpsHjIo_01qiBQURpDNTK7pKTkw1L21Y,4921
9
- nlptools/CLI/arabiner/bin/infer2.py,sha256=CtR9rwe20ks_qq-l_fQU-ThLqft_1o3Ztmd1my1kHMg,3905
10
- nlptools/CLI/morphology/ALMA_multi_word.py,sha256=NINts8BtT8BGQPBvs4BJ_y2PsR7czsGPOVAwngaT85A,2644
11
- nlptools/CLI/morphology/morph_analyzer.py,sha256=39vrFx6ppu7yEITcz8lAJhk3xHweaPWEqL-CcqBM37Q,3565
12
- nlptools/CLI/salma/salma_tools.py,sha256=7awpCb68QUc3kx-EuwRHxDmItZlX2aSdpukwKF1G3Fo,1999
13
- nlptools/CLI/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- nlptools/CLI/utils/arStrip.py,sha256=dzy16wZfSznkvGHHBn5P21EvyusKB55dqrZ4zbaa41w,3621
15
- nlptools/CLI/utils/corpus_tokenizer.py,sha256=S0YG8FRS29K1C8eJVEYuWSV1ABS7PKymlNS7KxvYqxI,2817
16
- nlptools/CLI/utils/implication.py,sha256=hjYTN0oiLf0bz0bRO_GD4rphZkaB3cH770clFFhuevE,3172
17
- nlptools/CLI/utils/jaccard.py,sha256=a6oc28yMgm7UewO6Lz25A4Yv8QEzVa85XF-QV9uhMwI,4639
18
- nlptools/CLI/utils/latin_remove.py,sha256=Xw6PB4GtMLLiYK3zTEwdLhBbivMyy1msD5Ab_QdJoQA,1303
19
- nlptools/CLI/utils/remove_Punc.py,sha256=dvSiSs9UulhGCogBgtpD8fU860BFuMBTnwa8Ek9aPKQ,1393
20
- nlptools/CLI/utils/sentence_tokenizer.py,sha256=AcJa_yRdlQqKMwVWWKSv1vRO1Yk-NK75-NpalkHqewc,3469
21
- nlptools/CLI/utils/text_transliteration.py,sha256=blIGB8FeF10iFeXADM-z01XJ4qeB1qgj6S2Xnk9w5fI,2266
22
- nlptools/DataDownload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- nlptools/DataDownload/downloader.py,sha256=yONVa99OtPXD5Lewy4Fm3eUiJMpBt492G1JOPh5sXAU,6523
24
- nlptools/arabert/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
25
- nlptools/arabert/preprocess.py,sha256=qI0FsuMTOzdRlYGCtLrjpXgikNElUZPv9bnjaKDZKJ4,33024
26
- nlptools/arabert/arabert/__init__.py,sha256=KbSAH-XqbRygn0y59m5-ZYOLXgpT1gSgE3F-qd4rKEc,627
27
- nlptools/arabert/arabert/create_classification_data.py,sha256=BhemGNRbYz_Pun0Q5WerN2-9n-ILmU3tm4J-OlHw5-A,7678
28
- nlptools/arabert/arabert/create_pretraining_data.py,sha256=2M-cF3CLHbQ0cdWrzFT6Frg1vVP4Y-CFoq8iEPyxgsE,18924
29
- nlptools/arabert/arabert/extract_features.py,sha256=C1IzASrlX7u4_M2xdr_PjzWfTRZgklhUXA2WHKgQt-I,15585
30
- nlptools/arabert/arabert/lamb_optimizer.py,sha256=uN3Dcx-6n2_OwepyymRrGrB4EcSkR8b2ZczZrOr7bpY,6263
31
- nlptools/arabert/arabert/modeling.py,sha256=KliecCmA1pP3owg0mYge6On3IRHunMF5kMLuEwc0VLw,40896
32
- nlptools/arabert/arabert/optimization.py,sha256=Wx0Js6Zsfc3iVw-_7Q1SCnxfP_qqbdTAyFD-vZSpOyk,8153
33
- nlptools/arabert/arabert/run_classifier.py,sha256=AdVGyvidlmbEp12b-PauiBo6EmFLEO7tqeJKuLhK2DA,38777
34
- nlptools/arabert/arabert/run_pretraining.py,sha256=yO16nKkHDfcYA2Zx7vv8KN4te6_1qFOzyVeDzFT-DQw,21894
35
- nlptools/arabert/arabert/run_squad.py,sha256=PORxgiByP8L6vZqAFkqgHPJ_ZjAlqlg64gtkdLmDNns,53456
36
- nlptools/arabert/arabert/tokenization.py,sha256=R6xkyCb8_vgeksXiLeqDvV5vOnLb1cPNsvfDij6YVFk,14132
37
- nlptools/arabert/araelectra/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
38
- nlptools/arabert/araelectra/build_openwebtext_pretraining_dataset.py,sha256=pIo6VFT3XXOYroZaab3msZAP6XjCKu0KcrIZQA0Pj8U,3881
39
- nlptools/arabert/araelectra/build_pretraining_dataset.py,sha256=Z8ZmKznaE_2SPDRoPYR1SDhjTN_NTpNCFFuhUkykwl8,9041
40
- nlptools/arabert/araelectra/build_pretraining_dataset_single_file.py,sha256=W7HFr1XoO6bCDR7X7w-bOuwULFtTSjeKbJ2LHzzHf9k,3224
41
- nlptools/arabert/araelectra/configure_finetuning.py,sha256=YfGLMdgN6Qqm357Mzy5UMjkuLPPWtBs7f4dA-DKE6JM,7768
42
- nlptools/arabert/araelectra/configure_pretraining.py,sha256=oafQgu4WmVdxBcU5mSfXhPlvCk43CJwAWXC10Q58BlI,5801
43
- nlptools/arabert/araelectra/flops_computation.py,sha256=krHTeuPH9xQu5ldprBOPJNlJRvC7fmmvXXqUjfWrzPE,9499
44
- nlptools/arabert/araelectra/run_finetuning.py,sha256=JecbrSmGikBNyid4JKRZ49Rm5xFpt02WfgIIcs3TpcU,12976
45
- nlptools/arabert/araelectra/run_pretraining.py,sha256=1K2aAFTY0p3iaLY0xkhTlm6v0B-Zun8SwEzz-K6RXM4,20665
46
- nlptools/arabert/araelectra/finetune/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
47
- nlptools/arabert/araelectra/finetune/feature_spec.py,sha256=cqNlBa2KK_G1-vkKm1EJUv6BoS3gesCUAHwVagZB6wM,1888
48
- nlptools/arabert/araelectra/finetune/preprocessing.py,sha256=1mf7-IxknCRsobQZ-VV1zs4Cwt-mfOtoVxysDJa9LZ0,6657
49
- nlptools/arabert/araelectra/finetune/scorer.py,sha256=PjRg0P5ANCtul2ute7ccq3mRCCoIAoCb-lVLlwd4rVY,1571
50
- nlptools/arabert/araelectra/finetune/task.py,sha256=zM8M4PGSIrY2u6ytpmkQEXxG-jjoeN9wouEyVR23qeQ,1991
51
- nlptools/arabert/araelectra/finetune/task_builder.py,sha256=Zsoiuw5M3Ca8QhaZVLVLZyWw09K5R75UeMuPmazMlHI,2768
52
- nlptools/arabert/araelectra/model/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
53
- nlptools/arabert/araelectra/model/modeling.py,sha256=5XLIutnmr-SFQOV_XntJ-U5evSCY-J2e9NjvlwVXKkk,40877
54
- nlptools/arabert/araelectra/model/optimization.py,sha256=BCMb_C5hgBw7wC9ZR8AQ4lwoPopqLIcSiqcCrIjx9XU,7254
55
- nlptools/arabert/araelectra/model/tokenization.py,sha256=9CkyPzs3L6OEPzN-7EWQDNQmW2mIJoZD4o1rn6xLdL4,11082
56
- nlptools/arabert/araelectra/pretrain/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
57
- nlptools/arabert/araelectra/pretrain/pretrain_data.py,sha256=NLgIcLAq1-MgtBNXYu_isDxnOY5k67SyADYy-8nzBok,5413
58
- nlptools/arabert/araelectra/pretrain/pretrain_helpers.py,sha256=nFl7LEdxAU5kKwiodqJHzi-ty9jMFsCCNYOF__A69j8,9255
59
- nlptools/arabert/araelectra/util/__init__.py,sha256=d55FZ9ZE-_t_WWMnIiRGozkTw50vBZ-s9BMy7l_I-ao,619
60
- nlptools/arabert/araelectra/util/training_utils.py,sha256=7h_J1ljUWM0ynBcofEtjZWL_oAfZtTxEemQLkixgn-0,4142
61
- nlptools/arabert/araelectra/util/utils.py,sha256=G0UAETUCZMlU9R9ASD9AXrWZeodWI1aZJEE9F-goaH4,2591
62
- nlptools/arabert/aragpt2/__init__.py,sha256=aQkKhQwWaS61wYEeOdx682upeMWFPUjLxXSs7JM1sOA,18
63
- nlptools/arabert/aragpt2/create_pretraining_data.py,sha256=fFa2_DAyTwc8L2IqQbshsh_Ia26nj1qtVLzC6DooSac,3105
64
- nlptools/arabert/aragpt2/train_bpe_tokenizer.py,sha256=b-8zHQ02fLmZV4GfjnrPptwjpX259F41SlnWzBrflMA,1888
65
- nlptools/arabert/aragpt2/gpt2/__init__.py,sha256=aQkKhQwWaS61wYEeOdx682upeMWFPUjLxXSs7JM1sOA,18
66
- nlptools/arabert/aragpt2/gpt2/lamb_optimizer.py,sha256=uN3Dcx-6n2_OwepyymRrGrB4EcSkR8b2ZczZrOr7bpY,6263
67
- nlptools/arabert/aragpt2/gpt2/optimization.py,sha256=iqh23cypRSRUt53wt2G5SbNNpJMwERM7gZAOKVh5l4U,8411
68
- nlptools/arabert/aragpt2/gpt2/run_pretraining.py,sha256=4jjkUbvTO1DHoKJ89yKtlkkofcND_fyAunQ-mlnJhTM,13298
69
- nlptools/arabert/aragpt2/grover/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
70
- nlptools/arabert/aragpt2/grover/dataloader.py,sha256=-FWPTjtsvweEE1WaWRHBXfOSbsGiUmnXT3qK7KJP8cM,6853
71
- nlptools/arabert/aragpt2/grover/modeling.py,sha256=XcUvFwqRaxAwWiJstrH2FPBvDJe03pTWIyipdMfWj9g,38280
72
- nlptools/arabert/aragpt2/grover/modeling_gpt2.py,sha256=WFpCWn1792yATFzt8rZ0rpWvExfbLzV2BqiEs7llFUw,51602
73
- nlptools/arabert/aragpt2/grover/optimization_adafactor.py,sha256=1geOsCWuv5xxtSnKDz9a8aY5SVwZ1MGq-xVQDBg4Gpg,9765
74
- nlptools/arabert/aragpt2/grover/train_tpu.py,sha256=qNgLI_j6-KYkTMJfVoFlh4NIKweY1aPz1XPDw6odld0,7102
75
- nlptools/arabert/aragpt2/grover/utils.py,sha256=V5wMUxK03r5g_pb7R3_uGLOPqQJfbIB0VaJ8ZDM4XAo,8473
76
- nlptools/arabiner/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
77
- nlptools/arabiner/bin/__init__.py,sha256=d1ToN2uheCCVby3TjiSuD1dqo_pvNIuTgz4COFr2Khs,438
78
- nlptools/arabiner/bin/eval.py,sha256=ihtjJinY1jXpZXW5bQJzTC5MF6_V3GQ5zHzsc691_HQ,2591
79
- nlptools/arabiner/bin/infer.py,sha256=EZKeq4zucIE-ooHYnegODNxsRiIY_gY5GvDPChH5WRQ,3237
80
- nlptools/arabiner/bin/process.py,sha256=4QCZjsmYV5lep6waQE37fs7Fe59_1G5seIJLDkArg4s,4698
81
- nlptools/arabiner/bin/train.py,sha256=hf6ZRhqMZ7bFealMSusBjtWrbzHGHc5HB2Lh4rp2uQA,6390
82
- nlptools/arabiner/data/__init__.py,sha256=XPic1bPijmZda_LPFL5J6TOps_IHUTiBDJvMx-iJqKo,61
83
- nlptools/arabiner/data/datasets.py,sha256=p52Uc8Q2D3EzN1OmoHQcWVsJ2oB3TqgTzAcy1B9fJ68,5068
84
- nlptools/arabiner/data/transforms.py,sha256=KPCDdjZOEvhMC38eiFwJuiQC84cfDrvC0XM4Ye0o3do,4878
85
- nlptools/arabiner/nn/BaseModel.py,sha256=3GmujQasTZZunOBuFXpY2p1W8W256iI_Uu4hxhOY2Z0,608
86
- nlptools/arabiner/nn/BertNestedTagger.py,sha256=7vU2tmDSoqSHn6GvMJmyN0hEMLvCkbr_r-AaiAaYdw8,1223
87
- nlptools/arabiner/nn/BertSeqTagger.py,sha256=dFcBBiMw2QCWsyy7aQDe_PS3aRuNn4DOxKIHgTblFvc,504
88
- nlptools/arabiner/nn/__init__.py,sha256=ZN7Psm83pysUhGI3ZSaJra2aCYBZb9DZ0UX4CiKGc0A,182
89
- nlptools/arabiner/trainers/BaseTrainer.py,sha256=oZgFJW-CawfCKT5gtaBHA7Q7XjNfiyqM62KnFsgVzPU,3919
90
- nlptools/arabiner/trainers/BertNestedTrainer.py,sha256=hVqPRdmaHf2iwftseNpgsAfwGkl6eMHJx1rKunQS_vM,8443
91
- nlptools/arabiner/trainers/BertTrainer.py,sha256=KkcgZXu6kqsrrnfFtiAQ8ucLsrQtDxLRqdbTiTnRWqI,6447
92
- nlptools/arabiner/trainers/__init__.py,sha256=kt8WqsaOjX0h1JMa-v7Y9ywT5mfwQIsZTyVWnIAWsEQ,200
93
- nlptools/arabiner/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
94
- nlptools/arabiner/utils/data.py,sha256=uuPiu-7v0gccNygZjdTKomJGE7X0H9FC24Y9nHZpf4c,4376
95
- nlptools/arabiner/utils/helpers.py,sha256=PyOOlx5uabvZVmU3SZtZ3ZLA3pliinJ3JXsvos9SUWU,5032
96
- nlptools/arabiner/utils/metrics.py,sha256=Irz6SsIvpOzGIA2lWxrEV86xnTnm0TzKm9SUVT4SXUU,2734
97
- nlptools/morphology/ALMA_multi_word.py,sha256=hlzZCk-uUdZ-GbiPsFxDTvoWoIuVof2Sm7NdaxaFipM,1313
98
- nlptools/morphology/__init__.py,sha256=z6_RGhiyfNHXNKMmhNSI6ObTLmdjQyP58vsFottI8GA,1706
99
- nlptools/morphology/charsets.py,sha256=7w9OrbnZTnLU3A9q-SUi9GhUN97qNtbYR5T0Pm72cF8,2784
100
- nlptools/morphology/morph_analyzer.py,sha256=OmCxm4-fM2qfYzKk8yOd6D_T3RsfzZCcd7Oz2V4Advg,6507
101
- nlptools/morphology/settings.py,sha256=sEZdnA7MiYXHdxrfHWXop1RcKClVzpOYzZwzHC1PxJ8,144
102
- nlptools/morphology/tokenizers_words.py,sha256=Smtt_KXifl2wRI464Qn07PtUvOsyGBJjZ7E20gd8zVM,602
103
- nlptools/salma/__init__.py,sha256=pOauGjD-xrGHw05sNx3EiSFc_wpM3bD1vJxQHoDDXOA,376
104
- nlptools/salma/settings.py,sha256=fqAQg2b22gorzT9Pf_AEJD9p8AlVUaVyKD3FH8g2yUs,1110
105
- nlptools/salma/views.py,sha256=EH1vc6P88CeAIzQKt7EU_HTI0uJipv4JdXiAX5NjrJY,18416
106
- nlptools/salma/wsd.py,sha256=kmP5ZvvVMkxApgk91TAGSBkMJZbPPbS0qoNk8OE37og,4434
107
- nlptools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
108
- nlptools/utils/corpus_tokenizer.py,sha256=IDWh87XJaFa7V2P4kIxY4QVywPKhz0fIErc_c0gJGUU,4581
109
- nlptools/utils/implication.py,sha256=Ro1Vw62oOBzELkX-zpHyieq4v2OsoyFrFeTU7BiK7qc,27794
110
- nlptools/utils/jaccard.py,sha256=TTC5KTVv6kONw5vZtzxEQvv7QM79BCsD0xcJAY0T5tU,10111
111
- nlptools/utils/parser.py,sha256=0Yd40CZf4wXso2q-d9LULUNAVUAMdiYMImfcVb6i9qQ,6175
112
- nlptools/utils/readfile.py,sha256=xE4LEaCqXJIk9v37QUSSmWb-aY3UnCFUNb7uVdx3cpM,133
113
- nlptools/utils/sentence_tokenizer.py,sha256=3C0Wx1ns8ZHiGwKlUkcti-8zA3fB4ju0fIEtGACM7oU,2162
114
- nlptools/utils/text_transliteration.py,sha256=zhB3sFXSMtkkdqImRMVg415AAB80DOm9lMFKb2IBynw,8765
115
- nlptools/utils/utils.py,sha256=vKkFOkYclMu8nXS_VZb6Kobx8QGKW9onXkkLCeiRb6g,32
116
- SinaTools-0.1.3.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
117
- SinaTools-0.1.3.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
118
- SinaTools-0.1.3.dist-info/METADATA,sha256=zxuxnKe_i5AAHNC_uPGxpmAzB2T2y01iL-kHIRV5H-o,1527
119
- SinaTools-0.1.3.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
120
- SinaTools-0.1.3.dist-info/entry_points.txt,sha256=9-PNkvWGCid8SVN03S2NkJFuxAzvcB22tGpHe-et2q8,951
121
- SinaTools-0.1.3.dist-info/top_level.txt,sha256=sREDI6iHe4D0BZQmZbZ-LxYIn2cBWUayk9CZwAR9jaE,9
122
- SinaTools-0.1.3.dist-info/RECORD,,
@@ -1,18 +0,0 @@
1
- [console_scripts]
2
- arabi_ner = nlptools.CLI.arabiner.bin.infer:main
3
- arabi_ner2 = nlptools.CLI.arabiner.bin.infer2:main
4
- install_env = nlptools.install_env:main
5
- sina_alma_multi_word = nlptools.CLI.morphology.ALMA_multi_word:main
6
- sina_appdatadir = nlptools.CLI.DataDownload.get_appdatadir:main
7
- sina_arStrip = nlptools.CLI.utils.arStrip:main
8
- sina_corpus_tokenizer = nlptools.CLI.utils.corpus_tokenizer:main
9
- sina_download_files = nlptools.CLI.DataDownload.download_files:main
10
- sina_implication = nlptools.CLI.utils.implication:main
11
- sina_jaccard_similarity = nlptools.CLI.utils.jaccard:main
12
- sina_morph_analyze = nlptools.CLI.morphology.morph_analyzer:main
13
- sina_remove_latin = nlptools.CLI.utils.latin_remove:main
14
- sina_remove_punctuation = nlptools.CLI.utils.remove_Punc:main
15
- sina_salma = nlptools.CLI.salma.salma_tools:main
16
- sina_sentence_tokenize = nlptools.CLI.utils.sentence_tokenizer:main
17
- sina_transliterate = nlptools.CLI.utils.text_transliteration:main
18
-
@@ -1 +0,0 @@
1
- nlptools
@@ -1,91 +0,0 @@
1
- """
2
- About:
3
- ------
4
- The sina_morph_analyze tool is designed to provide morphological analysis for Arabic text using the SinaTools' `analyze` utility. Users can specify the language and desired analysis task (e.g., lemmatization, part-of-speech tagging, or a full morphological analysis).
5
-
6
- Usage:
7
- ------
8
- Below is the usage information that can be generated by running sina_morph_analyze --help.
9
-
10
- .. code-block:: none
11
-
12
- sina_morph_analyze --text=TEXT [OPTIONS]
13
- sina_morph_analyze --file=FILE [OPTIONS]
14
-
15
- Options:
16
- --------
17
-
18
- .. code-block:: none
19
-
20
- --text TEXT
21
- The text that needs to be morphologically analyzed.
22
-
23
- --file FILE
24
- File containing the text to be morphologically analyzed
25
-
26
- --language LANGUAGE [default=MSA]
27
- Specifies the language for the analysis. The default is MSA (Modern Standard Arabic).
28
- Use other codes as appropriate for your requirements.
29
-
30
- --task TASK [default=full]
31
- Determines the specific type of morphological analysis to be performed. Available options are:
32
- - lemmatizer: Provides lemmatization results.
33
- - pos: Provides part-of-speech tagging.
34
- - full: Provides a comprehensive morphological analysis.
35
- The default is a full morphological analysis.
36
-
37
- Examples:
38
- ---------
39
-
40
- .. code-block:: none
41
-
42
- sina_morph_analyze --text "Your Arabic text here" --language MSA --task full
43
- sina_morph_analyze --text "Your Arabic text here" --task lemmatizer
44
- sina_morph_analyze --file "path/to/your/file.txt" --language MSA --task full
45
- sina_morph_analyze --file "path/to/your/file.txt" --task lemmatizer
46
-
47
- Note:
48
- -----
49
-
50
- .. code-block:: none
51
-
52
- - Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
53
- - The quality and accuracy of the analysis depend on the underlying capabilities of the SinaTools' `analyze` utility.
54
- - The analysis can be influenced by the choice of language. Ensure you are using the correct language setting.
55
-
56
- """
57
-
58
- import argparse
59
- from nlptools.morphology.morph_analyzer import analyze
60
- from nlptools.utils.readfile import read_file
61
-
62
- def main():
63
- parser = argparse.ArgumentParser(description='Morphological Analysis using SinaTools')
64
-
65
- # Adding arguments for the text, file, language, and task
66
- parser.add_argument('--text', type=str, help='Text to be morphologically analyzed')
67
- parser.add_argument('--file', type=str, help='File containing the text to be morphologically analyzed')
68
- parser.add_argument('--language', type=str, default='MSA', help='Language for analysis (default: MSA)')
69
- parser.add_argument('--task', type=str, default='full', choices=['lemmatizer', 'pos', 'full'],
70
- help='Task for the result filter [lemmatizer, pos, full] (default: full)')
71
-
72
- args = parser.parse_args()
73
-
74
- if args.text is None and args.file is None:
75
- print("Error: Either --text or --file argument must be provided.")
76
- return
77
-
78
- # Get the input either from the --text argument or from the file specified in the --file argument
79
- input_text = args.text if args.text else " ".join(read_file(args.file))
80
-
81
- # Perform morphological analysis
82
- results = analyze(input_text, args.language, args.task)
83
-
84
- # Print the results
85
- for result in results:
86
- print(result)
87
-
88
- if __name__ == '__main__':
89
- main()
90
- #sina_morph_analyze --text "Your Arabic text here" --language MSA --task full
91
- #sina_morph_analyze --file "path/to/your/file.txt" --language MSA --task full
@@ -1,74 +0,0 @@
1
- """
2
-
3
- About:
4
- ------
5
-
6
- The sina_corpus_tokenizer tool offers functionality to tokenize a corpus and write the results to a CSV file. It recursively searches through a specified directory for text files, tokenizes the content, and outputs the results, including various metadata, to a specified CSV file.
7
-
8
- Usage:
9
- -------
10
-
11
- Below is the usage information that can be generated by running sina_corpus_tokenizer --help.
12
-
13
- .. code-block:: none
14
-
15
- Usage:
16
- sina_corpus_tokenizer dir_path output_csv
17
-
18
- .. code-block:: none
19
-
20
- Positional Arguments:
21
- dir_path
22
- The path to the directory containing the text files.
23
-
24
- output_csv
25
- The path to the output CSV file.
26
-
27
- Examples:
28
- ---------
29
-
30
- .. code-block:: none
31
-
32
- sina_corpus_tokenizer --dir_path "/path/to/text/directory/of/files" --output_csv "outputFile.csv"
33
-
34
- Note:
35
- -----
36
-
37
- .. code-block:: none
38
-
39
- - The tool only processes text files (with a .txt extension).
40
- - The output CSV will contain the following columns:
41
- - 'Row_ID' (a unique identifier for each records in outputfile)
42
- - 'Docs_Sentence_Word_ID' (a concatenated identifier comprising directory name, file name, global sentence id, sentence id, and word position).
43
- - 'GlobalSentenceID' (Integer, a unique identifier for each sentence in the entire file)
44
- - 'SentenceID' (Integer, a unique identifier for each file within the CSV file)
45
- - 'Sentence' (Generated text that forms a sentence)
46
- - 'Word Position' (Integer, the position of each word within the sentence)
47
- - 'Word' (Each row contains a word from the generated sentence).
48
- - Ensure that the text files are appropriately encoded in UTF-8 or compatible formats.
49
- - The tool uses the `nltk` library for sentence and word tokenization. Make sure to have the library installed in your environment.
50
- """
51
-
52
- import argparse
53
- from nlptools.utils.corpus_tokenizer import corpus_tokenizer
54
-
55
- # Define the main function that will parse the arguments
56
- def main():
57
- # Create an ArgumentParser object
58
- parser = argparse.ArgumentParser(description='Tokenize the corpus and write the results to a CSV file.')
59
-
60
- # Add arguments to the parser
61
- parser.add_argument('--dir_path', type=str, help='The path to the directory containing the text files.')
62
- parser.add_argument('--output_csv', type=str, help='The path to the output CSV file.')
63
-
64
- # Parse the command-line arguments
65
- args = parser.parse_args()
66
-
67
- # Call the corpus_tokenizer function with the parsed arguments
68
- corpus_tokenizer(args.dir_path, args.output_csv)
69
-
70
- # Call the main function when the script is executed
71
- if __name__ == '__main__':
72
- main()
73
-
74
- #sina_corpus_tokenizer /path/to/text/files output.csv