SinaTools 0.1.4__py2.py3-none-any.whl → 0.1.8__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. {SinaTools-0.1.4.dist-info → SinaTools-0.1.8.dist-info}/METADATA +10 -10
  2. SinaTools-0.1.8.dist-info/RECORD +101 -0
  3. SinaTools-0.1.8.dist-info/entry_points.txt +18 -0
  4. SinaTools-0.1.8.dist-info/top_level.txt +1 -0
  5. {nlptools → sinatools}/CLI/DataDownload/download_files.py +9 -9
  6. {nlptools → sinatools}/CLI/morphology/ALMA_multi_word.py +10 -20
  7. sinatools/CLI/morphology/morph_analyzer.py +80 -0
  8. nlptools/CLI/arabiner/bin/infer2.py → sinatools/CLI/ner/corpus_entity_extractor.py +5 -9
  9. nlptools/CLI/arabiner/bin/infer.py → sinatools/CLI/ner/entity_extractor.py +4 -8
  10. {nlptools → sinatools}/CLI/salma/salma_tools.py +8 -8
  11. {nlptools → sinatools}/CLI/utils/arStrip.py +10 -21
  12. sinatools/CLI/utils/corpus_tokenizer.py +50 -0
  13. {nlptools → sinatools}/CLI/utils/implication.py +9 -9
  14. {nlptools → sinatools}/CLI/utils/jaccard.py +10 -10
  15. sinatools/CLI/utils/remove_latin.py +34 -0
  16. sinatools/CLI/utils/remove_punctuation.py +42 -0
  17. {nlptools → sinatools}/CLI/utils/sentence_tokenizer.py +9 -22
  18. {nlptools → sinatools}/CLI/utils/text_transliteration.py +10 -17
  19. {nlptools → sinatools}/DataDownload/downloader.py +9 -9
  20. sinatools/VERSION +1 -0
  21. {nlptools → sinatools}/__init__.py +1 -1
  22. {nlptools → sinatools}/morphology/ALMA_multi_word.py +4 -5
  23. {nlptools → sinatools}/morphology/__init__.py +4 -14
  24. sinatools/morphology/morph_analyzer.py +172 -0
  25. sinatools/ner/__init__.py +12 -0
  26. nlptools/arabiner/bin/infer.py → sinatools/ner/entity_extractor.py +9 -8
  27. {nlptools → sinatools}/salma/__init__.py +2 -2
  28. {nlptools → sinatools}/salma/settings.py +1 -1
  29. {nlptools → sinatools}/salma/views.py +9 -9
  30. {nlptools → sinatools}/salma/wsd.py +2 -2
  31. {nlptools/morphology → sinatools/utils}/charsets.py +1 -3
  32. {nlptools → sinatools}/utils/implication.py +10 -10
  33. {nlptools → sinatools}/utils/jaccard.py +2 -2
  34. {nlptools → sinatools}/utils/parser.py +18 -21
  35. {nlptools → sinatools}/utils/text_transliteration.py +1 -1
  36. nlptools/utils/corpus_tokenizer.py → sinatools/utils/tokenizer.py +58 -5
  37. {nlptools/morphology → sinatools/utils}/tokenizers_words.py +3 -6
  38. SinaTools-0.1.4.dist-info/RECORD +0 -122
  39. SinaTools-0.1.4.dist-info/entry_points.txt +0 -18
  40. SinaTools-0.1.4.dist-info/top_level.txt +0 -1
  41. nlptools/CLI/morphology/morph_analyzer.py +0 -91
  42. nlptools/CLI/utils/corpus_tokenizer.py +0 -74
  43. nlptools/CLI/utils/latin_remove.py +0 -51
  44. nlptools/CLI/utils/remove_Punc.py +0 -53
  45. nlptools/VERSION +0 -1
  46. nlptools/arabiner/bin/__init__.py +0 -14
  47. nlptools/arabiner/bin/eval.py +0 -87
  48. nlptools/arabiner/bin/process.py +0 -140
  49. nlptools/arabiner/bin/train.py +0 -221
  50. nlptools/arabiner/data/__init__.py +0 -1
  51. nlptools/arabiner/data/datasets.py +0 -146
  52. nlptools/arabiner/data/transforms.py +0 -118
  53. nlptools/arabiner/nn/BaseModel.py +0 -22
  54. nlptools/arabiner/nn/BertNestedTagger.py +0 -34
  55. nlptools/arabiner/nn/BertSeqTagger.py +0 -17
  56. nlptools/arabiner/nn/__init__.py +0 -3
  57. nlptools/arabiner/trainers/BaseTrainer.py +0 -117
  58. nlptools/arabiner/trainers/BertNestedTrainer.py +0 -203
  59. nlptools/arabiner/trainers/BertTrainer.py +0 -163
  60. nlptools/arabiner/trainers/__init__.py +0 -3
  61. nlptools/arabiner/utils/__init__.py +0 -0
  62. nlptools/arabiner/utils/data.py +0 -124
  63. nlptools/arabiner/utils/helpers.py +0 -151
  64. nlptools/arabiner/utils/metrics.py +0 -69
  65. nlptools/morphology/morph_analyzer.py +0 -171
  66. nlptools/morphology/settings.py +0 -8
  67. nlptools/utils/__init__.py +0 -0
  68. nlptools/utils/sentence_tokenizer.py +0 -53
  69. {SinaTools-0.1.4.data/data/nlptools → SinaTools-0.1.8.data/data/sinatools}/environment.yml +0 -0
  70. {SinaTools-0.1.4.dist-info → SinaTools-0.1.8.dist-info}/AUTHORS.rst +0 -0
  71. {SinaTools-0.1.4.dist-info → SinaTools-0.1.8.dist-info}/LICENSE +0 -0
  72. {SinaTools-0.1.4.dist-info → SinaTools-0.1.8.dist-info}/WHEEL +0 -0
  73. {nlptools → sinatools}/CLI/utils/__init__.py +0 -0
  74. {nlptools → sinatools}/DataDownload/__init__.py +0 -0
  75. {nlptools → sinatools}/arabert/__init__.py +0 -0
  76. {nlptools → sinatools}/arabert/arabert/__init__.py +0 -0
  77. {nlptools → sinatools}/arabert/arabert/create_classification_data.py +0 -0
  78. {nlptools → sinatools}/arabert/arabert/create_pretraining_data.py +0 -0
  79. {nlptools → sinatools}/arabert/arabert/extract_features.py +0 -0
  80. {nlptools → sinatools}/arabert/arabert/lamb_optimizer.py +0 -0
  81. {nlptools → sinatools}/arabert/arabert/modeling.py +0 -0
  82. {nlptools → sinatools}/arabert/arabert/optimization.py +0 -0
  83. {nlptools → sinatools}/arabert/arabert/run_classifier.py +0 -0
  84. {nlptools → sinatools}/arabert/arabert/run_pretraining.py +0 -0
  85. {nlptools → sinatools}/arabert/arabert/run_squad.py +0 -0
  86. {nlptools → sinatools}/arabert/arabert/tokenization.py +0 -0
  87. {nlptools → sinatools}/arabert/araelectra/__init__.py +0 -0
  88. {nlptools → sinatools}/arabert/araelectra/build_openwebtext_pretraining_dataset.py +0 -0
  89. {nlptools → sinatools}/arabert/araelectra/build_pretraining_dataset.py +0 -0
  90. {nlptools → sinatools}/arabert/araelectra/build_pretraining_dataset_single_file.py +0 -0
  91. {nlptools → sinatools}/arabert/araelectra/configure_finetuning.py +0 -0
  92. {nlptools → sinatools}/arabert/araelectra/configure_pretraining.py +0 -0
  93. {nlptools → sinatools}/arabert/araelectra/finetune/__init__.py +0 -0
  94. {nlptools → sinatools}/arabert/araelectra/finetune/feature_spec.py +0 -0
  95. {nlptools → sinatools}/arabert/araelectra/finetune/preprocessing.py +0 -0
  96. {nlptools → sinatools}/arabert/araelectra/finetune/scorer.py +0 -0
  97. {nlptools → sinatools}/arabert/araelectra/finetune/task.py +0 -0
  98. {nlptools → sinatools}/arabert/araelectra/finetune/task_builder.py +0 -0
  99. {nlptools → sinatools}/arabert/araelectra/flops_computation.py +0 -0
  100. {nlptools → sinatools}/arabert/araelectra/model/__init__.py +0 -0
  101. {nlptools → sinatools}/arabert/araelectra/model/modeling.py +0 -0
  102. {nlptools → sinatools}/arabert/araelectra/model/optimization.py +0 -0
  103. {nlptools → sinatools}/arabert/araelectra/model/tokenization.py +0 -0
  104. {nlptools → sinatools}/arabert/araelectra/pretrain/__init__.py +0 -0
  105. {nlptools → sinatools}/arabert/araelectra/pretrain/pretrain_data.py +0 -0
  106. {nlptools → sinatools}/arabert/araelectra/pretrain/pretrain_helpers.py +0 -0
  107. {nlptools → sinatools}/arabert/araelectra/run_finetuning.py +0 -0
  108. {nlptools → sinatools}/arabert/araelectra/run_pretraining.py +0 -0
  109. {nlptools → sinatools}/arabert/araelectra/util/__init__.py +0 -0
  110. {nlptools → sinatools}/arabert/araelectra/util/training_utils.py +0 -0
  111. {nlptools → sinatools}/arabert/araelectra/util/utils.py +0 -0
  112. {nlptools → sinatools}/arabert/aragpt2/__init__.py +0 -0
  113. {nlptools → sinatools}/arabert/aragpt2/create_pretraining_data.py +0 -0
  114. {nlptools → sinatools}/arabert/aragpt2/gpt2/__init__.py +0 -0
  115. {nlptools → sinatools}/arabert/aragpt2/gpt2/lamb_optimizer.py +0 -0
  116. {nlptools → sinatools}/arabert/aragpt2/gpt2/optimization.py +0 -0
  117. {nlptools → sinatools}/arabert/aragpt2/gpt2/run_pretraining.py +0 -0
  118. {nlptools → sinatools}/arabert/aragpt2/grover/__init__.py +0 -0
  119. {nlptools → sinatools}/arabert/aragpt2/grover/dataloader.py +0 -0
  120. {nlptools → sinatools}/arabert/aragpt2/grover/modeling.py +0 -0
  121. {nlptools → sinatools}/arabert/aragpt2/grover/modeling_gpt2.py +0 -0
  122. {nlptools → sinatools}/arabert/aragpt2/grover/optimization_adafactor.py +0 -0
  123. {nlptools → sinatools}/arabert/aragpt2/grover/train_tpu.py +0 -0
  124. {nlptools → sinatools}/arabert/aragpt2/grover/utils.py +0 -0
  125. {nlptools → sinatools}/arabert/aragpt2/train_bpe_tokenizer.py +0 -0
  126. {nlptools → sinatools}/arabert/preprocess.py +0 -0
  127. {nlptools → sinatools}/environment.yml +0 -0
  128. {nlptools → sinatools}/install_env.py +0 -0
  129. /nlptools/nlptools.py → /sinatools/sinatools.py +0 -0
  130. {nlptools/arabiner → sinatools/utils}/__init__.py +0 -0
  131. {nlptools → sinatools}/utils/readfile.py +0 -0
  132. {nlptools → sinatools}/utils/utils.py +0 -0
@@ -1,18 +1,18 @@
1
1
  """
2
2
  About:
3
3
  ------
4
- The sina_jaccard tool computes the Jaccard similarity between two sets of strings. The Jaccard similarity is the size of the intersection divided by the size of the union of the sample sets. It provides a measure of similarity between two sets.
4
+ The jaccard tool computes the Jaccard similarity between two sets of strings. The Jaccard similarity is the size of the intersection divided by the size of the union of the sample sets. It provides a measure of similarity between two sets.
5
5
 
6
6
  Usage:
7
7
  ------
8
- Below is the usage information that can be generated by running sina_jaccard --help.
8
+ Below is the usage information that can be generated by running jaccard --help.
9
9
 
10
10
  .. code-block:: none
11
11
 
12
12
  Usage:
13
- sina_jaccard --list1="WORD1, WORD2" --list2="WORD1,WORD2" --delimiter="DELIMITER" --selection="SELECTION" [OPTIONS]
13
+ jaccard --list1="WORD1, WORD2" --list2="WORD1,WORD2" --delimiter="DELIMITER" --selection="SELECTION" [OPTIONS]
14
14
 
15
- sina_jaccard --file1=File1 --file2=File2 --delimiter="DELIMITER" --selection="SELECTION" [OPTIONS]
15
+ jaccard --file1=File1 --file2=File2 --delimiter="DELIMITER" --selection="SELECTION" [OPTIONS]
16
16
 
17
17
  .. code-block:: none
18
18
 
@@ -39,9 +39,9 @@ Examples:
39
39
 
40
40
  .. code-block:: none
41
41
 
42
- sina_jaccard --list1 "word1,word2" --list2 "word1, word2" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
42
+ jaccard --list1 "word1,word2" --list2 "word1, word2" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
43
43
 
44
- sina_jaccard --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
44
+ jaccard --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
45
45
 
46
46
  Note:
47
47
  -----
@@ -55,8 +55,8 @@ Note:
55
55
  """
56
56
 
57
57
  import argparse
58
- from nlptools.utils.jaccard import jaccard
59
- from nlptools.utils.readfile import read_file
58
+ from sinatools.utils.jaccard import jaccard
59
+ from sinatools.utils.readfile import read_file
60
60
 
61
61
 
62
62
  def main():
@@ -92,5 +92,5 @@ def main():
92
92
  if __name__ == '__main__':
93
93
  main()
94
94
 
95
- # sina_jaccard_similarity --list1 "word1,word2" --list2 "word1, word2" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
96
- # sina_jaccard_similarity --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
95
+ # jaccard_similarity --list1 "word1,word2" --list2 "word1, word2" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
96
+ # jaccard_similarity --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
@@ -0,0 +1,34 @@
1
+ """
2
+ About:
3
+ ------
4
+ The remove_latin command performs delete latin characters from the input text.
5
+
6
+ Usage:
7
+ ------
8
+ Below is the usage information that can be generated by running remove_latin --help.
9
+
10
+ .. code-block:: none
11
+ remove_latin --text=TEXT
12
+ remove_latin --file "path/to/your/file.txt"
13
+
14
+ Examples:
15
+ ---------
16
+ .. code-block:: none
17
+ latin_remove --text "123test"
18
+ latin_remove --file "path/to/your/file.txt"
19
+ """
20
+
21
+ import argparse
22
+ from sinatools.utils.parser import remove_latin
23
+
24
+
25
+ def main():
26
+ parser = argparse.ArgumentParser(description='remove latin characters from the text')
27
+
28
+ parser.add_argument('--text', type=str, required=True, help='The input text')
29
+ args = parser.parse_args()
30
+ result = remove_latin(args.text)
31
+
32
+ print(result)
33
+ if __name__ == '__main__':
34
+ main()
@@ -0,0 +1,42 @@
1
+ """
2
+ About:
3
+ ------
4
+ The remove_punctuation command performs delete punctuation marks from the input text.
5
+
6
+ Usage:
7
+ ------
8
+ Below is the usage information that can be generated by running remove_punctuation --help.
9
+
10
+ .. code-block:: none
11
+
12
+ Usage:
13
+ remove_punctuation --text=TEXT
14
+ remove_punctuation --file "path/to/your/file.txt"
15
+
16
+ Examples:
17
+ ---------
18
+ .. code-block:: none
19
+
20
+ remove_punctuation --text "te%s@t...!!?"
21
+ remove_punctuation --file "path/to/your/file.txt"
22
+ """
23
+
24
+ import argparse
25
+ from sinatools.utils.parser import remove_punctuation
26
+ #from sinatools.utils.parser import read_file
27
+ #from sinatools.utils.parser import write_file
28
+
29
+
30
+ def main():
31
+ parser = argparse.ArgumentParser(description='remove punctuation marks from the text')
32
+
33
+ parser.add_argument('--text',required=True,help="input text")
34
+ # parser.add_argument('myFile', type=argparse.FileType('r'),help='Input file csv')
35
+ args = parser.parse_args()
36
+ result = remove_punctuation(args.text)
37
+
38
+ print(result)
39
+ if __name__ == '__main__':
40
+ main()
41
+
42
+
@@ -2,20 +2,19 @@
2
2
  About:
3
3
  ------
4
4
 
5
- The sina_sentence_tokenize tool allows you to tokenize text into sentences using the SinaTools utility. It provides
5
+ The sentence_tokenizer command allows you to tokenize text into sentences using the SinaTools utility. It provides
6
6
  flexibility in tokenizing at different punctuation marks, including dots, question marks, and exclamation marks. It also
7
7
  allows tokenization at new lines.
8
8
 
9
9
  Usage:
10
10
  ------
11
- Below is the usage information that can be generated by running sina_sentence_tokenize --help.
11
+ Below is the usage information that can be generated by running sentence_tokenizer --help.
12
12
 
13
13
  .. code-block:: none
14
14
 
15
15
  Usage:
16
- sina_sentence_tokenize --text=TEXT [options]
17
-
18
- sina_sentence_tokenize --file=FILE [options]
16
+ sentence_tokenizer --text=TEXT [options]
17
+ sentence_tokenizer --file=FILE [options]
19
18
 
20
19
  .. code-block:: none
21
20
 
@@ -38,23 +37,13 @@ Examples:
38
37
 
39
38
  .. code-block:: none
40
39
 
41
- sina_sentence_tokenize --text "Your text here. Does it work? Yes! Try with new lines." --dot --question_mark --exclamation_mark
42
-
43
- sina_sentence_tokenize --file "path/to/your/file.txt" --dot --question_mark --exclamation_mark
44
-
45
- Note:
46
- -----
47
-
48
- .. code-block:: none
49
-
50
- - The tokenization options allow for a customized experience. You can choose any combination of the options, or even none
51
- - of them, to achieve the desired sentence tokenization behavior. If no tokenization options are provided, the tool will
52
- - use default settings as implemented in the underlying `sent_tokenize` function of SinaTools.
40
+ sentence_tokenizer --text "Your text here. Does it work? Yes! Try with new lines." --dot --question_mark --exclamation_mark
41
+ sentence_tokenizer --file "path/to/your/file.txt" --dot --question_mark --exclamation_mark
53
42
 
54
43
  """
55
44
  import argparse
56
- from nlptools.utils.sentence_tokenizer import sent_tokenize
57
- from nlptools.utils.readfile import read_file
45
+ from sinatools.utils.tokenizer import sentence_tokenizer
46
+ from sinatools.utils.readfile import read_file
58
47
 
59
48
  def main():
60
49
  parser = argparse.ArgumentParser(description='Sentence Tokenization using SinaTools')
@@ -77,7 +66,7 @@ def main():
77
66
  text_content = args.text if args.text else read_file(args.file)
78
67
 
79
68
  # Perform sentence tokenization
80
- sentences = sent_tokenize(" ".join(text_content), dot=args.dot, new_line=args.new_line,
69
+ sentences = sentence_tokenizer(" ".join(text_content), dot=args.dot, new_line=args.new_line,
81
70
  question_mark=args.question_mark, exclamation_mark=args.exclamation_mark)
82
71
 
83
72
  # Print each sentence in a new line
@@ -86,5 +75,3 @@ def main():
86
75
 
87
76
  if __name__ == '__main__':
88
77
  main()
89
- #sina_sentence_tokenize --text "Your text here. Does it work? Yes! Try with new lines." --dot --question_mark --exclamation_mark
90
- #sina_sentence_tokenize --file "path/to/your/file.txt" --dot --question_mark --exclamation_mark
@@ -2,21 +2,21 @@
2
2
  About:
3
3
  ------
4
4
 
5
- The sina_transliterate tool allows you to transliterate text using the SinaTools' utility. This command-line utility
5
+ The transliterate tool allows you to transliterate text using the SinaTools' utility. This command-line utility
6
6
  takes in a text and a desired schema, and outputs the transliterated text.
7
7
 
8
8
  Usage:
9
9
  ------
10
- Below is the usage information that can be generated by running sina_transliterate --help.
10
+ Below is the usage information that can be generated by running transliterate --help.
11
11
 
12
12
  Usage:
13
13
  ------
14
14
 
15
15
  .. code-block:: none
16
16
 
17
- sina_transliterate --text=TEXT --schema=SCHEMA
17
+ transliterate --text=TEXT --schema=SCHEMA
18
18
 
19
- sina_transliterate --file=FILE --schema=SCHEMA
19
+ transliterate --file=FILE --schema=SCHEMA
20
20
 
21
21
  Options:
22
22
  --------
@@ -33,21 +33,14 @@ Examples:
33
33
 
34
34
  .. code-block:: none
35
35
 
36
- sina_transliterate --text "klmp" --schema "bw2ar"
37
- sina_transliterate --file "path/to/your/file.txt" --schema "ar2bw"
36
+ transliterate --text "klmp" --schema "bw2ar"
37
+ transliterate --file "path/to/your/file.txt" --schema "ar2bw"
38
38
 
39
- Note:
40
- -----
41
-
42
- .. code-block:: none
43
-
44
- For available transliteration schemas and more details, please refer to the SinaTools' documentation or the source code
45
- of the function `perform_transliteration`.
46
39
 
47
40
  """
48
41
  import argparse
49
- from nlptools.utils.text_transliteration import perform_transliteration
50
- from nlptools.utils.readfile import read_file
42
+ from sinatools.utils.text_transliteration import perform_transliteration
43
+ from sinatools.utils.readfile import read_file
51
44
 
52
45
  def main():
53
46
  parser = argparse.ArgumentParser(description='Perform text transliteration using SinaTools')
@@ -73,5 +66,5 @@ def main():
73
66
  if __name__ == '__main__':
74
67
  main()
75
68
 
76
- #sina_transliterate --text "example text" --schema "bw2ar"
77
- #sina_transliterate --file "path/to/your/file.txt" --schema "bw2ar"
69
+ #transliterate --text "example text" --schema "bw2ar"
70
+ #transliterate --file "path/to/your/file.txt" --schema "bw2ar"
@@ -35,25 +35,25 @@ def get_appdatadir():
35
35
  .. highlight:: python
36
36
  .. code-block:: python
37
37
 
38
- from nlptools.DataDownload import downloader
38
+ from sinatools.DataDownload import downloader
39
39
 
40
40
  path = downloader.get_appdatadir()
41
41
 
42
- Windows: 'C:/Users/<Username>/AppData/Roaming/nlptools'
43
- MacOS: '/Users/<Username>/Library/Application Support/nlptools'
44
- Linux: '/home/<Username>/.nlptools'
45
- Google Colab: '/content/nlptools'
42
+ Windows: 'C:/Users/<Username>/AppData/Roaming/sinatools'
43
+ MacOS: '/Users/<Username>/Library/Application Support/sinatools'
44
+ Linux: '/home/<Username>/.sinatools'
45
+ Google Colab: '/content/sinatools'
46
46
 
47
47
  """
48
48
  home = str(Path.home())
49
49
  if 'google.colab' in sys.modules:
50
- path = Path('/content/nlptools')
50
+ path = Path('/content/sinatools')
51
51
  elif sys.platform == 'win32':
52
- path = Path(home, 'AppData/Roaming/nlptools')
52
+ path = Path(home, 'AppData/Roaming/sinatools')
53
53
  elif sys.platform == 'darwin':
54
- path = Path(home, 'Library/Application Support/nlptools')
54
+ path = Path(home, 'Library/Application Support/sinatools')
55
55
  else:
56
- path = Path(home, '.nlptools')
56
+ path = Path(home, '.sinatools')
57
57
 
58
58
  if not os.path.exists(path):
59
59
  os.makedirs(path)
sinatools/VERSION ADDED
@@ -0,0 +1 @@
1
+ 0.1.8
@@ -1,4 +1,4 @@
1
- """Top-level package for nlptools."""
1
+ """Top-level package for sinatools."""
2
2
 
3
3
  __author__ = """SinaLab"""
4
4
  __email__ = 'sina.institute.bzu@gmail.com'
@@ -1,13 +1,12 @@
1
- from nlptools.morphology import settings
2
- from nlptools.utils.parser import arStrip
1
+ from sinatools.utils.parser import arStrip
3
2
  import json
4
-
3
+ from . import dictionary
5
4
 
6
5
  def ALMA_multi_word(multi_word):
7
6
  undiac_multi_word = arStrip(multi_word, True, True, True, False, True, False) # diacs , smallDiacs , shaddah , digit , alif , specialChars
8
7
  result_word = []
9
- if undiac_multi_word in settings.div_dic.keys():
10
- result_word = settings.div_dic[undiac_multi_word]
8
+ if undiac_multi_word in dictionary.keys():
9
+ result_word = dictionary[undiac_multi_word]
11
10
 
12
11
  my_json = {}
13
12
  glosses_list = []
@@ -1,23 +1,13 @@
1
- from nlptools.morphology import settings
2
1
  import pickle
3
- from nlptools.DataDownload import downloader
2
+ from sinatools.DataDownload import downloader
4
3
  import os
5
4
 
6
- #filename = 'ALMA27012000.pickle'
7
- #path =downloader.get_appdatadir()
8
- #file_path = os.path.join(path, filename)
9
- #with open(file_path, 'rb') as f:
10
- # #Load the serialized data from the file
11
- # settings.div_dic = pickle.load(f)
12
-
13
-
5
+ dictionary = {}
14
6
  filename = 'lemmas_dic.pickle'
15
- path =downloader.get_appdatadir()
7
+ path = downloader.get_appdatadir()
16
8
  file_path = os.path.join(path, filename)
17
9
  with open(file_path, 'rb') as f:
18
- #Load the serialized data from the file
19
- settings.div_dic = pickle.load(f)
20
-
10
+ dictionary = pickle.load(f)
21
11
 
22
12
  #filename_five = 'five_grams.pickle'
23
13
  #path =downloader.get_appdatadir()
@@ -0,0 +1,172 @@
1
+ import re
2
+ from sinatools.utils.tokenizers_words import simple_word_tokenize
3
+ from sinatools.utils.parser import arStrip
4
+ from sinatools.utils.charsets import AR_CHARSET, AR_DIAC_CHARSET
5
+ from sinatools.DataDownload.downloader import get_appdatadir
6
+ from . import dictionary
7
+
8
+ _IS_AR_RE = re.compile(u'^[' + re.escape(u''.join(AR_CHARSET)) + u']+$')
9
+
10
+ def find_solution(token, language, flag):
11
+ if token in dictionary.keys():
12
+ resulted_solutions = []
13
+ solutions = dictionary[token]
14
+ if flag == '1':
15
+ solutions = [solutions[0]]
16
+ for solution in solutions:
17
+ # token, freq, lemma, lemma_id, root, pos
18
+ resulted_solutions.append([token, solution[0], solution[1], solution[2], solution[3], solution[4]])
19
+ return resulted_solutions
20
+ else:
21
+ return []
22
+
23
+
24
+
25
+ def analyze(text, language ='MSA', task ='full', flag="1"):
26
+ """
27
+ This method processes an input text and returns morphological analysis for each token within the text, based on the specified language, task, and flag. As follows:
28
+ If:
29
+ The task is lemmatization, the morphological solution includes only the lemma_id, lemma, token, and token frequency.
30
+ The task is pos, the morphological solution includes only the part-of-speech, token, and token frequency.
31
+ The task is root, the morphological solution includes only the root, token, and token frequency.
32
+ The task is full, the morphological solution includes the lemma_id, lemma, part-of-speech, root, token, and token frequency.
33
+
34
+ Args:
35
+ text (:obj:`str`): The Arabic text to be morphologically analyzed.
36
+ language (:obj:`str`): The type of the input text. Currently, only Modern Standard Arabic (MSA) is supported.
37
+ task (:obj:`str`): The task to filter the results by. Options are [lemmatization, pos, root, full]. The default task if not specified is `full`.
38
+ flag (:obj:`str`): The flag to filter the returned results. If the flag is `1`, the solution with the highest frequency will be returned. If the flag is `*`, all solutions will be returned, ordered descendingly, with the highest frequency solution first. The default flag if not specified is `1`.
39
+
40
+ Returns:
41
+ list (:obj:`list`): A list of JSON objects, where each JSON could be contains:
42
+ token: The token from the original text.
43
+ lemma: The lemma of the token.
44
+ lemma_id: The id of the lemma.
45
+ pos: The part-of-speech of the token.
46
+ root: The root of the token.
47
+ frequency: The frequency of the token.
48
+
49
+ **Example:**
50
+
51
+ .. highlight:: python
52
+ .. code-block:: python
53
+
54
+ from sinatools.morphology.morph_analyzer import analyze
55
+
56
+ #Return the morpological solution for each token in this text
57
+ #Example: task = full
58
+ analyze('ذهب الولد الى المدرسة')
59
+
60
+ [
61
+ {
62
+ "token": "ذهب",
63
+ "lemma": "ذَهَبَ",
64
+ "lemma_id": "202001617",
65
+ "root": "ذ ه ب",
66
+ "pos": "فعل ماضي",
67
+ "frequency": "82202"
68
+ },{
69
+ "token": "الولد",
70
+ "lemma": "وَلَدٌ",
71
+ "lemma_id": "202003092",
72
+ "root": "و ل د",
73
+ "pos": "اسم",
74
+ "frequency": "19066"
75
+ },{
76
+ "token": "إلى",
77
+ "lemma": "إِلَى",
78
+ "lemma_id": "202000856",
79
+ "root": "إ ل ى",
80
+ "pos": "حرف جر",
81
+ "frequency": "7367507"
82
+ },{
83
+ "token": "المدرسة",
84
+ "lemma": "مَدْرَسَةٌ",
85
+ "lemma_id": "202002620",
86
+ "root": "د ر س",
87
+ "pos": "اسم",
88
+ "frequency": "145285"
89
+ }
90
+ ]
91
+ """
92
+
93
+ output_list = []
94
+
95
+ tokens = simple_word_tokenize(text)
96
+
97
+ for token in tokens:
98
+ result_token = []
99
+ token = arStrip(token , False , True , False , False , False , False)
100
+ token = re.sub('[ٱ]','ﺍ',token)
101
+ # token, freq, lemma, lemma_id, root, pos
102
+ solution = [token, 0, token+"_0", 0, token, ""]
103
+
104
+ if token.isdigit():
105
+ solution[5] = "digit" #pos
106
+
107
+ elif not _is_ar(token):
108
+ solution[5] = "Foreign" #pos
109
+
110
+ else:
111
+ result_token = find_solution(token,language,flag)
112
+
113
+ if result_token == []:
114
+ token_without_al = re.sub(r'^[ﻝ]','',re.sub(r'^[ﺍ]','',token))
115
+ if len(token_without_al) > 5 :
116
+ result_token = find_solution(token_without_al, language, flag)
117
+
118
+ if result_token == []:
119
+ # try with replace ﻩ with ﺓ
120
+ result_token = find_solution(re.sub(r'[ﻩ]$','ﺓ',token), language, flag)
121
+
122
+
123
+ if result_token == []:
124
+ # try with unify Alef
125
+ word_with_unify_alef = arStrip(token , False , False , False , False , True , False) # Unify Alef
126
+ result_token = find_solution(word_with_unify_alef, language, flag)
127
+
128
+ if result_token == []:
129
+ # try with remove diac
130
+ word_undiac = arStrip(token , True , False , True , True , False , False) # remove diacs, shaddah , digit
131
+ result_token = find_solution(word_undiac, language, flag)
132
+
133
+ if result_token == []:
134
+ # try with remove diac and unify alef
135
+ word_undiac = arStrip(token , True , True , True , False, True , False) # diacs , smallDiacs , shaddah , alif
136
+ result_token = find_solution(word_undiac, language, flag)
137
+
138
+ if result_token != []:
139
+ output_list += result_token
140
+ else:
141
+ output_list += [solution]
142
+
143
+ return filter_results(output_list, task)
144
+
145
+
146
+ def filter_results(data, task):
147
+ filtered_data = []
148
+ # token, freq, lemma, lemma_id, root, pos
149
+ if task == 'lemmatization':
150
+ filtered_data = [{'token': item[0], 'lemma': item[2], 'lemma_id': item[3], 'frequency': item[1]} for item in data]
151
+ elif task == 'pos':
152
+ filtered_data = [{'token': item[0], 'pos': item[5], 'frequency': item[1]} for item in data]
153
+ elif task == 'root':
154
+ filtered_data = [{'token': item[0], 'root': item[4], 'frequency': item[1]} for item in data]
155
+ else:
156
+ filtered_data = [{'token': item[0], 'lemma': item[2], 'lemma_id': item[3], 'root': item[4], 'pos':item[5], 'frequency': item[1]} for item in data]
157
+
158
+ return filtered_data
159
+
160
+
161
+ def _is_ar(word):
162
+ return _IS_AR_RE.match(word) is not None
163
+
164
+
165
+
166
+
167
+
168
+
169
+
170
+
171
+
172
+
@@ -0,0 +1,12 @@
1
+ from sinatools.DataDownload import downloader
2
+ import os
3
+ from sinatools.ner.utils.helpers import load_checkpoint
4
+
5
+ tagger = None
6
+ tag_vocab = None
7
+ train_config = None
8
+
9
+ filename = 'Wj27012000.tar'
10
+ path =downloader.get_appdatadir()
11
+ model_path = os.path.join(path, filename)
12
+ tagger, tag_vocab, train_config = load_checkpoint(model_path)
@@ -1,9 +1,10 @@
1
1
  import os
2
2
  from collections import namedtuple
3
- from nlptools.arabiner.utils.helpers import load_checkpoint
4
- from nlptools.arabiner.utils.data import get_dataloaders, text2segments
5
- from nlptools.DataDownload import downloader
6
- import nlptools
3
+ from sinatools.ner.utils.helpers import load_checkpoint
4
+ from sinatools.ner.utils.data import get_dataloaders, text2segments
5
+ from sinatools.DataDownload import downloader
6
+ from . import tag_vocab, train_config, tagger
7
+
7
8
  def ner(text, batch_size=32):
8
9
  """
9
10
  This method takes a text as input, and a batch size, then performs named entity recognition (NER) on the input text and returns a list of tagged mentions.
@@ -20,7 +21,7 @@ def ner(text, batch_size=32):
20
21
  .. highlight:: python
21
22
  .. code-block:: python
22
23
 
23
- from nlptools.arabiner.bin import infer
24
+ from sinatools.arabiner.bin import infer
24
25
  infer.ner('ذهب محمد الى جامعة بيرزيت')
25
26
 
26
27
  #the output
@@ -42,19 +43,19 @@ def ner(text, batch_size=32):
42
43
  dataset, token_vocab = text2segments(text)
43
44
 
44
45
  vocabs = namedtuple("Vocab", ["tags", "tokens"])
45
- vocab = vocabs(tokens=token_vocab, tags=nlptools.tag_vocab)
46
+ vocab = vocabs(tokens=token_vocab, tags=tag_vocab)
46
47
 
47
48
  # From the datasets generate the dataloaders
48
49
  dataloader = get_dataloaders(
49
50
  (dataset,),
50
51
  vocab,
51
- nlptools.train_config.data_config,
52
+ train_config.data_config,
52
53
  batch_size=batch_size,
53
54
  shuffle=(False,),
54
55
  )[0]
55
56
 
56
57
  # Perform inference on the text and get back the tagged segments
57
- segments = nlptools.tagger.infer(dataloader)
58
+ segments = tagger.infer(dataloader)
58
59
  segments_lists = []
59
60
  # Print results
60
61
  for segment in segments:
@@ -1,6 +1,6 @@
1
- from nlptools.salma import settings
1
+ from sinatools.salma import settings
2
2
  import pickle
3
- from nlptools.DataDownload import downloader
3
+ from sinatools.DataDownload import downloader
4
4
  import os
5
5
 
6
6
  #filename = 'glosses_dic.pickle'
@@ -6,7 +6,7 @@ import pandas as pd
6
6
 
7
7
 
8
8
 
9
- from nlptools.DataDownload import downloader
9
+ from sinatools.DataDownload import downloader
10
10
  import os
11
11
 
12
12
  glosses_dic = {}
@@ -1,12 +1,12 @@
1
1
  import json
2
- from nlptools.salma import settings
3
- from nlptools.salma.wsd import normalizearabert
4
- from nlptools.salma.wsd import GlossPredictor
5
- from nlptools.utils.parser import arStrip
6
- from nlptools.morphology.tokenizers_words import simple_word_tokenize
7
- from nlptools.morphology.ALMA_multi_word import ALMA_multi_word
8
- from nlptools.morphology.morph_analyzer import analyze
9
- #from nlptools.arabiner.bin.infer import ner
2
+ from sinatools.salma import settings
3
+ from sinatools.salma.wsd import normalizearabert
4
+ from sinatools.salma.wsd import GlossPredictor
5
+ from sinatools.utils.parser import arStrip
6
+ from sinatools.utils.tokenizers_words import simple_word_tokenize
7
+ from sinatools.morphology.ALMA_multi_word import ALMA_multi_word
8
+ from sinatools.morphology.morph_analyzer import analyze
9
+ #from sinatools.ner.entity_extractor import ner
10
10
 
11
11
  def delete_form_list(position, word_lemma):
12
12
  #"""
@@ -424,7 +424,7 @@ def SALMA(sentence):
424
424
  .. highlight:: python
425
425
  .. code-block:: python
426
426
 
427
- from nlptools.salma.views import SALMA
427
+ from sinatools.salma.views import SALMA
428
428
  JSON = SALMA("مختبر سينا لحوسبة اللغة والذكاء الإصطناعي. في جامعة بيرزيت.")
429
429
  print(JSON["resp"])
430
430
 
@@ -1,11 +1,11 @@
1
- from nlptools.salma import settings
1
+ from sinatools.salma import settings
2
2
  import re
3
3
  import warnings
4
4
  warnings.filterwarnings("ignore")
5
5
  import torch
6
6
  import numpy as np
7
7
  import pandas as pd
8
- from nlptools.arabert.preprocess import ArabertPreprocessor
8
+ from sinatools.arabert.preprocess import ArabertPreprocessor
9
9
 
10
10
  def normalizearabert(s):
11
11
  model_name = 'aubmindlab/bert-base-arabertv02'
@@ -1,6 +1,4 @@
1
- # -*- coding: utf-8 -*-
2
- # We acknoledge that this file charsets.py is imported from Camel tools citation. url
3
- #
1
+ # We acknowledge that this file, charsets.py, is imported from Camel Tools. [https://camel-tools.readthedocs.io/en/latest/api/utils/charsets.html].
4
2
 
5
3
  import unicodedata
6
4