SinaTools 0.1.27__py2.py3-none-any.whl → 0.1.28__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SinaTools-0.1.27.dist-info → SinaTools-0.1.28.dist-info}/METADATA +2 -2
- {SinaTools-0.1.27.dist-info → SinaTools-0.1.28.dist-info}/RECORD +27 -26
- sinatools/CLI/DataDownload/download_files.py +2 -5
- sinatools/CLI/morphology/ALMA_multi_word.py +0 -34
- sinatools/CLI/morphology/morph_analyzer.py +1 -1
- sinatools/CLI/ner/corpus_entity_extractor.py +17 -4
- sinatools/CLI/ner/entity_extractor.py +8 -8
- sinatools/CLI/utils/implication.py +3 -3
- sinatools/VERSION +1 -1
- sinatools/morphology/morph_analyzer.py +44 -45
- sinatools/ner/entity_extractor.py +41 -0
- sinatools/semantic_relatedness/compute_relatedness.py +22 -0
- sinatools/synonyms/synonyms_generator.py +45 -1
- sinatools/utils/jaccard.py +1 -1
- sinatools/utils/parser.py +12 -15
- sinatools/utils/similarity.py +95 -4
- sinatools/utils/text_dublication_detector.py +22 -0
- sinatools/utils/text_transliteration.py +1 -1
- sinatools/utils/tokenizer.py +1 -1
- sinatools/utils/word_compare.py +667 -0
- sinatools/wsd/disambiguator.py +20 -19
- {SinaTools-0.1.27.data → SinaTools-0.1.28.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.27.dist-info → SinaTools-0.1.28.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.27.dist-info → SinaTools-0.1.28.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.27.dist-info → SinaTools-0.1.28.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.27.dist-info → SinaTools-0.1.28.dist-info}/entry_points.txt +0 -0
- {SinaTools-0.1.27.dist-info → SinaTools-0.1.28.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: SinaTools
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.28
|
4
4
|
Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
5
5
|
Home-page: https://github.com/SinaLab/sinatools
|
6
6
|
License: MIT license
|
@@ -24,7 +24,7 @@ SinaTools
|
|
24
24
|
======================
|
25
25
|
Open Source Toolkit for Arabic NLP and NLU developed by [SinaLab](http://sina.birzeit.edu/) at Birzeit University. SinaTools is available through Python APIs, command lines, colabs, and online demos.
|
26
26
|
|
27
|
-
See the full list of [Available Packages](https://sina.birzeit.edu/sinatools/), which include: (1) [Morphology Tagging](https://sina.birzeit.edu/sinatools/index.html#morph), (2) [Named Entity Recognition (NER)](https://sina.birzeit.edu/sinatools/index.html#ner), (3) [Word Sense Disambiguation (WSD)](https://sina.birzeit.edu/sinatools/index.html#wsd), (4) [Semantic Relatedness](https://sina.birzeit.edu/sinatools/index.html#sr), (5) [Synonymy Extraction and Evaluation](https://sina.birzeit.edu/sinatools/index.html#se), (6) [Relation Extraction](https://sina.birzeit.edu/sinatools/index.html), (7) [Utilities](https://sina.birzeit.edu/sinatools/index.html#u) (diacritic-based word matching, Jaccard similarly, parser, tokenizers, corpora processing, transliteration, etc).
|
27
|
+
See the full list of [Available Packages](https://sina.birzeit.edu/sinatools/), which include: (1) [Morphology Tagging](https://sina.birzeit.edu/sinatools/index.html#morph), (2) [Named Entity Recognition (NER)](https://sina.birzeit.edu/sinatools/index.html#ner), (3) [Word Sense Disambiguation (WSD)](https://sina.birzeit.edu/sinatools/index.html#wsd), (4) [Semantic Relatedness](https://sina.birzeit.edu/sinatools/index.html#sr), (5) [Synonymy Extraction and Evaluation](https://sina.birzeit.edu/sinatools/index.html#se), (6) [Relation Extraction](https://sina.birzeit.edu/sinatools/index.html#re), (7) [Utilities](https://sina.birzeit.edu/sinatools/index.html#u) (diacritic-based word matching, Jaccard similarly, parser, tokenizers, corpora processing, transliteration, etc).
|
28
28
|
|
29
29
|
See [Demo Pages](https://sina.birzeit.edu/sinatools/).
|
30
30
|
|
@@ -1,18 +1,18 @@
|
|
1
|
-
SinaTools-0.1.
|
2
|
-
sinatools/VERSION,sha256=
|
1
|
+
SinaTools-0.1.28.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
|
2
|
+
sinatools/VERSION,sha256=NhKxpb_MVtfi01FRu6rOIYrldV__GIvBYcyyn5UnDBM,6
|
3
3
|
sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
|
4
4
|
sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
|
5
5
|
sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
|
6
6
|
sinatools/sinatools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
|
7
|
-
sinatools/CLI/DataDownload/download_files.py,sha256=
|
8
|
-
sinatools/CLI/morphology/ALMA_multi_word.py,sha256=
|
9
|
-
sinatools/CLI/morphology/morph_analyzer.py,sha256=
|
10
|
-
sinatools/CLI/ner/corpus_entity_extractor.py,sha256=
|
11
|
-
sinatools/CLI/ner/entity_extractor.py,sha256=
|
7
|
+
sinatools/CLI/DataDownload/download_files.py,sha256=TzS0XjYDhusRBb2CRX1EjKjORa0wI6me_XoZ09dY4R8,2397
|
8
|
+
sinatools/CLI/morphology/ALMA_multi_word.py,sha256=rmpa72twwIJHme_kpQ1lu3_7y_Jorj70QTvOnQMJRuI,1274
|
9
|
+
sinatools/CLI/morphology/morph_analyzer.py,sha256=HPamEKos_JRYCJv_2q6c12N--da58_JXTno9haww5Ao,3497
|
10
|
+
sinatools/CLI/ner/corpus_entity_extractor.py,sha256=DdvigsDQzko5nJBjzUXlIDqoBMBTVzktjSo7JfEXTIA,4778
|
11
|
+
sinatools/CLI/ner/entity_extractor.py,sha256=G9j-t0WKm2CRORhqARJM-pI-KArQ2IXIvnBK_NHxlHs,2885
|
12
12
|
sinatools/CLI/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
13
|
sinatools/CLI/utils/arStrip.py,sha256=NLyp8vOu2xv80tL9jiKRvyptmbkRZVg-wcAr-9YyvNY,3264
|
14
14
|
sinatools/CLI/utils/corpus_tokenizer.py,sha256=nH0T4h6urr_0Qy6-wN3PquOtnwybj0REde5Ts_OE4U8,1650
|
15
|
-
sinatools/CLI/utils/implication.py,sha256=
|
15
|
+
sinatools/CLI/utils/implication.py,sha256=AojpkCwUQJiQjxhyEUWKRHmBnIt1tVqr485cAF7Thq0,2857
|
16
16
|
sinatools/CLI/utils/jaccard.py,sha256=w56N_cNEFJ0A7WtunmY_xtms4srFagKBzrW_0YhH2DE,4216
|
17
17
|
sinatools/CLI/utils/remove_latin.py,sha256=NOaTm2RHxt5IQrV98ySTmD8rTXTmcqSmfbPAwTyaXqU,848
|
18
18
|
sinatools/CLI/utils/remove_punctuation.py,sha256=vJAZlEn7WGftZAFVFYnddkRrxdJ_rMmKB9vFZkY-jN4,1097
|
@@ -75,12 +75,12 @@ sinatools/arabert/aragpt2/grover/train_tpu.py,sha256=qNgLI_j6-KYkTMJfVoFlh4NIKwe
|
|
75
75
|
sinatools/arabert/aragpt2/grover/utils.py,sha256=V5wMUxK03r5g_pb7R3_uGLOPqQJfbIB0VaJ8ZDM4XAo,8473
|
76
76
|
sinatools/morphology/ALMA_multi_word.py,sha256=hj_-8ojrYYHnfCGk8WKtJdUR8mauzQdma4WUm-okDps,1346
|
77
77
|
sinatools/morphology/__init__.py,sha256=I4wVBh8BhyNl-CySVdiI_nUSn6gj1j-gmLKP300RpE0,1216
|
78
|
-
sinatools/morphology/morph_analyzer.py,sha256=
|
78
|
+
sinatools/morphology/morph_analyzer.py,sha256=3B-ewxFg_If83oYlk1bDdVS1clb-mgyAF4WgAMqcAVI,7009
|
79
79
|
sinatools/ner/__init__.py,sha256=CLPaqUcvPGAA4lU-6hjAqjNfKJ5WtwRfsma6QkYZHEk,1379
|
80
80
|
sinatools/ner/data.py,sha256=lvOW86dXse8SC75Q0supQaE0rrRffoxNjIA0Qbv5WZY,4354
|
81
81
|
sinatools/ner/data_format.py,sha256=7Yt0aOicOn9_YuuyCkM_IYi_rgjGYxR9bCuUaNGM73o,4341
|
82
82
|
sinatools/ner/datasets.py,sha256=mG1iwqSm3lXCFHLqE-b4wNi176cpuzNBz8tKaBU6z6M,5059
|
83
|
-
sinatools/ner/entity_extractor.py,sha256=
|
83
|
+
sinatools/ner/entity_extractor.py,sha256=O2epRwRFUUcQs3SnFIYHVBI4zVhr8hRcj0XJYeby4ts,3588
|
84
84
|
sinatools/ner/helpers.py,sha256=dnOoDY5JMyOLTUWVIZLMt8mBn2IbWlVaqHhQyjs1voo,2343
|
85
85
|
sinatools/ner/metrics.py,sha256=Irz6SsIvpOzGIA2lWxrEV86xnTnm0TzKm9SUVT4SXUU,2734
|
86
86
|
sinatools/ner/relation_extractor.py,sha256=a85xGX6V72fDpJk0GKmmtlWf8S8ezY-2pm5oGc9_ESY,9750
|
@@ -97,28 +97,29 @@ sinatools/ner/trainers/BertNestedTrainer.py,sha256=Pb4O2WeBmTvV3hHMT6DXjxrTzgtuh
|
|
97
97
|
sinatools/ner/trainers/BertTrainer.py,sha256=B_uVtUwfv_eFwMMPsKQvZgW_ZNLy6XEsX5ePR0s8d-k,6433
|
98
98
|
sinatools/ner/trainers/__init__.py,sha256=UDok8pDDpYOpwRBBKVLKaOgSUlmqqb-zHZI1p0xPxzI,188
|
99
99
|
sinatools/semantic_relatedness/__init__.py,sha256=S0xrmqtl72L02N56nbNMudPoebnYQgsaIyyX-587DsU,830
|
100
|
-
sinatools/semantic_relatedness/compute_relatedness.py,sha256=
|
100
|
+
sinatools/semantic_relatedness/compute_relatedness.py,sha256=_9HFPs3nQBLklHFfkc9o3gEjEI6Bd34Ha4E1Kvv1RIg,2256
|
101
101
|
sinatools/synonyms/__init__.py,sha256=yMuphNZrm5XLOR2T0weOHcUysJm-JKHUmVLoLQO8390,548
|
102
|
-
sinatools/synonyms/synonyms_generator.py,sha256=
|
102
|
+
sinatools/synonyms/synonyms_generator.py,sha256=jRd0D3_kn-jYBaZzqY-7oOy0SFjSJ-mjM7JhsySzX58,9037
|
103
103
|
sinatools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
104
104
|
sinatools/utils/charsets.py,sha256=rs82oZJqRqosZdTKXfFAJfJ5t4PxjMM_oAPsiWSWuwU,2817
|
105
105
|
sinatools/utils/implication.py,sha256=MsbI6S1LNY-fCxGMxFTuaV639r3QijkkdcfH48rvY7A,27804
|
106
|
-
sinatools/utils/jaccard.py,sha256=
|
107
|
-
sinatools/utils/parser.py,sha256=
|
106
|
+
sinatools/utils/jaccard.py,sha256=kLIptPNB2VIqnemVve9auyOL1kXHIsCkKCEwxFM8yP4,10114
|
107
|
+
sinatools/utils/parser.py,sha256=qvHdln5R5CAv_0UOJWe0mcp8JCsGqgazoeIIkoALH88,6259
|
108
108
|
sinatools/utils/readfile.py,sha256=xE4LEaCqXJIk9v37QUSSmWb-aY3UnCFUNb7uVdx3cpM,133
|
109
|
-
sinatools/utils/similarity.py,sha256=
|
110
|
-
sinatools/utils/text_dublication_detector.py,sha256=
|
111
|
-
sinatools/utils/text_transliteration.py,sha256=
|
112
|
-
sinatools/utils/tokenizer.py,sha256=
|
109
|
+
sinatools/utils/similarity.py,sha256=CgKOJpRAU5UaSjOg-sdZcACCNl9tuKDRwdFAKATCL_w,10762
|
110
|
+
sinatools/utils/text_dublication_detector.py,sha256=FeSkbfWGMQluz23H4CBHXION-walZPgjueX6AL8u_Q0,5660
|
111
|
+
sinatools/utils/text_transliteration.py,sha256=F3smhr2AEJtySE6wGQsiXXOslTvSDzLivTYu0btgc10,8769
|
112
|
+
sinatools/utils/tokenizer.py,sha256=nyk6lh5-p38wrU62hvh4wg7ni9ammkdqqIgcjbbBxxo,6965
|
113
113
|
sinatools/utils/tokenizers_words.py,sha256=efNfOil9qDNVJ9yynk_8sqf65PsL-xtsHG7y2SZCkjQ,656
|
114
|
+
sinatools/utils/word_compare.py,sha256=rS2Z74sf7R-7MTXyrFj5miRi2TnSG9OdTDp_qQYuo2Y,28200
|
114
115
|
sinatools/wsd/__init__.py,sha256=mwmCUurOV42rsNRpIUP3luG0oEzeTfEx3oeDl93Oif8,306
|
115
|
-
sinatools/wsd/disambiguator.py,sha256=
|
116
|
+
sinatools/wsd/disambiguator.py,sha256=h-3idc5rPPbMDSE_QVJAsEVkDHwzYY3L2SEPNXIdOcc,20104
|
116
117
|
sinatools/wsd/settings.py,sha256=6XflVTFKD8SVySX9Wj7zYQtV26WDTcQ2-uW8-gDNHKE,747
|
117
118
|
sinatools/wsd/wsd.py,sha256=gHIBUFXegoY1z3rRnIlK6TduhYq2BTa_dHakOjOlT4k,4434
|
118
|
-
SinaTools-0.1.
|
119
|
-
SinaTools-0.1.
|
120
|
-
SinaTools-0.1.
|
121
|
-
SinaTools-0.1.
|
122
|
-
SinaTools-0.1.
|
123
|
-
SinaTools-0.1.
|
124
|
-
SinaTools-0.1.
|
119
|
+
SinaTools-0.1.28.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
|
120
|
+
SinaTools-0.1.28.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
|
121
|
+
SinaTools-0.1.28.dist-info/METADATA,sha256=oJ0szwQ8a_ykAsYn2uqU-pmhF4N4Sh0oIsv1JCYeX78,3267
|
122
|
+
SinaTools-0.1.28.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
|
123
|
+
SinaTools-0.1.28.dist-info/entry_points.txt,sha256=ZwZLolnWog2fjdDrfaHNHob8SE_YtMbD6ayzsOzItxs,1234
|
124
|
+
SinaTools-0.1.28.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
|
125
|
+
SinaTools-0.1.28.dist-info/RECORD,,
|
@@ -2,7 +2,7 @@
|
|
2
2
|
About:
|
3
3
|
------
|
4
4
|
|
5
|
-
The download_files
|
5
|
+
The download_files command, allows users to select specific files and models to download and use it within SinaTools modules. Additionally, it automatically manages the extraction of compressed files, including zip and tar.gz formats.
|
6
6
|
|
7
7
|
Usage:
|
8
8
|
------
|
@@ -18,7 +18,7 @@ Below is the usage information that can be generated by running download_files -
|
|
18
18
|
|
19
19
|
Options:
|
20
20
|
-f, --files FILES
|
21
|
-
Names of the files to download. Available files are: ner, morph,
|
21
|
+
Names of the files to download. Available files are: ner, morph, wsd, synonyms.
|
22
22
|
If no file is specified, all files will be downloaded.
|
23
23
|
|
24
24
|
Examples:
|
@@ -28,7 +28,6 @@ Examples:
|
|
28
28
|
|
29
29
|
download_files -f morph ner
|
30
30
|
This command will download only the `morph` and `ner` files to the default directory.
|
31
|
-
|
32
31
|
"""
|
33
32
|
|
34
33
|
import argparse
|
@@ -72,5 +71,3 @@ def main():
|
|
72
71
|
|
73
72
|
if __name__ == '__main__':
|
74
73
|
main()
|
75
|
-
|
76
|
-
#download_files -f morph ner
|
@@ -1,37 +1,3 @@
|
|
1
|
-
"""
|
2
|
-
About:
|
3
|
-
------
|
4
|
-
The alma_multi_word tool performs multi-word morphological analysis using SinaTools' `ALMA_multi_word` utility. Given a multi-word Arabic text input, it returns a detailed analysis in JSON format.
|
5
|
-
|
6
|
-
Usage:
|
7
|
-
------
|
8
|
-
Below is the usage information that can be generated by running alma_multi_word --help.
|
9
|
-
|
10
|
-
.. code-block:: none
|
11
|
-
|
12
|
-
alma_multi_word --multi_word=MULTI_WORD_TEXT
|
13
|
-
alma_multi_word --file
|
14
|
-
|
15
|
-
Options:
|
16
|
-
--------
|
17
|
-
|
18
|
-
.. code-block:: none
|
19
|
-
|
20
|
-
--multi_word MULTI_WORD_TEXT
|
21
|
-
The multi-word Arabic text that needs to be analyzed.
|
22
|
-
--file
|
23
|
-
File containing the multi-word text to be analyzed
|
24
|
-
|
25
|
-
Examples:
|
26
|
-
---------
|
27
|
-
|
28
|
-
.. code-block:: none
|
29
|
-
|
30
|
-
alma_multi_word --multi_word "Your multi-word text here"
|
31
|
-
alma_multi_word --file "path/to/your/file.txt"
|
32
|
-
|
33
|
-
"""
|
34
|
-
|
35
1
|
import argparse
|
36
2
|
from sinatools.morphology.ALMA_multi_word import ALMA_multi_word
|
37
3
|
import json
|
@@ -1,7 +1,7 @@
|
|
1
1
|
"""
|
2
2
|
About:
|
3
3
|
------
|
4
|
-
The morphology_analyzer command is designed to provide morphological analysis for Arabic text using the SinaTools morph_analyzer
|
4
|
+
The morphology_analyzer command is designed to provide morphological analysis for Arabic text using the SinaTools morph_analyzer API. Users can specify the language and desired analysis task (lemmatization, part-of-speech tagging, or full morphological analysis), and flag.
|
5
5
|
|
6
6
|
Usage:
|
7
7
|
------
|
@@ -7,13 +7,26 @@ import argparse
|
|
7
7
|
from sinatools.ner.entity_extractor import extract
|
8
8
|
|
9
9
|
"""
|
10
|
-
|
10
|
+
The following command takes a CSV file as input. It splits a specific column into tokens and tags them using named entity recognition (NER). It retains all other columns as they are, and it also adds sentences and tokens. Additionally, it assigns an auto-incrementing ID, a sentence ID, and a global sentence ID to each token. As follows:
|
11
11
|
|
12
12
|
Usage:
|
13
13
|
------
|
14
|
-
|
15
|
-
|
16
|
-
corpus_entity_extractor
|
14
|
+
Below is the usage information that can be generated by running corpus_entity_extractor --help.
|
15
|
+
|
16
|
+
corpus_entity_extractor --input_csv path/to/csv/file --text-columns "name of the column to be tokenized" --additional-columns "Column3,Column4" --output-csv path/to/csv/file
|
17
|
+
|
18
|
+
Options:
|
19
|
+
-------
|
20
|
+
--input_csv CSV_FILE_PATH
|
21
|
+
Path of csv file
|
22
|
+
--text-columns STR
|
23
|
+
Name of the text column that need to be tagged
|
24
|
+
-- additional-columns
|
25
|
+
name of columns that returned as they are
|
26
|
+
-- output-csv
|
27
|
+
path to csv file
|
28
|
+
|
29
|
+
corpus_entity_extractor --input_csv "input.csv" --text-columns "TextColumn1" --additional-columns "Column3,Column4" --output-csv "output.csv"
|
17
30
|
"""
|
18
31
|
|
19
32
|
def jsons_to_list_of_lists(json_list):
|
@@ -1,7 +1,7 @@
|
|
1
1
|
"""
|
2
2
|
About:
|
3
3
|
------
|
4
|
-
This
|
4
|
+
This command processes an input text and returns named entites for each token within the text. As follows:
|
5
5
|
|
6
6
|
Usage:
|
7
7
|
------
|
@@ -10,7 +10,7 @@ Below is the usage information that can be generated by running entity_extractor
|
|
10
10
|
.. code-block:: none
|
11
11
|
|
12
12
|
entity_extractor --text=INPUT_TEXT
|
13
|
-
entity_extractor --dir=
|
13
|
+
entity_extractor --dir=DIRECTORY_PATH --output_csv "path/to/csv/file"
|
14
14
|
|
15
15
|
Options:
|
16
16
|
--------
|
@@ -18,11 +18,11 @@ Options:
|
|
18
18
|
.. code-block:: none
|
19
19
|
|
20
20
|
--text INPUT_TEXT
|
21
|
-
|
22
|
-
--
|
23
|
-
|
24
|
-
--output_csv
|
25
|
-
|
21
|
+
The text that needs to be analyzed for Named Entity Recognition.
|
22
|
+
--dir DIRECTORY_PATH
|
23
|
+
Directory containing the text files to be analyzed for Named Entity Recognition
|
24
|
+
--output_csv CSV_FILE
|
25
|
+
The path for output csv file
|
26
26
|
|
27
27
|
|
28
28
|
Examples:
|
@@ -31,7 +31,7 @@ Examples:
|
|
31
31
|
.. code-block:: none
|
32
32
|
|
33
33
|
entity_extractor --text "Your text here"
|
34
|
-
entity_extractor --dir "
|
34
|
+
entity_extractor --dir "path/to/your/dir" --output_csv "path/to/your/file"
|
35
35
|
|
36
36
|
"""
|
37
37
|
|
@@ -39,7 +39,7 @@ Examples:
|
|
39
39
|
|
40
40
|
"""
|
41
41
|
import argparse
|
42
|
-
from sinatools.utils.
|
42
|
+
from sinatools.utils.word_compare import Implication
|
43
43
|
|
44
44
|
def read_file(file_path):
|
45
45
|
with open(file_path, 'r', encoding='utf-8') as file:
|
@@ -72,8 +72,8 @@ def main():
|
|
72
72
|
# Instantiate the Implication class
|
73
73
|
implication_obj = Implication(word1, word2)
|
74
74
|
|
75
|
-
# For this example, assuming there is a method `
|
76
|
-
result = implication_obj.
|
75
|
+
# For this example, assuming there is a method `get_verdict()` in the Implication class.
|
76
|
+
result = implication_obj.get_verdict()
|
77
77
|
print(result)
|
78
78
|
|
79
79
|
if __name__ == '__main__':
|
sinatools/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.28
|
@@ -24,27 +24,27 @@ def find_solution(token, language, flag):
|
|
24
24
|
|
25
25
|
def analyze(text, language ='MSA', task ='full', flag="1"):
|
26
26
|
"""
|
27
|
-
This method processes an input text and returns morphological analysis for each token within the text, based on the specified language, task, and flag.
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
27
|
+
This method processes an input text and returns morphological analysis for each token within the text, based on the specified language, task, and flag. You can try the demo online. See article for more details
|
28
|
+
|
29
|
+
* If the task is lemmatization, the morphological solution includes only the lemma_id, lemma, token, and token frequency.
|
30
|
+
* If the task is pos, the morphological solution includes only the part-of-speech, token, and token frequency.
|
31
|
+
* If the task is root, the morphological solution includes only the root, token, and token frequency.
|
32
|
+
* If the task is full, the morphological solution includes the lemma_id, lemma, part-of-speech, root, token, and token frequency.
|
33
33
|
|
34
|
-
|
34
|
+
Parameters:
|
35
35
|
text (:obj:`str`): The Arabic text to be morphologically analyzed.
|
36
|
-
language (:obj:`str`):
|
36
|
+
language (:obj:`str`): Currently, only Modern Standard Arabic (MSA) is supported.
|
37
37
|
task (:obj:`str`): The task to filter the results by. Options are [lemmatization, pos, root, full]. The default task if not specified is `full`.
|
38
|
-
flag (:obj:`str`):
|
38
|
+
flag (:obj:`str`): The flag to filter the returned results. If the flag is `1`, the solution with the highest frequency will be returned. If the flag is `*`, all solutions will be returned, ordered descendingly, with the highest frequency solution first. The default flag if not specified is `1`.
|
39
39
|
|
40
40
|
Returns:
|
41
41
|
list (:obj:`list`): A list of JSON objects, where each JSON could be contains:
|
42
42
|
token: The token from the original text.
|
43
|
-
lemma: The lemma of the token.
|
44
|
-
lemma_id: The id of the lemma.
|
45
|
-
pos: The part-of-speech of the token.
|
46
|
-
root: The root of the token.
|
47
|
-
frequency: The frequency of the token.
|
43
|
+
lemma: The lemma of the token (Lemmas from the Qabas lexicon).
|
44
|
+
lemma_id: The id of the lemma (qabas lemma ids).
|
45
|
+
pos: The part-of-speech of the token (see Qabas POS tags).
|
46
|
+
root: The root of the token (qabas roots).
|
47
|
+
frequency: The frequency of the token (see section 3 in article).
|
48
48
|
|
49
49
|
**Example:**
|
50
50
|
|
@@ -57,37 +57,36 @@ def analyze(text, language ='MSA', task ='full', flag="1"):
|
|
57
57
|
#Example: task = full
|
58
58
|
analyze('ذهب الولد الى المدرسة')
|
59
59
|
|
60
|
-
[
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
]
|
60
|
+
[{
|
61
|
+
"token": "ذهب",
|
62
|
+
"lemma": "ذَهَبَ",
|
63
|
+
"lemma_id": "202001617",
|
64
|
+
"root": "ذ ه ب",
|
65
|
+
"pos": "فعل ماضي",
|
66
|
+
"frequency": "82202"
|
67
|
+
},{
|
68
|
+
"token": "الولد",
|
69
|
+
"lemma": "وَلَدٌ",
|
70
|
+
"lemma_id": "202003092",
|
71
|
+
"root": "و ل د",
|
72
|
+
"pos": "اسم",
|
73
|
+
"frequency": "19066"
|
74
|
+
},{
|
75
|
+
"token": "إلى",
|
76
|
+
"lemma": "إِلَى",
|
77
|
+
"lemma_id": "202000856",
|
78
|
+
"root": "إ ل ى",
|
79
|
+
"pos": "حرف جر",
|
80
|
+
"frequency": "7367507"
|
81
|
+
},{
|
82
|
+
"token": "المدرسة",
|
83
|
+
"lemma": "مَدْرَسَةٌ",
|
84
|
+
"lemma_id": "202002620",
|
85
|
+
"root": "د ر س",
|
86
|
+
"pos": "اسم",
|
87
|
+
"frequency": "145285"
|
88
|
+
}]
|
89
|
+
|
91
90
|
"""
|
92
91
|
|
93
92
|
output_list = []
|
@@ -28,7 +28,48 @@ def convert_nested_to_flat(nested_tags):
|
|
28
28
|
return flat_tags
|
29
29
|
|
30
30
|
def extract(text, ner_method="nested"):
|
31
|
+
"""
|
32
|
+
This method processes an input text and returns named entites for each token within the text. It support 21 class of entites. The method also support flat and nested NER. You can try the demo online. See article for details.
|
31
33
|
|
34
|
+
Args:
|
35
|
+
* text (:obj:`str`) – The Arabic text to be tagged.
|
36
|
+
* ner_method (:obj:`str`) – The NER method can produce either flat or nested output formats. The default method is nested.
|
37
|
+
nested method: If the method is nested, the output will include nested tags.
|
38
|
+
flat method: If the method is flat, the output will consist of only flat tags.
|
39
|
+
The choice between flat and nested methods determines the structure and detail of the named entity recognition output.
|
40
|
+
|
41
|
+
Returns:
|
42
|
+
A list of JSON objects, where each object could be contains:
|
43
|
+
token: The token from the original text.
|
44
|
+
NER tag: The label pairs for each segment.
|
45
|
+
|
46
|
+
**Example:**
|
47
|
+
|
48
|
+
.. highlight:: python
|
49
|
+
.. code-block:: python
|
50
|
+
|
51
|
+
from sinatools.ner.entity_extractor import extract
|
52
|
+
#Example of nested ner. Notice that the last word in this sentense contains nested tags.
|
53
|
+
extract('ذهب محمد الى جامعة بيرزيت')
|
54
|
+
#the output
|
55
|
+
[{
|
56
|
+
"token":"ذهب",
|
57
|
+
"tags":"O"
|
58
|
+
},{
|
59
|
+
"token":"محمد",
|
60
|
+
"tags":"B-PERS"
|
61
|
+
},{
|
62
|
+
"token":"إلى",
|
63
|
+
"tags":"O"
|
64
|
+
},{
|
65
|
+
"token":"جامعة",
|
66
|
+
"tags":"B-ORG"
|
67
|
+
},{
|
68
|
+
"token":"بيرزيت",
|
69
|
+
"tags":"B-GPE I-ORG"
|
70
|
+
}]
|
71
|
+
"""
|
72
|
+
|
32
73
|
dataset, token_vocab = text2segments(text)
|
33
74
|
|
34
75
|
vocabs = namedtuple("Vocab", ["tags", "tokens"])
|
@@ -4,6 +4,28 @@ from . import model
|
|
4
4
|
|
5
5
|
#cosine using average embedding
|
6
6
|
def get_similarity_score(sentence1, sentence2):
|
7
|
+
"""
|
8
|
+
Computes the degree of association between two sentences across various dimensions, meaning, underlying concepts, domain-specificity, topic overlap, viewpoint alignment. This method is described and implemented on this article.
|
9
|
+
|
10
|
+
Args:
|
11
|
+
sentence1 (:obj:`str`) – The Arabic sentence to find the semantic relatedness between it and the second sentence.
|
12
|
+
sentence2 (:obj:`int`) – The Arabic sentence to find the semantic relatedness between it and the first sentence.
|
13
|
+
|
14
|
+
Returns:
|
15
|
+
:obj:`float`: An float number that represents the degree of relatedness between two provided sentences.
|
16
|
+
|
17
|
+
**Example:**
|
18
|
+
|
19
|
+
.. highlight:: python
|
20
|
+
.. code-block:: python
|
21
|
+
|
22
|
+
from sinatools.semantic_relatedness.compute_relatedness import get_similarity_score
|
23
|
+
|
24
|
+
sentence1 = "تبلغ سرعة دوران الأرض حول الشمس حوالي 110 كيلومتر في الساعة."
|
25
|
+
sentence2 = "تدور الأرض حول محورها بسرعة تصل تقريبا 1670 كيلومتر في الساعة."
|
26
|
+
get_similarity_score(sentence1, sentence2)
|
27
|
+
Score = 0.90
|
28
|
+
"""
|
7
29
|
|
8
30
|
# Tokenize and encode sentences
|
9
31
|
inputs1 = tokenizer(sentence1, return_tensors="pt")
|
@@ -76,7 +76,28 @@ def find_fuzzy_value_for_candidates(level, list_of_unique_synonyms, number_of_cy
|
|
76
76
|
|
77
77
|
|
78
78
|
def extend_synonyms(synset, level):
|
79
|
-
|
79
|
+
"""
|
80
|
+
This method receives a set of one or more synonyms and a level number, then extends this set with additional synonyms. The more synonyms in the input, the more accurate in the results. Each synonym in the output is assigned a fuzzy value to indicate how much it is likely to be a synonymy. You can try the demo online. Read the article for more details.
|
81
|
+
|
82
|
+
Args:
|
83
|
+
synset (:obj:`str`) – A set of initial synonyms to be extended (string of synonyms seperated by |).
|
84
|
+
level (:obj:`int`) – The level number indicates the depth of the synonym graph that the method should explore. The level could be 2 or 3. The 3rd level is richer, but the 2nd is faster.
|
85
|
+
|
86
|
+
Returns:
|
87
|
+
:obj:`list`: A list of lists, where each list could be contains:
|
88
|
+
synonym: Synonym related to the given synset (set of synonyms).
|
89
|
+
fuzzy_value: The synonyms strength as a percentage out of 100.
|
90
|
+
|
91
|
+
**Example:**
|
92
|
+
|
93
|
+
.. highlight:: python
|
94
|
+
.. code-block:: python
|
95
|
+
|
96
|
+
from sinatools.synonyms.synonyms_generator import extend_synonyms
|
97
|
+
extend_synonyms('ممر | طريق', 2)
|
98
|
+
[["مَسْلَك","61%"],["سبيل","61%"],["وَجْه","30%"],["نَهْج", "30%"],["نَمَطٌ","30%"],["مِنْهَج","30%"],["مِنهاج", "30%"],["مَوْر","30%"],["مَسَار","30%"],["مَرصَد", "30%"],["مَذْهَبٌ","30%"],["مَدْرَج","30%"],["مَجَاز","30%"]]
|
99
|
+
|
100
|
+
"""
|
80
101
|
used_graph = {}
|
81
102
|
if level == 2:
|
82
103
|
used_graph = synonyms_level2_dict
|
@@ -119,6 +140,29 @@ def extend_synonyms(synset, level):
|
|
119
140
|
|
120
141
|
def evaluate_synonyms(synset, level):
|
121
142
|
|
143
|
+
"""
|
144
|
+
This method receives a set of synonyms and a level number, then evaluates how much each of these input synonyms is really a synonym (i.e., how much it belongs to the set). You can try the demo online.
|
145
|
+
|
146
|
+
Args:
|
147
|
+
synset (:obj:`str`) – A set of initial synonyms to be evaluated (string of synonyms seperated by |).
|
148
|
+
level (:obj:`int`) – The level number indicating the depth of synonym graph that the method will explore, which could be 2 or 3.
|
149
|
+
|
150
|
+
Returns:
|
151
|
+
:obj:`list`: A list of lists, where each list could be contains:
|
152
|
+
synonym: Synonym related to the given synset (set of synonyms).
|
153
|
+
fuzzy_value: The synonyms strength as a percentage out of 100.
|
154
|
+
|
155
|
+
**Example:**
|
156
|
+
|
157
|
+
.. highlight:: python
|
158
|
+
.. code-block:: python
|
159
|
+
|
160
|
+
from sinatools.synonyms.synonyms_generator import evaluate_synonyms
|
161
|
+
|
162
|
+
evaluate_synonyms('ممر | طريق | مَسْلَك | سبيل')
|
163
|
+
[["مَسْلَك","61%"],["سبيل","60%"],["طريق","40%"],["ممر", "40%"]]
|
164
|
+
"""
|
165
|
+
|
122
166
|
used_graph = {}
|
123
167
|
if level == 2:
|
124
168
|
used_graph = synonyms_level2_dict
|
sinatools/utils/jaccard.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
# -*- coding: utf-8 -*-
|
2
2
|
|
3
3
|
from sinatools.utils.parser import arStrip
|
4
|
-
from sinatools.utils.
|
4
|
+
from sinatools.utils.word_compare import Implication
|
5
5
|
import argparse
|
6
6
|
|
7
7
|
def normalize_word(word: str, ignore_all_diacritics_but_not_shadda: bool=True, ignore_shadda_diacritic: bool=True) -> str:
|
sinatools/utils/parser.py
CHANGED
@@ -4,16 +4,16 @@ import argparse
|
|
4
4
|
def arStrip(text , diacs=True , small_diacs=True , shaddah=True , digit=True, alif=True , special_chars=True ):
|
5
5
|
|
6
6
|
"""
|
7
|
-
This method
|
7
|
+
This method allows one to optionally remove (Arabic diacritics, small diacritics, shaddah, Latin and Arabic digits, unify alif, remove special characters, extra spaces, underscore and Arabic tatwelah) from the input text.
|
8
8
|
|
9
9
|
Args:
|
10
10
|
text (:obj:`str`): Arabic text to be processed.
|
11
|
-
diacs (:obj:`bool`): flag to remove Arabic diacretics [
|
12
|
-
small_diacs (:obj:`bool`): flag to remove small
|
11
|
+
diacs (:obj:`bool`): flag to remove these 7 Arabic diacretics [ ٍ ِ ْ ٌ ُ َ ً] (default is True).
|
12
|
+
small_diacs (:obj:`bool`): flag to remove all Quranic annotation signs from this range [06D6-06ED] in addition to small alif. (default is True).
|
13
13
|
shaddah (:obj:`bool`): flag to remove shaddah (default is True).
|
14
14
|
digit (:obj:`bool`): flag to remove Latin and Arabic digits (default is True).
|
15
|
-
alif (:obj:`bool`): flag to unify alif (default is True).
|
16
|
-
special_chars (:obj:`bool`): flag to remove special characters (default is True).
|
15
|
+
alif (:obj:`bool`): flag to unify alif. Replace [ٱ أ إ آ] into [ا] (default is True).
|
16
|
+
special_chars (:obj:`bool`): flag to remove these special characters [?؟!@#$%] (default is True).
|
17
17
|
|
18
18
|
Returns:
|
19
19
|
:obj:`str`: stripped text.
|
@@ -30,10 +30,10 @@ def arStrip(text , diacs=True , small_diacs=True , shaddah=True , digit=True, al
|
|
30
30
|
# output
|
31
31
|
الجو جميل
|
32
32
|
|
33
|
-
output =
|
33
|
+
output =parser.arStrip('أَلَمۡ یَأۡنِ لِلَّذِینَ ءَامَنُوۤا۟ أَن تَخۡشَعَ قُلُوبُهُمۡ لِذِكۡرِ ٱللَّهِ وَمَا نَزَلَ مِنَ ٱلۡحَقِّ وَلَا یَكُونُوا۟ كَٱلَّذِینَ أُوتُوا۟ ٱلۡكِتَـٰبَ مِن قَبۡلُ فَطَالَ عَلَیۡهِمُ ٱلۡأَمَدُ فَقَسَتۡ قُلُوبُهُمۡۖ وَكَثِیر مِّنۡهُمۡ فَـسِقُونَ', True, True, True, True, False, False )
|
34
34
|
print(output)
|
35
35
|
#output
|
36
|
-
|
36
|
+
ألم یأن للذین ءامنوا أن تخشع قلوبهم لذكر ٱلله وما نزل من ٱلحق ولا یكونوا كٱلذین أوتوا ٱلكتب من قبل فطال علیهم ٱلأمد فقست قلوبهم وكثیر منهم فسقون
|
37
37
|
"""
|
38
38
|
try:
|
39
39
|
if text: # if the input string is not empty do the following
|
@@ -67,13 +67,13 @@ def arStrip(text , diacs=True , small_diacs=True , shaddah=True , digit=True, al
|
|
67
67
|
|
68
68
|
def remove_punctuation(text):
|
69
69
|
"""
|
70
|
-
Removes punctuation marks from the text.
|
70
|
+
Removes these arabic and english punctuation marks from the text [! " # $ % & ' ( ) * + , - . / : ; > = < ? @ [ \ ] ^ _ ` { | } ~ ، ؛ ؞ ؟ ـ ٓ ٬ ٪ ٫ ٭ ۔].
|
71
71
|
|
72
72
|
Args:
|
73
73
|
text (:obj:`str`): The input text.
|
74
74
|
|
75
75
|
Returns:
|
76
|
-
|
76
|
+
:obj:`str`
|
77
77
|
|
78
78
|
**Example:**
|
79
79
|
|
@@ -109,15 +109,12 @@ def remove_punctuation(text):
|
|
109
109
|
|
110
110
|
def remove_latin(text):
|
111
111
|
"""
|
112
|
-
This method removes all Latin
|
112
|
+
This method removes all Latin letters from the input text.
|
113
113
|
|
114
|
-
|
114
|
+
Parameters:
|
115
115
|
text (:obj:`str`): The input text.
|
116
|
-
|
117
116
|
Returns:
|
118
|
-
|
119
|
-
Note:
|
120
|
-
If an error occurs during processing, the original text is returned.
|
117
|
+
:obj:`str`
|
121
118
|
**Example:**
|
122
119
|
|
123
120
|
.. highlight:: python
|