SinaTools 0.1.25__py2.py3-none-any.whl → 0.1.27__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,64 @@
1
+ Metadata-Version: 2.1
2
+ Name: SinaTools
3
+ Version: 0.1.27
4
+ Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
+ Home-page: https://github.com/SinaLab/sinatools
6
+ License: MIT license
7
+ Keywords: sinatools
8
+ Platform: UNKNOWN
9
+ Description-Content-Type: text/markdown
10
+ Requires-Dist: six
11
+ Requires-Dist: farasapy
12
+ Requires-Dist: tqdm
13
+ Requires-Dist: requests
14
+ Requires-Dist: regex
15
+ Requires-Dist: pathlib
16
+ Requires-Dist: torch (==1.13.0)
17
+ Requires-Dist: transformers (==4.24.0)
18
+ Requires-Dist: torchtext (==0.14.0)
19
+ Requires-Dist: torchvision (==0.14.0)
20
+ Requires-Dist: seqeval (==1.2.2)
21
+ Requires-Dist: natsort (==7.1.1)
22
+
23
+ SinaTools
24
+ ======================
25
+ Open Source Toolkit for Arabic NLP and NLU developed by [SinaLab](http://sina.birzeit.edu/) at Birzeit University. SinaTools is available through Python APIs, command lines, colabs, and online demos.
26
+
27
+ See the full list of [Available Packages](https://sina.birzeit.edu/sinatools/), which include: (1) [Morphology Tagging](https://sina.birzeit.edu/sinatools/index.html#morph), (2) [Named Entity Recognition (NER)](https://sina.birzeit.edu/sinatools/index.html#ner), (3) [Word Sense Disambiguation (WSD)](https://sina.birzeit.edu/sinatools/index.html#wsd), (4) [Semantic Relatedness](https://sina.birzeit.edu/sinatools/index.html#sr), (5) [Synonymy Extraction and Evaluation](https://sina.birzeit.edu/sinatools/index.html#se), (6) [Relation Extraction](https://sina.birzeit.edu/sinatools/index.html), (7) [Utilities](https://sina.birzeit.edu/sinatools/index.html#u) (diacritic-based word matching, Jaccard similarly, parser, tokenizers, corpora processing, transliteration, etc).
28
+
29
+ See [Demo Pages](https://sina.birzeit.edu/sinatools/).
30
+
31
+ See the [benchmarking](https://www.jarrar.info/publications/HJK24.pdf), which shows that SinaTools outperformed all related toolkits.
32
+
33
+ Installation
34
+ --------
35
+ To install SinaTools, ensure you are using Python version 3.10.8, then clone the [GitHub](git://github.com/SinaLab/SinaTools) repository.
36
+
37
+ Alternatively, you can execute the following command:
38
+
39
+ ```bash
40
+ pip install sinatools
41
+ ```
42
+
43
+ Installing Models and Data Files
44
+ --------
45
+ Some modules in SinaTools require some data files and fine-tuned models to be downloaded. To download these models, please consult the [DataDownload](https://sina.birzeit.edu/sinatools/documentation/cli_tools/DataDownload/DataDownload.html).
46
+
47
+ Documentation
48
+ --------
49
+ For information, please refer to the [main page](https://sina.birzeit.edu/sinatools) or the [online domuementation](https://sina.birzeit.edu/sinatools/documentation).
50
+
51
+ Citation
52
+ -------
53
+ Tymaa Hammouda, Mustafa Jarrar, Mohammed Khalilia: [SinaTools: Open Source Toolkit for Arabic Natural Language Understanding](http://www.jarrar.info/publications/HJK24.pdf). In Proceedings of the 2024 AI in Computational Linguistics (ACLing 2024), Procedia Computer Science, Dubai. ELSEVIER.
54
+
55
+ License
56
+ --------
57
+ SinaTools is available under the MIT License. See the [LICENSE](https://github.com/SinaLab/sinatools/blob/main/LICENSE) file for more information.
58
+
59
+ Reporting Issues
60
+ --------
61
+ To report any issues or bugs, please contact us at "sina.institute.bzu@gmail.com" or visit [SinaTools Issues](https://github.com/SinaLab/sinatools/issues).
62
+
63
+
64
+
@@ -1,26 +1,26 @@
1
- SinaTools-0.1.25.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
2
- sinatools/VERSION,sha256=ZC9GuKFeLfVc4onMU4koq0Nk_oc7Tx3zDdE0H4bcUKU,6
1
+ SinaTools-0.1.27.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
2
+ sinatools/VERSION,sha256=W9ZpgLhiag6deRMB5DzdHO6EDLvNbZPVIRIUv6nHT3U,6
3
3
  sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
4
4
  sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
5
5
  sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
6
6
  sinatools/sinatools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
7
- sinatools/CLI/DataDownload/download_files.py,sha256=KG9W-Y5kJG_9yLUyo-cA33B5uO3avdZ5sSYUeW3wM6s,1960
7
+ sinatools/CLI/DataDownload/download_files.py,sha256=2jLohYd5Vjtz7gVd8zePhK6A_r0svpoV3ykbO5rwR8s,2613
8
8
  sinatools/CLI/morphology/ALMA_multi_word.py,sha256=ZImJ1vtcpSHydI1BjJmK3KcMJbGBZX16kO4L6rxvBvA,2086
9
9
  sinatools/CLI/morphology/morph_analyzer.py,sha256=ieIM47QK9Nct3MtCS9uq3h2rZN5r4qNhsLmlVeE6wiE,3503
10
- sinatools/CLI/ner/corpus_entity_extractor.py,sha256=_o0frMSgpsFVXPoztS3mQTK7LjHsgzUv9gfs6iJL424,4024
11
- sinatools/CLI/ner/entity_extractor.py,sha256=zn0Jd37BEDE1wHE5HOAK0_N2tURAznFNj7WDd6WGLIw,2932
10
+ sinatools/CLI/ner/corpus_entity_extractor.py,sha256=Da-DHFrqT6if7w6WnodB4TBE5ze3DJYjb2Mmju_Qd7g,4034
11
+ sinatools/CLI/ner/entity_extractor.py,sha256=IiTioe0px0aJ1E58FrDVa2yNgM8Ie4uS2LZKK_z2Qn4,2942
12
12
  sinatools/CLI/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
13
13
  sinatools/CLI/utils/arStrip.py,sha256=NLyp8vOu2xv80tL9jiKRvyptmbkRZVg-wcAr-9YyvNY,3264
14
14
  sinatools/CLI/utils/corpus_tokenizer.py,sha256=nH0T4h6urr_0Qy6-wN3PquOtnwybj0REde5Ts_OE4U8,1650
15
15
  sinatools/CLI/utils/implication.py,sha256=nvoiI5UHHaJdd6MICql0pB_-h3L0icYwP1WgJi2h7p0,2854
16
- sinatools/CLI/utils/jaccard.py,sha256=NoKbWAq6dHDtQ56mAc1kdAnROm8NXEjZ1ecVZ7EYm6Y,4205
16
+ sinatools/CLI/utils/jaccard.py,sha256=w56N_cNEFJ0A7WtunmY_xtms4srFagKBzrW_0YhH2DE,4216
17
17
  sinatools/CLI/utils/remove_latin.py,sha256=NOaTm2RHxt5IQrV98ySTmD8rTXTmcqSmfbPAwTyaXqU,848
18
18
  sinatools/CLI/utils/remove_punctuation.py,sha256=vJAZlEn7WGftZAFVFYnddkRrxdJ_rMmKB9vFZkY-jN4,1097
19
19
  sinatools/CLI/utils/sentence_tokenizer.py,sha256=Wli8eiDbWSd_Z8UKpu_JkaS8jImowa1vnRL0oYCSfqw,2823
20
20
  sinatools/CLI/utils/text_dublication_detector.py,sha256=dW70O5O20GxeUDDF6zVYn52wWLmJF-HBZgvqIeVL2rQ,1661
21
21
  sinatools/CLI/utils/text_transliteration.py,sha256=vz-3kxWf8pNYVCqNAtBAiA6u_efrS5NtWT-ofN1NX6I,2014
22
22
  sinatools/DataDownload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
- sinatools/DataDownload/downloader.py,sha256=F-SV-0mbYMYFSNCx8FoAYXhn0X1j0dF37PTLU0nUBVg,6482
23
+ sinatools/DataDownload/downloader.py,sha256=6xH55WlDhgtImPRFQ0AaeDFJjL8OMNU29x61PL8mZ2w,6468
24
24
  sinatools/arabert/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
25
25
  sinatools/arabert/preprocess.py,sha256=qI0FsuMTOzdRlYGCtLrjpXgikNElUZPv9bnjaKDZKJ4,33024
26
26
  sinatools/arabert/arabert/__init__.py,sha256=KbSAH-XqbRygn0y59m5-ZYOLXgpT1gSgE3F-qd4rKEc,627
@@ -76,13 +76,14 @@ sinatools/arabert/aragpt2/grover/utils.py,sha256=V5wMUxK03r5g_pb7R3_uGLOPqQJfbIB
76
76
  sinatools/morphology/ALMA_multi_word.py,sha256=hj_-8ojrYYHnfCGk8WKtJdUR8mauzQdma4WUm-okDps,1346
77
77
  sinatools/morphology/__init__.py,sha256=I4wVBh8BhyNl-CySVdiI_nUSn6gj1j-gmLKP300RpE0,1216
78
78
  sinatools/morphology/morph_analyzer.py,sha256=tA78gWg6iaE_G1c2xqxZoXZWNbvHBJLrTSxPyir5Xn8,6941
79
- sinatools/ner/__init__.py,sha256=gSs0x6veWJ8j3_iOs79tynBd_hJP0t44CGpJ0xzoiW4,1048
79
+ sinatools/ner/__init__.py,sha256=CLPaqUcvPGAA4lU-6hjAqjNfKJ5WtwRfsma6QkYZHEk,1379
80
80
  sinatools/ner/data.py,sha256=lvOW86dXse8SC75Q0supQaE0rrRffoxNjIA0Qbv5WZY,4354
81
81
  sinatools/ner/data_format.py,sha256=7Yt0aOicOn9_YuuyCkM_IYi_rgjGYxR9bCuUaNGM73o,4341
82
82
  sinatools/ner/datasets.py,sha256=mG1iwqSm3lXCFHLqE-b4wNi176cpuzNBz8tKaBU6z6M,5059
83
- sinatools/ner/entity_extractor.py,sha256=k0Yvvg_aknINkFSdqOgG1KulS0UIo-W0qycv9J2MtNo,2273
83
+ sinatools/ner/entity_extractor.py,sha256=bDInJYLC1z_kze5jtQFgQVLZ76tRzUK_FJxqETfVw70,1885
84
84
  sinatools/ner/helpers.py,sha256=dnOoDY5JMyOLTUWVIZLMt8mBn2IbWlVaqHhQyjs1voo,2343
85
85
  sinatools/ner/metrics.py,sha256=Irz6SsIvpOzGIA2lWxrEV86xnTnm0TzKm9SUVT4SXUU,2734
86
+ sinatools/ner/relation_extractor.py,sha256=a85xGX6V72fDpJk0GKmmtlWf8S8ezY-2pm5oGc9_ESY,9750
86
87
  sinatools/ner/transforms.py,sha256=vti3mDdi-IRP8i0aTQ37QqpPlP9hdMmJ6_bAMa0uL-s,4871
87
88
  sinatools/ner/data/__init__.py,sha256=W0C1ge_XxTfmdEGz0hkclz57aLI5VFS5t6BjByCfkFk,57
88
89
  sinatools/ner/data/datasets.py,sha256=lcdDDenFMEKIGYQmfww2dk_9WKWrJO9HtKptaAEsRmY,5064
@@ -91,13 +92,13 @@ sinatools/ner/nn/BaseModel.py,sha256=3GmujQasTZZunOBuFXpY2p1W8W256iI_Uu4hxhOY2Z0
91
92
  sinatools/ner/nn/BertNestedTagger.py,sha256=_fwAn1kiKmXe6m5y16Ipty3kvXIEFEmiUq74Ad1818U,1219
92
93
  sinatools/ner/nn/BertSeqTagger.py,sha256=dFcBBiMw2QCWsyy7aQDe_PS3aRuNn4DOxKIHgTblFvc,504
93
94
  sinatools/ner/nn/__init__.py,sha256=UgQD_XLNzQGBNSYc_Bw1aRJZjq4PJsnMT1iZwnJemqE,170
94
- sinatools/ner/trainers/BaseTrainer.py,sha256=oZgFJW-CawfCKT5gtaBHA7Q7XjNfiyqM62KnFsgVzPU,3919
95
+ sinatools/ner/trainers/BaseTrainer.py,sha256=Ifz4SeTxJwVn1_uWZ3I9KbcSo2hLPN3ojsIYuoKE9wE,4050
95
96
  sinatools/ner/trainers/BertNestedTrainer.py,sha256=Pb4O2WeBmTvV3hHMT6DXjxrTzgtuh3OrKQZnogYy8RQ,8429
96
97
  sinatools/ner/trainers/BertTrainer.py,sha256=B_uVtUwfv_eFwMMPsKQvZgW_ZNLy6XEsX5ePR0s8d-k,6433
97
98
  sinatools/ner/trainers/__init__.py,sha256=UDok8pDDpYOpwRBBKVLKaOgSUlmqqb-zHZI1p0xPxzI,188
98
99
  sinatools/semantic_relatedness/__init__.py,sha256=S0xrmqtl72L02N56nbNMudPoebnYQgsaIyyX-587DsU,830
99
100
  sinatools/semantic_relatedness/compute_relatedness.py,sha256=JvI0cXgukKtuMpmAygMnlocCsPeAJ98LD1jZCP_6SyQ,1110
100
- sinatools/synonyms/__init__.py,sha256=BN1f99w4yqnuT9PrOBsbeOMepPJPi-Fh1hEMvxqfMdM,562
101
+ sinatools/synonyms/__init__.py,sha256=yMuphNZrm5XLOR2T0weOHcUysJm-JKHUmVLoLQO8390,548
101
102
  sinatools/synonyms/synonyms_generator.py,sha256=FgAiuduSFyM6vJobWZKHg4KNWIQz8T6MGBPVIuVuw-8,6506
102
103
  sinatools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
103
104
  sinatools/utils/charsets.py,sha256=rs82oZJqRqosZdTKXfFAJfJ5t4PxjMM_oAPsiWSWuwU,2817
@@ -105,18 +106,19 @@ sinatools/utils/implication.py,sha256=MsbI6S1LNY-fCxGMxFTuaV639r3QijkkdcfH48rvY7
105
106
  sinatools/utils/jaccard.py,sha256=S7OgvaMqkN5HFgTZkKhMCNAuAnQ0LhRyXPN79jAzmKM,10113
106
107
  sinatools/utils/parser.py,sha256=CPPtCrsbxUqsjhY5C9wTOgkAs6iw0k_WvMUxLEPM1IU,6168
107
108
  sinatools/utils/readfile.py,sha256=xE4LEaCqXJIk9v37QUSSmWb-aY3UnCFUNb7uVdx3cpM,133
109
+ sinatools/utils/similarity.py,sha256=oEdpcn3hm8v3qvNvr1USQ7eQlK0zU-7a7W090rCIFyw,5688
108
110
  sinatools/utils/text_dublication_detector.py,sha256=6yAOUtdw4TKiJkUPDDi3oK7CEoIuBDbliJ4PU7kapfo,4249
109
111
  sinatools/utils/text_transliteration.py,sha256=NQoXrxI-h0UXnvVtDA3skNJduxIy0IW26r46N4tDxGk,8766
110
112
  sinatools/utils/tokenizer.py,sha256=QHyrVqJA_On4rKxexiWR2ovq4pI1-u6iZkdhRbK9tew,6676
111
113
  sinatools/utils/tokenizers_words.py,sha256=efNfOil9qDNVJ9yynk_8sqf65PsL-xtsHG7y2SZCkjQ,656
112
- sinatools/wsd/__init__.py,sha256=yV-SQSCzSrjbNkciMbDCqzGZ_EESchL7rlJk56uibVI,309
113
- sinatools/wsd/disambiguator.py,sha256=8HrVAGpEQyrzwiuEreLX9X82WSL-U2Aeca0ttrtIw2Y,19998
114
+ sinatools/wsd/__init__.py,sha256=mwmCUurOV42rsNRpIUP3luG0oEzeTfEx3oeDl93Oif8,306
115
+ sinatools/wsd/disambiguator.py,sha256=43Iq7NTZsiYWGFg-NUDrQuJKO1NT9QOnfBPB10IOJNs,19828
114
116
  sinatools/wsd/settings.py,sha256=6XflVTFKD8SVySX9Wj7zYQtV26WDTcQ2-uW8-gDNHKE,747
115
117
  sinatools/wsd/wsd.py,sha256=gHIBUFXegoY1z3rRnIlK6TduhYq2BTa_dHakOjOlT4k,4434
116
- SinaTools-0.1.25.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
117
- SinaTools-0.1.25.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
118
- SinaTools-0.1.25.dist-info/METADATA,sha256=0tDl5B000Z0gvhNCSzdlOzfyC0YQrP2uSZSm4OEP_EQ,953
119
- SinaTools-0.1.25.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
120
- SinaTools-0.1.25.dist-info/entry_points.txt,sha256=ZwZLolnWog2fjdDrfaHNHob8SE_YtMbD6ayzsOzItxs,1234
121
- SinaTools-0.1.25.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
122
- SinaTools-0.1.25.dist-info/RECORD,,
118
+ SinaTools-0.1.27.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
119
+ SinaTools-0.1.27.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
120
+ SinaTools-0.1.27.dist-info/METADATA,sha256=jcBDhXCwjYOsgNkcEjQLG7l_kdWSHkEUpSsoMt7uI1c,3264
121
+ SinaTools-0.1.27.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
122
+ SinaTools-0.1.27.dist-info/entry_points.txt,sha256=ZwZLolnWog2fjdDrfaHNHob8SE_YtMbD6ayzsOzItxs,1234
123
+ SinaTools-0.1.27.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
124
+ SinaTools-0.1.27.dist-info/RECORD,,
@@ -40,7 +40,7 @@ from sinatools.DataDownload.downloader import urls
40
40
 
41
41
  def main():
42
42
  parser = argparse.ArgumentParser(description="Download files from specified URLs.")
43
- parser.add_argument('-f', '--files', nargs="*", choices=urls.keys(),
43
+ parser.add_argument('-f', '--files', nargs="*",
44
44
  help="Names of the files to download. Available files are: "
45
45
  f"{', '.join(urls.keys())}. If no file is specified, all files will be downloaded.")
46
46
 
@@ -50,8 +50,23 @@ def main():
50
50
 
51
51
  if args.files:
52
52
  for file in args.files:
53
- url = urls[file]
54
- download_file(url)
53
+ print("file: ", file)
54
+ if file == "wsd":
55
+ download_file(urls["morph"])
56
+ download_file(urls["ner"])
57
+ download_file(urls["wsd_model"])
58
+ download_file(urls["wsd_tokenizer"])
59
+ download_file(urls["one_gram"])
60
+ download_file(urls["five_grams"])
61
+ download_file(urls["four_grams"])
62
+ download_file(urls["three_grams"])
63
+ download_file(urls["two_grams"])
64
+ elif file == "synonyms":
65
+ download_file(urls["graph_l2"])
66
+ download_file(urls["graph_l3"])
67
+ else:
68
+ url = urls[file]
69
+ download_file(url)
55
70
  else:
56
71
  download_files()
57
72
 
@@ -20,7 +20,7 @@ def jsons_to_list_of_lists(json_list):
20
20
  return [[d['token'], d['tags']] for d in json_list]
21
21
 
22
22
  def combine_tags(sentence):
23
- output = jsons_to_list_of_lists(extract(sentence))
23
+ output = jsons_to_list_of_lists(extract(sentence, "nested"))
24
24
  return [word[1] for word in output]
25
25
 
26
26
 
@@ -46,7 +46,7 @@ def jsons_to_list_of_lists(json_list):
46
46
  return [[d['token'], d['tags']] for d in json_list]
47
47
 
48
48
  def combine_tags(sentence):
49
- output = jsons_to_list_of_lists(extract(sentence))
49
+ output = jsons_to_list_of_lists(extract(sentence, "nested"))
50
50
  return [word[1] for word in output]
51
51
 
52
52
 
@@ -46,7 +46,7 @@ Examples:
46
46
  """
47
47
 
48
48
  import argparse
49
- from sinatools.utils.jaccard import jaccard
49
+ from sinatools.utils.similarity import get_jaccard
50
50
  from sinatools.utils.readfile import read_file
51
51
 
52
52
 
@@ -76,7 +76,7 @@ def main():
76
76
  print("Either --file1 and --file2 arguments or both --set1 and --set2 arguments must be provided.")
77
77
  return
78
78
 
79
- similarity = jaccard(args.delimiter, set1, set2, args.selection, args.ignoreAllDiacriticsButNotShadda, args.ignoreShaddaDiacritic)
79
+ similarity = get_jaccard(args.delimiter, set1, set2, args.selection, args.ignoreAllDiacriticsButNotShadda, args.ignoreShaddaDiacritic)
80
80
 
81
81
  print("Jaccard Result:", similarity)
82
82
 
@@ -15,8 +15,8 @@ urls = {
15
15
  'four_grams':'https://sina.birzeit.edu/four_grams.pickle',
16
16
  'three_grams':'https://sina.birzeit.edu/three_grams.pickle',
17
17
  'two_grams':'https://sina.birzeit.edu/two_grams.pickle',
18
- 'synonyms_level2':'https://sina.birzeit.edu/synonyms_level2.pkl',
19
- 'synonyms_level3':'https://sina.birzeit.edu/synonyms_level3.pkl'
18
+ 'synonyms_level2':'https://sina.birzeit.edu/graph_l2.pkl',
19
+ 'synonyms_level3':'https://sina.birzeit.edu/graph_l3.pkl'
20
20
  }
21
21
 
22
22
  def get_appdatadir():
sinatools/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.25
1
+ 0.1.27
sinatools/ner/__init__.py CHANGED
@@ -7,6 +7,8 @@ import torch
7
7
  import pickle
8
8
  import json
9
9
  from argparse import Namespace
10
+ from transformers import pipeline
11
+ #from transformers import AutoModelForSequenceClassification
10
12
 
11
13
  tagger = None
12
14
  tag_vocab = None
@@ -35,4 +37,7 @@ if torch.cuda.is_available():
35
37
 
36
38
  train_config.trainer_config["kwargs"]["model"] = model
37
39
  tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
38
- tagger.load(os.path.join(model_path,"checkpoints"))
40
+ tagger.load(os.path.join(model_path,"checkpoints"))
41
+
42
+ pipe = pipeline("sentiment-analysis", model= os.path.join(path, "best_model"), return_all_scores =True, max_length=128, truncation=True)
43
+ #pipe = AutoModelForSequenceClassification.from_pretrained(os.path.join(path, "best_model"))
@@ -3,43 +3,31 @@ from collections import namedtuple
3
3
  from sinatools.ner.data_format import get_dataloaders, text2segments
4
4
  from . import tagger, tag_vocab, train_config
5
5
 
6
- def extract(text, batch_size=32):
7
- """
8
- This method processes an input text and returns named entites for each token within the text, based on the specified batch size. As follows:
9
6
 
10
- Args:
11
- text (:obj:`str`): The Arabic text to be tagged.
12
- batch_size (int, optional): Batch size for inference. Default is 32.
13
-
14
- Returns:
15
- list (:obj:`list`): A list of JSON objects, where each JSON could be contains:
16
- token: The token from the original text.
17
- NER tag: The label pairs for each segment.
18
-
19
- **Example:**
20
-
21
- .. highlight:: python
22
- .. code-block:: python
7
+ def convert_nested_to_flat(nested_tags):
8
+ flat_tags = []
9
+
10
+ for entry in nested_tags:
11
+ word = entry['token']
12
+ tags = entry['tags'].split()
13
+
14
+ # Initialize with the first tag in the sequence
15
+ flat_tag = tags[0]
16
+
17
+ for tag in tags[1:]:
18
+ # Check if the tag is an "I-" tag, indicating continuation of an entity
19
+ if tag.startswith('I-'):
20
+ flat_tag = tag
21
+ break
22
+
23
+ flat_tags.append({
24
+ 'token': word,
25
+ 'tags': flat_tag
26
+ })
27
+
28
+ return flat_tags
23
29
 
24
- from sinatools.ner.entity_extractor import extract
25
- extract('ذهب محمد إلى جامعة بيرزيت')
26
- [{
27
- "word":"ذهب",
28
- "tags":"O"
29
- },{
30
- "word":"محمد",
31
- "tags":"B-PERS"
32
- },{
33
- "word":"إلى",
34
- "tags":"O"
35
- },{
36
- "word":"جامعة",
37
- "tags":"B-ORG"
38
- },{
39
- "word":"بيرزيت",
40
- "tags":"B-GPE I-ORG"
41
- }]
42
- """
30
+ def extract(text, ner_method="nested"):
43
31
 
44
32
  dataset, token_vocab = text2segments(text)
45
33
 
@@ -50,7 +38,7 @@ def extract(text, batch_size=32):
50
38
  (dataset,),
51
39
  vocab,
52
40
  train_config.data_config,
53
- batch_size=batch_size,
41
+ batch_size=32,
54
42
  shuffle=(False,),
55
43
  )[0]
56
44
 
@@ -69,4 +57,7 @@ def extract(text, batch_size=32):
69
57
  else:
70
58
  segments_list["tags"] = ' '.join(list_of_tags)
71
59
  segments_lists.append(segments_list)
60
+
61
+ if ner_method == "flat":
62
+ segments_lists = convert_nested_to_flat(segments_lists)
72
63
  return segments_lists
@@ -0,0 +1,201 @@
1
+ import torch
2
+ import json
3
+ from urllib.request import Request, urlopen
4
+ from sinatools.ner.entity_extractor import extract
5
+ from . import pipe
6
+
7
+
8
+ # ============================ Extract entities and their types ========================
9
+ def jsons_to_list_of_lists(json_list):
10
+ return [[d['token'], d['tags']] for d in json_list]
11
+
12
+ def entities_and_types(sentence):
13
+ output_list = jsons_to_list_of_lists(extract(sentence))
14
+ json_short = distill_entities(output_list)
15
+
16
+ entities = {}
17
+ for entity in json_short:
18
+ name = entity[0]
19
+ entity_type = entity[1]
20
+ entities[name] = entity_type
21
+
22
+ return entities
23
+
24
+ def distill_entities(entities):
25
+ # This is list that we put the output what we need
26
+ list_output = list()
27
+
28
+ # This line go to sort function and save the output to temp_entities
29
+ temp_entities = sortTags(entities)
30
+
31
+ # This list help us to make the output,
32
+ temp_list = list()
33
+
34
+ # initlize the temp_list
35
+ temp_list.append(["", "", 0, 0])
36
+ word_position = 0
37
+
38
+ # For each entity, convert ibo to distllir list.
39
+ for entity in temp_entities:
40
+ # This is counter tag of this entity
41
+ counter_tag = 0
42
+ # For each tag
43
+ for tag in str(entity[1]).split():
44
+ # If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist
45
+ if counter_tag >= len(temp_list):
46
+ temp_list.append(["", "", 0, 0])
47
+
48
+ # If tag equal O and word postion of this tag is not equal zero then it will add all
49
+ # not empty eliment of temp list in output list
50
+ if "O" == tag and word_position != 0:
51
+ for j in range(0, len(temp_list)):
52
+ if temp_list[j][1] != "":
53
+ list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
54
+ temp_list[j][0] = ""
55
+ temp_list[j][1] = ""
56
+ temp_list[j][2] = word_position
57
+ temp_list[j][3] = word_position
58
+ # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
59
+ # of the split its B
60
+ elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
61
+ # if the temp_list of counter is not empty then it will append in output list and hten it will
62
+ # initilize by new string and tag in templist of counter
63
+ if temp_list[counter_tag][1] != "":
64
+ list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
65
+ temp_list[counter_tag][0] = str(entity[0]) + " "
66
+ temp_list[counter_tag][1] = str(tag).split("-")[1]
67
+ temp_list[counter_tag][2] = word_position
68
+ temp_list[counter_tag][3] = word_position
69
+
70
+ # if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
71
+ # of the split its O
72
+ elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
73
+ # For each of temp_list, check if in this counter tag of templist is same tag with this.tag
74
+ # then will complete if not it will save in output list and cheak another
75
+ for j in range(counter_tag,len(temp_list)):
76
+ if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
77
+ temp_list[j][0] += str(entity[0]) + " "
78
+ temp_list[j][3] += 1
79
+ break
80
+ else:
81
+ if temp_list[j][1] != "":
82
+ list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
83
+ temp_list[j][0] = ""
84
+ temp_list[j][1] = ""
85
+ temp_list[j][2] = word_position
86
+ temp_list[j][3] = word_position
87
+ counter_tag += 1
88
+ word_position += 1
89
+ # For each temp_list, at the end of the previous loop, there will be some
90
+ # values in this list, we should save it to the output list
91
+ for j in range(0, len(temp_list)):
92
+ if temp_list[j][1] != "":
93
+ list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
94
+ return sorted(list_output, key=lambda x: (x[2]))
95
+
96
+ def sortTags(entities):
97
+ temp_entities = entities
98
+ temp_counter = 0
99
+ # For each entity, this loop will sort each tag of entitiy, first it will check if the
100
+ # previous tags has same count of this tag, second will sort the tags and check if this tags is correct
101
+ for entity in temp_entities:
102
+ tags = entity[1].split()
103
+ for tag in tags:
104
+ # if the counter is not 0 then, will complete
105
+ if temp_counter != 0:
106
+ # Check if this tag is equal I-, if yes then it will count how many tag in this tags and
107
+ # count how many tag in previous tags
108
+ if "I-" == tag[0:2]:
109
+ counter_of_this_tag = 0
110
+ counter_of_previous_tag = 0
111
+ for word in tags:
112
+ if tag.split("-")[1] in word:
113
+ counter_of_this_tag+=1
114
+ for word in temp_entities[temp_counter-1][1].split():
115
+ if tag.split("-")[1] in word:
116
+ counter_of_previous_tag+=1
117
+ # if the counter of previous tag is bigger than counter of this tag, then we
118
+ # need to add I-tag in this tags
119
+ if counter_of_previous_tag > counter_of_this_tag:
120
+ tags.append("I-"+tag.split("-")[1])
121
+ # Sort the tags
122
+ tags.sort()
123
+ # Need to revers the tags because it should begins with I
124
+ tags.reverse()
125
+ # If the counter is not 0 then we can complete
126
+ if temp_counter != 0:
127
+ this_tags = tags
128
+ previous_tags = temp_entities[temp_counter - 1][1].split()
129
+ sorted_tags = list()
130
+
131
+ # Check if the this tag is not O and previous tags is not O, then will complete,
132
+ # if not then it will ignor this tag
133
+ if "O" not in this_tags and "O" not in previous_tags:
134
+ index = 0
135
+ #For each previous tags, need sort this tag by previous tags if its I, B we can ignor
136
+ for i in previous_tags:
137
+ j = 0
138
+ while this_tags and j < len(this_tags):
139
+ if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
140
+ sorted_tags.insert(index, this_tags.pop(j))
141
+ break
142
+ elif this_tags[j][0:2] == "B-":
143
+ break
144
+ j += 1
145
+ index += 1
146
+ sorted_tags += this_tags
147
+ tags = sorted_tags
148
+ str_tag = " "
149
+ str_tag = str_tag.join(tags)
150
+ str_tag = str_tag.strip()
151
+ temp_entities[temp_counter][1] = str_tag
152
+ temp_counter += 1
153
+ return temp_entities
154
+
155
+ # ============= Prepare Templates and Catergorize Extracted Entities ================
156
+ temp03={'location':'مكان حدوث','agent':'أحد المتأثرين في','happened at':'تاريخ حدوث'}
157
+ categories = {
158
+ 'agent': ['PERS', 'NORP', 'OCC', 'ORG'],
159
+ 'location': ['LOC', 'FAC', 'GPE'],
160
+ 'happened at': ['DATE', 'TIME']
161
+ }
162
+
163
+ def get_entity_category(entity_type, categories):
164
+ for category, types in categories.items():
165
+ if entity_type in types:
166
+ return category
167
+ return None
168
+
169
+
170
+ # ============ Extract entities, their types and categorize them ===============
171
+ def relation_extraction(sentence):
172
+ #test_sentence="صورة إعتقال طفل فلسطيني خلال انتفاضة الأقصى ."
173
+ entities=entities_and_types(sentence)
174
+
175
+ event_indices = [i for i, (_, entity_type) in enumerate(entities.items()) if entity_type == 'EVENT']
176
+ arg_event_indices = [i for i, (_, entity_type) in enumerate(entities.items()) if entity_type != 'EVENT']
177
+
178
+ output_list=[]
179
+
180
+ for i in event_indices:
181
+ event_entity=list(entities.keys())[i]
182
+ for j in arg_event_indices:
183
+ arg_name= list(entities.keys())[j]
184
+ arg_type=entities[arg_name]
185
+ category = get_entity_category(arg_type, categories)
186
+
187
+ if category in temp03:
188
+ relation_sentence=f"[CLS] {sentence} [SEP] {event_entity} {temp03[category]} {arg_name}"
189
+ predicted_relation=pipe(relation_sentence)
190
+ score = predicted_relation[0][0]['score']
191
+ if score > 0.50:
192
+ #print(f"Event:{event_entity} Relation:{category} Argument:{arg_name}\n")
193
+ #output_list.append([{event_entity} ,{category}, {arg_name}])
194
+ output_list.append(f"Event:{event_entity}, Relation:{category}, Argument:{arg_name}")
195
+
196
+ else:
197
+ #print(f"Event:{event_entity} Relation:No relation Argument:{arg_name}\n")
198
+ #output_list.append([{event_entity} ,'No relation', {arg_name}])
199
+ output_list.append(f"Event:{event_entity}, Relation:No relation, Argument:{arg_name}")
200
+
201
+ return output_list
@@ -1,117 +1,117 @@
1
- import os
2
- import torch
3
- import logging
4
- import natsort
5
- import glob
6
-
7
- logger = logging.getLogger(__name__)
8
-
9
-
10
- class BaseTrainer:
11
- def __init__(
12
- self,
13
- model=None,
14
- max_epochs=50,
15
- optimizer=None,
16
- scheduler=None,
17
- loss=None,
18
- train_dataloader=None,
19
- val_dataloader=None,
20
- test_dataloader=None,
21
- log_interval=10,
22
- summary_writer=None,
23
- output_path=None,
24
- clip=5,
25
- patience=5
26
- ):
27
- self.model = model
28
- self.max_epochs = max_epochs
29
- self.train_dataloader = train_dataloader
30
- self.val_dataloader = val_dataloader
31
- self.test_dataloader = test_dataloader
32
- self.optimizer = optimizer
33
- self.scheduler = scheduler
34
- self.loss = loss
35
- self.log_interval = log_interval
36
- self.summary_writer = summary_writer
37
- self.output_path = output_path
38
- self.current_timestep = 0
39
- self.current_epoch = 0
40
- self.clip = clip
41
- self.patience = patience
42
-
43
- def tag(self, dataloader, is_train=True):
44
- """
45
- Given a dataloader containing segments, predict the tags
46
- :param dataloader: torch.utils.data.DataLoader
47
- :param is_train: boolean - True for training model, False for evaluation
48
- :return: Iterator
49
- subwords (B x T x NUM_LABELS)- torch.Tensor - BERT subword ID
50
- gold_tags (B x T x NUM_LABELS) - torch.Tensor - ground truth tags IDs
51
- tokens - List[arabiner.data.dataset.Token] - list of tokens
52
- valid_len (B x 1) - int - valiud length of each sequence
53
- logits (B x T x NUM_LABELS) - logits for each token and each tag
54
- """
55
- for subwords, gold_tags, tokens, valid_len in dataloader:
56
- self.model.train(is_train)
57
-
58
- if torch.cuda.is_available():
59
- subwords = subwords.cuda()
60
- gold_tags = gold_tags.cuda()
61
-
62
- if is_train:
63
- self.optimizer.zero_grad()
64
- logits = self.model(subwords)
65
- else:
66
- with torch.no_grad():
67
- logits = self.model(subwords)
68
-
69
- yield subwords, gold_tags, tokens, valid_len, logits
70
-
71
- def segments_to_file(self, segments, filename):
72
- """
73
- Write segments to file
74
- :param segments: [List[arabiner.data.dataset.Token]] - list of list of tokens
75
- :param filename: str - output filename
76
- :return: None
77
- """
78
- with open(filename, "w") as fh:
79
- results = "\n\n".join(["\n".join([t.__str__() for t in segment]) for segment in segments])
80
- fh.write("Token\tGold Tag\tPredicted Tag\n")
81
- fh.write(results)
82
- logging.info("Predictions written to %s", filename)
83
-
84
- def save(self):
85
- """
86
- Save model checkpoint
87
- :return:
88
- """
89
- filename = os.path.join(
90
- self.output_path,
91
- "checkpoints",
92
- "checkpoint_{}.pt".format(self.current_epoch),
93
- )
94
-
95
- checkpoint = {
96
- "model": self.model.state_dict(),
97
- "optimizer": self.optimizer.state_dict(),
98
- "epoch": self.current_epoch
99
- }
100
-
101
- logger.info("Saving checkpoint to %s", filename)
102
- torch.save(checkpoint, filename)
103
-
104
- def load(self, checkpoint_path):
105
- """
106
- Load model checkpoint
107
- :param checkpoint_path: str - path/to/checkpoints
108
- :return: None
109
- """
110
- checkpoint_path = natsort.natsorted(glob.glob(f"{checkpoint_path}/checkpoint_*.pt"))
111
- checkpoint_path = checkpoint_path[-1]
112
-
113
- logger.info("Loading checkpoint %s", checkpoint_path)
114
-
115
- device = None if torch.cuda.is_available() else torch.device('cpu')
116
- checkpoint = torch.load(checkpoint_path, map_location=device)
117
- self.model.load_state_dict(checkpoint["model"])
1
+ import os
2
+ import torch
3
+ import logging
4
+ import natsort
5
+ import glob
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+
10
+ class BaseTrainer:
11
+ def __init__(
12
+ self,
13
+ model=None,
14
+ max_epochs=50,
15
+ optimizer=None,
16
+ scheduler=None,
17
+ loss=None,
18
+ train_dataloader=None,
19
+ val_dataloader=None,
20
+ test_dataloader=None,
21
+ log_interval=10,
22
+ summary_writer=None,
23
+ output_path=None,
24
+ clip=5,
25
+ patience=5
26
+ ):
27
+ self.model = model
28
+ self.max_epochs = max_epochs
29
+ self.train_dataloader = train_dataloader
30
+ self.val_dataloader = val_dataloader
31
+ self.test_dataloader = test_dataloader
32
+ self.optimizer = optimizer
33
+ self.scheduler = scheduler
34
+ self.loss = loss
35
+ self.log_interval = log_interval
36
+ self.summary_writer = summary_writer
37
+ self.output_path = output_path
38
+ self.current_timestep = 0
39
+ self.current_epoch = 0
40
+ self.clip = clip
41
+ self.patience = patience
42
+
43
+ def tag(self, dataloader, is_train=True):
44
+ """
45
+ Given a dataloader containing segments, predict the tags
46
+ :param dataloader: torch.utils.data.DataLoader
47
+ :param is_train: boolean - True for training model, False for evaluation
48
+ :return: Iterator
49
+ subwords (B x T x NUM_LABELS)- torch.Tensor - BERT subword ID
50
+ gold_tags (B x T x NUM_LABELS) - torch.Tensor - ground truth tags IDs
51
+ tokens - List[arabiner.data.dataset.Token] - list of tokens
52
+ valid_len (B x 1) - int - valiud length of each sequence
53
+ logits (B x T x NUM_LABELS) - logits for each token and each tag
54
+ """
55
+ for subwords, gold_tags, tokens, valid_len in dataloader:
56
+ self.model.train(is_train)
57
+
58
+ if torch.cuda.is_available():
59
+ subwords = subwords.cuda()
60
+ gold_tags = gold_tags.cuda()
61
+
62
+ if is_train:
63
+ self.optimizer.zero_grad()
64
+ logits = self.model(subwords)
65
+ else:
66
+ with torch.no_grad():
67
+ logits = self.model(subwords)
68
+
69
+ yield subwords, gold_tags, tokens, valid_len, logits
70
+
71
+ def segments_to_file(self, segments, filename):
72
+ """
73
+ Write segments to file
74
+ :param segments: [List[arabiner.data.dataset.Token]] - list of list of tokens
75
+ :param filename: str - output filename
76
+ :return: None
77
+ """
78
+ with open(filename, "w") as fh:
79
+ results = "\n\n".join(["\n".join([t.__str__() for t in segment]) for segment in segments])
80
+ fh.write("Token\tGold Tag\tPredicted Tag\n")
81
+ fh.write(results)
82
+ logging.info("Predictions written to %s", filename)
83
+
84
+ def save(self):
85
+ """
86
+ Save model checkpoint
87
+ :return:
88
+ """
89
+ filename = os.path.join(
90
+ self.output_path,
91
+ "checkpoints",
92
+ "checkpoint_{}.pt".format(self.current_epoch),
93
+ )
94
+
95
+ checkpoint = {
96
+ "model": self.model.state_dict(),
97
+ "optimizer": self.optimizer.state_dict(),
98
+ "epoch": self.current_epoch
99
+ }
100
+
101
+ logger.info("Saving checkpoint to %s", filename)
102
+ torch.save(checkpoint, filename)
103
+
104
+ def load(self, checkpoint_path):
105
+ """
106
+ Load model checkpoint
107
+ :param checkpoint_path: str - path/to/checkpoints
108
+ :return: None
109
+ """
110
+ checkpoint_path = natsort.natsorted(glob.glob(f"{checkpoint_path}/checkpoint_*.pt"))
111
+ checkpoint_path = checkpoint_path[-1]
112
+
113
+ logger.info("Loading checkpoint %s", checkpoint_path)
114
+
115
+ device = None if torch.cuda.is_available() else torch.device('cpu')
116
+ checkpoint = torch.load(checkpoint_path, map_location=device)
117
+ self.model.load_state_dict(checkpoint["model"], strict=False)
@@ -3,7 +3,7 @@ from sinatools.DataDownload import downloader
3
3
  import os
4
4
 
5
5
  synonyms_level2_dict = {}
6
- level2_dict = 'synonyms_level2.pkl'
6
+ level2_dict = 'graph_l2.pkl'
7
7
  path = downloader.get_appdatadir()
8
8
  file_path = os.path.join(path, level2_dict)
9
9
  with open(file_path, 'rb') as f:
@@ -11,7 +11,7 @@ with open(file_path, 'rb') as f:
11
11
 
12
12
 
13
13
  synonyms_level3_dict = {}
14
- level3_dict = 'synonyms_level3.pkl'
14
+ level3_dict = 'graph_l3.pkl'
15
15
  path = downloader.get_appdatadir()
16
16
  file_path = os.path.join(path, level3_dict)
17
17
  with open(file_path, 'rb') as f:
@@ -0,0 +1,149 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from sinatools.utils.parser import arStrip
4
+ from sinatools.utils.implication import Implication
5
+ import argparse
6
+
7
+ def normalize_word(word: str, ignore_all_diacritics_but_not_shadda: bool=True, ignore_shadda_diacritic: bool=True) -> str:
8
+ if ignore_all_diacritics_but_not_shadda:
9
+ word = arStrip(word, True, True, False, False, False, False)
10
+
11
+ if ignore_shadda_diacritic:
12
+ word = arStrip(word, False, False, True, False, False, False)
13
+
14
+ return word
15
+
16
+
17
+ def get_preferred_word(word1, word2):
18
+ implication = Implication(word1, word2)
19
+
20
+ direction = implication.get_direction()
21
+
22
+ if direction in (0, 2):
23
+ return word1
24
+
25
+ elif direction == 1:
26
+ return word2
27
+
28
+ elif direction == 3:
29
+ if not word1.endswith("َ") and not word1.endswith("ُ"):
30
+ return word2
31
+ return word1
32
+
33
+
34
+ def get_non_preferred_word(word1, word2):
35
+
36
+ implication = Implication(word1, word2)
37
+ if implication.get_distance() < 15:
38
+ direction = implication.get_direction()
39
+ if direction == 0 or direction == 1:
40
+ return word1
41
+ elif direction == 2:
42
+ return word2
43
+ elif direction == 3:
44
+ if not word1.endswith("َ") and not word1.endswith("ُ"):
45
+ return word1
46
+ return word2
47
+ return "#"
48
+
49
+ def get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda=False, ignore_shadda_diacritic=False):
50
+
51
+ list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
52
+ list1 = [str(i.strip()) for i in list1]
53
+
54
+ list2 = [str(i) for i in list2 if i not in (None, ' ', '')]
55
+ list2 = [str(i.strip()) for i in list2]
56
+
57
+ interection_list = []
58
+
59
+ for list1_word in list1:
60
+ for list2_word in list2:
61
+ word1 = normalize_word(list1_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
62
+ word2 = normalize_word(list2_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
63
+
64
+ implication = Implication(word1, word2)
65
+ if implication.get_direction() >= 0 and implication.get_distance() < 15:
66
+ interection_list.append(get_preferred_word(word1, word2))
67
+
68
+ i = 0
69
+ while i < len(interection_list):
70
+ j = i + 1
71
+ while j < len(interection_list):
72
+ non_preferred_word = get_non_preferred_word(interection_list[i], interection_list[j])
73
+ if non_preferred_word != "#":
74
+ interection_list.remove(non_preferred_word)
75
+ j += 1
76
+ i += 1
77
+
78
+ return interection_list
79
+
80
+
81
+
82
+ def get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic):
83
+
84
+ list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
85
+
86
+ list2 = [str(i) for i in list2 if i not in (None, ' ', '')]
87
+
88
+ union_list = []
89
+
90
+ for list1_word in list1:
91
+ word1 = normalize_word(list1_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
92
+ union_list.append(word1)
93
+
94
+ for list2_word in list2:
95
+ word2 = normalize_word(list2_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
96
+ union_list.append(word2)
97
+
98
+ i = 0
99
+ while i < len(union_list):
100
+ j = i + 1
101
+ while j < len(union_list):
102
+ non_preferred_word = get_non_preferred_word(union_list[i], union_list[j])
103
+ if (non_preferred_word != "#"):
104
+ union_list.remove(non_preferred_word)
105
+ j = j + 1
106
+ i = i + 1
107
+
108
+ return union_list
109
+
110
+
111
+
112
+ def get_jaccard_similarity(list1: list, list2: list, ignore_all_diacritics_but_not_shadda: bool, ignore_shadda_diacritic: bool) -> float:
113
+
114
+ intersection_list = get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
115
+
116
+ union_list = get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
117
+
118
+ return float(len(intersection_list)) / float(len(union_list))
119
+
120
+ def get_jaccard(delimiter, str1, str2, selection, ignoreAllDiacriticsButNotShadda=True, ignoreShaddaDiacritic=True):
121
+
122
+ try:
123
+ list1 = str1.split(delimiter)
124
+ list2 = str2.split(delimiter)
125
+
126
+ if selection == "intersection":
127
+ intersection = get_intersection(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
128
+ return intersection
129
+ elif selection == "union":
130
+ union = get_union(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
131
+ return union
132
+ elif selection == "jaccardSimilarity":
133
+ similarity = get_jaccard_similarity(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
134
+ return similarity
135
+ elif selection == "jaccardAll":
136
+ intersection = get_intersection(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
137
+ union = get_union(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
138
+ similarity = get_jaccard_similarity(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
139
+ output_list = ["intersection:", intersection, "union:", union, "similarity:", similarity]
140
+ return output_list
141
+ else:
142
+ return 'Invalid selection option'
143
+
144
+ except AttributeError as ae:
145
+ print(f"Attribute error occurred: {str(ae)}")
146
+ return 'Invalid input type'
147
+ except Exception as e:
148
+ print(f"Error occurred: {str(e)}")
149
+ return 'An error has occurred'
sinatools/wsd/__init__.py CHANGED
@@ -4,7 +4,7 @@ from sinatools.DataDownload import downloader
4
4
  import os
5
5
 
6
6
  glosses_dic = {}
7
- filename = 'glosses_dic.pickle'
7
+ filename = 'one_gram.pickle'
8
8
  path =downloader.get_appdatadir()
9
9
  file_path = os.path.join(path, filename)
10
10
  with open(file_path, 'rb') as f:
@@ -217,7 +217,7 @@ def jsons_to_list_of_lists(json_list):
217
217
  def find_named_entities(string):
218
218
  found_entities = []
219
219
 
220
- ner_entites = extract(string)
220
+ ner_entites = extract(string, "nested")
221
221
  list_of_entites = jsons_to_list_of_lists(ner_entites)
222
222
  entites = distill_entities(list_of_entites)
223
223
 
@@ -288,17 +288,17 @@ def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, se
288
288
  concept_id, gloss = GlossPredictor(Diac_lemma, Undiac_lemma,word,sentence,glosses_dictionary)
289
289
 
290
290
  my_json = {}
291
- my_json['Concept_id'] = concept_id
291
+ my_json['concept_id'] = concept_id
292
292
  # my_json['Gloss'] = gloss
293
293
  my_json['word'] = word
294
- my_json['Undiac_lemma'] = Undiac_lemma
295
- my_json['Diac_lemma'] = Diac_lemma
294
+ #my_json['Undiac_lemma'] = Undiac_lemma
295
+ my_json['lemma'] = Diac_lemma
296
296
  return my_json
297
297
  else:
298
298
  my_json = {}
299
299
  my_json['word'] = word
300
- my_json['Undiac_lemma'] = Undiac_lemma
301
- my_json['Diac_lemma'] = Diac_lemma
300
+ #my_json['Undiac_lemma'] = Undiac_lemma
301
+ my_json['lemma'] = Diac_lemma
302
302
  return my_json
303
303
 
304
304
 
@@ -405,26 +405,26 @@ def disambiguate_glosses_main(word, sentence):
405
405
  if concept_count == 0:
406
406
  my_json = {}
407
407
  my_json['word'] = word['word']
408
- my_json['Diac_lemma'] = word['Diac_lemma']
409
- my_json['Undiac_lemma'] = word['Undiac_lemma']
408
+ my_json['lemma'] = word['Diac_lemma']
409
+ #my_json['Undiac_lemma'] = word['Undiac_lemma']
410
410
  return my_json
411
411
  elif concept_count == 1:
412
412
  my_json = {}
413
413
  my_json['word'] = word['word']
414
414
  glosses = word['glosses'][0]
415
415
  # my_json['Gloss'] = glosses['gloss']
416
- my_json['Concept_id'] = glosses['concept_id']
417
- my_json['Diac_lemma'] = word['Diac_lemma']
418
- my_json['Undiac_lemma'] = word['Undiac_lemma']
416
+ my_json['concept_id'] = glosses['concept_id']
417
+ my_json['lemma'] = word['Diac_lemma']
418
+ #my_json['Undiac_lemma'] = word['Undiac_lemma']
419
419
  return my_json
420
420
  elif concept_count == '*':
421
421
  my_json = {}
422
422
  my_json['word'] = word['word']
423
423
  glosses = word['glosses'][0]
424
424
  my_json['Gloss'] = glosses['gloss']
425
- my_json['Concept_id'] = glosses['concept_id']
426
- my_json['Diac_lemma'] = word['Diac_lemma']
427
- my_json['Undiac_lemma'] = word['Undiac_lemma']
425
+ my_json['concept_id'] = glosses['concept_id']
426
+ my_json['lemma'] = word['Diac_lemma']
427
+ #my_json['Undiac_lemma'] = word['Undiac_lemma']
428
428
  return my_json
429
429
  else:
430
430
  input_word = word['word']
@@ -477,21 +477,18 @@ def disambiguate(sentence):
477
477
  #output
478
478
  [
479
479
  {
480
- "Concept_id": "303019218",
480
+ "concept_id": "303019218",
481
481
  "word": "ذهبت",
482
- "Undiac_lemma": "ذهب",
483
- "Diac_lemma": "ذَهَبَ۪ 1"
482
+ "lemma": "ذَهَبَ۪ 1"
484
483
  },
485
484
  {
486
485
  "word": "إلى",
487
- "Diac_lemma": إِلَى 1,
488
- "Undiac_lemma": "الى"
486
+ "lemma": "إِلَى 1"
489
487
  },
490
488
  {
491
489
  "word": "جامعة بيرزيت",
492
- "Concept_id": "334000099",
493
- "Diac_lemma": جامِعَة بيرزَيت,
494
- "Undiac_lemma": "جامعة بيرزيت"
490
+ "concept_id": "334000099",
491
+ "lemma": "جامِعَة بيرزَيت"
495
492
  }
496
493
  ]
497
494
  """
@@ -1,34 +0,0 @@
1
- Metadata-Version: 2.1
2
- Name: SinaTools
3
- Version: 0.1.25
4
- Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
- Home-page: https://github.com/SinaLab/sinatools
6
- License: MIT license
7
- Keywords: sinatools
8
- Platform: UNKNOWN
9
- Description-Content-Type: text/markdown
10
- Requires-Dist: six
11
- Requires-Dist: farasapy
12
- Requires-Dist: tqdm
13
- Requires-Dist: requests
14
- Requires-Dist: regex
15
- Requires-Dist: pathlib
16
- Requires-Dist: torch (==1.13.0)
17
- Requires-Dist: transformers (==4.24.0)
18
- Requires-Dist: torchtext (==0.14.0)
19
- Requires-Dist: torchvision (==0.14.0)
20
- Requires-Dist: seqeval (==1.2.2)
21
- Requires-Dist: natsort (==7.1.1)
22
-
23
- SinaTools
24
- ---------
25
-
26
- Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
27
-
28
- Python APIs, command lines, colabs, and online demos.
29
-
30
- * Free software: MIT license
31
- * Documentation: https://sina.birzeit.edu/sinatools/
32
-
33
-
34
-