SinaTools 0.1.25__py2.py3-none-any.whl → 0.1.27__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- SinaTools-0.1.27.dist-info/METADATA +64 -0
- {SinaTools-0.1.25.dist-info → SinaTools-0.1.27.dist-info}/RECORD +22 -20
- sinatools/CLI/DataDownload/download_files.py +18 -3
- sinatools/CLI/ner/corpus_entity_extractor.py +1 -1
- sinatools/CLI/ner/entity_extractor.py +1 -1
- sinatools/CLI/utils/jaccard.py +2 -2
- sinatools/DataDownload/downloader.py +2 -2
- sinatools/VERSION +1 -1
- sinatools/ner/__init__.py +6 -1
- sinatools/ner/entity_extractor.py +27 -36
- sinatools/ner/relation_extractor.py +201 -0
- sinatools/ner/trainers/BaseTrainer.py +117 -117
- sinatools/synonyms/__init__.py +2 -2
- sinatools/utils/similarity.py +149 -0
- sinatools/wsd/__init__.py +1 -1
- sinatools/wsd/disambiguator.py +19 -22
- SinaTools-0.1.25.dist-info/METADATA +0 -34
- {SinaTools-0.1.25.data → SinaTools-0.1.27.data}/data/sinatools/environment.yml +0 -0
- {SinaTools-0.1.25.dist-info → SinaTools-0.1.27.dist-info}/AUTHORS.rst +0 -0
- {SinaTools-0.1.25.dist-info → SinaTools-0.1.27.dist-info}/LICENSE +0 -0
- {SinaTools-0.1.25.dist-info → SinaTools-0.1.27.dist-info}/WHEEL +0 -0
- {SinaTools-0.1.25.dist-info → SinaTools-0.1.27.dist-info}/entry_points.txt +0 -0
- {SinaTools-0.1.25.dist-info → SinaTools-0.1.27.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,64 @@
|
|
1
|
+
Metadata-Version: 2.1
|
2
|
+
Name: SinaTools
|
3
|
+
Version: 0.1.27
|
4
|
+
Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
5
|
+
Home-page: https://github.com/SinaLab/sinatools
|
6
|
+
License: MIT license
|
7
|
+
Keywords: sinatools
|
8
|
+
Platform: UNKNOWN
|
9
|
+
Description-Content-Type: text/markdown
|
10
|
+
Requires-Dist: six
|
11
|
+
Requires-Dist: farasapy
|
12
|
+
Requires-Dist: tqdm
|
13
|
+
Requires-Dist: requests
|
14
|
+
Requires-Dist: regex
|
15
|
+
Requires-Dist: pathlib
|
16
|
+
Requires-Dist: torch (==1.13.0)
|
17
|
+
Requires-Dist: transformers (==4.24.0)
|
18
|
+
Requires-Dist: torchtext (==0.14.0)
|
19
|
+
Requires-Dist: torchvision (==0.14.0)
|
20
|
+
Requires-Dist: seqeval (==1.2.2)
|
21
|
+
Requires-Dist: natsort (==7.1.1)
|
22
|
+
|
23
|
+
SinaTools
|
24
|
+
======================
|
25
|
+
Open Source Toolkit for Arabic NLP and NLU developed by [SinaLab](http://sina.birzeit.edu/) at Birzeit University. SinaTools is available through Python APIs, command lines, colabs, and online demos.
|
26
|
+
|
27
|
+
See the full list of [Available Packages](https://sina.birzeit.edu/sinatools/), which include: (1) [Morphology Tagging](https://sina.birzeit.edu/sinatools/index.html#morph), (2) [Named Entity Recognition (NER)](https://sina.birzeit.edu/sinatools/index.html#ner), (3) [Word Sense Disambiguation (WSD)](https://sina.birzeit.edu/sinatools/index.html#wsd), (4) [Semantic Relatedness](https://sina.birzeit.edu/sinatools/index.html#sr), (5) [Synonymy Extraction and Evaluation](https://sina.birzeit.edu/sinatools/index.html#se), (6) [Relation Extraction](https://sina.birzeit.edu/sinatools/index.html), (7) [Utilities](https://sina.birzeit.edu/sinatools/index.html#u) (diacritic-based word matching, Jaccard similarly, parser, tokenizers, corpora processing, transliteration, etc).
|
28
|
+
|
29
|
+
See [Demo Pages](https://sina.birzeit.edu/sinatools/).
|
30
|
+
|
31
|
+
See the [benchmarking](https://www.jarrar.info/publications/HJK24.pdf), which shows that SinaTools outperformed all related toolkits.
|
32
|
+
|
33
|
+
Installation
|
34
|
+
--------
|
35
|
+
To install SinaTools, ensure you are using Python version 3.10.8, then clone the [GitHub](git://github.com/SinaLab/SinaTools) repository.
|
36
|
+
|
37
|
+
Alternatively, you can execute the following command:
|
38
|
+
|
39
|
+
```bash
|
40
|
+
pip install sinatools
|
41
|
+
```
|
42
|
+
|
43
|
+
Installing Models and Data Files
|
44
|
+
--------
|
45
|
+
Some modules in SinaTools require some data files and fine-tuned models to be downloaded. To download these models, please consult the [DataDownload](https://sina.birzeit.edu/sinatools/documentation/cli_tools/DataDownload/DataDownload.html).
|
46
|
+
|
47
|
+
Documentation
|
48
|
+
--------
|
49
|
+
For information, please refer to the [main page](https://sina.birzeit.edu/sinatools) or the [online domuementation](https://sina.birzeit.edu/sinatools/documentation).
|
50
|
+
|
51
|
+
Citation
|
52
|
+
-------
|
53
|
+
Tymaa Hammouda, Mustafa Jarrar, Mohammed Khalilia: [SinaTools: Open Source Toolkit for Arabic Natural Language Understanding](http://www.jarrar.info/publications/HJK24.pdf). In Proceedings of the 2024 AI in Computational Linguistics (ACLing 2024), Procedia Computer Science, Dubai. ELSEVIER.
|
54
|
+
|
55
|
+
License
|
56
|
+
--------
|
57
|
+
SinaTools is available under the MIT License. See the [LICENSE](https://github.com/SinaLab/sinatools/blob/main/LICENSE) file for more information.
|
58
|
+
|
59
|
+
Reporting Issues
|
60
|
+
--------
|
61
|
+
To report any issues or bugs, please contact us at "sina.institute.bzu@gmail.com" or visit [SinaTools Issues](https://github.com/SinaLab/sinatools/issues).
|
62
|
+
|
63
|
+
|
64
|
+
|
@@ -1,26 +1,26 @@
|
|
1
|
-
SinaTools-0.1.
|
2
|
-
sinatools/VERSION,sha256=
|
1
|
+
SinaTools-0.1.27.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
|
2
|
+
sinatools/VERSION,sha256=W9ZpgLhiag6deRMB5DzdHO6EDLvNbZPVIRIUv6nHT3U,6
|
3
3
|
sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
|
4
4
|
sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
|
5
5
|
sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
|
6
6
|
sinatools/sinatools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
|
7
|
-
sinatools/CLI/DataDownload/download_files.py,sha256=
|
7
|
+
sinatools/CLI/DataDownload/download_files.py,sha256=2jLohYd5Vjtz7gVd8zePhK6A_r0svpoV3ykbO5rwR8s,2613
|
8
8
|
sinatools/CLI/morphology/ALMA_multi_word.py,sha256=ZImJ1vtcpSHydI1BjJmK3KcMJbGBZX16kO4L6rxvBvA,2086
|
9
9
|
sinatools/CLI/morphology/morph_analyzer.py,sha256=ieIM47QK9Nct3MtCS9uq3h2rZN5r4qNhsLmlVeE6wiE,3503
|
10
|
-
sinatools/CLI/ner/corpus_entity_extractor.py,sha256=
|
11
|
-
sinatools/CLI/ner/entity_extractor.py,sha256=
|
10
|
+
sinatools/CLI/ner/corpus_entity_extractor.py,sha256=Da-DHFrqT6if7w6WnodB4TBE5ze3DJYjb2Mmju_Qd7g,4034
|
11
|
+
sinatools/CLI/ner/entity_extractor.py,sha256=IiTioe0px0aJ1E58FrDVa2yNgM8Ie4uS2LZKK_z2Qn4,2942
|
12
12
|
sinatools/CLI/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
13
13
|
sinatools/CLI/utils/arStrip.py,sha256=NLyp8vOu2xv80tL9jiKRvyptmbkRZVg-wcAr-9YyvNY,3264
|
14
14
|
sinatools/CLI/utils/corpus_tokenizer.py,sha256=nH0T4h6urr_0Qy6-wN3PquOtnwybj0REde5Ts_OE4U8,1650
|
15
15
|
sinatools/CLI/utils/implication.py,sha256=nvoiI5UHHaJdd6MICql0pB_-h3L0icYwP1WgJi2h7p0,2854
|
16
|
-
sinatools/CLI/utils/jaccard.py,sha256=
|
16
|
+
sinatools/CLI/utils/jaccard.py,sha256=w56N_cNEFJ0A7WtunmY_xtms4srFagKBzrW_0YhH2DE,4216
|
17
17
|
sinatools/CLI/utils/remove_latin.py,sha256=NOaTm2RHxt5IQrV98ySTmD8rTXTmcqSmfbPAwTyaXqU,848
|
18
18
|
sinatools/CLI/utils/remove_punctuation.py,sha256=vJAZlEn7WGftZAFVFYnddkRrxdJ_rMmKB9vFZkY-jN4,1097
|
19
19
|
sinatools/CLI/utils/sentence_tokenizer.py,sha256=Wli8eiDbWSd_Z8UKpu_JkaS8jImowa1vnRL0oYCSfqw,2823
|
20
20
|
sinatools/CLI/utils/text_dublication_detector.py,sha256=dW70O5O20GxeUDDF6zVYn52wWLmJF-HBZgvqIeVL2rQ,1661
|
21
21
|
sinatools/CLI/utils/text_transliteration.py,sha256=vz-3kxWf8pNYVCqNAtBAiA6u_efrS5NtWT-ofN1NX6I,2014
|
22
22
|
sinatools/DataDownload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
23
|
-
sinatools/DataDownload/downloader.py,sha256=
|
23
|
+
sinatools/DataDownload/downloader.py,sha256=6xH55WlDhgtImPRFQ0AaeDFJjL8OMNU29x61PL8mZ2w,6468
|
24
24
|
sinatools/arabert/__init__.py,sha256=ely2PttjgSv7vKdzskuD1rtK_l_UOpmxJSz8isrveD0,16
|
25
25
|
sinatools/arabert/preprocess.py,sha256=qI0FsuMTOzdRlYGCtLrjpXgikNElUZPv9bnjaKDZKJ4,33024
|
26
26
|
sinatools/arabert/arabert/__init__.py,sha256=KbSAH-XqbRygn0y59m5-ZYOLXgpT1gSgE3F-qd4rKEc,627
|
@@ -76,13 +76,14 @@ sinatools/arabert/aragpt2/grover/utils.py,sha256=V5wMUxK03r5g_pb7R3_uGLOPqQJfbIB
|
|
76
76
|
sinatools/morphology/ALMA_multi_word.py,sha256=hj_-8ojrYYHnfCGk8WKtJdUR8mauzQdma4WUm-okDps,1346
|
77
77
|
sinatools/morphology/__init__.py,sha256=I4wVBh8BhyNl-CySVdiI_nUSn6gj1j-gmLKP300RpE0,1216
|
78
78
|
sinatools/morphology/morph_analyzer.py,sha256=tA78gWg6iaE_G1c2xqxZoXZWNbvHBJLrTSxPyir5Xn8,6941
|
79
|
-
sinatools/ner/__init__.py,sha256=
|
79
|
+
sinatools/ner/__init__.py,sha256=CLPaqUcvPGAA4lU-6hjAqjNfKJ5WtwRfsma6QkYZHEk,1379
|
80
80
|
sinatools/ner/data.py,sha256=lvOW86dXse8SC75Q0supQaE0rrRffoxNjIA0Qbv5WZY,4354
|
81
81
|
sinatools/ner/data_format.py,sha256=7Yt0aOicOn9_YuuyCkM_IYi_rgjGYxR9bCuUaNGM73o,4341
|
82
82
|
sinatools/ner/datasets.py,sha256=mG1iwqSm3lXCFHLqE-b4wNi176cpuzNBz8tKaBU6z6M,5059
|
83
|
-
sinatools/ner/entity_extractor.py,sha256=
|
83
|
+
sinatools/ner/entity_extractor.py,sha256=bDInJYLC1z_kze5jtQFgQVLZ76tRzUK_FJxqETfVw70,1885
|
84
84
|
sinatools/ner/helpers.py,sha256=dnOoDY5JMyOLTUWVIZLMt8mBn2IbWlVaqHhQyjs1voo,2343
|
85
85
|
sinatools/ner/metrics.py,sha256=Irz6SsIvpOzGIA2lWxrEV86xnTnm0TzKm9SUVT4SXUU,2734
|
86
|
+
sinatools/ner/relation_extractor.py,sha256=a85xGX6V72fDpJk0GKmmtlWf8S8ezY-2pm5oGc9_ESY,9750
|
86
87
|
sinatools/ner/transforms.py,sha256=vti3mDdi-IRP8i0aTQ37QqpPlP9hdMmJ6_bAMa0uL-s,4871
|
87
88
|
sinatools/ner/data/__init__.py,sha256=W0C1ge_XxTfmdEGz0hkclz57aLI5VFS5t6BjByCfkFk,57
|
88
89
|
sinatools/ner/data/datasets.py,sha256=lcdDDenFMEKIGYQmfww2dk_9WKWrJO9HtKptaAEsRmY,5064
|
@@ -91,13 +92,13 @@ sinatools/ner/nn/BaseModel.py,sha256=3GmujQasTZZunOBuFXpY2p1W8W256iI_Uu4hxhOY2Z0
|
|
91
92
|
sinatools/ner/nn/BertNestedTagger.py,sha256=_fwAn1kiKmXe6m5y16Ipty3kvXIEFEmiUq74Ad1818U,1219
|
92
93
|
sinatools/ner/nn/BertSeqTagger.py,sha256=dFcBBiMw2QCWsyy7aQDe_PS3aRuNn4DOxKIHgTblFvc,504
|
93
94
|
sinatools/ner/nn/__init__.py,sha256=UgQD_XLNzQGBNSYc_Bw1aRJZjq4PJsnMT1iZwnJemqE,170
|
94
|
-
sinatools/ner/trainers/BaseTrainer.py,sha256=
|
95
|
+
sinatools/ner/trainers/BaseTrainer.py,sha256=Ifz4SeTxJwVn1_uWZ3I9KbcSo2hLPN3ojsIYuoKE9wE,4050
|
95
96
|
sinatools/ner/trainers/BertNestedTrainer.py,sha256=Pb4O2WeBmTvV3hHMT6DXjxrTzgtuh3OrKQZnogYy8RQ,8429
|
96
97
|
sinatools/ner/trainers/BertTrainer.py,sha256=B_uVtUwfv_eFwMMPsKQvZgW_ZNLy6XEsX5ePR0s8d-k,6433
|
97
98
|
sinatools/ner/trainers/__init__.py,sha256=UDok8pDDpYOpwRBBKVLKaOgSUlmqqb-zHZI1p0xPxzI,188
|
98
99
|
sinatools/semantic_relatedness/__init__.py,sha256=S0xrmqtl72L02N56nbNMudPoebnYQgsaIyyX-587DsU,830
|
99
100
|
sinatools/semantic_relatedness/compute_relatedness.py,sha256=JvI0cXgukKtuMpmAygMnlocCsPeAJ98LD1jZCP_6SyQ,1110
|
100
|
-
sinatools/synonyms/__init__.py,sha256=
|
101
|
+
sinatools/synonyms/__init__.py,sha256=yMuphNZrm5XLOR2T0weOHcUysJm-JKHUmVLoLQO8390,548
|
101
102
|
sinatools/synonyms/synonyms_generator.py,sha256=FgAiuduSFyM6vJobWZKHg4KNWIQz8T6MGBPVIuVuw-8,6506
|
102
103
|
sinatools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
103
104
|
sinatools/utils/charsets.py,sha256=rs82oZJqRqosZdTKXfFAJfJ5t4PxjMM_oAPsiWSWuwU,2817
|
@@ -105,18 +106,19 @@ sinatools/utils/implication.py,sha256=MsbI6S1LNY-fCxGMxFTuaV639r3QijkkdcfH48rvY7
|
|
105
106
|
sinatools/utils/jaccard.py,sha256=S7OgvaMqkN5HFgTZkKhMCNAuAnQ0LhRyXPN79jAzmKM,10113
|
106
107
|
sinatools/utils/parser.py,sha256=CPPtCrsbxUqsjhY5C9wTOgkAs6iw0k_WvMUxLEPM1IU,6168
|
107
108
|
sinatools/utils/readfile.py,sha256=xE4LEaCqXJIk9v37QUSSmWb-aY3UnCFUNb7uVdx3cpM,133
|
109
|
+
sinatools/utils/similarity.py,sha256=oEdpcn3hm8v3qvNvr1USQ7eQlK0zU-7a7W090rCIFyw,5688
|
108
110
|
sinatools/utils/text_dublication_detector.py,sha256=6yAOUtdw4TKiJkUPDDi3oK7CEoIuBDbliJ4PU7kapfo,4249
|
109
111
|
sinatools/utils/text_transliteration.py,sha256=NQoXrxI-h0UXnvVtDA3skNJduxIy0IW26r46N4tDxGk,8766
|
110
112
|
sinatools/utils/tokenizer.py,sha256=QHyrVqJA_On4rKxexiWR2ovq4pI1-u6iZkdhRbK9tew,6676
|
111
113
|
sinatools/utils/tokenizers_words.py,sha256=efNfOil9qDNVJ9yynk_8sqf65PsL-xtsHG7y2SZCkjQ,656
|
112
|
-
sinatools/wsd/__init__.py,sha256=
|
113
|
-
sinatools/wsd/disambiguator.py,sha256=
|
114
|
+
sinatools/wsd/__init__.py,sha256=mwmCUurOV42rsNRpIUP3luG0oEzeTfEx3oeDl93Oif8,306
|
115
|
+
sinatools/wsd/disambiguator.py,sha256=43Iq7NTZsiYWGFg-NUDrQuJKO1NT9QOnfBPB10IOJNs,19828
|
114
116
|
sinatools/wsd/settings.py,sha256=6XflVTFKD8SVySX9Wj7zYQtV26WDTcQ2-uW8-gDNHKE,747
|
115
117
|
sinatools/wsd/wsd.py,sha256=gHIBUFXegoY1z3rRnIlK6TduhYq2BTa_dHakOjOlT4k,4434
|
116
|
-
SinaTools-0.1.
|
117
|
-
SinaTools-0.1.
|
118
|
-
SinaTools-0.1.
|
119
|
-
SinaTools-0.1.
|
120
|
-
SinaTools-0.1.
|
121
|
-
SinaTools-0.1.
|
122
|
-
SinaTools-0.1.
|
118
|
+
SinaTools-0.1.27.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
|
119
|
+
SinaTools-0.1.27.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
|
120
|
+
SinaTools-0.1.27.dist-info/METADATA,sha256=jcBDhXCwjYOsgNkcEjQLG7l_kdWSHkEUpSsoMt7uI1c,3264
|
121
|
+
SinaTools-0.1.27.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
|
122
|
+
SinaTools-0.1.27.dist-info/entry_points.txt,sha256=ZwZLolnWog2fjdDrfaHNHob8SE_YtMbD6ayzsOzItxs,1234
|
123
|
+
SinaTools-0.1.27.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
|
124
|
+
SinaTools-0.1.27.dist-info/RECORD,,
|
@@ -40,7 +40,7 @@ from sinatools.DataDownload.downloader import urls
|
|
40
40
|
|
41
41
|
def main():
|
42
42
|
parser = argparse.ArgumentParser(description="Download files from specified URLs.")
|
43
|
-
parser.add_argument('-f', '--files', nargs="*",
|
43
|
+
parser.add_argument('-f', '--files', nargs="*",
|
44
44
|
help="Names of the files to download. Available files are: "
|
45
45
|
f"{', '.join(urls.keys())}. If no file is specified, all files will be downloaded.")
|
46
46
|
|
@@ -50,8 +50,23 @@ def main():
|
|
50
50
|
|
51
51
|
if args.files:
|
52
52
|
for file in args.files:
|
53
|
-
|
54
|
-
|
53
|
+
print("file: ", file)
|
54
|
+
if file == "wsd":
|
55
|
+
download_file(urls["morph"])
|
56
|
+
download_file(urls["ner"])
|
57
|
+
download_file(urls["wsd_model"])
|
58
|
+
download_file(urls["wsd_tokenizer"])
|
59
|
+
download_file(urls["one_gram"])
|
60
|
+
download_file(urls["five_grams"])
|
61
|
+
download_file(urls["four_grams"])
|
62
|
+
download_file(urls["three_grams"])
|
63
|
+
download_file(urls["two_grams"])
|
64
|
+
elif file == "synonyms":
|
65
|
+
download_file(urls["graph_l2"])
|
66
|
+
download_file(urls["graph_l3"])
|
67
|
+
else:
|
68
|
+
url = urls[file]
|
69
|
+
download_file(url)
|
55
70
|
else:
|
56
71
|
download_files()
|
57
72
|
|
@@ -20,7 +20,7 @@ def jsons_to_list_of_lists(json_list):
|
|
20
20
|
return [[d['token'], d['tags']] for d in json_list]
|
21
21
|
|
22
22
|
def combine_tags(sentence):
|
23
|
-
output = jsons_to_list_of_lists(extract(sentence))
|
23
|
+
output = jsons_to_list_of_lists(extract(sentence, "nested"))
|
24
24
|
return [word[1] for word in output]
|
25
25
|
|
26
26
|
|
@@ -46,7 +46,7 @@ def jsons_to_list_of_lists(json_list):
|
|
46
46
|
return [[d['token'], d['tags']] for d in json_list]
|
47
47
|
|
48
48
|
def combine_tags(sentence):
|
49
|
-
output = jsons_to_list_of_lists(extract(sentence))
|
49
|
+
output = jsons_to_list_of_lists(extract(sentence, "nested"))
|
50
50
|
return [word[1] for word in output]
|
51
51
|
|
52
52
|
|
sinatools/CLI/utils/jaccard.py
CHANGED
@@ -46,7 +46,7 @@ Examples:
|
|
46
46
|
"""
|
47
47
|
|
48
48
|
import argparse
|
49
|
-
from sinatools.utils.
|
49
|
+
from sinatools.utils.similarity import get_jaccard
|
50
50
|
from sinatools.utils.readfile import read_file
|
51
51
|
|
52
52
|
|
@@ -76,7 +76,7 @@ def main():
|
|
76
76
|
print("Either --file1 and --file2 arguments or both --set1 and --set2 arguments must be provided.")
|
77
77
|
return
|
78
78
|
|
79
|
-
similarity =
|
79
|
+
similarity = get_jaccard(args.delimiter, set1, set2, args.selection, args.ignoreAllDiacriticsButNotShadda, args.ignoreShaddaDiacritic)
|
80
80
|
|
81
81
|
print("Jaccard Result:", similarity)
|
82
82
|
|
@@ -15,8 +15,8 @@ urls = {
|
|
15
15
|
'four_grams':'https://sina.birzeit.edu/four_grams.pickle',
|
16
16
|
'three_grams':'https://sina.birzeit.edu/three_grams.pickle',
|
17
17
|
'two_grams':'https://sina.birzeit.edu/two_grams.pickle',
|
18
|
-
'synonyms_level2':'https://sina.birzeit.edu/
|
19
|
-
'synonyms_level3':'https://sina.birzeit.edu/
|
18
|
+
'synonyms_level2':'https://sina.birzeit.edu/graph_l2.pkl',
|
19
|
+
'synonyms_level3':'https://sina.birzeit.edu/graph_l3.pkl'
|
20
20
|
}
|
21
21
|
|
22
22
|
def get_appdatadir():
|
sinatools/VERSION
CHANGED
@@ -1 +1 @@
|
|
1
|
-
0.1.
|
1
|
+
0.1.27
|
sinatools/ner/__init__.py
CHANGED
@@ -7,6 +7,8 @@ import torch
|
|
7
7
|
import pickle
|
8
8
|
import json
|
9
9
|
from argparse import Namespace
|
10
|
+
from transformers import pipeline
|
11
|
+
#from transformers import AutoModelForSequenceClassification
|
10
12
|
|
11
13
|
tagger = None
|
12
14
|
tag_vocab = None
|
@@ -35,4 +37,7 @@ if torch.cuda.is_available():
|
|
35
37
|
|
36
38
|
train_config.trainer_config["kwargs"]["model"] = model
|
37
39
|
tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
|
38
|
-
tagger.load(os.path.join(model_path,"checkpoints"))
|
40
|
+
tagger.load(os.path.join(model_path,"checkpoints"))
|
41
|
+
|
42
|
+
pipe = pipeline("sentiment-analysis", model= os.path.join(path, "best_model"), return_all_scores =True, max_length=128, truncation=True)
|
43
|
+
#pipe = AutoModelForSequenceClassification.from_pretrained(os.path.join(path, "best_model"))
|
@@ -3,43 +3,31 @@ from collections import namedtuple
|
|
3
3
|
from sinatools.ner.data_format import get_dataloaders, text2segments
|
4
4
|
from . import tagger, tag_vocab, train_config
|
5
5
|
|
6
|
-
def extract(text, batch_size=32):
|
7
|
-
"""
|
8
|
-
This method processes an input text and returns named entites for each token within the text, based on the specified batch size. As follows:
|
9
6
|
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
7
|
+
def convert_nested_to_flat(nested_tags):
|
8
|
+
flat_tags = []
|
9
|
+
|
10
|
+
for entry in nested_tags:
|
11
|
+
word = entry['token']
|
12
|
+
tags = entry['tags'].split()
|
13
|
+
|
14
|
+
# Initialize with the first tag in the sequence
|
15
|
+
flat_tag = tags[0]
|
16
|
+
|
17
|
+
for tag in tags[1:]:
|
18
|
+
# Check if the tag is an "I-" tag, indicating continuation of an entity
|
19
|
+
if tag.startswith('I-'):
|
20
|
+
flat_tag = tag
|
21
|
+
break
|
22
|
+
|
23
|
+
flat_tags.append({
|
24
|
+
'token': word,
|
25
|
+
'tags': flat_tag
|
26
|
+
})
|
27
|
+
|
28
|
+
return flat_tags
|
23
29
|
|
24
|
-
|
25
|
-
extract('ذهب محمد إلى جامعة بيرزيت')
|
26
|
-
[{
|
27
|
-
"word":"ذهب",
|
28
|
-
"tags":"O"
|
29
|
-
},{
|
30
|
-
"word":"محمد",
|
31
|
-
"tags":"B-PERS"
|
32
|
-
},{
|
33
|
-
"word":"إلى",
|
34
|
-
"tags":"O"
|
35
|
-
},{
|
36
|
-
"word":"جامعة",
|
37
|
-
"tags":"B-ORG"
|
38
|
-
},{
|
39
|
-
"word":"بيرزيت",
|
40
|
-
"tags":"B-GPE I-ORG"
|
41
|
-
}]
|
42
|
-
"""
|
30
|
+
def extract(text, ner_method="nested"):
|
43
31
|
|
44
32
|
dataset, token_vocab = text2segments(text)
|
45
33
|
|
@@ -50,7 +38,7 @@ def extract(text, batch_size=32):
|
|
50
38
|
(dataset,),
|
51
39
|
vocab,
|
52
40
|
train_config.data_config,
|
53
|
-
batch_size=
|
41
|
+
batch_size=32,
|
54
42
|
shuffle=(False,),
|
55
43
|
)[0]
|
56
44
|
|
@@ -69,4 +57,7 @@ def extract(text, batch_size=32):
|
|
69
57
|
else:
|
70
58
|
segments_list["tags"] = ' '.join(list_of_tags)
|
71
59
|
segments_lists.append(segments_list)
|
60
|
+
|
61
|
+
if ner_method == "flat":
|
62
|
+
segments_lists = convert_nested_to_flat(segments_lists)
|
72
63
|
return segments_lists
|
@@ -0,0 +1,201 @@
|
|
1
|
+
import torch
|
2
|
+
import json
|
3
|
+
from urllib.request import Request, urlopen
|
4
|
+
from sinatools.ner.entity_extractor import extract
|
5
|
+
from . import pipe
|
6
|
+
|
7
|
+
|
8
|
+
# ============================ Extract entities and their types ========================
|
9
|
+
def jsons_to_list_of_lists(json_list):
|
10
|
+
return [[d['token'], d['tags']] for d in json_list]
|
11
|
+
|
12
|
+
def entities_and_types(sentence):
|
13
|
+
output_list = jsons_to_list_of_lists(extract(sentence))
|
14
|
+
json_short = distill_entities(output_list)
|
15
|
+
|
16
|
+
entities = {}
|
17
|
+
for entity in json_short:
|
18
|
+
name = entity[0]
|
19
|
+
entity_type = entity[1]
|
20
|
+
entities[name] = entity_type
|
21
|
+
|
22
|
+
return entities
|
23
|
+
|
24
|
+
def distill_entities(entities):
|
25
|
+
# This is list that we put the output what we need
|
26
|
+
list_output = list()
|
27
|
+
|
28
|
+
# This line go to sort function and save the output to temp_entities
|
29
|
+
temp_entities = sortTags(entities)
|
30
|
+
|
31
|
+
# This list help us to make the output,
|
32
|
+
temp_list = list()
|
33
|
+
|
34
|
+
# initlize the temp_list
|
35
|
+
temp_list.append(["", "", 0, 0])
|
36
|
+
word_position = 0
|
37
|
+
|
38
|
+
# For each entity, convert ibo to distllir list.
|
39
|
+
for entity in temp_entities:
|
40
|
+
# This is counter tag of this entity
|
41
|
+
counter_tag = 0
|
42
|
+
# For each tag
|
43
|
+
for tag in str(entity[1]).split():
|
44
|
+
# If the counter tag greater than or equal to lenght of templist, if yes then we will append the empty value in templist
|
45
|
+
if counter_tag >= len(temp_list):
|
46
|
+
temp_list.append(["", "", 0, 0])
|
47
|
+
|
48
|
+
# If tag equal O and word postion of this tag is not equal zero then it will add all
|
49
|
+
# not empty eliment of temp list in output list
|
50
|
+
if "O" == tag and word_position != 0:
|
51
|
+
for j in range(0, len(temp_list)):
|
52
|
+
if temp_list[j][1] != "":
|
53
|
+
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
|
54
|
+
temp_list[j][0] = ""
|
55
|
+
temp_list[j][1] = ""
|
56
|
+
temp_list[j][2] = word_position
|
57
|
+
temp_list[j][3] = word_position
|
58
|
+
# if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
|
59
|
+
# of the split its B
|
60
|
+
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "B":
|
61
|
+
# if the temp_list of counter is not empty then it will append in output list and hten it will
|
62
|
+
# initilize by new string and tag in templist of counter
|
63
|
+
if temp_list[counter_tag][1] != "":
|
64
|
+
list_output.append([temp_list[counter_tag][0].strip(), temp_list[counter_tag][1], temp_list[counter_tag][2], temp_list[counter_tag][3]])
|
65
|
+
temp_list[counter_tag][0] = str(entity[0]) + " "
|
66
|
+
temp_list[counter_tag][1] = str(tag).split("-")[1]
|
67
|
+
temp_list[counter_tag][2] = word_position
|
68
|
+
temp_list[counter_tag][3] = word_position
|
69
|
+
|
70
|
+
# if this tag not equal O, and split by '-' the tag and check the lenght equals two and if the first eliment
|
71
|
+
# of the split its O
|
72
|
+
elif "O" != tag and len(tag.split("-")) == 2 and tag.split("-")[0] == "I" and word_position != 0:
|
73
|
+
# For each of temp_list, check if in this counter tag of templist is same tag with this.tag
|
74
|
+
# then will complete if not it will save in output list and cheak another
|
75
|
+
for j in range(counter_tag,len(temp_list)):
|
76
|
+
if temp_list[j][1] == tag[2:] and temp_list[j][3] != word_position:
|
77
|
+
temp_list[j][0] += str(entity[0]) + " "
|
78
|
+
temp_list[j][3] += 1
|
79
|
+
break
|
80
|
+
else:
|
81
|
+
if temp_list[j][1] != "":
|
82
|
+
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
|
83
|
+
temp_list[j][0] = ""
|
84
|
+
temp_list[j][1] = ""
|
85
|
+
temp_list[j][2] = word_position
|
86
|
+
temp_list[j][3] = word_position
|
87
|
+
counter_tag += 1
|
88
|
+
word_position += 1
|
89
|
+
# For each temp_list, at the end of the previous loop, there will be some
|
90
|
+
# values in this list, we should save it to the output list
|
91
|
+
for j in range(0, len(temp_list)):
|
92
|
+
if temp_list[j][1] != "":
|
93
|
+
list_output.append([temp_list[j][0].strip(), temp_list[j][1], temp_list[j][2], temp_list[j][3]])
|
94
|
+
return sorted(list_output, key=lambda x: (x[2]))
|
95
|
+
|
96
|
+
def sortTags(entities):
|
97
|
+
temp_entities = entities
|
98
|
+
temp_counter = 0
|
99
|
+
# For each entity, this loop will sort each tag of entitiy, first it will check if the
|
100
|
+
# previous tags has same count of this tag, second will sort the tags and check if this tags is correct
|
101
|
+
for entity in temp_entities:
|
102
|
+
tags = entity[1].split()
|
103
|
+
for tag in tags:
|
104
|
+
# if the counter is not 0 then, will complete
|
105
|
+
if temp_counter != 0:
|
106
|
+
# Check if this tag is equal I-, if yes then it will count how many tag in this tags and
|
107
|
+
# count how many tag in previous tags
|
108
|
+
if "I-" == tag[0:2]:
|
109
|
+
counter_of_this_tag = 0
|
110
|
+
counter_of_previous_tag = 0
|
111
|
+
for word in tags:
|
112
|
+
if tag.split("-")[1] in word:
|
113
|
+
counter_of_this_tag+=1
|
114
|
+
for word in temp_entities[temp_counter-1][1].split():
|
115
|
+
if tag.split("-")[1] in word:
|
116
|
+
counter_of_previous_tag+=1
|
117
|
+
# if the counter of previous tag is bigger than counter of this tag, then we
|
118
|
+
# need to add I-tag in this tags
|
119
|
+
if counter_of_previous_tag > counter_of_this_tag:
|
120
|
+
tags.append("I-"+tag.split("-")[1])
|
121
|
+
# Sort the tags
|
122
|
+
tags.sort()
|
123
|
+
# Need to revers the tags because it should begins with I
|
124
|
+
tags.reverse()
|
125
|
+
# If the counter is not 0 then we can complete
|
126
|
+
if temp_counter != 0:
|
127
|
+
this_tags = tags
|
128
|
+
previous_tags = temp_entities[temp_counter - 1][1].split()
|
129
|
+
sorted_tags = list()
|
130
|
+
|
131
|
+
# Check if the this tag is not O and previous tags is not O, then will complete,
|
132
|
+
# if not then it will ignor this tag
|
133
|
+
if "O" not in this_tags and "O" not in previous_tags:
|
134
|
+
index = 0
|
135
|
+
#For each previous tags, need sort this tag by previous tags if its I, B we can ignor
|
136
|
+
for i in previous_tags:
|
137
|
+
j = 0
|
138
|
+
while this_tags and j < len(this_tags):
|
139
|
+
if this_tags[j][0:2] == "I-" and this_tags[j][2:] == i[2:]:
|
140
|
+
sorted_tags.insert(index, this_tags.pop(j))
|
141
|
+
break
|
142
|
+
elif this_tags[j][0:2] == "B-":
|
143
|
+
break
|
144
|
+
j += 1
|
145
|
+
index += 1
|
146
|
+
sorted_tags += this_tags
|
147
|
+
tags = sorted_tags
|
148
|
+
str_tag = " "
|
149
|
+
str_tag = str_tag.join(tags)
|
150
|
+
str_tag = str_tag.strip()
|
151
|
+
temp_entities[temp_counter][1] = str_tag
|
152
|
+
temp_counter += 1
|
153
|
+
return temp_entities
|
154
|
+
|
155
|
+
# ============= Prepare Templates and Catergorize Extracted Entities ================
|
156
|
+
temp03={'location':'مكان حدوث','agent':'أحد المتأثرين في','happened at':'تاريخ حدوث'}
|
157
|
+
categories = {
|
158
|
+
'agent': ['PERS', 'NORP', 'OCC', 'ORG'],
|
159
|
+
'location': ['LOC', 'FAC', 'GPE'],
|
160
|
+
'happened at': ['DATE', 'TIME']
|
161
|
+
}
|
162
|
+
|
163
|
+
def get_entity_category(entity_type, categories):
|
164
|
+
for category, types in categories.items():
|
165
|
+
if entity_type in types:
|
166
|
+
return category
|
167
|
+
return None
|
168
|
+
|
169
|
+
|
170
|
+
# ============ Extract entities, their types and categorize them ===============
|
171
|
+
def relation_extraction(sentence):
|
172
|
+
#test_sentence="صورة إعتقال طفل فلسطيني خلال انتفاضة الأقصى ."
|
173
|
+
entities=entities_and_types(sentence)
|
174
|
+
|
175
|
+
event_indices = [i for i, (_, entity_type) in enumerate(entities.items()) if entity_type == 'EVENT']
|
176
|
+
arg_event_indices = [i for i, (_, entity_type) in enumerate(entities.items()) if entity_type != 'EVENT']
|
177
|
+
|
178
|
+
output_list=[]
|
179
|
+
|
180
|
+
for i in event_indices:
|
181
|
+
event_entity=list(entities.keys())[i]
|
182
|
+
for j in arg_event_indices:
|
183
|
+
arg_name= list(entities.keys())[j]
|
184
|
+
arg_type=entities[arg_name]
|
185
|
+
category = get_entity_category(arg_type, categories)
|
186
|
+
|
187
|
+
if category in temp03:
|
188
|
+
relation_sentence=f"[CLS] {sentence} [SEP] {event_entity} {temp03[category]} {arg_name}"
|
189
|
+
predicted_relation=pipe(relation_sentence)
|
190
|
+
score = predicted_relation[0][0]['score']
|
191
|
+
if score > 0.50:
|
192
|
+
#print(f"Event:{event_entity} Relation:{category} Argument:{arg_name}\n")
|
193
|
+
#output_list.append([{event_entity} ,{category}, {arg_name}])
|
194
|
+
output_list.append(f"Event:{event_entity}, Relation:{category}, Argument:{arg_name}")
|
195
|
+
|
196
|
+
else:
|
197
|
+
#print(f"Event:{event_entity} Relation:No relation Argument:{arg_name}\n")
|
198
|
+
#output_list.append([{event_entity} ,'No relation', {arg_name}])
|
199
|
+
output_list.append(f"Event:{event_entity}, Relation:No relation, Argument:{arg_name}")
|
200
|
+
|
201
|
+
return output_list
|
@@ -1,117 +1,117 @@
|
|
1
|
-
import os
|
2
|
-
import torch
|
3
|
-
import logging
|
4
|
-
import natsort
|
5
|
-
import glob
|
6
|
-
|
7
|
-
logger = logging.getLogger(__name__)
|
8
|
-
|
9
|
-
|
10
|
-
class BaseTrainer:
|
11
|
-
def __init__(
|
12
|
-
self,
|
13
|
-
model=None,
|
14
|
-
max_epochs=50,
|
15
|
-
optimizer=None,
|
16
|
-
scheduler=None,
|
17
|
-
loss=None,
|
18
|
-
train_dataloader=None,
|
19
|
-
val_dataloader=None,
|
20
|
-
test_dataloader=None,
|
21
|
-
log_interval=10,
|
22
|
-
summary_writer=None,
|
23
|
-
output_path=None,
|
24
|
-
clip=5,
|
25
|
-
patience=5
|
26
|
-
):
|
27
|
-
self.model = model
|
28
|
-
self.max_epochs = max_epochs
|
29
|
-
self.train_dataloader = train_dataloader
|
30
|
-
self.val_dataloader = val_dataloader
|
31
|
-
self.test_dataloader = test_dataloader
|
32
|
-
self.optimizer = optimizer
|
33
|
-
self.scheduler = scheduler
|
34
|
-
self.loss = loss
|
35
|
-
self.log_interval = log_interval
|
36
|
-
self.summary_writer = summary_writer
|
37
|
-
self.output_path = output_path
|
38
|
-
self.current_timestep = 0
|
39
|
-
self.current_epoch = 0
|
40
|
-
self.clip = clip
|
41
|
-
self.patience = patience
|
42
|
-
|
43
|
-
def tag(self, dataloader, is_train=True):
|
44
|
-
"""
|
45
|
-
Given a dataloader containing segments, predict the tags
|
46
|
-
:param dataloader: torch.utils.data.DataLoader
|
47
|
-
:param is_train: boolean - True for training model, False for evaluation
|
48
|
-
:return: Iterator
|
49
|
-
subwords (B x T x NUM_LABELS)- torch.Tensor - BERT subword ID
|
50
|
-
gold_tags (B x T x NUM_LABELS) - torch.Tensor - ground truth tags IDs
|
51
|
-
tokens - List[arabiner.data.dataset.Token] - list of tokens
|
52
|
-
valid_len (B x 1) - int - valiud length of each sequence
|
53
|
-
logits (B x T x NUM_LABELS) - logits for each token and each tag
|
54
|
-
"""
|
55
|
-
for subwords, gold_tags, tokens, valid_len in dataloader:
|
56
|
-
self.model.train(is_train)
|
57
|
-
|
58
|
-
if torch.cuda.is_available():
|
59
|
-
subwords = subwords.cuda()
|
60
|
-
gold_tags = gold_tags.cuda()
|
61
|
-
|
62
|
-
if is_train:
|
63
|
-
self.optimizer.zero_grad()
|
64
|
-
logits = self.model(subwords)
|
65
|
-
else:
|
66
|
-
with torch.no_grad():
|
67
|
-
logits = self.model(subwords)
|
68
|
-
|
69
|
-
yield subwords, gold_tags, tokens, valid_len, logits
|
70
|
-
|
71
|
-
def segments_to_file(self, segments, filename):
|
72
|
-
"""
|
73
|
-
Write segments to file
|
74
|
-
:param segments: [List[arabiner.data.dataset.Token]] - list of list of tokens
|
75
|
-
:param filename: str - output filename
|
76
|
-
:return: None
|
77
|
-
"""
|
78
|
-
with open(filename, "w") as fh:
|
79
|
-
results = "\n\n".join(["\n".join([t.__str__() for t in segment]) for segment in segments])
|
80
|
-
fh.write("Token\tGold Tag\tPredicted Tag\n")
|
81
|
-
fh.write(results)
|
82
|
-
logging.info("Predictions written to %s", filename)
|
83
|
-
|
84
|
-
def save(self):
|
85
|
-
"""
|
86
|
-
Save model checkpoint
|
87
|
-
:return:
|
88
|
-
"""
|
89
|
-
filename = os.path.join(
|
90
|
-
self.output_path,
|
91
|
-
"checkpoints",
|
92
|
-
"checkpoint_{}.pt".format(self.current_epoch),
|
93
|
-
)
|
94
|
-
|
95
|
-
checkpoint = {
|
96
|
-
"model": self.model.state_dict(),
|
97
|
-
"optimizer": self.optimizer.state_dict(),
|
98
|
-
"epoch": self.current_epoch
|
99
|
-
}
|
100
|
-
|
101
|
-
logger.info("Saving checkpoint to %s", filename)
|
102
|
-
torch.save(checkpoint, filename)
|
103
|
-
|
104
|
-
def load(self, checkpoint_path):
|
105
|
-
"""
|
106
|
-
Load model checkpoint
|
107
|
-
:param checkpoint_path: str - path/to/checkpoints
|
108
|
-
:return: None
|
109
|
-
"""
|
110
|
-
checkpoint_path = natsort.natsorted(glob.glob(f"{checkpoint_path}/checkpoint_*.pt"))
|
111
|
-
checkpoint_path = checkpoint_path[-1]
|
112
|
-
|
113
|
-
logger.info("Loading checkpoint %s", checkpoint_path)
|
114
|
-
|
115
|
-
device = None if torch.cuda.is_available() else torch.device('cpu')
|
116
|
-
checkpoint = torch.load(checkpoint_path, map_location=device)
|
117
|
-
self.model.load_state_dict(checkpoint["model"])
|
1
|
+
import os
|
2
|
+
import torch
|
3
|
+
import logging
|
4
|
+
import natsort
|
5
|
+
import glob
|
6
|
+
|
7
|
+
logger = logging.getLogger(__name__)
|
8
|
+
|
9
|
+
|
10
|
+
class BaseTrainer:
|
11
|
+
def __init__(
|
12
|
+
self,
|
13
|
+
model=None,
|
14
|
+
max_epochs=50,
|
15
|
+
optimizer=None,
|
16
|
+
scheduler=None,
|
17
|
+
loss=None,
|
18
|
+
train_dataloader=None,
|
19
|
+
val_dataloader=None,
|
20
|
+
test_dataloader=None,
|
21
|
+
log_interval=10,
|
22
|
+
summary_writer=None,
|
23
|
+
output_path=None,
|
24
|
+
clip=5,
|
25
|
+
patience=5
|
26
|
+
):
|
27
|
+
self.model = model
|
28
|
+
self.max_epochs = max_epochs
|
29
|
+
self.train_dataloader = train_dataloader
|
30
|
+
self.val_dataloader = val_dataloader
|
31
|
+
self.test_dataloader = test_dataloader
|
32
|
+
self.optimizer = optimizer
|
33
|
+
self.scheduler = scheduler
|
34
|
+
self.loss = loss
|
35
|
+
self.log_interval = log_interval
|
36
|
+
self.summary_writer = summary_writer
|
37
|
+
self.output_path = output_path
|
38
|
+
self.current_timestep = 0
|
39
|
+
self.current_epoch = 0
|
40
|
+
self.clip = clip
|
41
|
+
self.patience = patience
|
42
|
+
|
43
|
+
def tag(self, dataloader, is_train=True):
|
44
|
+
"""
|
45
|
+
Given a dataloader containing segments, predict the tags
|
46
|
+
:param dataloader: torch.utils.data.DataLoader
|
47
|
+
:param is_train: boolean - True for training model, False for evaluation
|
48
|
+
:return: Iterator
|
49
|
+
subwords (B x T x NUM_LABELS)- torch.Tensor - BERT subword ID
|
50
|
+
gold_tags (B x T x NUM_LABELS) - torch.Tensor - ground truth tags IDs
|
51
|
+
tokens - List[arabiner.data.dataset.Token] - list of tokens
|
52
|
+
valid_len (B x 1) - int - valiud length of each sequence
|
53
|
+
logits (B x T x NUM_LABELS) - logits for each token and each tag
|
54
|
+
"""
|
55
|
+
for subwords, gold_tags, tokens, valid_len in dataloader:
|
56
|
+
self.model.train(is_train)
|
57
|
+
|
58
|
+
if torch.cuda.is_available():
|
59
|
+
subwords = subwords.cuda()
|
60
|
+
gold_tags = gold_tags.cuda()
|
61
|
+
|
62
|
+
if is_train:
|
63
|
+
self.optimizer.zero_grad()
|
64
|
+
logits = self.model(subwords)
|
65
|
+
else:
|
66
|
+
with torch.no_grad():
|
67
|
+
logits = self.model(subwords)
|
68
|
+
|
69
|
+
yield subwords, gold_tags, tokens, valid_len, logits
|
70
|
+
|
71
|
+
def segments_to_file(self, segments, filename):
|
72
|
+
"""
|
73
|
+
Write segments to file
|
74
|
+
:param segments: [List[arabiner.data.dataset.Token]] - list of list of tokens
|
75
|
+
:param filename: str - output filename
|
76
|
+
:return: None
|
77
|
+
"""
|
78
|
+
with open(filename, "w") as fh:
|
79
|
+
results = "\n\n".join(["\n".join([t.__str__() for t in segment]) for segment in segments])
|
80
|
+
fh.write("Token\tGold Tag\tPredicted Tag\n")
|
81
|
+
fh.write(results)
|
82
|
+
logging.info("Predictions written to %s", filename)
|
83
|
+
|
84
|
+
def save(self):
|
85
|
+
"""
|
86
|
+
Save model checkpoint
|
87
|
+
:return:
|
88
|
+
"""
|
89
|
+
filename = os.path.join(
|
90
|
+
self.output_path,
|
91
|
+
"checkpoints",
|
92
|
+
"checkpoint_{}.pt".format(self.current_epoch),
|
93
|
+
)
|
94
|
+
|
95
|
+
checkpoint = {
|
96
|
+
"model": self.model.state_dict(),
|
97
|
+
"optimizer": self.optimizer.state_dict(),
|
98
|
+
"epoch": self.current_epoch
|
99
|
+
}
|
100
|
+
|
101
|
+
logger.info("Saving checkpoint to %s", filename)
|
102
|
+
torch.save(checkpoint, filename)
|
103
|
+
|
104
|
+
def load(self, checkpoint_path):
|
105
|
+
"""
|
106
|
+
Load model checkpoint
|
107
|
+
:param checkpoint_path: str - path/to/checkpoints
|
108
|
+
:return: None
|
109
|
+
"""
|
110
|
+
checkpoint_path = natsort.natsorted(glob.glob(f"{checkpoint_path}/checkpoint_*.pt"))
|
111
|
+
checkpoint_path = checkpoint_path[-1]
|
112
|
+
|
113
|
+
logger.info("Loading checkpoint %s", checkpoint_path)
|
114
|
+
|
115
|
+
device = None if torch.cuda.is_available() else torch.device('cpu')
|
116
|
+
checkpoint = torch.load(checkpoint_path, map_location=device)
|
117
|
+
self.model.load_state_dict(checkpoint["model"], strict=False)
|
sinatools/synonyms/__init__.py
CHANGED
@@ -3,7 +3,7 @@ from sinatools.DataDownload import downloader
|
|
3
3
|
import os
|
4
4
|
|
5
5
|
synonyms_level2_dict = {}
|
6
|
-
level2_dict = '
|
6
|
+
level2_dict = 'graph_l2.pkl'
|
7
7
|
path = downloader.get_appdatadir()
|
8
8
|
file_path = os.path.join(path, level2_dict)
|
9
9
|
with open(file_path, 'rb') as f:
|
@@ -11,7 +11,7 @@ with open(file_path, 'rb') as f:
|
|
11
11
|
|
12
12
|
|
13
13
|
synonyms_level3_dict = {}
|
14
|
-
level3_dict = '
|
14
|
+
level3_dict = 'graph_l3.pkl'
|
15
15
|
path = downloader.get_appdatadir()
|
16
16
|
file_path = os.path.join(path, level3_dict)
|
17
17
|
with open(file_path, 'rb') as f:
|
@@ -0,0 +1,149 @@
|
|
1
|
+
# -*- coding: utf-8 -*-
|
2
|
+
|
3
|
+
from sinatools.utils.parser import arStrip
|
4
|
+
from sinatools.utils.implication import Implication
|
5
|
+
import argparse
|
6
|
+
|
7
|
+
def normalize_word(word: str, ignore_all_diacritics_but_not_shadda: bool=True, ignore_shadda_diacritic: bool=True) -> str:
|
8
|
+
if ignore_all_diacritics_but_not_shadda:
|
9
|
+
word = arStrip(word, True, True, False, False, False, False)
|
10
|
+
|
11
|
+
if ignore_shadda_diacritic:
|
12
|
+
word = arStrip(word, False, False, True, False, False, False)
|
13
|
+
|
14
|
+
return word
|
15
|
+
|
16
|
+
|
17
|
+
def get_preferred_word(word1, word2):
|
18
|
+
implication = Implication(word1, word2)
|
19
|
+
|
20
|
+
direction = implication.get_direction()
|
21
|
+
|
22
|
+
if direction in (0, 2):
|
23
|
+
return word1
|
24
|
+
|
25
|
+
elif direction == 1:
|
26
|
+
return word2
|
27
|
+
|
28
|
+
elif direction == 3:
|
29
|
+
if not word1.endswith("َ") and not word1.endswith("ُ"):
|
30
|
+
return word2
|
31
|
+
return word1
|
32
|
+
|
33
|
+
|
34
|
+
def get_non_preferred_word(word1, word2):
|
35
|
+
|
36
|
+
implication = Implication(word1, word2)
|
37
|
+
if implication.get_distance() < 15:
|
38
|
+
direction = implication.get_direction()
|
39
|
+
if direction == 0 or direction == 1:
|
40
|
+
return word1
|
41
|
+
elif direction == 2:
|
42
|
+
return word2
|
43
|
+
elif direction == 3:
|
44
|
+
if not word1.endswith("َ") and not word1.endswith("ُ"):
|
45
|
+
return word1
|
46
|
+
return word2
|
47
|
+
return "#"
|
48
|
+
|
49
|
+
def get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda=False, ignore_shadda_diacritic=False):
|
50
|
+
|
51
|
+
list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
|
52
|
+
list1 = [str(i.strip()) for i in list1]
|
53
|
+
|
54
|
+
list2 = [str(i) for i in list2 if i not in (None, ' ', '')]
|
55
|
+
list2 = [str(i.strip()) for i in list2]
|
56
|
+
|
57
|
+
interection_list = []
|
58
|
+
|
59
|
+
for list1_word in list1:
|
60
|
+
for list2_word in list2:
|
61
|
+
word1 = normalize_word(list1_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
62
|
+
word2 = normalize_word(list2_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
63
|
+
|
64
|
+
implication = Implication(word1, word2)
|
65
|
+
if implication.get_direction() >= 0 and implication.get_distance() < 15:
|
66
|
+
interection_list.append(get_preferred_word(word1, word2))
|
67
|
+
|
68
|
+
i = 0
|
69
|
+
while i < len(interection_list):
|
70
|
+
j = i + 1
|
71
|
+
while j < len(interection_list):
|
72
|
+
non_preferred_word = get_non_preferred_word(interection_list[i], interection_list[j])
|
73
|
+
if non_preferred_word != "#":
|
74
|
+
interection_list.remove(non_preferred_word)
|
75
|
+
j += 1
|
76
|
+
i += 1
|
77
|
+
|
78
|
+
return interection_list
|
79
|
+
|
80
|
+
|
81
|
+
|
82
|
+
def get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic):
|
83
|
+
|
84
|
+
list1 = [str(i) for i in list1 if i not in (None, ' ', '')]
|
85
|
+
|
86
|
+
list2 = [str(i) for i in list2 if i not in (None, ' ', '')]
|
87
|
+
|
88
|
+
union_list = []
|
89
|
+
|
90
|
+
for list1_word in list1:
|
91
|
+
word1 = normalize_word(list1_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
92
|
+
union_list.append(word1)
|
93
|
+
|
94
|
+
for list2_word in list2:
|
95
|
+
word2 = normalize_word(list2_word, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
96
|
+
union_list.append(word2)
|
97
|
+
|
98
|
+
i = 0
|
99
|
+
while i < len(union_list):
|
100
|
+
j = i + 1
|
101
|
+
while j < len(union_list):
|
102
|
+
non_preferred_word = get_non_preferred_word(union_list[i], union_list[j])
|
103
|
+
if (non_preferred_word != "#"):
|
104
|
+
union_list.remove(non_preferred_word)
|
105
|
+
j = j + 1
|
106
|
+
i = i + 1
|
107
|
+
|
108
|
+
return union_list
|
109
|
+
|
110
|
+
|
111
|
+
|
112
|
+
def get_jaccard_similarity(list1: list, list2: list, ignore_all_diacritics_but_not_shadda: bool, ignore_shadda_diacritic: bool) -> float:
|
113
|
+
|
114
|
+
intersection_list = get_intersection(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
115
|
+
|
116
|
+
union_list = get_union(list1, list2, ignore_all_diacritics_but_not_shadda, ignore_shadda_diacritic)
|
117
|
+
|
118
|
+
return float(len(intersection_list)) / float(len(union_list))
|
119
|
+
|
120
|
+
def get_jaccard(delimiter, str1, str2, selection, ignoreAllDiacriticsButNotShadda=True, ignoreShaddaDiacritic=True):
|
121
|
+
|
122
|
+
try:
|
123
|
+
list1 = str1.split(delimiter)
|
124
|
+
list2 = str2.split(delimiter)
|
125
|
+
|
126
|
+
if selection == "intersection":
|
127
|
+
intersection = get_intersection(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
128
|
+
return intersection
|
129
|
+
elif selection == "union":
|
130
|
+
union = get_union(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
131
|
+
return union
|
132
|
+
elif selection == "jaccardSimilarity":
|
133
|
+
similarity = get_jaccard_similarity(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
134
|
+
return similarity
|
135
|
+
elif selection == "jaccardAll":
|
136
|
+
intersection = get_intersection(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
137
|
+
union = get_union(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
138
|
+
similarity = get_jaccard_similarity(list1, list2, ignoreAllDiacriticsButNotShadda, ignoreShaddaDiacritic)
|
139
|
+
output_list = ["intersection:", intersection, "union:", union, "similarity:", similarity]
|
140
|
+
return output_list
|
141
|
+
else:
|
142
|
+
return 'Invalid selection option'
|
143
|
+
|
144
|
+
except AttributeError as ae:
|
145
|
+
print(f"Attribute error occurred: {str(ae)}")
|
146
|
+
return 'Invalid input type'
|
147
|
+
except Exception as e:
|
148
|
+
print(f"Error occurred: {str(e)}")
|
149
|
+
return 'An error has occurred'
|
sinatools/wsd/__init__.py
CHANGED
sinatools/wsd/disambiguator.py
CHANGED
@@ -217,7 +217,7 @@ def jsons_to_list_of_lists(json_list):
|
|
217
217
|
def find_named_entities(string):
|
218
218
|
found_entities = []
|
219
219
|
|
220
|
-
ner_entites = extract(string)
|
220
|
+
ner_entites = extract(string, "nested")
|
221
221
|
list_of_entites = jsons_to_list_of_lists(ner_entites)
|
222
222
|
entites = distill_entities(list_of_entites)
|
223
223
|
|
@@ -288,17 +288,17 @@ def disambiguate_glosses_using_SALMA(glosses, Diac_lemma, Undiac_lemma, word, se
|
|
288
288
|
concept_id, gloss = GlossPredictor(Diac_lemma, Undiac_lemma,word,sentence,glosses_dictionary)
|
289
289
|
|
290
290
|
my_json = {}
|
291
|
-
my_json['
|
291
|
+
my_json['concept_id'] = concept_id
|
292
292
|
# my_json['Gloss'] = gloss
|
293
293
|
my_json['word'] = word
|
294
|
-
my_json['Undiac_lemma'] = Undiac_lemma
|
295
|
-
my_json['
|
294
|
+
#my_json['Undiac_lemma'] = Undiac_lemma
|
295
|
+
my_json['lemma'] = Diac_lemma
|
296
296
|
return my_json
|
297
297
|
else:
|
298
298
|
my_json = {}
|
299
299
|
my_json['word'] = word
|
300
|
-
my_json['Undiac_lemma'] = Undiac_lemma
|
301
|
-
my_json['
|
300
|
+
#my_json['Undiac_lemma'] = Undiac_lemma
|
301
|
+
my_json['lemma'] = Diac_lemma
|
302
302
|
return my_json
|
303
303
|
|
304
304
|
|
@@ -405,26 +405,26 @@ def disambiguate_glosses_main(word, sentence):
|
|
405
405
|
if concept_count == 0:
|
406
406
|
my_json = {}
|
407
407
|
my_json['word'] = word['word']
|
408
|
-
my_json['
|
409
|
-
my_json['Undiac_lemma'] = word['Undiac_lemma']
|
408
|
+
my_json['lemma'] = word['Diac_lemma']
|
409
|
+
#my_json['Undiac_lemma'] = word['Undiac_lemma']
|
410
410
|
return my_json
|
411
411
|
elif concept_count == 1:
|
412
412
|
my_json = {}
|
413
413
|
my_json['word'] = word['word']
|
414
414
|
glosses = word['glosses'][0]
|
415
415
|
# my_json['Gloss'] = glosses['gloss']
|
416
|
-
my_json['
|
417
|
-
my_json['
|
418
|
-
my_json['Undiac_lemma'] = word['Undiac_lemma']
|
416
|
+
my_json['concept_id'] = glosses['concept_id']
|
417
|
+
my_json['lemma'] = word['Diac_lemma']
|
418
|
+
#my_json['Undiac_lemma'] = word['Undiac_lemma']
|
419
419
|
return my_json
|
420
420
|
elif concept_count == '*':
|
421
421
|
my_json = {}
|
422
422
|
my_json['word'] = word['word']
|
423
423
|
glosses = word['glosses'][0]
|
424
424
|
my_json['Gloss'] = glosses['gloss']
|
425
|
-
my_json['
|
426
|
-
my_json['
|
427
|
-
my_json['Undiac_lemma'] = word['Undiac_lemma']
|
425
|
+
my_json['concept_id'] = glosses['concept_id']
|
426
|
+
my_json['lemma'] = word['Diac_lemma']
|
427
|
+
#my_json['Undiac_lemma'] = word['Undiac_lemma']
|
428
428
|
return my_json
|
429
429
|
else:
|
430
430
|
input_word = word['word']
|
@@ -477,21 +477,18 @@ def disambiguate(sentence):
|
|
477
477
|
#output
|
478
478
|
[
|
479
479
|
{
|
480
|
-
"
|
480
|
+
"concept_id": "303019218",
|
481
481
|
"word": "ذهبت",
|
482
|
-
"
|
483
|
-
"Diac_lemma": "ذَهَبَ۪ 1"
|
482
|
+
"lemma": "ذَهَبَ۪ 1"
|
484
483
|
},
|
485
484
|
{
|
486
485
|
"word": "إلى",
|
487
|
-
"
|
488
|
-
"Undiac_lemma": "الى"
|
486
|
+
"lemma": "إِلَى 1"
|
489
487
|
},
|
490
488
|
{
|
491
489
|
"word": "جامعة بيرزيت",
|
492
|
-
"
|
493
|
-
"
|
494
|
-
"Undiac_lemma": "جامعة بيرزيت"
|
490
|
+
"concept_id": "334000099",
|
491
|
+
"lemma": "جامِعَة بيرزَيت"
|
495
492
|
}
|
496
493
|
]
|
497
494
|
"""
|
@@ -1,34 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.1
|
2
|
-
Name: SinaTools
|
3
|
-
Version: 0.1.25
|
4
|
-
Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
5
|
-
Home-page: https://github.com/SinaLab/sinatools
|
6
|
-
License: MIT license
|
7
|
-
Keywords: sinatools
|
8
|
-
Platform: UNKNOWN
|
9
|
-
Description-Content-Type: text/markdown
|
10
|
-
Requires-Dist: six
|
11
|
-
Requires-Dist: farasapy
|
12
|
-
Requires-Dist: tqdm
|
13
|
-
Requires-Dist: requests
|
14
|
-
Requires-Dist: regex
|
15
|
-
Requires-Dist: pathlib
|
16
|
-
Requires-Dist: torch (==1.13.0)
|
17
|
-
Requires-Dist: transformers (==4.24.0)
|
18
|
-
Requires-Dist: torchtext (==0.14.0)
|
19
|
-
Requires-Dist: torchvision (==0.14.0)
|
20
|
-
Requires-Dist: seqeval (==1.2.2)
|
21
|
-
Requires-Dist: natsort (==7.1.1)
|
22
|
-
|
23
|
-
SinaTools
|
24
|
-
---------
|
25
|
-
|
26
|
-
Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
|
27
|
-
|
28
|
-
Python APIs, command lines, colabs, and online demos.
|
29
|
-
|
30
|
-
* Free software: MIT license
|
31
|
-
* Documentation: https://sina.birzeit.edu/sinatools/
|
32
|
-
|
33
|
-
|
34
|
-
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|