SinaTools 0.1.11__py2.py3-none-any.whl → 0.1.13__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (50) hide show
  1. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/METADATA +2 -3
  2. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/RECORD +47 -26
  3. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/entry_points.txt +7 -3
  4. sinatools/CLI/DataDownload/download_files.py +0 -10
  5. sinatools/CLI/ner/corpus_entity_extractor.py +9 -6
  6. sinatools/CLI/ner/entity_extractor.py +18 -42
  7. sinatools/CLI/utils/arStrip.py +8 -8
  8. sinatools/CLI/utils/implication.py +0 -8
  9. sinatools/CLI/utils/jaccard.py +5 -14
  10. sinatools/CLI/utils/remove_latin.py +2 -2
  11. sinatools/CLI/utils/text_dublication_detector.py +25 -0
  12. sinatools/VERSION +1 -1
  13. sinatools/morphology/ALMA_multi_word.py +14 -16
  14. sinatools/morphology/__init__.py +32 -31
  15. sinatools/ner/__init__.py +28 -2
  16. sinatools/ner/data/__init__.py +1 -0
  17. sinatools/ner/data/datasets.py +146 -0
  18. sinatools/ner/data/transforms.py +118 -0
  19. sinatools/ner/data.py +124 -0
  20. sinatools/ner/data_format.py +124 -0
  21. sinatools/ner/datasets.py +146 -0
  22. sinatools/ner/entity_extractor.py +34 -54
  23. sinatools/ner/helpers.py +86 -0
  24. sinatools/ner/metrics.py +69 -0
  25. sinatools/ner/nn/BaseModel.py +22 -0
  26. sinatools/ner/nn/BertNestedTagger.py +34 -0
  27. sinatools/ner/nn/BertSeqTagger.py +17 -0
  28. sinatools/ner/nn/__init__.py +3 -0
  29. sinatools/ner/trainers/BaseTrainer.py +117 -0
  30. sinatools/ner/trainers/BertNestedTrainer.py +203 -0
  31. sinatools/ner/trainers/BertTrainer.py +163 -0
  32. sinatools/ner/trainers/__init__.py +3 -0
  33. sinatools/ner/transforms.py +119 -0
  34. sinatools/semantic_relatedness/__init__.py +20 -0
  35. sinatools/semantic_relatedness/compute_relatedness.py +31 -0
  36. sinatools/synonyms/__init__.py +18 -0
  37. sinatools/synonyms/synonyms_generator.py +192 -0
  38. sinatools/utils/text_dublication_detector.py +110 -0
  39. sinatools/wsd/__init__.py +11 -0
  40. sinatools/{salma/views.py → wsd/disambiguator.py} +135 -94
  41. sinatools/{salma → wsd}/wsd.py +1 -1
  42. sinatools/CLI/salma/salma_tools.py +0 -68
  43. sinatools/salma/__init__.py +0 -12
  44. sinatools/utils/utils.py +0 -2
  45. {SinaTools-0.1.11.data → SinaTools-0.1.13.data}/data/sinatools/environment.yml +0 -0
  46. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/AUTHORS.rst +0 -0
  47. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/LICENSE +0 -0
  48. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/WHEEL +0 -0
  49. {SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/top_level.txt +0 -0
  50. /sinatools/{salma → wsd}/settings.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: SinaTools
3
- Version: 0.1.11
3
+ Version: 0.1.13
4
4
  Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
5
5
  Home-page: https://github.com/SinaLab/sinatools
6
6
  License: MIT license
@@ -19,9 +19,8 @@ Requires-Dist: torchtext (==0.14.0)
19
19
  Requires-Dist: torchvision (==0.14.0)
20
20
  Requires-Dist: seqeval (==1.2.2)
21
21
  Requires-Dist: natsort (==7.1.1)
22
- Requires-Dist: pandas (==1.2.4)
23
22
 
24
- sinatools
23
+ SinaTools
25
24
  ---------
26
25
 
27
26
  Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
@@ -1,23 +1,23 @@
1
- SinaTools-0.1.11.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
2
- sinatools/VERSION,sha256=r-jqHzm6w_RJgmQoGTVXOd9UOJB6qtVWF_tPmBSRxi8,6
1
+ SinaTools-0.1.13.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
2
+ sinatools/VERSION,sha256=7lGv2l4eJuZteaVLIUnlbwoi4W41EwZ01RPRCjudlCI,6
3
3
  sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
4
4
  sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
5
5
  sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
6
6
  sinatools/sinatools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
7
- sinatools/CLI/DataDownload/download_files.py,sha256=17CtswdAT66KO7hw3o87RaWbM-BxdUpsheE6bysP3-c,2302
7
+ sinatools/CLI/DataDownload/download_files.py,sha256=tkH293ZUSvlvyZClkJmxfNk1x-C3XnlQCTVGdLYCKu0,1946
8
8
  sinatools/CLI/morphology/ALMA_multi_word.py,sha256=ZImJ1vtcpSHydI1BjJmK3KcMJbGBZX16kO4L6rxvBvA,2086
9
9
  sinatools/CLI/morphology/morph_analyzer.py,sha256=ieIM47QK9Nct3MtCS9uq3h2rZN5r4qNhsLmlVeE6wiE,3503
10
- sinatools/CLI/ner/corpus_entity_extractor.py,sha256=jsxTQsR4i8ZwsWrX1XxkYUbLGygYKV7-pWDiubfaANE,3751
11
- sinatools/CLI/ner/entity_extractor.py,sha256=BHAs2nGKL9npHUXj-6FDHQCuR2jidvFJX8yUkgQKxhc,4436
12
- sinatools/CLI/salma/salma_tools.py,sha256=8IDMSXjpM2u8jXc6c5JcI_l2CmiwdCxsUBJVN1Rrfk0,1971
10
+ sinatools/CLI/ner/corpus_entity_extractor.py,sha256=_o0frMSgpsFVXPoztS3mQTK7LjHsgzUv9gfs6iJL424,4024
11
+ sinatools/CLI/ner/entity_extractor.py,sha256=QFGkavZz8ZZGetMTXiTH_OeoN9B2Iyx60EKCYdFtoDY,2811
13
12
  sinatools/CLI/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
14
- sinatools/CLI/utils/arStrip.py,sha256=pOMh9e4O-vBixbv2HM5qjlA8-qJH3Zf7DeJVekPrgjo,3252
13
+ sinatools/CLI/utils/arStrip.py,sha256=NLyp8vOu2xv80tL9jiKRvyptmbkRZVg-wcAr-9YyvNY,3264
15
14
  sinatools/CLI/utils/corpus_tokenizer.py,sha256=nH0T4h6urr_0Qy6-wN3PquOtnwybj0REde5Ts_OE4U8,1650
16
- sinatools/CLI/utils/implication.py,sha256=3vw526ZL0WR8LiIKbjYibTQWE_UeYvHThc1W9-BlbHg,3133
17
- sinatools/CLI/utils/jaccard.py,sha256=1zSkEQevB-1D5xcT__qmrgB1s8CISU70wDMBteCKCSo,4601
18
- sinatools/CLI/utils/remove_latin.py,sha256=dzRzRapmM4mJwS-rhNy9PYQKS-ONMsRBmN1ZcPfEBfE,848
15
+ sinatools/CLI/utils/implication.py,sha256=nvoiI5UHHaJdd6MICql0pB_-h3L0icYwP1WgJi2h7p0,2854
16
+ sinatools/CLI/utils/jaccard.py,sha256=NoKbWAq6dHDtQ56mAc1kdAnROm8NXEjZ1ecVZ7EYm6Y,4205
17
+ sinatools/CLI/utils/remove_latin.py,sha256=NOaTm2RHxt5IQrV98ySTmD8rTXTmcqSmfbPAwTyaXqU,848
19
18
  sinatools/CLI/utils/remove_punctuation.py,sha256=vJAZlEn7WGftZAFVFYnddkRrxdJ_rMmKB9vFZkY-jN4,1097
20
19
  sinatools/CLI/utils/sentence_tokenizer.py,sha256=Wli8eiDbWSd_Z8UKpu_JkaS8jImowa1vnRL0oYCSfqw,2823
20
+ sinatools/CLI/utils/text_dublication_detector.py,sha256=dW70O5O20GxeUDDF6zVYn52wWLmJF-HBZgvqIeVL2rQ,1661
21
21
  sinatools/CLI/utils/text_transliteration.py,sha256=vz-3kxWf8pNYVCqNAtBAiA6u_efrS5NtWT-ofN1NX6I,2014
22
22
  sinatools/DataDownload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
23
23
  sinatools/DataDownload/downloader.py,sha256=MbTPqqlg5vOTErxeVvdMn5k0TsYaG6kef2zHkeBLXlk,6480
@@ -73,29 +73,50 @@ sinatools/arabert/aragpt2/grover/modeling_gpt2.py,sha256=WFpCWn1792yATFzt8rZ0rpW
73
73
  sinatools/arabert/aragpt2/grover/optimization_adafactor.py,sha256=1geOsCWuv5xxtSnKDz9a8aY5SVwZ1MGq-xVQDBg4Gpg,9765
74
74
  sinatools/arabert/aragpt2/grover/train_tpu.py,sha256=qNgLI_j6-KYkTMJfVoFlh4NIKweY1aPz1XPDw6odld0,7102
75
75
  sinatools/arabert/aragpt2/grover/utils.py,sha256=V5wMUxK03r5g_pb7R3_uGLOPqQJfbIB0VaJ8ZDM4XAo,8473
76
- sinatools/morphology/ALMA_multi_word.py,sha256=GPM2-N7_5JIZwNdi1we6gBG0rh59AlGM0XWYxmEE7jY,1283
77
- sinatools/morphology/__init__.py,sha256=avTxtexZELp1Fya6cBNFLyeYPB31OcmQOlT2L-uAQnI,1386
76
+ sinatools/morphology/ALMA_multi_word.py,sha256=hj_-8ojrYYHnfCGk8WKtJdUR8mauzQdma4WUm-okDps,1346
77
+ sinatools/morphology/__init__.py,sha256=I4wVBh8BhyNl-CySVdiI_nUSn6gj1j-gmLKP300RpE0,1216
78
78
  sinatools/morphology/morph_analyzer.py,sha256=tA78gWg6iaE_G1c2xqxZoXZWNbvHBJLrTSxPyir5Xn8,6941
79
- sinatools/ner/__init__.py,sha256=8R8epTEyvpbreLYTrC5M5lctlzZrNr7T7B4KmENnB3I,341
80
- sinatools/ner/entity_extractor.py,sha256=amVU6tXoAAL9NcadfJlx1qyEPlxBY8wRo5Tn-ZLOVIw,3236
81
- sinatools/salma/__init__.py,sha256=_by3PsXetNjkxSyg24nF592T-21JEWhPXzMAPzwDOhQ,378
82
- sinatools/salma/settings.py,sha256=b_AqTxVWALuGXnsMd9KhnnwIo9-JEoWOTekB-7_xJCU,1111
83
- sinatools/salma/views.py,sha256=G5W5BSr770NapWz5j6hcuwInrR40JKG-LkzP1OpcYeA,18416
84
- sinatools/salma/wsd.py,sha256=vCiiR5h3bjAOHi3yxxkh_7GUgBWKQf297aHbO4Z8CBk,4436
79
+ sinatools/ner/__init__.py,sha256=gSs0x6veWJ8j3_iOs79tynBd_hJP0t44CGpJ0xzoiW4,1048
80
+ sinatools/ner/data.py,sha256=lvOW86dXse8SC75Q0supQaE0rrRffoxNjIA0Qbv5WZY,4354
81
+ sinatools/ner/data_format.py,sha256=7Yt0aOicOn9_YuuyCkM_IYi_rgjGYxR9bCuUaNGM73o,4341
82
+ sinatools/ner/datasets.py,sha256=mG1iwqSm3lXCFHLqE-b4wNi176cpuzNBz8tKaBU6z6M,5059
83
+ sinatools/ner/entity_extractor.py,sha256=fwaPmbg3RaohQu9uu9rMXlvamCnv3am1EYjQMG6tuyY,2270
84
+ sinatools/ner/helpers.py,sha256=dnOoDY5JMyOLTUWVIZLMt8mBn2IbWlVaqHhQyjs1voo,2343
85
+ sinatools/ner/metrics.py,sha256=Irz6SsIvpOzGIA2lWxrEV86xnTnm0TzKm9SUVT4SXUU,2734
86
+ sinatools/ner/transforms.py,sha256=vti3mDdi-IRP8i0aTQ37QqpPlP9hdMmJ6_bAMa0uL-s,4871
87
+ sinatools/ner/data/__init__.py,sha256=W0C1ge_XxTfmdEGz0hkclz57aLI5VFS5t6BjByCfkFk,57
88
+ sinatools/ner/data/datasets.py,sha256=lcdDDenFMEKIGYQmfww2dk_9WKWrJO9HtKptaAEsRmY,5064
89
+ sinatools/ner/data/transforms.py,sha256=URMz1dHzkHjgUGAkDOenCWvQThO1ha8XeQVjoLL9RXM,4874
90
+ sinatools/ner/nn/BaseModel.py,sha256=3GmujQasTZZunOBuFXpY2p1W8W256iI_Uu4hxhOY2Z0,608
91
+ sinatools/ner/nn/BertNestedTagger.py,sha256=_fwAn1kiKmXe6m5y16Ipty3kvXIEFEmiUq74Ad1818U,1219
92
+ sinatools/ner/nn/BertSeqTagger.py,sha256=dFcBBiMw2QCWsyy7aQDe_PS3aRuNn4DOxKIHgTblFvc,504
93
+ sinatools/ner/nn/__init__.py,sha256=UgQD_XLNzQGBNSYc_Bw1aRJZjq4PJsnMT1iZwnJemqE,170
94
+ sinatools/ner/trainers/BaseTrainer.py,sha256=oZgFJW-CawfCKT5gtaBHA7Q7XjNfiyqM62KnFsgVzPU,3919
95
+ sinatools/ner/trainers/BertNestedTrainer.py,sha256=Pb4O2WeBmTvV3hHMT6DXjxrTzgtuh3OrKQZnogYy8RQ,8429
96
+ sinatools/ner/trainers/BertTrainer.py,sha256=B_uVtUwfv_eFwMMPsKQvZgW_ZNLy6XEsX5ePR0s8d-k,6433
97
+ sinatools/ner/trainers/__init__.py,sha256=UDok8pDDpYOpwRBBKVLKaOgSUlmqqb-zHZI1p0xPxzI,188
98
+ sinatools/semantic_relatedness/__init__.py,sha256=S0xrmqtl72L02N56nbNMudPoebnYQgsaIyyX-587DsU,830
99
+ sinatools/semantic_relatedness/compute_relatedness.py,sha256=JvI0cXgukKtuMpmAygMnlocCsPeAJ98LD1jZCP_6SyQ,1110
100
+ sinatools/synonyms/__init__.py,sha256=d4Xq8iFpuXojJ5HL1OpMQvYigNdU601oxwjt9iighOU,568
101
+ sinatools/synonyms/synonyms_generator.py,sha256=FgAiuduSFyM6vJobWZKHg4KNWIQz8T6MGBPVIuVuw-8,6506
85
102
  sinatools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
86
103
  sinatools/utils/charsets.py,sha256=rs82oZJqRqosZdTKXfFAJfJ5t4PxjMM_oAPsiWSWuwU,2817
87
104
  sinatools/utils/implication.py,sha256=MsbI6S1LNY-fCxGMxFTuaV639r3QijkkdcfH48rvY7A,27804
88
105
  sinatools/utils/jaccard.py,sha256=S7OgvaMqkN5HFgTZkKhMCNAuAnQ0LhRyXPN79jAzmKM,10113
89
106
  sinatools/utils/parser.py,sha256=CPPtCrsbxUqsjhY5C9wTOgkAs6iw0k_WvMUxLEPM1IU,6168
90
107
  sinatools/utils/readfile.py,sha256=xE4LEaCqXJIk9v37QUSSmWb-aY3UnCFUNb7uVdx3cpM,133
108
+ sinatools/utils/text_dublication_detector.py,sha256=6yAOUtdw4TKiJkUPDDi3oK7CEoIuBDbliJ4PU7kapfo,4249
91
109
  sinatools/utils/text_transliteration.py,sha256=NQoXrxI-h0UXnvVtDA3skNJduxIy0IW26r46N4tDxGk,8766
92
110
  sinatools/utils/tokenizer.py,sha256=QHyrVqJA_On4rKxexiWR2ovq4pI1-u6iZkdhRbK9tew,6676
93
111
  sinatools/utils/tokenizers_words.py,sha256=efNfOil9qDNVJ9yynk_8sqf65PsL-xtsHG7y2SZCkjQ,656
94
- sinatools/utils/utils.py,sha256=vKkFOkYclMu8nXS_VZb6Kobx8QGKW9onXkkLCeiRb6g,32
95
- SinaTools-0.1.11.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
96
- SinaTools-0.1.11.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
97
- SinaTools-0.1.11.dist-info/METADATA,sha256=DVITBkslVbLkq84QWFz8Cg4LfFc_2lSA3l9dFyVElq8,985
98
- SinaTools-0.1.11.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
99
- SinaTools-0.1.11.dist-info/entry_points.txt,sha256=9uGvOGRicf-CsHMaFyQjq1odtr3RMeOvEfiZwpDQ9VU,926
100
- SinaTools-0.1.11.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
101
- SinaTools-0.1.11.dist-info/RECORD,,
112
+ sinatools/wsd/__init__.py,sha256=5Ondsp-Xe9YxVjRlTc4nLrxu6xiyML7B3bQ3EZ44uEM,327
113
+ sinatools/wsd/disambiguator.py,sha256=BUiIXLd8b9tdZqThBiwacfSZtTkRx9LNnqegibmlbFA,20008
114
+ sinatools/wsd/settings.py,sha256=b_AqTxVWALuGXnsMd9KhnnwIo9-JEoWOTekB-7_xJCU,1111
115
+ sinatools/wsd/wsd.py,sha256=gHIBUFXegoY1z3rRnIlK6TduhYq2BTa_dHakOjOlT4k,4434
116
+ SinaTools-0.1.13.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
117
+ SinaTools-0.1.13.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
118
+ SinaTools-0.1.13.dist-info/METADATA,sha256=OAlD6n0C6DAbu4Kf9Foys8qSgyPzcwk-jwkklU6QkzA,953
119
+ SinaTools-0.1.13.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
120
+ SinaTools-0.1.13.dist-info/entry_points.txt,sha256=ZwZLolnWog2fjdDrfaHNHob8SE_YtMbD6ayzsOzItxs,1234
121
+ SinaTools-0.1.13.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
122
+ SinaTools-0.1.13.dist-info/RECORD,,
@@ -2,17 +2,21 @@
2
2
  alma_multi_word = sinatools.CLI.morphology.ALMA_multi_word:main
3
3
  appdatadir = sinatools.CLI.DataDownload.get_appdatadir:main
4
4
  arStrip = sinatools.CLI.utils.arStrip:main
5
- arabi_ner = sinatools.CLI.ner.entity_extractor:main
6
- arabi_ner2 = sinatools.CLI.ner.corpus_entity_extractor:main
5
+ corpus_entity_extractor = sinatools.CLI.ner.corpus_entity_extractor:main
7
6
  corpus_tokenizer = sinatools.CLI.utils.corpus_tokenizer:main
8
7
  download_files = sinatools.CLI.DataDownload.download_files:main
8
+ entity_extractor = sinatools.CLI.ner.entity_extractor:main
9
+ evaluate_synonyms = sinatools.CLI.synonyms.evaluate_synonyms:main
10
+ extend_synonyms = sinatools.CLI.synonyms.extend_synonyms:main
9
11
  implication = sinatools.CLI.utils.implication:main
10
12
  install_env = sinatools.install_env:main
11
13
  jaccard_similarity = sinatools.CLI.utils.jaccard:main
12
14
  morphology_analyzer = sinatools.CLI.morphology.morph_analyzer:main
13
15
  remove_latin = sinatools.CLI.utils.remove_latin:main
14
16
  remove_punctuation = sinatools.CLI.utils.remove_punctuation:main
15
- salma = sinatools.CLI.salma.salma_tools:main
17
+ semantic_relatedness = sinatools.CLI.semantic_relatedness.compute_relatedness:main
16
18
  sentence_tokenizer = sinatools.CLI.utils.sentence_tokenizer:main
19
+ text_dublication_detector = sinatools.CLI.utils.text_dublication_detector:main
17
20
  transliterate = sinatools.CLI.utils.text_transliteration:main
21
+ wsd = sinatools.CLI.wsd.disambiguator:main
18
22
 
@@ -29,16 +29,6 @@ Examples:
29
29
  download_files -f morph ner
30
30
  This command will download only the `morph` and `ner` files to the default directory.
31
31
 
32
- Note:
33
- -----
34
-
35
- .. code-block:: none
36
-
37
- - The script automatically handles the extraction of zip and tar.gz files after downloading.
38
- - Ensure you have the necessary permissions to write to the specified directory.
39
- - The default download directory is based on the operating system and can be obtained using the `get_appdatadir` function.
40
-
41
-
42
32
  """
43
33
 
44
34
  import argparse
@@ -4,20 +4,23 @@ from sinatools.utils.tokenizer import sentence_tokenizer
4
4
  from sinatools.utils.tokenizers_words import simple_word_tokenize
5
5
  import pandas as pd
6
6
  import argparse
7
- from sinatools.ner.entity_extractor import ner
7
+ from sinatools.ner.entity_extractor import extract
8
8
 
9
9
  """
10
- CSV NER Tagging Tool
10
+ This tool processes a csv file and returns named entites for each token within the text, based on the specified batch size. As follows:
11
11
 
12
12
  Usage:
13
13
  ------
14
14
  Run the script with the following command:
15
15
 
16
- arabi_ner2 input.csv --text-columns "TextColumn1,TextColumn2" --additional-columns "Column3,Column4" --output-csv output.csv
16
+ corpus_entity_extractor input.csv --text-columns "TextColumn1,TextColumn2" --additional-columns "Column3,Column4" --output-csv output.csv
17
17
  """
18
18
 
19
- def infer(sentence):
20
- output = ner(sentence)
19
+ def jsons_to_list_of_lists(json_list):
20
+ return [[d['token'], d['tags']] for d in json_list]
21
+
22
+ def combine_tags(sentence):
23
+ output = jsons_to_list_of_lists(extract(sentence))
21
24
  return [word[1] for word in output]
22
25
 
23
26
 
@@ -40,7 +43,7 @@ def corpus_tokenizer(input_csv, output_csv, text_column, additional_columns, row
40
43
  words = simple_word_tokenize(sentence)
41
44
  global_sentence_id += 1
42
45
 
43
- tags = infer(sentence)
46
+ tags = combine_tags(sentence)
44
47
  for word_position, word in enumerate(words, start=1):
45
48
  row_id += 1
46
49
  doc_sentence_filename = input_csv.split(".csv")[0]
@@ -1,16 +1,16 @@
1
1
  """
2
2
  About:
3
3
  ------
4
- The ArabiNER tool carries out Named Entity Recognition (NER) utilizing the ArabiNER utility from the SinaTools suite. It identifies the named entities and provides a comprehensive analysis in JSON format if the input consists of text, or in a CSV file if the input is a directory of files.
4
+ This tool processes an input text and returns named entites for each token within the text, based on the specified batch size. As follows:
5
5
 
6
6
  Usage:
7
7
  ------
8
- Below is the usage information that can be generated by running arabi_ner --help.
8
+ Below is the usage information that can be generated by running entity_extractor --help.
9
9
 
10
10
  .. code-block:: none
11
11
 
12
- arabi_ner --text=INPUT_TEXT
13
- arabi_ner --dir=INPUT_FILE --output_csv=OUTPUT_FILE_NAME
12
+ entity_extractor --text=INPUT_TEXT
13
+ entity_extractor --dir=INPUT_FILE --output_csv=OUTPUT_FILE_NAME
14
14
 
15
15
  Options:
16
16
  --------
@@ -23,37 +23,28 @@ Options:
23
23
  File containing the text to be analyzed for Named Entity Recognition.
24
24
  --output_csv OUTPUT_FILE_NAME
25
25
  A file containing the tokenized text and its Named Entity tags.
26
- Examples:
27
- ---------
28
26
 
29
- .. code-block:: none
30
27
 
31
- arabi_ner --text "Your text here"
32
- arabi_ner --dir "/path/to/your/directory" --output_csv "output.csv"
33
-
34
- Note:
35
- -----
28
+ Examples:
29
+ ---------
36
30
 
37
31
  .. code-block:: none
38
32
 
39
- - Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
40
- - The tool returns results in JSON format with proper indentation for better readability.
41
- - The quality and accuracy of the analysis depend on the underlying capabilities of the ArabiNER utility.
33
+ entity_extractor --text "Your text here"
34
+ entity_extractor --dir "/path/to/your/directory" --output_csv "output.csv"
42
35
 
43
36
  """
44
37
 
45
38
  import argparse
46
39
  import json
47
40
  import pandas as pd
48
- from sinatools.ner.entity_extractor import ner
41
+ from sinatools.ner.entity_extractor import extract
49
42
  from sinatools.utils.tokenizer import corpus_tokenizer
50
43
  from sinatools.utils.tokenizers_words import simple_word_tokenize
51
44
 
52
45
 
53
- def infer(sentence):
54
- # Now infer returns all NER tags for a sentence
55
- output = ner(sentence)
56
- ##print("ner output : ", output)
46
+ def combine_tags(sentence):
47
+ output = extract(sentence)
57
48
  return [word[1] for word in output]
58
49
 
59
50
 
@@ -67,7 +58,7 @@ def main():
67
58
  args = parser.parse_args()
68
59
 
69
60
  if args.text is not None:
70
- results = ner(args.text)
61
+ results = extract(args.text)
71
62
  # Print the results in JSON format
72
63
  print(json.dumps(results, ensure_ascii=False, indent=4))
73
64
  elif args.dir is not None:
@@ -76,28 +67,16 @@ def main():
76
67
  df['NER tags'] = None
77
68
  i = 0
78
69
 
79
- # Use drop_duplicates to get unique values based on Row_ID and Sentence
80
70
  result = df.drop_duplicates(subset=['Global Sentence ID', 'Sentence'])
81
-
82
- # Get the "Sentence" column as an array
83
71
  unique_sentences = result['Sentence'].to_numpy()
84
-
85
- # Print the result
86
- #print(unique_sentences, len(result['Sentence']))
87
- #print("#############")
88
-
89
- for sentence in unique_sentences: # iterating over unique sentences
90
- #print(" Sentence : ", simple_word_tokenize(sentence), len(simple_word_tokenize(sentence)))
91
- ner_tags = infer(sentence) # getting all NER tags for the sentence
92
- #if len(ner_tags) != len(df[i:i+len(ner_tags)]):
93
- # print("Not Equal...", len(ner_tags) , len(df[i:i+len(ner_tags)]))
94
- # return
72
+
73
+ for sentence in unique_sentences:
74
+ ner_tags = combine_tags(sentence)
95
75
  if len(simple_word_tokenize(sentence)) > 300:
96
76
  print(" Length of this sentence is more than 300 word: ", sentence)
97
77
  return
98
- #df['NER tags'].iloc[i:i+len(ner_tags)] = ner_tags
99
- df.loc[i:i+len(ner_tags)-1, 'NER tags'] = ner_tags # Use .loc to assign values
100
- #print("Exit with ner tags = ", ner_tags, " and length : ", len(ner_tags), type(len(ner_tags)), " and df is " , df[i:i+len(ner_tags)], " with length : ", len(df[i:i+len(ner_tags)]), type(len(df[i:i+len(ner_tags)])), " i:i+len(ner_tags) : ", i," , ", i+len(ner_tags))
78
+
79
+ df.loc[i:i+len(ner_tags)-1, 'NER tags'] = ner_tags
101
80
  i = i + len(ner_tags)
102
81
 
103
82
  df.to_csv(args.output_csv, index=False)
@@ -107,7 +86,4 @@ def main():
107
86
 
108
87
 
109
88
  if __name__ == '__main__':
110
- main()
111
-
112
- #arabi_ner --text "Your text here."
113
- #arabi_ner --dir /path/to/your/directory --output_csv output.csv
89
+ main()
@@ -26,7 +26,7 @@ Below is the usage information that can be generated by running arStrip --help.
26
26
  --diacs BOOL [default=True]
27
27
  Indicates whether to strip diacritics.
28
28
 
29
- --smallDiacs BOOL [default=True]
29
+ --small_diacs BOOL [default=True]
30
30
  Indicates whether to strip small diacritics.
31
31
 
32
32
  --shaddah BOOL [default=True]
@@ -38,15 +38,15 @@ Below is the usage information that can be generated by running arStrip --help.
38
38
  --alif BOOL [default=True]
39
39
  Indicates whether to strip alif.
40
40
 
41
- --specialChars BOOL [default=True]
41
+ --special_chars BOOL [default=True]
42
42
  Indicates whether to strip special characters.
43
43
 
44
44
  Examples:
45
45
  ---------
46
46
  .. code-block:: none
47
47
 
48
- arStrip --text "مُختَبَر سينا لحوسبة اللغة!" --diacs=True --smallDiacs=False --shaddah=True --digit=False --alif=False --specialChars=False
49
- arStrip --file "path/to/your/file.txt" --diacs=True --smallDiacs=False --shaddah=True --digit=False --alif=False --specialChars=False
48
+ arStrip --text "مُختَبَر سينا لحوسبة اللغة!" --diacs=True --small_diacs=False --shaddah=True --digit=False --alif=False --special_chars=False
49
+ arStrip --file "path/to/your/file.txt" --diacs=True --small_diacs=False --shaddah=True --digit=False --alif=False --special_chars=False
50
50
 
51
51
  """
52
52
 
@@ -60,11 +60,11 @@ def main():
60
60
  parser.add_argument('--text', type=str, help='Text to be stripped')
61
61
  parser.add_argument('--file', type=str, help='File containing text to be stripped')
62
62
  parser.add_argument('--diacs', type=bool, default=True, help='Whether to strip diacritics')
63
- parser.add_argument('--smallDiacs', type=bool, default=True, help='Whether to strip small diacritics')
63
+ parser.add_argument('--small_diacs', type=bool, default=True, help='Whether to strip small diacritics')
64
64
  parser.add_argument('--shaddah', type=bool, default=True, help='Whether to strip shaddah')
65
65
  parser.add_argument('--digit', type=bool, default=True, help='Whether to strip digits')
66
66
  parser.add_argument('--alif', type=bool, default=True, help='Whether to strip alif')
67
- parser.add_argument('--specialChars', type=bool, default=True, help='Whether to strip special characters')
67
+ parser.add_argument('--special_chars', type=bool, default=True, help='Whether to strip special characters')
68
68
 
69
69
  args = parser.parse_args()
70
70
 
@@ -76,8 +76,8 @@ def main():
76
76
  print("Either --text or --file argument must be provided.")
77
77
  return
78
78
 
79
- stripped_text = arStrip(text_content, diacs=args.diacs, smallDiacs=args.smallDiacs,
80
- shaddah=args.shaddah, digit=args.digit, alif=args.alif, specialChars=args.specialChars)
79
+ stripped_text = arStrip(text_content, diacs=args.diacs, small_diacs=args.small_diacs,
80
+ shaddah=args.shaddah, digit=args.digit, alif=args.alif, special_chars=args.special_chars)
81
81
 
82
82
  print(stripped_text)
83
83
 
@@ -37,14 +37,6 @@ Examples:
37
37
 
38
38
  implication --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt"
39
39
 
40
- Note:
41
- -----
42
-
43
- .. code-block:: none
44
-
45
- - The results are based on the underlying logic and data sets present in the `Implication` class of SinaTools.
46
- - The tool compares the implication between two words, and the relationship might vary based on linguistic nuances.
47
-
48
40
  """
49
41
  import argparse
50
42
  from sinatools.utils.implication import Implication
@@ -5,14 +5,14 @@ The jaccard tool computes the Jaccard similarity between two sets of strings. Th
5
5
 
6
6
  Usage:
7
7
  ------
8
- Below is the usage information that can be generated by running jaccard --help.
8
+ Below is the usage information that can be generated by running jaccard_similarity --help.
9
9
 
10
10
  .. code-block:: none
11
11
 
12
12
  Usage:
13
- jaccard --list1="WORD1, WORD2" --list2="WORD1,WORD2" --delimiter="DELIMITER" --selection="SELECTION" [OPTIONS]
13
+ jaccard_similarity --list1="WORD1, WORD2" --list2="WORD1,WORD2" --delimiter="DELIMITER" --selection="SELECTION" [OPTIONS]
14
14
 
15
- jaccard --file1=File1 --file2=File2 --delimiter="DELIMITER" --selection="SELECTION" [OPTIONS]
15
+ jaccard_similarity --file1=File1 --file2=File2 --delimiter="DELIMITER" --selection="SELECTION" [OPTIONS]
16
16
 
17
17
  .. code-block:: none
18
18
 
@@ -39,18 +39,9 @@ Examples:
39
39
 
40
40
  .. code-block:: none
41
41
 
42
- jaccard --list1 "word1,word2" --list2 "word1, word2" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
42
+ jaccard_similarity --list1 "word1,word2" --list2 "word1, word2" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
43
43
 
44
- jaccard --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
45
-
46
- Note:
47
- -----
48
-
49
- .. code-block:: none
50
-
51
- - The Jaccard similarity ranges from 0 to 1. A value of 1 indicates that the sets are identical, while a value of 0 indicates no similarity between the sets.
52
- - Diacritics refer to the Arabic Diacritics (like fatha, damma, kasra, etc.) and shadda.
53
- - The two normalization options can be used individually or together. However, the combination will result in both rules being applied, and thus,
44
+ jaccard_similarity --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt" --delimiter "," --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
54
45
 
55
46
  """
56
47
 
@@ -14,8 +14,8 @@ Below is the usage information that can be generated by running remove_latin --h
14
14
  Examples:
15
15
  ---------
16
16
  .. code-block:: none
17
- latin_remove --text "123test"
18
- latin_remove --file "path/to/your/file.txt"
17
+ remove_latin --text "123test"
18
+ remove_latin --file "path/to/your/file.txt"
19
19
  """
20
20
 
21
21
  import argparse
@@ -0,0 +1,25 @@
1
+ import argparse
2
+ from sinatools.utils.text_dublication_detector import removal
3
+
4
+ def main():
5
+ parser = argparse.ArgumentParser(description='Processes a CSV file of sentences to identify and remove duplicate sentences based on a specified threshold and cosine similarity. It saves the filtered results and the identified duplicates to separate files.')
6
+
7
+ parser.add_argument('--csv_file', type=str, help='The path to the input CSV file that will be processed.')
8
+ parser.add_argument('--column_name', type=str, help='The name of the column from which duplicates will be removed.')
9
+ parser.add_argument('--final_file_name', type=str, help='The name of the output file that will contain the deduplicated results.')
10
+ parser.add_argument('--deleted_file_name', type=str, help='The name of the output file that will contain the records that were identified as duplicates and removed.')
11
+ parser.add_argument('--similarity_threshold', type=float, default=0.8, help='The similarity threshold for determining duplicates. Records with a similarity score above this value will be considered duplicates (default is 0.8).')
12
+
13
+ args = parser.parse_args()
14
+
15
+ if args.csv_file is None and args.column_name is None:
16
+ print("Either --csv_file or --column_name argument must be provided.")
17
+ return
18
+
19
+ removal(args.csv_file, args.column_name, args.final_file_name, args.deleted_file_name, args.similarity_threshold)
20
+
21
+
22
+ if __name__ == '__main__':
23
+ main()
24
+
25
+ # text_dublication_detector --csv_file "text.csv" --column_name "A" --final_file_name "Final.csv" --deleted_file_name "deleted.csv" --similarity_threshold 0.8
sinatools/VERSION CHANGED
@@ -1 +1 @@
1
- 0.1.11
1
+ 0.1.13
@@ -1,33 +1,31 @@
1
1
  from sinatools.utils.parser import arStrip
2
- import json
3
- from . import dictionary
2
+ from . import five_grams_dict, four_grams_dict , three_grams_dict , two_grams_dict
4
3
 
5
- def ALMA_multi_word(multi_word):
4
+ def ALMA_multi_word(multi_word, n):
6
5
  undiac_multi_word = arStrip(multi_word, True, True, True, False, True, False) # diacs , smallDiacs , shaddah , digit , alif , specialChars
7
6
  result_word = []
8
- if undiac_multi_word in dictionary.keys():
9
- result_word = dictionary[undiac_multi_word]
7
+ if n == 2:
8
+ if undiac_multi_word in two_grams_dict.keys():
9
+ result_word = two_grams_dict[undiac_multi_word]
10
+ elif n == 3:
11
+ if undiac_multi_word in three_grams_dict.keys():
12
+ result_word = three_grams_dict[undiac_multi_word]
13
+ elif n == 4:
14
+ if undiac_multi_word in four_grams_dict.keys():
15
+ result_word = four_grams_dict[undiac_multi_word]
16
+ else:
17
+ if undiac_multi_word in five_grams_dict.keys():
18
+ result_word = five_grams_dict[undiac_multi_word]
10
19
 
11
20
  my_json = {}
12
- glosses_list = []
13
21
  output_list = []
14
- concept_count = 0
15
22
  my_json['multi_word_lemma'] = multi_word
16
23
  my_json['undiac_multi_word_lemma'] = multi_word
17
24
  ids = []
18
25
  if result_word != []:
19
- #my_json['concept_count'] = result_word[0][1] #concept_count
20
- #my_json['POS'] = result_word[0][2] #POS
21
26
  my_json['POS'] = result_word[0][1] #POS
22
-
23
27
  for result in result_word:
24
28
  ids.append(result[3])
25
- #if lemma_id in settings.glosses_dic.keys():
26
- # value = settings.glosses_dic[lemma_id]
27
- # glosses_list.append(json.loads(value[1]))
28
- # concept_count = concept_count + value[0]
29
29
  my_json['ids'] = ids
30
- #my_json['concept_count'] = concept_count
31
- #my_json['glosses'] = glosses_list
32
30
  output_list.append(my_json)
33
31
  return output_list
@@ -3,40 +3,41 @@ from sinatools.DataDownload import downloader
3
3
  import os
4
4
 
5
5
  dictionary = {}
6
+ five_grams_dict = {}
7
+ four_grams_dict = {}
8
+ three_grams_dict = {}
9
+ two_grams_dict = {}
10
+
6
11
  filename = 'lemmas_dic.pickle'
7
12
  path = downloader.get_appdatadir()
8
13
  file_path = os.path.join(path, filename)
9
14
  with open(file_path, 'rb') as f:
10
15
  dictionary = pickle.load(f)
11
16
 
12
- #filename_five = 'five_grams.pickle'
13
- #path =downloader.get_appdatadir()
14
- #file_path = os.path.join(path, filename_five)
15
- #with open(file_path, 'rb') as f:
16
- # #Load the serialized data from the file
17
- # settings.five_grams_dict = pickle.load(f, encoding='utf-8')
18
- #
19
- #
20
- #filename_four = 'four_grams.pickle'
21
- #path =downloader.get_appdatadir()
22
- #file_path = os.path.join(path, filename_four)
23
- #with open(file_path, 'rb') as f:
24
- # #Load the serialized data from the file
25
- # settings.four_grams_dict = pickle.load(f, encoding='utf-8')
26
- #
27
- #
28
- #filename_three = 'three_grams.pickle'
29
- #path =downloader.get_appdatadir()
30
- #file_path = os.path.join(path, filename_three)
31
- #with open(file_path, 'rb') as f:
32
- # #Load the serialized data from the file
33
- # settings.three_grams_dict = pickle.load(f, encoding='utf-8')
34
- #
35
- #
36
- #filename_two = 'two_grams.pickle'
37
- #path =downloader.get_appdatadir()
38
- #file_path = os.path.join(path, filename_two)
39
- #with open(file_path, 'rb') as f:
40
- # #Load the serialized data from the file
41
- # settings.two_grams_dict = pickle.load(f, encoding='utf-8')
42
- #
17
+ filename_five = 'five_grams.pickle'
18
+ path =downloader.get_appdatadir()
19
+ file_path = os.path.join(path, filename_five)
20
+ with open(file_path, 'rb') as f:
21
+ five_grams_dict = pickle.load(f, encoding='utf-8')
22
+
23
+
24
+ filename_four = 'four_grams.pickle'
25
+ path =downloader.get_appdatadir()
26
+ file_path = os.path.join(path, filename_four)
27
+ with open(file_path, 'rb') as f:
28
+ four_grams_dict = pickle.load(f, encoding='utf-8')
29
+
30
+
31
+ filename_three = 'three_grams.pickle'
32
+ path =downloader.get_appdatadir()
33
+ file_path = os.path.join(path, filename_three)
34
+ with open(file_path, 'rb') as f:
35
+ three_grams_dict = pickle.load(f, encoding='utf-8')
36
+
37
+
38
+ filename_two = 'two_grams.pickle'
39
+ path =downloader.get_appdatadir()
40
+ file_path = os.path.join(path, filename_two)
41
+ with open(file_path, 'rb') as f:
42
+ two_grams_dict = pickle.load(f, encoding='utf-8')
43
+
sinatools/ner/__init__.py CHANGED
@@ -1,6 +1,12 @@
1
1
  from sinatools.DataDownload import downloader
2
2
  import os
3
- from sinatools.ner.utils.helpers import load_checkpoint
3
+ from sinatools.ner.helpers import load_object
4
+ import pickle
5
+ import os
6
+ import torch
7
+ import pickle
8
+ import json
9
+ from argparse import Namespace
4
10
 
5
11
  tagger = None
6
12
  tag_vocab = None
@@ -9,4 +15,24 @@ train_config = None
9
15
  filename = 'Wj27012000.tar'
10
16
  path =downloader.get_appdatadir()
11
17
  model_path = os.path.join(path, filename)
12
- tagger, tag_vocab, train_config = load_checkpoint(model_path)
18
+
19
+ _path = os.path.join(model_path, "tag_vocab.pkl")
20
+
21
+ with open(_path, "rb") as fh:
22
+ tag_vocab = pickle.load(fh)
23
+
24
+ train_config = Namespace()
25
+ args_path = os.path.join(model_path, "args.json")
26
+
27
+ with open(args_path, "r") as fh:
28
+ train_config.__dict__ = json.load(fh)
29
+
30
+ model = load_object(train_config.network_config["fn"], train_config.network_config["kwargs"])
31
+ model = torch.nn.DataParallel(model)
32
+
33
+ if torch.cuda.is_available():
34
+ model = model.cuda()
35
+
36
+ train_config.trainer_config["kwargs"]["model"] = model
37
+ tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
38
+ tagger.load(os.path.join(model_path,"checkpoints"))
@@ -0,0 +1 @@
1
+ from sinatools.ner.data.datasets import NestedTagsDataset