PyPI - SinaTools - Versions diffs - 0.1.11__py2.py3-none-any.whl → 0.1.13__py2.py3-none-any.whl - Mend

SinaTools 0.1.11py2.py3-none-any.whl → 0.1.13py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

{SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/METADATA +2 -3
{SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/RECORD +47 -26
{SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/entry_points.txt +7 -3
sinatools/CLI/DataDownload/download_files.py +0 -10
sinatools/CLI/ner/corpus_entity_extractor.py +9 -6
sinatools/CLI/ner/entity_extractor.py +18 -42
sinatools/CLI/utils/arStrip.py +8 -8
sinatools/CLI/utils/implication.py +0 -8
sinatools/CLI/utils/jaccard.py +5 -14
sinatools/CLI/utils/remove_latin.py +2 -2
sinatools/CLI/utils/text_dublication_detector.py +25 -0
sinatools/VERSION +1 -1
sinatools/morphology/ALMA_multi_word.py +14 -16
sinatools/morphology/__init__.py +32 -31
sinatools/ner/__init__.py +28 -2
sinatools/ner/data/__init__.py +1 -0
sinatools/ner/data/datasets.py +146 -0
sinatools/ner/data/transforms.py +118 -0
sinatools/ner/data.py +124 -0
sinatools/ner/data_format.py +124 -0
sinatools/ner/datasets.py +146 -0
sinatools/ner/entity_extractor.py +34 -54
sinatools/ner/helpers.py +86 -0
sinatools/ner/metrics.py +69 -0
sinatools/ner/nn/BaseModel.py +22 -0
sinatools/ner/nn/BertNestedTagger.py +34 -0
sinatools/ner/nn/BertSeqTagger.py +17 -0
sinatools/ner/nn/__init__.py +3 -0
sinatools/ner/trainers/BaseTrainer.py +117 -0
sinatools/ner/trainers/BertNestedTrainer.py +203 -0
sinatools/ner/trainers/BertTrainer.py +163 -0
sinatools/ner/trainers/__init__.py +3 -0
sinatools/ner/transforms.py +119 -0
sinatools/semantic_relatedness/__init__.py +20 -0
sinatools/semantic_relatedness/compute_relatedness.py +31 -0
sinatools/synonyms/__init__.py +18 -0
sinatools/synonyms/synonyms_generator.py +192 -0
sinatools/utils/text_dublication_detector.py +110 -0
sinatools/wsd/__init__.py +11 -0
sinatools/{salma/views.py → wsd/disambiguator.py} +135 -94
sinatools/{salma → wsd}/wsd.py +1 -1
sinatools/CLI/salma/salma_tools.py +0 -68
sinatools/salma/__init__.py +0 -12
sinatools/utils/utils.py +0 -2
{SinaTools-0.1.11.data → SinaTools-0.1.13.data}/data/sinatools/environment.yml +0 -0
{SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/AUTHORS.rst +0 -0
{SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/LICENSE +0 -0
{SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/WHEEL +0 -0
{SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/top_level.txt +0 -0
/sinatools/{salma → wsd}/settings.py +0 -0

{SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: SinaTools
-Version: 0.1.11
+Version: 0.1.13
 Summary: Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.
 Home-page: https://github.com/SinaLab/sinatools
 License: MIT license
@@ -19,9 +19,8 @@ Requires-Dist: torchtext (==0.14.0)
 Requires-Dist: torchvision (==0.14.0)
 Requires-Dist: seqeval (==1.2.2)
 Requires-Dist: natsort (==7.1.1)
-Requires-Dist: pandas (==1.2.4)
-sinatools
+SinaTools
 ---------
 Open-source Python toolkit for Arabic Natural Understanding, allowing people to integrate it in their system workflow.

{SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/RECORD RENAMED Viewed

@@ -1,23 +1,23 @@
-SinaTools-0.1.11.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
-sinatools/VERSION,sha256=r-jqHzm6w_RJgmQoGTVXOd9UOJB6qtVWF_tPmBSRxi8,6
+SinaTools-0.1.13.data/data/sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
+sinatools/VERSION,sha256=7lGv2l4eJuZteaVLIUnlbwoi4W41EwZ01RPRCjudlCI,6
 sinatools/__init__.py,sha256=bEosTU1o-FSpyytS6iVP_82BXHF2yHnzpJxPLYRbeII,135
 sinatools/environment.yml,sha256=OzilhLjZbo_3nU93EQNUFX-6G5O3newiSWrwxvMH2Os,7231
 sinatools/install_env.py,sha256=EODeeE0ZzfM_rz33_JSIruX03Nc4ghyVOM5BHVhsZaQ,404
 sinatools/sinatools.py,sha256=vR5AaF0iel21LvsdcqwheoBz0SIj9K9I_Ub8M8oA98Y,20
-sinatools/CLI/DataDownload/download_files.py,sha256=17CtswdAT66KO7hw3o87RaWbM-BxdUpsheE6bysP3-c,2302
+sinatools/CLI/DataDownload/download_files.py,sha256=tkH293ZUSvlvyZClkJmxfNk1x-C3XnlQCTVGdLYCKu0,1946
 sinatools/CLI/morphology/ALMA_multi_word.py,sha256=ZImJ1vtcpSHydI1BjJmK3KcMJbGBZX16kO4L6rxvBvA,2086
 sinatools/CLI/morphology/morph_analyzer.py,sha256=ieIM47QK9Nct3MtCS9uq3h2rZN5r4qNhsLmlVeE6wiE,3503
-sinatools/CLI/ner/corpus_entity_extractor.py,sha256=jsxTQsR4i8ZwsWrX1XxkYUbLGygYKV7-pWDiubfaANE,3751
-sinatools/CLI/ner/entity_extractor.py,sha256=BHAs2nGKL9npHUXj-6FDHQCuR2jidvFJX8yUkgQKxhc,4436
-sinatools/CLI/salma/salma_tools.py,sha256=8IDMSXjpM2u8jXc6c5JcI_l2CmiwdCxsUBJVN1Rrfk0,1971
+sinatools/CLI/ner/corpus_entity_extractor.py,sha256=_o0frMSgpsFVXPoztS3mQTK7LjHsgzUv9gfs6iJL424,4024
+sinatools/CLI/ner/entity_extractor.py,sha256=QFGkavZz8ZZGetMTXiTH_OeoN9B2Iyx60EKCYdFtoDY,2811
 sinatools/CLI/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-sinatools/CLI/utils/arStrip.py,sha256=pOMh9e4O-vBixbv2HM5qjlA8-qJH3Zf7DeJVekPrgjo,3252
+sinatools/CLI/utils/arStrip.py,sha256=NLyp8vOu2xv80tL9jiKRvyptmbkRZVg-wcAr-9YyvNY,3264
 sinatools/CLI/utils/corpus_tokenizer.py,sha256=nH0T4h6urr_0Qy6-wN3PquOtnwybj0REde5Ts_OE4U8,1650
-sinatools/CLI/utils/implication.py,sha256=3vw526ZL0WR8LiIKbjYibTQWE_UeYvHThc1W9-BlbHg,3133
-sinatools/CLI/utils/jaccard.py,sha256=1zSkEQevB-1D5xcT__qmrgB1s8CISU70wDMBteCKCSo,4601
-sinatools/CLI/utils/remove_latin.py,sha256=dzRzRapmM4mJwS-rhNy9PYQKS-ONMsRBmN1ZcPfEBfE,848
+sinatools/CLI/utils/implication.py,sha256=nvoiI5UHHaJdd6MICql0pB_-h3L0icYwP1WgJi2h7p0,2854
+sinatools/CLI/utils/jaccard.py,sha256=NoKbWAq6dHDtQ56mAc1kdAnROm8NXEjZ1ecVZ7EYm6Y,4205
+sinatools/CLI/utils/remove_latin.py,sha256=NOaTm2RHxt5IQrV98ySTmD8rTXTmcqSmfbPAwTyaXqU,848
 sinatools/CLI/utils/remove_punctuation.py,sha256=vJAZlEn7WGftZAFVFYnddkRrxdJ_rMmKB9vFZkY-jN4,1097
 sinatools/CLI/utils/sentence_tokenizer.py,sha256=Wli8eiDbWSd_Z8UKpu_JkaS8jImowa1vnRL0oYCSfqw,2823
+sinatools/CLI/utils/text_dublication_detector.py,sha256=dW70O5O20GxeUDDF6zVYn52wWLmJF-HBZgvqIeVL2rQ,1661
 sinatools/CLI/utils/text_transliteration.py,sha256=vz-3kxWf8pNYVCqNAtBAiA6u_efrS5NtWT-ofN1NX6I,2014
 sinatools/DataDownload/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sinatools/DataDownload/downloader.py,sha256=MbTPqqlg5vOTErxeVvdMn5k0TsYaG6kef2zHkeBLXlk,6480
@@ -73,29 +73,50 @@ sinatools/arabert/aragpt2/grover/modeling_gpt2.py,sha256=WFpCWn1792yATFzt8rZ0rpW
 sinatools/arabert/aragpt2/grover/optimization_adafactor.py,sha256=1geOsCWuv5xxtSnKDz9a8aY5SVwZ1MGq-xVQDBg4Gpg,9765
 sinatools/arabert/aragpt2/grover/train_tpu.py,sha256=qNgLI_j6-KYkTMJfVoFlh4NIKweY1aPz1XPDw6odld0,7102
 sinatools/arabert/aragpt2/grover/utils.py,sha256=V5wMUxK03r5g_pb7R3_uGLOPqQJfbIB0VaJ8ZDM4XAo,8473
-sinatools/morphology/ALMA_multi_word.py,sha256=GPM2-N7_5JIZwNdi1we6gBG0rh59AlGM0XWYxmEE7jY,1283
-sinatools/morphology/__init__.py,sha256=avTxtexZELp1Fya6cBNFLyeYPB31OcmQOlT2L-uAQnI,1386
+sinatools/morphology/ALMA_multi_word.py,sha256=hj_-8ojrYYHnfCGk8WKtJdUR8mauzQdma4WUm-okDps,1346
+sinatools/morphology/__init__.py,sha256=I4wVBh8BhyNl-CySVdiI_nUSn6gj1j-gmLKP300RpE0,1216
 sinatools/morphology/morph_analyzer.py,sha256=tA78gWg6iaE_G1c2xqxZoXZWNbvHBJLrTSxPyir5Xn8,6941
-sinatools/ner/__init__.py,sha256=8R8epTEyvpbreLYTrC5M5lctlzZrNr7T7B4KmENnB3I,341
-sinatools/ner/entity_extractor.py,sha256=amVU6tXoAAL9NcadfJlx1qyEPlxBY8wRo5Tn-ZLOVIw,3236
-sinatools/salma/__init__.py,sha256=_by3PsXetNjkxSyg24nF592T-21JEWhPXzMAPzwDOhQ,378
-sinatools/salma/settings.py,sha256=b_AqTxVWALuGXnsMd9KhnnwIo9-JEoWOTekB-7_xJCU,1111
-sinatools/salma/views.py,sha256=G5W5BSr770NapWz5j6hcuwInrR40JKG-LkzP1OpcYeA,18416
-sinatools/salma/wsd.py,sha256=vCiiR5h3bjAOHi3yxxkh_7GUgBWKQf297aHbO4Z8CBk,4436
+sinatools/ner/__init__.py,sha256=gSs0x6veWJ8j3_iOs79tynBd_hJP0t44CGpJ0xzoiW4,1048
+sinatools/ner/data.py,sha256=lvOW86dXse8SC75Q0supQaE0rrRffoxNjIA0Qbv5WZY,4354
+sinatools/ner/data_format.py,sha256=7Yt0aOicOn9_YuuyCkM_IYi_rgjGYxR9bCuUaNGM73o,4341
+sinatools/ner/datasets.py,sha256=mG1iwqSm3lXCFHLqE-b4wNi176cpuzNBz8tKaBU6z6M,5059
+sinatools/ner/entity_extractor.py,sha256=fwaPmbg3RaohQu9uu9rMXlvamCnv3am1EYjQMG6tuyY,2270
+sinatools/ner/helpers.py,sha256=dnOoDY5JMyOLTUWVIZLMt8mBn2IbWlVaqHhQyjs1voo,2343
+sinatools/ner/metrics.py,sha256=Irz6SsIvpOzGIA2lWxrEV86xnTnm0TzKm9SUVT4SXUU,2734
+sinatools/ner/transforms.py,sha256=vti3mDdi-IRP8i0aTQ37QqpPlP9hdMmJ6_bAMa0uL-s,4871
+sinatools/ner/data/__init__.py,sha256=W0C1ge_XxTfmdEGz0hkclz57aLI5VFS5t6BjByCfkFk,57
+sinatools/ner/data/datasets.py,sha256=lcdDDenFMEKIGYQmfww2dk_9WKWrJO9HtKptaAEsRmY,5064
+sinatools/ner/data/transforms.py,sha256=URMz1dHzkHjgUGAkDOenCWvQThO1ha8XeQVjoLL9RXM,4874
+sinatools/ner/nn/BaseModel.py,sha256=3GmujQasTZZunOBuFXpY2p1W8W256iI_Uu4hxhOY2Z0,608
+sinatools/ner/nn/BertNestedTagger.py,sha256=_fwAn1kiKmXe6m5y16Ipty3kvXIEFEmiUq74Ad1818U,1219
+sinatools/ner/nn/BertSeqTagger.py,sha256=dFcBBiMw2QCWsyy7aQDe_PS3aRuNn4DOxKIHgTblFvc,504
+sinatools/ner/nn/__init__.py,sha256=UgQD_XLNzQGBNSYc_Bw1aRJZjq4PJsnMT1iZwnJemqE,170
+sinatools/ner/trainers/BaseTrainer.py,sha256=oZgFJW-CawfCKT5gtaBHA7Q7XjNfiyqM62KnFsgVzPU,3919
+sinatools/ner/trainers/BertNestedTrainer.py,sha256=Pb4O2WeBmTvV3hHMT6DXjxrTzgtuh3OrKQZnogYy8RQ,8429
+sinatools/ner/trainers/BertTrainer.py,sha256=B_uVtUwfv_eFwMMPsKQvZgW_ZNLy6XEsX5ePR0s8d-k,6433
+sinatools/ner/trainers/__init__.py,sha256=UDok8pDDpYOpwRBBKVLKaOgSUlmqqb-zHZI1p0xPxzI,188
+sinatools/semantic_relatedness/__init__.py,sha256=S0xrmqtl72L02N56nbNMudPoebnYQgsaIyyX-587DsU,830
+sinatools/semantic_relatedness/compute_relatedness.py,sha256=JvI0cXgukKtuMpmAygMnlocCsPeAJ98LD1jZCP_6SyQ,1110
+sinatools/synonyms/__init__.py,sha256=d4Xq8iFpuXojJ5HL1OpMQvYigNdU601oxwjt9iighOU,568
+sinatools/synonyms/synonyms_generator.py,sha256=FgAiuduSFyM6vJobWZKHg4KNWIQz8T6MGBPVIuVuw-8,6506
 sinatools/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sinatools/utils/charsets.py,sha256=rs82oZJqRqosZdTKXfFAJfJ5t4PxjMM_oAPsiWSWuwU,2817
 sinatools/utils/implication.py,sha256=MsbI6S1LNY-fCxGMxFTuaV639r3QijkkdcfH48rvY7A,27804
 sinatools/utils/jaccard.py,sha256=S7OgvaMqkN5HFgTZkKhMCNAuAnQ0LhRyXPN79jAzmKM,10113
 sinatools/utils/parser.py,sha256=CPPtCrsbxUqsjhY5C9wTOgkAs6iw0k_WvMUxLEPM1IU,6168
 sinatools/utils/readfile.py,sha256=xE4LEaCqXJIk9v37QUSSmWb-aY3UnCFUNb7uVdx3cpM,133
+sinatools/utils/text_dublication_detector.py,sha256=6yAOUtdw4TKiJkUPDDi3oK7CEoIuBDbliJ4PU7kapfo,4249
 sinatools/utils/text_transliteration.py,sha256=NQoXrxI-h0UXnvVtDA3skNJduxIy0IW26r46N4tDxGk,8766
 sinatools/utils/tokenizer.py,sha256=QHyrVqJA_On4rKxexiWR2ovq4pI1-u6iZkdhRbK9tew,6676
 sinatools/utils/tokenizers_words.py,sha256=efNfOil9qDNVJ9yynk_8sqf65PsL-xtsHG7y2SZCkjQ,656
-sinatools/utils/utils.py,sha256=vKkFOkYclMu8nXS_VZb6Kobx8QGKW9onXkkLCeiRb6g,32
-SinaTools-0.1.11.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
-SinaTools-0.1.11.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
-SinaTools-0.1.11.dist-info/METADATA,sha256=DVITBkslVbLkq84QWFz8Cg4LfFc_2lSA3l9dFyVElq8,985
-SinaTools-0.1.11.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
-SinaTools-0.1.11.dist-info/entry_points.txt,sha256=9uGvOGRicf-CsHMaFyQjq1odtr3RMeOvEfiZwpDQ9VU,926
-SinaTools-0.1.11.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
-SinaTools-0.1.11.dist-info/RECORD,,
+sinatools/wsd/__init__.py,sha256=5Ondsp-Xe9YxVjRlTc4nLrxu6xiyML7B3bQ3EZ44uEM,327
+sinatools/wsd/disambiguator.py,sha256=BUiIXLd8b9tdZqThBiwacfSZtTkRx9LNnqegibmlbFA,20008
+sinatools/wsd/settings.py,sha256=b_AqTxVWALuGXnsMd9KhnnwIo9-JEoWOTekB-7_xJCU,1111
+sinatools/wsd/wsd.py,sha256=gHIBUFXegoY1z3rRnIlK6TduhYq2BTa_dHakOjOlT4k,4434
+SinaTools-0.1.13.dist-info/AUTHORS.rst,sha256=aTWeWlIdfLi56iLJfIUAwIrmqDcgxXKLji75_Fjzjyg,174
+SinaTools-0.1.13.dist-info/LICENSE,sha256=uwsKYG4TayHXNANWdpfMN2lVW4dimxQjA_7vuCVhD70,1088
+SinaTools-0.1.13.dist-info/METADATA,sha256=OAlD6n0C6DAbu4Kf9Foys8qSgyPzcwk-jwkklU6QkzA,953
+SinaTools-0.1.13.dist-info/WHEEL,sha256=6T3TYZE4YFi2HTS1BeZHNXAi8N52OZT4O-dJ6-ome_4,116
+SinaTools-0.1.13.dist-info/entry_points.txt,sha256=ZwZLolnWog2fjdDrfaHNHob8SE_YtMbD6ayzsOzItxs,1234
+SinaTools-0.1.13.dist-info/top_level.txt,sha256=8tNdPTeJKw3TQCaua8IJIx6N6WpgZZmVekf1OdBNJpE,10
+SinaTools-0.1.13.dist-info/RECORD,,

{SinaTools-0.1.11.dist-info → SinaTools-0.1.13.dist-info}/entry_points.txt RENAMED Viewed

@@ -2,17 +2,21 @@
 alma_multi_word = sinatools.CLI.morphology.ALMA_multi_word:main
 appdatadir = sinatools.CLI.DataDownload.get_appdatadir:main
 arStrip = sinatools.CLI.utils.arStrip:main
-arabi_ner = sinatools.CLI.ner.entity_extractor:main
-arabi_ner2 = sinatools.CLI.ner.corpus_entity_extractor:main
+corpus_entity_extractor = sinatools.CLI.ner.corpus_entity_extractor:main
 corpus_tokenizer = sinatools.CLI.utils.corpus_tokenizer:main
 download_files = sinatools.CLI.DataDownload.download_files:main
+entity_extractor = sinatools.CLI.ner.entity_extractor:main
+evaluate_synonyms = sinatools.CLI.synonyms.evaluate_synonyms:main
+extend_synonyms = sinatools.CLI.synonyms.extend_synonyms:main
 implication = sinatools.CLI.utils.implication:main
 install_env = sinatools.install_env:main
 jaccard_similarity = sinatools.CLI.utils.jaccard:main
 morphology_analyzer = sinatools.CLI.morphology.morph_analyzer:main
 remove_latin = sinatools.CLI.utils.remove_latin:main
 remove_punctuation = sinatools.CLI.utils.remove_punctuation:main
-salma = sinatools.CLI.salma.salma_tools:main
+semantic_relatedness = sinatools.CLI.semantic_relatedness.compute_relatedness:main
 sentence_tokenizer = sinatools.CLI.utils.sentence_tokenizer:main
+text_dublication_detector = sinatools.CLI.utils.text_dublication_detector:main
 transliterate = sinatools.CLI.utils.text_transliteration:main
+wsd = sinatools.CLI.wsd.disambiguator:main

sinatools/CLI/DataDownload/download_files.py CHANGED Viewed

@@ -29,16 +29,6 @@ Examples:
     download_files -f morph ner
     This command will download only the `morph` and `ner` files to the default directory.
-Note:
------
-.. code-block:: none
-    - The script automatically handles the extraction of zip and tar.gz files after downloading.
-    - Ensure you have the necessary permissions to write to the specified directory.
-    - The default download directory is based on the operating system and can be obtained using the `get_appdatadir` function.
 """
 import argparse

sinatools/CLI/ner/corpus_entity_extractor.py CHANGED Viewed

@@ -4,20 +4,23 @@ from sinatools.utils.tokenizer import sentence_tokenizer
 from sinatools.utils.tokenizers_words import simple_word_tokenize
 import pandas as pd
 import argparse
-from sinatools.ner.entity_extractor import ner
+from sinatools.ner.entity_extractor import extract
 """
-CSV NER Tagging Tool
+This tool processes a csv file and returns named entites for each token within the text, based on the specified batch size. As follows:
 Usage:
 ------
 Run the script with the following command:
-arabi_ner2  input.csv --text-columns "TextColumn1,TextColumn2" --additional-columns "Column3,Column4" --output-csv output.csv
+corpus_entity_extractor  input.csv --text-columns "TextColumn1,TextColumn2" --additional-columns "Column3,Column4" --output-csv output.csv
 """
-def infer(sentence):
-    output = ner(sentence)
+def jsons_to_list_of_lists(json_list):
+    return [[d['token'], d['tags']] for d in json_list]
+def combine_tags(sentence):
+    output = jsons_to_list_of_lists(extract(sentence))
     return [word[1] for word in output]
@@ -40,7 +43,7 @@ def corpus_tokenizer(input_csv, output_csv, text_column, additional_columns, row
                 words = simple_word_tokenize(sentence)
                 global_sentence_id += 1
-                tags = infer(sentence)
+                tags = combine_tags(sentence)
                 for word_position, word in enumerate(words, start=1):
                     row_id += 1
                     doc_sentence_filename = input_csv.split(".csv")[0]

sinatools/CLI/ner/entity_extractor.py CHANGED Viewed

@@ -1,16 +1,16 @@
 """
 About:
 ------
-The ArabiNER tool carries out Named Entity Recognition (NER) utilizing the ArabiNER utility from the SinaTools suite. It identifies the named entities and provides a comprehensive analysis in JSON format if the input consists of text, or in a CSV file if the input is a directory of files.
+This tool processes an input text and returns named entites for each token within the text, based on the specified batch size. As follows:
 Usage:
 ------
-Below is the usage information that can be generated by running arabi_ner --help.
+Below is the usage information that can be generated by running entity_extractor --help.
 .. code-block:: none
-    arabi_ner --text=INPUT_TEXT
-    arabi_ner --dir=INPUT_FILE --output_csv=OUTPUT_FILE_NAME
+    entity_extractor --text=INPUT_TEXT
+    entity_extractor --dir=INPUT_FILE --output_csv=OUTPUT_FILE_NAME
 Options:
 --------
@@ -23,37 +23,28 @@ Options:
         File containing the text to be analyzed for Named Entity Recognition.
   --output_csv OUTPUT_FILE_NAME
         A file containing the tokenized text and its Named Entity tags.
-Examples:
----------
-.. code-block:: none
-    arabi_ner --text "Your text here"
-    arabi_ner --dir "/path/to/your/directory" --output_csv "output.csv"
-Note:
------
+Examples:
+---------
 .. code-block:: none
-    - Ensure that the text input is appropriately encoded in UTF-8 or compatible formats.
-    - The tool returns results in JSON format with proper indentation for better readability.
-    - The quality and accuracy of the analysis depend on the underlying capabilities of the ArabiNER utility.
+    entity_extractor --text "Your text here"
+    entity_extractor --dir "/path/to/your/directory" --output_csv "output.csv"
 """
 import argparse
 import json
 import pandas as pd
-from sinatools.ner.entity_extractor import ner
+from sinatools.ner.entity_extractor import extract
 from sinatools.utils.tokenizer import corpus_tokenizer
 from sinatools.utils.tokenizers_words import simple_word_tokenize
-def infer(sentence):
-    # Now infer returns all NER tags for a sentence
-    output = ner(sentence)
-    ##print("ner output : ", output)
+def combine_tags(sentence):
+    output = extract(sentence)
     return [word[1] for word in output]
@@ -67,7 +58,7 @@ def main():
     args = parser.parse_args()
     if args.text is not None:
-        results = ner(args.text)
+        results = extract(args.text)
         # Print the results in JSON format
         print(json.dumps(results, ensure_ascii=False, indent=4))
     elif args.dir is not None:
@@ -76,28 +67,16 @@ def main():
         df['NER tags'] = None
         i = 0
-        # Use drop_duplicates to get unique values based on Row_ID and Sentence
         result = df.drop_duplicates(subset=['Global Sentence ID', 'Sentence'])
-        # Get the "Sentence" column as an array
         unique_sentences = result['Sentence'].to_numpy()
-        # Print the result
-        #print(unique_sentences, len(result['Sentence']))
-        #print("#############")
-        for sentence in unique_sentences:  # iterating over unique sentences
-            #print(" Sentence : ", simple_word_tokenize(sentence), len(simple_word_tokenize(sentence)))
-            ner_tags = infer(sentence)  # getting all NER tags for the sentence
-            #if len(ner_tags) != len(df[i:i+len(ner_tags)]):
-            #    print("Not Equal...", len(ner_tags) , len(df[i:i+len(ner_tags)]))
-            #    return
+        for sentence in unique_sentences:
+            ner_tags = combine_tags(sentence)
             if len(simple_word_tokenize(sentence)) > 300:
                 print(" Length of this sentence is more than 300 word:  ", sentence)
                 return
-            #df['NER tags'].iloc[i:i+len(ner_tags)] = ner_tags
-            df.loc[i:i+len(ner_tags)-1, 'NER tags'] = ner_tags  # Use .loc to assign values
-            #print("Exit with ner tags = ", ner_tags, " and length : ", len(ner_tags), type(len(ner_tags)), " and df is " , df[i:i+len(ner_tags)], " with length : ", len(df[i:i+len(ner_tags)]), type(len(df[i:i+len(ner_tags)])),  " i:i+len(ner_tags) : ", i," , ", i+len(ner_tags))
+            df.loc[i:i+len(ner_tags)-1, 'NER tags'] = ner_tags
             i = i + len(ner_tags)
         df.to_csv(args.output_csv, index=False)
@@ -107,7 +86,4 @@ def main():
 if __name__ == '__main__':
-    main()
-#arabi_ner --text "Your text here."
-#arabi_ner --dir /path/to/your/directory --output_csv output.csv
+    main()

sinatools/CLI/utils/arStrip.py CHANGED Viewed

@@ -26,7 +26,7 @@ Below is the usage information that can be generated by running arStrip --help.
       --diacs BOOL [default=True]
             Indicates whether to strip diacritics.
-      --smallDiacs BOOL [default=True]
+      --small_diacs BOOL [default=True]
             Indicates whether to strip small diacritics.
       --shaddah BOOL [default=True]
@@ -38,15 +38,15 @@ Below is the usage information that can be generated by running arStrip --help.
       --alif BOOL [default=True]
             Indicates whether to strip alif.
-      --specialChars BOOL [default=True]
+      --special_chars BOOL [default=True]
             Indicates whether to strip special characters.
 Examples:
 ---------
 .. code-block:: none
-    arStrip --text "مُختَبَر سينا لحوسبة اللغة!" --diacs=True --smallDiacs=False --shaddah=True --digit=False --alif=False --specialChars=False
-    arStrip --file "path/to/your/file.txt" --diacs=True --smallDiacs=False --shaddah=True --digit=False --alif=False --specialChars=False
+    arStrip --text "مُختَبَر سينا لحوسبة اللغة!" --diacs=True --small_diacs=False --shaddah=True --digit=False --alif=False --special_chars=False
+    arStrip --file "path/to/your/file.txt" --diacs=True --small_diacs=False --shaddah=True --digit=False --alif=False --special_chars=False
 """
@@ -60,11 +60,11 @@ def main():
     parser.add_argument('--text', type=str, help='Text to be stripped')
     parser.add_argument('--file', type=str, help='File containing text to be stripped')
     parser.add_argument('--diacs', type=bool, default=True, help='Whether to strip diacritics')
-    parser.add_argument('--smallDiacs', type=bool, default=True, help='Whether to strip small diacritics')
+    parser.add_argument('--small_diacs', type=bool, default=True, help='Whether to strip small diacritics')
     parser.add_argument('--shaddah', type=bool, default=True, help='Whether to strip shaddah')
     parser.add_argument('--digit', type=bool, default=True, help='Whether to strip digits')
     parser.add_argument('--alif', type=bool, default=True, help='Whether to strip alif')
-    parser.add_argument('--specialChars', type=bool, default=True, help='Whether to strip special characters')
+    parser.add_argument('--special_chars', type=bool, default=True, help='Whether to strip special characters')
     args = parser.parse_args()
@@ -76,8 +76,8 @@ def main():
         print("Either --text or --file argument must be provided.")
         return
-    stripped_text = arStrip(text_content, diacs=args.diacs, smallDiacs=args.smallDiacs,
-                            shaddah=args.shaddah, digit=args.digit, alif=args.alif, specialChars=args.specialChars)
+    stripped_text = arStrip(text_content, diacs=args.diacs, small_diacs=args.small_diacs,
+                            shaddah=args.shaddah, digit=args.digit, alif=args.alif, special_chars=args.special_chars)
     print(stripped_text)

sinatools/CLI/utils/implication.py CHANGED Viewed

@@ -37,14 +37,6 @@ Examples:
       implication --file1 "path/to/your/file1.txt" --file2 "path/to/your/file2.txt"
-Note:
------
-.. code-block:: none
-    - The results are based on the underlying logic and data sets present in the `Implication` class of SinaTools.
-    - The tool compares the implication between two words, and the relationship might vary based on linguistic nuances.
 """
 import argparse
 from sinatools.utils.implication import Implication

sinatools/CLI/utils/jaccard.py CHANGED Viewed

@@ -5,14 +5,14 @@ The jaccard tool computes the Jaccard similarity between two sets of strings. Th
 Usage:
 ------
-Below is the usage information that can be generated by running jaccard --help.
+Below is the usage information that can be generated by running jaccard_similarity --help.
 .. code-block:: none
     Usage:
-        jaccard --list1="WORD1, WORD2"  --list2="WORD1,WORD2" --delimiter="DELIMITER"  --selection="SELECTION"  [OPTIONS]
+        jaccard_similarity --list1="WORD1, WORD2"  --list2="WORD1,WORD2" --delimiter="DELIMITER"  --selection="SELECTION"  [OPTIONS]
-        jaccard --file1=File1 --file2=File2 --delimiter="DELIMITER"  --selection="SELECTION"  [OPTIONS]
+        jaccard_similarity --file1=File1 --file2=File2 --delimiter="DELIMITER"  --selection="SELECTION"  [OPTIONS]
 .. code-block:: none
@@ -39,18 +39,9 @@ Examples:
 .. code-block:: none
-      jaccard --list1 "word1,word2"  --list2 "word1, word2" --delimiter ","  --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
+      jaccard_similarity --list1 "word1,word2"  --list2 "word1, word2" --delimiter ","  --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
-      jaccard --file1 "path/to/your/file1.txt"  --file2 "path/to/your/file2.txt" --delimiter ","  --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
-Note:
------
-.. code-block:: none
-    - The Jaccard similarity ranges from 0 to 1. A value of 1 indicates that the sets are identical, while a value of 0 indicates no similarity between the sets.
-    - Diacritics refer to the Arabic Diacritics (like fatha, damma, kasra, etc.) and shadda.
-    - The two normalization options can be used individually or together. However, the combination will result in both rules being applied, and thus,
+      jaccard_similarity --file1 "path/to/your/file1.txt"  --file2 "path/to/your/file2.txt" --delimiter ","  --selection "jaccardAll" --ignoreAllDiacriticsButNotShadda --ignoreShaddaDiacritic
 """

sinatools/CLI/utils/remove_latin.py CHANGED Viewed

@@ -14,8 +14,8 @@ Below is the usage information that can be generated by running remove_latin --h
 Examples:
 ---------
 .. code-block:: none
-    latin_remove --text "123test"
-    latin_remove --file "path/to/your/file.txt"
+    remove_latin --text "123test"
+    remove_latin --file "path/to/your/file.txt"
 """
 import argparse

sinatools/CLI/utils/text_dublication_detector.py ADDED Viewed

@@ -0,0 +1,25 @@
+import argparse
+from sinatools.utils.text_dublication_detector import removal
+def main():
+    parser = argparse.ArgumentParser(description='Processes a CSV file of sentences to identify and remove duplicate sentences based on a specified threshold and cosine similarity. It saves the filtered results and the identified duplicates to separate files.')
+    parser.add_argument('--csv_file', type=str, help='The path to the input CSV file that will be processed.')
+    parser.add_argument('--column_name', type=str, help='The name of the column from which duplicates will be removed.')
+    parser.add_argument('--final_file_name', type=str, help='The name of the output file that will contain the deduplicated results.')
+    parser.add_argument('--deleted_file_name', type=str, help='The name of the output file that will contain the records that were identified as duplicates and removed.')
+    parser.add_argument('--similarity_threshold', type=float, default=0.8, help='The similarity threshold for determining duplicates. Records with a similarity score above this value will be considered duplicates (default is 0.8).')
+    args = parser.parse_args()
+    if args.csv_file is None and args.column_name is None:
+        print("Either --csv_file or --column_name argument must be provided.")
+        return
+    removal(args.csv_file, args.column_name, args.final_file_name, args.deleted_file_name, args.similarity_threshold)
+if __name__ == '__main__':
+    main()
+# text_dublication_detector --csv_file "text.csv" --column_name "A" --final_file_name "Final.csv" --deleted_file_name "deleted.csv" --similarity_threshold 0.8

sinatools/VERSION CHANGED Viewed

	@@ -1 +1 @@
1	- 0.1.11
1	+ 0.1.13

sinatools/morphology/ALMA_multi_word.py CHANGED Viewed

@@ -1,33 +1,31 @@
 from sinatools.utils.parser import arStrip
-import json
-from . import dictionary
+from . import five_grams_dict, four_grams_dict  , three_grams_dict , two_grams_dict
-def ALMA_multi_word(multi_word):
+def ALMA_multi_word(multi_word, n):
     undiac_multi_word = arStrip(multi_word, True, True, True, False, True, False)  # diacs , smallDiacs , shaddah ,  digit , alif , specialChars
     result_word = []
-    if undiac_multi_word in dictionary.keys():
-        result_word = dictionary[undiac_multi_word]
+    if n == 2:
+        if undiac_multi_word in two_grams_dict.keys():
+            result_word = two_grams_dict[undiac_multi_word]
+    elif n == 3:
+        if undiac_multi_word in three_grams_dict.keys():
+            result_word = three_grams_dict[undiac_multi_word]
+    elif n == 4:
+        if undiac_multi_word in four_grams_dict.keys():
+            result_word = four_grams_dict[undiac_multi_word]
+    else:
+     if undiac_multi_word in five_grams_dict.keys():
+         result_word = five_grams_dict[undiac_multi_word]
     my_json = {}
-    glosses_list = []
     output_list = []
-    concept_count = 0
     my_json['multi_word_lemma'] = multi_word
     my_json['undiac_multi_word_lemma'] = multi_word
     ids = []
     if result_word != []:
-        #my_json['concept_count'] = result_word[0][1] #concept_count
-        #my_json['POS'] = result_word[0][2] #POS
         my_json['POS'] = result_word[0][1] #POS
         for result in result_word:
            ids.append(result[3])
-           #if lemma_id in settings.glosses_dic.keys():
-           #   value = settings.glosses_dic[lemma_id]
-           #   glosses_list.append(json.loads(value[1]))
-           #   concept_count = concept_count + value[0]
         my_json['ids'] = ids
-        #my_json['concept_count'] = concept_count
-        #my_json['glosses'] = glosses_list
         output_list.append(my_json)
     return output_list

sinatools/morphology/__init__.py CHANGED Viewed

@@ -3,40 +3,41 @@ from sinatools.DataDownload import downloader
 import os
 dictionary = {}
+five_grams_dict = {}
+four_grams_dict = {}
+three_grams_dict = {}
+two_grams_dict = {}
 filename = 'lemmas_dic.pickle'
 path = downloader.get_appdatadir()
 file_path = os.path.join(path, filename)
 with open(file_path, 'rb') as f:
     dictionary = pickle.load(f)
-#filename_five = 'five_grams.pickle'
-#path =downloader.get_appdatadir()
-#file_path = os.path.join(path, filename_five)
-#with open(file_path, 'rb') as f:
-#    #Load the serialized data from the file
-#    settings.five_grams_dict = pickle.load(f, encoding='utf-8')
-#
-#
-#filename_four = 'four_grams.pickle'
-#path =downloader.get_appdatadir()
-#file_path = os.path.join(path, filename_four)
-#with open(file_path, 'rb') as f:
-#    #Load the serialized data from the file
-#    settings.four_grams_dict = pickle.load(f, encoding='utf-8')
-#
-#
-#filename_three = 'three_grams.pickle'
-#path =downloader.get_appdatadir()
-#file_path = os.path.join(path, filename_three)
-#with open(file_path, 'rb') as f:
-#    #Load the serialized data from the file
-#    settings.three_grams_dict = pickle.load(f, encoding='utf-8')
-#
-#
-#filename_two = 'two_grams.pickle'
-#path =downloader.get_appdatadir()
-#file_path = os.path.join(path, filename_two)
-#with open(file_path, 'rb') as f:
-#    #Load the serialized data from the file
-#    settings.two_grams_dict = pickle.load(f, encoding='utf-8')
-#
+filename_five = 'five_grams.pickle'
+path =downloader.get_appdatadir()
+file_path = os.path.join(path, filename_five)
+with open(file_path, 'rb') as f:
+   five_grams_dict = pickle.load(f, encoding='utf-8')
+filename_four = 'four_grams.pickle'
+path =downloader.get_appdatadir()
+file_path = os.path.join(path, filename_four)
+with open(file_path, 'rb') as f:
+  four_grams_dict = pickle.load(f, encoding='utf-8')
+filename_three = 'three_grams.pickle'
+path =downloader.get_appdatadir()
+file_path = os.path.join(path, filename_three)
+with open(file_path, 'rb') as f:
+  three_grams_dict = pickle.load(f, encoding='utf-8')
+filename_two = 'two_grams.pickle'
+path =downloader.get_appdatadir()
+file_path = os.path.join(path, filename_two)
+with open(file_path, 'rb') as f:
+  two_grams_dict = pickle.load(f, encoding='utf-8')

sinatools/ner/__init__.py CHANGED Viewed

@@ -1,6 +1,12 @@
 from sinatools.DataDownload import downloader
 import os
-from sinatools.ner.utils.helpers import load_checkpoint
+from sinatools.ner.helpers import load_object
+import pickle
+import os
+import torch
+import pickle
+import json
+from argparse import Namespace
 tagger = None
 tag_vocab = None
@@ -9,4 +15,24 @@ train_config = None
 filename = 'Wj27012000.tar'
 path =downloader.get_appdatadir()
 model_path = os.path.join(path, filename)
-tagger, tag_vocab, train_config = load_checkpoint(model_path)
+_path = os.path.join(model_path, "tag_vocab.pkl")
+with open(_path, "rb") as fh:
+    tag_vocab = pickle.load(fh)
+train_config = Namespace()
+args_path = os.path.join(model_path, "args.json")
+with open(args_path, "r") as fh:
+    train_config.__dict__ = json.load(fh)
+model = load_object(train_config.network_config["fn"], train_config.network_config["kwargs"])
+model = torch.nn.DataParallel(model)
+if torch.cuda.is_available():
+    model = model.cuda()
+train_config.trainer_config["kwargs"]["model"] = model
+tagger = load_object(train_config.trainer_config["fn"], train_config.trainer_config["kwargs"])
+tagger.load(os.path.join(model_path,"checkpoints"))

sinatools/ner/data/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ from sinatools.ner.data.datasets import NestedTagsDataset

SinaTools 0.1.11__py2.py3-none-any.whl → 0.1.13__py2.py3-none-any.whl

SinaTools 0.1.11py2.py3-none-any.whl → 0.1.13py2.py3-none-any.whl