dalla-data-processing 0.0.2__tar.gz → 0.0.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/PKG-INFO +5 -6
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/_version.py +3 -3
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/cli.py +9 -1
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/quality/checker.py +20 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/stemming/stemmer.py +22 -9
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing.egg-info/PKG-INFO +5 -6
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing.egg-info/requires.txt +3 -4
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/pyproject.toml +6 -5
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/.dockerignore +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/.github/workflows/ci.yml +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/.github/workflows/release.yml +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/.gitignore +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/.pre-commit-config.yaml +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/MANIFEST.in +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/README.md +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/__init__.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/core/README.md +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/core/__init__.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/core/dataset.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/core/parallel.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/README.md +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/__init__.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/bin/.gitignore +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/COPYING +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/Makefile +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/Makefile.config +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/README.md +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src/Makefile +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src/Makefile.g +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src/buzhash.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src/buzhash.h +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src/hashdup.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src/hashgen.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src/onion +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src/onion.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src/onion_dup.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src/version.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src/version.h +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src_sc/.gitignore +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src_sc/Makefile +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src_sc/Makefile.g +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src_sc/buzhash.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src_sc/buzhash.h +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src_sc/hashdup +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src_sc/hashdup.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src_sc/hashgen +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src_sc/hashgen.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src_sc/onion.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src_sc/onion_dup.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src_sc/version.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion/src_sc/version.h +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/onion_wrapper.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/postprocessing.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/deduplication/preprocessing.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/packing/README.md +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/packing/__init__.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/packing/dataset_packer.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/packing/pack_config.example.yaml +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/quality/README.md +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/quality/__init__.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/readability/README.md +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/readability/__init__.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/readability/ranking.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/readability/scorer.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/stemming/README.md +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/stemming/__init__.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/stemming/data/words_al.txt +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/stemming/data/words_al_t.txt +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/stemming/data/words_t.txt +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/utils/__init__.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/utils/logger.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/utils/tokenize.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing.egg-info/SOURCES.txt +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing.egg-info/dependency_links.txt +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing.egg-info/entry_points.txt +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing.egg-info/top_level.txt +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/scripts/build_onion.sh +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/scripts/release.sh +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/setup.cfg +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/uv.lock +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dalla-data-processing
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.3
|
|
4
4
|
Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
|
|
5
5
|
Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
|
|
6
6
|
Project-URL: Homepage, https://github.com/U4RASD/dalla-data-processing
|
|
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3
|
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
15
15
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
16
|
Classifier: Topic :: Text Processing :: Linguistic
|
|
17
|
-
Requires-Python:
|
|
17
|
+
Requires-Python: <3.13,>=3.12
|
|
18
18
|
Description-Content-Type: text/markdown
|
|
19
19
|
Requires-Dist: datasets>=2.14.0
|
|
20
20
|
Requires-Dist: transformers>=4.30.0
|
|
@@ -28,18 +28,17 @@ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
|
28
28
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
29
29
|
Requires-Dist: pre-commit>=3.0.0; extra == "dev"
|
|
30
30
|
Provides-Extra: dedup
|
|
31
|
-
Requires-Dist: camel-tools
|
|
31
|
+
Requires-Dist: camel-tools==1.5.7; extra == "dedup"
|
|
32
32
|
Provides-Extra: dedup-native
|
|
33
33
|
Requires-Dist: cffi>=1.15.0; extra == "dedup-native"
|
|
34
34
|
Provides-Extra: stem
|
|
35
|
-
Requires-Dist: camel-tools
|
|
35
|
+
Requires-Dist: camel-tools==1.5.7; extra == "stem"
|
|
36
36
|
Provides-Extra: quality
|
|
37
|
-
Requires-Dist: camel-tools
|
|
37
|
+
Requires-Dist: camel-tools==1.5.7; extra == "quality"
|
|
38
38
|
Provides-Extra: readability
|
|
39
39
|
Requires-Dist: textstat>=0.7.0; extra == "readability"
|
|
40
40
|
Provides-Extra: pack
|
|
41
41
|
Requires-Dist: sentencepiece>=0.2.0; extra == "pack"
|
|
42
|
-
Requires-Dist: rbpe; extra == "pack"
|
|
43
42
|
Requires-Dist: pyyaml; extra == "pack"
|
|
44
43
|
Provides-Extra: all
|
|
45
44
|
Requires-Dist: dalla-data-processing[dedup,dedup-native,dev,pack,quality,readability,stem]; extra == "all"
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/_version.py
RENAMED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.0.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 0,
|
|
31
|
+
__version__ = version = '0.0.3'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 0, 3)
|
|
33
33
|
|
|
34
|
-
__commit_id__ = commit_id = '
|
|
34
|
+
__commit_id__ = commit_id = 'g37580acc9'
|
|
@@ -651,7 +651,15 @@ def pack(
|
|
|
651
651
|
tokenizer = RBPETokenizer.from_pretrained(config_data["tokenizer_path"])
|
|
652
652
|
except ImportError:
|
|
653
653
|
logger.error("Missing rbpe package")
|
|
654
|
-
logger.error(
|
|
654
|
+
logger.error(
|
|
655
|
+
"rbpe is not included in the default installation due to "
|
|
656
|
+
"dependency conflicts with camel-tools (transformers version requirements)"
|
|
657
|
+
)
|
|
658
|
+
logger.error("Install separately with: pip install rbpe")
|
|
659
|
+
logger.error(
|
|
660
|
+
"Note: Installing rbpe may require a separate environment "
|
|
661
|
+
"if you also use dedup/stem/quality features"
|
|
662
|
+
)
|
|
655
663
|
sys.exit(1)
|
|
656
664
|
else:
|
|
657
665
|
try:
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/quality/checker.py
RENAMED
|
@@ -11,6 +11,7 @@ from concurrent.futures import TimeoutError as FutureTimeoutError
|
|
|
11
11
|
from types import MethodType
|
|
12
12
|
from typing import Any
|
|
13
13
|
|
|
14
|
+
from camel_tools.data.catalogue import Catalogue
|
|
14
15
|
from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
|
|
15
16
|
from camel_tools.disambig.mle import MLEDisambiguator
|
|
16
17
|
from datasets import Dataset
|
|
@@ -53,6 +54,25 @@ class QualityChecker:
|
|
|
53
54
|
|
|
54
55
|
def _init_disambiguator(self):
|
|
55
56
|
"""Initialize and configure the disambiguator with caching."""
|
|
57
|
+
# Install required CAMeL Tools packages based on model type
|
|
58
|
+
logger.info("Checking CAMeL Tools data packages...")
|
|
59
|
+
catalogue = Catalogue.load_catalogue()
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
catalogue.download_package("morphology-db-msa-r13")
|
|
63
|
+
catalogue.download_package("disambig-mle-calima-msa-r13")
|
|
64
|
+
logger.info("msa-r13 packages installed")
|
|
65
|
+
except Exception as e:
|
|
66
|
+
logger.warning(f"Package installation warning: {e}")
|
|
67
|
+
|
|
68
|
+
# Install BERT package if using BERT model
|
|
69
|
+
if self.model == "bert":
|
|
70
|
+
try:
|
|
71
|
+
catalogue.download_package("disambig-bert-unfactored-all")
|
|
72
|
+
logger.info("BERT package installed")
|
|
73
|
+
except Exception as e:
|
|
74
|
+
logger.warning(f"BERT package installation warning: {e}")
|
|
75
|
+
|
|
56
76
|
if self.model == "mle":
|
|
57
77
|
self.disambiguator = MLEDisambiguator.pretrained()
|
|
58
78
|
logger.info("MLE disambiguator loaded")
|
|
@@ -473,12 +473,19 @@ def stem_dataset(
|
|
|
473
473
|
catalogue = Catalogue.load_catalogue()
|
|
474
474
|
try:
|
|
475
475
|
catalogue.download_package("morphology-db-msa-r13")
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
# For BERT, let it download automatically when pretrained() is called
|
|
479
|
-
logger.info("CAMeL Tools data packages ready")
|
|
476
|
+
catalogue.download_package("disambig-mle-calima-msa-r13")
|
|
477
|
+
logger.info("msa-r13 packages installed")
|
|
480
478
|
except Exception as e:
|
|
481
|
-
logger.warning(f"
|
|
479
|
+
logger.warning(f"Package installation warning: {e}")
|
|
480
|
+
|
|
481
|
+
if model == "bert":
|
|
482
|
+
try:
|
|
483
|
+
catalogue.download_package("disambig-bert-unfactored-all")
|
|
484
|
+
logger.info("BERT package installed")
|
|
485
|
+
except Exception as e:
|
|
486
|
+
logger.warning(f"BERT package installation warning: {e}")
|
|
487
|
+
|
|
488
|
+
logger.info("CAMeL Tools data packages ready")
|
|
482
489
|
|
|
483
490
|
logger.info("Loading additional words lists...")
|
|
484
491
|
words_dir = os.path.join(os.path.dirname(__file__), "data")
|
|
@@ -597,15 +604,21 @@ def stem(
|
|
|
597
604
|
if not all(isinstance(t, str) for t in text_list):
|
|
598
605
|
raise TypeError("All items in text list must be strings")
|
|
599
606
|
|
|
600
|
-
# Initialize disambiguator (cached globally if possible)
|
|
601
607
|
logger.info(f"Initializing {model.upper()} disambiguator...")
|
|
602
608
|
catalogue = Catalogue.load_catalogue()
|
|
603
609
|
try:
|
|
604
610
|
catalogue.download_package("morphology-db-msa-r13")
|
|
605
|
-
|
|
606
|
-
|
|
611
|
+
catalogue.download_package("disambig-mle-calima-msa-r13")
|
|
612
|
+
logger.info("msa-r13 packages installed")
|
|
607
613
|
except Exception as e:
|
|
608
|
-
logger.warning(f"
|
|
614
|
+
logger.warning(f"Package installation warning: {e}")
|
|
615
|
+
|
|
616
|
+
if model == "bert":
|
|
617
|
+
try:
|
|
618
|
+
catalogue.download_package("disambig-bert-unfactored-all")
|
|
619
|
+
logger.info("BERT package installed")
|
|
620
|
+
except Exception as e:
|
|
621
|
+
logger.warning(f"BERT package installation warning: {e}")
|
|
609
622
|
|
|
610
623
|
if model == "mle":
|
|
611
624
|
disambiguator = MLEDisambiguator.pretrained("calima-msa-r13", cache_size=1_000_000)
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dalla-data-processing
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.3
|
|
4
4
|
Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
|
|
5
5
|
Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
|
|
6
6
|
Project-URL: Homepage, https://github.com/U4RASD/dalla-data-processing
|
|
@@ -14,7 +14,7 @@ Classifier: Programming Language :: Python :: 3
|
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.12
|
|
15
15
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
16
|
Classifier: Topic :: Text Processing :: Linguistic
|
|
17
|
-
Requires-Python:
|
|
17
|
+
Requires-Python: <3.13,>=3.12
|
|
18
18
|
Description-Content-Type: text/markdown
|
|
19
19
|
Requires-Dist: datasets>=2.14.0
|
|
20
20
|
Requires-Dist: transformers>=4.30.0
|
|
@@ -28,18 +28,17 @@ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
|
28
28
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
29
29
|
Requires-Dist: pre-commit>=3.0.0; extra == "dev"
|
|
30
30
|
Provides-Extra: dedup
|
|
31
|
-
Requires-Dist: camel-tools
|
|
31
|
+
Requires-Dist: camel-tools==1.5.7; extra == "dedup"
|
|
32
32
|
Provides-Extra: dedup-native
|
|
33
33
|
Requires-Dist: cffi>=1.15.0; extra == "dedup-native"
|
|
34
34
|
Provides-Extra: stem
|
|
35
|
-
Requires-Dist: camel-tools
|
|
35
|
+
Requires-Dist: camel-tools==1.5.7; extra == "stem"
|
|
36
36
|
Provides-Extra: quality
|
|
37
|
-
Requires-Dist: camel-tools
|
|
37
|
+
Requires-Dist: camel-tools==1.5.7; extra == "quality"
|
|
38
38
|
Provides-Extra: readability
|
|
39
39
|
Requires-Dist: textstat>=0.7.0; extra == "readability"
|
|
40
40
|
Provides-Extra: pack
|
|
41
41
|
Requires-Dist: sentencepiece>=0.2.0; extra == "pack"
|
|
42
|
-
Requires-Dist: rbpe; extra == "pack"
|
|
43
42
|
Requires-Dist: pyyaml; extra == "pack"
|
|
44
43
|
Provides-Extra: all
|
|
45
44
|
Requires-Dist: dalla-data-processing[dedup,dedup-native,dev,pack,quality,readability,stem]; extra == "all"
|
|
@@ -9,7 +9,7 @@ structlog>=24.0.0
|
|
|
9
9
|
dalla-data-processing[dedup,dedup-native,dev,pack,quality,readability,stem]
|
|
10
10
|
|
|
11
11
|
[dedup]
|
|
12
|
-
camel-tools
|
|
12
|
+
camel-tools==1.5.7
|
|
13
13
|
|
|
14
14
|
[dedup-native]
|
|
15
15
|
cffi>=1.15.0
|
|
@@ -22,14 +22,13 @@ pre-commit>=3.0.0
|
|
|
22
22
|
|
|
23
23
|
[pack]
|
|
24
24
|
sentencepiece>=0.2.0
|
|
25
|
-
rbpe
|
|
26
25
|
pyyaml
|
|
27
26
|
|
|
28
27
|
[quality]
|
|
29
|
-
camel-tools
|
|
28
|
+
camel-tools==1.5.7
|
|
30
29
|
|
|
31
30
|
[readability]
|
|
32
31
|
textstat>=0.7.0
|
|
33
32
|
|
|
34
33
|
[stem]
|
|
35
|
-
camel-tools
|
|
34
|
+
camel-tools==1.5.7
|
|
@@ -11,7 +11,7 @@ authors = [
|
|
|
11
11
|
{name = "Digital Research Unit - Arab Center", email = "dru@dohainstitute.edu.qa"}
|
|
12
12
|
]
|
|
13
13
|
readme = "README.md"
|
|
14
|
-
requires-python = ">=3.12"
|
|
14
|
+
requires-python = ">=3.12,<3.13"
|
|
15
15
|
keywords = ["arabic", "nlp", "data-processing", "deduplication", "stemming", "readability", "quality"]
|
|
16
16
|
classifiers = [
|
|
17
17
|
"Intended Audience :: Developers",
|
|
@@ -39,23 +39,24 @@ dev = [
|
|
|
39
39
|
"pre-commit>=3.0.0",
|
|
40
40
|
]
|
|
41
41
|
dedup = [
|
|
42
|
-
"camel-tools
|
|
42
|
+
"camel-tools==1.5.7",
|
|
43
43
|
]
|
|
44
44
|
dedup-native = [
|
|
45
45
|
"cffi>=1.15.0",
|
|
46
46
|
]
|
|
47
47
|
stem = [
|
|
48
|
-
"camel-tools
|
|
48
|
+
"camel-tools==1.5.7",
|
|
49
49
|
]
|
|
50
50
|
quality = [
|
|
51
|
-
"camel-tools
|
|
51
|
+
"camel-tools==1.5.7",
|
|
52
52
|
]
|
|
53
53
|
readability = [
|
|
54
54
|
"textstat>=0.7.0",
|
|
55
55
|
]
|
|
56
56
|
pack = [
|
|
57
57
|
"sentencepiece>=0.2.0",
|
|
58
|
-
"rbpe",
|
|
58
|
+
# "rbpe", # excluded due to transformers version conflict with camel-tools
|
|
59
|
+
# users should install separately if needed: pip install rbpe
|
|
59
60
|
"pyyaml",
|
|
60
61
|
]
|
|
61
62
|
all = [
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/__init__.py
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/core/README.md
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/core/__init__.py
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/core/dataset.py
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/core/parallel.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/packing/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/quality/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/stemming/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/utils/__init__.py
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/utils/logger.py
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.3}/dalla_data_processing/utils/tokenize.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|