dalla-data-processing 0.0.2__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/.github/workflows/ci.yml +3 -1
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/.github/workflows/release.yml +33 -32
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/PKG-INFO +8 -6
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/_version.py +3 -3
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/cli.py +9 -1
- {dalla_data_processing-0.0.2/dalla_data_processing/deduplication/onion/src_sc → dalla_data_processing-0.0.4/dalla_data_processing/deduplication/onion/src}/Makefile +1 -1
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/Makefile.g +1 -1
- {dalla_data_processing-0.0.2/dalla_data_processing/deduplication/onion/src → dalla_data_processing-0.0.4/dalla_data_processing/deduplication/onion/src_sc}/Makefile +1 -1
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/Makefile.g +1 -1
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/quality/checker.py +20 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/stemmer.py +22 -9
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/PKG-INFO +8 -6
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/SOURCES.txt +1 -0
- dalla_data_processing-0.0.4/dalla_data_processing.egg-info/not-zip-safe +1 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/requires.txt +3 -4
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/pyproject.toml +12 -6
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/scripts/build_onion.sh +4 -2
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/.dockerignore +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/.gitignore +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/.pre-commit-config.yaml +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/MANIFEST.in +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/README.md +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/__init__.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/core/README.md +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/core/__init__.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/core/dataset.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/core/parallel.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/README.md +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/__init__.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/bin/.gitignore +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/COPYING +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/Makefile +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/Makefile.config +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/README.md +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/buzhash.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/buzhash.h +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/hashdup.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/hashgen.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/onion +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/onion.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/onion_dup.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/version.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/version.h +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/.gitignore +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/buzhash.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/buzhash.h +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/hashdup +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/hashdup.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/hashgen +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/hashgen.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/onion.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/onion_dup.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/version.c +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/version.h +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion_wrapper.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/postprocessing.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/preprocessing.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/packing/README.md +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/packing/__init__.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/packing/dataset_packer.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/packing/pack_config.example.yaml +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/quality/README.md +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/quality/__init__.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/readability/README.md +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/readability/__init__.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/readability/ranking.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/readability/scorer.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/README.md +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/__init__.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/data/words_al.txt +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/data/words_al_t.txt +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/data/words_t.txt +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/utils/__init__.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/utils/logger.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/utils/tokenize.py +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/dependency_links.txt +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/entry_points.txt +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/top_level.txt +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/scripts/release.sh +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/setup.cfg +0 -0
- {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/uv.lock +0 -0
|
@@ -55,8 +55,10 @@ jobs:
|
|
|
55
55
|
include:
|
|
56
56
|
- os: ubuntu-latest
|
|
57
57
|
artifact_name: onion-linux-x86_64
|
|
58
|
-
- os: macos-
|
|
58
|
+
- os: macos-13
|
|
59
59
|
artifact_name: onion-darwin-x86_64
|
|
60
|
+
- os: macos-14
|
|
61
|
+
artifact_name: onion-darwin-arm64
|
|
60
62
|
|
|
61
63
|
steps:
|
|
62
64
|
- uses: actions/checkout@v4
|
|
@@ -17,8 +17,10 @@ jobs:
|
|
|
17
17
|
include:
|
|
18
18
|
- os: ubuntu-latest
|
|
19
19
|
artifact_name: onion-linux-x86_64
|
|
20
|
-
- os: macos-
|
|
20
|
+
- os: macos-13
|
|
21
21
|
artifact_name: onion-darwin-x86_64
|
|
22
|
+
- os: macos-14
|
|
23
|
+
artifact_name: onion-darwin-arm64
|
|
22
24
|
|
|
23
25
|
steps:
|
|
24
26
|
- uses: actions/checkout@v4
|
|
@@ -46,19 +48,10 @@ jobs:
|
|
|
46
48
|
path: dalla_data_processing/deduplication/bin/onion-*
|
|
47
49
|
if-no-files-found: error
|
|
48
50
|
|
|
49
|
-
build-
|
|
50
|
-
name: Build
|
|
51
|
+
build-wheel:
|
|
52
|
+
name: Build universal wheel with all platform binaries
|
|
51
53
|
needs: build-onion
|
|
52
|
-
runs-on:
|
|
53
|
-
strategy:
|
|
54
|
-
matrix:
|
|
55
|
-
include:
|
|
56
|
-
- os: ubuntu-latest
|
|
57
|
-
onion_artifact: onion-linux-x86_64
|
|
58
|
-
- os: macos-latest
|
|
59
|
-
onion_artifact: onion-darwin-x86_64
|
|
60
|
-
- os: windows-latest
|
|
61
|
-
onion_artifact: none
|
|
54
|
+
runs-on: ubuntu-latest
|
|
62
55
|
|
|
63
56
|
steps:
|
|
64
57
|
- uses: actions/checkout@v4
|
|
@@ -67,24 +60,30 @@ jobs:
|
|
|
67
60
|
|
|
68
61
|
- name: Create bin directory
|
|
69
62
|
run: mkdir -p dalla_data_processing/deduplication/bin
|
|
70
|
-
shell: bash
|
|
71
63
|
|
|
72
|
-
- name: Download
|
|
73
|
-
|
|
64
|
+
- name: Download Linux binary
|
|
65
|
+
uses: actions/download-artifact@v4
|
|
66
|
+
with:
|
|
67
|
+
name: onion-linux-x86_64
|
|
68
|
+
path: dalla_data_processing/deduplication/bin/
|
|
69
|
+
|
|
70
|
+
- name: Download macOS Intel binary
|
|
71
|
+
uses: actions/download-artifact@v4
|
|
72
|
+
with:
|
|
73
|
+
name: onion-darwin-x86_64
|
|
74
|
+
path: dalla_data_processing/deduplication/bin/
|
|
75
|
+
|
|
76
|
+
- name: Download macOS ARM binary
|
|
74
77
|
uses: actions/download-artifact@v4
|
|
75
78
|
with:
|
|
76
|
-
name:
|
|
79
|
+
name: onion-darwin-arm64
|
|
77
80
|
path: dalla_data_processing/deduplication/bin/
|
|
78
81
|
|
|
79
82
|
- name: Set binary permissions
|
|
80
|
-
if: matrix.onion_artifact != 'none'
|
|
81
83
|
run: chmod +x dalla_data_processing/deduplication/bin/onion-*
|
|
82
|
-
shell: bash
|
|
83
84
|
|
|
84
|
-
- name: List
|
|
85
|
-
if: matrix.onion_artifact != 'none'
|
|
85
|
+
- name: List all binaries
|
|
86
86
|
run: ls -lah dalla_data_processing/deduplication/bin/
|
|
87
|
-
shell: bash
|
|
88
87
|
|
|
89
88
|
- name: Set up Python
|
|
90
89
|
uses: actions/setup-python@v5
|
|
@@ -101,13 +100,16 @@ jobs:
|
|
|
101
100
|
|
|
102
101
|
- name: List wheel contents
|
|
103
102
|
run: |
|
|
104
|
-
|
|
103
|
+
echo "=== Wheel contents (binaries) ==="
|
|
104
|
+
python -m zipfile -l dist/*.whl | grep onion-
|
|
105
|
+
echo "=== Wheel info ==="
|
|
106
|
+
ls -lh dist/
|
|
105
107
|
|
|
106
|
-
- name: Upload
|
|
108
|
+
- name: Upload wheel
|
|
107
109
|
uses: actions/upload-artifact@v4
|
|
108
110
|
with:
|
|
109
|
-
name:
|
|
110
|
-
path: dist
|
|
111
|
+
name: wheel
|
|
112
|
+
path: dist/*.whl
|
|
111
113
|
|
|
112
114
|
build-sdist:
|
|
113
115
|
name: Build source distribution
|
|
@@ -136,15 +138,15 @@ jobs:
|
|
|
136
138
|
|
|
137
139
|
publish-pypi:
|
|
138
140
|
name: Publish to PyPI
|
|
139
|
-
needs: [build-
|
|
141
|
+
needs: [build-wheel, build-sdist]
|
|
140
142
|
runs-on: ubuntu-latest
|
|
141
143
|
environment: release
|
|
142
144
|
|
|
143
145
|
steps:
|
|
144
|
-
#
|
|
146
|
+
# Download the universal wheel with all platform binaries
|
|
145
147
|
- uses: actions/download-artifact@v4
|
|
146
148
|
with:
|
|
147
|
-
name:
|
|
149
|
+
name: wheel
|
|
148
150
|
path: dist/
|
|
149
151
|
|
|
150
152
|
- uses: actions/download-artifact@v4
|
|
@@ -168,7 +170,7 @@ jobs:
|
|
|
168
170
|
|
|
169
171
|
create-release:
|
|
170
172
|
name: Create GitHub Release
|
|
171
|
-
needs: [build-
|
|
173
|
+
needs: [build-wheel, build-sdist]
|
|
172
174
|
runs-on: ubuntu-latest
|
|
173
175
|
|
|
174
176
|
steps:
|
|
@@ -178,8 +180,7 @@ jobs:
|
|
|
178
180
|
|
|
179
181
|
- uses: actions/download-artifact@v4
|
|
180
182
|
with:
|
|
181
|
-
|
|
182
|
-
merge-multiple: true
|
|
183
|
+
name: wheel
|
|
183
184
|
path: dist/
|
|
184
185
|
|
|
185
186
|
- uses: actions/download-artifact@v4
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dalla-data-processing
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
|
|
5
5
|
Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
|
|
6
6
|
Project-URL: Homepage, https://github.com/U4RASD/dalla-data-processing
|
|
@@ -11,10 +11,13 @@ Keywords: arabic,nlp,data-processing,deduplication,stemming,readability,quality
|
|
|
11
11
|
Classifier: Intended Audience :: Developers
|
|
12
12
|
Classifier: Intended Audience :: Science/Research
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
16
|
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
18
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
19
|
Classifier: Topic :: Text Processing :: Linguistic
|
|
17
|
-
Requires-Python: >=3.
|
|
20
|
+
Requires-Python: >=3.10
|
|
18
21
|
Description-Content-Type: text/markdown
|
|
19
22
|
Requires-Dist: datasets>=2.14.0
|
|
20
23
|
Requires-Dist: transformers>=4.30.0
|
|
@@ -28,18 +31,17 @@ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
|
28
31
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
29
32
|
Requires-Dist: pre-commit>=3.0.0; extra == "dev"
|
|
30
33
|
Provides-Extra: dedup
|
|
31
|
-
Requires-Dist: camel-tools
|
|
34
|
+
Requires-Dist: camel-tools==1.5.7; extra == "dedup"
|
|
32
35
|
Provides-Extra: dedup-native
|
|
33
36
|
Requires-Dist: cffi>=1.15.0; extra == "dedup-native"
|
|
34
37
|
Provides-Extra: stem
|
|
35
|
-
Requires-Dist: camel-tools
|
|
38
|
+
Requires-Dist: camel-tools==1.5.7; extra == "stem"
|
|
36
39
|
Provides-Extra: quality
|
|
37
|
-
Requires-Dist: camel-tools
|
|
40
|
+
Requires-Dist: camel-tools==1.5.7; extra == "quality"
|
|
38
41
|
Provides-Extra: readability
|
|
39
42
|
Requires-Dist: textstat>=0.7.0; extra == "readability"
|
|
40
43
|
Provides-Extra: pack
|
|
41
44
|
Requires-Dist: sentencepiece>=0.2.0; extra == "pack"
|
|
42
|
-
Requires-Dist: rbpe; extra == "pack"
|
|
43
45
|
Requires-Dist: pyyaml; extra == "pack"
|
|
44
46
|
Provides-Extra: all
|
|
45
47
|
Requires-Dist: dalla-data-processing[dedup,dedup-native,dev,pack,quality,readability,stem]; extra == "all"
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/_version.py
RENAMED
|
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
|
|
|
28
28
|
commit_id: COMMIT_ID
|
|
29
29
|
__commit_id__: COMMIT_ID
|
|
30
30
|
|
|
31
|
-
__version__ = version = '0.0.
|
|
32
|
-
__version_tuple__ = version_tuple = (0, 0,
|
|
31
|
+
__version__ = version = '0.0.4'
|
|
32
|
+
__version_tuple__ = version_tuple = (0, 0, 4)
|
|
33
33
|
|
|
34
|
-
__commit_id__ = commit_id = '
|
|
34
|
+
__commit_id__ = commit_id = 'g89939c5cc'
|
|
@@ -651,7 +651,15 @@ def pack(
|
|
|
651
651
|
tokenizer = RBPETokenizer.from_pretrained(config_data["tokenizer_path"])
|
|
652
652
|
except ImportError:
|
|
653
653
|
logger.error("Missing rbpe package")
|
|
654
|
-
logger.error(
|
|
654
|
+
logger.error(
|
|
655
|
+
"rbpe is not included in the default installation due to "
|
|
656
|
+
"dependency conflicts with camel-tools (transformers version requirements)"
|
|
657
|
+
)
|
|
658
|
+
logger.error("Install separately with: pip install rbpe")
|
|
659
|
+
logger.error(
|
|
660
|
+
"Note: Installing rbpe may require a separate environment "
|
|
661
|
+
"if you also use dedup/stem/quality features"
|
|
662
|
+
)
|
|
655
663
|
sys.exit(1)
|
|
656
664
|
else:
|
|
657
665
|
try:
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/quality/checker.py
RENAMED
|
@@ -11,6 +11,7 @@ from concurrent.futures import TimeoutError as FutureTimeoutError
|
|
|
11
11
|
from types import MethodType
|
|
12
12
|
from typing import Any
|
|
13
13
|
|
|
14
|
+
from camel_tools.data.catalogue import Catalogue
|
|
14
15
|
from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
|
|
15
16
|
from camel_tools.disambig.mle import MLEDisambiguator
|
|
16
17
|
from datasets import Dataset
|
|
@@ -53,6 +54,25 @@ class QualityChecker:
|
|
|
53
54
|
|
|
54
55
|
def _init_disambiguator(self):
|
|
55
56
|
"""Initialize and configure the disambiguator with caching."""
|
|
57
|
+
# Install required CAMeL Tools packages based on model type
|
|
58
|
+
logger.info("Checking CAMeL Tools data packages...")
|
|
59
|
+
catalogue = Catalogue.load_catalogue()
|
|
60
|
+
|
|
61
|
+
try:
|
|
62
|
+
catalogue.download_package("morphology-db-msa-r13")
|
|
63
|
+
catalogue.download_package("disambig-mle-calima-msa-r13")
|
|
64
|
+
logger.info("msa-r13 packages installed")
|
|
65
|
+
except Exception as e:
|
|
66
|
+
logger.warning(f"Package installation warning: {e}")
|
|
67
|
+
|
|
68
|
+
# Install BERT package if using BERT model
|
|
69
|
+
if self.model == "bert":
|
|
70
|
+
try:
|
|
71
|
+
catalogue.download_package("disambig-bert-unfactored-all")
|
|
72
|
+
logger.info("BERT package installed")
|
|
73
|
+
except Exception as e:
|
|
74
|
+
logger.warning(f"BERT package installation warning: {e}")
|
|
75
|
+
|
|
56
76
|
if self.model == "mle":
|
|
57
77
|
self.disambiguator = MLEDisambiguator.pretrained()
|
|
58
78
|
logger.info("MLE disambiguator loaded")
|
|
@@ -473,12 +473,19 @@ def stem_dataset(
|
|
|
473
473
|
catalogue = Catalogue.load_catalogue()
|
|
474
474
|
try:
|
|
475
475
|
catalogue.download_package("morphology-db-msa-r13")
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
# For BERT, let it download automatically when pretrained() is called
|
|
479
|
-
logger.info("CAMeL Tools data packages ready")
|
|
476
|
+
catalogue.download_package("disambig-mle-calima-msa-r13")
|
|
477
|
+
logger.info("msa-r13 packages installed")
|
|
480
478
|
except Exception as e:
|
|
481
|
-
logger.warning(f"
|
|
479
|
+
logger.warning(f"Package installation warning: {e}")
|
|
480
|
+
|
|
481
|
+
if model == "bert":
|
|
482
|
+
try:
|
|
483
|
+
catalogue.download_package("disambig-bert-unfactored-all")
|
|
484
|
+
logger.info("BERT package installed")
|
|
485
|
+
except Exception as e:
|
|
486
|
+
logger.warning(f"BERT package installation warning: {e}")
|
|
487
|
+
|
|
488
|
+
logger.info("CAMeL Tools data packages ready")
|
|
482
489
|
|
|
483
490
|
logger.info("Loading additional words lists...")
|
|
484
491
|
words_dir = os.path.join(os.path.dirname(__file__), "data")
|
|
@@ -597,15 +604,21 @@ def stem(
|
|
|
597
604
|
if not all(isinstance(t, str) for t in text_list):
|
|
598
605
|
raise TypeError("All items in text list must be strings")
|
|
599
606
|
|
|
600
|
-
# Initialize disambiguator (cached globally if possible)
|
|
601
607
|
logger.info(f"Initializing {model.upper()} disambiguator...")
|
|
602
608
|
catalogue = Catalogue.load_catalogue()
|
|
603
609
|
try:
|
|
604
610
|
catalogue.download_package("morphology-db-msa-r13")
|
|
605
|
-
|
|
606
|
-
|
|
611
|
+
catalogue.download_package("disambig-mle-calima-msa-r13")
|
|
612
|
+
logger.info("msa-r13 packages installed")
|
|
607
613
|
except Exception as e:
|
|
608
|
-
logger.warning(f"
|
|
614
|
+
logger.warning(f"Package installation warning: {e}")
|
|
615
|
+
|
|
616
|
+
if model == "bert":
|
|
617
|
+
try:
|
|
618
|
+
catalogue.download_package("disambig-bert-unfactored-all")
|
|
619
|
+
logger.info("BERT package installed")
|
|
620
|
+
except Exception as e:
|
|
621
|
+
logger.warning(f"BERT package installation warning: {e}")
|
|
609
622
|
|
|
610
623
|
if model == "mle":
|
|
611
624
|
disambiguator = MLEDisambiguator.pretrained("calima-msa-r13", cache_size=1_000_000)
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/PKG-INFO
RENAMED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: dalla-data-processing
|
|
3
|
-
Version: 0.0.
|
|
3
|
+
Version: 0.0.4
|
|
4
4
|
Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
|
|
5
5
|
Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
|
|
6
6
|
Project-URL: Homepage, https://github.com/U4RASD/dalla-data-processing
|
|
@@ -11,10 +11,13 @@ Keywords: arabic,nlp,data-processing,deduplication,stemming,readability,quality
|
|
|
11
11
|
Classifier: Intended Audience :: Developers
|
|
12
12
|
Classifier: Intended Audience :: Science/Research
|
|
13
13
|
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
16
|
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
18
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
16
19
|
Classifier: Topic :: Text Processing :: Linguistic
|
|
17
|
-
Requires-Python: >=3.
|
|
20
|
+
Requires-Python: >=3.10
|
|
18
21
|
Description-Content-Type: text/markdown
|
|
19
22
|
Requires-Dist: datasets>=2.14.0
|
|
20
23
|
Requires-Dist: transformers>=4.30.0
|
|
@@ -28,18 +31,17 @@ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
|
28
31
|
Requires-Dist: ruff>=0.1.0; extra == "dev"
|
|
29
32
|
Requires-Dist: pre-commit>=3.0.0; extra == "dev"
|
|
30
33
|
Provides-Extra: dedup
|
|
31
|
-
Requires-Dist: camel-tools
|
|
34
|
+
Requires-Dist: camel-tools==1.5.7; extra == "dedup"
|
|
32
35
|
Provides-Extra: dedup-native
|
|
33
36
|
Requires-Dist: cffi>=1.15.0; extra == "dedup-native"
|
|
34
37
|
Provides-Extra: stem
|
|
35
|
-
Requires-Dist: camel-tools
|
|
38
|
+
Requires-Dist: camel-tools==1.5.7; extra == "stem"
|
|
36
39
|
Provides-Extra: quality
|
|
37
|
-
Requires-Dist: camel-tools
|
|
40
|
+
Requires-Dist: camel-tools==1.5.7; extra == "quality"
|
|
38
41
|
Provides-Extra: readability
|
|
39
42
|
Requires-Dist: textstat>=0.7.0; extra == "readability"
|
|
40
43
|
Provides-Extra: pack
|
|
41
44
|
Requires-Dist: sentencepiece>=0.2.0; extra == "pack"
|
|
42
|
-
Requires-Dist: rbpe; extra == "pack"
|
|
43
45
|
Requires-Dist: pyyaml; extra == "pack"
|
|
44
46
|
Provides-Extra: all
|
|
45
47
|
Requires-Dist: dalla-data-processing[dedup,dedup-native,dev,pack,quality,readability,stem]; extra == "all"
|
|
@@ -14,6 +14,7 @@ dalla_data_processing.egg-info/PKG-INFO
|
|
|
14
14
|
dalla_data_processing.egg-info/SOURCES.txt
|
|
15
15
|
dalla_data_processing.egg-info/dependency_links.txt
|
|
16
16
|
dalla_data_processing.egg-info/entry_points.txt
|
|
17
|
+
dalla_data_processing.egg-info/not-zip-safe
|
|
17
18
|
dalla_data_processing.egg-info/requires.txt
|
|
18
19
|
dalla_data_processing.egg-info/top_level.txt
|
|
19
20
|
dalla_data_processing/core/README.md
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -9,7 +9,7 @@ structlog>=24.0.0
|
|
|
9
9
|
dalla-data-processing[dedup,dedup-native,dev,pack,quality,readability,stem]
|
|
10
10
|
|
|
11
11
|
[dedup]
|
|
12
|
-
camel-tools
|
|
12
|
+
camel-tools==1.5.7
|
|
13
13
|
|
|
14
14
|
[dedup-native]
|
|
15
15
|
cffi>=1.15.0
|
|
@@ -22,14 +22,13 @@ pre-commit>=3.0.0
|
|
|
22
22
|
|
|
23
23
|
[pack]
|
|
24
24
|
sentencepiece>=0.2.0
|
|
25
|
-
rbpe
|
|
26
25
|
pyyaml
|
|
27
26
|
|
|
28
27
|
[quality]
|
|
29
|
-
camel-tools
|
|
28
|
+
camel-tools==1.5.7
|
|
30
29
|
|
|
31
30
|
[readability]
|
|
32
31
|
textstat>=0.7.0
|
|
33
32
|
|
|
34
33
|
[stem]
|
|
35
|
-
camel-tools
|
|
34
|
+
camel-tools==1.5.7
|
|
@@ -11,13 +11,16 @@ authors = [
|
|
|
11
11
|
{name = "Digital Research Unit - Arab Center", email = "dru@dohainstitute.edu.qa"}
|
|
12
12
|
]
|
|
13
13
|
readme = "README.md"
|
|
14
|
-
requires-python = ">=3.
|
|
14
|
+
requires-python = ">=3.10"
|
|
15
15
|
keywords = ["arabic", "nlp", "data-processing", "deduplication", "stemming", "readability", "quality"]
|
|
16
16
|
classifiers = [
|
|
17
17
|
"Intended Audience :: Developers",
|
|
18
18
|
"Intended Audience :: Science/Research",
|
|
19
19
|
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
20
22
|
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Programming Language :: Python :: 3.13",
|
|
21
24
|
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
22
25
|
"Topic :: Text Processing :: Linguistic",
|
|
23
26
|
]
|
|
@@ -39,23 +42,24 @@ dev = [
|
|
|
39
42
|
"pre-commit>=3.0.0",
|
|
40
43
|
]
|
|
41
44
|
dedup = [
|
|
42
|
-
"camel-tools
|
|
45
|
+
"camel-tools==1.5.7",
|
|
43
46
|
]
|
|
44
47
|
dedup-native = [
|
|
45
48
|
"cffi>=1.15.0",
|
|
46
49
|
]
|
|
47
50
|
stem = [
|
|
48
|
-
"camel-tools
|
|
51
|
+
"camel-tools==1.5.7",
|
|
49
52
|
]
|
|
50
53
|
quality = [
|
|
51
|
-
"camel-tools
|
|
54
|
+
"camel-tools==1.5.7",
|
|
52
55
|
]
|
|
53
56
|
readability = [
|
|
54
57
|
"textstat>=0.7.0",
|
|
55
58
|
]
|
|
56
59
|
pack = [
|
|
57
60
|
"sentencepiece>=0.2.0",
|
|
58
|
-
"rbpe",
|
|
61
|
+
# "rbpe", # excluded due to transformers version conflict with camel-tools
|
|
62
|
+
# users should install separately if needed: pip install rbpe
|
|
59
63
|
"pyyaml",
|
|
60
64
|
]
|
|
61
65
|
all = [
|
|
@@ -75,6 +79,8 @@ Repository = "https://github.com/U4RASD/dalla-data-processing"
|
|
|
75
79
|
packages = ["dalla_data_processing", "dalla_data_processing.core", "dalla_data_processing.deduplication", "dalla_data_processing.packing", "dalla_data_processing.stemming", "dalla_data_processing.quality", "dalla_data_processing.readability", "dalla_data_processing.utils"]
|
|
76
80
|
include-package-data = true
|
|
77
81
|
|
|
82
|
+
zip-safe = false
|
|
83
|
+
|
|
78
84
|
[tool.setuptools.package-data]
|
|
79
85
|
dalla_data_processing = ["py.typed"]
|
|
80
86
|
"dalla_data_processing.stemming" = ["data/*.txt"]
|
|
@@ -82,7 +88,7 @@ dalla_data_processing = ["py.typed"]
|
|
|
82
88
|
|
|
83
89
|
[tool.ruff]
|
|
84
90
|
line-length = 100
|
|
85
|
-
target-version = "
|
|
91
|
+
target-version = "py310"
|
|
86
92
|
src = ["dalla_data_processing"]
|
|
87
93
|
|
|
88
94
|
[tool.ruff.lint]
|
|
@@ -63,14 +63,16 @@ if [[ "$OSTYPE" == "darwin"* ]]; then
|
|
|
63
63
|
# On macOS, add Homebrew paths for sparsehash
|
|
64
64
|
if command -v brew &> /dev/null; then
|
|
65
65
|
BREW_PREFIX=$(brew --prefix)
|
|
66
|
-
|
|
66
|
+
SPARSEHASH_PREFIX=$(brew --prefix google-sparsehash 2>/dev/null || echo "${BREW_PREFIX}")
|
|
67
|
+
EXTRA_CFLAGS="-I${SPARSEHASH_PREFIX}/include"
|
|
67
68
|
echo -e "${YELLOW}Using Homebrew prefix: ${BREW_PREFIX}${NC}"
|
|
69
|
+
echo -e "${YELLOW}Using sparsehash include: ${SPARSEHASH_PREFIX}/include${NC}"
|
|
68
70
|
fi
|
|
69
71
|
fi
|
|
70
72
|
|
|
71
73
|
# Build onion
|
|
72
74
|
echo -e "${YELLOW}Compiling Onion...${NC}"
|
|
73
|
-
if make CFLAGS="-Wall -O3 ${EXTRA_CFLAGS}"; then
|
|
75
|
+
if make CFLAGS="-Wall -O3 -std=c++11 ${EXTRA_CFLAGS}"; then
|
|
74
76
|
echo -e "${GREEN}✓ Compilation successful${NC}"
|
|
75
77
|
else
|
|
76
78
|
echo -e "${RED}✗ Compilation failed${NC}"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/__init__.py
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/core/README.md
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/core/__init__.py
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/core/dataset.py
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/core/parallel.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/packing/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/quality/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/README.md
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/utils/__init__.py
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/utils/logger.py
RENAMED
|
File without changes
|
{dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/utils/tokenize.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|