dalla-data-processing 0.0.2__tar.gz → 0.0.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/.github/workflows/ci.yml +3 -1
  2. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/.github/workflows/release.yml +33 -32
  3. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/PKG-INFO +8 -6
  4. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/_version.py +3 -3
  5. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/cli.py +9 -1
  6. {dalla_data_processing-0.0.2/dalla_data_processing/deduplication/onion/src_sc → dalla_data_processing-0.0.4/dalla_data_processing/deduplication/onion/src}/Makefile +1 -1
  7. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/Makefile.g +1 -1
  8. {dalla_data_processing-0.0.2/dalla_data_processing/deduplication/onion/src → dalla_data_processing-0.0.4/dalla_data_processing/deduplication/onion/src_sc}/Makefile +1 -1
  9. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/Makefile.g +1 -1
  10. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/quality/checker.py +20 -0
  11. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/stemmer.py +22 -9
  12. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/PKG-INFO +8 -6
  13. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/SOURCES.txt +1 -0
  14. dalla_data_processing-0.0.4/dalla_data_processing.egg-info/not-zip-safe +1 -0
  15. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/requires.txt +3 -4
  16. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/pyproject.toml +12 -6
  17. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/scripts/build_onion.sh +4 -2
  18. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/.dockerignore +0 -0
  19. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/.gitignore +0 -0
  20. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/.pre-commit-config.yaml +0 -0
  21. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/MANIFEST.in +0 -0
  22. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/README.md +0 -0
  23. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/__init__.py +0 -0
  24. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/core/README.md +0 -0
  25. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/core/__init__.py +0 -0
  26. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/core/dataset.py +0 -0
  27. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/core/parallel.py +0 -0
  28. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/README.md +0 -0
  29. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/__init__.py +0 -0
  30. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/bin/.gitignore +0 -0
  31. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/COPYING +0 -0
  32. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/Makefile +0 -0
  33. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/Makefile.config +0 -0
  34. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/README.md +0 -0
  35. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/buzhash.c +0 -0
  36. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/buzhash.h +0 -0
  37. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/hashdup.c +0 -0
  38. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/hashgen.c +0 -0
  39. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/onion +0 -0
  40. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/onion.c +0 -0
  41. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/onion_dup.c +0 -0
  42. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/version.c +0 -0
  43. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src/version.h +0 -0
  44. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/.gitignore +0 -0
  45. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/buzhash.c +0 -0
  46. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/buzhash.h +0 -0
  47. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/hashdup +0 -0
  48. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/hashdup.c +0 -0
  49. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/hashgen +0 -0
  50. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/hashgen.c +0 -0
  51. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/onion.c +0 -0
  52. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/onion_dup.c +0 -0
  53. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/version.c +0 -0
  54. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion/src_sc/version.h +0 -0
  55. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/onion_wrapper.py +0 -0
  56. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/postprocessing.py +0 -0
  57. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/deduplication/preprocessing.py +0 -0
  58. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/packing/README.md +0 -0
  59. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/packing/__init__.py +0 -0
  60. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/packing/dataset_packer.py +0 -0
  61. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/packing/pack_config.example.yaml +0 -0
  62. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/quality/README.md +0 -0
  63. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/quality/__init__.py +0 -0
  64. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/readability/README.md +0 -0
  65. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/readability/__init__.py +0 -0
  66. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/readability/ranking.py +0 -0
  67. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/readability/scorer.py +0 -0
  68. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/README.md +0 -0
  69. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/__init__.py +0 -0
  70. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/data/words_al.txt +0 -0
  71. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/data/words_al_t.txt +0 -0
  72. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/stemming/data/words_t.txt +0 -0
  73. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/utils/__init__.py +0 -0
  74. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/utils/logger.py +0 -0
  75. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing/utils/tokenize.py +0 -0
  76. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/dependency_links.txt +0 -0
  77. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/entry_points.txt +0 -0
  78. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/dalla_data_processing.egg-info/top_level.txt +0 -0
  79. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/scripts/release.sh +0 -0
  80. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/setup.cfg +0 -0
  81. {dalla_data_processing-0.0.2 → dalla_data_processing-0.0.4}/uv.lock +0 -0
@@ -55,8 +55,10 @@ jobs:
55
55
  include:
56
56
  - os: ubuntu-latest
57
57
  artifact_name: onion-linux-x86_64
58
- - os: macos-latest
58
+ - os: macos-13
59
59
  artifact_name: onion-darwin-x86_64
60
+ - os: macos-14
61
+ artifact_name: onion-darwin-arm64
60
62
 
61
63
  steps:
62
64
  - uses: actions/checkout@v4
@@ -17,8 +17,10 @@ jobs:
17
17
  include:
18
18
  - os: ubuntu-latest
19
19
  artifact_name: onion-linux-x86_64
20
- - os: macos-latest
20
+ - os: macos-13
21
21
  artifact_name: onion-darwin-x86_64
22
+ - os: macos-14
23
+ artifact_name: onion-darwin-arm64
22
24
 
23
25
  steps:
24
26
  - uses: actions/checkout@v4
@@ -46,19 +48,10 @@ jobs:
46
48
  path: dalla_data_processing/deduplication/bin/onion-*
47
49
  if-no-files-found: error
48
50
 
49
- build-wheels:
50
- name: Build wheels on ${{ matrix.os }}
51
+ build-wheel:
52
+ name: Build universal wheel with all platform binaries
51
53
  needs: build-onion
52
- runs-on: ${{ matrix.os }}
53
- strategy:
54
- matrix:
55
- include:
56
- - os: ubuntu-latest
57
- onion_artifact: onion-linux-x86_64
58
- - os: macos-latest
59
- onion_artifact: onion-darwin-x86_64
60
- - os: windows-latest
61
- onion_artifact: none
54
+ runs-on: ubuntu-latest
62
55
 
63
56
  steps:
64
57
  - uses: actions/checkout@v4
@@ -67,24 +60,30 @@ jobs:
67
60
 
68
61
  - name: Create bin directory
69
62
  run: mkdir -p dalla_data_processing/deduplication/bin
70
- shell: bash
71
63
 
72
- - name: Download Onion binary
73
- if: matrix.onion_artifact != 'none'
64
+ - name: Download Linux binary
65
+ uses: actions/download-artifact@v4
66
+ with:
67
+ name: onion-linux-x86_64
68
+ path: dalla_data_processing/deduplication/bin/
69
+
70
+ - name: Download macOS Intel binary
71
+ uses: actions/download-artifact@v4
72
+ with:
73
+ name: onion-darwin-x86_64
74
+ path: dalla_data_processing/deduplication/bin/
75
+
76
+ - name: Download macOS ARM binary
74
77
  uses: actions/download-artifact@v4
75
78
  with:
76
- name: ${{ matrix.onion_artifact }}
79
+ name: onion-darwin-arm64
77
80
  path: dalla_data_processing/deduplication/bin/
78
81
 
79
82
  - name: Set binary permissions
80
- if: matrix.onion_artifact != 'none'
81
83
  run: chmod +x dalla_data_processing/deduplication/bin/onion-*
82
- shell: bash
83
84
 
84
- - name: List binary files
85
- if: matrix.onion_artifact != 'none'
85
+ - name: List all binaries
86
86
  run: ls -lah dalla_data_processing/deduplication/bin/
87
- shell: bash
88
87
 
89
88
  - name: Set up Python
90
89
  uses: actions/setup-python@v5
@@ -101,13 +100,16 @@ jobs:
101
100
 
102
101
  - name: List wheel contents
103
102
  run: |
104
- python -m zipfile -l dist/*.whl | head -50
103
+ echo "=== Wheel contents (binaries) ==="
104
+ python -m zipfile -l dist/*.whl | grep onion-
105
+ echo "=== Wheel info ==="
106
+ ls -lh dist/
105
107
 
106
- - name: Upload wheels
108
+ - name: Upload wheel
107
109
  uses: actions/upload-artifact@v4
108
110
  with:
109
- name: wheels-${{ matrix.os }}
110
- path: dist/*
111
+ name: wheel
112
+ path: dist/*.whl
111
113
 
112
114
  build-sdist:
113
115
  name: Build source distribution
@@ -136,15 +138,15 @@ jobs:
136
138
 
137
139
  publish-pypi:
138
140
  name: Publish to PyPI
139
- needs: [build-wheels, build-sdist]
141
+ needs: [build-wheel, build-sdist]
140
142
  runs-on: ubuntu-latest
141
143
  environment: release
142
144
 
143
145
  steps:
144
- # Only use Linux wheel to avoid conflicts
146
+ # Download the universal wheel with all platform binaries
145
147
  - uses: actions/download-artifact@v4
146
148
  with:
147
- name: wheels-ubuntu-latest
149
+ name: wheel
148
150
  path: dist/
149
151
 
150
152
  - uses: actions/download-artifact@v4
@@ -168,7 +170,7 @@ jobs:
168
170
 
169
171
  create-release:
170
172
  name: Create GitHub Release
171
- needs: [build-wheels, build-sdist]
173
+ needs: [build-wheel, build-sdist]
172
174
  runs-on: ubuntu-latest
173
175
 
174
176
  steps:
@@ -178,8 +180,7 @@ jobs:
178
180
 
179
181
  - uses: actions/download-artifact@v4
180
182
  with:
181
- pattern: wheels-*
182
- merge-multiple: true
183
+ name: wheel
183
184
  path: dist/
184
185
 
185
186
  - uses: actions/download-artifact@v4
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dalla-data-processing
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
5
5
  Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
6
6
  Project-URL: Homepage, https://github.com/U4RASD/dalla-data-processing
@@ -11,10 +11,13 @@ Keywords: arabic,nlp,data-processing,deduplication,stemming,readability,quality
11
11
  Classifier: Intended Audience :: Developers
12
12
  Classifier: Intended Audience :: Science/Research
13
13
  Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
14
16
  Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
15
18
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
19
  Classifier: Topic :: Text Processing :: Linguistic
17
- Requires-Python: >=3.12
20
+ Requires-Python: >=3.10
18
21
  Description-Content-Type: text/markdown
19
22
  Requires-Dist: datasets>=2.14.0
20
23
  Requires-Dist: transformers>=4.30.0
@@ -28,18 +31,17 @@ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
28
31
  Requires-Dist: ruff>=0.1.0; extra == "dev"
29
32
  Requires-Dist: pre-commit>=3.0.0; extra == "dev"
30
33
  Provides-Extra: dedup
31
- Requires-Dist: camel-tools>=1.5.0; extra == "dedup"
34
+ Requires-Dist: camel-tools==1.5.7; extra == "dedup"
32
35
  Provides-Extra: dedup-native
33
36
  Requires-Dist: cffi>=1.15.0; extra == "dedup-native"
34
37
  Provides-Extra: stem
35
- Requires-Dist: camel-tools>=1.5.0; extra == "stem"
38
+ Requires-Dist: camel-tools==1.5.7; extra == "stem"
36
39
  Provides-Extra: quality
37
- Requires-Dist: camel-tools>=1.5.0; extra == "quality"
40
+ Requires-Dist: camel-tools==1.5.7; extra == "quality"
38
41
  Provides-Extra: readability
39
42
  Requires-Dist: textstat>=0.7.0; extra == "readability"
40
43
  Provides-Extra: pack
41
44
  Requires-Dist: sentencepiece>=0.2.0; extra == "pack"
42
- Requires-Dist: rbpe; extra == "pack"
43
45
  Requires-Dist: pyyaml; extra == "pack"
44
46
  Provides-Extra: all
45
47
  Requires-Dist: dalla-data-processing[dedup,dedup-native,dev,pack,quality,readability,stem]; extra == "all"
@@ -28,7 +28,7 @@ version_tuple: VERSION_TUPLE
28
28
  commit_id: COMMIT_ID
29
29
  __commit_id__: COMMIT_ID
30
30
 
31
- __version__ = version = '0.0.2'
32
- __version_tuple__ = version_tuple = (0, 0, 2)
31
+ __version__ = version = '0.0.4'
32
+ __version_tuple__ = version_tuple = (0, 0, 4)
33
33
 
34
- __commit_id__ = commit_id = 'g68cee46a4'
34
+ __commit_id__ = commit_id = 'g89939c5cc'
@@ -651,7 +651,15 @@ def pack(
651
651
  tokenizer = RBPETokenizer.from_pretrained(config_data["tokenizer_path"])
652
652
  except ImportError:
653
653
  logger.error("Missing rbpe package")
654
- logger.error("Install with: pip install rbpe")
654
+ logger.error(
655
+ "rbpe is not included in the default installation due to "
656
+ "dependency conflicts with camel-tools (transformers version requirements)"
657
+ )
658
+ logger.error("Install separately with: pip install rbpe")
659
+ logger.error(
660
+ "Note: Installing rbpe may require a separate environment "
661
+ "if you also use dedup/stem/quality features"
662
+ )
655
663
  sys.exit(1)
656
664
  else:
657
665
  try:
@@ -1,7 +1,7 @@
1
1
  include ../Makefile.config
2
2
 
3
3
  CC=g++
4
- CFLAGS=-Wall -O3
4
+ CFLAGS=-Wall -O3 -std=c++11 -I/opt/homebrew/opt/google-sparsehash/include
5
5
 
6
6
  OBJS=version.o buzhash.o
7
7
  TARGETS=hashgen hashdup onion
@@ -2,7 +2,7 @@ include ../Makefile.config
2
2
 
3
3
  CC=g++
4
4
  #CFLAGS=-Wall -O3
5
- CFLAGS=-Wall -g
5
+ CFLAGS=-Wall -g -std=c++11 -I/opt/homebrew/opt/google-sparsehash/include
6
6
 
7
7
  OBJS=version.o buzhash.o
8
8
  TARGETS=hashgen hashdup onion
@@ -1,7 +1,7 @@
1
1
  include ../Makefile.config
2
2
 
3
3
  CC=g++
4
- CFLAGS=-Wall -O3
4
+ CFLAGS=-Wall -O3 -std=c++11 -I/opt/homebrew/opt/google-sparsehash/include
5
5
 
6
6
  OBJS=version.o buzhash.o
7
7
  TARGETS=hashgen hashdup onion
@@ -2,7 +2,7 @@ include ../Makefile.config
2
2
 
3
3
  CC=g++
4
4
  #CFLAGS=-Wall -O3
5
- CFLAGS=-Wall -g
5
+ CFLAGS=-Wall -g -std=c++11 -I/opt/homebrew/opt/google-sparsehash/include
6
6
 
7
7
  OBJS=version.o buzhash.o
8
8
  TARGETS=hashgen hashdup onion
@@ -11,6 +11,7 @@ from concurrent.futures import TimeoutError as FutureTimeoutError
11
11
  from types import MethodType
12
12
  from typing import Any
13
13
 
14
+ from camel_tools.data.catalogue import Catalogue
14
15
  from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
15
16
  from camel_tools.disambig.mle import MLEDisambiguator
16
17
  from datasets import Dataset
@@ -53,6 +54,25 @@ class QualityChecker:
53
54
 
54
55
  def _init_disambiguator(self):
55
56
  """Initialize and configure the disambiguator with caching."""
57
+ # Install required CAMeL Tools packages based on model type
58
+ logger.info("Checking CAMeL Tools data packages...")
59
+ catalogue = Catalogue.load_catalogue()
60
+
61
+ try:
62
+ catalogue.download_package("morphology-db-msa-r13")
63
+ catalogue.download_package("disambig-mle-calima-msa-r13")
64
+ logger.info("msa-r13 packages installed")
65
+ except Exception as e:
66
+ logger.warning(f"Package installation warning: {e}")
67
+
68
+ # Install BERT package if using BERT model
69
+ if self.model == "bert":
70
+ try:
71
+ catalogue.download_package("disambig-bert-unfactored-all")
72
+ logger.info("BERT package installed")
73
+ except Exception as e:
74
+ logger.warning(f"BERT package installation warning: {e}")
75
+
56
76
  if self.model == "mle":
57
77
  self.disambiguator = MLEDisambiguator.pretrained()
58
78
  logger.info("MLE disambiguator loaded")
@@ -473,12 +473,19 @@ def stem_dataset(
473
473
  catalogue = Catalogue.load_catalogue()
474
474
  try:
475
475
  catalogue.download_package("morphology-db-msa-r13")
476
- if model == "mle":
477
- catalogue.download_package("disambig-mle-calima-msa-r13")
478
- # For BERT, let it download automatically when pretrained() is called
479
- logger.info("CAMeL Tools data packages ready")
476
+ catalogue.download_package("disambig-mle-calima-msa-r13")
477
+ logger.info("msa-r13 packages installed")
480
478
  except Exception as e:
481
- logger.warning(f"Could not verify CAMeL packages: {e}")
479
+ logger.warning(f"Package installation warning: {e}")
480
+
481
+ if model == "bert":
482
+ try:
483
+ catalogue.download_package("disambig-bert-unfactored-all")
484
+ logger.info("BERT package installed")
485
+ except Exception as e:
486
+ logger.warning(f"BERT package installation warning: {e}")
487
+
488
+ logger.info("CAMeL Tools data packages ready")
482
489
 
483
490
  logger.info("Loading additional words lists...")
484
491
  words_dir = os.path.join(os.path.dirname(__file__), "data")
@@ -597,15 +604,21 @@ def stem(
597
604
  if not all(isinstance(t, str) for t in text_list):
598
605
  raise TypeError("All items in text list must be strings")
599
606
 
600
- # Initialize disambiguator (cached globally if possible)
601
607
  logger.info(f"Initializing {model.upper()} disambiguator...")
602
608
  catalogue = Catalogue.load_catalogue()
603
609
  try:
604
610
  catalogue.download_package("morphology-db-msa-r13")
605
- if model == "mle":
606
- catalogue.download_package("disambig-mle-calima-msa-r13")
611
+ catalogue.download_package("disambig-mle-calima-msa-r13")
612
+ logger.info("msa-r13 packages installed")
607
613
  except Exception as e:
608
- logger.warning(f"Could not verify CAMeL packages: {e}")
614
+ logger.warning(f"Package installation warning: {e}")
615
+
616
+ if model == "bert":
617
+ try:
618
+ catalogue.download_package("disambig-bert-unfactored-all")
619
+ logger.info("BERT package installed")
620
+ except Exception as e:
621
+ logger.warning(f"BERT package installation warning: {e}")
609
622
 
610
623
  if model == "mle":
611
624
  disambiguator = MLEDisambiguator.pretrained("calima-msa-r13", cache_size=1_000_000)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dalla-data-processing
3
- Version: 0.0.2
3
+ Version: 0.0.4
4
4
  Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
5
5
  Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
6
6
  Project-URL: Homepage, https://github.com/U4RASD/dalla-data-processing
@@ -11,10 +11,13 @@ Keywords: arabic,nlp,data-processing,deduplication,stemming,readability,quality
11
11
  Classifier: Intended Audience :: Developers
12
12
  Classifier: Intended Audience :: Science/Research
13
13
  Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Classifier: Programming Language :: Python :: 3.11
14
16
  Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Programming Language :: Python :: 3.13
15
18
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
19
  Classifier: Topic :: Text Processing :: Linguistic
17
- Requires-Python: >=3.12
20
+ Requires-Python: >=3.10
18
21
  Description-Content-Type: text/markdown
19
22
  Requires-Dist: datasets>=2.14.0
20
23
  Requires-Dist: transformers>=4.30.0
@@ -28,18 +31,17 @@ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
28
31
  Requires-Dist: ruff>=0.1.0; extra == "dev"
29
32
  Requires-Dist: pre-commit>=3.0.0; extra == "dev"
30
33
  Provides-Extra: dedup
31
- Requires-Dist: camel-tools>=1.5.0; extra == "dedup"
34
+ Requires-Dist: camel-tools==1.5.7; extra == "dedup"
32
35
  Provides-Extra: dedup-native
33
36
  Requires-Dist: cffi>=1.15.0; extra == "dedup-native"
34
37
  Provides-Extra: stem
35
- Requires-Dist: camel-tools>=1.5.0; extra == "stem"
38
+ Requires-Dist: camel-tools==1.5.7; extra == "stem"
36
39
  Provides-Extra: quality
37
- Requires-Dist: camel-tools>=1.5.0; extra == "quality"
40
+ Requires-Dist: camel-tools==1.5.7; extra == "quality"
38
41
  Provides-Extra: readability
39
42
  Requires-Dist: textstat>=0.7.0; extra == "readability"
40
43
  Provides-Extra: pack
41
44
  Requires-Dist: sentencepiece>=0.2.0; extra == "pack"
42
- Requires-Dist: rbpe; extra == "pack"
43
45
  Requires-Dist: pyyaml; extra == "pack"
44
46
  Provides-Extra: all
45
47
  Requires-Dist: dalla-data-processing[dedup,dedup-native,dev,pack,quality,readability,stem]; extra == "all"
@@ -14,6 +14,7 @@ dalla_data_processing.egg-info/PKG-INFO
14
14
  dalla_data_processing.egg-info/SOURCES.txt
15
15
  dalla_data_processing.egg-info/dependency_links.txt
16
16
  dalla_data_processing.egg-info/entry_points.txt
17
+ dalla_data_processing.egg-info/not-zip-safe
17
18
  dalla_data_processing.egg-info/requires.txt
18
19
  dalla_data_processing.egg-info/top_level.txt
19
20
  dalla_data_processing/core/README.md
@@ -9,7 +9,7 @@ structlog>=24.0.0
9
9
  dalla-data-processing[dedup,dedup-native,dev,pack,quality,readability,stem]
10
10
 
11
11
  [dedup]
12
- camel-tools>=1.5.0
12
+ camel-tools==1.5.7
13
13
 
14
14
  [dedup-native]
15
15
  cffi>=1.15.0
@@ -22,14 +22,13 @@ pre-commit>=3.0.0
22
22
 
23
23
  [pack]
24
24
  sentencepiece>=0.2.0
25
- rbpe
26
25
  pyyaml
27
26
 
28
27
  [quality]
29
- camel-tools>=1.5.0
28
+ camel-tools==1.5.7
30
29
 
31
30
  [readability]
32
31
  textstat>=0.7.0
33
32
 
34
33
  [stem]
35
- camel-tools>=1.5.0
34
+ camel-tools==1.5.7
@@ -11,13 +11,16 @@ authors = [
11
11
  {name = "Digital Research Unit - Arab Center", email = "dru@dohainstitute.edu.qa"}
12
12
  ]
13
13
  readme = "README.md"
14
- requires-python = ">=3.12"
14
+ requires-python = ">=3.10"
15
15
  keywords = ["arabic", "nlp", "data-processing", "deduplication", "stemming", "readability", "quality"]
16
16
  classifiers = [
17
17
  "Intended Audience :: Developers",
18
18
  "Intended Audience :: Science/Research",
19
19
  "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
20
22
  "Programming Language :: Python :: 3.12",
23
+ "Programming Language :: Python :: 3.13",
21
24
  "Topic :: Scientific/Engineering :: Artificial Intelligence",
22
25
  "Topic :: Text Processing :: Linguistic",
23
26
  ]
@@ -39,23 +42,24 @@ dev = [
39
42
  "pre-commit>=3.0.0",
40
43
  ]
41
44
  dedup = [
42
- "camel-tools>=1.5.0",
45
+ "camel-tools==1.5.7",
43
46
  ]
44
47
  dedup-native = [
45
48
  "cffi>=1.15.0",
46
49
  ]
47
50
  stem = [
48
- "camel-tools>=1.5.0",
51
+ "camel-tools==1.5.7",
49
52
  ]
50
53
  quality = [
51
- "camel-tools>=1.5.0",
54
+ "camel-tools==1.5.7",
52
55
  ]
53
56
  readability = [
54
57
  "textstat>=0.7.0",
55
58
  ]
56
59
  pack = [
57
60
  "sentencepiece>=0.2.0",
58
- "rbpe",
61
+ # "rbpe", # excluded due to transformers version conflict with camel-tools
62
+ # users should install separately if needed: pip install rbpe
59
63
  "pyyaml",
60
64
  ]
61
65
  all = [
@@ -75,6 +79,8 @@ Repository = "https://github.com/U4RASD/dalla-data-processing"
75
79
  packages = ["dalla_data_processing", "dalla_data_processing.core", "dalla_data_processing.deduplication", "dalla_data_processing.packing", "dalla_data_processing.stemming", "dalla_data_processing.quality", "dalla_data_processing.readability", "dalla_data_processing.utils"]
76
80
  include-package-data = true
77
81
 
82
+ zip-safe = false
83
+
78
84
  [tool.setuptools.package-data]
79
85
  dalla_data_processing = ["py.typed"]
80
86
  "dalla_data_processing.stemming" = ["data/*.txt"]
@@ -82,7 +88,7 @@ dalla_data_processing = ["py.typed"]
82
88
 
83
89
  [tool.ruff]
84
90
  line-length = 100
85
- target-version = "py312"
91
+ target-version = "py310"
86
92
  src = ["dalla_data_processing"]
87
93
 
88
94
  [tool.ruff.lint]
@@ -63,14 +63,16 @@ if [[ "$OSTYPE" == "darwin"* ]]; then
63
63
  # On macOS, add Homebrew paths for sparsehash
64
64
  if command -v brew &> /dev/null; then
65
65
  BREW_PREFIX=$(brew --prefix)
66
- EXTRA_CFLAGS="-I${BREW_PREFIX}/include"
66
+ SPARSEHASH_PREFIX=$(brew --prefix google-sparsehash 2>/dev/null || echo "${BREW_PREFIX}")
67
+ EXTRA_CFLAGS="-I${SPARSEHASH_PREFIX}/include"
67
68
  echo -e "${YELLOW}Using Homebrew prefix: ${BREW_PREFIX}${NC}"
69
+ echo -e "${YELLOW}Using sparsehash include: ${SPARSEHASH_PREFIX}/include${NC}"
68
70
  fi
69
71
  fi
70
72
 
71
73
  # Build onion
72
74
  echo -e "${YELLOW}Compiling Onion...${NC}"
73
- if make CFLAGS="-Wall -O3 ${EXTRA_CFLAGS}"; then
75
+ if make CFLAGS="-Wall -O3 -std=c++11 ${EXTRA_CFLAGS}"; then
74
76
  echo -e "${GREEN}✓ Compilation successful${NC}"
75
77
  else
76
78
  echo -e "${RED}✗ Compilation failed${NC}"