dalla-data-processing 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. dalla_data_processing-0.0.1/MANIFEST.in +8 -0
  2. dalla_data_processing-0.0.1/PKG-INFO +393 -0
  3. dalla_data_processing-0.0.1/README.md +355 -0
  4. dalla_data_processing-0.0.1/dalla/__init__.py +27 -0
  5. dalla_data_processing-0.0.1/dalla/cli.py +453 -0
  6. dalla_data_processing-0.0.1/dalla/core/__init__.py +6 -0
  7. dalla_data_processing-0.0.1/dalla/core/dataset.py +387 -0
  8. dalla_data_processing-0.0.1/dalla/core/parallel.py +279 -0
  9. dalla_data_processing-0.0.1/dalla/deduplication/__init__.py +370 -0
  10. dalla_data_processing-0.0.1/dalla/deduplication/bin/.gitignore +1 -0
  11. dalla_data_processing-0.0.1/dalla/deduplication/onion/COPYING +24 -0
  12. dalla_data_processing-0.0.1/dalla/deduplication/onion/Makefile +21 -0
  13. dalla_data_processing-0.0.1/dalla/deduplication/onion/Makefile.config +3 -0
  14. dalla_data_processing-0.0.1/dalla/deduplication/onion/README.md +21 -0
  15. dalla_data_processing-0.0.1/dalla/deduplication/onion/src/Makefile +22 -0
  16. dalla_data_processing-0.0.1/dalla/deduplication/onion/src/Makefile.g +23 -0
  17. dalla_data_processing-0.0.1/dalla/deduplication/onion/src/buzhash.c +325 -0
  18. dalla_data_processing-0.0.1/dalla/deduplication/onion/src/buzhash.h +30 -0
  19. dalla_data_processing-0.0.1/dalla/deduplication/onion/src/hashdup.c +172 -0
  20. dalla_data_processing-0.0.1/dalla/deduplication/onion/src/hashgen.c +206 -0
  21. dalla_data_processing-0.0.1/dalla/deduplication/onion/src/onion +0 -0
  22. dalla_data_processing-0.0.1/dalla/deduplication/onion/src/onion.c +799 -0
  23. dalla_data_processing-0.0.1/dalla/deduplication/onion/src/onion_dup.c +824 -0
  24. dalla_data_processing-0.0.1/dalla/deduplication/onion/src/version.c +17 -0
  25. dalla_data_processing-0.0.1/dalla/deduplication/onion/src/version.h +10 -0
  26. dalla_data_processing-0.0.1/dalla/deduplication/onion/src_sc/Makefile +22 -0
  27. dalla_data_processing-0.0.1/dalla/deduplication/onion/src_sc/Makefile.g +23 -0
  28. dalla_data_processing-0.0.1/dalla/deduplication/onion/src_sc/buzhash.c +325 -0
  29. dalla_data_processing-0.0.1/dalla/deduplication/onion/src_sc/buzhash.h +30 -0
  30. dalla_data_processing-0.0.1/dalla/deduplication/onion/src_sc/hashdup +0 -0
  31. dalla_data_processing-0.0.1/dalla/deduplication/onion/src_sc/hashdup.c +172 -0
  32. dalla_data_processing-0.0.1/dalla/deduplication/onion/src_sc/hashgen +0 -0
  33. dalla_data_processing-0.0.1/dalla/deduplication/onion/src_sc/hashgen.c +206 -0
  34. dalla_data_processing-0.0.1/dalla/deduplication/onion/src_sc/onion.c +854 -0
  35. dalla_data_processing-0.0.1/dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
  36. dalla_data_processing-0.0.1/dalla/deduplication/onion/src_sc/version.c +17 -0
  37. dalla_data_processing-0.0.1/dalla/deduplication/onion/src_sc/version.h +10 -0
  38. dalla_data_processing-0.0.1/dalla/deduplication/onion_wrapper.py +223 -0
  39. dalla_data_processing-0.0.1/dalla/deduplication/postprocessing.py +216 -0
  40. dalla_data_processing-0.0.1/dalla/deduplication/preprocessing.py +120 -0
  41. dalla_data_processing-0.0.1/dalla/quality/__init__.py +5 -0
  42. dalla_data_processing-0.0.1/dalla/quality/checker.py +354 -0
  43. dalla_data_processing-0.0.1/dalla/readability/__init__.py +197 -0
  44. dalla_data_processing-0.0.1/dalla/readability/ranking.py +165 -0
  45. dalla_data_processing-0.0.1/dalla/readability/scorer.py +148 -0
  46. dalla_data_processing-0.0.1/dalla/stemming/__init__.py +551 -0
  47. dalla_data_processing-0.0.1/dalla/stemming/data/words_al.txt +3414 -0
  48. dalla_data_processing-0.0.1/dalla/stemming/data/words_al_t.txt +885 -0
  49. dalla_data_processing-0.0.1/dalla/stemming/data/words_t.txt +7 -0
  50. dalla_data_processing-0.0.1/dalla/utils/__init__.py +10 -0
  51. dalla_data_processing-0.0.1/dalla/utils/logger.py +128 -0
  52. dalla_data_processing-0.0.1/dalla/utils/tokenize.py +89 -0
  53. dalla_data_processing-0.0.1/dalla_data_processing.egg-info/PKG-INFO +393 -0
  54. dalla_data_processing-0.0.1/dalla_data_processing.egg-info/SOURCES.txt +58 -0
  55. dalla_data_processing-0.0.1/dalla_data_processing.egg-info/dependency_links.txt +1 -0
  56. dalla_data_processing-0.0.1/dalla_data_processing.egg-info/entry_points.txt +2 -0
  57. dalla_data_processing-0.0.1/dalla_data_processing.egg-info/requires.txt +22 -0
  58. dalla_data_processing-0.0.1/dalla_data_processing.egg-info/top_level.txt +1 -0
  59. dalla_data_processing-0.0.1/pyproject.toml +111 -0
  60. dalla_data_processing-0.0.1/setup.cfg +4 -0
@@ -0,0 +1,8 @@
1
+ include README.md
2
+ include LICENSE
3
+ recursive-include dalla *.py
4
+ recursive-include dalla/stemming/data *.txt
5
+ recursive-include dalla/deduplication/bin *
6
+ recursive-include dalla/deduplication/onion *.c *.h Makefile*
7
+ global-exclude __pycache__
8
+ global-exclude *.py[co]
@@ -0,0 +1,393 @@
1
+ Metadata-Version: 2.4
2
+ Name: dalla-data-processing
3
+ Version: 0.0.1
4
+ Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
5
+ Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
6
+ Project-URL: Homepage, https://github.com/U4RASD/dalla-data-processing
7
+ Project-URL: Documentation, https://github.com/U4RASD/dalla-data-processing#readme
8
+ Project-URL: Repository, https://github.com/U4RASD/dalla-data-processing
9
+ Project-URL: Bug Tracker, https://github.com/U4RASD/dalla-data-processing/issues
10
+ Keywords: arabic,nlp,data-processing,deduplication,stemming,readability,quality
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Topic :: Text Processing :: Linguistic
17
+ Requires-Python: >=3.12
18
+ Description-Content-Type: text/markdown
19
+ Requires-Dist: datasets>=2.14.0
20
+ Requires-Dist: transformers>=4.30.0
21
+ Requires-Dist: camel-tools>=1.5.0
22
+ Requires-Dist: click>=8.0.0
23
+ Requires-Dist: tqdm>=4.65.0
24
+ Requires-Dist: pandas>=2.0.0
25
+ Requires-Dist: numpy>=1.24.0
26
+ Requires-Dist: pyarrow>=12.0.0
27
+ Requires-Dist: textstat>=0.7.0
28
+ Requires-Dist: structlog>=24.0.0
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
31
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
32
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
33
+ Requires-Dist: pre-commit>=3.0.0; extra == "dev"
34
+ Provides-Extra: dedup-native
35
+ Requires-Dist: cffi>=1.15.0; extra == "dedup-native"
36
+ Provides-Extra: all
37
+ Requires-Dist: dalla-data-processing[dedup-native,dev]; extra == "all"
38
+
39
+ # Dalla Data Processing (dalla-dp)
40
+
41
+ A comprehensive Arabic data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models.
42
+
43
+ ## Compatibility
44
+
45
+ - **Linux**: Fully supported
46
+ - **macOS**: Fully supported (Intel or through rosetta)
47
+ - **Windows**: Supported through WSL (Windows Subsystem for Linux) only, for native windows: manual build from source works for deduplication.
48
+
49
+ ## Installation
50
+
51
+ <b>Using uv</b>
52
+
53
+ ```bash
54
+ # Install the package
55
+ uv pip install dalla-data-processing
56
+ ```
57
+
58
+
59
+ <b>Using pip</b>
60
+
61
+ ```bash
62
+ # Install the package
63
+ pip install dalla-data-processing
64
+ ```
65
+
66
+
67
+ <b>From Source</b>
68
+
69
+ ```bash
70
+ git clone https://github.com/U4RASD/dalla-data-processing.git
71
+ cd dalla-data-processing
72
+
73
+ # Using uv
74
+ uv pip install -e .
75
+
76
+ # Or using pip
77
+ pip install -e .
78
+ ```
79
+
80
+ ## Components
81
+
82
+ ### Deduplication
83
+
84
+ **CLI Usage**
85
+
86
+ **Command:** `dalla-dp deduplicate [OPTIONS]`
87
+
88
+ **Arguments:**
89
+ - `-t, --threshold FLOAT` - Similarity threshold (0.0-1.0, default: 0.8)
90
+ - `--return-pairs` / `--filter-duplicates` - Return dataset with duplicate info (default) or filtered dataset
91
+ - `--keep-vert-files` - Keep vertical format files for inspection
92
+ - `--vert-dir PATH` - Directory to store vertical files (useful for different disk)
93
+ - `--calculate-scores` - Run phase 2 to calculate similarity scores (slower but more precise)
94
+ - `--onion-binary PATH` - Path to onion binary (auto-detected if not specified)
95
+
96
+ **Examples:**
97
+ ```bash
98
+ # Basic deduplication
99
+ dalla-dp -i ./data/raw -o ./data/deduped deduplicate
100
+
101
+ # With custom threshold
102
+ dalla-dp -i ./data/raw -o ./data/deduped deduplicate --threshold 0.9
103
+
104
+ # Return filtered dataset (removes duplicates)
105
+ dalla-dp -i ./data/raw -o ./data/clean deduplicate --filter-duplicates
106
+
107
+ # Keep intermediate files for inspection
108
+ dalla-dp -i ./data/raw -o ./data/deduped deduplicate --keep-vert-files
109
+
110
+ # Calculate precise similarity scores (slower)
111
+ dalla-dp -i ./data/raw -o ./data/deduped deduplicate --calculate-scores
112
+
113
+ # Use custom onion binary
114
+ dalla-dp -i ./data/raw -o ./data/deduped deduplicate --onion-binary /path/to/onion
115
+ ```
116
+
117
+ **Python API**
118
+
119
+ ```python
120
+ from datasets import load_from_disk
121
+ from dalla.deduplication import deduplicate_dataset
122
+
123
+ # Load dataset
124
+ dataset = load_from_disk("./data/raw")
125
+
126
+ # get duplicate information (adds columns: duplicate_cluster, is_duplicate, duplicate_count)
127
+ result = deduplicate_dataset(dataset, column="text", threshold=0.8, return_pairs=True)
128
+
129
+ # filter to see only duplicates
130
+ duplicates = result.filter(lambda x: x['is_duplicate'])
131
+
132
+ deduped.save_to_disk("./data/clean")
133
+ ```
134
+
135
+ ### Stemming
136
+
137
+ Apply morphological analysis and stemming using CAMeL Tools.
138
+
139
+ **CLI Usage**
140
+
141
+ **Command:** `dalla-dp stem [OPTIONS]`
142
+
143
+ **Arguments:**
144
+ - `--sep-token TEXT` - Separator token for morphological splits (default: `<+>`)
145
+ - `--normalize` - Apply Arabic normalization
146
+ - `--keep-diacritics` - Keep diacritics in output
147
+ - `--model [mle|bert]` - Disambiguator model (default: mle, faster | bert: more accurate)
148
+ - `--use-gpu` - Use GPU for BERT model (only applicable when --model=bert)
149
+
150
+ **Examples:**
151
+ ```bash
152
+ # Basic stemming with MLE model
153
+ dalla-dp -i ./data/raw -o ./data/stemmed stem
154
+
155
+ # Use BERT model
156
+ dalla-dp -i ./data/raw -o ./data/stemmed stem --model bert
157
+
158
+ # Use BERT with GPU acceleration
159
+ dalla-dp -i ./data/raw -o ./data/stemmed stem --model bert --use-gpu
160
+
161
+ # Custom separator token
162
+ dalla-dp -i ./data/raw -o ./data/stemmed stem --sep-token "<SEP>"
163
+
164
+ # Apply normalization
165
+ dalla-dp -i ./data/raw -o ./data/stemmed stem --normalize
166
+
167
+ # Keep diacritics in output
168
+ dalla-dp -i ./data/raw -o ./data/stemmed stem --keep-diacritics
169
+
170
+ ```
171
+
172
+ **Python API**
173
+
174
+ ```python
175
+ from datasets import load_from_disk
176
+ from dalla.stemming import stem_dataset
177
+
178
+ # Load dataset
179
+ dataset = load_from_disk("./data/raw")
180
+
181
+ stemmed = stem_dataset(dataset, column="text")
182
+
183
+ stemmed = stem_dataset(
184
+ dataset,
185
+ column="text",
186
+ model="bert",
187
+ use_gpu=True,
188
+ num_proc=8
189
+ )
190
+
191
+ stemmed = stem_dataset(
192
+ dataset,
193
+ column="content",
194
+ sep_token="<+>",
195
+ normalize=True,
196
+ keep_diacritics=True
197
+ )
198
+
199
+ stemmed.save_to_disk("./data/stemmed")
200
+ ```
201
+
202
+ ### Quality Checking
203
+
204
+ Check text quality using morphological analysis to detect errors and foreign words.
205
+
206
+ **CLI Usage**
207
+
208
+ **Command:** `dalla-dp quality-check [OPTIONS]`
209
+
210
+ **Arguments:**
211
+ - `--min-score FLOAT` - Minimum quality score to keep (0-100, default: 0)
212
+ - `--save-errors` - Save erroneous words to file
213
+ - `--model [mle|bert]` - Disambiguator model (default: mle, faster | bert: more accurate)
214
+ - `--use-gpu` - Use GPU for BERT model (only applicable when --model=bert)
215
+
216
+ **Examples:**
217
+ ```bash
218
+ dalla-dp -i ./data/raw -o ./data/quality quality-check
219
+
220
+ # Filter low-quality texts (score < 50)
221
+ dalla-dp -i ./data/raw -o ./data/quality quality-check --min-score 50
222
+
223
+ # Save erroneous words to log
224
+ dalla-dp -i ./data/raw -o ./data/quality quality-check --save-errors
225
+
226
+ # Use BERT model with GPU
227
+ dalla-dp -i ./data/raw -o ./data/quality quality-check --model bert --use-gpu
228
+
229
+ dalla-dp -i ./data/raw -o ./data/quality -c content quality-check
230
+ ```
231
+
232
+ **Python API**
233
+
234
+ ```python
235
+ from datasets import load_from_disk
236
+ from dalla.quality import check_quality
237
+
238
+ dataset = load_from_disk("./data/raw")
239
+
240
+ scored = check_quality(dataset, column="text")
241
+
242
+ high_quality = check_quality(
243
+ dataset,
244
+ column="text",
245
+ min_score=60.0,
246
+ save_errors=True
247
+ )
248
+
249
+ scored = check_quality(
250
+ dataset,
251
+ model="bert",
252
+ use_gpu=True,
253
+ num_workers=4,
254
+ timeout=3600
255
+ )
256
+
257
+ scored.save_to_disk("./data/quality")
258
+ ```
259
+
260
+ ### Readability Scoring
261
+
262
+ Calculate readability scores using Flesch Reading Ease and Osman methods.
263
+
264
+ **CLI Usage**
265
+
266
+ **Command:** `dalla-dp readability [OPTIONS]`
267
+
268
+ **Arguments:**
269
+ - `--add-ranks` / `--no-ranks` - Add ranking and level columns (default: True)
270
+
271
+ **Examples:**
272
+ ```bash
273
+ dalla-dp -i ./data/raw -o ./data/scored readability
274
+
275
+ dalla-dp -i ./data/raw -o ./data/scored readability --no-ranks
276
+
277
+ dalla-dp -i ./data/raw -o ./data/scored -c content readability
278
+ ```
279
+
280
+ **Python API**
281
+
282
+ ```python
283
+ from datasets import load_from_disk
284
+ from dalla.readability import score_readability
285
+
286
+ # Load dataset
287
+ dataset = load_from_disk("./data/raw")
288
+
289
+ scored = score_readability(dataset, column="text", add_ranks=True)
290
+
291
+ # Save result
292
+ scored.save_to_disk("./data/scored")
293
+ ```
294
+
295
+ **Readability Levels:**
296
+ - `0`: Very Easy
297
+ - `1`: Easy
298
+ - `2`: Medium
299
+ - `3`: Difficult
300
+ - `4`: Very Difficult
301
+
302
+ ### Dataset Management
303
+
304
+ Utilities for loading, saving, and inspecting datasets.
305
+
306
+ **CLI Usage**
307
+
308
+ **Command:** `dalla-dp info [OPTIONS] DATASET_PATH`
309
+
310
+ **Arguments:**
311
+ - `DATASET_PATH` - Path to the dataset (required, positional argument)
312
+ - `--split TEXT` - Specific split to show info for
313
+
314
+ **Examples:**
315
+ ```bash
316
+ # Show dataset information
317
+ dalla-dp info ./data/my_dataset
318
+
319
+ ```
320
+
321
+ **Python API**
322
+
323
+ ```python
324
+ from dalla.core.dataset import DatasetManager
325
+
326
+ dm = DatasetManager()
327
+
328
+ dataset = dm.load("./data/my_dataset")
329
+ train_data = dm.load("./data/my_dataset", split="train")
330
+
331
+
332
+ info = dm.get_info(dataset)
333
+ dm.print_info(dataset)
334
+
335
+ size = dm.get_size(dataset)
336
+
337
+ filtered = dm.filter_dataset(
338
+ dataset,
339
+ lambda x: x['quality_score'] > 80.0,
340
+ num_proc=4
341
+ )
342
+
343
+ scores = [0.95, 0.87, 0.92, ...]
344
+ dataset = dm.add_column(dataset, "my_score", scores)
345
+
346
+ subset = dm.select_columns(dataset, ["text", "quality_score"])
347
+ cleaned = dm.remove_columns(dataset, ["temp_column"])
348
+
349
+ splits = dm.train_test_split(dataset, test_size=0.2, seed=42)
350
+ ```
351
+
352
+ **Working with DatasetDict**
353
+
354
+ ```python
355
+ from datasets import DatasetDict, load_from_disk
356
+ from dalla.quality import check_quality
357
+
358
+ dataset_dict = load_from_disk("./data/my_dataset")
359
+
360
+ processed_dict = DatasetDict({
361
+ split: check_quality(ds, min_score=60.0)
362
+ for split, ds in dataset_dict.items()
363
+ })
364
+
365
+ train_processed = check_quality(dataset_dict['train'], min_score=60.0)
366
+ ```
367
+
368
+ ## Building Onion from Source
369
+
370
+ **Build Instructions**
371
+
372
+ The onion deduplication tool needs to be compiled for your system:
373
+
374
+ ```bash
375
+ cd dalla/deduplication/onion/src_sc
376
+
377
+ # Compile
378
+ make -f Makefile.g
379
+
380
+ ```
381
+
382
+ Alternatively, use the build script:
383
+
384
+ ```bash
385
+ chmod +x scripts/build_onion.sh
386
+ ./scripts/build_onion.sh
387
+ ```
388
+
389
+ ## Links
390
+
391
+ - Homepage: https://github.com/U4RASD/dalla-data-processing
392
+ - Issues: https://github.com/U4RASD/dalla-data-processing/issues
393
+ - Documentation: https://github.com/U4RASD/dalla-data-processing#readme