dalla-data-processing 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dalla/__init__.py +27 -0
  2. dalla/cli.py +453 -0
  3. dalla/core/__init__.py +6 -0
  4. dalla/core/dataset.py +387 -0
  5. dalla/core/parallel.py +279 -0
  6. dalla/deduplication/__init__.py +370 -0
  7. dalla/deduplication/bin/.gitignore +1 -0
  8. dalla/deduplication/bin/onion-linux-x86_64 +0 -0
  9. dalla/deduplication/onion/COPYING +24 -0
  10. dalla/deduplication/onion/Makefile +21 -0
  11. dalla/deduplication/onion/Makefile.config +3 -0
  12. dalla/deduplication/onion/README.md +21 -0
  13. dalla/deduplication/onion/src/Makefile +22 -0
  14. dalla/deduplication/onion/src/Makefile.g +23 -0
  15. dalla/deduplication/onion/src/buzhash.c +325 -0
  16. dalla/deduplication/onion/src/buzhash.h +30 -0
  17. dalla/deduplication/onion/src/hashdup.c +172 -0
  18. dalla/deduplication/onion/src/hashgen.c +206 -0
  19. dalla/deduplication/onion/src/onion +0 -0
  20. dalla/deduplication/onion/src/onion.c +799 -0
  21. dalla/deduplication/onion/src/onion_dup.c +824 -0
  22. dalla/deduplication/onion/src/version.c +17 -0
  23. dalla/deduplication/onion/src/version.h +10 -0
  24. dalla/deduplication/onion/src_sc/Makefile +22 -0
  25. dalla/deduplication/onion/src_sc/Makefile.g +23 -0
  26. dalla/deduplication/onion/src_sc/buzhash.c +325 -0
  27. dalla/deduplication/onion/src_sc/buzhash.h +30 -0
  28. dalla/deduplication/onion/src_sc/hashdup +0 -0
  29. dalla/deduplication/onion/src_sc/hashdup.c +172 -0
  30. dalla/deduplication/onion/src_sc/hashgen +0 -0
  31. dalla/deduplication/onion/src_sc/hashgen.c +206 -0
  32. dalla/deduplication/onion/src_sc/onion.c +854 -0
  33. dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
  34. dalla/deduplication/onion/src_sc/version.c +17 -0
  35. dalla/deduplication/onion/src_sc/version.h +10 -0
  36. dalla/deduplication/onion_wrapper.py +223 -0
  37. dalla/deduplication/postprocessing.py +216 -0
  38. dalla/deduplication/preprocessing.py +120 -0
  39. dalla/quality/__init__.py +5 -0
  40. dalla/quality/checker.py +354 -0
  41. dalla/readability/__init__.py +197 -0
  42. dalla/readability/ranking.py +165 -0
  43. dalla/readability/scorer.py +148 -0
  44. dalla/stemming/__init__.py +551 -0
  45. dalla/stemming/data/words_al.txt +3414 -0
  46. dalla/stemming/data/words_al_t.txt +885 -0
  47. dalla/stemming/data/words_t.txt +7 -0
  48. dalla/utils/__init__.py +10 -0
  49. dalla/utils/logger.py +128 -0
  50. dalla/utils/tokenize.py +89 -0
  51. dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
  52. dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
  53. dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
  54. dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
  55. dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0
dalla/__init__.py ADDED
@@ -0,0 +1,27 @@
1
+ """
2
+ Dalla Data Processing
3
+
4
+ A comprehensive toolkit for processing Arabic text data with support for:
5
+ - Deduplication
6
+ - Stemming and morphological analysis
7
+ - Quality checking
8
+ - Readability scoring
9
+ """
10
+
11
+ try:
12
+ from dalla.core.dataset import DatasetManager
13
+
14
+ _has_dataset = True
15
+ except ImportError:
16
+ _has_dataset = False
17
+ DatasetManager = None
18
+
19
+ try:
20
+ from dalla.utils.tokenize import simple_word_tokenize
21
+
22
+ _has_tokenize = True
23
+ except ImportError:
24
+ _has_tokenize = False
25
+ simple_word_tokenize = None
26
+
27
+ __all__ = ["DatasetManager", "simple_word_tokenize"]
dalla/cli.py ADDED
@@ -0,0 +1,453 @@
1
+ """
2
+ Main CLI entry point for dalla-process.
3
+
4
+ This module provides the unified command-line interface for all
5
+ Arabic data processing operations.
6
+ """
7
+
8
+ import sys
9
+ from pathlib import Path
10
+
11
+ import click
12
+ from datasets import Dataset, DatasetDict
13
+
14
+ from dalla import __version__
15
+ from dalla.core.dataset import DatasetManager
16
+ from dalla.utils import get_logger, setup_logging
17
+
18
+ setup_logging(log_format="console", log_level="INFO")
19
+ logger = get_logger(__name__)
20
+
21
+
22
+ class Context:
23
+ """Shared context for CLI commands."""
24
+
25
+ def __init__(self):
26
+ self.input_dataset: Path | None = None
27
+ self.output_dataset: Path | None = None
28
+ self.column: str = "text"
29
+ self.num_workers: int | None = None
30
+ self.verbose: bool = False
31
+ self.overwrite: bool = False
32
+ self.dataset: Dataset | None = None
33
+ self.dataset_manager = DatasetManager()
34
+
35
+
36
+ pass_context = click.make_pass_decorator(Context, ensure=True)
37
+
38
+
39
+ @click.group(context_settings={"help_option_names": ["-h", "--help"]})
40
+ @click.version_option(version=__version__, prog_name="dalla-process")
41
+ @click.option(
42
+ "--input-dataset",
43
+ "-i",
44
+ type=click.Path(exists=True, path_type=Path),
45
+ help="Path to input HuggingFace dataset",
46
+ )
47
+ @click.option(
48
+ "--output-dataset",
49
+ "-o",
50
+ type=click.Path(path_type=Path),
51
+ help="Path to save output HuggingFace dataset",
52
+ )
53
+ @click.option(
54
+ "--column",
55
+ "-c",
56
+ default="text",
57
+ help="Column name to process (default: 'text')",
58
+ )
59
+ @click.option(
60
+ "--num-workers",
61
+ "-w",
62
+ type=int,
63
+ help="Number of parallel workers (default: auto)",
64
+ )
65
+ @click.option(
66
+ "--verbose",
67
+ "-v",
68
+ is_flag=True,
69
+ help="Enable verbose output",
70
+ )
71
+ @click.option(
72
+ "--quiet",
73
+ "-q",
74
+ is_flag=True,
75
+ help="Suppress non-error output",
76
+ )
77
+ @click.option(
78
+ "--overwrite",
79
+ is_flag=True,
80
+ help="Overwrite output dataset if it already exists",
81
+ )
82
+ @pass_context
83
+ def cli(
84
+ ctx: Context,
85
+ input_dataset: Path | None,
86
+ output_dataset: Path | None,
87
+ column: str,
88
+ num_workers: int | None,
89
+ verbose: bool,
90
+ quiet: bool,
91
+ overwrite: bool,
92
+ ):
93
+ """
94
+ Dalla Data Processing - Unified Arabic Data Processing Pipeline
95
+
96
+ A comprehensive toolkit for processing Arabic text data with support for:
97
+ - Deduplication using onion algorithm
98
+ - Stemming and morphological analysis
99
+ - Quality checking
100
+ - Readability scoring
101
+
102
+ Examples:
103
+
104
+ # Deduplicate a dataset
105
+ dalla-dp -i ./data/raw -o ./data/deduped deduplicate
106
+
107
+ # Stem text with 8 workers
108
+ dalla-dp -i ./data/raw -o ./data/stemmed -w 8 stem
109
+
110
+ # Check quality with custom column
111
+ dalla-dp -i ./data/raw -o ./data/quality -c content quality-check
112
+ """
113
+ ctx.input_dataset = input_dataset
114
+ ctx.output_dataset = output_dataset
115
+ ctx.column = column
116
+ ctx.num_workers = num_workers
117
+ ctx.verbose = verbose
118
+ ctx.overwrite = overwrite
119
+
120
+ if quiet:
121
+ setup_logging(log_format="console", log_level="ERROR")
122
+ elif verbose:
123
+ setup_logging(log_format="console", log_level="DEBUG")
124
+
125
+
126
+ @cli.command(context_settings={"help_option_names": ["-h", "--help"]})
127
+ @click.option(
128
+ "--threshold",
129
+ "-t",
130
+ type=float,
131
+ default=0.8,
132
+ help="Similarity threshold (0.0-1.0, default: 0.8)",
133
+ )
134
+ @click.option(
135
+ "--return-pairs/--filter-duplicates",
136
+ default=False,
137
+ help="Return dataset with duplicate info (True) or filtered dataset (False)",
138
+ )
139
+ @click.option(
140
+ "--keep-vert-files",
141
+ is_flag=True,
142
+ help="Keep vertical format files for inspection",
143
+ )
144
+ @click.option(
145
+ "--vert-dir",
146
+ type=click.Path(),
147
+ help="Directory to store vertical files (useful for different disk)",
148
+ )
149
+ @click.option(
150
+ "--calculate-scores",
151
+ is_flag=True,
152
+ help="Run phase 2 to calculate similarity scores (slower but more precise)",
153
+ )
154
+ @click.option(
155
+ "--onion-binary",
156
+ type=click.Path(exists=True),
157
+ help="Path to onion binary (auto-detected if not specified)",
158
+ )
159
+ @pass_context
160
+ def deduplicate(
161
+ ctx: Context,
162
+ threshold: float,
163
+ return_pairs: bool,
164
+ keep_vert_files: bool,
165
+ vert_dir: str | None,
166
+ calculate_scores: bool,
167
+ onion_binary: str | None,
168
+ ):
169
+ """Remove duplicate entries using onion algorithm."""
170
+ _require_io_paths(ctx)
171
+
172
+ click.echo(f"Loading dataset from {ctx.input_dataset}")
173
+ dataset = ctx.dataset_manager.load(ctx.input_dataset)
174
+ dataset = _handle_dataset_dict(dataset)
175
+
176
+ mode = "pairs" if return_pairs else "filter"
177
+ click.echo(f"Deduplicating with threshold={threshold}, mode={mode}")
178
+ if calculate_scores:
179
+ click.echo(" Phase 2: ON (calculating similarity scores)")
180
+ else:
181
+ click.echo(" Phase 2: OFF (faster, sufficient for most use cases)")
182
+
183
+ from dalla.deduplication import deduplicate_dataset
184
+
185
+ deduplicated = deduplicate_dataset(
186
+ dataset,
187
+ column=ctx.column,
188
+ threshold=threshold,
189
+ return_pairs=return_pairs,
190
+ keep_vert_files=keep_vert_files,
191
+ vert_dir=Path(vert_dir) if vert_dir else None,
192
+ calculate_scores=calculate_scores,
193
+ onion_binary=Path(onion_binary) if onion_binary else None,
194
+ )
195
+
196
+ click.echo(f"Saving deduplicated dataset to {ctx.output_dataset}")
197
+ ctx.dataset_manager.save(deduplicated, ctx.output_dataset, overwrite=ctx.overwrite)
198
+
199
+ original_size = DatasetManager.get_size(dataset)
200
+ final_size = DatasetManager.get_size(deduplicated)
201
+
202
+ click.echo(click.style("✓ Deduplication complete", fg="green"))
203
+ click.echo(f" Original: {original_size:,} examples")
204
+
205
+ if return_pairs:
206
+ num_dups = sum(1 for ex in deduplicated if ex.get("is_duplicate", False))
207
+ click.echo(
208
+ f" Documents with duplicates: {num_dups:,} ({num_dups / original_size * 100:.1f}%)"
209
+ )
210
+ click.echo(" Added columns: duplicate_cluster, is_duplicate, duplicate_count")
211
+ else:
212
+ removed = original_size - final_size
213
+ click.echo(f" Removed: {removed:,} duplicates ({removed / original_size * 100:.1f}%)")
214
+ click.echo(f" Final: {final_size:,} examples")
215
+
216
+
217
+ @cli.command(context_settings={"help_option_names": ["-h", "--help"]})
218
+ @click.option(
219
+ "--sep-token",
220
+ default="<+>",
221
+ help="Separator token for morphological splits (default: '<+>')",
222
+ )
223
+ @click.option(
224
+ "--normalize",
225
+ is_flag=True,
226
+ help="Apply Arabic normalization",
227
+ )
228
+ @click.option(
229
+ "--keep-diacritics",
230
+ is_flag=True,
231
+ help="Keep diacritics in output",
232
+ )
233
+ @click.option(
234
+ "--model",
235
+ type=click.Choice(["mle", "bert"], case_sensitive=False),
236
+ default="mle",
237
+ help="Disambiguator model (default: mle, faster | bert: more accurate)",
238
+ )
239
+ @click.option(
240
+ "--use-gpu",
241
+ is_flag=True,
242
+ help="Use GPU for BERT model (only applicable when --model=bert)",
243
+ )
244
+ @pass_context
245
+ def stem(
246
+ ctx: Context, sep_token: str, normalize: bool, keep_diacritics: bool, model: str, use_gpu: bool
247
+ ):
248
+ """Apply stemming and morphological analysis."""
249
+ _require_io_paths(ctx)
250
+
251
+ click.echo(f"Loading dataset from {ctx.input_dataset}")
252
+ dataset = ctx.dataset_manager.load(ctx.input_dataset)
253
+ dataset = _handle_dataset_dict(dataset)
254
+
255
+ click.echo(f"Stemming {ctx.column} column (workers={ctx.num_workers or 'auto'})")
256
+ click.echo(f"Model: {model.upper()}{' (GPU enabled)' if model == 'bert' and use_gpu else ''}")
257
+
258
+ from dalla.stemming import stem_dataset
259
+
260
+ stemmed = stem_dataset(
261
+ dataset,
262
+ column=ctx.column,
263
+ sep_token=sep_token,
264
+ normalize=normalize,
265
+ keep_diacritics=keep_diacritics,
266
+ num_proc=ctx.num_workers,
267
+ model=model,
268
+ use_gpu=use_gpu,
269
+ )
270
+
271
+ click.echo(f"Saving stemmed dataset to {ctx.output_dataset}")
272
+ ctx.dataset_manager.save(stemmed, ctx.output_dataset, overwrite=ctx.overwrite)
273
+
274
+ click.echo(click.style("✓ Stemming complete", fg="green"))
275
+
276
+
277
+ @cli.command(context_settings={"help_option_names": ["-h", "--help"]})
278
+ @click.option(
279
+ "--min-score",
280
+ type=float,
281
+ default=0.0,
282
+ help="Minimum quality score to keep (0-100, default: 0)",
283
+ )
284
+ @click.option(
285
+ "--save-errors",
286
+ is_flag=True,
287
+ help="Save erroneous words to file",
288
+ )
289
+ @click.option(
290
+ "--model",
291
+ type=click.Choice(["mle", "bert"], case_sensitive=False),
292
+ default="mle",
293
+ help="Disambiguator model (default: mle, faster | bert: more accurate)",
294
+ )
295
+ @click.option(
296
+ "--use-gpu",
297
+ is_flag=True,
298
+ help="Use GPU for BERT model (only applicable when --model=bert)",
299
+ )
300
+ @pass_context
301
+ def quality_check(ctx: Context, min_score: float, save_errors: bool, model: str, use_gpu: bool):
302
+ """Check text quality and calculate scores."""
303
+ _require_io_paths(ctx)
304
+
305
+ click.echo(f"Loading dataset from {ctx.input_dataset}")
306
+ dataset = ctx.dataset_manager.load(ctx.input_dataset)
307
+ dataset = _handle_dataset_dict(dataset)
308
+
309
+ click.echo(f"Checking quality of {ctx.column} column")
310
+ click.echo(f"Model: {model.upper()}{' (GPU enabled)' if model == 'bert' and use_gpu else ''}")
311
+
312
+ from dalla.quality import check_quality
313
+
314
+ scored = check_quality(
315
+ dataset,
316
+ column=ctx.column,
317
+ min_score=min_score,
318
+ save_errors=save_errors,
319
+ num_workers=ctx.num_workers,
320
+ model=model,
321
+ use_gpu=use_gpu,
322
+ )
323
+
324
+ click.echo(f"Saving quality-checked dataset to {ctx.output_dataset}")
325
+ ctx.dataset_manager.save(scored, ctx.output_dataset, overwrite=ctx.overwrite)
326
+
327
+ original_size = DatasetManager.get_size(dataset)
328
+ final_size = DatasetManager.get_size(scored)
329
+
330
+ click.echo(click.style("✓ Quality check complete", fg="green"))
331
+ if min_score > 0:
332
+ removed = original_size - final_size
333
+ click.echo(
334
+ f" Filtered {removed:,} low-quality examples ({removed / original_size * 100:.1f}%)"
335
+ )
336
+
337
+
338
+ @cli.command(context_settings={"help_option_names": ["-h", "--help"]})
339
+ @click.option(
340
+ "--add-ranks/--no-ranks",
341
+ default=True,
342
+ help="Add ranking and level columns (default: True)",
343
+ )
344
+ @pass_context
345
+ def readability(ctx: Context, add_ranks: bool):
346
+ """Calculate readability scores using Flesch and Osman methods."""
347
+ _require_io_paths(ctx)
348
+
349
+ click.echo(f"Loading dataset from {ctx.input_dataset}")
350
+ dataset = ctx.dataset_manager.load(ctx.input_dataset)
351
+ dataset = _handle_dataset_dict(dataset)
352
+
353
+ click.echo(f"Calculating readability scores for {ctx.column} column")
354
+ if add_ranks:
355
+ click.echo(" Including ranking and difficulty levels (0-4)")
356
+
357
+ from dalla.readability import score_readability
358
+
359
+ scored = score_readability(
360
+ dataset,
361
+ column=ctx.column,
362
+ add_ranks=add_ranks,
363
+ num_proc=ctx.num_workers,
364
+ )
365
+
366
+ click.echo(f"Saving scored dataset to {ctx.output_dataset}")
367
+ ctx.dataset_manager.save(scored, ctx.output_dataset, overwrite=ctx.overwrite)
368
+
369
+ click.echo(click.style("✓ Readability scoring complete", fg="green"))
370
+
371
+ if add_ranks:
372
+ click.echo(" Added columns: flesch_score, osman_score, flesch_rank, osman_rank,")
373
+ click.echo(" readability_level")
374
+ else:
375
+ click.echo(" Added columns: flesch_score, osman_score")
376
+
377
+
378
+ @cli.command(context_settings={"help_option_names": ["-h", "--help"]})
379
+ @click.option(
380
+ "--split",
381
+ help="Specific split to show info for",
382
+ )
383
+ @click.argument(
384
+ "dataset_path",
385
+ type=click.Path(exists=True, path_type=Path),
386
+ )
387
+ def info(dataset_path: Path, split: str | None):
388
+ """Display information about a dataset."""
389
+ dm = DatasetManager()
390
+
391
+ try:
392
+ dataset = dm.load(dataset_path, split=split)
393
+ dm.print_info(dataset)
394
+ except Exception as e:
395
+ click.echo(click.style(f"Error loading dataset: {e}", fg="red"), err=True)
396
+ sys.exit(1)
397
+
398
+
399
+ def _handle_dataset_dict(dataset, split_preference: str = "train"):
400
+ """Handle DatasetDict by selecting appropriate split."""
401
+
402
+ if isinstance(dataset, DatasetDict):
403
+ splits = list(dataset.keys())
404
+ click.echo(f"Dataset has multiple splits: {', '.join(splits)}")
405
+
406
+ if split_preference in dataset:
407
+ click.echo(
408
+ f"Using '{split_preference}' split ({len(dataset[split_preference])} examples)"
409
+ )
410
+ return dataset[split_preference]
411
+ else:
412
+ first_split = splits[0]
413
+ click.echo(f"Using '{first_split}' split ({len(dataset[first_split])} examples)")
414
+ return dataset[first_split]
415
+ else:
416
+ return dataset
417
+
418
+
419
+ def _require_io_paths(ctx: Context):
420
+ """Ensure input and output paths are provided."""
421
+ if ctx.input_dataset is None:
422
+ click.echo(
423
+ click.style("Error: --input-dataset is required", fg="red"),
424
+ err=True,
425
+ )
426
+ click.echo("Use --help for usage information")
427
+ sys.exit(1)
428
+
429
+ if ctx.output_dataset is None:
430
+ click.echo(
431
+ click.style("Error: --output-dataset is required", fg="red"),
432
+ err=True,
433
+ )
434
+ click.echo("Use --help for usage information")
435
+ sys.exit(1)
436
+
437
+
438
+ def main():
439
+ """Main entry point for the CLI."""
440
+ try:
441
+ cli(obj=Context())
442
+ except KeyboardInterrupt:
443
+ click.echo("\n" + click.style("Interrupted by user", fg="yellow"))
444
+ sys.exit(130)
445
+ except Exception as e:
446
+ click.echo(click.style(f"Error: {e}", fg="red"), err=True)
447
+ if "--verbose" in sys.argv or "-v" in sys.argv:
448
+ raise
449
+ sys.exit(1)
450
+
451
+
452
+ if __name__ == "__main__":
453
+ main()
dalla/core/__init__.py ADDED
@@ -0,0 +1,6 @@
1
+ """Core utilities for dataset I/O and parallel processing."""
2
+
3
+ from dalla.core.dataset import DatasetManager
4
+ from dalla.core.parallel import ParallelProcessor
5
+
6
+ __all__ = ["DatasetManager", "ParallelProcessor"]