dalla-data-processing 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dalla/__init__.py +27 -0
  2. dalla/cli.py +453 -0
  3. dalla/core/__init__.py +6 -0
  4. dalla/core/dataset.py +387 -0
  5. dalla/core/parallel.py +279 -0
  6. dalla/deduplication/__init__.py +370 -0
  7. dalla/deduplication/bin/.gitignore +1 -0
  8. dalla/deduplication/bin/onion-linux-x86_64 +0 -0
  9. dalla/deduplication/onion/COPYING +24 -0
  10. dalla/deduplication/onion/Makefile +21 -0
  11. dalla/deduplication/onion/Makefile.config +3 -0
  12. dalla/deduplication/onion/README.md +21 -0
  13. dalla/deduplication/onion/src/Makefile +22 -0
  14. dalla/deduplication/onion/src/Makefile.g +23 -0
  15. dalla/deduplication/onion/src/buzhash.c +325 -0
  16. dalla/deduplication/onion/src/buzhash.h +30 -0
  17. dalla/deduplication/onion/src/hashdup.c +172 -0
  18. dalla/deduplication/onion/src/hashgen.c +206 -0
  19. dalla/deduplication/onion/src/onion +0 -0
  20. dalla/deduplication/onion/src/onion.c +799 -0
  21. dalla/deduplication/onion/src/onion_dup.c +824 -0
  22. dalla/deduplication/onion/src/version.c +17 -0
  23. dalla/deduplication/onion/src/version.h +10 -0
  24. dalla/deduplication/onion/src_sc/Makefile +22 -0
  25. dalla/deduplication/onion/src_sc/Makefile.g +23 -0
  26. dalla/deduplication/onion/src_sc/buzhash.c +325 -0
  27. dalla/deduplication/onion/src_sc/buzhash.h +30 -0
  28. dalla/deduplication/onion/src_sc/hashdup +0 -0
  29. dalla/deduplication/onion/src_sc/hashdup.c +172 -0
  30. dalla/deduplication/onion/src_sc/hashgen +0 -0
  31. dalla/deduplication/onion/src_sc/hashgen.c +206 -0
  32. dalla/deduplication/onion/src_sc/onion.c +854 -0
  33. dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
  34. dalla/deduplication/onion/src_sc/version.c +17 -0
  35. dalla/deduplication/onion/src_sc/version.h +10 -0
  36. dalla/deduplication/onion_wrapper.py +223 -0
  37. dalla/deduplication/postprocessing.py +216 -0
  38. dalla/deduplication/preprocessing.py +120 -0
  39. dalla/quality/__init__.py +5 -0
  40. dalla/quality/checker.py +354 -0
  41. dalla/readability/__init__.py +197 -0
  42. dalla/readability/ranking.py +165 -0
  43. dalla/readability/scorer.py +148 -0
  44. dalla/stemming/__init__.py +551 -0
  45. dalla/stemming/data/words_al.txt +3414 -0
  46. dalla/stemming/data/words_al_t.txt +885 -0
  47. dalla/stemming/data/words_t.txt +7 -0
  48. dalla/utils/__init__.py +10 -0
  49. dalla/utils/logger.py +128 -0
  50. dalla/utils/tokenize.py +89 -0
  51. dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
  52. dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
  53. dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
  54. dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
  55. dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,7 @@
1
+ الأرسطية
2
+ الراشدية
3
+ القاسمية
4
+ اليوسفية
5
+ الكاظمية
6
+ الشاذلية
7
+ الكاريبية
@@ -0,0 +1,10 @@
1
+ """
2
+ Utility functions for text processing.
3
+
4
+ This module provides utilities for tokenization, text manipulation, and logging.
5
+ """
6
+
7
+ from dalla.utils.logger import get_logger, logger, setup_logging
8
+ from dalla.utils.tokenize import simple_word_tokenize
9
+
10
+ __all__ = ["simple_word_tokenize", "logger", "get_logger", "setup_logging"]
dalla/utils/logger.py ADDED
@@ -0,0 +1,128 @@
1
+ """
2
+ Structured logging configuration using structlog.
3
+
4
+ This module provides a centralized logging setup for the application,
5
+ with support for both JSON and console output formats.
6
+ """
7
+
8
+ import logging
9
+ import sys
10
+
11
+ import structlog
12
+ from structlog.types import EventDict, Processor
13
+
14
+
15
+ def rename_event_key(_, __, event_dict: EventDict) -> EventDict:
16
+ """
17
+ Log entries keep the text message in the `event` field, but some systems
18
+ use the `message` field. This processor moves the value from one field to
19
+ the other.
20
+ See https://github.com/hynek/structlog/issues/35#issuecomment-591321744
21
+ """
22
+ event_dict["message"] = event_dict.pop("event")
23
+ return event_dict
24
+
25
+
26
+ def drop_color_message_key(_, __, event_dict: EventDict) -> EventDict:
27
+ """
28
+ Drop the `color_message` key from the event dict if it exists.
29
+ Some loggers add this redundant field which we don't need.
30
+ """
31
+ event_dict.pop("color_message", None)
32
+ return event_dict
33
+
34
+
35
+ def setup_logging(log_format: str = "console", log_level: str = "INFO"):
36
+ """
37
+ Configure structured logging for the application.
38
+
39
+ Args:
40
+ log_format: Output format - either "json" or "console" (default: "console")
41
+ log_level: Minimum log level - DEBUG, INFO, WARNING, ERROR, or CRITICAL (default: "INFO")
42
+ """
43
+ timestamper = structlog.processors.TimeStamper(fmt="iso")
44
+
45
+ shared_processors: list[Processor] = [
46
+ structlog.contextvars.merge_contextvars,
47
+ structlog.stdlib.add_logger_name,
48
+ structlog.stdlib.add_log_level,
49
+ structlog.stdlib.PositionalArgumentsFormatter(),
50
+ structlog.stdlib.ExtraAdder(),
51
+ drop_color_message_key,
52
+ timestamper,
53
+ structlog.processors.StackInfoRenderer(),
54
+ ]
55
+
56
+ if log_format == "json":
57
+ # We rename the `event` key to `message` only in JSON logs
58
+ # The pretty ConsoleRenderer looks for `event`
59
+ shared_processors.append(rename_event_key)
60
+ # Format the exception only for JSON logs, as we want to pretty-print them when
61
+ # using the ConsoleRenderer
62
+ shared_processors.append(structlog.processors.format_exc_info)
63
+
64
+ structlog.configure(
65
+ processors=shared_processors
66
+ + [
67
+ # Prepare event dict for `ProcessorFormatter`.
68
+ structlog.stdlib.ProcessorFormatter.wrap_for_formatter,
69
+ ],
70
+ logger_factory=structlog.stdlib.LoggerFactory(),
71
+ cache_logger_on_first_use=True,
72
+ )
73
+
74
+ log_renderer: structlog.types.Processor
75
+ if log_format == "json":
76
+ log_renderer = structlog.processors.JSONRenderer()
77
+ else:
78
+ log_renderer = structlog.dev.ConsoleRenderer()
79
+
80
+ formatter = structlog.stdlib.ProcessorFormatter(
81
+ # These run ONLY on `logging` entries that do NOT originate within
82
+ # structlog.
83
+ foreign_pre_chain=shared_processors,
84
+ # These run on ALL entries after the pre_chain is done.
85
+ processors=[
86
+ # Remove _record & _from_structlog.
87
+ structlog.stdlib.ProcessorFormatter.remove_processors_meta,
88
+ log_renderer,
89
+ ],
90
+ )
91
+
92
+ handler = logging.StreamHandler()
93
+ # Use OUR `ProcessorFormatter` to format all `logging` entries.
94
+ handler.setFormatter(formatter)
95
+ root_logger = logging.getLogger()
96
+ root_logger.addHandler(handler)
97
+ root_logger.setLevel(log_level.upper())
98
+
99
+ def handle_exception(exc_type, exc_value, exc_traceback):
100
+ """
101
+ Log any uncaught exception instead of letting it be printed by Python
102
+ (but leave KeyboardInterrupt untouched to allow users to Ctrl+C to stop)
103
+ See https://stackoverflow.com/a/16993115/3641865
104
+ """
105
+ if issubclass(exc_type, KeyboardInterrupt):
106
+ sys.__excepthook__(exc_type, exc_value, exc_traceback)
107
+ return
108
+
109
+ root_logger.error("Uncaught exception", exc_info=(exc_type, exc_value, exc_traceback))
110
+
111
+ sys.excepthook = handle_exception
112
+
113
+
114
+ def get_logger(name: str = "dalla") -> structlog.stdlib.BoundLogger:
115
+ """
116
+ Get a structured logger instance.
117
+
118
+ Args:
119
+ name: Logger name (default: "dalla")
120
+
121
+ Returns:
122
+ A configured structlog logger instance
123
+ """
124
+ return structlog.stdlib.get_logger(name)
125
+
126
+
127
+ # Default logger instance for the application
128
+ logger = get_logger("dalla")
@@ -0,0 +1,89 @@
1
+ # MIT License
2
+ #
3
+ # Copyright 2018-2024 New York University Abu Dhabi
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+
24
+ """This module contains utilities for word-boundary tokenization."""
25
+
26
+ import re
27
+
28
+ from camel_tools.utils.charsets import (
29
+ EMOJI_MULTICHAR_CHARSET,
30
+ UNICODE_LETTER_CHARSET,
31
+ UNICODE_LETTER_MARK_NUMBER_CHARSET,
32
+ UNICODE_MARK_CHARSET,
33
+ UNICODE_NUMBER_CHARSET,
34
+ UNICODE_PUNCT_SYMBOL_CHARSET,
35
+ )
36
+
37
+ __all__ = ["simple_word_tokenize"]
38
+
39
+
40
+ _ALL_PUNCT_SYMBOLS = UNICODE_PUNCT_SYMBOL_CHARSET | EMOJI_MULTICHAR_CHARSET
41
+ _ALL_PUNCT_SYMBOLS = [re.escape(x) for x in _ALL_PUNCT_SYMBOLS]
42
+ _ALL_PUNCT_SYMBOLS = sorted(_ALL_PUNCT_SYMBOLS, key=len, reverse=True)
43
+ _WHITESPACE_RE = r"\s+"
44
+ _ALL_NUMBER = "".join(UNICODE_NUMBER_CHARSET)
45
+ _ALL_LETTER_MARK = "".join(UNICODE_LETTER_CHARSET | UNICODE_MARK_CHARSET)
46
+ _ALL_LETTER_MARK_NUMBER = "".join(UNICODE_LETTER_MARK_NUMBER_CHARSET)
47
+
48
+ _TOKENIZE_RE = re.compile(
49
+ "|".join(_ALL_PUNCT_SYMBOLS)
50
+ + r"|["
51
+ + re.escape(_ALL_LETTER_MARK_NUMBER)
52
+ + r"]+|"
53
+ + _WHITESPACE_RE
54
+ )
55
+ _TOKENIZE_NUMBER_RE = re.compile(
56
+ "|".join(_ALL_PUNCT_SYMBOLS)
57
+ + r"|["
58
+ + re.escape(_ALL_NUMBER)
59
+ + r"]+|["
60
+ + re.escape(_ALL_LETTER_MARK)
61
+ + r"]+"
62
+ )
63
+
64
+
65
+ def simple_word_tokenize(sentence, split_digits=False):
66
+ """Tokenizes a sentence by splitting on whitespace and seperating
67
+ punctuation. The resulting tokens are either alpha-numeric words, single
68
+ punctuation/symbol/emoji characters, or multi-character emoji sequences.
69
+ This function is language agnostic and splits all characters marked as
70
+ punctuation or symbols in the Unicode specification.
71
+ For example, tokenizing :code:`'Hello, world!!!'`
72
+ would yield :code:`['Hello', ',', 'world', '!', '!', '!']`.
73
+ If split_digits is set to True, it also splits on number.
74
+ For example, tokenizing :code:`'Hello, world123!!!'`
75
+ would yield :code:`['Hello', ',', 'world', '123', '!', '!', '!']`.
76
+
77
+ Args:
78
+ sentence (:obj:`str`): Sentence to tokenize.
79
+ split_digits (:obj:`bool`, optional): The flag to split on number.
80
+ Defaults to False.
81
+
82
+ Returns:
83
+ :obj:`list` of :obj:`str`: The list of tokens.
84
+ """
85
+
86
+ if split_digits:
87
+ return _TOKENIZE_NUMBER_RE.findall(sentence)
88
+ else:
89
+ return _TOKENIZE_RE.findall(sentence)
@@ -0,0 +1,393 @@
1
+ Metadata-Version: 2.4
2
+ Name: dalla-data-processing
3
+ Version: 0.0.1
4
+ Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
5
+ Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
6
+ Project-URL: Homepage, https://github.com/U4RASD/dalla-data-processing
7
+ Project-URL: Documentation, https://github.com/U4RASD/dalla-data-processing#readme
8
+ Project-URL: Repository, https://github.com/U4RASD/dalla-data-processing
9
+ Project-URL: Bug Tracker, https://github.com/U4RASD/dalla-data-processing/issues
10
+ Keywords: arabic,nlp,data-processing,deduplication,stemming,readability,quality
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
16
+ Classifier: Topic :: Text Processing :: Linguistic
17
+ Requires-Python: >=3.12
18
+ Description-Content-Type: text/markdown
19
+ Requires-Dist: datasets>=2.14.0
20
+ Requires-Dist: transformers>=4.30.0
21
+ Requires-Dist: camel-tools>=1.5.0
22
+ Requires-Dist: click>=8.0.0
23
+ Requires-Dist: tqdm>=4.65.0
24
+ Requires-Dist: pandas>=2.0.0
25
+ Requires-Dist: numpy>=1.24.0
26
+ Requires-Dist: pyarrow>=12.0.0
27
+ Requires-Dist: textstat>=0.7.0
28
+ Requires-Dist: structlog>=24.0.0
29
+ Provides-Extra: dev
30
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
31
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
32
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
33
+ Requires-Dist: pre-commit>=3.0.0; extra == "dev"
34
+ Provides-Extra: dedup-native
35
+ Requires-Dist: cffi>=1.15.0; extra == "dedup-native"
36
+ Provides-Extra: all
37
+ Requires-Dist: dalla-data-processing[dedup-native,dev]; extra == "all"
38
+
39
+ # Dalla Data Processing (dalla-dp)
40
+
41
+ A comprehensive Arabic data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models.
42
+
43
+ ## Compatibility
44
+
45
+ - **Linux**: Fully supported
46
+ - **macOS**: Fully supported (Intel or through rosetta)
47
+ - **Windows**: Supported through WSL (Windows Subsystem for Linux) only, for native windows: manual build from source works for deduplication.
48
+
49
+ ## Installation
50
+
51
+ <b>Using uv</b>
52
+
53
+ ```bash
54
+ # Install the package
55
+ uv pip install dalla-data-processing
56
+ ```
57
+
58
+
59
+ <b>Using pip</b>
60
+
61
+ ```bash
62
+ # Install the package
63
+ pip install dalla-data-processing
64
+ ```
65
+
66
+
67
+ <b>From Source</b>
68
+
69
+ ```bash
70
+ git clone https://github.com/U4RASD/dalla-data-processing.git
71
+ cd dalla-data-processing
72
+
73
+ # Using uv
74
+ uv pip install -e .
75
+
76
+ # Or using pip
77
+ pip install -e .
78
+ ```
79
+
80
+ ## Components
81
+
82
+ ### Deduplication
83
+
84
+ **CLI Usage**
85
+
86
+ **Command:** `dalla-dp deduplicate [OPTIONS]`
87
+
88
+ **Arguments:**
89
+ - `-t, --threshold FLOAT` - Similarity threshold (0.0-1.0, default: 0.8)
90
+ - `--return-pairs` / `--filter-duplicates` - Return dataset with duplicate info (default) or filtered dataset
91
+ - `--keep-vert-files` - Keep vertical format files for inspection
92
+ - `--vert-dir PATH` - Directory to store vertical files (useful for different disk)
93
+ - `--calculate-scores` - Run phase 2 to calculate similarity scores (slower but more precise)
94
+ - `--onion-binary PATH` - Path to onion binary (auto-detected if not specified)
95
+
96
+ **Examples:**
97
+ ```bash
98
+ # Basic deduplication
99
+ dalla-dp -i ./data/raw -o ./data/deduped deduplicate
100
+
101
+ # With custom threshold
102
+ dalla-dp -i ./data/raw -o ./data/deduped deduplicate --threshold 0.9
103
+
104
+ # Return filtered dataset (removes duplicates)
105
+ dalla-dp -i ./data/raw -o ./data/clean deduplicate --filter-duplicates
106
+
107
+ # Keep intermediate files for inspection
108
+ dalla-dp -i ./data/raw -o ./data/deduped deduplicate --keep-vert-files
109
+
110
+ # Calculate precise similarity scores (slower)
111
+ dalla-dp -i ./data/raw -o ./data/deduped deduplicate --calculate-scores
112
+
113
+ # Use custom onion binary
114
+ dalla-dp -i ./data/raw -o ./data/deduped deduplicate --onion-binary /path/to/onion
115
+ ```
116
+
117
+ **Python API**
118
+
119
+ ```python
120
+ from datasets import load_from_disk
121
+ from dalla.deduplication import deduplicate_dataset
122
+
123
+ # Load dataset
124
+ dataset = load_from_disk("./data/raw")
125
+
126
+ # get duplicate information (adds columns: duplicate_cluster, is_duplicate, duplicate_count)
127
+ result = deduplicate_dataset(dataset, column="text", threshold=0.8, return_pairs=True)
128
+
129
+ # filter to see only duplicates
130
+ duplicates = result.filter(lambda x: x['is_duplicate'])
131
+
132
+ deduped.save_to_disk("./data/clean")
133
+ ```
134
+
135
+ ### Stemming
136
+
137
+ Apply morphological analysis and stemming using CAMeL Tools.
138
+
139
+ **CLI Usage**
140
+
141
+ **Command:** `dalla-dp stem [OPTIONS]`
142
+
143
+ **Arguments:**
144
+ - `--sep-token TEXT` - Separator token for morphological splits (default: `<+>`)
145
+ - `--normalize` - Apply Arabic normalization
146
+ - `--keep-diacritics` - Keep diacritics in output
147
+ - `--model [mle|bert]` - Disambiguator model (default: mle, faster | bert: more accurate)
148
+ - `--use-gpu` - Use GPU for BERT model (only applicable when --model=bert)
149
+
150
+ **Examples:**
151
+ ```bash
152
+ # Basic stemming with MLE model
153
+ dalla-dp -i ./data/raw -o ./data/stemmed stem
154
+
155
+ # Use BERT model
156
+ dalla-dp -i ./data/raw -o ./data/stemmed stem --model bert
157
+
158
+ # Use BERT with GPU acceleration
159
+ dalla-dp -i ./data/raw -o ./data/stemmed stem --model bert --use-gpu
160
+
161
+ # Custom separator token
162
+ dalla-dp -i ./data/raw -o ./data/stemmed stem --sep-token "<SEP>"
163
+
164
+ # Apply normalization
165
+ dalla-dp -i ./data/raw -o ./data/stemmed stem --normalize
166
+
167
+ # Keep diacritics in output
168
+ dalla-dp -i ./data/raw -o ./data/stemmed stem --keep-diacritics
169
+
170
+ ```
171
+
172
+ **Python API**
173
+
174
+ ```python
175
+ from datasets import load_from_disk
176
+ from dalla.stemming import stem_dataset
177
+
178
+ # Load dataset
179
+ dataset = load_from_disk("./data/raw")
180
+
181
+ stemmed = stem_dataset(dataset, column="text")
182
+
183
+ stemmed = stem_dataset(
184
+ dataset,
185
+ column="text",
186
+ model="bert",
187
+ use_gpu=True,
188
+ num_proc=8
189
+ )
190
+
191
+ stemmed = stem_dataset(
192
+ dataset,
193
+ column="content",
194
+ sep_token="<+>",
195
+ normalize=True,
196
+ keep_diacritics=True
197
+ )
198
+
199
+ stemmed.save_to_disk("./data/stemmed")
200
+ ```
201
+
202
+ ### Quality Checking
203
+
204
+ Check text quality using morphological analysis to detect errors and foreign words.
205
+
206
+ **CLI Usage**
207
+
208
+ **Command:** `dalla-dp quality-check [OPTIONS]`
209
+
210
+ **Arguments:**
211
+ - `--min-score FLOAT` - Minimum quality score to keep (0-100, default: 0)
212
+ - `--save-errors` - Save erroneous words to file
213
+ - `--model [mle|bert]` - Disambiguator model (default: mle, faster | bert: more accurate)
214
+ - `--use-gpu` - Use GPU for BERT model (only applicable when --model=bert)
215
+
216
+ **Examples:**
217
+ ```bash
218
+ dalla-dp -i ./data/raw -o ./data/quality quality-check
219
+
220
+ # Filter low-quality texts (score < 50)
221
+ dalla-dp -i ./data/raw -o ./data/quality quality-check --min-score 50
222
+
223
+ # Save erroneous words to log
224
+ dalla-dp -i ./data/raw -o ./data/quality quality-check --save-errors
225
+
226
+ # Use BERT model with GPU
227
+ dalla-dp -i ./data/raw -o ./data/quality quality-check --model bert --use-gpu
228
+
229
+ dalla-dp -i ./data/raw -o ./data/quality -c content quality-check
230
+ ```
231
+
232
+ **Python API**
233
+
234
+ ```python
235
+ from datasets import load_from_disk
236
+ from dalla.quality import check_quality
237
+
238
+ dataset = load_from_disk("./data/raw")
239
+
240
+ scored = check_quality(dataset, column="text")
241
+
242
+ high_quality = check_quality(
243
+ dataset,
244
+ column="text",
245
+ min_score=60.0,
246
+ save_errors=True
247
+ )
248
+
249
+ scored = check_quality(
250
+ dataset,
251
+ model="bert",
252
+ use_gpu=True,
253
+ num_workers=4,
254
+ timeout=3600
255
+ )
256
+
257
+ scored.save_to_disk("./data/quality")
258
+ ```
259
+
260
+ ### Readability Scoring
261
+
262
+ Calculate readability scores using Flesch Reading Ease and Osman methods.
263
+
264
+ **CLI Usage**
265
+
266
+ **Command:** `dalla-dp readability [OPTIONS]`
267
+
268
+ **Arguments:**
269
+ - `--add-ranks` / `--no-ranks` - Add ranking and level columns (default: True)
270
+
271
+ **Examples:**
272
+ ```bash
273
+ dalla-dp -i ./data/raw -o ./data/scored readability
274
+
275
+ dalla-dp -i ./data/raw -o ./data/scored readability --no-ranks
276
+
277
+ dalla-dp -i ./data/raw -o ./data/scored -c content readability
278
+ ```
279
+
280
+ **Python API**
281
+
282
+ ```python
283
+ from datasets import load_from_disk
284
+ from dalla.readability import score_readability
285
+
286
+ # Load dataset
287
+ dataset = load_from_disk("./data/raw")
288
+
289
+ scored = score_readability(dataset, column="text", add_ranks=True)
290
+
291
+ # Save result
292
+ scored.save_to_disk("./data/scored")
293
+ ```
294
+
295
+ **Readability Levels:**
296
+ - `0`: Very Easy
297
+ - `1`: Easy
298
+ - `2`: Medium
299
+ - `3`: Difficult
300
+ - `4`: Very Difficult
301
+
302
+ ### Dataset Management
303
+
304
+ Utilities for loading, saving, and inspecting datasets.
305
+
306
+ **CLI Usage**
307
+
308
+ **Command:** `dalla-dp info [OPTIONS] DATASET_PATH`
309
+
310
+ **Arguments:**
311
+ - `DATASET_PATH` - Path to the dataset (required, positional argument)
312
+ - `--split TEXT` - Specific split to show info for
313
+
314
+ **Examples:**
315
+ ```bash
316
+ # Show dataset information
317
+ dalla-dp info ./data/my_dataset
318
+
319
+ ```
320
+
321
+ **Python API**
322
+
323
+ ```python
324
+ from dalla.core.dataset import DatasetManager
325
+
326
+ dm = DatasetManager()
327
+
328
+ dataset = dm.load("./data/my_dataset")
329
+ train_data = dm.load("./data/my_dataset", split="train")
330
+
331
+
332
+ info = dm.get_info(dataset)
333
+ dm.print_info(dataset)
334
+
335
+ size = dm.get_size(dataset)
336
+
337
+ filtered = dm.filter_dataset(
338
+ dataset,
339
+ lambda x: x['quality_score'] > 80.0,
340
+ num_proc=4
341
+ )
342
+
343
+ scores = [0.95, 0.87, 0.92, ...]
344
+ dataset = dm.add_column(dataset, "my_score", scores)
345
+
346
+ subset = dm.select_columns(dataset, ["text", "quality_score"])
347
+ cleaned = dm.remove_columns(dataset, ["temp_column"])
348
+
349
+ splits = dm.train_test_split(dataset, test_size=0.2, seed=42)
350
+ ```
351
+
352
+ **Working with DatasetDict**
353
+
354
+ ```python
355
+ from datasets import DatasetDict, load_from_disk
356
+ from dalla.quality import check_quality
357
+
358
+ dataset_dict = load_from_disk("./data/my_dataset")
359
+
360
+ processed_dict = DatasetDict({
361
+ split: check_quality(ds, min_score=60.0)
362
+ for split, ds in dataset_dict.items()
363
+ })
364
+
365
+ train_processed = check_quality(dataset_dict['train'], min_score=60.0)
366
+ ```
367
+
368
+ ## Building Onion from Source
369
+
370
+ **Build Instructions**
371
+
372
+ The onion deduplication tool needs to be compiled for your system:
373
+
374
+ ```bash
375
+ cd dalla/deduplication/onion/src_sc
376
+
377
+ # Compile
378
+ make -f Makefile.g
379
+
380
+ ```
381
+
382
+ Alternatively, use the build script:
383
+
384
+ ```bash
385
+ chmod +x scripts/build_onion.sh
386
+ ./scripts/build_onion.sh
387
+ ```
388
+
389
+ ## Links
390
+
391
+ - Homepage: https://github.com/U4RASD/dalla-data-processing
392
+ - Issues: https://github.com/U4RASD/dalla-data-processing/issues
393
+ - Documentation: https://github.com/U4RASD/dalla-data-processing#readme
@@ -0,0 +1,55 @@
1
+ dalla/__init__.py,sha256=bkI4Q89iOWvfajFLLN227oILlvLmBs-2XSMe_GU7p9g,574
2
+ dalla/cli.py,sha256=vyTg6QVdTEPgyd10XNj9yHDlO6pWfnlk8QOycoNUjIg,13445
3
+ dalla/core/__init__.py,sha256=Akdu9E15y_w4yVNOiFaNoL4wzozK3bK5DAjRU7Q2Tts,210
4
+ dalla/core/dataset.py,sha256=EVEU-dYu6JNDHEJ3mDQ0Oe2NOts0RJ9jdGqZ7Xfxj0s,12311
5
+ dalla/core/parallel.py,sha256=OjkZu_okQoUmvQTR4_ILcQYXrHlZJdUKIC1LEhB1DZQ,8412
6
+ dalla/deduplication/__init__.py,sha256=2z22nVJ0SiSsoMPRDag5bTUNGnUg9W2daronVyyg-Ds,13191
7
+ dalla/deduplication/onion_wrapper.py,sha256=A_YOc9A_6tyPHro721dqpKkiEJK-INiP9ylM0EhQC9E,6245
8
+ dalla/deduplication/postprocessing.py,sha256=jCOf3-VzcO6sMoPDl7UZOPr1-X2_dWcF317UCFRjJ4E,6276
9
+ dalla/deduplication/preprocessing.py,sha256=-Cf9rj16AcHTA_Xpr-SLnpNmOA-ZDp_tADysO3VcQYU,3071
10
+ dalla/deduplication/bin/.gitignore,sha256=4J55bzduDaecLUxWsg5lYWaYWNdmhJBNQTCNEtYkMFI,8
11
+ dalla/deduplication/bin/onion-linux-x86_64,sha256=JMFQSWLu5c7no41H5aSVZpNoEHC5ZoWI28kZBa5tFGc,116200
12
+ dalla/deduplication/onion/COPYING,sha256=2uaW6t5ZQSZTUzpWPwHk53FcxWx4A4rC5FaPEQejYQg,1428
13
+ dalla/deduplication/onion/Makefile,sha256=cAbImtRPHXmCDlhlm3TJHMDt7_FU5AZrloGGOiNTG18,587
14
+ dalla/deduplication/onion/Makefile.config,sha256=21e5PX_LaLk1Nce0pw4z58BOYF8HiSVCPVBz2Y3F4PM,73
15
+ dalla/deduplication/onion/README.md,sha256=cqoUMWUKQhGG-nnOVS-6cYauA4HzYB4UQanMSavajlI,508
16
+ dalla/deduplication/onion/src/Makefile,sha256=9EsDLZDWHNG0zecIaMZnpW14zGZJcoVT0z3CmVqq8aU,349
17
+ dalla/deduplication/onion/src/Makefile.g,sha256=X_iwBr1UGvr-36fQEnJUuGPq9sBVUI6bpGtAghiJqJs,366
18
+ dalla/deduplication/onion/src/buzhash.c,sha256=UXhOk8t3aE0HqZXNBMF6PgMMb0VkRAMmkeO_R2G9UCY,9281
19
+ dalla/deduplication/onion/src/buzhash.h,sha256=47UdIUVWW0DcBxgxRQsnCpdn6VBYQBelZVPgU0eJ5vk,1206
20
+ dalla/deduplication/onion/src/hashdup.c,sha256=gYylof5UKT0Pd2GUd282OhRV0l9oBMMUnXTcMrpkVnU,4887
21
+ dalla/deduplication/onion/src/hashgen.c,sha256=jHi90u9kJ5x1UbAZ_ksDXlXD1EtarNHvnf1pVsQ0XgY,6494
22
+ dalla/deduplication/onion/src/onion,sha256=01djWtqAWXHu0oBIyuZ-BoxV-qrmasgJYlV2E18KIZE,677200
23
+ dalla/deduplication/onion/src/onion.c,sha256=-8FpfR837Zpx4hmWr6j_QW0z8k_RFHDyLMeGl0tKlLs,26306
24
+ dalla/deduplication/onion/src/onion_dup.c,sha256=tiodAJS50fqarDbN8rNsjFifQm89BwKRfKHuueP-OoA,27581
25
+ dalla/deduplication/onion/src/version.c,sha256=Q2gLN26lVgRNMOsT5fVGa0za3JZO4B2wF2fiCWO7g3g,776
26
+ dalla/deduplication/onion/src/version.h,sha256=ID4eerlhYTU-aDcx_R_eBRotufx1GM-SP5jDOi4Oki0,563
27
+ dalla/deduplication/onion/src_sc/Makefile,sha256=9EsDLZDWHNG0zecIaMZnpW14zGZJcoVT0z3CmVqq8aU,349
28
+ dalla/deduplication/onion/src_sc/Makefile.g,sha256=X_iwBr1UGvr-36fQEnJUuGPq9sBVUI6bpGtAghiJqJs,366
29
+ dalla/deduplication/onion/src_sc/buzhash.c,sha256=UXhOk8t3aE0HqZXNBMF6PgMMb0VkRAMmkeO_R2G9UCY,9281
30
+ dalla/deduplication/onion/src_sc/buzhash.h,sha256=47UdIUVWW0DcBxgxRQsnCpdn6VBYQBelZVPgU0eJ5vk,1206
31
+ dalla/deduplication/onion/src_sc/hashdup,sha256=wWQt0Qn7YrO-6jIV4G0nXlCD2tbyYYuFjU3KypLv5qw,21376
32
+ dalla/deduplication/onion/src_sc/hashdup.c,sha256=gYylof5UKT0Pd2GUd282OhRV0l9oBMMUnXTcMrpkVnU,4887
33
+ dalla/deduplication/onion/src_sc/hashgen,sha256=-zNF_rmTxi-Z9S53wYicAkTToqk8rOcdM3zypXoWd7s,21664
34
+ dalla/deduplication/onion/src_sc/hashgen.c,sha256=jHi90u9kJ5x1UbAZ_ksDXlXD1EtarNHvnf1pVsQ0XgY,6494
35
+ dalla/deduplication/onion/src_sc/onion.c,sha256=1S6DTn2X5a42ZmuBGjvmQyaLQNelr1_QWvD3gMpYx3g,28465
36
+ dalla/deduplication/onion/src_sc/onion_dup.c,sha256=tiodAJS50fqarDbN8rNsjFifQm89BwKRfKHuueP-OoA,27581
37
+ dalla/deduplication/onion/src_sc/version.c,sha256=Q2gLN26lVgRNMOsT5fVGa0za3JZO4B2wF2fiCWO7g3g,776
38
+ dalla/deduplication/onion/src_sc/version.h,sha256=ID4eerlhYTU-aDcx_R_eBRotufx1GM-SP5jDOi4Oki0,563
39
+ dalla/quality/__init__.py,sha256=TAV5rk6UV0sRogoflfuKsaQ4u5opumU1dd7gSIsugRQ,171
40
+ dalla/quality/checker.py,sha256=8n8rLaihhC32gVL4jVj1wH4uVCBM7wPYTrZEVtU--8k,12671
41
+ dalla/readability/__init__.py,sha256=t5dghcv-A5_R7txF9fW9vfdgDc9l_5fFvDwGuJut4OM,6560
42
+ dalla/readability/ranking.py,sha256=dy1UXTHcKmrTwsnT888_-AnoTJTcKjwHBkBhWyfppXw,5382
43
+ dalla/readability/scorer.py,sha256=yFJnH5xXXYo6ga92ceH4z-O01GczrmpB4bIGkIbuMjw,4822
44
+ dalla/stemming/__init__.py,sha256=NbOskEhy4v4GIz2yd7Xcn3gH_YmX_fNaotziqRu8BQ0,19267
45
+ dalla/stemming/data/words_al.txt,sha256=KiekmVjY9B-whpIby7D78C0HSAFy7fxs8RLlOoAVuwc,55876
46
+ dalla/stemming/data/words_al_t.txt,sha256=92WtZbqaM-wCJCKFvckeRxCk6rLJ12ExAnN_NIbDD9M,16047
47
+ dalla/stemming/data/words_t.txt,sha256=V8cpifv1W3XE0lylRkBgRi_Pl8rydIFoB6wVcI25KzA,121
48
+ dalla/utils/__init__.py,sha256=hPQJk-9afcex9zLdfvaTtO_kL-c9mQI-b6S63INYX0Q,326
49
+ dalla/utils/logger.py,sha256=rfnKgwTky5-G_F-GG9l_pZATT-3AMtLSnLU4U43IvDc,4300
50
+ dalla/utils/tokenize.py,sha256=4ZpXqT5faQqh0ySWLDZXr87JUvW8-M-18b6JRfZimZU,3325
51
+ dalla_data_processing-0.0.1.dist-info/METADATA,sha256=W3ZZvo70XF6YhmufQRhbni217r6rNvMMbljBxJMTmxQ,10052
52
+ dalla_data_processing-0.0.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
53
+ dalla_data_processing-0.0.1.dist-info/entry_points.txt,sha256=22WRI5lLipfw7ram-Acp_Uk0wlkn8gC0Dy_JTBeCgxw,44
54
+ dalla_data_processing-0.0.1.dist-info/top_level.txt,sha256=eeE-IlXf7Mecax_Sf1XaZmMw1Yx6Ghs3l4l7zAs8YO0,6
55
+ dalla_data_processing-0.0.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ dalla-dp = dalla.cli:main