dalla-data-processing 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dalla/__init__.py +27 -0
  2. dalla/cli.py +453 -0
  3. dalla/core/__init__.py +6 -0
  4. dalla/core/dataset.py +387 -0
  5. dalla/core/parallel.py +279 -0
  6. dalla/deduplication/__init__.py +370 -0
  7. dalla/deduplication/bin/.gitignore +1 -0
  8. dalla/deduplication/bin/onion-linux-x86_64 +0 -0
  9. dalla/deduplication/onion/COPYING +24 -0
  10. dalla/deduplication/onion/Makefile +21 -0
  11. dalla/deduplication/onion/Makefile.config +3 -0
  12. dalla/deduplication/onion/README.md +21 -0
  13. dalla/deduplication/onion/src/Makefile +22 -0
  14. dalla/deduplication/onion/src/Makefile.g +23 -0
  15. dalla/deduplication/onion/src/buzhash.c +325 -0
  16. dalla/deduplication/onion/src/buzhash.h +30 -0
  17. dalla/deduplication/onion/src/hashdup.c +172 -0
  18. dalla/deduplication/onion/src/hashgen.c +206 -0
  19. dalla/deduplication/onion/src/onion +0 -0
  20. dalla/deduplication/onion/src/onion.c +799 -0
  21. dalla/deduplication/onion/src/onion_dup.c +824 -0
  22. dalla/deduplication/onion/src/version.c +17 -0
  23. dalla/deduplication/onion/src/version.h +10 -0
  24. dalla/deduplication/onion/src_sc/Makefile +22 -0
  25. dalla/deduplication/onion/src_sc/Makefile.g +23 -0
  26. dalla/deduplication/onion/src_sc/buzhash.c +325 -0
  27. dalla/deduplication/onion/src_sc/buzhash.h +30 -0
  28. dalla/deduplication/onion/src_sc/hashdup +0 -0
  29. dalla/deduplication/onion/src_sc/hashdup.c +172 -0
  30. dalla/deduplication/onion/src_sc/hashgen +0 -0
  31. dalla/deduplication/onion/src_sc/hashgen.c +206 -0
  32. dalla/deduplication/onion/src_sc/onion.c +854 -0
  33. dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
  34. dalla/deduplication/onion/src_sc/version.c +17 -0
  35. dalla/deduplication/onion/src_sc/version.h +10 -0
  36. dalla/deduplication/onion_wrapper.py +223 -0
  37. dalla/deduplication/postprocessing.py +216 -0
  38. dalla/deduplication/preprocessing.py +120 -0
  39. dalla/quality/__init__.py +5 -0
  40. dalla/quality/checker.py +354 -0
  41. dalla/readability/__init__.py +197 -0
  42. dalla/readability/ranking.py +165 -0
  43. dalla/readability/scorer.py +148 -0
  44. dalla/stemming/__init__.py +551 -0
  45. dalla/stemming/data/words_al.txt +3414 -0
  46. dalla/stemming/data/words_al_t.txt +885 -0
  47. dalla/stemming/data/words_t.txt +7 -0
  48. dalla/utils/__init__.py +10 -0
  49. dalla/utils/logger.py +128 -0
  50. dalla/utils/tokenize.py +89 -0
  51. dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
  52. dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
  53. dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
  54. dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
  55. dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,354 @@
1
+ """
2
+ Quality checking implementation for Arabic text using CAMEL Tools.
3
+
4
+ This module provides quality assessment by analyzing morphological features
5
+ and detecting errors in Arabic text.
6
+ """
7
+
8
+ import re
9
+ from concurrent.futures import ThreadPoolExecutor
10
+ from concurrent.futures import TimeoutError as FutureTimeoutError
11
+ from types import MethodType
12
+ from typing import Any
13
+
14
+ from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
15
+ from camel_tools.disambig.mle import MLEDisambiguator
16
+ from datasets import Dataset
17
+
18
+ from dalla.core.parallel import ParallelProcessor
19
+ from dalla.utils.logger import get_logger
20
+
21
+ logger = get_logger(__name__)
22
+
23
+ WORD_DELIMITERS = re.compile(r'[0-9#%?:\-+=~()\s\'"/\\*]+|[\[\]{}<>﴿﴾,.٫٪؟«»،؛]+')
24
+ SENTENCE_DELIMITERS = re.compile(r"[?\n\r.;:,.٫٪؟«»،؛]+")
25
+
26
+
27
+ class QualityChecker:
28
+ """Quality checker for Arabic text using CAMEL Tools."""
29
+
30
+ def __init__(self, timeout: int = 3600, model: str = "mle", use_gpu: bool = False):
31
+ """
32
+ Initialize quality checker.
33
+
34
+ Args:
35
+ timeout: Maximum time in seconds for processing a single text (default: 3600)
36
+ model: Disambiguator model to use - "mle" or "bert" (default: "mle")
37
+ use_gpu: Whether to use GPU for BERT model (default: False)
38
+ """
39
+ self.timeout = timeout
40
+ self.model = model.lower()
41
+ self.use_gpu = use_gpu
42
+ self.disambiguator = None
43
+ self.erroneous_words: dict[str, int] = {}
44
+
45
+ if self.model not in ["mle", "bert"]:
46
+ raise ValueError(f"Invalid model '{model}'. Must be 'mle' or 'bert'")
47
+
48
+ logger.info(f"Initializing CAMEL Tools {self.model.upper()} disambiguator...")
49
+ if self.model == "bert" and self.use_gpu:
50
+ logger.info("GPU mode enabled for BERT")
51
+
52
+ self._init_disambiguator()
53
+
54
+ def _init_disambiguator(self):
55
+ """Initialize and configure the disambiguator with caching."""
56
+ if self.model == "mle":
57
+ self.disambiguator = MLEDisambiguator.pretrained()
58
+ logger.info("MLE disambiguator loaded")
59
+ else:
60
+ self.disambiguator = BERTUnfactoredDisambiguator.pretrained(use_gpu=self.use_gpu)
61
+ logger.info(f"BERT disambiguator loaded (GPU: {self.use_gpu})")
62
+
63
+ def cached_scored_analysis(disambiguator, word_dd):
64
+ if word_dd in disambiguator._cache:
65
+ return disambiguator._cache[word_dd]
66
+ result = disambiguator._scored_analyses(word_dd)
67
+ disambiguator._cache[word_dd] = result
68
+ return result
69
+
70
+ self.disambiguator._scored_analyses_cached = MethodType(
71
+ cached_scored_analysis, self.disambiguator
72
+ )
73
+ self.disambiguator._score_fn = self.disambiguator._scored_analyses_cached
74
+
75
+ logger.info("Disambiguator initialized with caching enabled")
76
+
77
+ @staticmethod
78
+ def is_arabic(word: str) -> bool:
79
+ """
80
+ Check if a word is Arabic.
81
+
82
+ Args:
83
+ word: Word to check
84
+
85
+ Returns:
86
+ True if word contains only Arabic characters
87
+ """
88
+ arabic_ranges = [
89
+ (0x0600, 0x06FF), # Arabic
90
+ (0x0750, 0x077F), # Arabic Supplement
91
+ (0x08A0, 0x08FF), # Arabic Extended-A
92
+ (0xFB50, 0xFDFF), # Arabic Presentation Forms-A
93
+ (0xFE70, 0xFEFF), # Arabic Presentation Forms-B
94
+ ]
95
+
96
+ arabic_numbers = range(0x0660, 0x066A)
97
+ return all(
98
+ any(start <= ord(char) <= end for start, end in arabic_ranges) for char in word
99
+ ) and not all(ord(char) in arabic_numbers for char in word)
100
+
101
+ def process_content(
102
+ self, content: str, erroneous_words: dict[str, int]
103
+ ) -> tuple[int, int, int, int]:
104
+ """
105
+ Process content and count errors.
106
+
107
+ Args:
108
+ content: Text content to process
109
+ erroneous_words: Dictionary to track erroneous words
110
+
111
+ Returns:
112
+ Tuple of (total_words, error_count, no_analysis_count, foreign_count)
113
+ """
114
+ arabic_sentence_list = WORD_DELIMITERS.split(content)
115
+ arabic_sentence_list = [word for word in arabic_sentence_list if word]
116
+
117
+ if not arabic_sentence_list:
118
+ return 0, 0, 0, 0
119
+
120
+ morph_features = self.disambiguator.disambiguate(arabic_sentence_list)
121
+
122
+ total_words = len(morph_features)
123
+ err_count = 0
124
+ err_no_analysis = 0
125
+ err_foreign = 0
126
+
127
+ for i, word in enumerate(arabic_sentence_list):
128
+ if morph_features[i] is None or len(morph_features[i].analyses) == 0:
129
+ err_count += 1
130
+ if self.is_arabic(word):
131
+ erroneous_words[word] = erroneous_words.get(word, 0) + 1
132
+ continue
133
+
134
+ analyses = morph_features[i].analyses
135
+ analysis_i = analyses[0].analysis
136
+
137
+ if analysis_i["gloss"] == "NO_ANALYSIS":
138
+ err_count += 1
139
+ err_no_analysis += 1
140
+ if self.is_arabic(word):
141
+ erroneous_words[word] = erroneous_words.get(word, 0) + 1
142
+
143
+ elif analysis_i["gloss"] == word:
144
+ err_count += 1
145
+ err_foreign += 1
146
+ if self.is_arabic(word):
147
+ erroneous_words[word] = erroneous_words.get(word, 0) + 1
148
+
149
+ return total_words, err_count, err_no_analysis, err_foreign
150
+
151
+ def process_full_content(
152
+ self, content: str, erroneous_words: dict[str, int]
153
+ ) -> tuple[float, float, float]:
154
+ """
155
+ Process full content by splitting into sentences.
156
+
157
+ Args:
158
+ content: Full text content
159
+ erroneous_words: Dictionary to track erroneous words
160
+
161
+ Returns:
162
+ Tuple of (quality_score, arabic_error_percent, foreign_error_percent)
163
+ """
164
+ full_content_list = SENTENCE_DELIMITERS.split(content)
165
+
166
+ total_words = 0
167
+ err_count = 0
168
+ err_no_analysis = 0
169
+ err_foreign = 0
170
+
171
+ for sentence in full_content_list:
172
+ if sentence.strip():
173
+ t, ec, ena, ef = self.process_content(sentence, erroneous_words)
174
+ total_words += t
175
+ err_count += ec
176
+ err_no_analysis += ena
177
+ err_foreign += ef
178
+
179
+ if total_words == 0:
180
+ return 0.0, 0.0, 0.0
181
+
182
+ quality_score = 100 * (1 - (err_count / total_words))
183
+ score_ar = 100 * (err_no_analysis / total_words)
184
+ score_foreign = 100 * (err_foreign / total_words)
185
+
186
+ return quality_score, score_ar, score_foreign
187
+
188
+ def check_text_quality(
189
+ self, text: str, erroneous_words: dict[str, int] | None = None
190
+ ) -> dict[str, Any]:
191
+ """
192
+ Check quality of a single text with timeout protection.
193
+
194
+ Args:
195
+ text: Text to check
196
+ erroneous_words: Optional dictionary to track erroneous words
197
+
198
+ Returns:
199
+ Dictionary with quality scores and status
200
+ """
201
+ if erroneous_words is None:
202
+ erroneous_words = {}
203
+
204
+ result = {
205
+ "quality_score": 0.0,
206
+ "arabic_error_percent": 0.0,
207
+ "foreign_error_percent": 0.0,
208
+ "error_code": 0,
209
+ "error_message": None,
210
+ }
211
+
212
+ if not text or not isinstance(text, str):
213
+ result["error_code"] = -1
214
+ result["error_message"] = "Empty or invalid text"
215
+ return result
216
+
217
+ with ThreadPoolExecutor(max_workers=1) as executor:
218
+ future = executor.submit(self.process_full_content, text, erroneous_words)
219
+ try:
220
+ quality_score, score_ar, score_foreign = future.result(timeout=self.timeout)
221
+ result["quality_score"] = quality_score
222
+ result["arabic_error_percent"] = score_ar
223
+ result["foreign_error_percent"] = score_foreign
224
+ except FutureTimeoutError:
225
+ result["error_code"] = -3
226
+ result["error_message"] = f"Processing timeout ({self.timeout}s)"
227
+ logger.warning(f"Text processing timeout after {self.timeout}s")
228
+ except Exception as e:
229
+ result["error_code"] = -2
230
+ result["error_message"] = f"Processing error: {str(e)}"
231
+ logger.error(f"Error processing text: {e}")
232
+
233
+ return result
234
+
235
+ def process_example(self, example: dict[str, Any], column: str) -> dict[str, Any]:
236
+ """
237
+ Process a single example from dataset.
238
+
239
+ Args:
240
+ example: Dataset example
241
+ column: Column name to process
242
+
243
+ Returns:
244
+ Example with added quality scores
245
+ """
246
+ text = example.get(column, "")
247
+
248
+ result = self.check_text_quality(text, self.erroneous_words)
249
+
250
+ example["quality_score"] = result["quality_score"]
251
+ example["arabic_error_percent"] = result["arabic_error_percent"]
252
+ example["foreign_error_percent"] = result["foreign_error_percent"]
253
+ example["quality_error_code"] = result["error_code"]
254
+
255
+ if result["error_message"]:
256
+ example["quality_error_message"] = result["error_message"]
257
+
258
+ return example
259
+
260
+ def get_erroneous_words(self) -> dict[str, int]:
261
+ """
262
+ Get dictionary of erroneous words found during processing.
263
+
264
+ Returns:
265
+ Dictionary mapping erroneous words to their occurrence count
266
+ """
267
+ return self.erroneous_words.copy()
268
+
269
+
270
+ def check_quality(
271
+ dataset: Dataset,
272
+ column: str = "text",
273
+ min_score: float = 0.0,
274
+ save_errors: bool = False,
275
+ num_workers: int | None = None,
276
+ timeout: int = 3600,
277
+ model: str = "mle",
278
+ use_gpu: bool = False,
279
+ ) -> Dataset:
280
+ """
281
+ Check quality of texts in dataset and add quality score columns.
282
+
283
+ Args:
284
+ dataset: HuggingFace dataset
285
+ column: Column name to check
286
+ min_score: Minimum quality score to keep (0-100)
287
+ save_errors: Whether to save erroneous words (logged if True)
288
+ num_workers: Number of parallel workers (None for auto)
289
+ timeout: Timeout per text in seconds
290
+ model: Disambiguator model - "mle" or "bert" (default: "mle")
291
+ use_gpu: Use GPU for BERT model (default: False, only for model="bert")
292
+
293
+ Returns:
294
+ Dataset with quality score columns added (and optionally filtered)
295
+
296
+ Example:
297
+ >>> # Using MLE (default, faster)
298
+ >>> scored = check_quality(dataset, min_score=50.0)
299
+
300
+ >>> # Using BERT (more accurate, slower)
301
+ >>> scored = check_quality(dataset, model="bert", use_gpu=True)
302
+
303
+ >>> # Columns added: quality_score, arabic_error_percent,
304
+ >>> # foreign_error_percent, quality_error_code
305
+ """
306
+
307
+ logger.info(f"Checking quality of {len(dataset)} examples")
308
+ logger.info(f"Model: {model.upper()}, Column: {column}, Min score: {min_score}")
309
+ logger.info(f"Timeout: {timeout}s, GPU: {use_gpu if model == 'bert' else 'N/A'}")
310
+
311
+ if column not in dataset.column_names:
312
+ raise ValueError(f"Column '{column}' not found in dataset")
313
+
314
+ checker = QualityChecker(timeout=timeout, model=model, use_gpu=use_gpu)
315
+
316
+ num_workers = ParallelProcessor.get_optimal_num_workers(num_workers)
317
+ logger.info(f"Processing with {num_workers} workers")
318
+
319
+ processed_dataset = dataset.map(
320
+ lambda example: checker.process_example(example, column),
321
+ num_proc=num_workers,
322
+ desc="Quality checking",
323
+ )
324
+
325
+ original_size = len(dataset)
326
+ avg_score = sum(processed_dataset["quality_score"]) / len(processed_dataset)
327
+ logger.info(f"Average quality score: {avg_score:.2f}")
328
+
329
+ if min_score > 0:
330
+ logger.info(f"Filtering examples with score < {min_score}")
331
+ processed_dataset = processed_dataset.filter(
332
+ lambda x: x["quality_score"] >= min_score,
333
+ num_proc=num_workers,
334
+ desc=f"Filtering (min_score={min_score})",
335
+ )
336
+
337
+ filtered_size = len(processed_dataset)
338
+ removed = original_size - filtered_size
339
+ logger.info(
340
+ f"Removed {removed:,} low-quality examples ({removed / original_size * 100:.1f}%)"
341
+ )
342
+ logger.info(f"Final dataset size: {filtered_size:,}")
343
+
344
+ if save_errors:
345
+ erroneous_words = checker.get_erroneous_words()
346
+ logger.info(f"Found {len(erroneous_words)} unique erroneous words")
347
+
348
+ if erroneous_words:
349
+ sorted_errors = sorted(erroneous_words.items(), key=lambda x: x[1], reverse=True)[:20]
350
+ logger.info("Top 20 erroneous words:")
351
+ for word, count in sorted_errors:
352
+ logger.info(f" {word}: {count}")
353
+
354
+ return processed_dataset
@@ -0,0 +1,197 @@
1
+ """Readability scoring and ranking module using textstat."""
2
+
3
+ from datasets import Dataset
4
+
5
+ from dalla.readability.ranking import compute_ranks_and_levels
6
+ from dalla.readability.scorer import ReadabilityScorer
7
+ from dalla.utils.logger import get_logger
8
+
9
+ logger = get_logger(__name__)
10
+
11
+
12
+ def score_readability(
13
+ dataset: Dataset,
14
+ column: str = "text",
15
+ add_ranks: bool = True,
16
+ num_proc: int | None = None,
17
+ ) -> Dataset:
18
+ """
19
+ Score readability using Flesch and Osman methods, with optional ranking.
20
+
21
+ Adds columns to dataset:
22
+ - flesch_score: Flesch Reading Ease score
23
+ - osman_score: Osman readability score
24
+
25
+ If add_ranks=True, also adds (computed across entire dataset):
26
+ - flesch_rank: Flesch rank (1 = lowest score)
27
+ - osman_rank: Osman rank (1 = lowest score)
28
+ - readability_level: Final readability level (0-4)
29
+
30
+ Args:
31
+ dataset: HuggingFace dataset
32
+ column: Column to score
33
+ add_ranks: Whether to add ranking columns (default: True)
34
+ num_proc: Number of parallel processes
35
+
36
+ Returns:
37
+ Dataset with readability scores and optional rankings
38
+
39
+ Example:
40
+ >>> from dalla.readability import score_readability
41
+ >>> scored = score_readability(dataset)
42
+ >>> # Columns: flesch_score, osman_score, readability_level, etc.
43
+ """
44
+ logger.info(f"Scoring readability of {len(dataset)} examples")
45
+ logger.info(f"Column: {column}, Add ranks: {add_ranks}, Workers: {num_proc or 'auto'}")
46
+
47
+ # Initialize scorer
48
+ logger.info("Initializing readability scorer...")
49
+ ReadabilityScorer() # Initialize to verify dependencies are available
50
+ logger.info("Scorer ready")
51
+
52
+ # Step 1: Score all texts
53
+ logger.info("Calculating Flesch and Osman scores...")
54
+
55
+ def score_example(example):
56
+ # Create scorer inside worker (for multiprocessing compatibility)
57
+ from dalla.readability.scorer import ReadabilityScorer
58
+
59
+ worker_scorer = ReadabilityScorer()
60
+
61
+ text = example.get(column, "")
62
+ if not text:
63
+ example["osman_score"] = None
64
+ example["flesch_score"] = None
65
+ return example
66
+
67
+ osman_score, flesch_score = worker_scorer.score_text(text)
68
+ example["osman_score"] = osman_score
69
+ example["flesch_score"] = flesch_score
70
+ return example
71
+
72
+ scored_dataset = dataset.map(score_example, num_proc=num_proc, desc="Scoring readability")
73
+
74
+ # Count how many valid scores we got
75
+ valid_count = sum(
76
+ 1
77
+ for ex in scored_dataset
78
+ if ex.get("osman_score") is not None and ex.get("flesch_score") is not None
79
+ )
80
+ logger.info(f"Scoring complete for {len(scored_dataset)} examples")
81
+ if valid_count == len(scored_dataset):
82
+ logger.info(f"Successfully scored all {valid_count} examples")
83
+ else:
84
+ logger.info(
85
+ f"Valid scores: {valid_count}/{len(scored_dataset)} ({valid_count / len(scored_dataset) * 100:.1f}%)"
86
+ )
87
+ if valid_count == 0:
88
+ logger.error(
89
+ "Failed to calculate scores for any examples. "
90
+ "This indicates a problem with the text or textstat library."
91
+ )
92
+ logger.info(f"Scoring complete for {len(scored_dataset)} examples")
93
+ logger.info(
94
+ f"Valid scores: {valid_count}/{len(scored_dataset)} ({valid_count / len(scored_dataset) * 100:.1f}%)"
95
+ )
96
+
97
+ if valid_count == 0:
98
+ logger.warning(
99
+ "No valid readability scores calculated. "
100
+ "Common causes: text too short (< 2 sentences), "
101
+ "no complete sentences, or special characters only."
102
+ )
103
+
104
+ # Step 2: Add ranks if requested
105
+ if add_ranks:
106
+ logger.info("Computing ranks and readability levels...")
107
+ scored_dataset = _add_ranks_to_dataset(scored_dataset)
108
+ logger.info("Ranks and levels added")
109
+
110
+ logger.info("Readability scoring complete!")
111
+ return scored_dataset
112
+
113
+
114
+ def _add_ranks_to_dataset(dataset: Dataset) -> Dataset:
115
+ """
116
+ Add ranking columns to dataset based on scores.
117
+
118
+ This computes ranks across the entire dataset and adds:
119
+ - osman_rank, flesch_rank
120
+ - readability_level (final 0-4 level)
121
+
122
+ Args:
123
+ dataset: Dataset with osman_score and flesch_score columns
124
+
125
+ Returns:
126
+ Dataset with ranking columns added
127
+ """
128
+ # Extract scores
129
+ osman_scores = []
130
+ flesch_scores = []
131
+ valid_indices = []
132
+
133
+ for i, example in enumerate(dataset):
134
+ o_score = example.get("osman_score")
135
+ f_score = example.get("flesch_score")
136
+
137
+ # Only include examples with valid scores
138
+ if o_score is not None and f_score is not None:
139
+ osman_scores.append(float(o_score))
140
+ flesch_scores.append(float(f_score))
141
+ valid_indices.append(i)
142
+
143
+ logger.info(f"Computing ranks for {len(valid_indices)} valid examples")
144
+
145
+ if len(osman_scores) == 0:
146
+ logger.error("No valid scores found - cannot compute ranks")
147
+ logger.error(
148
+ f"All {len(dataset)} examples have None scores. "
149
+ "This should not happen with the fallback scoring system. "
150
+ "Please report this as a bug."
151
+ )
152
+ # Still return the dataset with None rank columns
153
+ return dataset
154
+
155
+ # Compute ranks and levels
156
+ o_ranks, f_ranks, final_levels = compute_ranks_and_levels(osman_scores, flesch_scores)
157
+
158
+ # Create mapping from index to rank data
159
+ rank_data = {}
160
+ for idx, o_r, f_r, final_lvl in zip(
161
+ valid_indices,
162
+ o_ranks,
163
+ f_ranks,
164
+ final_levels,
165
+ strict=False,
166
+ ):
167
+ rank_data[idx] = {
168
+ "osman_rank": o_r,
169
+ "flesch_rank": f_r,
170
+ "readability_level": final_lvl,
171
+ }
172
+
173
+ # Add columns to dataset
174
+ def add_rank_columns(example, idx):
175
+ if idx in rank_data:
176
+ example.update(rank_data[idx])
177
+ else:
178
+ # No valid scores - set to None
179
+ example["osman_rank"] = None
180
+ example["flesch_rank"] = None
181
+ example["readability_level"] = None
182
+ return example
183
+
184
+ dataset = dataset.map(add_rank_columns, with_indices=True, desc="Adding ranks")
185
+
186
+ # Log statistics
187
+ if final_levels:
188
+ level_counts = [final_levels.count(i) for i in range(5)]
189
+ logger.info("Readability level distribution:")
190
+ for i, count in enumerate(level_counts):
191
+ pct = (count / len(final_levels)) * 100
192
+ logger.info(f" Level {i}: {count:,} ({pct:.1f}%)")
193
+
194
+ return dataset
195
+
196
+
197
+ __all__ = ["score_readability"]