dalla-data-processing 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. dalla/__init__.py +27 -0
  2. dalla/cli.py +453 -0
  3. dalla/core/__init__.py +6 -0
  4. dalla/core/dataset.py +387 -0
  5. dalla/core/parallel.py +279 -0
  6. dalla/deduplication/__init__.py +370 -0
  7. dalla/deduplication/bin/.gitignore +1 -0
  8. dalla/deduplication/bin/onion-linux-x86_64 +0 -0
  9. dalla/deduplication/onion/COPYING +24 -0
  10. dalla/deduplication/onion/Makefile +21 -0
  11. dalla/deduplication/onion/Makefile.config +3 -0
  12. dalla/deduplication/onion/README.md +21 -0
  13. dalla/deduplication/onion/src/Makefile +22 -0
  14. dalla/deduplication/onion/src/Makefile.g +23 -0
  15. dalla/deduplication/onion/src/buzhash.c +325 -0
  16. dalla/deduplication/onion/src/buzhash.h +30 -0
  17. dalla/deduplication/onion/src/hashdup.c +172 -0
  18. dalla/deduplication/onion/src/hashgen.c +206 -0
  19. dalla/deduplication/onion/src/onion +0 -0
  20. dalla/deduplication/onion/src/onion.c +799 -0
  21. dalla/deduplication/onion/src/onion_dup.c +824 -0
  22. dalla/deduplication/onion/src/version.c +17 -0
  23. dalla/deduplication/onion/src/version.h +10 -0
  24. dalla/deduplication/onion/src_sc/Makefile +22 -0
  25. dalla/deduplication/onion/src_sc/Makefile.g +23 -0
  26. dalla/deduplication/onion/src_sc/buzhash.c +325 -0
  27. dalla/deduplication/onion/src_sc/buzhash.h +30 -0
  28. dalla/deduplication/onion/src_sc/hashdup +0 -0
  29. dalla/deduplication/onion/src_sc/hashdup.c +172 -0
  30. dalla/deduplication/onion/src_sc/hashgen +0 -0
  31. dalla/deduplication/onion/src_sc/hashgen.c +206 -0
  32. dalla/deduplication/onion/src_sc/onion.c +854 -0
  33. dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
  34. dalla/deduplication/onion/src_sc/version.c +17 -0
  35. dalla/deduplication/onion/src_sc/version.h +10 -0
  36. dalla/deduplication/onion_wrapper.py +223 -0
  37. dalla/deduplication/postprocessing.py +216 -0
  38. dalla/deduplication/preprocessing.py +120 -0
  39. dalla/quality/__init__.py +5 -0
  40. dalla/quality/checker.py +354 -0
  41. dalla/readability/__init__.py +197 -0
  42. dalla/readability/ranking.py +165 -0
  43. dalla/readability/scorer.py +148 -0
  44. dalla/stemming/__init__.py +551 -0
  45. dalla/stemming/data/words_al.txt +3414 -0
  46. dalla/stemming/data/words_al_t.txt +885 -0
  47. dalla/stemming/data/words_t.txt +7 -0
  48. dalla/utils/__init__.py +10 -0
  49. dalla/utils/logger.py +128 -0
  50. dalla/utils/tokenize.py +89 -0
  51. dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
  52. dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
  53. dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
  54. dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
  55. dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,165 @@
1
+ """
2
+ Ranking and binning logic for readability scores.
3
+
4
+ Converts raw Flesch and Osman scores into 5-level difficulty rankings.
5
+ """
6
+
7
+ from dalla.utils.logger import get_logger
8
+
9
+ logger = get_logger(__name__)
10
+
11
+
12
+ def compute_ranks_and_levels(
13
+ osman_scores: list[float], flesch_scores: list[float]
14
+ ) -> tuple[list[int], list[int], list[int]]:
15
+ """
16
+ Compute ranks and final readability levels.
17
+
18
+ Methodology:
19
+ 1. Rank documents by Osman & Flesch (highest score = rank 1, easiest)
20
+ 2. Bin ranks into 5 levels (0-4) using quantiles (guarantees balanced bins)
21
+ 3. Decide final level using smart conservative logic
22
+
23
+ Args:
24
+ osman_scores: List of Osman scores
25
+ flesch_scores: List of Flesch scores
26
+
27
+ Returns:
28
+ Tuple of:
29
+ - o_ranks: Osman ranks (list of ints)
30
+ - f_ranks: Flesch ranks (list of ints)
31
+ - final_levels: Final readability levels 0-4 (list of ints)
32
+ """
33
+ n = len(osman_scores)
34
+
35
+ if n == 0:
36
+ return ([], [], [])
37
+
38
+ # Determine ranks (highest score => rank=1, easiest)
39
+ sorted_osman_idx = sorted(range(n), key=lambda i: osman_scores[i], reverse=True)
40
+ o_ranks = [0] * n
41
+ for rank_i, doc_idx in enumerate(sorted_osman_idx):
42
+ o_ranks[doc_idx] = rank_i + 1
43
+
44
+ sorted_flesch_idx = sorted(range(n), key=lambda i: flesch_scores[i], reverse=True)
45
+ f_ranks = [0] * n
46
+ for rank_i, doc_idx in enumerate(sorted_flesch_idx):
47
+ f_ranks[doc_idx] = rank_i + 1
48
+
49
+ # Bin ranks into [0..4]
50
+ o_bins = bin_ranks(o_ranks)
51
+ f_bins = bin_ranks(f_ranks)
52
+
53
+ # Decide final level
54
+ final_levels = [decide_final_level(ob, fb) for ob, fb in zip(o_bins, f_bins, strict=True)]
55
+
56
+ return (o_ranks, f_ranks, final_levels)
57
+
58
+
59
+ def bin_ranks(ranks: list[int]) -> list[int]:
60
+ """
61
+ Map ranks into 5 bins (0..4) using quantile-based binning.
62
+
63
+ This uses TRUE quantile binning (position-based) which guarantees approximately
64
+ 20% of documents in each bin, unlike percentile-threshold binning which can
65
+ create unbalanced or empty bins when data is clustered.
66
+
67
+ After ranking (where highest score = rank 1), lower rank numbers indicate easier text.
68
+ This function bins rank 1 (easiest) to bin 0, and highest rank (hardest) to bin 4.
69
+
70
+ Args:
71
+ ranks: List of rank values (integers starting from 1)
72
+
73
+ Returns:
74
+ List of bin assignments (0-4, where 0=easiest, 4=hardest)
75
+
76
+ Algorithm:
77
+ 1. Sort ranks in ascending order (rank 1 first = easiest)
78
+ 2. Assign bins based on position in sorted list
79
+ 3. First 20% (lowest ranks) → bin 0, last 20% (highest ranks) → bin 4
80
+
81
+ Example:
82
+ >>> bin_ranks([5, 4, 3, 2, 1, 1, 2, 3, 4, 5])
83
+ [4, 3, 2, 1, 0, 0, 1, 2, 3, 4]
84
+ # Rank 1 (easiest) → bin 0, Rank 5 (hardest) → bin 4
85
+ """
86
+ n = len(ranks)
87
+
88
+ if n == 0:
89
+ return []
90
+ if n == 1:
91
+ return [0]
92
+
93
+ # Create (rank, original_index) pairs to track positions
94
+ indexed_ranks = [(rank, i) for i, rank in enumerate(ranks)]
95
+
96
+ # Sort by rank ASCENDING (rank 1 = easiest, should go to bin 0)
97
+ indexed_ranks.sort(key=lambda x: x[0])
98
+
99
+ # Assign bins based on position in sorted list
100
+ bins = [0] * n
101
+
102
+ for sorted_position, (_rank, orig_idx) in enumerate(indexed_ranks):
103
+ # Calculate which quintile (0-4) this position falls into
104
+ # Position 0 to n/5-1 → bin 0 (easiest 20%)
105
+ # Position n/5 to 2n/5-1 → bin 1
106
+ # ...
107
+ # Position 4n/5 to n-1 → bin 4 (hardest 20%)
108
+ bin_number = min(4, int((sorted_position * 5) / n))
109
+ bins[orig_idx] = bin_number
110
+
111
+ return bins
112
+
113
+
114
+ def decide_final_level(o_bin: int, f_bin: int) -> int:
115
+ """
116
+ Decide final readability level from Osman and Flesch bins.
117
+
118
+ Strategy (Option B3 - Smart Conservative):
119
+ - Trust Osman when it indicates hardness (bins 3-4)
120
+ - Trust Flesch when it indicates easiness (bins 0-1)
121
+ - On complete disagreement (diff >= 2), be conservative (take harder)
122
+ - On small disagreement (diff = 1), average them
123
+
124
+ Philosophy:
125
+ - Osman is the expert at identifying hard texts
126
+ - Flesch is the expert at identifying easy texts
127
+ - When metrics completely disagree, the text is unusual → mark as harder
128
+ - When metrics slightly disagree, compromise with average
129
+
130
+ Args:
131
+ o_bin: Osman bin (0-4, 0=easiest, 4=hardest)
132
+ f_bin: Flesch bin (0-4, 0=easiest, 4=hardest)
133
+
134
+ Returns:
135
+ Final level (0-4)
136
+
137
+ Examples:
138
+ >>> decide_final_level(4, 0) # Osman=hard, Flesch=easy → trust Osman
139
+ 4
140
+ >>> decide_final_level(0, 4) # Osman=easy, Flesch=hard → trust Flesch (unusual, conservative)
141
+ 4
142
+ >>> decide_final_level(1, 0) # Both easy, Flesch=easier → trust Flesch
143
+ 0
144
+ >>> decide_final_level(3, 4) # Both hard, Osman=easier → trust Osman
145
+ 3
146
+ >>> decide_final_level(2, 3) # Small disagreement → average (2+3+1)//2 = 3
147
+ 3
148
+ """
149
+ # Strong Osman signal: text is hard (bins 3-4)
150
+ if o_bin >= 3:
151
+ return o_bin
152
+
153
+ # Strong Flesch signal: text is easy (bins 0-1)
154
+ if f_bin <= 1:
155
+ return f_bin
156
+
157
+ # Calculate disagreement magnitude
158
+ diff = abs(o_bin - f_bin)
159
+
160
+ # Complete disagreement (diff >= 2)
161
+ if diff >= 2:
162
+ return max(o_bin, f_bin)
163
+
164
+ # Small disagreement (diff = 1) or agreement
165
+ return (o_bin + f_bin + 1) // 2
@@ -0,0 +1,148 @@
1
+ """
2
+ Readability scoring using textstat library (Flesch Reading Ease).
3
+
4
+ For Arabic-specific Osman scoring, we use a simplified formula.
5
+ """
6
+
7
+ import textstat
8
+
9
+ from dalla.utils.logger import get_logger
10
+
11
+ logger = get_logger(__name__)
12
+
13
+
14
+ class ReadabilityScorer:
15
+ """Calculate readability scores for Arabic text."""
16
+
17
+ def __init__(self):
18
+ """Initialize the readability scorer."""
19
+ try:
20
+ self.textstat = textstat
21
+ try:
22
+ textstat.set_lang("ar")
23
+ except Exception:
24
+ logger.warning("Arabic language not available in textstat, using default")
25
+ except ImportError as err:
26
+ raise ImportError(
27
+ "textstat library required. Install with: pip install textstat"
28
+ ) from err
29
+
30
+ def score_text(self, text: str) -> tuple[float | None, float | None]:
31
+ """
32
+ Score text using both Flesch and Osman methods.
33
+
34
+ For very short texts where Flesch returns None, we use the Osman score.
35
+ If Osman also fails, we use a simple fallback based on word length.
36
+
37
+ Args:
38
+ text: Text to score
39
+
40
+ Returns:
41
+ Tuple of (osman_score, flesch_score)
42
+ """
43
+ if not text or not text.strip():
44
+ return (None, None)
45
+
46
+ flesch_score = self._calculate_flesch(text)
47
+ osman_score = self._calculate_osman(text)
48
+
49
+ # If Flesch fails but Osman succeeds, use Osman for both
50
+ if flesch_score is None and osman_score is not None:
51
+ logger.info(f"Flesch failed, using Osman score ({osman_score:.1f}) for both metrics")
52
+ flesch_score = osman_score
53
+
54
+ # If both fail, use fallback as last resort
55
+ elif flesch_score is None and osman_score is None:
56
+ flesch_fallback, osman_fallback = self._calculate_fallback_scores(text)
57
+ flesch_score = flesch_fallback
58
+ osman_score = osman_fallback
59
+ logger.info(
60
+ f"Both Flesch and Osman failed, using fallback scores: O={osman_score:.1f}, F={flesch_score:.1f}"
61
+ )
62
+
63
+ return (osman_score, flesch_score)
64
+
65
+ def _calculate_flesch(self, text: str) -> float | None:
66
+ """
67
+ Calculate Flesch Reading Ease score.
68
+
69
+ Score range: 0-100+
70
+
71
+ Args:
72
+ text: Text to score
73
+
74
+ Returns:
75
+ Flesch score or None if error
76
+ """
77
+ try:
78
+ score = self.textstat.flesch_reading_ease(text)
79
+ if score is None:
80
+ logger.debug(f"Flesch score is None for text (length={len(text)})")
81
+ return None
82
+ return float(score)
83
+ except Exception as e:
84
+ logger.warning(f"Error calculating Flesch score: {type(e).__name__}: {e}")
85
+ return None
86
+
87
+ def _calculate_osman(self, text: str) -> float | None:
88
+ """
89
+ Calculate Osman readability score for Arabic.
90
+
91
+ Args:
92
+ text: Text to score
93
+
94
+ Returns:
95
+ Osman score or None if error
96
+ """
97
+ try:
98
+ score = self.textstat.osman(text)
99
+ if score is None:
100
+ logger.debug(f"Osman score is None for text (length={len(text)})")
101
+ return None
102
+ return float(score)
103
+
104
+ except Exception as e:
105
+ logger.warning(f"Error calculating Osman score: {type(e).__name__}: {e}")
106
+ return None
107
+
108
+ def _calculate_fallback_scores(self, text: str) -> tuple[float, float]:
109
+ """
110
+ Calculate simple fallback scores for very short texts.
111
+
112
+ This is used when textstat returns None (usually for texts with < 2 sentences).
113
+ We calculate simple metrics based on word/character counts.
114
+
115
+ Args:
116
+ text: Text to score
117
+
118
+ Returns:
119
+ Tuple of (osman_fallback, flesch_fallback)
120
+ """
121
+ words = text.split()
122
+ num_words = len(words)
123
+ num_chars = len(text.strip())
124
+
125
+ # Average word length
126
+ avg_word_len = num_chars / num_words if num_words > 0 else 0
127
+
128
+ if avg_word_len <= 3:
129
+ flesch_fallback = 90.0 # Very easy
130
+ elif avg_word_len <= 5:
131
+ flesch_fallback = 70.0 # Easy
132
+ elif avg_word_len <= 7:
133
+ flesch_fallback = 50.0 # Medium
134
+ elif avg_word_len <= 9:
135
+ flesch_fallback = 30.0 # Difficult
136
+ else:
137
+ flesch_fallback = 10.0 # Very difficult
138
+
139
+ # Osman fallback: similar logic
140
+ # Osman typically ranges 0-100+ for Arabic
141
+ osman_fallback = flesch_fallback # Use same score for simplicity
142
+
143
+ logger.debug(
144
+ f"Using fallback scores (words={num_words}, avg_word_len={avg_word_len:.1f}): "
145
+ f"Flesch={flesch_fallback}, Osman={osman_fallback}"
146
+ )
147
+
148
+ return (osman_fallback, flesch_fallback)