dalla-data-processing 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dalla/__init__.py +27 -0
- dalla/cli.py +453 -0
- dalla/core/__init__.py +6 -0
- dalla/core/dataset.py +387 -0
- dalla/core/parallel.py +279 -0
- dalla/deduplication/__init__.py +370 -0
- dalla/deduplication/bin/.gitignore +1 -0
- dalla/deduplication/bin/onion-linux-x86_64 +0 -0
- dalla/deduplication/onion/COPYING +24 -0
- dalla/deduplication/onion/Makefile +21 -0
- dalla/deduplication/onion/Makefile.config +3 -0
- dalla/deduplication/onion/README.md +21 -0
- dalla/deduplication/onion/src/Makefile +22 -0
- dalla/deduplication/onion/src/Makefile.g +23 -0
- dalla/deduplication/onion/src/buzhash.c +325 -0
- dalla/deduplication/onion/src/buzhash.h +30 -0
- dalla/deduplication/onion/src/hashdup.c +172 -0
- dalla/deduplication/onion/src/hashgen.c +206 -0
- dalla/deduplication/onion/src/onion +0 -0
- dalla/deduplication/onion/src/onion.c +799 -0
- dalla/deduplication/onion/src/onion_dup.c +824 -0
- dalla/deduplication/onion/src/version.c +17 -0
- dalla/deduplication/onion/src/version.h +10 -0
- dalla/deduplication/onion/src_sc/Makefile +22 -0
- dalla/deduplication/onion/src_sc/Makefile.g +23 -0
- dalla/deduplication/onion/src_sc/buzhash.c +325 -0
- dalla/deduplication/onion/src_sc/buzhash.h +30 -0
- dalla/deduplication/onion/src_sc/hashdup +0 -0
- dalla/deduplication/onion/src_sc/hashdup.c +172 -0
- dalla/deduplication/onion/src_sc/hashgen +0 -0
- dalla/deduplication/onion/src_sc/hashgen.c +206 -0
- dalla/deduplication/onion/src_sc/onion.c +854 -0
- dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
- dalla/deduplication/onion/src_sc/version.c +17 -0
- dalla/deduplication/onion/src_sc/version.h +10 -0
- dalla/deduplication/onion_wrapper.py +223 -0
- dalla/deduplication/postprocessing.py +216 -0
- dalla/deduplication/preprocessing.py +120 -0
- dalla/quality/__init__.py +5 -0
- dalla/quality/checker.py +354 -0
- dalla/readability/__init__.py +197 -0
- dalla/readability/ranking.py +165 -0
- dalla/readability/scorer.py +148 -0
- dalla/stemming/__init__.py +551 -0
- dalla/stemming/data/words_al.txt +3414 -0
- dalla/stemming/data/words_al_t.txt +885 -0
- dalla/stemming/data/words_t.txt +7 -0
- dalla/utils/__init__.py +10 -0
- dalla/utils/logger.py +128 -0
- dalla/utils/tokenize.py +89 -0
- dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
- dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
- dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
- dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
- dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Ranking and binning logic for readability scores.
|
|
3
|
+
|
|
4
|
+
Converts raw Flesch and Osman scores into 5-level difficulty rankings.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from dalla.utils.logger import get_logger
|
|
8
|
+
|
|
9
|
+
logger = get_logger(__name__)
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def compute_ranks_and_levels(
|
|
13
|
+
osman_scores: list[float], flesch_scores: list[float]
|
|
14
|
+
) -> tuple[list[int], list[int], list[int]]:
|
|
15
|
+
"""
|
|
16
|
+
Compute ranks and final readability levels.
|
|
17
|
+
|
|
18
|
+
Methodology:
|
|
19
|
+
1. Rank documents by Osman & Flesch (highest score = rank 1, easiest)
|
|
20
|
+
2. Bin ranks into 5 levels (0-4) using quantiles (guarantees balanced bins)
|
|
21
|
+
3. Decide final level using smart conservative logic
|
|
22
|
+
|
|
23
|
+
Args:
|
|
24
|
+
osman_scores: List of Osman scores
|
|
25
|
+
flesch_scores: List of Flesch scores
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
Tuple of:
|
|
29
|
+
- o_ranks: Osman ranks (list of ints)
|
|
30
|
+
- f_ranks: Flesch ranks (list of ints)
|
|
31
|
+
- final_levels: Final readability levels 0-4 (list of ints)
|
|
32
|
+
"""
|
|
33
|
+
n = len(osman_scores)
|
|
34
|
+
|
|
35
|
+
if n == 0:
|
|
36
|
+
return ([], [], [])
|
|
37
|
+
|
|
38
|
+
# Determine ranks (highest score => rank=1, easiest)
|
|
39
|
+
sorted_osman_idx = sorted(range(n), key=lambda i: osman_scores[i], reverse=True)
|
|
40
|
+
o_ranks = [0] * n
|
|
41
|
+
for rank_i, doc_idx in enumerate(sorted_osman_idx):
|
|
42
|
+
o_ranks[doc_idx] = rank_i + 1
|
|
43
|
+
|
|
44
|
+
sorted_flesch_idx = sorted(range(n), key=lambda i: flesch_scores[i], reverse=True)
|
|
45
|
+
f_ranks = [0] * n
|
|
46
|
+
for rank_i, doc_idx in enumerate(sorted_flesch_idx):
|
|
47
|
+
f_ranks[doc_idx] = rank_i + 1
|
|
48
|
+
|
|
49
|
+
# Bin ranks into [0..4]
|
|
50
|
+
o_bins = bin_ranks(o_ranks)
|
|
51
|
+
f_bins = bin_ranks(f_ranks)
|
|
52
|
+
|
|
53
|
+
# Decide final level
|
|
54
|
+
final_levels = [decide_final_level(ob, fb) for ob, fb in zip(o_bins, f_bins, strict=True)]
|
|
55
|
+
|
|
56
|
+
return (o_ranks, f_ranks, final_levels)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def bin_ranks(ranks: list[int]) -> list[int]:
|
|
60
|
+
"""
|
|
61
|
+
Map ranks into 5 bins (0..4) using quantile-based binning.
|
|
62
|
+
|
|
63
|
+
This uses TRUE quantile binning (position-based) which guarantees approximately
|
|
64
|
+
20% of documents in each bin, unlike percentile-threshold binning which can
|
|
65
|
+
create unbalanced or empty bins when data is clustered.
|
|
66
|
+
|
|
67
|
+
After ranking (where highest score = rank 1), lower rank numbers indicate easier text.
|
|
68
|
+
This function bins rank 1 (easiest) to bin 0, and highest rank (hardest) to bin 4.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
ranks: List of rank values (integers starting from 1)
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
List of bin assignments (0-4, where 0=easiest, 4=hardest)
|
|
75
|
+
|
|
76
|
+
Algorithm:
|
|
77
|
+
1. Sort ranks in ascending order (rank 1 first = easiest)
|
|
78
|
+
2. Assign bins based on position in sorted list
|
|
79
|
+
3. First 20% (lowest ranks) → bin 0, last 20% (highest ranks) → bin 4
|
|
80
|
+
|
|
81
|
+
Example:
|
|
82
|
+
>>> bin_ranks([5, 4, 3, 2, 1, 1, 2, 3, 4, 5])
|
|
83
|
+
[4, 3, 2, 1, 0, 0, 1, 2, 3, 4]
|
|
84
|
+
# Rank 1 (easiest) → bin 0, Rank 5 (hardest) → bin 4
|
|
85
|
+
"""
|
|
86
|
+
n = len(ranks)
|
|
87
|
+
|
|
88
|
+
if n == 0:
|
|
89
|
+
return []
|
|
90
|
+
if n == 1:
|
|
91
|
+
return [0]
|
|
92
|
+
|
|
93
|
+
# Create (rank, original_index) pairs to track positions
|
|
94
|
+
indexed_ranks = [(rank, i) for i, rank in enumerate(ranks)]
|
|
95
|
+
|
|
96
|
+
# Sort by rank ASCENDING (rank 1 = easiest, should go to bin 0)
|
|
97
|
+
indexed_ranks.sort(key=lambda x: x[0])
|
|
98
|
+
|
|
99
|
+
# Assign bins based on position in sorted list
|
|
100
|
+
bins = [0] * n
|
|
101
|
+
|
|
102
|
+
for sorted_position, (_rank, orig_idx) in enumerate(indexed_ranks):
|
|
103
|
+
# Calculate which quintile (0-4) this position falls into
|
|
104
|
+
# Position 0 to n/5-1 → bin 0 (easiest 20%)
|
|
105
|
+
# Position n/5 to 2n/5-1 → bin 1
|
|
106
|
+
# ...
|
|
107
|
+
# Position 4n/5 to n-1 → bin 4 (hardest 20%)
|
|
108
|
+
bin_number = min(4, int((sorted_position * 5) / n))
|
|
109
|
+
bins[orig_idx] = bin_number
|
|
110
|
+
|
|
111
|
+
return bins
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def decide_final_level(o_bin: int, f_bin: int) -> int:
|
|
115
|
+
"""
|
|
116
|
+
Decide final readability level from Osman and Flesch bins.
|
|
117
|
+
|
|
118
|
+
Strategy (Option B3 - Smart Conservative):
|
|
119
|
+
- Trust Osman when it indicates hardness (bins 3-4)
|
|
120
|
+
- Trust Flesch when it indicates easiness (bins 0-1)
|
|
121
|
+
- On complete disagreement (diff >= 2), be conservative (take harder)
|
|
122
|
+
- On small disagreement (diff = 1), average them
|
|
123
|
+
|
|
124
|
+
Philosophy:
|
|
125
|
+
- Osman is the expert at identifying hard texts
|
|
126
|
+
- Flesch is the expert at identifying easy texts
|
|
127
|
+
- When metrics completely disagree, the text is unusual → mark as harder
|
|
128
|
+
- When metrics slightly disagree, compromise with average
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
o_bin: Osman bin (0-4, 0=easiest, 4=hardest)
|
|
132
|
+
f_bin: Flesch bin (0-4, 0=easiest, 4=hardest)
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
Final level (0-4)
|
|
136
|
+
|
|
137
|
+
Examples:
|
|
138
|
+
>>> decide_final_level(4, 0) # Osman=hard, Flesch=easy → trust Osman
|
|
139
|
+
4
|
|
140
|
+
>>> decide_final_level(0, 4) # Osman=easy, Flesch=hard → trust Flesch (unusual, conservative)
|
|
141
|
+
4
|
|
142
|
+
>>> decide_final_level(1, 0) # Both easy, Flesch=easier → trust Flesch
|
|
143
|
+
0
|
|
144
|
+
>>> decide_final_level(3, 4) # Both hard, Osman=easier → trust Osman
|
|
145
|
+
3
|
|
146
|
+
>>> decide_final_level(2, 3) # Small disagreement → average (2+3+1)//2 = 3
|
|
147
|
+
3
|
|
148
|
+
"""
|
|
149
|
+
# Strong Osman signal: text is hard (bins 3-4)
|
|
150
|
+
if o_bin >= 3:
|
|
151
|
+
return o_bin
|
|
152
|
+
|
|
153
|
+
# Strong Flesch signal: text is easy (bins 0-1)
|
|
154
|
+
if f_bin <= 1:
|
|
155
|
+
return f_bin
|
|
156
|
+
|
|
157
|
+
# Calculate disagreement magnitude
|
|
158
|
+
diff = abs(o_bin - f_bin)
|
|
159
|
+
|
|
160
|
+
# Complete disagreement (diff >= 2)
|
|
161
|
+
if diff >= 2:
|
|
162
|
+
return max(o_bin, f_bin)
|
|
163
|
+
|
|
164
|
+
# Small disagreement (diff = 1) or agreement
|
|
165
|
+
return (o_bin + f_bin + 1) // 2
|
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Readability scoring using textstat library (Flesch Reading Ease).
|
|
3
|
+
|
|
4
|
+
For Arabic-specific Osman scoring, we use a simplified formula.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import textstat
|
|
8
|
+
|
|
9
|
+
from dalla.utils.logger import get_logger
|
|
10
|
+
|
|
11
|
+
logger = get_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ReadabilityScorer:
|
|
15
|
+
"""Calculate readability scores for Arabic text."""
|
|
16
|
+
|
|
17
|
+
def __init__(self):
|
|
18
|
+
"""Initialize the readability scorer."""
|
|
19
|
+
try:
|
|
20
|
+
self.textstat = textstat
|
|
21
|
+
try:
|
|
22
|
+
textstat.set_lang("ar")
|
|
23
|
+
except Exception:
|
|
24
|
+
logger.warning("Arabic language not available in textstat, using default")
|
|
25
|
+
except ImportError as err:
|
|
26
|
+
raise ImportError(
|
|
27
|
+
"textstat library required. Install with: pip install textstat"
|
|
28
|
+
) from err
|
|
29
|
+
|
|
30
|
+
def score_text(self, text: str) -> tuple[float | None, float | None]:
|
|
31
|
+
"""
|
|
32
|
+
Score text using both Flesch and Osman methods.
|
|
33
|
+
|
|
34
|
+
For very short texts where Flesch returns None, we use the Osman score.
|
|
35
|
+
If Osman also fails, we use a simple fallback based on word length.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
text: Text to score
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
Tuple of (osman_score, flesch_score)
|
|
42
|
+
"""
|
|
43
|
+
if not text or not text.strip():
|
|
44
|
+
return (None, None)
|
|
45
|
+
|
|
46
|
+
flesch_score = self._calculate_flesch(text)
|
|
47
|
+
osman_score = self._calculate_osman(text)
|
|
48
|
+
|
|
49
|
+
# If Flesch fails but Osman succeeds, use Osman for both
|
|
50
|
+
if flesch_score is None and osman_score is not None:
|
|
51
|
+
logger.info(f"Flesch failed, using Osman score ({osman_score:.1f}) for both metrics")
|
|
52
|
+
flesch_score = osman_score
|
|
53
|
+
|
|
54
|
+
# If both fail, use fallback as last resort
|
|
55
|
+
elif flesch_score is None and osman_score is None:
|
|
56
|
+
flesch_fallback, osman_fallback = self._calculate_fallback_scores(text)
|
|
57
|
+
flesch_score = flesch_fallback
|
|
58
|
+
osman_score = osman_fallback
|
|
59
|
+
logger.info(
|
|
60
|
+
f"Both Flesch and Osman failed, using fallback scores: O={osman_score:.1f}, F={flesch_score:.1f}"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
return (osman_score, flesch_score)
|
|
64
|
+
|
|
65
|
+
def _calculate_flesch(self, text: str) -> float | None:
|
|
66
|
+
"""
|
|
67
|
+
Calculate Flesch Reading Ease score.
|
|
68
|
+
|
|
69
|
+
Score range: 0-100+
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
text: Text to score
|
|
73
|
+
|
|
74
|
+
Returns:
|
|
75
|
+
Flesch score or None if error
|
|
76
|
+
"""
|
|
77
|
+
try:
|
|
78
|
+
score = self.textstat.flesch_reading_ease(text)
|
|
79
|
+
if score is None:
|
|
80
|
+
logger.debug(f"Flesch score is None for text (length={len(text)})")
|
|
81
|
+
return None
|
|
82
|
+
return float(score)
|
|
83
|
+
except Exception as e:
|
|
84
|
+
logger.warning(f"Error calculating Flesch score: {type(e).__name__}: {e}")
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
def _calculate_osman(self, text: str) -> float | None:
|
|
88
|
+
"""
|
|
89
|
+
Calculate Osman readability score for Arabic.
|
|
90
|
+
|
|
91
|
+
Args:
|
|
92
|
+
text: Text to score
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Osman score or None if error
|
|
96
|
+
"""
|
|
97
|
+
try:
|
|
98
|
+
score = self.textstat.osman(text)
|
|
99
|
+
if score is None:
|
|
100
|
+
logger.debug(f"Osman score is None for text (length={len(text)})")
|
|
101
|
+
return None
|
|
102
|
+
return float(score)
|
|
103
|
+
|
|
104
|
+
except Exception as e:
|
|
105
|
+
logger.warning(f"Error calculating Osman score: {type(e).__name__}: {e}")
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
def _calculate_fallback_scores(self, text: str) -> tuple[float, float]:
|
|
109
|
+
"""
|
|
110
|
+
Calculate simple fallback scores for very short texts.
|
|
111
|
+
|
|
112
|
+
This is used when textstat returns None (usually for texts with < 2 sentences).
|
|
113
|
+
We calculate simple metrics based on word/character counts.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
text: Text to score
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
Tuple of (osman_fallback, flesch_fallback)
|
|
120
|
+
"""
|
|
121
|
+
words = text.split()
|
|
122
|
+
num_words = len(words)
|
|
123
|
+
num_chars = len(text.strip())
|
|
124
|
+
|
|
125
|
+
# Average word length
|
|
126
|
+
avg_word_len = num_chars / num_words if num_words > 0 else 0
|
|
127
|
+
|
|
128
|
+
if avg_word_len <= 3:
|
|
129
|
+
flesch_fallback = 90.0 # Very easy
|
|
130
|
+
elif avg_word_len <= 5:
|
|
131
|
+
flesch_fallback = 70.0 # Easy
|
|
132
|
+
elif avg_word_len <= 7:
|
|
133
|
+
flesch_fallback = 50.0 # Medium
|
|
134
|
+
elif avg_word_len <= 9:
|
|
135
|
+
flesch_fallback = 30.0 # Difficult
|
|
136
|
+
else:
|
|
137
|
+
flesch_fallback = 10.0 # Very difficult
|
|
138
|
+
|
|
139
|
+
# Osman fallback: similar logic
|
|
140
|
+
# Osman typically ranges 0-100+ for Arabic
|
|
141
|
+
osman_fallback = flesch_fallback # Use same score for simplicity
|
|
142
|
+
|
|
143
|
+
logger.debug(
|
|
144
|
+
f"Using fallback scores (words={num_words}, avg_word_len={avg_word_len:.1f}): "
|
|
145
|
+
f"Flesch={flesch_fallback}, Osman={osman_fallback}"
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
return (osman_fallback, flesch_fallback)
|