dalla-data-processing 0.0.10__tar.gz → 0.0.12__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/.pre-commit-config.yaml +0 -2
  2. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/PKG-INFO +5 -5
  3. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/README.md +4 -4
  4. dalla_data_processing-0.0.12/dalla_data_processing/_version.py +24 -0
  5. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/readability/__init__.py +17 -5
  6. dalla_data_processing-0.0.12/dalla_data_processing/readability/arabic_flesch.py +147 -0
  7. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/readability/ranking.py +51 -41
  8. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/readability/scorer.py +10 -13
  9. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/utils/__init__.py +1 -5
  10. dalla_data_processing-0.0.12/dalla_data_processing/utils/tokenize.py +79 -0
  11. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing.egg-info/PKG-INFO +5 -5
  12. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing.egg-info/SOURCES.txt +1 -0
  13. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/scripts/build_onion.sh +4 -3
  14. dalla_data_processing-0.0.10/dalla_data_processing/_version.py +0 -34
  15. dalla_data_processing-0.0.10/dalla_data_processing/utils/tokenize.py +0 -89
  16. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/.dockerignore +0 -0
  17. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/.github/workflows/ci.yml +0 -0
  18. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/.github/workflows/release.yml +0 -0
  19. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/.gitignore +0 -0
  20. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/LICENSE +0 -0
  21. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/MANIFEST.in +0 -0
  22. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/__init__.py +0 -0
  23. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/cli.py +0 -0
  24. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/core/README.md +0 -0
  25. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/core/__init__.py +0 -0
  26. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/core/dataset.py +0 -0
  27. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/core/parallel.py +0 -0
  28. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/README.md +0 -0
  29. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/__init__.py +0 -0
  30. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/bin/.gitignore +0 -0
  31. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/COPYING +0 -0
  32. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/Makefile +0 -0
  33. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/Makefile.config +0 -0
  34. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/README.md +0 -0
  35. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/Makefile +0 -0
  36. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/Makefile.g +0 -0
  37. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/buzhash.c +0 -0
  38. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/buzhash.h +0 -0
  39. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/hashdup.c +0 -0
  40. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/hashgen.c +0 -0
  41. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/onion +0 -0
  42. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/onion.c +0 -0
  43. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/onion_dup.c +0 -0
  44. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/version.c +0 -0
  45. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src/version.h +0 -0
  46. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/.gitignore +0 -0
  47. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/Makefile +0 -0
  48. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/Makefile.g +0 -0
  49. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/buzhash.c +0 -0
  50. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/buzhash.h +0 -0
  51. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/hashdup +0 -0
  52. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/hashdup.c +0 -0
  53. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/hashgen +0 -0
  54. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/hashgen.c +0 -0
  55. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/onion.c +0 -0
  56. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/onion_dup.c +0 -0
  57. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/version.c +0 -0
  58. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion/src_sc/version.h +0 -0
  59. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/onion_wrapper.py +0 -0
  60. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/postprocessing.py +0 -0
  61. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/deduplication/preprocessing.py +0 -0
  62. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/packing/README.md +0 -0
  63. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/packing/__init__.py +0 -0
  64. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/packing/dataset_packer.py +0 -0
  65. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/packing/pack_config.example.yaml +0 -0
  66. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/quality/README.md +0 -0
  67. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/quality/__init__.py +0 -0
  68. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/quality/checker.py +0 -0
  69. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/readability/README.md +0 -0
  70. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/stemming/README.md +0 -0
  71. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/stemming/__init__.py +0 -0
  72. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/stemming/data/words_al.txt +0 -0
  73. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/stemming/data/words_al_t.txt +0 -0
  74. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/stemming/data/words_t.txt +0 -0
  75. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/stemming/stemmer.py +0 -0
  76. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/utils/logger.py +0 -0
  77. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing.egg-info/dependency_links.txt +0 -0
  78. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing.egg-info/entry_points.txt +0 -0
  79. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing.egg-info/not-zip-safe +0 -0
  80. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing.egg-info/requires.txt +0 -0
  81. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing.egg-info/top_level.txt +0 -0
  82. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/pyproject.toml +0 -0
  83. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/scripts/release.sh +0 -0
  84. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/setup.cfg +0 -0
  85. {dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/uv.lock +0 -0
@@ -3,7 +3,6 @@
3
3
  # Run manually: pre-commit run --all-files
4
4
 
5
5
  repos:
6
- # Ruff - Fast Python linter and formatter
7
6
  - repo: https://github.com/astral-sh/ruff-pre-commit
8
7
  rev: v0.1.15
9
8
  hooks:
@@ -11,6 +10,5 @@ repos:
11
10
  args: [--fix, --exit-non-zero-on-fix]
12
11
  - id: ruff-format
13
12
 
14
- # Configuration
15
13
  default_language_version:
16
14
  python: python3.12
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dalla-data-processing
3
- Version: 0.0.10
3
+ Version: 0.0.12
4
4
  Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
5
5
  Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
6
6
  License: CC-BY-NC-SA-4.0
@@ -57,7 +57,7 @@ A comprehensive Arabic data processing pipeline with deduplication, stemming, qu
57
57
 
58
58
  - **Linux**: Fully supported
59
59
  - **macOS**: Fully supported (Intel or through rosetta)
60
- - **Windows**: Supported through WSL (Windows Subsystem for Linux) only, for native windows: manual build from source works for deduplication.
60
+ - **Windows**: Supported through WSL, for native windows: manual build from source works for deduplication.
61
61
 
62
62
  ## Installation
63
63
 
@@ -98,7 +98,7 @@ pip install "dalla-data-processing[dedup,stem,quality]"
98
98
 
99
99
  ### Development Installation
100
100
 
101
- <b>From Source (with uv - recommended)</b>
101
+ <b>From Source (with uv)</b>
102
102
 
103
103
  ```bash
104
104
  git clone https://github.com/U4RASD/dalla-data-processing.git
@@ -148,6 +148,6 @@ Pack and prepare datasets for training.
148
148
 
149
149
  ## Links
150
150
 
151
- - Homepage: https://github.com/U4RASD/dalla-data-processing
152
- - Issues: https://github.com/U4RASD/dalla-data-processing/issues
151
+ - Homepage: https://acrps.ai
153
152
  - Documentation: https://github.com/U4RASD/dalla-data-processing#readme
153
+ - ACRPS: https://acr.ps
@@ -6,7 +6,7 @@ A comprehensive Arabic data processing pipeline with deduplication, stemming, qu
6
6
 
7
7
  - **Linux**: Fully supported
8
8
  - **macOS**: Fully supported (Intel or through rosetta)
9
- - **Windows**: Supported through WSL (Windows Subsystem for Linux) only, for native windows: manual build from source works for deduplication.
9
+ - **Windows**: Supported through WSL, for native windows: manual build from source works for deduplication.
10
10
 
11
11
  ## Installation
12
12
 
@@ -47,7 +47,7 @@ pip install "dalla-data-processing[dedup,stem,quality]"
47
47
 
48
48
  ### Development Installation
49
49
 
50
- <b>From Source (with uv - recommended)</b>
50
+ <b>From Source (with uv)</b>
51
51
 
52
52
  ```bash
53
53
  git clone https://github.com/U4RASD/dalla-data-processing.git
@@ -97,6 +97,6 @@ Pack and prepare datasets for training.
97
97
 
98
98
  ## Links
99
99
 
100
- - Homepage: https://github.com/U4RASD/dalla-data-processing
101
- - Issues: https://github.com/U4RASD/dalla-data-processing/issues
100
+ - Homepage: https://acrps.ai
102
101
  - Documentation: https://github.com/U4RASD/dalla-data-processing#readme
102
+ - ACRPS: https://acr.ps
@@ -0,0 +1,24 @@
1
+ # file generated by vcs-versioning
2
+ # don't change, don't track in version control
3
+ from __future__ import annotations
4
+
5
+ __all__ = [
6
+ "__version__",
7
+ "__version_tuple__",
8
+ "version",
9
+ "version_tuple",
10
+ "__commit_id__",
11
+ "commit_id",
12
+ ]
13
+
14
+ version: str
15
+ __version__: str
16
+ __version_tuple__: tuple[int | str, ...]
17
+ version_tuple: tuple[int | str, ...]
18
+ commit_id: str | None
19
+ __commit_id__: str | None
20
+
21
+ __version__ = version = '0.0.12'
22
+ __version_tuple__ = version_tuple = (0, 0, 12)
23
+
24
+ __commit_id__ = commit_id = 'g3a0e013e2'
@@ -2,7 +2,11 @@
2
2
 
3
3
  from datasets import Dataset
4
4
 
5
- from dalla_data_processing.readability.ranking import compute_ranks_and_levels
5
+ from dalla_data_processing.readability.ranking import (
6
+ OSMAN_WEIGHT,
7
+ WEIGHTED,
8
+ compute_ranks_and_levels,
9
+ )
6
10
  from dalla_data_processing.readability.scorer import ReadabilityScorer
7
11
  from dalla_data_processing.utils.logger import get_logger
8
12
 
@@ -14,6 +18,8 @@ def score_readability(
14
18
  column: str = "text",
15
19
  add_ranks: bool = True,
16
20
  num_proc: int | None = None,
21
+ level_method: str = WEIGHTED,
22
+ osman_weight: float = OSMAN_WEIGHT,
17
23
  ) -> Dataset:
18
24
  """
19
25
  Score readability using Flesch and Osman methods, with optional ranking.
@@ -32,6 +38,8 @@ def score_readability(
32
38
  column: Column to score
33
39
  add_ranks: Whether to add ranking columns (default: True)
34
40
  num_proc: Number of parallel processes
41
+ level_method: Bin-combination strategy, "weighted" or "conservative"
42
+ osman_weight: Weight on the Osman bin when level_method="weighted"
35
43
 
36
44
  Returns:
37
45
  Dataset with readability scores and optional rankings
@@ -103,15 +111,17 @@ def score_readability(
103
111
 
104
112
  # Step 2: Add ranks if requested
105
113
  if add_ranks:
106
- logger.info("Computing ranks and readability levels...")
107
- scored_dataset = _add_ranks_to_dataset(scored_dataset)
114
+ logger.info(f"Computing ranks and readability levels (method={level_method})...")
115
+ scored_dataset = _add_ranks_to_dataset(scored_dataset, level_method, osman_weight)
108
116
  logger.info("Ranks and levels added")
109
117
 
110
118
  logger.info("Readability scoring complete!")
111
119
  return scored_dataset
112
120
 
113
121
 
114
- def _add_ranks_to_dataset(dataset: Dataset) -> Dataset:
122
+ def _add_ranks_to_dataset(
123
+ dataset: Dataset, level_method: str = WEIGHTED, osman_weight: float = OSMAN_WEIGHT
124
+ ) -> Dataset:
115
125
  """
116
126
  Add ranking columns to dataset based on scores.
117
127
 
@@ -153,7 +163,9 @@ def _add_ranks_to_dataset(dataset: Dataset) -> Dataset:
153
163
  return dataset
154
164
 
155
165
  # Compute ranks and levels
156
- o_ranks, f_ranks, final_levels = compute_ranks_and_levels(osman_scores, flesch_scores)
166
+ o_ranks, f_ranks, final_levels = compute_ranks_and_levels(
167
+ osman_scores, flesch_scores, method=level_method, osman_weight=osman_weight
168
+ )
157
169
 
158
170
  # Create mapping from index to rank data
159
171
  rank_data = {}
@@ -0,0 +1,147 @@
1
+ """
2
+ Arabic Flesch Reading Ease scoring.
3
+
4
+ textstat's flesch_reading_ease() cannot score Arabic: it counts syllables with
5
+ pyphen, which ships no Arabic hyphenation dictionary and raises KeyError. This
6
+ module instead counts Arabic syllables directly from diacritics, with a
7
+ character-length fallback for undiacritised text, so Flesch is computable for
8
+ Arabic in all cases.
9
+
10
+ Ported from the original OSMAN readability implementation by Mahmoud El-Haj
11
+ (OsmanReadability.java, Syllables.java).
12
+ """
13
+
14
+ import re
15
+
16
+ # Short vowels (harakat): fatha, damma, kasra.
17
+ HARAKAT = ("َ", "ُ", "ِ")
18
+ # Long-vowel letters that turn a preceding haraka into a long syllable: alef, waw, yaa.
19
+ LONG_LETTERS = ("ا", "و", "ي")
20
+ # Stress marks: tanween fath, tanween damm, tanween kasr, shadda.
21
+ STRESS_MARKS = ("ً", "ٌ", "ٍ", "ّ")
22
+
23
+ PUNCT_PATTERN = re.compile(r"[^\w\s]", flags=re.UNICODE)
24
+ DIGIT_PATTERN = re.compile(r"\d")
25
+ SENTENCE_PATTERN = re.compile(r"\n|(?<!\d)\.(?!\d)")
26
+
27
+
28
+ def count_all_syllables(text: str) -> tuple[int, int, int]:
29
+ """
30
+ Count Arabic short, long, and stress syllables.
31
+
32
+ Long syllables are harakat followed by a long-vowel letter; the remaining
33
+ harakat are short. Stress syllables are tanween and shadda marks. For
34
+ undiacritised text (no short syllables found), short syllables are
35
+ approximated from the stripped character length.
36
+
37
+ Args:
38
+ text: Text to analyse
39
+
40
+ Returns:
41
+ Tuple of (short_syllables, long_syllables, stress_syllables)
42
+ """
43
+ long_count = 0
44
+ short_count = 0
45
+
46
+ for haraka in HARAKAT:
47
+ for i, char in enumerate(text):
48
+ if char == haraka:
49
+ if i + 1 < len(text) and text[i + 1] in LONG_LETTERS:
50
+ long_count += 1
51
+ else:
52
+ short_count += 1
53
+
54
+ stress_count = sum(text.count(mark) for mark in STRESS_MARKS)
55
+
56
+ # Fallback for undiacritised text: approximate short syllables from length.
57
+ if short_count == 0:
58
+ stripped = (
59
+ text.replace("ا", "")
60
+ .replace("ى", "")
61
+ .replace("?", "")
62
+ .replace(".", "")
63
+ .replace("!", "")
64
+ .replace(",", "")
65
+ .replace(" ", "")
66
+ )
67
+ short_count = len(stripped) - 2
68
+
69
+ return short_count, long_count, stress_count
70
+
71
+
72
+ def count_syllables(text: str) -> int:
73
+ """
74
+ Count total syllables, weighting long and stress syllables double.
75
+
76
+ Args:
77
+ text: Text to analyse
78
+
79
+ Returns:
80
+ Total syllable count
81
+ """
82
+ short_syl, long_syl, stress_syl = count_all_syllables(text)
83
+ return (long_syl * 2) + short_syl + (stress_syl * 2)
84
+
85
+
86
+ def count_words(text: str) -> int:
87
+ """
88
+ Count words after removing digits and punctuation.
89
+
90
+ Args:
91
+ text: Text to analyse
92
+
93
+ Returns:
94
+ Number of whitespace-separated words
95
+ """
96
+ cleaned = DIGIT_PATTERN.sub("", text)
97
+ cleaned = PUNCT_PATTERN.sub("", cleaned)
98
+ cleaned = re.sub(r" +", " ", cleaned.strip())
99
+ return len(cleaned.split()) if cleaned else 0
100
+
101
+
102
+ def count_sentences(text: str) -> int:
103
+ """
104
+ Count sentences by splitting on newlines and non-decimal periods.
105
+
106
+ Args:
107
+ text: Text to analyse
108
+
109
+ Returns:
110
+ Number of sentences
111
+ """
112
+ # Match Java's String.split(regex), which discards trailing empty strings,
113
+ # so a trailing period does not count as an extra empty sentence.
114
+ parts = SENTENCE_PATTERN.split(text)
115
+ while parts and parts[-1] == "":
116
+ parts.pop()
117
+ return len(parts)
118
+
119
+
120
+ def words_per_sentence(text: str) -> float:
121
+ """Return the average number of words per sentence."""
122
+ words = count_words(text)
123
+ sentences = count_sentences(text)
124
+ return words / sentences if sentences else float(words)
125
+
126
+
127
+ def syllables_per_word(text: str) -> float:
128
+ """Return the average number of syllables per word."""
129
+ words = count_words(text)
130
+ return count_syllables(text) / words if words else 0.0
131
+
132
+
133
+ def arabic_flesch_reading_ease(text: str) -> float | None:
134
+ """
135
+ Calculate Arabic Flesch Reading Ease.
136
+
137
+ Score = 206.835 - 1.015 * (words / sentence) - 84.6 * (syllables / word)
138
+
139
+ Args:
140
+ text: Text to score
141
+
142
+ Returns:
143
+ Flesch Reading Ease score, or None for empty or word-less text
144
+ """
145
+ if not text or not text.strip() or count_words(text) == 0:
146
+ return None
147
+ return 206.835 - (1.015 * words_per_sentence(text)) - (84.6 * syllables_per_word(text))
@@ -8,9 +8,21 @@ from dalla_data_processing.utils.logger import get_logger
8
8
 
9
9
  logger = get_logger(__name__)
10
10
 
11
+ # Strategies for combining the Osman and Flesch bins into a final level.
12
+ WEIGHTED = "weighted" # Osman-dominant weighted average (default)
13
+ CONSERVATIVE = "conservative" # legacy regime-split (Option B3)
14
+ LEVEL_METHODS = (WEIGHTED, CONSERVATIVE)
15
+
16
+ # Default weight on the Osman bin for the "weighted" method. Osman is the more
17
+ # reliable signal for Arabic, so it dominates; Flesch only nudges the result.
18
+ OSMAN_WEIGHT = 0.8
19
+
11
20
 
12
21
  def compute_ranks_and_levels(
13
- osman_scores: list[float], flesch_scores: list[float]
22
+ osman_scores: list[float],
23
+ flesch_scores: list[float],
24
+ method: str = WEIGHTED,
25
+ osman_weight: float = OSMAN_WEIGHT,
14
26
  ) -> tuple[list[int], list[int], list[int]]:
15
27
  """
16
28
  Compute ranks and final readability levels.
@@ -18,11 +30,13 @@ def compute_ranks_and_levels(
18
30
  Methodology:
19
31
  1. Rank documents by Osman & Flesch (highest score = rank 1, easiest)
20
32
  2. Bin ranks into 5 levels (0-4) using quantiles (guarantees balanced bins)
21
- 3. Decide final level using smart conservative logic
33
+ 3. Decide the final level from the two bins (see decide_final_level)
22
34
 
23
35
  Args:
24
36
  osman_scores: List of Osman scores
25
37
  flesch_scores: List of Flesch scores
38
+ method: How to combine the bins ("weighted" or "conservative")
39
+ osman_weight: Weight on the Osman bin when method="weighted"
26
40
 
27
41
  Returns:
28
42
  Tuple of:
@@ -51,7 +65,10 @@ def compute_ranks_and_levels(
51
65
  f_bins = bin_ranks(f_ranks)
52
66
 
53
67
  # Decide final level
54
- final_levels = [decide_final_level(ob, fb) for ob, fb in zip(o_bins, f_bins, strict=True)]
68
+ final_levels = [
69
+ decide_final_level(ob, fb, method=method, osman_weight=osman_weight)
70
+ for ob, fb in zip(o_bins, f_bins, strict=True)
71
+ ]
55
72
 
56
73
  return (o_ranks, f_ranks, final_levels)
57
74
 
@@ -111,55 +128,48 @@ def bin_ranks(ranks: list[int]) -> list[int]:
111
128
  return bins
112
129
 
113
130
 
114
- def decide_final_level(o_bin: int, f_bin: int) -> int:
131
+ def decide_final_level(
132
+ o_bin: int, f_bin: int, method: str = WEIGHTED, osman_weight: float = OSMAN_WEIGHT
133
+ ) -> int:
115
134
  """
116
- Decide final readability level from Osman and Flesch bins.
135
+ Decide final readability level from the Osman and Flesch bins.
136
+
137
+ Two strategies are available:
117
138
 
118
- Strategy (Option B3 - Smart Conservative):
119
- - Trust Osman when it indicates hardness (bins 3-4)
120
- - Trust Flesch when it indicates easiness (bins 0-1)
121
- - On complete disagreement (diff >= 2), be conservative (take harder)
122
- - On small disagreement (diff = 1), average them
139
+ "weighted" (default): an Osman-dominant weighted average,
140
+ round(osman_weight * o_bin + (1 - osman_weight) * f_bin).
141
+ For Arabic, Osman is the reliable signal (it carries Arabic-specific terms
142
+ such as faseeh and complex/long-word ratios that hold up on undiacritised
143
+ text), whereas Flesch depends on syllable counts that degrade without
144
+ diacritics. Flesch therefore only nudges the level rather than overriding it.
123
145
 
124
- Philosophy:
125
- - Osman is the expert at identifying hard texts
126
- - Flesch is the expert at identifying easy texts
127
- - When metrics completely disagree, the text is unusual → mark as harder
128
- - When metrics slightly disagree, compromise with average
146
+ "conservative": the legacy regime-split (Option B3) — trust Osman when it
147
+ indicates hardness (bins 3-4), trust Flesch when it indicates easiness
148
+ (bins 0-1), take the harder bin on large disagreement, else average.
129
149
 
130
150
  Args:
131
151
  o_bin: Osman bin (0-4, 0=easiest, 4=hardest)
132
152
  f_bin: Flesch bin (0-4, 0=easiest, 4=hardest)
153
+ method: "weighted" or "conservative"
154
+ osman_weight: Weight on the Osman bin when method="weighted"
133
155
 
134
156
  Returns:
135
157
  Final level (0-4)
136
158
 
137
159
  Examples:
138
- >>> decide_final_level(4, 0) # Osman=hard, Flesch=easy → trust Osman
139
- 4
140
- >>> decide_final_level(0, 4) # Osman=easy, Flesch=hard → trust Flesch (unusual, conservative)
141
- 4
142
- >>> decide_final_level(1, 0) # Both easy, Flesch=easier → trust Flesch
143
- 0
144
- >>> decide_final_level(3, 4) # Both hard, Osman=easier → trust Osman
145
- 3
146
- >>> decide_final_level(2, 3) # Small disagreement → average (2+3+1)//2 = 3
160
+ >>> decide_final_level(4, 0) # weighted, Osman dominates
147
161
  3
162
+ >>> decide_final_level(0, 4, method="conservative") # easy: trust Flesch -> hard
163
+ 4
148
164
  """
149
- # Strong Osman signal: text is hard (bins 3-4)
150
- if o_bin >= 3:
151
- return o_bin
152
-
153
- # Strong Flesch signal: text is easy (bins 0-1)
154
- if f_bin <= 1:
155
- return f_bin
156
-
157
- # Calculate disagreement magnitude
158
- diff = abs(o_bin - f_bin)
159
-
160
- # Complete disagreement (diff >= 2)
161
- if diff >= 2:
162
- return max(o_bin, f_bin)
163
-
164
- # Small disagreement (diff = 1) or agreement
165
- return (o_bin + f_bin + 1) // 2
165
+ if method == WEIGHTED:
166
+ return round(osman_weight * o_bin + (1 - osman_weight) * f_bin)
167
+ if method == CONSERVATIVE:
168
+ if o_bin >= 3:
169
+ return o_bin
170
+ if f_bin <= 1:
171
+ return f_bin
172
+ if abs(o_bin - f_bin) >= 2:
173
+ return max(o_bin, f_bin)
174
+ return (o_bin + f_bin + 1) // 2
175
+ raise ValueError(f"Unknown level method {method!r}; expected one of {LEVEL_METHODS}")
@@ -1,11 +1,14 @@
1
1
  """
2
- Readability scoring using textstat library (Flesch Reading Ease).
2
+ Readability scoring for Arabic text.
3
3
 
4
- For Arabic-specific Osman scoring, we use a simplified formula.
4
+ Osman scores come from the textstat library. Flesch Reading Ease uses an Arabic
5
+ syllable-based implementation (see arabic_flesch), since textstat's Flesch relies
6
+ on pyphen, which has no Arabic support.
5
7
  """
6
8
 
7
9
  import textstat
8
10
 
11
+ from dalla_data_processing.readability.arabic_flesch import arabic_flesch_reading_ease
9
12
  from dalla_data_processing.utils.logger import get_logger
10
13
 
11
14
  logger = get_logger(__name__)
@@ -31,8 +34,7 @@ class ReadabilityScorer:
31
34
  """
32
35
  Score text using both Flesch and Osman methods.
33
36
 
34
- For very short texts where Flesch returns None, we use the Osman score.
35
- If Osman also fails, we use a simple fallback based on word length.
37
+ If both scores fail, fall back to a simple estimate based on word length.
36
38
 
37
39
  Args:
38
40
  text: Text to score
@@ -46,13 +48,8 @@ class ReadabilityScorer:
46
48
  flesch_score = self._calculate_flesch(text)
47
49
  osman_score = self._calculate_osman(text)
48
50
 
49
- # If Flesch fails but Osman succeeds, use Osman for both
50
- if flesch_score is None and osman_score is not None:
51
- logger.info(f"Flesch failed, using Osman score ({osman_score:.1f}) for both metrics")
52
- flesch_score = osman_score
53
-
54
51
  # If both fail, use fallback as last resort
55
- elif flesch_score is None and osman_score is None:
52
+ if flesch_score is None and osman_score is None:
56
53
  flesch_fallback, osman_fallback = self._calculate_fallback_scores(text)
57
54
  flesch_score = flesch_fallback
58
55
  osman_score = osman_fallback
@@ -64,9 +61,9 @@ class ReadabilityScorer:
64
61
 
65
62
  def _calculate_flesch(self, text: str) -> float | None:
66
63
  """
67
- Calculate Flesch Reading Ease score.
64
+ Calculate Arabic Flesch Reading Ease score.
68
65
 
69
- Score range: 0-100+
66
+ Higher scores indicate easier text (typically 0-100, but unbounded).
70
67
 
71
68
  Args:
72
69
  text: Text to score
@@ -75,7 +72,7 @@ class ReadabilityScorer:
75
72
  Flesch score or None if error
76
73
  """
77
74
  try:
78
- score = self.textstat.flesch_reading_ease(text)
75
+ score = arabic_flesch_reading_ease(text)
79
76
  if score is None:
80
77
  logger.debug(f"Flesch score is None for text (length={len(text)})")
81
78
  return None
@@ -1,8 +1,4 @@
1
- """
2
- Utility functions for text processing.
3
-
4
- This module provides utilities for tokenization, text manipulation, and logging.
5
- """
1
+ """Utility functions for text processing."""
6
2
 
7
3
  from dalla_data_processing.utils.logger import get_logger, logger, setup_logging
8
4
 
@@ -0,0 +1,79 @@
1
+ # MIT License
2
+ #
3
+ # Copyright 2018-2024 New York University Abu Dhabi
4
+ #
5
+ # Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ # of this software and associated documentation files (the "Software"), to deal
7
+ # in the Software without restriction, including without limitation the rights
8
+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ # copies of the Software, and to permit persons to whom the Software is
10
+ # furnished to do so, subject to the following conditions:
11
+ #
12
+ # The above copyright notice and this permission notice shall be included in
13
+ # all copies or substantial portions of the Software.
14
+ #
15
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ # SOFTWARE.
22
+
23
+ """Word-boundary tokenization utilities."""
24
+
25
+ import re
26
+
27
+ __all__ = ["simple_word_tokenize"]
28
+
29
+ # Compact mode: Arabic + Latin + digits
30
+ _ARABIC = (
31
+ r"\u0621-\u063A"
32
+ r"\u0641-\u064A"
33
+ r"\u064B-\u0652"
34
+ r"\u0653-\u0655"
35
+ r"\u0670"
36
+ r"\u0671-\u06D3"
37
+ r"\u06D5-\u06FF"
38
+ r"\u0750-\u077F"
39
+ r"\u08A0-\u08FF"
40
+ r"\uFB50-\uFDFF"
41
+ r"\uFE70-\uFEFF"
42
+ )
43
+ _LATIN = r"a-zA-Z"
44
+ _DIGITS = r"0-9\u0660-\u0669\u06F0-\u06F9"
45
+ _COMPACT_CHARSET = _ARABIC + _LATIN + _DIGITS
46
+
47
+
48
+ _FULL_CHARSET = r"\w"
49
+
50
+
51
+ _COMPACT_RE = re.compile(f"[{_COMPACT_CHARSET}]+|[^{_COMPACT_CHARSET}\\s]|\\s+")
52
+ _COMPACT_SPLIT_RE = re.compile(f"[{_ARABIC}{_LATIN}]+|[{_DIGITS}]+|[^{_COMPACT_CHARSET}\\s]|\\s+")
53
+
54
+ _FULL_RE = re.compile(r"\w+|[^\w\s]|\s+")
55
+ _FULL_SPLIT_RE = re.compile(r"[^\W\d]+|\d+|[^\w\s]|\s+")
56
+
57
+
58
+ def simple_word_tokenize(sentence, split_digits=False, mode="compact"):
59
+ """Tokenize a sentence by splitting on whitespace and separating punctuation.
60
+
61
+ Args:
62
+ sentence: Sentence to tokenize.
63
+ split_digits: Split digits from letters. Defaults to False.
64
+ mode: "compact" (Arabic + Latin + digits) or "full" (all Unicode).
65
+ Defaults to "compact".
66
+
67
+ Returns:
68
+ List of tokens.
69
+ """
70
+ if mode == "compact":
71
+ if split_digits:
72
+ return _COMPACT_SPLIT_RE.findall(sentence)
73
+ return _COMPACT_RE.findall(sentence)
74
+ elif mode == "full":
75
+ if split_digits:
76
+ return _FULL_SPLIT_RE.findall(sentence)
77
+ return _FULL_RE.findall(sentence)
78
+ else:
79
+ raise ValueError(f"Unknown mode: {mode}. Use 'compact' or 'full'.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: dalla-data-processing
3
- Version: 0.0.10
3
+ Version: 0.0.12
4
4
  Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
5
5
  Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
6
6
  License: CC-BY-NC-SA-4.0
@@ -57,7 +57,7 @@ A comprehensive Arabic data processing pipeline with deduplication, stemming, qu
57
57
 
58
58
  - **Linux**: Fully supported
59
59
  - **macOS**: Fully supported (Intel or through rosetta)
60
- - **Windows**: Supported through WSL (Windows Subsystem for Linux) only, for native windows: manual build from source works for deduplication.
60
+ - **Windows**: Supported through WSL, for native windows: manual build from source works for deduplication.
61
61
 
62
62
  ## Installation
63
63
 
@@ -98,7 +98,7 @@ pip install "dalla-data-processing[dedup,stem,quality]"
98
98
 
99
99
  ### Development Installation
100
100
 
101
- <b>From Source (with uv - recommended)</b>
101
+ <b>From Source (with uv)</b>
102
102
 
103
103
  ```bash
104
104
  git clone https://github.com/U4RASD/dalla-data-processing.git
@@ -148,6 +148,6 @@ Pack and prepare datasets for training.
148
148
 
149
149
  ## Links
150
150
 
151
- - Homepage: https://github.com/U4RASD/dalla-data-processing
152
- - Issues: https://github.com/U4RASD/dalla-data-processing/issues
151
+ - Homepage: https://acrps.ai
153
152
  - Documentation: https://github.com/U4RASD/dalla-data-processing#readme
153
+ - ACRPS: https://acr.ps
@@ -65,6 +65,7 @@ dalla_data_processing/quality/__init__.py
65
65
  dalla_data_processing/quality/checker.py
66
66
  dalla_data_processing/readability/README.md
67
67
  dalla_data_processing/readability/__init__.py
68
+ dalla_data_processing/readability/arabic_flesch.py
68
69
  dalla_data_processing/readability/ranking.py
69
70
  dalla_data_processing/readability/scorer.py
70
71
  dalla_data_processing/stemming/README.md
@@ -12,13 +12,14 @@ NC='\033[0m' # No Color
12
12
 
13
13
  echo -e "${GREEN}=== Building Onion Binary ===${NC}"
14
14
 
15
- # Get script directory and project root
15
+
16
+
16
17
  SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
17
18
  PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
18
19
  ONION_SOURCE="$PROJECT_ROOT/dalla_data_processing/deduplication/onion/src_sc"
19
20
  OUTPUT_DIR="$PROJECT_ROOT/dalla_data_processing/deduplication/bin"
20
21
 
21
- # Check if source exists
22
+
22
23
  if [ ! -d "$ONION_SOURCE" ]; then
23
24
  echo -e "${RED}Error: Onion source not found at $ONION_SOURCE${NC}"
24
25
  exit 1
@@ -33,7 +34,7 @@ if ! command -v g++ &> /dev/null; then
33
34
  exit 1
34
35
  fi
35
36
 
36
- # Check for Google sparsehash
37
+
37
38
  echo -e "${YELLOW}Checking for Google sparsehash...${NC}"
38
39
  if ! echo '#include <google/sparse_hash_set>' | g++ -x c++ -c - -o /dev/null 2>/dev/null; then
39
40
  echo -e "${YELLOW}Warning: Google sparsehash headers not found${NC}"
@@ -1,34 +0,0 @@
1
- # file generated by setuptools-scm
2
- # don't change, don't track in version control
3
-
4
- __all__ = [
5
- "__version__",
6
- "__version_tuple__",
7
- "version",
8
- "version_tuple",
9
- "__commit_id__",
10
- "commit_id",
11
- ]
12
-
13
- TYPE_CHECKING = False
14
- if TYPE_CHECKING:
15
- from typing import Tuple
16
- from typing import Union
17
-
18
- VERSION_TUPLE = Tuple[Union[int, str], ...]
19
- COMMIT_ID = Union[str, None]
20
- else:
21
- VERSION_TUPLE = object
22
- COMMIT_ID = object
23
-
24
- version: str
25
- __version__: str
26
- __version_tuple__: VERSION_TUPLE
27
- version_tuple: VERSION_TUPLE
28
- commit_id: COMMIT_ID
29
- __commit_id__: COMMIT_ID
30
-
31
- __version__ = version = '0.0.10'
32
- __version_tuple__ = version_tuple = (0, 0, 10)
33
-
34
- __commit_id__ = commit_id = 'gcc87ead80'
@@ -1,89 +0,0 @@
1
- # MIT License
2
- #
3
- # Copyright 2018-2024 New York University Abu Dhabi
4
- #
5
- # Permission is hereby granted, free of charge, to any person obtaining a copy
6
- # of this software and associated documentation files (the "Software"), to deal
7
- # in the Software without restriction, including without limitation the rights
8
- # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- # copies of the Software, and to permit persons to whom the Software is
10
- # furnished to do so, subject to the following conditions:
11
- #
12
- # The above copyright notice and this permission notice shall be included in
13
- # all copies or substantial portions of the Software.
14
- #
15
- # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- # SOFTWARE.
22
-
23
-
24
- """This module contains utilities for word-boundary tokenization."""
25
-
26
- import re
27
-
28
- from camel_tools.utils.charsets import (
29
- EMOJI_MULTICHAR_CHARSET,
30
- UNICODE_LETTER_CHARSET,
31
- UNICODE_LETTER_MARK_NUMBER_CHARSET,
32
- UNICODE_MARK_CHARSET,
33
- UNICODE_NUMBER_CHARSET,
34
- UNICODE_PUNCT_SYMBOL_CHARSET,
35
- )
36
-
37
- __all__ = ["simple_word_tokenize"]
38
-
39
-
40
- _ALL_PUNCT_SYMBOLS = UNICODE_PUNCT_SYMBOL_CHARSET | EMOJI_MULTICHAR_CHARSET
41
- _ALL_PUNCT_SYMBOLS = [re.escape(x) for x in _ALL_PUNCT_SYMBOLS]
42
- _ALL_PUNCT_SYMBOLS = sorted(_ALL_PUNCT_SYMBOLS, key=len, reverse=True)
43
- _WHITESPACE_RE = r"\s+"
44
- _ALL_NUMBER = "".join(UNICODE_NUMBER_CHARSET)
45
- _ALL_LETTER_MARK = "".join(UNICODE_LETTER_CHARSET | UNICODE_MARK_CHARSET)
46
- _ALL_LETTER_MARK_NUMBER = "".join(UNICODE_LETTER_MARK_NUMBER_CHARSET)
47
-
48
- _TOKENIZE_RE = re.compile(
49
- "|".join(_ALL_PUNCT_SYMBOLS)
50
- + r"|["
51
- + re.escape(_ALL_LETTER_MARK_NUMBER)
52
- + r"]+|"
53
- + _WHITESPACE_RE
54
- )
55
- _TOKENIZE_NUMBER_RE = re.compile(
56
- "|".join(_ALL_PUNCT_SYMBOLS)
57
- + r"|["
58
- + re.escape(_ALL_NUMBER)
59
- + r"]+|["
60
- + re.escape(_ALL_LETTER_MARK)
61
- + r"]+"
62
- )
63
-
64
-
65
- def simple_word_tokenize(sentence, split_digits=False):
66
- """Tokenizes a sentence by splitting on whitespace and seperating
67
- punctuation. The resulting tokens are either alpha-numeric words, single
68
- punctuation/symbol/emoji characters, or multi-character emoji sequences.
69
- This function is language agnostic and splits all characters marked as
70
- punctuation or symbols in the Unicode specification.
71
- For example, tokenizing :code:`'Hello, world!!!'`
72
- would yield :code:`['Hello', ',', 'world', '!', '!', '!']`.
73
- If split_digits is set to True, it also splits on number.
74
- For example, tokenizing :code:`'Hello, world123!!!'`
75
- would yield :code:`['Hello', ',', 'world', '123', '!', '!', '!']`.
76
-
77
- Args:
78
- sentence (:obj:`str`): Sentence to tokenize.
79
- split_digits (:obj:`bool`, optional): The flag to split on number.
80
- Defaults to False.
81
-
82
- Returns:
83
- :obj:`list` of :obj:`str`: The list of tokens.
84
- """
85
-
86
- if split_digits:
87
- return _TOKENIZE_NUMBER_RE.findall(sentence)
88
- else:
89
- return _TOKENIZE_RE.findall(sentence)