PyPI - dalla-data-processing - Versions diffs - 0.0.10__tar.gz → 0.0.12__tar.gz - Mend

dalla-data-processing 0.0.10tar.gz → 0.0.12tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (85) hide show

{dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/.pre-commit-config.yaml RENAMED Viewed

@@ -3,7 +3,6 @@
 # Run manually: pre-commit run --all-files
 repos:
-  # Ruff - Fast Python linter and formatter
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.1.15
     hooks:
@@ -11,6 +10,5 @@ repos:
         args: [--fix, --exit-non-zero-on-fix]
       - id: ruff-format
-# Configuration
 default_language_version:
   python: python3.12

{dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dalla-data-processing
-Version: 0.0.10
+Version: 0.0.12
 Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
 Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
 License: CC-BY-NC-SA-4.0
@@ -57,7 +57,7 @@ A comprehensive Arabic data processing pipeline with deduplication, stemming, qu
 - **Linux**: Fully supported
 - **macOS**: Fully supported (Intel or through rosetta)
-- **Windows**: Supported through WSL (Windows Subsystem for Linux) only, for native windows: manual build from source works for deduplication.
+- **Windows**: Supported through WSL, for native windows: manual build from source works for deduplication.
 ## Installation
@@ -98,7 +98,7 @@ pip install "dalla-data-processing[dedup,stem,quality]"
 ### Development Installation
-<b>From Source (with uv - recommended)</b>
+<b>From Source (with uv)</b>
 ```bash
 git clone https://github.com/U4RASD/dalla-data-processing.git
@@ -148,6 +148,6 @@ Pack and prepare datasets for training.
 ## Links
-- Homepage: https://github.com/U4RASD/dalla-data-processing
-- Issues: https://github.com/U4RASD/dalla-data-processing/issues
+- Homepage: https://acrps.ai
 - Documentation: https://github.com/U4RASD/dalla-data-processing#readme
+- ACRPS: https://acr.ps

{dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/README.md RENAMED Viewed

@@ -6,7 +6,7 @@ A comprehensive Arabic data processing pipeline with deduplication, stemming, qu
 - **Linux**: Fully supported
 - **macOS**: Fully supported (Intel or through rosetta)
-- **Windows**: Supported through WSL (Windows Subsystem for Linux) only, for native windows: manual build from source works for deduplication.
+- **Windows**: Supported through WSL, for native windows: manual build from source works for deduplication.
 ## Installation
@@ -47,7 +47,7 @@ pip install "dalla-data-processing[dedup,stem,quality]"
 ### Development Installation
-<b>From Source (with uv - recommended)</b>
+<b>From Source (with uv)</b>
 ```bash
 git clone https://github.com/U4RASD/dalla-data-processing.git
@@ -97,6 +97,6 @@ Pack and prepare datasets for training.
 ## Links
-- Homepage: https://github.com/U4RASD/dalla-data-processing
-- Issues: https://github.com/U4RASD/dalla-data-processing/issues
+- Homepage: https://acrps.ai
 - Documentation: https://github.com/U4RASD/dalla-data-processing#readme
+- ACRPS: https://acr.ps

dalla_data_processing-0.0.12/dalla_data_processing/_version.py ADDED Viewed

@@ -0,0 +1,24 @@
+# file generated by vcs-versioning
+# don't change, don't track in version control
+from __future__ import annotations
+__all__ = [
+    "__version__",
+    "__version_tuple__",
+    "version",
+    "version_tuple",
+    "__commit_id__",
+    "commit_id",
+]
+version: str
+__version__: str
+__version_tuple__: tuple[int | str, ...]
+version_tuple: tuple[int | str, ...]
+commit_id: str | None
+__commit_id__: str | None
+__version__ = version = '0.0.12'
+__version_tuple__ = version_tuple = (0, 0, 12)
+__commit_id__ = commit_id = 'g3a0e013e2'

{dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/readability/__init__.py RENAMED Viewed

@@ -2,7 +2,11 @@
 from datasets import Dataset
-from dalla_data_processing.readability.ranking import compute_ranks_and_levels
+from dalla_data_processing.readability.ranking import (
+    OSMAN_WEIGHT,
+    WEIGHTED,
+    compute_ranks_and_levels,
+)
 from dalla_data_processing.readability.scorer import ReadabilityScorer
 from dalla_data_processing.utils.logger import get_logger
@@ -14,6 +18,8 @@ def score_readability(
     column: str = "text",
     add_ranks: bool = True,
     num_proc: int | None = None,
+    level_method: str = WEIGHTED,
+    osman_weight: float = OSMAN_WEIGHT,
 ) -> Dataset:
     """
     Score readability using Flesch and Osman methods, with optional ranking.
@@ -32,6 +38,8 @@ def score_readability(
         column: Column to score
         add_ranks: Whether to add ranking columns (default: True)
         num_proc: Number of parallel processes
+        level_method: Bin-combination strategy, "weighted" or "conservative"
+        osman_weight: Weight on the Osman bin when level_method="weighted"
     Returns:
         Dataset with readability scores and optional rankings
@@ -103,15 +111,17 @@ def score_readability(
     # Step 2: Add ranks if requested
     if add_ranks:
-        logger.info("Computing ranks and readability levels...")
-        scored_dataset = _add_ranks_to_dataset(scored_dataset)
+        logger.info(f"Computing ranks and readability levels (method={level_method})...")
+        scored_dataset = _add_ranks_to_dataset(scored_dataset, level_method, osman_weight)
         logger.info("Ranks and levels added")
     logger.info("Readability scoring complete!")
     return scored_dataset
-def _add_ranks_to_dataset(dataset: Dataset) -> Dataset:
+def _add_ranks_to_dataset(
+    dataset: Dataset, level_method: str = WEIGHTED, osman_weight: float = OSMAN_WEIGHT
+) -> Dataset:
     """
     Add ranking columns to dataset based on scores.
@@ -153,7 +163,9 @@ def _add_ranks_to_dataset(dataset: Dataset) -> Dataset:
         return dataset
     # Compute ranks and levels
-    o_ranks, f_ranks, final_levels = compute_ranks_and_levels(osman_scores, flesch_scores)
+    o_ranks, f_ranks, final_levels = compute_ranks_and_levels(
+        osman_scores, flesch_scores, method=level_method, osman_weight=osman_weight
+    )
     # Create mapping from index to rank data
     rank_data = {}

dalla_data_processing-0.0.12/dalla_data_processing/readability/arabic_flesch.py ADDED Viewed

@@ -0,0 +1,147 @@
+"""
+Arabic Flesch Reading Ease scoring.
+textstat's flesch_reading_ease() cannot score Arabic: it counts syllables with
+pyphen, which ships no Arabic hyphenation dictionary and raises KeyError. This
+module instead counts Arabic syllables directly from diacritics, with a
+character-length fallback for undiacritised text, so Flesch is computable for
+Arabic in all cases.
+Ported from the original OSMAN readability implementation by Mahmoud El-Haj
+(OsmanReadability.java, Syllables.java).
+"""
+import re
+# Short vowels (harakat): fatha, damma, kasra.
+HARAKAT = ("َ", "ُ", "ِ")
+# Long-vowel letters that turn a preceding haraka into a long syllable: alef, waw, yaa.
+LONG_LETTERS = ("ا", "و", "ي")
+# Stress marks: tanween fath, tanween damm, tanween kasr, shadda.
+STRESS_MARKS = ("ً", "ٌ", "ٍ", "ّ")
+PUNCT_PATTERN = re.compile(r"[^\w\s]", flags=re.UNICODE)
+DIGIT_PATTERN = re.compile(r"\d")
+SENTENCE_PATTERN = re.compile(r"\n|(?<!\d)\.(?!\d)")
+def count_all_syllables(text: str) -> tuple[int, int, int]:
+    """
+    Count Arabic short, long, and stress syllables.
+    Long syllables are harakat followed by a long-vowel letter; the remaining
+    harakat are short. Stress syllables are tanween and shadda marks. For
+    undiacritised text (no short syllables found), short syllables are
+    approximated from the stripped character length.
+    Args:
+        text: Text to analyse
+    Returns:
+        Tuple of (short_syllables, long_syllables, stress_syllables)
+    """
+    long_count = 0
+    short_count = 0
+    for haraka in HARAKAT:
+        for i, char in enumerate(text):
+            if char == haraka:
+                if i + 1 < len(text) and text[i + 1] in LONG_LETTERS:
+                    long_count += 1
+                else:
+                    short_count += 1
+    stress_count = sum(text.count(mark) for mark in STRESS_MARKS)
+    # Fallback for undiacritised text: approximate short syllables from length.
+    if short_count == 0:
+        stripped = (
+            text.replace("ا", "")
+            .replace("ى", "")
+            .replace("?", "")
+            .replace(".", "")
+            .replace("!", "")
+            .replace(",", "")
+            .replace(" ", "")
+        )
+        short_count = len(stripped) - 2
+    return short_count, long_count, stress_count
+def count_syllables(text: str) -> int:
+    """
+    Count total syllables, weighting long and stress syllables double.
+    Args:
+        text: Text to analyse
+    Returns:
+        Total syllable count
+    """
+    short_syl, long_syl, stress_syl = count_all_syllables(text)
+    return (long_syl * 2) + short_syl + (stress_syl * 2)
+def count_words(text: str) -> int:
+    """
+    Count words after removing digits and punctuation.
+    Args:
+        text: Text to analyse
+    Returns:
+        Number of whitespace-separated words
+    """
+    cleaned = DIGIT_PATTERN.sub("", text)
+    cleaned = PUNCT_PATTERN.sub("", cleaned)
+    cleaned = re.sub(r" +", " ", cleaned.strip())
+    return len(cleaned.split()) if cleaned else 0
+def count_sentences(text: str) -> int:
+    """
+    Count sentences by splitting on newlines and non-decimal periods.
+    Args:
+        text: Text to analyse
+    Returns:
+        Number of sentences
+    """
+    # Match Java's String.split(regex), which discards trailing empty strings,
+    # so a trailing period does not count as an extra empty sentence.
+    parts = SENTENCE_PATTERN.split(text)
+    while parts and parts[-1] == "":
+        parts.pop()
+    return len(parts)
+def words_per_sentence(text: str) -> float:
+    """Return the average number of words per sentence."""
+    words = count_words(text)
+    sentences = count_sentences(text)
+    return words / sentences if sentences else float(words)
+def syllables_per_word(text: str) -> float:
+    """Return the average number of syllables per word."""
+    words = count_words(text)
+    return count_syllables(text) / words if words else 0.0
+def arabic_flesch_reading_ease(text: str) -> float | None:
+    """
+    Calculate Arabic Flesch Reading Ease.
+    Score = 206.835 - 1.015 * (words / sentence) - 84.6 * (syllables / word)
+    Args:
+        text: Text to score
+    Returns:
+        Flesch Reading Ease score, or None for empty or word-less text
+    """
+    if not text or not text.strip() or count_words(text) == 0:
+        return None
+    return 206.835 - (1.015 * words_per_sentence(text)) - (84.6 * syllables_per_word(text))

{dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/readability/ranking.py RENAMED Viewed

@@ -8,9 +8,21 @@ from dalla_data_processing.utils.logger import get_logger
 logger = get_logger(__name__)
+# Strategies for combining the Osman and Flesch bins into a final level.
+WEIGHTED = "weighted"  # Osman-dominant weighted average (default)
+CONSERVATIVE = "conservative"  # legacy regime-split (Option B3)
+LEVEL_METHODS = (WEIGHTED, CONSERVATIVE)
+# Default weight on the Osman bin for the "weighted" method. Osman is the more
+# reliable signal for Arabic, so it dominates; Flesch only nudges the result.
+OSMAN_WEIGHT = 0.8
 def compute_ranks_and_levels(
-    osman_scores: list[float], flesch_scores: list[float]
+    osman_scores: list[float],
+    flesch_scores: list[float],
+    method: str = WEIGHTED,
+    osman_weight: float = OSMAN_WEIGHT,
 ) -> tuple[list[int], list[int], list[int]]:
     """
     Compute ranks and final readability levels.
@@ -18,11 +30,13 @@ def compute_ranks_and_levels(
     Methodology:
     1. Rank documents by Osman & Flesch (highest score = rank 1, easiest)
     2. Bin ranks into 5 levels (0-4) using quantiles (guarantees balanced bins)
-    3. Decide final level using smart conservative logic
+    3. Decide the final level from the two bins (see decide_final_level)
     Args:
         osman_scores: List of Osman scores
         flesch_scores: List of Flesch scores
+        method: How to combine the bins ("weighted" or "conservative")
+        osman_weight: Weight on the Osman bin when method="weighted"
     Returns:
         Tuple of:
@@ -51,7 +65,10 @@ def compute_ranks_and_levels(
     f_bins = bin_ranks(f_ranks)
     # Decide final level
-    final_levels = [decide_final_level(ob, fb) for ob, fb in zip(o_bins, f_bins, strict=True)]
+    final_levels = [
+        decide_final_level(ob, fb, method=method, osman_weight=osman_weight)
+        for ob, fb in zip(o_bins, f_bins, strict=True)
+    ]
     return (o_ranks, f_ranks, final_levels)
@@ -111,55 +128,48 @@ def bin_ranks(ranks: list[int]) -> list[int]:
     return bins
-def decide_final_level(o_bin: int, f_bin: int) -> int:
+def decide_final_level(
+    o_bin: int, f_bin: int, method: str = WEIGHTED, osman_weight: float = OSMAN_WEIGHT
+) -> int:
     """
-    Decide final readability level from Osman and Flesch bins.
+    Decide final readability level from the Osman and Flesch bins.
+    Two strategies are available:
-    Strategy (Option B3 - Smart Conservative):
-    - Trust Osman when it indicates hardness (bins 3-4)
-    - Trust Flesch when it indicates easiness (bins 0-1)
-    - On complete disagreement (diff >= 2), be conservative (take harder)
-    - On small disagreement (diff = 1), average them
+    "weighted" (default): an Osman-dominant weighted average,
+        round(osman_weight * o_bin + (1 - osman_weight) * f_bin).
+        For Arabic, Osman is the reliable signal (it carries Arabic-specific terms
+        such as faseeh and complex/long-word ratios that hold up on undiacritised
+        text), whereas Flesch depends on syllable counts that degrade without
+        diacritics. Flesch therefore only nudges the level rather than overriding it.
-    Philosophy:
-    - Osman is the expert at identifying hard texts
-    - Flesch is the expert at identifying easy texts
-    - When metrics completely disagree, the text is unusual → mark as harder
-    - When metrics slightly disagree, compromise with average
+    "conservative": the legacy regime-split (Option B3) — trust Osman when it
+        indicates hardness (bins 3-4), trust Flesch when it indicates easiness
+        (bins 0-1), take the harder bin on large disagreement, else average.
     Args:
         o_bin: Osman bin (0-4, 0=easiest, 4=hardest)
         f_bin: Flesch bin (0-4, 0=easiest, 4=hardest)
+        method: "weighted" or "conservative"
+        osman_weight: Weight on the Osman bin when method="weighted"
     Returns:
         Final level (0-4)
     Examples:
-        >>> decide_final_level(4, 0)  # Osman=hard, Flesch=easy → trust Osman
-        4
-        >>> decide_final_level(0, 4)  # Osman=easy, Flesch=hard → trust Flesch (unusual, conservative)
-        4
-        >>> decide_final_level(1, 0)  # Both easy, Flesch=easier → trust Flesch
-        0
-        >>> decide_final_level(3, 4)  # Both hard, Osman=easier → trust Osman
-        3
-        >>> decide_final_level(2, 3)  # Small disagreement → average (2+3+1)//2 = 3
+        >>> decide_final_level(4, 0)                          # weighted, Osman dominates
         3
+        >>> decide_final_level(0, 4, method="conservative")   # easy: trust Flesch -> hard
+        4
     """
-    # Strong Osman signal: text is hard (bins 3-4)
-    if o_bin >= 3:
-        return o_bin
-    # Strong Flesch signal: text is easy (bins 0-1)
-    if f_bin <= 1:
-        return f_bin
-    # Calculate disagreement magnitude
-    diff = abs(o_bin - f_bin)
-    # Complete disagreement (diff >= 2)
-    if diff >= 2:
-        return max(o_bin, f_bin)
-    # Small disagreement (diff = 1) or agreement
-    return (o_bin + f_bin + 1) // 2
+    if method == WEIGHTED:
+        return round(osman_weight * o_bin + (1 - osman_weight) * f_bin)
+    if method == CONSERVATIVE:
+        if o_bin >= 3:
+            return o_bin
+        if f_bin <= 1:
+            return f_bin
+        if abs(o_bin - f_bin) >= 2:
+            return max(o_bin, f_bin)
+        return (o_bin + f_bin + 1) // 2
+    raise ValueError(f"Unknown level method {method!r}; expected one of {LEVEL_METHODS}")

{dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/readability/scorer.py RENAMED Viewed

@@ -1,11 +1,14 @@
 """
-Readability scoring using textstat library (Flesch Reading Ease).
+Readability scoring for Arabic text.
-For Arabic-specific Osman scoring, we use a simplified formula.
+Osman scores come from the textstat library. Flesch Reading Ease uses an Arabic
+syllable-based implementation (see arabic_flesch), since textstat's Flesch relies
+on pyphen, which has no Arabic support.
 """
 import textstat
+from dalla_data_processing.readability.arabic_flesch import arabic_flesch_reading_ease
 from dalla_data_processing.utils.logger import get_logger
 logger = get_logger(__name__)
@@ -31,8 +34,7 @@ class ReadabilityScorer:
         """
         Score text using both Flesch and Osman methods.
-        For very short texts where Flesch returns None, we use the Osman score.
-        If Osman also fails, we use a simple fallback based on word length.
+        If both scores fail, fall back to a simple estimate based on word length.
         Args:
             text: Text to score
@@ -46,13 +48,8 @@ class ReadabilityScorer:
         flesch_score = self._calculate_flesch(text)
         osman_score = self._calculate_osman(text)
-        # If Flesch fails but Osman succeeds, use Osman for both
-        if flesch_score is None and osman_score is not None:
-            logger.info(f"Flesch failed, using Osman score ({osman_score:.1f}) for both metrics")
-            flesch_score = osman_score
         # If both fail, use fallback as last resort
-        elif flesch_score is None and osman_score is None:
+        if flesch_score is None and osman_score is None:
             flesch_fallback, osman_fallback = self._calculate_fallback_scores(text)
             flesch_score = flesch_fallback
             osman_score = osman_fallback
@@ -64,9 +61,9 @@ class ReadabilityScorer:
     def _calculate_flesch(self, text: str) -> float | None:
         """
-        Calculate Flesch Reading Ease score.
+        Calculate Arabic Flesch Reading Ease score.
-        Score range: 0-100+
+        Higher scores indicate easier text (typically 0-100, but unbounded).
         Args:
             text: Text to score
@@ -75,7 +72,7 @@ class ReadabilityScorer:
             Flesch score or None if error
         """
         try:
-            score = self.textstat.flesch_reading_ease(text)
+            score = arabic_flesch_reading_ease(text)
             if score is None:
                 logger.debug(f"Flesch score is None for text (length={len(text)})")
                 return None

{dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing/utils/__init__.py RENAMED Viewed

@@ -1,8 +1,4 @@
-"""
-Utility functions for text processing.
-This module provides utilities for tokenization, text manipulation, and logging.
-"""
+"""Utility functions for text processing."""
 from dalla_data_processing.utils.logger import get_logger, logger, setup_logging

dalla_data_processing-0.0.12/dalla_data_processing/utils/tokenize.py ADDED Viewed

@@ -0,0 +1,79 @@
+# MIT License
+#
+# Copyright 2018-2024 New York University Abu Dhabi
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""Word-boundary tokenization utilities."""
+import re
+__all__ = ["simple_word_tokenize"]
+# Compact mode: Arabic + Latin + digits
+_ARABIC = (
+    r"\u0621-\u063A"
+    r"\u0641-\u064A"
+    r"\u064B-\u0652"
+    r"\u0653-\u0655"
+    r"\u0670"
+    r"\u0671-\u06D3"
+    r"\u06D5-\u06FF"
+    r"\u0750-\u077F"
+    r"\u08A0-\u08FF"
+    r"\uFB50-\uFDFF"
+    r"\uFE70-\uFEFF"
+)
+_LATIN = r"a-zA-Z"
+_DIGITS = r"0-9\u0660-\u0669\u06F0-\u06F9"
+_COMPACT_CHARSET = _ARABIC + _LATIN + _DIGITS
+_FULL_CHARSET = r"\w"
+_COMPACT_RE = re.compile(f"[{_COMPACT_CHARSET}]+|[^{_COMPACT_CHARSET}\\s]|\\s+")
+_COMPACT_SPLIT_RE = re.compile(f"[{_ARABIC}{_LATIN}]+|[{_DIGITS}]+|[^{_COMPACT_CHARSET}\\s]|\\s+")
+_FULL_RE = re.compile(r"\w+|[^\w\s]|\s+")
+_FULL_SPLIT_RE = re.compile(r"[^\W\d]+|\d+|[^\w\s]|\s+")
+def simple_word_tokenize(sentence, split_digits=False, mode="compact"):
+    """Tokenize a sentence by splitting on whitespace and separating punctuation.
+    Args:
+        sentence: Sentence to tokenize.
+        split_digits: Split digits from letters. Defaults to False.
+        mode: "compact" (Arabic + Latin + digits) or "full" (all Unicode).
+            Defaults to "compact".
+    Returns:
+        List of tokens.
+    """
+    if mode == "compact":
+        if split_digits:
+            return _COMPACT_SPLIT_RE.findall(sentence)
+        return _COMPACT_RE.findall(sentence)
+    elif mode == "full":
+        if split_digits:
+            return _FULL_SPLIT_RE.findall(sentence)
+        return _FULL_RE.findall(sentence)
+    else:
+        raise ValueError(f"Unknown mode: {mode}. Use 'compact' or 'full'.")

{dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: dalla-data-processing
-Version: 0.0.10
+Version: 0.0.12
 Summary: data processing pipeline with deduplication, stemming, quality checking, and readability scoring, used for the DALLA Models
 Author-email: Hadi Hamoud <hhamoud@dohainstitute.edu.qa>, Digital Research Unit - Arab Center <dru@dohainstitute.edu.qa>
 License: CC-BY-NC-SA-4.0
@@ -57,7 +57,7 @@ A comprehensive Arabic data processing pipeline with deduplication, stemming, qu
 - **Linux**: Fully supported
 - **macOS**: Fully supported (Intel or through rosetta)
-- **Windows**: Supported through WSL (Windows Subsystem for Linux) only, for native windows: manual build from source works for deduplication.
+- **Windows**: Supported through WSL, for native windows: manual build from source works for deduplication.
 ## Installation
@@ -98,7 +98,7 @@ pip install "dalla-data-processing[dedup,stem,quality]"
 ### Development Installation
-<b>From Source (with uv - recommended)</b>
+<b>From Source (with uv)</b>
 ```bash
 git clone https://github.com/U4RASD/dalla-data-processing.git
@@ -148,6 +148,6 @@ Pack and prepare datasets for training.
 ## Links
-- Homepage: https://github.com/U4RASD/dalla-data-processing
-- Issues: https://github.com/U4RASD/dalla-data-processing/issues
+- Homepage: https://acrps.ai
 - Documentation: https://github.com/U4RASD/dalla-data-processing#readme
+- ACRPS: https://acr.ps

{dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/dalla_data_processing.egg-info/SOURCES.txt RENAMED Viewed

@@ -65,6 +65,7 @@ dalla_data_processing/quality/__init__.py
 dalla_data_processing/quality/checker.py
 dalla_data_processing/readability/README.md
 dalla_data_processing/readability/__init__.py
+dalla_data_processing/readability/arabic_flesch.py
 dalla_data_processing/readability/ranking.py
 dalla_data_processing/readability/scorer.py
 dalla_data_processing/stemming/README.md

{dalla_data_processing-0.0.10 → dalla_data_processing-0.0.12}/scripts/build_onion.sh RENAMED Viewed

@@ -12,13 +12,14 @@ NC='\033[0m' # No Color
 echo -e "${GREEN}=== Building Onion Binary ===${NC}"
-# Get script directory and project root
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
 ONION_SOURCE="$PROJECT_ROOT/dalla_data_processing/deduplication/onion/src_sc"
 OUTPUT_DIR="$PROJECT_ROOT/dalla_data_processing/deduplication/bin"
-# Check if source exists
 if [ ! -d "$ONION_SOURCE" ]; then
     echo -e "${RED}Error: Onion source not found at $ONION_SOURCE${NC}"
     exit 1
@@ -33,7 +34,7 @@ if ! command -v g++ &> /dev/null; then
     exit 1
 fi
-# Check for Google sparsehash
 echo -e "${YELLOW}Checking for Google sparsehash...${NC}"
 if ! echo '#include <google/sparse_hash_set>' | g++ -x c++ -c - -o /dev/null 2>/dev/null; then
     echo -e "${YELLOW}Warning: Google sparsehash headers not found${NC}"

dalla_data_processing-0.0.10/dalla_data_processing/_version.py DELETED Viewed

@@ -1,34 +0,0 @@
-# file generated by setuptools-scm
-# don't change, don't track in version control
-__all__ = [
-    "__version__",
-    "__version_tuple__",
-    "version",
-    "version_tuple",
-    "__commit_id__",
-    "commit_id",
-]
-TYPE_CHECKING = False
-if TYPE_CHECKING:
-    from typing import Tuple
-    from typing import Union
-    VERSION_TUPLE = Tuple[Union[int, str], ...]
-    COMMIT_ID = Union[str, None]
-else:
-    VERSION_TUPLE = object
-    COMMIT_ID = object
-version: str
-__version__: str
-__version_tuple__: VERSION_TUPLE
-version_tuple: VERSION_TUPLE
-commit_id: COMMIT_ID
-__commit_id__: COMMIT_ID
-__version__ = version = '0.0.10'
-__version_tuple__ = version_tuple = (0, 0, 10)
-__commit_id__ = commit_id = 'gcc87ead80'

dalla_data_processing-0.0.10/dalla_data_processing/utils/tokenize.py DELETED Viewed

@@ -1,89 +0,0 @@
-# MIT License
-#
-# Copyright 2018-2024 New York University Abu Dhabi
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in
-# all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
-"""This module contains utilities for word-boundary tokenization."""
-import re
-from camel_tools.utils.charsets import (
-    EMOJI_MULTICHAR_CHARSET,
-    UNICODE_LETTER_CHARSET,
-    UNICODE_LETTER_MARK_NUMBER_CHARSET,
-    UNICODE_MARK_CHARSET,
-    UNICODE_NUMBER_CHARSET,
-    UNICODE_PUNCT_SYMBOL_CHARSET,
-)
-__all__ = ["simple_word_tokenize"]
-_ALL_PUNCT_SYMBOLS = UNICODE_PUNCT_SYMBOL_CHARSET | EMOJI_MULTICHAR_CHARSET
-_ALL_PUNCT_SYMBOLS = [re.escape(x) for x in _ALL_PUNCT_SYMBOLS]
-_ALL_PUNCT_SYMBOLS = sorted(_ALL_PUNCT_SYMBOLS, key=len, reverse=True)
-_WHITESPACE_RE = r"\s+"
-_ALL_NUMBER = "".join(UNICODE_NUMBER_CHARSET)
-_ALL_LETTER_MARK = "".join(UNICODE_LETTER_CHARSET | UNICODE_MARK_CHARSET)
-_ALL_LETTER_MARK_NUMBER = "".join(UNICODE_LETTER_MARK_NUMBER_CHARSET)
-_TOKENIZE_RE = re.compile(
-    "|".join(_ALL_PUNCT_SYMBOLS)
-    + r"|["
-    + re.escape(_ALL_LETTER_MARK_NUMBER)
-    + r"]+|"
-    + _WHITESPACE_RE
-)
-_TOKENIZE_NUMBER_RE = re.compile(
-    "|".join(_ALL_PUNCT_SYMBOLS)
-    + r"|["
-    + re.escape(_ALL_NUMBER)
-    + r"]+|["
-    + re.escape(_ALL_LETTER_MARK)
-    + r"]+"
-)
-def simple_word_tokenize(sentence, split_digits=False):
-    """Tokenizes a sentence by splitting on whitespace and seperating
-    punctuation. The resulting tokens are either alpha-numeric words, single
-    punctuation/symbol/emoji characters, or multi-character emoji sequences.
-    This function is language agnostic and splits all characters marked as
-    punctuation or symbols in the Unicode specification.
-    For example, tokenizing :code:`'Hello,    world!!!'`
-    would yield :code:`['Hello', ',', 'world', '!', '!', '!']`.
-    If split_digits is set to True, it also splits on number.
-    For example, tokenizing :code:`'Hello,    world123!!!'`
-    would yield :code:`['Hello', ',', 'world', '123', '!', '!', '!']`.
-    Args:
-        sentence (:obj:`str`): Sentence to tokenize.
-        split_digits (:obj:`bool`, optional): The flag to split on number.
-            Defaults to False.
-    Returns:
-        :obj:`list` of :obj:`str`: The list of tokens.
-    """
-    if split_digits:
-        return _TOKENIZE_NUMBER_RE.findall(sentence)
-    else:
-        return _TOKENIZE_RE.findall(sentence)