PyPI - dalla-data-processing - Versions diffs - 0.0.1__py3-none-any.whl - Mend

dalla-data-processing 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

dalla/__init__.py +27 -0
dalla/cli.py +453 -0
dalla/core/__init__.py +6 -0
dalla/core/dataset.py +387 -0
dalla/core/parallel.py +279 -0
dalla/deduplication/__init__.py +370 -0
dalla/deduplication/bin/.gitignore +1 -0
dalla/deduplication/bin/onion-linux-x86_64 +0 -0
dalla/deduplication/onion/COPYING +24 -0
dalla/deduplication/onion/Makefile +21 -0
dalla/deduplication/onion/Makefile.config +3 -0
dalla/deduplication/onion/README.md +21 -0
dalla/deduplication/onion/src/Makefile +22 -0
dalla/deduplication/onion/src/Makefile.g +23 -0
dalla/deduplication/onion/src/buzhash.c +325 -0
dalla/deduplication/onion/src/buzhash.h +30 -0
dalla/deduplication/onion/src/hashdup.c +172 -0
dalla/deduplication/onion/src/hashgen.c +206 -0
dalla/deduplication/onion/src/onion +0 -0
dalla/deduplication/onion/src/onion.c +799 -0
dalla/deduplication/onion/src/onion_dup.c +824 -0
dalla/deduplication/onion/src/version.c +17 -0
dalla/deduplication/onion/src/version.h +10 -0
dalla/deduplication/onion/src_sc/Makefile +22 -0
dalla/deduplication/onion/src_sc/Makefile.g +23 -0
dalla/deduplication/onion/src_sc/buzhash.c +325 -0
dalla/deduplication/onion/src_sc/buzhash.h +30 -0
dalla/deduplication/onion/src_sc/hashdup +0 -0
dalla/deduplication/onion/src_sc/hashdup.c +172 -0
dalla/deduplication/onion/src_sc/hashgen +0 -0
dalla/deduplication/onion/src_sc/hashgen.c +206 -0
dalla/deduplication/onion/src_sc/onion.c +854 -0
dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
dalla/deduplication/onion/src_sc/version.c +17 -0
dalla/deduplication/onion/src_sc/version.h +10 -0
dalla/deduplication/onion_wrapper.py +223 -0
dalla/deduplication/postprocessing.py +216 -0
dalla/deduplication/preprocessing.py +120 -0
dalla/quality/__init__.py +5 -0
dalla/quality/checker.py +354 -0
dalla/readability/__init__.py +197 -0
dalla/readability/ranking.py +165 -0
dalla/readability/scorer.py +148 -0
dalla/stemming/__init__.py +551 -0
dalla/stemming/data/words_al.txt +3414 -0
dalla/stemming/data/words_al_t.txt +885 -0
dalla/stemming/data/words_t.txt +7 -0
dalla/utils/__init__.py +10 -0
dalla/utils/logger.py +128 -0
dalla/utils/tokenize.py +89 -0
dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0

dalla/quality/checker.py ADDED Viewed

@@ -0,0 +1,354 @@
+"""
+Quality checking implementation for Arabic text using CAMEL Tools.
+This module provides quality assessment by analyzing morphological features
+and detecting errors in Arabic text.
+"""
+import re
+from concurrent.futures import ThreadPoolExecutor
+from concurrent.futures import TimeoutError as FutureTimeoutError
+from types import MethodType
+from typing import Any
+from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
+from camel_tools.disambig.mle import MLEDisambiguator
+from datasets import Dataset
+from dalla.core.parallel import ParallelProcessor
+from dalla.utils.logger import get_logger
+logger = get_logger(__name__)
+WORD_DELIMITERS = re.compile(r'[0-9#%?:\-+=~()\s\'"/\\*]+|[\[\]{}<>﴿﴾,.٫٪؟«»،؛]+')
+SENTENCE_DELIMITERS = re.compile(r"[?\n\r.;:,.٫٪؟«»،؛]+")
+class QualityChecker:
+    """Quality checker for Arabic text using CAMEL Tools."""
+    def __init__(self, timeout: int = 3600, model: str = "mle", use_gpu: bool = False):
+        """
+        Initialize quality checker.
+        Args:
+            timeout: Maximum time in seconds for processing a single text (default: 3600)
+            model: Disambiguator model to use - "mle" or "bert" (default: "mle")
+            use_gpu: Whether to use GPU for BERT model (default: False)
+        """
+        self.timeout = timeout
+        self.model = model.lower()
+        self.use_gpu = use_gpu
+        self.disambiguator = None
+        self.erroneous_words: dict[str, int] = {}
+        if self.model not in ["mle", "bert"]:
+            raise ValueError(f"Invalid model '{model}'. Must be 'mle' or 'bert'")
+        logger.info(f"Initializing CAMEL Tools {self.model.upper()} disambiguator...")
+        if self.model == "bert" and self.use_gpu:
+            logger.info("GPU mode enabled for BERT")
+        self._init_disambiguator()
+    def _init_disambiguator(self):
+        """Initialize and configure the disambiguator with caching."""
+        if self.model == "mle":
+            self.disambiguator = MLEDisambiguator.pretrained()
+            logger.info("MLE disambiguator loaded")
+        else:
+            self.disambiguator = BERTUnfactoredDisambiguator.pretrained(use_gpu=self.use_gpu)
+            logger.info(f"BERT disambiguator loaded (GPU: {self.use_gpu})")
+        def cached_scored_analysis(disambiguator, word_dd):
+            if word_dd in disambiguator._cache:
+                return disambiguator._cache[word_dd]
+            result = disambiguator._scored_analyses(word_dd)
+            disambiguator._cache[word_dd] = result
+            return result
+        self.disambiguator._scored_analyses_cached = MethodType(
+            cached_scored_analysis, self.disambiguator
+        )
+        self.disambiguator._score_fn = self.disambiguator._scored_analyses_cached
+        logger.info("Disambiguator initialized with caching enabled")
+    @staticmethod
+    def is_arabic(word: str) -> bool:
+        """
+        Check if a word is Arabic.
+        Args:
+            word: Word to check
+        Returns:
+            True if word contains only Arabic characters
+        """
+        arabic_ranges = [
+            (0x0600, 0x06FF),  # Arabic
+            (0x0750, 0x077F),  # Arabic Supplement
+            (0x08A0, 0x08FF),  # Arabic Extended-A
+            (0xFB50, 0xFDFF),  # Arabic Presentation Forms-A
+            (0xFE70, 0xFEFF),  # Arabic Presentation Forms-B
+        ]
+        arabic_numbers = range(0x0660, 0x066A)
+        return all(
+            any(start <= ord(char) <= end for start, end in arabic_ranges) for char in word
+        ) and not all(ord(char) in arabic_numbers for char in word)
+    def process_content(
+        self, content: str, erroneous_words: dict[str, int]
+    ) -> tuple[int, int, int, int]:
+        """
+        Process content and count errors.
+        Args:
+            content: Text content to process
+            erroneous_words: Dictionary to track erroneous words
+        Returns:
+            Tuple of (total_words, error_count, no_analysis_count, foreign_count)
+        """
+        arabic_sentence_list = WORD_DELIMITERS.split(content)
+        arabic_sentence_list = [word for word in arabic_sentence_list if word]
+        if not arabic_sentence_list:
+            return 0, 0, 0, 0
+        morph_features = self.disambiguator.disambiguate(arabic_sentence_list)
+        total_words = len(morph_features)
+        err_count = 0
+        err_no_analysis = 0
+        err_foreign = 0
+        for i, word in enumerate(arabic_sentence_list):
+            if morph_features[i] is None or len(morph_features[i].analyses) == 0:
+                err_count += 1
+                if self.is_arabic(word):
+                    erroneous_words[word] = erroneous_words.get(word, 0) + 1
+                continue
+            analyses = morph_features[i].analyses
+            analysis_i = analyses[0].analysis
+            if analysis_i["gloss"] == "NO_ANALYSIS":
+                err_count += 1
+                err_no_analysis += 1
+                if self.is_arabic(word):
+                    erroneous_words[word] = erroneous_words.get(word, 0) + 1
+            elif analysis_i["gloss"] == word:
+                err_count += 1
+                err_foreign += 1
+                if self.is_arabic(word):
+                    erroneous_words[word] = erroneous_words.get(word, 0) + 1
+        return total_words, err_count, err_no_analysis, err_foreign
+    def process_full_content(
+        self, content: str, erroneous_words: dict[str, int]
+    ) -> tuple[float, float, float]:
+        """
+        Process full content by splitting into sentences.
+        Args:
+            content: Full text content
+            erroneous_words: Dictionary to track erroneous words
+        Returns:
+            Tuple of (quality_score, arabic_error_percent, foreign_error_percent)
+        """
+        full_content_list = SENTENCE_DELIMITERS.split(content)
+        total_words = 0
+        err_count = 0
+        err_no_analysis = 0
+        err_foreign = 0
+        for sentence in full_content_list:
+            if sentence.strip():
+                t, ec, ena, ef = self.process_content(sentence, erroneous_words)
+                total_words += t
+                err_count += ec
+                err_no_analysis += ena
+                err_foreign += ef
+        if total_words == 0:
+            return 0.0, 0.0, 0.0
+        quality_score = 100 * (1 - (err_count / total_words))
+        score_ar = 100 * (err_no_analysis / total_words)
+        score_foreign = 100 * (err_foreign / total_words)
+        return quality_score, score_ar, score_foreign
+    def check_text_quality(
+        self, text: str, erroneous_words: dict[str, int] | None = None
+    ) -> dict[str, Any]:
+        """
+        Check quality of a single text with timeout protection.
+        Args:
+            text: Text to check
+            erroneous_words: Optional dictionary to track erroneous words
+        Returns:
+            Dictionary with quality scores and status
+        """
+        if erroneous_words is None:
+            erroneous_words = {}
+        result = {
+            "quality_score": 0.0,
+            "arabic_error_percent": 0.0,
+            "foreign_error_percent": 0.0,
+            "error_code": 0,
+            "error_message": None,
+        }
+        if not text or not isinstance(text, str):
+            result["error_code"] = -1
+            result["error_message"] = "Empty or invalid text"
+            return result
+        with ThreadPoolExecutor(max_workers=1) as executor:
+            future = executor.submit(self.process_full_content, text, erroneous_words)
+            try:
+                quality_score, score_ar, score_foreign = future.result(timeout=self.timeout)
+                result["quality_score"] = quality_score
+                result["arabic_error_percent"] = score_ar
+                result["foreign_error_percent"] = score_foreign
+            except FutureTimeoutError:
+                result["error_code"] = -3
+                result["error_message"] = f"Processing timeout ({self.timeout}s)"
+                logger.warning(f"Text processing timeout after {self.timeout}s")
+            except Exception as e:
+                result["error_code"] = -2
+                result["error_message"] = f"Processing error: {str(e)}"
+                logger.error(f"Error processing text: {e}")
+        return result
+    def process_example(self, example: dict[str, Any], column: str) -> dict[str, Any]:
+        """
+        Process a single example from dataset.
+        Args:
+            example: Dataset example
+            column: Column name to process
+        Returns:
+            Example with added quality scores
+        """
+        text = example.get(column, "")
+        result = self.check_text_quality(text, self.erroneous_words)
+        example["quality_score"] = result["quality_score"]
+        example["arabic_error_percent"] = result["arabic_error_percent"]
+        example["foreign_error_percent"] = result["foreign_error_percent"]
+        example["quality_error_code"] = result["error_code"]
+        if result["error_message"]:
+            example["quality_error_message"] = result["error_message"]
+        return example
+    def get_erroneous_words(self) -> dict[str, int]:
+        """
+        Get dictionary of erroneous words found during processing.
+        Returns:
+            Dictionary mapping erroneous words to their occurrence count
+        """
+        return self.erroneous_words.copy()
+def check_quality(
+    dataset: Dataset,
+    column: str = "text",
+    min_score: float = 0.0,
+    save_errors: bool = False,
+    num_workers: int | None = None,
+    timeout: int = 3600,
+    model: str = "mle",
+    use_gpu: bool = False,
+) -> Dataset:
+    """
+    Check quality of texts in dataset and add quality score columns.
+    Args:
+        dataset: HuggingFace dataset
+        column: Column name to check
+        min_score: Minimum quality score to keep (0-100)
+        save_errors: Whether to save erroneous words (logged if True)
+        num_workers: Number of parallel workers (None for auto)
+        timeout: Timeout per text in seconds
+        model: Disambiguator model - "mle" or "bert" (default: "mle")
+        use_gpu: Use GPU for BERT model (default: False, only for model="bert")
+    Returns:
+        Dataset with quality score columns added (and optionally filtered)
+    Example:
+        >>> # Using MLE (default, faster)
+        >>> scored = check_quality(dataset, min_score=50.0)
+        >>> # Using BERT (more accurate, slower)
+        >>> scored = check_quality(dataset, model="bert", use_gpu=True)
+        >>> # Columns added: quality_score, arabic_error_percent,
+        >>> #                foreign_error_percent, quality_error_code
+    """
+    logger.info(f"Checking quality of {len(dataset)} examples")
+    logger.info(f"Model: {model.upper()}, Column: {column}, Min score: {min_score}")
+    logger.info(f"Timeout: {timeout}s, GPU: {use_gpu if model == 'bert' else 'N/A'}")
+    if column not in dataset.column_names:
+        raise ValueError(f"Column '{column}' not found in dataset")
+    checker = QualityChecker(timeout=timeout, model=model, use_gpu=use_gpu)
+    num_workers = ParallelProcessor.get_optimal_num_workers(num_workers)
+    logger.info(f"Processing with {num_workers} workers")
+    processed_dataset = dataset.map(
+        lambda example: checker.process_example(example, column),
+        num_proc=num_workers,
+        desc="Quality checking",
+    )
+    original_size = len(dataset)
+    avg_score = sum(processed_dataset["quality_score"]) / len(processed_dataset)
+    logger.info(f"Average quality score: {avg_score:.2f}")
+    if min_score > 0:
+        logger.info(f"Filtering examples with score < {min_score}")
+        processed_dataset = processed_dataset.filter(
+            lambda x: x["quality_score"] >= min_score,
+            num_proc=num_workers,
+            desc=f"Filtering (min_score={min_score})",
+        )
+        filtered_size = len(processed_dataset)
+        removed = original_size - filtered_size
+        logger.info(
+            f"Removed {removed:,} low-quality examples ({removed / original_size * 100:.1f}%)"
+        )
+        logger.info(f"Final dataset size: {filtered_size:,}")
+    if save_errors:
+        erroneous_words = checker.get_erroneous_words()
+        logger.info(f"Found {len(erroneous_words)} unique erroneous words")
+        if erroneous_words:
+            sorted_errors = sorted(erroneous_words.items(), key=lambda x: x[1], reverse=True)[:20]
+            logger.info("Top 20 erroneous words:")
+            for word, count in sorted_errors:
+                logger.info(f"  {word}: {count}")
+    return processed_dataset

dalla/readability/__init__.py ADDED Viewed

@@ -0,0 +1,197 @@
+"""Readability scoring and ranking module using textstat."""
+from datasets import Dataset
+from dalla.readability.ranking import compute_ranks_and_levels
+from dalla.readability.scorer import ReadabilityScorer
+from dalla.utils.logger import get_logger
+logger = get_logger(__name__)
+def score_readability(
+    dataset: Dataset,
+    column: str = "text",
+    add_ranks: bool = True,
+    num_proc: int | None = None,
+) -> Dataset:
+    """
+    Score readability using Flesch and Osman methods, with optional ranking.
+    Adds columns to dataset:
+    - flesch_score: Flesch Reading Ease score
+    - osman_score: Osman readability score
+    If add_ranks=True, also adds (computed across entire dataset):
+    - flesch_rank: Flesch rank (1 = lowest score)
+    - osman_rank: Osman rank (1 = lowest score)
+    - readability_level: Final readability level (0-4)
+    Args:
+        dataset: HuggingFace dataset
+        column: Column to score
+        add_ranks: Whether to add ranking columns (default: True)
+        num_proc: Number of parallel processes
+    Returns:
+        Dataset with readability scores and optional rankings
+    Example:
+        >>> from dalla.readability import score_readability
+        >>> scored = score_readability(dataset)
+        >>> # Columns: flesch_score, osman_score, readability_level, etc.
+    """
+    logger.info(f"Scoring readability of {len(dataset)} examples")
+    logger.info(f"Column: {column}, Add ranks: {add_ranks}, Workers: {num_proc or 'auto'}")
+    # Initialize scorer
+    logger.info("Initializing readability scorer...")
+    ReadabilityScorer()  # Initialize to verify dependencies are available
+    logger.info("Scorer ready")
+    # Step 1: Score all texts
+    logger.info("Calculating Flesch and Osman scores...")
+    def score_example(example):
+        # Create scorer inside worker (for multiprocessing compatibility)
+        from dalla.readability.scorer import ReadabilityScorer
+        worker_scorer = ReadabilityScorer()
+        text = example.get(column, "")
+        if not text:
+            example["osman_score"] = None
+            example["flesch_score"] = None
+            return example
+        osman_score, flesch_score = worker_scorer.score_text(text)
+        example["osman_score"] = osman_score
+        example["flesch_score"] = flesch_score
+        return example
+    scored_dataset = dataset.map(score_example, num_proc=num_proc, desc="Scoring readability")
+    # Count how many valid scores we got
+    valid_count = sum(
+        1
+        for ex in scored_dataset
+        if ex.get("osman_score") is not None and ex.get("flesch_score") is not None
+    )
+    logger.info(f"Scoring complete for {len(scored_dataset)} examples")
+    if valid_count == len(scored_dataset):
+        logger.info(f"Successfully scored all {valid_count} examples")
+    else:
+        logger.info(
+            f"Valid scores: {valid_count}/{len(scored_dataset)} ({valid_count / len(scored_dataset) * 100:.1f}%)"
+        )
+        if valid_count == 0:
+            logger.error(
+                "Failed to calculate scores for any examples. "
+                "This indicates a problem with the text or textstat library."
+            )
+    logger.info(f"Scoring complete for {len(scored_dataset)} examples")
+    logger.info(
+        f"Valid scores: {valid_count}/{len(scored_dataset)} ({valid_count / len(scored_dataset) * 100:.1f}%)"
+    )
+    if valid_count == 0:
+        logger.warning(
+            "No valid readability scores calculated. "
+            "Common causes: text too short (< 2 sentences), "
+            "no complete sentences, or special characters only."
+        )
+    # Step 2: Add ranks if requested
+    if add_ranks:
+        logger.info("Computing ranks and readability levels...")
+        scored_dataset = _add_ranks_to_dataset(scored_dataset)
+        logger.info("Ranks and levels added")
+    logger.info("Readability scoring complete!")
+    return scored_dataset
+def _add_ranks_to_dataset(dataset: Dataset) -> Dataset:
+    """
+    Add ranking columns to dataset based on scores.
+    This computes ranks across the entire dataset and adds:
+    - osman_rank, flesch_rank
+    - readability_level (final 0-4 level)
+    Args:
+        dataset: Dataset with osman_score and flesch_score columns
+    Returns:
+        Dataset with ranking columns added
+    """
+    # Extract scores
+    osman_scores = []
+    flesch_scores = []
+    valid_indices = []
+    for i, example in enumerate(dataset):
+        o_score = example.get("osman_score")
+        f_score = example.get("flesch_score")
+        # Only include examples with valid scores
+        if o_score is not None and f_score is not None:
+            osman_scores.append(float(o_score))
+            flesch_scores.append(float(f_score))
+            valid_indices.append(i)
+    logger.info(f"Computing ranks for {len(valid_indices)} valid examples")
+    if len(osman_scores) == 0:
+        logger.error("No valid scores found - cannot compute ranks")
+        logger.error(
+            f"All {len(dataset)} examples have None scores. "
+            "This should not happen with the fallback scoring system. "
+            "Please report this as a bug."
+        )
+        # Still return the dataset with None rank columns
+        return dataset
+    # Compute ranks and levels
+    o_ranks, f_ranks, final_levels = compute_ranks_and_levels(osman_scores, flesch_scores)
+    # Create mapping from index to rank data
+    rank_data = {}
+    for idx, o_r, f_r, final_lvl in zip(
+        valid_indices,
+        o_ranks,
+        f_ranks,
+        final_levels,
+        strict=False,
+    ):
+        rank_data[idx] = {
+            "osman_rank": o_r,
+            "flesch_rank": f_r,
+            "readability_level": final_lvl,
+        }
+    # Add columns to dataset
+    def add_rank_columns(example, idx):
+        if idx in rank_data:
+            example.update(rank_data[idx])
+        else:
+            # No valid scores - set to None
+            example["osman_rank"] = None
+            example["flesch_rank"] = None
+            example["readability_level"] = None
+        return example
+    dataset = dataset.map(add_rank_columns, with_indices=True, desc="Adding ranks")
+    # Log statistics
+    if final_levels:
+        level_counts = [final_levels.count(i) for i in range(5)]
+        logger.info("Readability level distribution:")
+        for i, count in enumerate(level_counts):
+            pct = (count / len(final_levels)) * 100
+            logger.info(f"  Level {i}: {count:,} ({pct:.1f}%)")
+    return dataset
+__all__ = ["score_readability"]