PyPI - dalla-data-processing - Versions diffs - 0.0.1__py3-none-any.whl - Mend

dalla-data-processing 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

dalla/__init__.py +27 -0
dalla/cli.py +453 -0
dalla/core/__init__.py +6 -0
dalla/core/dataset.py +387 -0
dalla/core/parallel.py +279 -0
dalla/deduplication/__init__.py +370 -0
dalla/deduplication/bin/.gitignore +1 -0
dalla/deduplication/bin/onion-linux-x86_64 +0 -0
dalla/deduplication/onion/COPYING +24 -0
dalla/deduplication/onion/Makefile +21 -0
dalla/deduplication/onion/Makefile.config +3 -0
dalla/deduplication/onion/README.md +21 -0
dalla/deduplication/onion/src/Makefile +22 -0
dalla/deduplication/onion/src/Makefile.g +23 -0
dalla/deduplication/onion/src/buzhash.c +325 -0
dalla/deduplication/onion/src/buzhash.h +30 -0
dalla/deduplication/onion/src/hashdup.c +172 -0
dalla/deduplication/onion/src/hashgen.c +206 -0
dalla/deduplication/onion/src/onion +0 -0
dalla/deduplication/onion/src/onion.c +799 -0
dalla/deduplication/onion/src/onion_dup.c +824 -0
dalla/deduplication/onion/src/version.c +17 -0
dalla/deduplication/onion/src/version.h +10 -0
dalla/deduplication/onion/src_sc/Makefile +22 -0
dalla/deduplication/onion/src_sc/Makefile.g +23 -0
dalla/deduplication/onion/src_sc/buzhash.c +325 -0
dalla/deduplication/onion/src_sc/buzhash.h +30 -0
dalla/deduplication/onion/src_sc/hashdup +0 -0
dalla/deduplication/onion/src_sc/hashdup.c +172 -0
dalla/deduplication/onion/src_sc/hashgen +0 -0
dalla/deduplication/onion/src_sc/hashgen.c +206 -0
dalla/deduplication/onion/src_sc/onion.c +854 -0
dalla/deduplication/onion/src_sc/onion_dup.c +824 -0
dalla/deduplication/onion/src_sc/version.c +17 -0
dalla/deduplication/onion/src_sc/version.h +10 -0
dalla/deduplication/onion_wrapper.py +223 -0
dalla/deduplication/postprocessing.py +216 -0
dalla/deduplication/preprocessing.py +120 -0
dalla/quality/__init__.py +5 -0
dalla/quality/checker.py +354 -0
dalla/readability/__init__.py +197 -0
dalla/readability/ranking.py +165 -0
dalla/readability/scorer.py +148 -0
dalla/stemming/__init__.py +551 -0
dalla/stemming/data/words_al.txt +3414 -0
dalla/stemming/data/words_al_t.txt +885 -0
dalla/stemming/data/words_t.txt +7 -0
dalla/utils/__init__.py +10 -0
dalla/utils/logger.py +128 -0
dalla/utils/tokenize.py +89 -0
dalla_data_processing-0.0.1.dist-info/METADATA +393 -0
dalla_data_processing-0.0.1.dist-info/RECORD +55 -0
dalla_data_processing-0.0.1.dist-info/WHEEL +5 -0
dalla_data_processing-0.0.1.dist-info/entry_points.txt +2 -0
dalla_data_processing-0.0.1.dist-info/top_level.txt +1 -0

dalla/core/dataset.py ADDED Viewed

@@ -0,0 +1,387 @@
+"""
+Dataset I/O utilities for unified HuggingFace dataset handling.
+This module provides a consistent interface for loading, saving, and manipulating
+HuggingFace datasets across all dalla-process components.
+"""
+from collections.abc import Callable
+from pathlib import Path
+from typing import Any
+from datasets import Dataset, DatasetDict, concatenate_datasets, load_from_disk
+from dalla.utils.logger import get_logger
+logger = get_logger(__name__)
+class DatasetManager:
+    """Unified manager for HuggingFace dataset operations."""
+    @staticmethod
+    def load(
+        path: str | Path,
+        split: str | None = None,
+        streaming: bool = False,
+    ) -> Dataset | DatasetDict:
+        """
+        Load a HuggingFace dataset from disk.
+        Args:
+            path: Path to the dataset directory
+            split: Optional split name to load (e.g., 'train', 'test')
+            streaming: Whether to use streaming mode for large datasets
+        Returns:
+            Dataset or DatasetDict depending on the structure
+        Example:
+            >>> dm = DatasetManager()
+            >>> dataset = dm.load("./data/my_dataset")
+            >>> train_data = dm.load("./data/my_dataset", split="train")
+        """
+        path = Path(path)
+        if not path.exists():
+            raise FileNotFoundError(f"Dataset path does not exist: {path}")
+        logger.info(f"Loading dataset from {path}")
+        dataset = load_from_disk(str(path))
+        if split is not None:
+            if isinstance(dataset, DatasetDict):
+                if split not in dataset:
+                    raise ValueError(
+                        f"Split '{split}' not found. Available splits: {list(dataset.keys())}"
+                    )
+                dataset = dataset[split]
+            else:
+                logger.warning(f"Split '{split}' specified but dataset has no splits")
+        logger.info(f"Loaded dataset with {DatasetManager.get_size(dataset)} examples")
+        return dataset
+    @staticmethod
+    def save(
+        dataset: Dataset | DatasetDict,
+        path: str | Path,
+        overwrite: bool = False,
+    ) -> None:
+        """
+        Save a HuggingFace dataset to disk.
+        Args:
+            dataset: Dataset or DatasetDict to save
+            path: Path where the dataset will be saved
+            overwrite: Whether to overwrite existing dataset
+        Example:
+            >>> dm = DatasetManager()
+            >>> dm.save(processed_dataset, "./data/processed")
+        """
+        path = Path(path)
+        if path.exists() and not overwrite:
+            raise FileExistsError(
+                f"Dataset path already exists: {path}. Use overwrite=True to replace."
+            )
+        logger.info(f"Saving dataset to {path}")
+        dataset.save_to_disk(str(path))
+        logger.info("Dataset saved successfully")
+    @staticmethod
+    def get_size(dataset: Dataset | DatasetDict) -> int:
+        """
+        Get the total number of examples in a dataset.
+        Args:
+            dataset: Dataset or DatasetDict
+        Returns:
+            Total number of examples
+        """
+        if isinstance(dataset, DatasetDict):
+            return sum(len(ds) for ds in dataset.values())
+        return len(dataset)
+    @staticmethod
+    def get_column_names(dataset: Dataset | DatasetDict) -> list[str]:
+        """
+        Get column names from a dataset.
+        Args:
+            dataset: Dataset or DatasetDict
+        Returns:
+            List of column names
+        """
+        if isinstance(dataset, DatasetDict):
+            # Get columns from first split
+            first_split = next(iter(dataset.values()))
+            return first_split.column_names
+        return dataset.column_names
+    @staticmethod
+    def add_column(
+        dataset: Dataset,
+        column_name: str,
+        data: list[Any],
+    ) -> Dataset:
+        """
+        Add a new column to a dataset.
+        Args:
+            dataset: Dataset to modify
+            column_name: Name of the new column
+            data: List of values for the new column
+        Returns:
+            Dataset with the new column added
+        Example:
+            >>> scores = [0.95, 0.87, 0.92, ...]
+            >>> dataset = dm.add_column(dataset, "quality_score", scores)
+        """
+        if len(data) != len(dataset):
+            raise ValueError(
+                f"Data length ({len(data)}) must match dataset length ({len(dataset)})"
+            )
+        logger.info(f"Adding column '{column_name}' to dataset")
+        return dataset.add_column(column_name, data)
+    @staticmethod
+    def map_column(
+        dataset: Dataset,
+        fn: Callable,
+        input_column: str,
+        output_column: str | None = None,
+        batched: bool = False,
+        batch_size: int = 1000,
+        num_proc: int | None = None,
+        desc: str | None = None,
+    ) -> Dataset:
+        """
+        Apply a function to a column in the dataset.
+        Args:
+            dataset: Dataset to process
+            fn: Function to apply to each example
+            input_column: Name of the input column
+            output_column: Name of the output column (if None, replaces input_column)
+            batched: Whether to process in batches
+            batch_size: Size of batches when batched=True
+            num_proc: Number of processes for parallel processing
+            desc: Description for progress bar
+        Returns:
+            Processed dataset
+        Example:
+            >>> def deduplicate_text(text):
+            ...     return text.strip().lower()
+            >>> dataset = dm.map_column(
+            ...     dataset,
+            ...     deduplicate_text,
+            ...     "text",
+            ...     "cleaned_text",
+            ...     num_proc=4
+            ... )
+        """
+        if input_column not in dataset.column_names:
+            raise ValueError(f"Column '{input_column}' not found in dataset")
+        output_col = output_column or input_column
+        def process_fn(examples):
+            if batched:
+                results = [fn(item) for item in examples[input_column]]
+            else:
+                results = fn(examples[input_column])
+            return {output_col: results}
+        logger.info(f"Mapping function to column '{input_column}'")
+        return dataset.map(
+            process_fn,
+            batched=batched,
+            batch_size=batch_size,
+            num_proc=num_proc,
+            desc=desc or f"Processing {input_column}",
+        )
+    @staticmethod
+    def filter_dataset(
+        dataset: Dataset,
+        fn: Callable,
+        num_proc: int | None = None,
+        desc: str | None = None,
+    ) -> Dataset:
+        """
+        Filter dataset based on a condition.
+        Args:
+            dataset: Dataset to filter
+            fn: Function that returns True for examples to keep
+            num_proc: Number of processes for parallel processing
+            desc: Description for progress bar
+        Returns:
+            Filtered dataset
+        Example:
+            >>> def is_high_quality(example):
+            ...     return example['quality_score'] > 0.8
+            >>> filtered = dm.filter_dataset(dataset, is_high_quality)
+        """
+        logger.info(f"Filtering dataset with {len(dataset)} examples")
+        filtered = dataset.filter(fn, num_proc=num_proc, desc=desc or "Filtering dataset")
+        logger.info(
+            f"Filtered to {len(filtered)} examples ({len(filtered) / len(dataset) * 100:.1f}%)"
+        )
+        return filtered
+    @staticmethod
+    def select_columns(
+        dataset: Dataset | DatasetDict,
+        columns: list[str],
+    ) -> Dataset | DatasetDict:
+        """
+        Select specific columns from a dataset.
+        Args:
+            dataset: Dataset or DatasetDict
+            columns: List of column names to keep
+        Returns:
+            Dataset with only the specified columns
+        """
+        available_columns = DatasetManager.get_column_names(dataset)
+        invalid_columns = set(columns) - set(available_columns)
+        if invalid_columns:
+            raise ValueError(f"Columns not found: {invalid_columns}")
+        logger.info(f"Selecting columns: {columns}")
+        if isinstance(dataset, DatasetDict):
+            return DatasetDict({split: ds.select_columns(columns) for split, ds in dataset.items()})
+        return dataset.select_columns(columns)
+    @staticmethod
+    def remove_columns(
+        dataset: Dataset | DatasetDict,
+        columns: list[str],
+    ) -> Dataset | DatasetDict:
+        """
+        Remove specific columns from a dataset.
+        Args:
+            dataset: Dataset or DatasetDict
+            columns: List of column names to remove
+        Returns:
+            Dataset without the specified columns
+        """
+        logger.info(f"Removing columns: {columns}")
+        if isinstance(dataset, DatasetDict):
+            return DatasetDict({split: ds.remove_columns(columns) for split, ds in dataset.items()})
+        return dataset.remove_columns(columns)
+    @staticmethod
+    def concatenate(datasets: list[Dataset]) -> Dataset:
+        """
+        Concatenate multiple datasets.
+        Args:
+            datasets: List of datasets to concatenate
+        Returns:
+            Concatenated dataset
+        """
+        if not datasets:
+            raise ValueError("Cannot concatenate empty list of datasets")
+        logger.info(f"Concatenating {len(datasets)} datasets")
+        return concatenate_datasets(datasets)
+    @staticmethod
+    def train_test_split(
+        dataset: Dataset,
+        test_size: float = 0.1,
+        seed: int = 42,
+    ) -> DatasetDict:
+        """
+        Split dataset into train and test sets.
+        Args:
+            dataset: Dataset to split
+            test_size: Fraction of data to use for testing
+            seed: Random seed for reproducibility
+        Returns:
+            DatasetDict with 'train' and 'test' splits
+        """
+        logger.info(f"Splitting dataset into train/test (test_size={test_size})")
+        return dataset.train_test_split(test_size=test_size, seed=seed)
+    @staticmethod
+    def get_info(dataset: Dataset | DatasetDict) -> dict[str, Any]:
+        """
+        Get information about a dataset.
+        Args:
+            dataset: Dataset or DatasetDict
+        Returns:
+            Dictionary with dataset information
+        """
+        if isinstance(dataset, DatasetDict):
+            return {
+                "type": "DatasetDict",
+                "splits": list(dataset.keys()),
+                "total_examples": DatasetManager.get_size(dataset),
+                "split_info": {
+                    split: {
+                        "num_examples": len(ds),
+                        "columns": ds.column_names,
+                        "features": str(ds.features),
+                    }
+                    for split, ds in dataset.items()
+                },
+            }
+        else:
+            return {
+                "type": "Dataset",
+                "num_examples": len(dataset),
+                "columns": dataset.column_names,
+                "features": str(dataset.features),
+            }
+    @staticmethod
+    def print_info(dataset: Dataset | DatasetDict) -> None:
+        """
+        Print dataset information in a readable format.
+        Args:
+            dataset: Dataset or DatasetDict
+        """
+        info = DatasetManager.get_info(dataset)
+        if info["type"] == "DatasetDict":
+            print(f"\n{'=' * 60}")
+            print("Dataset Dictionary")
+            print(f"{'=' * 60}")
+            print(f"Total examples: {info['total_examples']:,}")
+            print(f"Splits: {', '.join(info['splits'])}")
+            print()
+            for split, split_info in info["split_info"].items():
+                print(f"  {split}:")
+                print(f"    Examples: {split_info['num_examples']:,}")
+                print(f"    Columns: {', '.join(split_info['columns'])}")
+            print(f"{'=' * 60}\n")
+        else:
+            print(f"\n{'=' * 60}")
+            print("Dataset")
+            print(f"{'=' * 60}")
+            print(f"Examples: {info['num_examples']:,}")
+            print(f"Columns: {', '.join(info['columns'])}")
+            print(f"{'=' * 60}\n")

dalla/core/parallel.py ADDED Viewed

@@ -0,0 +1,279 @@
+"""
+Parallel processing utilities for efficient dataset operations.
+This module provides utilities for parallel processing of datasets,
+including batch processing, multiprocessing, and progress tracking.
+"""
+import multiprocessing
+from collections.abc import Callable
+from typing import Any
+from datasets import Dataset
+from tqdm import tqdm
+from dalla.utils.logger import get_logger
+logger = get_logger(__name__)
+class ParallelProcessor:
+    """Utility class for parallel dataset processing."""
+    @staticmethod
+    def get_optimal_num_workers(num_workers: int | None = None) -> int:
+        """
+        Get optimal number of workers for parallel processing.
+        Args:
+            num_workers: Requested number of workers (None for auto)
+        Returns:
+            Optimal number of workers
+        """
+        cpu_count = multiprocessing.cpu_count()
+        if num_workers is None:
+            return max(1, cpu_count - 1)
+        return min(num_workers, cpu_count)
+    @staticmethod
+    def process_dataset_parallel(
+        dataset: Dataset,
+        process_fn: Callable,
+        num_proc: int | None = None,
+        batched: bool = False,
+        batch_size: int = 1000,
+        desc: str | None = None,
+        remove_columns: list[str] | None = None,
+        **map_kwargs,
+    ) -> Dataset:
+        """
+        Process a dataset in parallel using the map function.
+        Args:
+            dataset: Dataset to process
+            process_fn: Function to apply to each example/batch
+            num_proc: Number of processes (None for auto)
+            batched: Whether to process in batches
+            batch_size: Batch size when batched=True
+            desc: Description for progress bar
+            remove_columns: Columns to remove after processing
+            **map_kwargs: Additional arguments for dataset.map()
+        Returns:
+            Processed dataset
+        Example:
+            >>> def process_text(example):
+            ...     example['processed'] = example['text'].lower()
+            ...     return example
+            >>> processed = ParallelProcessor.process_dataset_parallel(
+            ...     dataset, process_text, num_proc=4
+            ... )
+        """
+        num_workers = ParallelProcessor.get_optimal_num_workers(num_proc)
+        logger.info(f"Processing dataset with {num_workers} workers")
+        logger.info(f"Batched: {batched}, Batch size: {batch_size if batched else 'N/A'}")
+        return dataset.map(
+            process_fn,
+            num_proc=num_workers,
+            batched=batched,
+            batch_size=batch_size,
+            desc=desc or "Processing dataset",
+            remove_columns=remove_columns,
+            **map_kwargs,
+        )
+    @staticmethod
+    def process_in_batches(
+        dataset: Dataset,
+        process_fn: Callable[[list[dict[str, Any]]], list[dict[str, Any]]],
+        batch_size: int = 1000,
+        desc: str | None = None,
+    ) -> Dataset:
+        """
+        Process dataset in batches with custom function.
+        Args:
+            dataset: Dataset to process
+            process_fn: Function that takes a list of examples and returns processed list
+            batch_size: Size of batches
+            desc: Description for progress bar
+        Returns:
+            Processed dataset
+        Example:
+            >>> def batch_process(batch):
+            ...     # Process batch of examples
+            ...     return [{'text': ex['text'].upper()} for ex in batch]
+            >>> result = ParallelProcessor.process_in_batches(
+            ...     dataset, batch_process, batch_size=100
+            ... )
+        """
+        logger.info(f"Processing dataset in batches of {batch_size}")
+        processed_examples = []
+        total_batches = (len(dataset) + batch_size - 1) // batch_size
+        with tqdm(total=total_batches, desc=desc or "Processing batches") as pbar:
+            for i in range(0, len(dataset), batch_size):
+                batch = dataset[i : i + batch_size]
+                batch_list = [
+                    {key: batch[key][j] for key in batch}
+                    for j in range(len(batch[next(iter(batch))]))
+                ]
+                processed_batch = process_fn(batch_list)
+                processed_examples.extend(processed_batch)
+                pbar.update(1)
+        return Dataset.from_list(processed_examples)
+    @staticmethod
+    def create_shards(
+        dataset: Dataset,
+        num_shards: int,
+    ) -> list[Dataset]:
+        """
+        Split dataset into shards for parallel processing.
+        Args:
+            dataset: Dataset to shard
+            num_shards: Number of shards to create
+        Returns:
+            List of dataset shards
+        Example:
+            >>> shards = ParallelProcessor.create_shards(dataset, 4)
+            >>> # Process each shard independently
+        """
+        if num_shards <= 0:
+            raise ValueError("num_shards must be positive")
+        total_size = len(dataset)
+        shard_size = (total_size + num_shards - 1) // num_shards
+        shards = []
+        for i in range(num_shards):
+            start_idx = i * shard_size
+            end_idx = min(start_idx + shard_size, total_size)
+            if start_idx < total_size:
+                shard_indices = list(range(start_idx, end_idx))
+                shards.append(dataset.select(shard_indices))
+        logger.info(f"Created {len(shards)} shards from dataset of {total_size} examples")
+        return shards
+    @staticmethod
+    def process_with_multiprocessing(
+        items: list[Any],
+        process_fn: Callable,
+        num_workers: int | None = None,
+        desc: str | None = None,
+    ) -> list[Any]:
+        """
+        Process a list of items using multiprocessing.
+        Args:
+            items: List of items to process
+            process_fn: Function to apply to each item
+            num_workers: Number of worker processes
+            desc: Description for progress bar
+        Returns:
+            List of processed items
+        Example:
+            >>> def process_item(x):
+            ...     return x * 2
+            >>> results = ParallelProcessor.process_with_multiprocessing(
+            ...     [1, 2, 3, 4], process_item, num_workers=2
+            ... )
+        """
+        num_workers = ParallelProcessor.get_optimal_num_workers(num_workers)
+        logger.info(f"Processing {len(items)} items with {num_workers} workers")
+        if num_workers == 1:
+            return [process_fn(item) for item in tqdm(items, desc=desc or "Processing items")]
+        with multiprocessing.Pool(processes=num_workers) as pool:
+            results = list(
+                tqdm(
+                    pool.imap(process_fn, items),
+                    total=len(items),
+                    desc=desc or "Processing items",
+                )
+            )
+        return results
+class ProgressTracker:
+    """Utility for tracking progress across multiple operations."""
+    def __init__(self, total: int, desc: str | None = None):
+        """
+        Initialize progress tracker.
+        Args:
+            total: Total number of items to track
+            desc: Description for progress bar
+        """
+        self.pbar = tqdm(total=total, desc=desc or "Processing")
+        self.current = 0
+    def update(self, n: int = 1):
+        """Update progress by n items."""
+        self.pbar.update(n)
+        self.current += n
+    def set_description(self, desc: str):
+        """Update progress bar description."""
+        self.pbar.set_description(desc)
+    def close(self):
+        """Close the progress bar."""
+        self.pbar.close()
+    def __enter__(self):
+        """Context manager entry."""
+        return self
+    def __exit__(self, *args):
+        """Context manager exit."""
+        self.close()
+def batch_iterator(iterable, batch_size: int):
+    """
+    Yield batches from an iterable.
+    Args:
+        iterable: Any iterable
+        batch_size: Size of each batch
+    Yields:
+        Batches of items
+    Example:
+        >>> for batch in batch_iterator(range(10), batch_size=3):
+        ...     print(batch)
+        [0, 1, 2]
+        [3, 4, 5]
+        [6, 7, 8]
+        [9]
+    """
+    batch = []
+    for item in iterable:
+        batch.append(item)
+        if len(batch) == batch_size:
+            yield batch
+            batch = []
+    if batch:
+        yield batch