PyPI - TozaText - Versions diffs - 0.1.0__py3-none-any.whl - Mend

TozaText 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

TozaText/__init__.py +22 -0
TozaText/core.py +93 -0
TozaText/doc_modifier.py +29 -0
TozaText/line_remover.py +19 -0
TozaText/repetion.py +84 -0
TozaText/tranlitrate.py +22 -0
TozaText/unicode_reformattor.py +46 -0
TozaText/url_remover.py +17 -0
TozaText/utils/__init__.py +0 -0
TozaText/utils/logging_utils.py +31 -0
tozatext-0.1.0.dist-info/METADATA +70 -0
tozatext-0.1.0.dist-info/RECORD +13 -0
tozatext-0.1.0.dist-info/WHEEL +4 -0

TozaText/__init__.py ADDED Viewed

@@ -0,0 +1,22 @@
+# TozaText/__init__.py
+from .core import Pipeline
+from .doc_modifier import DocumentModifier
+# Import some common modifiers so they’re accessible directly
+from .repetion import WordRepetitionFilter, ParagraphRepetitionFilter
+from .url_remover import UrlRemover
+from .tranlitrate import TransliteratorModifier
+from .unicode_reformattor import UzbekUnicodeCleaner
+from .line_remover import LineRemover
+__all__ = [
+    "Pipeline",
+    "DocumentModifier",
+    "WordRepetitionFilter",
+    "UrlRemover",
+    "TransliteratorModifier",
+    "UzbekUnicodeCleaner",
+    "LineRemover",
+    "ParagraphRepetitionFilter"
+]

TozaText/core.py ADDED Viewed

@@ -0,0 +1,93 @@
+import pandas as pd
+from datasets import Dataset
+from tqdm import tqdm
+from time import perf_counter
+from .utils.logging_utils import setup_logging
+import json
+from datetime import datetime
+import os
+class Pipeline:
+    """Runs a sequence of text modifiers on text, pandas DataFrame, or HF Dataset."""
+    def __init__(self, modifiers:list, debug=False):
+        self.modifiers = modifiers
+        self.logger = setup_logging("logs/Pipeline.log", debug=debug)
+        self.summary = {"texts_processed": 0, "modifiers": {}}
+    def process_text(self, text: str) -> str:
+        """Apply all modifiers to a single text string."""
+        cleaned_text = text
+        for modifier in self.modifiers:
+            start = perf_counter()
+            new_text = modifier.modify_document(cleaned_text)
+            elapsed = perf_counter() - start
+            changed = int(new_text != cleaned_text)
+            name = modifier.__class__.__name__
+            stats = self.summary["modifiers"].setdefault(name, {"changed": 0, "time": 0.0})
+            stats["changed"] += changed
+            stats["time"] += elapsed
+            if not isinstance(new_text, str) or not new_text:
+                new_text = cleaned_text  # fallback to previous valid text
+            cleaned_text = new_text
+        self.summary["texts_processed"] += 1
+        return cleaned_text
+    def process_hf_dataset(self, dataset, column: str) -> "Dataset":
+        """Clean Hugging Face Dataset column in place."""
+        self.logger.info(f"Cleaning Hugging Face Dataset with {len(self.modifiers)} modifiers...")
+        def _apply(example):
+            text = example.get(column, "")
+            cleaned_text = self.process_text(text)
+            return {"cleaned_text": cleaned_text}
+        dataset = dataset.map(
+            _apply,
+            desc="Cleaning HF dataset",
+            load_from_cache_file=False,
+        )
+        self._log_summary()
+        return dataset
+    def process(self, data, column: str = None): # type: ignore
+        """Automatically process string or HF Dataset."""
+        if isinstance(data, str):
+            return self.process_text(data)
+        elif isinstance(data, Dataset):
+            if not column:
+                raise ValueError("Specify `column` for Dataset input.")
+            return self.process_hf_dataset(data, column)
+        else:
+            raise TypeError(f"Unsupported data type: {type(data)}")
+    def _log_summary(self):
+        """Log and save a structured summary of the cleaning session."""
+        total = self.summary["texts_processed"]
+        self.logger.info("Cleaning summary:")
+        self.logger.info(f"Texts processed: {total}")
+        for name, stats in self.summary["modifiers"].items():
+            pct = (stats["changed"] / total * 100) if total else 0
+            self.logger.info(
+                f"  • {name}: changed={stats['changed']} ({pct:.1f}%), time={stats['time']:.2f}s"
+            )
+        os.makedirs("logs", exist_ok=True)
+        with open("logs/summary.json", "a", encoding="utf-8") as f:
+            json.dump(
+                {
+                    "timestamp": datetime.now().isoformat(),
+                    **self.summary
+                },
+                f,
+                ensure_ascii=False,
+            )
+            f.write("\n")
+        self.logger.info("Logs saved to logs/tozatext.log\n")

TozaText/doc_modifier.py ADDED Viewed

@@ -0,0 +1,29 @@
+from abc import ABC, abstractmethod
+class DocumentModifier(ABC):
+    """
+    Abstract base class for text-based document modifiers.
+    Subclasses must implement `modify_document` to transform input value(s)
+    and return the modified value. This supports both single-input and
+    multi-input usage:
+    - Single input: `modify_document(value)`
+    - Multiple inputs: `modify_document(**values)` where each input field is
+      expanded as a keyword argument (e.g., `modify_document(column_1=..., column_2=...)`).
+    """
+    def __init__(self) -> None:
+        super().__init__()
+        self._name = self.__class__.__name__
+        self._sentences = None
+        self._paragraphs = None
+        self._ngrams = None
+    @abstractmethod
+    def modify_document(self, *args: object, **kwargs: object) -> object:
+        """Transform the provided value(s) and return the result."""
+        raise NotImplementedError
+    @property
+    def name(self) -> str:
+        return self._name

TozaText/line_remover.py ADDED Viewed

@@ -0,0 +1,19 @@
+from .doc_modifier import DocumentModifier
+class LineRemover(DocumentModifier):
+    """
+    Removes lines from a document if the content of the line matches a given string.
+    """
+    def __init__(self, patterns: list[str]):
+        """
+        Args:
+            patterns (List[str]): The patterns to check
+        """
+        super().__init__()
+        self._patterns = patterns
+    def modify_document(self, text: str,*args,**kwargs) -> str:
+        lines = text.split("\n")
+        new_lines = [line for line in lines if line not in self._patterns]
+        return "\n".join(new_lines)

TozaText/repetion.py ADDED Viewed

@@ -0,0 +1,84 @@
+import re
+import logging
+from .doc_modifier import DocumentModifier
+logger = logging.getLogger(__name__)
+import re
+import logging
+from .doc_modifier import DocumentModifier
+logger = logging.getLogger(__name__)
+class WordRepetitionFilter(DocumentModifier):
+    """Remove consecutive repeated words, even if separated by punctuation."""
+    def __init__(self, max_repetition=1):
+        super().__init__()
+        self.max_repetition = max_repetition
+        self.change_count = 0
+    def modify_document(self, text, *args, **kwargs):
+        if not isinstance(text, str) or not text.strip():
+            return text
+        # Split tokens by space (keep punctuation attached)
+        tokens = text.split()
+        new_tokens = []
+        prev_norm = None
+        repetition_count = 0
+        for tok in tokens:
+            # Normalize token for comparison (remove punctuation + lowercase)
+            norm = re.sub(r"[^\w]+$", "", tok).lower()
+            if prev_norm == norm and norm != "":
+                repetition_count += 1
+                if repetition_count <= self.max_repetition:
+                    new_tokens.append(tok)
+                else:
+                    # skip repeated token (collapse)
+                    self.change_count += 1
+                    logger.info(
+                        f"[{self.__class__.__name__}] | skipped repeated '{tok}' (normalized as '{norm}')"
+                    )
+            else:
+                new_tokens.append(tok)
+                repetition_count = 1
+                prev_norm = norm
+        cleaned_text = " ".join(new_tokens)
+        return cleaned_text
+class ParagraphRepetitionFilter(DocumentModifier):
+    """Remove texts with too many repeated paragraphs."""
+    def __init__(self, max_paragraph_dup_ratio=0.3, max_char_dup_ratio=0.2):
+        super().__init__()
+        self.max_paragraph_dup_ratio = max_paragraph_dup_ratio
+        self.max_char_dup_ratio = max_char_dup_ratio
+    def modify_document(self, text: str, *args, **kwargs) -> str:
+        paragraphs = re.compile(r"\n{2,}").split(text.strip())
+        unique_x = set()
+        duplicate_chars = 0
+        duplicate_elements = 0
+        for p in paragraphs:
+            if p in unique_x:
+                duplicate_chars += len(p)
+                duplicate_elements += 1
+            else:
+                unique_x.add(p)
+        if paragraphs:
+            if duplicate_elements / len(paragraphs) > self.max_paragraph_dup_ratio:
+                return ""
+            if duplicate_chars / len(text) > self.max_char_dup_ratio:
+                return ""
+        return text

TozaText/tranlitrate.py ADDED Viewed

@@ -0,0 +1,22 @@
+from .doc_modifier import DocumentModifier
+try:
+    from UzTransliterator import UzTransliterator
+except ImportError:
+    raise ImportError(
+        "UzTransliterator is not installed. To use transliteration, run:\n"
+        "pip install git+https://github.com/latofatbobojonova/UzTransliterator.git"
+    )
+class TransliteratorModifier(DocumentModifier):
+    def __init__(self, from_: str = "cyr", to: str = "lat"):
+        super().__init__()
+        self.translit = UzTransliterator.UzTransliterator()
+        self.from_ = from_
+        self.to = to
+    def modify_document(self, text: str,*args,**kwargs) -> str:
+        if not isinstance(text, str):
+            return text
+        return self.translit.transliterate(text, from_=self.from_, to=self.to)

TozaText/unicode_reformattor.py ADDED Viewed

@@ -0,0 +1,46 @@
+from .doc_modifier import DocumentModifier
+import re
+import ftfy
+from ftfy import TextFixerConfig
+class UzbekUnicodeCleaner(DocumentModifier):
+    """
+    Clean up Uzbek texts and fix unicode errors.
+    """
+    def __init__(self):
+        super().__init__()
+        self.config = TextFixerConfig(
+            normalization='NFC',
+            remove_control_chars=True,
+            fix_line_breaks=True,
+            explain=False
+        )
+        # generic apostrophes
+        self.uz_replacements = [
+            (r"–", "—"),
+            (r"«", "“"),
+            (r"»", "”"),
+            (r' \\"', " “"),
+            (r'\\" ', "” "),
+            (r"g['‘’`´]", "gʻ"),
+            (r"o['‘’`´]", "oʻ"),
+            (r"G['‘’`´]", "Gʻ"),
+            (r"O['‘’`´]", "Oʻ"),
+            (r"(?<![ogOG])[’'`´ʻ]", "ʼ"),
+            (r" {2,}", " "),
+        ]
+    def modify_document(self, text, *args,**kwargs):
+        if not isinstance(text, str):
+            return text
+        text = ftfy.fix_text(text, config=self.config)
+        for pattern, repl in self.uz_replacements:
+            text = re.sub(pattern, repl, text)
+        return text.strip()

TozaText/url_remover.py ADDED Viewed

@@ -0,0 +1,17 @@
+import re
+from .doc_modifier import DocumentModifier
+URL_REGEX = re.compile(r"https?://\S+|www\.\S+", flags=re.IGNORECASE)
+class UrlRemover(DocumentModifier):
+    """
+    Removes all URLs in a document.
+    """
+    def __init__(self):
+        super().__init__()
+    def modify_document(self, text: str, *args, **kwargs) -> str:
+        return URL_REGEX.sub("", text)

TozaText/utils/__init__.py ADDED Viewed

File without changes

TozaText/utils/logging_utils.py ADDED Viewed

@@ -0,0 +1,31 @@
+import logging
+import os
+def setup_logging(log_file="logs/pipeline.log", debug=False):
+    """
+    Setup logging for the entire TozaText project.
+    Always saves to a file; prints to console only if debug=True.
+    """
+    os.makedirs(os.path.dirname(log_file) or ".", exist_ok=True)
+    log_format = "%(asctime)s | %(levelname)s | %(name)s | %(message)s"
+    file_handler = logging.FileHandler(log_file, mode="w", encoding="utf-8")
+    file_handler.setFormatter(logging.Formatter(log_format))
+    # Configure the root logger (affects all submodules)
+    logging.basicConfig(
+        level=logging.INFO,
+        handlers=[file_handler],
+        format=log_format,
+        force=True,  # clears any existing handlers
+    )
+    # Add optional console output
+    if debug:
+        console_handler = logging.StreamHandler()
+        console_handler.setFormatter(logging.Formatter(log_format))
+        logging.getLogger().addHandler(console_handler)
+    logger = logging.getLogger(__name__)  # current module name, not fixed
+    return logger

tozatext-0.1.0.dist-info/METADATA ADDED Viewed

@@ -0,0 +1,70 @@
+Metadata-Version: 2.4
+Name: TozaText
+Version: 0.1.0
+Summary: TozaText is a cleaning library for preprocessing raw Uzbek and multilingual text data.
+Author-email: Shohrux Isakov <isakovsh19@gmail.com>
+License: MIT
+Requires-Python: >=3.10
+Requires-Dist: datasets>=4.4.1
+Requires-Dist: ftfy>=6.3.1
+Requires-Dist: loguru>=0.7.0
+Requires-Dist: pandas>=2.0.0
+Requires-Dist: tqdm>=4.65.0
+Description-Content-Type: text/markdown
+# 🧹 TozaText
+**TozaText** is a lightweight and extensible text-preprocessing pipeline built for cleaning noisy, transcribed, or user-generated text data.
+It’s designed around a **modifier-based architecture** — each cleaning rule is a `DocumentModifier` that can be combined into a customizable `Pipeline`.
+---
+## Features
+- **Modular design** – add or remove modifiers easily (e.g., repetition removal, transliteration)
+- **Smart repetition cleaner** – removes consecutive repeated words, even with punctuation or ellipses
+---
+## Available Modifiers
+TozaText currently includes the following modifiers out of the box:
+| Modifier | Description | Example Input | Example Output |
+|-----------|--------------|----------------|----------------|
+| **`WordRepetitionFilter`** | Removes consecutive repeated words, even when separated by punctuation or ellipses. | `bu. bu. bu. shu shu qila qila` | `bu. shu qila` |
+| **`ParagraphRepetitionFilter`** | Removes entire paragraphs if too many repeated paragraphs or characters are detected (useful for STT data with repeated intros). | `"Salom!\n\nSalom!\n\nSalom!"` | `""` |
+| **`TransliteratorModifier`** | Converts Uzbek text between **Cyrillic** and **Latin** alphabets using `UzTransliterator`. | `"Салом дунё"` | `"Salom dunyo"` |
+| **`UrlCleaner`** | Remove or normalize URLs and links from text. | `"Bu sayt: https://example.com"` | `"Bu sayt"` |
+| **`EmojiCleaner`** | Strip or map emojis to descriptive tokens. | `"Zo‘r 😎"` | `"Zo‘r"` |
+All modifiers inherit from:
+```python
+class DocumentModifier:
+    def modify_document(self, text: str, *args, **kwargs) -> str:
+        ...
+```
+## Installation
+```bash
+git clone https://gitlab.adliya.uz/shohrux1sakov/tozatext.git
+cd TozaText
+pip install -e .
+```
+## Code Example
+```
+from datasets import load_dataset
+from TozaText import Pipeline, WordRepetitionFilter, ParagraphRepetitionFilter
+data = load_dataset("aktrmai/youtube_transcribe_data", split="train")
+pipeline = Pipeline([
+    WordRepetitionFilter(),
+    ParagraphRepetitionFilter(),
+])
+cleaned = pipeline.process_hf_dataset(data, column="text")
+```

tozatext-0.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+TozaText/__init__.py,sha256=xzQ1vcj30oSEzB2pljrErJt5kv1hpxZhJmvpZE0yp04,613
+TozaText/core.py,sha256=ONRWF0n1tQ09U8NlurHn9FixrL9HvS9jgY4IEWLMpLk,3386
+TozaText/doc_modifier.py,sha256=oGIvFZOYQgupXHYfFbNX3sYJokxeuHL8nGLjbac-OIg,987
+TozaText/line_remover.py,sha256=6D2Ge18BG4mAIUkNGjWawFIGwy4nM1AhXsp-JT4EUJc,592
+TozaText/repetion.py,sha256=KYBdC3CDerFM47Zx-yfyKG_vII97XpPI0liy3s8S9hk,2618
+TozaText/tranlitrate.py,sha256=Sln63dtJUYZo88Gr-V0ZpHN6BCvjSJ49cZjMdMDExU0,762
+TozaText/unicode_reformattor.py,sha256=t3-sXmyvgYPRN4WQ5Ccn4PFuwgWyFEjdiUA92_TncDk,1239
+TozaText/url_remover.py,sha256=zXdMTs65OD9uVG0qD-SFHQ2q0qYWEBo-jxyMCLxv-vo,372
+TozaText/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+TozaText/utils/logging_utils.py,sha256=vCmjPBYkkstnANrtLHYSyV4DEmzQEK1NLeVJZlT5BfQ,1044
+tozatext-0.1.0.dist-info/METADATA,sha256=vvQU_SpXgZDfGyaYYLFKS8-vbVKG9oCV7zha6_SxT-0,2572
+tozatext-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
+tozatext-0.1.0.dist-info/RECORD,,

tozatext-0.1.0.dist-info/WHEEL ADDED Viewed

@@ -0,0 +1,4 @@
+Wheel-Version: 1.0
+Generator: hatchling 1.27.0
+Root-Is-Purelib: true
+Tag: py3-none-any