TozaText 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
TozaText/__init__.py ADDED
@@ -0,0 +1,22 @@
1
+ # TozaText/__init__.py
2
+
3
+ from .core import Pipeline
4
+ from .doc_modifier import DocumentModifier
5
+
6
+ # Import some common modifiers so they’re accessible directly
7
+ from .repetion import WordRepetitionFilter, ParagraphRepetitionFilter
8
+ from .url_remover import UrlRemover
9
+ from .tranlitrate import TransliteratorModifier
10
+ from .unicode_reformattor import UzbekUnicodeCleaner
11
+ from .line_remover import LineRemover
12
+
13
+ __all__ = [
14
+ "Pipeline",
15
+ "DocumentModifier",
16
+ "WordRepetitionFilter",
17
+ "UrlRemover",
18
+ "TransliteratorModifier",
19
+ "UzbekUnicodeCleaner",
20
+ "LineRemover",
21
+ "ParagraphRepetitionFilter"
22
+ ]
TozaText/core.py ADDED
@@ -0,0 +1,93 @@
1
+ import pandas as pd
2
+ from datasets import Dataset
3
+ from tqdm import tqdm
4
+ from time import perf_counter
5
+ from .utils.logging_utils import setup_logging
6
+ import json
7
+ from datetime import datetime
8
+ import os
9
+
10
+ class Pipeline:
11
+ """Runs a sequence of text modifiers on text, pandas DataFrame, or HF Dataset."""
12
+
13
+ def __init__(self, modifiers:list, debug=False):
14
+ self.modifiers = modifiers
15
+ self.logger = setup_logging("logs/Pipeline.log", debug=debug)
16
+ self.summary = {"texts_processed": 0, "modifiers": {}}
17
+
18
+ def process_text(self, text: str) -> str:
19
+ """Apply all modifiers to a single text string."""
20
+ cleaned_text = text
21
+ for modifier in self.modifiers:
22
+ start = perf_counter()
23
+ new_text = modifier.modify_document(cleaned_text)
24
+ elapsed = perf_counter() - start
25
+
26
+ changed = int(new_text != cleaned_text)
27
+ name = modifier.__class__.__name__
28
+ stats = self.summary["modifiers"].setdefault(name, {"changed": 0, "time": 0.0})
29
+ stats["changed"] += changed
30
+ stats["time"] += elapsed
31
+
32
+ if not isinstance(new_text, str) or not new_text:
33
+ new_text = cleaned_text # fallback to previous valid text
34
+
35
+ cleaned_text = new_text
36
+
37
+ self.summary["texts_processed"] += 1
38
+ return cleaned_text
39
+
40
+ def process_hf_dataset(self, dataset, column: str) -> "Dataset":
41
+ """Clean Hugging Face Dataset column in place."""
42
+ self.logger.info(f"Cleaning Hugging Face Dataset with {len(self.modifiers)} modifiers...")
43
+
44
+ def _apply(example):
45
+ text = example.get(column, "")
46
+ cleaned_text = self.process_text(text)
47
+ return {"cleaned_text": cleaned_text}
48
+
49
+ dataset = dataset.map(
50
+ _apply,
51
+ desc="Cleaning HF dataset",
52
+ load_from_cache_file=False,
53
+ )
54
+
55
+ self._log_summary()
56
+ return dataset
57
+
58
+ def process(self, data, column: str = None): # type: ignore
59
+ """Automatically process string or HF Dataset."""
60
+ if isinstance(data, str):
61
+ return self.process_text(data)
62
+ elif isinstance(data, Dataset):
63
+ if not column:
64
+ raise ValueError("Specify `column` for Dataset input.")
65
+ return self.process_hf_dataset(data, column)
66
+ else:
67
+ raise TypeError(f"Unsupported data type: {type(data)}")
68
+
69
+ def _log_summary(self):
70
+ """Log and save a structured summary of the cleaning session."""
71
+ total = self.summary["texts_processed"]
72
+ self.logger.info("Cleaning summary:")
73
+ self.logger.info(f"Texts processed: {total}")
74
+
75
+ for name, stats in self.summary["modifiers"].items():
76
+ pct = (stats["changed"] / total * 100) if total else 0
77
+ self.logger.info(
78
+ f" • {name}: changed={stats['changed']} ({pct:.1f}%), time={stats['time']:.2f}s"
79
+ )
80
+
81
+ os.makedirs("logs", exist_ok=True)
82
+ with open("logs/summary.json", "a", encoding="utf-8") as f:
83
+ json.dump(
84
+ {
85
+ "timestamp": datetime.now().isoformat(),
86
+ **self.summary
87
+ },
88
+ f,
89
+ ensure_ascii=False,
90
+ )
91
+ f.write("\n")
92
+
93
+ self.logger.info("Logs saved to logs/tozatext.log\n")
@@ -0,0 +1,29 @@
1
+ from abc import ABC, abstractmethod
2
+
3
+ class DocumentModifier(ABC):
4
+ """
5
+ Abstract base class for text-based document modifiers.
6
+
7
+ Subclasses must implement `modify_document` to transform input value(s)
8
+ and return the modified value. This supports both single-input and
9
+ multi-input usage:
10
+ - Single input: `modify_document(value)`
11
+ - Multiple inputs: `modify_document(**values)` where each input field is
12
+ expanded as a keyword argument (e.g., `modify_document(column_1=..., column_2=...)`).
13
+ """
14
+
15
+ def __init__(self) -> None:
16
+ super().__init__()
17
+ self._name = self.__class__.__name__
18
+ self._sentences = None
19
+ self._paragraphs = None
20
+ self._ngrams = None
21
+
22
+ @abstractmethod
23
+ def modify_document(self, *args: object, **kwargs: object) -> object:
24
+ """Transform the provided value(s) and return the result."""
25
+ raise NotImplementedError
26
+
27
+ @property
28
+ def name(self) -> str:
29
+ return self._name
@@ -0,0 +1,19 @@
1
+ from .doc_modifier import DocumentModifier
2
+
3
+ class LineRemover(DocumentModifier):
4
+ """
5
+ Removes lines from a document if the content of the line matches a given string.
6
+ """
7
+
8
+ def __init__(self, patterns: list[str]):
9
+ """
10
+ Args:
11
+ patterns (List[str]): The patterns to check
12
+ """
13
+ super().__init__()
14
+ self._patterns = patterns
15
+
16
+ def modify_document(self, text: str,*args,**kwargs) -> str:
17
+ lines = text.split("\n")
18
+ new_lines = [line for line in lines if line not in self._patterns]
19
+ return "\n".join(new_lines)
TozaText/repetion.py ADDED
@@ -0,0 +1,84 @@
1
+ import re
2
+ import logging
3
+ from .doc_modifier import DocumentModifier
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ import re
8
+ import logging
9
+ from .doc_modifier import DocumentModifier
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class WordRepetitionFilter(DocumentModifier):
14
+ """Remove consecutive repeated words, even if separated by punctuation."""
15
+
16
+ def __init__(self, max_repetition=1):
17
+ super().__init__()
18
+ self.max_repetition = max_repetition
19
+ self.change_count = 0
20
+
21
+ def modify_document(self, text, *args, **kwargs):
22
+ if not isinstance(text, str) or not text.strip():
23
+ return text
24
+
25
+ # Split tokens by space (keep punctuation attached)
26
+ tokens = text.split()
27
+ new_tokens = []
28
+
29
+ prev_norm = None
30
+ repetition_count = 0
31
+
32
+ for tok in tokens:
33
+ # Normalize token for comparison (remove punctuation + lowercase)
34
+ norm = re.sub(r"[^\w]+$", "", tok).lower()
35
+
36
+ if prev_norm == norm and norm != "":
37
+ repetition_count += 1
38
+ if repetition_count <= self.max_repetition:
39
+ new_tokens.append(tok)
40
+ else:
41
+ # skip repeated token (collapse)
42
+ self.change_count += 1
43
+ logger.info(
44
+ f"[{self.__class__.__name__}] | skipped repeated '{tok}' (normalized as '{norm}')"
45
+ )
46
+ else:
47
+ new_tokens.append(tok)
48
+ repetition_count = 1
49
+ prev_norm = norm
50
+
51
+ cleaned_text = " ".join(new_tokens)
52
+ return cleaned_text
53
+
54
+
55
+
56
+
57
+ class ParagraphRepetitionFilter(DocumentModifier):
58
+ """Remove texts with too many repeated paragraphs."""
59
+ def __init__(self, max_paragraph_dup_ratio=0.3, max_char_dup_ratio=0.2):
60
+ super().__init__()
61
+ self.max_paragraph_dup_ratio = max_paragraph_dup_ratio
62
+ self.max_char_dup_ratio = max_char_dup_ratio
63
+
64
+ def modify_document(self, text: str, *args, **kwargs) -> str:
65
+ paragraphs = re.compile(r"\n{2,}").split(text.strip())
66
+ unique_x = set()
67
+ duplicate_chars = 0
68
+ duplicate_elements = 0
69
+ for p in paragraphs:
70
+ if p in unique_x:
71
+ duplicate_chars += len(p)
72
+ duplicate_elements += 1
73
+ else:
74
+ unique_x.add(p)
75
+
76
+ if paragraphs:
77
+ if duplicate_elements / len(paragraphs) > self.max_paragraph_dup_ratio:
78
+ return ""
79
+ if duplicate_chars / len(text) > self.max_char_dup_ratio:
80
+ return ""
81
+
82
+ return text
83
+
84
+
@@ -0,0 +1,22 @@
1
+ from .doc_modifier import DocumentModifier
2
+ try:
3
+ from UzTransliterator import UzTransliterator
4
+ except ImportError:
5
+ raise ImportError(
6
+ "UzTransliterator is not installed. To use transliteration, run:\n"
7
+ "pip install git+https://github.com/latofatbobojonova/UzTransliterator.git"
8
+ )
9
+
10
+ class TransliteratorModifier(DocumentModifier):
11
+ def __init__(self, from_: str = "cyr", to: str = "lat"):
12
+ super().__init__()
13
+ self.translit = UzTransliterator.UzTransliterator()
14
+ self.from_ = from_
15
+ self.to = to
16
+
17
+ def modify_document(self, text: str,*args,**kwargs) -> str:
18
+ if not isinstance(text, str):
19
+ return text
20
+ return self.translit.transliterate(text, from_=self.from_, to=self.to)
21
+
22
+
@@ -0,0 +1,46 @@
1
+ from .doc_modifier import DocumentModifier
2
+ import re
3
+ import ftfy
4
+ from ftfy import TextFixerConfig
5
+
6
+ class UzbekUnicodeCleaner(DocumentModifier):
7
+ """
8
+ Clean up Uzbek texts and fix unicode errors.
9
+ """
10
+
11
+ def __init__(self):
12
+ super().__init__()
13
+ self.config = TextFixerConfig(
14
+ normalization='NFC',
15
+ remove_control_chars=True,
16
+ fix_line_breaks=True,
17
+ explain=False
18
+ )
19
+
20
+ # generic apostrophes
21
+ self.uz_replacements = [
22
+ (r"–", "—"),
23
+ (r"«", "“"),
24
+ (r"»", "”"),
25
+ (r' \\"', " “"),
26
+ (r'\\" ', "” "),
27
+ (r"g['‘’`´]", "gʻ"),
28
+ (r"o['‘’`´]", "oʻ"),
29
+ (r"G['‘’`´]", "Gʻ"),
30
+ (r"O['‘’`´]", "Oʻ"),
31
+ (r"(?<![ogOG])[’'`´ʻ]", "ʼ"),
32
+ (r" {2,}", " "),
33
+ ]
34
+
35
+ def modify_document(self, text, *args,**kwargs):
36
+
37
+ if not isinstance(text, str):
38
+ return text
39
+
40
+ text = ftfy.fix_text(text, config=self.config)
41
+ for pattern, repl in self.uz_replacements:
42
+ text = re.sub(pattern, repl, text)
43
+
44
+ return text.strip()
45
+
46
+
@@ -0,0 +1,17 @@
1
+ import re
2
+
3
+ from .doc_modifier import DocumentModifier
4
+
5
+ URL_REGEX = re.compile(r"https?://\S+|www\.\S+", flags=re.IGNORECASE)
6
+
7
+
8
+ class UrlRemover(DocumentModifier):
9
+ """
10
+ Removes all URLs in a document.
11
+ """
12
+
13
+ def __init__(self):
14
+ super().__init__()
15
+
16
+ def modify_document(self, text: str, *args, **kwargs) -> str:
17
+ return URL_REGEX.sub("", text)
File without changes
@@ -0,0 +1,31 @@
1
+ import logging
2
+ import os
3
+
4
+ def setup_logging(log_file="logs/pipeline.log", debug=False):
5
+ """
6
+ Setup logging for the entire TozaText project.
7
+ Always saves to a file; prints to console only if debug=True.
8
+ """
9
+ os.makedirs(os.path.dirname(log_file) or ".", exist_ok=True)
10
+
11
+ log_format = "%(asctime)s | %(levelname)s | %(name)s | %(message)s"
12
+
13
+ file_handler = logging.FileHandler(log_file, mode="w", encoding="utf-8")
14
+ file_handler.setFormatter(logging.Formatter(log_format))
15
+
16
+ # Configure the root logger (affects all submodules)
17
+ logging.basicConfig(
18
+ level=logging.INFO,
19
+ handlers=[file_handler],
20
+ format=log_format,
21
+ force=True, # clears any existing handlers
22
+ )
23
+
24
+ # Add optional console output
25
+ if debug:
26
+ console_handler = logging.StreamHandler()
27
+ console_handler.setFormatter(logging.Formatter(log_format))
28
+ logging.getLogger().addHandler(console_handler)
29
+
30
+ logger = logging.getLogger(__name__) # current module name, not fixed
31
+ return logger
@@ -0,0 +1,70 @@
1
+ Metadata-Version: 2.4
2
+ Name: TozaText
3
+ Version: 0.1.0
4
+ Summary: TozaText is a cleaning library for preprocessing raw Uzbek and multilingual text data.
5
+ Author-email: Shohrux Isakov <isakovsh19@gmail.com>
6
+ License: MIT
7
+ Requires-Python: >=3.10
8
+ Requires-Dist: datasets>=4.4.1
9
+ Requires-Dist: ftfy>=6.3.1
10
+ Requires-Dist: loguru>=0.7.0
11
+ Requires-Dist: pandas>=2.0.0
12
+ Requires-Dist: tqdm>=4.65.0
13
+ Description-Content-Type: text/markdown
14
+
15
+ # 🧹 TozaText
16
+
17
+ **TozaText** is a lightweight and extensible text-preprocessing pipeline built for cleaning noisy, transcribed, or user-generated text data.
18
+ It’s designed around a **modifier-based architecture** — each cleaning rule is a `DocumentModifier` that can be combined into a customizable `Pipeline`.
19
+
20
+ ---
21
+
22
+ ## Features
23
+
24
+ - **Modular design** – add or remove modifiers easily (e.g., repetition removal, transliteration)
25
+ - **Smart repetition cleaner** – removes consecutive repeated words, even with punctuation or ellipses
26
+ ---
27
+ ## Available Modifiers
28
+
29
+ TozaText currently includes the following modifiers out of the box:
30
+
31
+ | Modifier | Description | Example Input | Example Output |
32
+ |-----------|--------------|----------------|----------------|
33
+ | **`WordRepetitionFilter`** | Removes consecutive repeated words, even when separated by punctuation or ellipses. | `bu. bu. bu. shu shu qila qila` | `bu. shu qila` |
34
+ | **`ParagraphRepetitionFilter`** | Removes entire paragraphs if too many repeated paragraphs or characters are detected (useful for STT data with repeated intros). | `"Salom!\n\nSalom!\n\nSalom!"` | `""` |
35
+ | **`TransliteratorModifier`** | Converts Uzbek text between **Cyrillic** and **Latin** alphabets using `UzTransliterator`. | `"Салом дунё"` | `"Salom dunyo"` |
36
+ | **`UrlCleaner`** | Remove or normalize URLs and links from text. | `"Bu sayt: https://example.com"` | `"Bu sayt"` |
37
+ | **`EmojiCleaner`** | Strip or map emojis to descriptive tokens. | `"Zo‘r 😎"` | `"Zo‘r"` |
38
+
39
+
40
+
41
+ All modifiers inherit from:
42
+ ```python
43
+ class DocumentModifier:
44
+ def modify_document(self, text: str, *args, **kwargs) -> str:
45
+ ...
46
+ ```
47
+
48
+ ## Installation
49
+
50
+ ```bash
51
+ git clone https://gitlab.adliya.uz/shohrux1sakov/tozatext.git
52
+ cd TozaText
53
+ pip install -e .
54
+ ```
55
+
56
+ ## Code Example
57
+ ```
58
+ from datasets import load_dataset
59
+ from TozaText import Pipeline, WordRepetitionFilter, ParagraphRepetitionFilter
60
+
61
+ data = load_dataset("aktrmai/youtube_transcribe_data", split="train")
62
+
63
+ pipeline = Pipeline([
64
+ WordRepetitionFilter(),
65
+ ParagraphRepetitionFilter(),
66
+ ])
67
+
68
+ cleaned = pipeline.process_hf_dataset(data, column="text")
69
+ ```
70
+
@@ -0,0 +1,13 @@
1
+ TozaText/__init__.py,sha256=xzQ1vcj30oSEzB2pljrErJt5kv1hpxZhJmvpZE0yp04,613
2
+ TozaText/core.py,sha256=ONRWF0n1tQ09U8NlurHn9FixrL9HvS9jgY4IEWLMpLk,3386
3
+ TozaText/doc_modifier.py,sha256=oGIvFZOYQgupXHYfFbNX3sYJokxeuHL8nGLjbac-OIg,987
4
+ TozaText/line_remover.py,sha256=6D2Ge18BG4mAIUkNGjWawFIGwy4nM1AhXsp-JT4EUJc,592
5
+ TozaText/repetion.py,sha256=KYBdC3CDerFM47Zx-yfyKG_vII97XpPI0liy3s8S9hk,2618
6
+ TozaText/tranlitrate.py,sha256=Sln63dtJUYZo88Gr-V0ZpHN6BCvjSJ49cZjMdMDExU0,762
7
+ TozaText/unicode_reformattor.py,sha256=t3-sXmyvgYPRN4WQ5Ccn4PFuwgWyFEjdiUA92_TncDk,1239
8
+ TozaText/url_remover.py,sha256=zXdMTs65OD9uVG0qD-SFHQ2q0qYWEBo-jxyMCLxv-vo,372
9
+ TozaText/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ TozaText/utils/logging_utils.py,sha256=vCmjPBYkkstnANrtLHYSyV4DEmzQEK1NLeVJZlT5BfQ,1044
11
+ tozatext-0.1.0.dist-info/METADATA,sha256=vvQU_SpXgZDfGyaYYLFKS8-vbVKG9oCV7zha6_SxT-0,2572
12
+ tozatext-0.1.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
13
+ tozatext-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.27.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any