basa 0.1.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
basa/__init__.py ADDED
@@ -0,0 +1,25 @@
1
+ """
2
+ BASA - Modern NLP for Indonesian and Regional Languages
3
+ =======================================================
4
+
5
+ Top-level package. Re-exports the public API.
6
+
7
+ Quick start:
8
+ >>> from basa import normalize, quick
9
+ >>> normalize("GW GKKKK NGERTIII BNGTTTT!!!!!")
10
+ 'saya tidak mengerti banget!'
11
+ >>> quick("gw gk ngerti bngt sihhhh!!!")
12
+ 'saya tidak mengerti banget sih!'
13
+ """
14
+
15
+ from .core.normalize import normalize
16
+ from .core.quick import quick
17
+ from .core.typo import typo
18
+ from .core.slang import slang as slang_engine
19
+
20
+ __all__ = [
21
+ "normalize",
22
+ "quick",
23
+ "typo",
24
+ "slang_engine",
25
+ ]
File without changes
basa/augment/noise.py ADDED
File without changes
File without changes
File without changes
basa/core/__init__.py ADDED
File without changes
basa/core/normalize.py ADDED
@@ -0,0 +1,214 @@
1
+ """
2
+ BASA - Top-level Normalization API
3
+ ====================================
4
+ Provides the simple 1-line normalization interface for the library.
5
+
6
+ Pipeline (in order):
7
+ 1. Lowercase – optional, default True
8
+ 2. Slang normalization – replaces slang + reduces repeated chars
9
+ 3. Typo correction – Levenshtein-based, strictly opt-in
10
+ 4. Punctuation reduction – collapses repeated punctuation (!!!! → !)
11
+ 5. Whitespace cleanup – trims and collapses multiple spaces
12
+
13
+ Design philosophy:
14
+ - Conservative by default: only safe, lossless transforms are enabled.
15
+ - Destructive features (typo correction) are strictly opt-in.
16
+ - ``_normalize_single`` handles one string; ``normalize`` handles both
17
+ str and List[str] inputs so adding new parameters never breaks the
18
+ batch path.
19
+
20
+ Usage:
21
+ >>> from basa import normalize
22
+ >>> normalize("GW GKKKK NGERTIII BNGTTTT!!!!!")
23
+ 'saya tidak mengerti banget!'
24
+
25
+ >>> normalize("Jokowi pergi ke Jakarta", lowercase=False)
26
+ 'Jokowi pergi ke Jakarta'
27
+
28
+ >>> normalize(["gw mkan", "dia mnum"], apply_slang=True)
29
+ ['saya makan', 'dia minum']
30
+ """
31
+
32
+ from __future__ import annotations
33
+
34
+ import re
35
+ from typing import List, Union
36
+
37
+ from .slang import slang as slang_engine
38
+ from .typo import typo as typo_engine
39
+
40
+
41
+ # ─────────────────────────────────────────────────────────────────────────────
42
+ # INTERNAL HELPERS
43
+ # ─────────────────────────────────────────────────────────────────────────────
44
+
45
+ def _reduce_punctuation(text: str) -> str:
46
+ """
47
+ Collapse runs of repeated punctuation characters to a single character.
48
+
49
+ Only targets common sentence-ending / emphasis punctuation so that
50
+ legitimate patterns (e.g. ellipsis "...") are also collapsed cleanly.
51
+ This is intentionally narrow to avoid touching hyphens, slashes, etc.
52
+
53
+ Affected characters: . , ! ? ~ * - _
54
+
55
+ Examples:
56
+ >>> _reduce_punctuation("bagus banget!!!!!")
57
+ 'bagus banget!'
58
+ >>> _reduce_punctuation("hmmm.....serius???")
59
+ 'hmmm.serius?'
60
+ >>> _reduce_punctuation("seru~~~~ banget~~")
61
+ 'seru~ banget~'
62
+ """
63
+ # Match 2+ consecutive occurrences of the same punctuation character
64
+ return re.sub(r'([.!?,~*\-_])\1+', r'\1', text)
65
+
66
+
67
+ def _normalize_single(
68
+ text: str,
69
+ apply_slang: bool,
70
+ apply_typo: bool,
71
+ lowercase: bool,
72
+ normalize_punctuation: bool,
73
+ normalize_whitespace: bool,
74
+ ) -> str:
75
+ """
76
+ Apply the full normalization pipeline to a single string.
77
+
78
+ This is the internal workhorse. ``normalize()`` delegates here for
79
+ every input string so that adding new parameters only requires
80
+ updating the signature in one place.
81
+
82
+ Args:
83
+ text: Input string (assumed to be a non-empty str).
84
+ apply_slang: If True, run slang normalization.
85
+ apply_typo: If True, run typo correction (opt-in).
86
+ lowercase: If True, convert to lowercase first.
87
+ normalize_punctuation: If True, collapse repeated punctuation.
88
+ normalize_whitespace: If True, trim and collapse multiple spaces.
89
+
90
+ Returns:
91
+ Normalized string.
92
+ """
93
+ # ── Stage 1: Lowercase ───────────────────────────────────────────────────
94
+ # Must come first so slang matching is case-insensitive.
95
+ if lowercase:
96
+ text = text.lower()
97
+
98
+ # ── Stage 2: Slang Normalization ─────────────────────────────────────────
99
+ # Handles repeated-char reduction AND slang dict lookup.
100
+ # e.g. "GKKKK" → (lower) "gkkkk" → (char reduce) "gk" → "tidak"
101
+ # We defer whitespace normalization to Stage 5.
102
+ if apply_slang:
103
+ text = slang_engine.normalize(text, normalize_whitespace=False)
104
+
105
+ # ── Stage 3: Typo Correction (Strictly Opt-in) ───────────────────────────
106
+ # Safeguard: skip if vocab is empty to prevent no-op full-corpus scans.
107
+ if apply_typo and typo_engine.vocab:
108
+ text = typo_engine.correct_text(text)
109
+
110
+ # ── Stage 4: Punctuation Reduction ───────────────────────────────────────
111
+ # e.g. "!!!!!" → "!", "?????" → "?", "....." → "."
112
+ if normalize_punctuation:
113
+ text = _reduce_punctuation(text)
114
+
115
+ # ── Stage 5: Whitespace Cleanup ──────────────────────────────────────────
116
+ if normalize_whitespace:
117
+ text = re.sub(r'\s+', ' ', text).strip()
118
+
119
+ return text
120
+
121
+
122
+ # ─────────────────────────────────────────────────────────────────────────────
123
+ # PUBLIC API
124
+ # ─────────────────────────────────────────────────────────────────────────────
125
+
126
+ def normalize(
127
+ text: Union[str, List[str]],
128
+ apply_slang: bool = True,
129
+ apply_typo: bool = False,
130
+ lowercase: bool = True,
131
+ normalize_punctuation: bool = True,
132
+ normalize_whitespace: bool = True,
133
+ ) -> Union[str, List[str]]:
134
+ """
135
+ Normalize informal Indonesian text with a single line of code.
136
+
137
+ Applies a configurable pipeline of normalization stages to the input.
138
+ Accepts a single string or a list of strings.
139
+
140
+ Design Notes:
141
+ - ``lowercase=True`` by default so slang matching works reliably.
142
+ Set ``lowercase=False`` when case matters (e.g. for NER tasks).
143
+ - ``apply_typo=False`` by default. Typo correction is opt-in because
144
+ it requires a populated vocabulary and can corrupt domain-specific
145
+ terms (e.g. "xgboost", "lightgbm", proper nouns). Always load
146
+ your vocabulary via ``basa.typo.add_to_vocab()`` before enabling.
147
+
148
+ Args:
149
+ text: Input string or list of strings to normalize.
150
+ apply_slang: If True (default), apply slang dictionary
151
+ lookup and repeated-character reduction.
152
+ apply_typo: If True, apply Levenshtein-based typo
153
+ correction. Default False. Silently skipped
154
+ if the typo engine's vocabulary is empty.
155
+ lowercase: If True (default), convert text to lowercase
156
+ before processing. Set False to preserve
157
+ casing (e.g. for NER pipelines).
158
+ normalize_punctuation: If True (default), collapse repeated
159
+ punctuation marks (e.g. "!!!" → "!").
160
+ normalize_whitespace: If True (default), trim leading/trailing
161
+ whitespace and collapse internal runs of
162
+ spaces to a single space.
163
+
164
+ Returns:
165
+ Normalized string if input is str, or list of normalized strings
166
+ if input is a list.
167
+
168
+ Examples:
169
+ >>> from basa import normalize
170
+
171
+ >>> normalize("GW GKKKK NGERTIII BNGTTTT!!!!!")
172
+ 'saya tidak mengerti banget!'
173
+
174
+ >>> normalize("gw gk ngerti bngt sihhhh!!!")
175
+ 'saya tidak mengerti banget sih!'
176
+
177
+ >>> normalize("Jokowi pergi ke Jakarta", lowercase=False)
178
+ 'Jokowi pergi ke Jakarta'
179
+
180
+ >>> normalize(["gw mkan", "dia mnum"])
181
+ ['saya makan', 'dia minum']
182
+
183
+ >>> normalize("harga naik terus????", normalize_punctuation=True)
184
+ 'harga naik terus?'
185
+ """
186
+ # ── Batch path ────────────────────────────────────────────────────────────
187
+ # Delegates each element to _normalize_single — adding new parameters
188
+ # here only requires updating _normalize_single's signature, not this call.
189
+ if isinstance(text, list):
190
+ return [
191
+ _normalize_single(
192
+ t,
193
+ apply_slang=apply_slang,
194
+ apply_typo=apply_typo,
195
+ lowercase=lowercase,
196
+ normalize_punctuation=normalize_punctuation,
197
+ normalize_whitespace=normalize_whitespace,
198
+ )
199
+ for t in text
200
+ if isinstance(t, str) and t
201
+ ]
202
+
203
+ # ── Guard: invalid / empty input ─────────────────────────────────────────
204
+ if not text or not isinstance(text, str):
205
+ return text
206
+
207
+ return _normalize_single(
208
+ text,
209
+ apply_slang=apply_slang,
210
+ apply_typo=apply_typo,
211
+ lowercase=lowercase,
212
+ normalize_punctuation=normalize_punctuation,
213
+ normalize_whitespace=normalize_whitespace,
214
+ )
basa/core/quick.py ADDED
@@ -0,0 +1,50 @@
1
+ """
2
+ BASA - Quick API
3
+ =================
4
+ Zero-config shorthand for the most common normalize() call.
5
+
6
+ ``quick()`` always uses all default settings:
7
+ - lowercase=True
8
+ - apply_slang=True
9
+ - apply_typo=False
10
+ - normalize_punctuation=True
11
+ - normalize_whitespace=True
12
+
13
+ This is intentionally a thin alias — no new logic lives here.
14
+ For fine-grained control use ``normalize()`` directly.
15
+
16
+ Usage:
17
+ >>> from basa import quick
18
+ >>> quick("GW GKKKK NGERTIII BNGTTTT!!!!!")
19
+ 'saya tidak mengerti banget!'
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ from typing import List, Union
25
+
26
+ from .normalize import normalize
27
+
28
+
29
+ def quick(text: Union[str, List[str]]) -> Union[str, List[str]]:
30
+ """
31
+ Normalize informal Indonesian text with zero configuration.
32
+
33
+ A convenience alias for ``normalize()`` with all defaults applied.
34
+ Accepts a single string or a list of strings.
35
+
36
+ Args:
37
+ text: Input string or list of strings.
38
+
39
+ Returns:
40
+ Normalized string or list of normalized strings.
41
+
42
+ Examples:
43
+ >>> from basa import quick
44
+ >>> quick("gw gk ngerti bngt sihhhh!!!")
45
+ 'saya tidak mengerti banget sih!'
46
+
47
+ >>> quick(["gw mkan", "dia mnum"])
48
+ ['saya makan', 'dia minum']
49
+ """
50
+ return normalize(text)