basa 0.1.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- basa/__init__.py +25 -0
- basa/augment/__init__.py +0 -0
- basa/augment/noise.py +0 -0
- basa/augment/paraphrase.py +0 -0
- basa/augment/synthetic.py +0 -0
- basa/core/__init__.py +0 -0
- basa/core/normalize.py +214 -0
- basa/core/quick.py +50 -0
- basa/core/slang.py +700 -0
- basa/core/typo.py +433 -0
- basa/dataset/__init__.py +0 -0
- basa/dataset/builder.py +0 -0
- basa/dataset/cleaner.py +0 -0
- basa/dataset/split.py +0 -0
- basa/dataset/validator.py +0 -0
- basa/evaluate/__init__.py +0 -0
- basa/evaluate/factual.py +0 -0
- basa/evaluate/metrics.py +0 -0
- basa/evaluate/similarity.py +0 -0
- basa/tokenize/__init__.py +0 -0
- basa/tokenize/lang_detect.py +0 -0
- basa/tokenize/sentence.py +0 -0
- basa/tokenize/word.py +0 -0
- basa/translate/__init__.py +0 -0
- basa/translate/jv_id.py +0 -0
- basa/translate/router.py +0 -0
- basa/translate/su_id.py +0 -0
- basa/utils/__init__.py +0 -0
- basa/utils/constants.py +0 -0
- basa/utils/regex.py +0 -0
- basa/utils/text_clean.py +0 -0
- basa-0.1.0a0.dist-info/METADATA +394 -0
- basa-0.1.0a0.dist-info/RECORD +35 -0
- basa-0.1.0a0.dist-info/WHEEL +4 -0
- basa-0.1.0a0.dist-info/licenses/LICENSE +21 -0
basa/__init__.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BASA - Modern NLP for Indonesian and Regional Languages
|
|
3
|
+
=======================================================
|
|
4
|
+
|
|
5
|
+
Top-level package. Re-exports the public API.
|
|
6
|
+
|
|
7
|
+
Quick start:
|
|
8
|
+
>>> from basa import normalize, quick
|
|
9
|
+
>>> normalize("GW GKKKK NGERTIII BNGTTTT!!!!!")
|
|
10
|
+
'saya tidak mengerti banget!'
|
|
11
|
+
>>> quick("gw gk ngerti bngt sihhhh!!!")
|
|
12
|
+
'saya tidak mengerti banget sih!'
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from .core.normalize import normalize
|
|
16
|
+
from .core.quick import quick
|
|
17
|
+
from .core.typo import typo
|
|
18
|
+
from .core.slang import slang as slang_engine
|
|
19
|
+
|
|
20
|
+
__all__ = [
|
|
21
|
+
"normalize",
|
|
22
|
+
"quick",
|
|
23
|
+
"typo",
|
|
24
|
+
"slang_engine",
|
|
25
|
+
]
|
basa/augment/__init__.py
ADDED
|
File without changes
|
basa/augment/noise.py
ADDED
|
File without changes
|
|
File without changes
|
|
File without changes
|
basa/core/__init__.py
ADDED
|
File without changes
|
basa/core/normalize.py
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BASA - Top-level Normalization API
|
|
3
|
+
====================================
|
|
4
|
+
Provides the simple 1-line normalization interface for the library.
|
|
5
|
+
|
|
6
|
+
Pipeline (in order):
|
|
7
|
+
1. Lowercase – optional, default True
|
|
8
|
+
2. Slang normalization – replaces slang + reduces repeated chars
|
|
9
|
+
3. Typo correction – Levenshtein-based, strictly opt-in
|
|
10
|
+
4. Punctuation reduction – collapses repeated punctuation (!!!! → !)
|
|
11
|
+
5. Whitespace cleanup – trims and collapses multiple spaces
|
|
12
|
+
|
|
13
|
+
Design philosophy:
|
|
14
|
+
- Conservative by default: only safe, lossless transforms are enabled.
|
|
15
|
+
- Destructive features (typo correction) are strictly opt-in.
|
|
16
|
+
- ``_normalize_single`` handles one string; ``normalize`` handles both
|
|
17
|
+
str and List[str] inputs so adding new parameters never breaks the
|
|
18
|
+
batch path.
|
|
19
|
+
|
|
20
|
+
Usage:
|
|
21
|
+
>>> from basa import normalize
|
|
22
|
+
>>> normalize("GW GKKKK NGERTIII BNGTTTT!!!!!")
|
|
23
|
+
'saya tidak mengerti banget!'
|
|
24
|
+
|
|
25
|
+
>>> normalize("Jokowi pergi ke Jakarta", lowercase=False)
|
|
26
|
+
'Jokowi pergi ke Jakarta'
|
|
27
|
+
|
|
28
|
+
>>> normalize(["gw mkan", "dia mnum"], apply_slang=True)
|
|
29
|
+
['saya makan', 'dia minum']
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
from __future__ import annotations
|
|
33
|
+
|
|
34
|
+
import re
|
|
35
|
+
from typing import List, Union
|
|
36
|
+
|
|
37
|
+
from .slang import slang as slang_engine
|
|
38
|
+
from .typo import typo as typo_engine
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
42
|
+
# INTERNAL HELPERS
|
|
43
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
44
|
+
|
|
45
|
+
def _reduce_punctuation(text: str) -> str:
|
|
46
|
+
"""
|
|
47
|
+
Collapse runs of repeated punctuation characters to a single character.
|
|
48
|
+
|
|
49
|
+
Only targets common sentence-ending / emphasis punctuation so that
|
|
50
|
+
legitimate patterns (e.g. ellipsis "...") are also collapsed cleanly.
|
|
51
|
+
This is intentionally narrow to avoid touching hyphens, slashes, etc.
|
|
52
|
+
|
|
53
|
+
Affected characters: . , ! ? ~ * - _
|
|
54
|
+
|
|
55
|
+
Examples:
|
|
56
|
+
>>> _reduce_punctuation("bagus banget!!!!!")
|
|
57
|
+
'bagus banget!'
|
|
58
|
+
>>> _reduce_punctuation("hmmm.....serius???")
|
|
59
|
+
'hmmm.serius?'
|
|
60
|
+
>>> _reduce_punctuation("seru~~~~ banget~~")
|
|
61
|
+
'seru~ banget~'
|
|
62
|
+
"""
|
|
63
|
+
# Match 2+ consecutive occurrences of the same punctuation character
|
|
64
|
+
return re.sub(r'([.!?,~*\-_])\1+', r'\1', text)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _normalize_single(
|
|
68
|
+
text: str,
|
|
69
|
+
apply_slang: bool,
|
|
70
|
+
apply_typo: bool,
|
|
71
|
+
lowercase: bool,
|
|
72
|
+
normalize_punctuation: bool,
|
|
73
|
+
normalize_whitespace: bool,
|
|
74
|
+
) -> str:
|
|
75
|
+
"""
|
|
76
|
+
Apply the full normalization pipeline to a single string.
|
|
77
|
+
|
|
78
|
+
This is the internal workhorse. ``normalize()`` delegates here for
|
|
79
|
+
every input string so that adding new parameters only requires
|
|
80
|
+
updating the signature in one place.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
text: Input string (assumed to be a non-empty str).
|
|
84
|
+
apply_slang: If True, run slang normalization.
|
|
85
|
+
apply_typo: If True, run typo correction (opt-in).
|
|
86
|
+
lowercase: If True, convert to lowercase first.
|
|
87
|
+
normalize_punctuation: If True, collapse repeated punctuation.
|
|
88
|
+
normalize_whitespace: If True, trim and collapse multiple spaces.
|
|
89
|
+
|
|
90
|
+
Returns:
|
|
91
|
+
Normalized string.
|
|
92
|
+
"""
|
|
93
|
+
# ── Stage 1: Lowercase ───────────────────────────────────────────────────
|
|
94
|
+
# Must come first so slang matching is case-insensitive.
|
|
95
|
+
if lowercase:
|
|
96
|
+
text = text.lower()
|
|
97
|
+
|
|
98
|
+
# ── Stage 2: Slang Normalization ─────────────────────────────────────────
|
|
99
|
+
# Handles repeated-char reduction AND slang dict lookup.
|
|
100
|
+
# e.g. "GKKKK" → (lower) "gkkkk" → (char reduce) "gk" → "tidak"
|
|
101
|
+
# We defer whitespace normalization to Stage 5.
|
|
102
|
+
if apply_slang:
|
|
103
|
+
text = slang_engine.normalize(text, normalize_whitespace=False)
|
|
104
|
+
|
|
105
|
+
# ── Stage 3: Typo Correction (Strictly Opt-in) ───────────────────────────
|
|
106
|
+
# Safeguard: skip if vocab is empty to prevent no-op full-corpus scans.
|
|
107
|
+
if apply_typo and typo_engine.vocab:
|
|
108
|
+
text = typo_engine.correct_text(text)
|
|
109
|
+
|
|
110
|
+
# ── Stage 4: Punctuation Reduction ───────────────────────────────────────
|
|
111
|
+
# e.g. "!!!!!" → "!", "?????" → "?", "....." → "."
|
|
112
|
+
if normalize_punctuation:
|
|
113
|
+
text = _reduce_punctuation(text)
|
|
114
|
+
|
|
115
|
+
# ── Stage 5: Whitespace Cleanup ──────────────────────────────────────────
|
|
116
|
+
if normalize_whitespace:
|
|
117
|
+
text = re.sub(r'\s+', ' ', text).strip()
|
|
118
|
+
|
|
119
|
+
return text
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
123
|
+
# PUBLIC API
|
|
124
|
+
# ─────────────────────────────────────────────────────────────────────────────
|
|
125
|
+
|
|
126
|
+
def normalize(
|
|
127
|
+
text: Union[str, List[str]],
|
|
128
|
+
apply_slang: bool = True,
|
|
129
|
+
apply_typo: bool = False,
|
|
130
|
+
lowercase: bool = True,
|
|
131
|
+
normalize_punctuation: bool = True,
|
|
132
|
+
normalize_whitespace: bool = True,
|
|
133
|
+
) -> Union[str, List[str]]:
|
|
134
|
+
"""
|
|
135
|
+
Normalize informal Indonesian text with a single line of code.
|
|
136
|
+
|
|
137
|
+
Applies a configurable pipeline of normalization stages to the input.
|
|
138
|
+
Accepts a single string or a list of strings.
|
|
139
|
+
|
|
140
|
+
Design Notes:
|
|
141
|
+
- ``lowercase=True`` by default so slang matching works reliably.
|
|
142
|
+
Set ``lowercase=False`` when case matters (e.g. for NER tasks).
|
|
143
|
+
- ``apply_typo=False`` by default. Typo correction is opt-in because
|
|
144
|
+
it requires a populated vocabulary and can corrupt domain-specific
|
|
145
|
+
terms (e.g. "xgboost", "lightgbm", proper nouns). Always load
|
|
146
|
+
your vocabulary via ``basa.typo.add_to_vocab()`` before enabling.
|
|
147
|
+
|
|
148
|
+
Args:
|
|
149
|
+
text: Input string or list of strings to normalize.
|
|
150
|
+
apply_slang: If True (default), apply slang dictionary
|
|
151
|
+
lookup and repeated-character reduction.
|
|
152
|
+
apply_typo: If True, apply Levenshtein-based typo
|
|
153
|
+
correction. Default False. Silently skipped
|
|
154
|
+
if the typo engine's vocabulary is empty.
|
|
155
|
+
lowercase: If True (default), convert text to lowercase
|
|
156
|
+
before processing. Set False to preserve
|
|
157
|
+
casing (e.g. for NER pipelines).
|
|
158
|
+
normalize_punctuation: If True (default), collapse repeated
|
|
159
|
+
punctuation marks (e.g. "!!!" → "!").
|
|
160
|
+
normalize_whitespace: If True (default), trim leading/trailing
|
|
161
|
+
whitespace and collapse internal runs of
|
|
162
|
+
spaces to a single space.
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Normalized string if input is str, or list of normalized strings
|
|
166
|
+
if input is a list.
|
|
167
|
+
|
|
168
|
+
Examples:
|
|
169
|
+
>>> from basa import normalize
|
|
170
|
+
|
|
171
|
+
>>> normalize("GW GKKKK NGERTIII BNGTTTT!!!!!")
|
|
172
|
+
'saya tidak mengerti banget!'
|
|
173
|
+
|
|
174
|
+
>>> normalize("gw gk ngerti bngt sihhhh!!!")
|
|
175
|
+
'saya tidak mengerti banget sih!'
|
|
176
|
+
|
|
177
|
+
>>> normalize("Jokowi pergi ke Jakarta", lowercase=False)
|
|
178
|
+
'Jokowi pergi ke Jakarta'
|
|
179
|
+
|
|
180
|
+
>>> normalize(["gw mkan", "dia mnum"])
|
|
181
|
+
['saya makan', 'dia minum']
|
|
182
|
+
|
|
183
|
+
>>> normalize("harga naik terus????", normalize_punctuation=True)
|
|
184
|
+
'harga naik terus?'
|
|
185
|
+
"""
|
|
186
|
+
# ── Batch path ────────────────────────────────────────────────────────────
|
|
187
|
+
# Delegates each element to _normalize_single — adding new parameters
|
|
188
|
+
# here only requires updating _normalize_single's signature, not this call.
|
|
189
|
+
if isinstance(text, list):
|
|
190
|
+
return [
|
|
191
|
+
_normalize_single(
|
|
192
|
+
t,
|
|
193
|
+
apply_slang=apply_slang,
|
|
194
|
+
apply_typo=apply_typo,
|
|
195
|
+
lowercase=lowercase,
|
|
196
|
+
normalize_punctuation=normalize_punctuation,
|
|
197
|
+
normalize_whitespace=normalize_whitespace,
|
|
198
|
+
)
|
|
199
|
+
for t in text
|
|
200
|
+
if isinstance(t, str) and t
|
|
201
|
+
]
|
|
202
|
+
|
|
203
|
+
# ── Guard: invalid / empty input ─────────────────────────────────────────
|
|
204
|
+
if not text or not isinstance(text, str):
|
|
205
|
+
return text
|
|
206
|
+
|
|
207
|
+
return _normalize_single(
|
|
208
|
+
text,
|
|
209
|
+
apply_slang=apply_slang,
|
|
210
|
+
apply_typo=apply_typo,
|
|
211
|
+
lowercase=lowercase,
|
|
212
|
+
normalize_punctuation=normalize_punctuation,
|
|
213
|
+
normalize_whitespace=normalize_whitespace,
|
|
214
|
+
)
|
basa/core/quick.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""
|
|
2
|
+
BASA - Quick API
|
|
3
|
+
=================
|
|
4
|
+
Zero-config shorthand for the most common normalize() call.
|
|
5
|
+
|
|
6
|
+
``quick()`` always uses all default settings:
|
|
7
|
+
- lowercase=True
|
|
8
|
+
- apply_slang=True
|
|
9
|
+
- apply_typo=False
|
|
10
|
+
- normalize_punctuation=True
|
|
11
|
+
- normalize_whitespace=True
|
|
12
|
+
|
|
13
|
+
This is intentionally a thin alias — no new logic lives here.
|
|
14
|
+
For fine-grained control use ``normalize()`` directly.
|
|
15
|
+
|
|
16
|
+
Usage:
|
|
17
|
+
>>> from basa import quick
|
|
18
|
+
>>> quick("GW GKKKK NGERTIII BNGTTTT!!!!!")
|
|
19
|
+
'saya tidak mengerti banget!'
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
from typing import List, Union
|
|
25
|
+
|
|
26
|
+
from .normalize import normalize
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def quick(text: Union[str, List[str]]) -> Union[str, List[str]]:
|
|
30
|
+
"""
|
|
31
|
+
Normalize informal Indonesian text with zero configuration.
|
|
32
|
+
|
|
33
|
+
A convenience alias for ``normalize()`` with all defaults applied.
|
|
34
|
+
Accepts a single string or a list of strings.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
text: Input string or list of strings.
|
|
38
|
+
|
|
39
|
+
Returns:
|
|
40
|
+
Normalized string or list of normalized strings.
|
|
41
|
+
|
|
42
|
+
Examples:
|
|
43
|
+
>>> from basa import quick
|
|
44
|
+
>>> quick("gw gk ngerti bngt sihhhh!!!")
|
|
45
|
+
'saya tidak mengerti banget sih!'
|
|
46
|
+
|
|
47
|
+
>>> quick(["gw mkan", "dia mnum"])
|
|
48
|
+
['saya makan', 'dia minum']
|
|
49
|
+
"""
|
|
50
|
+
return normalize(text)
|