basa 0.1.0a0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. basa-0.1.0a0/.gitignore +220 -0
  2. basa-0.1.0a0/LICENSE +21 -0
  3. basa-0.1.0a0/PKG-INFO +394 -0
  4. basa-0.1.0a0/README.md +368 -0
  5. basa-0.1.0a0/pyproject.toml +37 -0
  6. basa-0.1.0a0/src/basa/__init__.py +25 -0
  7. basa-0.1.0a0/src/basa/augment/__init__.py +0 -0
  8. basa-0.1.0a0/src/basa/augment/noise.py +0 -0
  9. basa-0.1.0a0/src/basa/augment/paraphrase.py +0 -0
  10. basa-0.1.0a0/src/basa/augment/synthetic.py +0 -0
  11. basa-0.1.0a0/src/basa/core/__init__.py +0 -0
  12. basa-0.1.0a0/src/basa/core/normalize.py +214 -0
  13. basa-0.1.0a0/src/basa/core/quick.py +50 -0
  14. basa-0.1.0a0/src/basa/core/slang.py +700 -0
  15. basa-0.1.0a0/src/basa/core/typo.py +433 -0
  16. basa-0.1.0a0/src/basa/dataset/__init__.py +0 -0
  17. basa-0.1.0a0/src/basa/dataset/builder.py +0 -0
  18. basa-0.1.0a0/src/basa/dataset/cleaner.py +0 -0
  19. basa-0.1.0a0/src/basa/dataset/split.py +0 -0
  20. basa-0.1.0a0/src/basa/dataset/validator.py +0 -0
  21. basa-0.1.0a0/src/basa/evaluate/__init__.py +0 -0
  22. basa-0.1.0a0/src/basa/evaluate/factual.py +0 -0
  23. basa-0.1.0a0/src/basa/evaluate/metrics.py +0 -0
  24. basa-0.1.0a0/src/basa/evaluate/similarity.py +0 -0
  25. basa-0.1.0a0/src/basa/tokenize/__init__.py +0 -0
  26. basa-0.1.0a0/src/basa/tokenize/lang_detect.py +0 -0
  27. basa-0.1.0a0/src/basa/tokenize/sentence.py +0 -0
  28. basa-0.1.0a0/src/basa/tokenize/word.py +0 -0
  29. basa-0.1.0a0/src/basa/translate/__init__.py +0 -0
  30. basa-0.1.0a0/src/basa/translate/jv_id.py +0 -0
  31. basa-0.1.0a0/src/basa/translate/router.py +0 -0
  32. basa-0.1.0a0/src/basa/translate/su_id.py +0 -0
  33. basa-0.1.0a0/src/basa/utils/__init__.py +0 -0
  34. basa-0.1.0a0/src/basa/utils/constants.py +0 -0
  35. basa-0.1.0a0/src/basa/utils/regex.py +0 -0
  36. basa-0.1.0a0/src/basa/utils/text_clean.py +0 -0
  37. basa-0.1.0a0/tests/test_normalize.py +272 -0
@@ -0,0 +1,220 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .venv
11
+ .Python
12
+ .idea/
13
+ build/
14
+ develop-eggs/
15
+ dist/
16
+ downloads/
17
+ eggs/
18
+ .eggs/
19
+ lib/
20
+ lib64/
21
+ parts/
22
+ sdist/
23
+ var/
24
+ wheels/
25
+ share/python-wheels/
26
+ *.egg-info/
27
+ .installed.cfg
28
+ *.egg
29
+ MANIFEST
30
+
31
+ # PyInstaller
32
+ # Usually these files are written by a python script from a template
33
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
34
+ *.manifest
35
+ *.spec
36
+
37
+ # Installer logs
38
+ pip-log.txt
39
+ pip-delete-this-directory.txt
40
+
41
+ # Unit test / coverage reports
42
+ htmlcov/
43
+ .tox/
44
+ .nox/
45
+ .coverage
46
+ .coverage.*
47
+ .cache
48
+ nosetests.xml
49
+ coverage.xml
50
+ *.cover
51
+ *.py.cover
52
+ .hypothesis/
53
+ .pytest_cache/
54
+ cover/
55
+
56
+ # Translations
57
+ *.mo
58
+ *.pot
59
+
60
+ # Django stuff:
61
+ *.log
62
+ local_settings.py
63
+ db.sqlite3
64
+ db.sqlite3-journal
65
+
66
+ # Flask stuff:
67
+ instance/
68
+ .webassets-cache
69
+
70
+ # Scrapy stuff:
71
+ .scrapy
72
+
73
+ # Sphinx documentation
74
+ docs/_build/
75
+
76
+ # PyBuilder
77
+ .pybuilder/
78
+ target/
79
+
80
+ # Jupyter Notebook
81
+ .ipynb_checkpoints
82
+
83
+ # IPython
84
+ profile_default/
85
+ ipython_config.py
86
+
87
+ # pyenv
88
+ # For a library or package, you might want to ignore these files since the code is
89
+ # intended to run in multiple environments; otherwise, check them in:
90
+ # .python-version
91
+
92
+ # pipenv
93
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
95
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
96
+ # install all needed dependencies.
97
+ # Pipfile.lock
98
+
99
+ # UV
100
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
101
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
102
+ # commonly ignored for libraries.
103
+ # uv.lock
104
+
105
+ # poetry
106
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
107
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
108
+ # commonly ignored for libraries.
109
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
110
+ # poetry.lock
111
+ # poetry.toml
112
+
113
+ # pdm
114
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
116
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
117
+ # pdm.lock
118
+ # pdm.toml
119
+ .pdm-python
120
+ .pdm-build/
121
+
122
+ # pixi
123
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
124
+ # pixi.lock
125
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
126
+ # in the .venv directory. It is recommended not to include this directory in version control.
127
+ .pixi
128
+
129
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
130
+ __pypackages__/
131
+
132
+ # Celery stuff
133
+ celerybeat-schedule
134
+ celerybeat.pid
135
+
136
+ # Redis
137
+ *.rdb
138
+ *.aof
139
+ *.pid
140
+
141
+ # RabbitMQ
142
+ mnesia/
143
+ rabbitmq/
144
+ rabbitmq-data/
145
+
146
+ # ActiveMQ
147
+ activemq-data/
148
+
149
+ # SageMath parsed files
150
+ *.sage.py
151
+
152
+ # Environments
153
+ .env
154
+ .envrc
155
+ .venv
156
+ env/
157
+ venv/
158
+ ENV/
159
+ env.bak/
160
+ venv.bak/
161
+
162
+ # Spyder project settings
163
+ .spyderproject
164
+ .spyproject
165
+
166
+ # Rope project settings
167
+ .ropeproject
168
+
169
+ # mkdocs documentation
170
+ /site
171
+
172
+ # mypy
173
+ .mypy_cache/
174
+ .dmypy.json
175
+ dmypy.json
176
+
177
+ # Pyre type checker
178
+ .pyre/
179
+
180
+ # pytype static type analyzer
181
+ .pytype/
182
+
183
+ # Cython debug symbols
184
+ cython_debug/
185
+
186
+ # PyCharm
187
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
188
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
189
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
190
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
191
+ # .idea/
192
+
193
+ # Abstra
194
+ # Abstra is an AI-powered process automation framework.
195
+ # Ignore directories containing user credentials, local state, and settings.
196
+ # Learn more at https://abstra.io/docs
197
+ .abstra/
198
+
199
+ # Visual Studio Code
200
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
201
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
202
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
203
+ # you could uncomment the following to ignore the entire vscode folder
204
+ # .vscode/
205
+ # Temporary file for partial code execution
206
+ tempCodeRunnerFile.py
207
+
208
+ # Ruff stuff:
209
+ .ruff_cache/
210
+
211
+ # PyPI configuration file
212
+ .pypirc
213
+
214
+ # Marimo
215
+ marimo/_static/
216
+ marimo/_lsp/
217
+ __marimo__/
218
+
219
+ # Streamlit
220
+ .streamlit/secrets.toml
basa-0.1.0a0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Muanai Khalifah Revindo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
basa-0.1.0a0/PKG-INFO ADDED
@@ -0,0 +1,394 @@
1
+ Metadata-Version: 2.4
2
+ Name: basa
3
+ Version: 0.1.0a0
4
+ Summary: Modern NLP for Indonesian and regional languages
5
+ Author-email: Muanai Khalifah Revindo <muanaikhalifahr@gmail.com>
6
+ License: MIT
7
+ License-File: LICENSE
8
+ Requires-Python: >=3.10
9
+ Requires-Dist: pydantic>=2.0.0
10
+ Requires-Dist: torch>=2.0.0
11
+ Requires-Dist: transformers>=4.30.0
12
+ Provides-Extra: dev
13
+ Requires-Dist: black>=23.0.0; extra == 'dev'
14
+ Requires-Dist: mypy>=1.0.0; extra == 'dev'
15
+ Requires-Dist: pytest-cov>=4.1.0; extra == 'dev'
16
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
17
+ Requires-Dist: ruff>=0.1.0; extra == 'dev'
18
+ Provides-Extra: evaluation
19
+ Requires-Dist: bert-score>=0.3.13; extra == 'evaluation'
20
+ Requires-Dist: rouge-score>=0.1.2; extra == 'evaluation'
21
+ Requires-Dist: seqeval>=1.2.2; extra == 'evaluation'
22
+ Provides-Extra: serving
23
+ Requires-Dist: fastapi>=0.100.0; extra == 'serving'
24
+ Requires-Dist: uvicorn>=0.23.0; extra == 'serving'
25
+ Description-Content-Type: text/markdown
26
+
27
+ # BASA
28
+
29
+ **Modern NLP preprocessing for Indonesian and regional languages.**
30
+
31
+ BASA is a lightweight, zero-dependency preprocessing library designed for real-world Indonesian text — the kind found on Twitter/X, TikTok, WhatsApp, Shopee reviews, and Discord. It normalizes informal slang, collapses expressive character repetition, reduces punctuation noise, and optionally corrects typos, all through a single clean API.
32
+
33
+ ```python
34
+ from basa import normalize
35
+
36
+ normalize("GW GKKKK NGERTIII BNGTTTT!!!!!")
37
+ # → 'saya tidak mengerti banget!'
38
+ ```
39
+
40
+ ---
41
+
42
+ ## Why BASA?
43
+
44
+ Indonesian social media text is notoriously difficult to process with standard NLP tools:
45
+
46
+ | Raw input | After `normalize()` |
47
+ |---|---|
48
+ | `gw gk ngerti bngt sihhhh!!!` | `saya tidak mengerti banget sih!` |
49
+ | `kmrn gamau makan krn baper bgt` | `kemarin tidak mau makan karena bawa perasaan banget` |
50
+ | `otw gan, rekber dlu ya!!!!!` | `dalam perjalanan saudara, rekening bersama dulu ya!` |
51
+ | `GW GKKKK NGERTIII BNGTTTT!!!!!` | `saya tidak mengerti banget!` |
52
+
53
+ Standard tokenizers and language models often fail on this kind of input because they see `"gkkkk"`, `"bngtttt"`, and `"ngertiii"` as unknown tokens. BASA normalizes them first.
54
+
55
+ ---
56
+
57
+ ## Installation
58
+
59
+ ```bash
60
+ pip install basa
61
+ ```
62
+
63
+ > **Requires Python 3.10+**
64
+
65
+ ---
66
+
67
+ ## Quick Start
68
+
69
+ ### One-liner (recommended for most use cases)
70
+
71
+ ```python
72
+ from basa import normalize
73
+
74
+ normalize("gw gk ngerti bngt sihhhh!!!")
75
+ # → 'saya tidak mengerti banget sih!'
76
+ ```
77
+
78
+ ### Zero-config alias
79
+
80
+ ```python
81
+ from basa import quick
82
+
83
+ quick("GW GKKKK NGERTIII BNGTTTT!!!!!")
84
+ # → 'saya tidak mengerti banget!'
85
+ ```
86
+
87
+ `quick()` is a thin alias for `normalize()` with all defaults applied. Use it when you want the shortest possible call.
88
+
89
+ ### Batch processing
90
+
91
+ ```python
92
+ from basa import normalize
93
+
94
+ texts = [
95
+ "gw gk ngerti",
96
+ "lu udh makan??",
97
+ "kmrn gamau pergi krn baper bgt",
98
+ ]
99
+
100
+ normalize(texts)
101
+ # → ['saya tidak mengerti', 'kamu sudah makan?', 'kemarin tidak mau pergi karena bawa perasaan banget']
102
+ ```
103
+
104
+ ---
105
+
106
+ ## API Reference
107
+
108
+ ### `normalize(text, **options)`
109
+
110
+ Normalize informal Indonesian text. Accepts a single string or a list of strings.
111
+
112
+ ```python
113
+ normalize(
114
+ text: Union[str, List[str]],
115
+ apply_slang: bool = True,
116
+ apply_typo: bool = False,
117
+ lowercase: bool = True,
118
+ normalize_punctuation: bool = True,
119
+ normalize_whitespace: bool = True,
120
+ ) -> Union[str, List[str]]
121
+ ```
122
+
123
+ #### Parameters
124
+
125
+ | Parameter | Type | Default | Description |
126
+ |---|---|---|---|
127
+ | `text` | `str` or `List[str]` | — | Input text or list of texts. |
128
+ | `apply_slang` | `bool` | `True` | Expand slang and reduce expressive repeated characters (e.g. `"bngtttt"` → `"banget"`). |
129
+ | `apply_typo` | `bool` | `False` | Correct misspelled words using Levenshtein distance. **Opt-in** — requires a vocabulary to be loaded first. |
130
+ | `lowercase` | `bool` | `True` | Lowercase the text before processing. Set `False` for NER and case-sensitive pipelines. |
131
+ | `normalize_punctuation` | `bool` | `True` | Collapse repeated punctuation marks (`"!!!!!"` → `"!"`). |
132
+ | `normalize_whitespace` | `bool` | `True` | Strip leading/trailing whitespace and collapse internal multiple spaces. |
133
+
134
+ #### Processing pipeline (in order)
135
+
136
+ ```
137
+ 1. lowercase → "GW GK NGERTI" → "gw gk ngerti"
138
+ 2. slang normalization → "gkkkk" → "gk" → "tidak"
139
+ 3. typo correction → "mkan" → "makan" (opt-in)
140
+ 4. punctuation → "!!!!!" → "!"
141
+ 5. whitespace cleanup → " a b " → "a b"
142
+ ```
143
+
144
+ #### Examples
145
+
146
+ ```python
147
+ # Preserve case for NER tasks
148
+ normalize("Jokowi pergi ke Jakarta", lowercase=False)
149
+ # → 'Jokowi pergi ke Jakarta'
150
+
151
+ # Disable slang (pass through raw tokens)
152
+ normalize("gw gk ngerti", apply_slang=False)
153
+ # → 'gw gk ngerti'
154
+
155
+ # Enable typo correction (requires vocab)
156
+ from basa import typo
157
+ typo.add_to_vocab({"makan", "minum", "pergi"})
158
+ normalize("saya mkan dan mnum", apply_typo=True)
159
+ # → 'saya makan dan minum'
160
+ ```
161
+
162
+ ---
163
+
164
+ ### `quick(text)`
165
+
166
+ Zero-config alias for `normalize()` with all default settings.
167
+
168
+ ```python
169
+ from basa import quick
170
+
171
+ quick("gw gamau pergi krn mager")
172
+ # → 'saya tidak mau pergi karena malas bergerak'
173
+ ```
174
+
175
+ ---
176
+
177
+ ### `typo` — Typo Corrector
178
+
179
+ BASA's typo corrector is **vocabulary-driven and opt-in by default**. You supply the vocabulary; BASA finds the closest match using Levenshtein distance.
180
+
181
+ ```python
182
+ from basa import typo
183
+
184
+ # Load your domain vocabulary
185
+ typo.add_to_vocab({"makan", "minum", "masak", "pergi", "datang"})
186
+
187
+ typo.correct("mkan") # → 'makan'
188
+ typo.correct("mnm") # → 'minum'
189
+ typo.correct("ok") # → 'ok' (too short, skipped by default)
190
+
191
+ # Correct a full sentence
192
+ typo.correct_text("saya mkan dan mnm")
193
+ # → 'saya makan dan minum'
194
+
195
+ # Get multiple suggestions
196
+ typo.suggest("mkan", top_k=3)
197
+ # → ['makan', 'masak', 'minum']
198
+ ```
199
+
200
+ #### Why is `apply_typo=False` by default?
201
+
202
+ Typo correction is **destructive** when applied blindly. Without the right vocabulary, domain-specific terms like `xgboost`, `lightgbm`, or `rekber` would be mangled. BASA follows the principle of *conservative by default, destructive features opt-in*.
203
+
204
+ #### Vocabulary management
205
+
206
+ ```python
207
+ from basa import typo
208
+
209
+ typo.add_to_vocab({"kata", "lain"}) # add words
210
+ typo.remove_from_vocab({"kata"}) # remove words
211
+ typo.clear_vocab() # reset entirely
212
+ len(typo) # vocab size
213
+ "makan" in typo # membership check
214
+
215
+ # Check cache statistics (useful for profiling)
216
+ typo.cache_info()
217
+ # → {'hits': 120, 'misses': 35, 'size': 35}
218
+ ```
219
+
220
+ #### Typo corrector options
221
+
222
+ ```python
223
+ from basa.core.typo import TypoCorrector
224
+
225
+ corrector = TypoCorrector(
226
+ vocab={"makan", "minum"},
227
+ min_word_length=4, # tokens shorter than this are skipped (default: 4)
228
+ min_confidence=0.5, # minimum correction confidence in [0, 1] (default: 0.5)
229
+ )
230
+ ```
231
+
232
+ ---
233
+
234
+ ### `slang` — Slang Normalizer
235
+
236
+ Access the underlying slang engine directly for fine-grained control.
237
+
238
+ ```python
239
+ from basa.core.slang import slang, SlangNormalizer
240
+
241
+ # Use the singleton
242
+ slang.normalize("gw gamau pergi krn lg baper bgt")
243
+ # → 'saya tidak mau pergi karena sedang bawa perasaan banget'
244
+
245
+ # Custom dictionary (extend or override defaults)
246
+ custom = SlangNormalizer(custom_mapping={
247
+ "gaskeun": "ayo lakukan",
248
+ "jancok": "ekspresi",
249
+ })
250
+ custom.normalize("gaskeun bro!")
251
+ # → 'ayo lakukan bro!'
252
+
253
+ # Batch normalize
254
+ slang.normalize_batch(["gw makan", "lu minum"])
255
+ # → ['saya makan', 'kamu minum']
256
+ ```
257
+
258
+ #### Slang dictionary categories
259
+
260
+ The built-in dictionary covers **250+ entries** across 13 categories:
261
+
262
+ | Category | Examples |
263
+ |---|---|
264
+ | Pronouns | `gw` → saya, `lu` → kamu, `dy` → dia |
265
+ | Kinship & address | `kk` → kakak, `klg` → keluarga, `ortu` → orang tua |
266
+ | Negation | `ga`, `gak`, `nggak` → tidak |
267
+ | Compound negation | `gamau` → tidak mau, `gabisa` → tidak bisa |
268
+ | Conjunctions | `yg` → yang, `krn` → karena, `tp` → tapi |
269
+ | Verbs | `udah` → sudah, `blm` → belum, `ngerti` → mengerti |
270
+ | Adjectives & adverbs | `bgt` → banget, `bener` → benar, `dikit` → sedikit |
271
+ | Question words | `gmn` → bagaimana, `knp` → kenapa, `kmn` → kemana |
272
+ | Greetings & responses | `makasih` → terima kasih, `sip` → baik |
273
+ | Temporal & location | `skrg` → sekarang, `kmrn` → kemarin, `ntr` → nanti |
274
+ | Internet slang | `otw` → dalam perjalanan, `btw` → omong-omong, `wkwk` → tertawa |
275
+ | E-commerce & finance | `ongkir` → ongkos kirim, `rekber` → rekening bersama, `cod` → bayar di tempat |
276
+ | Youth / Gen-Z | `mager` → malas bergerak, `baper` → bawa perasaan, `gabut` → tidak ada kegiatan |
277
+
278
+ ---
279
+
280
+ ## Real-World Use Cases
281
+
282
+ ### Preprocessing for sentiment analysis
283
+
284
+ ```python
285
+ from basa import normalize
286
+
287
+ reviews = [
288
+ "produknya bagus bgt tp ongkirnya mahal bgt!!!",
289
+ "gw kecewa bngt, barang ga sesuai deskripsi smskali",
290
+ "rekber dlu gan, takut kena tipu",
291
+ ]
292
+
293
+ clean = normalize(reviews)
294
+ # Pass clean into your sentiment model
295
+ ```
296
+
297
+ ### Preprocessing for a custom NLP pipeline
298
+
299
+ ```python
300
+ from basa import normalize, typo
301
+
302
+ # Load your domain vocabulary (e.g., from a word list file)
303
+ with open("vocab.txt") as f:
304
+ domain_vocab = set(f.read().splitlines())
305
+
306
+ typo.add_to_vocab(domain_vocab)
307
+
308
+ def preprocess(text: str) -> str:
309
+ return normalize(text, apply_typo=True)
310
+
311
+ preprocess("gw mkan siang tdi di wrng padang")
312
+ # → 'saya makan siang tadi di warung padang'
313
+ ```
314
+
315
+ ### NER pipeline (preserve casing)
316
+
317
+ ```python
318
+ from basa import normalize
319
+
320
+ text = "Jokowi blg bhw pemerintah akan bantu UMKM"
321
+ normalize(text, lowercase=False)
322
+ # → 'Jokowi bilang bahwa pemerintah akan bantu UMKM'
323
+ ```
324
+
325
+ ---
326
+
327
+ ## Design Philosophy
328
+
329
+ BASA is built around three principles:
330
+
331
+ 1. **Conservative by default.** Only safe, lossless transforms are enabled out of the box. Destructive features (like typo correction) require explicit opt-in.
332
+
333
+ 2. **No bundled vocabularies for correction.** Every domain has different vocabulary needs — fintech, e-commerce, ML, healthcare. Callers supply their own word list via `typo.add_to_vocab()`.
334
+
335
+ 3. **Zero required dependencies for core preprocessing.** The `normalize()` and `slang` modules use only the Python standard library. The optional `transformers`, `torch`, and `pydantic` dependencies are only required for advanced modules (`basa.translate`, `basa.evaluate`).
336
+
337
+ ---
338
+
339
+ ## Development
340
+
341
+ ### Setup
342
+
343
+ ```bash
344
+ git clone https://github.com/Muanai/basa.git
345
+ cd basa
346
+ python -m venv .venv
347
+ .venv\Scripts\activate # Windows
348
+ # source .venv/bin/activate # macOS / Linux
349
+ pip install -e ".[dev]"
350
+ ```
351
+
352
+ ### Running tests
353
+
354
+ ```bash
355
+ pytest tests/ -v
356
+ ```
357
+
358
+ ### Optional extras
359
+
360
+ ```bash
361
+ pip install -e ".[serving]" # FastAPI serving
362
+ pip install -e ".[evaluation]" # ROUGE, BERTScore, seqeval
363
+ pip install -e ".[dev]" # pytest, ruff, black, mypy
364
+ ```
365
+
366
+ ---
367
+
368
+ ## Roadmap
369
+
370
+ | Version | Status | Features |
371
+ |---|---|---|
372
+ | **v0.1** | ✅ Current | `normalize()`, `quick()`, slang (250+ entries), typo corrector |
373
+ | **v0.2** | 🔜 Planned | BK-Tree / SymSpell for faster typo correction at large vocab sizes |
374
+ | **v0.3** | 🔜 Planned | Emoji handling, `remove_emoji` flag |
375
+ | **v0.4** | 🔜 Planned | Tokenizer module (`basa.tokenize`) |
376
+ | **v1.0** | 🔜 Planned | Stable API, full docs site, PyPI release |
377
+
378
+ ---
379
+
380
+ ## Contributing
381
+
382
+ Contributions are welcome! In particular:
383
+
384
+ - **Slang dictionary additions** — if you spot a common slang word that's missing, open a PR adding it to the appropriate category in [`src/basa/core/slang.py`](src/basa/core/slang.py).
385
+ - **Bug reports** — please include the exact input string and the unexpected output.
386
+ - **Performance improvements** — especially for the typo correction module.
387
+
388
+ Please open an issue before submitting large changes.
389
+
390
+ ---
391
+
392
+ ## License
393
+
394
+ MIT © 2026 [Muanai Khalifah Revindo](mailto:muanaikhalifahr@gmail.com)