khmerthings 0.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. khmerthings-0.4.3/.github/PULL_REQUEST_TEMPLATE.md +28 -0
  2. khmerthings-0.4.3/.github/workflows/ci.yml +69 -0
  3. khmerthings-0.4.3/.github/workflows/publish.yml +58 -0
  4. khmerthings-0.4.3/.gitignore +8 -0
  5. khmerthings-0.4.3/AGENTS.md +130 -0
  6. khmerthings-0.4.3/CHANGELOG.md +131 -0
  7. khmerthings-0.4.3/CLAUDE.md +1 -0
  8. khmerthings-0.4.3/DEVELOPMENT_GUIDE.md +123 -0
  9. khmerthings-0.4.3/LICENSE +21 -0
  10. khmerthings-0.4.3/PKG-INFO +101 -0
  11. khmerthings-0.4.3/README.md +85 -0
  12. khmerthings-0.4.3/docs/line-sorter.md +174 -0
  13. khmerthings-0.4.3/docs/word-breaker.md +253 -0
  14. khmerthings-0.4.3/docs/word-counter.md +246 -0
  15. khmerthings-0.4.3/pyproject.toml +58 -0
  16. khmerthings-0.4.3/src/khmerthings/__init__.py +29 -0
  17. khmerthings-0.4.3/src/khmerthings/__main__.py +7 -0
  18. khmerthings-0.4.3/src/khmerthings/chars.py +129 -0
  19. khmerthings-0.4.3/src/khmerthings/cli.py +147 -0
  20. khmerthings-0.4.3/src/khmerthings/clusters.py +77 -0
  21. khmerthings-0.4.3/src/khmerthings/counter.py +73 -0
  22. khmerthings-0.4.3/src/khmerthings/data/modern.txt +44 -0
  23. khmerthings-0.4.3/src/khmerthings/data/names.txt +212 -0
  24. khmerthings-0.4.3/src/khmerthings/data/words.txt +610 -0
  25. khmerthings-0.4.3/src/khmerthings/lexicon.py +126 -0
  26. khmerthings-0.4.3/src/khmerthings/py.typed +0 -0
  27. khmerthings-0.4.3/src/khmerthings/segmenter.py +59 -0
  28. khmerthings-0.4.3/src/khmerthings/sorting.py +79 -0
  29. khmerthings-0.4.3/src/khmerthings/tokenizer.py +125 -0
  30. khmerthings-0.4.3/tests/test_chars.py +135 -0
  31. khmerthings-0.4.3/tests/test_cli.py +197 -0
  32. khmerthings-0.4.3/tests/test_clusters.py +104 -0
  33. khmerthings-0.4.3/tests/test_counter.py +81 -0
  34. khmerthings-0.4.3/tests/test_lexicon.py +135 -0
  35. khmerthings-0.4.3/tests/test_segmenter.py +83 -0
  36. khmerthings-0.4.3/tests/test_sorting.py +93 -0
  37. khmerthings-0.4.3/tests/test_tokenizer.py +118 -0
  38. khmerthings-0.4.3/uv.lock +467 -0
@@ -0,0 +1,28 @@
1
+ # Summary
2
+
3
+ <!-- What does this PR change, and why? -->
4
+
5
+ ## Type of change
6
+
7
+ - [ ] New tool / feature
8
+ - [ ] Bug fix
9
+ - [ ] Lexicon update (`words.txt`)
10
+ - [ ] Tests / CI / tooling
11
+ - [ ] Documentation
12
+
13
+ ## Checklist
14
+
15
+ - [ ] `uv run pytest` passes; new behavior is covered by tests
16
+ - [ ] `uv run mypy src tests` is clean
17
+ - [ ] `uv run ruff check` and `uv run ruff format --check` are clean
18
+ - [ ] Change is fully deterministic (no randomness, no ML/LLM inference)
19
+ - [ ] No third-party Khmer NLP code or data introduced; zero new runtime deps
20
+ - [ ] Lexicon edits (if any): NFC, Khmer-only, no duplicates, both ្ត/្ដ
21
+ variants where applicable
22
+ - [ ] Public API changes re-exported in `__init__.py` and documented in
23
+ `README.md` / `AGENTS.md`
24
+ - [ ] `CHANGELOG.md` updated under `[Unreleased]` (any user-visible change)
25
+ - [ ] Affected docs updated (`README.md`, `AGENTS.md`,
26
+ `DEVELOPMENT_GUIDE.md`, docstrings)
27
+ - [ ] Tool changes: per-tool doc in `docs/<tool>.md` added/updated, with
28
+ real (executed) example outputs
@@ -0,0 +1,69 @@
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+
8
+ permissions:
9
+ contents: read
10
+
11
+ concurrency:
12
+ group: ci-${{ github.workflow }}-${{ github.ref }}
13
+ cancel-in-progress: true
14
+
15
+ jobs:
16
+ lint:
17
+ name: Lint & type check
18
+ runs-on: ubuntu-latest
19
+ steps:
20
+ - uses: actions/checkout@v4
21
+ - uses: astral-sh/setup-uv@v5
22
+ with:
23
+ enable-cache: true
24
+ - name: Install
25
+ run: uv sync --locked
26
+ - name: Ruff lint
27
+ run: uv run ruff check --output-format=github
28
+ - name: Ruff format
29
+ run: uv run ruff format --check
30
+ - name: Mypy
31
+ run: uv run mypy src tests
32
+
33
+ test:
34
+ name: Test (Python ${{ matrix.python-version }})
35
+ runs-on: ubuntu-latest
36
+ strategy:
37
+ fail-fast: false
38
+ matrix:
39
+ python-version: ["3.11", "3.12", "3.13", "3.14"]
40
+ steps:
41
+ - uses: actions/checkout@v4
42
+ - uses: astral-sh/setup-uv@v5
43
+ with:
44
+ enable-cache: true
45
+ python-version: ${{ matrix.python-version }}
46
+ - name: Install
47
+ run: uv sync --locked
48
+ - name: Run tests
49
+ run: uv run pytest --cov=khmerthings --cov-report=term-missing
50
+
51
+ build:
52
+ name: Build package
53
+ runs-on: ubuntu-latest
54
+ steps:
55
+ - uses: actions/checkout@v4
56
+ - uses: astral-sh/setup-uv@v5
57
+ with:
58
+ enable-cache: true
59
+ - name: Build sdist and wheel
60
+ run: uv build
61
+ - name: Smoke test the wheel
62
+ run: |
63
+ uv venv /tmp/smoke
64
+ uv pip install --python /tmp/smoke/bin/python dist/*.whl
65
+ echo "ខ្ញុំស្រឡាញ់ភាសាខ្មែរ" | /tmp/smoke/bin/python -m khmerthings count -
66
+ - uses: actions/upload-artifact@v4
67
+ with:
68
+ name: dist
69
+ path: dist/
@@ -0,0 +1,58 @@
1
+ name: Publish
2
+
3
+ on:
4
+ push:
5
+ tags: ["v*"]
6
+
7
+ permissions:
8
+ contents: read
9
+
10
+ jobs:
11
+ checks:
12
+ name: Full checks
13
+ runs-on: ubuntu-latest
14
+ steps:
15
+ - uses: actions/checkout@v4
16
+ - uses: astral-sh/setup-uv@v5
17
+ with:
18
+ enable-cache: true
19
+ - name: Install
20
+ run: uv sync --locked
21
+ - name: Ruff lint
22
+ run: uv run ruff check --output-format=github
23
+ - name: Ruff format
24
+ run: uv run ruff format --check
25
+ - name: Mypy
26
+ run: uv run mypy src tests
27
+ - name: Run tests
28
+ run: uv run pytest
29
+
30
+ publish:
31
+ name: Build and publish to PyPI
32
+ runs-on: ubuntu-latest
33
+ needs: checks
34
+ environment: pypi
35
+ steps:
36
+ - uses: actions/checkout@v4
37
+ - uses: astral-sh/setup-uv@v5
38
+ with:
39
+ enable-cache: true
40
+ - name: Check tag matches package version
41
+ run: |
42
+ version=$(uv run --no-sync python -c "import tomllib; print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])")
43
+ tag="${GITHUB_REF_NAME#v}"
44
+ if [ "$version" != "$tag" ]; then
45
+ echo "Tag v$tag does not match pyproject.toml version $version" >&2
46
+ exit 1
47
+ fi
48
+ - name: Build sdist and wheel
49
+ run: uv build
50
+ - name: Smoke test the wheel
51
+ run: |
52
+ uv venv /tmp/smoke
53
+ uv pip install --python /tmp/smoke/bin/python dist/*.whl
54
+ echo "ខ្ញុំស្រឡាញ់ភាសាខ្មែរ" | /tmp/smoke/bin/python -m khmerthings count -
55
+ - name: Publish to PyPI
56
+ run: uv publish --token "$PYPI_API_TOKEN"
57
+ env:
58
+ PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
@@ -0,0 +1,8 @@
1
+ .venv/
2
+ __pycache__/
3
+ *.egg-info/
4
+ dist/
5
+ .pytest_cache/
6
+ .mypy_cache/
7
+ .ruff_cache/
8
+ .coverage
@@ -0,0 +1,130 @@
1
+ # khmerthings — agent guide
2
+
3
+ Deterministic Khmer language tools in Python. Note: the repo directory is
4
+ `libkhm`, but the package, CLI, and PyPI name is **khmerthings**.
5
+
6
+ ## Hard constraints (non-negotiable)
7
+
8
+ - **Deterministic only.** Every tool must be rule/algorithm/dictionary-based.
9
+ Same input → same output, always. No probabilistic models, no LLMs, no ML
10
+ inference at runtime.
11
+ - **No third-party Khmer NLP code; word data is self-curated.** Do not add
12
+ dependencies on or copy from other Khmer NLP projects, and never
13
+ bulk-import someone else's wordlist. Web research to *find and verify*
14
+ candidate words/names/slang is allowed (user decision, 2026-07-03) — but
15
+ every entry is curated individually, spellings cross-checked, and sources
16
+ noted in the data file header. Data files under `src/khmerthings/data/`:
17
+ `words.txt` (core), `names.txt` (names, surnames, titles), `modern.txt`
18
+ (slang, informal, loanwords, trending). All three are growable and merged
19
+ via `load_lexicon(*sources)`.
20
+ - **Zero runtime dependencies.** Stdlib only. Dev tools (pytest, ruff, mypy)
21
+ are the only allowed dependencies.
22
+ - **Tests are the top priority.** Every module ships with table-driven unit
23
+ tests and invariant checks. Write/update tests with every change.
24
+
25
+ ## Commands
26
+
27
+ ```sh
28
+ uv sync # env + dev deps
29
+ uv run pytest # tests (must stay green)
30
+ uv run mypy src tests # strict mode, must be clean
31
+ uv run ruff check --fix && uv run ruff format
32
+ uv build # sdist + wheel
33
+ echo "ខ្មែរ" | uv run khmerthings count # CLI smoke test
34
+ ```
35
+
36
+ Run all four checks (pytest, mypy, ruff check, ruff format --check) before
37
+ considering any change done.
38
+
39
+ ## Architecture
40
+
41
+ `src/` layout; modules build bottom-up, each layer a primitive for the next:
42
+
43
+ 1. `chars.py` — Khmer Unicode character classification (pure functions,
44
+ single-character contract: multi-char input raises `ValueError`).
45
+ 2. `clusters.py` — Khmer character-cluster (KCC) segmentation. Cluster
46
+ boundaries are the only legal word boundaries.
47
+ 3. `lexicon.py` + `data/*.txt` — wordlists (`words`/`names`/`modern`) +
48
+ trie keyed by clusters; `longest_match` is the segmentation primitive;
49
+ `load_lexicon(*sources)` merges sources (cached), `--include` on the
50
+ CLI exposes the extra ones.
51
+ 4. `tokenizer.py` — lossless typed tokenization (Khmer words via greedy
52
+ longest-match; unknown Khmer spans become `KHMER_UNKNOWN`, never dropped).
53
+ 5. `counter.py` — word counter tool (`count_words`, `analyze`).
54
+ 6. `segmenter.py` — word breaker tool (`break_words`, `mark_boundaries`),
55
+ a thin first-class wrapper over `tokenize`; invariant
56
+ `len(break_words(t)) == count_words(t)`.
57
+ 7. `sorting.py` — Khmer dictionary-order line sorting (`sort_lines`,
58
+ `khmer_sort_key`: per-cluster key `(base, coengs, vowels, signs)` —
59
+ naive codepoint order is wrong for subscripts).
60
+ 8. `cli.py` — argparse subcommands, one per tool (`khmerthings count ...`,
61
+ `khmerthings segment ...`, `khmerthings sort ...`).
62
+
63
+ Planned tools (spellchecker/spellfixer — blocked on lexicon size, POS
64
+ tagger, intent detector, paragraph categorizer) follow the same pattern:
65
+ new module in `src/khmerthings/`, re-export in `__init__.py`, new CLI
66
+ subcommand in `cli.py`, new `tests/test_<module>.py`, and a
67
+ **per-tool document `docs/<tool>.md`** (see below).
68
+
69
+ ## Invariants to preserve (enforced by tests)
70
+
71
+ - `"".join(segment_clusters(t)) == unicodedata.normalize("NFC", t)` — cluster
72
+ segmentation never drops or reorders characters, even on malformed input.
73
+ - Tokenization is lossless: concatenated token texts equal the NFC input, and
74
+ token offsets are contiguous.
75
+ - All text is NFC-normalized at entry points; lexicon entries must be NFC,
76
+ Khmer-letters-only, and unique (loader raises otherwise).
77
+ - A lexicon match can never split a character cluster.
78
+
79
+ ## Documentation upkeep (do this every change)
80
+
81
+ Docs must always reflect the current state of the code. As part of any
82
+ change — not as an afterthought:
83
+
84
+ - **Self-update this file (AGENTS.md)** when architecture, constraints,
85
+ commands, tools, or conventions change (e.g. a new module or subcommand).
86
+ - **Every main (end-user) tool has its own `docs/<tool>.md`** written for
87
+ community users *and for AI agents driving the tools autonomously*,
88
+ following the shared template: What it does / Quick start / CLI
89
+ reference / Python API / How it works / Guarantees & limitations / Task
90
+ recipes / Related tools. Requirements: **every** CLI flag and API
91
+ parameter gets a concrete example with its output; exit codes and error
92
+ output are documented; a "Task recipes" table maps goals → exact
93
+ commands/calls. Low-level primitives are documented via docstrings only,
94
+ not `docs/`. All example outputs must be real — run the command and
95
+ paste the actual output, never invent it (a wrong "expected" output has
96
+ already been caught this way).
97
+ - **Update the other docs** touched by the change: `README.md` (landing
98
+ page: tool table, roadmap, examples), the affected `docs/*.md`,
99
+ `DEVELOPMENT_GUIDE.md` (workflow, recipes), docstrings.
100
+ - **Keep `CHANGELOG.md` current**: add an entry under `[Unreleased]` for
101
+ every user-visible change (Keep a Changelog format: Added / Changed /
102
+ Fixed / Removed). On release, rename `[Unreleased]` to the version + date.
103
+ - A PR that changes behavior but touches no docs/changelog is incomplete —
104
+ the PR template checklist enforces this.
105
+
106
+ ## Releasing
107
+
108
+ 1. Bump `version` in `pyproject.toml` **and** `__version__` in
109
+ `src/khmerthings/__init__.py` (they must stay in sync).
110
+ 2. Turn the `[Unreleased]` section of `CHANGELOG.md` into `[X.Y.Z] - date`.
111
+ 3. `uv sync` (refresh lockfile), run all four checks, commit.
112
+ 4. `git tag vX.Y.Z && git push origin main --tags`.
113
+ 5. The tag push triggers `.github/workflows/publish.yml`, which re-runs the
114
+ checks, verifies the tag matches `pyproject.toml`, builds, smoke-tests the
115
+ wheel, and publishes to PyPI (needs the `PYPI_API_TOKEN` repo secret,
116
+ configured in repo settings).
117
+
118
+ ## Conventions
119
+
120
+ - Python ≥ 3.11, mypy `strict`, ruff line length 100.
121
+ - Public API re-exported in `__init__.py` with `__all__`; keep `py.typed`.
122
+ - Frozen dataclasses for result types (`Token`, `WordCount`).
123
+ - Wordlist files (`words.txt`, `names.txt`, `modern.txt`): one entry per
124
+ line, UTF-8, NFC, Khmer letters/marks only, `#` comments, grouped by
125
+ category, sources noted in the header. High-frequency words with
126
+ subscript ta/da (្ត/្ដ) spelling variation are listed in both spellings —
127
+ real-world text mixes them. Within a file duplicates are a load error;
128
+ the same entry in different files is fine (merged at load).
129
+ - Khmer test strings: verify codepoints carefully (visually identical strings
130
+ can differ, e.g. ្ត vs ្ដ); assert exact expected values, hand-verified.
@@ -0,0 +1,131 @@
1
+ # Changelog
2
+
3
+ All notable changes to khmerthings are documented here.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.4.3] - 2026-07-03
11
+
12
+ ### Added
13
+
14
+ - GitHub Actions publish workflow: pushing a `vX.Y.Z` tag now runs the full
15
+ checks, verifies the tag matches the package version, builds, smoke-tests
16
+ the wheel, and uploads to PyPI (requires the `PYPI_API_TOKEN` secret).
17
+
18
+ ## [0.4.2] - 2026-07-03
19
+
20
+ ### Added
21
+
22
+ - Lexicon: ខ្សោយ (weak) — reported missing via
23
+ `khmerthings count` showing it as an unknown span.
24
+
25
+ ## [0.4.1] - 2026-07-03
26
+
27
+ ### Changed
28
+
29
+ - Per-tool docs rewritten for autonomous (AI-agent and scripted) use:
30
+ every CLI flag and Python API parameter now has a concrete example with
31
+ real executed output, exit codes and error output are documented, and
32
+ each doc ends with a "Task recipes" table mapping goals to exact
33
+ commands/calls.
34
+
35
+ ### Fixed
36
+
37
+ - CLI: a missing/unreadable input file now exits with code 1 and a
38
+ one-line `khmerthings: error: ...` message instead of a Python traceback.
39
+
40
+ ## [0.4.0] - 2026-07-03
41
+
42
+ ### Added
43
+
44
+ - Two new growable wordlist sources alongside the core vocabulary
45
+ (candidates researched from public sources — Wikipedia's Cambodian-name
46
+ article, Behind the Name, Khmer Wiktionary and Khmer media — and curated
47
+ entry by entry, spellings cross-checked):
48
+ - `names.txt` (200 entries): Khmer surnames, given names, and honorific
49
+ titles (ឯកឧត្តម, សម្តេច, …).
50
+ - `modern.txt` (30 entries): slang (ឡូយ, ស្ទាវ), informal register,
51
+ tech/media loanwords (ហ្វេសប៊ុក, អនឡាញ), and everyday modern loanwords.
52
+ - `load_lexicon(*sources)` public API: merge any combination of `words`,
53
+ `names`, `modern` (cached, per-file validation, cross-file duplicates
54
+ merged); `WORD_SOURCES` lists the available sources.
55
+ - `--include names,modern` flag on `khmerthings count` and
56
+ `khmerthings segment` to match against the extra wordlists.
57
+
58
+ ### Changed
59
+
60
+ - Data policy clarified: web research to find/verify candidate entries is
61
+ allowed; bulk-importing third-party wordlists remains forbidden. Sources
62
+ are noted in each data file header.
63
+ - Total curated entries: 802 across the three sources.
64
+
65
+ ## [0.3.0] - 2026-07-03
66
+
67
+ ### Added
68
+
69
+ - Word breaker tool: `khmerthings.segmenter` with `break_words(text)`
70
+ (words as a list; length always equals `count_words`) and
71
+ `mark_boundaries(text, separator="​")` (insert separators at Khmer
72
+ word boundaries, everything else preserved).
73
+ - `khmerthings segment [files|-] [--separator SEP] [--mark]` CLI subcommand.
74
+ - Per-tool community documentation in `docs/`: `word-breaker.md`,
75
+ `word-counter.md`, `line-sorter.md` — each with quick start, CLI and
76
+ Python API reference, how-it-works, and guarantees/limitations. All
77
+ example outputs are real, executed outputs.
78
+ - Lexicon batch 2: +290 hand-curated words (family & occupations, food &
79
+ drink, verbs, adjectives, connectives, society & learning, time & places);
80
+ now 582 words.
81
+
82
+ ### Changed
83
+
84
+ - `README.md` restructured as a community landing page: tool table linking
85
+ to per-tool docs, roadmap (spellchecker/spellfixer next, pending lexicon
86
+ coverage), contributing pointers.
87
+ - Doc-upkeep rules in `AGENTS.md`/`DEVELOPMENT_GUIDE.md`/PR template now
88
+ require a `docs/<tool>.md` for every main tool, with verified outputs.
89
+
90
+ ## [0.2.0] - 2026-07-03
91
+
92
+ ### Added
93
+
94
+ - Khmer dictionary-order line sorting: `khmerthings.sorting` module with
95
+ `sort_lines(lines, descending=False)` and `khmer_sort_key(text)` (a
96
+ collation key usable directly with `sorted()`). Sorting is per character
97
+ cluster — base consonant, then subscript consonants, then dependent
98
+ vowels, then signs — approximating Chuon Nath dictionary order, which
99
+ naive codepoint sorting gets wrong for subscript consonants.
100
+ - `khmerthings sort [files|-] [--desc]` CLI subcommand (stdin by default,
101
+ multiple files merged).
102
+ - `CHANGELOG.md` (this file) and documentation-upkeep rules in `AGENTS.md`.
103
+
104
+ ## [0.1.0] - 2026-07-03
105
+
106
+ ### Added
107
+
108
+ - Initial release: deterministic Khmer language toolkit with zero runtime
109
+ dependencies, built from first principles.
110
+ - `khmerthings.chars`: Khmer Unicode character classification (consonants,
111
+ vowels, signs, coeng, digits, punctuation, script classes).
112
+ - `khmerthings.clusters`: Khmer character-cluster (KCC) segmentation with a
113
+ losslessness invariant (never drops or reorders characters).
114
+ - `khmerthings.lexicon`: hand-curated seed lexicon (~290 words) with
115
+ trie-based longest-match lookup keyed by clusters.
116
+ - `khmerthings.tokenizer`: lossless typed tokenization of mixed
117
+ Khmer/Latin text; unknown Khmer spans preserved as `KHMER_UNKNOWN`.
118
+ - `khmerthings.counter`: word counter (`count_words`, `analyze`) aware that
119
+ Khmer writes no spaces between words.
120
+ - `khmerthings count [files|-] [--json]` CLI, installable globally.
121
+ - Full test suite (207 tests), strict mypy, ruff, GitHub Actions CI
122
+ (lint, Python 3.11–3.14 test matrix, build + wheel smoke test), MIT
123
+ license, AGENTS.md/CLAUDE.md, DEVELOPMENT_GUIDE.md, PR template.
124
+
125
+ [Unreleased]: https://github.com/spkskx/khmerthings/compare/v0.4.2...HEAD
126
+ [0.4.2]: https://github.com/spkskx/khmerthings/compare/v0.4.1...v0.4.2
127
+ [0.4.1]: https://github.com/spkskx/khmerthings/compare/v0.4.0...v0.4.1
128
+ [0.4.0]: https://github.com/spkskx/khmerthings/compare/v0.3.0...v0.4.0
129
+ [0.3.0]: https://github.com/spkskx/khmerthings/compare/v0.2.0...v0.3.0
130
+ [0.2.0]: https://github.com/spkskx/khmerthings/compare/v0.1.0...v0.2.0
131
+ [0.1.0]: https://github.com/spkskx/khmerthings/releases/tag/v0.1.0
@@ -0,0 +1 @@
1
+ Refer to @AGENTS.md for project guidance, constraints, commands, and conventions.
@@ -0,0 +1,123 @@
1
+ # Development guide
2
+
3
+ How to work on khmerthings. Agent-oriented rules live in [AGENTS.md](AGENTS.md);
4
+ this guide is for humans. The short version: deterministic algorithms only,
5
+ self-owned data only, and tests come first.
6
+
7
+ ## Setup
8
+
9
+ Requires [uv](https://docs.astral.sh/uv/). Python 3.11+ is provisioned
10
+ automatically.
11
+
12
+ ```sh
13
+ git clone git@github.com:spkskx/khmerthings.git
14
+ cd khmerthings
15
+ uv sync # creates .venv with dev dependencies
16
+ ```
17
+
18
+ ## Everyday commands
19
+
20
+ ```sh
21
+ uv run pytest # run the test suite
22
+ uv run pytest tests/test_clusters.py # one file
23
+ uv run mypy src tests # type check (strict)
24
+ uv run ruff check --fix # lint (+autofix)
25
+ uv run ruff format # format
26
+ uv run khmerthings count file.txt # run the CLI from source
27
+ uv build # build sdist + wheel
28
+ ```
29
+
30
+ All of pytest, mypy, `ruff check`, and `ruff format --check` must pass before
31
+ a change is done — CI enforces exactly these on every PR.
32
+
33
+ ## Branching & PRs
34
+
35
+ - `main` is the only long-lived branch and is what CI, tags, and releases
36
+ point at. Work on short-lived feature branches and open a PR into `main`.
37
+ - CI runs lint/type checks, the test matrix (Python 3.11–3.14), a package
38
+ build, and a wheel smoke test. All jobs must be green to merge.
39
+ - Keep `uv.lock` committed; CI installs with `uv sync --locked`.
40
+
41
+ ## Project rules
42
+
43
+ 1. **Deterministic only.** Same input → same output, always. Dictionary- and
44
+ rule-based algorithms; no ML inference, no LLMs, no randomness.
45
+ 2. **Self-owned everything.** No third-party Khmer NLP libraries and no
46
+ bulk-imported wordlists/corpora. Word data is hand-curated entry by
47
+ entry in `src/khmerthings/data/` — web research to find and verify
48
+ candidates is fine; copying someone's wordlist wholesale is not. Note
49
+ candidate sources in the data file header.
50
+ 3. **Zero runtime dependencies.** Stdlib only; dev tooling is the only
51
+ exception.
52
+ 4. **Tests first.** Every behavior change comes with tests. Prefer
53
+ table-driven `pytest.mark.parametrize` cases plus invariant tests (see
54
+ `tests/test_clusters.py::TestInvariants` for the pattern).
55
+
56
+ ## Adding a new tool
57
+
58
+ Follow the existing bottom-up architecture (chars → clusters → lexicon →
59
+ tokenizer → tools). To add a tool, e.g. a spellchecker:
60
+
61
+ 1. Create `src/khmerthings/spellcheck.py`, building on the existing
62
+ primitives (`segment_clusters`, `Lexicon.longest_match`, `tokenize`).
63
+ 2. Re-export the public API in `src/khmerthings/__init__.py` (`__all__`).
64
+ 3. Add a CLI subcommand in `src/khmerthings/cli.py` (`khmerthings spellcheck`).
65
+ 4. Add `tests/test_spellcheck.py` with unit + invariant tests.
66
+ 5. Write its per-tool document `docs/spellcheck.md` following the shared
67
+ template used by the existing docs (What it does / Quick start / CLI
68
+ reference / Python API / How it works / Guarantees & limitations /
69
+ Related tools). All example outputs must be real, verified outputs.
70
+ 6. Add it to the tool table and roadmap in `README.md`, and to `AGENTS.md`.
71
+
72
+ ## Editing the wordlists (`src/khmerthings/data/`)
73
+
74
+ Three growable files, merged on demand via `load_lexicon(*sources)`:
75
+
76
+ | File | Source name | Contents |
77
+ |---|---|---|
78
+ | `words.txt` | `words` | core vocabulary (always loaded) |
79
+ | `names.txt` | `names` | personal names, surnames, honorific titles |
80
+ | `modern.txt` | `modern` | slang, informal register, loanwords, trending terms |
81
+
82
+ Put new entries in the right file: standard dictionary vocabulary → `words`;
83
+ anything people are called → `names`; register that shifts with time
84
+ (slang, borrowings, internet vocabulary) → `modern`.
85
+
86
+ - One entry per line, UTF-8, NFC-normalized, Khmer letters/marks only —
87
+ the loader rejects anything else (including within-file duplicates) at
88
+ load time. The same entry may appear in different files (merged at load).
89
+ - Keep entries in their category section; add a new `# --- section ---` when
90
+ needed. Note research sources in the file header comments.
91
+ - Words with subscript ta/da (្ត/្ដ) spelling variation should be added in
92
+ **both** spellings — the two render identically and real text mixes them.
93
+ - Beware visually identical strings with different codepoints; verify with
94
+ `python -c "print([hex(ord(c)) for c in 'word'])"` when in doubt.
95
+ - Adding words is the highest-leverage accuracy improvement for every tool.
96
+
97
+ ## Khmer script primer (why the code looks like this)
98
+
99
+ - Khmer writes **no spaces between words**; spaces separate phrases. Word
100
+ boundaries must be inferred via the lexicon.
101
+ - A **character cluster** (base consonant + subscript consonants + vowel +
102
+ signs) is the smallest indivisible unit; word boundaries can only fall on
103
+ cluster boundaries. `clusters.py` implements this.
104
+ - The zero-width space (U+200B) is used in real Khmer text as an explicit
105
+ word delimiter; the tokenizer treats it as whitespace.
106
+
107
+ ## Documentation & changelog
108
+
109
+ Docs are part of every change, not a follow-up: update `README.md`,
110
+ `AGENTS.md`, `DEVELOPMENT_GUIDE.md`, and docstrings whenever behavior,
111
+ architecture, or workflow changes, and add an entry to the `[Unreleased]`
112
+ section of `CHANGELOG.md` (Keep a Changelog format) for every user-visible
113
+ change. A behavior-changing PR with no docs/changelog update is incomplete.
114
+
115
+ ## Releasing
116
+
117
+ 1. Bump `version` in `pyproject.toml` and `__version__` in
118
+ `src/khmerthings/__init__.py` (keep them in sync).
119
+ 2. Rename the `[Unreleased]` section of `CHANGELOG.md` to `[X.Y.Z] - date`
120
+ and update the compare links at the bottom.
121
+ 3. Update the lockfile: `uv sync`.
122
+ 4. Commit, then tag: `git tag vX.Y.Z && git push origin main --tags`.
123
+ 5. Build artifacts with `uv build` (CI also builds and uploads them).
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 spkskx
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,101 @@
1
+ Metadata-Version: 2.4
2
+ Name: khmerthings
3
+ Version: 0.4.3
4
+ Summary: Deterministic Khmer language tools: word counter, segmentation primitives, and more.
5
+ Author: spkskx
6
+ License: MIT
7
+ License-File: LICENSE
8
+ Keywords: khmer,nlp,segmentation,unicode,word-count
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Topic :: Text Processing :: Linguistic
13
+ Classifier: Typing :: Typed
14
+ Requires-Python: >=3.11
15
+ Description-Content-Type: text/markdown
16
+
17
+ # khmerthings
18
+
19
+ Deterministic Khmer language tools for Python — built as **community
20
+ building blocks**: small, correct, dependency-free primitives you can
21
+ compose into bigger systems.
22
+
23
+ No machine-learning models, no third-party NLP dependencies, no network
24
+ calls. Every result is reproducible and explainable. Khmer script writes no
25
+ spaces between words, so even "simple" operations like counting or sorting
26
+ need real language handling — khmerthings implements that from first
27
+ principles.
28
+
29
+ ## Tools
30
+
31
+ Each tool is available both as a Python API and a CLI subcommand, and has
32
+ its own detailed document:
33
+
34
+ | Tool | CLI | Python | Docs |
35
+ |---|---|---|---|
36
+ | **Word breaker** — split Khmer text into words | `khmerthings segment` | `break_words`, `mark_boundaries` | [docs/word-breaker.md](docs/word-breaker.md) |
37
+ | **Word counter** — count words in Khmer/mixed text | `khmerthings count` | `count_words`, `analyze` | [docs/word-counter.md](docs/word-counter.md) |
38
+ | **Line sorter** — Khmer dictionary-order sorting | `khmerthings sort` | `sort_lines`, `khmer_sort_key` | [docs/line-sorter.md](docs/line-sorter.md) |
39
+
40
+ ## Install
41
+
42
+ ```sh
43
+ pip install khmerthings # library
44
+ uv tool install khmerthings # global CLI
45
+ ```
46
+
47
+ ## A taste
48
+
49
+ ```sh
50
+ $ echo "ខ្ញុំស្រឡាញ់ភាសាខ្មែរ" | khmerthings segment
51
+ ខ្ញុំ ស្រឡាញ់ ភាសា ខ្មែរ
52
+ ```
53
+
54
+ ```python
55
+ from khmerthings import break_words, count_words, sort_lines
56
+
57
+ break_words("ខ្ញុំស្រឡាញ់ភាសាខ្មែរ") # ['ខ្ញុំ', 'ស្រឡាញ់', 'ភាសា', 'ខ្មែរ']
58
+ count_words("ខ្ញុំមានឆ្កែ ២ ក្បាល and 3 cats") # 8
59
+ sort_lines(["ក្រ", "កា", "កក"]) # ['កក', 'កា', 'ក្រ']
60
+ ```
61
+
62
+ ## Design principles
63
+
64
+ - **Deterministic**: same input, same output, always. Rule- and
65
+ dictionary-based algorithms only; nothing probabilistic.
66
+ - **Self-contained**: zero runtime dependencies; all word data is our own
67
+ hand-curated set of growable wordlists — `words` (core vocabulary),
68
+ `names` (people's names & titles), and `modern` (slang, loanwords,
69
+ trending terms) — 802 entries and growing. Candidates are researched from
70
+ public sources and verified entry by entry; no wordlist is imported
71
+ wholesale.
72
+ - **Lossless**: no character is ever dropped — unknown Khmer spans are
73
+ reported, not discarded.
74
+ - **Tested first**: every module ships with table-driven unit tests and
75
+ invariant checks (258 tests as of v0.3.0).
76
+
77
+ Under the hood, the tools share deterministic primitives (character
78
+ classification, character-cluster segmentation, a cluster-keyed lexicon
79
+ trie, lossless tokenization) in `src/khmerthings/` — see the module
80
+ docstrings if you want to build on them directly.
81
+
82
+ ## Roadmap
83
+
84
+ - ✅ Word counter, line sorter, word breaker
85
+ - ⏳ Wordlist growth across all three sources (`words`, `names`, `modern`) —
86
+ hand-curated batches each release; the accuracy lever for every
87
+ dictionary-based tool
88
+ - 🔜 Spellchecker & spellfixer (engine is feasible today; waiting on lexicon
89
+ coverage to make its verdicts trustworthy)
90
+ - Later: part-of-speech tagger, intent detection, paragraph categorization
91
+
92
+ ## Contributing
93
+
94
+ See [DEVELOPMENT_GUIDE.md](DEVELOPMENT_GUIDE.md) for setup, the
95
+ architecture, the rules (determinism, self-owned data, tests first), and
96
+ how to add words to the lexicon — the single most valuable contribution.
97
+ Changes are tracked in [CHANGELOG.md](CHANGELOG.md).
98
+
99
+ ## License
100
+
101
+ [MIT](LICENSE)