khmerthings 0.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khmerthings-0.4.3/.github/PULL_REQUEST_TEMPLATE.md +28 -0
- khmerthings-0.4.3/.github/workflows/ci.yml +69 -0
- khmerthings-0.4.3/.github/workflows/publish.yml +58 -0
- khmerthings-0.4.3/.gitignore +8 -0
- khmerthings-0.4.3/AGENTS.md +130 -0
- khmerthings-0.4.3/CHANGELOG.md +131 -0
- khmerthings-0.4.3/CLAUDE.md +1 -0
- khmerthings-0.4.3/DEVELOPMENT_GUIDE.md +123 -0
- khmerthings-0.4.3/LICENSE +21 -0
- khmerthings-0.4.3/PKG-INFO +101 -0
- khmerthings-0.4.3/README.md +85 -0
- khmerthings-0.4.3/docs/line-sorter.md +174 -0
- khmerthings-0.4.3/docs/word-breaker.md +253 -0
- khmerthings-0.4.3/docs/word-counter.md +246 -0
- khmerthings-0.4.3/pyproject.toml +58 -0
- khmerthings-0.4.3/src/khmerthings/__init__.py +29 -0
- khmerthings-0.4.3/src/khmerthings/__main__.py +7 -0
- khmerthings-0.4.3/src/khmerthings/chars.py +129 -0
- khmerthings-0.4.3/src/khmerthings/cli.py +147 -0
- khmerthings-0.4.3/src/khmerthings/clusters.py +77 -0
- khmerthings-0.4.3/src/khmerthings/counter.py +73 -0
- khmerthings-0.4.3/src/khmerthings/data/modern.txt +44 -0
- khmerthings-0.4.3/src/khmerthings/data/names.txt +212 -0
- khmerthings-0.4.3/src/khmerthings/data/words.txt +610 -0
- khmerthings-0.4.3/src/khmerthings/lexicon.py +126 -0
- khmerthings-0.4.3/src/khmerthings/py.typed +0 -0
- khmerthings-0.4.3/src/khmerthings/segmenter.py +59 -0
- khmerthings-0.4.3/src/khmerthings/sorting.py +79 -0
- khmerthings-0.4.3/src/khmerthings/tokenizer.py +125 -0
- khmerthings-0.4.3/tests/test_chars.py +135 -0
- khmerthings-0.4.3/tests/test_cli.py +197 -0
- khmerthings-0.4.3/tests/test_clusters.py +104 -0
- khmerthings-0.4.3/tests/test_counter.py +81 -0
- khmerthings-0.4.3/tests/test_lexicon.py +135 -0
- khmerthings-0.4.3/tests/test_segmenter.py +83 -0
- khmerthings-0.4.3/tests/test_sorting.py +93 -0
- khmerthings-0.4.3/tests/test_tokenizer.py +118 -0
- khmerthings-0.4.3/uv.lock +467 -0
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Summary
|
|
2
|
+
|
|
3
|
+
<!-- What does this PR change, and why? -->
|
|
4
|
+
|
|
5
|
+
## Type of change
|
|
6
|
+
|
|
7
|
+
- [ ] New tool / feature
|
|
8
|
+
- [ ] Bug fix
|
|
9
|
+
- [ ] Lexicon update (`words.txt`)
|
|
10
|
+
- [ ] Tests / CI / tooling
|
|
11
|
+
- [ ] Documentation
|
|
12
|
+
|
|
13
|
+
## Checklist
|
|
14
|
+
|
|
15
|
+
- [ ] `uv run pytest` passes; new behavior is covered by tests
|
|
16
|
+
- [ ] `uv run mypy src tests` is clean
|
|
17
|
+
- [ ] `uv run ruff check` and `uv run ruff format --check` are clean
|
|
18
|
+
- [ ] Change is fully deterministic (no randomness, no ML/LLM inference)
|
|
19
|
+
- [ ] No third-party Khmer NLP code or data introduced; zero new runtime deps
|
|
20
|
+
- [ ] Lexicon edits (if any): NFC, Khmer-only, no duplicates, both ្ត/្ដ
|
|
21
|
+
variants where applicable
|
|
22
|
+
- [ ] Public API changes re-exported in `__init__.py` and documented in
|
|
23
|
+
`README.md` / `AGENTS.md`
|
|
24
|
+
- [ ] `CHANGELOG.md` updated under `[Unreleased]` (any user-visible change)
|
|
25
|
+
- [ ] Affected docs updated (`README.md`, `AGENTS.md`,
|
|
26
|
+
`DEVELOPMENT_GUIDE.md`, docstrings)
|
|
27
|
+
- [ ] Tool changes: per-tool doc in `docs/<tool>.md` added/updated, with
|
|
28
|
+
real (executed) example outputs
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
name: CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches: [main]
|
|
6
|
+
pull_request:
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
contents: read
|
|
10
|
+
|
|
11
|
+
concurrency:
|
|
12
|
+
group: ci-${{ github.workflow }}-${{ github.ref }}
|
|
13
|
+
cancel-in-progress: true
|
|
14
|
+
|
|
15
|
+
jobs:
|
|
16
|
+
lint:
|
|
17
|
+
name: Lint & type check
|
|
18
|
+
runs-on: ubuntu-latest
|
|
19
|
+
steps:
|
|
20
|
+
- uses: actions/checkout@v4
|
|
21
|
+
- uses: astral-sh/setup-uv@v5
|
|
22
|
+
with:
|
|
23
|
+
enable-cache: true
|
|
24
|
+
- name: Install
|
|
25
|
+
run: uv sync --locked
|
|
26
|
+
- name: Ruff lint
|
|
27
|
+
run: uv run ruff check --output-format=github
|
|
28
|
+
- name: Ruff format
|
|
29
|
+
run: uv run ruff format --check
|
|
30
|
+
- name: Mypy
|
|
31
|
+
run: uv run mypy src tests
|
|
32
|
+
|
|
33
|
+
test:
|
|
34
|
+
name: Test (Python ${{ matrix.python-version }})
|
|
35
|
+
runs-on: ubuntu-latest
|
|
36
|
+
strategy:
|
|
37
|
+
fail-fast: false
|
|
38
|
+
matrix:
|
|
39
|
+
python-version: ["3.11", "3.12", "3.13", "3.14"]
|
|
40
|
+
steps:
|
|
41
|
+
- uses: actions/checkout@v4
|
|
42
|
+
- uses: astral-sh/setup-uv@v5
|
|
43
|
+
with:
|
|
44
|
+
enable-cache: true
|
|
45
|
+
python-version: ${{ matrix.python-version }}
|
|
46
|
+
- name: Install
|
|
47
|
+
run: uv sync --locked
|
|
48
|
+
- name: Run tests
|
|
49
|
+
run: uv run pytest --cov=khmerthings --cov-report=term-missing
|
|
50
|
+
|
|
51
|
+
build:
|
|
52
|
+
name: Build package
|
|
53
|
+
runs-on: ubuntu-latest
|
|
54
|
+
steps:
|
|
55
|
+
- uses: actions/checkout@v4
|
|
56
|
+
- uses: astral-sh/setup-uv@v5
|
|
57
|
+
with:
|
|
58
|
+
enable-cache: true
|
|
59
|
+
- name: Build sdist and wheel
|
|
60
|
+
run: uv build
|
|
61
|
+
- name: Smoke test the wheel
|
|
62
|
+
run: |
|
|
63
|
+
uv venv /tmp/smoke
|
|
64
|
+
uv pip install --python /tmp/smoke/bin/python dist/*.whl
|
|
65
|
+
echo "ខ្ញុំស្រឡាញ់ភាសាខ្មែរ" | /tmp/smoke/bin/python -m khmerthings count -
|
|
66
|
+
- uses: actions/upload-artifact@v4
|
|
67
|
+
with:
|
|
68
|
+
name: dist
|
|
69
|
+
path: dist/
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
name: Publish
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags: ["v*"]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
contents: read
|
|
9
|
+
|
|
10
|
+
jobs:
|
|
11
|
+
checks:
|
|
12
|
+
name: Full checks
|
|
13
|
+
runs-on: ubuntu-latest
|
|
14
|
+
steps:
|
|
15
|
+
- uses: actions/checkout@v4
|
|
16
|
+
- uses: astral-sh/setup-uv@v5
|
|
17
|
+
with:
|
|
18
|
+
enable-cache: true
|
|
19
|
+
- name: Install
|
|
20
|
+
run: uv sync --locked
|
|
21
|
+
- name: Ruff lint
|
|
22
|
+
run: uv run ruff check --output-format=github
|
|
23
|
+
- name: Ruff format
|
|
24
|
+
run: uv run ruff format --check
|
|
25
|
+
- name: Mypy
|
|
26
|
+
run: uv run mypy src tests
|
|
27
|
+
- name: Run tests
|
|
28
|
+
run: uv run pytest
|
|
29
|
+
|
|
30
|
+
publish:
|
|
31
|
+
name: Build and publish to PyPI
|
|
32
|
+
runs-on: ubuntu-latest
|
|
33
|
+
needs: checks
|
|
34
|
+
environment: pypi
|
|
35
|
+
steps:
|
|
36
|
+
- uses: actions/checkout@v4
|
|
37
|
+
- uses: astral-sh/setup-uv@v5
|
|
38
|
+
with:
|
|
39
|
+
enable-cache: true
|
|
40
|
+
- name: Check tag matches package version
|
|
41
|
+
run: |
|
|
42
|
+
version=$(uv run --no-sync python -c "import tomllib; print(tomllib.load(open('pyproject.toml','rb'))['project']['version'])")
|
|
43
|
+
tag="${GITHUB_REF_NAME#v}"
|
|
44
|
+
if [ "$version" != "$tag" ]; then
|
|
45
|
+
echo "Tag v$tag does not match pyproject.toml version $version" >&2
|
|
46
|
+
exit 1
|
|
47
|
+
fi
|
|
48
|
+
- name: Build sdist and wheel
|
|
49
|
+
run: uv build
|
|
50
|
+
- name: Smoke test the wheel
|
|
51
|
+
run: |
|
|
52
|
+
uv venv /tmp/smoke
|
|
53
|
+
uv pip install --python /tmp/smoke/bin/python dist/*.whl
|
|
54
|
+
echo "ខ្ញុំស្រឡាញ់ភាសាខ្មែរ" | /tmp/smoke/bin/python -m khmerthings count -
|
|
55
|
+
- name: Publish to PyPI
|
|
56
|
+
run: uv publish --token "$PYPI_API_TOKEN"
|
|
57
|
+
env:
|
|
58
|
+
PYPI_API_TOKEN: ${{ secrets.PYPI_API_TOKEN }}
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
# khmerthings — agent guide
|
|
2
|
+
|
|
3
|
+
Deterministic Khmer language tools in Python. Note: the repo directory is
|
|
4
|
+
`libkhm`, but the package, CLI, and PyPI name is **khmerthings**.
|
|
5
|
+
|
|
6
|
+
## Hard constraints (non-negotiable)
|
|
7
|
+
|
|
8
|
+
- **Deterministic only.** Every tool must be rule/algorithm/dictionary-based.
|
|
9
|
+
Same input → same output, always. No probabilistic models, no LLMs, no ML
|
|
10
|
+
inference at runtime.
|
|
11
|
+
- **No third-party Khmer NLP code; word data is self-curated.** Do not add
|
|
12
|
+
dependencies on or copy from other Khmer NLP projects, and never
|
|
13
|
+
bulk-import someone else's wordlist. Web research to *find and verify*
|
|
14
|
+
candidate words/names/slang is allowed (user decision, 2026-07-03) — but
|
|
15
|
+
every entry is curated individually, spellings cross-checked, and sources
|
|
16
|
+
noted in the data file header. Data files under `src/khmerthings/data/`:
|
|
17
|
+
`words.txt` (core), `names.txt` (names, surnames, titles), `modern.txt`
|
|
18
|
+
(slang, informal, loanwords, trending). All three are growable and merged
|
|
19
|
+
via `load_lexicon(*sources)`.
|
|
20
|
+
- **Zero runtime dependencies.** Stdlib only. Dev tools (pytest, ruff, mypy)
|
|
21
|
+
are the only allowed dependencies.
|
|
22
|
+
- **Tests are the top priority.** Every module ships with table-driven unit
|
|
23
|
+
tests and invariant checks. Write/update tests with every change.
|
|
24
|
+
|
|
25
|
+
## Commands
|
|
26
|
+
|
|
27
|
+
```sh
|
|
28
|
+
uv sync # env + dev deps
|
|
29
|
+
uv run pytest # tests (must stay green)
|
|
30
|
+
uv run mypy src tests # strict mode, must be clean
|
|
31
|
+
uv run ruff check --fix && uv run ruff format
|
|
32
|
+
uv build # sdist + wheel
|
|
33
|
+
echo "ខ្មែរ" | uv run khmerthings count # CLI smoke test
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
Run all four checks (pytest, mypy, ruff check, ruff format --check) before
|
|
37
|
+
considering any change done.
|
|
38
|
+
|
|
39
|
+
## Architecture
|
|
40
|
+
|
|
41
|
+
`src/` layout; modules build bottom-up, each layer a primitive for the next:
|
|
42
|
+
|
|
43
|
+
1. `chars.py` — Khmer Unicode character classification (pure functions,
|
|
44
|
+
single-character contract: multi-char input raises `ValueError`).
|
|
45
|
+
2. `clusters.py` — Khmer character-cluster (KCC) segmentation. Cluster
|
|
46
|
+
boundaries are the only legal word boundaries.
|
|
47
|
+
3. `lexicon.py` + `data/*.txt` — wordlists (`words`/`names`/`modern`) +
|
|
48
|
+
trie keyed by clusters; `longest_match` is the segmentation primitive;
|
|
49
|
+
`load_lexicon(*sources)` merges sources (cached), `--include` on the
|
|
50
|
+
CLI exposes the extra ones.
|
|
51
|
+
4. `tokenizer.py` — lossless typed tokenization (Khmer words via greedy
|
|
52
|
+
longest-match; unknown Khmer spans become `KHMER_UNKNOWN`, never dropped).
|
|
53
|
+
5. `counter.py` — word counter tool (`count_words`, `analyze`).
|
|
54
|
+
6. `segmenter.py` — word breaker tool (`break_words`, `mark_boundaries`),
|
|
55
|
+
a thin first-class wrapper over `tokenize`; invariant
|
|
56
|
+
`len(break_words(t)) == count_words(t)`.
|
|
57
|
+
7. `sorting.py` — Khmer dictionary-order line sorting (`sort_lines`,
|
|
58
|
+
`khmer_sort_key`: per-cluster key `(base, coengs, vowels, signs)` —
|
|
59
|
+
naive codepoint order is wrong for subscripts).
|
|
60
|
+
8. `cli.py` — argparse subcommands, one per tool (`khmerthings count ...`,
|
|
61
|
+
`khmerthings segment ...`, `khmerthings sort ...`).
|
|
62
|
+
|
|
63
|
+
Planned tools (spellchecker/spellfixer — blocked on lexicon size, POS
|
|
64
|
+
tagger, intent detector, paragraph categorizer) follow the same pattern:
|
|
65
|
+
new module in `src/khmerthings/`, re-export in `__init__.py`, new CLI
|
|
66
|
+
subcommand in `cli.py`, new `tests/test_<module>.py`, and a
|
|
67
|
+
**per-tool document `docs/<tool>.md`** (see below).
|
|
68
|
+
|
|
69
|
+
## Invariants to preserve (enforced by tests)
|
|
70
|
+
|
|
71
|
+
- `"".join(segment_clusters(t)) == unicodedata.normalize("NFC", t)` — cluster
|
|
72
|
+
segmentation never drops or reorders characters, even on malformed input.
|
|
73
|
+
- Tokenization is lossless: concatenated token texts equal the NFC input, and
|
|
74
|
+
token offsets are contiguous.
|
|
75
|
+
- All text is NFC-normalized at entry points; lexicon entries must be NFC,
|
|
76
|
+
Khmer-letters-only, and unique (loader raises otherwise).
|
|
77
|
+
- A lexicon match can never split a character cluster.
|
|
78
|
+
|
|
79
|
+
## Documentation upkeep (do this every change)
|
|
80
|
+
|
|
81
|
+
Docs must always reflect the current state of the code. As part of any
|
|
82
|
+
change — not as an afterthought:
|
|
83
|
+
|
|
84
|
+
- **Self-update this file (AGENTS.md)** when architecture, constraints,
|
|
85
|
+
commands, tools, or conventions change (e.g. a new module or subcommand).
|
|
86
|
+
- **Every main (end-user) tool has its own `docs/<tool>.md`** written for
|
|
87
|
+
community users *and for AI agents driving the tools autonomously*,
|
|
88
|
+
following the shared template: What it does / Quick start / CLI
|
|
89
|
+
reference / Python API / How it works / Guarantees & limitations / Task
|
|
90
|
+
recipes / Related tools. Requirements: **every** CLI flag and API
|
|
91
|
+
parameter gets a concrete example with its output; exit codes and error
|
|
92
|
+
output are documented; a "Task recipes" table maps goals → exact
|
|
93
|
+
commands/calls. Low-level primitives are documented via docstrings only,
|
|
94
|
+
not `docs/`. All example outputs must be real — run the command and
|
|
95
|
+
paste the actual output, never invent it (a wrong "expected" output has
|
|
96
|
+
already been caught this way).
|
|
97
|
+
- **Update the other docs** touched by the change: `README.md` (landing
|
|
98
|
+
page: tool table, roadmap, examples), the affected `docs/*.md`,
|
|
99
|
+
`DEVELOPMENT_GUIDE.md` (workflow, recipes), docstrings.
|
|
100
|
+
- **Keep `CHANGELOG.md` current**: add an entry under `[Unreleased]` for
|
|
101
|
+
every user-visible change (Keep a Changelog format: Added / Changed /
|
|
102
|
+
Fixed / Removed). On release, rename `[Unreleased]` to the version + date.
|
|
103
|
+
- A PR that changes behavior but touches no docs/changelog is incomplete —
|
|
104
|
+
the PR template checklist enforces this.
|
|
105
|
+
|
|
106
|
+
## Releasing
|
|
107
|
+
|
|
108
|
+
1. Bump `version` in `pyproject.toml` **and** `__version__` in
|
|
109
|
+
`src/khmerthings/__init__.py` (they must stay in sync).
|
|
110
|
+
2. Turn the `[Unreleased]` section of `CHANGELOG.md` into `[X.Y.Z] - date`.
|
|
111
|
+
3. `uv sync` (refresh lockfile), run all four checks, commit.
|
|
112
|
+
4. `git tag vX.Y.Z && git push origin main --tags`.
|
|
113
|
+
5. The tag push triggers `.github/workflows/publish.yml`, which re-runs the
|
|
114
|
+
checks, verifies the tag matches `pyproject.toml`, builds, smoke-tests the
|
|
115
|
+
wheel, and publishes to PyPI (needs the `PYPI_API_TOKEN` repo secret,
|
|
116
|
+
configured in repo settings).
|
|
117
|
+
|
|
118
|
+
## Conventions
|
|
119
|
+
|
|
120
|
+
- Python ≥ 3.11, mypy `strict`, ruff line length 100.
|
|
121
|
+
- Public API re-exported in `__init__.py` with `__all__`; keep `py.typed`.
|
|
122
|
+
- Frozen dataclasses for result types (`Token`, `WordCount`).
|
|
123
|
+
- Wordlist files (`words.txt`, `names.txt`, `modern.txt`): one entry per
|
|
124
|
+
line, UTF-8, NFC, Khmer letters/marks only, `#` comments, grouped by
|
|
125
|
+
category, sources noted in the header. High-frequency words with
|
|
126
|
+
subscript ta/da (្ត/្ដ) spelling variation are listed in both spellings —
|
|
127
|
+
real-world text mixes them. Within a file duplicates are a load error;
|
|
128
|
+
the same entry in different files is fine (merged at load).
|
|
129
|
+
- Khmer test strings: verify codepoints carefully (visually identical strings
|
|
130
|
+
can differ, e.g. ្ត vs ្ដ); assert exact expected values, hand-verified.
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to khmerthings are documented here.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [Unreleased]
|
|
9
|
+
|
|
10
|
+
## [0.4.3] - 2026-07-03
|
|
11
|
+
|
|
12
|
+
### Added
|
|
13
|
+
|
|
14
|
+
- GitHub Actions publish workflow: pushing a `vX.Y.Z` tag now runs the full
|
|
15
|
+
checks, verifies the tag matches the package version, builds, smoke-tests
|
|
16
|
+
the wheel, and uploads to PyPI (requires the `PYPI_API_TOKEN` secret).
|
|
17
|
+
|
|
18
|
+
## [0.4.2] - 2026-07-03
|
|
19
|
+
|
|
20
|
+
### Added
|
|
21
|
+
|
|
22
|
+
- Lexicon: ខ្សោយ (weak) — reported missing via
|
|
23
|
+
`khmerthings count` showing it as an unknown span.
|
|
24
|
+
|
|
25
|
+
## [0.4.1] - 2026-07-03
|
|
26
|
+
|
|
27
|
+
### Changed
|
|
28
|
+
|
|
29
|
+
- Per-tool docs rewritten for autonomous (AI-agent and scripted) use:
|
|
30
|
+
every CLI flag and Python API parameter now has a concrete example with
|
|
31
|
+
real executed output, exit codes and error output are documented, and
|
|
32
|
+
each doc ends with a "Task recipes" table mapping goals to exact
|
|
33
|
+
commands/calls.
|
|
34
|
+
|
|
35
|
+
### Fixed
|
|
36
|
+
|
|
37
|
+
- CLI: a missing/unreadable input file now exits with code 1 and a
|
|
38
|
+
one-line `khmerthings: error: ...` message instead of a Python traceback.
|
|
39
|
+
|
|
40
|
+
## [0.4.0] - 2026-07-03
|
|
41
|
+
|
|
42
|
+
### Added
|
|
43
|
+
|
|
44
|
+
- Two new growable wordlist sources alongside the core vocabulary
|
|
45
|
+
(candidates researched from public sources — Wikipedia's Cambodian-name
|
|
46
|
+
article, Behind the Name, Khmer Wiktionary and Khmer media — and curated
|
|
47
|
+
entry by entry, spellings cross-checked):
|
|
48
|
+
- `names.txt` (200 entries): Khmer surnames, given names, and honorific
|
|
49
|
+
titles (ឯកឧត្តម, សម្តេច, …).
|
|
50
|
+
- `modern.txt` (30 entries): slang (ឡូយ, ស្ទាវ), informal register,
|
|
51
|
+
tech/media loanwords (ហ្វេសប៊ុក, អនឡាញ), and everyday modern loanwords.
|
|
52
|
+
- `load_lexicon(*sources)` public API: merge any combination of `words`,
|
|
53
|
+
`names`, `modern` (cached, per-file validation, cross-file duplicates
|
|
54
|
+
merged); `WORD_SOURCES` lists the available sources.
|
|
55
|
+
- `--include names,modern` flag on `khmerthings count` and
|
|
56
|
+
`khmerthings segment` to match against the extra wordlists.
|
|
57
|
+
|
|
58
|
+
### Changed
|
|
59
|
+
|
|
60
|
+
- Data policy clarified: web research to find/verify candidate entries is
|
|
61
|
+
allowed; bulk-importing third-party wordlists remains forbidden. Sources
|
|
62
|
+
are noted in each data file header.
|
|
63
|
+
- Total curated entries: 802 across the three sources.
|
|
64
|
+
|
|
65
|
+
## [0.3.0] - 2026-07-03
|
|
66
|
+
|
|
67
|
+
### Added
|
|
68
|
+
|
|
69
|
+
- Word breaker tool: `khmerthings.segmenter` with `break_words(text)`
|
|
70
|
+
(words as a list; length always equals `count_words`) and
|
|
71
|
+
`mark_boundaries(text, separator="")` (insert separators at Khmer
|
|
72
|
+
word boundaries, everything else preserved).
|
|
73
|
+
- `khmerthings segment [files|-] [--separator SEP] [--mark]` CLI subcommand.
|
|
74
|
+
- Per-tool community documentation in `docs/`: `word-breaker.md`,
|
|
75
|
+
`word-counter.md`, `line-sorter.md` — each with quick start, CLI and
|
|
76
|
+
Python API reference, how-it-works, and guarantees/limitations. All
|
|
77
|
+
example outputs are real, executed outputs.
|
|
78
|
+
- Lexicon batch 2: +290 hand-curated words (family & occupations, food &
|
|
79
|
+
drink, verbs, adjectives, connectives, society & learning, time & places);
|
|
80
|
+
now 582 words.
|
|
81
|
+
|
|
82
|
+
### Changed
|
|
83
|
+
|
|
84
|
+
- `README.md` restructured as a community landing page: tool table linking
|
|
85
|
+
to per-tool docs, roadmap (spellchecker/spellfixer next, pending lexicon
|
|
86
|
+
coverage), contributing pointers.
|
|
87
|
+
- Doc-upkeep rules in `AGENTS.md`/`DEVELOPMENT_GUIDE.md`/PR template now
|
|
88
|
+
require a `docs/<tool>.md` for every main tool, with verified outputs.
|
|
89
|
+
|
|
90
|
+
## [0.2.0] - 2026-07-03
|
|
91
|
+
|
|
92
|
+
### Added
|
|
93
|
+
|
|
94
|
+
- Khmer dictionary-order line sorting: `khmerthings.sorting` module with
|
|
95
|
+
`sort_lines(lines, descending=False)` and `khmer_sort_key(text)` (a
|
|
96
|
+
collation key usable directly with `sorted()`). Sorting is per character
|
|
97
|
+
cluster — base consonant, then subscript consonants, then dependent
|
|
98
|
+
vowels, then signs — approximating Chuon Nath dictionary order, which
|
|
99
|
+
naive codepoint sorting gets wrong for subscript consonants.
|
|
100
|
+
- `khmerthings sort [files|-] [--desc]` CLI subcommand (stdin by default,
|
|
101
|
+
multiple files merged).
|
|
102
|
+
- `CHANGELOG.md` (this file) and documentation-upkeep rules in `AGENTS.md`.
|
|
103
|
+
|
|
104
|
+
## [0.1.0] - 2026-07-03
|
|
105
|
+
|
|
106
|
+
### Added
|
|
107
|
+
|
|
108
|
+
- Initial release: deterministic Khmer language toolkit with zero runtime
|
|
109
|
+
dependencies, built from first principles.
|
|
110
|
+
- `khmerthings.chars`: Khmer Unicode character classification (consonants,
|
|
111
|
+
vowels, signs, coeng, digits, punctuation, script classes).
|
|
112
|
+
- `khmerthings.clusters`: Khmer character-cluster (KCC) segmentation with a
|
|
113
|
+
losslessness invariant (never drops or reorders characters).
|
|
114
|
+
- `khmerthings.lexicon`: hand-curated seed lexicon (~290 words) with
|
|
115
|
+
trie-based longest-match lookup keyed by clusters.
|
|
116
|
+
- `khmerthings.tokenizer`: lossless typed tokenization of mixed
|
|
117
|
+
Khmer/Latin text; unknown Khmer spans preserved as `KHMER_UNKNOWN`.
|
|
118
|
+
- `khmerthings.counter`: word counter (`count_words`, `analyze`) aware that
|
|
119
|
+
Khmer writes no spaces between words.
|
|
120
|
+
- `khmerthings count [files|-] [--json]` CLI, installable globally.
|
|
121
|
+
- Full test suite (207 tests), strict mypy, ruff, GitHub Actions CI
|
|
122
|
+
(lint, Python 3.11–3.14 test matrix, build + wheel smoke test), MIT
|
|
123
|
+
license, AGENTS.md/CLAUDE.md, DEVELOPMENT_GUIDE.md, PR template.
|
|
124
|
+
|
|
125
|
+
[Unreleased]: https://github.com/spkskx/khmerthings/compare/v0.4.2...HEAD
|
|
126
|
+
[0.4.2]: https://github.com/spkskx/khmerthings/compare/v0.4.1...v0.4.2
|
|
127
|
+
[0.4.1]: https://github.com/spkskx/khmerthings/compare/v0.4.0...v0.4.1
|
|
128
|
+
[0.4.0]: https://github.com/spkskx/khmerthings/compare/v0.3.0...v0.4.0
|
|
129
|
+
[0.3.0]: https://github.com/spkskx/khmerthings/compare/v0.2.0...v0.3.0
|
|
130
|
+
[0.2.0]: https://github.com/spkskx/khmerthings/compare/v0.1.0...v0.2.0
|
|
131
|
+
[0.1.0]: https://github.com/spkskx/khmerthings/releases/tag/v0.1.0
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
Refer to @AGENTS.md for project guidance, constraints, commands, and conventions.
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# Development guide
|
|
2
|
+
|
|
3
|
+
How to work on khmerthings. Agent-oriented rules live in [AGENTS.md](AGENTS.md);
|
|
4
|
+
this guide is for humans. The short version: deterministic algorithms only,
|
|
5
|
+
self-owned data only, and tests come first.
|
|
6
|
+
|
|
7
|
+
## Setup
|
|
8
|
+
|
|
9
|
+
Requires [uv](https://docs.astral.sh/uv/). Python 3.11+ is provisioned
|
|
10
|
+
automatically.
|
|
11
|
+
|
|
12
|
+
```sh
|
|
13
|
+
git clone git@github.com:spkskx/khmerthings.git
|
|
14
|
+
cd khmerthings
|
|
15
|
+
uv sync # creates .venv with dev dependencies
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Everyday commands
|
|
19
|
+
|
|
20
|
+
```sh
|
|
21
|
+
uv run pytest # run the test suite
|
|
22
|
+
uv run pytest tests/test_clusters.py # one file
|
|
23
|
+
uv run mypy src tests # type check (strict)
|
|
24
|
+
uv run ruff check --fix # lint (+autofix)
|
|
25
|
+
uv run ruff format # format
|
|
26
|
+
uv run khmerthings count file.txt # run the CLI from source
|
|
27
|
+
uv build # build sdist + wheel
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
All of pytest, mypy, `ruff check`, and `ruff format --check` must pass before
|
|
31
|
+
a change is done — CI enforces exactly these on every PR.
|
|
32
|
+
|
|
33
|
+
## Branching & PRs
|
|
34
|
+
|
|
35
|
+
- `main` is the only long-lived branch and is what CI, tags, and releases
|
|
36
|
+
point at. Work on short-lived feature branches and open a PR into `main`.
|
|
37
|
+
- CI runs lint/type checks, the test matrix (Python 3.11–3.14), a package
|
|
38
|
+
build, and a wheel smoke test. All jobs must be green to merge.
|
|
39
|
+
- Keep `uv.lock` committed; CI installs with `uv sync --locked`.
|
|
40
|
+
|
|
41
|
+
## Project rules
|
|
42
|
+
|
|
43
|
+
1. **Deterministic only.** Same input → same output, always. Dictionary- and
|
|
44
|
+
rule-based algorithms; no ML inference, no LLMs, no randomness.
|
|
45
|
+
2. **Self-owned everything.** No third-party Khmer NLP libraries and no
|
|
46
|
+
bulk-imported wordlists/corpora. Word data is hand-curated entry by
|
|
47
|
+
entry in `src/khmerthings/data/` — web research to find and verify
|
|
48
|
+
candidates is fine; copying someone's wordlist wholesale is not. Note
|
|
49
|
+
candidate sources in the data file header.
|
|
50
|
+
3. **Zero runtime dependencies.** Stdlib only; dev tooling is the only
|
|
51
|
+
exception.
|
|
52
|
+
4. **Tests first.** Every behavior change comes with tests. Prefer
|
|
53
|
+
table-driven `pytest.mark.parametrize` cases plus invariant tests (see
|
|
54
|
+
`tests/test_clusters.py::TestInvariants` for the pattern).
|
|
55
|
+
|
|
56
|
+
## Adding a new tool
|
|
57
|
+
|
|
58
|
+
Follow the existing bottom-up architecture (chars → clusters → lexicon →
|
|
59
|
+
tokenizer → tools). To add a tool, e.g. a spellchecker:
|
|
60
|
+
|
|
61
|
+
1. Create `src/khmerthings/spellcheck.py`, building on the existing
|
|
62
|
+
primitives (`segment_clusters`, `Lexicon.longest_match`, `tokenize`).
|
|
63
|
+
2. Re-export the public API in `src/khmerthings/__init__.py` (`__all__`).
|
|
64
|
+
3. Add a CLI subcommand in `src/khmerthings/cli.py` (`khmerthings spellcheck`).
|
|
65
|
+
4. Add `tests/test_spellcheck.py` with unit + invariant tests.
|
|
66
|
+
5. Write its per-tool document `docs/spellcheck.md` following the shared
|
|
67
|
+
template used by the existing docs (What it does / Quick start / CLI
|
|
68
|
+
reference / Python API / How it works / Guarantees & limitations /
|
|
69
|
+
Related tools). All example outputs must be real, verified outputs.
|
|
70
|
+
6. Add it to the tool table and roadmap in `README.md`, and to `AGENTS.md`.
|
|
71
|
+
|
|
72
|
+
## Editing the wordlists (`src/khmerthings/data/`)
|
|
73
|
+
|
|
74
|
+
Three growable files, merged on demand via `load_lexicon(*sources)`:
|
|
75
|
+
|
|
76
|
+
| File | Source name | Contents |
|
|
77
|
+
|---|---|---|
|
|
78
|
+
| `words.txt` | `words` | core vocabulary (always loaded) |
|
|
79
|
+
| `names.txt` | `names` | personal names, surnames, honorific titles |
|
|
80
|
+
| `modern.txt` | `modern` | slang, informal register, loanwords, trending terms |
|
|
81
|
+
|
|
82
|
+
Put new entries in the right file: standard dictionary vocabulary → `words`;
|
|
83
|
+
anything people are called → `names`; register that shifts with time
|
|
84
|
+
(slang, borrowings, internet vocabulary) → `modern`.
|
|
85
|
+
|
|
86
|
+
- One entry per line, UTF-8, NFC-normalized, Khmer letters/marks only —
|
|
87
|
+
the loader rejects anything else (including within-file duplicates) at
|
|
88
|
+
load time. The same entry may appear in different files (merged at load).
|
|
89
|
+
- Keep entries in their category section; add a new `# --- section ---` when
|
|
90
|
+
needed. Note research sources in the file header comments.
|
|
91
|
+
- Words with subscript ta/da (្ត/្ដ) spelling variation should be added in
|
|
92
|
+
**both** spellings — the two render identically and real text mixes them.
|
|
93
|
+
- Beware visually identical strings with different codepoints; verify with
|
|
94
|
+
`python -c "print([hex(ord(c)) for c in 'word'])"` when in doubt.
|
|
95
|
+
- Adding words is the highest-leverage accuracy improvement for every tool.
|
|
96
|
+
|
|
97
|
+
## Khmer script primer (why the code looks like this)
|
|
98
|
+
|
|
99
|
+
- Khmer writes **no spaces between words**; spaces separate phrases. Word
|
|
100
|
+
boundaries must be inferred via the lexicon.
|
|
101
|
+
- A **character cluster** (base consonant + subscript consonants + vowel +
|
|
102
|
+
signs) is the smallest indivisible unit; word boundaries can only fall on
|
|
103
|
+
cluster boundaries. `clusters.py` implements this.
|
|
104
|
+
- The zero-width space (U+200B) is used in real Khmer text as an explicit
|
|
105
|
+
word delimiter; the tokenizer treats it as whitespace.
|
|
106
|
+
|
|
107
|
+
## Documentation & changelog
|
|
108
|
+
|
|
109
|
+
Docs are part of every change, not a follow-up: update `README.md`,
|
|
110
|
+
`AGENTS.md`, `DEVELOPMENT_GUIDE.md`, and docstrings whenever behavior,
|
|
111
|
+
architecture, or workflow changes, and add an entry to the `[Unreleased]`
|
|
112
|
+
section of `CHANGELOG.md` (Keep a Changelog format) for every user-visible
|
|
113
|
+
change. A behavior-changing PR with no docs/changelog update is incomplete.
|
|
114
|
+
|
|
115
|
+
## Releasing
|
|
116
|
+
|
|
117
|
+
1. Bump `version` in `pyproject.toml` and `__version__` in
|
|
118
|
+
`src/khmerthings/__init__.py` (keep them in sync).
|
|
119
|
+
2. Rename the `[Unreleased]` section of `CHANGELOG.md` to `[X.Y.Z] - date`
|
|
120
|
+
and update the compare links at the bottom.
|
|
121
|
+
3. Update the lockfile: `uv sync`.
|
|
122
|
+
4. Commit, then tag: `git tag vX.Y.Z && git push origin main --tags`.
|
|
123
|
+
5. Build artifacts with `uv build` (CI also builds and uploads them).
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 spkskx
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: khmerthings
|
|
3
|
+
Version: 0.4.3
|
|
4
|
+
Summary: Deterministic Khmer language tools: word counter, segmentation primitives, and more.
|
|
5
|
+
Author: spkskx
|
|
6
|
+
License: MIT
|
|
7
|
+
License-File: LICENSE
|
|
8
|
+
Keywords: khmer,nlp,segmentation,unicode,word-count
|
|
9
|
+
Classifier: Development Status :: 3 - Alpha
|
|
10
|
+
Classifier: Intended Audience :: Developers
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
13
|
+
Classifier: Typing :: Typed
|
|
14
|
+
Requires-Python: >=3.11
|
|
15
|
+
Description-Content-Type: text/markdown
|
|
16
|
+
|
|
17
|
+
# khmerthings
|
|
18
|
+
|
|
19
|
+
Deterministic Khmer language tools for Python — built as **community
|
|
20
|
+
building blocks**: small, correct, dependency-free primitives you can
|
|
21
|
+
compose into bigger systems.
|
|
22
|
+
|
|
23
|
+
No machine-learning models, no third-party NLP dependencies, no network
|
|
24
|
+
calls. Every result is reproducible and explainable. Khmer script writes no
|
|
25
|
+
spaces between words, so even "simple" operations like counting or sorting
|
|
26
|
+
need real language handling — khmerthings implements that from first
|
|
27
|
+
principles.
|
|
28
|
+
|
|
29
|
+
## Tools
|
|
30
|
+
|
|
31
|
+
Each tool is available both as a Python API and a CLI subcommand, and has
|
|
32
|
+
its own detailed document:
|
|
33
|
+
|
|
34
|
+
| Tool | CLI | Python | Docs |
|
|
35
|
+
|---|---|---|---|
|
|
36
|
+
| **Word breaker** — split Khmer text into words | `khmerthings segment` | `break_words`, `mark_boundaries` | [docs/word-breaker.md](docs/word-breaker.md) |
|
|
37
|
+
| **Word counter** — count words in Khmer/mixed text | `khmerthings count` | `count_words`, `analyze` | [docs/word-counter.md](docs/word-counter.md) |
|
|
38
|
+
| **Line sorter** — Khmer dictionary-order sorting | `khmerthings sort` | `sort_lines`, `khmer_sort_key` | [docs/line-sorter.md](docs/line-sorter.md) |
|
|
39
|
+
|
|
40
|
+
## Install
|
|
41
|
+
|
|
42
|
+
```sh
|
|
43
|
+
pip install khmerthings # library
|
|
44
|
+
uv tool install khmerthings # global CLI
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## A taste
|
|
48
|
+
|
|
49
|
+
```sh
|
|
50
|
+
$ echo "ខ្ញុំស្រឡាញ់ភាសាខ្មែរ" | khmerthings segment
|
|
51
|
+
ខ្ញុំ ស្រឡាញ់ ភាសា ខ្មែរ
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from khmerthings import break_words, count_words, sort_lines
|
|
56
|
+
|
|
57
|
+
break_words("ខ្ញុំស្រឡាញ់ភាសាខ្មែរ") # ['ខ្ញុំ', 'ស្រឡាញ់', 'ភាសា', 'ខ្មែរ']
|
|
58
|
+
count_words("ខ្ញុំមានឆ្កែ ២ ក្បាល and 3 cats") # 8
|
|
59
|
+
sort_lines(["ក្រ", "កា", "កក"]) # ['កក', 'កា', 'ក្រ']
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Design principles
|
|
63
|
+
|
|
64
|
+
- **Deterministic**: same input, same output, always. Rule- and
|
|
65
|
+
dictionary-based algorithms only; nothing probabilistic.
|
|
66
|
+
- **Self-contained**: zero runtime dependencies; all word data is our own
|
|
67
|
+
hand-curated set of growable wordlists — `words` (core vocabulary),
|
|
68
|
+
`names` (people's names & titles), and `modern` (slang, loanwords,
|
|
69
|
+
trending terms) — 802 entries and growing. Candidates are researched from
|
|
70
|
+
public sources and verified entry by entry; no wordlist is imported
|
|
71
|
+
wholesale.
|
|
72
|
+
- **Lossless**: no character is ever dropped — unknown Khmer spans are
|
|
73
|
+
reported, not discarded.
|
|
74
|
+
- **Tested first**: every module ships with table-driven unit tests and
|
|
75
|
+
invariant checks (258 tests as of v0.3.0).
|
|
76
|
+
|
|
77
|
+
Under the hood, the tools share deterministic primitives (character
|
|
78
|
+
classification, character-cluster segmentation, a cluster-keyed lexicon
|
|
79
|
+
trie, lossless tokenization) in `src/khmerthings/` — see the module
|
|
80
|
+
docstrings if you want to build on them directly.
|
|
81
|
+
|
|
82
|
+
## Roadmap
|
|
83
|
+
|
|
84
|
+
- ✅ Word counter, line sorter, word breaker
|
|
85
|
+
- ⏳ Wordlist growth across all three sources (`words`, `names`, `modern`) —
|
|
86
|
+
hand-curated batches each release; the accuracy lever for every
|
|
87
|
+
dictionary-based tool
|
|
88
|
+
- 🔜 Spellchecker & spellfixer (engine is feasible today; waiting on lexicon
|
|
89
|
+
coverage to make its verdicts trustworthy)
|
|
90
|
+
- Later: part-of-speech tagger, intent detection, paragraph categorization
|
|
91
|
+
|
|
92
|
+
## Contributing
|
|
93
|
+
|
|
94
|
+
See [DEVELOPMENT_GUIDE.md](DEVELOPMENT_GUIDE.md) for setup, the
|
|
95
|
+
architecture, the rules (determinism, self-owned data, tests first), and
|
|
96
|
+
how to add words to the lexicon — the single most valuable contribution.
|
|
97
|
+
Changes are tracked in [CHANGELOG.md](CHANGELOG.md).
|
|
98
|
+
|
|
99
|
+
## License
|
|
100
|
+
|
|
101
|
+
[MIT](LICENSE)
|