khmerthings 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,29 @@
1
+ """khmerthings — deterministic Khmer language tools."""
2
+
3
+ from khmerthings.clusters import segment_clusters
4
+ from khmerthings.counter import WordCount, analyze, count_words
5
+ from khmerthings.lexicon import WORD_SOURCES, Lexicon, default_lexicon, load_lexicon
6
+ from khmerthings.segmenter import break_words, mark_boundaries
7
+ from khmerthings.sorting import khmer_sort_key, sort_lines
8
+ from khmerthings.tokenizer import Token, TokenType, tokenize
9
+
10
+ __version__ = "0.4.3"
11
+
12
+ __all__ = [
13
+ "WORD_SOURCES",
14
+ "Lexicon",
15
+ "Token",
16
+ "TokenType",
17
+ "WordCount",
18
+ "__version__",
19
+ "analyze",
20
+ "break_words",
21
+ "count_words",
22
+ "default_lexicon",
23
+ "khmer_sort_key",
24
+ "load_lexicon",
25
+ "mark_boundaries",
26
+ "segment_clusters",
27
+ "sort_lines",
28
+ "tokenize",
29
+ ]
@@ -0,0 +1,7 @@
1
+ """Allow ``python -m khmerthings``."""
2
+
3
+ import sys
4
+
5
+ from khmerthings.cli import main
6
+
7
+ sys.exit(main())
khmerthings/chars.py ADDED
@@ -0,0 +1,129 @@
1
+ """Khmer Unicode character classification.
2
+
3
+ Covers the Khmer block (U+1780-U+17FF) and Khmer Symbols block
4
+ (U+19E0-U+19FF). All functions are pure and operate on single characters;
5
+ multi-character strings raise ``ValueError`` to catch misuse early.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from enum import Enum
11
+
12
+ COENG = "្"
13
+ ZERO_WIDTH_SPACE = "​"
14
+
15
+ # Invisible, deprecated "inherent vowel" format characters that still occur
16
+ # in real-world text; they always belong to the preceding cluster.
17
+ INHERENT_VOWELS = ("឴", "឵")
18
+
19
+ _CONSONANTS = range(0x1780, 0x17A3) # ក..អ
20
+ _INDEPENDENT_VOWELS = range(0x17A3, 0x17B4) # ឣ..ឳ (includes deprecated ឣឤ)
21
+ _DEPENDENT_VOWELS = range(0x17B6, 0x17C6) # ា..ៅ
22
+ _SIGNS = frozenset(range(0x17C6, 0x17D2)) | {0x17DD} # ំ..៑ + ៝ (not coeng)
23
+ _DIGITS = range(0x17E0, 0x17EA) # ០..៩
24
+ _PUNCTUATION = frozenset(range(0x17D4, 0x17DB)) | {0x17DC} # ។៕៖ៗ៘៙៚ + ៜ
25
+ _KHMER_BLOCK = range(0x1780, 0x1800)
26
+ _KHMER_SYMBOLS = range(0x19E0, 0x1A00)
27
+
28
+
29
+ class ScriptClass(Enum):
30
+ """Coarse script classification of a single character."""
31
+
32
+ KHMER = "khmer"
33
+ LATIN = "latin"
34
+ DIGIT = "digit"
35
+ OTHER = "other"
36
+
37
+
38
+ def _codepoint(ch: str) -> int:
39
+ if len(ch) != 1:
40
+ raise ValueError(f"expected a single character, got {ch!r}")
41
+ return ord(ch)
42
+
43
+
44
+ def is_khmer(ch: str) -> bool:
45
+ """True if *ch* is in the Khmer or Khmer Symbols Unicode blocks."""
46
+ cp = _codepoint(ch)
47
+ return cp in _KHMER_BLOCK or cp in _KHMER_SYMBOLS
48
+
49
+
50
+ def is_consonant(ch: str) -> bool:
51
+ """True for the 33 Khmer consonants ក (U+1780) through អ (U+17A2)."""
52
+ return _codepoint(ch) in _CONSONANTS
53
+
54
+
55
+ def is_independent_vowel(ch: str) -> bool:
56
+ """True for independent vowels ឣ (U+17A3) through ឳ (U+17B3)."""
57
+ return _codepoint(ch) in _INDEPENDENT_VOWELS
58
+
59
+
60
+ def is_dependent_vowel(ch: str) -> bool:
61
+ """True for dependent (combining) vowels ា (U+17B6) through ៅ (U+17C5)."""
62
+ return _codepoint(ch) in _DEPENDENT_VOWELS
63
+
64
+
65
+ def is_sign(ch: str) -> bool:
66
+ """True for combining signs/diacritics (nikahit, reahmuk, bantoc, ...)."""
67
+ return _codepoint(ch) in _SIGNS
68
+
69
+
70
+ def is_coeng(ch: str) -> bool:
71
+ """True for the subscript-forming sign ្ (U+17D2)."""
72
+ return ch == COENG
73
+
74
+
75
+ def is_inherent_vowel(ch: str) -> bool:
76
+ """True for the invisible deprecated inherent vowels U+17B4 and U+17B5."""
77
+ return ch in INHERENT_VOWELS
78
+
79
+
80
+ def is_khmer_digit(ch: str) -> bool:
81
+ """True for Khmer digits ០ (U+17E0) through ៩ (U+17E9)."""
82
+ return _codepoint(ch) in _DIGITS
83
+
84
+
85
+ def is_khmer_punctuation(ch: str) -> bool:
86
+ """True for Khmer punctuation such as ។ (khan) and ៕ (bariyoosan).
87
+
88
+ The currency sign ៛ (U+17DB) is deliberately excluded.
89
+ """
90
+ return _codepoint(ch) in _PUNCTUATION
91
+
92
+
93
+ def is_khmer_letter_or_mark(ch: str) -> bool:
94
+ """True if *ch* can be part of a Khmer word (letters and combining marks)."""
95
+ return (
96
+ is_consonant(ch)
97
+ or is_independent_vowel(ch)
98
+ or is_dependent_vowel(ch)
99
+ or is_sign(ch)
100
+ or is_coeng(ch)
101
+ or is_inherent_vowel(ch)
102
+ )
103
+
104
+
105
+ def khmer_digit_to_int(ch: str) -> int:
106
+ """Convert a single Khmer digit to its integer value.
107
+
108
+ Raises ``ValueError`` for anything that is not a Khmer digit.
109
+ """
110
+ cp = _codepoint(ch)
111
+ if cp not in _DIGITS:
112
+ raise ValueError(f"not a Khmer digit: {ch!r}")
113
+ return cp - 0x17E0
114
+
115
+
116
+ def script_class(ch: str) -> ScriptClass:
117
+ """Classify a character as KHMER, LATIN, DIGIT (ASCII), or OTHER.
118
+
119
+ Khmer digits classify as KHMER; use :func:`is_khmer_digit` to
120
+ distinguish them.
121
+ """
122
+ cp = _codepoint(ch)
123
+ if cp in _KHMER_BLOCK or cp in _KHMER_SYMBOLS:
124
+ return ScriptClass.KHMER
125
+ if ch.isascii() and ch.isalpha():
126
+ return ScriptClass.LATIN
127
+ if ch.isascii() and ch.isdigit():
128
+ return ScriptClass.DIGIT
129
+ return ScriptClass.OTHER
khmerthings/cli.py ADDED
@@ -0,0 +1,147 @@
1
+ """Command-line interface: ``khmerthings <tool> ...``.
2
+
3
+ Each library tool is a subcommand; new tools register a subparser here.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ import argparse
9
+ import dataclasses
10
+ import json
11
+ import sys
12
+ from collections.abc import Sequence
13
+
14
+ from khmerthings import __version__
15
+ from khmerthings.counter import analyze
16
+ from khmerthings.lexicon import WORD_SOURCES, Lexicon, load_lexicon
17
+ from khmerthings.segmenter import break_words, mark_boundaries
18
+ from khmerthings.sorting import sort_lines
19
+
20
+ __all__ = ["main"]
21
+
22
+
23
+ def _read_source(path: str) -> tuple[str, str]:
24
+ """Return (source label, text) for a file path or '-' for stdin."""
25
+ if path == "-":
26
+ return "<stdin>", sys.stdin.read()
27
+ with open(path, encoding="utf-8") as f:
28
+ return path, f.read()
29
+
30
+
31
+ def _lexicon_from_args(args: argparse.Namespace) -> Lexicon:
32
+ includes = tuple(s.strip() for s in (args.include or "").split(",") if s.strip())
33
+ return load_lexicon("words", *includes)
34
+
35
+
36
+ def _add_include_option(parser: argparse.ArgumentParser) -> None:
37
+ extra = sorted(set(WORD_SOURCES) - {"words"})
38
+ parser.add_argument(
39
+ "--include",
40
+ metavar=",".join(extra),
41
+ help=f"extra wordlists to match against, comma-separated (available: {', '.join(extra)})",
42
+ )
43
+
44
+
45
+ def _cmd_count(args: argparse.Namespace) -> int:
46
+ paths: list[str] = args.files or ["-"]
47
+ lexicon = _lexicon_from_args(args)
48
+ results = []
49
+ for path in paths:
50
+ source, text = _read_source(path)
51
+ results.append({"source": source, **dataclasses.asdict(analyze(text, lexicon))})
52
+
53
+ if args.json:
54
+ print(json.dumps(results, ensure_ascii=False, indent=2))
55
+ return 0
56
+
57
+ for result in results:
58
+ if len(results) > 1:
59
+ print(f"{result['source']}:")
60
+ for field in dataclasses.fields(analyze("")):
61
+ print(f" {field.name}: {result[field.name]}")
62
+ return 0
63
+
64
+
65
+ def _cmd_segment(args: argparse.Namespace) -> int:
66
+ paths: list[str] = args.files or ["-"]
67
+ lexicon = _lexicon_from_args(args)
68
+ if args.mark:
69
+ separator = args.separator if args.separator is not None else "​"
70
+ else:
71
+ separator = args.separator if args.separator is not None else " "
72
+ for path in paths:
73
+ _, text = _read_source(path)
74
+ for line in text.splitlines():
75
+ if args.mark:
76
+ print(mark_boundaries(line, separator, lexicon))
77
+ else:
78
+ print(separator.join(break_words(line, lexicon)))
79
+ return 0
80
+
81
+
82
+ def _cmd_sort(args: argparse.Namespace) -> int:
83
+ paths: list[str] = args.files or ["-"]
84
+ lines: list[str] = []
85
+ for path in paths:
86
+ _, text = _read_source(path)
87
+ lines.extend(text.splitlines())
88
+ for line in sort_lines(lines, descending=args.desc):
89
+ print(line)
90
+ return 0
91
+
92
+
93
+ def _build_parser() -> argparse.ArgumentParser:
94
+ parser = argparse.ArgumentParser(
95
+ prog="khmerthings",
96
+ description="Deterministic Khmer language tools.",
97
+ )
98
+ parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
99
+ subparsers = parser.add_subparsers(dest="command", required=True)
100
+
101
+ count = subparsers.add_parser("count", help="count words in Khmer (or mixed) text")
102
+ count.add_argument("files", nargs="*", help="input files, or '-' for stdin (default)")
103
+ count.add_argument("--json", action="store_true", help="emit machine-readable JSON")
104
+ _add_include_option(count)
105
+ count.set_defaults(func=_cmd_count)
106
+
107
+ segment = subparsers.add_parser(
108
+ "segment", help="break Khmer text into words (word segmentation)"
109
+ )
110
+ segment.add_argument("files", nargs="*", help="input files, or '-' for stdin (default)")
111
+ segment.add_argument(
112
+ "--separator",
113
+ help="word separator (default: space, or ZWSP with --mark)",
114
+ )
115
+ segment.add_argument(
116
+ "--mark",
117
+ action="store_true",
118
+ help="preserve the line as-is and only insert separators at Khmer word boundaries",
119
+ )
120
+ _add_include_option(segment)
121
+ segment.set_defaults(func=_cmd_segment)
122
+
123
+ sort = subparsers.add_parser(
124
+ "sort", help="sort lines in Khmer dictionary order (ascending by default)"
125
+ )
126
+ sort.add_argument("files", nargs="*", help="input files, or '-' for stdin (default)")
127
+ sort.add_argument("--desc", action="store_true", help="sort in descending order")
128
+ sort.set_defaults(func=_cmd_sort)
129
+
130
+ return parser
131
+
132
+
133
+ def main(argv: Sequence[str] | None = None) -> int:
134
+ parser = _build_parser()
135
+ args = parser.parse_args(argv)
136
+ try:
137
+ result: int = args.func(args)
138
+ except ValueError as exc: # e.g. unknown --include source
139
+ parser.error(str(exc))
140
+ except OSError as exc: # e.g. missing or unreadable input file
141
+ print(f"khmerthings: error: {exc}", file=sys.stderr)
142
+ return 1
143
+ return result
144
+
145
+
146
+ if __name__ == "__main__":
147
+ sys.exit(main())
@@ -0,0 +1,77 @@
1
+ """Khmer character-cluster (KCC) segmentation.
2
+
3
+ A Khmer character cluster is the smallest user-visible unit of text: a base
4
+ consonant or independent vowel, optionally followed by subscript (coeng)
5
+ consonants, dependent vowels, and combining signs. Cluster boundaries are the
6
+ only positions where a word boundary can legally occur, which makes this the
7
+ foundation for word segmentation.
8
+
9
+ The segmenter is a deterministic scanner over NFC-normalized text. It never
10
+ drops or reorders characters: ``"".join(segment_clusters(t))`` always equals
11
+ ``unicodedata.normalize("NFC", t)``. Malformed sequences (orphan coeng,
12
+ orphan vowel) are attached to the preceding cluster when one exists,
13
+ otherwise emitted as standalone clusters.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import unicodedata
19
+
20
+ from khmerthings.chars import (
21
+ is_coeng as _is_coeng,
22
+ )
23
+ from khmerthings.chars import (
24
+ is_consonant,
25
+ is_dependent_vowel,
26
+ is_independent_vowel,
27
+ is_inherent_vowel,
28
+ is_khmer,
29
+ is_sign,
30
+ )
31
+
32
+ __all__ = ["segment_clusters"]
33
+
34
+
35
+ def _is_base(ch: str) -> bool:
36
+ return is_consonant(ch) or is_independent_vowel(ch)
37
+
38
+
39
+ def _is_trailing(ch: str) -> bool:
40
+ """Characters that extend the current cluster (excluding coeng pairs)."""
41
+ return is_dependent_vowel(ch) or is_sign(ch) or is_inherent_vowel(ch)
42
+
43
+
44
+ def segment_clusters(text: str) -> list[str]:
45
+ """Split *text* into Khmer character clusters.
46
+
47
+ Non-Khmer characters are emitted as single-character clusters. The input
48
+ is NFC-normalized first; the concatenation of the result equals the
49
+ normalized input.
50
+ """
51
+ text = unicodedata.normalize("NFC", text)
52
+ clusters: list[str] = []
53
+ i = 0
54
+ n = len(text)
55
+ while i < n:
56
+ ch = text[i]
57
+ if _is_base(ch):
58
+ j = i + 1
59
+ while j < n:
60
+ c = text[j]
61
+ if _is_coeng(c) and j + 1 < n and _is_base(text[j + 1]):
62
+ j += 2
63
+ elif _is_trailing(c):
64
+ j += 1
65
+ else:
66
+ break
67
+ clusters.append(text[i:j])
68
+ i = j
69
+ elif (_is_trailing(ch) or _is_coeng(ch)) and clusters and is_khmer(clusters[-1][-1]):
70
+ # Orphan combining mark: attach to the preceding Khmer cluster.
71
+ clusters[-1] += ch
72
+ i += 1
73
+ else:
74
+ # Digits, symbols, punctuation, non-Khmer, or leading orphan marks.
75
+ clusters.append(ch)
76
+ i += 1
77
+ return clusters
khmerthings/counter.py ADDED
@@ -0,0 +1,73 @@
1
+ """Khmer-aware word counting."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import unicodedata
6
+ from dataclasses import dataclass
7
+
8
+ from khmerthings.chars import is_khmer
9
+ from khmerthings.clusters import segment_clusters
10
+ from khmerthings.lexicon import Lexicon
11
+ from khmerthings.tokenizer import TokenType, tokenize
12
+
13
+ __all__ = ["WordCount", "analyze", "count_words"]
14
+
15
+
16
+ @dataclass(frozen=True, slots=True)
17
+ class WordCount:
18
+ """Word and character statistics for a text.
19
+
20
+ ``total_words`` is the sum of known Khmer words, unknown Khmer word
21
+ groups, Latin words, and number tokens (ASCII or Khmer digits).
22
+ ``characters`` counts characters of the NFC-normalized text.
23
+ """
24
+
25
+ total_words: int
26
+ khmer_words: int
27
+ unknown_khmer_words: int
28
+ latin_words: int
29
+ numbers: int
30
+ clusters: int
31
+ khmer_characters: int
32
+ characters: int
33
+
34
+
35
+ def analyze(text: str, lexicon: Lexicon | None = None) -> WordCount:
36
+ """Compute word and character statistics for *text*."""
37
+ normalized = unicodedata.normalize("NFC", text)
38
+ tokens = tokenize(normalized, lexicon)
39
+
40
+ khmer_words = 0
41
+ unknown = 0
42
+ latin = 0
43
+ numbers = 0
44
+ clusters = 0
45
+ for token in tokens:
46
+ if token.type is TokenType.KHMER_WORD:
47
+ khmer_words += 1
48
+ clusters += len(segment_clusters(token.text))
49
+ elif token.type is TokenType.KHMER_UNKNOWN:
50
+ unknown += 1
51
+ clusters += len(segment_clusters(token.text))
52
+ elif token.type is TokenType.LATIN:
53
+ latin += 1
54
+ elif token.type in (TokenType.NUMBER, TokenType.KHMER_DIGIT):
55
+ numbers += 1
56
+ if token.type is TokenType.KHMER_DIGIT:
57
+ clusters += len(token.text)
58
+
59
+ return WordCount(
60
+ total_words=khmer_words + unknown + latin + numbers,
61
+ khmer_words=khmer_words,
62
+ unknown_khmer_words=unknown,
63
+ latin_words=latin,
64
+ numbers=numbers,
65
+ clusters=clusters,
66
+ khmer_characters=sum(1 for ch in normalized if is_khmer(ch)),
67
+ characters=len(normalized),
68
+ )
69
+
70
+
71
+ def count_words(text: str, lexicon: Lexicon | None = None) -> int:
72
+ """Count words in *text* (Khmer words, Latin words, and numbers)."""
73
+ return analyze(text, lexicon).total_words
@@ -0,0 +1,44 @@
1
+ # khmerthings modern lexicon: slang, informal register, loanwords, and
2
+ # trending vocabulary. One entry per line, UTF-8, NFC, Khmer letters/marks only.
3
+ # Hand-curated entry by entry; candidates researched from public sources and
4
+ # contemporary Khmer media usage (spellings cross-checked, no wordlist
5
+ # imported wholesale):
6
+ # - https://km.wiktionary.org/wiki/ស្ទាវ
7
+ # - https://thmeythmey.com/detail/130843 (ឡូយ)
8
+ # - https://ling-app.com/blog/khmer-slang-words/
9
+ # This register shifts quickly; entries here are expected to grow and be
10
+ # revised more often than words.txt.
11
+ #
12
+ # --- slang & informal ---
13
+ ឡូយ
14
+ ស្ទាវ
15
+ ឡប់
16
+ ភ្លើ
17
+ ចង្រៃ
18
+ ម៉ង
19
+ អេម
20
+ ប្រូ
21
+ អញ
22
+ ហែង
23
+ # --- internet, tech & media loanwords ---
24
+ ហ្វេសប៊ុក
25
+ យូធូប
26
+ តេឡេក្រាម
27
+ អនឡាញ
28
+ អ៊ីនធឺណិត
29
+ វីដេអូ
30
+ កាមេរ៉ា
31
+ ហ្គេម
32
+ ស្មាតហ្វូន
33
+ ឡាយ
34
+ សែលហ្វី
35
+ # --- everyday modern loanwords ---
36
+ កូវីដ
37
+ វ៉ាក់សាំង
38
+ ដុល្លារ
39
+ ម៉ាស៊ីន
40
+ ម៉ូដ
41
+ សាំង
42
+ កាដូ
43
+ ភីហ្សា
44
+ ប៊ិច