khmerthings 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- khmerthings/__init__.py +29 -0
- khmerthings/__main__.py +7 -0
- khmerthings/chars.py +129 -0
- khmerthings/cli.py +147 -0
- khmerthings/clusters.py +77 -0
- khmerthings/counter.py +73 -0
- khmerthings/data/modern.txt +44 -0
- khmerthings/data/names.txt +212 -0
- khmerthings/data/words.txt +610 -0
- khmerthings/lexicon.py +126 -0
- khmerthings/py.typed +0 -0
- khmerthings/segmenter.py +59 -0
- khmerthings/sorting.py +79 -0
- khmerthings/tokenizer.py +125 -0
- khmerthings-0.4.3.dist-info/METADATA +101 -0
- khmerthings-0.4.3.dist-info/RECORD +19 -0
- khmerthings-0.4.3.dist-info/WHEEL +4 -0
- khmerthings-0.4.3.dist-info/entry_points.txt +2 -0
- khmerthings-0.4.3.dist-info/licenses/LICENSE +21 -0
khmerthings/__init__.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""khmerthings — deterministic Khmer language tools."""
|
|
2
|
+
|
|
3
|
+
from khmerthings.clusters import segment_clusters
|
|
4
|
+
from khmerthings.counter import WordCount, analyze, count_words
|
|
5
|
+
from khmerthings.lexicon import WORD_SOURCES, Lexicon, default_lexicon, load_lexicon
|
|
6
|
+
from khmerthings.segmenter import break_words, mark_boundaries
|
|
7
|
+
from khmerthings.sorting import khmer_sort_key, sort_lines
|
|
8
|
+
from khmerthings.tokenizer import Token, TokenType, tokenize
|
|
9
|
+
|
|
10
|
+
__version__ = "0.4.3"
|
|
11
|
+
|
|
12
|
+
__all__ = [
|
|
13
|
+
"WORD_SOURCES",
|
|
14
|
+
"Lexicon",
|
|
15
|
+
"Token",
|
|
16
|
+
"TokenType",
|
|
17
|
+
"WordCount",
|
|
18
|
+
"__version__",
|
|
19
|
+
"analyze",
|
|
20
|
+
"break_words",
|
|
21
|
+
"count_words",
|
|
22
|
+
"default_lexicon",
|
|
23
|
+
"khmer_sort_key",
|
|
24
|
+
"load_lexicon",
|
|
25
|
+
"mark_boundaries",
|
|
26
|
+
"segment_clusters",
|
|
27
|
+
"sort_lines",
|
|
28
|
+
"tokenize",
|
|
29
|
+
]
|
khmerthings/__main__.py
ADDED
khmerthings/chars.py
ADDED
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""Khmer Unicode character classification.
|
|
2
|
+
|
|
3
|
+
Covers the Khmer block (U+1780-U+17FF) and Khmer Symbols block
|
|
4
|
+
(U+19E0-U+19FF). All functions are pure and operate on single characters;
|
|
5
|
+
multi-character strings raise ``ValueError`` to catch misuse early.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from enum import Enum
|
|
11
|
+
|
|
12
|
+
COENG = "្"
|
|
13
|
+
ZERO_WIDTH_SPACE = ""
|
|
14
|
+
|
|
15
|
+
# Invisible, deprecated "inherent vowel" format characters that still occur
|
|
16
|
+
# in real-world text; they always belong to the preceding cluster.
|
|
17
|
+
INHERENT_VOWELS = ("឴", "឵")
|
|
18
|
+
|
|
19
|
+
_CONSONANTS = range(0x1780, 0x17A3) # ក..អ
|
|
20
|
+
_INDEPENDENT_VOWELS = range(0x17A3, 0x17B4) # ឣ..ឳ (includes deprecated ឣឤ)
|
|
21
|
+
_DEPENDENT_VOWELS = range(0x17B6, 0x17C6) # ា..ៅ
|
|
22
|
+
_SIGNS = frozenset(range(0x17C6, 0x17D2)) | {0x17DD} # ំ..៑ + ៝ (not coeng)
|
|
23
|
+
_DIGITS = range(0x17E0, 0x17EA) # ០..៩
|
|
24
|
+
_PUNCTUATION = frozenset(range(0x17D4, 0x17DB)) | {0x17DC} # ។៕៖ៗ៘៙៚ + ៜ
|
|
25
|
+
_KHMER_BLOCK = range(0x1780, 0x1800)
|
|
26
|
+
_KHMER_SYMBOLS = range(0x19E0, 0x1A00)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ScriptClass(Enum):
|
|
30
|
+
"""Coarse script classification of a single character."""
|
|
31
|
+
|
|
32
|
+
KHMER = "khmer"
|
|
33
|
+
LATIN = "latin"
|
|
34
|
+
DIGIT = "digit"
|
|
35
|
+
OTHER = "other"
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _codepoint(ch: str) -> int:
|
|
39
|
+
if len(ch) != 1:
|
|
40
|
+
raise ValueError(f"expected a single character, got {ch!r}")
|
|
41
|
+
return ord(ch)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def is_khmer(ch: str) -> bool:
|
|
45
|
+
"""True if *ch* is in the Khmer or Khmer Symbols Unicode blocks."""
|
|
46
|
+
cp = _codepoint(ch)
|
|
47
|
+
return cp in _KHMER_BLOCK or cp in _KHMER_SYMBOLS
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def is_consonant(ch: str) -> bool:
|
|
51
|
+
"""True for the 33 Khmer consonants ក (U+1780) through អ (U+17A2)."""
|
|
52
|
+
return _codepoint(ch) in _CONSONANTS
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def is_independent_vowel(ch: str) -> bool:
|
|
56
|
+
"""True for independent vowels ឣ (U+17A3) through ឳ (U+17B3)."""
|
|
57
|
+
return _codepoint(ch) in _INDEPENDENT_VOWELS
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def is_dependent_vowel(ch: str) -> bool:
|
|
61
|
+
"""True for dependent (combining) vowels ា (U+17B6) through ៅ (U+17C5)."""
|
|
62
|
+
return _codepoint(ch) in _DEPENDENT_VOWELS
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def is_sign(ch: str) -> bool:
|
|
66
|
+
"""True for combining signs/diacritics (nikahit, reahmuk, bantoc, ...)."""
|
|
67
|
+
return _codepoint(ch) in _SIGNS
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def is_coeng(ch: str) -> bool:
|
|
71
|
+
"""True for the subscript-forming sign ្ (U+17D2)."""
|
|
72
|
+
return ch == COENG
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def is_inherent_vowel(ch: str) -> bool:
|
|
76
|
+
"""True for the invisible deprecated inherent vowels U+17B4 and U+17B5."""
|
|
77
|
+
return ch in INHERENT_VOWELS
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def is_khmer_digit(ch: str) -> bool:
|
|
81
|
+
"""True for Khmer digits ០ (U+17E0) through ៩ (U+17E9)."""
|
|
82
|
+
return _codepoint(ch) in _DIGITS
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def is_khmer_punctuation(ch: str) -> bool:
|
|
86
|
+
"""True for Khmer punctuation such as ។ (khan) and ៕ (bariyoosan).
|
|
87
|
+
|
|
88
|
+
The currency sign ៛ (U+17DB) is deliberately excluded.
|
|
89
|
+
"""
|
|
90
|
+
return _codepoint(ch) in _PUNCTUATION
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def is_khmer_letter_or_mark(ch: str) -> bool:
|
|
94
|
+
"""True if *ch* can be part of a Khmer word (letters and combining marks)."""
|
|
95
|
+
return (
|
|
96
|
+
is_consonant(ch)
|
|
97
|
+
or is_independent_vowel(ch)
|
|
98
|
+
or is_dependent_vowel(ch)
|
|
99
|
+
or is_sign(ch)
|
|
100
|
+
or is_coeng(ch)
|
|
101
|
+
or is_inherent_vowel(ch)
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def khmer_digit_to_int(ch: str) -> int:
|
|
106
|
+
"""Convert a single Khmer digit to its integer value.
|
|
107
|
+
|
|
108
|
+
Raises ``ValueError`` for anything that is not a Khmer digit.
|
|
109
|
+
"""
|
|
110
|
+
cp = _codepoint(ch)
|
|
111
|
+
if cp not in _DIGITS:
|
|
112
|
+
raise ValueError(f"not a Khmer digit: {ch!r}")
|
|
113
|
+
return cp - 0x17E0
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
def script_class(ch: str) -> ScriptClass:
|
|
117
|
+
"""Classify a character as KHMER, LATIN, DIGIT (ASCII), or OTHER.
|
|
118
|
+
|
|
119
|
+
Khmer digits classify as KHMER; use :func:`is_khmer_digit` to
|
|
120
|
+
distinguish them.
|
|
121
|
+
"""
|
|
122
|
+
cp = _codepoint(ch)
|
|
123
|
+
if cp in _KHMER_BLOCK or cp in _KHMER_SYMBOLS:
|
|
124
|
+
return ScriptClass.KHMER
|
|
125
|
+
if ch.isascii() and ch.isalpha():
|
|
126
|
+
return ScriptClass.LATIN
|
|
127
|
+
if ch.isascii() and ch.isdigit():
|
|
128
|
+
return ScriptClass.DIGIT
|
|
129
|
+
return ScriptClass.OTHER
|
khmerthings/cli.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""Command-line interface: ``khmerthings <tool> ...``.
|
|
2
|
+
|
|
3
|
+
Each library tool is a subcommand; new tools register a subparser here.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
import argparse
|
|
9
|
+
import dataclasses
|
|
10
|
+
import json
|
|
11
|
+
import sys
|
|
12
|
+
from collections.abc import Sequence
|
|
13
|
+
|
|
14
|
+
from khmerthings import __version__
|
|
15
|
+
from khmerthings.counter import analyze
|
|
16
|
+
from khmerthings.lexicon import WORD_SOURCES, Lexicon, load_lexicon
|
|
17
|
+
from khmerthings.segmenter import break_words, mark_boundaries
|
|
18
|
+
from khmerthings.sorting import sort_lines
|
|
19
|
+
|
|
20
|
+
__all__ = ["main"]
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _read_source(path: str) -> tuple[str, str]:
|
|
24
|
+
"""Return (source label, text) for a file path or '-' for stdin."""
|
|
25
|
+
if path == "-":
|
|
26
|
+
return "<stdin>", sys.stdin.read()
|
|
27
|
+
with open(path, encoding="utf-8") as f:
|
|
28
|
+
return path, f.read()
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _lexicon_from_args(args: argparse.Namespace) -> Lexicon:
|
|
32
|
+
includes = tuple(s.strip() for s in (args.include or "").split(",") if s.strip())
|
|
33
|
+
return load_lexicon("words", *includes)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _add_include_option(parser: argparse.ArgumentParser) -> None:
|
|
37
|
+
extra = sorted(set(WORD_SOURCES) - {"words"})
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
"--include",
|
|
40
|
+
metavar=",".join(extra),
|
|
41
|
+
help=f"extra wordlists to match against, comma-separated (available: {', '.join(extra)})",
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _cmd_count(args: argparse.Namespace) -> int:
|
|
46
|
+
paths: list[str] = args.files or ["-"]
|
|
47
|
+
lexicon = _lexicon_from_args(args)
|
|
48
|
+
results = []
|
|
49
|
+
for path in paths:
|
|
50
|
+
source, text = _read_source(path)
|
|
51
|
+
results.append({"source": source, **dataclasses.asdict(analyze(text, lexicon))})
|
|
52
|
+
|
|
53
|
+
if args.json:
|
|
54
|
+
print(json.dumps(results, ensure_ascii=False, indent=2))
|
|
55
|
+
return 0
|
|
56
|
+
|
|
57
|
+
for result in results:
|
|
58
|
+
if len(results) > 1:
|
|
59
|
+
print(f"{result['source']}:")
|
|
60
|
+
for field in dataclasses.fields(analyze("")):
|
|
61
|
+
print(f" {field.name}: {result[field.name]}")
|
|
62
|
+
return 0
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _cmd_segment(args: argparse.Namespace) -> int:
|
|
66
|
+
paths: list[str] = args.files or ["-"]
|
|
67
|
+
lexicon = _lexicon_from_args(args)
|
|
68
|
+
if args.mark:
|
|
69
|
+
separator = args.separator if args.separator is not None else ""
|
|
70
|
+
else:
|
|
71
|
+
separator = args.separator if args.separator is not None else " "
|
|
72
|
+
for path in paths:
|
|
73
|
+
_, text = _read_source(path)
|
|
74
|
+
for line in text.splitlines():
|
|
75
|
+
if args.mark:
|
|
76
|
+
print(mark_boundaries(line, separator, lexicon))
|
|
77
|
+
else:
|
|
78
|
+
print(separator.join(break_words(line, lexicon)))
|
|
79
|
+
return 0
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _cmd_sort(args: argparse.Namespace) -> int:
|
|
83
|
+
paths: list[str] = args.files or ["-"]
|
|
84
|
+
lines: list[str] = []
|
|
85
|
+
for path in paths:
|
|
86
|
+
_, text = _read_source(path)
|
|
87
|
+
lines.extend(text.splitlines())
|
|
88
|
+
for line in sort_lines(lines, descending=args.desc):
|
|
89
|
+
print(line)
|
|
90
|
+
return 0
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _build_parser() -> argparse.ArgumentParser:
|
|
94
|
+
parser = argparse.ArgumentParser(
|
|
95
|
+
prog="khmerthings",
|
|
96
|
+
description="Deterministic Khmer language tools.",
|
|
97
|
+
)
|
|
98
|
+
parser.add_argument("--version", action="version", version=f"%(prog)s {__version__}")
|
|
99
|
+
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
100
|
+
|
|
101
|
+
count = subparsers.add_parser("count", help="count words in Khmer (or mixed) text")
|
|
102
|
+
count.add_argument("files", nargs="*", help="input files, or '-' for stdin (default)")
|
|
103
|
+
count.add_argument("--json", action="store_true", help="emit machine-readable JSON")
|
|
104
|
+
_add_include_option(count)
|
|
105
|
+
count.set_defaults(func=_cmd_count)
|
|
106
|
+
|
|
107
|
+
segment = subparsers.add_parser(
|
|
108
|
+
"segment", help="break Khmer text into words (word segmentation)"
|
|
109
|
+
)
|
|
110
|
+
segment.add_argument("files", nargs="*", help="input files, or '-' for stdin (default)")
|
|
111
|
+
segment.add_argument(
|
|
112
|
+
"--separator",
|
|
113
|
+
help="word separator (default: space, or ZWSP with --mark)",
|
|
114
|
+
)
|
|
115
|
+
segment.add_argument(
|
|
116
|
+
"--mark",
|
|
117
|
+
action="store_true",
|
|
118
|
+
help="preserve the line as-is and only insert separators at Khmer word boundaries",
|
|
119
|
+
)
|
|
120
|
+
_add_include_option(segment)
|
|
121
|
+
segment.set_defaults(func=_cmd_segment)
|
|
122
|
+
|
|
123
|
+
sort = subparsers.add_parser(
|
|
124
|
+
"sort", help="sort lines in Khmer dictionary order (ascending by default)"
|
|
125
|
+
)
|
|
126
|
+
sort.add_argument("files", nargs="*", help="input files, or '-' for stdin (default)")
|
|
127
|
+
sort.add_argument("--desc", action="store_true", help="sort in descending order")
|
|
128
|
+
sort.set_defaults(func=_cmd_sort)
|
|
129
|
+
|
|
130
|
+
return parser
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def main(argv: Sequence[str] | None = None) -> int:
|
|
134
|
+
parser = _build_parser()
|
|
135
|
+
args = parser.parse_args(argv)
|
|
136
|
+
try:
|
|
137
|
+
result: int = args.func(args)
|
|
138
|
+
except ValueError as exc: # e.g. unknown --include source
|
|
139
|
+
parser.error(str(exc))
|
|
140
|
+
except OSError as exc: # e.g. missing or unreadable input file
|
|
141
|
+
print(f"khmerthings: error: {exc}", file=sys.stderr)
|
|
142
|
+
return 1
|
|
143
|
+
return result
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
if __name__ == "__main__":
|
|
147
|
+
sys.exit(main())
|
khmerthings/clusters.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
"""Khmer character-cluster (KCC) segmentation.
|
|
2
|
+
|
|
3
|
+
A Khmer character cluster is the smallest user-visible unit of text: a base
|
|
4
|
+
consonant or independent vowel, optionally followed by subscript (coeng)
|
|
5
|
+
consonants, dependent vowels, and combining signs. Cluster boundaries are the
|
|
6
|
+
only positions where a word boundary can legally occur, which makes this the
|
|
7
|
+
foundation for word segmentation.
|
|
8
|
+
|
|
9
|
+
The segmenter is a deterministic scanner over NFC-normalized text. It never
|
|
10
|
+
drops or reorders characters: ``"".join(segment_clusters(t))`` always equals
|
|
11
|
+
``unicodedata.normalize("NFC", t)``. Malformed sequences (orphan coeng,
|
|
12
|
+
orphan vowel) are attached to the preceding cluster when one exists,
|
|
13
|
+
otherwise emitted as standalone clusters.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import unicodedata
|
|
19
|
+
|
|
20
|
+
from khmerthings.chars import (
|
|
21
|
+
is_coeng as _is_coeng,
|
|
22
|
+
)
|
|
23
|
+
from khmerthings.chars import (
|
|
24
|
+
is_consonant,
|
|
25
|
+
is_dependent_vowel,
|
|
26
|
+
is_independent_vowel,
|
|
27
|
+
is_inherent_vowel,
|
|
28
|
+
is_khmer,
|
|
29
|
+
is_sign,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
__all__ = ["segment_clusters"]
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _is_base(ch: str) -> bool:
|
|
36
|
+
return is_consonant(ch) or is_independent_vowel(ch)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _is_trailing(ch: str) -> bool:
|
|
40
|
+
"""Characters that extend the current cluster (excluding coeng pairs)."""
|
|
41
|
+
return is_dependent_vowel(ch) or is_sign(ch) or is_inherent_vowel(ch)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def segment_clusters(text: str) -> list[str]:
|
|
45
|
+
"""Split *text* into Khmer character clusters.
|
|
46
|
+
|
|
47
|
+
Non-Khmer characters are emitted as single-character clusters. The input
|
|
48
|
+
is NFC-normalized first; the concatenation of the result equals the
|
|
49
|
+
normalized input.
|
|
50
|
+
"""
|
|
51
|
+
text = unicodedata.normalize("NFC", text)
|
|
52
|
+
clusters: list[str] = []
|
|
53
|
+
i = 0
|
|
54
|
+
n = len(text)
|
|
55
|
+
while i < n:
|
|
56
|
+
ch = text[i]
|
|
57
|
+
if _is_base(ch):
|
|
58
|
+
j = i + 1
|
|
59
|
+
while j < n:
|
|
60
|
+
c = text[j]
|
|
61
|
+
if _is_coeng(c) and j + 1 < n and _is_base(text[j + 1]):
|
|
62
|
+
j += 2
|
|
63
|
+
elif _is_trailing(c):
|
|
64
|
+
j += 1
|
|
65
|
+
else:
|
|
66
|
+
break
|
|
67
|
+
clusters.append(text[i:j])
|
|
68
|
+
i = j
|
|
69
|
+
elif (_is_trailing(ch) or _is_coeng(ch)) and clusters and is_khmer(clusters[-1][-1]):
|
|
70
|
+
# Orphan combining mark: attach to the preceding Khmer cluster.
|
|
71
|
+
clusters[-1] += ch
|
|
72
|
+
i += 1
|
|
73
|
+
else:
|
|
74
|
+
# Digits, symbols, punctuation, non-Khmer, or leading orphan marks.
|
|
75
|
+
clusters.append(ch)
|
|
76
|
+
i += 1
|
|
77
|
+
return clusters
|
khmerthings/counter.py
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"""Khmer-aware word counting."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import unicodedata
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
from khmerthings.chars import is_khmer
|
|
9
|
+
from khmerthings.clusters import segment_clusters
|
|
10
|
+
from khmerthings.lexicon import Lexicon
|
|
11
|
+
from khmerthings.tokenizer import TokenType, tokenize
|
|
12
|
+
|
|
13
|
+
__all__ = ["WordCount", "analyze", "count_words"]
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass(frozen=True, slots=True)
|
|
17
|
+
class WordCount:
|
|
18
|
+
"""Word and character statistics for a text.
|
|
19
|
+
|
|
20
|
+
``total_words`` is the sum of known Khmer words, unknown Khmer word
|
|
21
|
+
groups, Latin words, and number tokens (ASCII or Khmer digits).
|
|
22
|
+
``characters`` counts characters of the NFC-normalized text.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
total_words: int
|
|
26
|
+
khmer_words: int
|
|
27
|
+
unknown_khmer_words: int
|
|
28
|
+
latin_words: int
|
|
29
|
+
numbers: int
|
|
30
|
+
clusters: int
|
|
31
|
+
khmer_characters: int
|
|
32
|
+
characters: int
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def analyze(text: str, lexicon: Lexicon | None = None) -> WordCount:
|
|
36
|
+
"""Compute word and character statistics for *text*."""
|
|
37
|
+
normalized = unicodedata.normalize("NFC", text)
|
|
38
|
+
tokens = tokenize(normalized, lexicon)
|
|
39
|
+
|
|
40
|
+
khmer_words = 0
|
|
41
|
+
unknown = 0
|
|
42
|
+
latin = 0
|
|
43
|
+
numbers = 0
|
|
44
|
+
clusters = 0
|
|
45
|
+
for token in tokens:
|
|
46
|
+
if token.type is TokenType.KHMER_WORD:
|
|
47
|
+
khmer_words += 1
|
|
48
|
+
clusters += len(segment_clusters(token.text))
|
|
49
|
+
elif token.type is TokenType.KHMER_UNKNOWN:
|
|
50
|
+
unknown += 1
|
|
51
|
+
clusters += len(segment_clusters(token.text))
|
|
52
|
+
elif token.type is TokenType.LATIN:
|
|
53
|
+
latin += 1
|
|
54
|
+
elif token.type in (TokenType.NUMBER, TokenType.KHMER_DIGIT):
|
|
55
|
+
numbers += 1
|
|
56
|
+
if token.type is TokenType.KHMER_DIGIT:
|
|
57
|
+
clusters += len(token.text)
|
|
58
|
+
|
|
59
|
+
return WordCount(
|
|
60
|
+
total_words=khmer_words + unknown + latin + numbers,
|
|
61
|
+
khmer_words=khmer_words,
|
|
62
|
+
unknown_khmer_words=unknown,
|
|
63
|
+
latin_words=latin,
|
|
64
|
+
numbers=numbers,
|
|
65
|
+
clusters=clusters,
|
|
66
|
+
khmer_characters=sum(1 for ch in normalized if is_khmer(ch)),
|
|
67
|
+
characters=len(normalized),
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def count_words(text: str, lexicon: Lexicon | None = None) -> int:
|
|
72
|
+
"""Count words in *text* (Khmer words, Latin words, and numbers)."""
|
|
73
|
+
return analyze(text, lexicon).total_words
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# khmerthings modern lexicon: slang, informal register, loanwords, and
|
|
2
|
+
# trending vocabulary. One entry per line, UTF-8, NFC, Khmer letters/marks only.
|
|
3
|
+
# Hand-curated entry by entry; candidates researched from public sources and
|
|
4
|
+
# contemporary Khmer media usage (spellings cross-checked, no wordlist
|
|
5
|
+
# imported wholesale):
|
|
6
|
+
# - https://km.wiktionary.org/wiki/ស្ទាវ
|
|
7
|
+
# - https://thmeythmey.com/detail/130843 (ឡូយ)
|
|
8
|
+
# - https://ling-app.com/blog/khmer-slang-words/
|
|
9
|
+
# This register shifts quickly; entries here are expected to grow and be
|
|
10
|
+
# revised more often than words.txt.
|
|
11
|
+
#
|
|
12
|
+
# --- slang & informal ---
|
|
13
|
+
ឡូយ
|
|
14
|
+
ស្ទាវ
|
|
15
|
+
ឡប់
|
|
16
|
+
ភ្លើ
|
|
17
|
+
ចង្រៃ
|
|
18
|
+
ម៉ង
|
|
19
|
+
អេម
|
|
20
|
+
ប្រូ
|
|
21
|
+
អញ
|
|
22
|
+
ហែង
|
|
23
|
+
# --- internet, tech & media loanwords ---
|
|
24
|
+
ហ្វេសប៊ុក
|
|
25
|
+
យូធូប
|
|
26
|
+
តេឡេក្រាម
|
|
27
|
+
អនឡាញ
|
|
28
|
+
អ៊ីនធឺណិត
|
|
29
|
+
វីដេអូ
|
|
30
|
+
កាមេរ៉ា
|
|
31
|
+
ហ្គេម
|
|
32
|
+
ស្មាតហ្វូន
|
|
33
|
+
ឡាយ
|
|
34
|
+
សែលហ្វី
|
|
35
|
+
# --- everyday modern loanwords ---
|
|
36
|
+
កូវីដ
|
|
37
|
+
វ៉ាក់សាំង
|
|
38
|
+
ដុល្លារ
|
|
39
|
+
ម៉ាស៊ីន
|
|
40
|
+
ម៉ូដ
|
|
41
|
+
សាំង
|
|
42
|
+
កាដូ
|
|
43
|
+
ភីហ្សា
|
|
44
|
+
ប៊ិច
|