normalize-kap-orthography 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Keith Manaloto
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,127 @@
1
+ Metadata-Version: 2.4
2
+ Name: normalize-kap-orthography
3
+ Version: 0.1.0
4
+ Summary: Normalize Kapampangan words from Spanish-era (1730s) orthography to modern K-based orthography
5
+ Author: Keith Manaloto
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/keithmanaloto/normalize-kap-orthography
8
+ Project-URL: Issues, https://github.com/keithmanaloto/normalize-kap-orthography
9
+ Keywords: kapampangan,pampanga,orthography,nlp,linguistics
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: Text Processing :: Linguistic
15
+ Requires-Python: >=3.10
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Dynamic: license-file
19
+
20
+ # normalize-kap-orthography
21
+
22
+ A Python utility to normalize Kapampangan words from Spanish-era (1730s) orthography to modern K-based orthography.
23
+
24
+ Built to make historical Kapampangan texts — like Bergaño's 1732 *Vocabulario de la Lengua Pampanga* — more accessible to modern readers, researchers, and NLP pipelines.
25
+
26
+ ## Background
27
+
28
+ Before the Spanish conquest, Kapampangans used their own indigenous writing system (Kulitan). Spanish missionaries romanized the language using Spanish orthographic conventions (C, Q, Ñ, LL, etc.). Over the past century, multiple competing romanized orthographies have emerged:
29
+
30
+ ## Disclaimer
31
+
32
+ I am not a linguist — I'm a native Kapampangan speaker who happens to be a computer science graduate. The conversion rules in this tool were identified through patterns I recognized while cleaning historical dictionary data, not through formal linguistic analysis. The script was spot-checked against the dataset and appears accurate, but it has not been exhaustively verified. If you spot errors or have linguistic expertise to contribute, please open an issue or PR.
33
+
34
+ | System | Also known as | Key features |
35
+ |---|---|---|
36
+ | Spanish-era ("Q & C") | *Súlat Bacúlud*, Old Orthography | Uses QU, C, Ñ, LL — the system used in colonial-era texts |
37
+ | ABAKADA ("K") | *Súlat Wáwâ*, New Orthography | K-based, aligned with the Philippine national orthography |
38
+ | Samson Hybrid | *Ámung Samson* | Retains C before a/o/u, replaces QU→K, adds diacritical marks |
39
+ | Batiáuan Revised | *Súlat Wáwâ a alâng WA* | K-based without W, with diacritical marks |
40
+
41
+ This tool converts from the **Spanish-era system** to a **modern K-based form** (closest to ABAKADA). For more on the orthography dispute, see [Pangilinan (2006)](https://sil-philippines-languages.org/ical/papers/pangilinan-Dispute%20on%20Orthography.pdf).
42
+
43
+ ## What it does
44
+
45
+ The converter applies two phases of transformation:
46
+
47
+ **Phase 1 — Spanish letter substitutions:**
48
+ - `QUI` → `KI`, `QUE` → `KE`
49
+ - `C` → `K` (except after `SI`)
50
+ - `Ñ` → `N`, `LL` → `L`
51
+ - Word-initial `V` → `W`
52
+
53
+ **Phase 2 — Vowel cluster and diphthong normalization:**
54
+ - `AO` → `O`, `AI`/`AY` → `E` (word-final, non-initial)
55
+ - `UA` → `WA`, `UO` → `WO`
56
+ - Various other diphthong simplifications
57
+
58
+ An **exceptions table** handles words that don't follow general patterns, and a **two-pass conversion** catches cascading transformations.
59
+
60
+ ## Installation
61
+
62
+ ```shell
63
+ pip install normalize-kap-orthography
64
+ ```
65
+
66
+ Or just copy `normalize_orthography.py` into your project.
67
+
68
+ ## Usage
69
+
70
+ ```python
71
+ from normalize_orthography import convert_orthography
72
+
73
+ convert_orthography("QUINANG") # → "KINANG"
74
+
75
+ convert_orthography("VATAUAT") # → "WATAWAT"
76
+
77
+ convert_orthography("QUECAI") # → "KEKE"
78
+
79
+ convert_orthography("KINANG") # → None (already modern)
80
+ ```
81
+
82
+ Returns the normalized form, or `None` if no conversion is needed.
83
+
84
+ ### CLI
85
+
86
+ ```shell
87
+ python normalize_orthography.py
88
+ ```
89
+
90
+
91
+ Runs a small set of built-in test cases.
92
+
93
+ ## Limitations
94
+
95
+ - **Not linguistically verified.** The rules were identified through pattern recognition by a native speaker, not through formal linguistic analysis. The script was spot-checked against dictionary data but not exhaustively validated.
96
+ - **No diacritical marks.** The script does not handle stress marking, which is important in Kapampangan — e.g., *masakit* (painful) vs. *masákit* (difficult) vs. *másakit* (ill) are three distinct words.
97
+ - **One-directional.** Currently only converts Spanish-era → modern. Reverse conversion is not supported.
98
+ - **Uppercase only.** Input is converted to uppercase internally; output is always uppercase.
99
+
100
+ ## Origin
101
+
102
+ Originally written in Dart as part of the v2 of [Learn Kulitan](https://github.com/keithliam/learn-kulitan-app), then rewritten in Python with **Claude Code Opus 4.6**.
103
+
104
+ ## Real-World Usage
105
+
106
+ This script was originally used to normalize ~5,000 words extracted from [*Vocabulario de la Lengua Pampanga*](https://archive.org/details/aqn8189.0001.001.umich.edu/page/1/mode/2up) by Fray Diego Bergaño, originally published in 1732 — one of the earliest known dictionaries of the Kapampángan language. About 40% of entries (1,989 out of 4,971) had their orthography normalized.
107
+
108
+ The raw, uncleaned entries and their cleaned, normalized versions are available as part of an open dataset on Hugging Face:
109
+
110
+ **[keithmanaloto/kapampangan-dictionary-embeddings](https://huggingface.co/datasets/keithmanaloto/kapampangan-dictionary-embeddings)**
111
+
112
+ The dataset also includes LLM-enriched metadata and pre-computed embeddings across multiple models — designed for semantic search, retrieval, and clustering over Kapampángan vocabulary. Both the original 1730s spelling and the normalized modern form are preserved in the dataset.
113
+
114
+ For the full story behind the dataset and what I learned building it, see the article:
115
+ [From a 300-Year-Old Dictionary to Hugging Face: I Built Kapampángan's First Embedding Dataset](https://keithmanaloto.medium.com/from-a-300-year-old-dictionary-to-hugging-face-i-built-kapampángans-first-embedding-dataset-dce2b877bd83)
116
+
117
+ ## Contributing
118
+
119
+ Contributions are welcome, especially:
120
+ - Expanding the exceptions table
121
+ - Adding test coverage against known word lists
122
+ - Adding diacritical mark support
123
+ - Supporting additional orthographic target systems
124
+
125
+ ## License
126
+
127
+ MIT
@@ -0,0 +1,108 @@
1
+ # normalize-kap-orthography
2
+
3
+ A Python utility to normalize Kapampangan words from Spanish-era (1730s) orthography to modern K-based orthography.
4
+
5
+ Built to make historical Kapampangan texts — like Bergaño's 1732 *Vocabulario de la Lengua Pampanga* — more accessible to modern readers, researchers, and NLP pipelines.
6
+
7
+ ## Background
8
+
9
+ Before the Spanish conquest, Kapampangans used their own indigenous writing system (Kulitan). Spanish missionaries romanized the language using Spanish orthographic conventions (C, Q, Ñ, LL, etc.). Over the past century, multiple competing romanized orthographies have emerged:
10
+
11
+ ## Disclaimer
12
+
13
+ I am not a linguist — I'm a native Kapampangan speaker who happens to be a computer science graduate. The conversion rules in this tool were identified through patterns I recognized while cleaning historical dictionary data, not through formal linguistic analysis. The script was spot-checked against the dataset and appears accurate, but it has not been exhaustively verified. If you spot errors or have linguistic expertise to contribute, please open an issue or PR.
14
+
15
+ | System | Also known as | Key features |
16
+ |---|---|---|
17
+ | Spanish-era ("Q & C") | *Súlat Bacúlud*, Old Orthography | Uses QU, C, Ñ, LL — the system used in colonial-era texts |
18
+ | ABAKADA ("K") | *Súlat Wáwâ*, New Orthography | K-based, aligned with the Philippine national orthography |
19
+ | Samson Hybrid | *Ámung Samson* | Retains C before a/o/u, replaces QU→K, adds diacritical marks |
20
+ | Batiáuan Revised | *Súlat Wáwâ a alâng WA* | K-based without W, with diacritical marks |
21
+
22
+ This tool converts from the **Spanish-era system** to a **modern K-based form** (closest to ABAKADA). For more on the orthography dispute, see [Pangilinan (2006)](https://sil-philippines-languages.org/ical/papers/pangilinan-Dispute%20on%20Orthography.pdf).
23
+
24
+ ## What it does
25
+
26
+ The converter applies two phases of transformation:
27
+
28
+ **Phase 1 — Spanish letter substitutions:**
29
+ - `QUI` → `KI`, `QUE` → `KE`
30
+ - `C` → `K` (except after `SI`)
31
+ - `Ñ` → `N`, `LL` → `L`
32
+ - Word-initial `V` → `W`
33
+
34
+ **Phase 2 — Vowel cluster and diphthong normalization:**
35
+ - `AO` → `O`, `AI`/`AY` → `E` (word-final, non-initial)
36
+ - `UA` → `WA`, `UO` → `WO`
37
+ - Various other diphthong simplifications
38
+
39
+ An **exceptions table** handles words that don't follow general patterns, and a **two-pass conversion** catches cascading transformations.
40
+
41
+ ## Installation
42
+
43
+ ```shell
44
+ pip install normalize-kap-orthography
45
+ ```
46
+
47
+ Or just copy `normalize_orthography.py` into your project.
48
+
49
+ ## Usage
50
+
51
+ ```python
52
+ from normalize_orthography import convert_orthography
53
+
54
+ convert_orthography("QUINANG") # → "KINANG"
55
+
56
+ convert_orthography("VATAUAT") # → "WATAWAT"
57
+
58
+ convert_orthography("QUECAI") # → "KEKE"
59
+
60
+ convert_orthography("KINANG") # → None (already modern)
61
+ ```
62
+
63
+ Returns the normalized form, or `None` if no conversion is needed.
64
+
65
+ ### CLI
66
+
67
+ ```shell
68
+ python normalize_orthography.py
69
+ ```
70
+
71
+
72
+ Runs a small set of built-in test cases.
73
+
74
+ ## Limitations
75
+
76
+ - **Not linguistically verified.** The rules were identified through pattern recognition by a native speaker, not through formal linguistic analysis. The script was spot-checked against dictionary data but not exhaustively validated.
77
+ - **No diacritical marks.** The script does not handle stress marking, which is important in Kapampangan — e.g., *masakit* (painful) vs. *masákit* (difficult) vs. *másakit* (ill) are three distinct words.
78
+ - **One-directional.** Currently only converts Spanish-era → modern. Reverse conversion is not supported.
79
+ - **Uppercase only.** Input is converted to uppercase internally; output is always uppercase.
80
+
81
+ ## Origin
82
+
83
+ Originally written in Dart as part of the v2 of [Learn Kulitan](https://github.com/keithliam/learn-kulitan-app), then rewritten in Python with **Claude Code Opus 4.6**.
84
+
85
+ ## Real-World Usage
86
+
87
+ This script was originally used to normalize ~5,000 words extracted from [*Vocabulario de la Lengua Pampanga*](https://archive.org/details/aqn8189.0001.001.umich.edu/page/1/mode/2up) by Fray Diego Bergaño, originally published in 1732 — one of the earliest known dictionaries of the Kapampángan language. About 40% of entries (1,989 out of 4,971) had their orthography normalized.
88
+
89
+ The raw, uncleaned entries and their cleaned, normalized versions are available as part of an open dataset on Hugging Face:
90
+
91
+ **[keithmanaloto/kapampangan-dictionary-embeddings](https://huggingface.co/datasets/keithmanaloto/kapampangan-dictionary-embeddings)**
92
+
93
+ The dataset also includes LLM-enriched metadata and pre-computed embeddings across multiple models — designed for semantic search, retrieval, and clustering over Kapampángan vocabulary. Both the original 1730s spelling and the normalized modern form are preserved in the dataset.
94
+
95
+ For the full story behind the dataset and what I learned building it, see the article:
96
+ [From a 300-Year-Old Dictionary to Hugging Face: I Built Kapampángan's First Embedding Dataset](https://keithmanaloto.medium.com/from-a-300-year-old-dictionary-to-hugging-face-i-built-kapampángans-first-embedding-dataset-dce2b877bd83)
97
+
98
+ ## Contributing
99
+
100
+ Contributions are welcome, especially:
101
+ - Expanding the exceptions table
102
+ - Adding test coverage against known word lists
103
+ - Adding diacritical mark support
104
+ - Supporting additional orthographic target systems
105
+
106
+ ## License
107
+
108
+ MIT
@@ -0,0 +1,127 @@
1
+ Metadata-Version: 2.4
2
+ Name: normalize-kap-orthography
3
+ Version: 0.1.0
4
+ Summary: Normalize Kapampangan words from Spanish-era (1730s) orthography to modern K-based orthography
5
+ Author: Keith Manaloto
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/keithmanaloto/normalize-kap-orthography
8
+ Project-URL: Issues, https://github.com/keithmanaloto/normalize-kap-orthography
9
+ Keywords: kapampangan,pampanga,orthography,nlp,linguistics
10
+ Classifier: Development Status :: 3 - Alpha
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Science/Research
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: Text Processing :: Linguistic
15
+ Requires-Python: >=3.10
16
+ Description-Content-Type: text/markdown
17
+ License-File: LICENSE
18
+ Dynamic: license-file
19
+
20
+ # normalize-kap-orthography
21
+
22
+ A Python utility to normalize Kapampangan words from Spanish-era (1730s) orthography to modern K-based orthography.
23
+
24
+ Built to make historical Kapampangan texts — like Bergaño's 1732 *Vocabulario de la Lengua Pampanga* — more accessible to modern readers, researchers, and NLP pipelines.
25
+
26
+ ## Background
27
+
28
+ Before the Spanish conquest, Kapampangans used their own indigenous writing system (Kulitan). Spanish missionaries romanized the language using Spanish orthographic conventions (C, Q, Ñ, LL, etc.). Over the past century, multiple competing romanized orthographies have emerged:
29
+
30
+ ## Disclaimer
31
+
32
+ I am not a linguist — I'm a native Kapampangan speaker who happens to be a computer science graduate. The conversion rules in this tool were identified through patterns I recognized while cleaning historical dictionary data, not through formal linguistic analysis. The script was spot-checked against the dataset and appears accurate, but it has not been exhaustively verified. If you spot errors or have linguistic expertise to contribute, please open an issue or PR.
33
+
34
+ | System | Also known as | Key features |
35
+ |---|---|---|
36
+ | Spanish-era ("Q & C") | *Súlat Bacúlud*, Old Orthography | Uses QU, C, Ñ, LL — the system used in colonial-era texts |
37
+ | ABAKADA ("K") | *Súlat Wáwâ*, New Orthography | K-based, aligned with the Philippine national orthography |
38
+ | Samson Hybrid | *Ámung Samson* | Retains C before a/o/u, replaces QU→K, adds diacritical marks |
39
+ | Batiáuan Revised | *Súlat Wáwâ a alâng WA* | K-based without W, with diacritical marks |
40
+
41
+ This tool converts from the **Spanish-era system** to a **modern K-based form** (closest to ABAKADA). For more on the orthography dispute, see [Pangilinan (2006)](https://sil-philippines-languages.org/ical/papers/pangilinan-Dispute%20on%20Orthography.pdf).
42
+
43
+ ## What it does
44
+
45
+ The converter applies two phases of transformation:
46
+
47
+ **Phase 1 — Spanish letter substitutions:**
48
+ - `QUI` → `KI`, `QUE` → `KE`
49
+ - `C` → `K` (except after `SI`)
50
+ - `Ñ` → `N`, `LL` → `L`
51
+ - Word-initial `V` → `W`
52
+
53
+ **Phase 2 — Vowel cluster and diphthong normalization:**
54
+ - `AO` → `O`, `AI`/`AY` → `E` (word-final, non-initial)
55
+ - `UA` → `WA`, `UO` → `WO`
56
+ - Various other diphthong simplifications
57
+
58
+ An **exceptions table** handles words that don't follow general patterns, and a **two-pass conversion** catches cascading transformations.
59
+
60
+ ## Installation
61
+
62
+ ```shell
63
+ pip install normalize-kap-orthography
64
+ ```
65
+
66
+ Or just copy `normalize_orthography.py` into your project.
67
+
68
+ ## Usage
69
+
70
+ ```python
71
+ from normalize_orthography import convert_orthography
72
+
73
+ convert_orthography("QUINANG") # → "KINANG"
74
+
75
+ convert_orthography("VATAUAT") # → "WATAWAT"
76
+
77
+ convert_orthography("QUECAI") # → "KEKE"
78
+
79
+ convert_orthography("KINANG") # → None (already modern)
80
+ ```
81
+
82
+ Returns the normalized form, or `None` if no conversion is needed.
83
+
84
+ ### CLI
85
+
86
+ ```shell
87
+ python normalize_orthography.py
88
+ ```
89
+
90
+
91
+ Runs a small set of built-in test cases.
92
+
93
+ ## Limitations
94
+
95
+ - **Not linguistically verified.** The rules were identified through pattern recognition by a native speaker, not through formal linguistic analysis. The script was spot-checked against dictionary data but not exhaustively validated.
96
+ - **No diacritical marks.** The script does not handle stress marking, which is important in Kapampangan — e.g., *masakit* (painful) vs. *masákit* (difficult) vs. *másakit* (ill) are three distinct words.
97
+ - **One-directional.** Currently only converts Spanish-era → modern. Reverse conversion is not supported.
98
+ - **Uppercase only.** Input is converted to uppercase internally; output is always uppercase.
99
+
100
+ ## Origin
101
+
102
+ Originally written in Dart as part of the v2 of [Learn Kulitan](https://github.com/keithliam/learn-kulitan-app), then rewritten in Python with **Claude Code Opus 4.6**.
103
+
104
+ ## Real-World Usage
105
+
106
+ This script was originally used to normalize ~5,000 words extracted from [*Vocabulario de la Lengua Pampanga*](https://archive.org/details/aqn8189.0001.001.umich.edu/page/1/mode/2up) by Fray Diego Bergaño, originally published in 1732 — one of the earliest known dictionaries of the Kapampángan language. About 40% of entries (1,989 out of 4,971) had their orthography normalized.
107
+
108
+ The raw, uncleaned entries and their cleaned, normalized versions are available as part of an open dataset on Hugging Face:
109
+
110
+ **[keithmanaloto/kapampangan-dictionary-embeddings](https://huggingface.co/datasets/keithmanaloto/kapampangan-dictionary-embeddings)**
111
+
112
+ The dataset also includes LLM-enriched metadata and pre-computed embeddings across multiple models — designed for semantic search, retrieval, and clustering over Kapampángan vocabulary. Both the original 1730s spelling and the normalized modern form are preserved in the dataset.
113
+
114
+ For the full story behind the dataset and what I learned building it, see the article:
115
+ [From a 300-Year-Old Dictionary to Hugging Face: I Built Kapampángan's First Embedding Dataset](https://keithmanaloto.medium.com/from-a-300-year-old-dictionary-to-hugging-face-i-built-kapampángans-first-embedding-dataset-dce2b877bd83)
116
+
117
+ ## Contributing
118
+
119
+ Contributions are welcome, especially:
120
+ - Expanding the exceptions table
121
+ - Adding test coverage against known word lists
122
+ - Adding diacritical mark support
123
+ - Supporting additional orthographic target systems
124
+
125
+ ## License
126
+
127
+ MIT
@@ -0,0 +1,8 @@
1
+ LICENSE
2
+ README.md
3
+ normalize_orthography.py
4
+ pyproject.toml
5
+ normalize_kap_orthography.egg-info/PKG-INFO
6
+ normalize_kap_orthography.egg-info/SOURCES.txt
7
+ normalize_kap_orthography.egg-info/dependency_links.txt
8
+ normalize_kap_orthography.egg-info/top_level.txt
@@ -0,0 +1,129 @@
1
+ """
2
+ Normalize Kapampangan orthography from 1730s Spanish-influenced spelling to modern form.
3
+
4
+ Ported from learn-kulitan-app-v2/lib/utils/utils.dart (TextUtils.convertOrthography).
5
+
6
+ Usage as module:
7
+ from normalize_orthography import convert_orthography
8
+ result = convert_orthography("QUINANG") # -> "KINANG"
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import re
14
+
15
+
16
+ EXCEPTIONS = {
17
+ "CAI": "KAYI",
18
+ "AIA": "AYA",
19
+ "VATAUAT": "WATAWAT",
20
+ "VALI": "WALI",
21
+ "PASIBAIO": "PASIBAYO",
22
+ "MAIUTPUT": "MAYUTPUT",
23
+ "OGNAY": "UGNE",
24
+ "BABAY": "BABAYI",
25
+ "IUAD": "IWAD",
26
+ "DALIUAUAT": "DALYAWAT",
27
+ "GUIUA": "GIWA",
28
+ "SAGUESAI": "SAGESE",
29
+ "MAUI": "MAWI",
30
+ "QUECAI": "KEKE",
31
+ "CABAYIAN": "KABAYIAN",
32
+ # Manual edge case
33
+ "PAYNG": "PAING",
34
+ }
35
+
36
+ # Phase 1: Spanish orthography normalization
37
+ REPLACEMENTS_1 = [
38
+ ("QUI", "KI"),
39
+ ("QUE", "KE"),
40
+ (re.compile(r"(?<!SI)C"), "K"),
41
+ ("Ñ", "N"),
42
+ ("LL", "L"),
43
+ ("LLA", "LA"),
44
+ ("LL", "L"),
45
+ ("ÑY", "NY"),
46
+ ("NN", "N"),
47
+ (re.compile(r"^V"), "W"),
48
+ ]
49
+
50
+ # Phase 2: Vowel cluster and diphthong normalization
51
+ # Note: Dart supports variable-width lookbehinds but Python doesn't.
52
+ # Patterns like (?<=..+) ("preceded by 2+ chars") are rewritten as
53
+ # equivalent fixed-width or capturing-group approaches.
54
+ REPLACEMENTS_2 = [
55
+ (re.compile(r"^O(?!U)"), "U"), # O -> U (word-initial, not before U)
56
+ (re.compile(r"(?<=..)AO$"), "O"), # AO -> O (not at start, 2+ chars before)
57
+ (re.compile(r"(?<=.)AI$"), "E"), # AI -> E (not at start, 1+ char before)
58
+ (re.compile(r"(?<=.)AY$"), "E"), # AY -> E (not at start)
59
+ (re.compile(r"AU$"), "AW"), # AU -> AW (word-final)
60
+ (re.compile(r"(?<!L)UA$"), "WA"), # UA -> WA (not after L)
61
+ (re.compile(r"(?<!B)UO"), "WO"), # UO -> WO (not after B)
62
+ (re.compile(r"(?<=..)IA"), "YA"), # IA -> YA (2+ chars before)
63
+ (re.compile(r"IU(?=A)"), "IW"), # IU -> IW (before A)
64
+ (re.compile(r"IU(?=E)"), "IW"), # IU -> IW (before E)
65
+ (re.compile(r"(?<=.)IU(?!A)"), "YU"), # IU -> YU (1+ char before, not before A)
66
+ (re.compile(r"(?<=..)UI$"), "I"), # UI -> I (2+ chars before, word-final)
67
+ (re.compile(r"(?<=..)IO$"), "YO"), # IO -> YO (2+ chars before, word-final)
68
+ (re.compile(r"(?<=..z)IY$"), "I"), # IY -> I (after 2+ chars ending in z, word-final)
69
+ (re.compile(r"IE$"), "YE"), # IE -> YE (word-final)
70
+ ("AUA", "AWA"),
71
+ ("AUI", "AWI"),
72
+ ("EUA", "EWA"),
73
+ ("UE", "WE"),
74
+ ("KK", "K"),
75
+ ]
76
+
77
+
78
+ def _apply_replacements(replacements: list, word: str) -> str:
79
+ for pattern, repl in replacements:
80
+ if isinstance(pattern, str):
81
+ word = word.replace(pattern, repl)
82
+ else:
83
+ word = pattern.sub(repl, word)
84
+ return word
85
+
86
+
87
+ def _convert_orthography(word: str) -> str:
88
+ new_word = _apply_replacements(REPLACEMENTS_1, word)
89
+ # Remove gemination across hyphens: K-K -> K
90
+ new_word = re.sub(r"(\w)-\1", r"\1", new_word)
91
+ return _apply_replacements(REPLACEMENTS_2, new_word)
92
+
93
+
94
+ def convert_orthography(word: str) -> str | None:
95
+ """Convert a word from 1730s orthography to modern form.
96
+
97
+ Returns the normalized form, or None if no conversion is needed
98
+ (i.e., the word is already in modern orthography).
99
+ """
100
+ upper = word.upper()
101
+
102
+ if upper in EXCEPTIONS:
103
+ return EXCEPTIONS[upper]
104
+
105
+ converted = _convert_orthography(upper)
106
+ if converted == upper:
107
+ return None
108
+
109
+ # Second pass
110
+ converted2 = _convert_orthography(converted)
111
+ if converted2 == upper:
112
+ return None
113
+
114
+ return converted2
115
+
116
+
117
+ if __name__ == "__main__":
118
+ # Quick test
119
+ test_cases = [
120
+ ("QUINANG", "KINANG"),
121
+ ("CAI", "KAYI"),
122
+ ("VATAUAT", "WATAWAT"),
123
+ ("PAYNG", "PAING"),
124
+ ("QUECAI", "KEKE"),
125
+ ]
126
+ for original, expected in test_cases:
127
+ result = convert_orthography(original)
128
+ status = "OK" if result == expected else f"FAIL (got {result})"
129
+ print(f" {original} -> {result} {status}")
@@ -0,0 +1,29 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "normalize-kap-orthography"
7
+ version = "0.1.0"
8
+ description = "Normalize Kapampangan words from Spanish-era (1730s) orthography to modern K-based orthography"
9
+ readme = "README.md"
10
+ license = "MIT"
11
+ requires-python = ">=3.10"
12
+ authors = [
13
+ { name = "Keith Manaloto" },
14
+ ]
15
+ keywords = ["kapampangan", "pampanga", "orthography", "nlp", "linguistics"]
16
+ classifiers = [
17
+ "Development Status :: 3 - Alpha",
18
+ "Intended Audience :: Developers",
19
+ "Intended Audience :: Science/Research",
20
+ "Programming Language :: Python :: 3",
21
+ "Topic :: Text Processing :: Linguistic",
22
+ ]
23
+
24
+ [tool.setuptools]
25
+ py-modules = ["normalize_orthography"]
26
+
27
+ [project.urls]
28
+ Homepage = "https://github.com/keithmanaloto/normalize-kap-orthography"
29
+ Issues = "https://github.com/keithmanaloto/normalize-kap-orthography"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+