mlsynth 0.0.2__tar.gz → 0.0.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mlsynth-0.0.2 → mlsynth-0.0.4}/LIMITATIONS.md +9 -13
- {mlsynth-0.0.2 → mlsynth-0.0.4}/PKG-INFO +13 -13
- {mlsynth-0.0.2 → mlsynth-0.0.4}/README.md +10 -10
- {mlsynth-0.0.2 → mlsynth-0.0.4}/pyproject.toml +3 -3
- {mlsynth-0.0.2 → mlsynth-0.0.4}/src/mlsynth/__init__.py +5 -2
- mlsynth-0.0.4/src/mlsynth/gender.py +39 -0
- {mlsynth-0.0.2 → mlsynth-0.0.4}/src/mlsynth/nouns.py +71 -1
- {mlsynth-0.0.2 → mlsynth-0.0.4}/src/mlsynth/rules.py +11 -1
- mlsynth-0.0.4/tests/test_gender.py +36 -0
- {mlsynth-0.0.2 → mlsynth-0.0.4}/tests/test_nouns.py +5 -6
- mlsynth-0.0.4/tests/test_plurals.py +174 -0
- {mlsynth-0.0.2 → mlsynth-0.0.4}/.gitignore +0 -0
- {mlsynth-0.0.2 → mlsynth-0.0.4}/LICENSE +0 -0
- {mlsynth-0.0.2 → mlsynth-0.0.4}/NOTICE +0 -0
- {mlsynth-0.0.2 → mlsynth-0.0.4}/REFERENCES.md +0 -0
- {mlsynth-0.0.2 → mlsynth-0.0.4}/src/mlsynth/pronouns.py +0 -0
- {mlsynth-0.0.2 → mlsynth-0.0.4}/src/mlsynth/py.typed +0 -0
- {mlsynth-0.0.2 → mlsynth-0.0.4}/src/mlsynth/types.py +0 -0
- {mlsynth-0.0.2 → mlsynth-0.0.4}/tests/test_pronouns.py +0 -0
|
@@ -2,25 +2,21 @@
|
|
|
2
2
|
|
|
3
3
|
mlsynth never emits a form it has not verified; when it cannot, it raises
|
|
4
4
|
`NotImplementedError` (a supported class, but that case or number is not encoded) or
|
|
5
|
-
`UnsupportedRoot` (the root fits no class). The items below are genuine constraints
|
|
6
|
-
|
|
7
|
-
variants) are tracked as
|
|
8
|
-
|
|
9
|
-
## Plurals: partial coverage
|
|
10
|
-
|
|
11
|
-
- The classes `am_neuter`, `vowel_anuswara`, `i_vowel`, `u_vowel`, and `ṭ_geminate` have
|
|
12
|
-
plurals. `a_stem` and the chillu classes do not yet: their plurals are
|
|
13
|
-
animacy-conditioned (`-മാർ` human versus `-കൾ` inanimate, with irregulars like `മക്കൾ`),
|
|
14
|
-
which is not predictable from the stem shape and is not yet modelled. Requesting one
|
|
15
|
-
raises `NotImplementedError`.
|
|
5
|
+
`UnsupportedRoot` (the root fits no class). The items below are genuine constraints:
|
|
6
|
+
distinctions the orthography does not carry, and problems outside this package's scope.
|
|
7
|
+
Planned-but-unbuilt features (clitics, postpositions, stylistic variants) are tracked as
|
|
8
|
+
future work in the README, not here.
|
|
16
9
|
|
|
17
10
|
## Distinctions the input does not carry
|
|
18
11
|
|
|
12
|
+
- **Animacy is not recoverable from spelling.** `a_stem` and chillu plurals therefore
|
|
13
|
+
require an explicit `animacy` and raise without it, since `-മാർ` (human) versus `-കൾ`
|
|
14
|
+
(inanimate) is not predictable from the stem. For the same reason the accusative
|
|
15
|
+
defaults to the overt form; the zero-marked inanimate accusative needs
|
|
16
|
+
`animacy=INANIMATE`.
|
|
19
17
|
- **`u_vowel` and `a_stem` match broadly.** A malformed lemma (a samvruthokaram half-u
|
|
20
18
|
written as a bare `-ു` instead of `-്`, or truncated text) is inflected rather than
|
|
21
19
|
rejected, so callers must pass well-formed lemmas.
|
|
22
|
-
- Without an `animacy` argument the accusative defaults to the overt form; the zero-marked
|
|
23
|
-
inanimate accusative needs `animacy=INANIMATE`.
|
|
24
20
|
|
|
25
21
|
## Out of scope
|
|
26
22
|
|
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mlsynth
|
|
3
|
-
Version: 0.0.
|
|
4
|
-
Summary:
|
|
3
|
+
Version: 0.0.4
|
|
4
|
+
Summary: Rule-based Malayalam morphological synthesizer (noun inflection generation).
|
|
5
5
|
Project-URL: Homepage, https://github.com/jayashankarvr/mlsynth
|
|
6
6
|
Project-URL: Source, https://github.com/jayashankarvr/mlsynth
|
|
7
7
|
Project-URL: Issues, https://github.com/jayashankarvr/mlsynth/issues
|
|
8
|
-
Author-email: Jayashankar R <
|
|
8
|
+
Author-email: Jayashankar R <56070307+jayashankarvr@users.noreply.github.com>
|
|
9
9
|
License-Expression: Apache-2.0
|
|
10
10
|
License-File: LICENSE
|
|
11
11
|
License-File: NOTICE
|
|
@@ -23,7 +23,7 @@ Description-Content-Type: text/markdown
|
|
|
23
23
|
|
|
24
24
|
# mlsynth
|
|
25
25
|
|
|
26
|
-
A
|
|
26
|
+
A rule-based Malayalam morphological synthesizer. It does forward
|
|
27
27
|
morphological generation: given a root and grammatical features, it produces the
|
|
28
28
|
inflected surface form (the counterpart to morphological analysis/segmentation).
|
|
29
29
|
|
|
@@ -66,16 +66,18 @@ outside the supported classes raise rather than guess. Five classes (`am_neuter`
|
|
|
66
66
|
`vowel_anuswara` കലാം, `i_vowel`
|
|
67
67
|
കുട്ടി/സ്ത്രീ, `u_vowel` പശു, `ṭ_geminate` വീട്) are complete in singular and plural;
|
|
68
68
|
`a_stem` (അമ്മ) and the chillu classes (`അവൻ`, `മകൾ`, `കാർ`, `കാൽ`, `തൂൺ`) are
|
|
69
|
-
singular-complete
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
69
|
+
singular-complete; their plurals are animacy-conditioned across the full paradigm
|
|
70
|
+
(inanimate `-കൾ`/`-ഉകൾ`, human `-മാർ`/`-ന്മാർ`/`-കാർ`, animate `-കൾ`). Suppletive personal
|
|
71
|
+
pronouns (ഞാൻ, നീ, അവർ, നാം, താൻ, ഇവൻ) are handled through an exception table rather than
|
|
72
|
+
the rule engine. A `derive_feminine` helper builds a feminine lemma from a masculine base
|
|
73
|
+
(എഴുത്തുകാരൻ → എഴുത്തുകാരി) before inflection. Includes differential object marking and a
|
|
74
|
+
synthetic/colloquial register for the instrumental. See [`LIMITATIONS.md`](LIMITATIONS.md)
|
|
75
|
+
for the precise gaps. Clitics/postpositions, stylistic variants, and verbs are future work.
|
|
74
76
|
|
|
75
77
|
## Install
|
|
76
78
|
|
|
77
79
|
```bash
|
|
78
|
-
pip install mlsynth
|
|
80
|
+
pip install mlsynth
|
|
79
81
|
# from source:
|
|
80
82
|
pip install -e ".[dev]"
|
|
81
83
|
```
|
|
@@ -85,6 +87,4 @@ pip install -e ".[dev]"
|
|
|
85
87
|
Apache-2.0. See `LICENSE` and `NOTICE`. Contributions are accepted under Apache-2.0
|
|
86
88
|
§5 (inbound = outbound); no separate CLA is required.
|
|
87
89
|
|
|
88
|
-
|
|
89
|
-
text, tables, code, or data is reproduced. Sources are credited in `REFERENCES.md` as
|
|
90
|
-
scholarship; that implies no endorsement and creates no license obligation.
|
|
90
|
+
Linguistic sources are credited in `REFERENCES.md`.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# mlsynth
|
|
2
2
|
|
|
3
|
-
A
|
|
3
|
+
A rule-based Malayalam morphological synthesizer. It does forward
|
|
4
4
|
morphological generation: given a root and grammatical features, it produces the
|
|
5
5
|
inflected surface form (the counterpart to morphological analysis/segmentation).
|
|
6
6
|
|
|
@@ -43,16 +43,18 @@ outside the supported classes raise rather than guess. Five classes (`am_neuter`
|
|
|
43
43
|
`vowel_anuswara` കലാം, `i_vowel`
|
|
44
44
|
കുട്ടി/സ്ത്രീ, `u_vowel` പശു, `ṭ_geminate` വീട്) are complete in singular and plural;
|
|
45
45
|
`a_stem` (അമ്മ) and the chillu classes (`അവൻ`, `മകൾ`, `കാർ`, `കാൽ`, `തൂൺ`) are
|
|
46
|
-
singular-complete
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
46
|
+
singular-complete; their plurals are animacy-conditioned across the full paradigm
|
|
47
|
+
(inanimate `-കൾ`/`-ഉകൾ`, human `-മാർ`/`-ന്മാർ`/`-കാർ`, animate `-കൾ`). Suppletive personal
|
|
48
|
+
pronouns (ഞാൻ, നീ, അവർ, നാം, താൻ, ഇവൻ) are handled through an exception table rather than
|
|
49
|
+
the rule engine. A `derive_feminine` helper builds a feminine lemma from a masculine base
|
|
50
|
+
(എഴുത്തുകാരൻ → എഴുത്തുകാരി) before inflection. Includes differential object marking and a
|
|
51
|
+
synthetic/colloquial register for the instrumental. See [`LIMITATIONS.md`](LIMITATIONS.md)
|
|
52
|
+
for the precise gaps. Clitics/postpositions, stylistic variants, and verbs are future work.
|
|
51
53
|
|
|
52
54
|
## Install
|
|
53
55
|
|
|
54
56
|
```bash
|
|
55
|
-
pip install mlsynth
|
|
57
|
+
pip install mlsynth
|
|
56
58
|
# from source:
|
|
57
59
|
pip install -e ".[dev]"
|
|
58
60
|
```
|
|
@@ -62,6 +64,4 @@ pip install -e ".[dev]"
|
|
|
62
64
|
Apache-2.0. See `LICENSE` and `NOTICE`. Contributions are accepted under Apache-2.0
|
|
63
65
|
§5 (inbound = outbound); no separate CLA is required.
|
|
64
66
|
|
|
65
|
-
|
|
66
|
-
text, tables, code, or data is reproduced. Sources are credited in `REFERENCES.md` as
|
|
67
|
-
scholarship; that implies no endorsement and creates no license obligation.
|
|
67
|
+
Linguistic sources are credited in `REFERENCES.md`.
|
|
@@ -4,13 +4,13 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "mlsynth"
|
|
7
|
-
version = "0.0.
|
|
8
|
-
description = "
|
|
7
|
+
version = "0.0.4"
|
|
8
|
+
description = "Rule-based Malayalam morphological synthesizer (noun inflection generation)."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9"
|
|
11
11
|
license = "Apache-2.0"
|
|
12
12
|
license-files = ["LICENSE", "NOTICE"]
|
|
13
|
-
authors = [{ name = "Jayashankar R", email = "
|
|
13
|
+
authors = [{ name = "Jayashankar R", email = "56070307+jayashankarvr@users.noreply.github.com" }]
|
|
14
14
|
keywords = ["malayalam", "morphology", "nlp", "inflection", "dravidian", "agglutinative", "synthesis"]
|
|
15
15
|
classifiers = [
|
|
16
16
|
"Development Status :: 3 - Alpha",
|
|
@@ -1,17 +1,18 @@
|
|
|
1
1
|
# SPDX-License-Identifier: Apache-2.0
|
|
2
2
|
# Copyright 2026 Jayashankar R
|
|
3
|
-
"""A
|
|
3
|
+
"""A rule-based Malayalam morphological synthesizer.
|
|
4
4
|
|
|
5
5
|
Forward morphological generation (root + features -> inflected surface form),
|
|
6
6
|
starting with noun inflection. Rules are declarative and provenance-tagged; see
|
|
7
7
|
REFERENCES.md for the linguistic sources and the project's native-ratification
|
|
8
8
|
workflow.
|
|
9
9
|
"""
|
|
10
|
+
from .gender import UnsupportedDerivation, derive_feminine
|
|
10
11
|
from .nouns import UnsupportedRoot, list_supported_classes, synthesize_noun
|
|
11
12
|
from .pronouns import is_pronoun, list_pronouns
|
|
12
13
|
from .types import Animacy, Case, Gender, NounFeatures, Number, Register, SynthResult
|
|
13
14
|
|
|
14
|
-
__version__ = "0.0.
|
|
15
|
+
__version__ = "0.0.4"
|
|
15
16
|
__all__ = [
|
|
16
17
|
"Animacy",
|
|
17
18
|
"Case",
|
|
@@ -24,6 +25,8 @@ __all__ = [
|
|
|
24
25
|
"list_supported_classes",
|
|
25
26
|
"is_pronoun",
|
|
26
27
|
"list_pronouns",
|
|
28
|
+
"derive_feminine",
|
|
29
|
+
"UnsupportedDerivation",
|
|
27
30
|
"UnsupportedRoot",
|
|
28
31
|
"__version__",
|
|
29
32
|
]
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2026 Jayashankar R
|
|
3
|
+
"""Gender derivation: build a feminine stem from a masculine one (pre-inflection).
|
|
4
|
+
|
|
5
|
+
Malayalam marks gender on animate nouns only, and a feminine noun is *derived* from its
|
|
6
|
+
masculine base before any pluralization or case inflection. Two ratified patterns:
|
|
7
|
+
* a masculine ending in the chillu -ൻ takes -ഇ (എഴുത്തുകാരൻ -> എഴുത്തുകാരി);
|
|
8
|
+
* a masculine ending in -ഇ takes -ഇനി (വിദ്യാർത്ഥി -> വിദ്യാർത്ഥിനി).
|
|
9
|
+
|
|
10
|
+
``derive_feminine`` returns a plain root string, ready to feed back into
|
|
11
|
+
``synthesize_noun``. It is deliberately a separate step: derivation is lexical (it builds
|
|
12
|
+
a new lemma), inflection is grammatical. A base that fits neither pattern raises rather
|
|
13
|
+
than guess.
|
|
14
|
+
"""
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class UnsupportedDerivation(ValueError):
|
|
19
|
+
"""The masculine base fits no known feminine-derivation pattern."""
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def derive_feminine(masculine: str) -> str:
|
|
23
|
+
"""Derive the feminine lemma from a masculine animate base.
|
|
24
|
+
|
|
25
|
+
``-ൻ`` -> ``-ഇ`` (എഴുത്തുകാരൻ -> എഴുത്തുകാരി); ``-ഇ`` -> ``-ഇനി``
|
|
26
|
+
(വിദ്യാർത്ഥി -> വിദ്യാർത്ഥിനി). The caller is responsible for passing a genuinely
|
|
27
|
+
animate masculine noun; an unrecognised ending raises :class:`UnsupportedDerivation`.
|
|
28
|
+
"""
|
|
29
|
+
if masculine.endswith("ൻ"):
|
|
30
|
+
stem = masculine[:-1] # drop the chillu; the bare consonant takes -ഇ
|
|
31
|
+
if stem and "ക" <= stem[-1] <= "ഺ":
|
|
32
|
+
return stem + "ി"
|
|
33
|
+
elif masculine.endswith("ി"):
|
|
34
|
+
if len(masculine) > 1 and "ക" <= masculine[-2] <= "ഺ": # -ഇ on a real consonant
|
|
35
|
+
return masculine + "നി" # -ഇ -> -ഇനി
|
|
36
|
+
raise UnsupportedDerivation(
|
|
37
|
+
f"cannot derive a feminine form from {masculine!r}; expected a masculine base "
|
|
38
|
+
f"ending in -ൻ (-> -ഇ) or -ഇ (-> -ഇനി)"
|
|
39
|
+
)
|
|
@@ -18,7 +18,7 @@ from __future__ import annotations
|
|
|
18
18
|
from typing import List, Optional
|
|
19
19
|
|
|
20
20
|
from .pronouns import PRONOUN_FORMS
|
|
21
|
-
from .rules import CLASSES, NounClass
|
|
21
|
+
from .rules import CLASSES, NounClass, _plural_cases
|
|
22
22
|
from .types import Animacy, Case, Gender, NounFeatures, Number, Register, SynthResult
|
|
23
23
|
|
|
24
24
|
# Chillu -> base consonant (a chillu reverts before a following vowel sign).
|
|
@@ -26,6 +26,41 @@ _CHILLU_BASE = {
|
|
|
26
26
|
"ൺ": "ണ", "ൻ": "ന", "ർ": "ര", "ൽ": "ല", "ൾ": "ള", "ൿ": "ക",
|
|
27
27
|
}
|
|
28
28
|
|
|
29
|
+
# Shared oblique plural suffixes for a plural stem ending in a chillu (ൾ for inanimate
|
|
30
|
+
# -കൾ/-ഉകൾ and the irregular മക്കൾ; ർ for the human -മാർ/-ന്മാർ/-കാർ). _attach reverts the
|
|
31
|
+
# chillu before a vowel suffix, so the same paradigm serves every animacy.
|
|
32
|
+
_PLURAL_OBLIQUE = _plural_cases("native-2026", True)
|
|
33
|
+
|
|
34
|
+
# Suppletive / kinship plurals: root -> plural nominative, checked before the regular
|
|
35
|
+
# animacy rules. These are human/animate, so only the nominative is encoded for now.
|
|
36
|
+
_IRREGULAR_PLURALS = {
|
|
37
|
+
"മകൾ": "മക്കൾ", # daughter -> children
|
|
38
|
+
"മകൻ": "മക്കൾ", # son -> children
|
|
39
|
+
"അവൻ": "അവർ", # he -> they (-അൻ -> -അർ, lexical)
|
|
40
|
+
"മനുഷ്യൻ": "മനുഷ്യർ", # man -> people (-അൻ -> -അർ, lexical; not *മനുഷ്യന്മാർ)
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _inanimate_plural_stem(cls: NounClass, root: str) -> str:
|
|
45
|
+
"""Inanimate plural: a chillu drops to its *lexical* base consonant via the class
|
|
46
|
+
oblique + -ഉകൾ (കാർ -> കാറുകൾ, with the retroflex റ, not the dental ര of -മാർ); a
|
|
47
|
+
bare-consonant (-അ) stem just takes -കൾ (പുഴ -> പുഴകൾ)."""
|
|
48
|
+
if root and root[-1] in _CHILLU_BASE:
|
|
49
|
+
return cls.oblique.apply(root) + "ുകൾ"
|
|
50
|
+
return root + "കൾ"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _human_plural_stem(root: str) -> str:
|
|
54
|
+
"""Human plural (native spec, agentive-aware): the agentive -കാരൻ takes -കാർ
|
|
55
|
+
(എഴുത്തുകാരൻ -> എഴുത്തുകാർ); any other -ൻ takes -ന്മാർ (അധ്യാപകൻ -> അധ്യാപകന്മാർ);
|
|
56
|
+
everything else takes -മാർ (അമ്മ -> അമ്മമാർ, ഡോക്ടർ -> ഡോക്ടർമാർ). Lexical -ർ
|
|
57
|
+
exceptions (മനുഷ്യൻ -> മനുഷ്യർ) belong in the irregular-override table."""
|
|
58
|
+
if root.endswith("കാരൻ"):
|
|
59
|
+
return root[:-2] + "ർ"
|
|
60
|
+
if root.endswith("ൻ"):
|
|
61
|
+
return root[:-1] + "ന്മാർ"
|
|
62
|
+
return root + "മാർ"
|
|
63
|
+
|
|
29
64
|
|
|
30
65
|
def _is_vowel_sign(ch: str) -> bool:
|
|
31
66
|
o = ord(ch)
|
|
@@ -173,6 +208,41 @@ def synthesize_noun(
|
|
|
173
208
|
stem_class=cls.name, provenance=provenance, verified=verified, analytic=analytic,
|
|
174
209
|
)
|
|
175
210
|
|
|
211
|
+
# Animacy-conditioned plurals (a_stem + chillu), computed here rather than table-driven
|
|
212
|
+
# because the marker depends on animacy. Every plural stem ends in a chillu (ർ for
|
|
213
|
+
# -മാർ/-ന്മാർ/-കാർ/അവർ/മനുഷ്യർ, ൾ for -കൾ/-ഉകൾ/മക്കൾ), and _attach reverts it before a
|
|
214
|
+
# vowel suffix (ർ -> ര: അമ്മമാർ + ുടെ -> അമ്മമാരുടെ; ൾ -> ള: മക്കൾ + ുടെ -> മക്കളുടെ), so
|
|
215
|
+
# one ratified oblique paradigm serves all three animacies. Animacy must be given:
|
|
216
|
+
# without it -മാർ vs -കൾ is unpredictable, so we raise rather than guess. Non-human
|
|
217
|
+
# animates pluralize like inanimates (-കൾ) but keep the overt accusative.
|
|
218
|
+
if number is Number.PLURAL and cls.plural_requires_animacy:
|
|
219
|
+
if root in _IRREGULAR_PLURALS:
|
|
220
|
+
plural_nom = _IRREGULAR_PLURALS[root]
|
|
221
|
+
elif animacy in (Animacy.INANIMATE, Animacy.ANIMATE):
|
|
222
|
+
plural_nom = _inanimate_plural_stem(cls, root)
|
|
223
|
+
elif animacy is Animacy.HUMAN:
|
|
224
|
+
plural_nom = _human_plural_stem(root)
|
|
225
|
+
else:
|
|
226
|
+
raise NotImplementedError(
|
|
227
|
+
f"plural of class {cls.name!r} requires an explicit animacy "
|
|
228
|
+
f"(HUMAN, ANIMATE, or INANIMATE); got {animacy!r}"
|
|
229
|
+
)
|
|
230
|
+
if case is Case.INSTRUMENTAL and register is Register.COLLOQUIAL:
|
|
231
|
+
return _result(plural_nom + " കൊണ്ട്", [plural_nom, "കൊണ്ട്"],
|
|
232
|
+
"native-2026", True, analytic=True)
|
|
233
|
+
if case is Case.NOMINATIVE:
|
|
234
|
+
return _result(plural_nom, [plural_nom], "native-2026", True)
|
|
235
|
+
# DOM zero-marks the inanimate accusative only. Irregulars are human/animate (their
|
|
236
|
+
# override stem is checked before animacy), so they keep the overt accusative even
|
|
237
|
+
# if a contradictory animacy=INANIMATE is passed.
|
|
238
|
+
if (case is Case.ACCUSATIVE and animacy is Animacy.INANIMATE
|
|
239
|
+
and not marked and root not in _IRREGULAR_PLURALS):
|
|
240
|
+
return _result(plural_nom, [plural_nom], "native-2026", True)
|
|
241
|
+
suffix = _PLURAL_OBLIQUE[case].suffix
|
|
242
|
+
surface = _attach(plural_nom, suffix)
|
|
243
|
+
morphemes = [plural_nom, suffix] if suffix else [plural_nom]
|
|
244
|
+
return _result(surface, morphemes, "native-2026", True)
|
|
245
|
+
|
|
176
246
|
table = cls.plural_cases if number is Number.PLURAL else cls.cases
|
|
177
247
|
try:
|
|
178
248
|
rule = table[case]
|
|
@@ -52,6 +52,9 @@ class NounClass:
|
|
|
52
52
|
plural_cases: Dict[Case, CaseRule]
|
|
53
53
|
pre: str = "" # "" | "consonant" | "vowel" (what precedes the ending)
|
|
54
54
|
ends_consonant: bool = False # match roots ending in a bare consonant (-അ stems)
|
|
55
|
+
# When True the plural is animacy-conditioned (-മാർ human vs -കൾ inanimate), so it is
|
|
56
|
+
# computed in nouns.py rather than read from plural_marker/plural_cases (a_stem + chillu).
|
|
57
|
+
plural_requires_animacy: bool = False
|
|
55
58
|
|
|
56
59
|
|
|
57
60
|
_GEMINATE_T = StemTransform("ട്", "ട്ട") # ട് -> ട്ട for the ṭ_geminate spatial cases
|
|
@@ -212,11 +215,13 @@ CLASSES: Dict[str, NounClass] = {
|
|
|
212
215
|
},
|
|
213
216
|
plural_marker=StemTransform("", ""),
|
|
214
217
|
plural_cases={},
|
|
218
|
+
plural_requires_animacy=True,
|
|
215
219
|
),
|
|
216
220
|
# Chillu-final classes. The chillu reverts to its base consonant for vowel suffixes;
|
|
217
221
|
# each chillu takes a different suffix set (native worksheet).
|
|
218
222
|
# ൻ/ൾ take -ഓട് sociative and -ആൽ instrumental (on the reverted stem);
|
|
219
|
-
# ർ/ൽ/ൺ take the -ഇന- linker (sociative -ഇനോട്, instrumental -ഇനാൽ). Plurals
|
|
223
|
+
# ർ/ൽ/ൺ take the -ഇന- linker (sociative -ഇനോട്, instrumental -ഇനാൽ). Plurals are
|
|
224
|
+
# animacy-conditioned and computed in nouns.py (plural_requires_animacy).
|
|
220
225
|
"chillu_n": NounClass(
|
|
221
226
|
name="chillu_n",
|
|
222
227
|
description="-ൻ final nouns (e.g. അവൻ, നടൻ).",
|
|
@@ -237,6 +242,7 @@ CLASSES: Dict[str, NounClass] = {
|
|
|
237
242
|
},
|
|
238
243
|
plural_marker=StemTransform("", ""),
|
|
239
244
|
plural_cases={},
|
|
245
|
+
plural_requires_animacy=True,
|
|
240
246
|
),
|
|
241
247
|
"chillu_ll": NounClass(
|
|
242
248
|
name="chillu_ll",
|
|
@@ -258,6 +264,7 @@ CLASSES: Dict[str, NounClass] = {
|
|
|
258
264
|
},
|
|
259
265
|
plural_marker=StemTransform("", ""),
|
|
260
266
|
plural_cases={},
|
|
267
|
+
plural_requires_animacy=True,
|
|
261
268
|
),
|
|
262
269
|
"chillu_r": NounClass(
|
|
263
270
|
name="chillu_r",
|
|
@@ -279,6 +286,7 @@ CLASSES: Dict[str, NounClass] = {
|
|
|
279
286
|
},
|
|
280
287
|
plural_marker=StemTransform("", ""),
|
|
281
288
|
plural_cases={},
|
|
289
|
+
plural_requires_animacy=True,
|
|
282
290
|
),
|
|
283
291
|
"chillu_l": NounClass(
|
|
284
292
|
name="chillu_l",
|
|
@@ -300,6 +308,7 @@ CLASSES: Dict[str, NounClass] = {
|
|
|
300
308
|
},
|
|
301
309
|
plural_marker=StemTransform("", ""),
|
|
302
310
|
plural_cases={},
|
|
311
|
+
plural_requires_animacy=True,
|
|
303
312
|
),
|
|
304
313
|
"chillu_nn": NounClass(
|
|
305
314
|
name="chillu_nn",
|
|
@@ -321,5 +330,6 @@ CLASSES: Dict[str, NounClass] = {
|
|
|
321
330
|
},
|
|
322
331
|
plural_marker=StemTransform("", ""),
|
|
323
332
|
plural_cases={},
|
|
333
|
+
plural_requires_animacy=True,
|
|
324
334
|
),
|
|
325
335
|
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2026 Jayashankar R
|
|
3
|
+
"""Feminine derivation (pre-inflection) and its composition with case inflection."""
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
from mlsynth import (
|
|
7
|
+
Case,
|
|
8
|
+
UnsupportedDerivation,
|
|
9
|
+
derive_feminine,
|
|
10
|
+
synthesize_noun,
|
|
11
|
+
)
|
|
12
|
+
|
|
13
|
+
# (masculine base, feminine derivation)
|
|
14
|
+
PAIRS = [
|
|
15
|
+
("എഴുത്തുകാരൻ", "എഴുത്തുകാരി"), # -ൻ -> -ഇ (agentive writer)
|
|
16
|
+
("വിദ്യാർത്ഥി", "വിദ്യാർത്ഥിനി"), # -ഇ -> -ഇനി (student)
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@pytest.mark.parametrize("masc, fem", PAIRS)
|
|
21
|
+
def test_derive_feminine(masc, fem):
|
|
22
|
+
assert derive_feminine(masc) == fem
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_derive_then_inflect():
|
|
26
|
+
# Derivation is pre-inflection: the feminine lemma feeds straight back in and
|
|
27
|
+
# inflects as an ordinary -ഇ (i_vowel) stem.
|
|
28
|
+
fem = derive_feminine("എഴുത്തുകാരൻ") # എഴുത്തുകാരി
|
|
29
|
+
assert synthesize_noun(fem, Case.GENITIVE).surface == "എഴുത്തുകാരിയുടെ"
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@pytest.mark.parametrize("bad", ["മരം", "പശു", "വീട്", "അമ്മ", "ൻ", "ി"])
|
|
33
|
+
def test_underivable_base_raises(bad):
|
|
34
|
+
# A bare chillu ("ൻ") or a bare matra ("ി") must raise, not emit a lone-matra non-word.
|
|
35
|
+
with pytest.raises(UnsupportedDerivation):
|
|
36
|
+
derive_feminine(bad)
|
|
@@ -181,14 +181,13 @@ def test_explicit_mismatched_class_raises():
|
|
|
181
181
|
synthesize_noun(AM, Case.ACCUSATIVE, stem_class="i_vowel")
|
|
182
182
|
|
|
183
183
|
|
|
184
|
-
def
|
|
184
|
+
def test_a_stem_chillu_plural_needs_animacy():
|
|
185
|
+
# a_stem/chillu plurals are animacy-conditioned; without animacy they raise rather
|
|
186
|
+
# than guess (-മാർ vs -കൾ). Full coverage lives in test_plurals.py.
|
|
185
187
|
with pytest.raises(NotImplementedError):
|
|
186
|
-
synthesize_noun("അമ്മ", Case.GENITIVE, number=Number.PLURAL)
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
def test_unencoded_plural_raises():
|
|
188
|
+
synthesize_noun("അമ്മ", Case.GENITIVE, number=Number.PLURAL)
|
|
190
189
|
with pytest.raises(NotImplementedError):
|
|
191
|
-
synthesize_noun("
|
|
190
|
+
synthesize_noun("കാർ", Case.GENITIVE, number=Number.PLURAL)
|
|
192
191
|
|
|
193
192
|
|
|
194
193
|
@pytest.mark.parametrize("root,case,expected", [
|
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2026 Jayashankar R
|
|
3
|
+
"""Animacy-conditioned plurals for a_stem + chillu classes (0.0.3).
|
|
4
|
+
|
|
5
|
+
Inanimate plurals (-കൾ / -ഉകൾ) have the full case paradigm; human plurals
|
|
6
|
+
(-മാർ / -ന്മാർ / -കാർ) and irregulars expose only the nominative; an unknown animacy
|
|
7
|
+
raises rather than guess -മാർ vs -കൾ.
|
|
8
|
+
"""
|
|
9
|
+
import pytest
|
|
10
|
+
|
|
11
|
+
from mlsynth import Animacy, Case, Number, synthesize_noun
|
|
12
|
+
|
|
13
|
+
PL = Number.PLURAL
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _pl(root, case, **kw):
|
|
17
|
+
return synthesize_noun(root, case, number=PL, **kw)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
# --- Inanimate plurals: full paradigm ---------------------------------------
|
|
21
|
+
|
|
22
|
+
# (root, case, expected) for the inanimate full paradigm.
|
|
23
|
+
INANIMATE_FORMS = [
|
|
24
|
+
("പുഴ", Case.NOMINATIVE, "പുഴകൾ"),
|
|
25
|
+
("പുഴ", Case.GENITIVE, "പുഴകളുടെ"),
|
|
26
|
+
("പുഴ", Case.DATIVE, "പുഴകൾക്ക്"),
|
|
27
|
+
("പുഴ", Case.LOCATIVE, "പുഴകളിൽ"),
|
|
28
|
+
("പുഴ", Case.SOCIATIVE, "പുഴകളോട്"),
|
|
29
|
+
# chillu: the lexical base is restored (retroflex റ, not the dental ര of -മാർ).
|
|
30
|
+
("കാർ", Case.NOMINATIVE, "കാറുകൾ"),
|
|
31
|
+
("കാർ", Case.GENITIVE, "കാറുകളുടെ"),
|
|
32
|
+
("കാൽ", Case.NOMINATIVE, "കാലുകൾ"),
|
|
33
|
+
("കാൽ", Case.GENITIVE, "കാലുകളുടെ"),
|
|
34
|
+
("തൂൺ", Case.NOMINATIVE, "തൂണുകൾ"),
|
|
35
|
+
("തൂൺ", Case.GENITIVE, "തൂണുകളുടെ"),
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.mark.parametrize("root, case, expected", INANIMATE_FORMS)
|
|
40
|
+
def test_inanimate_plural_paradigm(root, case, expected):
|
|
41
|
+
r = _pl(root, case, animacy=Animacy.INANIMATE)
|
|
42
|
+
assert r.surface == expected
|
|
43
|
+
assert r.verified is True
|
|
44
|
+
assert r.provenance == "native-2026"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def test_inanimate_plural_accusative_is_dom_bare():
|
|
48
|
+
# DOM: the inanimate accusative defaults to the bare plural nominative.
|
|
49
|
+
assert _pl("പുഴ", Case.ACCUSATIVE, animacy=Animacy.INANIMATE).surface == "പുഴകൾ"
|
|
50
|
+
assert _pl("കാർ", Case.ACCUSATIVE, animacy=Animacy.INANIMATE).surface == "കാറുകൾ"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def test_inanimate_plural_accusative_marked_is_overt():
|
|
54
|
+
assert _pl(
|
|
55
|
+
"പുഴ", Case.ACCUSATIVE, animacy=Animacy.INANIMATE, marked=True
|
|
56
|
+
).surface == "പുഴകളെ"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
# --- Human plurals: nominative only -----------------------------------------
|
|
60
|
+
|
|
61
|
+
# (root, expected nominative plural) per the agentive-aware rule.
|
|
62
|
+
HUMAN_NOMINATIVES = [
|
|
63
|
+
("അമ്മ", "അമ്മമാർ"), # -അ stem -> -മാർ
|
|
64
|
+
("അധ്യാപകൻ", "അധ്യാപകന്മാർ"), # -ൻ -> -ന്മാർ
|
|
65
|
+
("എഴുത്തുകാരൻ", "എഴുത്തുകാർ"), # agentive -കാരൻ -> -കാർ
|
|
66
|
+
("ഡോക്ടർ", "ഡോക്ടർമാർ"), # -ർ human -> -മാർ
|
|
67
|
+
]
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
@pytest.mark.parametrize("root, expected", HUMAN_NOMINATIVES)
|
|
71
|
+
def test_human_plural_nominative(root, expected):
|
|
72
|
+
r = _pl(root, Case.NOMINATIVE, animacy=Animacy.HUMAN)
|
|
73
|
+
assert r.surface == expected
|
|
74
|
+
assert r.verified is True
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
# The full human-plural oblique paradigm (ർ -> ര before a vowel suffix), native-ratified.
|
|
78
|
+
HUMAN_OBLIQUE = [
|
|
79
|
+
(Case.ACCUSATIVE, "അമ്മമാരെ"),
|
|
80
|
+
(Case.GENITIVE, "അമ്മമാരുടെ"),
|
|
81
|
+
(Case.DATIVE, "അമ്മമാർക്ക്"), # consonant suffix: chillu stays intact
|
|
82
|
+
(Case.LOCATIVE, "അമ്മമാരിൽ"),
|
|
83
|
+
(Case.SOCIATIVE, "അമ്മമാരോട്"),
|
|
84
|
+
(Case.INSTRUMENTAL, "അമ്മമാരാൽ"),
|
|
85
|
+
(Case.VOCATIVE, "അമ്മമാരേ"),
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
@pytest.mark.parametrize("case, expected", HUMAN_OBLIQUE)
|
|
90
|
+
def test_human_plural_oblique(case, expected):
|
|
91
|
+
r = _pl("അമ്മ", case, animacy=Animacy.HUMAN)
|
|
92
|
+
assert r.surface == expected
|
|
93
|
+
assert r.verified is True
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def test_human_plural_accusative_is_overt():
|
|
97
|
+
# Humans take the overt accusative (DOM zero-marks inanimates only).
|
|
98
|
+
assert _pl("അമ്മ", Case.ACCUSATIVE, animacy=Animacy.HUMAN).surface == "അമ്മമാരെ"
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
# --- Irregular / suppletive plurals (no animacy needed) ----------------------
|
|
102
|
+
|
|
103
|
+
IRREGULARS = [
|
|
104
|
+
("മകൾ", "മക്കൾ"),
|
|
105
|
+
("മകൻ", "മക്കൾ"),
|
|
106
|
+
("അവൻ", "അവർ"),
|
|
107
|
+
("മനുഷ്യൻ", "മനുഷ്യർ"), # -അൻ -> -അർ lexical; not the *മനുഷ്യന്മാർ default
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@pytest.mark.parametrize("root, expected", IRREGULARS)
|
|
112
|
+
def test_irregular_plural(root, expected):
|
|
113
|
+
r = _pl(root, Case.NOMINATIVE) # irregulars are unambiguous; no animacy required
|
|
114
|
+
assert r.surface == expected
|
|
115
|
+
assert r.verified is True
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# Irregular oblique cases: ർ-final reverts to ര (അവർ), ൾ-final to ള (മക്കൾ).
|
|
119
|
+
IRREGULAR_OBLIQUE = [
|
|
120
|
+
("അവൻ", Case.ACCUSATIVE, "അവരെ"),
|
|
121
|
+
("അവൻ", Case.GENITIVE, "അവരുടെ"),
|
|
122
|
+
("അവൻ", Case.DATIVE, "അവർക്ക്"),
|
|
123
|
+
("മനുഷ്യൻ", Case.ACCUSATIVE, "മനുഷ്യരെ"),
|
|
124
|
+
("മനുഷ്യൻ", Case.GENITIVE, "മനുഷ്യരുടെ"),
|
|
125
|
+
("മകൾ", Case.GENITIVE, "മക്കളുടെ"),
|
|
126
|
+
("മകൾ", Case.DATIVE, "മക്കൾക്ക്"),
|
|
127
|
+
("മകൾ", Case.LOCATIVE, "മക്കളിൽ"),
|
|
128
|
+
]
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
@pytest.mark.parametrize("root, case, expected", IRREGULAR_OBLIQUE)
|
|
132
|
+
def test_irregular_plural_oblique(root, case, expected):
|
|
133
|
+
r = _pl(root, case)
|
|
134
|
+
assert r.surface == expected
|
|
135
|
+
assert r.verified is True
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def test_irregular_accusative_stays_overt_under_contradictory_animacy():
|
|
139
|
+
# Irregulars are human/animate; a contradictory animacy=INANIMATE must NOT DOM
|
|
140
|
+
# zero-mark them (അവർ -> overt അവരെ, never the bare അവർ).
|
|
141
|
+
assert _pl("അവൻ", Case.ACCUSATIVE, animacy=Animacy.INANIMATE).surface == "അവരെ"
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# --- ANIMATE (non-human) plurals: -കൾ like inanimate, but overt accusative ---
|
|
145
|
+
|
|
146
|
+
ANIMATE_FORMS = [
|
|
147
|
+
(Case.NOMINATIVE, "ആനകൾ"),
|
|
148
|
+
(Case.ACCUSATIVE, "ആനകളെ"), # overt: DOM zero-marks inanimates only
|
|
149
|
+
(Case.GENITIVE, "ആനകളുടെ"),
|
|
150
|
+
(Case.DATIVE, "ആനകൾക്ക്"),
|
|
151
|
+
(Case.SOCIATIVE, "ആനകളോട്"),
|
|
152
|
+
]
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
@pytest.mark.parametrize("case, expected", ANIMATE_FORMS)
|
|
156
|
+
def test_animate_nonhuman_plural(case, expected):
|
|
157
|
+
r = _pl("ആന", case, animacy=Animacy.ANIMATE)
|
|
158
|
+
assert r.surface == expected
|
|
159
|
+
assert r.verified is True
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
# --- Soundness: animacy is still required -----------------------------------
|
|
163
|
+
|
|
164
|
+
@pytest.mark.parametrize("root", ["അമ്മ", "കാർ", "അധ്യാപകൻ", "തൂൺ"])
|
|
165
|
+
def test_plural_without_animacy_raises(root):
|
|
166
|
+
with pytest.raises(NotImplementedError):
|
|
167
|
+
_pl(root, Case.NOMINATIVE)
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def test_done_classes_plurals_unaffected():
|
|
171
|
+
# The five already-complete classes keep their animacy-blind plurals.
|
|
172
|
+
assert synthesize_noun("മരം", Case.NOMINATIVE, number=PL).surface == "മരങ്ങൾ"
|
|
173
|
+
assert synthesize_noun("കുട്ടി", Case.NOMINATIVE, number=PL).surface == "കുട്ടികൾ"
|
|
174
|
+
assert synthesize_noun("വീട്", Case.GENITIVE, number=PL).surface == "വീടുകളുടെ"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|