bead 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bead/__init__.py +11 -0
- bead/__main__.py +11 -0
- bead/active_learning/__init__.py +15 -0
- bead/active_learning/config.py +231 -0
- bead/active_learning/loop.py +566 -0
- bead/active_learning/models/__init__.py +24 -0
- bead/active_learning/models/base.py +852 -0
- bead/active_learning/models/binary.py +910 -0
- bead/active_learning/models/categorical.py +943 -0
- bead/active_learning/models/cloze.py +862 -0
- bead/active_learning/models/forced_choice.py +956 -0
- bead/active_learning/models/free_text.py +773 -0
- bead/active_learning/models/lora.py +365 -0
- bead/active_learning/models/magnitude.py +835 -0
- bead/active_learning/models/multi_select.py +795 -0
- bead/active_learning/models/ordinal_scale.py +811 -0
- bead/active_learning/models/peft_adapter.py +155 -0
- bead/active_learning/models/random_effects.py +639 -0
- bead/active_learning/selection.py +354 -0
- bead/active_learning/strategies.py +391 -0
- bead/active_learning/trainers/__init__.py +26 -0
- bead/active_learning/trainers/base.py +210 -0
- bead/active_learning/trainers/data_collator.py +172 -0
- bead/active_learning/trainers/dataset_utils.py +261 -0
- bead/active_learning/trainers/huggingface.py +304 -0
- bead/active_learning/trainers/lightning.py +324 -0
- bead/active_learning/trainers/metrics.py +424 -0
- bead/active_learning/trainers/mixed_effects.py +551 -0
- bead/active_learning/trainers/model_wrapper.py +509 -0
- bead/active_learning/trainers/registry.py +104 -0
- bead/adapters/__init__.py +11 -0
- bead/adapters/huggingface.py +61 -0
- bead/behavioral/__init__.py +116 -0
- bead/behavioral/analytics.py +646 -0
- bead/behavioral/extraction.py +343 -0
- bead/behavioral/merging.py +343 -0
- bead/cli/__init__.py +11 -0
- bead/cli/active_learning.py +513 -0
- bead/cli/active_learning_commands.py +779 -0
- bead/cli/completion.py +359 -0
- bead/cli/config.py +624 -0
- bead/cli/constraint_builders.py +286 -0
- bead/cli/deployment.py +859 -0
- bead/cli/deployment_trials.py +493 -0
- bead/cli/deployment_ui.py +332 -0
- bead/cli/display.py +378 -0
- bead/cli/items.py +960 -0
- bead/cli/items_factories.py +776 -0
- bead/cli/list_constraints.py +714 -0
- bead/cli/lists.py +490 -0
- bead/cli/main.py +430 -0
- bead/cli/models.py +877 -0
- bead/cli/resource_loaders.py +621 -0
- bead/cli/resources.py +1036 -0
- bead/cli/shell.py +356 -0
- bead/cli/simulate.py +840 -0
- bead/cli/templates.py +1158 -0
- bead/cli/training.py +1080 -0
- bead/cli/utils.py +614 -0
- bead/cli/workflow.py +1273 -0
- bead/config/__init__.py +68 -0
- bead/config/active_learning.py +1009 -0
- bead/config/config.py +192 -0
- bead/config/defaults.py +118 -0
- bead/config/deployment.py +217 -0
- bead/config/env.py +147 -0
- bead/config/item.py +45 -0
- bead/config/list.py +193 -0
- bead/config/loader.py +149 -0
- bead/config/logging.py +42 -0
- bead/config/model.py +49 -0
- bead/config/paths.py +46 -0
- bead/config/profiles.py +320 -0
- bead/config/resources.py +47 -0
- bead/config/serialization.py +210 -0
- bead/config/simulation.py +206 -0
- bead/config/template.py +238 -0
- bead/config/validation.py +267 -0
- bead/data/__init__.py +65 -0
- bead/data/base.py +87 -0
- bead/data/identifiers.py +97 -0
- bead/data/language_codes.py +61 -0
- bead/data/metadata.py +270 -0
- bead/data/range.py +123 -0
- bead/data/repository.py +358 -0
- bead/data/serialization.py +249 -0
- bead/data/timestamps.py +89 -0
- bead/data/validation.py +349 -0
- bead/data_collection/__init__.py +11 -0
- bead/data_collection/jatos.py +223 -0
- bead/data_collection/merger.py +154 -0
- bead/data_collection/prolific.py +198 -0
- bead/deployment/__init__.py +5 -0
- bead/deployment/distribution.py +402 -0
- bead/deployment/jatos/__init__.py +1 -0
- bead/deployment/jatos/api.py +200 -0
- bead/deployment/jatos/exporter.py +210 -0
- bead/deployment/jspsych/__init__.py +9 -0
- bead/deployment/jspsych/biome.json +44 -0
- bead/deployment/jspsych/config.py +411 -0
- bead/deployment/jspsych/generator.py +598 -0
- bead/deployment/jspsych/package.json +51 -0
- bead/deployment/jspsych/pnpm-lock.yaml +2141 -0
- bead/deployment/jspsych/randomizer.py +299 -0
- bead/deployment/jspsych/src/lib/list-distributor.test.ts +327 -0
- bead/deployment/jspsych/src/lib/list-distributor.ts +1282 -0
- bead/deployment/jspsych/src/lib/randomizer.test.ts +232 -0
- bead/deployment/jspsych/src/lib/randomizer.ts +367 -0
- bead/deployment/jspsych/src/plugins/cloze-dropdown.ts +252 -0
- bead/deployment/jspsych/src/plugins/forced-choice.ts +265 -0
- bead/deployment/jspsych/src/plugins/plugins.test.ts +141 -0
- bead/deployment/jspsych/src/plugins/rating.ts +248 -0
- bead/deployment/jspsych/src/slopit/index.ts +9 -0
- bead/deployment/jspsych/src/types/jatos.d.ts +256 -0
- bead/deployment/jspsych/src/types/jspsych.d.ts +228 -0
- bead/deployment/jspsych/templates/experiment.css +1 -0
- bead/deployment/jspsych/templates/experiment.js.template +289 -0
- bead/deployment/jspsych/templates/index.html +51 -0
- bead/deployment/jspsych/templates/randomizer.js +241 -0
- bead/deployment/jspsych/templates/randomizer.js.template +313 -0
- bead/deployment/jspsych/trials.py +723 -0
- bead/deployment/jspsych/tsconfig.json +23 -0
- bead/deployment/jspsych/tsup.config.ts +30 -0
- bead/deployment/jspsych/ui/__init__.py +1 -0
- bead/deployment/jspsych/ui/components.py +383 -0
- bead/deployment/jspsych/ui/styles.py +411 -0
- bead/dsl/__init__.py +80 -0
- bead/dsl/ast.py +168 -0
- bead/dsl/context.py +178 -0
- bead/dsl/errors.py +71 -0
- bead/dsl/evaluator.py +570 -0
- bead/dsl/grammar.lark +81 -0
- bead/dsl/parser.py +231 -0
- bead/dsl/stdlib.py +929 -0
- bead/evaluation/__init__.py +13 -0
- bead/evaluation/convergence.py +485 -0
- bead/evaluation/interannotator.py +398 -0
- bead/items/__init__.py +40 -0
- bead/items/adapters/__init__.py +70 -0
- bead/items/adapters/anthropic.py +224 -0
- bead/items/adapters/api_utils.py +167 -0
- bead/items/adapters/base.py +216 -0
- bead/items/adapters/google.py +259 -0
- bead/items/adapters/huggingface.py +1074 -0
- bead/items/adapters/openai.py +323 -0
- bead/items/adapters/registry.py +202 -0
- bead/items/adapters/sentence_transformers.py +224 -0
- bead/items/adapters/togetherai.py +309 -0
- bead/items/binary.py +515 -0
- bead/items/cache.py +558 -0
- bead/items/categorical.py +593 -0
- bead/items/cloze.py +757 -0
- bead/items/constructor.py +784 -0
- bead/items/forced_choice.py +413 -0
- bead/items/free_text.py +681 -0
- bead/items/generation.py +432 -0
- bead/items/item.py +396 -0
- bead/items/item_template.py +787 -0
- bead/items/magnitude.py +573 -0
- bead/items/multi_select.py +621 -0
- bead/items/ordinal_scale.py +569 -0
- bead/items/scoring.py +448 -0
- bead/items/validation.py +723 -0
- bead/lists/__init__.py +30 -0
- bead/lists/balancer.py +263 -0
- bead/lists/constraints.py +1067 -0
- bead/lists/experiment_list.py +286 -0
- bead/lists/list_collection.py +378 -0
- bead/lists/partitioner.py +1141 -0
- bead/lists/stratification.py +254 -0
- bead/participants/__init__.py +73 -0
- bead/participants/collection.py +699 -0
- bead/participants/merging.py +312 -0
- bead/participants/metadata_spec.py +491 -0
- bead/participants/models.py +276 -0
- bead/resources/__init__.py +29 -0
- bead/resources/adapters/__init__.py +19 -0
- bead/resources/adapters/base.py +104 -0
- bead/resources/adapters/cache.py +128 -0
- bead/resources/adapters/glazing.py +508 -0
- bead/resources/adapters/registry.py +117 -0
- bead/resources/adapters/unimorph.py +796 -0
- bead/resources/classification.py +856 -0
- bead/resources/constraint_builders.py +329 -0
- bead/resources/constraints.py +165 -0
- bead/resources/lexical_item.py +223 -0
- bead/resources/lexicon.py +744 -0
- bead/resources/loaders.py +209 -0
- bead/resources/template.py +441 -0
- bead/resources/template_collection.py +707 -0
- bead/resources/template_generation.py +349 -0
- bead/simulation/__init__.py +29 -0
- bead/simulation/annotators/__init__.py +15 -0
- bead/simulation/annotators/base.py +175 -0
- bead/simulation/annotators/distance_based.py +135 -0
- bead/simulation/annotators/lm_based.py +114 -0
- bead/simulation/annotators/oracle.py +182 -0
- bead/simulation/annotators/random.py +181 -0
- bead/simulation/dsl_extension/__init__.py +3 -0
- bead/simulation/noise_models/__init__.py +13 -0
- bead/simulation/noise_models/base.py +42 -0
- bead/simulation/noise_models/random_noise.py +82 -0
- bead/simulation/noise_models/systematic.py +132 -0
- bead/simulation/noise_models/temperature.py +86 -0
- bead/simulation/runner.py +144 -0
- bead/simulation/strategies/__init__.py +23 -0
- bead/simulation/strategies/base.py +123 -0
- bead/simulation/strategies/binary.py +103 -0
- bead/simulation/strategies/categorical.py +123 -0
- bead/simulation/strategies/cloze.py +224 -0
- bead/simulation/strategies/forced_choice.py +127 -0
- bead/simulation/strategies/free_text.py +105 -0
- bead/simulation/strategies/magnitude.py +116 -0
- bead/simulation/strategies/multi_select.py +129 -0
- bead/simulation/strategies/ordinal_scale.py +131 -0
- bead/templates/__init__.py +27 -0
- bead/templates/adapters/__init__.py +17 -0
- bead/templates/adapters/base.py +128 -0
- bead/templates/adapters/cache.py +178 -0
- bead/templates/adapters/huggingface.py +312 -0
- bead/templates/combinatorics.py +103 -0
- bead/templates/filler.py +605 -0
- bead/templates/renderers.py +177 -0
- bead/templates/resolver.py +178 -0
- bead/templates/strategies.py +1806 -0
- bead/templates/streaming.py +195 -0
- bead-0.1.0.dist-info/METADATA +212 -0
- bead-0.1.0.dist-info/RECORD +231 -0
- bead-0.1.0.dist-info/WHEEL +4 -0
- bead-0.1.0.dist-info/entry_points.txt +2 -0
- bead-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,796 @@
|
|
|
1
|
+
"""Adapter for UniMorph morphological paradigms.
|
|
2
|
+
|
|
3
|
+
This module provides an adapter to fetch morphological paradigms from UniMorph
|
|
4
|
+
data and convert them to LexicalItem format with morphological features.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
import langcodes
|
|
12
|
+
import pandas as pd
|
|
13
|
+
import unimorph
|
|
14
|
+
from unimorph import load_dataset
|
|
15
|
+
|
|
16
|
+
from bead.data.language_codes import LanguageCode
|
|
17
|
+
from bead.resources.adapters.base import ResourceAdapter
|
|
18
|
+
from bead.resources.adapters.cache import AdapterCache
|
|
19
|
+
from bead.resources.lexical_item import LexicalItem
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class UniMorphAdapter(ResourceAdapter):
|
|
23
|
+
"""Adapter for UniMorph morphological paradigms.
|
|
24
|
+
|
|
25
|
+
This adapter fetches morphological paradigms from UniMorph and converts
|
|
26
|
+
them to LexicalItem format. Morphological features are stored in the
|
|
27
|
+
features field using UniMorph feature schema.
|
|
28
|
+
|
|
29
|
+
Parameters
|
|
30
|
+
----------
|
|
31
|
+
cache : AdapterCache | None
|
|
32
|
+
Optional cache instance. If None, no caching is performed.
|
|
33
|
+
|
|
34
|
+
Examples
|
|
35
|
+
--------
|
|
36
|
+
>>> adapter = UniMorphAdapter()
|
|
37
|
+
>>> items = adapter.fetch_items(query="walk", language_code="en")
|
|
38
|
+
>>> all(item.language_code == "en" for item in items)
|
|
39
|
+
True
|
|
40
|
+
>>> all("tense" in item.features for item in items if item.features)
|
|
41
|
+
True
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
def __init__(self, cache: AdapterCache | None = None) -> None:
|
|
45
|
+
"""Initialize UniMorph adapter.
|
|
46
|
+
|
|
47
|
+
Parameters
|
|
48
|
+
----------
|
|
49
|
+
cache : AdapterCache | None
|
|
50
|
+
Optional cache instance.
|
|
51
|
+
"""
|
|
52
|
+
self.cache = cache
|
|
53
|
+
self._datasets: dict[str, pd.DataFrame] = {} # Cache datasets by language
|
|
54
|
+
|
|
55
|
+
def fetch_items(
|
|
56
|
+
self,
|
|
57
|
+
query: str | None = None,
|
|
58
|
+
language_code: LanguageCode = None,
|
|
59
|
+
**kwargs: Any,
|
|
60
|
+
) -> list[LexicalItem]:
|
|
61
|
+
"""Fetch morphological paradigms from UniMorph.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
query : str | None
|
|
66
|
+
Lemma to query (e.g., "walk", "먹다", "hamba").
|
|
67
|
+
language_code : LanguageCode
|
|
68
|
+
**Required** language code (e.g., "en", "ko", "zu"). UniMorph
|
|
69
|
+
is organized by language, so this parameter is essential.
|
|
70
|
+
**kwargs : Any
|
|
71
|
+
Additional parameters (e.g., pos="VERB").
|
|
72
|
+
|
|
73
|
+
Returns
|
|
74
|
+
-------
|
|
75
|
+
list[LexicalItem]
|
|
76
|
+
Lexical items representing inflected forms with morphological
|
|
77
|
+
features in the features field.
|
|
78
|
+
|
|
79
|
+
Raises
|
|
80
|
+
------
|
|
81
|
+
ValueError
|
|
82
|
+
If language_code is None (required for UniMorph).
|
|
83
|
+
RuntimeError
|
|
84
|
+
If UniMorph access fails.
|
|
85
|
+
|
|
86
|
+
Examples
|
|
87
|
+
--------
|
|
88
|
+
>>> adapter = UniMorphAdapter()
|
|
89
|
+
>>> items = adapter.fetch_items(query="walk", language_code="en")
|
|
90
|
+
>>> len(items) > 0
|
|
91
|
+
True
|
|
92
|
+
>>> items[0].features.get("pos") == "VERB"
|
|
93
|
+
True
|
|
94
|
+
"""
|
|
95
|
+
if language_code is None:
|
|
96
|
+
raise ValueError("UniMorphAdapter requires language_code parameter")
|
|
97
|
+
|
|
98
|
+
# Normalize to ISO 639-3 (3-letter code) for UniMorph
|
|
99
|
+
# UniMorph uses 3-letter codes (language_code is guaranteed non-None here)
|
|
100
|
+
lang_code = self._normalize_language_code(language_code)
|
|
101
|
+
|
|
102
|
+
# Check cache
|
|
103
|
+
cache_key = None
|
|
104
|
+
if self.cache:
|
|
105
|
+
cache_key = self.cache.make_key(
|
|
106
|
+
"unimorph", query=query, language_code=lang_code, **kwargs
|
|
107
|
+
)
|
|
108
|
+
cached = self.cache.get(cache_key)
|
|
109
|
+
if cached is not None:
|
|
110
|
+
return cached
|
|
111
|
+
|
|
112
|
+
# Fetch from UniMorph
|
|
113
|
+
try:
|
|
114
|
+
# Load dataset for language (cached at instance level)
|
|
115
|
+
if lang_code not in self._datasets:
|
|
116
|
+
self._datasets[lang_code] = load_dataset(lang_code)
|
|
117
|
+
|
|
118
|
+
dataset = self._datasets[lang_code]
|
|
119
|
+
|
|
120
|
+
# Filter by lemma if query provided
|
|
121
|
+
if query:
|
|
122
|
+
dataset = dataset[dataset["lemma"] == query]
|
|
123
|
+
|
|
124
|
+
# Convert to LexicalItem objects
|
|
125
|
+
items: list[LexicalItem] = []
|
|
126
|
+
for _, row in dataset.iterrows():
|
|
127
|
+
# Skip rows with NaN values
|
|
128
|
+
if (
|
|
129
|
+
row["lemma"] is None
|
|
130
|
+
or row["form"] is None
|
|
131
|
+
or row["features"] is None
|
|
132
|
+
or str(row["lemma"]) == "nan"
|
|
133
|
+
or str(row["form"]) == "nan"
|
|
134
|
+
or str(row["features"]) == "nan"
|
|
135
|
+
):
|
|
136
|
+
continue
|
|
137
|
+
|
|
138
|
+
# Parse features string (e.g., "V;PRS;3;SG")
|
|
139
|
+
features_dict = self._parse_features(str(row["features"]))
|
|
140
|
+
|
|
141
|
+
item = LexicalItem(
|
|
142
|
+
lemma=str(row["lemma"]),
|
|
143
|
+
form=str(row["form"]),
|
|
144
|
+
language_code=language_code,
|
|
145
|
+
features=features_dict,
|
|
146
|
+
source="UniMorph",
|
|
147
|
+
)
|
|
148
|
+
items.append(item)
|
|
149
|
+
|
|
150
|
+
# Cache result
|
|
151
|
+
if self.cache and cache_key:
|
|
152
|
+
self.cache.set(cache_key, items)
|
|
153
|
+
|
|
154
|
+
return items
|
|
155
|
+
|
|
156
|
+
except Exception as e:
|
|
157
|
+
raise RuntimeError(f"Failed to fetch from UniMorph: {e}") from e
|
|
158
|
+
|
|
159
|
+
def _normalize_language_code(self, language_code: LanguageCode) -> str:
|
|
160
|
+
"""Normalize language code to ISO 639-3 (3-letter) format.
|
|
161
|
+
|
|
162
|
+
Uses the langcodes package to properly convert ISO 639-1 (2-letter) codes
|
|
163
|
+
to ISO 639-3 (3-letter) codes.
|
|
164
|
+
|
|
165
|
+
Parameters
|
|
166
|
+
----------
|
|
167
|
+
language_code : LanguageCode
|
|
168
|
+
Language code (2 or 3 letters, non-None).
|
|
169
|
+
|
|
170
|
+
Returns
|
|
171
|
+
-------
|
|
172
|
+
str
|
|
173
|
+
ISO 639-3 (3-letter) language code.
|
|
174
|
+
|
|
175
|
+
Raises
|
|
176
|
+
------
|
|
177
|
+
ValueError
|
|
178
|
+
If language_code is None.
|
|
179
|
+
"""
|
|
180
|
+
if language_code is None:
|
|
181
|
+
raise ValueError(
|
|
182
|
+
"language_code cannot be None when normalizing. "
|
|
183
|
+
"This should be checked by the caller."
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
# Use langcodes package to normalize
|
|
187
|
+
try:
|
|
188
|
+
# If it's already 3 letters, return as-is
|
|
189
|
+
if len(language_code) == 3:
|
|
190
|
+
return language_code
|
|
191
|
+
|
|
192
|
+
# For 2-letter codes, use langcodes to get the 3-letter equivalent
|
|
193
|
+
lang = langcodes.Language.get(language_code)
|
|
194
|
+
return lang.to_alpha3()
|
|
195
|
+
except Exception:
|
|
196
|
+
# If conversion fails, return as-is
|
|
197
|
+
return language_code
|
|
198
|
+
|
|
199
|
+
def _get_tag_dimension(self, tag: str) -> str:
|
|
200
|
+
"""Get the dimension for a UniMorph tag.
|
|
201
|
+
|
|
202
|
+
Based on analysis of 173 languages and 575 tags from
|
|
203
|
+
the actual UniMorph data.
|
|
204
|
+
|
|
205
|
+
Parameters
|
|
206
|
+
----------
|
|
207
|
+
tag : str
|
|
208
|
+
UniMorph feature tag.
|
|
209
|
+
|
|
210
|
+
Returns
|
|
211
|
+
-------
|
|
212
|
+
str
|
|
213
|
+
Dimension name, or "unknown" if tag is not recognized.
|
|
214
|
+
"""
|
|
215
|
+
# Language-specific tags
|
|
216
|
+
if tag.startswith("LGSPEC") or tag.startswith("LGSPE"):
|
|
217
|
+
return "lgspec"
|
|
218
|
+
|
|
219
|
+
# Tag-to-dimension mapping
|
|
220
|
+
# Build lookup lazily to avoid repeating this logic
|
|
221
|
+
if not hasattr(self, "_tag_map"):
|
|
222
|
+
self._tag_map = self._build_tag_map()
|
|
223
|
+
|
|
224
|
+
return self._tag_map.get(tag, "unknown")
|
|
225
|
+
|
|
226
|
+
def _build_tag_map(self) -> dict[str, str]:
|
|
227
|
+
"""Build complete tag-to-dimension mapping.
|
|
228
|
+
|
|
229
|
+
Returns
|
|
230
|
+
-------
|
|
231
|
+
dict[str, str]
|
|
232
|
+
Mapping from tag to dimension name.
|
|
233
|
+
"""
|
|
234
|
+
mapping: dict[str, str] = {}
|
|
235
|
+
|
|
236
|
+
# Part of speech
|
|
237
|
+
for tag in [
|
|
238
|
+
"N",
|
|
239
|
+
"V",
|
|
240
|
+
"ADJ",
|
|
241
|
+
"ADV",
|
|
242
|
+
"PRO",
|
|
243
|
+
"ART",
|
|
244
|
+
"DET",
|
|
245
|
+
"ADP",
|
|
246
|
+
"CONJ",
|
|
247
|
+
"INTJ",
|
|
248
|
+
"NUM",
|
|
249
|
+
"PRON",
|
|
250
|
+
"PROPN",
|
|
251
|
+
"PRT",
|
|
252
|
+
]:
|
|
253
|
+
mapping[tag] = "pos"
|
|
254
|
+
|
|
255
|
+
# Person (including complex)
|
|
256
|
+
for tag in ["0", "1", "2", "3", "4", "5", "1+2", "2+3", "1+EXCL", "1+INCL"]:
|
|
257
|
+
mapping[tag] = "person"
|
|
258
|
+
|
|
259
|
+
# Number
|
|
260
|
+
for tag in ["SG", "DU", "PL", "SG+PL", "DU/PL", "SG/DU/PL"]:
|
|
261
|
+
mapping[tag] = "number"
|
|
262
|
+
|
|
263
|
+
# Tense (including variants and whitespace)
|
|
264
|
+
for tag in [
|
|
265
|
+
"PRS",
|
|
266
|
+
"PST",
|
|
267
|
+
"FUT",
|
|
268
|
+
"PRES",
|
|
269
|
+
"PAST",
|
|
270
|
+
"NFUT",
|
|
271
|
+
"NPST",
|
|
272
|
+
"PRS ",
|
|
273
|
+
"PRS ",
|
|
274
|
+
"PRS+FUT",
|
|
275
|
+
"PRS/FUT",
|
|
276
|
+
"PRS+IMMED",
|
|
277
|
+
"PST+IMMED",
|
|
278
|
+
"PRS/PST+IMMED",
|
|
279
|
+
"FUT+IMMED",
|
|
280
|
+
"FUT+RMT",
|
|
281
|
+
"PST+RCT",
|
|
282
|
+
"PST+RMT",
|
|
283
|
+
"RCT",
|
|
284
|
+
"RMT",
|
|
285
|
+
"IMMED",
|
|
286
|
+
"FUT:ELEV",
|
|
287
|
+
"PST:ELEV",
|
|
288
|
+
"3:PRS",
|
|
289
|
+
"V:PST:3:PL",
|
|
290
|
+
"non{PRS}",
|
|
291
|
+
"non{PST}",
|
|
292
|
+
"PL,FUTS",
|
|
293
|
+
]:
|
|
294
|
+
mapping[tag] = "tense"
|
|
295
|
+
|
|
296
|
+
# Aspect
|
|
297
|
+
for tag in [
|
|
298
|
+
"PFV",
|
|
299
|
+
"IPFV",
|
|
300
|
+
"PRF",
|
|
301
|
+
"PROG",
|
|
302
|
+
"HAB",
|
|
303
|
+
"ITER",
|
|
304
|
+
"PROSP",
|
|
305
|
+
"DUR",
|
|
306
|
+
"INCH",
|
|
307
|
+
"SEMEL",
|
|
308
|
+
"FREQ",
|
|
309
|
+
"HAB+IPFV",
|
|
310
|
+
"HAB+PRF",
|
|
311
|
+
"HAB+PROG",
|
|
312
|
+
"IPFV/PROG",
|
|
313
|
+
"PFV/PRF",
|
|
314
|
+
"PRF+PROG",
|
|
315
|
+
"PROSP+PROG",
|
|
316
|
+
]:
|
|
317
|
+
mapping[tag] = "aspect"
|
|
318
|
+
|
|
319
|
+
# Mood (many combinations)
|
|
320
|
+
for tag in [
|
|
321
|
+
"IND",
|
|
322
|
+
"SBJV",
|
|
323
|
+
"IMP",
|
|
324
|
+
"COND",
|
|
325
|
+
"OPT",
|
|
326
|
+
"POT",
|
|
327
|
+
"DEB",
|
|
328
|
+
"OBLIG",
|
|
329
|
+
"PERM",
|
|
330
|
+
"ADM",
|
|
331
|
+
"REAL",
|
|
332
|
+
"IRR",
|
|
333
|
+
"HYP",
|
|
334
|
+
"INFER",
|
|
335
|
+
"LKLY",
|
|
336
|
+
] + [
|
|
337
|
+
"COND+IND",
|
|
338
|
+
"COND+IND+OPT",
|
|
339
|
+
"COND+POT",
|
|
340
|
+
"COND+POT+OPT",
|
|
341
|
+
"COND+SBJV",
|
|
342
|
+
"COND+SBJV+OPT",
|
|
343
|
+
"IND+IMP",
|
|
344
|
+
"IND+OPT",
|
|
345
|
+
"IND+POT",
|
|
346
|
+
"IND+POT+OPT",
|
|
347
|
+
"IMP+OPT",
|
|
348
|
+
"IMP+SBJV",
|
|
349
|
+
"POT+OPT",
|
|
350
|
+
"SBJV+OPT",
|
|
351
|
+
"SBJV+POT",
|
|
352
|
+
"SBJV+POT+OPT",
|
|
353
|
+
"ADM+OPT",
|
|
354
|
+
"ADM+POT",
|
|
355
|
+
"ADM+POT+OPT",
|
|
356
|
+
]:
|
|
357
|
+
mapping[tag] = "mood"
|
|
358
|
+
|
|
359
|
+
# Voice
|
|
360
|
+
for tag in [
|
|
361
|
+
"ACT",
|
|
362
|
+
"PASS",
|
|
363
|
+
"MID",
|
|
364
|
+
"ANTIP",
|
|
365
|
+
"REFL",
|
|
366
|
+
"RECP",
|
|
367
|
+
"CAUS",
|
|
368
|
+
"APPL",
|
|
369
|
+
"ACT+PASS",
|
|
370
|
+
"MID+PASS",
|
|
371
|
+
"REFL/RECP",
|
|
372
|
+
"CAUSV",
|
|
373
|
+
"COMPV",
|
|
374
|
+
"EXCLV",
|
|
375
|
+
"MASV",
|
|
376
|
+
]:
|
|
377
|
+
mapping[tag] = "voice"
|
|
378
|
+
|
|
379
|
+
# Gender (including complex combinations)
|
|
380
|
+
for tag in [
|
|
381
|
+
"MASC",
|
|
382
|
+
"FEM",
|
|
383
|
+
"NEUT",
|
|
384
|
+
"MASC+FEM",
|
|
385
|
+
"MASC+NEUT",
|
|
386
|
+
"FEM+NEUT",
|
|
387
|
+
"MASC+FEM+NEUT",
|
|
388
|
+
"FEM+FEM",
|
|
389
|
+
"FEM+MASC",
|
|
390
|
+
"MASC+MASC",
|
|
391
|
+
"NEUT+MASC",
|
|
392
|
+
"MASC/FEM",
|
|
393
|
+
"MASC+FEM+MASC",
|
|
394
|
+
]:
|
|
395
|
+
mapping[tag] = "gender"
|
|
396
|
+
|
|
397
|
+
# Animacy
|
|
398
|
+
for tag in ["ANIM", "INAN", "HUM"]:
|
|
399
|
+
mapping[tag] = "animacy"
|
|
400
|
+
|
|
401
|
+
# Finiteness
|
|
402
|
+
for tag in ["FIN", "NFIN"]:
|
|
403
|
+
mapping[tag] = "finiteness"
|
|
404
|
+
|
|
405
|
+
# Definiteness
|
|
406
|
+
for tag in [
|
|
407
|
+
"DEF",
|
|
408
|
+
"INDF",
|
|
409
|
+
"NDEF",
|
|
410
|
+
"INDF1",
|
|
411
|
+
"INDF2",
|
|
412
|
+
"INDF3",
|
|
413
|
+
"DEF/INDF",
|
|
414
|
+
"DEF/LGSPEC1",
|
|
415
|
+
]:
|
|
416
|
+
mapping[tag] = "definiteness"
|
|
417
|
+
|
|
418
|
+
# Comparison
|
|
419
|
+
for tag in [
|
|
420
|
+
"POS",
|
|
421
|
+
"CMPR",
|
|
422
|
+
"EQTV",
|
|
423
|
+
"SPRL",
|
|
424
|
+
"SUP",
|
|
425
|
+
"EQTV+ABL",
|
|
426
|
+
"EQTV+ACC",
|
|
427
|
+
"EQTV+DAT",
|
|
428
|
+
]:
|
|
429
|
+
mapping[tag] = "comparison"
|
|
430
|
+
|
|
431
|
+
# Politeness (including Korean)
|
|
432
|
+
for tag in [
|
|
433
|
+
"INFM",
|
|
434
|
+
"FORM",
|
|
435
|
+
"FORM2",
|
|
436
|
+
"POL",
|
|
437
|
+
"HUMB",
|
|
438
|
+
"ELEV",
|
|
439
|
+
"MPOL",
|
|
440
|
+
"FRML",
|
|
441
|
+
"INFM:LGSPEC1",
|
|
442
|
+
"POL:LGSPEC1",
|
|
443
|
+
"Formal polite(하십시오체)",
|
|
444
|
+
"Formal non-polite(해라체)",
|
|
445
|
+
"Informal polite(해요체)",
|
|
446
|
+
"Informal non-polite(해체)",
|
|
447
|
+
]:
|
|
448
|
+
mapping[tag] = "politeness"
|
|
449
|
+
|
|
450
|
+
# Evidentiality
|
|
451
|
+
for tag in ["FH", "NFH", "VIS", "QUOT", "RPRT", "INFR"]:
|
|
452
|
+
mapping[tag] = "evidentiality"
|
|
453
|
+
|
|
454
|
+
# Switch-reference
|
|
455
|
+
for tag in ["SS", "DS", "SIMMA"]:
|
|
456
|
+
mapping[tag] = "switch_reference"
|
|
457
|
+
|
|
458
|
+
# Deixis
|
|
459
|
+
for tag in ["PROX", "MED", "REMT"]:
|
|
460
|
+
mapping[tag] = "deixis"
|
|
461
|
+
|
|
462
|
+
# Interrogativity
|
|
463
|
+
for tag in ["INT", "DECL"]:
|
|
464
|
+
mapping[tag] = "interrogativity"
|
|
465
|
+
|
|
466
|
+
# Valency
|
|
467
|
+
for tag in ["INTR", "TR", "DISTR"]:
|
|
468
|
+
mapping[tag] = "valency"
|
|
469
|
+
|
|
470
|
+
# Polarity
|
|
471
|
+
for tag in ["NEG", "YES", "NO"]:
|
|
472
|
+
mapping[tag] = "polarity"
|
|
473
|
+
|
|
474
|
+
# Information structure
|
|
475
|
+
for tag in ["TOP", "FOC", "AGFOC", "PFOC"]:
|
|
476
|
+
mapping[tag] = "information_structure"
|
|
477
|
+
|
|
478
|
+
# Aktionsart
|
|
479
|
+
for tag in ["STAT", "ACTY", "TEL", "TAXIS", "SIM"]:
|
|
480
|
+
mapping[tag] = "aktionsart"
|
|
481
|
+
|
|
482
|
+
# Verb forms
|
|
483
|
+
for tag in [
|
|
484
|
+
"V.PTCP",
|
|
485
|
+
"V.CVB",
|
|
486
|
+
"V.MSDR",
|
|
487
|
+
"V.NFIN",
|
|
488
|
+
"V.CV",
|
|
489
|
+
"V.PCTP",
|
|
490
|
+
"V.PTCP.PRS",
|
|
491
|
+
"V.PTCP.PST",
|
|
492
|
+
"ADJ.PTCP",
|
|
493
|
+
"ADJ.CVB",
|
|
494
|
+
"ADJ.MSDR",
|
|
495
|
+
"PTCP",
|
|
496
|
+
"CVB",
|
|
497
|
+
"MSDR",
|
|
498
|
+
"INF",
|
|
499
|
+
"INFN",
|
|
500
|
+
]:
|
|
501
|
+
mapping[tag] = "verb_form"
|
|
502
|
+
|
|
503
|
+
# Bantu noun classes
|
|
504
|
+
for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 17]:
|
|
505
|
+
mapping[f"BANTU{i}"] = "bantu_class"
|
|
506
|
+
|
|
507
|
+
# Possessive markers
|
|
508
|
+
pss_tags = ["PSS", "PSS0", "PSS1", "PSS2", "PSS3", "PSS4"]
|
|
509
|
+
for base in ["PSS1", "PSS2", "PSS3"]:
|
|
510
|
+
for suffix in [
|
|
511
|
+
"D",
|
|
512
|
+
"I",
|
|
513
|
+
"P",
|
|
514
|
+
"PE",
|
|
515
|
+
"PI",
|
|
516
|
+
"PL",
|
|
517
|
+
"S",
|
|
518
|
+
"SM",
|
|
519
|
+
"F",
|
|
520
|
+
"M",
|
|
521
|
+
"PF",
|
|
522
|
+
"PM",
|
|
523
|
+
"SF",
|
|
524
|
+
]:
|
|
525
|
+
pss_tags.append(f"{base}{suffix}")
|
|
526
|
+
pss_tags += ["PSS3S/PSS3P", "PSS{2/3}D", "PSSD", "PSSRP", "PSSRS", "PSSS"]
|
|
527
|
+
for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 17]:
|
|
528
|
+
pss_tags.append(f"PSSB{i}")
|
|
529
|
+
pss_tags += [
|
|
530
|
+
"NALN+PSS3S",
|
|
531
|
+
"ALN+PSS1PE",
|
|
532
|
+
"ALN+PSS1PI",
|
|
533
|
+
"ALN+PSS1S",
|
|
534
|
+
"ALN+PSS2S",
|
|
535
|
+
"ALN+PSS3P",
|
|
536
|
+
"ALN+PSS3S",
|
|
537
|
+
"ALN+PSSRP",
|
|
538
|
+
"ALN+PSSRS",
|
|
539
|
+
]
|
|
540
|
+
for tag in pss_tags:
|
|
541
|
+
mapping[tag] = "possessive"
|
|
542
|
+
|
|
543
|
+
# Case (all combinations)
|
|
544
|
+
case_tags = [
|
|
545
|
+
"NOM",
|
|
546
|
+
"ACC",
|
|
547
|
+
"ERG",
|
|
548
|
+
"ABS",
|
|
549
|
+
"DAT",
|
|
550
|
+
"GEN",
|
|
551
|
+
"INS",
|
|
552
|
+
"INST",
|
|
553
|
+
"ABL",
|
|
554
|
+
"ALL",
|
|
555
|
+
"ESS",
|
|
556
|
+
"LOC",
|
|
557
|
+
"VOC",
|
|
558
|
+
"COM",
|
|
559
|
+
"BEN",
|
|
560
|
+
"AB",
|
|
561
|
+
"AT",
|
|
562
|
+
"IN",
|
|
563
|
+
"ON",
|
|
564
|
+
"PROL",
|
|
565
|
+
"TERM",
|
|
566
|
+
"VERS",
|
|
567
|
+
"OBL",
|
|
568
|
+
"SUB",
|
|
569
|
+
"ELEV",
|
|
570
|
+
"FROM",
|
|
571
|
+
"TO",
|
|
572
|
+
"APPRX",
|
|
573
|
+
"PRIV",
|
|
574
|
+
"PROPR",
|
|
575
|
+
"BYWAY",
|
|
576
|
+
"DIR",
|
|
577
|
+
] + [
|
|
578
|
+
"ACC+COM",
|
|
579
|
+
"ACC/DAT",
|
|
580
|
+
"AT+ABL",
|
|
581
|
+
"AT+ALL",
|
|
582
|
+
"AT+ESS",
|
|
583
|
+
"COM+TERM",
|
|
584
|
+
"DAT/GEN",
|
|
585
|
+
"DAT:FEM",
|
|
586
|
+
"GEN+DAT",
|
|
587
|
+
"GEN/DAT",
|
|
588
|
+
"IN+ABL",
|
|
589
|
+
"IN+ALL",
|
|
590
|
+
"IN+ESS",
|
|
591
|
+
"LOC+APPRX",
|
|
592
|
+
"NOM+VOC",
|
|
593
|
+
"NOM/ACC",
|
|
594
|
+
"NOM/ACC/DAT",
|
|
595
|
+
"OBL+VOC",
|
|
596
|
+
"ON+ABL",
|
|
597
|
+
"ON+ALL",
|
|
598
|
+
"ON+ESS",
|
|
599
|
+
"PSSRP+ACC",
|
|
600
|
+
"PSSRS+ACC",
|
|
601
|
+
"VOC+GEN",
|
|
602
|
+
"(non)NOM",
|
|
603
|
+
"non{NOM/ACC}",
|
|
604
|
+
"non{NOM}",
|
|
605
|
+
"not{NOM}",
|
|
606
|
+
]
|
|
607
|
+
for tag in case_tags:
|
|
608
|
+
mapping[tag] = "case"
|
|
609
|
+
|
|
610
|
+
# Argument markers (all observed tags with whitespace variants)
|
|
611
|
+
arg_prefixes = [
|
|
612
|
+
"ARGAB",
|
|
613
|
+
"ARGAC",
|
|
614
|
+
"ARGBE",
|
|
615
|
+
"ARGDA",
|
|
616
|
+
"ARGER",
|
|
617
|
+
"ARGERG",
|
|
618
|
+
"ARGIO",
|
|
619
|
+
"ARGNO",
|
|
620
|
+
]
|
|
621
|
+
for prefix in arg_prefixes:
|
|
622
|
+
for suffix in [
|
|
623
|
+
"",
|
|
624
|
+
"1",
|
|
625
|
+
"2",
|
|
626
|
+
"3",
|
|
627
|
+
"1P",
|
|
628
|
+
"1S",
|
|
629
|
+
"2P",
|
|
630
|
+
"2S",
|
|
631
|
+
"3P",
|
|
632
|
+
"3S",
|
|
633
|
+
"23S",
|
|
634
|
+
"S1",
|
|
635
|
+
"S2",
|
|
636
|
+
"S3",
|
|
637
|
+
"INFM",
|
|
638
|
+
"PL",
|
|
639
|
+
"SG",
|
|
640
|
+
"FEM",
|
|
641
|
+
"MASC",
|
|
642
|
+
"1DU",
|
|
643
|
+
"2DU",
|
|
644
|
+
"3DU",
|
|
645
|
+
"1PL",
|
|
646
|
+
"2PL",
|
|
647
|
+
"3PL",
|
|
648
|
+
"1SG",
|
|
649
|
+
"3SG",
|
|
650
|
+
"3SGHUM",
|
|
651
|
+
"{D/P}",
|
|
652
|
+
"S",
|
|
653
|
+
]:
|
|
654
|
+
mapping[f"{prefix}{suffix}"] = "argument"
|
|
655
|
+
# Add specific combinations and whitespace variants
|
|
656
|
+
for tag in (
|
|
657
|
+
[
|
|
658
|
+
"ARG1",
|
|
659
|
+
"ARG2",
|
|
660
|
+
"ARG3",
|
|
661
|
+
"ARG1P",
|
|
662
|
+
"ARG1S",
|
|
663
|
+
"ARG3P",
|
|
664
|
+
"ARG3S",
|
|
665
|
+
"ARGAB3P",
|
|
666
|
+
"ARGAB3P ",
|
|
667
|
+
"ARGAB3P ",
|
|
668
|
+
"ARGAB3P ",
|
|
669
|
+
"ARGAB3P ",
|
|
670
|
+
"ARGAB3S ",
|
|
671
|
+
"ARGAB3S ",
|
|
672
|
+
"ARGAB3S ",
|
|
673
|
+
"ARGAB3S ",
|
|
674
|
+
"ARGDU",
|
|
675
|
+
"ARGEXCL",
|
|
676
|
+
"ARGINCL",
|
|
677
|
+
"ARGPL",
|
|
678
|
+
"ARGSG",
|
|
679
|
+
"ARBAB1S",
|
|
680
|
+
"ARBAB3S",
|
|
681
|
+
"ARBEB1P",
|
|
682
|
+
"ARBEB1S",
|
|
683
|
+
]
|
|
684
|
+
+ [
|
|
685
|
+
"ARGAC1P+ARGNO1P",
|
|
686
|
+
"ARGAC1S+ARGNO1S",
|
|
687
|
+
"ARGAC2P+ARGNO2P",
|
|
688
|
+
"ARGAC2S+ARGNO2S",
|
|
689
|
+
"ARGAC3P+ARGNO3P",
|
|
690
|
+
"ARGAC3S+ARGNO1P",
|
|
691
|
+
"ARGAC3S+ARGNO1S",
|
|
692
|
+
"ARGAC3S+ARGNO2P",
|
|
693
|
+
"ARGAC3S+ARGNO2S",
|
|
694
|
+
"ARGAC3S+ARGNO3P",
|
|
695
|
+
"ARGAC3S+ARGNO3S",
|
|
696
|
+
"ARGNO{2/3}",
|
|
697
|
+
"ARGNO{D/P}",
|
|
698
|
+
"ARGAC{D/P}",
|
|
699
|
+
]
|
|
700
|
+
+ [
|
|
701
|
+
f"NO{x}"
|
|
702
|
+
for x in [
|
|
703
|
+
"",
|
|
704
|
+
"1",
|
|
705
|
+
"2",
|
|
706
|
+
"3",
|
|
707
|
+
"1P",
|
|
708
|
+
"1PE",
|
|
709
|
+
"1PI",
|
|
710
|
+
"1S",
|
|
711
|
+
"2P",
|
|
712
|
+
"2S",
|
|
713
|
+
"3F",
|
|
714
|
+
"3M",
|
|
715
|
+
"3P",
|
|
716
|
+
"3PA",
|
|
717
|
+
"3S",
|
|
718
|
+
"3SA",
|
|
719
|
+
"3SI",
|
|
720
|
+
]
|
|
721
|
+
]
|
|
722
|
+
+ [f"DA{x}" for x in ["1PE", "1PI", "1S", "2P", "2S", "3P", "3S"]]
|
|
723
|
+
+ ["ALN", "NALN+PSS3S"]
|
|
724
|
+
):
|
|
725
|
+
mapping[tag] = "argument"
|
|
726
|
+
|
|
727
|
+
return mapping
|
|
728
|
+
|
|
729
|
+
def _parse_features(self, features_str: str) -> dict[str, str]:
|
|
730
|
+
"""Parse UniMorph features string into dictionary.
|
|
731
|
+
|
|
732
|
+
Maps UniMorph feature tags to their dimensions based on
|
|
733
|
+
analysis of 173 languages and 575 unique tags from actual UniMorph data.
|
|
734
|
+
|
|
735
|
+
Parameters
|
|
736
|
+
----------
|
|
737
|
+
features_str : str
|
|
738
|
+
UniMorph features string (e.g., "V;PRS;3;SG").
|
|
739
|
+
|
|
740
|
+
Returns
|
|
741
|
+
-------
|
|
742
|
+
dict[str, str]
|
|
743
|
+
Parsed features dictionary with dimension names as keys.
|
|
744
|
+
"""
|
|
745
|
+
features_dict: dict[str, str] = {}
|
|
746
|
+
|
|
747
|
+
# Split by semicolon
|
|
748
|
+
parts = features_str.split(";")
|
|
749
|
+
|
|
750
|
+
# Map each tag to its dimension
|
|
751
|
+
for part in parts:
|
|
752
|
+
part = part.strip()
|
|
753
|
+
if not part: # Skip empty parts
|
|
754
|
+
continue
|
|
755
|
+
|
|
756
|
+
dimension = self._get_tag_dimension(part)
|
|
757
|
+
|
|
758
|
+
# Store tag under its dimension
|
|
759
|
+
if dimension == "unknown":
|
|
760
|
+
# Preserve unknown tags with sanitized key
|
|
761
|
+
safe_key = (
|
|
762
|
+
part.lower().replace(" ", "_").replace("+", "_").replace("/", "_")
|
|
763
|
+
)
|
|
764
|
+
features_dict[f"unknown_{safe_key}"] = part
|
|
765
|
+
elif dimension == "lgspec":
|
|
766
|
+
# Language-specific features
|
|
767
|
+
features_dict[f"lgspec_{part.lower()}"] = part
|
|
768
|
+
else:
|
|
769
|
+
# Known dimension - store the tag value
|
|
770
|
+
features_dict[dimension] = part
|
|
771
|
+
|
|
772
|
+
# Always store the original feature string
|
|
773
|
+
features_dict["unimorph_features"] = features_str
|
|
774
|
+
|
|
775
|
+
return features_dict
|
|
776
|
+
|
|
777
|
+
def is_available(self) -> bool:
|
|
778
|
+
"""Check if UniMorph package is available.
|
|
779
|
+
|
|
780
|
+
Returns
|
|
781
|
+
-------
|
|
782
|
+
bool
|
|
783
|
+
True if unimorph can be imported and accessed, False otherwise.
|
|
784
|
+
|
|
785
|
+
Examples
|
|
786
|
+
--------
|
|
787
|
+
>>> adapter = UniMorphAdapter()
|
|
788
|
+
>>> adapter.is_available()
|
|
789
|
+
True
|
|
790
|
+
"""
|
|
791
|
+
try:
|
|
792
|
+
# Verify unimorph is accessible
|
|
793
|
+
unimorph.get_list_of_datasets()
|
|
794
|
+
return True
|
|
795
|
+
except Exception:
|
|
796
|
+
return False
|