bead 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. bead/__init__.py +11 -0
  2. bead/__main__.py +11 -0
  3. bead/active_learning/__init__.py +15 -0
  4. bead/active_learning/config.py +231 -0
  5. bead/active_learning/loop.py +566 -0
  6. bead/active_learning/models/__init__.py +24 -0
  7. bead/active_learning/models/base.py +852 -0
  8. bead/active_learning/models/binary.py +910 -0
  9. bead/active_learning/models/categorical.py +943 -0
  10. bead/active_learning/models/cloze.py +862 -0
  11. bead/active_learning/models/forced_choice.py +956 -0
  12. bead/active_learning/models/free_text.py +773 -0
  13. bead/active_learning/models/lora.py +365 -0
  14. bead/active_learning/models/magnitude.py +835 -0
  15. bead/active_learning/models/multi_select.py +795 -0
  16. bead/active_learning/models/ordinal_scale.py +811 -0
  17. bead/active_learning/models/peft_adapter.py +155 -0
  18. bead/active_learning/models/random_effects.py +639 -0
  19. bead/active_learning/selection.py +354 -0
  20. bead/active_learning/strategies.py +391 -0
  21. bead/active_learning/trainers/__init__.py +26 -0
  22. bead/active_learning/trainers/base.py +210 -0
  23. bead/active_learning/trainers/data_collator.py +172 -0
  24. bead/active_learning/trainers/dataset_utils.py +261 -0
  25. bead/active_learning/trainers/huggingface.py +304 -0
  26. bead/active_learning/trainers/lightning.py +324 -0
  27. bead/active_learning/trainers/metrics.py +424 -0
  28. bead/active_learning/trainers/mixed_effects.py +551 -0
  29. bead/active_learning/trainers/model_wrapper.py +509 -0
  30. bead/active_learning/trainers/registry.py +104 -0
  31. bead/adapters/__init__.py +11 -0
  32. bead/adapters/huggingface.py +61 -0
  33. bead/behavioral/__init__.py +116 -0
  34. bead/behavioral/analytics.py +646 -0
  35. bead/behavioral/extraction.py +343 -0
  36. bead/behavioral/merging.py +343 -0
  37. bead/cli/__init__.py +11 -0
  38. bead/cli/active_learning.py +513 -0
  39. bead/cli/active_learning_commands.py +779 -0
  40. bead/cli/completion.py +359 -0
  41. bead/cli/config.py +624 -0
  42. bead/cli/constraint_builders.py +286 -0
  43. bead/cli/deployment.py +859 -0
  44. bead/cli/deployment_trials.py +493 -0
  45. bead/cli/deployment_ui.py +332 -0
  46. bead/cli/display.py +378 -0
  47. bead/cli/items.py +960 -0
  48. bead/cli/items_factories.py +776 -0
  49. bead/cli/list_constraints.py +714 -0
  50. bead/cli/lists.py +490 -0
  51. bead/cli/main.py +430 -0
  52. bead/cli/models.py +877 -0
  53. bead/cli/resource_loaders.py +621 -0
  54. bead/cli/resources.py +1036 -0
  55. bead/cli/shell.py +356 -0
  56. bead/cli/simulate.py +840 -0
  57. bead/cli/templates.py +1158 -0
  58. bead/cli/training.py +1080 -0
  59. bead/cli/utils.py +614 -0
  60. bead/cli/workflow.py +1273 -0
  61. bead/config/__init__.py +68 -0
  62. bead/config/active_learning.py +1009 -0
  63. bead/config/config.py +192 -0
  64. bead/config/defaults.py +118 -0
  65. bead/config/deployment.py +217 -0
  66. bead/config/env.py +147 -0
  67. bead/config/item.py +45 -0
  68. bead/config/list.py +193 -0
  69. bead/config/loader.py +149 -0
  70. bead/config/logging.py +42 -0
  71. bead/config/model.py +49 -0
  72. bead/config/paths.py +46 -0
  73. bead/config/profiles.py +320 -0
  74. bead/config/resources.py +47 -0
  75. bead/config/serialization.py +210 -0
  76. bead/config/simulation.py +206 -0
  77. bead/config/template.py +238 -0
  78. bead/config/validation.py +267 -0
  79. bead/data/__init__.py +65 -0
  80. bead/data/base.py +87 -0
  81. bead/data/identifiers.py +97 -0
  82. bead/data/language_codes.py +61 -0
  83. bead/data/metadata.py +270 -0
  84. bead/data/range.py +123 -0
  85. bead/data/repository.py +358 -0
  86. bead/data/serialization.py +249 -0
  87. bead/data/timestamps.py +89 -0
  88. bead/data/validation.py +349 -0
  89. bead/data_collection/__init__.py +11 -0
  90. bead/data_collection/jatos.py +223 -0
  91. bead/data_collection/merger.py +154 -0
  92. bead/data_collection/prolific.py +198 -0
  93. bead/deployment/__init__.py +5 -0
  94. bead/deployment/distribution.py +402 -0
  95. bead/deployment/jatos/__init__.py +1 -0
  96. bead/deployment/jatos/api.py +200 -0
  97. bead/deployment/jatos/exporter.py +210 -0
  98. bead/deployment/jspsych/__init__.py +9 -0
  99. bead/deployment/jspsych/biome.json +44 -0
  100. bead/deployment/jspsych/config.py +411 -0
  101. bead/deployment/jspsych/generator.py +598 -0
  102. bead/deployment/jspsych/package.json +51 -0
  103. bead/deployment/jspsych/pnpm-lock.yaml +2141 -0
  104. bead/deployment/jspsych/randomizer.py +299 -0
  105. bead/deployment/jspsych/src/lib/list-distributor.test.ts +327 -0
  106. bead/deployment/jspsych/src/lib/list-distributor.ts +1282 -0
  107. bead/deployment/jspsych/src/lib/randomizer.test.ts +232 -0
  108. bead/deployment/jspsych/src/lib/randomizer.ts +367 -0
  109. bead/deployment/jspsych/src/plugins/cloze-dropdown.ts +252 -0
  110. bead/deployment/jspsych/src/plugins/forced-choice.ts +265 -0
  111. bead/deployment/jspsych/src/plugins/plugins.test.ts +141 -0
  112. bead/deployment/jspsych/src/plugins/rating.ts +248 -0
  113. bead/deployment/jspsych/src/slopit/index.ts +9 -0
  114. bead/deployment/jspsych/src/types/jatos.d.ts +256 -0
  115. bead/deployment/jspsych/src/types/jspsych.d.ts +228 -0
  116. bead/deployment/jspsych/templates/experiment.css +1 -0
  117. bead/deployment/jspsych/templates/experiment.js.template +289 -0
  118. bead/deployment/jspsych/templates/index.html +51 -0
  119. bead/deployment/jspsych/templates/randomizer.js +241 -0
  120. bead/deployment/jspsych/templates/randomizer.js.template +313 -0
  121. bead/deployment/jspsych/trials.py +723 -0
  122. bead/deployment/jspsych/tsconfig.json +23 -0
  123. bead/deployment/jspsych/tsup.config.ts +30 -0
  124. bead/deployment/jspsych/ui/__init__.py +1 -0
  125. bead/deployment/jspsych/ui/components.py +383 -0
  126. bead/deployment/jspsych/ui/styles.py +411 -0
  127. bead/dsl/__init__.py +80 -0
  128. bead/dsl/ast.py +168 -0
  129. bead/dsl/context.py +178 -0
  130. bead/dsl/errors.py +71 -0
  131. bead/dsl/evaluator.py +570 -0
  132. bead/dsl/grammar.lark +81 -0
  133. bead/dsl/parser.py +231 -0
  134. bead/dsl/stdlib.py +929 -0
  135. bead/evaluation/__init__.py +13 -0
  136. bead/evaluation/convergence.py +485 -0
  137. bead/evaluation/interannotator.py +398 -0
  138. bead/items/__init__.py +40 -0
  139. bead/items/adapters/__init__.py +70 -0
  140. bead/items/adapters/anthropic.py +224 -0
  141. bead/items/adapters/api_utils.py +167 -0
  142. bead/items/adapters/base.py +216 -0
  143. bead/items/adapters/google.py +259 -0
  144. bead/items/adapters/huggingface.py +1074 -0
  145. bead/items/adapters/openai.py +323 -0
  146. bead/items/adapters/registry.py +202 -0
  147. bead/items/adapters/sentence_transformers.py +224 -0
  148. bead/items/adapters/togetherai.py +309 -0
  149. bead/items/binary.py +515 -0
  150. bead/items/cache.py +558 -0
  151. bead/items/categorical.py +593 -0
  152. bead/items/cloze.py +757 -0
  153. bead/items/constructor.py +784 -0
  154. bead/items/forced_choice.py +413 -0
  155. bead/items/free_text.py +681 -0
  156. bead/items/generation.py +432 -0
  157. bead/items/item.py +396 -0
  158. bead/items/item_template.py +787 -0
  159. bead/items/magnitude.py +573 -0
  160. bead/items/multi_select.py +621 -0
  161. bead/items/ordinal_scale.py +569 -0
  162. bead/items/scoring.py +448 -0
  163. bead/items/validation.py +723 -0
  164. bead/lists/__init__.py +30 -0
  165. bead/lists/balancer.py +263 -0
  166. bead/lists/constraints.py +1067 -0
  167. bead/lists/experiment_list.py +286 -0
  168. bead/lists/list_collection.py +378 -0
  169. bead/lists/partitioner.py +1141 -0
  170. bead/lists/stratification.py +254 -0
  171. bead/participants/__init__.py +73 -0
  172. bead/participants/collection.py +699 -0
  173. bead/participants/merging.py +312 -0
  174. bead/participants/metadata_spec.py +491 -0
  175. bead/participants/models.py +276 -0
  176. bead/resources/__init__.py +29 -0
  177. bead/resources/adapters/__init__.py +19 -0
  178. bead/resources/adapters/base.py +104 -0
  179. bead/resources/adapters/cache.py +128 -0
  180. bead/resources/adapters/glazing.py +508 -0
  181. bead/resources/adapters/registry.py +117 -0
  182. bead/resources/adapters/unimorph.py +796 -0
  183. bead/resources/classification.py +856 -0
  184. bead/resources/constraint_builders.py +329 -0
  185. bead/resources/constraints.py +165 -0
  186. bead/resources/lexical_item.py +223 -0
  187. bead/resources/lexicon.py +744 -0
  188. bead/resources/loaders.py +209 -0
  189. bead/resources/template.py +441 -0
  190. bead/resources/template_collection.py +707 -0
  191. bead/resources/template_generation.py +349 -0
  192. bead/simulation/__init__.py +29 -0
  193. bead/simulation/annotators/__init__.py +15 -0
  194. bead/simulation/annotators/base.py +175 -0
  195. bead/simulation/annotators/distance_based.py +135 -0
  196. bead/simulation/annotators/lm_based.py +114 -0
  197. bead/simulation/annotators/oracle.py +182 -0
  198. bead/simulation/annotators/random.py +181 -0
  199. bead/simulation/dsl_extension/__init__.py +3 -0
  200. bead/simulation/noise_models/__init__.py +13 -0
  201. bead/simulation/noise_models/base.py +42 -0
  202. bead/simulation/noise_models/random_noise.py +82 -0
  203. bead/simulation/noise_models/systematic.py +132 -0
  204. bead/simulation/noise_models/temperature.py +86 -0
  205. bead/simulation/runner.py +144 -0
  206. bead/simulation/strategies/__init__.py +23 -0
  207. bead/simulation/strategies/base.py +123 -0
  208. bead/simulation/strategies/binary.py +103 -0
  209. bead/simulation/strategies/categorical.py +123 -0
  210. bead/simulation/strategies/cloze.py +224 -0
  211. bead/simulation/strategies/forced_choice.py +127 -0
  212. bead/simulation/strategies/free_text.py +105 -0
  213. bead/simulation/strategies/magnitude.py +116 -0
  214. bead/simulation/strategies/multi_select.py +129 -0
  215. bead/simulation/strategies/ordinal_scale.py +131 -0
  216. bead/templates/__init__.py +27 -0
  217. bead/templates/adapters/__init__.py +17 -0
  218. bead/templates/adapters/base.py +128 -0
  219. bead/templates/adapters/cache.py +178 -0
  220. bead/templates/adapters/huggingface.py +312 -0
  221. bead/templates/combinatorics.py +103 -0
  222. bead/templates/filler.py +605 -0
  223. bead/templates/renderers.py +177 -0
  224. bead/templates/resolver.py +178 -0
  225. bead/templates/strategies.py +1806 -0
  226. bead/templates/streaming.py +195 -0
  227. bead-0.1.0.dist-info/METADATA +212 -0
  228. bead-0.1.0.dist-info/RECORD +231 -0
  229. bead-0.1.0.dist-info/WHEEL +4 -0
  230. bead-0.1.0.dist-info/entry_points.txt +2 -0
  231. bead-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,796 @@
1
+ """Adapter for UniMorph morphological paradigms.
2
+
3
+ This module provides an adapter to fetch morphological paradigms from UniMorph
4
+ data and convert them to LexicalItem format with morphological features.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Any
10
+
11
+ import langcodes
12
+ import pandas as pd
13
+ import unimorph
14
+ from unimorph import load_dataset
15
+
16
+ from bead.data.language_codes import LanguageCode
17
+ from bead.resources.adapters.base import ResourceAdapter
18
+ from bead.resources.adapters.cache import AdapterCache
19
+ from bead.resources.lexical_item import LexicalItem
20
+
21
+
22
+ class UniMorphAdapter(ResourceAdapter):
23
+ """Adapter for UniMorph morphological paradigms.
24
+
25
+ This adapter fetches morphological paradigms from UniMorph and converts
26
+ them to LexicalItem format. Morphological features are stored in the
27
+ features field using UniMorph feature schema.
28
+
29
+ Parameters
30
+ ----------
31
+ cache : AdapterCache | None
32
+ Optional cache instance. If None, no caching is performed.
33
+
34
+ Examples
35
+ --------
36
+ >>> adapter = UniMorphAdapter()
37
+ >>> items = adapter.fetch_items(query="walk", language_code="en")
38
+ >>> all(item.language_code == "en" for item in items)
39
+ True
40
+ >>> all("tense" in item.features for item in items if item.features)
41
+ True
42
+ """
43
+
44
+ def __init__(self, cache: AdapterCache | None = None) -> None:
45
+ """Initialize UniMorph adapter.
46
+
47
+ Parameters
48
+ ----------
49
+ cache : AdapterCache | None
50
+ Optional cache instance.
51
+ """
52
+ self.cache = cache
53
+ self._datasets: dict[str, pd.DataFrame] = {} # Cache datasets by language
54
+
55
+ def fetch_items(
56
+ self,
57
+ query: str | None = None,
58
+ language_code: LanguageCode = None,
59
+ **kwargs: Any,
60
+ ) -> list[LexicalItem]:
61
+ """Fetch morphological paradigms from UniMorph.
62
+
63
+ Parameters
64
+ ----------
65
+ query : str | None
66
+ Lemma to query (e.g., "walk", "먹다", "hamba").
67
+ language_code : LanguageCode
68
+ **Required** language code (e.g., "en", "ko", "zu"). UniMorph
69
+ is organized by language, so this parameter is essential.
70
+ **kwargs : Any
71
+ Additional parameters (e.g., pos="VERB").
72
+
73
+ Returns
74
+ -------
75
+ list[LexicalItem]
76
+ Lexical items representing inflected forms with morphological
77
+ features in the features field.
78
+
79
+ Raises
80
+ ------
81
+ ValueError
82
+ If language_code is None (required for UniMorph).
83
+ RuntimeError
84
+ If UniMorph access fails.
85
+
86
+ Examples
87
+ --------
88
+ >>> adapter = UniMorphAdapter()
89
+ >>> items = adapter.fetch_items(query="walk", language_code="en")
90
+ >>> len(items) > 0
91
+ True
92
+ >>> items[0].features.get("pos") == "VERB"
93
+ True
94
+ """
95
+ if language_code is None:
96
+ raise ValueError("UniMorphAdapter requires language_code parameter")
97
+
98
+ # Normalize to ISO 639-3 (3-letter code) for UniMorph
99
+ # UniMorph uses 3-letter codes (language_code is guaranteed non-None here)
100
+ lang_code = self._normalize_language_code(language_code)
101
+
102
+ # Check cache
103
+ cache_key = None
104
+ if self.cache:
105
+ cache_key = self.cache.make_key(
106
+ "unimorph", query=query, language_code=lang_code, **kwargs
107
+ )
108
+ cached = self.cache.get(cache_key)
109
+ if cached is not None:
110
+ return cached
111
+
112
+ # Fetch from UniMorph
113
+ try:
114
+ # Load dataset for language (cached at instance level)
115
+ if lang_code not in self._datasets:
116
+ self._datasets[lang_code] = load_dataset(lang_code)
117
+
118
+ dataset = self._datasets[lang_code]
119
+
120
+ # Filter by lemma if query provided
121
+ if query:
122
+ dataset = dataset[dataset["lemma"] == query]
123
+
124
+ # Convert to LexicalItem objects
125
+ items: list[LexicalItem] = []
126
+ for _, row in dataset.iterrows():
127
+ # Skip rows with NaN values
128
+ if (
129
+ row["lemma"] is None
130
+ or row["form"] is None
131
+ or row["features"] is None
132
+ or str(row["lemma"]) == "nan"
133
+ or str(row["form"]) == "nan"
134
+ or str(row["features"]) == "nan"
135
+ ):
136
+ continue
137
+
138
+ # Parse features string (e.g., "V;PRS;3;SG")
139
+ features_dict = self._parse_features(str(row["features"]))
140
+
141
+ item = LexicalItem(
142
+ lemma=str(row["lemma"]),
143
+ form=str(row["form"]),
144
+ language_code=language_code,
145
+ features=features_dict,
146
+ source="UniMorph",
147
+ )
148
+ items.append(item)
149
+
150
+ # Cache result
151
+ if self.cache and cache_key:
152
+ self.cache.set(cache_key, items)
153
+
154
+ return items
155
+
156
+ except Exception as e:
157
+ raise RuntimeError(f"Failed to fetch from UniMorph: {e}") from e
158
+
159
+ def _normalize_language_code(self, language_code: LanguageCode) -> str:
160
+ """Normalize language code to ISO 639-3 (3-letter) format.
161
+
162
+ Uses the langcodes package to properly convert ISO 639-1 (2-letter) codes
163
+ to ISO 639-3 (3-letter) codes.
164
+
165
+ Parameters
166
+ ----------
167
+ language_code : LanguageCode
168
+ Language code (2 or 3 letters, non-None).
169
+
170
+ Returns
171
+ -------
172
+ str
173
+ ISO 639-3 (3-letter) language code.
174
+
175
+ Raises
176
+ ------
177
+ ValueError
178
+ If language_code is None.
179
+ """
180
+ if language_code is None:
181
+ raise ValueError(
182
+ "language_code cannot be None when normalizing. "
183
+ "This should be checked by the caller."
184
+ )
185
+
186
+ # Use langcodes package to normalize
187
+ try:
188
+ # If it's already 3 letters, return as-is
189
+ if len(language_code) == 3:
190
+ return language_code
191
+
192
+ # For 2-letter codes, use langcodes to get the 3-letter equivalent
193
+ lang = langcodes.Language.get(language_code)
194
+ return lang.to_alpha3()
195
+ except Exception:
196
+ # If conversion fails, return as-is
197
+ return language_code
198
+
199
+ def _get_tag_dimension(self, tag: str) -> str:
200
+ """Get the dimension for a UniMorph tag.
201
+
202
+ Based on analysis of 173 languages and 575 tags from
203
+ the actual UniMorph data.
204
+
205
+ Parameters
206
+ ----------
207
+ tag : str
208
+ UniMorph feature tag.
209
+
210
+ Returns
211
+ -------
212
+ str
213
+ Dimension name, or "unknown" if tag is not recognized.
214
+ """
215
+ # Language-specific tags
216
+ if tag.startswith("LGSPEC") or tag.startswith("LGSPE"):
217
+ return "lgspec"
218
+
219
+ # Tag-to-dimension mapping
220
+ # Build lookup lazily to avoid repeating this logic
221
+ if not hasattr(self, "_tag_map"):
222
+ self._tag_map = self._build_tag_map()
223
+
224
+ return self._tag_map.get(tag, "unknown")
225
+
226
+ def _build_tag_map(self) -> dict[str, str]:
227
+ """Build complete tag-to-dimension mapping.
228
+
229
+ Returns
230
+ -------
231
+ dict[str, str]
232
+ Mapping from tag to dimension name.
233
+ """
234
+ mapping: dict[str, str] = {}
235
+
236
+ # Part of speech
237
+ for tag in [
238
+ "N",
239
+ "V",
240
+ "ADJ",
241
+ "ADV",
242
+ "PRO",
243
+ "ART",
244
+ "DET",
245
+ "ADP",
246
+ "CONJ",
247
+ "INTJ",
248
+ "NUM",
249
+ "PRON",
250
+ "PROPN",
251
+ "PRT",
252
+ ]:
253
+ mapping[tag] = "pos"
254
+
255
+ # Person (including complex)
256
+ for tag in ["0", "1", "2", "3", "4", "5", "1+2", "2+3", "1+EXCL", "1+INCL"]:
257
+ mapping[tag] = "person"
258
+
259
+ # Number
260
+ for tag in ["SG", "DU", "PL", "SG+PL", "DU/PL", "SG/DU/PL"]:
261
+ mapping[tag] = "number"
262
+
263
+ # Tense (including variants and whitespace)
264
+ for tag in [
265
+ "PRS",
266
+ "PST",
267
+ "FUT",
268
+ "PRES",
269
+ "PAST",
270
+ "NFUT",
271
+ "NPST",
272
+ "PRS ",
273
+ "PRS ",
274
+ "PRS+FUT",
275
+ "PRS/FUT",
276
+ "PRS+IMMED",
277
+ "PST+IMMED",
278
+ "PRS/PST+IMMED",
279
+ "FUT+IMMED",
280
+ "FUT+RMT",
281
+ "PST+RCT",
282
+ "PST+RMT",
283
+ "RCT",
284
+ "RMT",
285
+ "IMMED",
286
+ "FUT:ELEV",
287
+ "PST:ELEV",
288
+ "3:PRS",
289
+ "V:PST:3:PL",
290
+ "non{PRS}",
291
+ "non{PST}",
292
+ "PL,FUTS",
293
+ ]:
294
+ mapping[tag] = "tense"
295
+
296
+ # Aspect
297
+ for tag in [
298
+ "PFV",
299
+ "IPFV",
300
+ "PRF",
301
+ "PROG",
302
+ "HAB",
303
+ "ITER",
304
+ "PROSP",
305
+ "DUR",
306
+ "INCH",
307
+ "SEMEL",
308
+ "FREQ",
309
+ "HAB+IPFV",
310
+ "HAB+PRF",
311
+ "HAB+PROG",
312
+ "IPFV/PROG",
313
+ "PFV/PRF",
314
+ "PRF+PROG",
315
+ "PROSP+PROG",
316
+ ]:
317
+ mapping[tag] = "aspect"
318
+
319
+ # Mood (many combinations)
320
+ for tag in [
321
+ "IND",
322
+ "SBJV",
323
+ "IMP",
324
+ "COND",
325
+ "OPT",
326
+ "POT",
327
+ "DEB",
328
+ "OBLIG",
329
+ "PERM",
330
+ "ADM",
331
+ "REAL",
332
+ "IRR",
333
+ "HYP",
334
+ "INFER",
335
+ "LKLY",
336
+ ] + [
337
+ "COND+IND",
338
+ "COND+IND+OPT",
339
+ "COND+POT",
340
+ "COND+POT+OPT",
341
+ "COND+SBJV",
342
+ "COND+SBJV+OPT",
343
+ "IND+IMP",
344
+ "IND+OPT",
345
+ "IND+POT",
346
+ "IND+POT+OPT",
347
+ "IMP+OPT",
348
+ "IMP+SBJV",
349
+ "POT+OPT",
350
+ "SBJV+OPT",
351
+ "SBJV+POT",
352
+ "SBJV+POT+OPT",
353
+ "ADM+OPT",
354
+ "ADM+POT",
355
+ "ADM+POT+OPT",
356
+ ]:
357
+ mapping[tag] = "mood"
358
+
359
+ # Voice
360
+ for tag in [
361
+ "ACT",
362
+ "PASS",
363
+ "MID",
364
+ "ANTIP",
365
+ "REFL",
366
+ "RECP",
367
+ "CAUS",
368
+ "APPL",
369
+ "ACT+PASS",
370
+ "MID+PASS",
371
+ "REFL/RECP",
372
+ "CAUSV",
373
+ "COMPV",
374
+ "EXCLV",
375
+ "MASV",
376
+ ]:
377
+ mapping[tag] = "voice"
378
+
379
+ # Gender (including complex combinations)
380
+ for tag in [
381
+ "MASC",
382
+ "FEM",
383
+ "NEUT",
384
+ "MASC+FEM",
385
+ "MASC+NEUT",
386
+ "FEM+NEUT",
387
+ "MASC+FEM+NEUT",
388
+ "FEM+FEM",
389
+ "FEM+MASC",
390
+ "MASC+MASC",
391
+ "NEUT+MASC",
392
+ "MASC/FEM",
393
+ "MASC+FEM+MASC",
394
+ ]:
395
+ mapping[tag] = "gender"
396
+
397
+ # Animacy
398
+ for tag in ["ANIM", "INAN", "HUM"]:
399
+ mapping[tag] = "animacy"
400
+
401
+ # Finiteness
402
+ for tag in ["FIN", "NFIN"]:
403
+ mapping[tag] = "finiteness"
404
+
405
+ # Definiteness
406
+ for tag in [
407
+ "DEF",
408
+ "INDF",
409
+ "NDEF",
410
+ "INDF1",
411
+ "INDF2",
412
+ "INDF3",
413
+ "DEF/INDF",
414
+ "DEF/LGSPEC1",
415
+ ]:
416
+ mapping[tag] = "definiteness"
417
+
418
+ # Comparison
419
+ for tag in [
420
+ "POS",
421
+ "CMPR",
422
+ "EQTV",
423
+ "SPRL",
424
+ "SUP",
425
+ "EQTV+ABL",
426
+ "EQTV+ACC",
427
+ "EQTV+DAT",
428
+ ]:
429
+ mapping[tag] = "comparison"
430
+
431
+ # Politeness (including Korean)
432
+ for tag in [
433
+ "INFM",
434
+ "FORM",
435
+ "FORM2",
436
+ "POL",
437
+ "HUMB",
438
+ "ELEV",
439
+ "MPOL",
440
+ "FRML",
441
+ "INFM:LGSPEC1",
442
+ "POL:LGSPEC1",
443
+ "Formal polite(하십시오체)",
444
+ "Formal non-polite(해라체)",
445
+ "Informal polite(해요체)",
446
+ "Informal non-polite(해체)",
447
+ ]:
448
+ mapping[tag] = "politeness"
449
+
450
+ # Evidentiality
451
+ for tag in ["FH", "NFH", "VIS", "QUOT", "RPRT", "INFR"]:
452
+ mapping[tag] = "evidentiality"
453
+
454
+ # Switch-reference
455
+ for tag in ["SS", "DS", "SIMMA"]:
456
+ mapping[tag] = "switch_reference"
457
+
458
+ # Deixis
459
+ for tag in ["PROX", "MED", "REMT"]:
460
+ mapping[tag] = "deixis"
461
+
462
+ # Interrogativity
463
+ for tag in ["INT", "DECL"]:
464
+ mapping[tag] = "interrogativity"
465
+
466
+ # Valency
467
+ for tag in ["INTR", "TR", "DISTR"]:
468
+ mapping[tag] = "valency"
469
+
470
+ # Polarity
471
+ for tag in ["NEG", "YES", "NO"]:
472
+ mapping[tag] = "polarity"
473
+
474
+ # Information structure
475
+ for tag in ["TOP", "FOC", "AGFOC", "PFOC"]:
476
+ mapping[tag] = "information_structure"
477
+
478
+ # Aktionsart
479
+ for tag in ["STAT", "ACTY", "TEL", "TAXIS", "SIM"]:
480
+ mapping[tag] = "aktionsart"
481
+
482
+ # Verb forms
483
+ for tag in [
484
+ "V.PTCP",
485
+ "V.CVB",
486
+ "V.MSDR",
487
+ "V.NFIN",
488
+ "V.CV",
489
+ "V.PCTP",
490
+ "V.PTCP.PRS",
491
+ "V.PTCP.PST",
492
+ "ADJ.PTCP",
493
+ "ADJ.CVB",
494
+ "ADJ.MSDR",
495
+ "PTCP",
496
+ "CVB",
497
+ "MSDR",
498
+ "INF",
499
+ "INFN",
500
+ ]:
501
+ mapping[tag] = "verb_form"
502
+
503
+ # Bantu noun classes
504
+ for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 17]:
505
+ mapping[f"BANTU{i}"] = "bantu_class"
506
+
507
+ # Possessive markers
508
+ pss_tags = ["PSS", "PSS0", "PSS1", "PSS2", "PSS3", "PSS4"]
509
+ for base in ["PSS1", "PSS2", "PSS3"]:
510
+ for suffix in [
511
+ "D",
512
+ "I",
513
+ "P",
514
+ "PE",
515
+ "PI",
516
+ "PL",
517
+ "S",
518
+ "SM",
519
+ "F",
520
+ "M",
521
+ "PF",
522
+ "PM",
523
+ "SF",
524
+ ]:
525
+ pss_tags.append(f"{base}{suffix}")
526
+ pss_tags += ["PSS3S/PSS3P", "PSS{2/3}D", "PSSD", "PSSRP", "PSSRS", "PSSS"]
527
+ for i in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 17]:
528
+ pss_tags.append(f"PSSB{i}")
529
+ pss_tags += [
530
+ "NALN+PSS3S",
531
+ "ALN+PSS1PE",
532
+ "ALN+PSS1PI",
533
+ "ALN+PSS1S",
534
+ "ALN+PSS2S",
535
+ "ALN+PSS3P",
536
+ "ALN+PSS3S",
537
+ "ALN+PSSRP",
538
+ "ALN+PSSRS",
539
+ ]
540
+ for tag in pss_tags:
541
+ mapping[tag] = "possessive"
542
+
543
+ # Case (all combinations)
544
+ case_tags = [
545
+ "NOM",
546
+ "ACC",
547
+ "ERG",
548
+ "ABS",
549
+ "DAT",
550
+ "GEN",
551
+ "INS",
552
+ "INST",
553
+ "ABL",
554
+ "ALL",
555
+ "ESS",
556
+ "LOC",
557
+ "VOC",
558
+ "COM",
559
+ "BEN",
560
+ "AB",
561
+ "AT",
562
+ "IN",
563
+ "ON",
564
+ "PROL",
565
+ "TERM",
566
+ "VERS",
567
+ "OBL",
568
+ "SUB",
569
+ "ELEV",
570
+ "FROM",
571
+ "TO",
572
+ "APPRX",
573
+ "PRIV",
574
+ "PROPR",
575
+ "BYWAY",
576
+ "DIR",
577
+ ] + [
578
+ "ACC+COM",
579
+ "ACC/DAT",
580
+ "AT+ABL",
581
+ "AT+ALL",
582
+ "AT+ESS",
583
+ "COM+TERM",
584
+ "DAT/GEN",
585
+ "DAT:FEM",
586
+ "GEN+DAT",
587
+ "GEN/DAT",
588
+ "IN+ABL",
589
+ "IN+ALL",
590
+ "IN+ESS",
591
+ "LOC+APPRX",
592
+ "NOM+VOC",
593
+ "NOM/ACC",
594
+ "NOM/ACC/DAT",
595
+ "OBL+VOC",
596
+ "ON+ABL",
597
+ "ON+ALL",
598
+ "ON+ESS",
599
+ "PSSRP+ACC",
600
+ "PSSRS+ACC",
601
+ "VOC+GEN",
602
+ "(non)NOM",
603
+ "non{NOM/ACC}",
604
+ "non{NOM}",
605
+ "not{NOM}",
606
+ ]
607
+ for tag in case_tags:
608
+ mapping[tag] = "case"
609
+
610
+ # Argument markers (all observed tags with whitespace variants)
611
+ arg_prefixes = [
612
+ "ARGAB",
613
+ "ARGAC",
614
+ "ARGBE",
615
+ "ARGDA",
616
+ "ARGER",
617
+ "ARGERG",
618
+ "ARGIO",
619
+ "ARGNO",
620
+ ]
621
+ for prefix in arg_prefixes:
622
+ for suffix in [
623
+ "",
624
+ "1",
625
+ "2",
626
+ "3",
627
+ "1P",
628
+ "1S",
629
+ "2P",
630
+ "2S",
631
+ "3P",
632
+ "3S",
633
+ "23S",
634
+ "S1",
635
+ "S2",
636
+ "S3",
637
+ "INFM",
638
+ "PL",
639
+ "SG",
640
+ "FEM",
641
+ "MASC",
642
+ "1DU",
643
+ "2DU",
644
+ "3DU",
645
+ "1PL",
646
+ "2PL",
647
+ "3PL",
648
+ "1SG",
649
+ "3SG",
650
+ "3SGHUM",
651
+ "{D/P}",
652
+ "S",
653
+ ]:
654
+ mapping[f"{prefix}{suffix}"] = "argument"
655
+ # Add specific combinations and whitespace variants
656
+ for tag in (
657
+ [
658
+ "ARG1",
659
+ "ARG2",
660
+ "ARG3",
661
+ "ARG1P",
662
+ "ARG1S",
663
+ "ARG3P",
664
+ "ARG3S",
665
+ "ARGAB3P",
666
+ "ARGAB3P ",
667
+ "ARGAB3P ",
668
+ "ARGAB3P ",
669
+ "ARGAB3P ",
670
+ "ARGAB3S ",
671
+ "ARGAB3S ",
672
+ "ARGAB3S ",
673
+ "ARGAB3S ",
674
+ "ARGDU",
675
+ "ARGEXCL",
676
+ "ARGINCL",
677
+ "ARGPL",
678
+ "ARGSG",
679
+ "ARBAB1S",
680
+ "ARBAB3S",
681
+ "ARBEB1P",
682
+ "ARBEB1S",
683
+ ]
684
+ + [
685
+ "ARGAC1P+ARGNO1P",
686
+ "ARGAC1S+ARGNO1S",
687
+ "ARGAC2P+ARGNO2P",
688
+ "ARGAC2S+ARGNO2S",
689
+ "ARGAC3P+ARGNO3P",
690
+ "ARGAC3S+ARGNO1P",
691
+ "ARGAC3S+ARGNO1S",
692
+ "ARGAC3S+ARGNO2P",
693
+ "ARGAC3S+ARGNO2S",
694
+ "ARGAC3S+ARGNO3P",
695
+ "ARGAC3S+ARGNO3S",
696
+ "ARGNO{2/3}",
697
+ "ARGNO{D/P}",
698
+ "ARGAC{D/P}",
699
+ ]
700
+ + [
701
+ f"NO{x}"
702
+ for x in [
703
+ "",
704
+ "1",
705
+ "2",
706
+ "3",
707
+ "1P",
708
+ "1PE",
709
+ "1PI",
710
+ "1S",
711
+ "2P",
712
+ "2S",
713
+ "3F",
714
+ "3M",
715
+ "3P",
716
+ "3PA",
717
+ "3S",
718
+ "3SA",
719
+ "3SI",
720
+ ]
721
+ ]
722
+ + [f"DA{x}" for x in ["1PE", "1PI", "1S", "2P", "2S", "3P", "3S"]]
723
+ + ["ALN", "NALN+PSS3S"]
724
+ ):
725
+ mapping[tag] = "argument"
726
+
727
+ return mapping
728
+
729
+ def _parse_features(self, features_str: str) -> dict[str, str]:
730
+ """Parse UniMorph features string into dictionary.
731
+
732
+ Maps UniMorph feature tags to their dimensions based on
733
+ analysis of 173 languages and 575 unique tags from actual UniMorph data.
734
+
735
+ Parameters
736
+ ----------
737
+ features_str : str
738
+ UniMorph features string (e.g., "V;PRS;3;SG").
739
+
740
+ Returns
741
+ -------
742
+ dict[str, str]
743
+ Parsed features dictionary with dimension names as keys.
744
+ """
745
+ features_dict: dict[str, str] = {}
746
+
747
+ # Split by semicolon
748
+ parts = features_str.split(";")
749
+
750
+ # Map each tag to its dimension
751
+ for part in parts:
752
+ part = part.strip()
753
+ if not part: # Skip empty parts
754
+ continue
755
+
756
+ dimension = self._get_tag_dimension(part)
757
+
758
+ # Store tag under its dimension
759
+ if dimension == "unknown":
760
+ # Preserve unknown tags with sanitized key
761
+ safe_key = (
762
+ part.lower().replace(" ", "_").replace("+", "_").replace("/", "_")
763
+ )
764
+ features_dict[f"unknown_{safe_key}"] = part
765
+ elif dimension == "lgspec":
766
+ # Language-specific features
767
+ features_dict[f"lgspec_{part.lower()}"] = part
768
+ else:
769
+ # Known dimension - store the tag value
770
+ features_dict[dimension] = part
771
+
772
+ # Always store the original feature string
773
+ features_dict["unimorph_features"] = features_str
774
+
775
+ return features_dict
776
+
777
+ def is_available(self) -> bool:
778
+ """Check if UniMorph package is available.
779
+
780
+ Returns
781
+ -------
782
+ bool
783
+ True if unimorph can be imported and accessed, False otherwise.
784
+
785
+ Examples
786
+ --------
787
+ >>> adapter = UniMorphAdapter()
788
+ >>> adapter.is_available()
789
+ True
790
+ """
791
+ try:
792
+ # Verify unimorph is accessible
793
+ unimorph.get_list_of_datasets()
794
+ return True
795
+ except Exception:
796
+ return False