bead 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. bead/__init__.py +11 -0
  2. bead/__main__.py +11 -0
  3. bead/active_learning/__init__.py +15 -0
  4. bead/active_learning/config.py +231 -0
  5. bead/active_learning/loop.py +566 -0
  6. bead/active_learning/models/__init__.py +24 -0
  7. bead/active_learning/models/base.py +852 -0
  8. bead/active_learning/models/binary.py +910 -0
  9. bead/active_learning/models/categorical.py +943 -0
  10. bead/active_learning/models/cloze.py +862 -0
  11. bead/active_learning/models/forced_choice.py +956 -0
  12. bead/active_learning/models/free_text.py +773 -0
  13. bead/active_learning/models/lora.py +365 -0
  14. bead/active_learning/models/magnitude.py +835 -0
  15. bead/active_learning/models/multi_select.py +795 -0
  16. bead/active_learning/models/ordinal_scale.py +811 -0
  17. bead/active_learning/models/peft_adapter.py +155 -0
  18. bead/active_learning/models/random_effects.py +639 -0
  19. bead/active_learning/selection.py +354 -0
  20. bead/active_learning/strategies.py +391 -0
  21. bead/active_learning/trainers/__init__.py +26 -0
  22. bead/active_learning/trainers/base.py +210 -0
  23. bead/active_learning/trainers/data_collator.py +172 -0
  24. bead/active_learning/trainers/dataset_utils.py +261 -0
  25. bead/active_learning/trainers/huggingface.py +304 -0
  26. bead/active_learning/trainers/lightning.py +324 -0
  27. bead/active_learning/trainers/metrics.py +424 -0
  28. bead/active_learning/trainers/mixed_effects.py +551 -0
  29. bead/active_learning/trainers/model_wrapper.py +509 -0
  30. bead/active_learning/trainers/registry.py +104 -0
  31. bead/adapters/__init__.py +11 -0
  32. bead/adapters/huggingface.py +61 -0
  33. bead/behavioral/__init__.py +116 -0
  34. bead/behavioral/analytics.py +646 -0
  35. bead/behavioral/extraction.py +343 -0
  36. bead/behavioral/merging.py +343 -0
  37. bead/cli/__init__.py +11 -0
  38. bead/cli/active_learning.py +513 -0
  39. bead/cli/active_learning_commands.py +779 -0
  40. bead/cli/completion.py +359 -0
  41. bead/cli/config.py +624 -0
  42. bead/cli/constraint_builders.py +286 -0
  43. bead/cli/deployment.py +859 -0
  44. bead/cli/deployment_trials.py +493 -0
  45. bead/cli/deployment_ui.py +332 -0
  46. bead/cli/display.py +378 -0
  47. bead/cli/items.py +960 -0
  48. bead/cli/items_factories.py +776 -0
  49. bead/cli/list_constraints.py +714 -0
  50. bead/cli/lists.py +490 -0
  51. bead/cli/main.py +430 -0
  52. bead/cli/models.py +877 -0
  53. bead/cli/resource_loaders.py +621 -0
  54. bead/cli/resources.py +1036 -0
  55. bead/cli/shell.py +356 -0
  56. bead/cli/simulate.py +840 -0
  57. bead/cli/templates.py +1158 -0
  58. bead/cli/training.py +1080 -0
  59. bead/cli/utils.py +614 -0
  60. bead/cli/workflow.py +1273 -0
  61. bead/config/__init__.py +68 -0
  62. bead/config/active_learning.py +1009 -0
  63. bead/config/config.py +192 -0
  64. bead/config/defaults.py +118 -0
  65. bead/config/deployment.py +217 -0
  66. bead/config/env.py +147 -0
  67. bead/config/item.py +45 -0
  68. bead/config/list.py +193 -0
  69. bead/config/loader.py +149 -0
  70. bead/config/logging.py +42 -0
  71. bead/config/model.py +49 -0
  72. bead/config/paths.py +46 -0
  73. bead/config/profiles.py +320 -0
  74. bead/config/resources.py +47 -0
  75. bead/config/serialization.py +210 -0
  76. bead/config/simulation.py +206 -0
  77. bead/config/template.py +238 -0
  78. bead/config/validation.py +267 -0
  79. bead/data/__init__.py +65 -0
  80. bead/data/base.py +87 -0
  81. bead/data/identifiers.py +97 -0
  82. bead/data/language_codes.py +61 -0
  83. bead/data/metadata.py +270 -0
  84. bead/data/range.py +123 -0
  85. bead/data/repository.py +358 -0
  86. bead/data/serialization.py +249 -0
  87. bead/data/timestamps.py +89 -0
  88. bead/data/validation.py +349 -0
  89. bead/data_collection/__init__.py +11 -0
  90. bead/data_collection/jatos.py +223 -0
  91. bead/data_collection/merger.py +154 -0
  92. bead/data_collection/prolific.py +198 -0
  93. bead/deployment/__init__.py +5 -0
  94. bead/deployment/distribution.py +402 -0
  95. bead/deployment/jatos/__init__.py +1 -0
  96. bead/deployment/jatos/api.py +200 -0
  97. bead/deployment/jatos/exporter.py +210 -0
  98. bead/deployment/jspsych/__init__.py +9 -0
  99. bead/deployment/jspsych/biome.json +44 -0
  100. bead/deployment/jspsych/config.py +411 -0
  101. bead/deployment/jspsych/generator.py +598 -0
  102. bead/deployment/jspsych/package.json +51 -0
  103. bead/deployment/jspsych/pnpm-lock.yaml +2141 -0
  104. bead/deployment/jspsych/randomizer.py +299 -0
  105. bead/deployment/jspsych/src/lib/list-distributor.test.ts +327 -0
  106. bead/deployment/jspsych/src/lib/list-distributor.ts +1282 -0
  107. bead/deployment/jspsych/src/lib/randomizer.test.ts +232 -0
  108. bead/deployment/jspsych/src/lib/randomizer.ts +367 -0
  109. bead/deployment/jspsych/src/plugins/cloze-dropdown.ts +252 -0
  110. bead/deployment/jspsych/src/plugins/forced-choice.ts +265 -0
  111. bead/deployment/jspsych/src/plugins/plugins.test.ts +141 -0
  112. bead/deployment/jspsych/src/plugins/rating.ts +248 -0
  113. bead/deployment/jspsych/src/slopit/index.ts +9 -0
  114. bead/deployment/jspsych/src/types/jatos.d.ts +256 -0
  115. bead/deployment/jspsych/src/types/jspsych.d.ts +228 -0
  116. bead/deployment/jspsych/templates/experiment.css +1 -0
  117. bead/deployment/jspsych/templates/experiment.js.template +289 -0
  118. bead/deployment/jspsych/templates/index.html +51 -0
  119. bead/deployment/jspsych/templates/randomizer.js +241 -0
  120. bead/deployment/jspsych/templates/randomizer.js.template +313 -0
  121. bead/deployment/jspsych/trials.py +723 -0
  122. bead/deployment/jspsych/tsconfig.json +23 -0
  123. bead/deployment/jspsych/tsup.config.ts +30 -0
  124. bead/deployment/jspsych/ui/__init__.py +1 -0
  125. bead/deployment/jspsych/ui/components.py +383 -0
  126. bead/deployment/jspsych/ui/styles.py +411 -0
  127. bead/dsl/__init__.py +80 -0
  128. bead/dsl/ast.py +168 -0
  129. bead/dsl/context.py +178 -0
  130. bead/dsl/errors.py +71 -0
  131. bead/dsl/evaluator.py +570 -0
  132. bead/dsl/grammar.lark +81 -0
  133. bead/dsl/parser.py +231 -0
  134. bead/dsl/stdlib.py +929 -0
  135. bead/evaluation/__init__.py +13 -0
  136. bead/evaluation/convergence.py +485 -0
  137. bead/evaluation/interannotator.py +398 -0
  138. bead/items/__init__.py +40 -0
  139. bead/items/adapters/__init__.py +70 -0
  140. bead/items/adapters/anthropic.py +224 -0
  141. bead/items/adapters/api_utils.py +167 -0
  142. bead/items/adapters/base.py +216 -0
  143. bead/items/adapters/google.py +259 -0
  144. bead/items/adapters/huggingface.py +1074 -0
  145. bead/items/adapters/openai.py +323 -0
  146. bead/items/adapters/registry.py +202 -0
  147. bead/items/adapters/sentence_transformers.py +224 -0
  148. bead/items/adapters/togetherai.py +309 -0
  149. bead/items/binary.py +515 -0
  150. bead/items/cache.py +558 -0
  151. bead/items/categorical.py +593 -0
  152. bead/items/cloze.py +757 -0
  153. bead/items/constructor.py +784 -0
  154. bead/items/forced_choice.py +413 -0
  155. bead/items/free_text.py +681 -0
  156. bead/items/generation.py +432 -0
  157. bead/items/item.py +396 -0
  158. bead/items/item_template.py +787 -0
  159. bead/items/magnitude.py +573 -0
  160. bead/items/multi_select.py +621 -0
  161. bead/items/ordinal_scale.py +569 -0
  162. bead/items/scoring.py +448 -0
  163. bead/items/validation.py +723 -0
  164. bead/lists/__init__.py +30 -0
  165. bead/lists/balancer.py +263 -0
  166. bead/lists/constraints.py +1067 -0
  167. bead/lists/experiment_list.py +286 -0
  168. bead/lists/list_collection.py +378 -0
  169. bead/lists/partitioner.py +1141 -0
  170. bead/lists/stratification.py +254 -0
  171. bead/participants/__init__.py +73 -0
  172. bead/participants/collection.py +699 -0
  173. bead/participants/merging.py +312 -0
  174. bead/participants/metadata_spec.py +491 -0
  175. bead/participants/models.py +276 -0
  176. bead/resources/__init__.py +29 -0
  177. bead/resources/adapters/__init__.py +19 -0
  178. bead/resources/adapters/base.py +104 -0
  179. bead/resources/adapters/cache.py +128 -0
  180. bead/resources/adapters/glazing.py +508 -0
  181. bead/resources/adapters/registry.py +117 -0
  182. bead/resources/adapters/unimorph.py +796 -0
  183. bead/resources/classification.py +856 -0
  184. bead/resources/constraint_builders.py +329 -0
  185. bead/resources/constraints.py +165 -0
  186. bead/resources/lexical_item.py +223 -0
  187. bead/resources/lexicon.py +744 -0
  188. bead/resources/loaders.py +209 -0
  189. bead/resources/template.py +441 -0
  190. bead/resources/template_collection.py +707 -0
  191. bead/resources/template_generation.py +349 -0
  192. bead/simulation/__init__.py +29 -0
  193. bead/simulation/annotators/__init__.py +15 -0
  194. bead/simulation/annotators/base.py +175 -0
  195. bead/simulation/annotators/distance_based.py +135 -0
  196. bead/simulation/annotators/lm_based.py +114 -0
  197. bead/simulation/annotators/oracle.py +182 -0
  198. bead/simulation/annotators/random.py +181 -0
  199. bead/simulation/dsl_extension/__init__.py +3 -0
  200. bead/simulation/noise_models/__init__.py +13 -0
  201. bead/simulation/noise_models/base.py +42 -0
  202. bead/simulation/noise_models/random_noise.py +82 -0
  203. bead/simulation/noise_models/systematic.py +132 -0
  204. bead/simulation/noise_models/temperature.py +86 -0
  205. bead/simulation/runner.py +144 -0
  206. bead/simulation/strategies/__init__.py +23 -0
  207. bead/simulation/strategies/base.py +123 -0
  208. bead/simulation/strategies/binary.py +103 -0
  209. bead/simulation/strategies/categorical.py +123 -0
  210. bead/simulation/strategies/cloze.py +224 -0
  211. bead/simulation/strategies/forced_choice.py +127 -0
  212. bead/simulation/strategies/free_text.py +105 -0
  213. bead/simulation/strategies/magnitude.py +116 -0
  214. bead/simulation/strategies/multi_select.py +129 -0
  215. bead/simulation/strategies/ordinal_scale.py +131 -0
  216. bead/templates/__init__.py +27 -0
  217. bead/templates/adapters/__init__.py +17 -0
  218. bead/templates/adapters/base.py +128 -0
  219. bead/templates/adapters/cache.py +178 -0
  220. bead/templates/adapters/huggingface.py +312 -0
  221. bead/templates/combinatorics.py +103 -0
  222. bead/templates/filler.py +605 -0
  223. bead/templates/renderers.py +177 -0
  224. bead/templates/resolver.py +178 -0
  225. bead/templates/strategies.py +1806 -0
  226. bead/templates/streaming.py +195 -0
  227. bead-0.1.0.dist-info/METADATA +212 -0
  228. bead-0.1.0.dist-info/RECORD +231 -0
  229. bead-0.1.0.dist-info/WHEEL +4 -0
  230. bead-0.1.0.dist-info/entry_points.txt +2 -0
  231. bead-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,744 @@
1
+ """Lexicon management for collections of lexical items.
2
+
3
+ This module provides the Lexicon class for managing, querying, and manipulating
4
+ collections of lexical items. It supports filtering, searching, merging, and
5
+ conversion to/from pandas and polars DataFrames.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ from collections.abc import Callable, Iterator
12
+ from pathlib import Path
13
+ from typing import Any, Literal
14
+ from uuid import UUID
15
+
16
+ import pandas as pd
17
+ import polars as pl
18
+ from pydantic import Field
19
+
20
+ from bead.data.base import BeadBaseModel
21
+ from bead.data.language_codes import LanguageCode
22
+ from bead.resources.lexical_item import LexicalItem
23
+
24
+ # Type alias for supported DataFrame types
25
+ DataFrame = pd.DataFrame | pl.DataFrame
26
+
27
+
28
+ def _empty_str_list() -> list[str]:
29
+ """Create an empty string list."""
30
+ return []
31
+
32
+
33
+ def _empty_item_dict() -> dict[UUID, LexicalItem]:
34
+ """Create an empty item dictionary."""
35
+ return {}
36
+
37
+
38
+ class Lexicon(BeadBaseModel):
39
+ """A collection of lexical items with operations for filtering and analysis.
40
+
41
+ The Lexicon class manages collections of LexicalItem objects and provides
42
+ methods for:
43
+ - Adding and removing items (CRUD operations)
44
+ - Filtering by properties, features, and attributes
45
+ - Searching by text
46
+ - Merging with other lexicons
47
+ - Converting to/from pandas and polars DataFrames
48
+ - Serialization to JSONLines
49
+
50
+ Attributes
51
+ ----------
52
+ name : str
53
+ Name of the lexicon.
54
+ description : str | None
55
+ Optional description of the lexicon's purpose.
56
+ language_code : LanguageCode | None
57
+ ISO 639-1 (2-letter) or ISO 639-3 (3-letter) language code.
58
+ Examples: "en", "eng", "ko", "kor", "zu", "zul".
59
+ Automatically validated and normalized to lowercase.
60
+ items : dict[UUID, LexicalItem]
61
+ Dictionary of items indexed by their UUIDs.
62
+ tags : list[str]
63
+ Tags for categorizing the lexicon.
64
+
65
+ Examples
66
+ --------
67
+ >>> lexicon = Lexicon(name="verbs")
68
+ >>> item = LexicalItem(lemma="walk", pos="VERB")
69
+ >>> lexicon.add(item)
70
+ >>> len(lexicon)
71
+ 1
72
+ >>> verbs = lexicon.filter_by_pos("VERB")
73
+ >>> len(verbs.items)
74
+ 1
75
+ """
76
+
77
+ name: str
78
+ description: str | None = None
79
+ language_code: LanguageCode | None = None
80
+ items: dict[UUID, LexicalItem] = Field(default_factory=_empty_item_dict)
81
+ tags: list[str] = Field(default_factory=_empty_str_list)
82
+
83
+ def __len__(self) -> int:
84
+ """Return number of items in lexicon.
85
+
86
+ Returns
87
+ -------
88
+ int
89
+ Number of items in the lexicon.
90
+
91
+ Examples
92
+ --------
93
+ >>> lexicon = Lexicon(name="test")
94
+ >>> len(lexicon)
95
+ 0
96
+ >>> lexicon.add(LexicalItem(lemma="test"))
97
+ >>> len(lexicon)
98
+ 1
99
+ """
100
+ return len(self.items)
101
+
102
+ def __iter__(self) -> Iterator[LexicalItem]: # type: ignore[override]
103
+ """Iterate over items in lexicon.
104
+
105
+ Returns
106
+ -------
107
+ Iterator[LexicalItem]
108
+ Iterator over lexical items.
109
+
110
+ Examples
111
+ --------
112
+ >>> lexicon = Lexicon(name="test")
113
+ >>> lexicon.add(LexicalItem(lemma="walk"))
114
+ >>> lexicon.add(LexicalItem(lemma="run"))
115
+ >>> [item.lemma for item in lexicon]
116
+ ['walk', 'run']
117
+ """
118
+ return iter(self.items.values())
119
+
120
+ def __contains__(self, item_id: UUID) -> bool:
121
+ """Check if item ID is in lexicon.
122
+
123
+ Parameters
124
+ ----------
125
+ item_id : UUID
126
+ The item ID to check.
127
+
128
+ Returns
129
+ -------
130
+ bool
131
+ True if item ID exists in lexicon.
132
+
133
+ Examples
134
+ --------
135
+ >>> lexicon = Lexicon(name="test")
136
+ >>> item = LexicalItem(lemma="test")
137
+ >>> lexicon.add(item)
138
+ >>> item.id in lexicon
139
+ True
140
+ """
141
+ return item_id in self.items
142
+
143
+ def add(self, item: LexicalItem) -> None:
144
+ """Add a lexical item to the lexicon.
145
+
146
+ Parameters
147
+ ----------
148
+ item : LexicalItem
149
+ The item to add.
150
+
151
+ Raises
152
+ ------
153
+ ValueError
154
+ If item with same ID already exists.
155
+
156
+ Examples
157
+ --------
158
+ >>> lexicon = Lexicon(name="test")
159
+ >>> item = LexicalItem(lemma="walk")
160
+ >>> lexicon.add(item)
161
+ >>> len(lexicon)
162
+ 1
163
+ """
164
+ if item.id in self.items:
165
+ raise ValueError(f"Item with ID {item.id} already exists in lexicon")
166
+ self.items[item.id] = item
167
+ self.update_modified_time()
168
+
169
+ def add_many(self, items: list[LexicalItem]) -> None:
170
+ """Add multiple items to the lexicon.
171
+
172
+ Parameters
173
+ ----------
174
+ items : list[LexicalItem]
175
+ The items to add.
176
+
177
+ Raises
178
+ ------
179
+ ValueError
180
+ If any item with same ID already exists.
181
+
182
+ Examples
183
+ --------
184
+ >>> lexicon = Lexicon(name="test")
185
+ >>> items = [LexicalItem(lemma="walk"), LexicalItem(lemma="run")]
186
+ >>> lexicon.add_many(items)
187
+ >>> len(lexicon)
188
+ 2
189
+ """
190
+ for item in items:
191
+ self.add(item)
192
+
193
+ def remove(self, item_id: UUID) -> LexicalItem:
194
+ """Remove and return an item by ID.
195
+
196
+ Parameters
197
+ ----------
198
+ item_id : UUID
199
+ The ID of the item to remove.
200
+
201
+ Returns
202
+ -------
203
+ LexicalItem
204
+ The removed item.
205
+
206
+ Raises
207
+ ------
208
+ KeyError
209
+ If item ID not found.
210
+
211
+ Examples
212
+ --------
213
+ >>> lexicon = Lexicon(name="test")
214
+ >>> item = LexicalItem(lemma="walk")
215
+ >>> lexicon.add(item)
216
+ >>> removed = lexicon.remove(item.id)
217
+ >>> removed.lemma
218
+ 'walk'
219
+ >>> len(lexicon)
220
+ 0
221
+ """
222
+ if item_id not in self.items:
223
+ raise KeyError(f"Item with ID {item_id} not found in lexicon")
224
+ item = self.items.pop(item_id)
225
+ self.update_modified_time()
226
+ return item
227
+
228
+ def get(self, item_id: UUID) -> LexicalItem | None:
229
+ """Get an item by ID, or None if not found.
230
+
231
+ Parameters
232
+ ----------
233
+ item_id : UUID
234
+ The ID of the item to get.
235
+
236
+ Returns
237
+ -------
238
+ LexicalItem | None
239
+ The item if found, None otherwise.
240
+
241
+ Examples
242
+ --------
243
+ >>> lexicon = Lexicon(name="test")
244
+ >>> item = LexicalItem(lemma="walk")
245
+ >>> lexicon.add(item)
246
+ >>> retrieved = lexicon.get(item.id)
247
+ >>> retrieved.lemma # doctest: +SKIP
248
+ 'walk'
249
+ >>> from uuid import uuid4
250
+ >>> lexicon.get(uuid4()) is None
251
+ True
252
+ """
253
+ return self.items.get(item_id)
254
+
255
+ def filter(self, predicate: Callable[[LexicalItem], bool]) -> Lexicon:
256
+ """Filter items by a predicate function.
257
+
258
+ Creates a new lexicon containing only items that satisfy the predicate.
259
+
260
+ Parameters
261
+ ----------
262
+ predicate : Callable[[LexicalItem], bool]
263
+ Function that returns True for items to include.
264
+
265
+ Returns
266
+ -------
267
+ Lexicon
268
+ New lexicon with filtered items.
269
+
270
+ Examples
271
+ --------
272
+ >>> lexicon = Lexicon(name="test")
273
+ >>> lexicon.add(LexicalItem(lemma="walk", pos="VERB"))
274
+ >>> lexicon.add(LexicalItem(lemma="dog", pos="NOUN"))
275
+ >>> verbs = lexicon.filter(lambda item: item.pos == "VERB")
276
+ >>> len(verbs.items)
277
+ 1
278
+ """
279
+ filtered = Lexicon(
280
+ name=f"{self.name}_filtered",
281
+ description=self.description,
282
+ language_code=self.language_code,
283
+ tags=self.tags.copy(),
284
+ )
285
+ filtered.items = {
286
+ item_id: item for item_id, item in self.items.items() if predicate(item)
287
+ }
288
+ return filtered
289
+
290
+ def filter_by_pos(self, pos: str) -> Lexicon:
291
+ """Filter items by part of speech.
292
+
293
+ Parameters
294
+ ----------
295
+ pos : str
296
+ The part of speech to filter by.
297
+
298
+ Returns
299
+ -------
300
+ Lexicon
301
+ New lexicon with items matching the POS.
302
+
303
+ Examples
304
+ --------
305
+ >>> lexicon = Lexicon(name="test", language_code="eng")
306
+ >>> lexicon.add(LexicalItem(
307
+ ... lemma="walk", language_code="eng", features={"pos": "VERB"}
308
+ ... ))
309
+ >>> lexicon.add(LexicalItem(
310
+ ... lemma="dog", language_code="eng", features={"pos": "NOUN"}
311
+ ... ))
312
+ >>> verbs = lexicon.filter_by_pos("VERB")
313
+ >>> len(verbs.items)
314
+ 1
315
+ """
316
+ return self.filter(
317
+ lambda item: (
318
+ item.features.get("pos") is not None and item.features.get("pos") == pos
319
+ )
320
+ )
321
+
322
+ def filter_by_lemma(self, lemma: str) -> Lexicon:
323
+ """Filter items by lemma (exact match).
324
+
325
+ Parameters
326
+ ----------
327
+ lemma : str
328
+ The lemma to filter by.
329
+
330
+ Returns
331
+ -------
332
+ Lexicon
333
+ New lexicon with items matching the lemma.
334
+
335
+ Examples
336
+ --------
337
+ >>> lexicon = Lexicon(name="test")
338
+ >>> lexicon.add(LexicalItem(lemma="walk"))
339
+ >>> lexicon.add(LexicalItem(lemma="run"))
340
+ >>> results = lexicon.filter_by_lemma("walk")
341
+ >>> len(results.items)
342
+ 1
343
+ """
344
+ return self.filter(lambda item: item.lemma == lemma)
345
+
346
+ def filter_by_feature(self, feature_name: str, feature_value: Any) -> Lexicon:
347
+ """Filter items by a specific feature value.
348
+
349
+ Parameters
350
+ ----------
351
+ feature_name : str
352
+ The name of the feature.
353
+ feature_value : Any
354
+ The value to match.
355
+
356
+ Returns
357
+ -------
358
+ Lexicon
359
+ New lexicon with items having the specified feature value.
360
+
361
+ Examples
362
+ --------
363
+ >>> lexicon = Lexicon(name="test")
364
+ >>> lexicon.add(LexicalItem(lemma="walk", features={"tense": "present"}))
365
+ >>> lexicon.add(LexicalItem(lemma="walked", features={"tense": "past"}))
366
+ >>> present = lexicon.filter_by_feature("tense", "present")
367
+ >>> len(present.items)
368
+ 1
369
+ """
370
+ return self.filter(
371
+ lambda item: (
372
+ feature_name in item.features
373
+ and item.features[feature_name] == feature_value
374
+ )
375
+ )
376
+
377
+ def filter_by_attribute(self, attr_name: str, attr_value: Any) -> Lexicon:
378
+ """Filter items by a specific attribute value.
379
+
380
+ Parameters
381
+ ----------
382
+ attr_name : str
383
+ The name of the attribute.
384
+ attr_value : Any
385
+ The value to match.
386
+
387
+ Returns
388
+ -------
389
+ Lexicon
390
+ New lexicon with items having the specified attribute value.
391
+
392
+ Examples
393
+ --------
394
+ >>> lexicon = Lexicon(name="test")
395
+ >>> lexicon.add(LexicalItem(
396
+ ... lemma="walk", language_code="eng", features={"frequency": 1000}
397
+ ... ))
398
+ >>> lexicon.add(LexicalItem(
399
+ ... lemma="saunter", language_code="eng", features={"frequency": 10}
400
+ ... ))
401
+ >>> high_freq = lexicon.filter_by_attribute("frequency", 1000)
402
+ >>> len(high_freq.items)
403
+ 1
404
+ """
405
+ return self.filter(
406
+ lambda item: (
407
+ attr_name in item.features and item.features[attr_name] == attr_value
408
+ )
409
+ )
410
+
411
+ def search(self, query: str, field: str = "lemma") -> Lexicon:
412
+ """Search for items containing query string in specified field.
413
+
414
+ Parameters
415
+ ----------
416
+ query : str
417
+ Search string (case-insensitive substring match).
418
+ field : str
419
+ Field to search in ("lemma", "pos", "form").
420
+
421
+ Returns
422
+ -------
423
+ Lexicon
424
+ New lexicon with matching items.
425
+
426
+ Raises
427
+ ------
428
+ ValueError
429
+ If field is not a valid searchable field.
430
+
431
+ Examples
432
+ --------
433
+ >>> lexicon = Lexicon(name="test")
434
+ >>> lexicon.add(LexicalItem(lemma="walk"))
435
+ >>> lexicon.add(LexicalItem(lemma="run"))
436
+ >>> results = lexicon.search("wa")
437
+ >>> len(results.items)
438
+ 1
439
+ """
440
+ query_lower = query.lower()
441
+
442
+ if field == "lemma":
443
+ return self.filter(lambda item: query_lower in item.lemma.lower())
444
+ elif field == "pos":
445
+ return self.filter(
446
+ lambda item: (
447
+ item.features.get("pos") is not None
448
+ and query_lower in str(item.features.get("pos")).lower()
449
+ )
450
+ )
451
+ elif field == "form":
452
+ return self.filter(
453
+ lambda item: item.form is not None and query_lower in item.form.lower()
454
+ )
455
+ else:
456
+ raise ValueError(
457
+ f"Invalid field '{field}'. Must be 'lemma', 'pos', or 'form'."
458
+ )
459
+
460
+ def merge(
461
+ self,
462
+ other: Lexicon,
463
+ strategy: Literal["keep_first", "keep_second", "error"] = "keep_first",
464
+ ) -> Lexicon:
465
+ """Merge with another lexicon.
466
+
467
+ Parameters
468
+ ----------
469
+ other : Lexicon
470
+ The lexicon to merge with.
471
+ strategy : Literal["keep_first", "keep_second", "error"]
472
+ How to handle duplicate IDs:
473
+ - "keep_first": Keep item from self
474
+ - "keep_second": Keep item from other
475
+ - "error": Raise error on duplicates
476
+
477
+ Returns
478
+ -------
479
+ Lexicon
480
+ New merged lexicon.
481
+
482
+ Raises
483
+ ------
484
+ ValueError
485
+ If strategy is "error" and duplicates found.
486
+
487
+ Examples
488
+ --------
489
+ >>> lex1 = Lexicon(name="lex1")
490
+ >>> lex1.add(LexicalItem(lemma="walk"))
491
+ >>> lex2 = Lexicon(name="lex2")
492
+ >>> lex2.add(LexicalItem(lemma="run"))
493
+ >>> merged = lex1.merge(lex2)
494
+ >>> len(merged.items)
495
+ 2
496
+ """
497
+ # Check for duplicates if strategy is "error"
498
+ if strategy == "error":
499
+ duplicates = set(self.items.keys()) & set(other.items.keys())
500
+ if duplicates:
501
+ raise ValueError(
502
+ f"Duplicate item IDs found: {duplicates}. "
503
+ "Use strategy='keep_first' or 'keep_second' to resolve."
504
+ )
505
+
506
+ # Create merged lexicon
507
+ # Use language_code from self, or other if self's is None
508
+ language_code = self.language_code or other.language_code
509
+
510
+ merged = Lexicon(
511
+ name=f"{self.name}_merged",
512
+ description=self.description,
513
+ language_code=language_code,
514
+ tags=list(set(self.tags + other.tags)),
515
+ )
516
+
517
+ # Add items based on strategy
518
+ if strategy == "keep_first":
519
+ merged.items = {**other.items, **self.items}
520
+ elif strategy == "keep_second":
521
+ merged.items = {**self.items, **other.items}
522
+ else: # strategy == "error" already handled above
523
+ merged.items = {**self.items, **other.items}
524
+
525
+ return merged
526
+
527
+ def to_dataframe(
528
+ self, backend: Literal["pandas", "polars"] = "pandas"
529
+ ) -> DataFrame:
530
+ """Convert lexicon to DataFrame.
531
+
532
+ Parameters
533
+ ----------
534
+ backend : Literal["pandas", "polars"]
535
+ DataFrame backend to use (default: "pandas").
536
+
537
+ Returns
538
+ -------
539
+ DataFrame
540
+ pandas or polars DataFrame with columns: id, lemma, pos, form,
541
+ source, created_at, modified_at, plus separate columns for
542
+ each feature and attribute.
543
+
544
+ Examples
545
+ --------
546
+ >>> lexicon = Lexicon(name="test")
547
+ >>> lexicon.add(LexicalItem(lemma="walk", pos="VERB"))
548
+ >>> df = lexicon.to_dataframe()
549
+ >>> "lemma" in df.columns
550
+ True
551
+ >>> "pos" in df.columns
552
+ True
553
+ """
554
+ if not self.items:
555
+ # Return empty DataFrame with expected columns
556
+ columns = [
557
+ "id",
558
+ "lemma",
559
+ "pos",
560
+ "form",
561
+ "source",
562
+ "created_at",
563
+ "modified_at",
564
+ ]
565
+ if backend == "pandas":
566
+ return pd.DataFrame(columns=columns)
567
+ else:
568
+ schema: dict[str, type[pl.Utf8]] = dict.fromkeys(columns, pl.Utf8)
569
+ return pl.DataFrame(schema=schema)
570
+
571
+ rows = []
572
+ for item in self.items.values():
573
+ row = {
574
+ "id": str(item.id),
575
+ "lemma": item.lemma,
576
+ "form": item.form,
577
+ "language_code": item.language_code,
578
+ "source": item.source,
579
+ "created_at": item.created_at.isoformat(),
580
+ "modified_at": item.modified_at.isoformat(),
581
+ }
582
+
583
+ # Add features with "feature_" prefix
584
+ for key, value in item.features.items():
585
+ row[f"feature_{key}"] = value
586
+
587
+ rows.append(row) # type: ignore[arg-type]
588
+
589
+ if backend == "pandas":
590
+ return pd.DataFrame(rows)
591
+ else:
592
+ return pl.DataFrame(rows)
593
+
594
+ @classmethod
595
+ def from_dataframe(cls, df: DataFrame, name: str) -> Lexicon:
596
+ """Create lexicon from DataFrame.
597
+
598
+ Parameters
599
+ ----------
600
+ df : DataFrame
601
+ pandas or polars DataFrame with at minimum a 'lemma' column.
602
+ name : str
603
+ Name for the lexicon.
604
+
605
+ Returns
606
+ -------
607
+ Lexicon
608
+ New lexicon created from DataFrame.
609
+
610
+ Raises
611
+ ------
612
+ ValueError
613
+ If DataFrame does not have a 'lemma' column.
614
+
615
+ Examples
616
+ --------
617
+ >>> import pandas as pd
618
+ >>> df = pd.DataFrame({"lemma": ["walk", "run"], "pos": ["VERB", "VERB"]})
619
+ >>> lexicon = Lexicon.from_dataframe(df, "verbs")
620
+ >>> len(lexicon.items)
621
+ 2
622
+ """
623
+ # Check if it's a polars DataFrame
624
+ is_polars = isinstance(df, pl.DataFrame)
625
+
626
+ # Get columns, handling both pandas and polars
627
+ if is_polars:
628
+ assert isinstance(df, pl.DataFrame)
629
+ columns_list: list[str] = df.columns
630
+ else:
631
+ assert isinstance(df, pd.DataFrame)
632
+ columns_list = list(df.columns)
633
+
634
+ if "lemma" not in columns_list:
635
+ raise ValueError("DataFrame must have a 'lemma' column")
636
+
637
+ lexicon = cls(name=name)
638
+
639
+ # Convert to dict format for iteration
640
+ rows: list[dict[str, Any]]
641
+ if is_polars:
642
+ assert isinstance(df, pl.DataFrame)
643
+ rows = df.to_dicts()
644
+ else:
645
+ assert isinstance(df, pd.DataFrame)
646
+ rows = df.to_dict("records") # type: ignore[assignment]
647
+
648
+ for row in rows:
649
+ # Extract base fields
650
+ item_data: dict[str, Any] = {"lemma": row["lemma"]}
651
+
652
+ # Helper function to check for null values
653
+ def is_not_null(value: Any) -> bool:
654
+ if is_polars:
655
+ return value is not None
656
+ else:
657
+ return pd.notna(value) # type: ignore[no-any-return]
658
+
659
+ # Handle language_code (required field)
660
+ if "language_code" in row and is_not_null(row["language_code"]):
661
+ item_data["language_code"] = row["language_code"]
662
+ else:
663
+ item_data["language_code"] = "eng" # Default to English
664
+
665
+ if "form" in row and is_not_null(row["form"]):
666
+ item_data["form"] = row["form"]
667
+ if "source" in row and is_not_null(row["source"]):
668
+ item_data["source"] = row["source"]
669
+
670
+ # Extract features (columns with "feature_" prefix, "pos", or "attr_" prefix) # noqa: E501
671
+ features: dict[str, Any] = {}
672
+ if "pos" in row and is_not_null(row["pos"]):
673
+ features["pos"] = row["pos"]
674
+ for col in columns_list:
675
+ if col.startswith("feature_") and is_not_null(row[col]):
676
+ feature_name: str = col[len("feature_") :]
677
+ features[feature_name] = row[col]
678
+ elif col.startswith("attr_") and is_not_null(row[col]):
679
+ attr_name: str = col[len("attr_") :]
680
+ features[attr_name] = row[col]
681
+
682
+ if features:
683
+ item_data["features"] = features
684
+
685
+ item = LexicalItem(**item_data)
686
+ lexicon.add(item)
687
+
688
+ return lexicon
689
+
690
+ def to_jsonl(self, path: str) -> None:
691
+ """Save lexicon to JSONLines file (one item per line).
692
+
693
+ Parameters
694
+ ----------
695
+ path : str
696
+ Path to the output file.
697
+
698
+ Examples
699
+ --------
700
+ >>> lexicon = Lexicon(name="test")
701
+ >>> lexicon.add(LexicalItem(lemma="walk"))
702
+ >>> lexicon.to_jsonl("/tmp/lexicon.jsonl") # doctest: +SKIP
703
+ """
704
+ file_path = Path(path)
705
+ file_path.parent.mkdir(parents=True, exist_ok=True)
706
+
707
+ with open(file_path, "w", encoding="utf-8") as f:
708
+ for item in self.items.values():
709
+ f.write(item.model_dump_json() + "\n")
710
+
711
+ @classmethod
712
+ def from_jsonl(cls, path: str, name: str) -> Lexicon:
713
+ """Load lexicon from JSONLines file.
714
+
715
+ Parameters
716
+ ----------
717
+ path : str
718
+ Path to the input file.
719
+ name : str
720
+ Name for the lexicon.
721
+
722
+ Returns
723
+ -------
724
+ Lexicon
725
+ New lexicon loaded from file.
726
+
727
+ Examples
728
+ --------
729
+ >>> lexicon = Lexicon.from_jsonl(
730
+ ... "/tmp/lexicon.jsonl", "loaded"
731
+ ... ) # doctest: +SKIP
732
+ """
733
+ lexicon = cls(name=name)
734
+ file_path = Path(path)
735
+
736
+ with open(file_path, encoding="utf-8") as f:
737
+ for line in f:
738
+ line = line.strip()
739
+ if line:
740
+ item_data = json.loads(line)
741
+ item = LexicalItem(**item_data)
742
+ lexicon.add(item)
743
+
744
+ return lexicon