bead 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- bead/__init__.py +11 -0
- bead/__main__.py +11 -0
- bead/active_learning/__init__.py +15 -0
- bead/active_learning/config.py +231 -0
- bead/active_learning/loop.py +566 -0
- bead/active_learning/models/__init__.py +24 -0
- bead/active_learning/models/base.py +852 -0
- bead/active_learning/models/binary.py +910 -0
- bead/active_learning/models/categorical.py +943 -0
- bead/active_learning/models/cloze.py +862 -0
- bead/active_learning/models/forced_choice.py +956 -0
- bead/active_learning/models/free_text.py +773 -0
- bead/active_learning/models/lora.py +365 -0
- bead/active_learning/models/magnitude.py +835 -0
- bead/active_learning/models/multi_select.py +795 -0
- bead/active_learning/models/ordinal_scale.py +811 -0
- bead/active_learning/models/peft_adapter.py +155 -0
- bead/active_learning/models/random_effects.py +639 -0
- bead/active_learning/selection.py +354 -0
- bead/active_learning/strategies.py +391 -0
- bead/active_learning/trainers/__init__.py +26 -0
- bead/active_learning/trainers/base.py +210 -0
- bead/active_learning/trainers/data_collator.py +172 -0
- bead/active_learning/trainers/dataset_utils.py +261 -0
- bead/active_learning/trainers/huggingface.py +304 -0
- bead/active_learning/trainers/lightning.py +324 -0
- bead/active_learning/trainers/metrics.py +424 -0
- bead/active_learning/trainers/mixed_effects.py +551 -0
- bead/active_learning/trainers/model_wrapper.py +509 -0
- bead/active_learning/trainers/registry.py +104 -0
- bead/adapters/__init__.py +11 -0
- bead/adapters/huggingface.py +61 -0
- bead/behavioral/__init__.py +116 -0
- bead/behavioral/analytics.py +646 -0
- bead/behavioral/extraction.py +343 -0
- bead/behavioral/merging.py +343 -0
- bead/cli/__init__.py +11 -0
- bead/cli/active_learning.py +513 -0
- bead/cli/active_learning_commands.py +779 -0
- bead/cli/completion.py +359 -0
- bead/cli/config.py +624 -0
- bead/cli/constraint_builders.py +286 -0
- bead/cli/deployment.py +859 -0
- bead/cli/deployment_trials.py +493 -0
- bead/cli/deployment_ui.py +332 -0
- bead/cli/display.py +378 -0
- bead/cli/items.py +960 -0
- bead/cli/items_factories.py +776 -0
- bead/cli/list_constraints.py +714 -0
- bead/cli/lists.py +490 -0
- bead/cli/main.py +430 -0
- bead/cli/models.py +877 -0
- bead/cli/resource_loaders.py +621 -0
- bead/cli/resources.py +1036 -0
- bead/cli/shell.py +356 -0
- bead/cli/simulate.py +840 -0
- bead/cli/templates.py +1158 -0
- bead/cli/training.py +1080 -0
- bead/cli/utils.py +614 -0
- bead/cli/workflow.py +1273 -0
- bead/config/__init__.py +68 -0
- bead/config/active_learning.py +1009 -0
- bead/config/config.py +192 -0
- bead/config/defaults.py +118 -0
- bead/config/deployment.py +217 -0
- bead/config/env.py +147 -0
- bead/config/item.py +45 -0
- bead/config/list.py +193 -0
- bead/config/loader.py +149 -0
- bead/config/logging.py +42 -0
- bead/config/model.py +49 -0
- bead/config/paths.py +46 -0
- bead/config/profiles.py +320 -0
- bead/config/resources.py +47 -0
- bead/config/serialization.py +210 -0
- bead/config/simulation.py +206 -0
- bead/config/template.py +238 -0
- bead/config/validation.py +267 -0
- bead/data/__init__.py +65 -0
- bead/data/base.py +87 -0
- bead/data/identifiers.py +97 -0
- bead/data/language_codes.py +61 -0
- bead/data/metadata.py +270 -0
- bead/data/range.py +123 -0
- bead/data/repository.py +358 -0
- bead/data/serialization.py +249 -0
- bead/data/timestamps.py +89 -0
- bead/data/validation.py +349 -0
- bead/data_collection/__init__.py +11 -0
- bead/data_collection/jatos.py +223 -0
- bead/data_collection/merger.py +154 -0
- bead/data_collection/prolific.py +198 -0
- bead/deployment/__init__.py +5 -0
- bead/deployment/distribution.py +402 -0
- bead/deployment/jatos/__init__.py +1 -0
- bead/deployment/jatos/api.py +200 -0
- bead/deployment/jatos/exporter.py +210 -0
- bead/deployment/jspsych/__init__.py +9 -0
- bead/deployment/jspsych/biome.json +44 -0
- bead/deployment/jspsych/config.py +411 -0
- bead/deployment/jspsych/generator.py +598 -0
- bead/deployment/jspsych/package.json +51 -0
- bead/deployment/jspsych/pnpm-lock.yaml +2141 -0
- bead/deployment/jspsych/randomizer.py +299 -0
- bead/deployment/jspsych/src/lib/list-distributor.test.ts +327 -0
- bead/deployment/jspsych/src/lib/list-distributor.ts +1282 -0
- bead/deployment/jspsych/src/lib/randomizer.test.ts +232 -0
- bead/deployment/jspsych/src/lib/randomizer.ts +367 -0
- bead/deployment/jspsych/src/plugins/cloze-dropdown.ts +252 -0
- bead/deployment/jspsych/src/plugins/forced-choice.ts +265 -0
- bead/deployment/jspsych/src/plugins/plugins.test.ts +141 -0
- bead/deployment/jspsych/src/plugins/rating.ts +248 -0
- bead/deployment/jspsych/src/slopit/index.ts +9 -0
- bead/deployment/jspsych/src/types/jatos.d.ts +256 -0
- bead/deployment/jspsych/src/types/jspsych.d.ts +228 -0
- bead/deployment/jspsych/templates/experiment.css +1 -0
- bead/deployment/jspsych/templates/experiment.js.template +289 -0
- bead/deployment/jspsych/templates/index.html +51 -0
- bead/deployment/jspsych/templates/randomizer.js +241 -0
- bead/deployment/jspsych/templates/randomizer.js.template +313 -0
- bead/deployment/jspsych/trials.py +723 -0
- bead/deployment/jspsych/tsconfig.json +23 -0
- bead/deployment/jspsych/tsup.config.ts +30 -0
- bead/deployment/jspsych/ui/__init__.py +1 -0
- bead/deployment/jspsych/ui/components.py +383 -0
- bead/deployment/jspsych/ui/styles.py +411 -0
- bead/dsl/__init__.py +80 -0
- bead/dsl/ast.py +168 -0
- bead/dsl/context.py +178 -0
- bead/dsl/errors.py +71 -0
- bead/dsl/evaluator.py +570 -0
- bead/dsl/grammar.lark +81 -0
- bead/dsl/parser.py +231 -0
- bead/dsl/stdlib.py +929 -0
- bead/evaluation/__init__.py +13 -0
- bead/evaluation/convergence.py +485 -0
- bead/evaluation/interannotator.py +398 -0
- bead/items/__init__.py +40 -0
- bead/items/adapters/__init__.py +70 -0
- bead/items/adapters/anthropic.py +224 -0
- bead/items/adapters/api_utils.py +167 -0
- bead/items/adapters/base.py +216 -0
- bead/items/adapters/google.py +259 -0
- bead/items/adapters/huggingface.py +1074 -0
- bead/items/adapters/openai.py +323 -0
- bead/items/adapters/registry.py +202 -0
- bead/items/adapters/sentence_transformers.py +224 -0
- bead/items/adapters/togetherai.py +309 -0
- bead/items/binary.py +515 -0
- bead/items/cache.py +558 -0
- bead/items/categorical.py +593 -0
- bead/items/cloze.py +757 -0
- bead/items/constructor.py +784 -0
- bead/items/forced_choice.py +413 -0
- bead/items/free_text.py +681 -0
- bead/items/generation.py +432 -0
- bead/items/item.py +396 -0
- bead/items/item_template.py +787 -0
- bead/items/magnitude.py +573 -0
- bead/items/multi_select.py +621 -0
- bead/items/ordinal_scale.py +569 -0
- bead/items/scoring.py +448 -0
- bead/items/validation.py +723 -0
- bead/lists/__init__.py +30 -0
- bead/lists/balancer.py +263 -0
- bead/lists/constraints.py +1067 -0
- bead/lists/experiment_list.py +286 -0
- bead/lists/list_collection.py +378 -0
- bead/lists/partitioner.py +1141 -0
- bead/lists/stratification.py +254 -0
- bead/participants/__init__.py +73 -0
- bead/participants/collection.py +699 -0
- bead/participants/merging.py +312 -0
- bead/participants/metadata_spec.py +491 -0
- bead/participants/models.py +276 -0
- bead/resources/__init__.py +29 -0
- bead/resources/adapters/__init__.py +19 -0
- bead/resources/adapters/base.py +104 -0
- bead/resources/adapters/cache.py +128 -0
- bead/resources/adapters/glazing.py +508 -0
- bead/resources/adapters/registry.py +117 -0
- bead/resources/adapters/unimorph.py +796 -0
- bead/resources/classification.py +856 -0
- bead/resources/constraint_builders.py +329 -0
- bead/resources/constraints.py +165 -0
- bead/resources/lexical_item.py +223 -0
- bead/resources/lexicon.py +744 -0
- bead/resources/loaders.py +209 -0
- bead/resources/template.py +441 -0
- bead/resources/template_collection.py +707 -0
- bead/resources/template_generation.py +349 -0
- bead/simulation/__init__.py +29 -0
- bead/simulation/annotators/__init__.py +15 -0
- bead/simulation/annotators/base.py +175 -0
- bead/simulation/annotators/distance_based.py +135 -0
- bead/simulation/annotators/lm_based.py +114 -0
- bead/simulation/annotators/oracle.py +182 -0
- bead/simulation/annotators/random.py +181 -0
- bead/simulation/dsl_extension/__init__.py +3 -0
- bead/simulation/noise_models/__init__.py +13 -0
- bead/simulation/noise_models/base.py +42 -0
- bead/simulation/noise_models/random_noise.py +82 -0
- bead/simulation/noise_models/systematic.py +132 -0
- bead/simulation/noise_models/temperature.py +86 -0
- bead/simulation/runner.py +144 -0
- bead/simulation/strategies/__init__.py +23 -0
- bead/simulation/strategies/base.py +123 -0
- bead/simulation/strategies/binary.py +103 -0
- bead/simulation/strategies/categorical.py +123 -0
- bead/simulation/strategies/cloze.py +224 -0
- bead/simulation/strategies/forced_choice.py +127 -0
- bead/simulation/strategies/free_text.py +105 -0
- bead/simulation/strategies/magnitude.py +116 -0
- bead/simulation/strategies/multi_select.py +129 -0
- bead/simulation/strategies/ordinal_scale.py +131 -0
- bead/templates/__init__.py +27 -0
- bead/templates/adapters/__init__.py +17 -0
- bead/templates/adapters/base.py +128 -0
- bead/templates/adapters/cache.py +178 -0
- bead/templates/adapters/huggingface.py +312 -0
- bead/templates/combinatorics.py +103 -0
- bead/templates/filler.py +605 -0
- bead/templates/renderers.py +177 -0
- bead/templates/resolver.py +178 -0
- bead/templates/strategies.py +1806 -0
- bead/templates/streaming.py +195 -0
- bead-0.1.0.dist-info/METADATA +212 -0
- bead-0.1.0.dist-info/RECORD +231 -0
- bead-0.1.0.dist-info/WHEEL +4 -0
- bead-0.1.0.dist-info/entry_points.txt +2 -0
- bead-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,744 @@
|
|
|
1
|
+
"""Lexicon management for collections of lexical items.
|
|
2
|
+
|
|
3
|
+
This module provides the Lexicon class for managing, querying, and manipulating
|
|
4
|
+
collections of lexical items. It supports filtering, searching, merging, and
|
|
5
|
+
conversion to/from pandas and polars DataFrames.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
from collections.abc import Callable, Iterator
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any, Literal
|
|
14
|
+
from uuid import UUID
|
|
15
|
+
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import polars as pl
|
|
18
|
+
from pydantic import Field
|
|
19
|
+
|
|
20
|
+
from bead.data.base import BeadBaseModel
|
|
21
|
+
from bead.data.language_codes import LanguageCode
|
|
22
|
+
from bead.resources.lexical_item import LexicalItem
|
|
23
|
+
|
|
24
|
+
# Type alias for supported DataFrame types
|
|
25
|
+
DataFrame = pd.DataFrame | pl.DataFrame
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _empty_str_list() -> list[str]:
|
|
29
|
+
"""Create an empty string list."""
|
|
30
|
+
return []
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _empty_item_dict() -> dict[UUID, LexicalItem]:
|
|
34
|
+
"""Create an empty item dictionary."""
|
|
35
|
+
return {}
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class Lexicon(BeadBaseModel):
|
|
39
|
+
"""A collection of lexical items with operations for filtering and analysis.
|
|
40
|
+
|
|
41
|
+
The Lexicon class manages collections of LexicalItem objects and provides
|
|
42
|
+
methods for:
|
|
43
|
+
- Adding and removing items (CRUD operations)
|
|
44
|
+
- Filtering by properties, features, and attributes
|
|
45
|
+
- Searching by text
|
|
46
|
+
- Merging with other lexicons
|
|
47
|
+
- Converting to/from pandas and polars DataFrames
|
|
48
|
+
- Serialization to JSONLines
|
|
49
|
+
|
|
50
|
+
Attributes
|
|
51
|
+
----------
|
|
52
|
+
name : str
|
|
53
|
+
Name of the lexicon.
|
|
54
|
+
description : str | None
|
|
55
|
+
Optional description of the lexicon's purpose.
|
|
56
|
+
language_code : LanguageCode | None
|
|
57
|
+
ISO 639-1 (2-letter) or ISO 639-3 (3-letter) language code.
|
|
58
|
+
Examples: "en", "eng", "ko", "kor", "zu", "zul".
|
|
59
|
+
Automatically validated and normalized to lowercase.
|
|
60
|
+
items : dict[UUID, LexicalItem]
|
|
61
|
+
Dictionary of items indexed by their UUIDs.
|
|
62
|
+
tags : list[str]
|
|
63
|
+
Tags for categorizing the lexicon.
|
|
64
|
+
|
|
65
|
+
Examples
|
|
66
|
+
--------
|
|
67
|
+
>>> lexicon = Lexicon(name="verbs")
|
|
68
|
+
>>> item = LexicalItem(lemma="walk", pos="VERB")
|
|
69
|
+
>>> lexicon.add(item)
|
|
70
|
+
>>> len(lexicon)
|
|
71
|
+
1
|
|
72
|
+
>>> verbs = lexicon.filter_by_pos("VERB")
|
|
73
|
+
>>> len(verbs.items)
|
|
74
|
+
1
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
name: str
|
|
78
|
+
description: str | None = None
|
|
79
|
+
language_code: LanguageCode | None = None
|
|
80
|
+
items: dict[UUID, LexicalItem] = Field(default_factory=_empty_item_dict)
|
|
81
|
+
tags: list[str] = Field(default_factory=_empty_str_list)
|
|
82
|
+
|
|
83
|
+
def __len__(self) -> int:
|
|
84
|
+
"""Return number of items in lexicon.
|
|
85
|
+
|
|
86
|
+
Returns
|
|
87
|
+
-------
|
|
88
|
+
int
|
|
89
|
+
Number of items in the lexicon.
|
|
90
|
+
|
|
91
|
+
Examples
|
|
92
|
+
--------
|
|
93
|
+
>>> lexicon = Lexicon(name="test")
|
|
94
|
+
>>> len(lexicon)
|
|
95
|
+
0
|
|
96
|
+
>>> lexicon.add(LexicalItem(lemma="test"))
|
|
97
|
+
>>> len(lexicon)
|
|
98
|
+
1
|
|
99
|
+
"""
|
|
100
|
+
return len(self.items)
|
|
101
|
+
|
|
102
|
+
def __iter__(self) -> Iterator[LexicalItem]: # type: ignore[override]
|
|
103
|
+
"""Iterate over items in lexicon.
|
|
104
|
+
|
|
105
|
+
Returns
|
|
106
|
+
-------
|
|
107
|
+
Iterator[LexicalItem]
|
|
108
|
+
Iterator over lexical items.
|
|
109
|
+
|
|
110
|
+
Examples
|
|
111
|
+
--------
|
|
112
|
+
>>> lexicon = Lexicon(name="test")
|
|
113
|
+
>>> lexicon.add(LexicalItem(lemma="walk"))
|
|
114
|
+
>>> lexicon.add(LexicalItem(lemma="run"))
|
|
115
|
+
>>> [item.lemma for item in lexicon]
|
|
116
|
+
['walk', 'run']
|
|
117
|
+
"""
|
|
118
|
+
return iter(self.items.values())
|
|
119
|
+
|
|
120
|
+
def __contains__(self, item_id: UUID) -> bool:
|
|
121
|
+
"""Check if item ID is in lexicon.
|
|
122
|
+
|
|
123
|
+
Parameters
|
|
124
|
+
----------
|
|
125
|
+
item_id : UUID
|
|
126
|
+
The item ID to check.
|
|
127
|
+
|
|
128
|
+
Returns
|
|
129
|
+
-------
|
|
130
|
+
bool
|
|
131
|
+
True if item ID exists in lexicon.
|
|
132
|
+
|
|
133
|
+
Examples
|
|
134
|
+
--------
|
|
135
|
+
>>> lexicon = Lexicon(name="test")
|
|
136
|
+
>>> item = LexicalItem(lemma="test")
|
|
137
|
+
>>> lexicon.add(item)
|
|
138
|
+
>>> item.id in lexicon
|
|
139
|
+
True
|
|
140
|
+
"""
|
|
141
|
+
return item_id in self.items
|
|
142
|
+
|
|
143
|
+
def add(self, item: LexicalItem) -> None:
|
|
144
|
+
"""Add a lexical item to the lexicon.
|
|
145
|
+
|
|
146
|
+
Parameters
|
|
147
|
+
----------
|
|
148
|
+
item : LexicalItem
|
|
149
|
+
The item to add.
|
|
150
|
+
|
|
151
|
+
Raises
|
|
152
|
+
------
|
|
153
|
+
ValueError
|
|
154
|
+
If item with same ID already exists.
|
|
155
|
+
|
|
156
|
+
Examples
|
|
157
|
+
--------
|
|
158
|
+
>>> lexicon = Lexicon(name="test")
|
|
159
|
+
>>> item = LexicalItem(lemma="walk")
|
|
160
|
+
>>> lexicon.add(item)
|
|
161
|
+
>>> len(lexicon)
|
|
162
|
+
1
|
|
163
|
+
"""
|
|
164
|
+
if item.id in self.items:
|
|
165
|
+
raise ValueError(f"Item with ID {item.id} already exists in lexicon")
|
|
166
|
+
self.items[item.id] = item
|
|
167
|
+
self.update_modified_time()
|
|
168
|
+
|
|
169
|
+
def add_many(self, items: list[LexicalItem]) -> None:
|
|
170
|
+
"""Add multiple items to the lexicon.
|
|
171
|
+
|
|
172
|
+
Parameters
|
|
173
|
+
----------
|
|
174
|
+
items : list[LexicalItem]
|
|
175
|
+
The items to add.
|
|
176
|
+
|
|
177
|
+
Raises
|
|
178
|
+
------
|
|
179
|
+
ValueError
|
|
180
|
+
If any item with same ID already exists.
|
|
181
|
+
|
|
182
|
+
Examples
|
|
183
|
+
--------
|
|
184
|
+
>>> lexicon = Lexicon(name="test")
|
|
185
|
+
>>> items = [LexicalItem(lemma="walk"), LexicalItem(lemma="run")]
|
|
186
|
+
>>> lexicon.add_many(items)
|
|
187
|
+
>>> len(lexicon)
|
|
188
|
+
2
|
|
189
|
+
"""
|
|
190
|
+
for item in items:
|
|
191
|
+
self.add(item)
|
|
192
|
+
|
|
193
|
+
def remove(self, item_id: UUID) -> LexicalItem:
|
|
194
|
+
"""Remove and return an item by ID.
|
|
195
|
+
|
|
196
|
+
Parameters
|
|
197
|
+
----------
|
|
198
|
+
item_id : UUID
|
|
199
|
+
The ID of the item to remove.
|
|
200
|
+
|
|
201
|
+
Returns
|
|
202
|
+
-------
|
|
203
|
+
LexicalItem
|
|
204
|
+
The removed item.
|
|
205
|
+
|
|
206
|
+
Raises
|
|
207
|
+
------
|
|
208
|
+
KeyError
|
|
209
|
+
If item ID not found.
|
|
210
|
+
|
|
211
|
+
Examples
|
|
212
|
+
--------
|
|
213
|
+
>>> lexicon = Lexicon(name="test")
|
|
214
|
+
>>> item = LexicalItem(lemma="walk")
|
|
215
|
+
>>> lexicon.add(item)
|
|
216
|
+
>>> removed = lexicon.remove(item.id)
|
|
217
|
+
>>> removed.lemma
|
|
218
|
+
'walk'
|
|
219
|
+
>>> len(lexicon)
|
|
220
|
+
0
|
|
221
|
+
"""
|
|
222
|
+
if item_id not in self.items:
|
|
223
|
+
raise KeyError(f"Item with ID {item_id} not found in lexicon")
|
|
224
|
+
item = self.items.pop(item_id)
|
|
225
|
+
self.update_modified_time()
|
|
226
|
+
return item
|
|
227
|
+
|
|
228
|
+
def get(self, item_id: UUID) -> LexicalItem | None:
|
|
229
|
+
"""Get an item by ID, or None if not found.
|
|
230
|
+
|
|
231
|
+
Parameters
|
|
232
|
+
----------
|
|
233
|
+
item_id : UUID
|
|
234
|
+
The ID of the item to get.
|
|
235
|
+
|
|
236
|
+
Returns
|
|
237
|
+
-------
|
|
238
|
+
LexicalItem | None
|
|
239
|
+
The item if found, None otherwise.
|
|
240
|
+
|
|
241
|
+
Examples
|
|
242
|
+
--------
|
|
243
|
+
>>> lexicon = Lexicon(name="test")
|
|
244
|
+
>>> item = LexicalItem(lemma="walk")
|
|
245
|
+
>>> lexicon.add(item)
|
|
246
|
+
>>> retrieved = lexicon.get(item.id)
|
|
247
|
+
>>> retrieved.lemma # doctest: +SKIP
|
|
248
|
+
'walk'
|
|
249
|
+
>>> from uuid import uuid4
|
|
250
|
+
>>> lexicon.get(uuid4()) is None
|
|
251
|
+
True
|
|
252
|
+
"""
|
|
253
|
+
return self.items.get(item_id)
|
|
254
|
+
|
|
255
|
+
def filter(self, predicate: Callable[[LexicalItem], bool]) -> Lexicon:
|
|
256
|
+
"""Filter items by a predicate function.
|
|
257
|
+
|
|
258
|
+
Creates a new lexicon containing only items that satisfy the predicate.
|
|
259
|
+
|
|
260
|
+
Parameters
|
|
261
|
+
----------
|
|
262
|
+
predicate : Callable[[LexicalItem], bool]
|
|
263
|
+
Function that returns True for items to include.
|
|
264
|
+
|
|
265
|
+
Returns
|
|
266
|
+
-------
|
|
267
|
+
Lexicon
|
|
268
|
+
New lexicon with filtered items.
|
|
269
|
+
|
|
270
|
+
Examples
|
|
271
|
+
--------
|
|
272
|
+
>>> lexicon = Lexicon(name="test")
|
|
273
|
+
>>> lexicon.add(LexicalItem(lemma="walk", pos="VERB"))
|
|
274
|
+
>>> lexicon.add(LexicalItem(lemma="dog", pos="NOUN"))
|
|
275
|
+
>>> verbs = lexicon.filter(lambda item: item.pos == "VERB")
|
|
276
|
+
>>> len(verbs.items)
|
|
277
|
+
1
|
|
278
|
+
"""
|
|
279
|
+
filtered = Lexicon(
|
|
280
|
+
name=f"{self.name}_filtered",
|
|
281
|
+
description=self.description,
|
|
282
|
+
language_code=self.language_code,
|
|
283
|
+
tags=self.tags.copy(),
|
|
284
|
+
)
|
|
285
|
+
filtered.items = {
|
|
286
|
+
item_id: item for item_id, item in self.items.items() if predicate(item)
|
|
287
|
+
}
|
|
288
|
+
return filtered
|
|
289
|
+
|
|
290
|
+
def filter_by_pos(self, pos: str) -> Lexicon:
|
|
291
|
+
"""Filter items by part of speech.
|
|
292
|
+
|
|
293
|
+
Parameters
|
|
294
|
+
----------
|
|
295
|
+
pos : str
|
|
296
|
+
The part of speech to filter by.
|
|
297
|
+
|
|
298
|
+
Returns
|
|
299
|
+
-------
|
|
300
|
+
Lexicon
|
|
301
|
+
New lexicon with items matching the POS.
|
|
302
|
+
|
|
303
|
+
Examples
|
|
304
|
+
--------
|
|
305
|
+
>>> lexicon = Lexicon(name="test", language_code="eng")
|
|
306
|
+
>>> lexicon.add(LexicalItem(
|
|
307
|
+
... lemma="walk", language_code="eng", features={"pos": "VERB"}
|
|
308
|
+
... ))
|
|
309
|
+
>>> lexicon.add(LexicalItem(
|
|
310
|
+
... lemma="dog", language_code="eng", features={"pos": "NOUN"}
|
|
311
|
+
... ))
|
|
312
|
+
>>> verbs = lexicon.filter_by_pos("VERB")
|
|
313
|
+
>>> len(verbs.items)
|
|
314
|
+
1
|
|
315
|
+
"""
|
|
316
|
+
return self.filter(
|
|
317
|
+
lambda item: (
|
|
318
|
+
item.features.get("pos") is not None and item.features.get("pos") == pos
|
|
319
|
+
)
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
def filter_by_lemma(self, lemma: str) -> Lexicon:
|
|
323
|
+
"""Filter items by lemma (exact match).
|
|
324
|
+
|
|
325
|
+
Parameters
|
|
326
|
+
----------
|
|
327
|
+
lemma : str
|
|
328
|
+
The lemma to filter by.
|
|
329
|
+
|
|
330
|
+
Returns
|
|
331
|
+
-------
|
|
332
|
+
Lexicon
|
|
333
|
+
New lexicon with items matching the lemma.
|
|
334
|
+
|
|
335
|
+
Examples
|
|
336
|
+
--------
|
|
337
|
+
>>> lexicon = Lexicon(name="test")
|
|
338
|
+
>>> lexicon.add(LexicalItem(lemma="walk"))
|
|
339
|
+
>>> lexicon.add(LexicalItem(lemma="run"))
|
|
340
|
+
>>> results = lexicon.filter_by_lemma("walk")
|
|
341
|
+
>>> len(results.items)
|
|
342
|
+
1
|
|
343
|
+
"""
|
|
344
|
+
return self.filter(lambda item: item.lemma == lemma)
|
|
345
|
+
|
|
346
|
+
def filter_by_feature(self, feature_name: str, feature_value: Any) -> Lexicon:
|
|
347
|
+
"""Filter items by a specific feature value.
|
|
348
|
+
|
|
349
|
+
Parameters
|
|
350
|
+
----------
|
|
351
|
+
feature_name : str
|
|
352
|
+
The name of the feature.
|
|
353
|
+
feature_value : Any
|
|
354
|
+
The value to match.
|
|
355
|
+
|
|
356
|
+
Returns
|
|
357
|
+
-------
|
|
358
|
+
Lexicon
|
|
359
|
+
New lexicon with items having the specified feature value.
|
|
360
|
+
|
|
361
|
+
Examples
|
|
362
|
+
--------
|
|
363
|
+
>>> lexicon = Lexicon(name="test")
|
|
364
|
+
>>> lexicon.add(LexicalItem(lemma="walk", features={"tense": "present"}))
|
|
365
|
+
>>> lexicon.add(LexicalItem(lemma="walked", features={"tense": "past"}))
|
|
366
|
+
>>> present = lexicon.filter_by_feature("tense", "present")
|
|
367
|
+
>>> len(present.items)
|
|
368
|
+
1
|
|
369
|
+
"""
|
|
370
|
+
return self.filter(
|
|
371
|
+
lambda item: (
|
|
372
|
+
feature_name in item.features
|
|
373
|
+
and item.features[feature_name] == feature_value
|
|
374
|
+
)
|
|
375
|
+
)
|
|
376
|
+
|
|
377
|
+
def filter_by_attribute(self, attr_name: str, attr_value: Any) -> Lexicon:
|
|
378
|
+
"""Filter items by a specific attribute value.
|
|
379
|
+
|
|
380
|
+
Parameters
|
|
381
|
+
----------
|
|
382
|
+
attr_name : str
|
|
383
|
+
The name of the attribute.
|
|
384
|
+
attr_value : Any
|
|
385
|
+
The value to match.
|
|
386
|
+
|
|
387
|
+
Returns
|
|
388
|
+
-------
|
|
389
|
+
Lexicon
|
|
390
|
+
New lexicon with items having the specified attribute value.
|
|
391
|
+
|
|
392
|
+
Examples
|
|
393
|
+
--------
|
|
394
|
+
>>> lexicon = Lexicon(name="test")
|
|
395
|
+
>>> lexicon.add(LexicalItem(
|
|
396
|
+
... lemma="walk", language_code="eng", features={"frequency": 1000}
|
|
397
|
+
... ))
|
|
398
|
+
>>> lexicon.add(LexicalItem(
|
|
399
|
+
... lemma="saunter", language_code="eng", features={"frequency": 10}
|
|
400
|
+
... ))
|
|
401
|
+
>>> high_freq = lexicon.filter_by_attribute("frequency", 1000)
|
|
402
|
+
>>> len(high_freq.items)
|
|
403
|
+
1
|
|
404
|
+
"""
|
|
405
|
+
return self.filter(
|
|
406
|
+
lambda item: (
|
|
407
|
+
attr_name in item.features and item.features[attr_name] == attr_value
|
|
408
|
+
)
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
def search(self, query: str, field: str = "lemma") -> Lexicon:
|
|
412
|
+
"""Search for items containing query string in specified field.
|
|
413
|
+
|
|
414
|
+
Parameters
|
|
415
|
+
----------
|
|
416
|
+
query : str
|
|
417
|
+
Search string (case-insensitive substring match).
|
|
418
|
+
field : str
|
|
419
|
+
Field to search in ("lemma", "pos", "form").
|
|
420
|
+
|
|
421
|
+
Returns
|
|
422
|
+
-------
|
|
423
|
+
Lexicon
|
|
424
|
+
New lexicon with matching items.
|
|
425
|
+
|
|
426
|
+
Raises
|
|
427
|
+
------
|
|
428
|
+
ValueError
|
|
429
|
+
If field is not a valid searchable field.
|
|
430
|
+
|
|
431
|
+
Examples
|
|
432
|
+
--------
|
|
433
|
+
>>> lexicon = Lexicon(name="test")
|
|
434
|
+
>>> lexicon.add(LexicalItem(lemma="walk"))
|
|
435
|
+
>>> lexicon.add(LexicalItem(lemma="run"))
|
|
436
|
+
>>> results = lexicon.search("wa")
|
|
437
|
+
>>> len(results.items)
|
|
438
|
+
1
|
|
439
|
+
"""
|
|
440
|
+
query_lower = query.lower()
|
|
441
|
+
|
|
442
|
+
if field == "lemma":
|
|
443
|
+
return self.filter(lambda item: query_lower in item.lemma.lower())
|
|
444
|
+
elif field == "pos":
|
|
445
|
+
return self.filter(
|
|
446
|
+
lambda item: (
|
|
447
|
+
item.features.get("pos") is not None
|
|
448
|
+
and query_lower in str(item.features.get("pos")).lower()
|
|
449
|
+
)
|
|
450
|
+
)
|
|
451
|
+
elif field == "form":
|
|
452
|
+
return self.filter(
|
|
453
|
+
lambda item: item.form is not None and query_lower in item.form.lower()
|
|
454
|
+
)
|
|
455
|
+
else:
|
|
456
|
+
raise ValueError(
|
|
457
|
+
f"Invalid field '{field}'. Must be 'lemma', 'pos', or 'form'."
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
def merge(
|
|
461
|
+
self,
|
|
462
|
+
other: Lexicon,
|
|
463
|
+
strategy: Literal["keep_first", "keep_second", "error"] = "keep_first",
|
|
464
|
+
) -> Lexicon:
|
|
465
|
+
"""Merge with another lexicon.
|
|
466
|
+
|
|
467
|
+
Parameters
|
|
468
|
+
----------
|
|
469
|
+
other : Lexicon
|
|
470
|
+
The lexicon to merge with.
|
|
471
|
+
strategy : Literal["keep_first", "keep_second", "error"]
|
|
472
|
+
How to handle duplicate IDs:
|
|
473
|
+
- "keep_first": Keep item from self
|
|
474
|
+
- "keep_second": Keep item from other
|
|
475
|
+
- "error": Raise error on duplicates
|
|
476
|
+
|
|
477
|
+
Returns
|
|
478
|
+
-------
|
|
479
|
+
Lexicon
|
|
480
|
+
New merged lexicon.
|
|
481
|
+
|
|
482
|
+
Raises
|
|
483
|
+
------
|
|
484
|
+
ValueError
|
|
485
|
+
If strategy is "error" and duplicates found.
|
|
486
|
+
|
|
487
|
+
Examples
|
|
488
|
+
--------
|
|
489
|
+
>>> lex1 = Lexicon(name="lex1")
|
|
490
|
+
>>> lex1.add(LexicalItem(lemma="walk"))
|
|
491
|
+
>>> lex2 = Lexicon(name="lex2")
|
|
492
|
+
>>> lex2.add(LexicalItem(lemma="run"))
|
|
493
|
+
>>> merged = lex1.merge(lex2)
|
|
494
|
+
>>> len(merged.items)
|
|
495
|
+
2
|
|
496
|
+
"""
|
|
497
|
+
# Check for duplicates if strategy is "error"
|
|
498
|
+
if strategy == "error":
|
|
499
|
+
duplicates = set(self.items.keys()) & set(other.items.keys())
|
|
500
|
+
if duplicates:
|
|
501
|
+
raise ValueError(
|
|
502
|
+
f"Duplicate item IDs found: {duplicates}. "
|
|
503
|
+
"Use strategy='keep_first' or 'keep_second' to resolve."
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
# Create merged lexicon
|
|
507
|
+
# Use language_code from self, or other if self's is None
|
|
508
|
+
language_code = self.language_code or other.language_code
|
|
509
|
+
|
|
510
|
+
merged = Lexicon(
|
|
511
|
+
name=f"{self.name}_merged",
|
|
512
|
+
description=self.description,
|
|
513
|
+
language_code=language_code,
|
|
514
|
+
tags=list(set(self.tags + other.tags)),
|
|
515
|
+
)
|
|
516
|
+
|
|
517
|
+
# Add items based on strategy
|
|
518
|
+
if strategy == "keep_first":
|
|
519
|
+
merged.items = {**other.items, **self.items}
|
|
520
|
+
elif strategy == "keep_second":
|
|
521
|
+
merged.items = {**self.items, **other.items}
|
|
522
|
+
else: # strategy == "error" already handled above
|
|
523
|
+
merged.items = {**self.items, **other.items}
|
|
524
|
+
|
|
525
|
+
return merged
|
|
526
|
+
|
|
527
|
+
def to_dataframe(
|
|
528
|
+
self, backend: Literal["pandas", "polars"] = "pandas"
|
|
529
|
+
) -> DataFrame:
|
|
530
|
+
"""Convert lexicon to DataFrame.
|
|
531
|
+
|
|
532
|
+
Parameters
|
|
533
|
+
----------
|
|
534
|
+
backend : Literal["pandas", "polars"]
|
|
535
|
+
DataFrame backend to use (default: "pandas").
|
|
536
|
+
|
|
537
|
+
Returns
|
|
538
|
+
-------
|
|
539
|
+
DataFrame
|
|
540
|
+
pandas or polars DataFrame with columns: id, lemma, pos, form,
|
|
541
|
+
source, created_at, modified_at, plus separate columns for
|
|
542
|
+
each feature and attribute.
|
|
543
|
+
|
|
544
|
+
Examples
|
|
545
|
+
--------
|
|
546
|
+
>>> lexicon = Lexicon(name="test")
|
|
547
|
+
>>> lexicon.add(LexicalItem(lemma="walk", pos="VERB"))
|
|
548
|
+
>>> df = lexicon.to_dataframe()
|
|
549
|
+
>>> "lemma" in df.columns
|
|
550
|
+
True
|
|
551
|
+
>>> "pos" in df.columns
|
|
552
|
+
True
|
|
553
|
+
"""
|
|
554
|
+
if not self.items:
|
|
555
|
+
# Return empty DataFrame with expected columns
|
|
556
|
+
columns = [
|
|
557
|
+
"id",
|
|
558
|
+
"lemma",
|
|
559
|
+
"pos",
|
|
560
|
+
"form",
|
|
561
|
+
"source",
|
|
562
|
+
"created_at",
|
|
563
|
+
"modified_at",
|
|
564
|
+
]
|
|
565
|
+
if backend == "pandas":
|
|
566
|
+
return pd.DataFrame(columns=columns)
|
|
567
|
+
else:
|
|
568
|
+
schema: dict[str, type[pl.Utf8]] = dict.fromkeys(columns, pl.Utf8)
|
|
569
|
+
return pl.DataFrame(schema=schema)
|
|
570
|
+
|
|
571
|
+
rows = []
|
|
572
|
+
for item in self.items.values():
|
|
573
|
+
row = {
|
|
574
|
+
"id": str(item.id),
|
|
575
|
+
"lemma": item.lemma,
|
|
576
|
+
"form": item.form,
|
|
577
|
+
"language_code": item.language_code,
|
|
578
|
+
"source": item.source,
|
|
579
|
+
"created_at": item.created_at.isoformat(),
|
|
580
|
+
"modified_at": item.modified_at.isoformat(),
|
|
581
|
+
}
|
|
582
|
+
|
|
583
|
+
# Add features with "feature_" prefix
|
|
584
|
+
for key, value in item.features.items():
|
|
585
|
+
row[f"feature_{key}"] = value
|
|
586
|
+
|
|
587
|
+
rows.append(row) # type: ignore[arg-type]
|
|
588
|
+
|
|
589
|
+
if backend == "pandas":
|
|
590
|
+
return pd.DataFrame(rows)
|
|
591
|
+
else:
|
|
592
|
+
return pl.DataFrame(rows)
|
|
593
|
+
|
|
594
|
+
@classmethod
|
|
595
|
+
def from_dataframe(cls, df: DataFrame, name: str) -> Lexicon:
|
|
596
|
+
"""Create lexicon from DataFrame.
|
|
597
|
+
|
|
598
|
+
Parameters
|
|
599
|
+
----------
|
|
600
|
+
df : DataFrame
|
|
601
|
+
pandas or polars DataFrame with at minimum a 'lemma' column.
|
|
602
|
+
name : str
|
|
603
|
+
Name for the lexicon.
|
|
604
|
+
|
|
605
|
+
Returns
|
|
606
|
+
-------
|
|
607
|
+
Lexicon
|
|
608
|
+
New lexicon created from DataFrame.
|
|
609
|
+
|
|
610
|
+
Raises
|
|
611
|
+
------
|
|
612
|
+
ValueError
|
|
613
|
+
If DataFrame does not have a 'lemma' column.
|
|
614
|
+
|
|
615
|
+
Examples
|
|
616
|
+
--------
|
|
617
|
+
>>> import pandas as pd
|
|
618
|
+
>>> df = pd.DataFrame({"lemma": ["walk", "run"], "pos": ["VERB", "VERB"]})
|
|
619
|
+
>>> lexicon = Lexicon.from_dataframe(df, "verbs")
|
|
620
|
+
>>> len(lexicon.items)
|
|
621
|
+
2
|
|
622
|
+
"""
|
|
623
|
+
# Check if it's a polars DataFrame
|
|
624
|
+
is_polars = isinstance(df, pl.DataFrame)
|
|
625
|
+
|
|
626
|
+
# Get columns, handling both pandas and polars
|
|
627
|
+
if is_polars:
|
|
628
|
+
assert isinstance(df, pl.DataFrame)
|
|
629
|
+
columns_list: list[str] = df.columns
|
|
630
|
+
else:
|
|
631
|
+
assert isinstance(df, pd.DataFrame)
|
|
632
|
+
columns_list = list(df.columns)
|
|
633
|
+
|
|
634
|
+
if "lemma" not in columns_list:
|
|
635
|
+
raise ValueError("DataFrame must have a 'lemma' column")
|
|
636
|
+
|
|
637
|
+
lexicon = cls(name=name)
|
|
638
|
+
|
|
639
|
+
# Convert to dict format for iteration
|
|
640
|
+
rows: list[dict[str, Any]]
|
|
641
|
+
if is_polars:
|
|
642
|
+
assert isinstance(df, pl.DataFrame)
|
|
643
|
+
rows = df.to_dicts()
|
|
644
|
+
else:
|
|
645
|
+
assert isinstance(df, pd.DataFrame)
|
|
646
|
+
rows = df.to_dict("records") # type: ignore[assignment]
|
|
647
|
+
|
|
648
|
+
for row in rows:
|
|
649
|
+
# Extract base fields
|
|
650
|
+
item_data: dict[str, Any] = {"lemma": row["lemma"]}
|
|
651
|
+
|
|
652
|
+
# Helper function to check for null values
|
|
653
|
+
def is_not_null(value: Any) -> bool:
|
|
654
|
+
if is_polars:
|
|
655
|
+
return value is not None
|
|
656
|
+
else:
|
|
657
|
+
return pd.notna(value) # type: ignore[no-any-return]
|
|
658
|
+
|
|
659
|
+
# Handle language_code (required field)
|
|
660
|
+
if "language_code" in row and is_not_null(row["language_code"]):
|
|
661
|
+
item_data["language_code"] = row["language_code"]
|
|
662
|
+
else:
|
|
663
|
+
item_data["language_code"] = "eng" # Default to English
|
|
664
|
+
|
|
665
|
+
if "form" in row and is_not_null(row["form"]):
|
|
666
|
+
item_data["form"] = row["form"]
|
|
667
|
+
if "source" in row and is_not_null(row["source"]):
|
|
668
|
+
item_data["source"] = row["source"]
|
|
669
|
+
|
|
670
|
+
# Extract features (columns with "feature_" prefix, "pos", or "attr_" prefix) # noqa: E501
|
|
671
|
+
features: dict[str, Any] = {}
|
|
672
|
+
if "pos" in row and is_not_null(row["pos"]):
|
|
673
|
+
features["pos"] = row["pos"]
|
|
674
|
+
for col in columns_list:
|
|
675
|
+
if col.startswith("feature_") and is_not_null(row[col]):
|
|
676
|
+
feature_name: str = col[len("feature_") :]
|
|
677
|
+
features[feature_name] = row[col]
|
|
678
|
+
elif col.startswith("attr_") and is_not_null(row[col]):
|
|
679
|
+
attr_name: str = col[len("attr_") :]
|
|
680
|
+
features[attr_name] = row[col]
|
|
681
|
+
|
|
682
|
+
if features:
|
|
683
|
+
item_data["features"] = features
|
|
684
|
+
|
|
685
|
+
item = LexicalItem(**item_data)
|
|
686
|
+
lexicon.add(item)
|
|
687
|
+
|
|
688
|
+
return lexicon
|
|
689
|
+
|
|
690
|
+
def to_jsonl(self, path: str) -> None:
|
|
691
|
+
"""Save lexicon to JSONLines file (one item per line).
|
|
692
|
+
|
|
693
|
+
Parameters
|
|
694
|
+
----------
|
|
695
|
+
path : str
|
|
696
|
+
Path to the output file.
|
|
697
|
+
|
|
698
|
+
Examples
|
|
699
|
+
--------
|
|
700
|
+
>>> lexicon = Lexicon(name="test")
|
|
701
|
+
>>> lexicon.add(LexicalItem(lemma="walk"))
|
|
702
|
+
>>> lexicon.to_jsonl("/tmp/lexicon.jsonl") # doctest: +SKIP
|
|
703
|
+
"""
|
|
704
|
+
file_path = Path(path)
|
|
705
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
706
|
+
|
|
707
|
+
with open(file_path, "w", encoding="utf-8") as f:
|
|
708
|
+
for item in self.items.values():
|
|
709
|
+
f.write(item.model_dump_json() + "\n")
|
|
710
|
+
|
|
711
|
+
@classmethod
|
|
712
|
+
def from_jsonl(cls, path: str, name: str) -> Lexicon:
|
|
713
|
+
"""Load lexicon from JSONLines file.
|
|
714
|
+
|
|
715
|
+
Parameters
|
|
716
|
+
----------
|
|
717
|
+
path : str
|
|
718
|
+
Path to the input file.
|
|
719
|
+
name : str
|
|
720
|
+
Name for the lexicon.
|
|
721
|
+
|
|
722
|
+
Returns
|
|
723
|
+
-------
|
|
724
|
+
Lexicon
|
|
725
|
+
New lexicon loaded from file.
|
|
726
|
+
|
|
727
|
+
Examples
|
|
728
|
+
--------
|
|
729
|
+
>>> lexicon = Lexicon.from_jsonl(
|
|
730
|
+
... "/tmp/lexicon.jsonl", "loaded"
|
|
731
|
+
... ) # doctest: +SKIP
|
|
732
|
+
"""
|
|
733
|
+
lexicon = cls(name=name)
|
|
734
|
+
file_path = Path(path)
|
|
735
|
+
|
|
736
|
+
with open(file_path, encoding="utf-8") as f:
|
|
737
|
+
for line in f:
|
|
738
|
+
line = line.strip()
|
|
739
|
+
if line:
|
|
740
|
+
item_data = json.loads(line)
|
|
741
|
+
item = LexicalItem(**item_data)
|
|
742
|
+
lexicon.add(item)
|
|
743
|
+
|
|
744
|
+
return lexicon
|