bead 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (231) hide show
  1. bead/__init__.py +11 -0
  2. bead/__main__.py +11 -0
  3. bead/active_learning/__init__.py +15 -0
  4. bead/active_learning/config.py +231 -0
  5. bead/active_learning/loop.py +566 -0
  6. bead/active_learning/models/__init__.py +24 -0
  7. bead/active_learning/models/base.py +852 -0
  8. bead/active_learning/models/binary.py +910 -0
  9. bead/active_learning/models/categorical.py +943 -0
  10. bead/active_learning/models/cloze.py +862 -0
  11. bead/active_learning/models/forced_choice.py +956 -0
  12. bead/active_learning/models/free_text.py +773 -0
  13. bead/active_learning/models/lora.py +365 -0
  14. bead/active_learning/models/magnitude.py +835 -0
  15. bead/active_learning/models/multi_select.py +795 -0
  16. bead/active_learning/models/ordinal_scale.py +811 -0
  17. bead/active_learning/models/peft_adapter.py +155 -0
  18. bead/active_learning/models/random_effects.py +639 -0
  19. bead/active_learning/selection.py +354 -0
  20. bead/active_learning/strategies.py +391 -0
  21. bead/active_learning/trainers/__init__.py +26 -0
  22. bead/active_learning/trainers/base.py +210 -0
  23. bead/active_learning/trainers/data_collator.py +172 -0
  24. bead/active_learning/trainers/dataset_utils.py +261 -0
  25. bead/active_learning/trainers/huggingface.py +304 -0
  26. bead/active_learning/trainers/lightning.py +324 -0
  27. bead/active_learning/trainers/metrics.py +424 -0
  28. bead/active_learning/trainers/mixed_effects.py +551 -0
  29. bead/active_learning/trainers/model_wrapper.py +509 -0
  30. bead/active_learning/trainers/registry.py +104 -0
  31. bead/adapters/__init__.py +11 -0
  32. bead/adapters/huggingface.py +61 -0
  33. bead/behavioral/__init__.py +116 -0
  34. bead/behavioral/analytics.py +646 -0
  35. bead/behavioral/extraction.py +343 -0
  36. bead/behavioral/merging.py +343 -0
  37. bead/cli/__init__.py +11 -0
  38. bead/cli/active_learning.py +513 -0
  39. bead/cli/active_learning_commands.py +779 -0
  40. bead/cli/completion.py +359 -0
  41. bead/cli/config.py +624 -0
  42. bead/cli/constraint_builders.py +286 -0
  43. bead/cli/deployment.py +859 -0
  44. bead/cli/deployment_trials.py +493 -0
  45. bead/cli/deployment_ui.py +332 -0
  46. bead/cli/display.py +378 -0
  47. bead/cli/items.py +960 -0
  48. bead/cli/items_factories.py +776 -0
  49. bead/cli/list_constraints.py +714 -0
  50. bead/cli/lists.py +490 -0
  51. bead/cli/main.py +430 -0
  52. bead/cli/models.py +877 -0
  53. bead/cli/resource_loaders.py +621 -0
  54. bead/cli/resources.py +1036 -0
  55. bead/cli/shell.py +356 -0
  56. bead/cli/simulate.py +840 -0
  57. bead/cli/templates.py +1158 -0
  58. bead/cli/training.py +1080 -0
  59. bead/cli/utils.py +614 -0
  60. bead/cli/workflow.py +1273 -0
  61. bead/config/__init__.py +68 -0
  62. bead/config/active_learning.py +1009 -0
  63. bead/config/config.py +192 -0
  64. bead/config/defaults.py +118 -0
  65. bead/config/deployment.py +217 -0
  66. bead/config/env.py +147 -0
  67. bead/config/item.py +45 -0
  68. bead/config/list.py +193 -0
  69. bead/config/loader.py +149 -0
  70. bead/config/logging.py +42 -0
  71. bead/config/model.py +49 -0
  72. bead/config/paths.py +46 -0
  73. bead/config/profiles.py +320 -0
  74. bead/config/resources.py +47 -0
  75. bead/config/serialization.py +210 -0
  76. bead/config/simulation.py +206 -0
  77. bead/config/template.py +238 -0
  78. bead/config/validation.py +267 -0
  79. bead/data/__init__.py +65 -0
  80. bead/data/base.py +87 -0
  81. bead/data/identifiers.py +97 -0
  82. bead/data/language_codes.py +61 -0
  83. bead/data/metadata.py +270 -0
  84. bead/data/range.py +123 -0
  85. bead/data/repository.py +358 -0
  86. bead/data/serialization.py +249 -0
  87. bead/data/timestamps.py +89 -0
  88. bead/data/validation.py +349 -0
  89. bead/data_collection/__init__.py +11 -0
  90. bead/data_collection/jatos.py +223 -0
  91. bead/data_collection/merger.py +154 -0
  92. bead/data_collection/prolific.py +198 -0
  93. bead/deployment/__init__.py +5 -0
  94. bead/deployment/distribution.py +402 -0
  95. bead/deployment/jatos/__init__.py +1 -0
  96. bead/deployment/jatos/api.py +200 -0
  97. bead/deployment/jatos/exporter.py +210 -0
  98. bead/deployment/jspsych/__init__.py +9 -0
  99. bead/deployment/jspsych/biome.json +44 -0
  100. bead/deployment/jspsych/config.py +411 -0
  101. bead/deployment/jspsych/generator.py +598 -0
  102. bead/deployment/jspsych/package.json +51 -0
  103. bead/deployment/jspsych/pnpm-lock.yaml +2141 -0
  104. bead/deployment/jspsych/randomizer.py +299 -0
  105. bead/deployment/jspsych/src/lib/list-distributor.test.ts +327 -0
  106. bead/deployment/jspsych/src/lib/list-distributor.ts +1282 -0
  107. bead/deployment/jspsych/src/lib/randomizer.test.ts +232 -0
  108. bead/deployment/jspsych/src/lib/randomizer.ts +367 -0
  109. bead/deployment/jspsych/src/plugins/cloze-dropdown.ts +252 -0
  110. bead/deployment/jspsych/src/plugins/forced-choice.ts +265 -0
  111. bead/deployment/jspsych/src/plugins/plugins.test.ts +141 -0
  112. bead/deployment/jspsych/src/plugins/rating.ts +248 -0
  113. bead/deployment/jspsych/src/slopit/index.ts +9 -0
  114. bead/deployment/jspsych/src/types/jatos.d.ts +256 -0
  115. bead/deployment/jspsych/src/types/jspsych.d.ts +228 -0
  116. bead/deployment/jspsych/templates/experiment.css +1 -0
  117. bead/deployment/jspsych/templates/experiment.js.template +289 -0
  118. bead/deployment/jspsych/templates/index.html +51 -0
  119. bead/deployment/jspsych/templates/randomizer.js +241 -0
  120. bead/deployment/jspsych/templates/randomizer.js.template +313 -0
  121. bead/deployment/jspsych/trials.py +723 -0
  122. bead/deployment/jspsych/tsconfig.json +23 -0
  123. bead/deployment/jspsych/tsup.config.ts +30 -0
  124. bead/deployment/jspsych/ui/__init__.py +1 -0
  125. bead/deployment/jspsych/ui/components.py +383 -0
  126. bead/deployment/jspsych/ui/styles.py +411 -0
  127. bead/dsl/__init__.py +80 -0
  128. bead/dsl/ast.py +168 -0
  129. bead/dsl/context.py +178 -0
  130. bead/dsl/errors.py +71 -0
  131. bead/dsl/evaluator.py +570 -0
  132. bead/dsl/grammar.lark +81 -0
  133. bead/dsl/parser.py +231 -0
  134. bead/dsl/stdlib.py +929 -0
  135. bead/evaluation/__init__.py +13 -0
  136. bead/evaluation/convergence.py +485 -0
  137. bead/evaluation/interannotator.py +398 -0
  138. bead/items/__init__.py +40 -0
  139. bead/items/adapters/__init__.py +70 -0
  140. bead/items/adapters/anthropic.py +224 -0
  141. bead/items/adapters/api_utils.py +167 -0
  142. bead/items/adapters/base.py +216 -0
  143. bead/items/adapters/google.py +259 -0
  144. bead/items/adapters/huggingface.py +1074 -0
  145. bead/items/adapters/openai.py +323 -0
  146. bead/items/adapters/registry.py +202 -0
  147. bead/items/adapters/sentence_transformers.py +224 -0
  148. bead/items/adapters/togetherai.py +309 -0
  149. bead/items/binary.py +515 -0
  150. bead/items/cache.py +558 -0
  151. bead/items/categorical.py +593 -0
  152. bead/items/cloze.py +757 -0
  153. bead/items/constructor.py +784 -0
  154. bead/items/forced_choice.py +413 -0
  155. bead/items/free_text.py +681 -0
  156. bead/items/generation.py +432 -0
  157. bead/items/item.py +396 -0
  158. bead/items/item_template.py +787 -0
  159. bead/items/magnitude.py +573 -0
  160. bead/items/multi_select.py +621 -0
  161. bead/items/ordinal_scale.py +569 -0
  162. bead/items/scoring.py +448 -0
  163. bead/items/validation.py +723 -0
  164. bead/lists/__init__.py +30 -0
  165. bead/lists/balancer.py +263 -0
  166. bead/lists/constraints.py +1067 -0
  167. bead/lists/experiment_list.py +286 -0
  168. bead/lists/list_collection.py +378 -0
  169. bead/lists/partitioner.py +1141 -0
  170. bead/lists/stratification.py +254 -0
  171. bead/participants/__init__.py +73 -0
  172. bead/participants/collection.py +699 -0
  173. bead/participants/merging.py +312 -0
  174. bead/participants/metadata_spec.py +491 -0
  175. bead/participants/models.py +276 -0
  176. bead/resources/__init__.py +29 -0
  177. bead/resources/adapters/__init__.py +19 -0
  178. bead/resources/adapters/base.py +104 -0
  179. bead/resources/adapters/cache.py +128 -0
  180. bead/resources/adapters/glazing.py +508 -0
  181. bead/resources/adapters/registry.py +117 -0
  182. bead/resources/adapters/unimorph.py +796 -0
  183. bead/resources/classification.py +856 -0
  184. bead/resources/constraint_builders.py +329 -0
  185. bead/resources/constraints.py +165 -0
  186. bead/resources/lexical_item.py +223 -0
  187. bead/resources/lexicon.py +744 -0
  188. bead/resources/loaders.py +209 -0
  189. bead/resources/template.py +441 -0
  190. bead/resources/template_collection.py +707 -0
  191. bead/resources/template_generation.py +349 -0
  192. bead/simulation/__init__.py +29 -0
  193. bead/simulation/annotators/__init__.py +15 -0
  194. bead/simulation/annotators/base.py +175 -0
  195. bead/simulation/annotators/distance_based.py +135 -0
  196. bead/simulation/annotators/lm_based.py +114 -0
  197. bead/simulation/annotators/oracle.py +182 -0
  198. bead/simulation/annotators/random.py +181 -0
  199. bead/simulation/dsl_extension/__init__.py +3 -0
  200. bead/simulation/noise_models/__init__.py +13 -0
  201. bead/simulation/noise_models/base.py +42 -0
  202. bead/simulation/noise_models/random_noise.py +82 -0
  203. bead/simulation/noise_models/systematic.py +132 -0
  204. bead/simulation/noise_models/temperature.py +86 -0
  205. bead/simulation/runner.py +144 -0
  206. bead/simulation/strategies/__init__.py +23 -0
  207. bead/simulation/strategies/base.py +123 -0
  208. bead/simulation/strategies/binary.py +103 -0
  209. bead/simulation/strategies/categorical.py +123 -0
  210. bead/simulation/strategies/cloze.py +224 -0
  211. bead/simulation/strategies/forced_choice.py +127 -0
  212. bead/simulation/strategies/free_text.py +105 -0
  213. bead/simulation/strategies/magnitude.py +116 -0
  214. bead/simulation/strategies/multi_select.py +129 -0
  215. bead/simulation/strategies/ordinal_scale.py +131 -0
  216. bead/templates/__init__.py +27 -0
  217. bead/templates/adapters/__init__.py +17 -0
  218. bead/templates/adapters/base.py +128 -0
  219. bead/templates/adapters/cache.py +178 -0
  220. bead/templates/adapters/huggingface.py +312 -0
  221. bead/templates/combinatorics.py +103 -0
  222. bead/templates/filler.py +605 -0
  223. bead/templates/renderers.py +177 -0
  224. bead/templates/resolver.py +178 -0
  225. bead/templates/strategies.py +1806 -0
  226. bead/templates/streaming.py +195 -0
  227. bead-0.1.0.dist-info/METADATA +212 -0
  228. bead-0.1.0.dist-info/RECORD +231 -0
  229. bead-0.1.0.dist-info/WHEEL +4 -0
  230. bead-0.1.0.dist-info/entry_points.txt +2 -0
  231. bead-0.1.0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,432 @@
1
+ """Utilities for generating cross-product items from templates and lexicons.
2
+
3
+ This module provides language-agnostic utilities for generating items
4
+ by combining templates with lexical resources in various patterns.
5
+
6
+ RELATIONSHIP TO ItemConstructor:
7
+ - This module (generation.py): Generates cross-product combinations of
8
+ templates × lexical items BEFORE template filling. Creates lightweight
9
+ Item objects with just template_id, metadata, and unfilled information.
10
+ Use when: You want to systematically explore all combinations of a lexical
11
+ property (e.g., every verb in every frame).
12
+
13
+ - ItemConstructor (constructor.py): Builds Items FROM ItemTemplates +
14
+ FilledTemplates with constraint evaluation and model scoring. Takes filled
15
+ templates and combines them into experimental items with multi-slot
16
+ constraints checked.
17
+ Use when: You have filled templates and want to construct experimental
18
+ items with model-based constraint checking.
19
+
20
+ These modules are COMPLEMENTARY, not redundant. Typical pipeline:
21
+ 1. generation.py: Generate cross-product → unfilled item specifications
22
+ 2. Template filling: Fill template slots → FilledTemplates
23
+ 3. constructor.py: Construct items → Items with constraints checked
24
+ """
25
+
26
+ from __future__ import annotations
27
+
28
+ from collections import defaultdict
29
+ from collections.abc import Callable, Iterator
30
+ from pathlib import Path
31
+
32
+ from bead.items.item import Item, MetadataValue
33
+ from bead.resources.lexical_item import LexicalItem
34
+ from bead.resources.lexicon import Lexicon
35
+ from bead.resources.template import Template
36
+
37
+
38
+ def create_cross_product_items(
39
+ templates: list[Template],
40
+ lexicons: dict[str, Lexicon],
41
+ *,
42
+ cross_product_slot: str = "verb",
43
+ metadata_extractor: (
44
+ Callable[[Template, LexicalItem], dict[str, MetadataValue]] | None
45
+ ) = None,
46
+ filter_fn: Callable[[Template, LexicalItem], bool] | None = None,
47
+ ) -> Iterator[Item]:
48
+ """Generate cross-product items from templates and lexicons.
49
+
50
+ Creates an item for each combination of template × lexical item from
51
+ the specified slot's lexicon. This is useful for systematic exploration
52
+ of a lexical property (e.g., every verb in every frame).
53
+
54
+ Items are generated lazily via iterator for memory efficiency with
55
+ large cross-products.
56
+
57
+ Parameters
58
+ ----------
59
+ templates : list[Template]
60
+ Templates to use for generation.
61
+ lexicons : dict[str, Lexicon]
62
+ Lexicons keyed by slot name.
63
+ cross_product_slot : str
64
+ Slot name to vary across items (default: "verb").
65
+ This slot's lexicon will be crossed with all templates.
66
+ metadata_extractor : Callable[[Template, LexicalItem], \
67
+ dict[str, MetadataValue]] | None
68
+ Optional function to extract metadata from template and lexical item.
69
+ Receives (template, lexical_item) and returns dict for item_metadata.
70
+ filter_fn : Callable[[Template, LexicalItem], bool] | None
71
+ Optional filter function. Receives (template, lexical_item) and
72
+ returns True to include, False to skip.
73
+
74
+ Yields
75
+ ------
76
+ Item
77
+ Items representing template × lexical item combinations.
78
+
79
+ Examples
80
+ --------
81
+ Basic verb × template cross-product:
82
+ >>> from uuid import uuid4
83
+ >>> templates = [
84
+ ... Template(
85
+ ... name="transitive",
86
+ ... template_string="{subject} {verb} {object}.",
87
+ ... slots={}
88
+ ... )
89
+ ... ]
90
+ >>> verb_lex = Lexicon(name="verbs")
91
+ >>> verb_lex.add(LexicalItem(lemma="walk"))
92
+ >>> verb_lex.add(LexicalItem(lemma="eat"))
93
+ >>> lexicons = {"verb": verb_lex}
94
+ >>> items = list(create_cross_product_items(templates, lexicons))
95
+ >>> len(items)
96
+ 2
97
+
98
+ With metadata extraction:
99
+ >>> def extract_metadata(template, item):
100
+ ... return {
101
+ ... "verb_lemma": item.lemma,
102
+ ... "template_name": template.name,
103
+ ... "verb_pos": item.pos
104
+ ... }
105
+ >>> items = list(create_cross_product_items(
106
+ ... templates,
107
+ ... lexicons,
108
+ ... metadata_extractor=extract_metadata
109
+ ... )) # doctest: +SKIP
110
+
111
+ With filtering:
112
+ >>> def filter_transitive_only(template, item):
113
+ ... return "transitive" in template.name
114
+ >>> items = list(create_cross_product_items(
115
+ ... templates,
116
+ ... lexicons,
117
+ ... filter_fn=filter_transitive_only
118
+ ... )) # doctest: +SKIP
119
+ """
120
+ # get the lexicon for the cross-product slot
121
+ if cross_product_slot not in lexicons:
122
+ raise ValueError(
123
+ f"Lexicon for slot '{cross_product_slot}' not found. "
124
+ f"Available: {list(lexicons.keys())}"
125
+ )
126
+
127
+ cross_product_lexicon = lexicons[cross_product_slot]
128
+
129
+ # generate items
130
+ for template in templates:
131
+ for lexical_item in cross_product_lexicon:
132
+ # apply filter if provided
133
+ if filter_fn and not filter_fn(template, lexical_item):
134
+ continue
135
+
136
+ # extract metadata
137
+ if metadata_extractor:
138
+ item_metadata = metadata_extractor(template, lexical_item)
139
+ else:
140
+ item_metadata = _default_metadata_extractor(template, lexical_item)
141
+
142
+ # create rendered elements
143
+ rendered_elements = {
144
+ "template_name": template.name,
145
+ "template_string": template.template_string,
146
+ f"{cross_product_slot}_lemma": lexical_item.lemma,
147
+ f"{cross_product_slot}_form": lexical_item.form or lexical_item.lemma,
148
+ }
149
+
150
+ # create item
151
+ item = Item(
152
+ item_template_id=template.id,
153
+ rendered_elements=rendered_elements,
154
+ item_metadata=item_metadata,
155
+ )
156
+
157
+ yield item
158
+
159
+
160
+ def _default_metadata_extractor(
161
+ template: Template, lexical_item: LexicalItem
162
+ ) -> dict[str, MetadataValue]:
163
+ """Extract default metadata for cross-product items.
164
+
165
+ Parameters
166
+ ----------
167
+ template
168
+ Template being used.
169
+ lexical_item
170
+ Lexical item being crossed.
171
+
172
+ Returns
173
+ -------
174
+ dict[str, MetadataValue]
175
+ Default metadata dictionary.
176
+ """
177
+ metadata: dict[str, MetadataValue] = {
178
+ "template_id": str(template.id),
179
+ "template_name": template.name,
180
+ "template_structure": template.template_string,
181
+ "lexical_item_id": str(lexical_item.id),
182
+ "lexical_item_lemma": lexical_item.lemma,
183
+ "combination_type": "cross_product",
184
+ }
185
+
186
+ # add lexical item features
187
+ if lexical_item.features:
188
+ for key, value in lexical_item.features.items():
189
+ metadata[f"lexical_feature_{key}"] = value
190
+
191
+ # add lexical item features as attributes
192
+ if lexical_item.features:
193
+ for key, value in lexical_item.features.items():
194
+ metadata[f"lexical_attr_{key}"] = value
195
+
196
+ return metadata
197
+
198
+
199
+ def create_filtered_cross_product_items(
200
+ templates: list[Template],
201
+ lexicons: dict[str, Lexicon],
202
+ *,
203
+ cross_product_slot: str = "verb",
204
+ template_filter: Callable[[Template], bool] | None = None,
205
+ item_filter: Callable[[LexicalItem], bool] | None = None,
206
+ combination_filter: Callable[[Template, LexicalItem], bool] | None = None,
207
+ metadata_extractor: (
208
+ Callable[[Template, LexicalItem], dict[str, MetadataValue]] | None
209
+ ) = None,
210
+ ) -> Iterator[Item]:
211
+ """Generate cross-product items with multiple filter levels.
212
+
213
+ Provides separate filters for templates, lexical items, and their
214
+ combinations, offering more control than the basic cross-product function.
215
+
216
+ Parameters
217
+ ----------
218
+ templates : list[Template]
219
+ Templates to use for generation.
220
+ lexicons : dict[str, Lexicon]
221
+ Lexicons keyed by slot name.
222
+ cross_product_slot : str
223
+ Slot name to vary across items.
224
+ template_filter : Callable[[Template], bool] | None
225
+ Filter for templates (applied before cross-product).
226
+ item_filter : Callable[[LexicalItem], bool] | None
227
+ Filter for lexical items (applied before cross-product).
228
+ combination_filter : Callable[[Template, LexicalItem], bool] | None
229
+ Filter for combinations (applied during generation).
230
+ metadata_extractor : Callable[[Template, LexicalItem], \
231
+ dict[str, MetadataValue]] | None
232
+ Metadata extraction function.
233
+
234
+ Yields
235
+ ------
236
+ Item
237
+ Filtered cross-product items.
238
+
239
+ Examples
240
+ --------
241
+ Filter at multiple levels:
242
+ >>> def template_filter(t):
243
+ ... return "transitive" in t.name
244
+ >>> def item_filter(i):
245
+ ... return i.pos == "VERB"
246
+ >>> def combination_filter(t, i):
247
+ ... # Only combine if verb is compatible with template
248
+ ... return True
249
+ >>> items = list(create_filtered_cross_product_items(
250
+ ... templates,
251
+ ... lexicons,
252
+ ... template_filter=template_filter,
253
+ ... item_filter=item_filter,
254
+ ... combination_filter=combination_filter
255
+ ... )) # doctest: +SKIP
256
+ """
257
+ # get lexicon
258
+ if cross_product_slot not in lexicons:
259
+ raise ValueError(
260
+ f"Lexicon for slot '{cross_product_slot}' not found. "
261
+ f"Available: {list(lexicons.keys())}"
262
+ )
263
+
264
+ cross_product_lexicon = lexicons[cross_product_slot]
265
+
266
+ # filter templates
267
+ filtered_templates = templates
268
+ if template_filter:
269
+ filtered_templates = [t for t in templates if template_filter(t)]
270
+
271
+ # filter lexical items
272
+ filtered_items = list(cross_product_lexicon)
273
+ if item_filter:
274
+ filtered_items = [item for item in filtered_items if item_filter(item)]
275
+
276
+ # generate cross-product with combination filter
277
+ yield from create_cross_product_items(
278
+ filtered_templates,
279
+ {cross_product_slot: _create_temp_lexicon(filtered_items)},
280
+ cross_product_slot=cross_product_slot,
281
+ metadata_extractor=metadata_extractor,
282
+ filter_fn=combination_filter,
283
+ )
284
+
285
+
286
+ def _create_temp_lexicon(items: list[LexicalItem]) -> Lexicon:
287
+ """Create temporary lexicon from list of items.
288
+
289
+ Parameters
290
+ ----------
291
+ items : list[LexicalItem]
292
+ Lexical items to include.
293
+
294
+ Returns
295
+ -------
296
+ Lexicon
297
+ Temporary lexicon.
298
+ """
299
+ lexicon = Lexicon(name="temp")
300
+ for item in items:
301
+ lexicon.add(item)
302
+ return lexicon
303
+
304
+
305
+ def create_stratified_cross_product_items(
306
+ templates: list[Template],
307
+ lexicons: dict[str, Lexicon],
308
+ *,
309
+ cross_product_slot: str = "verb",
310
+ stratify_by: Callable[[LexicalItem], str],
311
+ items_per_stratum: int,
312
+ metadata_extractor: (
313
+ Callable[[Template, LexicalItem], dict[str, MetadataValue]] | None
314
+ ) = None,
315
+ ) -> Iterator[Item]:
316
+ """Generate stratified sample of cross-product items.
317
+
318
+ Instead of full cross-product, samples a fixed number of lexical items
319
+ from each stratum (defined by stratify_by function) and crosses them
320
+ with all templates.
321
+
322
+ Parameters
323
+ ----------
324
+ templates : list[Template]
325
+ Templates to use for generation.
326
+ lexicons : dict[str, Lexicon]
327
+ Lexicons keyed by slot name.
328
+ cross_product_slot : str
329
+ Slot name to vary across items.
330
+ stratify_by : Callable[[LexicalItem], str]
331
+ Function to extract stratum key from lexical items.
332
+ items_per_stratum : int
333
+ Number of items to sample from each stratum.
334
+ metadata_extractor : Callable[[Template, LexicalItem], \
335
+ dict[str, MetadataValue]] | None
336
+ Metadata extraction function.
337
+
338
+ Yields
339
+ ------
340
+ Item
341
+ Stratified cross-product items.
342
+
343
+ Examples
344
+ --------
345
+ Sample verbs stratified by frequency:
346
+ >>> def stratify_by_frequency(item):
347
+ ... freq = item.attributes.get("frequency", 0)
348
+ ... if freq > 1000:
349
+ ... return "high"
350
+ ... elif freq > 100:
351
+ ... return "medium"
352
+ ... else:
353
+ ... return "low"
354
+ >>> items = list(create_stratified_cross_product_items(
355
+ ... templates,
356
+ ... lexicons,
357
+ ... stratify_by=stratify_by_frequency,
358
+ ... items_per_stratum=10
359
+ ... )) # doctest: +SKIP
360
+ """
361
+ # get lexicon
362
+ if cross_product_slot not in lexicons:
363
+ raise ValueError(
364
+ f"Lexicon for slot '{cross_product_slot}' not found. "
365
+ f"Available: {list(lexicons.keys())}"
366
+ )
367
+
368
+ cross_product_lexicon = lexicons[cross_product_slot]
369
+
370
+ # group items by stratum
371
+ strata: dict[str, list[LexicalItem]] = defaultdict(list)
372
+ for item in cross_product_lexicon:
373
+ stratum = stratify_by(item)
374
+ strata[stratum].append(item)
375
+
376
+ # sample from each stratum
377
+ sampled_items: list[LexicalItem] = []
378
+ for _stratum, stratum_items in strata.items():
379
+ # take first items_per_stratum items (or all if fewer available)
380
+ n_to_take = min(items_per_stratum, len(stratum_items))
381
+ sampled_items.extend(stratum_items[:n_to_take])
382
+
383
+ # generate cross-product with sampled items
384
+ for item in create_cross_product_items(
385
+ templates,
386
+ {cross_product_slot: _create_temp_lexicon(sampled_items)},
387
+ cross_product_slot=cross_product_slot,
388
+ metadata_extractor=metadata_extractor,
389
+ ):
390
+ yield item
391
+
392
+
393
+ def items_to_jsonl(
394
+ items: Iterator[Item], output_path: str, progress_interval: int = 1000
395
+ ) -> int:
396
+ """Write iterator of items to JSONL file with progress tracking.
397
+
398
+ Utility function for efficient streaming write of large item sets.
399
+
400
+ Parameters
401
+ ----------
402
+ items : Iterator[Item]
403
+ Items to write.
404
+ output_path : str
405
+ Path to output JSONL file.
406
+ progress_interval : int
407
+ Print progress every N items (default: 1000).
408
+
409
+ Returns
410
+ -------
411
+ int
412
+ Number of items written.
413
+
414
+ Examples
415
+ --------
416
+ >>> items = create_cross_product_items(templates, lexicons) # doctest: +SKIP
417
+ >>> n = items_to_jsonl(items, "output.jsonl") # doctest: +SKIP
418
+ >>> print(f"Wrote {n} items") # doctest: +SKIP
419
+ """
420
+ output_file = Path(output_path)
421
+ output_file.parent.mkdir(parents=True, exist_ok=True)
422
+
423
+ count = 0
424
+ with open(output_file, "w", encoding="utf-8") as f:
425
+ for item in items:
426
+ f.write(item.model_dump_json() + "\n")
427
+ count += 1
428
+
429
+ if count % progress_interval == 0:
430
+ print(f" Progress: {count:,} items written...")
431
+
432
+ return count