openai-sdk-helpers 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. openai_sdk_helpers/__init__.py +45 -41
  2. openai_sdk_helpers/agent/__init__.py +4 -6
  3. openai_sdk_helpers/agent/base.py +110 -191
  4. openai_sdk_helpers/agent/{config.py → configuration.py} +24 -32
  5. openai_sdk_helpers/agent/{coordination.py → coordinator.py} +22 -23
  6. openai_sdk_helpers/agent/runner.py +3 -45
  7. openai_sdk_helpers/agent/search/base.py +54 -76
  8. openai_sdk_helpers/agent/search/vector.py +92 -108
  9. openai_sdk_helpers/agent/search/web.py +104 -82
  10. openai_sdk_helpers/agent/summarizer.py +22 -28
  11. openai_sdk_helpers/agent/translator.py +22 -24
  12. openai_sdk_helpers/agent/{validation.py → validator.py} +19 -23
  13. openai_sdk_helpers/cli.py +8 -22
  14. openai_sdk_helpers/environment.py +8 -13
  15. openai_sdk_helpers/errors.py +9 -0
  16. openai_sdk_helpers/extract/__init__.py +23 -0
  17. openai_sdk_helpers/extract/extractor.py +157 -0
  18. openai_sdk_helpers/extract/generator.py +476 -0
  19. openai_sdk_helpers/prompt/extractor_config_agent_instructions.jinja +6 -0
  20. openai_sdk_helpers/prompt/extractor_config_generator.jinja +37 -0
  21. openai_sdk_helpers/prompt/extractor_config_generator_instructions.jinja +9 -0
  22. openai_sdk_helpers/prompt/extractor_prompt_optimizer_agent_instructions.jinja +4 -0
  23. openai_sdk_helpers/prompt/extractor_prompt_optimizer_request.jinja +11 -0
  24. openai_sdk_helpers/prompt/vector_planner.jinja +7 -0
  25. openai_sdk_helpers/prompt/vector_search.jinja +6 -0
  26. openai_sdk_helpers/prompt/vector_writer.jinja +7 -0
  27. openai_sdk_helpers/response/__init__.py +3 -7
  28. openai_sdk_helpers/response/base.py +89 -98
  29. openai_sdk_helpers/response/{config.py → configuration.py} +45 -20
  30. openai_sdk_helpers/response/files.py +2 -0
  31. openai_sdk_helpers/response/planner.py +1 -1
  32. openai_sdk_helpers/response/prompter.py +1 -1
  33. openai_sdk_helpers/response/runner.py +1 -48
  34. openai_sdk_helpers/response/tool_call.py +0 -141
  35. openai_sdk_helpers/response/vector_store.py +8 -5
  36. openai_sdk_helpers/streamlit_app/__init__.py +1 -1
  37. openai_sdk_helpers/streamlit_app/app.py +17 -18
  38. openai_sdk_helpers/streamlit_app/{config.py → configuration.py} +13 -13
  39. openai_sdk_helpers/structure/__init__.py +16 -0
  40. openai_sdk_helpers/structure/base.py +239 -278
  41. openai_sdk_helpers/structure/extraction.py +1228 -0
  42. openai_sdk_helpers/structure/plan/plan.py +0 -20
  43. openai_sdk_helpers/structure/plan/task.py +0 -33
  44. openai_sdk_helpers/structure/prompt.py +16 -0
  45. openai_sdk_helpers/structure/responses.py +2 -2
  46. openai_sdk_helpers/structure/web_search.py +0 -10
  47. openai_sdk_helpers/tools.py +346 -99
  48. openai_sdk_helpers/types.py +3 -3
  49. openai_sdk_helpers/utils/__init__.py +9 -6
  50. openai_sdk_helpers/utils/json/base_model.py +316 -33
  51. openai_sdk_helpers/utils/json/data_class.py +1 -1
  52. openai_sdk_helpers/utils/langextract.py +194 -0
  53. openai_sdk_helpers/utils/registry.py +19 -15
  54. openai_sdk_helpers/vector_storage/storage.py +1 -1
  55. {openai_sdk_helpers-0.4.2.dist-info → openai_sdk_helpers-0.5.0.dist-info}/METADATA +25 -11
  56. openai_sdk_helpers-0.5.0.dist-info/RECORD +95 -0
  57. openai_sdk_helpers/agent/prompt_utils.py +0 -15
  58. openai_sdk_helpers/context_manager.py +0 -241
  59. openai_sdk_helpers/deprecation.py +0 -167
  60. openai_sdk_helpers/retry.py +0 -175
  61. openai_sdk_helpers/streamlit_app/streamlit_web_search.py +0 -75
  62. openai_sdk_helpers/utils/deprecation.py +0 -167
  63. openai_sdk_helpers-0.4.2.dist-info/RECORD +0 -88
  64. /openai_sdk_helpers/{logging_config.py → logging.py} +0 -0
  65. /openai_sdk_helpers/{config.py → settings.py} +0 -0
  66. {openai_sdk_helpers-0.4.2.dist-info → openai_sdk_helpers-0.5.0.dist-info}/WHEEL +0 -0
  67. {openai_sdk_helpers-0.4.2.dist-info → openai_sdk_helpers-0.5.0.dist-info}/entry_points.txt +0 -0
  68. {openai_sdk_helpers-0.4.2.dist-info → openai_sdk_helpers-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,1228 @@
1
+ """Structured extraction result models."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Sequence
6
+ import uuid
7
+ from enum import Enum, IntEnum
8
+ from langextract.core import format_handler as lx_format_handler
9
+ from langextract.core.data import (
10
+ AlignmentStatus as LXAlignmentStatus,
11
+ AnnotatedDocument as LXAnnotatedDocument,
12
+ CharInterval as LXCharInterval,
13
+ Document as LXDocument,
14
+ ExampleData as LXExampleData,
15
+ Extraction as LXExtraction,
16
+ )
17
+
18
+ from langextract.core import tokenizer as LXtokenizer
19
+ from .base import StructureBase, spec_field
20
+
21
+
22
+ class CharInterval(StructureBase):
23
+ """Class for representing a character interval.
24
+
25
+ Attributes
26
+ ----------
27
+ start_pos: The starting position of the interval (inclusive).
28
+ end_pos: The ending position of the interval (exclusive).
29
+
30
+ Methods
31
+ -------
32
+ to_dataclass()
33
+ Convert to a LangExtract ``CharInterval`` dataclass.
34
+ from_dataclass(data)
35
+ Create a CharInterval from a LangExtract dataclass.
36
+ """
37
+
38
+ start_pos: int | None = spec_field(
39
+ "start_pos",
40
+ description="The starting position of the interval (inclusive).",
41
+ )
42
+ end_pos: int | None = spec_field(
43
+ "end_pos",
44
+ description="The ending position of the interval (exclusive).",
45
+ )
46
+
47
+ def to_dataclass(self) -> LXCharInterval:
48
+ """Convert to LangExtract CharInterval dataclass.
49
+
50
+ Returns
51
+ -------
52
+ LXCharInterval
53
+ LangExtract character interval dataclass instance.
54
+ """
55
+ return LXCharInterval(
56
+ start_pos=self.start_pos,
57
+ end_pos=self.end_pos,
58
+ )
59
+
60
+ @classmethod
61
+ def from_dataclass(cls, data: LXCharInterval) -> "CharInterval":
62
+ """Create a CharInterval from a LangExtract dataclass.
63
+
64
+ Parameters
65
+ ----------
66
+ data : LXCharInterval
67
+ LangExtract CharInterval dataclass instance.
68
+
69
+ Returns
70
+ -------
71
+ CharInterval
72
+ Structured character interval model.
73
+ """
74
+ return cls(
75
+ start_pos=data.start_pos,
76
+ end_pos=data.end_pos,
77
+ )
78
+
79
+
80
+ class AlignmentStatus(Enum):
81
+ """Represent alignment status values for extracted items.
82
+
83
+ Methods
84
+ -------
85
+ to_dataclass()
86
+ Convert to a LangExtract ``AlignmentStatus`` dataclass.
87
+ from_dataclass(data)
88
+ Create an AlignmentStatus from a LangExtract dataclass.
89
+ """
90
+
91
+ MATCH_EXACT = "match_exact"
92
+ MATCH_GREATER = "match_greater"
93
+ MATCH_LESSER = "match_lesser"
94
+ MATCH_FUZZY = "match_fuzzy"
95
+
96
+ def to_dataclass(self) -> LXAlignmentStatus:
97
+ """Convert to LangExtract AlignmentStatus dataclass.
98
+
99
+ Returns
100
+ -------
101
+ LXAlignmentStatus
102
+ LangExtract alignment status dataclass instance.
103
+ """
104
+ return LXAlignmentStatus(self.value)
105
+
106
+ @classmethod
107
+ def from_dataclass(cls, data: LXAlignmentStatus) -> "AlignmentStatus":
108
+ """Create an AlignmentStatus from a LangExtract dataclass.
109
+
110
+ Parameters
111
+ ----------
112
+ data : LXAlignmentStatus
113
+ LangExtract alignment status dataclass instance.
114
+
115
+ Returns
116
+ -------
117
+ AlignmentStatus
118
+ Structured alignment status value.
119
+ """
120
+ return cls(data.value)
121
+
122
+
123
+ class TokenCharInterval(StructureBase):
124
+ """Represents an interval over characters in tokenized text.
125
+
126
+ The interval is defined by a start position (inclusive) and an end position
127
+ (exclusive).
128
+
129
+ Attributes
130
+ ----------
131
+ start_pos: The starting position of the interval (inclusive).
132
+ end_pos: The ending position of the interval (exclusive).
133
+
134
+ Methods
135
+ -------
136
+ to_dataclass()
137
+ Convert to a LangExtract ``CharInterval`` dataclass.
138
+ from_dataclass(data)
139
+ Create a TokenCharInterval from a LangExtract dataclass.
140
+ """
141
+
142
+ start_pos: int = spec_field(
143
+ "start_pos",
144
+ description="The starting position of the interval (inclusive).",
145
+ default=0,
146
+ )
147
+ end_pos: int = spec_field(
148
+ "end_pos",
149
+ description="The ending position of the interval (exclusive).",
150
+ default=0,
151
+ )
152
+
153
+ def to_dataclass(self) -> LXtokenizer.CharInterval:
154
+ """Convert to LangExtract CharInterval dataclass.
155
+
156
+ Returns
157
+ -------
158
+ LXtokenizer.CharInterval
159
+ LangExtract character interval dataclass instance.
160
+ """
161
+ return LXtokenizer.CharInterval(
162
+ start_pos=self.start_pos,
163
+ end_pos=self.end_pos,
164
+ )
165
+
166
+ @classmethod
167
+ def from_dataclass(cls, data: LXtokenizer.CharInterval) -> "TokenCharInterval":
168
+ """Create a TokenCharInterval from a LangExtract dataclass.
169
+
170
+ Parameters
171
+ ----------
172
+ data : LXtokenizer.CharInterval
173
+ LangExtract CharInterval dataclass instance.
174
+
175
+ Returns
176
+ -------
177
+ TokenCharInterval
178
+ Structured token character interval model.
179
+ """
180
+ return cls(
181
+ start_pos=data.start_pos,
182
+ end_pos=data.end_pos,
183
+ )
184
+
185
+
186
+ class TokenInterval(StructureBase):
187
+ """Represents an interval over tokens in tokenized text.
188
+
189
+ The interval is defined by a start index (inclusive) and an end index
190
+ (exclusive).
191
+
192
+ Attributes
193
+ ----------
194
+ start_index: The index of the first token in the interval.
195
+ end_index: The index one past the last token in the interval.
196
+
197
+ Methods
198
+ -------
199
+ to_dataclass()
200
+ Convert to a LangExtract ``TokenInterval`` dataclass.
201
+ from_dataclass(data)
202
+ Create a TokenInterval from a LangExtract dataclass.
203
+ """
204
+
205
+ start_index: int = spec_field(
206
+ "start_index",
207
+ description="The index of the first token in the interval.",
208
+ default=0,
209
+ )
210
+ end_index: int = spec_field(
211
+ "end_index",
212
+ description="The index one past the last token in the interval.",
213
+ default=0,
214
+ )
215
+
216
+ def to_dataclass(self) -> LXtokenizer.TokenInterval:
217
+ """Convert to LangExtract TokenInterval dataclass.
218
+
219
+ Returns
220
+ -------
221
+ LXtokenizer.TokenInterval
222
+ LangExtract token interval dataclass instance.
223
+ """
224
+ return LXtokenizer.TokenInterval(
225
+ start_index=self.start_index,
226
+ end_index=self.end_index,
227
+ )
228
+
229
+ @classmethod
230
+ def from_dataclass(cls, data: LXtokenizer.TokenInterval) -> "TokenInterval":
231
+ """Create a TokenInterval from a LangExtract dataclass.
232
+
233
+ Parameters
234
+ ----------
235
+ data : LXtokenizer.TokenInterval
236
+ LangExtract TokenInterval dataclass instance.
237
+
238
+ Returns
239
+ -------
240
+ TokenInterval
241
+ Structured token interval model.
242
+ """
243
+ return cls(
244
+ start_index=data.start_index,
245
+ end_index=data.end_index,
246
+ )
247
+
248
+
249
+ class TokenType(IntEnum):
250
+ """Enumeration of token types produced during tokenization.
251
+
252
+ Attributes
253
+ ----------
254
+ WORD: Represents an alphabetical word token.
255
+ NUMBER: Represents a numeric token.
256
+ PUNCTUATION: Represents punctuation characters.
257
+
258
+ Methods
259
+ -------
260
+ to_dataclass()
261
+ Convert to a LangExtract ``TokenType`` dataclass.
262
+ from_dataclass(data)
263
+ Create a TokenType from a LangExtract dataclass.
264
+ """
265
+
266
+ WORD = 0
267
+ NUMBER = 1
268
+ PUNCTUATION = 2
269
+
270
+ def to_dataclass(self) -> LXtokenizer.TokenType:
271
+ """Convert to LangExtract TokenType dataclass.
272
+
273
+ Returns
274
+ -------
275
+ LXtokenizer.TokenType
276
+ LangExtract token type dataclass instance.
277
+ """
278
+ return LXtokenizer.TokenType(self.value)
279
+
280
+ @classmethod
281
+ def from_dataclass(cls, data: LXtokenizer.TokenType) -> "TokenType":
282
+ """Create a TokenType from a LangExtract dataclass.
283
+
284
+ Parameters
285
+ ----------
286
+ data : LXtokenizer.TokenType
287
+ LangExtract token type dataclass instance.
288
+
289
+ Returns
290
+ -------
291
+ TokenType
292
+ Structured token type value.
293
+ """
294
+ return cls(data.value)
295
+
296
+
297
+ class Token(StructureBase):
298
+ """Represents a token extracted from text.
299
+
300
+ Each token is assigned an index and classified into a type (word, number,
301
+ punctuation, or acronym). The token also records the range of characters
302
+ (its CharInterval) that correspond to the substring from the original text.
303
+ Additionally, it tracks whether it follows a newline.
304
+
305
+ Attributes
306
+ ----------
307
+ index: The position of the token in the sequence of tokens.
308
+ token_type: The type of the token, as defined by TokenType.
309
+ char_interval: The character interval within the original text that this
310
+ token spans.
311
+ first_token_after_newline: True if the token immediately follows a newline
312
+ or carriage return.
313
+
314
+ Methods
315
+ -------
316
+ to_dataclass()
317
+ Convert to a LangExtract ``Token`` dataclass.
318
+ from_dataclass(data)
319
+ Create a Token from a LangExtract dataclass.
320
+ from_dataclass_list(data)
321
+ Create structured tokens from LangExtract dataclasses.
322
+ to_dataclass_list(data)
323
+ Convert structured tokens to LangExtract dataclasses.
324
+ """
325
+
326
+ index: int = spec_field(
327
+ "index",
328
+ description="The position of the token in the sequence of tokens.",
329
+ )
330
+ token_type: TokenType = spec_field(
331
+ "token_type",
332
+ description="The type of the token, as defined by TokenType.",
333
+ )
334
+ char_interval: TokenCharInterval | None = spec_field(
335
+ "char_interval",
336
+ description="The character interval within the original text that this token spans.",
337
+ allow_null=True,
338
+ )
339
+ first_token_after_newline: bool = spec_field(
340
+ "first_token_after_newline",
341
+ description="True if the token immediately follows a newline or carriage return.",
342
+ default=False,
343
+ )
344
+
345
+ def to_dataclass(self) -> LXtokenizer.Token:
346
+ """Convert to LangExtract Token dataclass.
347
+
348
+ Returns
349
+ -------
350
+ LXtokenizer.Token
351
+ LangExtract token dataclass instance.
352
+ """
353
+ token = LXtokenizer.Token(
354
+ index=self.index,
355
+ token_type=LXtokenizer.TokenType(self.token_type),
356
+ first_token_after_newline=self.first_token_after_newline,
357
+ )
358
+ if self.char_interval is not None:
359
+ token.char_interval = self.char_interval.to_dataclass()
360
+ return token
361
+
362
+ @classmethod
363
+ def from_dataclass(cls, data: LXtokenizer.Token) -> "Token":
364
+ """Create a Token from a LangExtract dataclass.
365
+
366
+ Parameters
367
+ ----------
368
+ data : LXtokenizer.Token
369
+ LangExtract token dataclass instance.
370
+
371
+ Returns
372
+ -------
373
+ Token
374
+ Structured token model.
375
+ """
376
+ char_interval = (
377
+ TokenCharInterval.from_dataclass(data.char_interval)
378
+ if data.char_interval is not None
379
+ else None
380
+ )
381
+ return cls(
382
+ index=data.index,
383
+ token_type=TokenType.from_dataclass(data.token_type),
384
+ char_interval=char_interval,
385
+ first_token_after_newline=data.first_token_after_newline,
386
+ )
387
+
388
+ @staticmethod
389
+ def from_dataclass_list(data: list[LXtokenizer.Token]) -> list["Token"]:
390
+ """Create a list of Tokens from a list of LangExtract dataclasses.
391
+
392
+ Parameters
393
+ ----------
394
+ data : list[LXtokenizer.Token]
395
+ List of LangExtract token dataclass instances.
396
+
397
+ Returns
398
+ -------
399
+ list[Token]
400
+ List of structured token models.
401
+ """
402
+ return [Token.from_dataclass(item) for item in data]
403
+
404
+ @staticmethod
405
+ def to_dataclass_list(data: list["Token"]) -> list[LXtokenizer.Token]:
406
+ """Convert a list of Tokens to LangExtract Token dataclasses.
407
+
408
+ Parameters
409
+ ----------
410
+ data : list[Token]
411
+ List of structured token models.
412
+
413
+ Returns
414
+ -------
415
+ list[LXtokenizer.Token]
416
+ List of LangExtract token dataclass instances.
417
+ """
418
+ return [item.to_dataclass() for item in data]
419
+
420
+
421
+ class TokenizedText(StructureBase):
422
+ """Holds the result of tokenizing a text string.
423
+
424
+ Attributes
425
+ ----------
426
+ text: The text that was tokenized. For UnicodeTokenizer, this is
427
+ NOT normalized to NFC (to preserve indices).
428
+ tokens: A list of Token objects extracted from the text.
429
+
430
+ Methods
431
+ -------
432
+ to_dataclass()
433
+ Convert to a LangExtract ``TokenizedText`` dataclass.
434
+ from_dataclass(data)
435
+ Create a TokenizedText from a LangExtract dataclass.
436
+ """
437
+
438
+ text: str = spec_field(
439
+ "text",
440
+ description="The text that was tokenized.",
441
+ allow_null=False,
442
+ )
443
+ tokens: list[Token] = spec_field(
444
+ "tokens",
445
+ description="A list of Token objects extracted from the text.",
446
+ allow_null=True,
447
+ default_factory=list,
448
+ )
449
+
450
+ def to_dataclass(self) -> LXtokenizer.TokenizedText:
451
+ """Convert to LangExtract TokenizedText dataclass.
452
+
453
+ Returns
454
+ -------
455
+ LXtokenizer.TokenizedText
456
+ LangExtract tokenized text dataclass instance.
457
+ """
458
+ return LXtokenizer.TokenizedText(
459
+ text=self.text,
460
+ tokens=Token.to_dataclass_list(self.tokens),
461
+ )
462
+
463
+ @classmethod
464
+ def from_dataclass(cls, data: LXtokenizer.TokenizedText) -> "TokenizedText":
465
+ """Create a TokenizedText from a LangExtract dataclass.
466
+
467
+ Parameters
468
+ ----------
469
+ data : LXtokenizer.TokenizedText
470
+ LangExtract TokenizedText dataclass instance.
471
+
472
+ Returns
473
+ -------
474
+ TokenizedText
475
+ Structured tokenized text model.
476
+ """
477
+ return cls(
478
+ text=data.text,
479
+ tokens=Token.from_dataclass_list(data.tokens),
480
+ )
481
+
482
+
483
+ class AttributeStructure(StructureBase):
484
+ """Represent an extraction attribute as a key/value pair.
485
+
486
+ Attributes
487
+ ----------
488
+ key : str
489
+ Attribute key.
490
+ value : str | int | float | dict | list | None
491
+ Attribute value.
492
+
493
+ Methods
494
+ -------
495
+ to_pair()
496
+ Convert the attribute to a tuple of ``(key, value)``.
497
+ from_pair(key, value)
498
+ Build an attribute from a key/value pair.
499
+ """
500
+
501
+ key: str = spec_field(
502
+ "key",
503
+ allow_null=False,
504
+ description="Attribute key.",
505
+ )
506
+ value: lx_format_handler.ExtractionValueType = spec_field(
507
+ "value",
508
+ allow_null=True,
509
+ description="Attribute value.",
510
+ )
511
+
512
+ def to_pair(self) -> tuple[str, lx_format_handler.ExtractionValueType]:
513
+ """Convert the attribute to a key/value pair.
514
+
515
+ Returns
516
+ -------
517
+ tuple[str, str | int | float | dict | list | None]
518
+ Tuple containing the attribute key and value.
519
+ """
520
+ return self.key, self.value
521
+
522
+ @classmethod
523
+ def from_pair(
524
+ cls, key: str, value: lx_format_handler.ExtractionValueType
525
+ ) -> "AttributeStructure":
526
+ """Build an attribute from a key/value pair.
527
+
528
+ Parameters
529
+ ----------
530
+ key : str
531
+ Attribute key.
532
+ value : str | int | float | dict | list | None
533
+ Attribute value to store.
534
+
535
+ Returns
536
+ -------
537
+ AttributeStructure
538
+ Structured attribute instance.
539
+ """
540
+ return cls(key=key, value=value)
541
+
542
+
543
+ def _attributes_to_dict(
544
+ attributes: list[AttributeStructure] | None,
545
+ ) -> dict[str, Any] | None:
546
+ """Convert structured attributes to a dictionary.
547
+
548
+ Parameters
549
+ ----------
550
+ attributes : list[AttributeStructure] or None
551
+ Structured attributes to convert.
552
+
553
+ Returns
554
+ -------
555
+ dict[str, Any] or None
556
+ Mapping of attribute keys to values.
557
+ """
558
+ if attributes is None:
559
+ return None
560
+ return {attribute.key: attribute.value for attribute in attributes}
561
+
562
+
563
+ def _attributes_from_dict(
564
+ attributes: dict[str, Any] | None,
565
+ ) -> list[AttributeStructure] | None:
566
+ """Convert an attribute dictionary into structured attributes.
567
+
568
+ Parameters
569
+ ----------
570
+ attributes : dict[str, Any] or None
571
+ Attributes mapping to convert.
572
+
573
+ Returns
574
+ -------
575
+ list[AttributeStructure] or None
576
+ Structured attribute list.
577
+ """
578
+ if attributes is None:
579
+ return None
580
+ return [
581
+ AttributeStructure.from_pair(key, value) for key, value in attributes.items()
582
+ ]
583
+
584
+
585
+ class ExtractionStructure(StructureBase):
586
+ """Represent a single extraction from a document.
587
+
588
+ Attributes
589
+ ----------
590
+ extraction_class : str
591
+ Label or class assigned to the extracted item.
592
+ extraction_text : str
593
+ Raw text captured for the extracted item.
594
+ description : str | None
595
+ Optional description of the extracted item.
596
+ attributes : list[AttributeStructure] | None
597
+ Additional attributes attached to the item.
598
+ char_interval : CharInterval | None
599
+ Character interval in the source text.
600
+ alignment_status : AlignmentStatus | None
601
+ Alignment status of the extracted item.
602
+ extraction_index : int | None
603
+ Index of the extraction in the list of extractions.
604
+ group_index : int | None
605
+ Index of the group this item belongs to, if applicable.
606
+ token_interval : TokenInterval | None
607
+ Token interval of the extracted item.
608
+
609
+ Methods
610
+ -------
611
+ to_dataclass()
612
+ Convert to a LangExtract ``Extraction`` dataclass.
613
+ to_dataclass_list(data)
614
+ Convert structured extractions to LangExtract dataclasses.
615
+ from_dataclass(data)
616
+ Create an extraction from a LangExtract dataclass.
617
+ from_dataclass_list(data)
618
+ Create structured extractions from LangExtract dataclasses.
619
+ """
620
+
621
+ extraction_class: str = spec_field(
622
+ "extraction_class",
623
+ allow_null=False,
624
+ description="Label or class for the extracted item.",
625
+ )
626
+ extraction_text: str = spec_field(
627
+ "extraction_text",
628
+ allow_null=False,
629
+ description="Raw text captured for the extracted item.",
630
+ )
631
+ description: str | None = spec_field(
632
+ "description",
633
+ allow_null=True,
634
+ description="Optional description of the extracted item.",
635
+ )
636
+ attributes: list[AttributeStructure] | None = spec_field(
637
+ "attributes",
638
+ default=None,
639
+ description="Additional attributes attached to the item.",
640
+ )
641
+ char_interval: CharInterval | None = spec_field(
642
+ "char_interval",
643
+ allow_null=True,
644
+ description="Character interval of the extracted item in the source text.",
645
+ )
646
+ alignment_status: AlignmentStatus | None = spec_field(
647
+ "alignment_status",
648
+ allow_null=True,
649
+ description="Alignment status of the extracted item.",
650
+ )
651
+ extraction_index: int | None = spec_field(
652
+ "extraction_index",
653
+ description="Index of the extraction in the list of extractions.",
654
+ allow_null=True,
655
+ )
656
+ group_index: int | None = spec_field(
657
+ "group_index",
658
+ description="Index of the group this item belongs to, if applicable.",
659
+ allow_null=True,
660
+ )
661
+
662
+ token_interval: TokenInterval | None = spec_field(
663
+ "token_interval",
664
+ description="Token interval of the extracted item.",
665
+ allow_null=True,
666
+ )
667
+
668
+ def to_dataclass(self) -> LXExtraction:
669
+ """Convert to LangExtract Extraction dataclass.
670
+
671
+ Returns
672
+ -------
673
+ LXExtraction
674
+ LangExtract extraction dataclass instance.
675
+ """
676
+ char_interval = (
677
+ self.char_interval.to_dataclass()
678
+ if self.char_interval is not None
679
+ else None
680
+ )
681
+ alignment_status = (
682
+ self.alignment_status.to_dataclass()
683
+ if self.alignment_status is not None
684
+ else None
685
+ )
686
+ token_interval = (
687
+ self.token_interval.to_dataclass()
688
+ if self.token_interval is not None
689
+ else None
690
+ )
691
+ return LXExtraction(
692
+ extraction_class=self.extraction_class,
693
+ extraction_text=self.extraction_text,
694
+ char_interval=char_interval,
695
+ alignment_status=alignment_status,
696
+ extraction_index=self.extraction_index,
697
+ group_index=self.group_index,
698
+ description=self.description,
699
+ attributes=_attributes_to_dict(self.attributes),
700
+ token_interval=token_interval,
701
+ )
702
+
703
+ @staticmethod
704
+ def to_dataclass_list(
705
+ data: Sequence["ExtractionStructure"],
706
+ ) -> list[LXExtraction]:
707
+ """Convert a list of Extractions to LangExtract Extraction dataclasses.
708
+
709
+ Parameters
710
+ ----------
711
+ data : Sequence[ExtractionStructure]
712
+ List of structured extraction models.
713
+
714
+ Returns
715
+ -------
716
+ list[LXExtraction]
717
+ List of LangExtract extraction dataclass instances.
718
+ """
719
+ return [item.to_dataclass() for item in data]
720
+
721
+ @classmethod
722
+ def from_dataclass(cls, data: LXExtraction) -> "ExtractionStructure":
723
+ """Create an extraction from a LangExtract dataclass.
724
+
725
+ Parameters
726
+ ----------
727
+ data : LXExtraction
728
+ LangExtract extraction dataclass instance.
729
+
730
+ Returns
731
+ -------
732
+ ExtractionStructure
733
+ Structured extraction model.
734
+ """
735
+ char_interval = (
736
+ CharInterval.from_dataclass(data.char_interval)
737
+ if data.char_interval is not None
738
+ else None
739
+ )
740
+ alignment_status = (
741
+ AlignmentStatus.from_dataclass(data.alignment_status)
742
+ if data.alignment_status is not None
743
+ else None
744
+ )
745
+ token_interval = (
746
+ TokenInterval.from_dataclass(data.token_interval)
747
+ if data.token_interval is not None
748
+ else None
749
+ )
750
+ return cls(
751
+ extraction_class=data.extraction_class,
752
+ extraction_text=data.extraction_text,
753
+ char_interval=char_interval,
754
+ alignment_status=alignment_status,
755
+ extraction_index=data.extraction_index,
756
+ group_index=data.group_index,
757
+ description=data.description,
758
+ attributes=_attributes_from_dict(data.attributes),
759
+ token_interval=token_interval,
760
+ )
761
+
762
+ @staticmethod
763
+ def from_dataclass_list(
764
+ data: list[LXExtraction] | None,
765
+ ) -> list["ExtractionStructure"]:
766
+ """Create a list of extractions from a list of LangExtract dataclasses.
767
+
768
+ Parameters
769
+ ----------
770
+ data : list[LXExtraction]
771
+ List of LangExtract extraction dataclass instances.
772
+
773
+ Returns
774
+ -------
775
+ list[ExtractionStructure]
776
+ List of structured extraction models.
777
+ """
778
+ if data is None:
779
+ return []
780
+ return [ExtractionStructure.from_dataclass(item) for item in data]
781
+
782
+
783
+ class ExampleDataStructure(StructureBase):
784
+ """Represent example data for structured prompting.
785
+
786
+ Attributes
787
+ ----------
788
+ text : str
789
+ Raw text for the example.
790
+ extractions : list[ExtractionStructure]
791
+ Extractions associated with the text. Default is an empty list.
792
+
793
+ Methods
794
+ -------
795
+ to_dataclass()
796
+ Convert to a LangExtract ``ExampleData`` dataclass.
797
+ to_dataclass_list(data)
798
+ Convert structured example data to LangExtract dataclasses.
799
+ from_dataclass(data)
800
+ Create example data from a LangExtract dataclass.
801
+ from_dataclass_list(data)
802
+ Create structured examples from LangExtract dataclasses.
803
+ """
804
+
805
+ text: str = spec_field(
806
+ "text",
807
+ allow_null=False,
808
+ description="Raw text for the example.",
809
+ )
810
+ extractions: list[ExtractionStructure] = spec_field(
811
+ "extractions",
812
+ description="Extractions associated with the text.",
813
+ default_factory=list,
814
+ )
815
+
816
+ def to_dataclass(self) -> LXExampleData:
817
+ """Convert to LangExtract ExampleData dataclass.
818
+
819
+ Returns
820
+ -------
821
+ LXExampleData
822
+ LangExtract example dataclass instance.
823
+ """
824
+ return LXExampleData(
825
+ text=self.text,
826
+ extractions=ExtractionStructure.to_dataclass_list(self.extractions),
827
+ )
828
+
829
+ @staticmethod
830
+ def to_dataclass_list(
831
+ data: Sequence["ExampleDataStructure"],
832
+ ) -> list[LXExampleData]:
833
+ """Convert structured examples to LangExtract dataclasses.
834
+
835
+ Parameters
836
+ ----------
837
+ data : Sequence[ExampleDataStructure]
838
+ List of structured example data models.
839
+
840
+ Returns
841
+ -------
842
+ list[LXExampleData]
843
+ List of LangExtract example dataclass instances.
844
+ """
845
+ return [item.to_dataclass() for item in data]
846
+
847
+ @classmethod
848
+ def from_dataclass(cls, data: LXExampleData) -> "ExampleDataStructure":
849
+ """Create example data from a LangExtract dataclass.
850
+
851
+ Parameters
852
+ ----------
853
+ data : LXExampleData
854
+ LangExtract example dataclass instance.
855
+
856
+ Returns
857
+ -------
858
+ ExampleDataStructure
859
+ Structured example data model.
860
+ """
861
+ extractions = ExtractionStructure.from_dataclass_list(data.extractions)
862
+ return cls(text=data.text, extractions=extractions)
863
+
864
+ @staticmethod
865
+ def from_dataclass_list(
866
+ data: list[LXExampleData] | None,
867
+ ) -> list["ExampleDataStructure"]:
868
+ """Create structured examples from LangExtract dataclasses.
869
+
870
+ Parameters
871
+ ----------
872
+ data : list[LXExampleData] or None
873
+ List of LangExtract example dataclass instances.
874
+
875
+ Returns
876
+ -------
877
+ list[ExampleDataStructure]
878
+ List of structured example data models.
879
+ """
880
+ if data is None:
881
+ return []
882
+ return [ExampleDataStructure.from_dataclass(item) for item in data]
883
+
884
+
885
+ class AnnotatedDocumentStructure(StructureBase):
886
+ """Represent a document annotated with extractions.
887
+
888
+ Attributes
889
+ ----------
890
+ document_id : str | None
891
+ Identifier for the document.
892
+ extractions : list[ExtractionStructure] | None
893
+ Extractions associated with the document.
894
+ text : str | None
895
+ Raw text representation of the document.
896
+ tokenized_text : TokenizedText | None
897
+ Tokenized text for the document.
898
+
899
+ Methods
900
+ -------
901
+ to_dataclass()
902
+ Convert to a LangExtract ``AnnotatedDocument`` dataclass.
903
+ from_dataclass(data)
904
+ Create an annotated document from a LangExtract dataclass.
905
+ """
906
+
907
+ document_id: str | None = spec_field(
908
+ "document_id",
909
+ description="Identifier for the document.",
910
+ allow_null=True,
911
+ )
912
+ extractions: list[ExtractionStructure] | None = spec_field(
913
+ "extractions",
914
+ description="Extractions associated with the document.",
915
+ allow_null=True,
916
+ default_factory=list,
917
+ )
918
+ text: str | None = spec_field(
919
+ "text",
920
+ description="Raw text representation of the document.",
921
+ allow_null=True,
922
+ )
923
+ tokenized_text: TokenizedText | None = spec_field(
924
+ "tokenized_text",
925
+ description="Tokenized representation of the document text.",
926
+ allow_null=True,
927
+ )
928
+
929
+ def model_post_init(self, __context: Any) -> None:
930
+ """Populate default identifiers and tokenized text after validation."""
931
+ if self.document_id is None:
932
+ self.document_id = f"doc_{uuid.uuid4().hex[:8]}"
933
+ if self.text and self.tokenized_text is None:
934
+ tokenized = LXtokenizer.tokenize(self.text)
935
+ self.tokenized_text = TokenizedText.from_dataclass(tokenized)
936
+
937
+ def to_dataclass(self) -> LXAnnotatedDocument:
938
+ """Convert to LangExtract AnnotatedDocument dataclass.
939
+
940
+ Returns
941
+ -------
942
+ LXAnnotatedDocument
943
+ LangExtract annotated document dataclass instance.
944
+ """
945
+ lx_extractions = (
946
+ ExtractionStructure.to_dataclass_list(self.extractions)
947
+ if self.extractions is not None
948
+ else None
949
+ )
950
+ lx_doc = LXAnnotatedDocument(
951
+ document_id=self.document_id,
952
+ extractions=lx_extractions,
953
+ text=self.text,
954
+ )
955
+ if self.tokenized_text is not None:
956
+ lx_doc.tokenized_text = self.tokenized_text.to_dataclass()
957
+ return lx_doc
958
+
959
+ @classmethod
960
+ def from_dataclass(cls, data: LXAnnotatedDocument) -> "AnnotatedDocumentStructure":
961
+ """Create an annotated document from a LangExtract dataclass.
962
+
963
+ Parameters
964
+ ----------
965
+ data : LXAnnotatedDocument
966
+ LangExtract annotated document dataclass instance.
967
+
968
+ Returns
969
+ -------
970
+ AnnotatedDocumentStructure
971
+ Structured annotated document model.
972
+ """
973
+ extractions = (
974
+ ExtractionStructure.from_dataclass_list(data.extractions)
975
+ if data.extractions is not None
976
+ else None
977
+ )
978
+ tokenized_text = (
979
+ TokenizedText.from_dataclass(data.tokenized_text)
980
+ if data.tokenized_text is not None
981
+ else None
982
+ )
983
+ return cls(
984
+ document_id=data.document_id,
985
+ extractions=extractions,
986
+ text=data.text,
987
+ tokenized_text=tokenized_text,
988
+ )
989
+
990
+
991
+ class DocumentStructure(StructureBase):
992
+ """Store extraction results for a document.
993
+
994
+ Attributes
995
+ ----------
996
+ text : str
997
+ Raw text representation for the document.
998
+ document_id : str | None
999
+ Identifier for the source document.
1000
+ additional_context : str | None
1001
+ Additional context to supplement prompt instructions.
1002
+ tokenized_text : TokenizedText | None
1003
+ Tokenized representation of the document text.
1004
+
1005
+ Methods
1006
+ -------
1007
+ to_dataclass()
1008
+ Convert to a LangExtract ``Document`` dataclass.
1009
+ to_dataclass_list(data)
1010
+ Convert structured documents to LangExtract dataclasses.
1011
+ from_dataclass(data)
1012
+ Create a document from a LangExtract dataclass.
1013
+ from_dataclass_list(data)
1014
+ Create structured documents from LangExtract dataclasses.
1015
+ """
1016
+
1017
+ text: str = spec_field(
1018
+ "text",
1019
+ allow_null=False,
1020
+ description="Raw text representation for the document.",
1021
+ )
1022
+ document_id: str | None = spec_field(
1023
+ "document_id",
1024
+ description="Identifier for the source document.",
1025
+ allow_null=True,
1026
+ )
1027
+ additional_context: str | None = spec_field(
1028
+ "additional_context",
1029
+ description="Additional context to supplement prompt instructions.",
1030
+ allow_null=True,
1031
+ )
1032
+ tokenized_text: TokenizedText | None = spec_field(
1033
+ "tokenized_text",
1034
+ description="Tokenized representation of the document text.",
1035
+ allow_null=True,
1036
+ )
1037
+
1038
+ def model_post_init(self, __context: Any) -> None:
1039
+ """Populate default identifiers and tokenized text after validation."""
1040
+ if self.document_id is None:
1041
+ self.document_id = f"doc_{uuid.uuid4().hex[:8]}"
1042
+ if self.tokenized_text is None and self.text:
1043
+ tokenized = LXtokenizer.tokenize(self.text)
1044
+ self.tokenized_text = TokenizedText.from_dataclass(tokenized)
1045
+
1046
+ def to_dataclass(self) -> LXDocument:
1047
+ """Convert to LangExtract Document dataclass.
1048
+
1049
+ Returns
1050
+ -------
1051
+ LXDocument
1052
+ LangExtract document dataclass instance.
1053
+ """
1054
+ lx_doc = LXDocument(
1055
+ text=self.text,
1056
+ document_id=self.document_id,
1057
+ additional_context=self.additional_context,
1058
+ )
1059
+ if self.tokenized_text is not None:
1060
+ lx_doc.tokenized_text = self.tokenized_text.to_dataclass()
1061
+ return lx_doc
1062
+
1063
+ @staticmethod
1064
+ def to_dataclass_list(
1065
+ data: Sequence["DocumentStructure"],
1066
+ ) -> list[LXDocument]:
1067
+ """Convert structured documents to LangExtract dataclasses.
1068
+
1069
+ Parameters
1070
+ ----------
1071
+ data : Sequence[DocumentStructure]
1072
+ List of structured document models.
1073
+
1074
+ Returns
1075
+ -------
1076
+ list[LXDocument]
1077
+ List of LangExtract document dataclass instances.
1078
+ """
1079
+ return [item.to_dataclass() for item in data]
1080
+
1081
+ @classmethod
1082
+ def from_dataclass(cls, data: LXDocument) -> "DocumentStructure":
1083
+ """Create a document from a LangExtract dataclass.
1084
+
1085
+ Parameters
1086
+ ----------
1087
+ data : LXDocument
1088
+ LangExtract document dataclass instance.
1089
+
1090
+ Returns
1091
+ -------
1092
+ DocumentStructure
1093
+ Structured document model.
1094
+ """
1095
+ tokenized_text = (
1096
+ TokenizedText.from_dataclass(data.tokenized_text)
1097
+ if data.tokenized_text is not None
1098
+ else None
1099
+ )
1100
+ return cls(
1101
+ text=data.text,
1102
+ document_id=data.document_id,
1103
+ additional_context=data.additional_context,
1104
+ tokenized_text=tokenized_text,
1105
+ )
1106
+
1107
+ @staticmethod
1108
+ def from_dataclass_list(
1109
+ data: list[LXDocument] | None,
1110
+ ) -> list["DocumentStructure"]:
1111
+ """Create structured documents from LangExtract dataclasses.
1112
+
1113
+ Parameters
1114
+ ----------
1115
+ data : list[LXDocument] or None
1116
+ List of LangExtract document dataclass instances.
1117
+
1118
+ Returns
1119
+ -------
1120
+ list[DocumentStructure]
1121
+ List of structured document models.
1122
+ """
1123
+ if data is None:
1124
+ return []
1125
+ return [DocumentStructure.from_dataclass(item) for item in data]
1126
+
1127
+
1128
+ class DocumentExtractorConfig(StructureBase):
1129
+ """Configuration settings for the extractor.
1130
+
1131
+ Attributes
1132
+ ----------
1133
+ name : str
1134
+ Name used to store and reuse extractor configurations.
1135
+ prompt_description : str
1136
+ Prompt description used by LangExtract.
1137
+ extraction_classes : list[str]
1138
+ List of extraction classes to be extracted.
1139
+ examples : list[ExampleDataStructure]
1140
+ Example payloads supplied to LangExtract.
1141
+
1142
+ Methods
1143
+ -------
1144
+ to_json()
1145
+ Return a JSON-compatible dict representation.
1146
+ to_json_file(filepath)
1147
+ Write serialized JSON data to a file path.
1148
+ """
1149
+
1150
+ name: str = spec_field(
1151
+ "name",
1152
+ allow_null=False,
1153
+ description="Name used to store and reuse extractor configurations.",
1154
+ examples=["invoice_entity_extractor"],
1155
+ )
1156
+ prompt_description: str = spec_field(
1157
+ "prompt_description",
1158
+ allow_null=False,
1159
+ description="Prompt description used by LangExtract.",
1160
+ examples=[
1161
+ "Extract characters, emotions, and relationships in order of appearance. "
1162
+ "Use exact text for extractions. Do not paraphrase or overlap entities. "
1163
+ "Provide meaningful attributes for each entity to add context."
1164
+ ],
1165
+ )
1166
+ extraction_classes: list[str] = spec_field(
1167
+ "extraction_classes",
1168
+ description="List of extraction classes to be extracted.",
1169
+ default_factory=list,
1170
+ examples=[["character", "emotion", "relationship"]],
1171
+ )
1172
+ examples: list[ExampleDataStructure] = spec_field(
1173
+ "examples",
1174
+ description="Example payloads supplied to LangExtract.",
1175
+ default_factory=list,
1176
+ examples=[
1177
+ [
1178
+ ExampleDataStructure(
1179
+ text=(
1180
+ "ROMEO. But soft! What light through yonder window breaks? "
1181
+ "It is the east, and Juliet is the sun."
1182
+ ),
1183
+ extractions=[
1184
+ ExtractionStructure(
1185
+ extraction_class="character",
1186
+ extraction_text="ROMEO",
1187
+ attributes=[
1188
+ AttributeStructure(
1189
+ key="emotional_state",
1190
+ value="wonder",
1191
+ )
1192
+ ],
1193
+ ),
1194
+ ExtractionStructure(
1195
+ extraction_class="emotion",
1196
+ extraction_text="But soft!",
1197
+ attributes=[
1198
+ AttributeStructure(
1199
+ key="feeling",
1200
+ value="gentle awe",
1201
+ )
1202
+ ],
1203
+ ),
1204
+ ExtractionStructure(
1205
+ extraction_class="relationship",
1206
+ extraction_text="Juliet is the sun",
1207
+ attributes=[
1208
+ AttributeStructure(
1209
+ key="type",
1210
+ value="metaphor",
1211
+ )
1212
+ ],
1213
+ ),
1214
+ ],
1215
+ )
1216
+ ]
1217
+ ],
1218
+ )
1219
+
1220
+
1221
+ __all__ = [
1222
+ "AnnotatedDocumentStructure",
1223
+ "AttributeStructure",
1224
+ "DocumentStructure",
1225
+ "ExampleDataStructure",
1226
+ "ExtractionStructure",
1227
+ "DocumentExtractorConfig",
1228
+ ]