openai-sdk-helpers 0.4.3__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openai_sdk_helpers/__init__.py +41 -7
- openai_sdk_helpers/agent/__init__.py +1 -2
- openai_sdk_helpers/agent/base.py +169 -190
- openai_sdk_helpers/agent/configuration.py +12 -20
- openai_sdk_helpers/agent/coordinator.py +14 -17
- openai_sdk_helpers/agent/runner.py +3 -45
- openai_sdk_helpers/agent/search/base.py +49 -71
- openai_sdk_helpers/agent/search/vector.py +82 -110
- openai_sdk_helpers/agent/search/web.py +103 -81
- openai_sdk_helpers/agent/summarizer.py +20 -28
- openai_sdk_helpers/agent/translator.py +17 -23
- openai_sdk_helpers/agent/validator.py +17 -23
- openai_sdk_helpers/errors.py +9 -0
- openai_sdk_helpers/extract/__init__.py +23 -0
- openai_sdk_helpers/extract/extractor.py +157 -0
- openai_sdk_helpers/extract/generator.py +476 -0
- openai_sdk_helpers/files_api.py +1 -0
- openai_sdk_helpers/logging.py +12 -1
- openai_sdk_helpers/prompt/extractor_config_agent_instructions.jinja +6 -0
- openai_sdk_helpers/prompt/extractor_config_generator.jinja +37 -0
- openai_sdk_helpers/prompt/extractor_config_generator_instructions.jinja +9 -0
- openai_sdk_helpers/prompt/extractor_prompt_optimizer_agent_instructions.jinja +4 -0
- openai_sdk_helpers/prompt/extractor_prompt_optimizer_request.jinja +11 -0
- openai_sdk_helpers/response/__init__.py +2 -6
- openai_sdk_helpers/response/base.py +233 -164
- openai_sdk_helpers/response/configuration.py +39 -14
- openai_sdk_helpers/response/files.py +41 -2
- openai_sdk_helpers/response/runner.py +1 -48
- openai_sdk_helpers/response/tool_call.py +0 -141
- openai_sdk_helpers/response/vector_store.py +8 -5
- openai_sdk_helpers/streamlit_app/app.py +1 -9
- openai_sdk_helpers/structure/__init__.py +16 -0
- openai_sdk_helpers/structure/base.py +239 -278
- openai_sdk_helpers/structure/extraction.py +1228 -0
- openai_sdk_helpers/structure/plan/plan.py +0 -20
- openai_sdk_helpers/structure/plan/task.py +0 -33
- openai_sdk_helpers/structure/prompt.py +16 -0
- openai_sdk_helpers/structure/responses.py +2 -2
- openai_sdk_helpers/structure/web_search.py +0 -10
- openai_sdk_helpers/tools.py +346 -99
- openai_sdk_helpers/utils/__init__.py +7 -0
- openai_sdk_helpers/utils/json/base_model.py +315 -32
- openai_sdk_helpers/utils/langextract.py +194 -0
- openai_sdk_helpers/vector_storage/cleanup.py +7 -2
- openai_sdk_helpers/vector_storage/storage.py +37 -7
- {openai_sdk_helpers-0.4.3.dist-info → openai_sdk_helpers-0.5.1.dist-info}/METADATA +21 -6
- openai_sdk_helpers-0.5.1.dist-info/RECORD +95 -0
- openai_sdk_helpers/streamlit_app/streamlit_web_search.py +0 -75
- openai_sdk_helpers-0.4.3.dist-info/RECORD +0 -86
- {openai_sdk_helpers-0.4.3.dist-info → openai_sdk_helpers-0.5.1.dist-info}/WHEEL +0 -0
- {openai_sdk_helpers-0.4.3.dist-info → openai_sdk_helpers-0.5.1.dist-info}/entry_points.txt +0 -0
- {openai_sdk_helpers-0.4.3.dist-info → openai_sdk_helpers-0.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,1228 @@
|
|
|
1
|
+
"""Structured extraction result models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any, Sequence
|
|
6
|
+
import uuid
|
|
7
|
+
from enum import Enum, IntEnum
|
|
8
|
+
from langextract.core import format_handler as lx_format_handler
|
|
9
|
+
from langextract.core.data import (
|
|
10
|
+
AlignmentStatus as LXAlignmentStatus,
|
|
11
|
+
AnnotatedDocument as LXAnnotatedDocument,
|
|
12
|
+
CharInterval as LXCharInterval,
|
|
13
|
+
Document as LXDocument,
|
|
14
|
+
ExampleData as LXExampleData,
|
|
15
|
+
Extraction as LXExtraction,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
from langextract.core import tokenizer as LXtokenizer
|
|
19
|
+
from .base import StructureBase, spec_field
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CharInterval(StructureBase):
|
|
23
|
+
"""Class for representing a character interval.
|
|
24
|
+
|
|
25
|
+
Attributes
|
|
26
|
+
----------
|
|
27
|
+
start_pos: The starting position of the interval (inclusive).
|
|
28
|
+
end_pos: The ending position of the interval (exclusive).
|
|
29
|
+
|
|
30
|
+
Methods
|
|
31
|
+
-------
|
|
32
|
+
to_dataclass()
|
|
33
|
+
Convert to a LangExtract ``CharInterval`` dataclass.
|
|
34
|
+
from_dataclass(data)
|
|
35
|
+
Create a CharInterval from a LangExtract dataclass.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
start_pos: int | None = spec_field(
|
|
39
|
+
"start_pos",
|
|
40
|
+
description="The starting position of the interval (inclusive).",
|
|
41
|
+
)
|
|
42
|
+
end_pos: int | None = spec_field(
|
|
43
|
+
"end_pos",
|
|
44
|
+
description="The ending position of the interval (exclusive).",
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
def to_dataclass(self) -> LXCharInterval:
|
|
48
|
+
"""Convert to LangExtract CharInterval dataclass.
|
|
49
|
+
|
|
50
|
+
Returns
|
|
51
|
+
-------
|
|
52
|
+
LXCharInterval
|
|
53
|
+
LangExtract character interval dataclass instance.
|
|
54
|
+
"""
|
|
55
|
+
return LXCharInterval(
|
|
56
|
+
start_pos=self.start_pos,
|
|
57
|
+
end_pos=self.end_pos,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
@classmethod
|
|
61
|
+
def from_dataclass(cls, data: LXCharInterval) -> "CharInterval":
|
|
62
|
+
"""Create a CharInterval from a LangExtract dataclass.
|
|
63
|
+
|
|
64
|
+
Parameters
|
|
65
|
+
----------
|
|
66
|
+
data : LXCharInterval
|
|
67
|
+
LangExtract CharInterval dataclass instance.
|
|
68
|
+
|
|
69
|
+
Returns
|
|
70
|
+
-------
|
|
71
|
+
CharInterval
|
|
72
|
+
Structured character interval model.
|
|
73
|
+
"""
|
|
74
|
+
return cls(
|
|
75
|
+
start_pos=data.start_pos,
|
|
76
|
+
end_pos=data.end_pos,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class AlignmentStatus(Enum):
|
|
81
|
+
"""Represent alignment status values for extracted items.
|
|
82
|
+
|
|
83
|
+
Methods
|
|
84
|
+
-------
|
|
85
|
+
to_dataclass()
|
|
86
|
+
Convert to a LangExtract ``AlignmentStatus`` dataclass.
|
|
87
|
+
from_dataclass(data)
|
|
88
|
+
Create an AlignmentStatus from a LangExtract dataclass.
|
|
89
|
+
"""
|
|
90
|
+
|
|
91
|
+
MATCH_EXACT = "match_exact"
|
|
92
|
+
MATCH_GREATER = "match_greater"
|
|
93
|
+
MATCH_LESSER = "match_lesser"
|
|
94
|
+
MATCH_FUZZY = "match_fuzzy"
|
|
95
|
+
|
|
96
|
+
def to_dataclass(self) -> LXAlignmentStatus:
|
|
97
|
+
"""Convert to LangExtract AlignmentStatus dataclass.
|
|
98
|
+
|
|
99
|
+
Returns
|
|
100
|
+
-------
|
|
101
|
+
LXAlignmentStatus
|
|
102
|
+
LangExtract alignment status dataclass instance.
|
|
103
|
+
"""
|
|
104
|
+
return LXAlignmentStatus(self.value)
|
|
105
|
+
|
|
106
|
+
@classmethod
|
|
107
|
+
def from_dataclass(cls, data: LXAlignmentStatus) -> "AlignmentStatus":
|
|
108
|
+
"""Create an AlignmentStatus from a LangExtract dataclass.
|
|
109
|
+
|
|
110
|
+
Parameters
|
|
111
|
+
----------
|
|
112
|
+
data : LXAlignmentStatus
|
|
113
|
+
LangExtract alignment status dataclass instance.
|
|
114
|
+
|
|
115
|
+
Returns
|
|
116
|
+
-------
|
|
117
|
+
AlignmentStatus
|
|
118
|
+
Structured alignment status value.
|
|
119
|
+
"""
|
|
120
|
+
return cls(data.value)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class TokenCharInterval(StructureBase):
|
|
124
|
+
"""Represents an interval over characters in tokenized text.
|
|
125
|
+
|
|
126
|
+
The interval is defined by a start position (inclusive) and an end position
|
|
127
|
+
(exclusive).
|
|
128
|
+
|
|
129
|
+
Attributes
|
|
130
|
+
----------
|
|
131
|
+
start_pos: The starting position of the interval (inclusive).
|
|
132
|
+
end_pos: The ending position of the interval (exclusive).
|
|
133
|
+
|
|
134
|
+
Methods
|
|
135
|
+
-------
|
|
136
|
+
to_dataclass()
|
|
137
|
+
Convert to a LangExtract ``CharInterval`` dataclass.
|
|
138
|
+
from_dataclass(data)
|
|
139
|
+
Create a TokenCharInterval from a LangExtract dataclass.
|
|
140
|
+
"""
|
|
141
|
+
|
|
142
|
+
start_pos: int = spec_field(
|
|
143
|
+
"start_pos",
|
|
144
|
+
description="The starting position of the interval (inclusive).",
|
|
145
|
+
default=0,
|
|
146
|
+
)
|
|
147
|
+
end_pos: int = spec_field(
|
|
148
|
+
"end_pos",
|
|
149
|
+
description="The ending position of the interval (exclusive).",
|
|
150
|
+
default=0,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
def to_dataclass(self) -> LXtokenizer.CharInterval:
|
|
154
|
+
"""Convert to LangExtract CharInterval dataclass.
|
|
155
|
+
|
|
156
|
+
Returns
|
|
157
|
+
-------
|
|
158
|
+
LXtokenizer.CharInterval
|
|
159
|
+
LangExtract character interval dataclass instance.
|
|
160
|
+
"""
|
|
161
|
+
return LXtokenizer.CharInterval(
|
|
162
|
+
start_pos=self.start_pos,
|
|
163
|
+
end_pos=self.end_pos,
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
@classmethod
|
|
167
|
+
def from_dataclass(cls, data: LXtokenizer.CharInterval) -> "TokenCharInterval":
|
|
168
|
+
"""Create a TokenCharInterval from a LangExtract dataclass.
|
|
169
|
+
|
|
170
|
+
Parameters
|
|
171
|
+
----------
|
|
172
|
+
data : LXtokenizer.CharInterval
|
|
173
|
+
LangExtract CharInterval dataclass instance.
|
|
174
|
+
|
|
175
|
+
Returns
|
|
176
|
+
-------
|
|
177
|
+
TokenCharInterval
|
|
178
|
+
Structured token character interval model.
|
|
179
|
+
"""
|
|
180
|
+
return cls(
|
|
181
|
+
start_pos=data.start_pos,
|
|
182
|
+
end_pos=data.end_pos,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
class TokenInterval(StructureBase):
|
|
187
|
+
"""Represents an interval over tokens in tokenized text.
|
|
188
|
+
|
|
189
|
+
The interval is defined by a start index (inclusive) and an end index
|
|
190
|
+
(exclusive).
|
|
191
|
+
|
|
192
|
+
Attributes
|
|
193
|
+
----------
|
|
194
|
+
start_index: The index of the first token in the interval.
|
|
195
|
+
end_index: The index one past the last token in the interval.
|
|
196
|
+
|
|
197
|
+
Methods
|
|
198
|
+
-------
|
|
199
|
+
to_dataclass()
|
|
200
|
+
Convert to a LangExtract ``TokenInterval`` dataclass.
|
|
201
|
+
from_dataclass(data)
|
|
202
|
+
Create a TokenInterval from a LangExtract dataclass.
|
|
203
|
+
"""
|
|
204
|
+
|
|
205
|
+
start_index: int = spec_field(
|
|
206
|
+
"start_index",
|
|
207
|
+
description="The index of the first token in the interval.",
|
|
208
|
+
default=0,
|
|
209
|
+
)
|
|
210
|
+
end_index: int = spec_field(
|
|
211
|
+
"end_index",
|
|
212
|
+
description="The index one past the last token in the interval.",
|
|
213
|
+
default=0,
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
def to_dataclass(self) -> LXtokenizer.TokenInterval:
|
|
217
|
+
"""Convert to LangExtract TokenInterval dataclass.
|
|
218
|
+
|
|
219
|
+
Returns
|
|
220
|
+
-------
|
|
221
|
+
LXtokenizer.TokenInterval
|
|
222
|
+
LangExtract token interval dataclass instance.
|
|
223
|
+
"""
|
|
224
|
+
return LXtokenizer.TokenInterval(
|
|
225
|
+
start_index=self.start_index,
|
|
226
|
+
end_index=self.end_index,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
@classmethod
|
|
230
|
+
def from_dataclass(cls, data: LXtokenizer.TokenInterval) -> "TokenInterval":
|
|
231
|
+
"""Create a TokenInterval from a LangExtract dataclass.
|
|
232
|
+
|
|
233
|
+
Parameters
|
|
234
|
+
----------
|
|
235
|
+
data : LXtokenizer.TokenInterval
|
|
236
|
+
LangExtract TokenInterval dataclass instance.
|
|
237
|
+
|
|
238
|
+
Returns
|
|
239
|
+
-------
|
|
240
|
+
TokenInterval
|
|
241
|
+
Structured token interval model.
|
|
242
|
+
"""
|
|
243
|
+
return cls(
|
|
244
|
+
start_index=data.start_index,
|
|
245
|
+
end_index=data.end_index,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
|
|
249
|
+
class TokenType(IntEnum):
|
|
250
|
+
"""Enumeration of token types produced during tokenization.
|
|
251
|
+
|
|
252
|
+
Attributes
|
|
253
|
+
----------
|
|
254
|
+
WORD: Represents an alphabetical word token.
|
|
255
|
+
NUMBER: Represents a numeric token.
|
|
256
|
+
PUNCTUATION: Represents punctuation characters.
|
|
257
|
+
|
|
258
|
+
Methods
|
|
259
|
+
-------
|
|
260
|
+
to_dataclass()
|
|
261
|
+
Convert to a LangExtract ``TokenType`` dataclass.
|
|
262
|
+
from_dataclass(data)
|
|
263
|
+
Create a TokenType from a LangExtract dataclass.
|
|
264
|
+
"""
|
|
265
|
+
|
|
266
|
+
WORD = 0
|
|
267
|
+
NUMBER = 1
|
|
268
|
+
PUNCTUATION = 2
|
|
269
|
+
|
|
270
|
+
def to_dataclass(self) -> LXtokenizer.TokenType:
|
|
271
|
+
"""Convert to LangExtract TokenType dataclass.
|
|
272
|
+
|
|
273
|
+
Returns
|
|
274
|
+
-------
|
|
275
|
+
LXtokenizer.TokenType
|
|
276
|
+
LangExtract token type dataclass instance.
|
|
277
|
+
"""
|
|
278
|
+
return LXtokenizer.TokenType(self.value)
|
|
279
|
+
|
|
280
|
+
@classmethod
|
|
281
|
+
def from_dataclass(cls, data: LXtokenizer.TokenType) -> "TokenType":
|
|
282
|
+
"""Create a TokenType from a LangExtract dataclass.
|
|
283
|
+
|
|
284
|
+
Parameters
|
|
285
|
+
----------
|
|
286
|
+
data : LXtokenizer.TokenType
|
|
287
|
+
LangExtract token type dataclass instance.
|
|
288
|
+
|
|
289
|
+
Returns
|
|
290
|
+
-------
|
|
291
|
+
TokenType
|
|
292
|
+
Structured token type value.
|
|
293
|
+
"""
|
|
294
|
+
return cls(data.value)
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
class Token(StructureBase):
|
|
298
|
+
"""Represents a token extracted from text.
|
|
299
|
+
|
|
300
|
+
Each token is assigned an index and classified into a type (word, number,
|
|
301
|
+
punctuation, or acronym). The token also records the range of characters
|
|
302
|
+
(its CharInterval) that correspond to the substring from the original text.
|
|
303
|
+
Additionally, it tracks whether it follows a newline.
|
|
304
|
+
|
|
305
|
+
Attributes
|
|
306
|
+
----------
|
|
307
|
+
index: The position of the token in the sequence of tokens.
|
|
308
|
+
token_type: The type of the token, as defined by TokenType.
|
|
309
|
+
char_interval: The character interval within the original text that this
|
|
310
|
+
token spans.
|
|
311
|
+
first_token_after_newline: True if the token immediately follows a newline
|
|
312
|
+
or carriage return.
|
|
313
|
+
|
|
314
|
+
Methods
|
|
315
|
+
-------
|
|
316
|
+
to_dataclass()
|
|
317
|
+
Convert to a LangExtract ``Token`` dataclass.
|
|
318
|
+
from_dataclass(data)
|
|
319
|
+
Create a Token from a LangExtract dataclass.
|
|
320
|
+
from_dataclass_list(data)
|
|
321
|
+
Create structured tokens from LangExtract dataclasses.
|
|
322
|
+
to_dataclass_list(data)
|
|
323
|
+
Convert structured tokens to LangExtract dataclasses.
|
|
324
|
+
"""
|
|
325
|
+
|
|
326
|
+
index: int = spec_field(
|
|
327
|
+
"index",
|
|
328
|
+
description="The position of the token in the sequence of tokens.",
|
|
329
|
+
)
|
|
330
|
+
token_type: TokenType = spec_field(
|
|
331
|
+
"token_type",
|
|
332
|
+
description="The type of the token, as defined by TokenType.",
|
|
333
|
+
)
|
|
334
|
+
char_interval: TokenCharInterval | None = spec_field(
|
|
335
|
+
"char_interval",
|
|
336
|
+
description="The character interval within the original text that this token spans.",
|
|
337
|
+
allow_null=True,
|
|
338
|
+
)
|
|
339
|
+
first_token_after_newline: bool = spec_field(
|
|
340
|
+
"first_token_after_newline",
|
|
341
|
+
description="True if the token immediately follows a newline or carriage return.",
|
|
342
|
+
default=False,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
def to_dataclass(self) -> LXtokenizer.Token:
|
|
346
|
+
"""Convert to LangExtract Token dataclass.
|
|
347
|
+
|
|
348
|
+
Returns
|
|
349
|
+
-------
|
|
350
|
+
LXtokenizer.Token
|
|
351
|
+
LangExtract token dataclass instance.
|
|
352
|
+
"""
|
|
353
|
+
token = LXtokenizer.Token(
|
|
354
|
+
index=self.index,
|
|
355
|
+
token_type=LXtokenizer.TokenType(self.token_type),
|
|
356
|
+
first_token_after_newline=self.first_token_after_newline,
|
|
357
|
+
)
|
|
358
|
+
if self.char_interval is not None:
|
|
359
|
+
token.char_interval = self.char_interval.to_dataclass()
|
|
360
|
+
return token
|
|
361
|
+
|
|
362
|
+
@classmethod
|
|
363
|
+
def from_dataclass(cls, data: LXtokenizer.Token) -> "Token":
|
|
364
|
+
"""Create a Token from a LangExtract dataclass.
|
|
365
|
+
|
|
366
|
+
Parameters
|
|
367
|
+
----------
|
|
368
|
+
data : LXtokenizer.Token
|
|
369
|
+
LangExtract token dataclass instance.
|
|
370
|
+
|
|
371
|
+
Returns
|
|
372
|
+
-------
|
|
373
|
+
Token
|
|
374
|
+
Structured token model.
|
|
375
|
+
"""
|
|
376
|
+
char_interval = (
|
|
377
|
+
TokenCharInterval.from_dataclass(data.char_interval)
|
|
378
|
+
if data.char_interval is not None
|
|
379
|
+
else None
|
|
380
|
+
)
|
|
381
|
+
return cls(
|
|
382
|
+
index=data.index,
|
|
383
|
+
token_type=TokenType.from_dataclass(data.token_type),
|
|
384
|
+
char_interval=char_interval,
|
|
385
|
+
first_token_after_newline=data.first_token_after_newline,
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
@staticmethod
|
|
389
|
+
def from_dataclass_list(data: list[LXtokenizer.Token]) -> list["Token"]:
|
|
390
|
+
"""Create a list of Tokens from a list of LangExtract dataclasses.
|
|
391
|
+
|
|
392
|
+
Parameters
|
|
393
|
+
----------
|
|
394
|
+
data : list[LXtokenizer.Token]
|
|
395
|
+
List of LangExtract token dataclass instances.
|
|
396
|
+
|
|
397
|
+
Returns
|
|
398
|
+
-------
|
|
399
|
+
list[Token]
|
|
400
|
+
List of structured token models.
|
|
401
|
+
"""
|
|
402
|
+
return [Token.from_dataclass(item) for item in data]
|
|
403
|
+
|
|
404
|
+
@staticmethod
|
|
405
|
+
def to_dataclass_list(data: list["Token"]) -> list[LXtokenizer.Token]:
|
|
406
|
+
"""Convert a list of Tokens to LangExtract Token dataclasses.
|
|
407
|
+
|
|
408
|
+
Parameters
|
|
409
|
+
----------
|
|
410
|
+
data : list[Token]
|
|
411
|
+
List of structured token models.
|
|
412
|
+
|
|
413
|
+
Returns
|
|
414
|
+
-------
|
|
415
|
+
list[LXtokenizer.Token]
|
|
416
|
+
List of LangExtract token dataclass instances.
|
|
417
|
+
"""
|
|
418
|
+
return [item.to_dataclass() for item in data]
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
class TokenizedText(StructureBase):
|
|
422
|
+
"""Holds the result of tokenizing a text string.
|
|
423
|
+
|
|
424
|
+
Attributes
|
|
425
|
+
----------
|
|
426
|
+
text: The text that was tokenized. For UnicodeTokenizer, this is
|
|
427
|
+
NOT normalized to NFC (to preserve indices).
|
|
428
|
+
tokens: A list of Token objects extracted from the text.
|
|
429
|
+
|
|
430
|
+
Methods
|
|
431
|
+
-------
|
|
432
|
+
to_dataclass()
|
|
433
|
+
Convert to a LangExtract ``TokenizedText`` dataclass.
|
|
434
|
+
from_dataclass(data)
|
|
435
|
+
Create a TokenizedText from a LangExtract dataclass.
|
|
436
|
+
"""
|
|
437
|
+
|
|
438
|
+
text: str = spec_field(
|
|
439
|
+
"text",
|
|
440
|
+
description="The text that was tokenized.",
|
|
441
|
+
allow_null=False,
|
|
442
|
+
)
|
|
443
|
+
tokens: list[Token] = spec_field(
|
|
444
|
+
"tokens",
|
|
445
|
+
description="A list of Token objects extracted from the text.",
|
|
446
|
+
allow_null=True,
|
|
447
|
+
default_factory=list,
|
|
448
|
+
)
|
|
449
|
+
|
|
450
|
+
def to_dataclass(self) -> LXtokenizer.TokenizedText:
|
|
451
|
+
"""Convert to LangExtract TokenizedText dataclass.
|
|
452
|
+
|
|
453
|
+
Returns
|
|
454
|
+
-------
|
|
455
|
+
LXtokenizer.TokenizedText
|
|
456
|
+
LangExtract tokenized text dataclass instance.
|
|
457
|
+
"""
|
|
458
|
+
return LXtokenizer.TokenizedText(
|
|
459
|
+
text=self.text,
|
|
460
|
+
tokens=Token.to_dataclass_list(self.tokens),
|
|
461
|
+
)
|
|
462
|
+
|
|
463
|
+
@classmethod
|
|
464
|
+
def from_dataclass(cls, data: LXtokenizer.TokenizedText) -> "TokenizedText":
|
|
465
|
+
"""Create a TokenizedText from a LangExtract dataclass.
|
|
466
|
+
|
|
467
|
+
Parameters
|
|
468
|
+
----------
|
|
469
|
+
data : LXtokenizer.TokenizedText
|
|
470
|
+
LangExtract TokenizedText dataclass instance.
|
|
471
|
+
|
|
472
|
+
Returns
|
|
473
|
+
-------
|
|
474
|
+
TokenizedText
|
|
475
|
+
Structured tokenized text model.
|
|
476
|
+
"""
|
|
477
|
+
return cls(
|
|
478
|
+
text=data.text,
|
|
479
|
+
tokens=Token.from_dataclass_list(data.tokens),
|
|
480
|
+
)
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
class AttributeStructure(StructureBase):
|
|
484
|
+
"""Represent an extraction attribute as a key/value pair.
|
|
485
|
+
|
|
486
|
+
Attributes
|
|
487
|
+
----------
|
|
488
|
+
key : str
|
|
489
|
+
Attribute key.
|
|
490
|
+
value : str | int | float | dict | list | None
|
|
491
|
+
Attribute value.
|
|
492
|
+
|
|
493
|
+
Methods
|
|
494
|
+
-------
|
|
495
|
+
to_pair()
|
|
496
|
+
Convert the attribute to a tuple of ``(key, value)``.
|
|
497
|
+
from_pair(key, value)
|
|
498
|
+
Build an attribute from a key/value pair.
|
|
499
|
+
"""
|
|
500
|
+
|
|
501
|
+
key: str = spec_field(
|
|
502
|
+
"key",
|
|
503
|
+
allow_null=False,
|
|
504
|
+
description="Attribute key.",
|
|
505
|
+
)
|
|
506
|
+
value: lx_format_handler.ExtractionValueType = spec_field(
|
|
507
|
+
"value",
|
|
508
|
+
allow_null=True,
|
|
509
|
+
description="Attribute value.",
|
|
510
|
+
)
|
|
511
|
+
|
|
512
|
+
def to_pair(self) -> tuple[str, lx_format_handler.ExtractionValueType]:
|
|
513
|
+
"""Convert the attribute to a key/value pair.
|
|
514
|
+
|
|
515
|
+
Returns
|
|
516
|
+
-------
|
|
517
|
+
tuple[str, str | int | float | dict | list | None]
|
|
518
|
+
Tuple containing the attribute key and value.
|
|
519
|
+
"""
|
|
520
|
+
return self.key, self.value
|
|
521
|
+
|
|
522
|
+
@classmethod
|
|
523
|
+
def from_pair(
|
|
524
|
+
cls, key: str, value: lx_format_handler.ExtractionValueType
|
|
525
|
+
) -> "AttributeStructure":
|
|
526
|
+
"""Build an attribute from a key/value pair.
|
|
527
|
+
|
|
528
|
+
Parameters
|
|
529
|
+
----------
|
|
530
|
+
key : str
|
|
531
|
+
Attribute key.
|
|
532
|
+
value : str | int | float | dict | list | None
|
|
533
|
+
Attribute value to store.
|
|
534
|
+
|
|
535
|
+
Returns
|
|
536
|
+
-------
|
|
537
|
+
AttributeStructure
|
|
538
|
+
Structured attribute instance.
|
|
539
|
+
"""
|
|
540
|
+
return cls(key=key, value=value)
|
|
541
|
+
|
|
542
|
+
|
|
543
|
+
def _attributes_to_dict(
|
|
544
|
+
attributes: list[AttributeStructure] | None,
|
|
545
|
+
) -> dict[str, Any] | None:
|
|
546
|
+
"""Convert structured attributes to a dictionary.
|
|
547
|
+
|
|
548
|
+
Parameters
|
|
549
|
+
----------
|
|
550
|
+
attributes : list[AttributeStructure] or None
|
|
551
|
+
Structured attributes to convert.
|
|
552
|
+
|
|
553
|
+
Returns
|
|
554
|
+
-------
|
|
555
|
+
dict[str, Any] or None
|
|
556
|
+
Mapping of attribute keys to values.
|
|
557
|
+
"""
|
|
558
|
+
if attributes is None:
|
|
559
|
+
return None
|
|
560
|
+
return {attribute.key: attribute.value for attribute in attributes}
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def _attributes_from_dict(
|
|
564
|
+
attributes: dict[str, Any] | None,
|
|
565
|
+
) -> list[AttributeStructure] | None:
|
|
566
|
+
"""Convert an attribute dictionary into structured attributes.
|
|
567
|
+
|
|
568
|
+
Parameters
|
|
569
|
+
----------
|
|
570
|
+
attributes : dict[str, Any] or None
|
|
571
|
+
Attributes mapping to convert.
|
|
572
|
+
|
|
573
|
+
Returns
|
|
574
|
+
-------
|
|
575
|
+
list[AttributeStructure] or None
|
|
576
|
+
Structured attribute list.
|
|
577
|
+
"""
|
|
578
|
+
if attributes is None:
|
|
579
|
+
return None
|
|
580
|
+
return [
|
|
581
|
+
AttributeStructure.from_pair(key, value) for key, value in attributes.items()
|
|
582
|
+
]
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
class ExtractionStructure(StructureBase):
|
|
586
|
+
"""Represent a single extraction from a document.
|
|
587
|
+
|
|
588
|
+
Attributes
|
|
589
|
+
----------
|
|
590
|
+
extraction_class : str
|
|
591
|
+
Label or class assigned to the extracted item.
|
|
592
|
+
extraction_text : str
|
|
593
|
+
Raw text captured for the extracted item.
|
|
594
|
+
description : str | None
|
|
595
|
+
Optional description of the extracted item.
|
|
596
|
+
attributes : list[AttributeStructure] | None
|
|
597
|
+
Additional attributes attached to the item.
|
|
598
|
+
char_interval : CharInterval | None
|
|
599
|
+
Character interval in the source text.
|
|
600
|
+
alignment_status : AlignmentStatus | None
|
|
601
|
+
Alignment status of the extracted item.
|
|
602
|
+
extraction_index : int | None
|
|
603
|
+
Index of the extraction in the list of extractions.
|
|
604
|
+
group_index : int | None
|
|
605
|
+
Index of the group this item belongs to, if applicable.
|
|
606
|
+
token_interval : TokenInterval | None
|
|
607
|
+
Token interval of the extracted item.
|
|
608
|
+
|
|
609
|
+
Methods
|
|
610
|
+
-------
|
|
611
|
+
to_dataclass()
|
|
612
|
+
Convert to a LangExtract ``Extraction`` dataclass.
|
|
613
|
+
to_dataclass_list(data)
|
|
614
|
+
Convert structured extractions to LangExtract dataclasses.
|
|
615
|
+
from_dataclass(data)
|
|
616
|
+
Create an extraction from a LangExtract dataclass.
|
|
617
|
+
from_dataclass_list(data)
|
|
618
|
+
Create structured extractions from LangExtract dataclasses.
|
|
619
|
+
"""
|
|
620
|
+
|
|
621
|
+
extraction_class: str = spec_field(
|
|
622
|
+
"extraction_class",
|
|
623
|
+
allow_null=False,
|
|
624
|
+
description="Label or class for the extracted item.",
|
|
625
|
+
)
|
|
626
|
+
extraction_text: str = spec_field(
|
|
627
|
+
"extraction_text",
|
|
628
|
+
allow_null=False,
|
|
629
|
+
description="Raw text captured for the extracted item.",
|
|
630
|
+
)
|
|
631
|
+
description: str | None = spec_field(
|
|
632
|
+
"description",
|
|
633
|
+
allow_null=True,
|
|
634
|
+
description="Optional description of the extracted item.",
|
|
635
|
+
)
|
|
636
|
+
attributes: list[AttributeStructure] | None = spec_field(
|
|
637
|
+
"attributes",
|
|
638
|
+
default=None,
|
|
639
|
+
description="Additional attributes attached to the item.",
|
|
640
|
+
)
|
|
641
|
+
char_interval: CharInterval | None = spec_field(
|
|
642
|
+
"char_interval",
|
|
643
|
+
allow_null=True,
|
|
644
|
+
description="Character interval of the extracted item in the source text.",
|
|
645
|
+
)
|
|
646
|
+
alignment_status: AlignmentStatus | None = spec_field(
|
|
647
|
+
"alignment_status",
|
|
648
|
+
allow_null=True,
|
|
649
|
+
description="Alignment status of the extracted item.",
|
|
650
|
+
)
|
|
651
|
+
extraction_index: int | None = spec_field(
|
|
652
|
+
"extraction_index",
|
|
653
|
+
description="Index of the extraction in the list of extractions.",
|
|
654
|
+
allow_null=True,
|
|
655
|
+
)
|
|
656
|
+
group_index: int | None = spec_field(
|
|
657
|
+
"group_index",
|
|
658
|
+
description="Index of the group this item belongs to, if applicable.",
|
|
659
|
+
allow_null=True,
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
token_interval: TokenInterval | None = spec_field(
|
|
663
|
+
"token_interval",
|
|
664
|
+
description="Token interval of the extracted item.",
|
|
665
|
+
allow_null=True,
|
|
666
|
+
)
|
|
667
|
+
|
|
668
|
+
def to_dataclass(self) -> LXExtraction:
|
|
669
|
+
"""Convert to LangExtract Extraction dataclass.
|
|
670
|
+
|
|
671
|
+
Returns
|
|
672
|
+
-------
|
|
673
|
+
LXExtraction
|
|
674
|
+
LangExtract extraction dataclass instance.
|
|
675
|
+
"""
|
|
676
|
+
char_interval = (
|
|
677
|
+
self.char_interval.to_dataclass()
|
|
678
|
+
if self.char_interval is not None
|
|
679
|
+
else None
|
|
680
|
+
)
|
|
681
|
+
alignment_status = (
|
|
682
|
+
self.alignment_status.to_dataclass()
|
|
683
|
+
if self.alignment_status is not None
|
|
684
|
+
else None
|
|
685
|
+
)
|
|
686
|
+
token_interval = (
|
|
687
|
+
self.token_interval.to_dataclass()
|
|
688
|
+
if self.token_interval is not None
|
|
689
|
+
else None
|
|
690
|
+
)
|
|
691
|
+
return LXExtraction(
|
|
692
|
+
extraction_class=self.extraction_class,
|
|
693
|
+
extraction_text=self.extraction_text,
|
|
694
|
+
char_interval=char_interval,
|
|
695
|
+
alignment_status=alignment_status,
|
|
696
|
+
extraction_index=self.extraction_index,
|
|
697
|
+
group_index=self.group_index,
|
|
698
|
+
description=self.description,
|
|
699
|
+
attributes=_attributes_to_dict(self.attributes),
|
|
700
|
+
token_interval=token_interval,
|
|
701
|
+
)
|
|
702
|
+
|
|
703
|
+
@staticmethod
|
|
704
|
+
def to_dataclass_list(
|
|
705
|
+
data: Sequence["ExtractionStructure"],
|
|
706
|
+
) -> list[LXExtraction]:
|
|
707
|
+
"""Convert a list of Extractions to LangExtract Extraction dataclasses.
|
|
708
|
+
|
|
709
|
+
Parameters
|
|
710
|
+
----------
|
|
711
|
+
data : Sequence[ExtractionStructure]
|
|
712
|
+
List of structured extraction models.
|
|
713
|
+
|
|
714
|
+
Returns
|
|
715
|
+
-------
|
|
716
|
+
list[LXExtraction]
|
|
717
|
+
List of LangExtract extraction dataclass instances.
|
|
718
|
+
"""
|
|
719
|
+
return [item.to_dataclass() for item in data]
|
|
720
|
+
|
|
721
|
+
@classmethod
|
|
722
|
+
def from_dataclass(cls, data: LXExtraction) -> "ExtractionStructure":
|
|
723
|
+
"""Create an extraction from a LangExtract dataclass.
|
|
724
|
+
|
|
725
|
+
Parameters
|
|
726
|
+
----------
|
|
727
|
+
data : LXExtraction
|
|
728
|
+
LangExtract extraction dataclass instance.
|
|
729
|
+
|
|
730
|
+
Returns
|
|
731
|
+
-------
|
|
732
|
+
ExtractionStructure
|
|
733
|
+
Structured extraction model.
|
|
734
|
+
"""
|
|
735
|
+
char_interval = (
|
|
736
|
+
CharInterval.from_dataclass(data.char_interval)
|
|
737
|
+
if data.char_interval is not None
|
|
738
|
+
else None
|
|
739
|
+
)
|
|
740
|
+
alignment_status = (
|
|
741
|
+
AlignmentStatus.from_dataclass(data.alignment_status)
|
|
742
|
+
if data.alignment_status is not None
|
|
743
|
+
else None
|
|
744
|
+
)
|
|
745
|
+
token_interval = (
|
|
746
|
+
TokenInterval.from_dataclass(data.token_interval)
|
|
747
|
+
if data.token_interval is not None
|
|
748
|
+
else None
|
|
749
|
+
)
|
|
750
|
+
return cls(
|
|
751
|
+
extraction_class=data.extraction_class,
|
|
752
|
+
extraction_text=data.extraction_text,
|
|
753
|
+
char_interval=char_interval,
|
|
754
|
+
alignment_status=alignment_status,
|
|
755
|
+
extraction_index=data.extraction_index,
|
|
756
|
+
group_index=data.group_index,
|
|
757
|
+
description=data.description,
|
|
758
|
+
attributes=_attributes_from_dict(data.attributes),
|
|
759
|
+
token_interval=token_interval,
|
|
760
|
+
)
|
|
761
|
+
|
|
762
|
+
@staticmethod
|
|
763
|
+
def from_dataclass_list(
|
|
764
|
+
data: list[LXExtraction] | None,
|
|
765
|
+
) -> list["ExtractionStructure"]:
|
|
766
|
+
"""Create a list of extractions from a list of LangExtract dataclasses.
|
|
767
|
+
|
|
768
|
+
Parameters
|
|
769
|
+
----------
|
|
770
|
+
data : list[LXExtraction]
|
|
771
|
+
List of LangExtract extraction dataclass instances.
|
|
772
|
+
|
|
773
|
+
Returns
|
|
774
|
+
-------
|
|
775
|
+
list[ExtractionStructure]
|
|
776
|
+
List of structured extraction models.
|
|
777
|
+
"""
|
|
778
|
+
if data is None:
|
|
779
|
+
return []
|
|
780
|
+
return [ExtractionStructure.from_dataclass(item) for item in data]
|
|
781
|
+
|
|
782
|
+
|
|
783
|
+
class ExampleDataStructure(StructureBase):
|
|
784
|
+
"""Represent example data for structured prompting.
|
|
785
|
+
|
|
786
|
+
Attributes
|
|
787
|
+
----------
|
|
788
|
+
text : str
|
|
789
|
+
Raw text for the example.
|
|
790
|
+
extractions : list[ExtractionStructure]
|
|
791
|
+
Extractions associated with the text. Default is an empty list.
|
|
792
|
+
|
|
793
|
+
Methods
|
|
794
|
+
-------
|
|
795
|
+
to_dataclass()
|
|
796
|
+
Convert to a LangExtract ``ExampleData`` dataclass.
|
|
797
|
+
to_dataclass_list(data)
|
|
798
|
+
Convert structured example data to LangExtract dataclasses.
|
|
799
|
+
from_dataclass(data)
|
|
800
|
+
Create example data from a LangExtract dataclass.
|
|
801
|
+
from_dataclass_list(data)
|
|
802
|
+
Create structured examples from LangExtract dataclasses.
|
|
803
|
+
"""
|
|
804
|
+
|
|
805
|
+
text: str = spec_field(
|
|
806
|
+
"text",
|
|
807
|
+
allow_null=False,
|
|
808
|
+
description="Raw text for the example.",
|
|
809
|
+
)
|
|
810
|
+
extractions: list[ExtractionStructure] = spec_field(
|
|
811
|
+
"extractions",
|
|
812
|
+
description="Extractions associated with the text.",
|
|
813
|
+
default_factory=list,
|
|
814
|
+
)
|
|
815
|
+
|
|
816
|
+
def to_dataclass(self) -> LXExampleData:
|
|
817
|
+
"""Convert to LangExtract ExampleData dataclass.
|
|
818
|
+
|
|
819
|
+
Returns
|
|
820
|
+
-------
|
|
821
|
+
LXExampleData
|
|
822
|
+
LangExtract example dataclass instance.
|
|
823
|
+
"""
|
|
824
|
+
return LXExampleData(
|
|
825
|
+
text=self.text,
|
|
826
|
+
extractions=ExtractionStructure.to_dataclass_list(self.extractions),
|
|
827
|
+
)
|
|
828
|
+
|
|
829
|
+
@staticmethod
|
|
830
|
+
def to_dataclass_list(
|
|
831
|
+
data: Sequence["ExampleDataStructure"],
|
|
832
|
+
) -> list[LXExampleData]:
|
|
833
|
+
"""Convert structured examples to LangExtract dataclasses.
|
|
834
|
+
|
|
835
|
+
Parameters
|
|
836
|
+
----------
|
|
837
|
+
data : Sequence[ExampleDataStructure]
|
|
838
|
+
List of structured example data models.
|
|
839
|
+
|
|
840
|
+
Returns
|
|
841
|
+
-------
|
|
842
|
+
list[LXExampleData]
|
|
843
|
+
List of LangExtract example dataclass instances.
|
|
844
|
+
"""
|
|
845
|
+
return [item.to_dataclass() for item in data]
|
|
846
|
+
|
|
847
|
+
@classmethod
|
|
848
|
+
def from_dataclass(cls, data: LXExampleData) -> "ExampleDataStructure":
|
|
849
|
+
"""Create example data from a LangExtract dataclass.
|
|
850
|
+
|
|
851
|
+
Parameters
|
|
852
|
+
----------
|
|
853
|
+
data : LXExampleData
|
|
854
|
+
LangExtract example dataclass instance.
|
|
855
|
+
|
|
856
|
+
Returns
|
|
857
|
+
-------
|
|
858
|
+
ExampleDataStructure
|
|
859
|
+
Structured example data model.
|
|
860
|
+
"""
|
|
861
|
+
extractions = ExtractionStructure.from_dataclass_list(data.extractions)
|
|
862
|
+
return cls(text=data.text, extractions=extractions)
|
|
863
|
+
|
|
864
|
+
@staticmethod
|
|
865
|
+
def from_dataclass_list(
|
|
866
|
+
data: list[LXExampleData] | None,
|
|
867
|
+
) -> list["ExampleDataStructure"]:
|
|
868
|
+
"""Create structured examples from LangExtract dataclasses.
|
|
869
|
+
|
|
870
|
+
Parameters
|
|
871
|
+
----------
|
|
872
|
+
data : list[LXExampleData] or None
|
|
873
|
+
List of LangExtract example dataclass instances.
|
|
874
|
+
|
|
875
|
+
Returns
|
|
876
|
+
-------
|
|
877
|
+
list[ExampleDataStructure]
|
|
878
|
+
List of structured example data models.
|
|
879
|
+
"""
|
|
880
|
+
if data is None:
|
|
881
|
+
return []
|
|
882
|
+
return [ExampleDataStructure.from_dataclass(item) for item in data]
|
|
883
|
+
|
|
884
|
+
|
|
885
|
+
class AnnotatedDocumentStructure(StructureBase):
|
|
886
|
+
"""Represent a document annotated with extractions.
|
|
887
|
+
|
|
888
|
+
Attributes
|
|
889
|
+
----------
|
|
890
|
+
document_id : str | None
|
|
891
|
+
Identifier for the document.
|
|
892
|
+
extractions : list[ExtractionStructure] | None
|
|
893
|
+
Extractions associated with the document.
|
|
894
|
+
text : str | None
|
|
895
|
+
Raw text representation of the document.
|
|
896
|
+
tokenized_text : TokenizedText | None
|
|
897
|
+
Tokenized text for the document.
|
|
898
|
+
|
|
899
|
+
Methods
|
|
900
|
+
-------
|
|
901
|
+
to_dataclass()
|
|
902
|
+
Convert to a LangExtract ``AnnotatedDocument`` dataclass.
|
|
903
|
+
from_dataclass(data)
|
|
904
|
+
Create an annotated document from a LangExtract dataclass.
|
|
905
|
+
"""
|
|
906
|
+
|
|
907
|
+
document_id: str | None = spec_field(
|
|
908
|
+
"document_id",
|
|
909
|
+
description="Identifier for the document.",
|
|
910
|
+
allow_null=True,
|
|
911
|
+
)
|
|
912
|
+
extractions: list[ExtractionStructure] | None = spec_field(
|
|
913
|
+
"extractions",
|
|
914
|
+
description="Extractions associated with the document.",
|
|
915
|
+
allow_null=True,
|
|
916
|
+
default_factory=list,
|
|
917
|
+
)
|
|
918
|
+
text: str | None = spec_field(
|
|
919
|
+
"text",
|
|
920
|
+
description="Raw text representation of the document.",
|
|
921
|
+
allow_null=True,
|
|
922
|
+
)
|
|
923
|
+
tokenized_text: TokenizedText | None = spec_field(
|
|
924
|
+
"tokenized_text",
|
|
925
|
+
description="Tokenized representation of the document text.",
|
|
926
|
+
allow_null=True,
|
|
927
|
+
)
|
|
928
|
+
|
|
929
|
+
def model_post_init(self, __context: Any) -> None:
|
|
930
|
+
"""Populate default identifiers and tokenized text after validation."""
|
|
931
|
+
if self.document_id is None:
|
|
932
|
+
self.document_id = f"doc_{uuid.uuid4().hex[:8]}"
|
|
933
|
+
if self.text and self.tokenized_text is None:
|
|
934
|
+
tokenized = LXtokenizer.tokenize(self.text)
|
|
935
|
+
self.tokenized_text = TokenizedText.from_dataclass(tokenized)
|
|
936
|
+
|
|
937
|
+
def to_dataclass(self) -> LXAnnotatedDocument:
|
|
938
|
+
"""Convert to LangExtract AnnotatedDocument dataclass.
|
|
939
|
+
|
|
940
|
+
Returns
|
|
941
|
+
-------
|
|
942
|
+
LXAnnotatedDocument
|
|
943
|
+
LangExtract annotated document dataclass instance.
|
|
944
|
+
"""
|
|
945
|
+
lx_extractions = (
|
|
946
|
+
ExtractionStructure.to_dataclass_list(self.extractions)
|
|
947
|
+
if self.extractions is not None
|
|
948
|
+
else None
|
|
949
|
+
)
|
|
950
|
+
lx_doc = LXAnnotatedDocument(
|
|
951
|
+
document_id=self.document_id,
|
|
952
|
+
extractions=lx_extractions,
|
|
953
|
+
text=self.text,
|
|
954
|
+
)
|
|
955
|
+
if self.tokenized_text is not None:
|
|
956
|
+
lx_doc.tokenized_text = self.tokenized_text.to_dataclass()
|
|
957
|
+
return lx_doc
|
|
958
|
+
|
|
959
|
+
@classmethod
|
|
960
|
+
def from_dataclass(cls, data: LXAnnotatedDocument) -> "AnnotatedDocumentStructure":
|
|
961
|
+
"""Create an annotated document from a LangExtract dataclass.
|
|
962
|
+
|
|
963
|
+
Parameters
|
|
964
|
+
----------
|
|
965
|
+
data : LXAnnotatedDocument
|
|
966
|
+
LangExtract annotated document dataclass instance.
|
|
967
|
+
|
|
968
|
+
Returns
|
|
969
|
+
-------
|
|
970
|
+
AnnotatedDocumentStructure
|
|
971
|
+
Structured annotated document model.
|
|
972
|
+
"""
|
|
973
|
+
extractions = (
|
|
974
|
+
ExtractionStructure.from_dataclass_list(data.extractions)
|
|
975
|
+
if data.extractions is not None
|
|
976
|
+
else None
|
|
977
|
+
)
|
|
978
|
+
tokenized_text = (
|
|
979
|
+
TokenizedText.from_dataclass(data.tokenized_text)
|
|
980
|
+
if data.tokenized_text is not None
|
|
981
|
+
else None
|
|
982
|
+
)
|
|
983
|
+
return cls(
|
|
984
|
+
document_id=data.document_id,
|
|
985
|
+
extractions=extractions,
|
|
986
|
+
text=data.text,
|
|
987
|
+
tokenized_text=tokenized_text,
|
|
988
|
+
)
|
|
989
|
+
|
|
990
|
+
|
|
991
|
+
class DocumentStructure(StructureBase):
|
|
992
|
+
"""Store extraction results for a document.
|
|
993
|
+
|
|
994
|
+
Attributes
|
|
995
|
+
----------
|
|
996
|
+
text : str
|
|
997
|
+
Raw text representation for the document.
|
|
998
|
+
document_id : str | None
|
|
999
|
+
Identifier for the source document.
|
|
1000
|
+
additional_context : str | None
|
|
1001
|
+
Additional context to supplement prompt instructions.
|
|
1002
|
+
tokenized_text : TokenizedText | None
|
|
1003
|
+
Tokenized representation of the document text.
|
|
1004
|
+
|
|
1005
|
+
Methods
|
|
1006
|
+
-------
|
|
1007
|
+
to_dataclass()
|
|
1008
|
+
Convert to a LangExtract ``Document`` dataclass.
|
|
1009
|
+
to_dataclass_list(data)
|
|
1010
|
+
Convert structured documents to LangExtract dataclasses.
|
|
1011
|
+
from_dataclass(data)
|
|
1012
|
+
Create a document from a LangExtract dataclass.
|
|
1013
|
+
from_dataclass_list(data)
|
|
1014
|
+
Create structured documents from LangExtract dataclasses.
|
|
1015
|
+
"""
|
|
1016
|
+
|
|
1017
|
+
text: str = spec_field(
|
|
1018
|
+
"text",
|
|
1019
|
+
allow_null=False,
|
|
1020
|
+
description="Raw text representation for the document.",
|
|
1021
|
+
)
|
|
1022
|
+
document_id: str | None = spec_field(
|
|
1023
|
+
"document_id",
|
|
1024
|
+
description="Identifier for the source document.",
|
|
1025
|
+
allow_null=True,
|
|
1026
|
+
)
|
|
1027
|
+
additional_context: str | None = spec_field(
|
|
1028
|
+
"additional_context",
|
|
1029
|
+
description="Additional context to supplement prompt instructions.",
|
|
1030
|
+
allow_null=True,
|
|
1031
|
+
)
|
|
1032
|
+
tokenized_text: TokenizedText | None = spec_field(
|
|
1033
|
+
"tokenized_text",
|
|
1034
|
+
description="Tokenized representation of the document text.",
|
|
1035
|
+
allow_null=True,
|
|
1036
|
+
)
|
|
1037
|
+
|
|
1038
|
+
def model_post_init(self, __context: Any) -> None:
|
|
1039
|
+
"""Populate default identifiers and tokenized text after validation."""
|
|
1040
|
+
if self.document_id is None:
|
|
1041
|
+
self.document_id = f"doc_{uuid.uuid4().hex[:8]}"
|
|
1042
|
+
if self.tokenized_text is None and self.text:
|
|
1043
|
+
tokenized = LXtokenizer.tokenize(self.text)
|
|
1044
|
+
self.tokenized_text = TokenizedText.from_dataclass(tokenized)
|
|
1045
|
+
|
|
1046
|
+
def to_dataclass(self) -> LXDocument:
|
|
1047
|
+
"""Convert to LangExtract Document dataclass.
|
|
1048
|
+
|
|
1049
|
+
Returns
|
|
1050
|
+
-------
|
|
1051
|
+
LXDocument
|
|
1052
|
+
LangExtract document dataclass instance.
|
|
1053
|
+
"""
|
|
1054
|
+
lx_doc = LXDocument(
|
|
1055
|
+
text=self.text,
|
|
1056
|
+
document_id=self.document_id,
|
|
1057
|
+
additional_context=self.additional_context,
|
|
1058
|
+
)
|
|
1059
|
+
if self.tokenized_text is not None:
|
|
1060
|
+
lx_doc.tokenized_text = self.tokenized_text.to_dataclass()
|
|
1061
|
+
return lx_doc
|
|
1062
|
+
|
|
1063
|
+
@staticmethod
|
|
1064
|
+
def to_dataclass_list(
|
|
1065
|
+
data: Sequence["DocumentStructure"],
|
|
1066
|
+
) -> list[LXDocument]:
|
|
1067
|
+
"""Convert structured documents to LangExtract dataclasses.
|
|
1068
|
+
|
|
1069
|
+
Parameters
|
|
1070
|
+
----------
|
|
1071
|
+
data : Sequence[DocumentStructure]
|
|
1072
|
+
List of structured document models.
|
|
1073
|
+
|
|
1074
|
+
Returns
|
|
1075
|
+
-------
|
|
1076
|
+
list[LXDocument]
|
|
1077
|
+
List of LangExtract document dataclass instances.
|
|
1078
|
+
"""
|
|
1079
|
+
return [item.to_dataclass() for item in data]
|
|
1080
|
+
|
|
1081
|
+
@classmethod
|
|
1082
|
+
def from_dataclass(cls, data: LXDocument) -> "DocumentStructure":
|
|
1083
|
+
"""Create a document from a LangExtract dataclass.
|
|
1084
|
+
|
|
1085
|
+
Parameters
|
|
1086
|
+
----------
|
|
1087
|
+
data : LXDocument
|
|
1088
|
+
LangExtract document dataclass instance.
|
|
1089
|
+
|
|
1090
|
+
Returns
|
|
1091
|
+
-------
|
|
1092
|
+
DocumentStructure
|
|
1093
|
+
Structured document model.
|
|
1094
|
+
"""
|
|
1095
|
+
tokenized_text = (
|
|
1096
|
+
TokenizedText.from_dataclass(data.tokenized_text)
|
|
1097
|
+
if data.tokenized_text is not None
|
|
1098
|
+
else None
|
|
1099
|
+
)
|
|
1100
|
+
return cls(
|
|
1101
|
+
text=data.text,
|
|
1102
|
+
document_id=data.document_id,
|
|
1103
|
+
additional_context=data.additional_context,
|
|
1104
|
+
tokenized_text=tokenized_text,
|
|
1105
|
+
)
|
|
1106
|
+
|
|
1107
|
+
@staticmethod
|
|
1108
|
+
def from_dataclass_list(
|
|
1109
|
+
data: list[LXDocument] | None,
|
|
1110
|
+
) -> list["DocumentStructure"]:
|
|
1111
|
+
"""Create structured documents from LangExtract dataclasses.
|
|
1112
|
+
|
|
1113
|
+
Parameters
|
|
1114
|
+
----------
|
|
1115
|
+
data : list[LXDocument] or None
|
|
1116
|
+
List of LangExtract document dataclass instances.
|
|
1117
|
+
|
|
1118
|
+
Returns
|
|
1119
|
+
-------
|
|
1120
|
+
list[DocumentStructure]
|
|
1121
|
+
List of structured document models.
|
|
1122
|
+
"""
|
|
1123
|
+
if data is None:
|
|
1124
|
+
return []
|
|
1125
|
+
return [DocumentStructure.from_dataclass(item) for item in data]
|
|
1126
|
+
|
|
1127
|
+
|
|
1128
|
+
class DocumentExtractorConfig(StructureBase):
|
|
1129
|
+
"""Configuration settings for the extractor.
|
|
1130
|
+
|
|
1131
|
+
Attributes
|
|
1132
|
+
----------
|
|
1133
|
+
name : str
|
|
1134
|
+
Name used to store and reuse extractor configurations.
|
|
1135
|
+
prompt_description : str
|
|
1136
|
+
Prompt description used by LangExtract.
|
|
1137
|
+
extraction_classes : list[str]
|
|
1138
|
+
List of extraction classes to be extracted.
|
|
1139
|
+
examples : list[ExampleDataStructure]
|
|
1140
|
+
Example payloads supplied to LangExtract.
|
|
1141
|
+
|
|
1142
|
+
Methods
|
|
1143
|
+
-------
|
|
1144
|
+
to_json()
|
|
1145
|
+
Return a JSON-compatible dict representation.
|
|
1146
|
+
to_json_file(filepath)
|
|
1147
|
+
Write serialized JSON data to a file path.
|
|
1148
|
+
"""
|
|
1149
|
+
|
|
1150
|
+
name: str = spec_field(
|
|
1151
|
+
"name",
|
|
1152
|
+
allow_null=False,
|
|
1153
|
+
description="Name used to store and reuse extractor configurations.",
|
|
1154
|
+
examples=["invoice_entity_extractor"],
|
|
1155
|
+
)
|
|
1156
|
+
prompt_description: str = spec_field(
|
|
1157
|
+
"prompt_description",
|
|
1158
|
+
allow_null=False,
|
|
1159
|
+
description="Prompt description used by LangExtract.",
|
|
1160
|
+
examples=[
|
|
1161
|
+
"Extract characters, emotions, and relationships in order of appearance. "
|
|
1162
|
+
"Use exact text for extractions. Do not paraphrase or overlap entities. "
|
|
1163
|
+
"Provide meaningful attributes for each entity to add context."
|
|
1164
|
+
],
|
|
1165
|
+
)
|
|
1166
|
+
extraction_classes: list[str] = spec_field(
|
|
1167
|
+
"extraction_classes",
|
|
1168
|
+
description="List of extraction classes to be extracted.",
|
|
1169
|
+
default_factory=list,
|
|
1170
|
+
examples=[["character", "emotion", "relationship"]],
|
|
1171
|
+
)
|
|
1172
|
+
examples: list[ExampleDataStructure] = spec_field(
|
|
1173
|
+
"examples",
|
|
1174
|
+
description="Example payloads supplied to LangExtract.",
|
|
1175
|
+
default_factory=list,
|
|
1176
|
+
examples=[
|
|
1177
|
+
[
|
|
1178
|
+
ExampleDataStructure(
|
|
1179
|
+
text=(
|
|
1180
|
+
"ROMEO. But soft! What light through yonder window breaks? "
|
|
1181
|
+
"It is the east, and Juliet is the sun."
|
|
1182
|
+
),
|
|
1183
|
+
extractions=[
|
|
1184
|
+
ExtractionStructure(
|
|
1185
|
+
extraction_class="character",
|
|
1186
|
+
extraction_text="ROMEO",
|
|
1187
|
+
attributes=[
|
|
1188
|
+
AttributeStructure(
|
|
1189
|
+
key="emotional_state",
|
|
1190
|
+
value="wonder",
|
|
1191
|
+
)
|
|
1192
|
+
],
|
|
1193
|
+
),
|
|
1194
|
+
ExtractionStructure(
|
|
1195
|
+
extraction_class="emotion",
|
|
1196
|
+
extraction_text="But soft!",
|
|
1197
|
+
attributes=[
|
|
1198
|
+
AttributeStructure(
|
|
1199
|
+
key="feeling",
|
|
1200
|
+
value="gentle awe",
|
|
1201
|
+
)
|
|
1202
|
+
],
|
|
1203
|
+
),
|
|
1204
|
+
ExtractionStructure(
|
|
1205
|
+
extraction_class="relationship",
|
|
1206
|
+
extraction_text="Juliet is the sun",
|
|
1207
|
+
attributes=[
|
|
1208
|
+
AttributeStructure(
|
|
1209
|
+
key="type",
|
|
1210
|
+
value="metaphor",
|
|
1211
|
+
)
|
|
1212
|
+
],
|
|
1213
|
+
),
|
|
1214
|
+
],
|
|
1215
|
+
)
|
|
1216
|
+
]
|
|
1217
|
+
],
|
|
1218
|
+
)
|
|
1219
|
+
|
|
1220
|
+
|
|
1221
|
+
__all__ = [
|
|
1222
|
+
"AnnotatedDocumentStructure",
|
|
1223
|
+
"AttributeStructure",
|
|
1224
|
+
"DocumentStructure",
|
|
1225
|
+
"ExampleDataStructure",
|
|
1226
|
+
"ExtractionStructure",
|
|
1227
|
+
"DocumentExtractorConfig",
|
|
1228
|
+
]
|