openai-sdk-helpers 0.4.3__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openai_sdk_helpers/__init__.py +41 -7
- openai_sdk_helpers/agent/__init__.py +1 -2
- openai_sdk_helpers/agent/base.py +89 -173
- openai_sdk_helpers/agent/configuration.py +12 -20
- openai_sdk_helpers/agent/coordinator.py +14 -17
- openai_sdk_helpers/agent/runner.py +3 -45
- openai_sdk_helpers/agent/search/base.py +49 -71
- openai_sdk_helpers/agent/search/vector.py +82 -110
- openai_sdk_helpers/agent/search/web.py +103 -81
- openai_sdk_helpers/agent/summarizer.py +20 -28
- openai_sdk_helpers/agent/translator.py +17 -23
- openai_sdk_helpers/agent/validator.py +17 -23
- openai_sdk_helpers/errors.py +9 -0
- openai_sdk_helpers/extract/__init__.py +23 -0
- openai_sdk_helpers/extract/extractor.py +157 -0
- openai_sdk_helpers/extract/generator.py +476 -0
- openai_sdk_helpers/prompt/extractor_config_agent_instructions.jinja +6 -0
- openai_sdk_helpers/prompt/extractor_config_generator.jinja +37 -0
- openai_sdk_helpers/prompt/extractor_config_generator_instructions.jinja +9 -0
- openai_sdk_helpers/prompt/extractor_prompt_optimizer_agent_instructions.jinja +4 -0
- openai_sdk_helpers/prompt/extractor_prompt_optimizer_request.jinja +11 -0
- openai_sdk_helpers/response/__init__.py +2 -6
- openai_sdk_helpers/response/base.py +85 -94
- openai_sdk_helpers/response/configuration.py +39 -14
- openai_sdk_helpers/response/files.py +2 -0
- openai_sdk_helpers/response/runner.py +1 -48
- openai_sdk_helpers/response/tool_call.py +0 -141
- openai_sdk_helpers/response/vector_store.py +8 -5
- openai_sdk_helpers/streamlit_app/app.py +1 -1
- openai_sdk_helpers/structure/__init__.py +16 -0
- openai_sdk_helpers/structure/base.py +239 -278
- openai_sdk_helpers/structure/extraction.py +1228 -0
- openai_sdk_helpers/structure/plan/plan.py +0 -20
- openai_sdk_helpers/structure/plan/task.py +0 -33
- openai_sdk_helpers/structure/prompt.py +16 -0
- openai_sdk_helpers/structure/responses.py +2 -2
- openai_sdk_helpers/structure/web_search.py +0 -10
- openai_sdk_helpers/tools.py +346 -99
- openai_sdk_helpers/utils/__init__.py +7 -0
- openai_sdk_helpers/utils/json/base_model.py +315 -32
- openai_sdk_helpers/utils/langextract.py +194 -0
- {openai_sdk_helpers-0.4.3.dist-info → openai_sdk_helpers-0.5.0.dist-info}/METADATA +18 -4
- {openai_sdk_helpers-0.4.3.dist-info → openai_sdk_helpers-0.5.0.dist-info}/RECORD +46 -37
- openai_sdk_helpers/streamlit_app/streamlit_web_search.py +0 -75
- {openai_sdk_helpers-0.4.3.dist-info → openai_sdk_helpers-0.5.0.dist-info}/WHEEL +0 -0
- {openai_sdk_helpers-0.4.3.dist-info → openai_sdk_helpers-0.5.0.dist-info}/entry_points.txt +0 -0
- {openai_sdk_helpers-0.4.3.dist-info → openai_sdk_helpers-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Document extraction helpers powered by LangExtract."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import typing
|
|
8
|
+
|
|
9
|
+
import langextract as lx
|
|
10
|
+
from langextract.core import format_handler as lx_format_handler
|
|
11
|
+
from langextract.core.data import AnnotatedDocument as LXAnnotatedDocument
|
|
12
|
+
|
|
13
|
+
from ..errors import ExtractionError
|
|
14
|
+
from ..structure.extraction import (
|
|
15
|
+
AnnotatedDocumentStructure,
|
|
16
|
+
DocumentStructure,
|
|
17
|
+
ExampleDataStructure,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DocumentExtractor:
|
|
22
|
+
"""Extract structured data from documents using LangExtract.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
prompt_description : str
|
|
27
|
+
Prompt description used by LangExtract.
|
|
28
|
+
examples : Sequence[ExampleDataStructure]
|
|
29
|
+
Example payloads supplied to LangExtract.
|
|
30
|
+
model_id : str
|
|
31
|
+
Model identifier to pass to LangExtract.
|
|
32
|
+
max_workers : int, optional
|
|
33
|
+
Maximum number of workers for concurrent extraction. Default is 1.
|
|
34
|
+
|
|
35
|
+
Methods
|
|
36
|
+
-------
|
|
37
|
+
extract(input_text)
|
|
38
|
+
Extract structured data from one or more documents.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
prompt_description: str,
|
|
44
|
+
examples: typing.Sequence[ExampleDataStructure],
|
|
45
|
+
model_id: str,
|
|
46
|
+
max_workers: int = 1,
|
|
47
|
+
) -> None:
|
|
48
|
+
"""Initialize the extractor.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
prompt_description : str
|
|
53
|
+
Prompt description used by LangExtract.
|
|
54
|
+
examples : Sequence[ExampleDataStructure]
|
|
55
|
+
Example payloads supplied to LangExtract.
|
|
56
|
+
model_id : str
|
|
57
|
+
Model identifier to pass to LangExtract.
|
|
58
|
+
max_workers : int, optional
|
|
59
|
+
Maximum number of workers for concurrent extraction. Default is 1.
|
|
60
|
+
"""
|
|
61
|
+
if not examples:
|
|
62
|
+
raise ValueError(
|
|
63
|
+
"Examples are required for reliable extraction. "
|
|
64
|
+
"Provide at least one ExampleDataStructure instance."
|
|
65
|
+
)
|
|
66
|
+
self.model_id = model_id
|
|
67
|
+
self.prompt = prompt_description
|
|
68
|
+
self.examples = examples
|
|
69
|
+
self.max_workers = max_workers
|
|
70
|
+
|
|
71
|
+
def extract(
|
|
72
|
+
self, input_text: DocumentStructure | list[DocumentStructure]
|
|
73
|
+
) -> list[AnnotatedDocumentStructure]:
|
|
74
|
+
"""Run the extraction.
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
input_text : DocumentStructure | list[DocumentStructure]
|
|
79
|
+
Document or list of documents to extract data from.
|
|
80
|
+
|
|
81
|
+
Returns
|
|
82
|
+
-------
|
|
83
|
+
list[AnnotatedDocumentStructure]
|
|
84
|
+
Extracted items for the provided documents.
|
|
85
|
+
"""
|
|
86
|
+
if isinstance(input_text, DocumentStructure):
|
|
87
|
+
input_documents = [input_text]
|
|
88
|
+
else:
|
|
89
|
+
input_documents = input_text
|
|
90
|
+
documents = DocumentStructure.to_dataclass_list(input_documents)
|
|
91
|
+
examples = ExampleDataStructure.to_dataclass_list(self.examples)
|
|
92
|
+
resolver_params = {"format_handler": _SanitizingFormatHandler()}
|
|
93
|
+
result = lx.extract(
|
|
94
|
+
text_or_documents=documents,
|
|
95
|
+
prompt_description=self.prompt,
|
|
96
|
+
examples=examples,
|
|
97
|
+
model_id=self.model_id, # Automatically selects OpenAI provider
|
|
98
|
+
api_key=os.environ.get("OPENAI_API_KEY"),
|
|
99
|
+
fence_output=True,
|
|
100
|
+
use_schema_constraints=False,
|
|
101
|
+
resolver_params=resolver_params,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def _convert(data: typing.Any) -> AnnotatedDocumentStructure:
|
|
105
|
+
if isinstance(data, LXAnnotatedDocument):
|
|
106
|
+
return AnnotatedDocumentStructure.from_dataclass(data)
|
|
107
|
+
return AnnotatedDocumentStructure.model_validate(data)
|
|
108
|
+
|
|
109
|
+
if isinstance(result, list):
|
|
110
|
+
return [_convert(doc) for doc in result]
|
|
111
|
+
|
|
112
|
+
return [_convert(result)]
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _sanitize_extraction_items(
|
|
116
|
+
items: typing.Sequence[typing.Mapping[str, lx_format_handler.ExtractionValueType]],
|
|
117
|
+
attribute_suffix: str,
|
|
118
|
+
) -> list[dict[str, lx_format_handler.ExtractionValueType]]:
|
|
119
|
+
sanitized: list[dict[str, lx_format_handler.ExtractionValueType]] = []
|
|
120
|
+
for item in items:
|
|
121
|
+
updated: dict[str, lx_format_handler.ExtractionValueType] = {}
|
|
122
|
+
for key, value in item.items():
|
|
123
|
+
keep, cleaned = _sanitize_extraction_value(key, value, attribute_suffix)
|
|
124
|
+
if not keep:
|
|
125
|
+
continue
|
|
126
|
+
updated[key] = cleaned
|
|
127
|
+
sanitized.append(updated)
|
|
128
|
+
return sanitized
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _sanitize_extraction_value(
|
|
132
|
+
key: str,
|
|
133
|
+
value: lx_format_handler.ExtractionValueType,
|
|
134
|
+
attribute_suffix: str,
|
|
135
|
+
) -> tuple[bool, lx_format_handler.ExtractionValueType]:
|
|
136
|
+
if value is None:
|
|
137
|
+
return False, None
|
|
138
|
+
if key.endswith(attribute_suffix):
|
|
139
|
+
if isinstance(value, dict):
|
|
140
|
+
return True, value
|
|
141
|
+
return False, None
|
|
142
|
+
if isinstance(value, (str, int, float)):
|
|
143
|
+
return True, value
|
|
144
|
+
return True, json.dumps(value, ensure_ascii=False)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class _SanitizingFormatHandler(lx_format_handler.FormatHandler):
|
|
148
|
+
"""Sanitize LangExtract output before the resolver validates types."""
|
|
149
|
+
|
|
150
|
+
def parse_output(
|
|
151
|
+
self, text: str, *, strict: bool | None = None
|
|
152
|
+
) -> typing.Sequence[typing.Mapping[str, lx_format_handler.ExtractionValueType]]:
|
|
153
|
+
items = super().parse_output(text, strict=strict)
|
|
154
|
+
return _sanitize_extraction_items(items, self.attribute_suffix)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
__all__ = ["DocumentExtractor", "ExtractionError"]
|
|
@@ -0,0 +1,476 @@
|
|
|
1
|
+
"""Prompt optimization and configuration helpers for document extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Sequence
|
|
8
|
+
|
|
9
|
+
from ..agent.base import AgentBase
|
|
10
|
+
from ..agent.configuration import AgentConfiguration
|
|
11
|
+
from ..prompt import PromptRenderer
|
|
12
|
+
from ..response.configuration import ResponseConfiguration
|
|
13
|
+
from ..response.prompter import PROMPTER
|
|
14
|
+
from ..settings import OpenAISettings
|
|
15
|
+
from ..structure.extraction import DocumentExtractorConfig, ExampleDataStructure
|
|
16
|
+
from ..structure.prompt import PromptStructure
|
|
17
|
+
|
|
18
|
+
EXTRACTOR_CONFIG_TEMPLATE_NAME = "extractor_config_generator.jinja"
|
|
19
|
+
EXTRACTOR_CONFIG_AGENT_INSTRUCTIONS_TEMPLATE = (
|
|
20
|
+
"extractor_config_agent_instructions.jinja"
|
|
21
|
+
)
|
|
22
|
+
EXTRACTOR_CONFIG_GENERATOR_INSTRUCTIONS_TEMPLATE = (
|
|
23
|
+
"extractor_config_generator_instructions.jinja"
|
|
24
|
+
)
|
|
25
|
+
EXTRACTOR_PROMPT_OPTIMIZER_INSTRUCTIONS_TEMPLATE = (
|
|
26
|
+
"extractor_prompt_optimizer_agent_instructions.jinja"
|
|
27
|
+
)
|
|
28
|
+
EXTRACTOR_PROMPT_OPTIMIZER_REQUEST_TEMPLATE = "extractor_prompt_optimizer_request.jinja"
|
|
29
|
+
PROMPT_RENDERER = PromptRenderer()
|
|
30
|
+
|
|
31
|
+
DEFAULT_EXAMPLE_COUNT = 3
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _render_prompt_template(
|
|
35
|
+
template_name: str,
|
|
36
|
+
context: dict[str, object] | None = None,
|
|
37
|
+
) -> str:
|
|
38
|
+
"""Render a prompt template from the prompt directory.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
template_name : str
|
|
43
|
+
Prompt template file name.
|
|
44
|
+
context : dict[str, object] or None, default None
|
|
45
|
+
Context variables for template rendering.
|
|
46
|
+
|
|
47
|
+
Returns
|
|
48
|
+
-------
|
|
49
|
+
str
|
|
50
|
+
Rendered prompt content.
|
|
51
|
+
"""
|
|
52
|
+
return PROMPT_RENDERER.render(template_name, context=context or {})
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
EXTRACTOR_CONFIG_GENERATOR = ResponseConfiguration(
|
|
56
|
+
name="document_extractor_config_generator",
|
|
57
|
+
instructions=_render_prompt_template(
|
|
58
|
+
EXTRACTOR_CONFIG_GENERATOR_INSTRUCTIONS_TEMPLATE
|
|
59
|
+
),
|
|
60
|
+
tools=None,
|
|
61
|
+
input_structure=None,
|
|
62
|
+
output_structure=DocumentExtractorConfig,
|
|
63
|
+
add_output_instructions=True,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
PROMPT_OPTIMIZER_AGENT_INSTRUCTIONS = _render_prompt_template(
|
|
67
|
+
EXTRACTOR_PROMPT_OPTIMIZER_INSTRUCTIONS_TEMPLATE,
|
|
68
|
+
context={"prompt_schema": PromptStructure.get_prompt()},
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
EXTRACTOR_CONFIG_AGENT_INSTRUCTIONS = _render_prompt_template(
|
|
72
|
+
EXTRACTOR_CONFIG_AGENT_INSTRUCTIONS_TEMPLATE,
|
|
73
|
+
context={"config_schema": DocumentExtractorConfig.get_prompt()},
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _format_extractor_prompt_request(
|
|
78
|
+
prompt: str,
|
|
79
|
+
extraction_classes: Sequence[str],
|
|
80
|
+
additional_context: str | None,
|
|
81
|
+
) -> str:
|
|
82
|
+
"""Format the prompt-optimization request payload.
|
|
83
|
+
|
|
84
|
+
Parameters
|
|
85
|
+
----------
|
|
86
|
+
prompt : str
|
|
87
|
+
User-provided prompt content.
|
|
88
|
+
extraction_classes : Sequence[str]
|
|
89
|
+
Extraction classes to include.
|
|
90
|
+
additional_context : str or None
|
|
91
|
+
Optional extra context to include.
|
|
92
|
+
|
|
93
|
+
Returns
|
|
94
|
+
-------
|
|
95
|
+
str
|
|
96
|
+
Formatted prompt optimization request.
|
|
97
|
+
"""
|
|
98
|
+
return _render_prompt_template(
|
|
99
|
+
EXTRACTOR_PROMPT_OPTIMIZER_REQUEST_TEMPLATE,
|
|
100
|
+
context={
|
|
101
|
+
"prompt": prompt,
|
|
102
|
+
"extraction_classes": list(extraction_classes),
|
|
103
|
+
"additional_context": additional_context,
|
|
104
|
+
},
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _format_extractor_config_request(
|
|
109
|
+
name: str,
|
|
110
|
+
prompt_description: str,
|
|
111
|
+
extraction_classes: Sequence[str],
|
|
112
|
+
*,
|
|
113
|
+
example_files: Sequence[str | Path] | None = None,
|
|
114
|
+
example_count: int = DEFAULT_EXAMPLE_COUNT,
|
|
115
|
+
) -> str:
|
|
116
|
+
"""Format the extractor configuration request payload.
|
|
117
|
+
|
|
118
|
+
Parameters
|
|
119
|
+
----------
|
|
120
|
+
name : str
|
|
121
|
+
Name for the extractor configuration.
|
|
122
|
+
prompt_description : str
|
|
123
|
+
Optimized prompt description to use.
|
|
124
|
+
extraction_classes : Sequence[str]
|
|
125
|
+
Extraction classes to include.
|
|
126
|
+
example_files : Sequence[str or Path] or None, default None
|
|
127
|
+
Optional file paths to ground the generated examples.
|
|
128
|
+
example_count : int, default 3
|
|
129
|
+
Number of examples to generate.
|
|
130
|
+
|
|
131
|
+
Returns
|
|
132
|
+
-------
|
|
133
|
+
str
|
|
134
|
+
Formatted configuration request.
|
|
135
|
+
"""
|
|
136
|
+
return PROMPT_RENDERER.render(
|
|
137
|
+
EXTRACTOR_CONFIG_TEMPLATE_NAME,
|
|
138
|
+
context={
|
|
139
|
+
"name": name,
|
|
140
|
+
"prompt_description": prompt_description,
|
|
141
|
+
"extraction_classes": list(extraction_classes),
|
|
142
|
+
"example_count": example_count,
|
|
143
|
+
"example_files": _load_example_files(example_files),
|
|
144
|
+
"examples_json": "- None provided. You must generate examples.",
|
|
145
|
+
"example_requirements": [
|
|
146
|
+
f"Generate {example_count} high-quality examples that align with the prompt.",
|
|
147
|
+
"Ensure each example includes realistic source text and extractions.",
|
|
148
|
+
"Cover every extraction class across the examples.",
|
|
149
|
+
],
|
|
150
|
+
},
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _format_extractor_config_request_with_examples(
|
|
155
|
+
name: str,
|
|
156
|
+
prompt_description: str,
|
|
157
|
+
extraction_classes: Sequence[str],
|
|
158
|
+
examples: Sequence[ExampleDataStructure],
|
|
159
|
+
) -> str:
|
|
160
|
+
"""Format the extractor configuration request payload with examples.
|
|
161
|
+
|
|
162
|
+
Parameters
|
|
163
|
+
----------
|
|
164
|
+
name : str
|
|
165
|
+
Name for the extractor configuration.
|
|
166
|
+
prompt_description : str
|
|
167
|
+
Optimized prompt description to use.
|
|
168
|
+
extraction_classes : Sequence[str]
|
|
169
|
+
Extraction classes to include.
|
|
170
|
+
examples : Sequence[ExampleDataStructure]
|
|
171
|
+
Example payloads to include.
|
|
172
|
+
|
|
173
|
+
Returns
|
|
174
|
+
-------
|
|
175
|
+
str
|
|
176
|
+
Formatted configuration request.
|
|
177
|
+
"""
|
|
178
|
+
serialized_examples = [example.to_json() for example in examples]
|
|
179
|
+
examples_json = json.dumps(serialized_examples, indent=2)
|
|
180
|
+
return PROMPT_RENDERER.render(
|
|
181
|
+
EXTRACTOR_CONFIG_TEMPLATE_NAME,
|
|
182
|
+
context={
|
|
183
|
+
"name": name,
|
|
184
|
+
"prompt_description": prompt_description,
|
|
185
|
+
"extraction_classes": list(extraction_classes),
|
|
186
|
+
"example_count": DEFAULT_EXAMPLE_COUNT,
|
|
187
|
+
"example_files": [],
|
|
188
|
+
"examples_json": examples_json,
|
|
189
|
+
"example_requirements": ["Use the provided examples exactly as written."],
|
|
190
|
+
},
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _load_example_files(
|
|
195
|
+
example_files: Sequence[str | Path] | None,
|
|
196
|
+
) -> list[dict[str, str]]:
|
|
197
|
+
"""Load optional example files for grounded extraction generation.
|
|
198
|
+
|
|
199
|
+
Parameters
|
|
200
|
+
----------
|
|
201
|
+
example_files : Sequence[str or Path] or None
|
|
202
|
+
File paths to load for grounding examples.
|
|
203
|
+
|
|
204
|
+
Returns
|
|
205
|
+
-------
|
|
206
|
+
list of dict[str, str]
|
|
207
|
+
Loaded file metadata including path and content.
|
|
208
|
+
|
|
209
|
+
Raises
|
|
210
|
+
------
|
|
211
|
+
FileNotFoundError
|
|
212
|
+
If any provided file does not exist.
|
|
213
|
+
"""
|
|
214
|
+
if not example_files:
|
|
215
|
+
return []
|
|
216
|
+
loaded_files: list[dict[str, str]] = []
|
|
217
|
+
for file_path in example_files:
|
|
218
|
+
path = Path(file_path)
|
|
219
|
+
content = path.read_text()
|
|
220
|
+
loaded_files.append({"path": str(path), "content": content})
|
|
221
|
+
return loaded_files
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def optimize_extractor_prompt(
|
|
225
|
+
openai_settings: OpenAISettings,
|
|
226
|
+
prompt: str,
|
|
227
|
+
extraction_classes: Sequence[str],
|
|
228
|
+
*,
|
|
229
|
+
additional_context: str | None = None,
|
|
230
|
+
) -> str:
|
|
231
|
+
"""Generate an optimized prompt description for extraction.
|
|
232
|
+
|
|
233
|
+
Parameters
|
|
234
|
+
----------
|
|
235
|
+
openai_settings : OpenAISettings
|
|
236
|
+
Settings used to configure the OpenAI client.
|
|
237
|
+
prompt : str
|
|
238
|
+
User-supplied prompt content.
|
|
239
|
+
extraction_classes : Sequence[str]
|
|
240
|
+
Extraction classes to include in the optimized prompt.
|
|
241
|
+
additional_context : str or None, default None
|
|
242
|
+
Optional context that should influence prompt generation.
|
|
243
|
+
|
|
244
|
+
Returns
|
|
245
|
+
-------
|
|
246
|
+
str
|
|
247
|
+
Optimized prompt description.
|
|
248
|
+
|
|
249
|
+
Raises
|
|
250
|
+
------
|
|
251
|
+
TypeError
|
|
252
|
+
If the prompter response does not return a prompt string.
|
|
253
|
+
"""
|
|
254
|
+
request_text = _format_extractor_prompt_request(
|
|
255
|
+
prompt,
|
|
256
|
+
extraction_classes,
|
|
257
|
+
additional_context,
|
|
258
|
+
)
|
|
259
|
+
response = PROMPTER.gen_response(openai_settings=openai_settings)
|
|
260
|
+
try:
|
|
261
|
+
result = response.run_sync(request_text)
|
|
262
|
+
finally:
|
|
263
|
+
response.close()
|
|
264
|
+
|
|
265
|
+
if isinstance(result, PromptStructure):
|
|
266
|
+
return result.prompt
|
|
267
|
+
if isinstance(result, str):
|
|
268
|
+
return result
|
|
269
|
+
raise TypeError("Prompter response must return a PromptStructure or string.")
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def optimize_extractor_prompt_with_agent(
|
|
273
|
+
openai_settings: OpenAISettings,
|
|
274
|
+
prompt: str,
|
|
275
|
+
extraction_classes: Sequence[str],
|
|
276
|
+
*,
|
|
277
|
+
additional_context: str | None = None,
|
|
278
|
+
) -> str:
|
|
279
|
+
"""Generate an optimized prompt description using AgentBase.
|
|
280
|
+
|
|
281
|
+
Parameters
|
|
282
|
+
----------
|
|
283
|
+
openai_settings : OpenAISettings
|
|
284
|
+
Settings used to configure the agent model.
|
|
285
|
+
prompt : str
|
|
286
|
+
User-supplied prompt content.
|
|
287
|
+
extraction_classes : Sequence[str]
|
|
288
|
+
Extraction classes to include in the optimized prompt.
|
|
289
|
+
additional_context : str or None, default None
|
|
290
|
+
Optional context that should influence prompt generation.
|
|
291
|
+
|
|
292
|
+
Returns
|
|
293
|
+
-------
|
|
294
|
+
str
|
|
295
|
+
Optimized prompt description.
|
|
296
|
+
|
|
297
|
+
Raises
|
|
298
|
+
------
|
|
299
|
+
TypeError
|
|
300
|
+
If the agent response does not return a prompt string.
|
|
301
|
+
ValueError
|
|
302
|
+
If no default model is configured.
|
|
303
|
+
"""
|
|
304
|
+
if not openai_settings.default_model:
|
|
305
|
+
raise ValueError("OpenAISettings.default_model is required for agent runs.")
|
|
306
|
+
request_text = _format_extractor_prompt_request(
|
|
307
|
+
prompt,
|
|
308
|
+
extraction_classes,
|
|
309
|
+
additional_context,
|
|
310
|
+
)
|
|
311
|
+
configuration = AgentConfiguration(
|
|
312
|
+
name="extractor_prompt_optimizer",
|
|
313
|
+
description="Optimize extraction prompt descriptions.",
|
|
314
|
+
model=openai_settings.default_model,
|
|
315
|
+
instructions=PROMPT_OPTIMIZER_AGENT_INSTRUCTIONS,
|
|
316
|
+
output_structure=PromptStructure,
|
|
317
|
+
)
|
|
318
|
+
agent = AgentBase(configuration=configuration)
|
|
319
|
+
result = agent.run_sync(request_text)
|
|
320
|
+
|
|
321
|
+
if isinstance(result, PromptStructure):
|
|
322
|
+
return result.prompt
|
|
323
|
+
if isinstance(result, str):
|
|
324
|
+
return result
|
|
325
|
+
raise TypeError("Agent response must return a PromptStructure or string.")
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def generate_document_extractor_config(
|
|
329
|
+
openai_settings: OpenAISettings,
|
|
330
|
+
name: str,
|
|
331
|
+
prompt: str,
|
|
332
|
+
extraction_classes: Sequence[str],
|
|
333
|
+
*,
|
|
334
|
+
example_files: Sequence[str | Path] | None = None,
|
|
335
|
+
example_count: int = DEFAULT_EXAMPLE_COUNT,
|
|
336
|
+
additional_context: str | None = None,
|
|
337
|
+
) -> DocumentExtractorConfig:
|
|
338
|
+
"""Generate a DocumentExtractorConfig using response-based helpers.
|
|
339
|
+
|
|
340
|
+
Parameters
|
|
341
|
+
----------
|
|
342
|
+
openai_settings : OpenAISettings
|
|
343
|
+
Settings used to configure the OpenAI client.
|
|
344
|
+
name : str
|
|
345
|
+
Name for the extractor configuration.
|
|
346
|
+
prompt : str
|
|
347
|
+
User-supplied prompt content.
|
|
348
|
+
extraction_classes : Sequence[str]
|
|
349
|
+
Extraction classes to include in the configuration.
|
|
350
|
+
example_files : Sequence[str or Path] or None, default None
|
|
351
|
+
Optional file paths used to ground the generated examples.
|
|
352
|
+
example_count : int, default 3
|
|
353
|
+
Number of examples to generate.
|
|
354
|
+
additional_context : str or None, default None
|
|
355
|
+
Optional context that should influence prompt generation.
|
|
356
|
+
|
|
357
|
+
Returns
|
|
358
|
+
-------
|
|
359
|
+
DocumentExtractorConfig
|
|
360
|
+
Generated extractor configuration.
|
|
361
|
+
|
|
362
|
+
Raises
|
|
363
|
+
------
|
|
364
|
+
TypeError
|
|
365
|
+
If the generator response does not return a DocumentExtractorConfig.
|
|
366
|
+
"""
|
|
367
|
+
prompt_description = optimize_extractor_prompt(
|
|
368
|
+
openai_settings,
|
|
369
|
+
prompt,
|
|
370
|
+
extraction_classes,
|
|
371
|
+
additional_context=additional_context,
|
|
372
|
+
)
|
|
373
|
+
request_text = _format_extractor_config_request(
|
|
374
|
+
name,
|
|
375
|
+
prompt_description,
|
|
376
|
+
extraction_classes,
|
|
377
|
+
example_files=example_files,
|
|
378
|
+
example_count=example_count,
|
|
379
|
+
)
|
|
380
|
+
response = EXTRACTOR_CONFIG_GENERATOR.gen_response(openai_settings=openai_settings)
|
|
381
|
+
try:
|
|
382
|
+
result = response.run_sync(request_text)
|
|
383
|
+
finally:
|
|
384
|
+
response.close()
|
|
385
|
+
|
|
386
|
+
if isinstance(result, DocumentExtractorConfig):
|
|
387
|
+
return result
|
|
388
|
+
if isinstance(result, dict):
|
|
389
|
+
return DocumentExtractorConfig.model_validate(result)
|
|
390
|
+
raise TypeError(
|
|
391
|
+
"Extractor config generator must return a DocumentExtractorConfig or dict."
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def generate_document_extractor_config_with_agent(
|
|
396
|
+
openai_settings: OpenAISettings,
|
|
397
|
+
name: str,
|
|
398
|
+
prompt: str,
|
|
399
|
+
extraction_classes: Sequence[str],
|
|
400
|
+
examples: Sequence[ExampleDataStructure],
|
|
401
|
+
*,
|
|
402
|
+
additional_context: str | None = None,
|
|
403
|
+
) -> DocumentExtractorConfig:
|
|
404
|
+
"""Generate a DocumentExtractorConfig using AgentBase workflows.
|
|
405
|
+
|
|
406
|
+
Parameters
|
|
407
|
+
----------
|
|
408
|
+
openai_settings : OpenAISettings
|
|
409
|
+
Settings used to configure the agent model.
|
|
410
|
+
name : str
|
|
411
|
+
Name for the extractor configuration.
|
|
412
|
+
prompt : str
|
|
413
|
+
User-supplied prompt content.
|
|
414
|
+
extraction_classes : Sequence[str]
|
|
415
|
+
Extraction classes to include in the configuration.
|
|
416
|
+
examples : Sequence[ExampleDataStructure]
|
|
417
|
+
Example payloads supplied to LangExtract.
|
|
418
|
+
additional_context : str or None, default None
|
|
419
|
+
Optional context that should influence prompt generation.
|
|
420
|
+
|
|
421
|
+
Returns
|
|
422
|
+
-------
|
|
423
|
+
DocumentExtractorConfig
|
|
424
|
+
Generated extractor configuration.
|
|
425
|
+
|
|
426
|
+
Raises
|
|
427
|
+
------
|
|
428
|
+
TypeError
|
|
429
|
+
If the agent response does not return a DocumentExtractorConfig.
|
|
430
|
+
ValueError
|
|
431
|
+
If no examples are provided.
|
|
432
|
+
"""
|
|
433
|
+
if not examples:
|
|
434
|
+
raise ValueError("At least one ExampleDataStructure instance is required.")
|
|
435
|
+
if not openai_settings.default_model:
|
|
436
|
+
raise ValueError("OpenAISettings.default_model is required for agent runs.")
|
|
437
|
+
prompt_description = optimize_extractor_prompt_with_agent(
|
|
438
|
+
openai_settings,
|
|
439
|
+
prompt,
|
|
440
|
+
extraction_classes,
|
|
441
|
+
additional_context=additional_context,
|
|
442
|
+
)
|
|
443
|
+
request_text = _format_extractor_config_request_with_examples(
|
|
444
|
+
name,
|
|
445
|
+
prompt_description,
|
|
446
|
+
extraction_classes,
|
|
447
|
+
examples,
|
|
448
|
+
)
|
|
449
|
+
configuration = AgentConfiguration(
|
|
450
|
+
name="extractor_config_generator",
|
|
451
|
+
description="Generate DocumentExtractorConfig instances.",
|
|
452
|
+
model=openai_settings.default_model,
|
|
453
|
+
instructions=EXTRACTOR_CONFIG_AGENT_INSTRUCTIONS,
|
|
454
|
+
output_structure=DocumentExtractorConfig,
|
|
455
|
+
)
|
|
456
|
+
agent = AgentBase(configuration=configuration)
|
|
457
|
+
result = agent.run_sync(request_text)
|
|
458
|
+
|
|
459
|
+
if isinstance(result, DocumentExtractorConfig):
|
|
460
|
+
return result
|
|
461
|
+
if isinstance(result, dict):
|
|
462
|
+
return DocumentExtractorConfig.model_validate(result)
|
|
463
|
+
raise TypeError(
|
|
464
|
+
"Agent config generator must return a DocumentExtractorConfig or dict."
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
__all__ = [
|
|
469
|
+
"EXTRACTOR_CONFIG_GENERATOR",
|
|
470
|
+
"EXTRACTOR_CONFIG_AGENT_INSTRUCTIONS",
|
|
471
|
+
"PROMPT_OPTIMIZER_AGENT_INSTRUCTIONS",
|
|
472
|
+
"generate_document_extractor_config",
|
|
473
|
+
"generate_document_extractor_config_with_agent",
|
|
474
|
+
"optimize_extractor_prompt",
|
|
475
|
+
"optimize_extractor_prompt_with_agent",
|
|
476
|
+
]
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
Generate a DocumentExtractorConfig using the provided details. Follow the example
|
|
2
|
+
approach: examples should be high-quality and match the prompt. Set the configuration
|
|
3
|
+
name exactly as provided. Preserve the prompt description, extraction classes, and
|
|
4
|
+
examples. Include meaningful attributes when applicable.
|
|
5
|
+
|
|
6
|
+
{{ config_schema }}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
Build a DocumentExtractorConfig using the details below:
|
|
2
|
+
Name: {{ name }}
|
|
3
|
+
Prompt description: {{ prompt_description }}
|
|
4
|
+
Extraction classes:
|
|
5
|
+
{% for item in extraction_classes -%}
|
|
6
|
+
- {{ item }}
|
|
7
|
+
{% else -%}
|
|
8
|
+
- None provided
|
|
9
|
+
{% endfor %}
|
|
10
|
+
|
|
11
|
+
Example requirements:
|
|
12
|
+
{% for requirement in example_requirements -%}
|
|
13
|
+
- {{ requirement }}
|
|
14
|
+
{% endfor %}
|
|
15
|
+
|
|
16
|
+
Attributes guidance:
|
|
17
|
+
- Every extraction must include an "attributes" object.
|
|
18
|
+
- Use attributes to capture meaningful structured details (e.g., confidence, type, qualifiers).
|
|
19
|
+
- If no attributes apply, provide an empty object {}.
|
|
20
|
+
|
|
21
|
+
Examples (JSON):
|
|
22
|
+
{{ examples_json }}
|
|
23
|
+
|
|
24
|
+
Source files for grounding:
|
|
25
|
+
{% if example_files -%}
|
|
26
|
+
{% for file in example_files -%}
|
|
27
|
+
- Path: {{ file.path }}
|
|
28
|
+
Content:
|
|
29
|
+
{{ file.content }}
|
|
30
|
+
{% endfor %}
|
|
31
|
+
{% else -%}
|
|
32
|
+
- None provided.
|
|
33
|
+
{% endif %}
|
|
34
|
+
|
|
35
|
+
Grounding requirements:
|
|
36
|
+
- Use source file content to craft example text when available.
|
|
37
|
+
- Prefer quoting or lightly paraphrasing source text over inventing details.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
Generate a DocumentExtractorConfig using the provided inputs.
|
|
2
|
+
Requirements:
|
|
3
|
+
- Generate high-quality examples that match the prompt and extraction classes.
|
|
4
|
+
- Ensure examples include realistic source text and cover all extraction classes.
|
|
5
|
+
- Include meaningful attributes on each extraction when applicable.
|
|
6
|
+
- If source files are provided, ground example text in that content.
|
|
7
|
+
- Set the configuration name exactly as provided.
|
|
8
|
+
- Preserve the provided prompt description and extraction classes.
|
|
9
|
+
- Do not add or remove extraction classes.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Optimize the extraction prompt using the details below:
|
|
2
|
+
User prompt: {{ prompt }}
|
|
3
|
+
Extraction classes:
|
|
4
|
+
{% for item in extraction_classes -%}
|
|
5
|
+
- {{ item }}
|
|
6
|
+
{% else -%}
|
|
7
|
+
- None provided
|
|
8
|
+
{% endfor %}
|
|
9
|
+
{% if additional_context -%}
|
|
10
|
+
Additional context: {{ additional_context }}
|
|
11
|
+
{% endif %}
|