openai-sdk-helpers 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openai_sdk_helpers/__init__.py +45 -41
- openai_sdk_helpers/agent/__init__.py +4 -6
- openai_sdk_helpers/agent/base.py +110 -191
- openai_sdk_helpers/agent/{config.py → configuration.py} +24 -32
- openai_sdk_helpers/agent/{coordination.py → coordinator.py} +22 -23
- openai_sdk_helpers/agent/runner.py +3 -45
- openai_sdk_helpers/agent/search/base.py +54 -76
- openai_sdk_helpers/agent/search/vector.py +92 -108
- openai_sdk_helpers/agent/search/web.py +104 -82
- openai_sdk_helpers/agent/summarizer.py +22 -28
- openai_sdk_helpers/agent/translator.py +22 -24
- openai_sdk_helpers/agent/{validation.py → validator.py} +19 -23
- openai_sdk_helpers/cli.py +8 -22
- openai_sdk_helpers/environment.py +8 -13
- openai_sdk_helpers/errors.py +9 -0
- openai_sdk_helpers/extract/__init__.py +23 -0
- openai_sdk_helpers/extract/extractor.py +157 -0
- openai_sdk_helpers/extract/generator.py +476 -0
- openai_sdk_helpers/prompt/extractor_config_agent_instructions.jinja +6 -0
- openai_sdk_helpers/prompt/extractor_config_generator.jinja +37 -0
- openai_sdk_helpers/prompt/extractor_config_generator_instructions.jinja +9 -0
- openai_sdk_helpers/prompt/extractor_prompt_optimizer_agent_instructions.jinja +4 -0
- openai_sdk_helpers/prompt/extractor_prompt_optimizer_request.jinja +11 -0
- openai_sdk_helpers/prompt/vector_planner.jinja +7 -0
- openai_sdk_helpers/prompt/vector_search.jinja +6 -0
- openai_sdk_helpers/prompt/vector_writer.jinja +7 -0
- openai_sdk_helpers/response/__init__.py +3 -7
- openai_sdk_helpers/response/base.py +89 -98
- openai_sdk_helpers/response/{config.py → configuration.py} +45 -20
- openai_sdk_helpers/response/files.py +2 -0
- openai_sdk_helpers/response/planner.py +1 -1
- openai_sdk_helpers/response/prompter.py +1 -1
- openai_sdk_helpers/response/runner.py +1 -48
- openai_sdk_helpers/response/tool_call.py +0 -141
- openai_sdk_helpers/response/vector_store.py +8 -5
- openai_sdk_helpers/streamlit_app/__init__.py +1 -1
- openai_sdk_helpers/streamlit_app/app.py +17 -18
- openai_sdk_helpers/streamlit_app/{config.py → configuration.py} +13 -13
- openai_sdk_helpers/structure/__init__.py +16 -0
- openai_sdk_helpers/structure/base.py +239 -278
- openai_sdk_helpers/structure/extraction.py +1228 -0
- openai_sdk_helpers/structure/plan/plan.py +0 -20
- openai_sdk_helpers/structure/plan/task.py +0 -33
- openai_sdk_helpers/structure/prompt.py +16 -0
- openai_sdk_helpers/structure/responses.py +2 -2
- openai_sdk_helpers/structure/web_search.py +0 -10
- openai_sdk_helpers/tools.py +346 -99
- openai_sdk_helpers/types.py +3 -3
- openai_sdk_helpers/utils/__init__.py +9 -6
- openai_sdk_helpers/utils/json/base_model.py +316 -33
- openai_sdk_helpers/utils/json/data_class.py +1 -1
- openai_sdk_helpers/utils/langextract.py +194 -0
- openai_sdk_helpers/utils/registry.py +19 -15
- openai_sdk_helpers/vector_storage/storage.py +1 -1
- {openai_sdk_helpers-0.4.2.dist-info → openai_sdk_helpers-0.5.0.dist-info}/METADATA +25 -11
- openai_sdk_helpers-0.5.0.dist-info/RECORD +95 -0
- openai_sdk_helpers/agent/prompt_utils.py +0 -15
- openai_sdk_helpers/context_manager.py +0 -241
- openai_sdk_helpers/deprecation.py +0 -167
- openai_sdk_helpers/retry.py +0 -175
- openai_sdk_helpers/streamlit_app/streamlit_web_search.py +0 -75
- openai_sdk_helpers/utils/deprecation.py +0 -167
- openai_sdk_helpers-0.4.2.dist-info/RECORD +0 -88
- /openai_sdk_helpers/{logging_config.py → logging.py} +0 -0
- /openai_sdk_helpers/{config.py → settings.py} +0 -0
- {openai_sdk_helpers-0.4.2.dist-info → openai_sdk_helpers-0.5.0.dist-info}/WHEEL +0 -0
- {openai_sdk_helpers-0.4.2.dist-info → openai_sdk_helpers-0.5.0.dist-info}/entry_points.txt +0 -0
- {openai_sdk_helpers-0.4.2.dist-info → openai_sdk_helpers-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,476 @@
|
|
|
1
|
+
"""Prompt optimization and configuration helpers for document extraction."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Sequence
|
|
8
|
+
|
|
9
|
+
from ..agent.base import AgentBase
|
|
10
|
+
from ..agent.configuration import AgentConfiguration
|
|
11
|
+
from ..prompt import PromptRenderer
|
|
12
|
+
from ..response.configuration import ResponseConfiguration
|
|
13
|
+
from ..response.prompter import PROMPTER
|
|
14
|
+
from ..settings import OpenAISettings
|
|
15
|
+
from ..structure.extraction import DocumentExtractorConfig, ExampleDataStructure
|
|
16
|
+
from ..structure.prompt import PromptStructure
|
|
17
|
+
|
|
18
|
+
EXTRACTOR_CONFIG_TEMPLATE_NAME = "extractor_config_generator.jinja"
|
|
19
|
+
EXTRACTOR_CONFIG_AGENT_INSTRUCTIONS_TEMPLATE = (
|
|
20
|
+
"extractor_config_agent_instructions.jinja"
|
|
21
|
+
)
|
|
22
|
+
EXTRACTOR_CONFIG_GENERATOR_INSTRUCTIONS_TEMPLATE = (
|
|
23
|
+
"extractor_config_generator_instructions.jinja"
|
|
24
|
+
)
|
|
25
|
+
EXTRACTOR_PROMPT_OPTIMIZER_INSTRUCTIONS_TEMPLATE = (
|
|
26
|
+
"extractor_prompt_optimizer_agent_instructions.jinja"
|
|
27
|
+
)
|
|
28
|
+
EXTRACTOR_PROMPT_OPTIMIZER_REQUEST_TEMPLATE = "extractor_prompt_optimizer_request.jinja"
|
|
29
|
+
PROMPT_RENDERER = PromptRenderer()
|
|
30
|
+
|
|
31
|
+
DEFAULT_EXAMPLE_COUNT = 3
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _render_prompt_template(
|
|
35
|
+
template_name: str,
|
|
36
|
+
context: dict[str, object] | None = None,
|
|
37
|
+
) -> str:
|
|
38
|
+
"""Render a prompt template from the prompt directory.
|
|
39
|
+
|
|
40
|
+
Parameters
|
|
41
|
+
----------
|
|
42
|
+
template_name : str
|
|
43
|
+
Prompt template file name.
|
|
44
|
+
context : dict[str, object] or None, default None
|
|
45
|
+
Context variables for template rendering.
|
|
46
|
+
|
|
47
|
+
Returns
|
|
48
|
+
-------
|
|
49
|
+
str
|
|
50
|
+
Rendered prompt content.
|
|
51
|
+
"""
|
|
52
|
+
return PROMPT_RENDERER.render(template_name, context=context or {})
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
EXTRACTOR_CONFIG_GENERATOR = ResponseConfiguration(
|
|
56
|
+
name="document_extractor_config_generator",
|
|
57
|
+
instructions=_render_prompt_template(
|
|
58
|
+
EXTRACTOR_CONFIG_GENERATOR_INSTRUCTIONS_TEMPLATE
|
|
59
|
+
),
|
|
60
|
+
tools=None,
|
|
61
|
+
input_structure=None,
|
|
62
|
+
output_structure=DocumentExtractorConfig,
|
|
63
|
+
add_output_instructions=True,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
PROMPT_OPTIMIZER_AGENT_INSTRUCTIONS = _render_prompt_template(
|
|
67
|
+
EXTRACTOR_PROMPT_OPTIMIZER_INSTRUCTIONS_TEMPLATE,
|
|
68
|
+
context={"prompt_schema": PromptStructure.get_prompt()},
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
EXTRACTOR_CONFIG_AGENT_INSTRUCTIONS = _render_prompt_template(
|
|
72
|
+
EXTRACTOR_CONFIG_AGENT_INSTRUCTIONS_TEMPLATE,
|
|
73
|
+
context={"config_schema": DocumentExtractorConfig.get_prompt()},
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def _format_extractor_prompt_request(
|
|
78
|
+
prompt: str,
|
|
79
|
+
extraction_classes: Sequence[str],
|
|
80
|
+
additional_context: str | None,
|
|
81
|
+
) -> str:
|
|
82
|
+
"""Format the prompt-optimization request payload.
|
|
83
|
+
|
|
84
|
+
Parameters
|
|
85
|
+
----------
|
|
86
|
+
prompt : str
|
|
87
|
+
User-provided prompt content.
|
|
88
|
+
extraction_classes : Sequence[str]
|
|
89
|
+
Extraction classes to include.
|
|
90
|
+
additional_context : str or None
|
|
91
|
+
Optional extra context to include.
|
|
92
|
+
|
|
93
|
+
Returns
|
|
94
|
+
-------
|
|
95
|
+
str
|
|
96
|
+
Formatted prompt optimization request.
|
|
97
|
+
"""
|
|
98
|
+
return _render_prompt_template(
|
|
99
|
+
EXTRACTOR_PROMPT_OPTIMIZER_REQUEST_TEMPLATE,
|
|
100
|
+
context={
|
|
101
|
+
"prompt": prompt,
|
|
102
|
+
"extraction_classes": list(extraction_classes),
|
|
103
|
+
"additional_context": additional_context,
|
|
104
|
+
},
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _format_extractor_config_request(
|
|
109
|
+
name: str,
|
|
110
|
+
prompt_description: str,
|
|
111
|
+
extraction_classes: Sequence[str],
|
|
112
|
+
*,
|
|
113
|
+
example_files: Sequence[str | Path] | None = None,
|
|
114
|
+
example_count: int = DEFAULT_EXAMPLE_COUNT,
|
|
115
|
+
) -> str:
|
|
116
|
+
"""Format the extractor configuration request payload.
|
|
117
|
+
|
|
118
|
+
Parameters
|
|
119
|
+
----------
|
|
120
|
+
name : str
|
|
121
|
+
Name for the extractor configuration.
|
|
122
|
+
prompt_description : str
|
|
123
|
+
Optimized prompt description to use.
|
|
124
|
+
extraction_classes : Sequence[str]
|
|
125
|
+
Extraction classes to include.
|
|
126
|
+
example_files : Sequence[str or Path] or None, default None
|
|
127
|
+
Optional file paths to ground the generated examples.
|
|
128
|
+
example_count : int, default 3
|
|
129
|
+
Number of examples to generate.
|
|
130
|
+
|
|
131
|
+
Returns
|
|
132
|
+
-------
|
|
133
|
+
str
|
|
134
|
+
Formatted configuration request.
|
|
135
|
+
"""
|
|
136
|
+
return PROMPT_RENDERER.render(
|
|
137
|
+
EXTRACTOR_CONFIG_TEMPLATE_NAME,
|
|
138
|
+
context={
|
|
139
|
+
"name": name,
|
|
140
|
+
"prompt_description": prompt_description,
|
|
141
|
+
"extraction_classes": list(extraction_classes),
|
|
142
|
+
"example_count": example_count,
|
|
143
|
+
"example_files": _load_example_files(example_files),
|
|
144
|
+
"examples_json": "- None provided. You must generate examples.",
|
|
145
|
+
"example_requirements": [
|
|
146
|
+
f"Generate {example_count} high-quality examples that align with the prompt.",
|
|
147
|
+
"Ensure each example includes realistic source text and extractions.",
|
|
148
|
+
"Cover every extraction class across the examples.",
|
|
149
|
+
],
|
|
150
|
+
},
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
def _format_extractor_config_request_with_examples(
|
|
155
|
+
name: str,
|
|
156
|
+
prompt_description: str,
|
|
157
|
+
extraction_classes: Sequence[str],
|
|
158
|
+
examples: Sequence[ExampleDataStructure],
|
|
159
|
+
) -> str:
|
|
160
|
+
"""Format the extractor configuration request payload with examples.
|
|
161
|
+
|
|
162
|
+
Parameters
|
|
163
|
+
----------
|
|
164
|
+
name : str
|
|
165
|
+
Name for the extractor configuration.
|
|
166
|
+
prompt_description : str
|
|
167
|
+
Optimized prompt description to use.
|
|
168
|
+
extraction_classes : Sequence[str]
|
|
169
|
+
Extraction classes to include.
|
|
170
|
+
examples : Sequence[ExampleDataStructure]
|
|
171
|
+
Example payloads to include.
|
|
172
|
+
|
|
173
|
+
Returns
|
|
174
|
+
-------
|
|
175
|
+
str
|
|
176
|
+
Formatted configuration request.
|
|
177
|
+
"""
|
|
178
|
+
serialized_examples = [example.to_json() for example in examples]
|
|
179
|
+
examples_json = json.dumps(serialized_examples, indent=2)
|
|
180
|
+
return PROMPT_RENDERER.render(
|
|
181
|
+
EXTRACTOR_CONFIG_TEMPLATE_NAME,
|
|
182
|
+
context={
|
|
183
|
+
"name": name,
|
|
184
|
+
"prompt_description": prompt_description,
|
|
185
|
+
"extraction_classes": list(extraction_classes),
|
|
186
|
+
"example_count": DEFAULT_EXAMPLE_COUNT,
|
|
187
|
+
"example_files": [],
|
|
188
|
+
"examples_json": examples_json,
|
|
189
|
+
"example_requirements": ["Use the provided examples exactly as written."],
|
|
190
|
+
},
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def _load_example_files(
|
|
195
|
+
example_files: Sequence[str | Path] | None,
|
|
196
|
+
) -> list[dict[str, str]]:
|
|
197
|
+
"""Load optional example files for grounded extraction generation.
|
|
198
|
+
|
|
199
|
+
Parameters
|
|
200
|
+
----------
|
|
201
|
+
example_files : Sequence[str or Path] or None
|
|
202
|
+
File paths to load for grounding examples.
|
|
203
|
+
|
|
204
|
+
Returns
|
|
205
|
+
-------
|
|
206
|
+
list of dict[str, str]
|
|
207
|
+
Loaded file metadata including path and content.
|
|
208
|
+
|
|
209
|
+
Raises
|
|
210
|
+
------
|
|
211
|
+
FileNotFoundError
|
|
212
|
+
If any provided file does not exist.
|
|
213
|
+
"""
|
|
214
|
+
if not example_files:
|
|
215
|
+
return []
|
|
216
|
+
loaded_files: list[dict[str, str]] = []
|
|
217
|
+
for file_path in example_files:
|
|
218
|
+
path = Path(file_path)
|
|
219
|
+
content = path.read_text()
|
|
220
|
+
loaded_files.append({"path": str(path), "content": content})
|
|
221
|
+
return loaded_files
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def optimize_extractor_prompt(
|
|
225
|
+
openai_settings: OpenAISettings,
|
|
226
|
+
prompt: str,
|
|
227
|
+
extraction_classes: Sequence[str],
|
|
228
|
+
*,
|
|
229
|
+
additional_context: str | None = None,
|
|
230
|
+
) -> str:
|
|
231
|
+
"""Generate an optimized prompt description for extraction.
|
|
232
|
+
|
|
233
|
+
Parameters
|
|
234
|
+
----------
|
|
235
|
+
openai_settings : OpenAISettings
|
|
236
|
+
Settings used to configure the OpenAI client.
|
|
237
|
+
prompt : str
|
|
238
|
+
User-supplied prompt content.
|
|
239
|
+
extraction_classes : Sequence[str]
|
|
240
|
+
Extraction classes to include in the optimized prompt.
|
|
241
|
+
additional_context : str or None, default None
|
|
242
|
+
Optional context that should influence prompt generation.
|
|
243
|
+
|
|
244
|
+
Returns
|
|
245
|
+
-------
|
|
246
|
+
str
|
|
247
|
+
Optimized prompt description.
|
|
248
|
+
|
|
249
|
+
Raises
|
|
250
|
+
------
|
|
251
|
+
TypeError
|
|
252
|
+
If the prompter response does not return a prompt string.
|
|
253
|
+
"""
|
|
254
|
+
request_text = _format_extractor_prompt_request(
|
|
255
|
+
prompt,
|
|
256
|
+
extraction_classes,
|
|
257
|
+
additional_context,
|
|
258
|
+
)
|
|
259
|
+
response = PROMPTER.gen_response(openai_settings=openai_settings)
|
|
260
|
+
try:
|
|
261
|
+
result = response.run_sync(request_text)
|
|
262
|
+
finally:
|
|
263
|
+
response.close()
|
|
264
|
+
|
|
265
|
+
if isinstance(result, PromptStructure):
|
|
266
|
+
return result.prompt
|
|
267
|
+
if isinstance(result, str):
|
|
268
|
+
return result
|
|
269
|
+
raise TypeError("Prompter response must return a PromptStructure or string.")
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def optimize_extractor_prompt_with_agent(
|
|
273
|
+
openai_settings: OpenAISettings,
|
|
274
|
+
prompt: str,
|
|
275
|
+
extraction_classes: Sequence[str],
|
|
276
|
+
*,
|
|
277
|
+
additional_context: str | None = None,
|
|
278
|
+
) -> str:
|
|
279
|
+
"""Generate an optimized prompt description using AgentBase.
|
|
280
|
+
|
|
281
|
+
Parameters
|
|
282
|
+
----------
|
|
283
|
+
openai_settings : OpenAISettings
|
|
284
|
+
Settings used to configure the agent model.
|
|
285
|
+
prompt : str
|
|
286
|
+
User-supplied prompt content.
|
|
287
|
+
extraction_classes : Sequence[str]
|
|
288
|
+
Extraction classes to include in the optimized prompt.
|
|
289
|
+
additional_context : str or None, default None
|
|
290
|
+
Optional context that should influence prompt generation.
|
|
291
|
+
|
|
292
|
+
Returns
|
|
293
|
+
-------
|
|
294
|
+
str
|
|
295
|
+
Optimized prompt description.
|
|
296
|
+
|
|
297
|
+
Raises
|
|
298
|
+
------
|
|
299
|
+
TypeError
|
|
300
|
+
If the agent response does not return a prompt string.
|
|
301
|
+
ValueError
|
|
302
|
+
If no default model is configured.
|
|
303
|
+
"""
|
|
304
|
+
if not openai_settings.default_model:
|
|
305
|
+
raise ValueError("OpenAISettings.default_model is required for agent runs.")
|
|
306
|
+
request_text = _format_extractor_prompt_request(
|
|
307
|
+
prompt,
|
|
308
|
+
extraction_classes,
|
|
309
|
+
additional_context,
|
|
310
|
+
)
|
|
311
|
+
configuration = AgentConfiguration(
|
|
312
|
+
name="extractor_prompt_optimizer",
|
|
313
|
+
description="Optimize extraction prompt descriptions.",
|
|
314
|
+
model=openai_settings.default_model,
|
|
315
|
+
instructions=PROMPT_OPTIMIZER_AGENT_INSTRUCTIONS,
|
|
316
|
+
output_structure=PromptStructure,
|
|
317
|
+
)
|
|
318
|
+
agent = AgentBase(configuration=configuration)
|
|
319
|
+
result = agent.run_sync(request_text)
|
|
320
|
+
|
|
321
|
+
if isinstance(result, PromptStructure):
|
|
322
|
+
return result.prompt
|
|
323
|
+
if isinstance(result, str):
|
|
324
|
+
return result
|
|
325
|
+
raise TypeError("Agent response must return a PromptStructure or string.")
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def generate_document_extractor_config(
|
|
329
|
+
openai_settings: OpenAISettings,
|
|
330
|
+
name: str,
|
|
331
|
+
prompt: str,
|
|
332
|
+
extraction_classes: Sequence[str],
|
|
333
|
+
*,
|
|
334
|
+
example_files: Sequence[str | Path] | None = None,
|
|
335
|
+
example_count: int = DEFAULT_EXAMPLE_COUNT,
|
|
336
|
+
additional_context: str | None = None,
|
|
337
|
+
) -> DocumentExtractorConfig:
|
|
338
|
+
"""Generate a DocumentExtractorConfig using response-based helpers.
|
|
339
|
+
|
|
340
|
+
Parameters
|
|
341
|
+
----------
|
|
342
|
+
openai_settings : OpenAISettings
|
|
343
|
+
Settings used to configure the OpenAI client.
|
|
344
|
+
name : str
|
|
345
|
+
Name for the extractor configuration.
|
|
346
|
+
prompt : str
|
|
347
|
+
User-supplied prompt content.
|
|
348
|
+
extraction_classes : Sequence[str]
|
|
349
|
+
Extraction classes to include in the configuration.
|
|
350
|
+
example_files : Sequence[str or Path] or None, default None
|
|
351
|
+
Optional file paths used to ground the generated examples.
|
|
352
|
+
example_count : int, default 3
|
|
353
|
+
Number of examples to generate.
|
|
354
|
+
additional_context : str or None, default None
|
|
355
|
+
Optional context that should influence prompt generation.
|
|
356
|
+
|
|
357
|
+
Returns
|
|
358
|
+
-------
|
|
359
|
+
DocumentExtractorConfig
|
|
360
|
+
Generated extractor configuration.
|
|
361
|
+
|
|
362
|
+
Raises
|
|
363
|
+
------
|
|
364
|
+
TypeError
|
|
365
|
+
If the generator response does not return a DocumentExtractorConfig.
|
|
366
|
+
"""
|
|
367
|
+
prompt_description = optimize_extractor_prompt(
|
|
368
|
+
openai_settings,
|
|
369
|
+
prompt,
|
|
370
|
+
extraction_classes,
|
|
371
|
+
additional_context=additional_context,
|
|
372
|
+
)
|
|
373
|
+
request_text = _format_extractor_config_request(
|
|
374
|
+
name,
|
|
375
|
+
prompt_description,
|
|
376
|
+
extraction_classes,
|
|
377
|
+
example_files=example_files,
|
|
378
|
+
example_count=example_count,
|
|
379
|
+
)
|
|
380
|
+
response = EXTRACTOR_CONFIG_GENERATOR.gen_response(openai_settings=openai_settings)
|
|
381
|
+
try:
|
|
382
|
+
result = response.run_sync(request_text)
|
|
383
|
+
finally:
|
|
384
|
+
response.close()
|
|
385
|
+
|
|
386
|
+
if isinstance(result, DocumentExtractorConfig):
|
|
387
|
+
return result
|
|
388
|
+
if isinstance(result, dict):
|
|
389
|
+
return DocumentExtractorConfig.model_validate(result)
|
|
390
|
+
raise TypeError(
|
|
391
|
+
"Extractor config generator must return a DocumentExtractorConfig or dict."
|
|
392
|
+
)
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
def generate_document_extractor_config_with_agent(
|
|
396
|
+
openai_settings: OpenAISettings,
|
|
397
|
+
name: str,
|
|
398
|
+
prompt: str,
|
|
399
|
+
extraction_classes: Sequence[str],
|
|
400
|
+
examples: Sequence[ExampleDataStructure],
|
|
401
|
+
*,
|
|
402
|
+
additional_context: str | None = None,
|
|
403
|
+
) -> DocumentExtractorConfig:
|
|
404
|
+
"""Generate a DocumentExtractorConfig using AgentBase workflows.
|
|
405
|
+
|
|
406
|
+
Parameters
|
|
407
|
+
----------
|
|
408
|
+
openai_settings : OpenAISettings
|
|
409
|
+
Settings used to configure the agent model.
|
|
410
|
+
name : str
|
|
411
|
+
Name for the extractor configuration.
|
|
412
|
+
prompt : str
|
|
413
|
+
User-supplied prompt content.
|
|
414
|
+
extraction_classes : Sequence[str]
|
|
415
|
+
Extraction classes to include in the configuration.
|
|
416
|
+
examples : Sequence[ExampleDataStructure]
|
|
417
|
+
Example payloads supplied to LangExtract.
|
|
418
|
+
additional_context : str or None, default None
|
|
419
|
+
Optional context that should influence prompt generation.
|
|
420
|
+
|
|
421
|
+
Returns
|
|
422
|
+
-------
|
|
423
|
+
DocumentExtractorConfig
|
|
424
|
+
Generated extractor configuration.
|
|
425
|
+
|
|
426
|
+
Raises
|
|
427
|
+
------
|
|
428
|
+
TypeError
|
|
429
|
+
If the agent response does not return a DocumentExtractorConfig.
|
|
430
|
+
ValueError
|
|
431
|
+
If no examples are provided.
|
|
432
|
+
"""
|
|
433
|
+
if not examples:
|
|
434
|
+
raise ValueError("At least one ExampleDataStructure instance is required.")
|
|
435
|
+
if not openai_settings.default_model:
|
|
436
|
+
raise ValueError("OpenAISettings.default_model is required for agent runs.")
|
|
437
|
+
prompt_description = optimize_extractor_prompt_with_agent(
|
|
438
|
+
openai_settings,
|
|
439
|
+
prompt,
|
|
440
|
+
extraction_classes,
|
|
441
|
+
additional_context=additional_context,
|
|
442
|
+
)
|
|
443
|
+
request_text = _format_extractor_config_request_with_examples(
|
|
444
|
+
name,
|
|
445
|
+
prompt_description,
|
|
446
|
+
extraction_classes,
|
|
447
|
+
examples,
|
|
448
|
+
)
|
|
449
|
+
configuration = AgentConfiguration(
|
|
450
|
+
name="extractor_config_generator",
|
|
451
|
+
description="Generate DocumentExtractorConfig instances.",
|
|
452
|
+
model=openai_settings.default_model,
|
|
453
|
+
instructions=EXTRACTOR_CONFIG_AGENT_INSTRUCTIONS,
|
|
454
|
+
output_structure=DocumentExtractorConfig,
|
|
455
|
+
)
|
|
456
|
+
agent = AgentBase(configuration=configuration)
|
|
457
|
+
result = agent.run_sync(request_text)
|
|
458
|
+
|
|
459
|
+
if isinstance(result, DocumentExtractorConfig):
|
|
460
|
+
return result
|
|
461
|
+
if isinstance(result, dict):
|
|
462
|
+
return DocumentExtractorConfig.model_validate(result)
|
|
463
|
+
raise TypeError(
|
|
464
|
+
"Agent config generator must return a DocumentExtractorConfig or dict."
|
|
465
|
+
)
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
__all__ = [
|
|
469
|
+
"EXTRACTOR_CONFIG_GENERATOR",
|
|
470
|
+
"EXTRACTOR_CONFIG_AGENT_INSTRUCTIONS",
|
|
471
|
+
"PROMPT_OPTIMIZER_AGENT_INSTRUCTIONS",
|
|
472
|
+
"generate_document_extractor_config",
|
|
473
|
+
"generate_document_extractor_config_with_agent",
|
|
474
|
+
"optimize_extractor_prompt",
|
|
475
|
+
"optimize_extractor_prompt_with_agent",
|
|
476
|
+
]
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
Generate a DocumentExtractorConfig using the provided details. Follow the example
|
|
2
|
+
approach: examples should be high-quality and match the prompt. Set the configuration
|
|
3
|
+
name exactly as provided. Preserve the prompt description, extraction classes, and
|
|
4
|
+
examples. Include meaningful attributes when applicable.
|
|
5
|
+
|
|
6
|
+
{{ config_schema }}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
Build a DocumentExtractorConfig using the details below:
|
|
2
|
+
Name: {{ name }}
|
|
3
|
+
Prompt description: {{ prompt_description }}
|
|
4
|
+
Extraction classes:
|
|
5
|
+
{% for item in extraction_classes -%}
|
|
6
|
+
- {{ item }}
|
|
7
|
+
{% else -%}
|
|
8
|
+
- None provided
|
|
9
|
+
{% endfor %}
|
|
10
|
+
|
|
11
|
+
Example requirements:
|
|
12
|
+
{% for requirement in example_requirements -%}
|
|
13
|
+
- {{ requirement }}
|
|
14
|
+
{% endfor %}
|
|
15
|
+
|
|
16
|
+
Attributes guidance:
|
|
17
|
+
- Every extraction must include an "attributes" object.
|
|
18
|
+
- Use attributes to capture meaningful structured details (e.g., confidence, type, qualifiers).
|
|
19
|
+
- If no attributes apply, provide an empty object {}.
|
|
20
|
+
|
|
21
|
+
Examples (JSON):
|
|
22
|
+
{{ examples_json }}
|
|
23
|
+
|
|
24
|
+
Source files for grounding:
|
|
25
|
+
{% if example_files -%}
|
|
26
|
+
{% for file in example_files -%}
|
|
27
|
+
- Path: {{ file.path }}
|
|
28
|
+
Content:
|
|
29
|
+
{{ file.content }}
|
|
30
|
+
{% endfor %}
|
|
31
|
+
{% else -%}
|
|
32
|
+
- None provided.
|
|
33
|
+
{% endif %}
|
|
34
|
+
|
|
35
|
+
Grounding requirements:
|
|
36
|
+
- Use source file content to craft example text when available.
|
|
37
|
+
- Prefer quoting or lightly paraphrasing source text over inventing details.
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
Generate a DocumentExtractorConfig using the provided inputs.
|
|
2
|
+
Requirements:
|
|
3
|
+
- Generate high-quality examples that match the prompt and extraction classes.
|
|
4
|
+
- Ensure examples include realistic source text and cover all extraction classes.
|
|
5
|
+
- Include meaningful attributes on each extraction when applicable.
|
|
6
|
+
- If source files are provided, ground example text in that content.
|
|
7
|
+
- Set the configuration name exactly as provided.
|
|
8
|
+
- Preserve the provided prompt description and extraction classes.
|
|
9
|
+
- Do not add or remove extraction classes.
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
Optimize the extraction prompt using the details below:
|
|
2
|
+
User prompt: {{ prompt }}
|
|
3
|
+
Extraction classes:
|
|
4
|
+
{% for item in extraction_classes -%}
|
|
5
|
+
- {{ item }}
|
|
6
|
+
{% else -%}
|
|
7
|
+
- None provided
|
|
8
|
+
{% endfor %}
|
|
9
|
+
{% if additional_context -%}
|
|
10
|
+
Additional context: {{ additional_context }}
|
|
11
|
+
{% endif %}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
You are a vector search planner.
|
|
2
|
+
|
|
3
|
+
Instructions:
|
|
4
|
+
- Break the user query into 1-5 focused vector search queries.
|
|
5
|
+
- Prefer short, keyword-rich queries optimized for semantic retrieval.
|
|
6
|
+
- Avoid web-search phrasing like "site:" or "latest news."
|
|
7
|
+
- Provide a clear reason for each query.
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
You are a vector search report writer.
|
|
2
|
+
|
|
3
|
+
Instructions:
|
|
4
|
+
- Use only the provided vector search results.
|
|
5
|
+
- Summarize findings without referencing the public web.
|
|
6
|
+
- If sources are requested, describe them as vector store entries (e.g., file names or "vector store chunk").
|
|
7
|
+
- Keep the report focused on information grounded in the retrieved texts.
|
|
@@ -24,8 +24,6 @@ run_sync
|
|
|
24
24
|
Execute a response workflow synchronously with resource cleanup.
|
|
25
25
|
run_async
|
|
26
26
|
Execute a response workflow asynchronously with resource cleanup.
|
|
27
|
-
run_streamed
|
|
28
|
-
Execute a response workflow and return the asynchronous result.
|
|
29
27
|
attach_vector_store
|
|
30
28
|
Attach vector stores to a response's file_search tool.
|
|
31
29
|
process_files
|
|
@@ -35,11 +33,11 @@ process_files
|
|
|
35
33
|
from __future__ import annotations
|
|
36
34
|
|
|
37
35
|
from .base import ResponseBase
|
|
38
|
-
from .
|
|
36
|
+
from .configuration import ResponseConfiguration, ResponseRegistry, get_default_registry
|
|
39
37
|
from .files import process_files
|
|
40
38
|
from .messages import ResponseMessage, ResponseMessages
|
|
41
|
-
from .runner import run_async,
|
|
42
|
-
from .tool_call import ResponseToolCall
|
|
39
|
+
from .runner import run_async, run_sync
|
|
40
|
+
from .tool_call import ResponseToolCall
|
|
43
41
|
from .vector_store import attach_vector_store
|
|
44
42
|
|
|
45
43
|
__all__ = [
|
|
@@ -51,9 +49,7 @@ __all__ = [
|
|
|
51
49
|
"ResponseMessages",
|
|
52
50
|
"run_sync",
|
|
53
51
|
"run_async",
|
|
54
|
-
"run_streamed",
|
|
55
52
|
"ResponseToolCall",
|
|
56
|
-
"parse_tool_arguments",
|
|
57
53
|
"attach_vector_store",
|
|
58
54
|
"process_files",
|
|
59
55
|
]
|