openai-sdk-helpers 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- openai_sdk_helpers/__init__.py +45 -41
- openai_sdk_helpers/agent/__init__.py +4 -6
- openai_sdk_helpers/agent/base.py +110 -191
- openai_sdk_helpers/agent/{config.py → configuration.py} +24 -32
- openai_sdk_helpers/agent/{coordination.py → coordinator.py} +22 -23
- openai_sdk_helpers/agent/runner.py +3 -45
- openai_sdk_helpers/agent/search/base.py +54 -76
- openai_sdk_helpers/agent/search/vector.py +92 -108
- openai_sdk_helpers/agent/search/web.py +104 -82
- openai_sdk_helpers/agent/summarizer.py +22 -28
- openai_sdk_helpers/agent/translator.py +22 -24
- openai_sdk_helpers/agent/{validation.py → validator.py} +19 -23
- openai_sdk_helpers/cli.py +8 -22
- openai_sdk_helpers/environment.py +8 -13
- openai_sdk_helpers/errors.py +9 -0
- openai_sdk_helpers/extract/__init__.py +23 -0
- openai_sdk_helpers/extract/extractor.py +157 -0
- openai_sdk_helpers/extract/generator.py +476 -0
- openai_sdk_helpers/prompt/extractor_config_agent_instructions.jinja +6 -0
- openai_sdk_helpers/prompt/extractor_config_generator.jinja +37 -0
- openai_sdk_helpers/prompt/extractor_config_generator_instructions.jinja +9 -0
- openai_sdk_helpers/prompt/extractor_prompt_optimizer_agent_instructions.jinja +4 -0
- openai_sdk_helpers/prompt/extractor_prompt_optimizer_request.jinja +11 -0
- openai_sdk_helpers/prompt/vector_planner.jinja +7 -0
- openai_sdk_helpers/prompt/vector_search.jinja +6 -0
- openai_sdk_helpers/prompt/vector_writer.jinja +7 -0
- openai_sdk_helpers/response/__init__.py +3 -7
- openai_sdk_helpers/response/base.py +89 -98
- openai_sdk_helpers/response/{config.py → configuration.py} +45 -20
- openai_sdk_helpers/response/files.py +2 -0
- openai_sdk_helpers/response/planner.py +1 -1
- openai_sdk_helpers/response/prompter.py +1 -1
- openai_sdk_helpers/response/runner.py +1 -48
- openai_sdk_helpers/response/tool_call.py +0 -141
- openai_sdk_helpers/response/vector_store.py +8 -5
- openai_sdk_helpers/streamlit_app/__init__.py +1 -1
- openai_sdk_helpers/streamlit_app/app.py +17 -18
- openai_sdk_helpers/streamlit_app/{config.py → configuration.py} +13 -13
- openai_sdk_helpers/structure/__init__.py +16 -0
- openai_sdk_helpers/structure/base.py +239 -278
- openai_sdk_helpers/structure/extraction.py +1228 -0
- openai_sdk_helpers/structure/plan/plan.py +0 -20
- openai_sdk_helpers/structure/plan/task.py +0 -33
- openai_sdk_helpers/structure/prompt.py +16 -0
- openai_sdk_helpers/structure/responses.py +2 -2
- openai_sdk_helpers/structure/web_search.py +0 -10
- openai_sdk_helpers/tools.py +346 -99
- openai_sdk_helpers/types.py +3 -3
- openai_sdk_helpers/utils/__init__.py +9 -6
- openai_sdk_helpers/utils/json/base_model.py +316 -33
- openai_sdk_helpers/utils/json/data_class.py +1 -1
- openai_sdk_helpers/utils/langextract.py +194 -0
- openai_sdk_helpers/utils/registry.py +19 -15
- openai_sdk_helpers/vector_storage/storage.py +1 -1
- {openai_sdk_helpers-0.4.2.dist-info → openai_sdk_helpers-0.5.0.dist-info}/METADATA +25 -11
- openai_sdk_helpers-0.5.0.dist-info/RECORD +95 -0
- openai_sdk_helpers/agent/prompt_utils.py +0 -15
- openai_sdk_helpers/context_manager.py +0 -241
- openai_sdk_helpers/deprecation.py +0 -167
- openai_sdk_helpers/retry.py +0 -175
- openai_sdk_helpers/streamlit_app/streamlit_web_search.py +0 -75
- openai_sdk_helpers/utils/deprecation.py +0 -167
- openai_sdk_helpers-0.4.2.dist-info/RECORD +0 -88
- /openai_sdk_helpers/{logging_config.py → logging.py} +0 -0
- /openai_sdk_helpers/{config.py → settings.py} +0 -0
- {openai_sdk_helpers-0.4.2.dist-info → openai_sdk_helpers-0.5.0.dist-info}/WHEEL +0 -0
- {openai_sdk_helpers-0.4.2.dist-info → openai_sdk_helpers-0.5.0.dist-info}/entry_points.txt +0 -0
- {openai_sdk_helpers-0.4.2.dist-info → openai_sdk_helpers-0.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,13 +3,12 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from pathlib import Path
|
|
6
|
-
from typing import Any, Dict, Optional
|
|
6
|
+
from typing import Any, Dict, Optional
|
|
7
7
|
|
|
8
8
|
from ..structure import SummaryStructure
|
|
9
9
|
from ..structure.base import StructureBase
|
|
10
10
|
from .base import AgentBase
|
|
11
|
-
from .
|
|
12
|
-
from .prompt_utils import DEFAULT_PROMPT_DIR
|
|
11
|
+
from .configuration import AgentConfiguration
|
|
13
12
|
|
|
14
13
|
|
|
15
14
|
class SummarizerAgent(AgentBase):
|
|
@@ -21,11 +20,10 @@ class SummarizerAgent(AgentBase):
|
|
|
21
20
|
|
|
22
21
|
Parameters
|
|
23
22
|
----------
|
|
24
|
-
|
|
25
|
-
Optional
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
Fallback model identifier when not specified elsewhere.
|
|
23
|
+
template_path : Path | str | None, default=None
|
|
24
|
+
Optional template file path for prompt rendering.
|
|
25
|
+
model : str | None, default=None
|
|
26
|
+
Model identifier to use for summarization.
|
|
29
27
|
output_structure : type[StructureBase], default=SummaryStructure
|
|
30
28
|
Type describing the expected summary output.
|
|
31
29
|
|
|
@@ -34,7 +32,7 @@ class SummarizerAgent(AgentBase):
|
|
|
34
32
|
Basic usage with default settings:
|
|
35
33
|
|
|
36
34
|
>>> from openai_sdk_helpers.agent import SummarizerAgent
|
|
37
|
-
>>> summarizer = SummarizerAgent(
|
|
35
|
+
>>> summarizer = SummarizerAgent(model="gpt-4o-mini")
|
|
38
36
|
>>> summary = summarizer.run_sync("Long text to summarize...")
|
|
39
37
|
>>> print(summary.text)
|
|
40
38
|
|
|
@@ -42,7 +40,7 @@ class SummarizerAgent(AgentBase):
|
|
|
42
40
|
|
|
43
41
|
>>> import asyncio
|
|
44
42
|
>>> async def main():
|
|
45
|
-
... summarizer = SummarizerAgent(
|
|
43
|
+
... summarizer = SummarizerAgent(model="gpt-4o-mini")
|
|
46
44
|
... result = await summarizer.run_agent(
|
|
47
45
|
... text="Article content...",
|
|
48
46
|
... metadata={"source": "news.txt", "date": "2025-01-01"}
|
|
@@ -59,42 +57,38 @@ class SummarizerAgent(AgentBase):
|
|
|
59
57
|
def __init__(
|
|
60
58
|
self,
|
|
61
59
|
*,
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
output_structure: Type[StructureBase] = SummaryStructure,
|
|
60
|
+
template_path: Path | str | None = None,
|
|
61
|
+
model: str | None = None,
|
|
65
62
|
) -> None:
|
|
66
63
|
"""Initialize the summarizer agent configuration.
|
|
67
64
|
|
|
68
65
|
Parameters
|
|
69
66
|
----------
|
|
70
|
-
|
|
71
|
-
Optional
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
Fallback model identifier when not specified elsewhere.
|
|
75
|
-
output_structure : type[StructureBase], default=SummaryStructure
|
|
76
|
-
Type describing the expected summary output.
|
|
67
|
+
template_path : Path | str | None, default=None
|
|
68
|
+
Optional template file path for prompt rendering.
|
|
69
|
+
model : str | None, default=None
|
|
70
|
+
Model identifier to use for summarization.
|
|
77
71
|
|
|
78
72
|
Raises
|
|
79
73
|
------
|
|
80
74
|
ValueError
|
|
81
|
-
If the
|
|
75
|
+
If the model is not provided.
|
|
82
76
|
|
|
83
77
|
Examples
|
|
84
78
|
--------
|
|
85
|
-
>>> summarizer = SummarizerAgent(
|
|
79
|
+
>>> summarizer = SummarizerAgent(model="gpt-4o-mini")
|
|
86
80
|
"""
|
|
87
|
-
|
|
81
|
+
configuration = AgentConfiguration(
|
|
88
82
|
name="summarizer",
|
|
89
83
|
instructions="Agent instructions",
|
|
90
84
|
description="Summarize passages into concise findings.",
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
super().__init__(
|
|
95
|
-
config=config, prompt_dir=prompt_directory, default_model=default_model
|
|
85
|
+
template_path=template_path,
|
|
86
|
+
output_structure=SummaryStructure,
|
|
87
|
+
model=model,
|
|
96
88
|
)
|
|
97
89
|
|
|
90
|
+
super().__init__(configuration=configuration)
|
|
91
|
+
|
|
98
92
|
async def run_agent(
|
|
99
93
|
self, text: str, metadata: Optional[Dict[str, Any]] = None
|
|
100
94
|
) -> Any:
|
|
@@ -5,12 +5,13 @@ from __future__ import annotations
|
|
|
5
5
|
from pathlib import Path
|
|
6
6
|
from typing import Any, Dict, Optional
|
|
7
7
|
|
|
8
|
-
|
|
9
|
-
from .config import AgentConfiguration
|
|
10
|
-
from .prompt_utils import DEFAULT_PROMPT_DIR
|
|
8
|
+
|
|
11
9
|
from ..structure import TranslationStructure
|
|
12
10
|
from ..structure.base import StructureBase
|
|
13
11
|
|
|
12
|
+
from .base import AgentBase
|
|
13
|
+
from .configuration import AgentConfiguration
|
|
14
|
+
|
|
14
15
|
|
|
15
16
|
class TranslatorAgent(AgentBase):
|
|
16
17
|
"""Translate text into a target language.
|
|
@@ -20,18 +21,17 @@ class TranslatorAgent(AgentBase):
|
|
|
20
21
|
|
|
21
22
|
Parameters
|
|
22
23
|
----------
|
|
23
|
-
|
|
24
|
-
Optional
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
Fallback model identifier when not specified elsewhere.
|
|
24
|
+
template_path : Path | str | None, default=None
|
|
25
|
+
Optional template file path for prompt rendering.
|
|
26
|
+
model : str | None, default=None
|
|
27
|
+
Model identifier to use for translation.
|
|
28
28
|
|
|
29
29
|
Examples
|
|
30
30
|
--------
|
|
31
31
|
Basic translation:
|
|
32
32
|
|
|
33
33
|
>>> from openai_sdk_helpers.agent import TranslatorAgent
|
|
34
|
-
>>> translator = TranslatorAgent(
|
|
34
|
+
>>> translator = TranslatorAgent(model="gpt-4o-mini")
|
|
35
35
|
>>> result = translator.run_sync("Hello world", target_language="Spanish")
|
|
36
36
|
>>> print(result.text)
|
|
37
37
|
'Hola mundo'
|
|
@@ -40,7 +40,7 @@ class TranslatorAgent(AgentBase):
|
|
|
40
40
|
|
|
41
41
|
>>> import asyncio
|
|
42
42
|
>>> async def main():
|
|
43
|
-
... translator = TranslatorAgent(
|
|
43
|
+
... translator = TranslatorAgent(model="gpt-4o-mini")
|
|
44
44
|
... result = await translator.run_agent(
|
|
45
45
|
... text="Good morning",
|
|
46
46
|
... target_language="French",
|
|
@@ -60,38 +60,36 @@ class TranslatorAgent(AgentBase):
|
|
|
60
60
|
def __init__(
|
|
61
61
|
self,
|
|
62
62
|
*,
|
|
63
|
-
|
|
64
|
-
|
|
63
|
+
template_path: Path | str | None = None,
|
|
64
|
+
model: str | None = None,
|
|
65
65
|
) -> None:
|
|
66
66
|
"""Initialize the translation agent configuration.
|
|
67
67
|
|
|
68
68
|
Parameters
|
|
69
69
|
----------
|
|
70
|
-
|
|
71
|
-
Optional
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
Fallback model identifier when not specified elsewhere.
|
|
70
|
+
template_path : Path | str | None, default=None
|
|
71
|
+
Optional template file path for prompt rendering.
|
|
72
|
+
model : str | None, default=None
|
|
73
|
+
Model identifier to use for translation.
|
|
75
74
|
|
|
76
75
|
Raises
|
|
77
76
|
------
|
|
78
77
|
ValueError
|
|
79
|
-
If the
|
|
78
|
+
If the model is not provided.
|
|
80
79
|
|
|
81
80
|
Examples
|
|
82
81
|
--------
|
|
83
|
-
>>> translator = TranslatorAgent(
|
|
82
|
+
>>> translator = TranslatorAgent(model="gpt-4o-mini")
|
|
84
83
|
"""
|
|
85
|
-
|
|
84
|
+
configuration = AgentConfiguration(
|
|
86
85
|
name="translator",
|
|
87
86
|
instructions="Agent instructions",
|
|
88
87
|
description="Translate text into the requested language.",
|
|
88
|
+
template_path=template_path,
|
|
89
89
|
output_structure=TranslationStructure,
|
|
90
|
+
model=model,
|
|
90
91
|
)
|
|
91
|
-
|
|
92
|
-
super().__init__(
|
|
93
|
-
config=config, prompt_dir=prompt_directory, default_model=default_model
|
|
94
|
-
)
|
|
92
|
+
super().__init__(configuration=configuration)
|
|
95
93
|
|
|
96
94
|
async def run_agent(
|
|
97
95
|
self,
|
|
@@ -7,8 +7,7 @@ from typing import Any, Dict, Optional
|
|
|
7
7
|
|
|
8
8
|
from ..structure.validation import ValidationResultStructure
|
|
9
9
|
from .base import AgentBase
|
|
10
|
-
from .
|
|
11
|
-
from .prompt_utils import DEFAULT_PROMPT_DIR
|
|
10
|
+
from .configuration import AgentConfiguration
|
|
12
11
|
|
|
13
12
|
|
|
14
13
|
class ValidatorAgent(AgentBase):
|
|
@@ -20,18 +19,17 @@ class ValidatorAgent(AgentBase):
|
|
|
20
19
|
|
|
21
20
|
Parameters
|
|
22
21
|
----------
|
|
23
|
-
|
|
24
|
-
Optional
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
Fallback model identifier when not specified elsewhere.
|
|
22
|
+
template_path : Path | str | None, default=None
|
|
23
|
+
Optional template file path for prompt rendering.
|
|
24
|
+
model : str | None, default=None
|
|
25
|
+
Model identifier to use for validation.
|
|
28
26
|
|
|
29
27
|
Examples
|
|
30
28
|
--------
|
|
31
29
|
Validate user input:
|
|
32
30
|
|
|
33
31
|
>>> from openai_sdk_helpers.agent import ValidatorAgent
|
|
34
|
-
>>> validator = ValidatorAgent(
|
|
32
|
+
>>> validator = ValidatorAgent(model="gpt-4o-mini")
|
|
35
33
|
>>> result = validator.run_sync("Tell me about Python programming")
|
|
36
34
|
>>> print(result.input_safe) # True
|
|
37
35
|
>>> print(result.violations) # []
|
|
@@ -40,7 +38,7 @@ class ValidatorAgent(AgentBase):
|
|
|
40
38
|
|
|
41
39
|
>>> import asyncio
|
|
42
40
|
>>> async def main():
|
|
43
|
-
... validator = ValidatorAgent(
|
|
41
|
+
... validator = ValidatorAgent(model="gpt-4o-mini")
|
|
44
42
|
... result = await validator.run_agent(
|
|
45
43
|
... user_input="Summarize this document",
|
|
46
44
|
... agent_output="Summary containing PII...",
|
|
@@ -59,38 +57,36 @@ class ValidatorAgent(AgentBase):
|
|
|
59
57
|
def __init__(
|
|
60
58
|
self,
|
|
61
59
|
*,
|
|
62
|
-
|
|
63
|
-
|
|
60
|
+
template_path: Path | str | None = None,
|
|
61
|
+
model: str | None = None,
|
|
64
62
|
) -> None:
|
|
65
63
|
"""Initialize the validator agent configuration.
|
|
66
64
|
|
|
67
65
|
Parameters
|
|
68
66
|
----------
|
|
69
|
-
|
|
70
|
-
Optional
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
Fallback model identifier when not specified elsewhere.
|
|
67
|
+
template_path : Path | str | None, default=None
|
|
68
|
+
Optional template file path for prompt rendering.
|
|
69
|
+
model : str | None, default=None
|
|
70
|
+
Model identifier to use for validation.
|
|
74
71
|
|
|
75
72
|
Raises
|
|
76
73
|
------
|
|
77
74
|
ValueError
|
|
78
|
-
If the
|
|
75
|
+
If the model is not provided.
|
|
79
76
|
|
|
80
77
|
Examples
|
|
81
78
|
--------
|
|
82
|
-
>>> validator = ValidatorAgent(
|
|
79
|
+
>>> validator = ValidatorAgent(model="gpt-4o-mini")
|
|
83
80
|
"""
|
|
84
|
-
|
|
81
|
+
configuration = AgentConfiguration(
|
|
85
82
|
name="validator",
|
|
86
83
|
instructions="Agent instructions",
|
|
87
84
|
description="Validate user input and agent output against guardrails.",
|
|
85
|
+
template_path=template_path,
|
|
88
86
|
output_structure=ValidationResultStructure,
|
|
87
|
+
model=model,
|
|
89
88
|
)
|
|
90
|
-
|
|
91
|
-
super().__init__(
|
|
92
|
-
config=config, prompt_dir=prompt_directory, default_model=default_model
|
|
93
|
-
)
|
|
89
|
+
super().__init__(configuration=configuration)
|
|
94
90
|
|
|
95
91
|
async def run_agent(
|
|
96
92
|
self,
|
openai_sdk_helpers/cli.py
CHANGED
|
@@ -18,17 +18,8 @@ registry inspect
|
|
|
18
18
|
from __future__ import annotations
|
|
19
19
|
|
|
20
20
|
import argparse
|
|
21
|
-
import json
|
|
22
21
|
import sys
|
|
23
22
|
from pathlib import Path
|
|
24
|
-
from typing import Any
|
|
25
|
-
|
|
26
|
-
try:
|
|
27
|
-
import openai_sdk_helpers
|
|
28
|
-
|
|
29
|
-
__version__ = getattr(openai_sdk_helpers, "__version__", "unknown")
|
|
30
|
-
except ImportError:
|
|
31
|
-
__version__ = "unknown"
|
|
32
23
|
|
|
33
24
|
|
|
34
25
|
def cmd_agent_test(args: argparse.Namespace) -> int:
|
|
@@ -159,8 +150,8 @@ def cmd_registry_list(args: argparse.Namespace) -> int:
|
|
|
159
150
|
|
|
160
151
|
print("Registered configurations:")
|
|
161
152
|
for name in sorted(names):
|
|
162
|
-
|
|
163
|
-
tools_count = len(
|
|
153
|
+
configuration = registry.get(name)
|
|
154
|
+
tools_count = len(configuration.tools) if configuration.tools else 0
|
|
164
155
|
print(f" - {name} ({tools_count} tools)")
|
|
165
156
|
|
|
166
157
|
return 0
|
|
@@ -199,7 +190,7 @@ def cmd_registry_inspect(args: argparse.Namespace) -> int:
|
|
|
199
190
|
registry = get_default_registry()
|
|
200
191
|
|
|
201
192
|
try:
|
|
202
|
-
|
|
193
|
+
configuration = registry.get(args.config_name)
|
|
203
194
|
except KeyError:
|
|
204
195
|
print(f"Error: Configuration '{args.config_name}' not found", file=sys.stderr)
|
|
205
196
|
print("\nAvailable configurations:")
|
|
@@ -207,17 +198,17 @@ def cmd_registry_inspect(args: argparse.Namespace) -> int:
|
|
|
207
198
|
print(f" - {name}")
|
|
208
199
|
return 1
|
|
209
200
|
|
|
210
|
-
print(f"Configuration: {
|
|
211
|
-
instructions_str = str(
|
|
201
|
+
print(f"Configuration: {configuration.name}")
|
|
202
|
+
instructions_str = str(configuration.instructions)
|
|
212
203
|
instructions_preview = (
|
|
213
204
|
instructions_str[:100] if len(instructions_str) > 100 else instructions_str
|
|
214
205
|
)
|
|
215
206
|
print(f"Instructions: {instructions_preview}...")
|
|
216
|
-
print(f"Tools: {len(
|
|
207
|
+
print(f"Tools: {len(configuration.tools) if configuration.tools else 0}")
|
|
217
208
|
|
|
218
|
-
if
|
|
209
|
+
if configuration.tools:
|
|
219
210
|
print("\nTool names:")
|
|
220
|
-
for tool in
|
|
211
|
+
for tool in configuration.tools:
|
|
221
212
|
tool_name = tool.get("function", {}).get("name", "unknown")
|
|
222
213
|
print(f" - {tool_name}")
|
|
223
214
|
|
|
@@ -245,11 +236,6 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
245
236
|
prog="openai-helpers",
|
|
246
237
|
description="OpenAI SDK Helpers CLI",
|
|
247
238
|
)
|
|
248
|
-
parser.add_argument(
|
|
249
|
-
"--version",
|
|
250
|
-
action="version",
|
|
251
|
-
version=f"openai-sdk-helpers {__version__}",
|
|
252
|
-
)
|
|
253
239
|
|
|
254
240
|
subparsers = parser.add_subparsers(dest="command", help="Commands")
|
|
255
241
|
|
|
@@ -18,8 +18,6 @@ get_data_path(name)
|
|
|
18
18
|
|
|
19
19
|
from __future__ import annotations
|
|
20
20
|
|
|
21
|
-
import os
|
|
22
|
-
import os
|
|
23
21
|
from pathlib import Path
|
|
24
22
|
from dotenv import load_dotenv
|
|
25
23
|
|
|
@@ -61,18 +59,15 @@ def get_data_path(name: str) -> Path:
|
|
|
61
59
|
return ensure_directory(path)
|
|
62
60
|
|
|
63
61
|
|
|
64
|
-
def
|
|
65
|
-
"""Return the
|
|
62
|
+
def get_package_path() -> Path:
|
|
63
|
+
"""Return the root path of the openai-sdk-helpers package.
|
|
66
64
|
|
|
67
65
|
Returns
|
|
68
66
|
-------
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
Examples
|
|
73
|
-
--------
|
|
74
|
-
>>> from openai_sdk_helpers.environment import _get_default_model
|
|
75
|
-
>>> _get_default_model()
|
|
76
|
-
'gpt-4o-mini'
|
|
67
|
+
Path
|
|
68
|
+
Root directory path of the openai-sdk-helpers package.
|
|
77
69
|
"""
|
|
78
|
-
return
|
|
70
|
+
return Path(__file__).parent
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
DEFAULT_PROMPT_DIR = get_package_path() / "prompt"
|
openai_sdk_helpers/errors.py
CHANGED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""LangExtract-powered document extraction helpers."""
|
|
2
|
+
|
|
3
|
+
from .extractor import DocumentExtractor
|
|
4
|
+
from .generator import (
|
|
5
|
+
EXTRACTOR_CONFIG_GENERATOR,
|
|
6
|
+
EXTRACTOR_CONFIG_AGENT_INSTRUCTIONS,
|
|
7
|
+
PROMPT_OPTIMIZER_AGENT_INSTRUCTIONS,
|
|
8
|
+
generate_document_extractor_config,
|
|
9
|
+
generate_document_extractor_config_with_agent,
|
|
10
|
+
optimize_extractor_prompt,
|
|
11
|
+
optimize_extractor_prompt_with_agent,
|
|
12
|
+
)
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"DocumentExtractor",
|
|
16
|
+
"EXTRACTOR_CONFIG_GENERATOR",
|
|
17
|
+
"EXTRACTOR_CONFIG_AGENT_INSTRUCTIONS",
|
|
18
|
+
"PROMPT_OPTIMIZER_AGENT_INSTRUCTIONS",
|
|
19
|
+
"generate_document_extractor_config",
|
|
20
|
+
"generate_document_extractor_config_with_agent",
|
|
21
|
+
"optimize_extractor_prompt",
|
|
22
|
+
"optimize_extractor_prompt_with_agent",
|
|
23
|
+
]
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
"""Document extraction helpers powered by LangExtract."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
import typing
|
|
8
|
+
|
|
9
|
+
import langextract as lx
|
|
10
|
+
from langextract.core import format_handler as lx_format_handler
|
|
11
|
+
from langextract.core.data import AnnotatedDocument as LXAnnotatedDocument
|
|
12
|
+
|
|
13
|
+
from ..errors import ExtractionError
|
|
14
|
+
from ..structure.extraction import (
|
|
15
|
+
AnnotatedDocumentStructure,
|
|
16
|
+
DocumentStructure,
|
|
17
|
+
ExampleDataStructure,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class DocumentExtractor:
|
|
22
|
+
"""Extract structured data from documents using LangExtract.
|
|
23
|
+
|
|
24
|
+
Parameters
|
|
25
|
+
----------
|
|
26
|
+
prompt_description : str
|
|
27
|
+
Prompt description used by LangExtract.
|
|
28
|
+
examples : Sequence[ExampleDataStructure]
|
|
29
|
+
Example payloads supplied to LangExtract.
|
|
30
|
+
model_id : str
|
|
31
|
+
Model identifier to pass to LangExtract.
|
|
32
|
+
max_workers : int, optional
|
|
33
|
+
Maximum number of workers for concurrent extraction. Default is 1.
|
|
34
|
+
|
|
35
|
+
Methods
|
|
36
|
+
-------
|
|
37
|
+
extract(input_text)
|
|
38
|
+
Extract structured data from one or more documents.
|
|
39
|
+
"""
|
|
40
|
+
|
|
41
|
+
def __init__(
|
|
42
|
+
self,
|
|
43
|
+
prompt_description: str,
|
|
44
|
+
examples: typing.Sequence[ExampleDataStructure],
|
|
45
|
+
model_id: str,
|
|
46
|
+
max_workers: int = 1,
|
|
47
|
+
) -> None:
|
|
48
|
+
"""Initialize the extractor.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
prompt_description : str
|
|
53
|
+
Prompt description used by LangExtract.
|
|
54
|
+
examples : Sequence[ExampleDataStructure]
|
|
55
|
+
Example payloads supplied to LangExtract.
|
|
56
|
+
model_id : str
|
|
57
|
+
Model identifier to pass to LangExtract.
|
|
58
|
+
max_workers : int, optional
|
|
59
|
+
Maximum number of workers for concurrent extraction. Default is 1.
|
|
60
|
+
"""
|
|
61
|
+
if not examples:
|
|
62
|
+
raise ValueError(
|
|
63
|
+
"Examples are required for reliable extraction. "
|
|
64
|
+
"Provide at least one ExampleDataStructure instance."
|
|
65
|
+
)
|
|
66
|
+
self.model_id = model_id
|
|
67
|
+
self.prompt = prompt_description
|
|
68
|
+
self.examples = examples
|
|
69
|
+
self.max_workers = max_workers
|
|
70
|
+
|
|
71
|
+
def extract(
|
|
72
|
+
self, input_text: DocumentStructure | list[DocumentStructure]
|
|
73
|
+
) -> list[AnnotatedDocumentStructure]:
|
|
74
|
+
"""Run the extraction.
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
input_text : DocumentStructure | list[DocumentStructure]
|
|
79
|
+
Document or list of documents to extract data from.
|
|
80
|
+
|
|
81
|
+
Returns
|
|
82
|
+
-------
|
|
83
|
+
list[AnnotatedDocumentStructure]
|
|
84
|
+
Extracted items for the provided documents.
|
|
85
|
+
"""
|
|
86
|
+
if isinstance(input_text, DocumentStructure):
|
|
87
|
+
input_documents = [input_text]
|
|
88
|
+
else:
|
|
89
|
+
input_documents = input_text
|
|
90
|
+
documents = DocumentStructure.to_dataclass_list(input_documents)
|
|
91
|
+
examples = ExampleDataStructure.to_dataclass_list(self.examples)
|
|
92
|
+
resolver_params = {"format_handler": _SanitizingFormatHandler()}
|
|
93
|
+
result = lx.extract(
|
|
94
|
+
text_or_documents=documents,
|
|
95
|
+
prompt_description=self.prompt,
|
|
96
|
+
examples=examples,
|
|
97
|
+
model_id=self.model_id, # Automatically selects OpenAI provider
|
|
98
|
+
api_key=os.environ.get("OPENAI_API_KEY"),
|
|
99
|
+
fence_output=True,
|
|
100
|
+
use_schema_constraints=False,
|
|
101
|
+
resolver_params=resolver_params,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def _convert(data: typing.Any) -> AnnotatedDocumentStructure:
|
|
105
|
+
if isinstance(data, LXAnnotatedDocument):
|
|
106
|
+
return AnnotatedDocumentStructure.from_dataclass(data)
|
|
107
|
+
return AnnotatedDocumentStructure.model_validate(data)
|
|
108
|
+
|
|
109
|
+
if isinstance(result, list):
|
|
110
|
+
return [_convert(doc) for doc in result]
|
|
111
|
+
|
|
112
|
+
return [_convert(result)]
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def _sanitize_extraction_items(
|
|
116
|
+
items: typing.Sequence[typing.Mapping[str, lx_format_handler.ExtractionValueType]],
|
|
117
|
+
attribute_suffix: str,
|
|
118
|
+
) -> list[dict[str, lx_format_handler.ExtractionValueType]]:
|
|
119
|
+
sanitized: list[dict[str, lx_format_handler.ExtractionValueType]] = []
|
|
120
|
+
for item in items:
|
|
121
|
+
updated: dict[str, lx_format_handler.ExtractionValueType] = {}
|
|
122
|
+
for key, value in item.items():
|
|
123
|
+
keep, cleaned = _sanitize_extraction_value(key, value, attribute_suffix)
|
|
124
|
+
if not keep:
|
|
125
|
+
continue
|
|
126
|
+
updated[key] = cleaned
|
|
127
|
+
sanitized.append(updated)
|
|
128
|
+
return sanitized
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _sanitize_extraction_value(
|
|
132
|
+
key: str,
|
|
133
|
+
value: lx_format_handler.ExtractionValueType,
|
|
134
|
+
attribute_suffix: str,
|
|
135
|
+
) -> tuple[bool, lx_format_handler.ExtractionValueType]:
|
|
136
|
+
if value is None:
|
|
137
|
+
return False, None
|
|
138
|
+
if key.endswith(attribute_suffix):
|
|
139
|
+
if isinstance(value, dict):
|
|
140
|
+
return True, value
|
|
141
|
+
return False, None
|
|
142
|
+
if isinstance(value, (str, int, float)):
|
|
143
|
+
return True, value
|
|
144
|
+
return True, json.dumps(value, ensure_ascii=False)
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
class _SanitizingFormatHandler(lx_format_handler.FormatHandler):
|
|
148
|
+
"""Sanitize LangExtract output before the resolver validates types."""
|
|
149
|
+
|
|
150
|
+
def parse_output(
|
|
151
|
+
self, text: str, *, strict: bool | None = None
|
|
152
|
+
) -> typing.Sequence[typing.Mapping[str, lx_format_handler.ExtractionValueType]]:
|
|
153
|
+
items = super().parse_output(text, strict=strict)
|
|
154
|
+
return _sanitize_extraction_items(items, self.attribute_suffix)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
__all__ = ["DocumentExtractor", "ExtractionError"]
|