openai-sdk-helpers 0.4.2__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. openai_sdk_helpers/__init__.py +45 -41
  2. openai_sdk_helpers/agent/__init__.py +4 -6
  3. openai_sdk_helpers/agent/base.py +110 -191
  4. openai_sdk_helpers/agent/{config.py → configuration.py} +24 -32
  5. openai_sdk_helpers/agent/{coordination.py → coordinator.py} +22 -23
  6. openai_sdk_helpers/agent/runner.py +3 -45
  7. openai_sdk_helpers/agent/search/base.py +54 -76
  8. openai_sdk_helpers/agent/search/vector.py +92 -108
  9. openai_sdk_helpers/agent/search/web.py +104 -82
  10. openai_sdk_helpers/agent/summarizer.py +22 -28
  11. openai_sdk_helpers/agent/translator.py +22 -24
  12. openai_sdk_helpers/agent/{validation.py → validator.py} +19 -23
  13. openai_sdk_helpers/cli.py +8 -22
  14. openai_sdk_helpers/environment.py +8 -13
  15. openai_sdk_helpers/errors.py +9 -0
  16. openai_sdk_helpers/extract/__init__.py +23 -0
  17. openai_sdk_helpers/extract/extractor.py +157 -0
  18. openai_sdk_helpers/extract/generator.py +476 -0
  19. openai_sdk_helpers/prompt/extractor_config_agent_instructions.jinja +6 -0
  20. openai_sdk_helpers/prompt/extractor_config_generator.jinja +37 -0
  21. openai_sdk_helpers/prompt/extractor_config_generator_instructions.jinja +9 -0
  22. openai_sdk_helpers/prompt/extractor_prompt_optimizer_agent_instructions.jinja +4 -0
  23. openai_sdk_helpers/prompt/extractor_prompt_optimizer_request.jinja +11 -0
  24. openai_sdk_helpers/prompt/vector_planner.jinja +7 -0
  25. openai_sdk_helpers/prompt/vector_search.jinja +6 -0
  26. openai_sdk_helpers/prompt/vector_writer.jinja +7 -0
  27. openai_sdk_helpers/response/__init__.py +3 -7
  28. openai_sdk_helpers/response/base.py +89 -98
  29. openai_sdk_helpers/response/{config.py → configuration.py} +45 -20
  30. openai_sdk_helpers/response/files.py +2 -0
  31. openai_sdk_helpers/response/planner.py +1 -1
  32. openai_sdk_helpers/response/prompter.py +1 -1
  33. openai_sdk_helpers/response/runner.py +1 -48
  34. openai_sdk_helpers/response/tool_call.py +0 -141
  35. openai_sdk_helpers/response/vector_store.py +8 -5
  36. openai_sdk_helpers/streamlit_app/__init__.py +1 -1
  37. openai_sdk_helpers/streamlit_app/app.py +17 -18
  38. openai_sdk_helpers/streamlit_app/{config.py → configuration.py} +13 -13
  39. openai_sdk_helpers/structure/__init__.py +16 -0
  40. openai_sdk_helpers/structure/base.py +239 -278
  41. openai_sdk_helpers/structure/extraction.py +1228 -0
  42. openai_sdk_helpers/structure/plan/plan.py +0 -20
  43. openai_sdk_helpers/structure/plan/task.py +0 -33
  44. openai_sdk_helpers/structure/prompt.py +16 -0
  45. openai_sdk_helpers/structure/responses.py +2 -2
  46. openai_sdk_helpers/structure/web_search.py +0 -10
  47. openai_sdk_helpers/tools.py +346 -99
  48. openai_sdk_helpers/types.py +3 -3
  49. openai_sdk_helpers/utils/__init__.py +9 -6
  50. openai_sdk_helpers/utils/json/base_model.py +316 -33
  51. openai_sdk_helpers/utils/json/data_class.py +1 -1
  52. openai_sdk_helpers/utils/langextract.py +194 -0
  53. openai_sdk_helpers/utils/registry.py +19 -15
  54. openai_sdk_helpers/vector_storage/storage.py +1 -1
  55. {openai_sdk_helpers-0.4.2.dist-info → openai_sdk_helpers-0.5.0.dist-info}/METADATA +25 -11
  56. openai_sdk_helpers-0.5.0.dist-info/RECORD +95 -0
  57. openai_sdk_helpers/agent/prompt_utils.py +0 -15
  58. openai_sdk_helpers/context_manager.py +0 -241
  59. openai_sdk_helpers/deprecation.py +0 -167
  60. openai_sdk_helpers/retry.py +0 -175
  61. openai_sdk_helpers/streamlit_app/streamlit_web_search.py +0 -75
  62. openai_sdk_helpers/utils/deprecation.py +0 -167
  63. openai_sdk_helpers-0.4.2.dist-info/RECORD +0 -88
  64. /openai_sdk_helpers/{logging_config.py → logging.py} +0 -0
  65. /openai_sdk_helpers/{config.py → settings.py} +0 -0
  66. {openai_sdk_helpers-0.4.2.dist-info → openai_sdk_helpers-0.5.0.dist-info}/WHEEL +0 -0
  67. {openai_sdk_helpers-0.4.2.dist-info → openai_sdk_helpers-0.5.0.dist-info}/entry_points.txt +0 -0
  68. {openai_sdk_helpers-0.4.2.dist-info → openai_sdk_helpers-0.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -3,13 +3,12 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from pathlib import Path
6
- from typing import Any, Dict, Optional, Type
6
+ from typing import Any, Dict, Optional
7
7
 
8
8
  from ..structure import SummaryStructure
9
9
  from ..structure.base import StructureBase
10
10
  from .base import AgentBase
11
- from .config import AgentConfiguration
12
- from .prompt_utils import DEFAULT_PROMPT_DIR
11
+ from .configuration import AgentConfiguration
13
12
 
14
13
 
15
14
  class SummarizerAgent(AgentBase):
@@ -21,11 +20,10 @@ class SummarizerAgent(AgentBase):
21
20
 
22
21
  Parameters
23
22
  ----------
24
- prompt_dir : Path or None, default=None
25
- Optional directory containing Jinja prompt templates. Defaults to the
26
- packaged ``prompt`` directory when not provided.
27
- default_model : str or None, default=None
28
- Fallback model identifier when not specified elsewhere.
23
+ template_path : Path | str | None, default=None
24
+ Optional template file path for prompt rendering.
25
+ model : str | None, default=None
26
+ Model identifier to use for summarization.
29
27
  output_structure : type[StructureBase], default=SummaryStructure
30
28
  Type describing the expected summary output.
31
29
 
@@ -34,7 +32,7 @@ class SummarizerAgent(AgentBase):
34
32
  Basic usage with default settings:
35
33
 
36
34
  >>> from openai_sdk_helpers.agent import SummarizerAgent
37
- >>> summarizer = SummarizerAgent(default_model="gpt-4o-mini")
35
+ >>> summarizer = SummarizerAgent(model="gpt-4o-mini")
38
36
  >>> summary = summarizer.run_sync("Long text to summarize...")
39
37
  >>> print(summary.text)
40
38
 
@@ -42,7 +40,7 @@ class SummarizerAgent(AgentBase):
42
40
 
43
41
  >>> import asyncio
44
42
  >>> async def main():
45
- ... summarizer = SummarizerAgent(default_model="gpt-4o-mini")
43
+ ... summarizer = SummarizerAgent(model="gpt-4o-mini")
46
44
  ... result = await summarizer.run_agent(
47
45
  ... text="Article content...",
48
46
  ... metadata={"source": "news.txt", "date": "2025-01-01"}
@@ -59,42 +57,38 @@ class SummarizerAgent(AgentBase):
59
57
  def __init__(
60
58
  self,
61
59
  *,
62
- prompt_dir: Optional[Path] = None,
63
- default_model: Optional[str] = None,
64
- output_structure: Type[StructureBase] = SummaryStructure,
60
+ template_path: Path | str | None = None,
61
+ model: str | None = None,
65
62
  ) -> None:
66
63
  """Initialize the summarizer agent configuration.
67
64
 
68
65
  Parameters
69
66
  ----------
70
- prompt_dir : Path or None, default=None
71
- Optional directory containing Jinja prompt templates. Defaults to the
72
- packaged ``prompt`` directory when not provided.
73
- default_model : str or None, default=None
74
- Fallback model identifier when not specified elsewhere.
75
- output_structure : type[StructureBase], default=SummaryStructure
76
- Type describing the expected summary output.
67
+ template_path : Path | str | None, default=None
68
+ Optional template file path for prompt rendering.
69
+ model : str | None, default=None
70
+ Model identifier to use for summarization.
77
71
 
78
72
  Raises
79
73
  ------
80
74
  ValueError
81
- If the default model is not provided.
75
+ If the model is not provided.
82
76
 
83
77
  Examples
84
78
  --------
85
- >>> summarizer = SummarizerAgent(default_model="gpt-4o-mini")
79
+ >>> summarizer = SummarizerAgent(model="gpt-4o-mini")
86
80
  """
87
- config = AgentConfiguration(
81
+ configuration = AgentConfiguration(
88
82
  name="summarizer",
89
83
  instructions="Agent instructions",
90
84
  description="Summarize passages into concise findings.",
91
- output_structure=output_structure,
92
- )
93
- prompt_directory = prompt_dir or DEFAULT_PROMPT_DIR
94
- super().__init__(
95
- config=config, prompt_dir=prompt_directory, default_model=default_model
85
+ template_path=template_path,
86
+ output_structure=SummaryStructure,
87
+ model=model,
96
88
  )
97
89
 
90
+ super().__init__(configuration=configuration)
91
+
98
92
  async def run_agent(
99
93
  self, text: str, metadata: Optional[Dict[str, Any]] = None
100
94
  ) -> Any:
@@ -5,12 +5,13 @@ from __future__ import annotations
5
5
  from pathlib import Path
6
6
  from typing import Any, Dict, Optional
7
7
 
8
- from .base import AgentBase
9
- from .config import AgentConfiguration
10
- from .prompt_utils import DEFAULT_PROMPT_DIR
8
+
11
9
  from ..structure import TranslationStructure
12
10
  from ..structure.base import StructureBase
13
11
 
12
+ from .base import AgentBase
13
+ from .configuration import AgentConfiguration
14
+
14
15
 
15
16
  class TranslatorAgent(AgentBase):
16
17
  """Translate text into a target language.
@@ -20,18 +21,17 @@ class TranslatorAgent(AgentBase):
20
21
 
21
22
  Parameters
22
23
  ----------
23
- prompt_dir : Path or None, default=None
24
- Optional directory containing Jinja prompt templates. Defaults to the
25
- packaged ``prompt`` directory when not provided.
26
- default_model : str or None, default=None
27
- Fallback model identifier when not specified elsewhere.
24
+ template_path : Path | str | None, default=None
25
+ Optional template file path for prompt rendering.
26
+ model : str | None, default=None
27
+ Model identifier to use for translation.
28
28
 
29
29
  Examples
30
30
  --------
31
31
  Basic translation:
32
32
 
33
33
  >>> from openai_sdk_helpers.agent import TranslatorAgent
34
- >>> translator = TranslatorAgent(default_model="gpt-4o-mini")
34
+ >>> translator = TranslatorAgent(model="gpt-4o-mini")
35
35
  >>> result = translator.run_sync("Hello world", target_language="Spanish")
36
36
  >>> print(result.text)
37
37
  'Hola mundo'
@@ -40,7 +40,7 @@ class TranslatorAgent(AgentBase):
40
40
 
41
41
  >>> import asyncio
42
42
  >>> async def main():
43
- ... translator = TranslatorAgent(default_model="gpt-4o-mini")
43
+ ... translator = TranslatorAgent(model="gpt-4o-mini")
44
44
  ... result = await translator.run_agent(
45
45
  ... text="Good morning",
46
46
  ... target_language="French",
@@ -60,38 +60,36 @@ class TranslatorAgent(AgentBase):
60
60
  def __init__(
61
61
  self,
62
62
  *,
63
- prompt_dir: Optional[Path] = None,
64
- default_model: Optional[str] = None,
63
+ template_path: Path | str | None = None,
64
+ model: str | None = None,
65
65
  ) -> None:
66
66
  """Initialize the translation agent configuration.
67
67
 
68
68
  Parameters
69
69
  ----------
70
- prompt_dir : Path or None, default=None
71
- Optional directory containing Jinja prompt templates. Defaults to the
72
- packaged ``prompt`` directory when not provided.
73
- default_model : str or None, default=None
74
- Fallback model identifier when not specified elsewhere.
70
+ template_path : Path | str | None, default=None
71
+ Optional template file path for prompt rendering.
72
+ model : str | None, default=None
73
+ Model identifier to use for translation.
75
74
 
76
75
  Raises
77
76
  ------
78
77
  ValueError
79
- If the default model is not provided.
78
+ If the model is not provided.
80
79
 
81
80
  Examples
82
81
  --------
83
- >>> translator = TranslatorAgent(default_model="gpt-4o-mini")
82
+ >>> translator = TranslatorAgent(model="gpt-4o-mini")
84
83
  """
85
- config = AgentConfiguration(
84
+ configuration = AgentConfiguration(
86
85
  name="translator",
87
86
  instructions="Agent instructions",
88
87
  description="Translate text into the requested language.",
88
+ template_path=template_path,
89
89
  output_structure=TranslationStructure,
90
+ model=model,
90
91
  )
91
- prompt_directory = prompt_dir or DEFAULT_PROMPT_DIR
92
- super().__init__(
93
- config=config, prompt_dir=prompt_directory, default_model=default_model
94
- )
92
+ super().__init__(configuration=configuration)
95
93
 
96
94
  async def run_agent(
97
95
  self,
@@ -7,8 +7,7 @@ from typing import Any, Dict, Optional
7
7
 
8
8
  from ..structure.validation import ValidationResultStructure
9
9
  from .base import AgentBase
10
- from .config import AgentConfiguration
11
- from .prompt_utils import DEFAULT_PROMPT_DIR
10
+ from .configuration import AgentConfiguration
12
11
 
13
12
 
14
13
  class ValidatorAgent(AgentBase):
@@ -20,18 +19,17 @@ class ValidatorAgent(AgentBase):
20
19
 
21
20
  Parameters
22
21
  ----------
23
- prompt_dir : Path or None, default=None
24
- Optional directory containing Jinja prompt templates. Defaults to the
25
- packaged ``prompt`` directory when not provided.
26
- default_model : str or None, default=None
27
- Fallback model identifier when not specified elsewhere.
22
+ template_path : Path | str | None, default=None
23
+ Optional template file path for prompt rendering.
24
+ model : str | None, default=None
25
+ Model identifier to use for validation.
28
26
 
29
27
  Examples
30
28
  --------
31
29
  Validate user input:
32
30
 
33
31
  >>> from openai_sdk_helpers.agent import ValidatorAgent
34
- >>> validator = ValidatorAgent(default_model="gpt-4o-mini")
32
+ >>> validator = ValidatorAgent(model="gpt-4o-mini")
35
33
  >>> result = validator.run_sync("Tell me about Python programming")
36
34
  >>> print(result.input_safe) # True
37
35
  >>> print(result.violations) # []
@@ -40,7 +38,7 @@ class ValidatorAgent(AgentBase):
40
38
 
41
39
  >>> import asyncio
42
40
  >>> async def main():
43
- ... validator = ValidatorAgent(default_model="gpt-4o-mini")
41
+ ... validator = ValidatorAgent(model="gpt-4o-mini")
44
42
  ... result = await validator.run_agent(
45
43
  ... user_input="Summarize this document",
46
44
  ... agent_output="Summary containing PII...",
@@ -59,38 +57,36 @@ class ValidatorAgent(AgentBase):
59
57
  def __init__(
60
58
  self,
61
59
  *,
62
- prompt_dir: Optional[Path] = None,
63
- default_model: Optional[str] = None,
60
+ template_path: Path | str | None = None,
61
+ model: str | None = None,
64
62
  ) -> None:
65
63
  """Initialize the validator agent configuration.
66
64
 
67
65
  Parameters
68
66
  ----------
69
- prompt_dir : Path or None, default=None
70
- Optional directory containing Jinja prompt templates. Defaults to the
71
- packaged ``prompt`` directory when not provided.
72
- default_model : str or None, default=None
73
- Fallback model identifier when not specified elsewhere.
67
+ template_path : Path | str | None, default=None
68
+ Optional template file path for prompt rendering.
69
+ model : str | None, default=None
70
+ Model identifier to use for validation.
74
71
 
75
72
  Raises
76
73
  ------
77
74
  ValueError
78
- If the default model is not provided.
75
+ If the model is not provided.
79
76
 
80
77
  Examples
81
78
  --------
82
- >>> validator = ValidatorAgent(default_model="gpt-4o-mini")
79
+ >>> validator = ValidatorAgent(model="gpt-4o-mini")
83
80
  """
84
- config = AgentConfiguration(
81
+ configuration = AgentConfiguration(
85
82
  name="validator",
86
83
  instructions="Agent instructions",
87
84
  description="Validate user input and agent output against guardrails.",
85
+ template_path=template_path,
88
86
  output_structure=ValidationResultStructure,
87
+ model=model,
89
88
  )
90
- prompt_directory = prompt_dir or DEFAULT_PROMPT_DIR
91
- super().__init__(
92
- config=config, prompt_dir=prompt_directory, default_model=default_model
93
- )
89
+ super().__init__(configuration=configuration)
94
90
 
95
91
  async def run_agent(
96
92
  self,
openai_sdk_helpers/cli.py CHANGED
@@ -18,17 +18,8 @@ registry inspect
18
18
  from __future__ import annotations
19
19
 
20
20
  import argparse
21
- import json
22
21
  import sys
23
22
  from pathlib import Path
24
- from typing import Any
25
-
26
- try:
27
- import openai_sdk_helpers
28
-
29
- __version__ = getattr(openai_sdk_helpers, "__version__", "unknown")
30
- except ImportError:
31
- __version__ = "unknown"
32
23
 
33
24
 
34
25
  def cmd_agent_test(args: argparse.Namespace) -> int:
@@ -159,8 +150,8 @@ def cmd_registry_list(args: argparse.Namespace) -> int:
159
150
 
160
151
  print("Registered configurations:")
161
152
  for name in sorted(names):
162
- config = registry.get(name)
163
- tools_count = len(config.tools) if config.tools else 0
153
+ configuration = registry.get(name)
154
+ tools_count = len(configuration.tools) if configuration.tools else 0
164
155
  print(f" - {name} ({tools_count} tools)")
165
156
 
166
157
  return 0
@@ -199,7 +190,7 @@ def cmd_registry_inspect(args: argparse.Namespace) -> int:
199
190
  registry = get_default_registry()
200
191
 
201
192
  try:
202
- config = registry.get(args.config_name)
193
+ configuration = registry.get(args.config_name)
203
194
  except KeyError:
204
195
  print(f"Error: Configuration '{args.config_name}' not found", file=sys.stderr)
205
196
  print("\nAvailable configurations:")
@@ -207,17 +198,17 @@ def cmd_registry_inspect(args: argparse.Namespace) -> int:
207
198
  print(f" - {name}")
208
199
  return 1
209
200
 
210
- print(f"Configuration: {config.name}")
211
- instructions_str = str(config.instructions)
201
+ print(f"Configuration: {configuration.name}")
202
+ instructions_str = str(configuration.instructions)
212
203
  instructions_preview = (
213
204
  instructions_str[:100] if len(instructions_str) > 100 else instructions_str
214
205
  )
215
206
  print(f"Instructions: {instructions_preview}...")
216
- print(f"Tools: {len(config.tools) if config.tools else 0}")
207
+ print(f"Tools: {len(configuration.tools) if configuration.tools else 0}")
217
208
 
218
- if config.tools:
209
+ if configuration.tools:
219
210
  print("\nTool names:")
220
- for tool in config.tools:
211
+ for tool in configuration.tools:
221
212
  tool_name = tool.get("function", {}).get("name", "unknown")
222
213
  print(f" - {tool_name}")
223
214
 
@@ -245,11 +236,6 @@ def main(argv: list[str] | None = None) -> int:
245
236
  prog="openai-helpers",
246
237
  description="OpenAI SDK Helpers CLI",
247
238
  )
248
- parser.add_argument(
249
- "--version",
250
- action="version",
251
- version=f"openai-sdk-helpers {__version__}",
252
- )
253
239
 
254
240
  subparsers = parser.add_subparsers(dest="command", help="Commands")
255
241
 
@@ -18,8 +18,6 @@ get_data_path(name)
18
18
 
19
19
  from __future__ import annotations
20
20
 
21
- import os
22
- import os
23
21
  from pathlib import Path
24
22
  from dotenv import load_dotenv
25
23
 
@@ -61,18 +59,15 @@ def get_data_path(name: str) -> Path:
61
59
  return ensure_directory(path)
62
60
 
63
61
 
64
- def get_model() -> str:
65
- """Return the default model identifier.
62
+ def get_package_path() -> Path:
63
+ """Return the root path of the openai-sdk-helpers package.
66
64
 
67
65
  Returns
68
66
  -------
69
- str
70
- Default OpenAI model identifier.
71
-
72
- Examples
73
- --------
74
- >>> from openai_sdk_helpers.environment import _get_default_model
75
- >>> _get_default_model()
76
- 'gpt-4o-mini'
67
+ Path
68
+ Root directory path of the openai-sdk-helpers package.
77
69
  """
78
- return os.getenv("DEFAULT_MODEL", DEFAULT_MODEL)
70
+ return Path(__file__).parent
71
+
72
+
73
+ DEFAULT_PROMPT_DIR = get_package_path() / "prompt"
@@ -119,3 +119,12 @@ class ResourceCleanupError(OpenAISDKError):
119
119
  """
120
120
 
121
121
  pass
122
+
123
+
124
+ class ExtractionError(OpenAISDKError):
125
+ """Extraction execution failed.
126
+
127
+ Raised when LangExtract operations fail or output validation fails.
128
+ """
129
+
130
+ pass
@@ -0,0 +1,23 @@
1
+ """LangExtract-powered document extraction helpers."""
2
+
3
+ from .extractor import DocumentExtractor
4
+ from .generator import (
5
+ EXTRACTOR_CONFIG_GENERATOR,
6
+ EXTRACTOR_CONFIG_AGENT_INSTRUCTIONS,
7
+ PROMPT_OPTIMIZER_AGENT_INSTRUCTIONS,
8
+ generate_document_extractor_config,
9
+ generate_document_extractor_config_with_agent,
10
+ optimize_extractor_prompt,
11
+ optimize_extractor_prompt_with_agent,
12
+ )
13
+
14
+ __all__ = [
15
+ "DocumentExtractor",
16
+ "EXTRACTOR_CONFIG_GENERATOR",
17
+ "EXTRACTOR_CONFIG_AGENT_INSTRUCTIONS",
18
+ "PROMPT_OPTIMIZER_AGENT_INSTRUCTIONS",
19
+ "generate_document_extractor_config",
20
+ "generate_document_extractor_config_with_agent",
21
+ "optimize_extractor_prompt",
22
+ "optimize_extractor_prompt_with_agent",
23
+ ]
@@ -0,0 +1,157 @@
1
+ """Document extraction helpers powered by LangExtract."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ import typing
8
+
9
+ import langextract as lx
10
+ from langextract.core import format_handler as lx_format_handler
11
+ from langextract.core.data import AnnotatedDocument as LXAnnotatedDocument
12
+
13
+ from ..errors import ExtractionError
14
+ from ..structure.extraction import (
15
+ AnnotatedDocumentStructure,
16
+ DocumentStructure,
17
+ ExampleDataStructure,
18
+ )
19
+
20
+
21
+ class DocumentExtractor:
22
+ """Extract structured data from documents using LangExtract.
23
+
24
+ Parameters
25
+ ----------
26
+ prompt_description : str
27
+ Prompt description used by LangExtract.
28
+ examples : Sequence[ExampleDataStructure]
29
+ Example payloads supplied to LangExtract.
30
+ model_id : str
31
+ Model identifier to pass to LangExtract.
32
+ max_workers : int, optional
33
+ Maximum number of workers for concurrent extraction. Default is 1.
34
+
35
+ Methods
36
+ -------
37
+ extract(input_text)
38
+ Extract structured data from one or more documents.
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ prompt_description: str,
44
+ examples: typing.Sequence[ExampleDataStructure],
45
+ model_id: str,
46
+ max_workers: int = 1,
47
+ ) -> None:
48
+ """Initialize the extractor.
49
+
50
+ Parameters
51
+ ----------
52
+ prompt_description : str
53
+ Prompt description used by LangExtract.
54
+ examples : Sequence[ExampleDataStructure]
55
+ Example payloads supplied to LangExtract.
56
+ model_id : str
57
+ Model identifier to pass to LangExtract.
58
+ max_workers : int, optional
59
+ Maximum number of workers for concurrent extraction. Default is 1.
60
+ """
61
+ if not examples:
62
+ raise ValueError(
63
+ "Examples are required for reliable extraction. "
64
+ "Provide at least one ExampleDataStructure instance."
65
+ )
66
+ self.model_id = model_id
67
+ self.prompt = prompt_description
68
+ self.examples = examples
69
+ self.max_workers = max_workers
70
+
71
+ def extract(
72
+ self, input_text: DocumentStructure | list[DocumentStructure]
73
+ ) -> list[AnnotatedDocumentStructure]:
74
+ """Run the extraction.
75
+
76
+ Parameters
77
+ ----------
78
+ input_text : DocumentStructure | list[DocumentStructure]
79
+ Document or list of documents to extract data from.
80
+
81
+ Returns
82
+ -------
83
+ list[AnnotatedDocumentStructure]
84
+ Extracted items for the provided documents.
85
+ """
86
+ if isinstance(input_text, DocumentStructure):
87
+ input_documents = [input_text]
88
+ else:
89
+ input_documents = input_text
90
+ documents = DocumentStructure.to_dataclass_list(input_documents)
91
+ examples = ExampleDataStructure.to_dataclass_list(self.examples)
92
+ resolver_params = {"format_handler": _SanitizingFormatHandler()}
93
+ result = lx.extract(
94
+ text_or_documents=documents,
95
+ prompt_description=self.prompt,
96
+ examples=examples,
97
+ model_id=self.model_id, # Automatically selects OpenAI provider
98
+ api_key=os.environ.get("OPENAI_API_KEY"),
99
+ fence_output=True,
100
+ use_schema_constraints=False,
101
+ resolver_params=resolver_params,
102
+ )
103
+
104
+ def _convert(data: typing.Any) -> AnnotatedDocumentStructure:
105
+ if isinstance(data, LXAnnotatedDocument):
106
+ return AnnotatedDocumentStructure.from_dataclass(data)
107
+ return AnnotatedDocumentStructure.model_validate(data)
108
+
109
+ if isinstance(result, list):
110
+ return [_convert(doc) for doc in result]
111
+
112
+ return [_convert(result)]
113
+
114
+
115
+ def _sanitize_extraction_items(
116
+ items: typing.Sequence[typing.Mapping[str, lx_format_handler.ExtractionValueType]],
117
+ attribute_suffix: str,
118
+ ) -> list[dict[str, lx_format_handler.ExtractionValueType]]:
119
+ sanitized: list[dict[str, lx_format_handler.ExtractionValueType]] = []
120
+ for item in items:
121
+ updated: dict[str, lx_format_handler.ExtractionValueType] = {}
122
+ for key, value in item.items():
123
+ keep, cleaned = _sanitize_extraction_value(key, value, attribute_suffix)
124
+ if not keep:
125
+ continue
126
+ updated[key] = cleaned
127
+ sanitized.append(updated)
128
+ return sanitized
129
+
130
+
131
+ def _sanitize_extraction_value(
132
+ key: str,
133
+ value: lx_format_handler.ExtractionValueType,
134
+ attribute_suffix: str,
135
+ ) -> tuple[bool, lx_format_handler.ExtractionValueType]:
136
+ if value is None:
137
+ return False, None
138
+ if key.endswith(attribute_suffix):
139
+ if isinstance(value, dict):
140
+ return True, value
141
+ return False, None
142
+ if isinstance(value, (str, int, float)):
143
+ return True, value
144
+ return True, json.dumps(value, ensure_ascii=False)
145
+
146
+
147
+ class _SanitizingFormatHandler(lx_format_handler.FormatHandler):
148
+ """Sanitize LangExtract output before the resolver validates types."""
149
+
150
+ def parse_output(
151
+ self, text: str, *, strict: bool | None = None
152
+ ) -> typing.Sequence[typing.Mapping[str, lx_format_handler.ExtractionValueType]]:
153
+ items = super().parse_output(text, strict=strict)
154
+ return _sanitize_extraction_items(items, self.attribute_suffix)
155
+
156
+
157
+ __all__ = ["DocumentExtractor", "ExtractionError"]