gaik 0.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gaik/__init__.py ADDED
@@ -0,0 +1,23 @@
1
+ """General AI Kit (GAIK) - AI/ML toolkit for Python.
2
+
3
+ Multi-provider LLM support with structured data extraction.
4
+
5
+ Modules:
6
+ - gaik.extract: Structured data extraction
7
+ - gaik.providers: LLM provider interface (OpenAI, Anthropic, Azure, Google)
8
+ - gaik.parsers: PDF to Markdown parsing (vision models)
9
+
10
+ Example:
11
+ >>> from gaik.extract import SchemaExtractor
12
+ >>> extractor = SchemaExtractor("Extract name and age", provider="anthropic")
13
+ >>> results = extractor.extract(["Alice is 25"])
14
+ """
15
+
16
+ import importlib.metadata
17
+
18
+ try:
19
+ __version__ = importlib.metadata.version("gaik")
20
+ except importlib.metadata.PackageNotFoundError:
21
+ __version__ = "0.0.0.dev"
22
+
23
+ __all__ = ["__version__"]
@@ -0,0 +1,26 @@
1
+ """Dynamic structured data extraction with LLM providers.
2
+
3
+ Extract structured data from unstructured text using Pydantic schemas
4
+ and provider-enforced structured outputs (OpenAI, Anthropic, Azure, Google).
5
+
6
+ Example:
7
+ >>> from gaik.extract import SchemaExtractor
8
+ >>> extractor = SchemaExtractor("Extract name and age")
9
+ >>> results = extractor.extract(["Alice is 25 years old"])
10
+ """
11
+
12
+ from gaik.extract.extractor import SchemaExtractor, dynamic_extraction_workflow
13
+ from gaik.extract.models import ExtractionRequirements, FieldSpec
14
+ from gaik.extract.utils import create_extraction_model, sanitize_model_name
15
+
16
+ __all__ = [
17
+ # Main API
18
+ "SchemaExtractor",
19
+ "dynamic_extraction_workflow",
20
+ # Models
21
+ "FieldSpec",
22
+ "ExtractionRequirements",
23
+ # Utilities
24
+ "create_extraction_model",
25
+ "sanitize_model_name",
26
+ ]
@@ -0,0 +1,314 @@
1
+ """Dynamic schema extraction with LangChain structured outputs.
2
+
3
+ This module provides the main API for extracting structured data from documents
4
+ using dynamically created Pydantic schemas and LangChain's structured outputs.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import TYPE_CHECKING, Any, Literal, cast
10
+
11
+ from gaik.extract.models import ExtractionRequirements
12
+ from gaik.extract.utils import create_extraction_model
13
+ from gaik.providers import get_provider
14
+ from langchain_core.language_models import BaseChatModel
15
+ from pydantic import BaseModel
16
+
17
+ if TYPE_CHECKING:
18
+ from gaik.extract.models import FieldSpec
19
+
20
+ # Type alias for supported providers
21
+ ProviderType = Literal["openai", "anthropic", "google", "azure"]
22
+
23
+
24
+ def _get_llm_client(
25
+ provider: ProviderType = "openai",
26
+ model: str | None = None,
27
+ api_key: str | None = None,
28
+ client: BaseChatModel | None = None,
29
+ **kwargs: Any,
30
+ ) -> BaseChatModel:
31
+ """Get or create LLM client using provider.
32
+
33
+ Args:
34
+ provider: Provider name (e.g., "openai", "anthropic", "azure", "google").
35
+ Defaults to "openai".
36
+ model: Model name. If None, uses provider's default model.
37
+ api_key: API key for authentication. If None, uses environment variable.
38
+ client: Optional existing LangChain client to return.
39
+ **kwargs: Additional provider-specific parameters.
40
+
41
+ Returns:
42
+ BaseChatModel: LangChain chat model instance
43
+
44
+ Raises:
45
+ ValueError: If provider is not recognized
46
+ """
47
+ if client is not None:
48
+ return client
49
+
50
+ provider_obj = get_provider(provider)
51
+ return provider_obj.create_chat_model(model=model, api_key=api_key, **kwargs)
52
+
53
+
54
+ def _parse_user_requirements(
55
+ user_description: str,
56
+ llm_client: BaseChatModel,
57
+ ) -> ExtractionRequirements:
58
+ """Parse user's natural language into structured field specifications.
59
+
60
+ Uses LangChain's structured outputs to ensure the response matches our schema.
61
+
62
+ Args:
63
+ user_description: Natural language description of what to extract
64
+ llm_client: LangChain chat model instance
65
+
66
+ Returns:
67
+ Parsed extraction requirements with field specifications
68
+ """
69
+ structured_model = llm_client.with_structured_output(ExtractionRequirements)
70
+ response = structured_model.invoke(user_description)
71
+ return cast(ExtractionRequirements, response)
72
+
73
+
74
+ def _extract_from_document(
75
+ document_text: str,
76
+ extraction_model: type[BaseModel],
77
+ llm_client: BaseChatModel,
78
+ ) -> BaseModel:
79
+ """Extract structured data from document using structured outputs.
80
+
81
+ The schema is enforced by LangChain's structured outputs API.
82
+
83
+ Args:
84
+ document_text: The document text to extract data from
85
+ extraction_model: Pydantic model defining the extraction schema
86
+ llm_client: LangChain chat model instance
87
+
88
+ Returns:
89
+ Extracted data as a Pydantic model instance
90
+ """
91
+ structured_model = llm_client.with_structured_output(extraction_model)
92
+ response = structured_model.invoke(document_text)
93
+ # LangChain's with_structured_output guarantees BaseModel return
94
+ return cast(BaseModel, response)
95
+
96
+
97
+ class SchemaExtractor:
98
+ """Dynamic schema extractor using LangChain structured outputs.
99
+
100
+ This class allows you to define extraction requirements once and reuse them
101
+ across multiple documents. It's more efficient than calling the workflow
102
+ function when processing multiple documents with the same schema.
103
+
104
+ Attributes:
105
+ requirements: The parsed extraction requirements
106
+ model: The dynamically created Pydantic model for extraction
107
+ client: LangChain chat model instance
108
+
109
+ Example:
110
+ >>> # Using default OpenAI provider
111
+ >>> extractor = SchemaExtractor('''
112
+ ... Extract from invoices:
113
+ ... - Invoice number
114
+ ... - Date
115
+ ... - Total amount in USD
116
+ ... - Vendor name
117
+ ... ''')
118
+ >>> results = extractor.extract(documents)
119
+
120
+ >>> # Using Anthropic provider
121
+ >>> extractor = SchemaExtractor(
122
+ ... "Extract name and age",
123
+ ... provider="anthropic"
124
+ ... )
125
+
126
+ >>> # Custom model
127
+ >>> extractor = SchemaExtractor(
128
+ ... "Extract fields",
129
+ ... provider="openai",
130
+ ... model="gpt-4o"
131
+ ... )
132
+ """
133
+
134
+ def __init__(
135
+ self,
136
+ user_description: str | None = None,
137
+ *,
138
+ provider: ProviderType = "openai",
139
+ model: str | None = None,
140
+ api_key: str | None = None,
141
+ client: BaseChatModel | None = None,
142
+ requirements: ExtractionRequirements | None = None,
143
+ **kwargs: Any,
144
+ ):
145
+ """Initialize the schema extractor.
146
+
147
+ Args:
148
+ user_description: Natural language description of what to extract.
149
+ Required if requirements is not provided.
150
+ provider: Provider name (e.g., "openai", "anthropic", "azure", "google").
151
+ Defaults to "openai".
152
+ model: Model name. If None, uses provider's default model.
153
+ api_key: API key for authentication. If None, uses environment variable.
154
+ client: Optional custom LangChain chat model. If provided, provider,
155
+ model, and api_key are ignored.
156
+ requirements: Optional pre-parsed extraction requirements. If provided,
157
+ user_description is not needed.
158
+ **kwargs: Additional provider-specific parameters.
159
+
160
+ Raises:
161
+ ValueError: If neither user_description nor requirements is provided.
162
+ """
163
+ self.client = _get_llm_client(
164
+ provider=provider,
165
+ model=model,
166
+ api_key=api_key,
167
+ client=client,
168
+ **kwargs,
169
+ )
170
+
171
+ if requirements is not None:
172
+ self.requirements = requirements
173
+ elif user_description is not None:
174
+ self.requirements = _parse_user_requirements(user_description, self.client)
175
+ else:
176
+ raise ValueError("Either 'user_description' or 'requirements' must be provided")
177
+
178
+ self.model = create_extraction_model(self.requirements)
179
+
180
+ @property
181
+ def field_names(self) -> list[str]:
182
+ """Get the list of field names that will be extracted."""
183
+ return [f.field_name for f in self.requirements.fields]
184
+
185
+ @property
186
+ def fields(self) -> list[FieldSpec]:
187
+ """Get the field specifications for this extractor."""
188
+ return self.requirements.fields
189
+
190
+ def extract(self, documents: list[str]) -> list[dict]:
191
+ """Extract structured data from multiple documents.
192
+
193
+ Args:
194
+ documents: List of document texts to extract data from
195
+
196
+ Returns:
197
+ List of extracted data as dictionaries
198
+ """
199
+ results = []
200
+ for doc in documents:
201
+ extracted = _extract_from_document(doc, self.model, self.client)
202
+ results.append(extracted.model_dump())
203
+ return results
204
+
205
+ def extract_one(self, document: str) -> dict:
206
+ """Extract structured data from a single document.
207
+
208
+ Args:
209
+ document: Document text to extract data from
210
+
211
+ Returns:
212
+ Extracted data as a dictionary
213
+ """
214
+ extracted = _extract_from_document(document, self.model, self.client)
215
+ return extracted.model_dump()
216
+
217
+
218
+ def dynamic_extraction_workflow(
219
+ user_description: str,
220
+ documents: list[str],
221
+ *,
222
+ provider: ProviderType = "openai",
223
+ model: str | None = None,
224
+ api_key: str | None = None,
225
+ client: BaseChatModel | None = None,
226
+ verbose: bool = False,
227
+ **kwargs: Any,
228
+ ) -> list[dict]:
229
+ """Complete workflow from natural language description to structured extraction.
230
+
231
+ This is a convenience function that combines all steps:
232
+ 1. Parse user requirements into field specifications
233
+ 2. Create dynamic Pydantic schema from specifications
234
+ 3. Extract data using structured outputs (guaranteed format)
235
+
236
+ For better performance when processing multiple batches with the same schema,
237
+ use SchemaExtractor instead.
238
+
239
+ Args:
240
+ user_description: Natural language description of what to extract
241
+ documents: List of document texts to extract data from
242
+ provider: Provider name (e.g., "openai", "anthropic", "azure", "google").
243
+ Defaults to "openai".
244
+ model: Model name. If None, uses provider's default model.
245
+ api_key: API key for authentication. If None, uses environment variable.
246
+ client: Optional custom LangChain chat model. If provided, provider,
247
+ model, and api_key are ignored.
248
+ verbose: If True, prints progress information
249
+ **kwargs: Additional provider-specific parameters.
250
+
251
+ Returns:
252
+ List of extracted data as dictionaries
253
+
254
+ Example:
255
+ >>> # Using default OpenAI provider
256
+ >>> results = dynamic_extraction_workflow(
257
+ ... user_description='''
258
+ ... Extract project title, budget in euros, and partner countries
259
+ ... ''',
260
+ ... documents=[doc1, doc2, doc3]
261
+ ... )
262
+
263
+ >>> # Using Anthropic provider
264
+ >>> results = dynamic_extraction_workflow(
265
+ ... user_description="Extract name and age",
266
+ ... documents=documents,
267
+ ... provider="anthropic"
268
+ ... )
269
+
270
+ Advantages:
271
+ - Reliable: API enforces schema compliance
272
+ - Efficient: Minimal API calls needed
273
+ - Safe: No code execution or eval()
274
+ - Type-safe: Full Pydantic validation
275
+ """
276
+ llm_client = _get_llm_client(
277
+ provider=provider,
278
+ model=model,
279
+ api_key=api_key,
280
+ client=client,
281
+ **kwargs,
282
+ )
283
+
284
+ if verbose:
285
+ print("Step 1: Parsing user requirements...")
286
+
287
+ requirements = _parse_user_requirements(user_description, llm_client)
288
+
289
+ if verbose:
290
+ print(f"[OK] Identified {len(requirements.fields)} fields to extract")
291
+ print(f" Fields: {[f.field_name for f in requirements.fields]}")
292
+ print("\nStep 2: Creating dynamic Pydantic schema...")
293
+
294
+ extraction_model = create_extraction_model(requirements)
295
+
296
+ if verbose:
297
+ print(f"[OK] Created schema: {extraction_model.__name__}")
298
+ print(f" Schema: {extraction_model.model_json_schema()}")
299
+ print("\nStep 3: Extracting from documents...")
300
+
301
+ results = []
302
+ for i, doc in enumerate(documents):
303
+ if verbose:
304
+ print(f" Processing document {i + 1}/{len(documents)}...")
305
+ extracted = _extract_from_document(doc, extraction_model, llm_client)
306
+ results.append(extracted.model_dump())
307
+
308
+ if verbose:
309
+ print(f"[OK] Extracted data from {len(documents)} documents")
310
+
311
+ return results
312
+
313
+
314
+ __all__ = ["SchemaExtractor", "dynamic_extraction_workflow"]
gaik/extract/models.py ADDED
@@ -0,0 +1,44 @@
1
+ """Pydantic models for dynamic schema extraction.
2
+
3
+ This module defines the data structures used to specify extraction requirements
4
+ and field specifications for creating dynamic Pydantic schemas.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Literal
10
+
11
+ from pydantic import BaseModel, Field
12
+
13
+
14
+ class FieldSpec(BaseModel):
15
+ """Specification for a single field to extract.
16
+
17
+ Attributes:
18
+ field_name: Snake_case field name (e.g., 'project_title')
19
+ field_type: Python type for this field
20
+ description: What this field represents and how to extract it
21
+ required: Whether this field is required in the extraction
22
+ """
23
+
24
+ field_name: str = Field(description="Snake_case field name (e.g., 'project_title')")
25
+ field_type: Literal["str", "int", "float", "bool", "list[str]"] = Field(
26
+ description="Python type for this field"
27
+ )
28
+ description: str = Field(description="What this field represents")
29
+ required: bool = Field(default=True, description="Whether this field is required")
30
+
31
+
32
+ class ExtractionRequirements(BaseModel):
33
+ """Parsed extraction requirements from user input.
34
+
35
+ Attributes:
36
+ use_case_name: Name for this extraction use case
37
+ fields: List of fields to extract from documents
38
+ """
39
+
40
+ use_case_name: str = Field(description="Name for this extraction use case")
41
+ fields: list[FieldSpec] = Field(description="List of fields to extract")
42
+
43
+
44
+ __all__ = ["FieldSpec", "ExtractionRequirements"]
gaik/extract/utils.py ADDED
@@ -0,0 +1,119 @@
1
+ """Utility functions for creating dynamic Pydantic models.
2
+
3
+ This module provides helper functions for sanitizing model names and creating
4
+ dynamic Pydantic schemas from field specifications.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import re
10
+ from typing import TYPE_CHECKING
11
+
12
+ from pydantic import BaseModel, Field, create_model
13
+
14
+ if TYPE_CHECKING:
15
+ from gaik.extract.models import ExtractionRequirements
16
+
17
+
18
+ def sanitize_model_name(name: str) -> str:
19
+ """Sanitize model name to match OpenAI's requirements.
20
+
21
+ Only alphanumeric, underscores, and hyphens are allowed.
22
+ Removes invalid characters and normalizes the name.
23
+
24
+ Args:
25
+ name: The raw model name to sanitize
26
+
27
+ Returns:
28
+ A sanitized model name safe for OpenAI API
29
+
30
+ Example:
31
+ >>> sanitize_model_name("My Project! (2024)")
32
+ 'My_Project_2024'
33
+ """
34
+ # Replace spaces and other characters with underscores
35
+ sanitized = re.sub(r"[^a-zA-Z0-9_-]", "_", name)
36
+ # Remove consecutive underscores
37
+ sanitized = re.sub(r"_+", "_", sanitized)
38
+ # Remove leading/trailing underscores
39
+ sanitized = sanitized.strip("_")
40
+ return sanitized
41
+
42
+
43
+ def create_extraction_model(requirements: ExtractionRequirements) -> type[BaseModel]:
44
+ """Create a Pydantic model dynamically from field specifications.
45
+
46
+ This is type-safe and doesn't require code generation or eval().
47
+ The resulting model can be used with OpenAI's structured outputs API.
48
+
49
+ Args:
50
+ requirements: Extraction requirements containing field specifications
51
+
52
+ Returns:
53
+ A dynamically created Pydantic model class
54
+
55
+ Example:
56
+ >>> from gaik.extract.models import ExtractionRequirements, FieldSpec
57
+ >>> requirements = ExtractionRequirements(
58
+ ... use_case_name="Invoice",
59
+ ... fields=[
60
+ ... FieldSpec(
61
+ ... field_name="invoice_number",
62
+ ... field_type="str",
63
+ ... description="The invoice number",
64
+ ... required=True
65
+ ... ),
66
+ ... FieldSpec(
67
+ ... field_name="amount",
68
+ ... field_type="float",
69
+ ... description="Total amount in USD",
70
+ ... required=True
71
+ ... )
72
+ ... ]
73
+ ... )
74
+ >>> invoice_model = create_extraction_model(requirements)
75
+ >>> invoice_model.__name__
76
+ 'Invoice_Extraction'
77
+ """
78
+ # Map string type names to actual Python types
79
+ type_mapping = {
80
+ "str": str,
81
+ "int": int,
82
+ "float": float,
83
+ "bool": bool,
84
+ "list[str]": list[str],
85
+ }
86
+
87
+ # Build field definitions for create_model()
88
+ field_definitions = {}
89
+
90
+ for field_spec in requirements.fields:
91
+ python_type = type_mapping[field_spec.field_type]
92
+
93
+ if field_spec.required:
94
+ # Required field
95
+ field_definitions[field_spec.field_name] = (
96
+ python_type,
97
+ Field(description=field_spec.description),
98
+ )
99
+ else:
100
+ # Optional field
101
+ field_definitions[field_spec.field_name] = (
102
+ python_type | None,
103
+ Field(default=None, description=field_spec.description),
104
+ )
105
+
106
+ # Sanitize the model name for OpenAI compatibility
107
+ model_name = sanitize_model_name(requirements.use_case_name) + "_Extraction"
108
+
109
+ # Create the model dynamically using Pydantic's built-in method
110
+ dynamic_model = create_model(
111
+ model_name,
112
+ __doc__=f"Extraction model for {requirements.use_case_name}",
113
+ **field_definitions,
114
+ )
115
+
116
+ return dynamic_model
117
+
118
+
119
+ __all__ = ["sanitize_model_name", "create_extraction_model"]
@@ -0,0 +1,9 @@
1
+ """Reusable vision-oriented parsers.
2
+
3
+ The submodule currently exposes :class:`VisionParser` for converting PDFs to
4
+ Markdown using OpenAI vision models.
5
+ """
6
+
7
+ from .vision import OpenAIConfig, VisionParser, get_openai_config
8
+
9
+ __all__ = ["OpenAIConfig", "VisionParser", "get_openai_config"]