gaik 0.2.17__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- gaik-0.2.17/.gitignore +1 -0
- gaik-0.2.17/LICENSE +21 -0
- gaik-0.2.17/PKG-INFO +291 -0
- gaik-0.2.17/README.md +214 -0
- gaik-0.2.17/pyproject.toml +131 -0
- gaik-0.2.17/scripts/README.md +28 -0
- gaik-0.2.17/scripts/verify_installation.py +67 -0
- gaik-0.2.17/setup.cfg +4 -0
- gaik-0.2.17/src/gaik/__init__.py +23 -0
- gaik-0.2.17/src/gaik/_version.py +34 -0
- gaik-0.2.17/src/gaik/parsers/__init__.py +21 -0
- gaik-0.2.17/src/gaik/parsers/docling.py +460 -0
- gaik-0.2.17/src/gaik/parsers/pymypdf.py +232 -0
- gaik-0.2.17/src/gaik/parsers/vision.py +378 -0
- gaik-0.2.17/src/gaik.egg-info/PKG-INFO +291 -0
- gaik-0.2.17/src/gaik.egg-info/SOURCES.txt +17 -0
- gaik-0.2.17/src/gaik.egg-info/dependency_links.txt +1 -0
- gaik-0.2.17/src/gaik.egg-info/requires.txt +36 -0
- gaik-0.2.17/src/gaik.egg-info/top_level.txt +1 -0
gaik-0.2.17/.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
src/gaik/_version.py
|
gaik-0.2.17/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 GAIK - GenAI for knowledge mgt
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
gaik-0.2.17/PKG-INFO
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: gaik
|
|
3
|
+
Version: 0.2.17
|
|
4
|
+
Summary: General AI Kit - Reusable AI/ML components for Python
|
|
5
|
+
Author: GAIK Project
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2025 GAIK - GenAI for knowledge mgt
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://gaik.ai/
|
|
29
|
+
Project-URL: Repository, https://github.com/GAIK-project/gaik-toolkit
|
|
30
|
+
Project-URL: Documentation, https://github.com/GAIK-project/gaik-toolkit/tree/main/packages/python/gaik
|
|
31
|
+
Project-URL: Issues, https://github.com/GAIK-project/gaik-toolkit/issues
|
|
32
|
+
Keywords: ai,ml,langchain,openai,anthropic,google,structured-outputs,pydantic,schema,extraction
|
|
33
|
+
Classifier: Development Status :: 3 - Alpha
|
|
34
|
+
Classifier: Intended Audience :: Developers
|
|
35
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
36
|
+
Classifier: Programming Language :: Python :: 3
|
|
37
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
38
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
39
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
40
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
41
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
42
|
+
Requires-Python: >=3.10
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
44
|
+
License-File: LICENSE
|
|
45
|
+
Requires-Dist: pydantic>=2.12.4
|
|
46
|
+
Provides-Extra: extract
|
|
47
|
+
Provides-Extra: parser
|
|
48
|
+
Requires-Dist: openai>=2.7; extra == "parser"
|
|
49
|
+
Requires-Dist: PyMuPDF>=1.23.0; extra == "parser"
|
|
50
|
+
Requires-Dist: python-dotenv>=1.0.0; extra == "parser"
|
|
51
|
+
Requires-Dist: docling; extra == "parser"
|
|
52
|
+
Requires-Dist: psutil; extra == "parser"
|
|
53
|
+
Provides-Extra: all
|
|
54
|
+
Requires-Dist: gaik[extract]; extra == "all"
|
|
55
|
+
Requires-Dist: gaik[parser]; extra == "all"
|
|
56
|
+
Provides-Extra: dev
|
|
57
|
+
Requires-Dist: ruff>=0.14.1; extra == "dev"
|
|
58
|
+
Requires-Dist: build>=1.0; extra == "dev"
|
|
59
|
+
Requires-Dist: twine>=4.0; extra == "dev"
|
|
60
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
61
|
+
Requires-Dist: pytest-cov>=4.1; extra == "dev"
|
|
62
|
+
Requires-Dist: pytest-mock>=3.12; extra == "dev"
|
|
63
|
+
Requires-Dist: tomli>=2.0.1; extra == "dev"
|
|
64
|
+
Provides-Extra: ci
|
|
65
|
+
Requires-Dist: langchain-core>=1.0.3; extra == "ci"
|
|
66
|
+
Requires-Dist: langchain-openai>=1.0.2; extra == "ci"
|
|
67
|
+
Requires-Dist: langchain-anthropic>=1.0.1; extra == "ci"
|
|
68
|
+
Requires-Dist: langchain-google-genai>=3.0.1; extra == "ci"
|
|
69
|
+
Requires-Dist: ruff>=0.14.1; extra == "ci"
|
|
70
|
+
Requires-Dist: build>=1.0; extra == "ci"
|
|
71
|
+
Requires-Dist: twine>=4.0; extra == "ci"
|
|
72
|
+
Requires-Dist: pytest>=8.0; extra == "ci"
|
|
73
|
+
Requires-Dist: pytest-cov>=4.1; extra == "ci"
|
|
74
|
+
Requires-Dist: pytest-mock>=3.12; extra == "ci"
|
|
75
|
+
Requires-Dist: tomli>=2.0.1; extra == "ci"
|
|
76
|
+
Dynamic: license-file
|
|
77
|
+
|
|
78
|
+
# GAIK - General AI Kit
|
|
79
|
+
|
|
80
|
+
Multi-provider AI toolkit for Python with structured data extraction and document parsing.
|
|
81
|
+
|
|
82
|
+
## Installation
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
# Extract features (OpenAI, Anthropic, Google, Azure)
|
|
86
|
+
pip install gaik[extract]
|
|
87
|
+
|
|
88
|
+
# PDF parsing
|
|
89
|
+
pip install gaik[parser]
|
|
90
|
+
|
|
91
|
+
# All features
|
|
92
|
+
pip install gaik[all]
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
## Quick Start
|
|
96
|
+
|
|
97
|
+
### Extract Data
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
from gaik.extract import SchemaExtractor
|
|
101
|
+
|
|
102
|
+
# Set API key first: export OPENAI_API_KEY='sk-...'
|
|
103
|
+
extractor = SchemaExtractor("Extract name and age from text")
|
|
104
|
+
result = extractor.extract_one("Alice is 25 years old")
|
|
105
|
+
print(result) # {'name': 'Alice', 'age': 25}
|
|
106
|
+
|
|
107
|
+
# Switch provider
|
|
108
|
+
extractor = SchemaExtractor("Extract name and age", provider="anthropic") # or "google", "azure"
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Parse PDF to Markdown
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
from gaik.parsers import VisionParser, get_openai_config
|
|
115
|
+
|
|
116
|
+
# Set environment: AZURE_API_KEY, AZURE_ENDPOINT, AZURE_DEPLOYMENT
|
|
117
|
+
config = get_openai_config(use_azure=True)
|
|
118
|
+
parser = VisionParser(config)
|
|
119
|
+
|
|
120
|
+
pages = parser.convert_pdf("invoice.pdf", clean_output=True)
|
|
121
|
+
markdown = "\n\n".join(pages)
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
### Fast Local PDF Parsing
|
|
125
|
+
|
|
126
|
+
```python
|
|
127
|
+
from gaik.parsers import PyMuPDFParser
|
|
128
|
+
|
|
129
|
+
parser = PyMuPDFParser()
|
|
130
|
+
result = parser.parse_document("document.pdf")
|
|
131
|
+
print(result["text_content"])
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
## Features
|
|
135
|
+
|
|
136
|
+
### 🔍 Structured Data Extraction
|
|
137
|
+
|
|
138
|
+
- **Multi-provider** - OpenAI, Anthropic, Google, Azure
|
|
139
|
+
- **Type-safe** - Full Pydantic validation
|
|
140
|
+
- **API-enforced** - Guaranteed schema compliance
|
|
141
|
+
- **Simple** - Natural language to structured data
|
|
142
|
+
|
|
143
|
+
### 📄 Document Parsing
|
|
144
|
+
|
|
145
|
+
- **VisionParser** - PDF to Markdown using vision models
|
|
146
|
+
- **PyMuPDFParser** - Fast local text extraction
|
|
147
|
+
- **No external binaries** - Pure Python dependencies
|
|
148
|
+
|
|
149
|
+
## API Reference
|
|
150
|
+
|
|
151
|
+
### Extraction
|
|
152
|
+
|
|
153
|
+
```python
|
|
154
|
+
SchemaExtractor(
|
|
155
|
+
user_description: str,
|
|
156
|
+
provider: Literal["openai", "anthropic", "google", "azure"] = "openai",
|
|
157
|
+
model: str | None = None,
|
|
158
|
+
api_key: str | None = None,
|
|
159
|
+
)
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
**Methods:**
|
|
163
|
+
- `extract_one(text: str) -> dict` - Extract from single text
|
|
164
|
+
- `extract(texts: list[str]) -> list[dict]` - Batch extraction
|
|
165
|
+
- `field_names` - List of field names
|
|
166
|
+
- `model` - Generated Pydantic model
|
|
167
|
+
|
|
168
|
+
### Vision Parser
|
|
169
|
+
|
|
170
|
+
```python
|
|
171
|
+
VisionParser(
|
|
172
|
+
config: OpenAIConfig,
|
|
173
|
+
custom_prompt: str | None = None,
|
|
174
|
+
use_context: bool = True,
|
|
175
|
+
max_tokens: int = 16_000,
|
|
176
|
+
)
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
**Methods:**
|
|
180
|
+
- `convert_pdf(pdf_path: str, dpi: int = 200, clean_output: bool = True) -> list[str]`
|
|
181
|
+
- `save_markdown(pages: list[str], output_path: str)`
|
|
182
|
+
|
|
183
|
+
**Config Helper:**
|
|
184
|
+
```python
|
|
185
|
+
get_openai_config(use_azure: bool = True) -> OpenAIConfig
|
|
186
|
+
```
|
|
187
|
+
|
|
188
|
+
### PyMuPDF Parser
|
|
189
|
+
|
|
190
|
+
```python
|
|
191
|
+
PyMuPDFParser()
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
**Methods:**
|
|
195
|
+
- `parse_document(file_path: str) -> dict` - Extract text and metadata
|
|
196
|
+
|
|
197
|
+
## Environment Variables
|
|
198
|
+
|
|
199
|
+
| Provider | Variables |
|
|
200
|
+
|----------|-----------|
|
|
201
|
+
| OpenAI | `OPENAI_API_KEY` |
|
|
202
|
+
| Anthropic | `ANTHROPIC_API_KEY` |
|
|
203
|
+
| Google | `GOOGLE_API_KEY` |
|
|
204
|
+
| Azure | `AZURE_API_KEY`, `AZURE_ENDPOINT`, `AZURE_DEPLOYMENT` |
|
|
205
|
+
|
|
206
|
+
## Default Models
|
|
207
|
+
|
|
208
|
+
| Provider | Model |
|
|
209
|
+
|----------|-------|
|
|
210
|
+
| OpenAI | `gpt-4.1` |
|
|
211
|
+
| Anthropic | `claude-sonnet-4-5-20250929` |
|
|
212
|
+
| Google | `gemini-2.5-flash` |
|
|
213
|
+
| Azure | User's deployment |
|
|
214
|
+
|
|
215
|
+
## Batch Processing
|
|
216
|
+
|
|
217
|
+
```python
|
|
218
|
+
extractor = SchemaExtractor("""
|
|
219
|
+
Extract:
|
|
220
|
+
- invoice_number: Invoice ID
|
|
221
|
+
- amount: Total in USD
|
|
222
|
+
- vendor: Company name
|
|
223
|
+
""")
|
|
224
|
+
|
|
225
|
+
documents = [
|
|
226
|
+
"Invoice #12345 from Acme Corp. Total: $1,500",
|
|
227
|
+
"INV-67890, Supplier: TechCo, Amount: $2,750"
|
|
228
|
+
]
|
|
229
|
+
|
|
230
|
+
results = extractor.extract(documents)
|
|
231
|
+
for result in results:
|
|
232
|
+
print(f"Invoice: {result['invoice_number']}, Amount: ${result['amount']}")
|
|
233
|
+
```
|
|
234
|
+
|
|
235
|
+
## Schema Inspection
|
|
236
|
+
|
|
237
|
+
```python
|
|
238
|
+
extractor = SchemaExtractor("Extract name and age")
|
|
239
|
+
|
|
240
|
+
# Field names
|
|
241
|
+
print(extractor.field_names) # ['name', 'age']
|
|
242
|
+
|
|
243
|
+
# JSON schema
|
|
244
|
+
schema = extractor.model.model_json_schema()
|
|
245
|
+
|
|
246
|
+
# Field specs
|
|
247
|
+
for field in extractor.fields:
|
|
248
|
+
print(f"{field.field_name}: {field.field_type}")
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
## Advanced Usage
|
|
252
|
+
|
|
253
|
+
### Custom Prompt for Vision Parser
|
|
254
|
+
|
|
255
|
+
```python
|
|
256
|
+
custom_prompt = """
|
|
257
|
+
Convert document to markdown:
|
|
258
|
+
- Preserve all tables
|
|
259
|
+
- Include headers and footers
|
|
260
|
+
- Maintain layout structure
|
|
261
|
+
"""
|
|
262
|
+
|
|
263
|
+
parser = VisionParser(config, custom_prompt=custom_prompt)
|
|
264
|
+
```
|
|
265
|
+
|
|
266
|
+
### Pre-defined Schema
|
|
267
|
+
|
|
268
|
+
```python
|
|
269
|
+
from gaik.extract import FieldSpec, ExtractionRequirements, create_extraction_model
|
|
270
|
+
|
|
271
|
+
requirements = ExtractionRequirements(
|
|
272
|
+
use_case_name="Invoice",
|
|
273
|
+
fields=[
|
|
274
|
+
FieldSpec("invoice_number", "str", "Invoice ID", required=True),
|
|
275
|
+
FieldSpec("amount", "float", "Total amount", required=True),
|
|
276
|
+
]
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
InvoiceModel = create_extraction_model(requirements)
|
|
280
|
+
extractor = SchemaExtractor(requirements=requirements)
|
|
281
|
+
```
|
|
282
|
+
|
|
283
|
+
## Resources
|
|
284
|
+
|
|
285
|
+
- **Examples**: [examples/](../../examples/)
|
|
286
|
+
- **Repository**: [github.com/GAIK-project/gaik-toolkit](https://github.com/GAIK-project/gaik-toolkit)
|
|
287
|
+
- **Contributing**: [CONTRIBUTING.md](../../../CONTRIBUTING.md)
|
|
288
|
+
|
|
289
|
+
## License
|
|
290
|
+
|
|
291
|
+
MIT - see [LICENSE](../../../LICENSE)
|
gaik-0.2.17/README.md
ADDED
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
# GAIK - General AI Kit
|
|
2
|
+
|
|
3
|
+
Multi-provider AI toolkit for Python with structured data extraction and document parsing.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Extract features (OpenAI, Anthropic, Google, Azure)
|
|
9
|
+
pip install gaik[extract]
|
|
10
|
+
|
|
11
|
+
# PDF parsing
|
|
12
|
+
pip install gaik[parser]
|
|
13
|
+
|
|
14
|
+
# All features
|
|
15
|
+
pip install gaik[all]
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Quick Start
|
|
19
|
+
|
|
20
|
+
### Extract Data
|
|
21
|
+
|
|
22
|
+
```python
|
|
23
|
+
from gaik.extract import SchemaExtractor
|
|
24
|
+
|
|
25
|
+
# Set API key first: export OPENAI_API_KEY='sk-...'
|
|
26
|
+
extractor = SchemaExtractor("Extract name and age from text")
|
|
27
|
+
result = extractor.extract_one("Alice is 25 years old")
|
|
28
|
+
print(result) # {'name': 'Alice', 'age': 25}
|
|
29
|
+
|
|
30
|
+
# Switch provider
|
|
31
|
+
extractor = SchemaExtractor("Extract name and age", provider="anthropic") # or "google", "azure"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### Parse PDF to Markdown
|
|
35
|
+
|
|
36
|
+
```python
|
|
37
|
+
from gaik.parsers import VisionParser, get_openai_config
|
|
38
|
+
|
|
39
|
+
# Set environment: AZURE_API_KEY, AZURE_ENDPOINT, AZURE_DEPLOYMENT
|
|
40
|
+
config = get_openai_config(use_azure=True)
|
|
41
|
+
parser = VisionParser(config)
|
|
42
|
+
|
|
43
|
+
pages = parser.convert_pdf("invoice.pdf", clean_output=True)
|
|
44
|
+
markdown = "\n\n".join(pages)
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
### Fast Local PDF Parsing
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from gaik.parsers import PyMuPDFParser
|
|
51
|
+
|
|
52
|
+
parser = PyMuPDFParser()
|
|
53
|
+
result = parser.parse_document("document.pdf")
|
|
54
|
+
print(result["text_content"])
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Features
|
|
58
|
+
|
|
59
|
+
### 🔍 Structured Data Extraction
|
|
60
|
+
|
|
61
|
+
- **Multi-provider** - OpenAI, Anthropic, Google, Azure
|
|
62
|
+
- **Type-safe** - Full Pydantic validation
|
|
63
|
+
- **API-enforced** - Guaranteed schema compliance
|
|
64
|
+
- **Simple** - Natural language to structured data
|
|
65
|
+
|
|
66
|
+
### 📄 Document Parsing
|
|
67
|
+
|
|
68
|
+
- **VisionParser** - PDF to Markdown using vision models
|
|
69
|
+
- **PyMuPDFParser** - Fast local text extraction
|
|
70
|
+
- **No external binaries** - Pure Python dependencies
|
|
71
|
+
|
|
72
|
+
## API Reference
|
|
73
|
+
|
|
74
|
+
### Extraction
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
SchemaExtractor(
|
|
78
|
+
user_description: str,
|
|
79
|
+
provider: Literal["openai", "anthropic", "google", "azure"] = "openai",
|
|
80
|
+
model: str | None = None,
|
|
81
|
+
api_key: str | None = None,
|
|
82
|
+
)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
**Methods:**
|
|
86
|
+
- `extract_one(text: str) -> dict` - Extract from single text
|
|
87
|
+
- `extract(texts: list[str]) -> list[dict]` - Batch extraction
|
|
88
|
+
- `field_names` - List of field names
|
|
89
|
+
- `model` - Generated Pydantic model
|
|
90
|
+
|
|
91
|
+
### Vision Parser
|
|
92
|
+
|
|
93
|
+
```python
|
|
94
|
+
VisionParser(
|
|
95
|
+
config: OpenAIConfig,
|
|
96
|
+
custom_prompt: str | None = None,
|
|
97
|
+
use_context: bool = True,
|
|
98
|
+
max_tokens: int = 16_000,
|
|
99
|
+
)
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
**Methods:**
|
|
103
|
+
- `convert_pdf(pdf_path: str, dpi: int = 200, clean_output: bool = True) -> list[str]`
|
|
104
|
+
- `save_markdown(pages: list[str], output_path: str)`
|
|
105
|
+
|
|
106
|
+
**Config Helper:**
|
|
107
|
+
```python
|
|
108
|
+
get_openai_config(use_azure: bool = True) -> OpenAIConfig
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### PyMuPDF Parser
|
|
112
|
+
|
|
113
|
+
```python
|
|
114
|
+
PyMuPDFParser()
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
**Methods:**
|
|
118
|
+
- `parse_document(file_path: str) -> dict` - Extract text and metadata
|
|
119
|
+
|
|
120
|
+
## Environment Variables
|
|
121
|
+
|
|
122
|
+
| Provider | Variables |
|
|
123
|
+
|----------|-----------|
|
|
124
|
+
| OpenAI | `OPENAI_API_KEY` |
|
|
125
|
+
| Anthropic | `ANTHROPIC_API_KEY` |
|
|
126
|
+
| Google | `GOOGLE_API_KEY` |
|
|
127
|
+
| Azure | `AZURE_API_KEY`, `AZURE_ENDPOINT`, `AZURE_DEPLOYMENT` |
|
|
128
|
+
|
|
129
|
+
## Default Models
|
|
130
|
+
|
|
131
|
+
| Provider | Model |
|
|
132
|
+
|----------|-------|
|
|
133
|
+
| OpenAI | `gpt-4.1` |
|
|
134
|
+
| Anthropic | `claude-sonnet-4-5-20250929` |
|
|
135
|
+
| Google | `gemini-2.5-flash` |
|
|
136
|
+
| Azure | User's deployment |
|
|
137
|
+
|
|
138
|
+
## Batch Processing
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
extractor = SchemaExtractor("""
|
|
142
|
+
Extract:
|
|
143
|
+
- invoice_number: Invoice ID
|
|
144
|
+
- amount: Total in USD
|
|
145
|
+
- vendor: Company name
|
|
146
|
+
""")
|
|
147
|
+
|
|
148
|
+
documents = [
|
|
149
|
+
"Invoice #12345 from Acme Corp. Total: $1,500",
|
|
150
|
+
"INV-67890, Supplier: TechCo, Amount: $2,750"
|
|
151
|
+
]
|
|
152
|
+
|
|
153
|
+
results = extractor.extract(documents)
|
|
154
|
+
for result in results:
|
|
155
|
+
print(f"Invoice: {result['invoice_number']}, Amount: ${result['amount']}")
|
|
156
|
+
```
|
|
157
|
+
|
|
158
|
+
## Schema Inspection
|
|
159
|
+
|
|
160
|
+
```python
|
|
161
|
+
extractor = SchemaExtractor("Extract name and age")
|
|
162
|
+
|
|
163
|
+
# Field names
|
|
164
|
+
print(extractor.field_names) # ['name', 'age']
|
|
165
|
+
|
|
166
|
+
# JSON schema
|
|
167
|
+
schema = extractor.model.model_json_schema()
|
|
168
|
+
|
|
169
|
+
# Field specs
|
|
170
|
+
for field in extractor.fields:
|
|
171
|
+
print(f"{field.field_name}: {field.field_type}")
|
|
172
|
+
```
|
|
173
|
+
|
|
174
|
+
## Advanced Usage
|
|
175
|
+
|
|
176
|
+
### Custom Prompt for Vision Parser
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
custom_prompt = """
|
|
180
|
+
Convert document to markdown:
|
|
181
|
+
- Preserve all tables
|
|
182
|
+
- Include headers and footers
|
|
183
|
+
- Maintain layout structure
|
|
184
|
+
"""
|
|
185
|
+
|
|
186
|
+
parser = VisionParser(config, custom_prompt=custom_prompt)
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
### Pre-defined Schema
|
|
190
|
+
|
|
191
|
+
```python
|
|
192
|
+
from gaik.extract import FieldSpec, ExtractionRequirements, create_extraction_model
|
|
193
|
+
|
|
194
|
+
requirements = ExtractionRequirements(
|
|
195
|
+
use_case_name="Invoice",
|
|
196
|
+
fields=[
|
|
197
|
+
FieldSpec("invoice_number", "str", "Invoice ID", required=True),
|
|
198
|
+
FieldSpec("amount", "float", "Total amount", required=True),
|
|
199
|
+
]
|
|
200
|
+
)
|
|
201
|
+
|
|
202
|
+
InvoiceModel = create_extraction_model(requirements)
|
|
203
|
+
extractor = SchemaExtractor(requirements=requirements)
|
|
204
|
+
```
|
|
205
|
+
|
|
206
|
+
## Resources
|
|
207
|
+
|
|
208
|
+
- **Examples**: [examples/](../../examples/)
|
|
209
|
+
- **Repository**: [github.com/GAIK-project/gaik-toolkit](https://github.com/GAIK-project/gaik-toolkit)
|
|
210
|
+
- **Contributing**: [CONTRIBUTING.md](../../../CONTRIBUTING.md)
|
|
211
|
+
|
|
212
|
+
## License
|
|
213
|
+
|
|
214
|
+
MIT - see [LICENSE](../../../LICENSE)
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "gaik"
|
|
3
|
+
dynamic = ["version"]
|
|
4
|
+
description = "General AI Kit - Reusable AI/ML components for Python"
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.10"
|
|
7
|
+
license = { file = "LICENSE" }
|
|
8
|
+
authors = [{ name = "GAIK Project" }]
|
|
9
|
+
keywords = [
|
|
10
|
+
"ai",
|
|
11
|
+
"ml",
|
|
12
|
+
"langchain",
|
|
13
|
+
"openai",
|
|
14
|
+
"anthropic",
|
|
15
|
+
"google",
|
|
16
|
+
"structured-outputs",
|
|
17
|
+
"pydantic",
|
|
18
|
+
"schema",
|
|
19
|
+
"extraction",
|
|
20
|
+
]
|
|
21
|
+
classifiers = [
|
|
22
|
+
"Development Status :: 3 - Alpha",
|
|
23
|
+
"Intended Audience :: Developers",
|
|
24
|
+
"License :: OSI Approved :: MIT License",
|
|
25
|
+
"Programming Language :: Python :: 3",
|
|
26
|
+
"Programming Language :: Python :: 3.10",
|
|
27
|
+
"Programming Language :: Python :: 3.11",
|
|
28
|
+
"Programming Language :: Python :: 3.12",
|
|
29
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
30
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
31
|
+
]
|
|
32
|
+
|
|
33
|
+
dependencies = [
|
|
34
|
+
# Core runtime requirement shared across all installs
|
|
35
|
+
"pydantic>=2.12.4",
|
|
36
|
+
]
|
|
37
|
+
|
|
38
|
+
[project.optional-dependencies]
|
|
39
|
+
# Data extraction with all LLM providers (OpenAI, Anthropic, Google, Azure)
|
|
40
|
+
extract = [
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
# Document parsing (PDF, images, etc.)
|
|
44
|
+
parser = [
|
|
45
|
+
# Required to call OpenAI or Azure OpenAI vision endpoints
|
|
46
|
+
"openai>=2.7",
|
|
47
|
+
# PDF parsing and image conversion (no Pillow required)
|
|
48
|
+
"PyMuPDF>=1.23.0",
|
|
49
|
+
# Environment variable loading
|
|
50
|
+
"python-dotenv>=1.0.0",
|
|
51
|
+
"docling",
|
|
52
|
+
"psutil"
|
|
53
|
+
]
|
|
54
|
+
|
|
55
|
+
# All features
|
|
56
|
+
all = [
|
|
57
|
+
"gaik[extract]",
|
|
58
|
+
"gaik[parser]",
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
# Development tooling for maintainers only (pip install -e ".[dev]")
|
|
62
|
+
dev = [
|
|
63
|
+
"ruff>=0.14.1", # Linter/formatter: ruff check . / ruff format .
|
|
64
|
+
"build>=1.0", # Package builder: python -m build
|
|
65
|
+
"twine>=4.0", # PyPI publisher: twine upload dist/*
|
|
66
|
+
"pytest>=8.0", # Test runner: pytest
|
|
67
|
+
"pytest-cov>=4.1", # Coverage reports (optional)
|
|
68
|
+
"pytest-mock>=3.12", # Mock fixtures for testing
|
|
69
|
+
"tomli>=2.0.1", # TOML parser for Python 3.10 tooling
|
|
70
|
+
]
|
|
71
|
+
|
|
72
|
+
# Continuous integration dependencies (dev + provider extras)
|
|
73
|
+
ci = [
|
|
74
|
+
# Extraction extras
|
|
75
|
+
"langchain-core>=1.0.3",
|
|
76
|
+
"langchain-openai>=1.0.2",
|
|
77
|
+
"langchain-anthropic>=1.0.1",
|
|
78
|
+
"langchain-google-genai>=3.0.1",
|
|
79
|
+
# Development/test tooling
|
|
80
|
+
"ruff>=0.14.1",
|
|
81
|
+
"build>=1.0",
|
|
82
|
+
"twine>=4.0",
|
|
83
|
+
"pytest>=8.0",
|
|
84
|
+
"pytest-cov>=4.1",
|
|
85
|
+
"pytest-mock>=3.12",
|
|
86
|
+
"tomli>=2.0.1",
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
[project.urls]
|
|
90
|
+
Homepage = "https://gaik.ai/"
|
|
91
|
+
Repository = "https://github.com/GAIK-project/gaik-toolkit"
|
|
92
|
+
Documentation = "https://github.com/GAIK-project/gaik-toolkit/tree/main/packages/python/gaik"
|
|
93
|
+
Issues = "https://github.com/GAIK-project/gaik-toolkit/issues"
|
|
94
|
+
|
|
95
|
+
# Build system configuration (not installed, used during `python -m build`)
|
|
96
|
+
[build-system]
|
|
97
|
+
requires = ["setuptools>=61.0", "wheel", "setuptools-scm>=8.0"]
|
|
98
|
+
build-backend = "setuptools.build_meta"
|
|
99
|
+
|
|
100
|
+
# Source code location (not installed, directs setuptools behavior)
|
|
101
|
+
[tool.setuptools.packages.find]
|
|
102
|
+
where = ["src"]
|
|
103
|
+
exclude = ["gaik.tests", "gaik.*.tests"]
|
|
104
|
+
[tool.setuptools_scm]
|
|
105
|
+
version_file = "src/gaik/_version.py"
|
|
106
|
+
root = "../../.."
|
|
107
|
+
tag_regex = "^v(?P<version>\\d+\\.\\d+\\.\\d+)$"
|
|
108
|
+
fallback_version = "0.0.0"
|
|
109
|
+
version_scheme = "no-guess-dev"
|
|
110
|
+
local_scheme = "no-local-version"
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
# Include type hints marker file (py.typed included in package)
|
|
114
|
+
[tool.setuptools.package-data]
|
|
115
|
+
gaik = ["py.typed"]
|
|
116
|
+
|
|
117
|
+
# Ruff linter/formatter settings (not installed, used when running `ruff check`)
|
|
118
|
+
[tool.ruff]
|
|
119
|
+
line-length = 100
|
|
120
|
+
target-version = "py310"
|
|
121
|
+
extend-exclude = ["scripts"] # Exclude CI/CD scripts from linting
|
|
122
|
+
|
|
123
|
+
# Ruff lint rules configuration
|
|
124
|
+
[tool.ruff.lint]
|
|
125
|
+
select = ["E", "F", "I", "N", "W", "UP"]
|
|
126
|
+
ignore = []
|
|
127
|
+
|
|
128
|
+
# Pytest test runner configuration
|
|
129
|
+
[tool.pytest.ini_options]
|
|
130
|
+
testpaths = ["src/gaik"]
|
|
131
|
+
addopts = ["-v", "--strict-markers"]
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# CI/CD Scripts
|
|
2
|
+
|
|
3
|
+
This directory stores helper scripts that CI workflows call after building or installing the package. Keep them lightweight and dependency-free so they can run in both local and GitHub-hosted environments.
|
|
4
|
+
|
|
5
|
+
## Available scripts
|
|
6
|
+
|
|
7
|
+
| Script | Purpose |
|
|
8
|
+
| ------------------------ | -------------------------------------------------------------------------- |
|
|
9
|
+
| `verify_installation.py` | Smoke test that imports and basic utilities work after `pip install gaik`. |
|
|
10
|
+
|
|
11
|
+
### verify_installation.py
|
|
12
|
+
|
|
13
|
+
This script performs quick runtime checks without making network calls:
|
|
14
|
+
|
|
15
|
+
- Imports the public API (`gaik`, `gaik.extract`, `gaik.providers`).
|
|
16
|
+
- Instantiates a few Pydantic models (no LLM providers needed).
|
|
17
|
+
- Confirms required providers are registered.
|
|
18
|
+
- Prints a ✅ success message and exits 0.
|
|
19
|
+
|
|
20
|
+
**Used by:** `test.yml` (after unit tests) and `publish.yml` (after uploading to PyPI).
|
|
21
|
+
|
|
22
|
+
Run manually from the repo root:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
python packages/python/gaik/scripts/verify_installation.py
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
> Unit tests now live next to the modules they cover (e.g., `src/gaik/extract/tests`). These scripts are only for CI smoke checks.
|