doc-intelligence 0.0.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- doc_intelligence-0.0.7/.github/workflows/deploy-docs.yml +40 -0
- doc_intelligence-0.0.7/.github/workflows/release.yml +29 -0
- doc_intelligence-0.0.7/.gitignore +13 -0
- doc_intelligence-0.0.7/.pre-commit-config.yaml +17 -0
- doc_intelligence-0.0.7/.python-version +1 -0
- doc_intelligence-0.0.7/.vscode/settings.json +15 -0
- doc_intelligence-0.0.7/PKG-INFO +151 -0
- doc_intelligence-0.0.7/README.md +133 -0
- doc_intelligence-0.0.7/docs/index.md +5 -0
- doc_intelligence-0.0.7/docs/quickstart.md +125 -0
- doc_intelligence-0.0.7/document_ai/__init__.py +1 -0
- doc_intelligence-0.0.7/document_ai/base.py +64 -0
- doc_intelligence-0.0.7/document_ai/config.py +7 -0
- doc_intelligence-0.0.7/document_ai/extractor.py +76 -0
- doc_intelligence-0.0.7/document_ai/formatter.py +62 -0
- doc_intelligence-0.0.7/document_ai/llm.py +49 -0
- doc_intelligence-0.0.7/document_ai/parser.py +36 -0
- doc_intelligence-0.0.7/document_ai/processer.py +109 -0
- doc_intelligence-0.0.7/document_ai/pydantic_to_json_instance_schema.py +408 -0
- doc_intelligence-0.0.7/document_ai/schemas/__init__.py +0 -0
- doc_intelligence-0.0.7/document_ai/schemas/core.py +42 -0
- doc_intelligence-0.0.7/document_ai/schemas/pdf.py +31 -0
- doc_intelligence-0.0.7/document_ai/types/pdf.py +6 -0
- doc_intelligence-0.0.7/document_ai/utils.py +292 -0
- doc_intelligence-0.0.7/main.py +6 -0
- doc_intelligence-0.0.7/mkdocs.yml +50 -0
- doc_intelligence-0.0.7/notebooks/architecture.drawio +145 -0
- doc_intelligence-0.0.7/notebooks/general.ipynb +532 -0
- doc_intelligence-0.0.7/notebooks/lab.ipynb +255 -0
- doc_intelligence-0.0.7/notebooks/pdf/digital_pdf.ipynb +257 -0
- doc_intelligence-0.0.7/pyproject.toml +44 -0
- doc_intelligence-0.0.7/tests/__init__.py +0 -0
- doc_intelligence-0.0.7/tests/test_pydantic_to_json_instance_schema.py +483 -0
- doc_intelligence-0.0.7/uv.lock +2630 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
name: Deploy MkDocs to GitHub Pages
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
branches:
|
|
6
|
+
- main
|
|
7
|
+
workflow_dispatch:
|
|
8
|
+
|
|
9
|
+
permissions:
|
|
10
|
+
contents: write
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
deploy:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
steps:
|
|
16
|
+
- name: Checkout repository
|
|
17
|
+
uses: actions/checkout@v4
|
|
18
|
+
with:
|
|
19
|
+
fetch-depth: 0
|
|
20
|
+
|
|
21
|
+
- name: Setup Python
|
|
22
|
+
uses: actions/setup-python@v5
|
|
23
|
+
with:
|
|
24
|
+
python-version: '3.10'
|
|
25
|
+
|
|
26
|
+
- name: Install uv
|
|
27
|
+
uses: astral-sh/setup-uv@v5
|
|
28
|
+
with:
|
|
29
|
+
enable-cache: true
|
|
30
|
+
|
|
31
|
+
- name: Install dependencies
|
|
32
|
+
run: uv sync
|
|
33
|
+
|
|
34
|
+
- name: Configure Git
|
|
35
|
+
run: |
|
|
36
|
+
git config user.name github-actions
|
|
37
|
+
git config user.email github-actions@github.com
|
|
38
|
+
|
|
39
|
+
- name: Deploy to GitHub Pages
|
|
40
|
+
run: uv run mkdocs gh-deploy --force
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
name: Publish to PyPI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
push:
|
|
5
|
+
tags:
|
|
6
|
+
- 'v*'
|
|
7
|
+
|
|
8
|
+
jobs:
|
|
9
|
+
release:
|
|
10
|
+
name: Publish to PyPI
|
|
11
|
+
runs-on: ubuntu-latest
|
|
12
|
+
environment: pypi
|
|
13
|
+
permissions:
|
|
14
|
+
id-token: write
|
|
15
|
+
steps:
|
|
16
|
+
- name: Checkout repository
|
|
17
|
+
uses: actions/checkout@v6
|
|
18
|
+
|
|
19
|
+
- name: Install uv and setup the python version
|
|
20
|
+
uses: astral-sh/setup-uv@v7
|
|
21
|
+
|
|
22
|
+
- name: Install Python 3.10
|
|
23
|
+
run: uv python install 3.10
|
|
24
|
+
|
|
25
|
+
- name: Build
|
|
26
|
+
run: uv build
|
|
27
|
+
|
|
28
|
+
- name: Publish package
|
|
29
|
+
run: uv publish
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
repos:
|
|
2
|
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
|
3
|
+
# Ruff version.
|
|
4
|
+
rev: v0.14.13
|
|
5
|
+
hooks:
|
|
6
|
+
# Run the linter.
|
|
7
|
+
- id: ruff-check
|
|
8
|
+
args: [ --fix ]
|
|
9
|
+
# Run the formatter.
|
|
10
|
+
- id: ruff-format
|
|
11
|
+
# - repo: https://github.com/facebook/pyrefly-pre-commit
|
|
12
|
+
# rev: 0.48.1 # Note: this is the version of the pre-commit hook, NOT the pyrefly version used for type checking
|
|
13
|
+
# hooks:
|
|
14
|
+
# - id: pyrefly-check
|
|
15
|
+
# name: Pyrefly (type checking)
|
|
16
|
+
# pass_filenames: false # Recommended to do full repo checks. However, you can change this to `true` to only check changed files
|
|
17
|
+
# language: system # Use system-installed pyrefly
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
3.10
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
{
|
|
2
|
+
"[python]": {
|
|
3
|
+
"editor.formatOnSave": true,
|
|
4
|
+
"editor.codeActionsOnSave": {
|
|
5
|
+
"source.fixAll": "explicit",
|
|
6
|
+
"source.organizeImports": "explicit"
|
|
7
|
+
},
|
|
8
|
+
"editor.defaultFormatter": "charliermarsh.ruff"
|
|
9
|
+
},
|
|
10
|
+
"ruff.enable": true,
|
|
11
|
+
"ruff.organizeImports": true,
|
|
12
|
+
"ruff.fixAll": true
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: doc-intelligence
|
|
3
|
+
Version: 0.0.7
|
|
4
|
+
Summary: Document AI - Intelligent document processing and extraction
|
|
5
|
+
Requires-Python: >=3.10
|
|
6
|
+
Requires-Dist: google-genai>=1.57.0
|
|
7
|
+
Requires-Dist: langchain>=1.2.9
|
|
8
|
+
Requires-Dist: loguru>=0.7.3
|
|
9
|
+
Requires-Dist: mkdocs-material>=9.7.1
|
|
10
|
+
Requires-Dist: mkdocs>=1.6.1
|
|
11
|
+
Requires-Dist: openai>=2.15.0
|
|
12
|
+
Requires-Dist: pdfplumber>=0.11.9
|
|
13
|
+
Requires-Dist: pre-commit>=4.5.1
|
|
14
|
+
Requires-Dist: pymupdf>=1.26.7
|
|
15
|
+
Requires-Dist: pytest>=9.0.2
|
|
16
|
+
Requires-Dist: python-dotenv>=1.2.1
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
|
|
19
|
+
# Document AI
|
|
20
|
+
|
|
21
|
+
**Documentation:** [https://zeel-04.github.io/document-ai/](https://zeel-04.github.io/document-ai/)
|
|
22
|
+
|
|
23
|
+
A library for parsing, formatting, and processing documents that can be used to build AI-powered document processing pipelines with structured data extraction and citation support.
|
|
24
|
+
|
|
25
|
+
## Features
|
|
26
|
+
|
|
27
|
+
- Extract structured data from PDF documents using LLMs
|
|
28
|
+
- Automatic citation tracking with page numbers, line numbers, and bounding boxes
|
|
29
|
+
- Support for digital PDFs
|
|
30
|
+
- Type-safe data models using Pydantic
|
|
31
|
+
- OpenAI integration with support for reasoning models
|
|
32
|
+
|
|
33
|
+
## Installation
|
|
34
|
+
|
|
35
|
+
### Requirements
|
|
36
|
+
|
|
37
|
+
- Python >= 3.10
|
|
38
|
+
- OpenAI API key
|
|
39
|
+
|
|
40
|
+
### Install uv
|
|
41
|
+
|
|
42
|
+
First, install [uv](https://docs.astral.sh/uv/) if you haven't already:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Install from Source
|
|
49
|
+
|
|
50
|
+
Clone the repository and install the package:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
git clone https://github.com/zeel-04/document-ai.git
|
|
54
|
+
cd document-ai
|
|
55
|
+
uv sync
|
|
56
|
+
```
|
|
57
|
+
|
|
58
|
+
### Install from Git (Alternative)
|
|
59
|
+
|
|
60
|
+
You can also install directly from the git repository:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
uv pip install git+https://github.com/zeel-04/document-ai.git
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Quick Start
|
|
67
|
+
|
|
68
|
+
Set up your OpenAI API key:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
echo "OPENAI_API_KEY=your-api-key-here" > .env
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Here's a simple example to extract structured data from a PDF:
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from dotenv import load_dotenv
|
|
78
|
+
from document_ai.processer import DocumentProcessor
|
|
79
|
+
from document_ai.llm import OpenAILLM
|
|
80
|
+
from pydantic import BaseModel
|
|
81
|
+
|
|
82
|
+
# Load environment variables
|
|
83
|
+
load_dotenv()
|
|
84
|
+
|
|
85
|
+
# Initialize the LLM
|
|
86
|
+
llm = OpenAILLM()
|
|
87
|
+
|
|
88
|
+
# Create a processor from a PDF file
|
|
89
|
+
processor = DocumentProcessor.from_digital_pdf(
|
|
90
|
+
uri="path/to/your/document.pdf",
|
|
91
|
+
llm=llm,
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
# Define your data model
|
|
95
|
+
class Balance(BaseModel):
|
|
96
|
+
ending_balance: float
|
|
97
|
+
|
|
98
|
+
# Configure extraction with citations
|
|
99
|
+
config = {
|
|
100
|
+
"response_format": Balance,
|
|
101
|
+
"llm_config": {
|
|
102
|
+
"model": "gpt-5",
|
|
103
|
+
"reasoning": {"effort": "minimal"},
|
|
104
|
+
},
|
|
105
|
+
"extraction_config": {
|
|
106
|
+
"include_citations": True,
|
|
107
|
+
"extraction_mode": "single_pass",
|
|
108
|
+
"page_numbers": [0, 1], # Optional: specify which pages to process
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
# Extract structured data
|
|
113
|
+
response = processor.extract(config)
|
|
114
|
+
|
|
115
|
+
# Get the extracted data and citations
|
|
116
|
+
data, citations = response
|
|
117
|
+
print(f"Extracted data: {data}")
|
|
118
|
+
print(f"Citations: {citations}")
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### Sample Output
|
|
122
|
+
|
|
123
|
+
The `extract` method returns a tuple containing the extracted data and citation information:
|
|
124
|
+
|
|
125
|
+
```python
|
|
126
|
+
(Balance(ending_balance=111.61),
|
|
127
|
+
{'ending_balance': {'value': 111.61,
|
|
128
|
+
'citations': [{'page': 0,
|
|
129
|
+
'bboxes': [{'x0': 0.058823529411764705,
|
|
130
|
+
'top': 0.6095707475757575,
|
|
131
|
+
'x1': 0.5635455037254902,
|
|
132
|
+
'bottom': 0.6221969596969696}]}]}})
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Documentation
|
|
136
|
+
|
|
137
|
+
For more detailed documentation, see the [docs](./docs) directory or visit the [documentation site](https://zeel-04.github.io/document-ai/).
|
|
138
|
+
|
|
139
|
+
## Development Setup
|
|
140
|
+
|
|
141
|
+
Prerequisites:
|
|
142
|
+
|
|
143
|
+
- Python 3.10+
|
|
144
|
+
- uv
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
git clone https://github.com/zeel-04/document-ai.git
|
|
148
|
+
cd document-ai
|
|
149
|
+
uv venv
|
|
150
|
+
uv sync
|
|
151
|
+
```
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
# Document AI
|
|
2
|
+
|
|
3
|
+
**Documentation:** [https://zeel-04.github.io/document-ai/](https://zeel-04.github.io/document-ai/)
|
|
4
|
+
|
|
5
|
+
A library for parsing, formatting, and processing documents that can be used to build AI-powered document processing pipelines with structured data extraction and citation support.
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
- Extract structured data from PDF documents using LLMs
|
|
10
|
+
- Automatic citation tracking with page numbers, line numbers, and bounding boxes
|
|
11
|
+
- Support for digital PDFs
|
|
12
|
+
- Type-safe data models using Pydantic
|
|
13
|
+
- OpenAI integration with support for reasoning models
|
|
14
|
+
|
|
15
|
+
## Installation
|
|
16
|
+
|
|
17
|
+
### Requirements
|
|
18
|
+
|
|
19
|
+
- Python >= 3.10
|
|
20
|
+
- OpenAI API key
|
|
21
|
+
|
|
22
|
+
### Install uv
|
|
23
|
+
|
|
24
|
+
First, install [uv](https://docs.astral.sh/uv/) if you haven't already:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Install from Source
|
|
31
|
+
|
|
32
|
+
Clone the repository and install the package:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
git clone https://github.com/zeel-04/document-ai.git
|
|
36
|
+
cd document-ai
|
|
37
|
+
uv sync
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
### Install from Git (Alternative)
|
|
41
|
+
|
|
42
|
+
You can also install directly from the git repository:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
uv pip install git+https://github.com/zeel-04/document-ai.git
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## Quick Start
|
|
49
|
+
|
|
50
|
+
Set up your OpenAI API key:
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
echo "OPENAI_API_KEY=your-api-key-here" > .env
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Here's a simple example to extract structured data from a PDF:
|
|
57
|
+
|
|
58
|
+
```python
|
|
59
|
+
from dotenv import load_dotenv
|
|
60
|
+
from document_ai.processer import DocumentProcessor
|
|
61
|
+
from document_ai.llm import OpenAILLM
|
|
62
|
+
from pydantic import BaseModel
|
|
63
|
+
|
|
64
|
+
# Load environment variables
|
|
65
|
+
load_dotenv()
|
|
66
|
+
|
|
67
|
+
# Initialize the LLM
|
|
68
|
+
llm = OpenAILLM()
|
|
69
|
+
|
|
70
|
+
# Create a processor from a PDF file
|
|
71
|
+
processor = DocumentProcessor.from_digital_pdf(
|
|
72
|
+
uri="path/to/your/document.pdf",
|
|
73
|
+
llm=llm,
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
# Define your data model
|
|
77
|
+
class Balance(BaseModel):
|
|
78
|
+
ending_balance: float
|
|
79
|
+
|
|
80
|
+
# Configure extraction with citations
|
|
81
|
+
config = {
|
|
82
|
+
"response_format": Balance,
|
|
83
|
+
"llm_config": {
|
|
84
|
+
"model": "gpt-5",
|
|
85
|
+
"reasoning": {"effort": "minimal"},
|
|
86
|
+
},
|
|
87
|
+
"extraction_config": {
|
|
88
|
+
"include_citations": True,
|
|
89
|
+
"extraction_mode": "single_pass",
|
|
90
|
+
"page_numbers": [0, 1], # Optional: specify which pages to process
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
# Extract structured data
|
|
95
|
+
response = processor.extract(config)
|
|
96
|
+
|
|
97
|
+
# Get the extracted data and citations
|
|
98
|
+
data, citations = response
|
|
99
|
+
print(f"Extracted data: {data}")
|
|
100
|
+
print(f"Citations: {citations}")
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Sample Output
|
|
104
|
+
|
|
105
|
+
The `extract` method returns a tuple containing the extracted data and citation information:
|
|
106
|
+
|
|
107
|
+
```python
|
|
108
|
+
(Balance(ending_balance=111.61),
|
|
109
|
+
{'ending_balance': {'value': 111.61,
|
|
110
|
+
'citations': [{'page': 0,
|
|
111
|
+
'bboxes': [{'x0': 0.058823529411764705,
|
|
112
|
+
'top': 0.6095707475757575,
|
|
113
|
+
'x1': 0.5635455037254902,
|
|
114
|
+
'bottom': 0.6221969596969696}]}]}})
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## Documentation
|
|
118
|
+
|
|
119
|
+
For more detailed documentation, see the [docs](./docs) directory or visit the [documentation site](https://zeel-04.github.io/document-ai/).
|
|
120
|
+
|
|
121
|
+
## Development Setup
|
|
122
|
+
|
|
123
|
+
Prerequisites:
|
|
124
|
+
|
|
125
|
+
- Python 3.10+
|
|
126
|
+
- uv
|
|
127
|
+
|
|
128
|
+
```bash
|
|
129
|
+
git clone https://github.com/zeel-04/document-ai.git
|
|
130
|
+
cd document-ai
|
|
131
|
+
uv venv
|
|
132
|
+
uv sync
|
|
133
|
+
```
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
# Quickstart
|
|
2
|
+
|
|
3
|
+
This guide will help you get started with Document AI.
|
|
4
|
+
|
|
5
|
+
## Installation
|
|
6
|
+
|
|
7
|
+
## Requirements
|
|
8
|
+
|
|
9
|
+
- Python >= 3.10
|
|
10
|
+
- OpenAI API key
|
|
11
|
+
|
|
12
|
+
### Install uv
|
|
13
|
+
|
|
14
|
+
First, install [uv](https://docs.astral.sh/uv/) if you haven't already:
|
|
15
|
+
|
|
16
|
+
```bash
|
|
17
|
+
curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
### Install from Source
|
|
21
|
+
|
|
22
|
+
Clone the repository and install the package:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
# Clone the repository
|
|
26
|
+
git clone https://github.com/zeel-04/document-ai.git
|
|
27
|
+
cd document-ai
|
|
28
|
+
|
|
29
|
+
# Install the package with uv
|
|
30
|
+
uv sync
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### Install from Git (Alternative)
|
|
34
|
+
|
|
35
|
+
You can also install directly from the git repository:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
uv pip install git+https://github.com/zeel-04/document-ai.git
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
## Environment Setup
|
|
42
|
+
|
|
43
|
+
Document AI uses OpenAI's API for document processing. Set up your API key:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
# Create a .env file
|
|
47
|
+
echo "OPENAI_API_KEY=your-api-key-here" > .env
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
## Basic Usage
|
|
51
|
+
|
|
52
|
+
Here's a simple example to extract structured data from a PDF document:
|
|
53
|
+
|
|
54
|
+
```python
|
|
55
|
+
from dotenv import load_dotenv
|
|
56
|
+
from document_ai.processer import DocumentProcessor
|
|
57
|
+
from document_ai.llm import OpenAILLM
|
|
58
|
+
from pydantic import BaseModel
|
|
59
|
+
|
|
60
|
+
# Load environment variables
|
|
61
|
+
load_dotenv()
|
|
62
|
+
|
|
63
|
+
# Initialize the LLM
|
|
64
|
+
llm = OpenAILLM()
|
|
65
|
+
|
|
66
|
+
# Create a processor from a PDF file
|
|
67
|
+
processor = DocumentProcessor.from_digital_pdf(
|
|
68
|
+
uri="path/to/your/document.pdf",
|
|
69
|
+
llm=llm,
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# Define your data model
|
|
73
|
+
class Balance(BaseModel):
|
|
74
|
+
ending_balance: float
|
|
75
|
+
|
|
76
|
+
# Configure extraction with citations
|
|
77
|
+
config = {
|
|
78
|
+
"response_format": Balance,
|
|
79
|
+
"llm_config": {
|
|
80
|
+
"model": "gpt-5",
|
|
81
|
+
"reasoning": {"effort": "minimal"},
|
|
82
|
+
},
|
|
83
|
+
"extraction_config": {
|
|
84
|
+
"include_citations": True,
|
|
85
|
+
"extraction_mode": "single_pass",
|
|
86
|
+
"page_numbers": [0, 1], # Optional: specify which pages to process
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
# Extract structured data
|
|
91
|
+
response = processor.extract(config)
|
|
92
|
+
|
|
93
|
+
# Get the extracted data and citations
|
|
94
|
+
data, citations = response
|
|
95
|
+
print(f"Extracted data: {data}")
|
|
96
|
+
print(f"Citations: {citations}")
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### Sample Output
|
|
100
|
+
|
|
101
|
+
The `extract` method returns a tuple containing:
|
|
102
|
+
1. The extracted data as a Pydantic model instance
|
|
103
|
+
2. A dictionary with citation information for each field
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
# Example output
|
|
107
|
+
(Balance(ending_balance=111.61),
|
|
108
|
+
{'ending_balance': {'value': 111.61,
|
|
109
|
+
'citations': [{'page': 0,
|
|
110
|
+
'bboxes': [{'x0': 0.058823529411764705,
|
|
111
|
+
'top': 0.6095707475757575,
|
|
112
|
+
'x1': 0.5635455037254902,
|
|
113
|
+
'bottom': 0.6221969596969696}]}]}})
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
### Configuration Options
|
|
117
|
+
|
|
118
|
+
- **response_format**: Your Pydantic model class
|
|
119
|
+
- **llm_config**:
|
|
120
|
+
- `model`: The OpenAI model to use (e.g., "gpt-5", "gpt-4o")
|
|
121
|
+
- `reasoning`: Optional reasoning configuration with `effort` level ("minimal", "low", "medium", "high")
|
|
122
|
+
- **extraction_config**:
|
|
123
|
+
- `include_citations`: Set to `True` to get citation information
|
|
124
|
+
- `extraction_mode`: "single_pass" for single-pass extraction
|
|
125
|
+
- `page_numbers`: Optional list of page indices to process (0-indexed)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from document_ai.base import BaseFormatter, BaseParser, Document
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
from langchain_core.output_parsers import JsonOutputParser
|
|
5
|
+
|
|
6
|
+
from .schemas.core import Document, PydanticModel
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BaseParser(ABC):
|
|
10
|
+
@abstractmethod
|
|
11
|
+
def parse(self, document: Document) -> PydanticModel:
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BaseFormatter(ABC):
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def format_document_for_llm(
|
|
18
|
+
self,
|
|
19
|
+
document: Document,
|
|
20
|
+
**kwargs,
|
|
21
|
+
) -> str:
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class BaseLLM(ABC):
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def generate_structured_output(
|
|
28
|
+
self,
|
|
29
|
+
model: str,
|
|
30
|
+
messages: list[dict[str, str]],
|
|
31
|
+
reasoning: Any,
|
|
32
|
+
output_format: type[PydanticModel],
|
|
33
|
+
openai_text: dict[str, Any] | None = None,
|
|
34
|
+
) -> PydanticModel | None:
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
@abstractmethod
|
|
38
|
+
def generate_text(
|
|
39
|
+
self,
|
|
40
|
+
system_prompt: str,
|
|
41
|
+
user_prompt: str,
|
|
42
|
+
**kwargs,
|
|
43
|
+
) -> str:
|
|
44
|
+
pass
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class BaseExtractor(ABC):
|
|
48
|
+
def __init__(
|
|
49
|
+
self,
|
|
50
|
+
llm: BaseLLM,
|
|
51
|
+
):
|
|
52
|
+
self.llm = llm
|
|
53
|
+
self.json_parser = JsonOutputParser()
|
|
54
|
+
|
|
55
|
+
@abstractmethod
|
|
56
|
+
def extract(
|
|
57
|
+
self,
|
|
58
|
+
document: Document,
|
|
59
|
+
llm_config: dict[str, Any],
|
|
60
|
+
extraction_config: dict[str, Any],
|
|
61
|
+
formatter: BaseFormatter,
|
|
62
|
+
response_format: type[PydanticModel],
|
|
63
|
+
) -> tuple[PydanticModel, dict[str, Any] | None]:
|
|
64
|
+
pass
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from typing import Any
|
|
2
|
+
|
|
3
|
+
from loguru import logger
|
|
4
|
+
|
|
5
|
+
from .base import BaseExtractor, BaseFormatter
|
|
6
|
+
from .llm import BaseLLM
|
|
7
|
+
from .pydantic_to_json_instance_schema import (
|
|
8
|
+
pydantic_to_json_instance_schema,
|
|
9
|
+
stringify_schema,
|
|
10
|
+
)
|
|
11
|
+
from .schemas.core import Document, PydanticModel
|
|
12
|
+
from .types.pdf import PDFExtractionMode
|
|
13
|
+
from .utils import enrich_citations_with_bboxes, strip_citations
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class DigitalPDFExtractor(BaseExtractor):
|
|
17
|
+
def __init__(self, llm: BaseLLM):
|
|
18
|
+
super().__init__(llm)
|
|
19
|
+
|
|
20
|
+
self.system_prompt = """Act as an expert in the field of document extraction and information extraction from documents."""
|
|
21
|
+
self.user_prompt = """Your job is to extract structured mentioned in schema data from a document given below.
|
|
22
|
+
|
|
23
|
+
DOCUMENT:
|
|
24
|
+
{content_text}
|
|
25
|
+
|
|
26
|
+
OUTPUT SCHEMA:
|
|
27
|
+
{schema}
|
|
28
|
+
|
|
29
|
+
Generate output in JSON format.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def extract(
|
|
33
|
+
self,
|
|
34
|
+
document: Document,
|
|
35
|
+
llm_config: dict[str, Any],
|
|
36
|
+
extraction_config: dict[str, Any],
|
|
37
|
+
formatter: BaseFormatter,
|
|
38
|
+
response_format: type[PydanticModel],
|
|
39
|
+
) -> tuple[PydanticModel, dict[str, Any] | None]:
|
|
40
|
+
if document.extraction_mode == PDFExtractionMode.SINGLE_PASS:
|
|
41
|
+
json_instance_schema = stringify_schema(
|
|
42
|
+
pydantic_to_json_instance_schema(
|
|
43
|
+
response_format,
|
|
44
|
+
citation=document.include_citations,
|
|
45
|
+
citation_level="line",
|
|
46
|
+
)
|
|
47
|
+
)
|
|
48
|
+
logger.debug(
|
|
49
|
+
f"DigitalPDFExtractor: extract: json_instance_schema: {json_instance_schema}"
|
|
50
|
+
)
|
|
51
|
+
content_text = formatter.format_document_for_llm(
|
|
52
|
+
document, **extraction_config
|
|
53
|
+
)
|
|
54
|
+
logger.debug(f"DigitalPDFExtractor: extract: content_text: {content_text}")
|
|
55
|
+
user_prompt = self.user_prompt.format(
|
|
56
|
+
content_text=content_text, schema=json_instance_schema
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
response = self.llm.generate_text(
|
|
60
|
+
system_prompt=self.system_prompt,
|
|
61
|
+
user_prompt=user_prompt,
|
|
62
|
+
**llm_config,
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
if document.extraction_mode == PDFExtractionMode.MULTI_PASS:
|
|
66
|
+
raise NotImplementedError("Multi-pass extraction is not implemented yet")
|
|
67
|
+
|
|
68
|
+
response_dict = self.json_parser.parse(response)
|
|
69
|
+
|
|
70
|
+
if document.include_citations:
|
|
71
|
+
response_metadata = enrich_citations_with_bboxes(response_dict, document)
|
|
72
|
+
response_dict = strip_citations(response_metadata)
|
|
73
|
+
else:
|
|
74
|
+
response_metadata = None
|
|
75
|
+
|
|
76
|
+
return response_format(**response_dict), response_metadata
|