lexoid 0.1.6__tar.gz → 0.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lexoid-0.1.6 → lexoid-0.1.8}/PKG-INFO +41 -19
- {lexoid-0.1.6 → lexoid-0.1.8}/README.md +37 -18
- lexoid-0.1.8/lexoid/core/parse_type/llm_parser.py +363 -0
- {lexoid-0.1.6 → lexoid-0.1.8}/lexoid/core/parse_type/static_parser.py +79 -27
- {lexoid-0.1.6 → lexoid-0.1.8}/lexoid/core/prompt_templates.py +19 -0
- {lexoid-0.1.6 → lexoid-0.1.8}/pyproject.toml +5 -6
- lexoid-0.1.6/lexoid/core/parse_type/llm_parser.py +0 -200
- {lexoid-0.1.6 → lexoid-0.1.8}/LICENSE +0 -0
- {lexoid-0.1.6 → lexoid-0.1.8}/lexoid/api.py +0 -0
- {lexoid-0.1.6 → lexoid-0.1.8}/lexoid/core/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lexoid
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.8
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.10,<4.0
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -11,6 +11,7 @@ Classifier: Programming Language :: Python :: 3.13
|
|
11
11
|
Requires-Dist: bs4 (>=0.0.2,<0.0.3)
|
12
12
|
Requires-Dist: docx2pdf (>=0.1.8,<0.2.0)
|
13
13
|
Requires-Dist: google-generativeai (>=0.8.1,<0.9.0)
|
14
|
+
Requires-Dist: huggingface-hub (>=0.27.0,<0.28.0)
|
14
15
|
Requires-Dist: loguru (>=0.7.2,<0.8.0)
|
15
16
|
Requires-Dist: markdown (>=3.7,<4.0)
|
16
17
|
Requires-Dist: markdownify (>=0.13.1,<0.14.0)
|
@@ -22,6 +23,8 @@ Requires-Dist: pdfplumber (>=0.11.4,<0.12.0)
|
|
22
23
|
Requires-Dist: pikepdf (>=9.3.0,<10.0.0)
|
23
24
|
Requires-Dist: playwright (>=1.49.0,<2.0.0)
|
24
25
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
26
|
+
Requires-Dist: pyqt5 (>=5.15.11,<6.0.0) ; platform_system != "debian"
|
27
|
+
Requires-Dist: pyqtwebengine (>=5.15.7,<6.0.0) ; platform_system != "debian"
|
25
28
|
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
26
29
|
Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
|
27
30
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
@@ -29,47 +32,59 @@ Description-Content-Type: text/markdown
|
|
29
32
|
|
30
33
|
# Lexoid
|
31
34
|
|
35
|
+
[](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
|
36
|
+
[](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
|
37
|
+
[](https://pypi.org/project/lexoid/)
|
38
|
+
|
32
39
|
Lexoid is an efficient document parsing library that supports both LLM-based and non-LLM-based (static) PDF document parsing.
|
33
40
|
|
34
41
|
## Motivation:
|
35
42
|
- Use the multi-modal advancement of LLMs
|
36
|
-
- Enable convenience for users
|
43
|
+
- Enable convenience for users
|
37
44
|
- Collaborate with a permissive license
|
38
45
|
|
39
46
|
## Installation
|
40
|
-
|
47
|
+
### Installing with pip
|
41
48
|
```
|
42
|
-
|
49
|
+
pip install lexoid
|
43
50
|
```
|
44
|
-
|
51
|
+
|
52
|
+
To use LLM-based parsing, define the following environment variables or create a `.env` file with the following definitions
|
45
53
|
```
|
46
|
-
|
54
|
+
OPENAI_API_KEY=""
|
55
|
+
GOOGLE_API_KEY=""
|
47
56
|
```
|
48
57
|
|
49
|
-
|
58
|
+
Optionally, to use `Playwright` for retrieving web content (instead of the `requests` library):
|
50
59
|
```
|
51
|
-
|
60
|
+
playwright install --with-deps --only-shell chromium
|
52
61
|
```
|
53
62
|
|
54
|
-
|
63
|
+
### Building `.whl` from source
|
55
64
|
```
|
56
|
-
|
57
|
-
GOOGLE_API_KEY=""
|
65
|
+
make build
|
58
66
|
```
|
59
67
|
|
60
|
-
|
68
|
+
### Creating a local installation
|
69
|
+
To install dependencies:
|
70
|
+
```
|
71
|
+
make install
|
72
|
+
```
|
73
|
+
or, to install with dev-dependencies:
|
61
74
|
```
|
62
|
-
|
75
|
+
make dev
|
63
76
|
```
|
64
77
|
|
65
|
-
|
78
|
+
To activate virtual environment:
|
66
79
|
```
|
67
|
-
|
80
|
+
source .venv/bin/activate
|
68
81
|
```
|
69
82
|
|
70
83
|
## Usage
|
71
84
|
[Example Notebook](https://github.com/oidlabs-com/Lexoid/blob/main/examples/example_notebook.ipynb)
|
72
85
|
|
86
|
+
[Example Colab Notebook](https://drive.google.com/file/d/1v9R6VOUp9CEGalgZGeg5G57XzHqh_tB6/view?usp=sharing)
|
87
|
+
|
73
88
|
Here's a quick example to parse documents using Lexoid:
|
74
89
|
``` python
|
75
90
|
from lexoid.api import parse
|
@@ -86,17 +101,24 @@ print(parsed_md)
|
|
86
101
|
### Parameters
|
87
102
|
- path (str): The file path or URL.
|
88
103
|
- parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
|
89
|
-
- raw (bool, optional):
|
104
|
+
- raw (bool, optional): Return raw text or structured data. Defaults to False.
|
90
105
|
- pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
|
91
106
|
- max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
|
92
107
|
- **kwargs: Additional arguments for the parser.
|
93
108
|
|
94
109
|
## Benchmark
|
95
110
|
Initial results (_more updates soon_)
|
111
|
+
_Note:_ Benchmarks done in zero-shot scenario currently
|
96
112
|
| Rank | Model/Framework | Similarity | Time (s) |
|
97
113
|
|------|-----------|------------|----------|
|
98
114
|
| 1 | gpt-4o | 0.799 | 21.77|
|
99
|
-
| 2 | gemini-
|
100
|
-
| 3 |
|
101
|
-
| 4 | gemini-1.5-
|
115
|
+
| 2 | gemini-2.0-flash-exp | 0.797 | 13.47 |
|
116
|
+
| 3 | gemini-exp-1121 | 0.779 | 30.88 |
|
117
|
+
| 4 | gemini-1.5-pro | 0.742 | 15.77 |
|
118
|
+
| 5 | gpt-4o-mini | 0.721 | 14.86 |
|
119
|
+
| 6 | gemini-1.5-flash | 0.702 | 4.56 |
|
120
|
+
| 7 | Llama-3.2-11B-Vision-Instruct (via HF) | 0.582 | 21.74 |
|
121
|
+
| 8 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.556 | 4.58 |
|
122
|
+
| 9 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.527 | 10.57 |
|
123
|
+
| 10 | Llama-Vision-Free (via Together AI) | 0.435 | 8.42 |
|
102
124
|
|
@@ -1,46 +1,58 @@
|
|
1
1
|
# Lexoid
|
2
2
|
|
3
|
+
[](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
|
4
|
+
[](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
|
5
|
+
[](https://pypi.org/project/lexoid/)
|
6
|
+
|
3
7
|
Lexoid is an efficient document parsing library that supports both LLM-based and non-LLM-based (static) PDF document parsing.
|
4
8
|
|
5
9
|
## Motivation:
|
6
10
|
- Use the multi-modal advancement of LLMs
|
7
|
-
- Enable convenience for users
|
11
|
+
- Enable convenience for users
|
8
12
|
- Collaborate with a permissive license
|
9
13
|
|
10
14
|
## Installation
|
11
|
-
|
15
|
+
### Installing with pip
|
12
16
|
```
|
13
|
-
|
17
|
+
pip install lexoid
|
14
18
|
```
|
15
|
-
|
19
|
+
|
20
|
+
To use LLM-based parsing, define the following environment variables or create a `.env` file with the following definitions
|
16
21
|
```
|
17
|
-
|
22
|
+
OPENAI_API_KEY=""
|
23
|
+
GOOGLE_API_KEY=""
|
18
24
|
```
|
19
25
|
|
20
|
-
|
26
|
+
Optionally, to use `Playwright` for retrieving web content (instead of the `requests` library):
|
21
27
|
```
|
22
|
-
|
28
|
+
playwright install --with-deps --only-shell chromium
|
23
29
|
```
|
24
30
|
|
25
|
-
|
31
|
+
### Building `.whl` from source
|
26
32
|
```
|
27
|
-
|
28
|
-
GOOGLE_API_KEY=""
|
33
|
+
make build
|
29
34
|
```
|
30
35
|
|
31
|
-
|
36
|
+
### Creating a local installation
|
37
|
+
To install dependencies:
|
38
|
+
```
|
39
|
+
make install
|
40
|
+
```
|
41
|
+
or, to install with dev-dependencies:
|
32
42
|
```
|
33
|
-
|
43
|
+
make dev
|
34
44
|
```
|
35
45
|
|
36
|
-
|
46
|
+
To activate virtual environment:
|
37
47
|
```
|
38
|
-
|
48
|
+
source .venv/bin/activate
|
39
49
|
```
|
40
50
|
|
41
51
|
## Usage
|
42
52
|
[Example Notebook](https://github.com/oidlabs-com/Lexoid/blob/main/examples/example_notebook.ipynb)
|
43
53
|
|
54
|
+
[Example Colab Notebook](https://drive.google.com/file/d/1v9R6VOUp9CEGalgZGeg5G57XzHqh_tB6/view?usp=sharing)
|
55
|
+
|
44
56
|
Here's a quick example to parse documents using Lexoid:
|
45
57
|
``` python
|
46
58
|
from lexoid.api import parse
|
@@ -57,16 +69,23 @@ print(parsed_md)
|
|
57
69
|
### Parameters
|
58
70
|
- path (str): The file path or URL.
|
59
71
|
- parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
|
60
|
-
- raw (bool, optional):
|
72
|
+
- raw (bool, optional): Return raw text or structured data. Defaults to False.
|
61
73
|
- pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
|
62
74
|
- max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
|
63
75
|
- **kwargs: Additional arguments for the parser.
|
64
76
|
|
65
77
|
## Benchmark
|
66
78
|
Initial results (_more updates soon_)
|
79
|
+
_Note:_ Benchmarks done in zero-shot scenario currently
|
67
80
|
| Rank | Model/Framework | Similarity | Time (s) |
|
68
81
|
|------|-----------|------------|----------|
|
69
82
|
| 1 | gpt-4o | 0.799 | 21.77|
|
70
|
-
| 2 | gemini-
|
71
|
-
| 3 |
|
72
|
-
| 4 | gemini-1.5-
|
83
|
+
| 2 | gemini-2.0-flash-exp | 0.797 | 13.47 |
|
84
|
+
| 3 | gemini-exp-1121 | 0.779 | 30.88 |
|
85
|
+
| 4 | gemini-1.5-pro | 0.742 | 15.77 |
|
86
|
+
| 5 | gpt-4o-mini | 0.721 | 14.86 |
|
87
|
+
| 6 | gemini-1.5-flash | 0.702 | 4.56 |
|
88
|
+
| 7 | Llama-3.2-11B-Vision-Instruct (via HF) | 0.582 | 21.74 |
|
89
|
+
| 8 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.556 | 4.58 |
|
90
|
+
| 9 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.527 | 10.57 |
|
91
|
+
| 10 | Llama-Vision-Free (via Together AI) | 0.435 | 8.42 |
|
@@ -0,0 +1,363 @@
|
|
1
|
+
import base64
|
2
|
+
import io
|
3
|
+
import mimetypes
|
4
|
+
import os
|
5
|
+
import time
|
6
|
+
import pypdfium2 as pdfium
|
7
|
+
import requests
|
8
|
+
from functools import wraps
|
9
|
+
from requests.exceptions import HTTPError
|
10
|
+
from typing import Dict, List
|
11
|
+
|
12
|
+
from lexoid.core.prompt_templates import (
|
13
|
+
INSTRUCTIONS_ADD_PG_BREAK,
|
14
|
+
OPENAI_USER_PROMPT,
|
15
|
+
PARSER_PROMPT,
|
16
|
+
LLAMA_PARSER_PROMPT,
|
17
|
+
)
|
18
|
+
from lexoid.core.utils import convert_image_to_pdf
|
19
|
+
from loguru import logger
|
20
|
+
from openai import OpenAI
|
21
|
+
from huggingface_hub import InferenceClient
|
22
|
+
|
23
|
+
|
24
|
+
def retry_on_http_error(func):
|
25
|
+
@wraps(func)
|
26
|
+
def wrapper(*args, **kwargs):
|
27
|
+
try:
|
28
|
+
return func(*args, **kwargs)
|
29
|
+
except HTTPError as e:
|
30
|
+
logger.error(f"HTTPError encountered: {e}. Retrying in 10 seconds...")
|
31
|
+
time.sleep(10)
|
32
|
+
try:
|
33
|
+
return func(*args, **kwargs)
|
34
|
+
except HTTPError as e:
|
35
|
+
logger.error(f"Retry failed: {e}")
|
36
|
+
if kwargs.get("raw", False):
|
37
|
+
return ""
|
38
|
+
return [
|
39
|
+
{
|
40
|
+
"metadata": {
|
41
|
+
"title": kwargs["title"],
|
42
|
+
"page": kwargs.get("start", 0),
|
43
|
+
},
|
44
|
+
"content": "",
|
45
|
+
}
|
46
|
+
]
|
47
|
+
|
48
|
+
return wrapper
|
49
|
+
|
50
|
+
|
51
|
+
@retry_on_http_error
|
52
|
+
def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
53
|
+
if "model" not in kwargs:
|
54
|
+
kwargs["model"] = "gemini-1.5-flash"
|
55
|
+
model = kwargs.get("model")
|
56
|
+
if model.startswith("gemini"):
|
57
|
+
return parse_with_gemini(path, raw, **kwargs)
|
58
|
+
if model.startswith("gpt"):
|
59
|
+
return parse_with_api(path, raw, api="openai", **kwargs)
|
60
|
+
if model.startswith("meta-llama"):
|
61
|
+
if model.endswith("Turbo") or model == "meta-llama/Llama-Vision-Free":
|
62
|
+
return parse_with_together(path, raw, **kwargs)
|
63
|
+
return parse_with_api(path, raw, api="huggingface", **kwargs)
|
64
|
+
raise ValueError(f"Unsupported model: {model}")
|
65
|
+
|
66
|
+
|
67
|
+
def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
68
|
+
api_key = os.environ.get("GOOGLE_API_KEY")
|
69
|
+
if not api_key:
|
70
|
+
raise ValueError("GOOGLE_API_KEY environment variable is not set")
|
71
|
+
|
72
|
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/{kwargs['model']}:generateContent?key={api_key}"
|
73
|
+
|
74
|
+
# Check if the file is an image and convert to PDF if necessary
|
75
|
+
mime_type, _ = mimetypes.guess_type(path)
|
76
|
+
if mime_type and mime_type.startswith("image"):
|
77
|
+
pdf_content = convert_image_to_pdf(path)
|
78
|
+
mime_type = "application/pdf"
|
79
|
+
base64_file = base64.b64encode(pdf_content).decode("utf-8")
|
80
|
+
else:
|
81
|
+
with open(path, "rb") as file:
|
82
|
+
file_content = file.read()
|
83
|
+
base64_file = base64.b64encode(file_content).decode("utf-8")
|
84
|
+
|
85
|
+
# Ideally, we do this ourselves. But, for now this might be a good enough.
|
86
|
+
custom_instruction = f"""- Total number of pages: {kwargs["pages_per_split_"]}. {INSTRUCTIONS_ADD_PG_BREAK}"""
|
87
|
+
if kwargs["pages_per_split_"] == 1:
|
88
|
+
custom_instruction = ""
|
89
|
+
|
90
|
+
payload = {
|
91
|
+
"contents": [
|
92
|
+
{
|
93
|
+
"parts": [
|
94
|
+
{
|
95
|
+
"text": PARSER_PROMPT.format(
|
96
|
+
custom_instructions=custom_instruction
|
97
|
+
)
|
98
|
+
},
|
99
|
+
{"inline_data": {"mime_type": mime_type, "data": base64_file}},
|
100
|
+
]
|
101
|
+
}
|
102
|
+
],
|
103
|
+
"generationConfig": {
|
104
|
+
"temperature": kwargs.get("temperature", 0.7),
|
105
|
+
},
|
106
|
+
}
|
107
|
+
|
108
|
+
headers = {"Content-Type": "application/json"}
|
109
|
+
|
110
|
+
response = requests.post(url, json=payload, headers=headers)
|
111
|
+
response.raise_for_status()
|
112
|
+
|
113
|
+
result = response.json()
|
114
|
+
|
115
|
+
raw_text = "".join(
|
116
|
+
part["text"]
|
117
|
+
for candidate in result.get("candidates", [])
|
118
|
+
for part in candidate.get("content", {}).get("parts", [])
|
119
|
+
if "text" in part
|
120
|
+
)
|
121
|
+
|
122
|
+
result = ""
|
123
|
+
if "<output>" in raw_text:
|
124
|
+
result = raw_text.split("<output>")[1].strip()
|
125
|
+
if "</output>" in result:
|
126
|
+
result = result.split("</output>")[0].strip()
|
127
|
+
|
128
|
+
if raw:
|
129
|
+
return result
|
130
|
+
|
131
|
+
return [
|
132
|
+
{
|
133
|
+
"metadata": {
|
134
|
+
"title": kwargs["title"],
|
135
|
+
"page": kwargs.get("start", 0) + page_no,
|
136
|
+
},
|
137
|
+
"content": page,
|
138
|
+
}
|
139
|
+
for page_no, page in enumerate(result.split("<page-break>"), start=1)
|
140
|
+
]
|
141
|
+
|
142
|
+
|
143
|
+
def convert_pdf_page_to_base64(
|
144
|
+
pdf_document: pdfium.PdfDocument, page_number: int
|
145
|
+
) -> str:
|
146
|
+
"""Convert a PDF page to a base64-encoded PNG string."""
|
147
|
+
page = pdf_document[page_number]
|
148
|
+
# Render with 4x scaling for better quality
|
149
|
+
pil_image = page.render(scale=4).to_pil()
|
150
|
+
|
151
|
+
# Convert to base64
|
152
|
+
img_byte_arr = io.BytesIO()
|
153
|
+
pil_image.save(img_byte_arr, format="PNG")
|
154
|
+
img_byte_arr.seek(0)
|
155
|
+
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
156
|
+
|
157
|
+
|
158
|
+
def parse_with_together(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
159
|
+
api_key = os.environ.get("TOGETHER_API_KEY")
|
160
|
+
if not api_key:
|
161
|
+
raise ValueError("TOGETHER_API_KEY environment variable is not set")
|
162
|
+
|
163
|
+
url = "https://api.together.xyz/v1/chat/completions"
|
164
|
+
headers = {
|
165
|
+
"Authorization": f"Bearer {api_key}",
|
166
|
+
"Content-Type": "application/json",
|
167
|
+
}
|
168
|
+
|
169
|
+
mime_type, _ = mimetypes.guess_type(path)
|
170
|
+
if mime_type and mime_type.startswith("image"):
|
171
|
+
with open(path, "rb") as img_file:
|
172
|
+
image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
|
173
|
+
images = [(0, f"data:{mime_type};base64,{image_base64}")]
|
174
|
+
else:
|
175
|
+
pdf_document = pdfium.PdfDocument(path)
|
176
|
+
images = [
|
177
|
+
(
|
178
|
+
page_num,
|
179
|
+
f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
|
180
|
+
)
|
181
|
+
for page_num in range(len(pdf_document))
|
182
|
+
]
|
183
|
+
|
184
|
+
all_results = []
|
185
|
+
for page_num, image_url in images:
|
186
|
+
messages = [
|
187
|
+
{
|
188
|
+
"role": "user",
|
189
|
+
"content": [
|
190
|
+
{"type": "text", "text": LLAMA_PARSER_PROMPT},
|
191
|
+
{"type": "image_url", "image_url": {"url": image_url}},
|
192
|
+
],
|
193
|
+
}
|
194
|
+
]
|
195
|
+
|
196
|
+
payload = {
|
197
|
+
"model": kwargs["model"],
|
198
|
+
"messages": messages,
|
199
|
+
"max_tokens": kwargs.get("max_tokens", 1024),
|
200
|
+
"temperature": kwargs.get("temperature", 0.7),
|
201
|
+
}
|
202
|
+
|
203
|
+
response = requests.post(url, json=payload, headers=headers)
|
204
|
+
response.raise_for_status()
|
205
|
+
response_data = response.json()
|
206
|
+
|
207
|
+
page_text = response_data["choices"][0]["message"]["content"]
|
208
|
+
if kwargs.get("verbose", None):
|
209
|
+
logger.debug(f"Page {page_num + 1} response: {page_text}")
|
210
|
+
|
211
|
+
result = page_text
|
212
|
+
if "<output>" in page_text:
|
213
|
+
result = page_text.split("<output>")[1].strip()
|
214
|
+
if "</output>" in result:
|
215
|
+
result = result.split("</output>")[0].strip()
|
216
|
+
all_results.append((page_num, result))
|
217
|
+
|
218
|
+
all_results.sort(key=lambda x: x[0])
|
219
|
+
all_texts = [text for _, text in all_results]
|
220
|
+
combined_text = "<page-break>".join(all_texts)
|
221
|
+
|
222
|
+
if raw:
|
223
|
+
return combined_text
|
224
|
+
|
225
|
+
return [
|
226
|
+
{
|
227
|
+
"metadata": {
|
228
|
+
"title": kwargs["title"],
|
229
|
+
"page": kwargs.get("start", 0) + page_no,
|
230
|
+
},
|
231
|
+
"content": page,
|
232
|
+
}
|
233
|
+
for page_no, page in enumerate(all_texts, start=1)
|
234
|
+
]
|
235
|
+
|
236
|
+
|
237
|
+
def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str:
|
238
|
+
"""
|
239
|
+
Parse documents (PDFs or images) using various vision model APIs.
|
240
|
+
|
241
|
+
Args:
|
242
|
+
path (str): Path to the document to parse
|
243
|
+
raw (bool): If True, return raw text; if False, return structured data
|
244
|
+
api (str): Which API to use ("openai" or "huggingface")
|
245
|
+
**kwargs: Additional arguments including model, temperature, title, etc.
|
246
|
+
|
247
|
+
Returns:
|
248
|
+
List[Dict] | str: Parsed content either as raw text or structured data
|
249
|
+
"""
|
250
|
+
# Initialize appropriate client
|
251
|
+
clients = {
|
252
|
+
"openai": lambda: OpenAI(),
|
253
|
+
"huggingface": lambda: InferenceClient(
|
254
|
+
token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
|
255
|
+
),
|
256
|
+
}
|
257
|
+
assert api in clients, f"Unsupported API: {api}"
|
258
|
+
logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
|
259
|
+
client = clients[api]()
|
260
|
+
|
261
|
+
# Handle different input types
|
262
|
+
mime_type, _ = mimetypes.guess_type(path)
|
263
|
+
if mime_type and mime_type.startswith("image"):
|
264
|
+
# Single image processing
|
265
|
+
with open(path, "rb") as img_file:
|
266
|
+
image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
|
267
|
+
images = [(0, f"data:{mime_type};base64,{image_base64}")]
|
268
|
+
else:
|
269
|
+
# PDF processing
|
270
|
+
pdf_document = pdfium.PdfDocument(path)
|
271
|
+
images = [
|
272
|
+
(
|
273
|
+
page_num,
|
274
|
+
f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
|
275
|
+
)
|
276
|
+
for page_num in range(len(pdf_document))
|
277
|
+
]
|
278
|
+
|
279
|
+
# API-specific message formatting
|
280
|
+
def get_messages(page_num: int, image_url: str) -> List[Dict]:
|
281
|
+
base_message = {
|
282
|
+
"type": "text",
|
283
|
+
"text": LLAMA_PARSER_PROMPT,
|
284
|
+
}
|
285
|
+
image_message = {
|
286
|
+
"type": "image_url",
|
287
|
+
"image_url": {"url": image_url},
|
288
|
+
}
|
289
|
+
|
290
|
+
if api == "openai":
|
291
|
+
return [
|
292
|
+
{
|
293
|
+
"role": "system",
|
294
|
+
"content": PARSER_PROMPT.format(
|
295
|
+
custom_instructions=INSTRUCTIONS_ADD_PG_BREAK
|
296
|
+
),
|
297
|
+
},
|
298
|
+
{
|
299
|
+
"role": "user",
|
300
|
+
"content": [
|
301
|
+
{
|
302
|
+
"type": "text",
|
303
|
+
"text": f"{OPENAI_USER_PROMPT} (Page {page_num + 1})",
|
304
|
+
},
|
305
|
+
image_message,
|
306
|
+
],
|
307
|
+
},
|
308
|
+
]
|
309
|
+
else:
|
310
|
+
return [
|
311
|
+
{
|
312
|
+
"role": "user",
|
313
|
+
"content": [base_message, image_message],
|
314
|
+
}
|
315
|
+
]
|
316
|
+
|
317
|
+
# Process each page/image
|
318
|
+
all_results = []
|
319
|
+
for page_num, image_url in images:
|
320
|
+
messages = get_messages(page_num, image_url)
|
321
|
+
|
322
|
+
# Common completion parameters
|
323
|
+
completion_params = {
|
324
|
+
"model": kwargs["model"],
|
325
|
+
"messages": messages,
|
326
|
+
"max_tokens": kwargs.get("max_tokens", 1024),
|
327
|
+
"temperature": kwargs.get("temperature", 0.7),
|
328
|
+
}
|
329
|
+
|
330
|
+
# Get completion from selected API
|
331
|
+
response = client.chat.completions.create(**completion_params)
|
332
|
+
|
333
|
+
# Extract the response text
|
334
|
+
page_text = response.choices[0].message.content
|
335
|
+
if kwargs.get("verbose", None):
|
336
|
+
logger.debug(f"Page {page_num + 1} response: {page_text}")
|
337
|
+
|
338
|
+
# Extract content between output tags if present
|
339
|
+
result = page_text
|
340
|
+
if "<output>" in page_text:
|
341
|
+
result = page_text.split("<output>")[1].strip()
|
342
|
+
if "</output>" in result:
|
343
|
+
result = result.split("</output>")[0].strip()
|
344
|
+
all_results.append((page_num, result))
|
345
|
+
|
346
|
+
# Sort results by page number and combine
|
347
|
+
all_results.sort(key=lambda x: x[0])
|
348
|
+
all_texts = [text for _, text in all_results]
|
349
|
+
combined_text = "<page-break>".join(all_texts)
|
350
|
+
|
351
|
+
if raw:
|
352
|
+
return combined_text
|
353
|
+
|
354
|
+
return [
|
355
|
+
{
|
356
|
+
"metadata": {
|
357
|
+
"title": kwargs["title"],
|
358
|
+
"page": kwargs.get("start", 0) + page_no,
|
359
|
+
},
|
360
|
+
"content": page,
|
361
|
+
}
|
362
|
+
for page_no, page in enumerate(all_texts, start=1)
|
363
|
+
]
|
@@ -89,15 +89,21 @@ def process_table(table) -> str:
|
|
89
89
|
|
90
90
|
# Convert to DataFrame and handle empty cells
|
91
91
|
df = pd.DataFrame(table_data)
|
92
|
+
df.replace("", pd.NA, inplace=True)
|
93
|
+
df = df.dropna(how="all", axis=0)
|
94
|
+
df = df.dropna(how="all", axis=1)
|
92
95
|
df = df.fillna("")
|
96
|
+
if len(df) == 0:
|
97
|
+
return ""
|
93
98
|
|
94
99
|
# Use first row as header and clean it up
|
95
100
|
df.columns = df.iloc[0]
|
96
|
-
df = df.drop(0)
|
101
|
+
df = df.drop(df.index[0])
|
102
|
+
df.replace(r"\n", "<br>", regex=True, inplace=True)
|
97
103
|
|
98
104
|
# Convert to markdown with some formatting options
|
99
105
|
markdown_table = df.to_markdown(index=False, tablefmt="pipe")
|
100
|
-
return f"\n{markdown_table}\n\n"
|
106
|
+
return f"\n{markdown_table}\n\n"
|
101
107
|
|
102
108
|
|
103
109
|
def embed_links_in_text(page, text, links):
|
@@ -157,8 +163,20 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
157
163
|
x_tolerance = kwargs.get("x_tolerance", 1)
|
158
164
|
y_tolerance = kwargs.get("y_tolerance", 5)
|
159
165
|
|
160
|
-
#
|
161
|
-
|
166
|
+
# Table settings
|
167
|
+
vertical_strategy = kwargs.get("vertical_strategy", "lines")
|
168
|
+
horizontal_strategy = kwargs.get("horizontal_strategy", "lines")
|
169
|
+
snap_x_tolerance = kwargs.get("snap_x_tolerance", 10)
|
170
|
+
snap_y_tolerance = kwargs.get("snap_y_tolerance", 0)
|
171
|
+
|
172
|
+
tables = page.find_tables(
|
173
|
+
table_settings={
|
174
|
+
"vertical_strategy": vertical_strategy,
|
175
|
+
"horizontal_strategy": horizontal_strategy,
|
176
|
+
"snap_x_tolerance": snap_x_tolerance,
|
177
|
+
"snap_y_tolerance": snap_y_tolerance,
|
178
|
+
}
|
179
|
+
)
|
162
180
|
table_zones = [(table.bbox, process_table(table)) for table in tables]
|
163
181
|
|
164
182
|
# Create a filtered page excluding table areas
|
@@ -171,12 +189,46 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
171
189
|
words = filtered_page.extract_words(
|
172
190
|
x_tolerance=x_tolerance,
|
173
191
|
y_tolerance=y_tolerance,
|
174
|
-
extra_attrs=["size", "top", "bottom"],
|
192
|
+
extra_attrs=["size", "top", "bottom", "fontname"],
|
175
193
|
)
|
176
194
|
|
177
|
-
def format_paragraph(
|
178
|
-
|
179
|
-
|
195
|
+
def format_paragraph(text_elements):
|
196
|
+
"""Format a paragraph with styling applied to individual words"""
|
197
|
+
formatted_words = []
|
198
|
+
for element in text_elements:
|
199
|
+
text = element["text"]
|
200
|
+
formatting = get_text_formatting(element)
|
201
|
+
formatted_words.append(apply_markdown_formatting(text, formatting))
|
202
|
+
return f"{' '.join(formatted_words)}\n\n"
|
203
|
+
|
204
|
+
def get_text_formatting(word):
|
205
|
+
"""
|
206
|
+
Detect text formatting based on font properties
|
207
|
+
Returns a dict of formatting attributes
|
208
|
+
"""
|
209
|
+
formatting = {
|
210
|
+
"bold": False,
|
211
|
+
"italic": False,
|
212
|
+
}
|
213
|
+
|
214
|
+
# Check font name for common bold/italic indicators
|
215
|
+
font_name = word.get("fontname", "").lower()
|
216
|
+
if any(style in font_name for style in ["bold", "heavy", "black"]):
|
217
|
+
formatting["bold"] = True
|
218
|
+
if any(style in font_name for style in ["italic", "oblique"]):
|
219
|
+
formatting["italic"] = True
|
220
|
+
|
221
|
+
return formatting
|
222
|
+
|
223
|
+
def apply_markdown_formatting(text, formatting):
|
224
|
+
"""Apply markdown formatting to text based on detected styles"""
|
225
|
+
if formatting["bold"] and formatting["italic"]:
|
226
|
+
text = f"***{text}***"
|
227
|
+
elif formatting["bold"]:
|
228
|
+
text = f"**{text}**"
|
229
|
+
elif formatting["italic"]:
|
230
|
+
text = f"*{text}*"
|
231
|
+
return text
|
180
232
|
|
181
233
|
def detect_heading_level(font_size):
|
182
234
|
if font_size >= 24:
|
@@ -205,17 +257,18 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
205
257
|
while tables and word["bottom"] > tables[0][1]["bottom"]:
|
206
258
|
content_elements.append(tables.pop(0))
|
207
259
|
content_elements.append(("word", word))
|
260
|
+
content_elements.extend(tables)
|
208
261
|
|
209
262
|
for element_type, element in content_elements:
|
210
263
|
if element_type == "table":
|
211
264
|
# If there are any pending paragraphs or headings, add them first
|
212
265
|
if current_heading:
|
213
266
|
level = detect_heading_level(current_heading[0]["size"])
|
214
|
-
heading_text =
|
215
|
-
markdown_content.append(f"{'#' * level} {heading_text}
|
267
|
+
heading_text = format_paragraph(current_heading)
|
268
|
+
markdown_content.append(f"{'#' * level} {heading_text}")
|
216
269
|
current_heading = []
|
217
270
|
if current_paragraph:
|
218
|
-
markdown_content.append(format_paragraph(
|
271
|
+
markdown_content.append(format_paragraph(current_paragraph))
|
219
272
|
current_paragraph = []
|
220
273
|
# Add the table
|
221
274
|
markdown_content.append(element["content"])
|
@@ -233,46 +286,42 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
233
286
|
# If we were collecting a heading
|
234
287
|
if current_heading:
|
235
288
|
level = detect_heading_level(current_heading[0]["size"])
|
236
|
-
heading_text =
|
237
|
-
markdown_content.append(f"{'#' * level} {heading_text}
|
289
|
+
heading_text = format_paragraph(current_heading)
|
290
|
+
markdown_content.append(f"{'#' * level} {heading_text}")
|
238
291
|
current_heading = []
|
239
292
|
|
240
293
|
# If we were collecting a paragraph
|
241
294
|
if current_paragraph:
|
242
|
-
markdown_content.append(
|
243
|
-
format_paragraph(" ".join(current_paragraph))
|
244
|
-
)
|
295
|
+
markdown_content.append(format_paragraph(current_paragraph))
|
245
296
|
current_paragraph = []
|
246
297
|
|
247
298
|
# Add word to appropriate collection
|
248
299
|
if heading_level:
|
249
300
|
if current_paragraph: # Flush any pending paragraph
|
250
|
-
markdown_content.append(
|
251
|
-
format_paragraph(" ".join(current_paragraph))
|
252
|
-
)
|
301
|
+
markdown_content.append(format_paragraph(current_paragraph))
|
253
302
|
current_paragraph = []
|
254
|
-
current_heading.append(
|
303
|
+
current_heading.append(word)
|
255
304
|
else:
|
256
305
|
if current_heading: # Flush any pending heading
|
257
306
|
level = detect_heading_level(current_heading[0]["size"])
|
258
|
-
heading_text =
|
259
|
-
markdown_content.append(f"{'#' * level} {heading_text}
|
307
|
+
heading_text = format_paragraph(current_heading)
|
308
|
+
markdown_content.append(f"{'#' * level} {heading_text}")
|
260
309
|
current_heading = []
|
261
|
-
current_paragraph.append(word
|
310
|
+
current_paragraph.append(word)
|
262
311
|
|
263
312
|
last_y = word["top"]
|
264
313
|
|
265
314
|
# Handle remaining content
|
266
315
|
if current_heading:
|
267
316
|
level = detect_heading_level(current_heading[0]["size"])
|
268
|
-
heading_text =
|
269
|
-
markdown_content.append(f"{'#' * level} {heading_text}
|
317
|
+
heading_text = format_paragraph(current_heading)
|
318
|
+
markdown_content.append(f"{'#' * level} {heading_text}")
|
270
319
|
|
271
320
|
if current_paragraph:
|
272
|
-
markdown_content.append(format_paragraph(
|
321
|
+
markdown_content.append(format_paragraph(current_paragraph))
|
273
322
|
|
274
323
|
# Process links for the page
|
275
|
-
content = "".join(markdown_content)
|
324
|
+
content = "".join(markdown_content)
|
276
325
|
if page.annots:
|
277
326
|
links = []
|
278
327
|
for annot in page.annots:
|
@@ -283,6 +332,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
283
332
|
if links:
|
284
333
|
content = embed_links_in_text(page, content, links)
|
285
334
|
|
335
|
+
# Remove redundant formatting
|
336
|
+
content = content.replace("** **", " ").replace("* *", " ")
|
337
|
+
|
286
338
|
return content
|
287
339
|
|
288
340
|
|
@@ -76,3 +76,22 @@ Ensure accurate representation of all content, including tables and visual eleme
|
|
76
76
|
"""
|
77
77
|
|
78
78
|
INSTRUCTIONS_ADD_PG_BREAK = "Insert a `<page-break>` tag between the content of each page to maintain the original page structure."
|
79
|
+
|
80
|
+
LLAMA_PARSER_PROMPT = """\
|
81
|
+
You are a document conversion assistant. Your task is to accurately reproduce the content of an image in Markdown and HTML format, maintaining the visual structure and layout of the original document as closely as possible.
|
82
|
+
|
83
|
+
Instructions:
|
84
|
+
1. Use a combination of Markdown and HTML to replicate the document's layout and formatting.
|
85
|
+
2. Reproduce all text content exactly as it appears, including preserving capitalization, punctuation, and any apparent errors or inconsistencies in the original.
|
86
|
+
3. Use appropriate Markdown syntax for headings, emphasis (bold, italic), and lists where applicable.
|
87
|
+
4. Always use HTML (`<table>`, `<tr>`, `<td>`) to represent tabular data. Include `colspan` and `rowspan` attributes if needed.
|
88
|
+
5. For figures, graphs, or diagrams, represent them using `<img>` tags and use appropriate `alt` text.
|
89
|
+
6. For handwritten documents, reproduce the content as typed text, maintaining the original structure and layout.
|
90
|
+
7. Do not include any descriptions of the document's appearance, paper type, or writing implements used.
|
91
|
+
8. Do not add any explanatory notes, comments, or additional information outside of the converted content.
|
92
|
+
9. Ensure all special characters, symbols, and equations are accurately represented.
|
93
|
+
10. Provide the output only once, without any duplication.
|
94
|
+
11. Enclose the entire output within <output> and </output> tags.
|
95
|
+
|
96
|
+
Output the converted content directly in Markdown and HTML without any additional explanations, descriptions, or notes.
|
97
|
+
"""
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "lexoid"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.8"
|
4
4
|
description = ""
|
5
5
|
authors = []
|
6
6
|
readme = "README.md"
|
@@ -24,16 +24,15 @@ playwright = "^1.49.0"
|
|
24
24
|
docx2pdf = "^0.1.8"
|
25
25
|
python-docx = "^1.1.2"
|
26
26
|
nest-asyncio ="^1.6.0"
|
27
|
+
pyqt5 = {version = "^5.15.11", markers = "platform_system != 'debian'"}
|
28
|
+
pyqtwebengine = {version = "^5.15.7", markers = "platform_system != 'debian'"}
|
29
|
+
huggingface-hub = "^0.27.0"
|
27
30
|
|
28
31
|
[tool.poetry.group.dev.dependencies]
|
29
32
|
ipykernel = "^6.29.5"
|
30
33
|
pytest-asyncio = "^0.23.8"
|
31
34
|
pytest = "^8.3.2"
|
32
35
|
|
33
|
-
[tool.poetry.group.qt5.dependencies]
|
34
|
-
pyqt5 = "^5.15.11"
|
35
|
-
pyqtwebengine = "^5.15.7"
|
36
|
-
|
37
36
|
[build-system]
|
38
|
-
requires = ["poetry-core"]
|
37
|
+
requires = ["poetry-core", "wheel"]
|
39
38
|
build-backend = "poetry.core.masonry.api"
|
@@ -1,200 +0,0 @@
|
|
1
|
-
import base64
|
2
|
-
import io
|
3
|
-
import mimetypes
|
4
|
-
import os
|
5
|
-
from typing import Dict, List
|
6
|
-
|
7
|
-
import pypdfium2 as pdfium
|
8
|
-
import requests
|
9
|
-
from lexoid.core.prompt_templates import (
|
10
|
-
INSTRUCTIONS_ADD_PG_BREAK,
|
11
|
-
OPENAI_USER_PROMPT,
|
12
|
-
PARSER_PROMPT,
|
13
|
-
)
|
14
|
-
from lexoid.core.utils import convert_image_to_pdf
|
15
|
-
from loguru import logger
|
16
|
-
from openai import OpenAI
|
17
|
-
|
18
|
-
|
19
|
-
def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
20
|
-
if "model" not in kwargs:
|
21
|
-
kwargs["model"] = "gemini-1.5-flash"
|
22
|
-
model = kwargs.get("model")
|
23
|
-
if model.startswith("gemini"):
|
24
|
-
return parse_with_gemini(path, raw, **kwargs)
|
25
|
-
elif model.startswith("gpt"):
|
26
|
-
return parse_with_gpt(path, raw, **kwargs)
|
27
|
-
else:
|
28
|
-
raise ValueError(f"Unsupported model: {model}")
|
29
|
-
|
30
|
-
|
31
|
-
def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
32
|
-
api_key = os.environ.get("GOOGLE_API_KEY")
|
33
|
-
if not api_key:
|
34
|
-
raise ValueError("GOOGLE_API_KEY environment variable is not set")
|
35
|
-
|
36
|
-
url = f"https://generativelanguage.googleapis.com/v1beta/models/{kwargs['model']}:generateContent?key={api_key}"
|
37
|
-
|
38
|
-
# Check if the file is an image and convert to PDF if necessary
|
39
|
-
mime_type, _ = mimetypes.guess_type(path)
|
40
|
-
if mime_type and mime_type.startswith("image"):
|
41
|
-
pdf_content = convert_image_to_pdf(path)
|
42
|
-
mime_type = "application/pdf"
|
43
|
-
base64_file = base64.b64encode(pdf_content).decode("utf-8")
|
44
|
-
else:
|
45
|
-
with open(path, "rb") as file:
|
46
|
-
file_content = file.read()
|
47
|
-
base64_file = base64.b64encode(file_content).decode("utf-8")
|
48
|
-
|
49
|
-
# Ideally, we do this ourselves. But, for now this might be a good enough.
|
50
|
-
custom_instruction = f"""- Total number of pages: {kwargs["pages_per_split_"]}. {INSTRUCTIONS_ADD_PG_BREAK}"""
|
51
|
-
if kwargs["pages_per_split_"] == 1:
|
52
|
-
custom_instruction = ""
|
53
|
-
|
54
|
-
payload = {
|
55
|
-
"contents": [
|
56
|
-
{
|
57
|
-
"parts": [
|
58
|
-
{
|
59
|
-
"text": PARSER_PROMPT.format(
|
60
|
-
custom_instructions=custom_instruction
|
61
|
-
)
|
62
|
-
},
|
63
|
-
{"inline_data": {"mime_type": mime_type, "data": base64_file}},
|
64
|
-
]
|
65
|
-
}
|
66
|
-
],
|
67
|
-
"generationConfig": {
|
68
|
-
"temperature": kwargs.get("temperature", 0.7),
|
69
|
-
},
|
70
|
-
}
|
71
|
-
|
72
|
-
headers = {"Content-Type": "application/json"}
|
73
|
-
|
74
|
-
response = requests.post(url, json=payload, headers=headers)
|
75
|
-
response.raise_for_status()
|
76
|
-
|
77
|
-
result = response.json()
|
78
|
-
|
79
|
-
raw_text = "".join(
|
80
|
-
part["text"]
|
81
|
-
for candidate in result.get("candidates", [])
|
82
|
-
for part in candidate.get("content", {}).get("parts", [])
|
83
|
-
if "text" in part
|
84
|
-
)
|
85
|
-
|
86
|
-
result = ""
|
87
|
-
if "<output>" in raw_text:
|
88
|
-
result = raw_text.split("<output>")[1].strip()
|
89
|
-
if "</output>" in result:
|
90
|
-
result = result.split("</output>")[0].strip()
|
91
|
-
|
92
|
-
if raw:
|
93
|
-
return result
|
94
|
-
|
95
|
-
return [
|
96
|
-
{
|
97
|
-
"metadata": {
|
98
|
-
"title": kwargs["title"],
|
99
|
-
"page": kwargs.get("start", 0) + page_no,
|
100
|
-
},
|
101
|
-
"content": page,
|
102
|
-
}
|
103
|
-
for page_no, page in enumerate(result.split("<page-break>"), start=1)
|
104
|
-
if page.strip()
|
105
|
-
]
|
106
|
-
|
107
|
-
|
108
|
-
def convert_pdf_page_to_base64(
|
109
|
-
pdf_document: pdfium.PdfDocument, page_number: int
|
110
|
-
) -> str:
|
111
|
-
"""Convert a PDF page to a base64-encoded PNG string."""
|
112
|
-
page = pdf_document[page_number]
|
113
|
-
# Render with 4x scaling for better quality
|
114
|
-
pil_image = page.render(scale=4).to_pil()
|
115
|
-
|
116
|
-
# Convert to base64
|
117
|
-
img_byte_arr = io.BytesIO()
|
118
|
-
pil_image.save(img_byte_arr, format="PNG")
|
119
|
-
img_byte_arr.seek(0)
|
120
|
-
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
121
|
-
|
122
|
-
|
123
|
-
def parse_with_gpt(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
124
|
-
client = OpenAI()
|
125
|
-
|
126
|
-
# Handle different input types
|
127
|
-
mime_type, _ = mimetypes.guess_type(path)
|
128
|
-
if mime_type and mime_type.startswith("image"):
|
129
|
-
# Single image processing
|
130
|
-
with open(path, "rb") as img_file:
|
131
|
-
image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
|
132
|
-
images = [(0, image_base64)]
|
133
|
-
else:
|
134
|
-
# PDF processing
|
135
|
-
pdf_document = pdfium.PdfDocument(path)
|
136
|
-
images = [
|
137
|
-
(page_num, convert_pdf_page_to_base64(pdf_document, page_num))
|
138
|
-
for page_num in range(len(pdf_document))
|
139
|
-
]
|
140
|
-
|
141
|
-
# Process each page/image
|
142
|
-
all_results = []
|
143
|
-
for page_num, image_base64 in images:
|
144
|
-
messages = [
|
145
|
-
{
|
146
|
-
"role": "system",
|
147
|
-
"content": PARSER_PROMPT,
|
148
|
-
},
|
149
|
-
{
|
150
|
-
"role": "user",
|
151
|
-
"content": [
|
152
|
-
{
|
153
|
-
"type": "text",
|
154
|
-
"text": f"{OPENAI_USER_PROMPT} (Page {page_num + 1})",
|
155
|
-
},
|
156
|
-
{
|
157
|
-
"type": "image_url",
|
158
|
-
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
159
|
-
},
|
160
|
-
],
|
161
|
-
},
|
162
|
-
]
|
163
|
-
|
164
|
-
# Get completion from GPT-4 Vision
|
165
|
-
response = client.chat.completions.create(
|
166
|
-
model=kwargs["model"],
|
167
|
-
temperature=kwargs.get("temperature", 0.7),
|
168
|
-
messages=messages,
|
169
|
-
)
|
170
|
-
|
171
|
-
# Extract the response text
|
172
|
-
page_text = response.choices[0].message.content
|
173
|
-
if kwargs.get("verbose", None):
|
174
|
-
logger.debug(f"Page {page_num + 1} response: {page_text}")
|
175
|
-
result = ""
|
176
|
-
if "<output>" in page_text:
|
177
|
-
result = page_text.split("<output>")[1].strip()
|
178
|
-
if "</output>" in result:
|
179
|
-
result = result.split("</output>")[0].strip()
|
180
|
-
all_results.append((page_num, result))
|
181
|
-
|
182
|
-
# Sort results by page number and combine
|
183
|
-
all_results.sort(key=lambda x: x[0])
|
184
|
-
all_texts = [text for _, text in all_results]
|
185
|
-
combined_text = "<page-break>".join(all_texts)
|
186
|
-
|
187
|
-
if raw:
|
188
|
-
return combined_text
|
189
|
-
|
190
|
-
return [
|
191
|
-
{
|
192
|
-
"metadata": {
|
193
|
-
"title": kwargs["title"],
|
194
|
-
"page": kwargs.get("start", 0) + page_no,
|
195
|
-
},
|
196
|
-
"content": page,
|
197
|
-
}
|
198
|
-
for page_no, page in enumerate(all_texts, start=1)
|
199
|
-
if page.strip()
|
200
|
-
]
|
File without changes
|
File without changes
|
File without changes
|