lexoid 0.1.13__tar.gz → 0.1.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lexoid-0.1.13 → lexoid-0.1.14}/PKG-INFO +25 -20
- {lexoid-0.1.13 → lexoid-0.1.14}/README.md +24 -19
- {lexoid-0.1.13 → lexoid-0.1.14}/lexoid/api.py +110 -38
- {lexoid-0.1.13 → lexoid-0.1.14}/lexoid/core/parse_type/llm_parser.py +144 -66
- {lexoid-0.1.13 → lexoid-0.1.14}/lexoid/core/prompt_templates.py +2 -1
- {lexoid-0.1.13 → lexoid-0.1.14}/lexoid/core/utils.py +15 -7
- {lexoid-0.1.13 → lexoid-0.1.14}/pyproject.toml +1 -1
- {lexoid-0.1.13 → lexoid-0.1.14}/LICENSE +0 -0
- {lexoid-0.1.13 → lexoid-0.1.14}/lexoid/core/parse_type/static_parser.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lexoid
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.14
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.10,<4.0
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -49,7 +49,8 @@ Description-Content-Type: text/markdown
|
|
49
49
|
</div>
|
50
50
|
|
51
51
|
[](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
|
52
|
-
[](https://huggingface.co/spaces/oidlabs/Lexoid)
|
53
|
+
[](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
|
53
54
|
[](https://pypi.org/project/lexoid/)
|
54
55
|
[](https://oidlabs-com.github.io/Lexoid/)
|
55
56
|
|
@@ -144,6 +145,7 @@ print(parsed_md)
|
|
144
145
|
* Hugging Face
|
145
146
|
* Together AI
|
146
147
|
* OpenRouter
|
148
|
+
* Fireworks
|
147
149
|
|
148
150
|
## Benchmark
|
149
151
|
|
@@ -151,22 +153,25 @@ Results aggregated across 5 iterations each for 5 documents.
|
|
151
153
|
|
152
154
|
_Note:_ Benchmarks are currently done in the zero-shot setting.
|
153
155
|
|
154
|
-
| Rank | Model
|
155
|
-
|
|
156
|
-
| 1
|
157
|
-
| 2
|
158
|
-
| 3
|
159
|
-
| 4
|
160
|
-
| 5
|
161
|
-
| 6
|
162
|
-
| 7
|
163
|
-
| 8
|
164
|
-
| 9
|
165
|
-
| 10
|
166
|
-
| 11
|
167
|
-
| 12
|
168
|
-
| 13
|
169
|
-
| 14
|
170
|
-
| 15
|
171
|
-
| 16
|
156
|
+
| Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
|
157
|
+
| --- | --- | --- | --- | --- | --- |
|
158
|
+
| 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.00048 |
|
159
|
+
| 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
|
160
|
+
| 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
|
161
|
+
| 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
|
162
|
+
| 5 | AUTO | 0.76 | 0.184 | 5.14 | 0.000217 |
|
163
|
+
| 6 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
|
164
|
+
| 7 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
|
165
|
+
| 8 | accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks) | 0.687 | 0.221 | 8.07 | 0.000419 |
|
166
|
+
| 9 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
|
167
|
+
| 10 | accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks) | 0.675 | 0.184 | 5.98 | 0.000226 |
|
168
|
+
| 11 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
|
169
|
+
| 12 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
|
170
|
+
| 13 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
|
171
|
+
| 14 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
|
172
|
+
| 15 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.00006 |
|
173
|
+
| 16 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
|
174
|
+
| 17 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
|
175
|
+
| 18 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
|
176
|
+
| 19 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.8 | 0.000019 |
|
172
177
|
|
@@ -14,7 +14,8 @@
|
|
14
14
|
</div>
|
15
15
|
|
16
16
|
[](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
|
17
|
-
[](https://huggingface.co/spaces/oidlabs/Lexoid)
|
18
|
+
[](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
|
18
19
|
[](https://pypi.org/project/lexoid/)
|
19
20
|
[](https://oidlabs-com.github.io/Lexoid/)
|
20
21
|
|
@@ -109,6 +110,7 @@ print(parsed_md)
|
|
109
110
|
* Hugging Face
|
110
111
|
* Together AI
|
111
112
|
* OpenRouter
|
113
|
+
* Fireworks
|
112
114
|
|
113
115
|
## Benchmark
|
114
116
|
|
@@ -116,21 +118,24 @@ Results aggregated across 5 iterations each for 5 documents.
|
|
116
118
|
|
117
119
|
_Note:_ Benchmarks are currently done in the zero-shot setting.
|
118
120
|
|
119
|
-
| Rank | Model
|
120
|
-
|
|
121
|
-
| 1
|
122
|
-
| 2
|
123
|
-
| 3
|
124
|
-
| 4
|
125
|
-
| 5
|
126
|
-
| 6
|
127
|
-
| 7
|
128
|
-
| 8
|
129
|
-
| 9
|
130
|
-
| 10
|
131
|
-
| 11
|
132
|
-
| 12
|
133
|
-
| 13
|
134
|
-
| 14
|
135
|
-
| 15
|
136
|
-
| 16
|
121
|
+
| Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
|
122
|
+
| --- | --- | --- | --- | --- | --- |
|
123
|
+
| 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.00048 |
|
124
|
+
| 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
|
125
|
+
| 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
|
126
|
+
| 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
|
127
|
+
| 5 | AUTO | 0.76 | 0.184 | 5.14 | 0.000217 |
|
128
|
+
| 6 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
|
129
|
+
| 7 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
|
130
|
+
| 8 | accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks) | 0.687 | 0.221 | 8.07 | 0.000419 |
|
131
|
+
| 9 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
|
132
|
+
| 10 | accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks) | 0.675 | 0.184 | 5.98 | 0.000226 |
|
133
|
+
| 11 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
|
134
|
+
| 12 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
|
135
|
+
| 13 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
|
136
|
+
| 14 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
|
137
|
+
| 15 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.00006 |
|
138
|
+
| 16 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
|
139
|
+
| 17 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
|
140
|
+
| 18 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
|
141
|
+
| 19 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.8 | 0.000019 |
|
@@ -10,7 +10,11 @@ from typing import Union, Dict, List
|
|
10
10
|
|
11
11
|
from loguru import logger
|
12
12
|
|
13
|
-
from lexoid.core.parse_type.llm_parser import
|
13
|
+
from lexoid.core.parse_type.llm_parser import (
|
14
|
+
parse_llm_doc,
|
15
|
+
create_response,
|
16
|
+
convert_doc_to_base64_images,
|
17
|
+
)
|
14
18
|
from lexoid.core.parse_type.static_parser import parse_static_doc
|
15
19
|
from lexoid.core.utils import (
|
16
20
|
convert_to_pdf,
|
@@ -49,6 +53,7 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
|
|
49
53
|
- parent_title: Title of parent doc if recursively parsed
|
50
54
|
- recursive_docs: List of dictionaries for recursively parsed documents
|
51
55
|
- token_usage: Dictionary containing token usage statistics
|
56
|
+
- parser_used: Which parser was actually used
|
52
57
|
"""
|
53
58
|
if parser_type == ParserType.AUTO:
|
54
59
|
router_priority = kwargs.get("router_priority", "speed")
|
@@ -60,10 +65,13 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
|
|
60
65
|
)
|
61
66
|
if parser_type == ParserType.STATIC_PARSE:
|
62
67
|
logger.debug("Using static parser")
|
63
|
-
|
68
|
+
result = parse_static_doc(path, **kwargs)
|
64
69
|
else:
|
65
70
|
logger.debug("Using LLM parser")
|
66
|
-
|
71
|
+
result = parse_llm_doc(path, **kwargs)
|
72
|
+
|
73
|
+
result["parser_used"] = parser_type
|
74
|
+
return result
|
67
75
|
|
68
76
|
|
69
77
|
def parse_chunk_list(
|
@@ -82,15 +90,18 @@ def parse_chunk_list(
|
|
82
90
|
"""
|
83
91
|
combined_segments = []
|
84
92
|
raw_texts = []
|
85
|
-
token_usage = {"input": 0, "output": 0, "
|
93
|
+
token_usage = {"input": 0, "output": 0, "llm_page_count": 0}
|
86
94
|
for file_path in file_paths:
|
87
95
|
result = parse_chunk(file_path, parser_type, **kwargs)
|
88
96
|
combined_segments.extend(result["segments"])
|
89
97
|
raw_texts.append(result["raw"])
|
90
|
-
if
|
98
|
+
if (
|
99
|
+
result.get("parser_used") == ParserType.LLM_PARSE
|
100
|
+
and "token_usage" in result
|
101
|
+
):
|
91
102
|
token_usage["input"] += result["token_usage"]["input"]
|
92
103
|
token_usage["output"] += result["token_usage"]["output"]
|
93
|
-
token_usage["
|
104
|
+
token_usage["llm_page_count"] += len(result["segments"])
|
94
105
|
token_usage["total"] = token_usage["input"] + token_usage["output"]
|
95
106
|
|
96
107
|
return {
|
@@ -136,7 +147,7 @@ def parse(
|
|
136
147
|
as_pdf = kwargs.get("as_pdf", False)
|
137
148
|
depth = kwargs.get("depth", 1)
|
138
149
|
|
139
|
-
if type(parser_type)
|
150
|
+
if type(parser_type) is str:
|
140
151
|
parser_type = ParserType[parser_type]
|
141
152
|
if (
|
142
153
|
path.lower().endswith((".doc", ".docx"))
|
@@ -184,7 +195,7 @@ def parse(
|
|
184
195
|
|
185
196
|
if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
|
186
197
|
kwargs["split"] = False
|
187
|
-
result =
|
198
|
+
result = parse_chunk_list([path], parser_type, kwargs)
|
188
199
|
else:
|
189
200
|
kwargs["split"] = True
|
190
201
|
split_dir = os.path.join(temp_dir, "splits/")
|
@@ -219,42 +230,43 @@ def parse(
|
|
219
230
|
"token_usage": {
|
220
231
|
"input": sum(r["token_usage"]["input"] for r in chunk_results),
|
221
232
|
"output": sum(r["token_usage"]["output"] for r in chunk_results),
|
222
|
-
"
|
223
|
-
r["token_usage"]["
|
233
|
+
"llm_page_count": sum(
|
234
|
+
r["token_usage"]["llm_page_count"] for r in chunk_results
|
224
235
|
),
|
225
236
|
"total": sum(r["token_usage"]["total"] for r in chunk_results),
|
226
237
|
},
|
227
238
|
}
|
228
239
|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
240
|
+
if "api_cost_mapping" in kwargs and "token_usage" in result:
|
241
|
+
api_cost_mapping = kwargs["api_cost_mapping"]
|
242
|
+
if isinstance(api_cost_mapping, dict):
|
243
|
+
api_cost_mapping = api_cost_mapping
|
244
|
+
elif isinstance(api_cost_mapping, str) and os.path.exists(api_cost_mapping):
|
245
|
+
with open(api_cost_mapping, "r") as f:
|
246
|
+
api_cost_mapping = json.load(f)
|
247
|
+
else:
|
248
|
+
raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.")
|
249
|
+
|
250
|
+
api_cost = api_cost_mapping.get(
|
251
|
+
kwargs.get("model", "gemini-2.0-flash"), None
|
252
|
+
)
|
253
|
+
if api_cost:
|
254
|
+
token_usage = result["token_usage"]
|
255
|
+
token_cost = {
|
256
|
+
"input": token_usage["input"] * api_cost["input"] / 1_000_000,
|
257
|
+
"input-image": api_cost.get("input-image", 0)
|
258
|
+
* token_usage.get("llm_page_count", 0),
|
259
|
+
"output": token_usage["output"] * api_cost["output"] / 1_000_000,
|
260
|
+
}
|
261
|
+
token_cost["total"] = (
|
262
|
+
token_cost["input"]
|
263
|
+
+ token_cost["input-image"]
|
264
|
+
+ token_cost["output"]
|
243
265
|
)
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
+ api_cost.get("input-image", 0) * token_usage["image_count"],
|
249
|
-
"output": token_usage["output"]
|
250
|
-
* api_cost["output"]
|
251
|
-
/ 1_000_000,
|
252
|
-
}
|
253
|
-
token_cost["total"] = token_cost["input"] + token_cost["output"]
|
254
|
-
result["token_cost"] = token_cost
|
255
|
-
|
256
|
-
if as_pdf:
|
257
|
-
result["pdf_path"] = path
|
266
|
+
result["token_cost"] = token_cost
|
267
|
+
|
268
|
+
if as_pdf:
|
269
|
+
result["pdf_path"] = path
|
258
270
|
|
259
271
|
if depth > 1:
|
260
272
|
recursive_docs = []
|
@@ -285,3 +297,63 @@ def parse(
|
|
285
297
|
result["recursive_docs"] = recursive_docs
|
286
298
|
|
287
299
|
return result
|
300
|
+
|
301
|
+
|
302
|
+
def parse_with_schema(
|
303
|
+
path: str, schema: Dict, api: str = "openai", model: str = "gpt-4o-mini", **kwargs
|
304
|
+
) -> List[List[Dict]]:
|
305
|
+
"""
|
306
|
+
Parses a PDF using an LLM to generate structured output conforming to a given JSON schema.
|
307
|
+
|
308
|
+
Args:
|
309
|
+
path (str): Path to the PDF file.
|
310
|
+
schema (Dict): JSON schema to which the parsed output should conform.
|
311
|
+
api (str, optional): LLM API provider (One of "openai", "huggingface", "together", "openrouter", and "fireworks").
|
312
|
+
model (str, optional): LLM model name.
|
313
|
+
**kwargs: Additional arguments for the parser (e.g.: temperature, max_tokens).
|
314
|
+
|
315
|
+
Returns:
|
316
|
+
List[List[Dict]]: List of dictionaries for each page, each conforming to the provided schema.
|
317
|
+
"""
|
318
|
+
system_prompt = f"""
|
319
|
+
The output should be formatted as a JSON instance that conforms to the JSON schema below.
|
320
|
+
|
321
|
+
As an example, for the schema {{
|
322
|
+
"properties": {{
|
323
|
+
"foo": {{
|
324
|
+
"title": "Foo",
|
325
|
+
"description": "a list of strings",
|
326
|
+
"type": "array",
|
327
|
+
"items": {{"type": "string"}}
|
328
|
+
}}
|
329
|
+
}},
|
330
|
+
"required": ["foo"]
|
331
|
+
}}, the object {{"foo": ["bar", "baz"]}} is valid. The object {{"properties": {{"foo": ["bar", "baz"]}}}} is not.
|
332
|
+
|
333
|
+
Here is the output schema:
|
334
|
+
{json.dumps(schema, indent=2)}
|
335
|
+
|
336
|
+
"""
|
337
|
+
|
338
|
+
user_prompt = "You are an AI agent that parses documents and returns them in the specified JSON format. Please parse the document and return it in the required format."
|
339
|
+
|
340
|
+
responses = []
|
341
|
+
images = convert_doc_to_base64_images(path)
|
342
|
+
for i, (page_num, image) in enumerate(images):
|
343
|
+
resp_dict = create_response(
|
344
|
+
api=api,
|
345
|
+
model=model,
|
346
|
+
user_prompt=user_prompt,
|
347
|
+
system_prompt=system_prompt,
|
348
|
+
image_url=image,
|
349
|
+
temperature=kwargs.get("temperature", 0.0),
|
350
|
+
max_tokens=kwargs.get("max_tokens", 1024),
|
351
|
+
)
|
352
|
+
|
353
|
+
response = resp_dict.get("response", "")
|
354
|
+
response = response.split("```json")[-1].split("```")[0].strip()
|
355
|
+
logger.debug(f"Processing page {page_num + 1} with response: {response}")
|
356
|
+
new_dict = json.loads(response)
|
357
|
+
responses.append(new_dict)
|
358
|
+
|
359
|
+
return responses
|
@@ -3,23 +3,24 @@ import io
|
|
3
3
|
import mimetypes
|
4
4
|
import os
|
5
5
|
import time
|
6
|
+
from functools import wraps
|
7
|
+
from typing import Dict, List, Optional, Tuple
|
8
|
+
|
6
9
|
import pypdfium2 as pdfium
|
7
10
|
import requests
|
8
|
-
from
|
11
|
+
from huggingface_hub import InferenceClient
|
12
|
+
from loguru import logger
|
13
|
+
from openai import OpenAI
|
9
14
|
from requests.exceptions import HTTPError
|
10
|
-
from
|
15
|
+
from together import Together
|
11
16
|
|
12
17
|
from lexoid.core.prompt_templates import (
|
13
18
|
INSTRUCTIONS_ADD_PG_BREAK,
|
19
|
+
LLAMA_PARSER_PROMPT,
|
14
20
|
OPENAI_USER_PROMPT,
|
15
21
|
PARSER_PROMPT,
|
16
|
-
LLAMA_PARSER_PROMPT,
|
17
22
|
)
|
18
23
|
from lexoid.core.utils import convert_image_to_pdf
|
19
|
-
from loguru import logger
|
20
|
-
from openai import OpenAI
|
21
|
-
from together import Together
|
22
|
-
from huggingface_hub import InferenceClient
|
23
24
|
|
24
25
|
|
25
26
|
def retry_on_http_error(func):
|
@@ -65,10 +66,13 @@ def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
|
|
65
66
|
return parse_with_api(path, api="huggingface", **kwargs)
|
66
67
|
if any(model.startswith(prefix) for prefix in ["microsoft", "google", "qwen"]):
|
67
68
|
return parse_with_api(path, api="openrouter", **kwargs)
|
69
|
+
if model.startswith("accounts/fireworks"):
|
70
|
+
return parse_with_api(path, api="fireworks", **kwargs)
|
68
71
|
raise ValueError(f"Unsupported model: {model}")
|
69
72
|
|
70
73
|
|
71
74
|
def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
75
|
+
logger.debug(f"Parsing with Gemini API and model {kwargs['model']}")
|
72
76
|
api_key = os.environ.get("GOOGLE_API_KEY")
|
73
77
|
if not api_key:
|
74
78
|
raise ValueError("GOOGLE_API_KEY environment variable is not set")
|
@@ -105,7 +109,7 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
105
109
|
}
|
106
110
|
],
|
107
111
|
"generationConfig": {
|
108
|
-
"temperature": kwargs.get("temperature", 0.
|
112
|
+
"temperature": kwargs.get("temperature", 0.2),
|
109
113
|
},
|
110
114
|
}
|
111
115
|
|
@@ -127,7 +131,7 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
127
131
|
|
128
132
|
combined_text = ""
|
129
133
|
if "<output>" in raw_text:
|
130
|
-
combined_text = raw_text.split("<output>")[1].strip()
|
134
|
+
combined_text = raw_text.split("<output>")[-1].strip()
|
131
135
|
if "</output>" in result:
|
132
136
|
combined_text = result.split("</output>")[0].strip()
|
133
137
|
|
@@ -169,18 +173,54 @@ def convert_pdf_page_to_base64(
|
|
169
173
|
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
170
174
|
|
171
175
|
|
172
|
-
def
|
173
|
-
|
174
|
-
|
176
|
+
def get_messages(
|
177
|
+
system_prompt: Optional[str], user_prompt: Optional[str], image_url: Optional[str]
|
178
|
+
) -> List[Dict]:
|
179
|
+
messages = []
|
180
|
+
if system_prompt:
|
181
|
+
messages.append(
|
182
|
+
{
|
183
|
+
"role": "system",
|
184
|
+
"content": system_prompt,
|
185
|
+
}
|
186
|
+
)
|
187
|
+
base_message = (
|
188
|
+
[
|
189
|
+
{"type": "text", "text": user_prompt},
|
190
|
+
]
|
191
|
+
if user_prompt
|
192
|
+
else []
|
193
|
+
)
|
194
|
+
image_message = (
|
195
|
+
[
|
196
|
+
{
|
197
|
+
"type": "image_url",
|
198
|
+
"image_url": {"url": image_url},
|
199
|
+
}
|
200
|
+
]
|
201
|
+
if image_url
|
202
|
+
else []
|
203
|
+
)
|
175
204
|
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
205
|
+
messages.append(
|
206
|
+
{
|
207
|
+
"role": "user",
|
208
|
+
"content": base_message + image_message,
|
209
|
+
}
|
210
|
+
)
|
180
211
|
|
181
|
-
|
182
|
-
|
183
|
-
|
212
|
+
return messages
|
213
|
+
|
214
|
+
|
215
|
+
def create_response(
|
216
|
+
api: str,
|
217
|
+
model: str,
|
218
|
+
system_prompt: Optional[str] = None,
|
219
|
+
user_prompt: Optional[str] = None,
|
220
|
+
image_url: Optional[str] = None,
|
221
|
+
temperature: float = 0.2,
|
222
|
+
max_tokens: int = 1024,
|
223
|
+
) -> Dict:
|
184
224
|
# Initialize appropriate client
|
185
225
|
clients = {
|
186
226
|
"openai": lambda: OpenAI(),
|
@@ -192,11 +232,52 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
|
192
232
|
base_url="https://openrouter.ai/api/v1",
|
193
233
|
api_key=os.environ["OPENROUTER_API_KEY"],
|
194
234
|
),
|
235
|
+
"fireworks": lambda: OpenAI(
|
236
|
+
base_url="https://api.fireworks.ai/inference/v1",
|
237
|
+
api_key=os.environ["FIREWORKS_API_KEY"],
|
238
|
+
),
|
195
239
|
}
|
196
240
|
assert api in clients, f"Unsupported API: {api}"
|
197
|
-
logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
|
198
241
|
client = clients[api]()
|
199
242
|
|
243
|
+
# Prepare messages for the API call
|
244
|
+
messages = get_messages(system_prompt, user_prompt, image_url)
|
245
|
+
|
246
|
+
# Common completion parameters
|
247
|
+
completion_params = {
|
248
|
+
"model": model,
|
249
|
+
"messages": messages,
|
250
|
+
"max_tokens": max_tokens,
|
251
|
+
"temperature": temperature,
|
252
|
+
}
|
253
|
+
|
254
|
+
# Get completion from selected API
|
255
|
+
response = client.chat.completions.create(**completion_params)
|
256
|
+
token_usage = response.usage
|
257
|
+
|
258
|
+
# Extract the response text
|
259
|
+
page_text = response.choices[0].message.content
|
260
|
+
|
261
|
+
return {
|
262
|
+
"response": page_text,
|
263
|
+
"usage": token_usage,
|
264
|
+
}
|
265
|
+
|
266
|
+
|
267
|
+
def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
268
|
+
"""
|
269
|
+
Parse documents (PDFs or images) using various vision model APIs.
|
270
|
+
|
271
|
+
Args:
|
272
|
+
path (str): Path to the document to parse
|
273
|
+
api (str): Which API to use ("openai", "huggingface", or "together")
|
274
|
+
**kwargs: Additional arguments including model, temperature, title, etc.
|
275
|
+
|
276
|
+
Returns:
|
277
|
+
Dict: Dictionary containing parsed document data
|
278
|
+
"""
|
279
|
+
logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
|
280
|
+
|
200
281
|
# Handle different input types
|
201
282
|
mime_type, _ = mimetypes.guess_type(path)
|
202
283
|
if mime_type and mime_type.startswith("image"):
|
@@ -215,67 +296,39 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
|
215
296
|
for page_num in range(len(pdf_document))
|
216
297
|
]
|
217
298
|
|
218
|
-
#
|
219
|
-
|
220
|
-
|
221
|
-
"type": "image_url",
|
222
|
-
"image_url": {"url": image_url},
|
223
|
-
}
|
224
|
-
|
299
|
+
# Process each page/image
|
300
|
+
all_results = []
|
301
|
+
for page_num, image_url in images:
|
225
302
|
if api == "openai":
|
226
303
|
system_prompt = kwargs.get(
|
227
304
|
"system_prompt", PARSER_PROMPT.format(custom_instructions="")
|
228
305
|
)
|
229
306
|
user_prompt = kwargs.get("user_prompt", OPENAI_USER_PROMPT)
|
230
|
-
return [
|
231
|
-
{
|
232
|
-
"role": "system",
|
233
|
-
"content": system_prompt,
|
234
|
-
},
|
235
|
-
{
|
236
|
-
"role": "user",
|
237
|
-
"content": [
|
238
|
-
{"type": "text", "text": user_prompt},
|
239
|
-
image_message,
|
240
|
-
],
|
241
|
-
},
|
242
|
-
]
|
243
307
|
else:
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
messages = get_messages(page_num, image_url)
|
257
|
-
|
258
|
-
# Common completion parameters
|
259
|
-
completion_params = {
|
260
|
-
"model": kwargs["model"],
|
261
|
-
"messages": messages,
|
262
|
-
"max_tokens": kwargs.get("max_tokens", 1024),
|
263
|
-
"temperature": kwargs.get("temperature", 0.7),
|
264
|
-
}
|
308
|
+
system_prompt = kwargs.get("system_prompt", None)
|
309
|
+
user_prompt = kwargs.get("user_prompt", LLAMA_PARSER_PROMPT)
|
310
|
+
|
311
|
+
response = create_response(
|
312
|
+
api=api,
|
313
|
+
model=kwargs["model"],
|
314
|
+
system_prompt=system_prompt,
|
315
|
+
user_prompt=user_prompt,
|
316
|
+
image_url=image_url,
|
317
|
+
temperature=kwargs.get("temperature", 0.2),
|
318
|
+
max_tokens=kwargs.get("max_tokens", 1024),
|
319
|
+
)
|
265
320
|
|
266
321
|
# Get completion from selected API
|
267
|
-
|
268
|
-
token_usage = response
|
322
|
+
page_text = response["response"]
|
323
|
+
token_usage = response["usage"]
|
269
324
|
|
270
|
-
# Extract the response text
|
271
|
-
page_text = response.choices[0].message.content
|
272
325
|
if kwargs.get("verbose", None):
|
273
326
|
logger.debug(f"Page {page_num + 1} response: {page_text}")
|
274
327
|
|
275
328
|
# Extract content between output tags if present
|
276
329
|
result = page_text
|
277
330
|
if "<output>" in page_text:
|
278
|
-
result = page_text.split("<output>")[1].strip()
|
331
|
+
result = page_text.split("<output>")[-1].strip()
|
279
332
|
if "</output>" in result:
|
280
333
|
result = result.split("</output>")[0].strip()
|
281
334
|
all_results.append(
|
@@ -319,3 +372,28 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
|
319
372
|
"total": sum(total_tokens for _, _, _, _, total_tokens in all_results),
|
320
373
|
},
|
321
374
|
}
|
375
|
+
|
376
|
+
|
377
|
+
def convert_doc_to_base64_images(path: str) -> List[Tuple[int, str]]:
|
378
|
+
"""
|
379
|
+
Converts a document (PDF or image) to a base64 encoded string.
|
380
|
+
|
381
|
+
Args:
|
382
|
+
path (str): Path to the PDF file.
|
383
|
+
|
384
|
+
Returns:
|
385
|
+
str: Base64 encoded string of the PDF content.
|
386
|
+
"""
|
387
|
+
if path.endswith(".pdf"):
|
388
|
+
pdf_document = pdfium.PdfDocument(path)
|
389
|
+
return [
|
390
|
+
(
|
391
|
+
page_num,
|
392
|
+
f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
|
393
|
+
)
|
394
|
+
for page_num in range(len(pdf_document))
|
395
|
+
]
|
396
|
+
elif mimetypes.guess_type(path)[0].startswith("image"):
|
397
|
+
with open(path, "rb") as img_file:
|
398
|
+
image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
|
399
|
+
return [(0, f"data:image/png;base64,{image_base64}")]
|
@@ -41,7 +41,8 @@ Think step-by-step.
|
|
41
41
|
'0' is typically more oval than 'O'
|
42
42
|
'8' has a more angular top than 'B'
|
43
43
|
{custom_instructions}
|
44
|
-
- Return only the correct markdown without additional text or explanations.
|
44
|
+
- Return only the correct markdown without additional text or explanations.
|
45
|
+
- DO NOT use code blocks such as "```html" or "```markdown" in the output unless there is a code block in the content.
|
45
46
|
- Think before generating the output in <thinking></thinking> tags.
|
46
47
|
|
47
48
|
Remember, your primary objective is to create an output that, when rendered, structurally replicates the original document's content as closely as possible without losing any textual details.
|
@@ -345,7 +345,7 @@ def get_webpage_soup(url: str) -> BeautifulSoup:
|
|
345
345
|
# Additional wait for any dynamic content
|
346
346
|
try:
|
347
347
|
await page.wait_for_selector("body", timeout=30000)
|
348
|
-
except:
|
348
|
+
except Exception:
|
349
349
|
pass
|
350
350
|
|
351
351
|
html = await page.content()
|
@@ -561,24 +561,32 @@ def router(path: str, priority: str = "speed") -> str:
|
|
561
561
|
priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE).
|
562
562
|
"""
|
563
563
|
file_type = get_file_type(path)
|
564
|
-
if
|
564
|
+
if (
|
565
|
+
file_type.startswith("text/")
|
566
|
+
or "spreadsheet" in file_type
|
567
|
+
or "presentation" in file_type
|
568
|
+
):
|
565
569
|
return "STATIC_PARSE"
|
566
570
|
|
567
571
|
if priority == "accuracy":
|
568
572
|
# If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
|
569
573
|
# Otherwise, use LLM_PARSE
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
and
|
574
|
-
):
|
574
|
+
has_image = has_image_in_pdf(path)
|
575
|
+
has_hyperlink = has_hyperlink_in_pdf(path)
|
576
|
+
if file_type == "application/pdf" and not has_image and has_hyperlink:
|
577
|
+
logger.debug("Using STATIC_PARSE for PDF with hyperlinks and no images.")
|
575
578
|
return "STATIC_PARSE"
|
579
|
+
logger.debug(
|
580
|
+
f"Using LLM_PARSE because PDF has image ({has_image}) or has no hyperlink ({has_hyperlink})."
|
581
|
+
)
|
576
582
|
return "LLM_PARSE"
|
577
583
|
else:
|
578
584
|
# If the file is a PDF without images, use STATIC_PARSE
|
579
585
|
# Otherwise, use LLM_PARSE
|
580
586
|
if file_type == "application/pdf" and not has_image_in_pdf(path):
|
587
|
+
logger.debug("Using STATIC_PARSE for PDF without images.")
|
581
588
|
return "STATIC_PARSE"
|
589
|
+
logger.debug("Using LLM_PARSE because PDF has images")
|
582
590
|
return "LLM_PARSE"
|
583
591
|
|
584
592
|
|
File without changes
|
File without changes
|