lexoid 0.1.13__tar.gz → 0.1.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lexoid-0.1.13 → lexoid-0.1.15}/PKG-INFO +26 -20
- {lexoid-0.1.13 → lexoid-0.1.15}/README.md +24 -19
- {lexoid-0.1.13 → lexoid-0.1.15}/lexoid/api.py +166 -44
- {lexoid-0.1.13 → lexoid-0.1.15}/lexoid/core/parse_type/llm_parser.py +246 -91
- {lexoid-0.1.13 → lexoid-0.1.15}/lexoid/core/parse_type/static_parser.py +34 -0
- {lexoid-0.1.13 → lexoid-0.1.15}/lexoid/core/prompt_templates.py +2 -1
- {lexoid-0.1.13 → lexoid-0.1.15}/lexoid/core/utils.py +48 -10
- {lexoid-0.1.13 → lexoid-0.1.15}/pyproject.toml +3 -1
- {lexoid-0.1.13 → lexoid-0.1.15}/LICENSE +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lexoid
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.15
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.10,<4.0
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -8,6 +8,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
8
8
|
Classifier: Programming Language :: Python :: 3.11
|
9
9
|
Classifier: Programming Language :: Python :: 3.12
|
10
10
|
Classifier: Programming Language :: Python :: 3.13
|
11
|
+
Requires-Dist: anthropic (>=0.55.0,<0.56.0)
|
11
12
|
Requires-Dist: bs4 (>=0.0.2,<0.0.3)
|
12
13
|
Requires-Dist: docx2pdf (>=0.1.8,<0.2.0)
|
13
14
|
Requires-Dist: google-generativeai (>=0.8.1,<0.9.0)
|
@@ -49,7 +50,8 @@ Description-Content-Type: text/markdown
|
|
49
50
|
</div>
|
50
51
|
|
51
52
|
[](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
|
52
|
-
[](https://huggingface.co/spaces/oidlabs/Lexoid)
|
54
|
+
[](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
|
53
55
|
[](https://pypi.org/project/lexoid/)
|
54
56
|
[](https://oidlabs-com.github.io/Lexoid/)
|
55
57
|
|
@@ -144,6 +146,7 @@ print(parsed_md)
|
|
144
146
|
* Hugging Face
|
145
147
|
* Together AI
|
146
148
|
* OpenRouter
|
149
|
+
* Fireworks
|
147
150
|
|
148
151
|
## Benchmark
|
149
152
|
|
@@ -151,22 +154,25 @@ Results aggregated across 5 iterations each for 5 documents.
|
|
151
154
|
|
152
155
|
_Note:_ Benchmarks are currently done in the zero-shot setting.
|
153
156
|
|
154
|
-
| Rank | Model
|
155
|
-
|
|
156
|
-
| 1
|
157
|
-
| 2
|
158
|
-
| 3
|
159
|
-
| 4
|
160
|
-
| 5
|
161
|
-
| 6
|
162
|
-
| 7
|
163
|
-
| 8
|
164
|
-
| 9
|
165
|
-
| 10
|
166
|
-
| 11
|
167
|
-
| 12
|
168
|
-
| 13
|
169
|
-
| 14
|
170
|
-
| 15
|
171
|
-
| 16
|
157
|
+
| Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
|
158
|
+
| --- | --- | --- | --- | --- | --- |
|
159
|
+
| 1 | AUTO | 0.906 | 0.112 | 9.56 | 0.00068 |
|
160
|
+
| 2 | gemini-2.0-flash | 0.897 | 0.126 | 9.91 | 0.00078 |
|
161
|
+
| 3 | gemini-2.5-flash | 0.895 | 0.148 | 54.10 | 0.01051 |
|
162
|
+
| 4 | gemini-1.5-pro | 0.868 | 0.283 | 15.03 | 0.00637 |
|
163
|
+
| 5 | gemini-1.5-flash | 0.864 | 0.194 | 15.47 | 0.00044 |
|
164
|
+
| 6 | claude-3-5-sonnet-20241022 | 0.851 | 0.209 | 15.99 | 0.01758 |
|
165
|
+
| 7 | gemini-2.5-pro | 0.849 | 0.298 | 101.95 | 0.01859 |
|
166
|
+
| 8 | claude-sonnet-4-20250514 | 0.804 | 0.190 | 19.27 | 0.02071 |
|
167
|
+
| 9 | claude-opus-4-20250514 | 0.772 | 0.238 | 20.03 | 0.09207 |
|
168
|
+
| 10 | accounts/fireworks/models/llama4-maverick-instruct-basic | 0.768 | 0.234 | 12.12 | 0.00150 |
|
169
|
+
| 11 | gpt-4o | 0.748 | 0.284 | 26.80 | 0.01478 |
|
170
|
+
| 12 | gpt-4o-mini | 0.733 | 0.231 | 18.18 | 0.00650 |
|
171
|
+
| 13 | gpt-4.1-mini | 0.723 | 0.269 | 20.91 | 0.00351 |
|
172
|
+
| 14 | google/gemma-3-27b-it | 0.681 | 0.334 | 19.41 | 0.00027 |
|
173
|
+
| 15 | gpt-4.1 | 0.650 | 0.342 | 33.72 | 0.01443 |
|
174
|
+
| 16 | claude-3-7-sonnet-20250219 | 0.633 | 0.369 | 14.24 | 0.01763 |
|
175
|
+
| 17 | microsoft/phi-4-multimodal-instruct | 0.622 | 0.320 | 13.15 | 0.00050 |
|
176
|
+
| 18 | qwen/qwen-2.5-vl-7b-instruct | 0.559 | 0.348 | 17.71 | 0.00086 |
|
177
|
+
| 19 | meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo | 0.546 | 0.239 | 29.26 | 0.01103 |
|
172
178
|
|
@@ -14,7 +14,8 @@
|
|
14
14
|
</div>
|
15
15
|
|
16
16
|
[](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
|
17
|
-
[](https://huggingface.co/spaces/oidlabs/Lexoid)
|
18
|
+
[](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
|
18
19
|
[](https://pypi.org/project/lexoid/)
|
19
20
|
[](https://oidlabs-com.github.io/Lexoid/)
|
20
21
|
|
@@ -109,6 +110,7 @@ print(parsed_md)
|
|
109
110
|
* Hugging Face
|
110
111
|
* Together AI
|
111
112
|
* OpenRouter
|
113
|
+
* Fireworks
|
112
114
|
|
113
115
|
## Benchmark
|
114
116
|
|
@@ -116,21 +118,24 @@ Results aggregated across 5 iterations each for 5 documents.
|
|
116
118
|
|
117
119
|
_Note:_ Benchmarks are currently done in the zero-shot setting.
|
118
120
|
|
119
|
-
| Rank | Model
|
120
|
-
|
|
121
|
-
| 1
|
122
|
-
| 2
|
123
|
-
| 3
|
124
|
-
| 4
|
125
|
-
| 5
|
126
|
-
| 6
|
127
|
-
| 7
|
128
|
-
| 8
|
129
|
-
| 9
|
130
|
-
| 10
|
131
|
-
| 11
|
132
|
-
| 12
|
133
|
-
| 13
|
134
|
-
| 14
|
135
|
-
| 15
|
136
|
-
| 16
|
121
|
+
| Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
|
122
|
+
| --- | --- | --- | --- | --- | --- |
|
123
|
+
| 1 | AUTO | 0.906 | 0.112 | 9.56 | 0.00068 |
|
124
|
+
| 2 | gemini-2.0-flash | 0.897 | 0.126 | 9.91 | 0.00078 |
|
125
|
+
| 3 | gemini-2.5-flash | 0.895 | 0.148 | 54.10 | 0.01051 |
|
126
|
+
| 4 | gemini-1.5-pro | 0.868 | 0.283 | 15.03 | 0.00637 |
|
127
|
+
| 5 | gemini-1.5-flash | 0.864 | 0.194 | 15.47 | 0.00044 |
|
128
|
+
| 6 | claude-3-5-sonnet-20241022 | 0.851 | 0.209 | 15.99 | 0.01758 |
|
129
|
+
| 7 | gemini-2.5-pro | 0.849 | 0.298 | 101.95 | 0.01859 |
|
130
|
+
| 8 | claude-sonnet-4-20250514 | 0.804 | 0.190 | 19.27 | 0.02071 |
|
131
|
+
| 9 | claude-opus-4-20250514 | 0.772 | 0.238 | 20.03 | 0.09207 |
|
132
|
+
| 10 | accounts/fireworks/models/llama4-maverick-instruct-basic | 0.768 | 0.234 | 12.12 | 0.00150 |
|
133
|
+
| 11 | gpt-4o | 0.748 | 0.284 | 26.80 | 0.01478 |
|
134
|
+
| 12 | gpt-4o-mini | 0.733 | 0.231 | 18.18 | 0.00650 |
|
135
|
+
| 13 | gpt-4.1-mini | 0.723 | 0.269 | 20.91 | 0.00351 |
|
136
|
+
| 14 | google/gemma-3-27b-it | 0.681 | 0.334 | 19.41 | 0.00027 |
|
137
|
+
| 15 | gpt-4.1 | 0.650 | 0.342 | 33.72 | 0.01443 |
|
138
|
+
| 16 | claude-3-7-sonnet-20250219 | 0.633 | 0.369 | 14.24 | 0.01763 |
|
139
|
+
| 17 | microsoft/phi-4-multimodal-instruct | 0.622 | 0.320 | 13.15 | 0.00050 |
|
140
|
+
| 18 | qwen/qwen-2.5-vl-7b-instruct | 0.559 | 0.348 | 17.71 | 0.00086 |
|
141
|
+
| 19 | meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo | 0.546 | 0.239 | 29.26 | 0.01103 |
|
@@ -4,13 +4,19 @@ import re
|
|
4
4
|
import tempfile
|
5
5
|
from concurrent.futures import ProcessPoolExecutor
|
6
6
|
from enum import Enum
|
7
|
+
from functools import wraps
|
7
8
|
from glob import glob
|
8
9
|
from time import time
|
9
|
-
from typing import Union, Dict, List
|
10
|
+
from typing import Optional, Union, Dict, List
|
10
11
|
|
11
12
|
from loguru import logger
|
12
13
|
|
13
|
-
from lexoid.core.parse_type.llm_parser import
|
14
|
+
from lexoid.core.parse_type.llm_parser import (
|
15
|
+
parse_llm_doc,
|
16
|
+
create_response,
|
17
|
+
convert_doc_to_base64_images,
|
18
|
+
get_api_provider_for_model,
|
19
|
+
)
|
14
20
|
from lexoid.core.parse_type.static_parser import parse_static_doc
|
15
21
|
from lexoid.core.utils import (
|
16
22
|
convert_to_pdf,
|
@@ -31,6 +37,51 @@ class ParserType(Enum):
|
|
31
37
|
AUTO = "AUTO"
|
32
38
|
|
33
39
|
|
40
|
+
def retry_with_different_parser_type(func):
|
41
|
+
@wraps(func)
|
42
|
+
def wrapper(*args, **kwargs):
|
43
|
+
try:
|
44
|
+
if len(args) > 0:
|
45
|
+
kwargs["path"] = args[0]
|
46
|
+
if len(args) > 1:
|
47
|
+
router_priority = kwargs.get("router_priority", "speed")
|
48
|
+
if args[1] == ParserType.AUTO:
|
49
|
+
parser_type = ParserType[router(kwargs["path"], router_priority)]
|
50
|
+
logger.debug(f"Auto-detected parser type: {parser_type}")
|
51
|
+
kwargs["routed"] = True
|
52
|
+
else:
|
53
|
+
parser_type = args[1]
|
54
|
+
kwargs["parser_type"] = parser_type
|
55
|
+
return func(**kwargs)
|
56
|
+
except Exception as e:
|
57
|
+
if kwargs.get("parser_type") == ParserType.LLM_PARSE and kwargs.get(
|
58
|
+
"routed", False
|
59
|
+
):
|
60
|
+
logger.warning(
|
61
|
+
f"LLM_PARSE failed with error: {e}. Retrying with STATIC_PARSE."
|
62
|
+
)
|
63
|
+
kwargs["parser_type"] = ParserType.STATIC_PARSE
|
64
|
+
kwargs["routed"] = False
|
65
|
+
return func(**kwargs)
|
66
|
+
elif kwargs.get("parser_type") == ParserType.STATIC_PARSE and kwargs.get(
|
67
|
+
"routed", False
|
68
|
+
):
|
69
|
+
logger.warning(
|
70
|
+
f"STATIC_PARSE failed with error: {e}. Retrying with LLM_PARSE."
|
71
|
+
)
|
72
|
+
kwargs["parser_type"] = ParserType.LLM_PARSE
|
73
|
+
kwargs["routed"] = False
|
74
|
+
return func(**kwargs)
|
75
|
+
else:
|
76
|
+
logger.error(
|
77
|
+
f"Parsing failed with error: {e}. No fallback parser available."
|
78
|
+
)
|
79
|
+
raise e
|
80
|
+
|
81
|
+
return wrapper
|
82
|
+
|
83
|
+
|
84
|
+
@retry_with_different_parser_type
|
34
85
|
def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
|
35
86
|
"""
|
36
87
|
Parses a file using the specified parser type.
|
@@ -49,21 +100,20 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
|
|
49
100
|
- parent_title: Title of parent doc if recursively parsed
|
50
101
|
- recursive_docs: List of dictionaries for recursively parsed documents
|
51
102
|
- token_usage: Dictionary containing token usage statistics
|
103
|
+
- parser_used: Which parser was actually used
|
52
104
|
"""
|
53
|
-
if parser_type == ParserType.AUTO:
|
54
|
-
router_priority = kwargs.get("router_priority", "speed")
|
55
|
-
parser_type = ParserType[router(path, router_priority)]
|
56
|
-
logger.debug(f"Auto-detected parser type: {parser_type}")
|
57
|
-
|
58
105
|
kwargs["start"] = (
|
59
106
|
int(os.path.basename(path).split("_")[1]) - 1 if kwargs.get("split") else 0
|
60
107
|
)
|
61
108
|
if parser_type == ParserType.STATIC_PARSE:
|
62
109
|
logger.debug("Using static parser")
|
63
|
-
|
110
|
+
result = parse_static_doc(path, **kwargs)
|
64
111
|
else:
|
65
112
|
logger.debug("Using LLM parser")
|
66
|
-
|
113
|
+
result = parse_llm_doc(path, **kwargs)
|
114
|
+
|
115
|
+
result["parser_used"] = parser_type
|
116
|
+
return result
|
67
117
|
|
68
118
|
|
69
119
|
def parse_chunk_list(
|
@@ -82,15 +132,18 @@ def parse_chunk_list(
|
|
82
132
|
"""
|
83
133
|
combined_segments = []
|
84
134
|
raw_texts = []
|
85
|
-
token_usage = {"input": 0, "output": 0, "
|
135
|
+
token_usage = {"input": 0, "output": 0, "llm_page_count": 0}
|
86
136
|
for file_path in file_paths:
|
87
137
|
result = parse_chunk(file_path, parser_type, **kwargs)
|
88
138
|
combined_segments.extend(result["segments"])
|
89
139
|
raw_texts.append(result["raw"])
|
90
|
-
if
|
140
|
+
if (
|
141
|
+
result.get("parser_used") == ParserType.LLM_PARSE
|
142
|
+
and "token_usage" in result
|
143
|
+
):
|
91
144
|
token_usage["input"] += result["token_usage"]["input"]
|
92
145
|
token_usage["output"] += result["token_usage"]["output"]
|
93
|
-
token_usage["
|
146
|
+
token_usage["llm_page_count"] += len(result["segments"])
|
94
147
|
token_usage["total"] = token_usage["input"] + token_usage["output"]
|
95
148
|
|
96
149
|
return {
|
@@ -136,7 +189,7 @@ def parse(
|
|
136
189
|
as_pdf = kwargs.get("as_pdf", False)
|
137
190
|
depth = kwargs.get("depth", 1)
|
138
191
|
|
139
|
-
if type(parser_type)
|
192
|
+
if type(parser_type) is str:
|
140
193
|
parser_type = ParserType[parser_type]
|
141
194
|
if (
|
142
195
|
path.lower().endswith((".doc", ".docx"))
|
@@ -182,9 +235,9 @@ def parse(
|
|
182
235
|
sub_pdf_path = os.path.join(sub_pdf_dir, f"{os.path.basename(path)}")
|
183
236
|
path = create_sub_pdf(path, sub_pdf_path, kwargs["page_nums"])
|
184
237
|
|
185
|
-
if not path.lower().endswith(".pdf")
|
238
|
+
if not path.lower().endswith(".pdf"):
|
186
239
|
kwargs["split"] = False
|
187
|
-
result =
|
240
|
+
result = parse_chunk_list([path], parser_type, kwargs)
|
188
241
|
else:
|
189
242
|
kwargs["split"] = True
|
190
243
|
split_dir = os.path.join(temp_dir, "splits/")
|
@@ -219,42 +272,43 @@ def parse(
|
|
219
272
|
"token_usage": {
|
220
273
|
"input": sum(r["token_usage"]["input"] for r in chunk_results),
|
221
274
|
"output": sum(r["token_usage"]["output"] for r in chunk_results),
|
222
|
-
"
|
223
|
-
r["token_usage"]["
|
275
|
+
"llm_page_count": sum(
|
276
|
+
r["token_usage"]["llm_page_count"] for r in chunk_results
|
224
277
|
),
|
225
278
|
"total": sum(r["token_usage"]["total"] for r in chunk_results),
|
226
279
|
},
|
227
280
|
}
|
228
281
|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
else:
|
239
|
-
raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.")
|
282
|
+
if "api_cost_mapping" in kwargs and "token_usage" in result:
|
283
|
+
api_cost_mapping = kwargs["api_cost_mapping"]
|
284
|
+
if isinstance(api_cost_mapping, dict):
|
285
|
+
api_cost_mapping = api_cost_mapping
|
286
|
+
elif isinstance(api_cost_mapping, str) and os.path.exists(api_cost_mapping):
|
287
|
+
with open(api_cost_mapping, "r") as f:
|
288
|
+
api_cost_mapping = json.load(f)
|
289
|
+
else:
|
290
|
+
raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.")
|
240
291
|
|
241
|
-
|
242
|
-
|
292
|
+
api_cost = api_cost_mapping.get(
|
293
|
+
kwargs.get("model", "gemini-2.0-flash"), None
|
294
|
+
)
|
295
|
+
if api_cost:
|
296
|
+
token_usage = result["token_usage"]
|
297
|
+
token_cost = {
|
298
|
+
"input": token_usage["input"] * api_cost["input"] / 1_000_000,
|
299
|
+
"input-image": api_cost.get("input-image", 0)
|
300
|
+
* token_usage.get("llm_page_count", 0),
|
301
|
+
"output": token_usage["output"] * api_cost["output"] / 1_000_000,
|
302
|
+
}
|
303
|
+
token_cost["total"] = (
|
304
|
+
token_cost["input"]
|
305
|
+
+ token_cost["input-image"]
|
306
|
+
+ token_cost["output"]
|
243
307
|
)
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
+ api_cost.get("input-image", 0) * token_usage["image_count"],
|
249
|
-
"output": token_usage["output"]
|
250
|
-
* api_cost["output"]
|
251
|
-
/ 1_000_000,
|
252
|
-
}
|
253
|
-
token_cost["total"] = token_cost["input"] + token_cost["output"]
|
254
|
-
result["token_cost"] = token_cost
|
255
|
-
|
256
|
-
if as_pdf:
|
257
|
-
result["pdf_path"] = path
|
308
|
+
result["token_cost"] = token_cost
|
309
|
+
|
310
|
+
if as_pdf:
|
311
|
+
result["pdf_path"] = path
|
258
312
|
|
259
313
|
if depth > 1:
|
260
314
|
recursive_docs = []
|
@@ -285,3 +339,71 @@ def parse(
|
|
285
339
|
result["recursive_docs"] = recursive_docs
|
286
340
|
|
287
341
|
return result
|
342
|
+
|
343
|
+
|
344
|
+
def parse_with_schema(
|
345
|
+
path: str,
|
346
|
+
schema: Dict,
|
347
|
+
api: Optional[str] = None,
|
348
|
+
model: str = "gpt-4o-mini",
|
349
|
+
**kwargs,
|
350
|
+
) -> List[List[Dict]]:
|
351
|
+
"""
|
352
|
+
Parses a PDF using an LLM to generate structured output conforming to a given JSON schema.
|
353
|
+
|
354
|
+
Args:
|
355
|
+
path (str): Path to the PDF file.
|
356
|
+
schema (Dict): JSON schema to which the parsed output should conform.
|
357
|
+
api (str, optional): LLM API provider (One of "openai", "huggingface", "together", "openrouter", and "fireworks").
|
358
|
+
model (str, optional): LLM model name.
|
359
|
+
**kwargs: Additional arguments for the parser (e.g.: temperature, max_tokens).
|
360
|
+
|
361
|
+
Returns:
|
362
|
+
List[List[Dict]]: List of dictionaries for each page, each conforming to the provided schema.
|
363
|
+
"""
|
364
|
+
if not api:
|
365
|
+
api = get_api_provider_for_model(model)
|
366
|
+
logger.debug(f"Using API provider: {api}")
|
367
|
+
|
368
|
+
system_prompt = f"""
|
369
|
+
The output should be formatted as a JSON instance that conforms to the JSON schema below.
|
370
|
+
|
371
|
+
As an example, for the schema {{
|
372
|
+
"properties": {{
|
373
|
+
"foo": {{
|
374
|
+
"title": "Foo",
|
375
|
+
"description": "a list of strings",
|
376
|
+
"type": "array",
|
377
|
+
"items": {{"type": "string"}}
|
378
|
+
}}
|
379
|
+
}},
|
380
|
+
"required": ["foo"]
|
381
|
+
}}, the object {{"foo": ["bar", "baz"]}} is valid. The object {{"properties": {{"foo": ["bar", "baz"]}}}} is not.
|
382
|
+
|
383
|
+
Here is the output schema:
|
384
|
+
{json.dumps(schema, indent=2)}
|
385
|
+
|
386
|
+
"""
|
387
|
+
|
388
|
+
user_prompt = "You are an AI agent that parses documents and returns them in the specified JSON format. Please parse the document and return it in the required format."
|
389
|
+
|
390
|
+
responses = []
|
391
|
+
images = convert_doc_to_base64_images(path)
|
392
|
+
for i, (page_num, image) in enumerate(images):
|
393
|
+
resp_dict = create_response(
|
394
|
+
api=api,
|
395
|
+
model=model,
|
396
|
+
user_prompt=user_prompt,
|
397
|
+
system_prompt=system_prompt,
|
398
|
+
image_url=image,
|
399
|
+
temperature=kwargs.get("temperature", 0.0),
|
400
|
+
max_tokens=kwargs.get("max_tokens", 1024),
|
401
|
+
)
|
402
|
+
|
403
|
+
response = resp_dict.get("response", "")
|
404
|
+
response = response.split("```json")[-1].split("```")[0].strip()
|
405
|
+
logger.debug(f"Processing page {page_num + 1} with response: {response}")
|
406
|
+
new_dict = json.loads(response)
|
407
|
+
responses.append(new_dict)
|
408
|
+
|
409
|
+
return responses
|
@@ -3,23 +3,25 @@ import io
|
|
3
3
|
import mimetypes
|
4
4
|
import os
|
5
5
|
import time
|
6
|
+
from functools import wraps
|
7
|
+
from typing import Dict, List, Optional, Tuple
|
8
|
+
|
6
9
|
import pypdfium2 as pdfium
|
7
10
|
import requests
|
8
|
-
from
|
11
|
+
from anthropic import Anthropic
|
12
|
+
from huggingface_hub import InferenceClient
|
13
|
+
from loguru import logger
|
14
|
+
from openai import OpenAI
|
9
15
|
from requests.exceptions import HTTPError
|
10
|
-
from
|
16
|
+
from together import Together
|
11
17
|
|
12
18
|
from lexoid.core.prompt_templates import (
|
13
19
|
INSTRUCTIONS_ADD_PG_BREAK,
|
20
|
+
LLAMA_PARSER_PROMPT,
|
14
21
|
OPENAI_USER_PROMPT,
|
15
22
|
PARSER_PROMPT,
|
16
|
-
LLAMA_PARSER_PROMPT,
|
17
23
|
)
|
18
24
|
from lexoid.core.utils import convert_image_to_pdf
|
19
|
-
from loguru import logger
|
20
|
-
from openai import OpenAI
|
21
|
-
from together import Together
|
22
|
-
from huggingface_hub import InferenceClient
|
23
25
|
|
24
26
|
|
25
27
|
def retry_on_http_error(func):
|
@@ -48,33 +50,41 @@ def retry_on_http_error(func):
|
|
48
50
|
return wrapper
|
49
51
|
|
50
52
|
|
51
|
-
|
52
|
-
def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
|
53
|
-
if "api_provider" in kwargs and kwargs["api_provider"]:
|
54
|
-
return parse_with_api(path, api=kwargs["api_provider"], **kwargs)
|
55
|
-
if "model" not in kwargs:
|
56
|
-
kwargs["model"] = "gemini-2.0-flash"
|
57
|
-
model = kwargs.get("model")
|
53
|
+
def get_api_provider_for_model(model: str) -> str:
|
58
54
|
if model.startswith("gemini"):
|
59
|
-
return
|
55
|
+
return "gemini"
|
60
56
|
if model.startswith("gpt"):
|
61
|
-
return
|
57
|
+
return "openai"
|
62
58
|
if model.startswith("meta-llama"):
|
63
59
|
if "Turbo" in model or model == "meta-llama/Llama-Vision-Free":
|
64
|
-
return
|
65
|
-
return
|
60
|
+
return "together"
|
61
|
+
return "huggingface"
|
66
62
|
if any(model.startswith(prefix) for prefix in ["microsoft", "google", "qwen"]):
|
67
|
-
return
|
63
|
+
return "openrouter"
|
64
|
+
if model.startswith("accounts/fireworks"):
|
65
|
+
return "fireworks"
|
66
|
+
if model.startswith("claude"):
|
67
|
+
return "anthropic"
|
68
68
|
raise ValueError(f"Unsupported model: {model}")
|
69
69
|
|
70
70
|
|
71
|
-
|
72
|
-
|
73
|
-
if
|
74
|
-
|
71
|
+
@retry_on_http_error
|
72
|
+
def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
|
73
|
+
if "api_provider" in kwargs and kwargs["api_provider"]:
|
74
|
+
return parse_with_api(path, api=kwargs["api_provider"], **kwargs)
|
75
75
|
|
76
|
-
|
76
|
+
model = kwargs.get("model", "gemini-2.0-flash")
|
77
|
+
kwargs["model"] = model
|
78
|
+
|
79
|
+
api_provider = get_api_provider_for_model(model)
|
77
80
|
|
81
|
+
if api_provider == "gemini":
|
82
|
+
return parse_with_gemini(path, **kwargs)
|
83
|
+
else:
|
84
|
+
return parse_with_api(path, api=api_provider, **kwargs)
|
85
|
+
|
86
|
+
|
87
|
+
def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
78
88
|
# Check if the file is an image and convert to PDF if necessary
|
79
89
|
mime_type, _ = mimetypes.guess_type(path)
|
80
90
|
if mime_type and mime_type.startswith("image"):
|
@@ -86,6 +96,20 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
86
96
|
file_content = file.read()
|
87
97
|
base64_file = base64.b64encode(file_content).decode("utf-8")
|
88
98
|
|
99
|
+
return parse_image_with_gemini(
|
100
|
+
base64_file=base64_file, mime_type=mime_type, **kwargs
|
101
|
+
)
|
102
|
+
|
103
|
+
|
104
|
+
def parse_image_with_gemini(
|
105
|
+
base64_file: str, mime_type: str = "image/png", **kwargs
|
106
|
+
) -> List[Dict] | str:
|
107
|
+
api_key = os.environ.get("GOOGLE_API_KEY")
|
108
|
+
if not api_key:
|
109
|
+
raise ValueError("GOOGLE_API_KEY environment variable is not set")
|
110
|
+
|
111
|
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/{kwargs['model']}:generateContent?key={api_key}"
|
112
|
+
|
89
113
|
if "system_prompt" in kwargs:
|
90
114
|
prompt = kwargs["system_prompt"]
|
91
115
|
else:
|
@@ -105,7 +129,7 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
105
129
|
}
|
106
130
|
],
|
107
131
|
"generationConfig": {
|
108
|
-
"temperature": kwargs.get("temperature", 0
|
132
|
+
"temperature": kwargs.get("temperature", 0),
|
109
133
|
},
|
110
134
|
}
|
111
135
|
|
@@ -125,24 +149,23 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
125
149
|
if "text" in part
|
126
150
|
)
|
127
151
|
|
128
|
-
combined_text =
|
152
|
+
combined_text = raw_text
|
129
153
|
if "<output>" in raw_text:
|
130
|
-
combined_text = raw_text.split("<output>")[1].strip()
|
131
|
-
if "</output>" in
|
132
|
-
combined_text =
|
154
|
+
combined_text = raw_text.split("<output>")[-1].strip()
|
155
|
+
if "</output>" in combined_text:
|
156
|
+
combined_text = combined_text.split("</output>")[0].strip()
|
133
157
|
|
134
158
|
token_usage = result["usageMetadata"]
|
135
159
|
input_tokens = token_usage.get("promptTokenCount", 0)
|
136
160
|
output_tokens = token_usage.get("candidatesTokenCount", 0)
|
137
161
|
total_tokens = input_tokens + output_tokens
|
138
|
-
|
139
162
|
return {
|
140
163
|
"raw": combined_text.replace("<page-break>", "\n\n"),
|
141
164
|
"segments": [
|
142
165
|
{"metadata": {"page": kwargs.get("start", 0) + page_no}, "content": page}
|
143
166
|
for page_no, page in enumerate(combined_text.split("<page-break>"), start=1)
|
144
167
|
],
|
145
|
-
"title": kwargs
|
168
|
+
"title": kwargs.get("title", ""),
|
146
169
|
"url": kwargs.get("url", ""),
|
147
170
|
"parent_title": kwargs.get("parent_title", ""),
|
148
171
|
"recursive_docs": [],
|
@@ -169,18 +192,54 @@ def convert_pdf_page_to_base64(
|
|
169
192
|
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
170
193
|
|
171
194
|
|
172
|
-
def
|
173
|
-
|
174
|
-
|
195
|
+
def get_messages(
|
196
|
+
system_prompt: Optional[str], user_prompt: Optional[str], image_url: Optional[str]
|
197
|
+
) -> List[Dict]:
|
198
|
+
messages = []
|
199
|
+
if system_prompt:
|
200
|
+
messages.append(
|
201
|
+
{
|
202
|
+
"role": "system",
|
203
|
+
"content": system_prompt,
|
204
|
+
}
|
205
|
+
)
|
206
|
+
base_message = (
|
207
|
+
[
|
208
|
+
{"type": "text", "text": user_prompt},
|
209
|
+
]
|
210
|
+
if user_prompt
|
211
|
+
else []
|
212
|
+
)
|
213
|
+
image_message = (
|
214
|
+
[
|
215
|
+
{
|
216
|
+
"type": "image_url",
|
217
|
+
"image_url": {"url": image_url},
|
218
|
+
}
|
219
|
+
]
|
220
|
+
if image_url
|
221
|
+
else []
|
222
|
+
)
|
175
223
|
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
224
|
+
messages.append(
|
225
|
+
{
|
226
|
+
"role": "user",
|
227
|
+
"content": base_message + image_message,
|
228
|
+
}
|
229
|
+
)
|
180
230
|
|
181
|
-
|
182
|
-
|
183
|
-
|
231
|
+
return messages
|
232
|
+
|
233
|
+
|
234
|
+
def create_response(
|
235
|
+
api: str,
|
236
|
+
model: str,
|
237
|
+
system_prompt: Optional[str] = None,
|
238
|
+
user_prompt: Optional[str] = None,
|
239
|
+
image_url: Optional[str] = None,
|
240
|
+
temperature: float = 0.0,
|
241
|
+
max_tokens: int = 1024,
|
242
|
+
) -> Dict:
|
184
243
|
# Initialize appropriate client
|
185
244
|
clients = {
|
186
245
|
"openai": lambda: OpenAI(),
|
@@ -192,11 +251,110 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
|
192
251
|
base_url="https://openrouter.ai/api/v1",
|
193
252
|
api_key=os.environ["OPENROUTER_API_KEY"],
|
194
253
|
),
|
254
|
+
"fireworks": lambda: OpenAI(
|
255
|
+
base_url="https://api.fireworks.ai/inference/v1",
|
256
|
+
api_key=os.environ["FIREWORKS_API_KEY"],
|
257
|
+
),
|
258
|
+
"anthropic": lambda: Anthropic(
|
259
|
+
api_key=os.environ["ANTHROPIC_API_KEY"],
|
260
|
+
),
|
261
|
+
"gemini": lambda: None, # Gemini is handled separately
|
195
262
|
}
|
196
263
|
assert api in clients, f"Unsupported API: {api}"
|
197
|
-
|
264
|
+
|
265
|
+
if api == "gemini":
|
266
|
+
image_url = image_url.split("data:image/png;base64,")[1]
|
267
|
+
response = parse_image_with_gemini(
|
268
|
+
base64_file=image_url,
|
269
|
+
model=model,
|
270
|
+
temperature=temperature,
|
271
|
+
max_tokens=max_tokens,
|
272
|
+
system_prompt=system_prompt,
|
273
|
+
)
|
274
|
+
return {
|
275
|
+
"response": response["raw"],
|
276
|
+
"usage": response["token_usage"],
|
277
|
+
}
|
278
|
+
|
198
279
|
client = clients[api]()
|
199
280
|
|
281
|
+
if api == "anthropic":
|
282
|
+
image_media_type = image_url.split(";")[0].split(":")[1]
|
283
|
+
image_data = image_url.split(",")[1]
|
284
|
+
response = client.messages.create(
|
285
|
+
model=model,
|
286
|
+
messages=[
|
287
|
+
{
|
288
|
+
"role": "user",
|
289
|
+
"content": [
|
290
|
+
{
|
291
|
+
"type": "image",
|
292
|
+
"source": {
|
293
|
+
"type": "base64",
|
294
|
+
"media_type": image_media_type,
|
295
|
+
"data": image_data,
|
296
|
+
},
|
297
|
+
},
|
298
|
+
{"type": "text", "text": user_prompt},
|
299
|
+
],
|
300
|
+
}
|
301
|
+
],
|
302
|
+
max_tokens=max_tokens,
|
303
|
+
temperature=temperature,
|
304
|
+
)
|
305
|
+
|
306
|
+
return {
|
307
|
+
"response": response.content[0].text,
|
308
|
+
"usage": {
|
309
|
+
"input_tokens": response.usage.input_tokens,
|
310
|
+
"output_tokens": response.usage.output_tokens,
|
311
|
+
"total_tokens": response.usage.input_tokens
|
312
|
+
+ response.usage.output_tokens,
|
313
|
+
},
|
314
|
+
}
|
315
|
+
|
316
|
+
# Prepare messages for the API call
|
317
|
+
messages = get_messages(system_prompt, user_prompt, image_url)
|
318
|
+
|
319
|
+
# Common completion parameters
|
320
|
+
completion_params = {
|
321
|
+
"model": model,
|
322
|
+
"messages": messages,
|
323
|
+
"max_tokens": max_tokens,
|
324
|
+
"temperature": temperature,
|
325
|
+
}
|
326
|
+
|
327
|
+
# Get completion from selected API
|
328
|
+
response = client.chat.completions.create(**completion_params)
|
329
|
+
token_usage = response.usage
|
330
|
+
|
331
|
+
# Extract the response text
|
332
|
+
page_text = response.choices[0].message.content
|
333
|
+
|
334
|
+
return {
|
335
|
+
"response": page_text,
|
336
|
+
"usage": {
|
337
|
+
"input_tokens": token_usage.prompt_tokens,
|
338
|
+
"output_tokens": token_usage.completion_tokens,
|
339
|
+
"total_tokens": token_usage.total_tokens,
|
340
|
+
},
|
341
|
+
}
|
342
|
+
|
343
|
+
|
344
|
+
def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
345
|
+
"""
|
346
|
+
Parse documents (PDFs or images) using various vision model APIs.
|
347
|
+
|
348
|
+
Args:
|
349
|
+
path (str): Path to the document to parse
|
350
|
+
api (str): Which API to use ("openai", "huggingface", or "together")
|
351
|
+
**kwargs: Additional arguments including model, temperature, title, etc.
|
352
|
+
|
353
|
+
Returns:
|
354
|
+
Dict: Dictionary containing parsed document data
|
355
|
+
"""
|
356
|
+
logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
|
357
|
+
|
200
358
|
# Handle different input types
|
201
359
|
mime_type, _ = mimetypes.guess_type(path)
|
202
360
|
if mime_type and mime_type.startswith("image"):
|
@@ -215,76 +373,48 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
|
215
373
|
for page_num in range(len(pdf_document))
|
216
374
|
]
|
217
375
|
|
218
|
-
#
|
219
|
-
|
220
|
-
|
221
|
-
"type": "image_url",
|
222
|
-
"image_url": {"url": image_url},
|
223
|
-
}
|
224
|
-
|
376
|
+
# Process each page/image
|
377
|
+
all_results = []
|
378
|
+
for page_num, image_url in images:
|
225
379
|
if api == "openai":
|
226
380
|
system_prompt = kwargs.get(
|
227
381
|
"system_prompt", PARSER_PROMPT.format(custom_instructions="")
|
228
382
|
)
|
229
383
|
user_prompt = kwargs.get("user_prompt", OPENAI_USER_PROMPT)
|
230
|
-
return [
|
231
|
-
{
|
232
|
-
"role": "system",
|
233
|
-
"content": system_prompt,
|
234
|
-
},
|
235
|
-
{
|
236
|
-
"role": "user",
|
237
|
-
"content": [
|
238
|
-
{"type": "text", "text": user_prompt},
|
239
|
-
image_message,
|
240
|
-
],
|
241
|
-
},
|
242
|
-
]
|
243
384
|
else:
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
messages = get_messages(page_num, image_url)
|
257
|
-
|
258
|
-
# Common completion parameters
|
259
|
-
completion_params = {
|
260
|
-
"model": kwargs["model"],
|
261
|
-
"messages": messages,
|
262
|
-
"max_tokens": kwargs.get("max_tokens", 1024),
|
263
|
-
"temperature": kwargs.get("temperature", 0.7),
|
264
|
-
}
|
385
|
+
system_prompt = kwargs.get("system_prompt", None)
|
386
|
+
user_prompt = kwargs.get("user_prompt", LLAMA_PARSER_PROMPT)
|
387
|
+
|
388
|
+
response = create_response(
|
389
|
+
api=api,
|
390
|
+
model=kwargs["model"],
|
391
|
+
system_prompt=system_prompt,
|
392
|
+
user_prompt=user_prompt,
|
393
|
+
image_url=image_url,
|
394
|
+
temperature=kwargs.get("temperature", 0.0),
|
395
|
+
max_tokens=kwargs.get("max_tokens", 1024),
|
396
|
+
)
|
265
397
|
|
266
398
|
# Get completion from selected API
|
267
|
-
|
268
|
-
token_usage = response
|
399
|
+
page_text = response["response"]
|
400
|
+
token_usage = response["usage"]
|
269
401
|
|
270
|
-
# Extract the response text
|
271
|
-
page_text = response.choices[0].message.content
|
272
402
|
if kwargs.get("verbose", None):
|
273
403
|
logger.debug(f"Page {page_num + 1} response: {page_text}")
|
274
404
|
|
275
405
|
# Extract content between output tags if present
|
276
406
|
result = page_text
|
277
407
|
if "<output>" in page_text:
|
278
|
-
result = page_text.split("<output>")[1].strip()
|
408
|
+
result = page_text.split("<output>")[-1].strip()
|
279
409
|
if "</output>" in result:
|
280
410
|
result = result.split("</output>")[0].strip()
|
281
411
|
all_results.append(
|
282
412
|
(
|
283
413
|
page_num,
|
284
414
|
result,
|
285
|
-
token_usage
|
286
|
-
token_usage
|
287
|
-
token_usage
|
415
|
+
token_usage["input_tokens"],
|
416
|
+
token_usage["output_tokens"],
|
417
|
+
token_usage["total_tokens"],
|
288
418
|
)
|
289
419
|
)
|
290
420
|
|
@@ -319,3 +449,28 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
|
319
449
|
"total": sum(total_tokens for _, _, _, _, total_tokens in all_results),
|
320
450
|
},
|
321
451
|
}
|
452
|
+
|
453
|
+
|
454
|
+
def convert_doc_to_base64_images(path: str) -> List[Tuple[int, str]]:
|
455
|
+
"""
|
456
|
+
Converts a document (PDF or image) to a base64 encoded string.
|
457
|
+
|
458
|
+
Args:
|
459
|
+
path (str): Path to the PDF file.
|
460
|
+
|
461
|
+
Returns:
|
462
|
+
str: Base64 encoded string of the PDF content.
|
463
|
+
"""
|
464
|
+
if path.endswith(".pdf"):
|
465
|
+
pdf_document = pdfium.PdfDocument(path)
|
466
|
+
return [
|
467
|
+
(
|
468
|
+
page_num,
|
469
|
+
f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
|
470
|
+
)
|
471
|
+
for page_num in range(len(pdf_document))
|
472
|
+
]
|
473
|
+
elif mimetypes.guess_type(path)[0].startswith("image"):
|
474
|
+
with open(path, "rb") as img_file:
|
475
|
+
image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
|
476
|
+
return [(0, f"data:image/png;base64,{image_base64}")]
|
@@ -1,12 +1,14 @@
|
|
1
1
|
import os
|
2
2
|
import re
|
3
3
|
import tempfile
|
4
|
+
from functools import wraps
|
4
5
|
from time import time
|
5
6
|
from typing import Dict, List
|
6
7
|
|
7
8
|
import pandas as pd
|
8
9
|
import pdfplumber
|
9
10
|
from docx import Document
|
11
|
+
from loguru import logger
|
10
12
|
from pdfminer.high_level import extract_pages
|
11
13
|
from pdfminer.layout import LTTextContainer
|
12
14
|
from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
|
@@ -22,6 +24,38 @@ from lexoid.core.utils import (
|
|
22
24
|
)
|
23
25
|
|
24
26
|
|
27
|
+
def retry_with_different_parser(func):
|
28
|
+
@wraps(func)
|
29
|
+
def wrapper(*args, **kwargs):
|
30
|
+
try:
|
31
|
+
return func(*args, **kwargs)
|
32
|
+
except Exception as e:
|
33
|
+
if "pdfplumber" in kwargs.get("framework", "pdfplumber") and not kwargs.get(
|
34
|
+
"routed", False
|
35
|
+
):
|
36
|
+
kwargs["framework"] = "pdfminer"
|
37
|
+
logger.warning(
|
38
|
+
f"Retrying with pdfminer due to error: {e}. Original framework: {kwargs['framework']}"
|
39
|
+
)
|
40
|
+
return func(*args, **kwargs)
|
41
|
+
elif "pdfminer" in kwargs.get("framework", "pdfplumber") and not kwargs.get(
|
42
|
+
"routed", False
|
43
|
+
):
|
44
|
+
kwargs["framework"] = "pdfplumber"
|
45
|
+
logger.warning(
|
46
|
+
f"Retrying with pdfplumber due to error: {e}. Original framework: {kwargs['framework']}"
|
47
|
+
)
|
48
|
+
return func(*args, **kwargs)
|
49
|
+
else:
|
50
|
+
logger.error(
|
51
|
+
f"Failed to parse document with both pdfplumber and pdfminer: {e}"
|
52
|
+
)
|
53
|
+
raise e
|
54
|
+
|
55
|
+
return wrapper
|
56
|
+
|
57
|
+
|
58
|
+
@retry_with_different_parser
|
25
59
|
def parse_static_doc(path: str, **kwargs) -> Dict:
|
26
60
|
"""
|
27
61
|
Parses a document using static parsing methods.
|
@@ -41,7 +41,8 @@ Think step-by-step.
|
|
41
41
|
'0' is typically more oval than 'O'
|
42
42
|
'8' has a more angular top than 'B'
|
43
43
|
{custom_instructions}
|
44
|
-
- Return only the correct markdown without additional text or explanations.
|
44
|
+
- Return only the correct markdown without additional text or explanations.
|
45
|
+
- DO NOT use code blocks such as "```html" or "```markdown" in the output unless there is a code block in the content.
|
45
46
|
- Think before generating the output in <thinking></thinking> tags.
|
46
47
|
|
47
48
|
Remember, your primary objective is to create an output that, when rendered, structurally replicates the original document's content as closely as possible without losing any textual details.
|
@@ -69,15 +69,45 @@ def convert_image_to_pdf(image_path: str) -> bytes:
|
|
69
69
|
|
70
70
|
def remove_html_tags(text: str):
|
71
71
|
html = markdown(text, extensions=["tables"])
|
72
|
-
return re.sub(HTML_TAG_PATTERN, "", html)
|
72
|
+
return re.sub(HTML_TAG_PATTERN, " ", html)
|
73
73
|
|
74
74
|
|
75
|
-
def
|
75
|
+
def clean_text(txt):
|
76
|
+
# Remove LaTeX commands (e.g. \command, \command[args]{args})
|
77
|
+
txt = re.sub(r"\\[a-zA-Z]+(\[[^\]]*\])?(\{[^}]*\})?", " ", txt)
|
78
|
+
|
79
|
+
# Replace all blocks of whitespace (including tabs and newlines) with a single space
|
80
|
+
txt = re.sub(r"\s+", " ", txt)
|
81
|
+
|
82
|
+
# Remove all non-alphanumeric characters except spaces
|
83
|
+
txt = re.sub(r"[^a-zA-Z0-9 ]", " ", txt)
|
84
|
+
|
85
|
+
return txt.strip()
|
86
|
+
|
87
|
+
|
88
|
+
def calculate_similarity(
|
89
|
+
text1: str, text2: str, ignore_html: bool = True, diff_save_path: str = ""
|
90
|
+
) -> float:
|
76
91
|
"""Calculate similarity ratio between two texts using SequenceMatcher."""
|
77
92
|
if ignore_html:
|
78
93
|
text1 = remove_html_tags(text1)
|
79
94
|
text2 = remove_html_tags(text2)
|
80
|
-
|
95
|
+
|
96
|
+
text1 = clean_text(clean_text(text1))
|
97
|
+
text2 = clean_text(clean_text(text2))
|
98
|
+
|
99
|
+
sm = SequenceMatcher(None, text1, text2)
|
100
|
+
# Save the diff and the texts for debugging
|
101
|
+
if diff_save_path:
|
102
|
+
with open(diff_save_path, "w") as f:
|
103
|
+
f.write(f"Text 1:\n{text1}\n\n")
|
104
|
+
f.write(f"Text 2:\n{text2}\n\n")
|
105
|
+
f.write("Differences:\n")
|
106
|
+
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
107
|
+
if tag == "equal":
|
108
|
+
continue
|
109
|
+
f.write(f"{tag} {text1[i1:i2]} -> {text2[j1:j2]}\n")
|
110
|
+
return sm.ratio()
|
81
111
|
|
82
112
|
|
83
113
|
def convert_pdf_page_to_image(
|
@@ -345,7 +375,7 @@ def get_webpage_soup(url: str) -> BeautifulSoup:
|
|
345
375
|
# Additional wait for any dynamic content
|
346
376
|
try:
|
347
377
|
await page.wait_for_selector("body", timeout=30000)
|
348
|
-
except:
|
378
|
+
except Exception:
|
349
379
|
pass
|
350
380
|
|
351
381
|
html = await page.content()
|
@@ -561,24 +591,32 @@ def router(path: str, priority: str = "speed") -> str:
|
|
561
591
|
priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE).
|
562
592
|
"""
|
563
593
|
file_type = get_file_type(path)
|
564
|
-
if
|
594
|
+
if (
|
595
|
+
file_type.startswith("text/")
|
596
|
+
or "spreadsheet" in file_type
|
597
|
+
or "presentation" in file_type
|
598
|
+
):
|
565
599
|
return "STATIC_PARSE"
|
566
600
|
|
567
601
|
if priority == "accuracy":
|
568
602
|
# If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
|
569
603
|
# Otherwise, use LLM_PARSE
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
and
|
574
|
-
):
|
604
|
+
has_image = has_image_in_pdf(path)
|
605
|
+
has_hyperlink = has_hyperlink_in_pdf(path)
|
606
|
+
if file_type == "application/pdf" and not has_image and has_hyperlink:
|
607
|
+
logger.debug("Using STATIC_PARSE for PDF with hyperlinks and no images.")
|
575
608
|
return "STATIC_PARSE"
|
609
|
+
logger.debug(
|
610
|
+
f"Using LLM_PARSE because PDF has image ({has_image}) or has no hyperlink ({has_hyperlink})."
|
611
|
+
)
|
576
612
|
return "LLM_PARSE"
|
577
613
|
else:
|
578
614
|
# If the file is a PDF without images, use STATIC_PARSE
|
579
615
|
# Otherwise, use LLM_PARSE
|
580
616
|
if file_type == "application/pdf" and not has_image_in_pdf(path):
|
617
|
+
logger.debug("Using STATIC_PARSE for PDF without images.")
|
581
618
|
return "STATIC_PARSE"
|
619
|
+
logger.debug("Using LLM_PARSE because PDF has images")
|
582
620
|
return "LLM_PARSE"
|
583
621
|
|
584
622
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "lexoid"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.15"
|
4
4
|
description = ""
|
5
5
|
authors = []
|
6
6
|
readme = "README.md"
|
@@ -30,6 +30,7 @@ huggingface-hub = "^0.27.0"
|
|
30
30
|
together = "^1.4.0"
|
31
31
|
openpyxl = "^3.1.5"
|
32
32
|
pptx2md = "^2.0.6"
|
33
|
+
anthropic = "^0.55.0"
|
33
34
|
|
34
35
|
[tool.poetry.group.dev.dependencies]
|
35
36
|
ipykernel = "^6.29.5"
|
@@ -40,6 +41,7 @@ pytest = "^8.3.2"
|
|
40
41
|
[tool.poetry.group.docs.dependencies]
|
41
42
|
sphinx = "^8.1.3"
|
42
43
|
pydata-sphinx-theme = "^0.16.1"
|
44
|
+
docutils = "^0.21.2"
|
43
45
|
|
44
46
|
[build-system]
|
45
47
|
requires = ["poetry-core", "wheel"]
|
File without changes
|