lexoid 0.1.14__tar.gz → 0.1.15__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lexoid-0.1.14 → lexoid-0.1.15}/PKG-INFO +21 -20
- {lexoid-0.1.14 → lexoid-0.1.15}/README.md +19 -19
- {lexoid-0.1.14 → lexoid-0.1.15}/lexoid/api.py +58 -8
- {lexoid-0.1.14 → lexoid-0.1.15}/lexoid/core/parse_type/llm_parser.py +108 -31
- {lexoid-0.1.14 → lexoid-0.1.15}/lexoid/core/parse_type/static_parser.py +34 -0
- {lexoid-0.1.14 → lexoid-0.1.15}/lexoid/core/utils.py +33 -3
- {lexoid-0.1.14 → lexoid-0.1.15}/pyproject.toml +3 -1
- {lexoid-0.1.14 → lexoid-0.1.15}/LICENSE +0 -0
- {lexoid-0.1.14 → lexoid-0.1.15}/lexoid/core/prompt_templates.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lexoid
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.15
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.10,<4.0
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -8,6 +8,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
8
8
|
Classifier: Programming Language :: Python :: 3.11
|
9
9
|
Classifier: Programming Language :: Python :: 3.12
|
10
10
|
Classifier: Programming Language :: Python :: 3.13
|
11
|
+
Requires-Dist: anthropic (>=0.55.0,<0.56.0)
|
11
12
|
Requires-Dist: bs4 (>=0.0.2,<0.0.3)
|
12
13
|
Requires-Dist: docx2pdf (>=0.1.8,<0.2.0)
|
13
14
|
Requires-Dist: google-generativeai (>=0.8.1,<0.9.0)
|
@@ -155,23 +156,23 @@ _Note:_ Benchmarks are currently done in the zero-shot setting.
|
|
155
156
|
|
156
157
|
| Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
|
157
158
|
| --- | --- | --- | --- | --- | --- |
|
158
|
-
| 1 |
|
159
|
-
| 2 | gemini-2.0-flash
|
160
|
-
| 3 | gemini-
|
161
|
-
| 4 | gemini-
|
162
|
-
| 5 |
|
163
|
-
| 6 |
|
164
|
-
| 7 | gemini-
|
165
|
-
| 8 |
|
166
|
-
| 9 |
|
167
|
-
| 10 | accounts/fireworks/models/llama4-
|
168
|
-
| 11 | gpt-4o
|
169
|
-
| 12 |
|
170
|
-
| 13 |
|
171
|
-
| 14 |
|
172
|
-
| 15 |
|
173
|
-
| 16 |
|
174
|
-
| 17 |
|
175
|
-
| 18 |
|
176
|
-
| 19 |
|
159
|
+
| 1 | AUTO | 0.906 | 0.112 | 9.56 | 0.00068 |
|
160
|
+
| 2 | gemini-2.0-flash | 0.897 | 0.126 | 9.91 | 0.00078 |
|
161
|
+
| 3 | gemini-2.5-flash | 0.895 | 0.148 | 54.10 | 0.01051 |
|
162
|
+
| 4 | gemini-1.5-pro | 0.868 | 0.283 | 15.03 | 0.00637 |
|
163
|
+
| 5 | gemini-1.5-flash | 0.864 | 0.194 | 15.47 | 0.00044 |
|
164
|
+
| 6 | claude-3-5-sonnet-20241022 | 0.851 | 0.209 | 15.99 | 0.01758 |
|
165
|
+
| 7 | gemini-2.5-pro | 0.849 | 0.298 | 101.95 | 0.01859 |
|
166
|
+
| 8 | claude-sonnet-4-20250514 | 0.804 | 0.190 | 19.27 | 0.02071 |
|
167
|
+
| 9 | claude-opus-4-20250514 | 0.772 | 0.238 | 20.03 | 0.09207 |
|
168
|
+
| 10 | accounts/fireworks/models/llama4-maverick-instruct-basic | 0.768 | 0.234 | 12.12 | 0.00150 |
|
169
|
+
| 11 | gpt-4o | 0.748 | 0.284 | 26.80 | 0.01478 |
|
170
|
+
| 12 | gpt-4o-mini | 0.733 | 0.231 | 18.18 | 0.00650 |
|
171
|
+
| 13 | gpt-4.1-mini | 0.723 | 0.269 | 20.91 | 0.00351 |
|
172
|
+
| 14 | google/gemma-3-27b-it | 0.681 | 0.334 | 19.41 | 0.00027 |
|
173
|
+
| 15 | gpt-4.1 | 0.650 | 0.342 | 33.72 | 0.01443 |
|
174
|
+
| 16 | claude-3-7-sonnet-20250219 | 0.633 | 0.369 | 14.24 | 0.01763 |
|
175
|
+
| 17 | microsoft/phi-4-multimodal-instruct | 0.622 | 0.320 | 13.15 | 0.00050 |
|
176
|
+
| 18 | qwen/qwen-2.5-vl-7b-instruct | 0.559 | 0.348 | 17.71 | 0.00086 |
|
177
|
+
| 19 | meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo | 0.546 | 0.239 | 29.26 | 0.01103 |
|
177
178
|
|
@@ -120,22 +120,22 @@ _Note:_ Benchmarks are currently done in the zero-shot setting.
|
|
120
120
|
|
121
121
|
| Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
|
122
122
|
| --- | --- | --- | --- | --- | --- |
|
123
|
-
| 1 |
|
124
|
-
| 2 | gemini-2.0-flash
|
125
|
-
| 3 | gemini-
|
126
|
-
| 4 | gemini-
|
127
|
-
| 5 |
|
128
|
-
| 6 |
|
129
|
-
| 7 | gemini-
|
130
|
-
| 8 |
|
131
|
-
| 9 |
|
132
|
-
| 10 | accounts/fireworks/models/llama4-
|
133
|
-
| 11 | gpt-4o
|
134
|
-
| 12 |
|
135
|
-
| 13 |
|
136
|
-
| 14 |
|
137
|
-
| 15 |
|
138
|
-
| 16 |
|
139
|
-
| 17 |
|
140
|
-
| 18 |
|
141
|
-
| 19 |
|
123
|
+
| 1 | AUTO | 0.906 | 0.112 | 9.56 | 0.00068 |
|
124
|
+
| 2 | gemini-2.0-flash | 0.897 | 0.126 | 9.91 | 0.00078 |
|
125
|
+
| 3 | gemini-2.5-flash | 0.895 | 0.148 | 54.10 | 0.01051 |
|
126
|
+
| 4 | gemini-1.5-pro | 0.868 | 0.283 | 15.03 | 0.00637 |
|
127
|
+
| 5 | gemini-1.5-flash | 0.864 | 0.194 | 15.47 | 0.00044 |
|
128
|
+
| 6 | claude-3-5-sonnet-20241022 | 0.851 | 0.209 | 15.99 | 0.01758 |
|
129
|
+
| 7 | gemini-2.5-pro | 0.849 | 0.298 | 101.95 | 0.01859 |
|
130
|
+
| 8 | claude-sonnet-4-20250514 | 0.804 | 0.190 | 19.27 | 0.02071 |
|
131
|
+
| 9 | claude-opus-4-20250514 | 0.772 | 0.238 | 20.03 | 0.09207 |
|
132
|
+
| 10 | accounts/fireworks/models/llama4-maverick-instruct-basic | 0.768 | 0.234 | 12.12 | 0.00150 |
|
133
|
+
| 11 | gpt-4o | 0.748 | 0.284 | 26.80 | 0.01478 |
|
134
|
+
| 12 | gpt-4o-mini | 0.733 | 0.231 | 18.18 | 0.00650 |
|
135
|
+
| 13 | gpt-4.1-mini | 0.723 | 0.269 | 20.91 | 0.00351 |
|
136
|
+
| 14 | google/gemma-3-27b-it | 0.681 | 0.334 | 19.41 | 0.00027 |
|
137
|
+
| 15 | gpt-4.1 | 0.650 | 0.342 | 33.72 | 0.01443 |
|
138
|
+
| 16 | claude-3-7-sonnet-20250219 | 0.633 | 0.369 | 14.24 | 0.01763 |
|
139
|
+
| 17 | microsoft/phi-4-multimodal-instruct | 0.622 | 0.320 | 13.15 | 0.00050 |
|
140
|
+
| 18 | qwen/qwen-2.5-vl-7b-instruct | 0.559 | 0.348 | 17.71 | 0.00086 |
|
141
|
+
| 19 | meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo | 0.546 | 0.239 | 29.26 | 0.01103 |
|
@@ -4,9 +4,10 @@ import re
|
|
4
4
|
import tempfile
|
5
5
|
from concurrent.futures import ProcessPoolExecutor
|
6
6
|
from enum import Enum
|
7
|
+
from functools import wraps
|
7
8
|
from glob import glob
|
8
9
|
from time import time
|
9
|
-
from typing import Union, Dict, List
|
10
|
+
from typing import Optional, Union, Dict, List
|
10
11
|
|
11
12
|
from loguru import logger
|
12
13
|
|
@@ -14,6 +15,7 @@ from lexoid.core.parse_type.llm_parser import (
|
|
14
15
|
parse_llm_doc,
|
15
16
|
create_response,
|
16
17
|
convert_doc_to_base64_images,
|
18
|
+
get_api_provider_for_model,
|
17
19
|
)
|
18
20
|
from lexoid.core.parse_type.static_parser import parse_static_doc
|
19
21
|
from lexoid.core.utils import (
|
@@ -35,6 +37,51 @@ class ParserType(Enum):
|
|
35
37
|
AUTO = "AUTO"
|
36
38
|
|
37
39
|
|
40
|
+
def retry_with_different_parser_type(func):
|
41
|
+
@wraps(func)
|
42
|
+
def wrapper(*args, **kwargs):
|
43
|
+
try:
|
44
|
+
if len(args) > 0:
|
45
|
+
kwargs["path"] = args[0]
|
46
|
+
if len(args) > 1:
|
47
|
+
router_priority = kwargs.get("router_priority", "speed")
|
48
|
+
if args[1] == ParserType.AUTO:
|
49
|
+
parser_type = ParserType[router(kwargs["path"], router_priority)]
|
50
|
+
logger.debug(f"Auto-detected parser type: {parser_type}")
|
51
|
+
kwargs["routed"] = True
|
52
|
+
else:
|
53
|
+
parser_type = args[1]
|
54
|
+
kwargs["parser_type"] = parser_type
|
55
|
+
return func(**kwargs)
|
56
|
+
except Exception as e:
|
57
|
+
if kwargs.get("parser_type") == ParserType.LLM_PARSE and kwargs.get(
|
58
|
+
"routed", False
|
59
|
+
):
|
60
|
+
logger.warning(
|
61
|
+
f"LLM_PARSE failed with error: {e}. Retrying with STATIC_PARSE."
|
62
|
+
)
|
63
|
+
kwargs["parser_type"] = ParserType.STATIC_PARSE
|
64
|
+
kwargs["routed"] = False
|
65
|
+
return func(**kwargs)
|
66
|
+
elif kwargs.get("parser_type") == ParserType.STATIC_PARSE and kwargs.get(
|
67
|
+
"routed", False
|
68
|
+
):
|
69
|
+
logger.warning(
|
70
|
+
f"STATIC_PARSE failed with error: {e}. Retrying with LLM_PARSE."
|
71
|
+
)
|
72
|
+
kwargs["parser_type"] = ParserType.LLM_PARSE
|
73
|
+
kwargs["routed"] = False
|
74
|
+
return func(**kwargs)
|
75
|
+
else:
|
76
|
+
logger.error(
|
77
|
+
f"Parsing failed with error: {e}. No fallback parser available."
|
78
|
+
)
|
79
|
+
raise e
|
80
|
+
|
81
|
+
return wrapper
|
82
|
+
|
83
|
+
|
84
|
+
@retry_with_different_parser_type
|
38
85
|
def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
|
39
86
|
"""
|
40
87
|
Parses a file using the specified parser type.
|
@@ -55,11 +102,6 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
|
|
55
102
|
- token_usage: Dictionary containing token usage statistics
|
56
103
|
- parser_used: Which parser was actually used
|
57
104
|
"""
|
58
|
-
if parser_type == ParserType.AUTO:
|
59
|
-
router_priority = kwargs.get("router_priority", "speed")
|
60
|
-
parser_type = ParserType[router(path, router_priority)]
|
61
|
-
logger.debug(f"Auto-detected parser type: {parser_type}")
|
62
|
-
|
63
105
|
kwargs["start"] = (
|
64
106
|
int(os.path.basename(path).split("_")[1]) - 1 if kwargs.get("split") else 0
|
65
107
|
)
|
@@ -193,7 +235,7 @@ def parse(
|
|
193
235
|
sub_pdf_path = os.path.join(sub_pdf_dir, f"{os.path.basename(path)}")
|
194
236
|
path = create_sub_pdf(path, sub_pdf_path, kwargs["page_nums"])
|
195
237
|
|
196
|
-
if not path.lower().endswith(".pdf")
|
238
|
+
if not path.lower().endswith(".pdf"):
|
197
239
|
kwargs["split"] = False
|
198
240
|
result = parse_chunk_list([path], parser_type, kwargs)
|
199
241
|
else:
|
@@ -300,7 +342,11 @@ def parse(
|
|
300
342
|
|
301
343
|
|
302
344
|
def parse_with_schema(
|
303
|
-
path: str,
|
345
|
+
path: str,
|
346
|
+
schema: Dict,
|
347
|
+
api: Optional[str] = None,
|
348
|
+
model: str = "gpt-4o-mini",
|
349
|
+
**kwargs,
|
304
350
|
) -> List[List[Dict]]:
|
305
351
|
"""
|
306
352
|
Parses a PDF using an LLM to generate structured output conforming to a given JSON schema.
|
@@ -315,6 +361,10 @@ def parse_with_schema(
|
|
315
361
|
Returns:
|
316
362
|
List[List[Dict]]: List of dictionaries for each page, each conforming to the provided schema.
|
317
363
|
"""
|
364
|
+
if not api:
|
365
|
+
api = get_api_provider_for_model(model)
|
366
|
+
logger.debug(f"Using API provider: {api}")
|
367
|
+
|
318
368
|
system_prompt = f"""
|
319
369
|
The output should be formatted as a JSON instance that conforms to the JSON schema below.
|
320
370
|
|
@@ -8,6 +8,7 @@ from typing import Dict, List, Optional, Tuple
|
|
8
8
|
|
9
9
|
import pypdfium2 as pdfium
|
10
10
|
import requests
|
11
|
+
from anthropic import Anthropic
|
11
12
|
from huggingface_hub import InferenceClient
|
12
13
|
from loguru import logger
|
13
14
|
from openai import OpenAI
|
@@ -49,36 +50,41 @@ def retry_on_http_error(func):
|
|
49
50
|
return wrapper
|
50
51
|
|
51
52
|
|
52
|
-
|
53
|
-
def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
|
54
|
-
if "api_provider" in kwargs and kwargs["api_provider"]:
|
55
|
-
return parse_with_api(path, api=kwargs["api_provider"], **kwargs)
|
56
|
-
if "model" not in kwargs:
|
57
|
-
kwargs["model"] = "gemini-2.0-flash"
|
58
|
-
model = kwargs.get("model")
|
53
|
+
def get_api_provider_for_model(model: str) -> str:
|
59
54
|
if model.startswith("gemini"):
|
60
|
-
return
|
55
|
+
return "gemini"
|
61
56
|
if model.startswith("gpt"):
|
62
|
-
return
|
57
|
+
return "openai"
|
63
58
|
if model.startswith("meta-llama"):
|
64
59
|
if "Turbo" in model or model == "meta-llama/Llama-Vision-Free":
|
65
|
-
return
|
66
|
-
return
|
60
|
+
return "together"
|
61
|
+
return "huggingface"
|
67
62
|
if any(model.startswith(prefix) for prefix in ["microsoft", "google", "qwen"]):
|
68
|
-
return
|
63
|
+
return "openrouter"
|
69
64
|
if model.startswith("accounts/fireworks"):
|
70
|
-
return
|
65
|
+
return "fireworks"
|
66
|
+
if model.startswith("claude"):
|
67
|
+
return "anthropic"
|
71
68
|
raise ValueError(f"Unsupported model: {model}")
|
72
69
|
|
73
70
|
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
raise ValueError("GOOGLE_API_KEY environment variable is not set")
|
71
|
+
@retry_on_http_error
|
72
|
+
def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
|
73
|
+
if "api_provider" in kwargs and kwargs["api_provider"]:
|
74
|
+
return parse_with_api(path, api=kwargs["api_provider"], **kwargs)
|
79
75
|
|
80
|
-
|
76
|
+
model = kwargs.get("model", "gemini-2.0-flash")
|
77
|
+
kwargs["model"] = model
|
81
78
|
|
79
|
+
api_provider = get_api_provider_for_model(model)
|
80
|
+
|
81
|
+
if api_provider == "gemini":
|
82
|
+
return parse_with_gemini(path, **kwargs)
|
83
|
+
else:
|
84
|
+
return parse_with_api(path, api=api_provider, **kwargs)
|
85
|
+
|
86
|
+
|
87
|
+
def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
82
88
|
# Check if the file is an image and convert to PDF if necessary
|
83
89
|
mime_type, _ = mimetypes.guess_type(path)
|
84
90
|
if mime_type and mime_type.startswith("image"):
|
@@ -90,6 +96,20 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
90
96
|
file_content = file.read()
|
91
97
|
base64_file = base64.b64encode(file_content).decode("utf-8")
|
92
98
|
|
99
|
+
return parse_image_with_gemini(
|
100
|
+
base64_file=base64_file, mime_type=mime_type, **kwargs
|
101
|
+
)
|
102
|
+
|
103
|
+
|
104
|
+
def parse_image_with_gemini(
|
105
|
+
base64_file: str, mime_type: str = "image/png", **kwargs
|
106
|
+
) -> List[Dict] | str:
|
107
|
+
api_key = os.environ.get("GOOGLE_API_KEY")
|
108
|
+
if not api_key:
|
109
|
+
raise ValueError("GOOGLE_API_KEY environment variable is not set")
|
110
|
+
|
111
|
+
url = f"https://generativelanguage.googleapis.com/v1beta/models/{kwargs['model']}:generateContent?key={api_key}"
|
112
|
+
|
93
113
|
if "system_prompt" in kwargs:
|
94
114
|
prompt = kwargs["system_prompt"]
|
95
115
|
else:
|
@@ -109,7 +129,7 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
109
129
|
}
|
110
130
|
],
|
111
131
|
"generationConfig": {
|
112
|
-
"temperature": kwargs.get("temperature", 0
|
132
|
+
"temperature": kwargs.get("temperature", 0),
|
113
133
|
},
|
114
134
|
}
|
115
135
|
|
@@ -129,24 +149,23 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
129
149
|
if "text" in part
|
130
150
|
)
|
131
151
|
|
132
|
-
combined_text =
|
152
|
+
combined_text = raw_text
|
133
153
|
if "<output>" in raw_text:
|
134
154
|
combined_text = raw_text.split("<output>")[-1].strip()
|
135
|
-
if "</output>" in
|
136
|
-
combined_text =
|
155
|
+
if "</output>" in combined_text:
|
156
|
+
combined_text = combined_text.split("</output>")[0].strip()
|
137
157
|
|
138
158
|
token_usage = result["usageMetadata"]
|
139
159
|
input_tokens = token_usage.get("promptTokenCount", 0)
|
140
160
|
output_tokens = token_usage.get("candidatesTokenCount", 0)
|
141
161
|
total_tokens = input_tokens + output_tokens
|
142
|
-
|
143
162
|
return {
|
144
163
|
"raw": combined_text.replace("<page-break>", "\n\n"),
|
145
164
|
"segments": [
|
146
165
|
{"metadata": {"page": kwargs.get("start", 0) + page_no}, "content": page}
|
147
166
|
for page_no, page in enumerate(combined_text.split("<page-break>"), start=1)
|
148
167
|
],
|
149
|
-
"title": kwargs
|
168
|
+
"title": kwargs.get("title", ""),
|
150
169
|
"url": kwargs.get("url", ""),
|
151
170
|
"parent_title": kwargs.get("parent_title", ""),
|
152
171
|
"recursive_docs": [],
|
@@ -218,7 +237,7 @@ def create_response(
|
|
218
237
|
system_prompt: Optional[str] = None,
|
219
238
|
user_prompt: Optional[str] = None,
|
220
239
|
image_url: Optional[str] = None,
|
221
|
-
temperature: float = 0.
|
240
|
+
temperature: float = 0.0,
|
222
241
|
max_tokens: int = 1024,
|
223
242
|
) -> Dict:
|
224
243
|
# Initialize appropriate client
|
@@ -236,10 +255,64 @@ def create_response(
|
|
236
255
|
base_url="https://api.fireworks.ai/inference/v1",
|
237
256
|
api_key=os.environ["FIREWORKS_API_KEY"],
|
238
257
|
),
|
258
|
+
"anthropic": lambda: Anthropic(
|
259
|
+
api_key=os.environ["ANTHROPIC_API_KEY"],
|
260
|
+
),
|
261
|
+
"gemini": lambda: None, # Gemini is handled separately
|
239
262
|
}
|
240
263
|
assert api in clients, f"Unsupported API: {api}"
|
264
|
+
|
265
|
+
if api == "gemini":
|
266
|
+
image_url = image_url.split("data:image/png;base64,")[1]
|
267
|
+
response = parse_image_with_gemini(
|
268
|
+
base64_file=image_url,
|
269
|
+
model=model,
|
270
|
+
temperature=temperature,
|
271
|
+
max_tokens=max_tokens,
|
272
|
+
system_prompt=system_prompt,
|
273
|
+
)
|
274
|
+
return {
|
275
|
+
"response": response["raw"],
|
276
|
+
"usage": response["token_usage"],
|
277
|
+
}
|
278
|
+
|
241
279
|
client = clients[api]()
|
242
280
|
|
281
|
+
if api == "anthropic":
|
282
|
+
image_media_type = image_url.split(";")[0].split(":")[1]
|
283
|
+
image_data = image_url.split(",")[1]
|
284
|
+
response = client.messages.create(
|
285
|
+
model=model,
|
286
|
+
messages=[
|
287
|
+
{
|
288
|
+
"role": "user",
|
289
|
+
"content": [
|
290
|
+
{
|
291
|
+
"type": "image",
|
292
|
+
"source": {
|
293
|
+
"type": "base64",
|
294
|
+
"media_type": image_media_type,
|
295
|
+
"data": image_data,
|
296
|
+
},
|
297
|
+
},
|
298
|
+
{"type": "text", "text": user_prompt},
|
299
|
+
],
|
300
|
+
}
|
301
|
+
],
|
302
|
+
max_tokens=max_tokens,
|
303
|
+
temperature=temperature,
|
304
|
+
)
|
305
|
+
|
306
|
+
return {
|
307
|
+
"response": response.content[0].text,
|
308
|
+
"usage": {
|
309
|
+
"input_tokens": response.usage.input_tokens,
|
310
|
+
"output_tokens": response.usage.output_tokens,
|
311
|
+
"total_tokens": response.usage.input_tokens
|
312
|
+
+ response.usage.output_tokens,
|
313
|
+
},
|
314
|
+
}
|
315
|
+
|
243
316
|
# Prepare messages for the API call
|
244
317
|
messages = get_messages(system_prompt, user_prompt, image_url)
|
245
318
|
|
@@ -260,7 +333,11 @@ def create_response(
|
|
260
333
|
|
261
334
|
return {
|
262
335
|
"response": page_text,
|
263
|
-
"usage":
|
336
|
+
"usage": {
|
337
|
+
"input_tokens": token_usage.prompt_tokens,
|
338
|
+
"output_tokens": token_usage.completion_tokens,
|
339
|
+
"total_tokens": token_usage.total_tokens,
|
340
|
+
},
|
264
341
|
}
|
265
342
|
|
266
343
|
|
@@ -314,7 +391,7 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
|
314
391
|
system_prompt=system_prompt,
|
315
392
|
user_prompt=user_prompt,
|
316
393
|
image_url=image_url,
|
317
|
-
temperature=kwargs.get("temperature", 0.
|
394
|
+
temperature=kwargs.get("temperature", 0.0),
|
318
395
|
max_tokens=kwargs.get("max_tokens", 1024),
|
319
396
|
)
|
320
397
|
|
@@ -335,9 +412,9 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
|
335
412
|
(
|
336
413
|
page_num,
|
337
414
|
result,
|
338
|
-
token_usage
|
339
|
-
token_usage
|
340
|
-
token_usage
|
415
|
+
token_usage["input_tokens"],
|
416
|
+
token_usage["output_tokens"],
|
417
|
+
token_usage["total_tokens"],
|
341
418
|
)
|
342
419
|
)
|
343
420
|
|
@@ -1,12 +1,14 @@
|
|
1
1
|
import os
|
2
2
|
import re
|
3
3
|
import tempfile
|
4
|
+
from functools import wraps
|
4
5
|
from time import time
|
5
6
|
from typing import Dict, List
|
6
7
|
|
7
8
|
import pandas as pd
|
8
9
|
import pdfplumber
|
9
10
|
from docx import Document
|
11
|
+
from loguru import logger
|
10
12
|
from pdfminer.high_level import extract_pages
|
11
13
|
from pdfminer.layout import LTTextContainer
|
12
14
|
from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
|
@@ -22,6 +24,38 @@ from lexoid.core.utils import (
|
|
22
24
|
)
|
23
25
|
|
24
26
|
|
27
|
+
def retry_with_different_parser(func):
|
28
|
+
@wraps(func)
|
29
|
+
def wrapper(*args, **kwargs):
|
30
|
+
try:
|
31
|
+
return func(*args, **kwargs)
|
32
|
+
except Exception as e:
|
33
|
+
if "pdfplumber" in kwargs.get("framework", "pdfplumber") and not kwargs.get(
|
34
|
+
"routed", False
|
35
|
+
):
|
36
|
+
kwargs["framework"] = "pdfminer"
|
37
|
+
logger.warning(
|
38
|
+
f"Retrying with pdfminer due to error: {e}. Original framework: {kwargs['framework']}"
|
39
|
+
)
|
40
|
+
return func(*args, **kwargs)
|
41
|
+
elif "pdfminer" in kwargs.get("framework", "pdfplumber") and not kwargs.get(
|
42
|
+
"routed", False
|
43
|
+
):
|
44
|
+
kwargs["framework"] = "pdfplumber"
|
45
|
+
logger.warning(
|
46
|
+
f"Retrying with pdfplumber due to error: {e}. Original framework: {kwargs['framework']}"
|
47
|
+
)
|
48
|
+
return func(*args, **kwargs)
|
49
|
+
else:
|
50
|
+
logger.error(
|
51
|
+
f"Failed to parse document with both pdfplumber and pdfminer: {e}"
|
52
|
+
)
|
53
|
+
raise e
|
54
|
+
|
55
|
+
return wrapper
|
56
|
+
|
57
|
+
|
58
|
+
@retry_with_different_parser
|
25
59
|
def parse_static_doc(path: str, **kwargs) -> Dict:
|
26
60
|
"""
|
27
61
|
Parses a document using static parsing methods.
|
@@ -69,15 +69,45 @@ def convert_image_to_pdf(image_path: str) -> bytes:
|
|
69
69
|
|
70
70
|
def remove_html_tags(text: str):
|
71
71
|
html = markdown(text, extensions=["tables"])
|
72
|
-
return re.sub(HTML_TAG_PATTERN, "", html)
|
72
|
+
return re.sub(HTML_TAG_PATTERN, " ", html)
|
73
73
|
|
74
74
|
|
75
|
-
def
|
75
|
+
def clean_text(txt):
|
76
|
+
# Remove LaTeX commands (e.g. \command, \command[args]{args})
|
77
|
+
txt = re.sub(r"\\[a-zA-Z]+(\[[^\]]*\])?(\{[^}]*\})?", " ", txt)
|
78
|
+
|
79
|
+
# Replace all blocks of whitespace (including tabs and newlines) with a single space
|
80
|
+
txt = re.sub(r"\s+", " ", txt)
|
81
|
+
|
82
|
+
# Remove all non-alphanumeric characters except spaces
|
83
|
+
txt = re.sub(r"[^a-zA-Z0-9 ]", " ", txt)
|
84
|
+
|
85
|
+
return txt.strip()
|
86
|
+
|
87
|
+
|
88
|
+
def calculate_similarity(
|
89
|
+
text1: str, text2: str, ignore_html: bool = True, diff_save_path: str = ""
|
90
|
+
) -> float:
|
76
91
|
"""Calculate similarity ratio between two texts using SequenceMatcher."""
|
77
92
|
if ignore_html:
|
78
93
|
text1 = remove_html_tags(text1)
|
79
94
|
text2 = remove_html_tags(text2)
|
80
|
-
|
95
|
+
|
96
|
+
text1 = clean_text(clean_text(text1))
|
97
|
+
text2 = clean_text(clean_text(text2))
|
98
|
+
|
99
|
+
sm = SequenceMatcher(None, text1, text2)
|
100
|
+
# Save the diff and the texts for debugging
|
101
|
+
if diff_save_path:
|
102
|
+
with open(diff_save_path, "w") as f:
|
103
|
+
f.write(f"Text 1:\n{text1}\n\n")
|
104
|
+
f.write(f"Text 2:\n{text2}\n\n")
|
105
|
+
f.write("Differences:\n")
|
106
|
+
for tag, i1, i2, j1, j2 in sm.get_opcodes():
|
107
|
+
if tag == "equal":
|
108
|
+
continue
|
109
|
+
f.write(f"{tag} {text1[i1:i2]} -> {text2[j1:j2]}\n")
|
110
|
+
return sm.ratio()
|
81
111
|
|
82
112
|
|
83
113
|
def convert_pdf_page_to_image(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "lexoid"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.15"
|
4
4
|
description = ""
|
5
5
|
authors = []
|
6
6
|
readme = "README.md"
|
@@ -30,6 +30,7 @@ huggingface-hub = "^0.27.0"
|
|
30
30
|
together = "^1.4.0"
|
31
31
|
openpyxl = "^3.1.5"
|
32
32
|
pptx2md = "^2.0.6"
|
33
|
+
anthropic = "^0.55.0"
|
33
34
|
|
34
35
|
[tool.poetry.group.dev.dependencies]
|
35
36
|
ipykernel = "^6.29.5"
|
@@ -40,6 +41,7 @@ pytest = "^8.3.2"
|
|
40
41
|
[tool.poetry.group.docs.dependencies]
|
41
42
|
sphinx = "^8.1.3"
|
42
43
|
pydata-sphinx-theme = "^0.16.1"
|
44
|
+
docutils = "^0.21.2"
|
43
45
|
|
44
46
|
[build-system]
|
45
47
|
requires = ["poetry-core", "wheel"]
|
File without changes
|
File without changes
|