lexoid 0.1.11__py3-none-any.whl → 0.1.12__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexoid/api.py +48 -8
- lexoid/core/parse_type/llm_parser.py +34 -26
- lexoid/core/parse_type/static_parser.py +41 -6
- lexoid/core/utils.py +15 -11
- {lexoid-0.1.11.dist-info → lexoid-0.1.12.dist-info}/METADATA +60 -20
- lexoid-0.1.12.dist-info/RECORD +9 -0
- lexoid-0.1.11.dist-info/RECORD +0 -9
- {lexoid-0.1.11.dist-info → lexoid-0.1.12.dist-info}/LICENSE +0 -0
- {lexoid-0.1.11.dist-info → lexoid-0.1.12.dist-info}/WHEEL +0 -0
lexoid/api.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import json
|
1
2
|
import os
|
2
3
|
import re
|
3
4
|
import tempfile
|
@@ -50,7 +51,8 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
|
|
50
51
|
- token_usage: Dictionary containing token usage statistics
|
51
52
|
"""
|
52
53
|
if parser_type == ParserType.AUTO:
|
53
|
-
|
54
|
+
router_priority = kwargs.get("router_priority", "speed")
|
55
|
+
parser_type = ParserType[router(path, router_priority)]
|
54
56
|
logger.debug(f"Auto-detected parser type: {parser_type}")
|
55
57
|
|
56
58
|
kwargs["start"] = (
|
@@ -80,7 +82,7 @@ def parse_chunk_list(
|
|
80
82
|
"""
|
81
83
|
combined_segments = []
|
82
84
|
raw_texts = []
|
83
|
-
token_usage = {"input": 0, "output": 0}
|
85
|
+
token_usage = {"input": 0, "output": 0, "image_count": 0}
|
84
86
|
for file_path in file_paths:
|
85
87
|
result = parse_chunk(file_path, parser_type, **kwargs)
|
86
88
|
combined_segments.extend(result["segments"])
|
@@ -88,6 +90,7 @@ def parse_chunk_list(
|
|
88
90
|
if "token_usage" in result:
|
89
91
|
token_usage["input"] += result["token_usage"]["input"]
|
90
92
|
token_usage["output"] += result["token_usage"]["output"]
|
93
|
+
token_usage["image_count"] += len(result["segments"])
|
91
94
|
token_usage["total"] = token_usage["input"] + token_usage["output"]
|
92
95
|
|
93
96
|
return {
|
@@ -135,14 +138,20 @@ def parse(
|
|
135
138
|
|
136
139
|
if type(parser_type) == str:
|
137
140
|
parser_type = ParserType[parser_type]
|
141
|
+
if (
|
142
|
+
path.lower().endswith((".doc", ".docx"))
|
143
|
+
and parser_type != ParserType.STATIC_PARSE
|
144
|
+
):
|
145
|
+
as_pdf = True
|
146
|
+
if path.lower().endswith(".xlsx") and parser_type == ParserType.LLM_PARSE:
|
147
|
+
logger.warning("LLM_PARSE does not support .xlsx files. Using STATIC_PARSE.")
|
148
|
+
parser_type = ParserType.STATIC_PARSE
|
149
|
+
if path.lower().endswith(".pptx") and parser_type == ParserType.LLM_PARSE:
|
150
|
+
logger.warning("LLM_PARSE does not support .pptx files. Using STATIC_PARSE.")
|
151
|
+
parser_type = ParserType.STATIC_PARSE
|
138
152
|
|
139
153
|
with tempfile.TemporaryDirectory() as temp_dir:
|
140
|
-
|
141
|
-
path.lower().endswith((".doc", ".docx"))
|
142
|
-
and parser_type != ParserType.STATIC_PARSE
|
143
|
-
):
|
144
|
-
as_pdf = True
|
145
|
-
|
154
|
+
kwargs["temp_dir"] = temp_dir
|
146
155
|
if path.startswith(("http://", "https://")):
|
147
156
|
kwargs["url"] = path
|
148
157
|
download_dir = kwargs.get("save_dir", os.path.join(temp_dir, "downloads/"))
|
@@ -210,9 +219,40 @@ def parse(
|
|
210
219
|
"token_usage": {
|
211
220
|
"input": sum(r["token_usage"]["input"] for r in chunk_results),
|
212
221
|
"output": sum(r["token_usage"]["output"] for r in chunk_results),
|
222
|
+
"image_count": sum(
|
223
|
+
r["token_usage"]["image_count"] for r in chunk_results
|
224
|
+
),
|
213
225
|
"total": sum(r["token_usage"]["total"] for r in chunk_results),
|
214
226
|
},
|
215
227
|
}
|
228
|
+
|
229
|
+
if "api_cost_mapping" in kwargs:
|
230
|
+
api_cost_mapping = kwargs["api_cost_mapping"]
|
231
|
+
if isinstance(api_cost_mapping, dict):
|
232
|
+
api_cost_mapping = api_cost_mapping
|
233
|
+
elif isinstance(api_cost_mapping, str) and os.path.exists(
|
234
|
+
api_cost_mapping
|
235
|
+
):
|
236
|
+
with open(api_cost_mapping, "r") as f:
|
237
|
+
api_cost_mapping = json.load(f)
|
238
|
+
else:
|
239
|
+
raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.")
|
240
|
+
|
241
|
+
api_cost = api_cost_mapping.get(
|
242
|
+
kwargs.get("model", "gemini-2.0-flash"), None
|
243
|
+
)
|
244
|
+
if api_cost:
|
245
|
+
token_usage = result["token_usage"]
|
246
|
+
token_cost = {
|
247
|
+
"input": token_usage["input"] * api_cost["input"] / 1_000_000
|
248
|
+
+ api_cost.get("input-image", 0) * token_usage["image_count"],
|
249
|
+
"output": token_usage["output"]
|
250
|
+
* api_cost["output"]
|
251
|
+
/ 1_000_000,
|
252
|
+
}
|
253
|
+
token_cost["total"] = token_cost["input"] + token_cost["output"]
|
254
|
+
result["token_cost"] = token_cost
|
255
|
+
|
216
256
|
if as_pdf:
|
217
257
|
result["pdf_path"] = path
|
218
258
|
|
@@ -31,6 +31,7 @@ def retry_on_http_error(func):
|
|
31
31
|
logger.error(f"HTTPError encountered: {e}. Retrying in 10 seconds...")
|
32
32
|
time.sleep(10)
|
33
33
|
try:
|
34
|
+
logger.debug(f"Retry {func.__name__}")
|
34
35
|
return func(*args, **kwargs)
|
35
36
|
except HTTPError as e:
|
36
37
|
logger.error(f"Retry failed: {e}")
|
@@ -49,6 +50,8 @@ def retry_on_http_error(func):
|
|
49
50
|
|
50
51
|
@retry_on_http_error
|
51
52
|
def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
|
53
|
+
if "api_provider" in kwargs and kwargs["api_provider"]:
|
54
|
+
return parse_with_api(path, api=kwargs["api_provider"], **kwargs)
|
52
55
|
if "model" not in kwargs:
|
53
56
|
kwargs["model"] = "gemini-2.0-flash"
|
54
57
|
model = kwargs.get("model")
|
@@ -57,9 +60,11 @@ def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
|
|
57
60
|
if model.startswith("gpt"):
|
58
61
|
return parse_with_api(path, api="openai", **kwargs)
|
59
62
|
if model.startswith("meta-llama"):
|
60
|
-
if
|
63
|
+
if "Turbo" in model or model == "meta-llama/Llama-Vision-Free":
|
61
64
|
return parse_with_api(path, api="together", **kwargs)
|
62
65
|
return parse_with_api(path, api="huggingface", **kwargs)
|
66
|
+
if any(model.startswith(prefix) for prefix in ["microsoft", "google", "qwen"]):
|
67
|
+
return parse_with_api(path, api="openrouter", **kwargs)
|
63
68
|
raise ValueError(f"Unsupported model: {model}")
|
64
69
|
|
65
70
|
|
@@ -81,20 +86,20 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
81
86
|
file_content = file.read()
|
82
87
|
base64_file = base64.b64encode(file_content).decode("utf-8")
|
83
88
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
89
|
+
if "system_prompt" in kwargs:
|
90
|
+
prompt = kwargs["system_prompt"]
|
91
|
+
else:
|
92
|
+
# Ideally, we do this ourselves. But, for now this might be a good enough.
|
93
|
+
custom_instruction = f"""- Total number of pages: {kwargs["pages_per_split_"]}. {INSTRUCTIONS_ADD_PG_BREAK}"""
|
94
|
+
if kwargs["pages_per_split_"] == 1:
|
95
|
+
custom_instruction = ""
|
96
|
+
prompt = PARSER_PROMPT.format(custom_instructions=custom_instruction)
|
88
97
|
|
89
98
|
payload = {
|
90
99
|
"contents": [
|
91
100
|
{
|
92
101
|
"parts": [
|
93
|
-
{
|
94
|
-
"text": PARSER_PROMPT.format(
|
95
|
-
custom_instructions=custom_instruction
|
96
|
-
)
|
97
|
-
},
|
102
|
+
{"text": prompt},
|
98
103
|
{"inline_data": {"mime_type": mime_type, "data": base64_file}},
|
99
104
|
]
|
100
105
|
}
|
@@ -105,9 +110,11 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
105
110
|
}
|
106
111
|
|
107
112
|
headers = {"Content-Type": "application/json"}
|
108
|
-
|
109
|
-
|
110
|
-
|
113
|
+
try:
|
114
|
+
response = requests.post(url, json=payload, headers=headers, timeout=120)
|
115
|
+
response.raise_for_status()
|
116
|
+
except requests.Timeout as e:
|
117
|
+
raise HTTPError(f"Timeout error occurred: {e}")
|
111
118
|
|
112
119
|
result = response.json()
|
113
120
|
|
@@ -130,7 +137,7 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
130
137
|
total_tokens = input_tokens + output_tokens
|
131
138
|
|
132
139
|
return {
|
133
|
-
"raw": combined_text,
|
140
|
+
"raw": combined_text.replace("<page-break>", "\n\n"),
|
134
141
|
"segments": [
|
135
142
|
{"metadata": {"page": kwargs.get("start", 0) + page_no}, "content": page}
|
136
143
|
for page_no, page in enumerate(combined_text.split("<page-break>"), start=1)
|
@@ -181,6 +188,10 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
|
181
188
|
token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
|
182
189
|
),
|
183
190
|
"together": lambda: Together(),
|
191
|
+
"openrouter": lambda: OpenAI(
|
192
|
+
base_url="https://openrouter.ai/api/v1",
|
193
|
+
api_key=os.environ["OPENROUTER_API_KEY"],
|
194
|
+
),
|
184
195
|
}
|
185
196
|
assert api in clients, f"Unsupported API: {api}"
|
186
197
|
logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
|
@@ -206,35 +217,32 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
|
206
217
|
|
207
218
|
# API-specific message formatting
|
208
219
|
def get_messages(page_num: int, image_url: str) -> List[Dict]:
|
209
|
-
base_message = {
|
210
|
-
"type": "text",
|
211
|
-
"text": LLAMA_PARSER_PROMPT,
|
212
|
-
}
|
213
220
|
image_message = {
|
214
221
|
"type": "image_url",
|
215
222
|
"image_url": {"url": image_url},
|
216
223
|
}
|
217
224
|
|
218
225
|
if api == "openai":
|
226
|
+
system_prompt = kwargs.get(
|
227
|
+
"system_prompt", PARSER_PROMPT.format(custom_instructions="")
|
228
|
+
)
|
229
|
+
user_prompt = kwargs.get("user_prompt", OPENAI_USER_PROMPT)
|
219
230
|
return [
|
220
231
|
{
|
221
232
|
"role": "system",
|
222
|
-
"content":
|
223
|
-
custom_instructions=INSTRUCTIONS_ADD_PG_BREAK
|
224
|
-
),
|
233
|
+
"content": system_prompt,
|
225
234
|
},
|
226
235
|
{
|
227
236
|
"role": "user",
|
228
237
|
"content": [
|
229
|
-
{
|
230
|
-
"type": "text",
|
231
|
-
"text": f"{OPENAI_USER_PROMPT} (Page {page_num + 1})",
|
232
|
-
},
|
238
|
+
{"type": "text", "text": user_prompt},
|
233
239
|
image_message,
|
234
240
|
],
|
235
241
|
},
|
236
242
|
]
|
237
243
|
else:
|
244
|
+
prompt = kwargs.get("system_prompt", LLAMA_PARSER_PROMPT)
|
245
|
+
base_message = {"type": "text", "text": prompt}
|
238
246
|
return [
|
239
247
|
{
|
240
248
|
"role": "user",
|
@@ -283,7 +291,7 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
|
283
291
|
# Sort results by page number and combine
|
284
292
|
all_results.sort(key=lambda x: x[0])
|
285
293
|
all_texts = [text for _, text, _, _, _ in all_results]
|
286
|
-
combined_text = "
|
294
|
+
combined_text = "\n\n".join(all_texts)
|
287
295
|
|
288
296
|
return {
|
289
297
|
"raw": combined_text,
|
@@ -1,12 +1,23 @@
|
|
1
|
+
import os
|
1
2
|
import tempfile
|
3
|
+
from time import time
|
4
|
+
from typing import List, Dict
|
5
|
+
|
2
6
|
import pandas as pd
|
3
7
|
import pdfplumber
|
4
|
-
from
|
5
|
-
from lexoid.core.utils import get_file_type, get_uri_rect, html_to_markdown, split_pdf
|
8
|
+
from docx import Document
|
6
9
|
from pdfminer.high_level import extract_pages
|
7
10
|
from pdfminer.layout import LTTextContainer
|
8
11
|
from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
|
9
|
-
from
|
12
|
+
from pptx2md import convert, ConversionConfig
|
13
|
+
|
14
|
+
from lexoid.core.utils import (
|
15
|
+
get_file_type,
|
16
|
+
get_uri_rect,
|
17
|
+
html_to_markdown,
|
18
|
+
split_pdf,
|
19
|
+
split_md_by_headings,
|
20
|
+
)
|
10
21
|
|
11
22
|
|
12
23
|
def parse_static_doc(path: str, **kwargs) -> Dict:
|
@@ -47,8 +58,11 @@ def parse_static_doc(path: str, **kwargs) -> Dict:
|
|
47
58
|
"parent_title": kwargs.get("parent_title", ""),
|
48
59
|
"recursive_docs": [],
|
49
60
|
}
|
50
|
-
elif file_type == "text/csv":
|
51
|
-
|
61
|
+
elif file_type == "text/csv" or "spreadsheet" in file_type:
|
62
|
+
if "spreadsheet" in file_type:
|
63
|
+
df = pd.read_excel(path)
|
64
|
+
else:
|
65
|
+
df = pd.read_csv(path)
|
52
66
|
content = df.to_markdown(index=False)
|
53
67
|
return {
|
54
68
|
"raw": content,
|
@@ -58,6 +72,27 @@ def parse_static_doc(path: str, **kwargs) -> Dict:
|
|
58
72
|
"parent_title": kwargs.get("parent_title", ""),
|
59
73
|
"recursive_docs": [],
|
60
74
|
}
|
75
|
+
elif "presentation" in file_type:
|
76
|
+
md_path = os.path.join(kwargs["temp_dir"], f"{int(time())}.md")
|
77
|
+
convert(
|
78
|
+
ConversionConfig(
|
79
|
+
pptx_path=path,
|
80
|
+
output_path=md_path,
|
81
|
+
image_dir=None,
|
82
|
+
disable_image=True,
|
83
|
+
disable_notes=True,
|
84
|
+
)
|
85
|
+
)
|
86
|
+
with open(md_path, "r") as f:
|
87
|
+
content = f.read()
|
88
|
+
return {
|
89
|
+
"raw": content,
|
90
|
+
"segments": split_md_by_headings(content, "#"),
|
91
|
+
"title": kwargs["title"],
|
92
|
+
"url": kwargs.get("url", ""),
|
93
|
+
"parent_title": kwargs.get("parent_title", ""),
|
94
|
+
"recursive_docs": [],
|
95
|
+
}
|
61
96
|
else:
|
62
97
|
raise ValueError(f"Unsupported file type: {file_type}")
|
63
98
|
|
@@ -389,7 +424,7 @@ def parse_with_pdfplumber(path: str, **kwargs) -> Dict:
|
|
389
424
|
]
|
390
425
|
|
391
426
|
return {
|
392
|
-
"raw": "
|
427
|
+
"raw": "\n\n".join(page_texts),
|
393
428
|
"segments": segments,
|
394
429
|
"title": kwargs["title"],
|
395
430
|
"url": kwargs.get("url", ""),
|
lexoid/core/utils.py
CHANGED
@@ -46,7 +46,7 @@ def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
|
|
46
46
|
|
47
47
|
|
48
48
|
def create_sub_pdf(
|
49
|
-
input_path: str, output_path: str, page_nums: Optional[tuple[int, ...]|int] = None
|
49
|
+
input_path: str, output_path: str, page_nums: Optional[tuple[int, ...] | int] = None
|
50
50
|
) -> str:
|
51
51
|
if isinstance(page_nums, int):
|
52
52
|
page_nums = (page_nums,)
|
@@ -106,6 +106,8 @@ def is_supported_file_type(path: str) -> bool:
|
|
106
106
|
if (
|
107
107
|
file_type == "application/pdf"
|
108
108
|
or "wordprocessing" in file_type
|
109
|
+
or "spreadsheet" in file_type
|
110
|
+
or "presentation" in file_type
|
109
111
|
or file_type.startswith("image/")
|
110
112
|
or file_type.startswith("text")
|
111
113
|
):
|
@@ -217,7 +219,7 @@ def split_md_by_headings(markdown_content: str, heading_pattern: str) -> List[Di
|
|
217
219
|
pattern = r"^([^\n]+)\n-+$"
|
218
220
|
sections = re.split(pattern, markdown_content, flags=re.MULTILINE)
|
219
221
|
# Remove empty sections and strip whitespace
|
220
|
-
sections = [section.strip() for section in sections
|
222
|
+
sections = [section.strip() for section in sections]
|
221
223
|
|
222
224
|
# Handle content before first heading if it exists
|
223
225
|
if sections and not re.match(r"^[^\n]+\n-+$", sections[0], re.MULTILINE):
|
@@ -244,7 +246,7 @@ def split_md_by_headings(markdown_content: str, heading_pattern: str) -> List[Di
|
|
244
246
|
headings = re.findall(regex, markdown_content, flags=re.MULTILINE)
|
245
247
|
|
246
248
|
# Remove empty sections and strip whitespace
|
247
|
-
sections = [section.strip() for section in sections
|
249
|
+
sections = [section.strip() for section in sections]
|
248
250
|
|
249
251
|
# Handle content before first heading if it exists
|
250
252
|
if len(sections) > len(headings):
|
@@ -299,6 +301,7 @@ def html_to_markdown(html: str, title: str, url: str) -> str:
|
|
299
301
|
|
300
302
|
return content
|
301
303
|
|
304
|
+
|
302
305
|
def get_webpage_soup(url: str) -> BeautifulSoup:
|
303
306
|
try:
|
304
307
|
from playwright.async_api import async_playwright
|
@@ -473,7 +476,10 @@ def save_webpage_as_pdf(url: str, output_path: str) -> str:
|
|
473
476
|
Returns:
|
474
477
|
str: The path to the saved PDF file.
|
475
478
|
"""
|
476
|
-
|
479
|
+
if not QApplication.instance():
|
480
|
+
app = QApplication(sys.argv)
|
481
|
+
else:
|
482
|
+
app = QApplication.instance()
|
477
483
|
web = QWebEngineView()
|
478
484
|
web.load(QUrl(url))
|
479
485
|
|
@@ -546,7 +552,7 @@ def has_hyperlink_in_pdf(path: str):
|
|
546
552
|
)
|
547
553
|
|
548
554
|
|
549
|
-
def router(path: str, priority: str = "
|
555
|
+
def router(path: str, priority: str = "speed") -> str:
|
550
556
|
"""
|
551
557
|
Routes the file path to the appropriate parser based on the file type.
|
552
558
|
|
@@ -555,9 +561,9 @@ def router(path: str, priority: str = "accuracy") -> str:
|
|
555
561
|
priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE).
|
556
562
|
"""
|
557
563
|
file_type = get_file_type(path)
|
558
|
-
if file_type.startswith("text/"):
|
564
|
+
if file_type.startswith("text/") or "spreadsheet" in file_type or "presentation" in file_type:
|
559
565
|
return "STATIC_PARSE"
|
560
|
-
|
566
|
+
|
561
567
|
if priority == "accuracy":
|
562
568
|
# If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
|
563
569
|
# Otherwise, use LLM_PARSE
|
@@ -571,13 +577,11 @@ def router(path: str, priority: str = "accuracy") -> str:
|
|
571
577
|
else:
|
572
578
|
# If the file is a PDF without images, use STATIC_PARSE
|
573
579
|
# Otherwise, use LLM_PARSE
|
574
|
-
if (
|
575
|
-
file_type == "application/pdf"
|
576
|
-
and not has_image_in_pdf(path)
|
577
|
-
):
|
580
|
+
if file_type == "application/pdf" and not has_image_in_pdf(path):
|
578
581
|
return "STATIC_PARSE"
|
579
582
|
return "LLM_PARSE"
|
580
583
|
|
584
|
+
|
581
585
|
def convert_doc_to_pdf(input_path: str, temp_dir: str) -> str:
|
582
586
|
temp_path = os.path.join(
|
583
587
|
temp_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lexoid
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.12
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.10,<4.0
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -18,10 +18,12 @@ Requires-Dist: markdownify (>=0.13.1,<0.14.0)
|
|
18
18
|
Requires-Dist: nest-asyncio (>=1.6.0,<2.0.0)
|
19
19
|
Requires-Dist: openai (>=1.47.0,<2.0.0)
|
20
20
|
Requires-Dist: opencv-python (>=4.10.0.84,<5.0.0.0)
|
21
|
+
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
21
22
|
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
22
23
|
Requires-Dist: pdfplumber (>=0.11.4,<0.12.0)
|
23
24
|
Requires-Dist: pikepdf (>=9.3.0,<10.0.0)
|
24
25
|
Requires-Dist: playwright (>=1.49.0,<2.0.0)
|
26
|
+
Requires-Dist: pptx2md (>=2.0.6,<3.0.0)
|
25
27
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
26
28
|
Requires-Dist: pyqt5 (>=5.15.11,<6.0.0) ; platform_system != "debian"
|
27
29
|
Requires-Dist: pyqtwebengine (>=5.15.7,<6.0.0) ; platform_system != "debian"
|
@@ -31,7 +33,20 @@ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
31
33
|
Requires-Dist: together (>=1.4.0,<2.0.0)
|
32
34
|
Description-Content-Type: text/markdown
|
33
35
|
|
34
|
-
|
36
|
+
<div align="center">
|
37
|
+
|
38
|
+
```
|
39
|
+
___ _______ __ __ _______ ___ ______
|
40
|
+
| | | || |_| || || | | |
|
41
|
+
| | | ___|| || _ || | | _ |
|
42
|
+
| | | |___ | || | | || | | | | |
|
43
|
+
| |___ | ___| | | | |_| || | | |_| |
|
44
|
+
| || |___ | _ || || | | |
|
45
|
+
|_______||_______||__| |__||_______||___| |______|
|
46
|
+
|
47
|
+
```
|
48
|
+
|
49
|
+
</div>
|
35
50
|
|
36
51
|
[](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
|
37
52
|
[](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
|
@@ -43,54 +58,67 @@ Lexoid is an efficient document parsing library that supports both LLM-based and
|
|
43
58
|
[Documentation](https://oidlabs-com.github.io/Lexoid/)
|
44
59
|
|
45
60
|
## Motivation:
|
61
|
+
|
46
62
|
- Use the multi-modal advancement of LLMs
|
47
63
|
- Enable convenience for users
|
48
64
|
- Collaborate with a permissive license
|
49
65
|
|
50
66
|
## Installation
|
67
|
+
|
51
68
|
### Installing with pip
|
69
|
+
|
52
70
|
```
|
53
71
|
pip install lexoid
|
54
72
|
```
|
55
73
|
|
56
74
|
To use LLM-based parsing, define the following environment variables or create a `.env` file with the following definitions
|
75
|
+
|
57
76
|
```
|
58
77
|
OPENAI_API_KEY=""
|
59
78
|
GOOGLE_API_KEY=""
|
60
79
|
```
|
61
80
|
|
62
81
|
Optionally, to use `Playwright` for retrieving web content (instead of the `requests` library):
|
82
|
+
|
63
83
|
```
|
64
84
|
playwright install --with-deps --only-shell chromium
|
65
85
|
```
|
66
86
|
|
67
87
|
### Building `.whl` from source
|
88
|
+
|
68
89
|
```
|
69
90
|
make build
|
70
91
|
```
|
71
92
|
|
72
93
|
### Creating a local installation
|
94
|
+
|
73
95
|
To install dependencies:
|
96
|
+
|
74
97
|
```
|
75
98
|
make install
|
76
99
|
```
|
100
|
+
|
77
101
|
or, to install with dev-dependencies:
|
102
|
+
|
78
103
|
```
|
79
104
|
make dev
|
80
105
|
```
|
81
106
|
|
82
107
|
To activate virtual environment:
|
108
|
+
|
83
109
|
```
|
84
110
|
source .venv/bin/activate
|
85
111
|
```
|
86
112
|
|
87
113
|
## Usage
|
114
|
+
|
88
115
|
[Example Notebook](https://github.com/oidlabs-com/Lexoid/blob/main/examples/example_notebook.ipynb)
|
89
116
|
|
90
|
-
[Example Colab Notebook](https://
|
117
|
+
[Example Colab Notebook](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
|
91
118
|
|
92
119
|
Here's a quick example to parse documents using Lexoid:
|
93
|
-
|
120
|
+
|
121
|
+
```python
|
94
122
|
from lexoid.api import parse
|
95
123
|
from lexoid.api import ParserType
|
96
124
|
|
@@ -103,30 +131,42 @@ print(parsed_md)
|
|
103
131
|
```
|
104
132
|
|
105
133
|
### Parameters
|
134
|
+
|
106
135
|
- path (str): The file path or URL.
|
107
136
|
- parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
|
108
137
|
- pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
|
109
138
|
- max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
|
110
|
-
-
|
139
|
+
- \*\*kwargs: Additional arguments for the parser.
|
140
|
+
|
141
|
+
## Supported API Providers
|
142
|
+
* Google
|
143
|
+
* OpenAI
|
144
|
+
* Hugging Face
|
145
|
+
* Together AI
|
146
|
+
* OpenRouter
|
111
147
|
|
112
148
|
## Benchmark
|
149
|
+
|
113
150
|
Results aggregated across 5 iterations each for 5 documents.
|
114
151
|
|
115
152
|
_Note:_ Benchmarks are currently done in the zero-shot setting.
|
116
153
|
|
117
|
-
| Rank | Model
|
118
|
-
|
119
|
-
| 1
|
120
|
-
| 2
|
121
|
-
| 3
|
122
|
-
| 4
|
123
|
-
| 5
|
124
|
-
| 6
|
125
|
-
| 7
|
126
|
-
| 8
|
127
|
-
| 9
|
128
|
-
| 10
|
129
|
-
| 11
|
130
|
-
| 12
|
131
|
-
| 13
|
154
|
+
| Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost($) |
|
155
|
+
| ---- | ----------------------------------------------------- | --------------- | --------- | -------- | -------- |
|
156
|
+
| 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.000480 |
|
157
|
+
| 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
|
158
|
+
| 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
|
159
|
+
| 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
|
160
|
+
| 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
|
161
|
+
| 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
|
162
|
+
| 7 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
|
163
|
+
| 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
|
164
|
+
| 9 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
|
165
|
+
| 10 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
|
166
|
+
| 11 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
|
167
|
+
| 12 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.000060 |
|
168
|
+
| 13 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
|
169
|
+
| 14 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
|
170
|
+
| 15 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
|
171
|
+
| 16 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.80 | 0.000019 |
|
132
172
|
|
@@ -0,0 +1,9 @@
|
|
1
|
+
lexoid/api.py,sha256=lTkUcbGML29JrWJv4pE_ZqbzeJuHUE8b6OnijoLBEfU,11350
|
2
|
+
lexoid/core/parse_type/llm_parser.py,sha256=rrc1Lwp-6ZAi8IVp3672mHAHUs1JefhT2rnYyQ1gA5E,11292
|
3
|
+
lexoid/core/parse_type/static_parser.py,sha256=v4GWUmZVBBIF9TnbkhPBt2gspk0Oq_ujtNGnXZHLBr8,15055
|
4
|
+
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
+
lexoid/core/utils.py,sha256=6s24X3-4Y57u70HzjIS798Tg8qx6Z3mLATf4xtENE-8,19718
|
6
|
+
lexoid-0.1.12.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
+
lexoid-0.1.12.dist-info/METADATA,sha256=XMHFMqwDj2DgSaZcZjXU881NxdPsRGBAsUyPyRsJvyU,6809
|
8
|
+
lexoid-0.1.12.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
+
lexoid-0.1.12.dist-info/RECORD,,
|
lexoid-0.1.11.dist-info/RECORD
DELETED
@@ -1,9 +0,0 @@
|
|
1
|
-
lexoid/api.py,sha256=CIZBNvh38PJbD0OwK1Mp0qqkWxkAEBw2L_FkoCmagXA,9288
|
2
|
-
lexoid/core/parse_type/llm_parser.py,sha256=XfsN6RAtb14p31U2jL-9QyRKpkNAGXXiK3urWJIFi2U,10625
|
3
|
-
lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
|
4
|
-
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
-
lexoid/core/utils.py,sha256=1If_3XoUhPQRY5XMzLJBsHdyjtLgD734eYBYvsg8w5Y,19569
|
6
|
-
lexoid-0.1.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
-
lexoid-0.1.11.dist-info/METADATA,sha256=kipDZLbUz_wkJUrzPGH2VppBNMHmaJadHR5_BAqHgjU,4838
|
8
|
-
lexoid-0.1.11.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
-
lexoid-0.1.11.dist-info/RECORD,,
|
File without changes
|
File without changes
|