lexoid 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexoid/api.py +13 -3
- lexoid/core/parse_type/llm_parser.py +7 -4
- lexoid/core/utils.py +59 -27
- {lexoid-0.1.9.dist-info → lexoid-0.1.11.dist-info}/METADATA +17 -14
- lexoid-0.1.11.dist-info/RECORD +9 -0
- lexoid-0.1.9.dist-info/RECORD +0 -9
- {lexoid-0.1.9.dist-info → lexoid-0.1.11.dist-info}/LICENSE +0 -0
- {lexoid-0.1.9.dist-info → lexoid-0.1.11.dist-info}/WHEEL +0 -0
lexoid/api.py
CHANGED
@@ -19,6 +19,8 @@ from lexoid.core.utils import (
|
|
19
19
|
recursive_read_html,
|
20
20
|
router,
|
21
21
|
split_pdf,
|
22
|
+
create_sub_pdf,
|
23
|
+
get_webpage_soup,
|
22
24
|
)
|
23
25
|
|
24
26
|
|
@@ -83,8 +85,9 @@ def parse_chunk_list(
|
|
83
85
|
result = parse_chunk(file_path, parser_type, **kwargs)
|
84
86
|
combined_segments.extend(result["segments"])
|
85
87
|
raw_texts.append(result["raw"])
|
86
|
-
token_usage
|
87
|
-
|
88
|
+
if "token_usage" in result:
|
89
|
+
token_usage["input"] += result["token_usage"]["input"]
|
90
|
+
token_usage["output"] += result["token_usage"]["output"]
|
88
91
|
token_usage["total"] = token_usage["input"] + token_usage["output"]
|
89
92
|
|
90
93
|
return {
|
@@ -100,7 +103,7 @@ def parse_chunk_list(
|
|
100
103
|
|
101
104
|
def parse(
|
102
105
|
path: str,
|
103
|
-
parser_type: Union[str, ParserType] = "
|
106
|
+
parser_type: Union[str, ParserType] = "AUTO",
|
104
107
|
pages_per_split: int = 4,
|
105
108
|
max_processes: int = 4,
|
106
109
|
**kwargs,
|
@@ -147,6 +150,7 @@ def parse(
|
|
147
150
|
if is_supported_url_file_type(path):
|
148
151
|
path = download_file(path, download_dir)
|
149
152
|
elif as_pdf:
|
153
|
+
kwargs["title"] = get_webpage_soup(path).title.string.strip()
|
150
154
|
pdf_filename = kwargs.get("save_filename", f"webpage_{int(time())}.pdf")
|
151
155
|
if not pdf_filename.endswith(".pdf"):
|
152
156
|
pdf_filename += ".pdf"
|
@@ -163,6 +167,12 @@ def parse(
|
|
163
167
|
pdf_path = os.path.join(temp_dir, "converted.pdf")
|
164
168
|
path = convert_to_pdf(path, pdf_path)
|
165
169
|
|
170
|
+
if "page_nums" in kwargs and path.lower().endswith(".pdf"):
|
171
|
+
sub_pdf_dir = os.path.join(temp_dir, "sub_pdfs")
|
172
|
+
os.makedirs(sub_pdf_dir, exist_ok=True)
|
173
|
+
sub_pdf_path = os.path.join(sub_pdf_dir, f"{os.path.basename(path)}")
|
174
|
+
path = create_sub_pdf(path, sub_pdf_path, kwargs["page_nums"])
|
175
|
+
|
166
176
|
if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
|
167
177
|
kwargs["split"] = False
|
168
178
|
result = parse_chunk(path, parser_type, **kwargs)
|
@@ -50,7 +50,7 @@ def retry_on_http_error(func):
|
|
50
50
|
@retry_on_http_error
|
51
51
|
def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
|
52
52
|
if "model" not in kwargs:
|
53
|
-
kwargs["model"] = "gemini-
|
53
|
+
kwargs["model"] = "gemini-2.0-flash"
|
54
54
|
model = kwargs.get("model")
|
55
55
|
if model.startswith("gemini"):
|
56
56
|
return parse_with_gemini(path, **kwargs)
|
@@ -125,6 +125,9 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
125
125
|
combined_text = result.split("</output>")[0].strip()
|
126
126
|
|
127
127
|
token_usage = result["usageMetadata"]
|
128
|
+
input_tokens = token_usage.get("promptTokenCount", 0)
|
129
|
+
output_tokens = token_usage.get("candidatesTokenCount", 0)
|
130
|
+
total_tokens = input_tokens + output_tokens
|
128
131
|
|
129
132
|
return {
|
130
133
|
"raw": combined_text,
|
@@ -137,9 +140,9 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
137
140
|
"parent_title": kwargs.get("parent_title", ""),
|
138
141
|
"recursive_docs": [],
|
139
142
|
"token_usage": {
|
140
|
-
"input":
|
141
|
-
"output":
|
142
|
-
"total":
|
143
|
+
"input": input_tokens,
|
144
|
+
"output": output_tokens,
|
145
|
+
"total": total_tokens,
|
143
146
|
},
|
144
147
|
}
|
145
148
|
|
lexoid/core/utils.py
CHANGED
@@ -6,7 +6,7 @@ import re
|
|
6
6
|
import sys
|
7
7
|
from difflib import SequenceMatcher
|
8
8
|
from hashlib import md5
|
9
|
-
from typing import Dict, List
|
9
|
+
from typing import Dict, List, Optional
|
10
10
|
from urllib.parse import urlparse
|
11
11
|
|
12
12
|
import nest_asyncio
|
@@ -45,6 +45,20 @@ def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
|
|
45
45
|
return paths
|
46
46
|
|
47
47
|
|
48
|
+
def create_sub_pdf(
|
49
|
+
input_path: str, output_path: str, page_nums: Optional[tuple[int, ...]|int] = None
|
50
|
+
) -> str:
|
51
|
+
if isinstance(page_nums, int):
|
52
|
+
page_nums = (page_nums,)
|
53
|
+
page_nums = tuple(sorted(set(page_nums)))
|
54
|
+
with pikepdf.open(input_path) as pdf:
|
55
|
+
indices = page_nums if page_nums else range(len(pdf.pages))
|
56
|
+
with pikepdf.new() as new_pdf:
|
57
|
+
new_pdf.pages.extend([pdf.pages[i - 1] for i in indices])
|
58
|
+
new_pdf.save(output_path)
|
59
|
+
return output_path
|
60
|
+
|
61
|
+
|
48
62
|
def convert_image_to_pdf(image_path: str) -> bytes:
|
49
63
|
with Image.open(image_path) as img:
|
50
64
|
img_rgb = img.convert("RGB")
|
@@ -285,18 +299,7 @@ def html_to_markdown(html: str, title: str, url: str) -> str:
|
|
285
299
|
|
286
300
|
return content
|
287
301
|
|
288
|
-
|
289
|
-
def read_html_content(url: str) -> Dict:
|
290
|
-
"""
|
291
|
-
Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
|
292
|
-
|
293
|
-
Args:
|
294
|
-
url (str): The URL of the HTML page.
|
295
|
-
|
296
|
-
Returns:
|
297
|
-
Dict: Dictionary containing parsed document data
|
298
|
-
"""
|
299
|
-
|
302
|
+
def get_webpage_soup(url: str) -> BeautifulSoup:
|
300
303
|
try:
|
301
304
|
from playwright.async_api import async_playwright
|
302
305
|
|
@@ -357,6 +360,21 @@ def read_html_content(url: str) -> Dict:
|
|
357
360
|
soup = BeautifulSoup(
|
358
361
|
response.content, "html.parser", from_encoding="iso-8859-1"
|
359
362
|
)
|
363
|
+
return soup
|
364
|
+
|
365
|
+
|
366
|
+
def read_html_content(url: str) -> Dict:
|
367
|
+
"""
|
368
|
+
Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
|
369
|
+
|
370
|
+
Args:
|
371
|
+
url (str): The URL of the HTML page.
|
372
|
+
|
373
|
+
Returns:
|
374
|
+
Dict: Dictionary containing parsed document data
|
375
|
+
"""
|
376
|
+
|
377
|
+
soup = get_webpage_soup(url)
|
360
378
|
title = soup.title.string.strip() if soup.title else "No title"
|
361
379
|
url_hash = md5(url.encode("utf-8")).hexdigest()[:8]
|
362
380
|
full_title = f"{title} - {url_hash}"
|
@@ -528,23 +546,37 @@ def has_hyperlink_in_pdf(path: str):
|
|
528
546
|
)
|
529
547
|
|
530
548
|
|
531
|
-
def router(path: str):
|
549
|
+
def router(path: str, priority: str = "accuracy") -> str:
|
550
|
+
"""
|
551
|
+
Routes the file path to the appropriate parser based on the file type.
|
552
|
+
|
553
|
+
Args:
|
554
|
+
path (str): The file path to route.
|
555
|
+
priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE).
|
556
|
+
"""
|
532
557
|
file_type = get_file_type(path)
|
533
558
|
if file_type.startswith("text/"):
|
534
559
|
return "STATIC_PARSE"
|
535
|
-
|
536
|
-
|
537
|
-
|
538
|
-
|
539
|
-
|
540
|
-
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
return "
|
546
|
-
|
547
|
-
|
560
|
+
|
561
|
+
if priority == "accuracy":
|
562
|
+
# If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
|
563
|
+
# Otherwise, use LLM_PARSE
|
564
|
+
if (
|
565
|
+
file_type == "application/pdf"
|
566
|
+
and not has_image_in_pdf(path)
|
567
|
+
and has_hyperlink_in_pdf(path)
|
568
|
+
):
|
569
|
+
return "STATIC_PARSE"
|
570
|
+
return "LLM_PARSE"
|
571
|
+
else:
|
572
|
+
# If the file is a PDF without images, use STATIC_PARSE
|
573
|
+
# Otherwise, use LLM_PARSE
|
574
|
+
if (
|
575
|
+
file_type == "application/pdf"
|
576
|
+
and not has_image_in_pdf(path)
|
577
|
+
):
|
578
|
+
return "STATIC_PARSE"
|
579
|
+
return "LLM_PARSE"
|
548
580
|
|
549
581
|
def convert_doc_to_pdf(input_path: str, temp_dir: str) -> str:
|
550
582
|
temp_path = os.path.join(
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lexoid
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.11
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.10,<4.0
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -110,20 +110,23 @@ print(parsed_md)
|
|
110
110
|
- **kwargs: Additional arguments for the parser.
|
111
111
|
|
112
112
|
## Benchmark
|
113
|
-
|
113
|
+
Results aggregated across 5 iterations each for 5 documents.
|
114
114
|
|
115
115
|
_Note:_ Benchmarks are currently done in the zero-shot setting.
|
116
116
|
|
117
|
-
| Rank | Model
|
118
|
-
|
119
|
-
| 1 |
|
120
|
-
| 2 | gemini-2.0-flash-
|
121
|
-
| 3 | gemini-
|
122
|
-
| 4 | gemini-
|
123
|
-
| 5 |
|
124
|
-
| 6 | gemini-1.5-
|
125
|
-
| 7 |
|
126
|
-
| 8 |
|
127
|
-
| 9 |
|
128
|
-
| 10 | Llama-Vision-Free (via Together AI) | 0.
|
117
|
+
| Rank | Model | Mean Similarity | Std. Dev. | Time (s) |
|
118
|
+
|---|---|---|---|---|
|
119
|
+
| 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 |
|
120
|
+
| 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 |
|
121
|
+
| 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 |
|
122
|
+
| 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 |
|
123
|
+
| 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 |
|
124
|
+
| 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 |
|
125
|
+
| 7 | gpt-4o | 0.687 | 0.247 | 10.16 |
|
126
|
+
| 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 |
|
127
|
+
| 9 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 |
|
128
|
+
| 10 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 |
|
129
|
+
| 11 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 |
|
130
|
+
| 12 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 |
|
131
|
+
| 13 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 |
|
129
132
|
|
@@ -0,0 +1,9 @@
|
|
1
|
+
lexoid/api.py,sha256=CIZBNvh38PJbD0OwK1Mp0qqkWxkAEBw2L_FkoCmagXA,9288
|
2
|
+
lexoid/core/parse_type/llm_parser.py,sha256=XfsN6RAtb14p31U2jL-9QyRKpkNAGXXiK3urWJIFi2U,10625
|
3
|
+
lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
|
4
|
+
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
+
lexoid/core/utils.py,sha256=1If_3XoUhPQRY5XMzLJBsHdyjtLgD734eYBYvsg8w5Y,19569
|
6
|
+
lexoid-0.1.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
+
lexoid-0.1.11.dist-info/METADATA,sha256=kipDZLbUz_wkJUrzPGH2VppBNMHmaJadHR5_BAqHgjU,4838
|
8
|
+
lexoid-0.1.11.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
+
lexoid-0.1.11.dist-info/RECORD,,
|
lexoid-0.1.9.dist-info/RECORD
DELETED
@@ -1,9 +0,0 @@
|
|
1
|
-
lexoid/api.py,sha256=EYyKwfdrjM94bslqTb7Db_wz0R2WioFPkJAqeDJJchY,8790
|
2
|
-
lexoid/core/parse_type/llm_parser.py,sha256=eu6zcl_uHVJ7-t506yfQT4jHpg2QGHV2CznS9X12lLQ,10515
|
3
|
-
lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
|
4
|
-
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
-
lexoid/core/utils.py,sha256=coVab6fCSSDpIN39WLQ6ciZVRiIx3qTsqjn2EbTmMks,18428
|
6
|
-
lexoid-0.1.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
-
lexoid-0.1.9.dist-info/METADATA,sha256=EegftW7ka6fSzaEos97N2-JPjkpO3tt4wyuL9oha014,4575
|
8
|
-
lexoid-0.1.9.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
-
lexoid-0.1.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|