lexoid 0.1.10__tar.gz → 0.1.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lexoid-0.1.10 → lexoid-0.1.11}/PKG-INFO +17 -14
- {lexoid-0.1.10 → lexoid-0.1.11}/README.md +16 -13
- {lexoid-0.1.10 → lexoid-0.1.11}/lexoid/api.py +3 -1
- {lexoid-0.1.10 → lexoid-0.1.11}/lexoid/core/parse_type/llm_parser.py +1 -1
- {lexoid-0.1.10 → lexoid-0.1.11}/lexoid/core/utils.py +44 -26
- {lexoid-0.1.10 → lexoid-0.1.11}/pyproject.toml +1 -1
- lexoid-0.1.10/lexoid/core/__pycache__/prompt_templates.cpython-310.pyc +0 -0
- lexoid-0.1.10/lexoid/core/__pycache__/prompt_templates.cpython-312.pyc +0 -0
- lexoid-0.1.10/lexoid/core/__pycache__/utils.cpython-310.pyc +0 -0
- lexoid-0.1.10/lexoid/core/__pycache__/utils.cpython-312.pyc +0 -0
- lexoid-0.1.10/lexoid/core/parse_type/__pycache__/llm_parser.cpython-310.pyc +0 -0
- lexoid-0.1.10/lexoid/core/parse_type/__pycache__/llm_parser.cpython-312.pyc +0 -0
- lexoid-0.1.10/lexoid/core/parse_type/__pycache__/static_parser.cpython-310.pyc +0 -0
- lexoid-0.1.10/lexoid/core/parse_type/__pycache__/static_parser.cpython-312.pyc +0 -0
- {lexoid-0.1.10 → lexoid-0.1.11}/LICENSE +0 -0
- {lexoid-0.1.10 → lexoid-0.1.11}/lexoid/core/parse_type/static_parser.py +0 -0
- {lexoid-0.1.10 → lexoid-0.1.11}/lexoid/core/prompt_templates.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lexoid
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.11
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.10,<4.0
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -110,20 +110,23 @@ print(parsed_md)
|
|
110
110
|
- **kwargs: Additional arguments for the parser.
|
111
111
|
|
112
112
|
## Benchmark
|
113
|
-
|
113
|
+
Results aggregated across 5 iterations each for 5 documents.
|
114
114
|
|
115
115
|
_Note:_ Benchmarks are currently done in the zero-shot setting.
|
116
116
|
|
117
|
-
| Rank | Model
|
118
|
-
|
119
|
-
| 1 |
|
120
|
-
| 2 | gemini-2.0-flash-
|
121
|
-
| 3 | gemini-
|
122
|
-
| 4 | gemini-
|
123
|
-
| 5 |
|
124
|
-
| 6 | gemini-1.5-
|
125
|
-
| 7 |
|
126
|
-
| 8 |
|
127
|
-
| 9 |
|
128
|
-
| 10 | Llama-Vision-Free (via Together AI) | 0.
|
117
|
+
| Rank | Model | Mean Similarity | Std. Dev. | Time (s) |
|
118
|
+
|---|---|---|---|---|
|
119
|
+
| 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 |
|
120
|
+
| 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 |
|
121
|
+
| 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 |
|
122
|
+
| 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 |
|
123
|
+
| 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 |
|
124
|
+
| 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 |
|
125
|
+
| 7 | gpt-4o | 0.687 | 0.247 | 10.16 |
|
126
|
+
| 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 |
|
127
|
+
| 9 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 |
|
128
|
+
| 10 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 |
|
129
|
+
| 11 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 |
|
130
|
+
| 12 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 |
|
131
|
+
| 13 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 |
|
129
132
|
|
@@ -77,19 +77,22 @@ print(parsed_md)
|
|
77
77
|
- **kwargs: Additional arguments for the parser.
|
78
78
|
|
79
79
|
## Benchmark
|
80
|
-
|
80
|
+
Results aggregated across 5 iterations each for 5 documents.
|
81
81
|
|
82
82
|
_Note:_ Benchmarks are currently done in the zero-shot setting.
|
83
83
|
|
84
|
-
| Rank | Model
|
85
|
-
|
86
|
-
| 1 |
|
87
|
-
| 2 | gemini-2.0-flash-
|
88
|
-
| 3 | gemini-
|
89
|
-
| 4 | gemini-
|
90
|
-
| 5 |
|
91
|
-
| 6 | gemini-1.5-
|
92
|
-
| 7 |
|
93
|
-
| 8 |
|
94
|
-
| 9 |
|
95
|
-
| 10 | Llama-Vision-Free (via Together AI) | 0.
|
84
|
+
| Rank | Model | Mean Similarity | Std. Dev. | Time (s) |
|
85
|
+
|---|---|---|---|---|
|
86
|
+
| 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 |
|
87
|
+
| 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 |
|
88
|
+
| 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 |
|
89
|
+
| 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 |
|
90
|
+
| 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 |
|
91
|
+
| 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 |
|
92
|
+
| 7 | gpt-4o | 0.687 | 0.247 | 10.16 |
|
93
|
+
| 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 |
|
94
|
+
| 9 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 |
|
95
|
+
| 10 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 |
|
96
|
+
| 11 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 |
|
97
|
+
| 12 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 |
|
98
|
+
| 13 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 |
|
@@ -20,6 +20,7 @@ from lexoid.core.utils import (
|
|
20
20
|
router,
|
21
21
|
split_pdf,
|
22
22
|
create_sub_pdf,
|
23
|
+
get_webpage_soup,
|
23
24
|
)
|
24
25
|
|
25
26
|
|
@@ -102,7 +103,7 @@ def parse_chunk_list(
|
|
102
103
|
|
103
104
|
def parse(
|
104
105
|
path: str,
|
105
|
-
parser_type: Union[str, ParserType] = "
|
106
|
+
parser_type: Union[str, ParserType] = "AUTO",
|
106
107
|
pages_per_split: int = 4,
|
107
108
|
max_processes: int = 4,
|
108
109
|
**kwargs,
|
@@ -149,6 +150,7 @@ def parse(
|
|
149
150
|
if is_supported_url_file_type(path):
|
150
151
|
path = download_file(path, download_dir)
|
151
152
|
elif as_pdf:
|
153
|
+
kwargs["title"] = get_webpage_soup(path).title.string.strip()
|
152
154
|
pdf_filename = kwargs.get("save_filename", f"webpage_{int(time())}.pdf")
|
153
155
|
if not pdf_filename.endswith(".pdf"):
|
154
156
|
pdf_filename += ".pdf"
|
@@ -50,7 +50,7 @@ def retry_on_http_error(func):
|
|
50
50
|
@retry_on_http_error
|
51
51
|
def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
|
52
52
|
if "model" not in kwargs:
|
53
|
-
kwargs["model"] = "gemini-
|
53
|
+
kwargs["model"] = "gemini-2.0-flash"
|
54
54
|
model = kwargs.get("model")
|
55
55
|
if model.startswith("gemini"):
|
56
56
|
return parse_with_gemini(path, **kwargs)
|
@@ -299,18 +299,7 @@ def html_to_markdown(html: str, title: str, url: str) -> str:
|
|
299
299
|
|
300
300
|
return content
|
301
301
|
|
302
|
-
|
303
|
-
def read_html_content(url: str) -> Dict:
|
304
|
-
"""
|
305
|
-
Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
|
306
|
-
|
307
|
-
Args:
|
308
|
-
url (str): The URL of the HTML page.
|
309
|
-
|
310
|
-
Returns:
|
311
|
-
Dict: Dictionary containing parsed document data
|
312
|
-
"""
|
313
|
-
|
302
|
+
def get_webpage_soup(url: str) -> BeautifulSoup:
|
314
303
|
try:
|
315
304
|
from playwright.async_api import async_playwright
|
316
305
|
|
@@ -371,6 +360,21 @@ def read_html_content(url: str) -> Dict:
|
|
371
360
|
soup = BeautifulSoup(
|
372
361
|
response.content, "html.parser", from_encoding="iso-8859-1"
|
373
362
|
)
|
363
|
+
return soup
|
364
|
+
|
365
|
+
|
366
|
+
def read_html_content(url: str) -> Dict:
|
367
|
+
"""
|
368
|
+
Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
|
369
|
+
|
370
|
+
Args:
|
371
|
+
url (str): The URL of the HTML page.
|
372
|
+
|
373
|
+
Returns:
|
374
|
+
Dict: Dictionary containing parsed document data
|
375
|
+
"""
|
376
|
+
|
377
|
+
soup = get_webpage_soup(url)
|
374
378
|
title = soup.title.string.strip() if soup.title else "No title"
|
375
379
|
url_hash = md5(url.encode("utf-8")).hexdigest()[:8]
|
376
380
|
full_title = f"{title} - {url_hash}"
|
@@ -542,23 +546,37 @@ def has_hyperlink_in_pdf(path: str):
|
|
542
546
|
)
|
543
547
|
|
544
548
|
|
545
|
-
def router(path: str):
|
549
|
+
def router(path: str, priority: str = "accuracy") -> str:
|
550
|
+
"""
|
551
|
+
Routes the file path to the appropriate parser based on the file type.
|
552
|
+
|
553
|
+
Args:
|
554
|
+
path (str): The file path to route.
|
555
|
+
priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE).
|
556
|
+
"""
|
546
557
|
file_type = get_file_type(path)
|
547
558
|
if file_type.startswith("text/"):
|
548
559
|
return "STATIC_PARSE"
|
549
|
-
|
550
|
-
|
551
|
-
|
552
|
-
|
553
|
-
|
554
|
-
|
555
|
-
|
556
|
-
|
557
|
-
|
558
|
-
|
559
|
-
return "
|
560
|
-
|
561
|
-
|
560
|
+
|
561
|
+
if priority == "accuracy":
|
562
|
+
# If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
|
563
|
+
# Otherwise, use LLM_PARSE
|
564
|
+
if (
|
565
|
+
file_type == "application/pdf"
|
566
|
+
and not has_image_in_pdf(path)
|
567
|
+
and has_hyperlink_in_pdf(path)
|
568
|
+
):
|
569
|
+
return "STATIC_PARSE"
|
570
|
+
return "LLM_PARSE"
|
571
|
+
else:
|
572
|
+
# If the file is a PDF without images, use STATIC_PARSE
|
573
|
+
# Otherwise, use LLM_PARSE
|
574
|
+
if (
|
575
|
+
file_type == "application/pdf"
|
576
|
+
and not has_image_in_pdf(path)
|
577
|
+
):
|
578
|
+
return "STATIC_PARSE"
|
579
|
+
return "LLM_PARSE"
|
562
580
|
|
563
581
|
def convert_doc_to_pdf(input_path: str, temp_dir: str) -> str:
|
564
582
|
temp_path = os.path.join(
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
File without changes
|
File without changes
|
File without changes
|