lexoid 0.1.7__py3-none-any.whl → 0.1.8.post1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexoid/core/parse_type/llm_parser.py +114 -8
- lexoid/core/utils.py +37 -2
- {lexoid-0.1.7.dist-info → lexoid-0.1.8.post1.dist-info}/METADATA +15 -6
- lexoid-0.1.8.post1.dist-info/RECORD +9 -0
- lexoid-0.1.7.dist-info/RECORD +0 -9
- {lexoid-0.1.7.dist-info → lexoid-0.1.8.post1.dist-info}/LICENSE +0 -0
- {lexoid-0.1.7.dist-info → lexoid-0.1.8.post1.dist-info}/WHEEL +0 -0
@@ -2,10 +2,13 @@ import base64
|
|
2
2
|
import io
|
3
3
|
import mimetypes
|
4
4
|
import os
|
5
|
-
|
6
|
-
|
5
|
+
import time
|
7
6
|
import pypdfium2 as pdfium
|
8
7
|
import requests
|
8
|
+
from functools import wraps
|
9
|
+
from requests.exceptions import HTTPError
|
10
|
+
from typing import Dict, List
|
11
|
+
|
9
12
|
from lexoid.core.prompt_templates import (
|
10
13
|
INSTRUCTIONS_ADD_PG_BREAK,
|
11
14
|
OPENAI_USER_PROMPT,
|
@@ -16,9 +19,36 @@ from lexoid.core.utils import convert_image_to_pdf
|
|
16
19
|
from loguru import logger
|
17
20
|
from openai import OpenAI
|
18
21
|
from huggingface_hub import InferenceClient
|
19
|
-
from together import Together
|
20
22
|
|
21
23
|
|
24
|
+
def retry_on_http_error(func):
|
25
|
+
@wraps(func)
|
26
|
+
def wrapper(*args, **kwargs):
|
27
|
+
try:
|
28
|
+
return func(*args, **kwargs)
|
29
|
+
except HTTPError as e:
|
30
|
+
logger.error(f"HTTPError encountered: {e}. Retrying in 10 seconds...")
|
31
|
+
time.sleep(10)
|
32
|
+
try:
|
33
|
+
return func(*args, **kwargs)
|
34
|
+
except HTTPError as e:
|
35
|
+
logger.error(f"Retry failed: {e}")
|
36
|
+
if kwargs.get("raw", False):
|
37
|
+
return ""
|
38
|
+
return [
|
39
|
+
{
|
40
|
+
"metadata": {
|
41
|
+
"title": kwargs["title"],
|
42
|
+
"page": kwargs.get("start", 0),
|
43
|
+
},
|
44
|
+
"content": "",
|
45
|
+
}
|
46
|
+
]
|
47
|
+
|
48
|
+
return wrapper
|
49
|
+
|
50
|
+
|
51
|
+
@retry_on_http_error
|
22
52
|
def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
23
53
|
if "model" not in kwargs:
|
24
54
|
kwargs["model"] = "gemini-1.5-flash"
|
@@ -29,7 +59,7 @@ def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
|
29
59
|
return parse_with_api(path, raw, api="openai", **kwargs)
|
30
60
|
if model.startswith("meta-llama"):
|
31
61
|
if model.endswith("Turbo") or model == "meta-llama/Llama-Vision-Free":
|
32
|
-
return
|
62
|
+
return parse_with_together(path, raw, **kwargs)
|
33
63
|
return parse_with_api(path, raw, api="huggingface", **kwargs)
|
34
64
|
raise ValueError(f"Unsupported model: {model}")
|
35
65
|
|
@@ -107,7 +137,6 @@ def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
|
107
137
|
"content": page,
|
108
138
|
}
|
109
139
|
for page_no, page in enumerate(result.split("<page-break>"), start=1)
|
110
|
-
if page.strip()
|
111
140
|
]
|
112
141
|
|
113
142
|
|
@@ -126,6 +155,85 @@ def convert_pdf_page_to_base64(
|
|
126
155
|
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
127
156
|
|
128
157
|
|
158
|
+
def parse_with_together(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
159
|
+
api_key = os.environ.get("TOGETHER_API_KEY")
|
160
|
+
if not api_key:
|
161
|
+
raise ValueError("TOGETHER_API_KEY environment variable is not set")
|
162
|
+
|
163
|
+
url = "https://api.together.xyz/v1/chat/completions"
|
164
|
+
headers = {
|
165
|
+
"Authorization": f"Bearer {api_key}",
|
166
|
+
"Content-Type": "application/json",
|
167
|
+
}
|
168
|
+
|
169
|
+
mime_type, _ = mimetypes.guess_type(path)
|
170
|
+
if mime_type and mime_type.startswith("image"):
|
171
|
+
with open(path, "rb") as img_file:
|
172
|
+
image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
|
173
|
+
images = [(0, f"data:{mime_type};base64,{image_base64}")]
|
174
|
+
else:
|
175
|
+
pdf_document = pdfium.PdfDocument(path)
|
176
|
+
images = [
|
177
|
+
(
|
178
|
+
page_num,
|
179
|
+
f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
|
180
|
+
)
|
181
|
+
for page_num in range(len(pdf_document))
|
182
|
+
]
|
183
|
+
|
184
|
+
all_results = []
|
185
|
+
for page_num, image_url in images:
|
186
|
+
messages = [
|
187
|
+
{
|
188
|
+
"role": "user",
|
189
|
+
"content": [
|
190
|
+
{"type": "text", "text": LLAMA_PARSER_PROMPT},
|
191
|
+
{"type": "image_url", "image_url": {"url": image_url}},
|
192
|
+
],
|
193
|
+
}
|
194
|
+
]
|
195
|
+
|
196
|
+
payload = {
|
197
|
+
"model": kwargs["model"],
|
198
|
+
"messages": messages,
|
199
|
+
"max_tokens": kwargs.get("max_tokens", 1024),
|
200
|
+
"temperature": kwargs.get("temperature", 0.7),
|
201
|
+
}
|
202
|
+
|
203
|
+
response = requests.post(url, json=payload, headers=headers)
|
204
|
+
response.raise_for_status()
|
205
|
+
response_data = response.json()
|
206
|
+
|
207
|
+
page_text = response_data["choices"][0]["message"]["content"]
|
208
|
+
if kwargs.get("verbose", None):
|
209
|
+
logger.debug(f"Page {page_num + 1} response: {page_text}")
|
210
|
+
|
211
|
+
result = page_text
|
212
|
+
if "<output>" in page_text:
|
213
|
+
result = page_text.split("<output>")[1].strip()
|
214
|
+
if "</output>" in result:
|
215
|
+
result = result.split("</output>")[0].strip()
|
216
|
+
all_results.append((page_num, result))
|
217
|
+
|
218
|
+
all_results.sort(key=lambda x: x[0])
|
219
|
+
all_texts = [text for _, text in all_results]
|
220
|
+
combined_text = "<page-break>".join(all_texts)
|
221
|
+
|
222
|
+
if raw:
|
223
|
+
return combined_text
|
224
|
+
|
225
|
+
return [
|
226
|
+
{
|
227
|
+
"metadata": {
|
228
|
+
"title": kwargs["title"],
|
229
|
+
"page": kwargs.get("start", 0) + page_no,
|
230
|
+
},
|
231
|
+
"content": page,
|
232
|
+
}
|
233
|
+
for page_no, page in enumerate(all_texts, start=1)
|
234
|
+
]
|
235
|
+
|
236
|
+
|
129
237
|
def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str:
|
130
238
|
"""
|
131
239
|
Parse documents (PDFs or images) using various vision model APIs.
|
@@ -133,7 +241,7 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
|
|
133
241
|
Args:
|
134
242
|
path (str): Path to the document to parse
|
135
243
|
raw (bool): If True, return raw text; if False, return structured data
|
136
|
-
api (str): Which API to use ("openai"
|
244
|
+
api (str): Which API to use ("openai" or "huggingface")
|
137
245
|
**kwargs: Additional arguments including model, temperature, title, etc.
|
138
246
|
|
139
247
|
Returns:
|
@@ -145,7 +253,6 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
|
|
145
253
|
"huggingface": lambda: InferenceClient(
|
146
254
|
token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
|
147
255
|
),
|
148
|
-
"together": lambda: Together(),
|
149
256
|
}
|
150
257
|
assert api in clients, f"Unsupported API: {api}"
|
151
258
|
logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
|
@@ -253,5 +360,4 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
|
|
253
360
|
"content": page,
|
254
361
|
}
|
255
362
|
for page_no, page in enumerate(all_texts, start=1)
|
256
|
-
if page.strip()
|
257
363
|
]
|
lexoid/core/utils.py
CHANGED
@@ -298,9 +298,44 @@ def read_html_content(url: str, raw: bool = False) -> Union[str, List[Dict]]:
|
|
298
298
|
|
299
299
|
async def fetch_page():
|
300
300
|
async with async_playwright() as p:
|
301
|
-
browser = await p.chromium.launch(
|
302
|
-
|
301
|
+
browser = await p.chromium.launch(
|
302
|
+
headless=True,
|
303
|
+
args=[
|
304
|
+
"--disable-blink-features=AutomationControlled",
|
305
|
+
"--no-sandbox",
|
306
|
+
"--window-size=1920,1080",
|
307
|
+
],
|
308
|
+
)
|
309
|
+
context = await browser.new_context(
|
310
|
+
viewport={"width": 1920, "height": 1080},
|
311
|
+
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
312
|
+
bypass_csp=True,
|
313
|
+
)
|
314
|
+
page = await context.new_page()
|
315
|
+
|
316
|
+
# Add headers to appear more like a real browser
|
317
|
+
await page.set_extra_http_headers(
|
318
|
+
{
|
319
|
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
320
|
+
"Accept-Language": "en-US,en;q=0.5",
|
321
|
+
"Sec-Fetch-Dest": "document",
|
322
|
+
"Sec-Fetch-Mode": "navigate",
|
323
|
+
"Sec-Fetch-Site": "none",
|
324
|
+
"Sec-Fetch-User": "?1",
|
325
|
+
}
|
326
|
+
)
|
327
|
+
|
303
328
|
await page.goto(url)
|
329
|
+
|
330
|
+
# Wait for Cloudflare check to complete
|
331
|
+
await page.wait_for_load_state("networkidle")
|
332
|
+
|
333
|
+
# Additional wait for any dynamic content
|
334
|
+
try:
|
335
|
+
await page.wait_for_selector("body", timeout=30000)
|
336
|
+
except:
|
337
|
+
pass
|
338
|
+
|
304
339
|
html = await page.content()
|
305
340
|
await browser.close()
|
306
341
|
return html
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lexoid
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.8.post1
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.10,<4.0
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -28,16 +28,22 @@ Requires-Dist: pyqtwebengine (>=5.15.7,<6.0.0) ; platform_system != "debian"
|
|
28
28
|
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
29
29
|
Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
|
30
30
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
31
|
-
Requires-Dist: together (>=1.3.10,<2.0.0)
|
32
31
|
Description-Content-Type: text/markdown
|
33
32
|
|
34
33
|
# Lexoid
|
35
34
|
|
35
|
+
[](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
|
36
|
+
[](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
|
37
|
+
[](https://pypi.org/project/lexoid/)
|
38
|
+
[](https://oidlabs-com.github.io/Lexoid/)
|
39
|
+
|
36
40
|
Lexoid is an efficient document parsing library that supports both LLM-based and non-LLM-based (static) PDF document parsing.
|
37
41
|
|
42
|
+
[Documentation](https://oidlabs-com.github.io/Lexoid/)
|
43
|
+
|
38
44
|
## Motivation:
|
39
45
|
- Use the multi-modal advancement of LLMs
|
40
|
-
- Enable convenience for users
|
46
|
+
- Enable convenience for users
|
41
47
|
- Collaborate with a permissive license
|
42
48
|
|
43
49
|
## Installation
|
@@ -52,13 +58,12 @@ OPENAI_API_KEY=""
|
|
52
58
|
GOOGLE_API_KEY=""
|
53
59
|
```
|
54
60
|
|
55
|
-
Optionally, to use `Playwright` for retrieving web content
|
61
|
+
Optionally, to use `Playwright` for retrieving web content (instead of the `requests` library):
|
56
62
|
```
|
57
63
|
playwright install --with-deps --only-shell chromium
|
58
64
|
```
|
59
65
|
|
60
66
|
### Building `.whl` from source
|
61
|
-
To create `.whl`:
|
62
67
|
```
|
63
68
|
make build
|
64
69
|
```
|
@@ -80,6 +85,7 @@ source .venv/bin/activate
|
|
80
85
|
|
81
86
|
## Usage
|
82
87
|
[Example Notebook](https://github.com/oidlabs-com/Lexoid/blob/main/examples/example_notebook.ipynb)
|
88
|
+
|
83
89
|
[Example Colab Notebook](https://drive.google.com/file/d/1v9R6VOUp9CEGalgZGeg5G57XzHqh_tB6/view?usp=sharing)
|
84
90
|
|
85
91
|
Here's a quick example to parse documents using Lexoid:
|
@@ -98,13 +104,16 @@ print(parsed_md)
|
|
98
104
|
### Parameters
|
99
105
|
- path (str): The file path or URL.
|
100
106
|
- parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
|
101
|
-
- raw (bool, optional):
|
107
|
+
- raw (bool, optional): Return raw text or structured data. Defaults to False.
|
102
108
|
- pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
|
103
109
|
- max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
|
104
110
|
- **kwargs: Additional arguments for the parser.
|
105
111
|
|
106
112
|
## Benchmark
|
107
113
|
Initial results (_more updates soon_)
|
114
|
+
|
115
|
+
_Note:_ Benchmarks are currently done in the zero-shot setting.
|
116
|
+
|
108
117
|
| Rank | Model/Framework | Similarity | Time (s) |
|
109
118
|
|------|-----------|------------|----------|
|
110
119
|
| 1 | gpt-4o | 0.799 | 21.77|
|
@@ -0,0 +1,9 @@
|
|
1
|
+
lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
|
2
|
+
lexoid/core/parse_type/llm_parser.py,sha256=JsrVALlK4h2j8URSgNIhdWPB6chWXrNrMlImtxVTyyU,11833
|
3
|
+
lexoid/core/parse_type/static_parser.py,sha256=NlAE_WMMNvNnVo2aQrA9mN3fwJ6ZrshMC8S9kG0h8CA,13772
|
4
|
+
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
+
lexoid/core/utils.py,sha256=peWuMVTk90-j0aSDaRnwigpoAz_Q5y8vSosCDc6Zl3g,18642
|
6
|
+
lexoid-0.1.8.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
+
lexoid-0.1.8.post1.dist-info/METADATA,sha256=mz8A_92-GrLfOmT8UYcIxWIEkcskad_9vSnNnlbE4dI,4625
|
8
|
+
lexoid-0.1.8.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
+
lexoid-0.1.8.post1.dist-info/RECORD,,
|
lexoid-0.1.7.dist-info/RECORD
DELETED
@@ -1,9 +0,0 @@
|
|
1
|
-
lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
|
2
|
-
lexoid/core/parse_type/llm_parser.py,sha256=i_iidoP_qExGTScRPMBX5X3RnjIf6XqAS_NhLkz0_LM,8464
|
3
|
-
lexoid/core/parse_type/static_parser.py,sha256=NlAE_WMMNvNnVo2aQrA9mN3fwJ6ZrshMC8S9kG0h8CA,13772
|
4
|
-
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
-
lexoid/core/utils.py,sha256=rd8sf2OZqMv_oHGxM1redpSwU8f_sBJ-0tzlbp8U3_A,17193
|
6
|
-
lexoid-0.1.7.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
-
lexoid-0.1.7.dist-info/METADATA,sha256=yOwsqpA5U-2Z2CXr5Cnrs2a6HtqY-4WryVfYDTI7X08,4092
|
8
|
-
lexoid-0.1.7.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
-
lexoid-0.1.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|