lexoid 0.1.7__tar.gz → 0.1.8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {lexoid-0.1.7 → lexoid-0.1.8}/PKG-INFO +10 -6
- {lexoid-0.1.7 → lexoid-0.1.8}/README.md +9 -4
- {lexoid-0.1.7 → lexoid-0.1.8}/lexoid/core/parse_type/llm_parser.py +114 -8
- {lexoid-0.1.7 → lexoid-0.1.8}/pyproject.toml +1 -2
- {lexoid-0.1.7 → lexoid-0.1.8}/LICENSE +0 -0
- {lexoid-0.1.7 → lexoid-0.1.8}/lexoid/api.py +0 -0
- {lexoid-0.1.7 → lexoid-0.1.8}/lexoid/core/parse_type/static_parser.py +0 -0
- {lexoid-0.1.7 → lexoid-0.1.8}/lexoid/core/prompt_templates.py +0 -0
- {lexoid-0.1.7 → lexoid-0.1.8}/lexoid/core/utils.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lexoid
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.8
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.10,<4.0
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -28,16 +28,19 @@ Requires-Dist: pyqtwebengine (>=5.15.7,<6.0.0) ; platform_system != "debian"
|
|
28
28
|
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
29
29
|
Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
|
30
30
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
31
|
-
Requires-Dist: together (>=1.3.10,<2.0.0)
|
32
31
|
Description-Content-Type: text/markdown
|
33
32
|
|
34
33
|
# Lexoid
|
35
34
|
|
35
|
+
[](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
|
36
|
+
[](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
|
37
|
+
[](https://pypi.org/project/lexoid/)
|
38
|
+
|
36
39
|
Lexoid is an efficient document parsing library that supports both LLM-based and non-LLM-based (static) PDF document parsing.
|
37
40
|
|
38
41
|
## Motivation:
|
39
42
|
- Use the multi-modal advancement of LLMs
|
40
|
-
- Enable convenience for users
|
43
|
+
- Enable convenience for users
|
41
44
|
- Collaborate with a permissive license
|
42
45
|
|
43
46
|
## Installation
|
@@ -52,13 +55,12 @@ OPENAI_API_KEY=""
|
|
52
55
|
GOOGLE_API_KEY=""
|
53
56
|
```
|
54
57
|
|
55
|
-
Optionally, to use `Playwright` for retrieving web content
|
58
|
+
Optionally, to use `Playwright` for retrieving web content (instead of the `requests` library):
|
56
59
|
```
|
57
60
|
playwright install --with-deps --only-shell chromium
|
58
61
|
```
|
59
62
|
|
60
63
|
### Building `.whl` from source
|
61
|
-
To create `.whl`:
|
62
64
|
```
|
63
65
|
make build
|
64
66
|
```
|
@@ -80,6 +82,7 @@ source .venv/bin/activate
|
|
80
82
|
|
81
83
|
## Usage
|
82
84
|
[Example Notebook](https://github.com/oidlabs-com/Lexoid/blob/main/examples/example_notebook.ipynb)
|
85
|
+
|
83
86
|
[Example Colab Notebook](https://drive.google.com/file/d/1v9R6VOUp9CEGalgZGeg5G57XzHqh_tB6/view?usp=sharing)
|
84
87
|
|
85
88
|
Here's a quick example to parse documents using Lexoid:
|
@@ -98,13 +101,14 @@ print(parsed_md)
|
|
98
101
|
### Parameters
|
99
102
|
- path (str): The file path or URL.
|
100
103
|
- parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
|
101
|
-
- raw (bool, optional):
|
104
|
+
- raw (bool, optional): Return raw text or structured data. Defaults to False.
|
102
105
|
- pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
|
103
106
|
- max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
|
104
107
|
- **kwargs: Additional arguments for the parser.
|
105
108
|
|
106
109
|
## Benchmark
|
107
110
|
Initial results (_more updates soon_)
|
111
|
+
_Note:_ Benchmarks done in zero-shot scenario currently
|
108
112
|
| Rank | Model/Framework | Similarity | Time (s) |
|
109
113
|
|------|-----------|------------|----------|
|
110
114
|
| 1 | gpt-4o | 0.799 | 21.77|
|
@@ -1,10 +1,14 @@
|
|
1
1
|
# Lexoid
|
2
2
|
|
3
|
+
[](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
|
4
|
+
[](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
|
5
|
+
[](https://pypi.org/project/lexoid/)
|
6
|
+
|
3
7
|
Lexoid is an efficient document parsing library that supports both LLM-based and non-LLM-based (static) PDF document parsing.
|
4
8
|
|
5
9
|
## Motivation:
|
6
10
|
- Use the multi-modal advancement of LLMs
|
7
|
-
- Enable convenience for users
|
11
|
+
- Enable convenience for users
|
8
12
|
- Collaborate with a permissive license
|
9
13
|
|
10
14
|
## Installation
|
@@ -19,13 +23,12 @@ OPENAI_API_KEY=""
|
|
19
23
|
GOOGLE_API_KEY=""
|
20
24
|
```
|
21
25
|
|
22
|
-
Optionally, to use `Playwright` for retrieving web content
|
26
|
+
Optionally, to use `Playwright` for retrieving web content (instead of the `requests` library):
|
23
27
|
```
|
24
28
|
playwright install --with-deps --only-shell chromium
|
25
29
|
```
|
26
30
|
|
27
31
|
### Building `.whl` from source
|
28
|
-
To create `.whl`:
|
29
32
|
```
|
30
33
|
make build
|
31
34
|
```
|
@@ -47,6 +50,7 @@ source .venv/bin/activate
|
|
47
50
|
|
48
51
|
## Usage
|
49
52
|
[Example Notebook](https://github.com/oidlabs-com/Lexoid/blob/main/examples/example_notebook.ipynb)
|
53
|
+
|
50
54
|
[Example Colab Notebook](https://drive.google.com/file/d/1v9R6VOUp9CEGalgZGeg5G57XzHqh_tB6/view?usp=sharing)
|
51
55
|
|
52
56
|
Here's a quick example to parse documents using Lexoid:
|
@@ -65,13 +69,14 @@ print(parsed_md)
|
|
65
69
|
### Parameters
|
66
70
|
- path (str): The file path or URL.
|
67
71
|
- parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
|
68
|
-
- raw (bool, optional):
|
72
|
+
- raw (bool, optional): Return raw text or structured data. Defaults to False.
|
69
73
|
- pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
|
70
74
|
- max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
|
71
75
|
- **kwargs: Additional arguments for the parser.
|
72
76
|
|
73
77
|
## Benchmark
|
74
78
|
Initial results (_more updates soon_)
|
79
|
+
_Note:_ Benchmarks done in zero-shot scenario currently
|
75
80
|
| Rank | Model/Framework | Similarity | Time (s) |
|
76
81
|
|------|-----------|------------|----------|
|
77
82
|
| 1 | gpt-4o | 0.799 | 21.77|
|
@@ -2,10 +2,13 @@ import base64
|
|
2
2
|
import io
|
3
3
|
import mimetypes
|
4
4
|
import os
|
5
|
-
|
6
|
-
|
5
|
+
import time
|
7
6
|
import pypdfium2 as pdfium
|
8
7
|
import requests
|
8
|
+
from functools import wraps
|
9
|
+
from requests.exceptions import HTTPError
|
10
|
+
from typing import Dict, List
|
11
|
+
|
9
12
|
from lexoid.core.prompt_templates import (
|
10
13
|
INSTRUCTIONS_ADD_PG_BREAK,
|
11
14
|
OPENAI_USER_PROMPT,
|
@@ -16,9 +19,36 @@ from lexoid.core.utils import convert_image_to_pdf
|
|
16
19
|
from loguru import logger
|
17
20
|
from openai import OpenAI
|
18
21
|
from huggingface_hub import InferenceClient
|
19
|
-
from together import Together
|
20
22
|
|
21
23
|
|
24
|
+
def retry_on_http_error(func):
|
25
|
+
@wraps(func)
|
26
|
+
def wrapper(*args, **kwargs):
|
27
|
+
try:
|
28
|
+
return func(*args, **kwargs)
|
29
|
+
except HTTPError as e:
|
30
|
+
logger.error(f"HTTPError encountered: {e}. Retrying in 10 seconds...")
|
31
|
+
time.sleep(10)
|
32
|
+
try:
|
33
|
+
return func(*args, **kwargs)
|
34
|
+
except HTTPError as e:
|
35
|
+
logger.error(f"Retry failed: {e}")
|
36
|
+
if kwargs.get("raw", False):
|
37
|
+
return ""
|
38
|
+
return [
|
39
|
+
{
|
40
|
+
"metadata": {
|
41
|
+
"title": kwargs["title"],
|
42
|
+
"page": kwargs.get("start", 0),
|
43
|
+
},
|
44
|
+
"content": "",
|
45
|
+
}
|
46
|
+
]
|
47
|
+
|
48
|
+
return wrapper
|
49
|
+
|
50
|
+
|
51
|
+
@retry_on_http_error
|
22
52
|
def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
23
53
|
if "model" not in kwargs:
|
24
54
|
kwargs["model"] = "gemini-1.5-flash"
|
@@ -29,7 +59,7 @@ def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
|
29
59
|
return parse_with_api(path, raw, api="openai", **kwargs)
|
30
60
|
if model.startswith("meta-llama"):
|
31
61
|
if model.endswith("Turbo") or model == "meta-llama/Llama-Vision-Free":
|
32
|
-
return
|
62
|
+
return parse_with_together(path, raw, **kwargs)
|
33
63
|
return parse_with_api(path, raw, api="huggingface", **kwargs)
|
34
64
|
raise ValueError(f"Unsupported model: {model}")
|
35
65
|
|
@@ -107,7 +137,6 @@ def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
|
107
137
|
"content": page,
|
108
138
|
}
|
109
139
|
for page_no, page in enumerate(result.split("<page-break>"), start=1)
|
110
|
-
if page.strip()
|
111
140
|
]
|
112
141
|
|
113
142
|
|
@@ -126,6 +155,85 @@ def convert_pdf_page_to_base64(
|
|
126
155
|
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
127
156
|
|
128
157
|
|
158
|
+
def parse_with_together(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
159
|
+
api_key = os.environ.get("TOGETHER_API_KEY")
|
160
|
+
if not api_key:
|
161
|
+
raise ValueError("TOGETHER_API_KEY environment variable is not set")
|
162
|
+
|
163
|
+
url = "https://api.together.xyz/v1/chat/completions"
|
164
|
+
headers = {
|
165
|
+
"Authorization": f"Bearer {api_key}",
|
166
|
+
"Content-Type": "application/json",
|
167
|
+
}
|
168
|
+
|
169
|
+
mime_type, _ = mimetypes.guess_type(path)
|
170
|
+
if mime_type and mime_type.startswith("image"):
|
171
|
+
with open(path, "rb") as img_file:
|
172
|
+
image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
|
173
|
+
images = [(0, f"data:{mime_type};base64,{image_base64}")]
|
174
|
+
else:
|
175
|
+
pdf_document = pdfium.PdfDocument(path)
|
176
|
+
images = [
|
177
|
+
(
|
178
|
+
page_num,
|
179
|
+
f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
|
180
|
+
)
|
181
|
+
for page_num in range(len(pdf_document))
|
182
|
+
]
|
183
|
+
|
184
|
+
all_results = []
|
185
|
+
for page_num, image_url in images:
|
186
|
+
messages = [
|
187
|
+
{
|
188
|
+
"role": "user",
|
189
|
+
"content": [
|
190
|
+
{"type": "text", "text": LLAMA_PARSER_PROMPT},
|
191
|
+
{"type": "image_url", "image_url": {"url": image_url}},
|
192
|
+
],
|
193
|
+
}
|
194
|
+
]
|
195
|
+
|
196
|
+
payload = {
|
197
|
+
"model": kwargs["model"],
|
198
|
+
"messages": messages,
|
199
|
+
"max_tokens": kwargs.get("max_tokens", 1024),
|
200
|
+
"temperature": kwargs.get("temperature", 0.7),
|
201
|
+
}
|
202
|
+
|
203
|
+
response = requests.post(url, json=payload, headers=headers)
|
204
|
+
response.raise_for_status()
|
205
|
+
response_data = response.json()
|
206
|
+
|
207
|
+
page_text = response_data["choices"][0]["message"]["content"]
|
208
|
+
if kwargs.get("verbose", None):
|
209
|
+
logger.debug(f"Page {page_num + 1} response: {page_text}")
|
210
|
+
|
211
|
+
result = page_text
|
212
|
+
if "<output>" in page_text:
|
213
|
+
result = page_text.split("<output>")[1].strip()
|
214
|
+
if "</output>" in result:
|
215
|
+
result = result.split("</output>")[0].strip()
|
216
|
+
all_results.append((page_num, result))
|
217
|
+
|
218
|
+
all_results.sort(key=lambda x: x[0])
|
219
|
+
all_texts = [text for _, text in all_results]
|
220
|
+
combined_text = "<page-break>".join(all_texts)
|
221
|
+
|
222
|
+
if raw:
|
223
|
+
return combined_text
|
224
|
+
|
225
|
+
return [
|
226
|
+
{
|
227
|
+
"metadata": {
|
228
|
+
"title": kwargs["title"],
|
229
|
+
"page": kwargs.get("start", 0) + page_no,
|
230
|
+
},
|
231
|
+
"content": page,
|
232
|
+
}
|
233
|
+
for page_no, page in enumerate(all_texts, start=1)
|
234
|
+
]
|
235
|
+
|
236
|
+
|
129
237
|
def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str:
|
130
238
|
"""
|
131
239
|
Parse documents (PDFs or images) using various vision model APIs.
|
@@ -133,7 +241,7 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
|
|
133
241
|
Args:
|
134
242
|
path (str): Path to the document to parse
|
135
243
|
raw (bool): If True, return raw text; if False, return structured data
|
136
|
-
api (str): Which API to use ("openai"
|
244
|
+
api (str): Which API to use ("openai" or "huggingface")
|
137
245
|
**kwargs: Additional arguments including model, temperature, title, etc.
|
138
246
|
|
139
247
|
Returns:
|
@@ -145,7 +253,6 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
|
|
145
253
|
"huggingface": lambda: InferenceClient(
|
146
254
|
token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
|
147
255
|
),
|
148
|
-
"together": lambda: Together(),
|
149
256
|
}
|
150
257
|
assert api in clients, f"Unsupported API: {api}"
|
151
258
|
logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
|
@@ -253,5 +360,4 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
|
|
253
360
|
"content": page,
|
254
361
|
}
|
255
362
|
for page_no, page in enumerate(all_texts, start=1)
|
256
|
-
if page.strip()
|
257
363
|
]
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "lexoid"
|
3
|
-
version = "0.1.
|
3
|
+
version = "0.1.8"
|
4
4
|
description = ""
|
5
5
|
authors = []
|
6
6
|
readme = "README.md"
|
@@ -27,7 +27,6 @@ nest-asyncio ="^1.6.0"
|
|
27
27
|
pyqt5 = {version = "^5.15.11", markers = "platform_system != 'debian'"}
|
28
28
|
pyqtwebengine = {version = "^5.15.7", markers = "platform_system != 'debian'"}
|
29
29
|
huggingface-hub = "^0.27.0"
|
30
|
-
together = "^1.3.10"
|
31
30
|
|
32
31
|
[tool.poetry.group.dev.dependencies]
|
33
32
|
ipykernel = "^6.29.5"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|