lexoid 0.1.6.post1__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexoid/core/parse_type/llm_parser.py +92 -35
- lexoid/core/parse_type/static_parser.py +79 -27
- lexoid/core/prompt_templates.py +19 -0
- {lexoid-0.1.6.post1.dist-info → lexoid-0.1.7.dist-info}/METADATA +33 -17
- lexoid-0.1.7.dist-info/RECORD +9 -0
- lexoid-0.1.6.post1.dist-info/RECORD +0 -9
- {lexoid-0.1.6.post1.dist-info → lexoid-0.1.7.dist-info}/LICENSE +0 -0
- {lexoid-0.1.6.post1.dist-info → lexoid-0.1.7.dist-info}/WHEEL +0 -0
@@ -10,10 +10,13 @@ from lexoid.core.prompt_templates import (
|
|
10
10
|
INSTRUCTIONS_ADD_PG_BREAK,
|
11
11
|
OPENAI_USER_PROMPT,
|
12
12
|
PARSER_PROMPT,
|
13
|
+
LLAMA_PARSER_PROMPT,
|
13
14
|
)
|
14
15
|
from lexoid.core.utils import convert_image_to_pdf
|
15
16
|
from loguru import logger
|
16
17
|
from openai import OpenAI
|
18
|
+
from huggingface_hub import InferenceClient
|
19
|
+
from together import Together
|
17
20
|
|
18
21
|
|
19
22
|
def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
@@ -22,10 +25,13 @@ def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
|
22
25
|
model = kwargs.get("model")
|
23
26
|
if model.startswith("gemini"):
|
24
27
|
return parse_with_gemini(path, raw, **kwargs)
|
25
|
-
|
26
|
-
return
|
27
|
-
|
28
|
-
|
28
|
+
if model.startswith("gpt"):
|
29
|
+
return parse_with_api(path, raw, api="openai", **kwargs)
|
30
|
+
if model.startswith("meta-llama"):
|
31
|
+
if model.endswith("Turbo") or model == "meta-llama/Llama-Vision-Free":
|
32
|
+
return parse_with_api(path, raw, api="together", **kwargs)
|
33
|
+
return parse_with_api(path, raw, api="huggingface", **kwargs)
|
34
|
+
raise ValueError(f"Unsupported model: {model}")
|
29
35
|
|
30
36
|
|
31
37
|
def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
@@ -120,8 +126,30 @@ def convert_pdf_page_to_base64(
|
|
120
126
|
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
121
127
|
|
122
128
|
|
123
|
-
def
|
124
|
-
|
129
|
+
def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str:
|
130
|
+
"""
|
131
|
+
Parse documents (PDFs or images) using various vision model APIs.
|
132
|
+
|
133
|
+
Args:
|
134
|
+
path (str): Path to the document to parse
|
135
|
+
raw (bool): If True, return raw text; if False, return structured data
|
136
|
+
api (str): Which API to use ("openai", "huggingface", or "together")
|
137
|
+
**kwargs: Additional arguments including model, temperature, title, etc.
|
138
|
+
|
139
|
+
Returns:
|
140
|
+
List[Dict] | str: Parsed content either as raw text or structured data
|
141
|
+
"""
|
142
|
+
# Initialize appropriate client
|
143
|
+
clients = {
|
144
|
+
"openai": lambda: OpenAI(),
|
145
|
+
"huggingface": lambda: InferenceClient(
|
146
|
+
token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
|
147
|
+
),
|
148
|
+
"together": lambda: Together(),
|
149
|
+
}
|
150
|
+
assert api in clients, f"Unsupported API: {api}"
|
151
|
+
logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
|
152
|
+
client = clients[api]()
|
125
153
|
|
126
154
|
# Handle different input types
|
127
155
|
mime_type, _ = mimetypes.guess_type(path)
|
@@ -129,50 +157,79 @@ def parse_with_gpt(path: str, raw: bool, **kwargs) -> List[Dict] | str:
|
|
129
157
|
# Single image processing
|
130
158
|
with open(path, "rb") as img_file:
|
131
159
|
image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
|
132
|
-
images = [(0, image_base64)]
|
160
|
+
images = [(0, f"data:{mime_type};base64,{image_base64}")]
|
133
161
|
else:
|
134
162
|
# PDF processing
|
135
163
|
pdf_document = pdfium.PdfDocument(path)
|
136
164
|
images = [
|
137
|
-
(
|
165
|
+
(
|
166
|
+
page_num,
|
167
|
+
f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
|
168
|
+
)
|
138
169
|
for page_num in range(len(pdf_document))
|
139
170
|
]
|
140
171
|
|
172
|
+
# API-specific message formatting
|
173
|
+
def get_messages(page_num: int, image_url: str) -> List[Dict]:
|
174
|
+
base_message = {
|
175
|
+
"type": "text",
|
176
|
+
"text": LLAMA_PARSER_PROMPT,
|
177
|
+
}
|
178
|
+
image_message = {
|
179
|
+
"type": "image_url",
|
180
|
+
"image_url": {"url": image_url},
|
181
|
+
}
|
182
|
+
|
183
|
+
if api == "openai":
|
184
|
+
return [
|
185
|
+
{
|
186
|
+
"role": "system",
|
187
|
+
"content": PARSER_PROMPT.format(
|
188
|
+
custom_instructions=INSTRUCTIONS_ADD_PG_BREAK
|
189
|
+
),
|
190
|
+
},
|
191
|
+
{
|
192
|
+
"role": "user",
|
193
|
+
"content": [
|
194
|
+
{
|
195
|
+
"type": "text",
|
196
|
+
"text": f"{OPENAI_USER_PROMPT} (Page {page_num + 1})",
|
197
|
+
},
|
198
|
+
image_message,
|
199
|
+
],
|
200
|
+
},
|
201
|
+
]
|
202
|
+
else:
|
203
|
+
return [
|
204
|
+
{
|
205
|
+
"role": "user",
|
206
|
+
"content": [base_message, image_message],
|
207
|
+
}
|
208
|
+
]
|
209
|
+
|
141
210
|
# Process each page/image
|
142
211
|
all_results = []
|
143
|
-
for page_num,
|
144
|
-
messages =
|
145
|
-
{
|
146
|
-
"role": "system",
|
147
|
-
"content": PARSER_PROMPT,
|
148
|
-
},
|
149
|
-
{
|
150
|
-
"role": "user",
|
151
|
-
"content": [
|
152
|
-
{
|
153
|
-
"type": "text",
|
154
|
-
"text": f"{OPENAI_USER_PROMPT} (Page {page_num + 1})",
|
155
|
-
},
|
156
|
-
{
|
157
|
-
"type": "image_url",
|
158
|
-
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
159
|
-
},
|
160
|
-
],
|
161
|
-
},
|
162
|
-
]
|
212
|
+
for page_num, image_url in images:
|
213
|
+
messages = get_messages(page_num, image_url)
|
163
214
|
|
164
|
-
#
|
165
|
-
|
166
|
-
model
|
167
|
-
|
168
|
-
|
169
|
-
|
215
|
+
# Common completion parameters
|
216
|
+
completion_params = {
|
217
|
+
"model": kwargs["model"],
|
218
|
+
"messages": messages,
|
219
|
+
"max_tokens": kwargs.get("max_tokens", 1024),
|
220
|
+
"temperature": kwargs.get("temperature", 0.7),
|
221
|
+
}
|
222
|
+
|
223
|
+
# Get completion from selected API
|
224
|
+
response = client.chat.completions.create(**completion_params)
|
170
225
|
|
171
226
|
# Extract the response text
|
172
227
|
page_text = response.choices[0].message.content
|
173
228
|
if kwargs.get("verbose", None):
|
174
229
|
logger.debug(f"Page {page_num + 1} response: {page_text}")
|
175
|
-
|
230
|
+
|
231
|
+
# Extract content between output tags if present
|
232
|
+
result = page_text
|
176
233
|
if "<output>" in page_text:
|
177
234
|
result = page_text.split("<output>")[1].strip()
|
178
235
|
if "</output>" in result:
|
@@ -89,15 +89,21 @@ def process_table(table) -> str:
|
|
89
89
|
|
90
90
|
# Convert to DataFrame and handle empty cells
|
91
91
|
df = pd.DataFrame(table_data)
|
92
|
+
df.replace("", pd.NA, inplace=True)
|
93
|
+
df = df.dropna(how="all", axis=0)
|
94
|
+
df = df.dropna(how="all", axis=1)
|
92
95
|
df = df.fillna("")
|
96
|
+
if len(df) == 0:
|
97
|
+
return ""
|
93
98
|
|
94
99
|
# Use first row as header and clean it up
|
95
100
|
df.columns = df.iloc[0]
|
96
|
-
df = df.drop(0)
|
101
|
+
df = df.drop(df.index[0])
|
102
|
+
df.replace(r"\n", "<br>", regex=True, inplace=True)
|
97
103
|
|
98
104
|
# Convert to markdown with some formatting options
|
99
105
|
markdown_table = df.to_markdown(index=False, tablefmt="pipe")
|
100
|
-
return f"\n{markdown_table}\n\n"
|
106
|
+
return f"\n{markdown_table}\n\n"
|
101
107
|
|
102
108
|
|
103
109
|
def embed_links_in_text(page, text, links):
|
@@ -157,8 +163,20 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
157
163
|
x_tolerance = kwargs.get("x_tolerance", 1)
|
158
164
|
y_tolerance = kwargs.get("y_tolerance", 5)
|
159
165
|
|
160
|
-
#
|
161
|
-
|
166
|
+
# Table settings
|
167
|
+
vertical_strategy = kwargs.get("vertical_strategy", "lines")
|
168
|
+
horizontal_strategy = kwargs.get("horizontal_strategy", "lines")
|
169
|
+
snap_x_tolerance = kwargs.get("snap_x_tolerance", 10)
|
170
|
+
snap_y_tolerance = kwargs.get("snap_y_tolerance", 0)
|
171
|
+
|
172
|
+
tables = page.find_tables(
|
173
|
+
table_settings={
|
174
|
+
"vertical_strategy": vertical_strategy,
|
175
|
+
"horizontal_strategy": horizontal_strategy,
|
176
|
+
"snap_x_tolerance": snap_x_tolerance,
|
177
|
+
"snap_y_tolerance": snap_y_tolerance,
|
178
|
+
}
|
179
|
+
)
|
162
180
|
table_zones = [(table.bbox, process_table(table)) for table in tables]
|
163
181
|
|
164
182
|
# Create a filtered page excluding table areas
|
@@ -171,12 +189,46 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
171
189
|
words = filtered_page.extract_words(
|
172
190
|
x_tolerance=x_tolerance,
|
173
191
|
y_tolerance=y_tolerance,
|
174
|
-
extra_attrs=["size", "top", "bottom"],
|
192
|
+
extra_attrs=["size", "top", "bottom", "fontname"],
|
175
193
|
)
|
176
194
|
|
177
|
-
def format_paragraph(
|
178
|
-
|
179
|
-
|
195
|
+
def format_paragraph(text_elements):
|
196
|
+
"""Format a paragraph with styling applied to individual words"""
|
197
|
+
formatted_words = []
|
198
|
+
for element in text_elements:
|
199
|
+
text = element["text"]
|
200
|
+
formatting = get_text_formatting(element)
|
201
|
+
formatted_words.append(apply_markdown_formatting(text, formatting))
|
202
|
+
return f"{' '.join(formatted_words)}\n\n"
|
203
|
+
|
204
|
+
def get_text_formatting(word):
|
205
|
+
"""
|
206
|
+
Detect text formatting based on font properties
|
207
|
+
Returns a dict of formatting attributes
|
208
|
+
"""
|
209
|
+
formatting = {
|
210
|
+
"bold": False,
|
211
|
+
"italic": False,
|
212
|
+
}
|
213
|
+
|
214
|
+
# Check font name for common bold/italic indicators
|
215
|
+
font_name = word.get("fontname", "").lower()
|
216
|
+
if any(style in font_name for style in ["bold", "heavy", "black"]):
|
217
|
+
formatting["bold"] = True
|
218
|
+
if any(style in font_name for style in ["italic", "oblique"]):
|
219
|
+
formatting["italic"] = True
|
220
|
+
|
221
|
+
return formatting
|
222
|
+
|
223
|
+
def apply_markdown_formatting(text, formatting):
|
224
|
+
"""Apply markdown formatting to text based on detected styles"""
|
225
|
+
if formatting["bold"] and formatting["italic"]:
|
226
|
+
text = f"***{text}***"
|
227
|
+
elif formatting["bold"]:
|
228
|
+
text = f"**{text}**"
|
229
|
+
elif formatting["italic"]:
|
230
|
+
text = f"*{text}*"
|
231
|
+
return text
|
180
232
|
|
181
233
|
def detect_heading_level(font_size):
|
182
234
|
if font_size >= 24:
|
@@ -205,17 +257,18 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
205
257
|
while tables and word["bottom"] > tables[0][1]["bottom"]:
|
206
258
|
content_elements.append(tables.pop(0))
|
207
259
|
content_elements.append(("word", word))
|
260
|
+
content_elements.extend(tables)
|
208
261
|
|
209
262
|
for element_type, element in content_elements:
|
210
263
|
if element_type == "table":
|
211
264
|
# If there are any pending paragraphs or headings, add them first
|
212
265
|
if current_heading:
|
213
266
|
level = detect_heading_level(current_heading[0]["size"])
|
214
|
-
heading_text =
|
215
|
-
markdown_content.append(f"{'#' * level} {heading_text}
|
267
|
+
heading_text = format_paragraph(current_heading)
|
268
|
+
markdown_content.append(f"{'#' * level} {heading_text}")
|
216
269
|
current_heading = []
|
217
270
|
if current_paragraph:
|
218
|
-
markdown_content.append(format_paragraph(
|
271
|
+
markdown_content.append(format_paragraph(current_paragraph))
|
219
272
|
current_paragraph = []
|
220
273
|
# Add the table
|
221
274
|
markdown_content.append(element["content"])
|
@@ -233,46 +286,42 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
233
286
|
# If we were collecting a heading
|
234
287
|
if current_heading:
|
235
288
|
level = detect_heading_level(current_heading[0]["size"])
|
236
|
-
heading_text =
|
237
|
-
markdown_content.append(f"{'#' * level} {heading_text}
|
289
|
+
heading_text = format_paragraph(current_heading)
|
290
|
+
markdown_content.append(f"{'#' * level} {heading_text}")
|
238
291
|
current_heading = []
|
239
292
|
|
240
293
|
# If we were collecting a paragraph
|
241
294
|
if current_paragraph:
|
242
|
-
markdown_content.append(
|
243
|
-
format_paragraph(" ".join(current_paragraph))
|
244
|
-
)
|
295
|
+
markdown_content.append(format_paragraph(current_paragraph))
|
245
296
|
current_paragraph = []
|
246
297
|
|
247
298
|
# Add word to appropriate collection
|
248
299
|
if heading_level:
|
249
300
|
if current_paragraph: # Flush any pending paragraph
|
250
|
-
markdown_content.append(
|
251
|
-
format_paragraph(" ".join(current_paragraph))
|
252
|
-
)
|
301
|
+
markdown_content.append(format_paragraph(current_paragraph))
|
253
302
|
current_paragraph = []
|
254
|
-
current_heading.append(
|
303
|
+
current_heading.append(word)
|
255
304
|
else:
|
256
305
|
if current_heading: # Flush any pending heading
|
257
306
|
level = detect_heading_level(current_heading[0]["size"])
|
258
|
-
heading_text =
|
259
|
-
markdown_content.append(f"{'#' * level} {heading_text}
|
307
|
+
heading_text = format_paragraph(current_heading)
|
308
|
+
markdown_content.append(f"{'#' * level} {heading_text}")
|
260
309
|
current_heading = []
|
261
|
-
current_paragraph.append(word
|
310
|
+
current_paragraph.append(word)
|
262
311
|
|
263
312
|
last_y = word["top"]
|
264
313
|
|
265
314
|
# Handle remaining content
|
266
315
|
if current_heading:
|
267
316
|
level = detect_heading_level(current_heading[0]["size"])
|
268
|
-
heading_text =
|
269
|
-
markdown_content.append(f"{'#' * level} {heading_text}
|
317
|
+
heading_text = format_paragraph(current_heading)
|
318
|
+
markdown_content.append(f"{'#' * level} {heading_text}")
|
270
319
|
|
271
320
|
if current_paragraph:
|
272
|
-
markdown_content.append(format_paragraph(
|
321
|
+
markdown_content.append(format_paragraph(current_paragraph))
|
273
322
|
|
274
323
|
# Process links for the page
|
275
|
-
content = "".join(markdown_content)
|
324
|
+
content = "".join(markdown_content)
|
276
325
|
if page.annots:
|
277
326
|
links = []
|
278
327
|
for annot in page.annots:
|
@@ -283,6 +332,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
283
332
|
if links:
|
284
333
|
content = embed_links_in_text(page, content, links)
|
285
334
|
|
335
|
+
# Remove redundant formatting
|
336
|
+
content = content.replace("** **", " ").replace("* *", " ")
|
337
|
+
|
286
338
|
return content
|
287
339
|
|
288
340
|
|
lexoid/core/prompt_templates.py
CHANGED
@@ -76,3 +76,22 @@ Ensure accurate representation of all content, including tables and visual eleme
|
|
76
76
|
"""
|
77
77
|
|
78
78
|
INSTRUCTIONS_ADD_PG_BREAK = "Insert a `<page-break>` tag between the content of each page to maintain the original page structure."
|
79
|
+
|
80
|
+
LLAMA_PARSER_PROMPT = """\
|
81
|
+
You are a document conversion assistant. Your task is to accurately reproduce the content of an image in Markdown and HTML format, maintaining the visual structure and layout of the original document as closely as possible.
|
82
|
+
|
83
|
+
Instructions:
|
84
|
+
1. Use a combination of Markdown and HTML to replicate the document's layout and formatting.
|
85
|
+
2. Reproduce all text content exactly as it appears, including preserving capitalization, punctuation, and any apparent errors or inconsistencies in the original.
|
86
|
+
3. Use appropriate Markdown syntax for headings, emphasis (bold, italic), and lists where applicable.
|
87
|
+
4. Always use HTML (`<table>`, `<tr>`, `<td>`) to represent tabular data. Include `colspan` and `rowspan` attributes if needed.
|
88
|
+
5. For figures, graphs, or diagrams, represent them using `<img>` tags and use appropriate `alt` text.
|
89
|
+
6. For handwritten documents, reproduce the content as typed text, maintaining the original structure and layout.
|
90
|
+
7. Do not include any descriptions of the document's appearance, paper type, or writing implements used.
|
91
|
+
8. Do not add any explanatory notes, comments, or additional information outside of the converted content.
|
92
|
+
9. Ensure all special characters, symbols, and equations are accurately represented.
|
93
|
+
10. Provide the output only once, without any duplication.
|
94
|
+
11. Enclose the entire output within <output> and </output> tags.
|
95
|
+
|
96
|
+
Output the converted content directly in Markdown and HTML without any additional explanations, descriptions, or notes.
|
97
|
+
"""
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lexoid
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.7
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.10,<4.0
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -11,6 +11,7 @@ Classifier: Programming Language :: Python :: 3.13
|
|
11
11
|
Requires-Dist: bs4 (>=0.0.2,<0.0.3)
|
12
12
|
Requires-Dist: docx2pdf (>=0.1.8,<0.2.0)
|
13
13
|
Requires-Dist: google-generativeai (>=0.8.1,<0.9.0)
|
14
|
+
Requires-Dist: huggingface-hub (>=0.27.0,<0.28.0)
|
14
15
|
Requires-Dist: loguru (>=0.7.2,<0.8.0)
|
15
16
|
Requires-Dist: markdown (>=3.7,<4.0)
|
16
17
|
Requires-Dist: markdownify (>=0.13.1,<0.14.0)
|
@@ -27,6 +28,7 @@ Requires-Dist: pyqtwebengine (>=5.15.7,<6.0.0) ; platform_system != "debian"
|
|
27
28
|
Requires-Dist: python-docx (>=1.1.2,<2.0.0)
|
28
29
|
Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
|
29
30
|
Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
31
|
+
Requires-Dist: together (>=1.3.10,<2.0.0)
|
30
32
|
Description-Content-Type: text/markdown
|
31
33
|
|
32
34
|
# Lexoid
|
@@ -39,38 +41,46 @@ Lexoid is an efficient document parsing library that supports both LLM-based and
|
|
39
41
|
- Collaborate with a permissive license
|
40
42
|
|
41
43
|
## Installation
|
42
|
-
|
44
|
+
### Installing with pip
|
43
45
|
```
|
44
|
-
|
46
|
+
pip install lexoid
|
45
47
|
```
|
46
|
-
|
48
|
+
|
49
|
+
To use LLM-based parsing, define the following environment variables or create a `.env` file with the following definitions
|
47
50
|
```
|
48
|
-
|
51
|
+
OPENAI_API_KEY=""
|
52
|
+
GOOGLE_API_KEY=""
|
49
53
|
```
|
50
54
|
|
51
|
-
|
55
|
+
Optionally, to use `Playwright` for retrieving web content with the `.whl` package (else regular requests will be used by default):
|
52
56
|
```
|
53
|
-
|
57
|
+
playwright install --with-deps --only-shell chromium
|
54
58
|
```
|
55
59
|
|
56
|
-
|
60
|
+
### Building `.whl` from source
|
61
|
+
To create `.whl`:
|
57
62
|
```
|
58
|
-
|
59
|
-
GOOGLE_API_KEY=""
|
63
|
+
make build
|
60
64
|
```
|
61
65
|
|
62
|
-
|
66
|
+
### Creating a local installation
|
67
|
+
To install dependencies:
|
68
|
+
```
|
69
|
+
make install
|
70
|
+
```
|
71
|
+
or, to install with dev-dependencies:
|
63
72
|
```
|
64
|
-
|
73
|
+
make dev
|
65
74
|
```
|
66
75
|
|
67
|
-
|
76
|
+
To activate virtual environment:
|
68
77
|
```
|
69
|
-
|
78
|
+
source .venv/bin/activate
|
70
79
|
```
|
71
80
|
|
72
81
|
## Usage
|
73
82
|
[Example Notebook](https://github.com/oidlabs-com/Lexoid/blob/main/examples/example_notebook.ipynb)
|
83
|
+
[Example Colab Notebook](https://drive.google.com/file/d/1v9R6VOUp9CEGalgZGeg5G57XzHqh_tB6/view?usp=sharing)
|
74
84
|
|
75
85
|
Here's a quick example to parse documents using Lexoid:
|
76
86
|
``` python
|
@@ -98,7 +108,13 @@ Initial results (_more updates soon_)
|
|
98
108
|
| Rank | Model/Framework | Similarity | Time (s) |
|
99
109
|
|------|-----------|------------|----------|
|
100
110
|
| 1 | gpt-4o | 0.799 | 21.77|
|
101
|
-
| 2 | gemini-
|
102
|
-
| 3 |
|
103
|
-
| 4 | gemini-1.5-
|
111
|
+
| 2 | gemini-2.0-flash-exp | 0.797 | 13.47 |
|
112
|
+
| 3 | gemini-exp-1121 | 0.779 | 30.88 |
|
113
|
+
| 4 | gemini-1.5-pro | 0.742 | 15.77 |
|
114
|
+
| 5 | gpt-4o-mini | 0.721 | 14.86 |
|
115
|
+
| 6 | gemini-1.5-flash | 0.702 | 4.56 |
|
116
|
+
| 7 | Llama-3.2-11B-Vision-Instruct (via HF) | 0.582 | 21.74 |
|
117
|
+
| 8 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.556 | 4.58 |
|
118
|
+
| 9 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.527 | 10.57 |
|
119
|
+
| 10 | Llama-Vision-Free (via Together AI) | 0.435 | 8.42 |
|
104
120
|
|
@@ -0,0 +1,9 @@
|
|
1
|
+
lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
|
2
|
+
lexoid/core/parse_type/llm_parser.py,sha256=i_iidoP_qExGTScRPMBX5X3RnjIf6XqAS_NhLkz0_LM,8464
|
3
|
+
lexoid/core/parse_type/static_parser.py,sha256=NlAE_WMMNvNnVo2aQrA9mN3fwJ6ZrshMC8S9kG0h8CA,13772
|
4
|
+
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
+
lexoid/core/utils.py,sha256=rd8sf2OZqMv_oHGxM1redpSwU8f_sBJ-0tzlbp8U3_A,17193
|
6
|
+
lexoid-0.1.7.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
+
lexoid-0.1.7.dist-info/METADATA,sha256=yOwsqpA5U-2Z2CXr5Cnrs2a6HtqY-4WryVfYDTI7X08,4092
|
8
|
+
lexoid-0.1.7.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
+
lexoid-0.1.7.dist-info/RECORD,,
|
@@ -1,9 +0,0 @@
|
|
1
|
-
lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
|
2
|
-
lexoid/core/parse_type/llm_parser.py,sha256=R-0HoXATCBnMJpyjOmMw_EdvVS_PKhhgC7z3NoKzhrs,6311
|
3
|
-
lexoid/core/parse_type/static_parser.py,sha256=uFmuz_1JQHUp8FZADPhLBPEv1La2AnZ4j2Vj6SlH0fo,11993
|
4
|
-
lexoid/core/prompt_templates.py,sha256=0KXHGNunMfrRZh5QfENcxY1s30VioY2fsu3wELc-3z8,4794
|
5
|
-
lexoid/core/utils.py,sha256=rd8sf2OZqMv_oHGxM1redpSwU8f_sBJ-0tzlbp8U3_A,17193
|
6
|
-
lexoid-0.1.6.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
-
lexoid-0.1.6.post1.dist-info/METADATA,sha256=tPhhqCNwJGR5LNSH-J9hCJf2O4AN6QJhFiXHbUcRizM,3436
|
8
|
-
lexoid-0.1.6.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
-
lexoid-0.1.6.post1.dist-info/RECORD,,
|
File without changes
|
File without changes
|