lexoid 0.1.12__py3-none-any.whl → 0.1.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexoid/api.py +110 -38
- lexoid/core/parse_type/llm_parser.py +144 -66
- lexoid/core/parse_type/static_parser.py +198 -26
- lexoid/core/prompt_templates.py +2 -1
- lexoid/core/utils.py +15 -7
- {lexoid-0.1.12.dist-info → lexoid-0.1.14.dist-info}/METADATA +25 -20
- lexoid-0.1.14.dist-info/RECORD +9 -0
- lexoid-0.1.12.dist-info/RECORD +0 -9
- {lexoid-0.1.12.dist-info → lexoid-0.1.14.dist-info}/LICENSE +0 -0
- {lexoid-0.1.12.dist-info → lexoid-0.1.14.dist-info}/WHEEL +0 -0
lexoid/api.py
CHANGED
@@ -10,7 +10,11 @@ from typing import Union, Dict, List
|
|
10
10
|
|
11
11
|
from loguru import logger
|
12
12
|
|
13
|
-
from lexoid.core.parse_type.llm_parser import
|
13
|
+
from lexoid.core.parse_type.llm_parser import (
|
14
|
+
parse_llm_doc,
|
15
|
+
create_response,
|
16
|
+
convert_doc_to_base64_images,
|
17
|
+
)
|
14
18
|
from lexoid.core.parse_type.static_parser import parse_static_doc
|
15
19
|
from lexoid.core.utils import (
|
16
20
|
convert_to_pdf,
|
@@ -49,6 +53,7 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
|
|
49
53
|
- parent_title: Title of parent doc if recursively parsed
|
50
54
|
- recursive_docs: List of dictionaries for recursively parsed documents
|
51
55
|
- token_usage: Dictionary containing token usage statistics
|
56
|
+
- parser_used: Which parser was actually used
|
52
57
|
"""
|
53
58
|
if parser_type == ParserType.AUTO:
|
54
59
|
router_priority = kwargs.get("router_priority", "speed")
|
@@ -60,10 +65,13 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
|
|
60
65
|
)
|
61
66
|
if parser_type == ParserType.STATIC_PARSE:
|
62
67
|
logger.debug("Using static parser")
|
63
|
-
|
68
|
+
result = parse_static_doc(path, **kwargs)
|
64
69
|
else:
|
65
70
|
logger.debug("Using LLM parser")
|
66
|
-
|
71
|
+
result = parse_llm_doc(path, **kwargs)
|
72
|
+
|
73
|
+
result["parser_used"] = parser_type
|
74
|
+
return result
|
67
75
|
|
68
76
|
|
69
77
|
def parse_chunk_list(
|
@@ -82,15 +90,18 @@ def parse_chunk_list(
|
|
82
90
|
"""
|
83
91
|
combined_segments = []
|
84
92
|
raw_texts = []
|
85
|
-
token_usage = {"input": 0, "output": 0, "
|
93
|
+
token_usage = {"input": 0, "output": 0, "llm_page_count": 0}
|
86
94
|
for file_path in file_paths:
|
87
95
|
result = parse_chunk(file_path, parser_type, **kwargs)
|
88
96
|
combined_segments.extend(result["segments"])
|
89
97
|
raw_texts.append(result["raw"])
|
90
|
-
if
|
98
|
+
if (
|
99
|
+
result.get("parser_used") == ParserType.LLM_PARSE
|
100
|
+
and "token_usage" in result
|
101
|
+
):
|
91
102
|
token_usage["input"] += result["token_usage"]["input"]
|
92
103
|
token_usage["output"] += result["token_usage"]["output"]
|
93
|
-
token_usage["
|
104
|
+
token_usage["llm_page_count"] += len(result["segments"])
|
94
105
|
token_usage["total"] = token_usage["input"] + token_usage["output"]
|
95
106
|
|
96
107
|
return {
|
@@ -136,7 +147,7 @@ def parse(
|
|
136
147
|
as_pdf = kwargs.get("as_pdf", False)
|
137
148
|
depth = kwargs.get("depth", 1)
|
138
149
|
|
139
|
-
if type(parser_type)
|
150
|
+
if type(parser_type) is str:
|
140
151
|
parser_type = ParserType[parser_type]
|
141
152
|
if (
|
142
153
|
path.lower().endswith((".doc", ".docx"))
|
@@ -184,7 +195,7 @@ def parse(
|
|
184
195
|
|
185
196
|
if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
|
186
197
|
kwargs["split"] = False
|
187
|
-
result =
|
198
|
+
result = parse_chunk_list([path], parser_type, kwargs)
|
188
199
|
else:
|
189
200
|
kwargs["split"] = True
|
190
201
|
split_dir = os.path.join(temp_dir, "splits/")
|
@@ -219,42 +230,43 @@ def parse(
|
|
219
230
|
"token_usage": {
|
220
231
|
"input": sum(r["token_usage"]["input"] for r in chunk_results),
|
221
232
|
"output": sum(r["token_usage"]["output"] for r in chunk_results),
|
222
|
-
"
|
223
|
-
r["token_usage"]["
|
233
|
+
"llm_page_count": sum(
|
234
|
+
r["token_usage"]["llm_page_count"] for r in chunk_results
|
224
235
|
),
|
225
236
|
"total": sum(r["token_usage"]["total"] for r in chunk_results),
|
226
237
|
},
|
227
238
|
}
|
228
239
|
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
240
|
+
if "api_cost_mapping" in kwargs and "token_usage" in result:
|
241
|
+
api_cost_mapping = kwargs["api_cost_mapping"]
|
242
|
+
if isinstance(api_cost_mapping, dict):
|
243
|
+
api_cost_mapping = api_cost_mapping
|
244
|
+
elif isinstance(api_cost_mapping, str) and os.path.exists(api_cost_mapping):
|
245
|
+
with open(api_cost_mapping, "r") as f:
|
246
|
+
api_cost_mapping = json.load(f)
|
247
|
+
else:
|
248
|
+
raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.")
|
249
|
+
|
250
|
+
api_cost = api_cost_mapping.get(
|
251
|
+
kwargs.get("model", "gemini-2.0-flash"), None
|
252
|
+
)
|
253
|
+
if api_cost:
|
254
|
+
token_usage = result["token_usage"]
|
255
|
+
token_cost = {
|
256
|
+
"input": token_usage["input"] * api_cost["input"] / 1_000_000,
|
257
|
+
"input-image": api_cost.get("input-image", 0)
|
258
|
+
* token_usage.get("llm_page_count", 0),
|
259
|
+
"output": token_usage["output"] * api_cost["output"] / 1_000_000,
|
260
|
+
}
|
261
|
+
token_cost["total"] = (
|
262
|
+
token_cost["input"]
|
263
|
+
+ token_cost["input-image"]
|
264
|
+
+ token_cost["output"]
|
243
265
|
)
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
+ api_cost.get("input-image", 0) * token_usage["image_count"],
|
249
|
-
"output": token_usage["output"]
|
250
|
-
* api_cost["output"]
|
251
|
-
/ 1_000_000,
|
252
|
-
}
|
253
|
-
token_cost["total"] = token_cost["input"] + token_cost["output"]
|
254
|
-
result["token_cost"] = token_cost
|
255
|
-
|
256
|
-
if as_pdf:
|
257
|
-
result["pdf_path"] = path
|
266
|
+
result["token_cost"] = token_cost
|
267
|
+
|
268
|
+
if as_pdf:
|
269
|
+
result["pdf_path"] = path
|
258
270
|
|
259
271
|
if depth > 1:
|
260
272
|
recursive_docs = []
|
@@ -285,3 +297,63 @@ def parse(
|
|
285
297
|
result["recursive_docs"] = recursive_docs
|
286
298
|
|
287
299
|
return result
|
300
|
+
|
301
|
+
|
302
|
+
def parse_with_schema(
|
303
|
+
path: str, schema: Dict, api: str = "openai", model: str = "gpt-4o-mini", **kwargs
|
304
|
+
) -> List[List[Dict]]:
|
305
|
+
"""
|
306
|
+
Parses a PDF using an LLM to generate structured output conforming to a given JSON schema.
|
307
|
+
|
308
|
+
Args:
|
309
|
+
path (str): Path to the PDF file.
|
310
|
+
schema (Dict): JSON schema to which the parsed output should conform.
|
311
|
+
api (str, optional): LLM API provider (One of "openai", "huggingface", "together", "openrouter", and "fireworks").
|
312
|
+
model (str, optional): LLM model name.
|
313
|
+
**kwargs: Additional arguments for the parser (e.g.: temperature, max_tokens).
|
314
|
+
|
315
|
+
Returns:
|
316
|
+
List[List[Dict]]: List of dictionaries for each page, each conforming to the provided schema.
|
317
|
+
"""
|
318
|
+
system_prompt = f"""
|
319
|
+
The output should be formatted as a JSON instance that conforms to the JSON schema below.
|
320
|
+
|
321
|
+
As an example, for the schema {{
|
322
|
+
"properties": {{
|
323
|
+
"foo": {{
|
324
|
+
"title": "Foo",
|
325
|
+
"description": "a list of strings",
|
326
|
+
"type": "array",
|
327
|
+
"items": {{"type": "string"}}
|
328
|
+
}}
|
329
|
+
}},
|
330
|
+
"required": ["foo"]
|
331
|
+
}}, the object {{"foo": ["bar", "baz"]}} is valid. The object {{"properties": {{"foo": ["bar", "baz"]}}}} is not.
|
332
|
+
|
333
|
+
Here is the output schema:
|
334
|
+
{json.dumps(schema, indent=2)}
|
335
|
+
|
336
|
+
"""
|
337
|
+
|
338
|
+
user_prompt = "You are an AI agent that parses documents and returns them in the specified JSON format. Please parse the document and return it in the required format."
|
339
|
+
|
340
|
+
responses = []
|
341
|
+
images = convert_doc_to_base64_images(path)
|
342
|
+
for i, (page_num, image) in enumerate(images):
|
343
|
+
resp_dict = create_response(
|
344
|
+
api=api,
|
345
|
+
model=model,
|
346
|
+
user_prompt=user_prompt,
|
347
|
+
system_prompt=system_prompt,
|
348
|
+
image_url=image,
|
349
|
+
temperature=kwargs.get("temperature", 0.0),
|
350
|
+
max_tokens=kwargs.get("max_tokens", 1024),
|
351
|
+
)
|
352
|
+
|
353
|
+
response = resp_dict.get("response", "")
|
354
|
+
response = response.split("```json")[-1].split("```")[0].strip()
|
355
|
+
logger.debug(f"Processing page {page_num + 1} with response: {response}")
|
356
|
+
new_dict = json.loads(response)
|
357
|
+
responses.append(new_dict)
|
358
|
+
|
359
|
+
return responses
|
@@ -3,23 +3,24 @@ import io
|
|
3
3
|
import mimetypes
|
4
4
|
import os
|
5
5
|
import time
|
6
|
+
from functools import wraps
|
7
|
+
from typing import Dict, List, Optional, Tuple
|
8
|
+
|
6
9
|
import pypdfium2 as pdfium
|
7
10
|
import requests
|
8
|
-
from
|
11
|
+
from huggingface_hub import InferenceClient
|
12
|
+
from loguru import logger
|
13
|
+
from openai import OpenAI
|
9
14
|
from requests.exceptions import HTTPError
|
10
|
-
from
|
15
|
+
from together import Together
|
11
16
|
|
12
17
|
from lexoid.core.prompt_templates import (
|
13
18
|
INSTRUCTIONS_ADD_PG_BREAK,
|
19
|
+
LLAMA_PARSER_PROMPT,
|
14
20
|
OPENAI_USER_PROMPT,
|
15
21
|
PARSER_PROMPT,
|
16
|
-
LLAMA_PARSER_PROMPT,
|
17
22
|
)
|
18
23
|
from lexoid.core.utils import convert_image_to_pdf
|
19
|
-
from loguru import logger
|
20
|
-
from openai import OpenAI
|
21
|
-
from together import Together
|
22
|
-
from huggingface_hub import InferenceClient
|
23
24
|
|
24
25
|
|
25
26
|
def retry_on_http_error(func):
|
@@ -65,10 +66,13 @@ def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
|
|
65
66
|
return parse_with_api(path, api="huggingface", **kwargs)
|
66
67
|
if any(model.startswith(prefix) for prefix in ["microsoft", "google", "qwen"]):
|
67
68
|
return parse_with_api(path, api="openrouter", **kwargs)
|
69
|
+
if model.startswith("accounts/fireworks"):
|
70
|
+
return parse_with_api(path, api="fireworks", **kwargs)
|
68
71
|
raise ValueError(f"Unsupported model: {model}")
|
69
72
|
|
70
73
|
|
71
74
|
def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
75
|
+
logger.debug(f"Parsing with Gemini API and model {kwargs['model']}")
|
72
76
|
api_key = os.environ.get("GOOGLE_API_KEY")
|
73
77
|
if not api_key:
|
74
78
|
raise ValueError("GOOGLE_API_KEY environment variable is not set")
|
@@ -105,7 +109,7 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
105
109
|
}
|
106
110
|
],
|
107
111
|
"generationConfig": {
|
108
|
-
"temperature": kwargs.get("temperature", 0.
|
112
|
+
"temperature": kwargs.get("temperature", 0.2),
|
109
113
|
},
|
110
114
|
}
|
111
115
|
|
@@ -127,7 +131,7 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
127
131
|
|
128
132
|
combined_text = ""
|
129
133
|
if "<output>" in raw_text:
|
130
|
-
combined_text = raw_text.split("<output>")[1].strip()
|
134
|
+
combined_text = raw_text.split("<output>")[-1].strip()
|
131
135
|
if "</output>" in result:
|
132
136
|
combined_text = result.split("</output>")[0].strip()
|
133
137
|
|
@@ -169,18 +173,54 @@ def convert_pdf_page_to_base64(
|
|
169
173
|
return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
|
170
174
|
|
171
175
|
|
172
|
-
def
|
173
|
-
|
174
|
-
|
176
|
+
def get_messages(
|
177
|
+
system_prompt: Optional[str], user_prompt: Optional[str], image_url: Optional[str]
|
178
|
+
) -> List[Dict]:
|
179
|
+
messages = []
|
180
|
+
if system_prompt:
|
181
|
+
messages.append(
|
182
|
+
{
|
183
|
+
"role": "system",
|
184
|
+
"content": system_prompt,
|
185
|
+
}
|
186
|
+
)
|
187
|
+
base_message = (
|
188
|
+
[
|
189
|
+
{"type": "text", "text": user_prompt},
|
190
|
+
]
|
191
|
+
if user_prompt
|
192
|
+
else []
|
193
|
+
)
|
194
|
+
image_message = (
|
195
|
+
[
|
196
|
+
{
|
197
|
+
"type": "image_url",
|
198
|
+
"image_url": {"url": image_url},
|
199
|
+
}
|
200
|
+
]
|
201
|
+
if image_url
|
202
|
+
else []
|
203
|
+
)
|
175
204
|
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
205
|
+
messages.append(
|
206
|
+
{
|
207
|
+
"role": "user",
|
208
|
+
"content": base_message + image_message,
|
209
|
+
}
|
210
|
+
)
|
180
211
|
|
181
|
-
|
182
|
-
|
183
|
-
|
212
|
+
return messages
|
213
|
+
|
214
|
+
|
215
|
+
def create_response(
|
216
|
+
api: str,
|
217
|
+
model: str,
|
218
|
+
system_prompt: Optional[str] = None,
|
219
|
+
user_prompt: Optional[str] = None,
|
220
|
+
image_url: Optional[str] = None,
|
221
|
+
temperature: float = 0.2,
|
222
|
+
max_tokens: int = 1024,
|
223
|
+
) -> Dict:
|
184
224
|
# Initialize appropriate client
|
185
225
|
clients = {
|
186
226
|
"openai": lambda: OpenAI(),
|
@@ -192,11 +232,52 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
|
192
232
|
base_url="https://openrouter.ai/api/v1",
|
193
233
|
api_key=os.environ["OPENROUTER_API_KEY"],
|
194
234
|
),
|
235
|
+
"fireworks": lambda: OpenAI(
|
236
|
+
base_url="https://api.fireworks.ai/inference/v1",
|
237
|
+
api_key=os.environ["FIREWORKS_API_KEY"],
|
238
|
+
),
|
195
239
|
}
|
196
240
|
assert api in clients, f"Unsupported API: {api}"
|
197
|
-
logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
|
198
241
|
client = clients[api]()
|
199
242
|
|
243
|
+
# Prepare messages for the API call
|
244
|
+
messages = get_messages(system_prompt, user_prompt, image_url)
|
245
|
+
|
246
|
+
# Common completion parameters
|
247
|
+
completion_params = {
|
248
|
+
"model": model,
|
249
|
+
"messages": messages,
|
250
|
+
"max_tokens": max_tokens,
|
251
|
+
"temperature": temperature,
|
252
|
+
}
|
253
|
+
|
254
|
+
# Get completion from selected API
|
255
|
+
response = client.chat.completions.create(**completion_params)
|
256
|
+
token_usage = response.usage
|
257
|
+
|
258
|
+
# Extract the response text
|
259
|
+
page_text = response.choices[0].message.content
|
260
|
+
|
261
|
+
return {
|
262
|
+
"response": page_text,
|
263
|
+
"usage": token_usage,
|
264
|
+
}
|
265
|
+
|
266
|
+
|
267
|
+
def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
268
|
+
"""
|
269
|
+
Parse documents (PDFs or images) using various vision model APIs.
|
270
|
+
|
271
|
+
Args:
|
272
|
+
path (str): Path to the document to parse
|
273
|
+
api (str): Which API to use ("openai", "huggingface", or "together")
|
274
|
+
**kwargs: Additional arguments including model, temperature, title, etc.
|
275
|
+
|
276
|
+
Returns:
|
277
|
+
Dict: Dictionary containing parsed document data
|
278
|
+
"""
|
279
|
+
logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
|
280
|
+
|
200
281
|
# Handle different input types
|
201
282
|
mime_type, _ = mimetypes.guess_type(path)
|
202
283
|
if mime_type and mime_type.startswith("image"):
|
@@ -215,67 +296,39 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
|
215
296
|
for page_num in range(len(pdf_document))
|
216
297
|
]
|
217
298
|
|
218
|
-
#
|
219
|
-
|
220
|
-
|
221
|
-
"type": "image_url",
|
222
|
-
"image_url": {"url": image_url},
|
223
|
-
}
|
224
|
-
|
299
|
+
# Process each page/image
|
300
|
+
all_results = []
|
301
|
+
for page_num, image_url in images:
|
225
302
|
if api == "openai":
|
226
303
|
system_prompt = kwargs.get(
|
227
304
|
"system_prompt", PARSER_PROMPT.format(custom_instructions="")
|
228
305
|
)
|
229
306
|
user_prompt = kwargs.get("user_prompt", OPENAI_USER_PROMPT)
|
230
|
-
return [
|
231
|
-
{
|
232
|
-
"role": "system",
|
233
|
-
"content": system_prompt,
|
234
|
-
},
|
235
|
-
{
|
236
|
-
"role": "user",
|
237
|
-
"content": [
|
238
|
-
{"type": "text", "text": user_prompt},
|
239
|
-
image_message,
|
240
|
-
],
|
241
|
-
},
|
242
|
-
]
|
243
307
|
else:
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
messages = get_messages(page_num, image_url)
|
257
|
-
|
258
|
-
# Common completion parameters
|
259
|
-
completion_params = {
|
260
|
-
"model": kwargs["model"],
|
261
|
-
"messages": messages,
|
262
|
-
"max_tokens": kwargs.get("max_tokens", 1024),
|
263
|
-
"temperature": kwargs.get("temperature", 0.7),
|
264
|
-
}
|
308
|
+
system_prompt = kwargs.get("system_prompt", None)
|
309
|
+
user_prompt = kwargs.get("user_prompt", LLAMA_PARSER_PROMPT)
|
310
|
+
|
311
|
+
response = create_response(
|
312
|
+
api=api,
|
313
|
+
model=kwargs["model"],
|
314
|
+
system_prompt=system_prompt,
|
315
|
+
user_prompt=user_prompt,
|
316
|
+
image_url=image_url,
|
317
|
+
temperature=kwargs.get("temperature", 0.2),
|
318
|
+
max_tokens=kwargs.get("max_tokens", 1024),
|
319
|
+
)
|
265
320
|
|
266
321
|
# Get completion from selected API
|
267
|
-
|
268
|
-
token_usage = response
|
322
|
+
page_text = response["response"]
|
323
|
+
token_usage = response["usage"]
|
269
324
|
|
270
|
-
# Extract the response text
|
271
|
-
page_text = response.choices[0].message.content
|
272
325
|
if kwargs.get("verbose", None):
|
273
326
|
logger.debug(f"Page {page_num + 1} response: {page_text}")
|
274
327
|
|
275
328
|
# Extract content between output tags if present
|
276
329
|
result = page_text
|
277
330
|
if "<output>" in page_text:
|
278
|
-
result = page_text.split("<output>")[1].strip()
|
331
|
+
result = page_text.split("<output>")[-1].strip()
|
279
332
|
if "</output>" in result:
|
280
333
|
result = result.split("</output>")[0].strip()
|
281
334
|
all_results.append(
|
@@ -319,3 +372,28 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
|
319
372
|
"total": sum(total_tokens for _, _, _, _, total_tokens in all_results),
|
320
373
|
},
|
321
374
|
}
|
375
|
+
|
376
|
+
|
377
|
+
def convert_doc_to_base64_images(path: str) -> List[Tuple[int, str]]:
|
378
|
+
"""
|
379
|
+
Converts a document (PDF or image) to a base64 encoded string.
|
380
|
+
|
381
|
+
Args:
|
382
|
+
path (str): Path to the PDF file.
|
383
|
+
|
384
|
+
Returns:
|
385
|
+
str: Base64 encoded string of the PDF content.
|
386
|
+
"""
|
387
|
+
if path.endswith(".pdf"):
|
388
|
+
pdf_document = pdfium.PdfDocument(path)
|
389
|
+
return [
|
390
|
+
(
|
391
|
+
page_num,
|
392
|
+
f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
|
393
|
+
)
|
394
|
+
for page_num in range(len(pdf_document))
|
395
|
+
]
|
396
|
+
elif mimetypes.guess_type(path)[0].startswith("image"):
|
397
|
+
with open(path, "rb") as img_file:
|
398
|
+
image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
|
399
|
+
return [(0, f"data:image/png;base64,{image_base64}")]
|
@@ -1,7 +1,8 @@
|
|
1
1
|
import os
|
2
|
+
import re
|
2
3
|
import tempfile
|
3
4
|
from time import time
|
4
|
-
from typing import
|
5
|
+
from typing import Dict, List
|
5
6
|
|
6
7
|
import pandas as pd
|
7
8
|
import pdfplumber
|
@@ -9,14 +10,15 @@ from docx import Document
|
|
9
10
|
from pdfminer.high_level import extract_pages
|
10
11
|
from pdfminer.layout import LTTextContainer
|
11
12
|
from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
|
12
|
-
from pptx2md import
|
13
|
+
from pptx2md import ConversionConfig, convert
|
14
|
+
|
13
15
|
|
14
16
|
from lexoid.core.utils import (
|
15
17
|
get_file_type,
|
16
18
|
get_uri_rect,
|
17
19
|
html_to_markdown,
|
18
|
-
split_pdf,
|
19
20
|
split_md_by_headings,
|
21
|
+
split_pdf,
|
20
22
|
)
|
21
23
|
|
22
24
|
|
@@ -203,6 +205,25 @@ def embed_links_in_text(page, text, links):
|
|
203
205
|
return text
|
204
206
|
|
205
207
|
|
208
|
+
def detect_indentation_level(word, base_left_position):
|
209
|
+
"""Determine indentation level based on left position difference."""
|
210
|
+
left_diff = word["x0"] - base_left_position
|
211
|
+
if left_diff < 5:
|
212
|
+
return 0
|
213
|
+
return int(left_diff // 25) + 1
|
214
|
+
|
215
|
+
|
216
|
+
def embed_email_links(text: str) -> str:
|
217
|
+
"""
|
218
|
+
Detect email addresses in text and wrap them in angle brackets.
|
219
|
+
For example, 'mail@example.com' becomes '<mail@example.com>'.
|
220
|
+
"""
|
221
|
+
email_pattern = re.compile(
|
222
|
+
r"(?<![<\[])(?P<email>\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b)(?![>\]])"
|
223
|
+
)
|
224
|
+
return email_pattern.sub(lambda match: f"<{match.group('email')}>", text)
|
225
|
+
|
226
|
+
|
206
227
|
def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
207
228
|
"""
|
208
229
|
Process a single page's content and return formatted markdown text.
|
@@ -213,7 +234,26 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
213
234
|
last_y = None
|
214
235
|
x_tolerance = kwargs.get("x_tolerance", 1)
|
215
236
|
y_tolerance = kwargs.get("y_tolerance", 5)
|
216
|
-
|
237
|
+
next_h_line_idx = 0
|
238
|
+
|
239
|
+
# First detect horizontal lines that could be markdown rules
|
240
|
+
horizontal_lines = []
|
241
|
+
if hasattr(page, "lines"):
|
242
|
+
for line in page.lines:
|
243
|
+
# Check if line is approximately horizontal (within 5 degrees)
|
244
|
+
if (
|
245
|
+
abs(line["height"]) < 0.1
|
246
|
+
or abs(line["width"]) > abs(line["height"]) * 20
|
247
|
+
):
|
248
|
+
# Consider it a horizontal rule candidate
|
249
|
+
horizontal_lines.append(
|
250
|
+
{
|
251
|
+
"top": line["top"],
|
252
|
+
"bottom": line["bottom"],
|
253
|
+
"x0": line["x0"],
|
254
|
+
"x1": line["x1"],
|
255
|
+
}
|
256
|
+
)
|
217
257
|
# Table settings
|
218
258
|
vertical_strategy = kwargs.get("vertical_strategy", "lines")
|
219
259
|
horizontal_strategy = kwargs.get("horizontal_strategy", "lines")
|
@@ -243,14 +283,43 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
243
283
|
extra_attrs=["size", "top", "bottom", "fontname"],
|
244
284
|
)
|
245
285
|
|
246
|
-
|
247
|
-
""
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
286
|
+
if words:
|
287
|
+
font_sizes = [w.get("size", 12) for w in words]
|
288
|
+
body_font_size = max(set(font_sizes), key=font_sizes.count)
|
289
|
+
else:
|
290
|
+
body_font_size = 12
|
291
|
+
|
292
|
+
left_positions = []
|
293
|
+
prev_bottom = None
|
294
|
+
|
295
|
+
for word in words:
|
296
|
+
# Check if this is likely a new line (first word in line)
|
297
|
+
if prev_bottom is None or abs(word["top"] - prev_bottom) > y_tolerance:
|
298
|
+
left_positions.append(word["x0"])
|
299
|
+
prev_bottom = word["top"]
|
300
|
+
|
301
|
+
# Find the most common minimum left position (mode)
|
302
|
+
if left_positions:
|
303
|
+
base_left = max(set(left_positions), key=left_positions.count)
|
304
|
+
else:
|
305
|
+
base_left = 0
|
306
|
+
|
307
|
+
for line in horizontal_lines:
|
308
|
+
# Check each word to see if it overlaps with this line
|
309
|
+
for word in words:
|
310
|
+
# Get word bounding box coordinates
|
311
|
+
word_left = word["x0"]
|
312
|
+
word_right = word["x1"]
|
313
|
+
word_top = word["top"]
|
314
|
+
word_bottom = word["bottom"]
|
315
|
+
|
316
|
+
# Check if word overlaps with line in both x and y dimensions
|
317
|
+
x_overlap = (word_left <= line["x1"]) and (word_right >= line["x0"])
|
318
|
+
y_overlap = (word_top <= line["bottom"]) and (word_bottom >= line["top"])
|
319
|
+
|
320
|
+
if x_overlap and y_overlap:
|
321
|
+
word["text"] = f"~~{word['text']}~~"
|
322
|
+
break
|
254
323
|
|
255
324
|
def get_text_formatting(word):
|
256
325
|
"""
|
@@ -260,19 +329,22 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
260
329
|
formatting = {
|
261
330
|
"bold": False,
|
262
331
|
"italic": False,
|
332
|
+
"monospace": False,
|
263
333
|
}
|
264
|
-
|
265
334
|
# Check font name for common bold/italic indicators
|
266
335
|
font_name = word.get("fontname", "").lower()
|
267
336
|
if any(style in font_name for style in ["bold", "heavy", "black"]):
|
268
337
|
formatting["bold"] = True
|
269
338
|
if any(style in font_name for style in ["italic", "oblique"]):
|
270
339
|
formatting["italic"] = True
|
271
|
-
|
340
|
+
if "mono" in font_name: # Detect monospace fonts
|
341
|
+
formatting["monospace"] = True
|
272
342
|
return formatting
|
273
343
|
|
274
344
|
def apply_markdown_formatting(text, formatting):
|
275
345
|
"""Apply markdown formatting to text based on detected styles"""
|
346
|
+
if formatting["monospace"]:
|
347
|
+
text = f"`{text}`"
|
276
348
|
if formatting["bold"] and formatting["italic"]:
|
277
349
|
text = f"***{text}***"
|
278
350
|
elif formatting["bold"]:
|
@@ -281,12 +353,64 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
281
353
|
text = f"*{text}*"
|
282
354
|
return text
|
283
355
|
|
284
|
-
def
|
285
|
-
|
356
|
+
def format_paragraph(text_elements):
|
357
|
+
"""
|
358
|
+
Format a paragraph with styling applied to individual words.
|
359
|
+
If all words are monospace, treat the paragraph as a code block.
|
360
|
+
Otherwise, wrap monospace words with backticks (`).
|
361
|
+
"""
|
362
|
+
|
363
|
+
all_monospace = True
|
364
|
+
formatted_words = []
|
365
|
+
|
366
|
+
for element in text_elements:
|
367
|
+
if isinstance(element, tuple) and element[0] == "indent":
|
368
|
+
indent = " " * element[1] * 3
|
369
|
+
formatted_words.append(indent)
|
370
|
+
continue
|
371
|
+
|
372
|
+
text = element["text"]
|
373
|
+
formatting = get_text_formatting(element)
|
374
|
+
|
375
|
+
if formatting.get("monospace", False):
|
376
|
+
# Wrap monospace words with backticks
|
377
|
+
formatted_words.append(f"`{text}`")
|
378
|
+
else:
|
379
|
+
all_monospace = False
|
380
|
+
# Apply other markdown formatting
|
381
|
+
formatted_words.append(apply_markdown_formatting(text, formatting))
|
382
|
+
|
383
|
+
# If all words are monospace, format as a code block
|
384
|
+
if all_monospace:
|
385
|
+
if isinstance(text_elements[0], tuple):
|
386
|
+
indent_str = " " * text_elements[0][1]
|
387
|
+
if len(text_elements) > 1:
|
388
|
+
text_elements = text_elements[1:]
|
389
|
+
text_elements[0]["text"] = indent_str + text_elements[0]["text"]
|
390
|
+
else:
|
391
|
+
return indent_str
|
392
|
+
code_content = " ".join([element["text"] for element in text_elements])
|
393
|
+
return f"```\n{code_content}\n```\n\n"
|
394
|
+
|
395
|
+
# Otherwise, return the formatted paragraph
|
396
|
+
return f"{' '.join(formatted_words)}\n\n"
|
397
|
+
|
398
|
+
def detect_heading_level(font_size, body_font_size):
|
399
|
+
"""Determine heading level based on font size ratio.
|
400
|
+
|
401
|
+
Args:
|
402
|
+
font_size: The font size to evaluate
|
403
|
+
body_font_size: The base body font size for comparison
|
404
|
+
|
405
|
+
Returns:
|
406
|
+
int: The heading level (1-3) or None if not a heading
|
407
|
+
"""
|
408
|
+
size_ratio = font_size / body_font_size
|
409
|
+
if size_ratio >= 2:
|
286
410
|
return 1
|
287
|
-
elif
|
411
|
+
elif size_ratio >= 1.4:
|
288
412
|
return 2
|
289
|
-
elif
|
413
|
+
elif size_ratio >= 1.2:
|
290
414
|
return 3
|
291
415
|
return None
|
292
416
|
|
@@ -303,18 +427,41 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
303
427
|
)
|
304
428
|
)
|
305
429
|
tables.sort(key=lambda x: x[1]["bottom"])
|
430
|
+
|
306
431
|
content_elements = []
|
307
|
-
for
|
432
|
+
for line in horizontal_lines:
|
433
|
+
content_elements.append(
|
434
|
+
(
|
435
|
+
"horizontal_line",
|
436
|
+
{
|
437
|
+
"top": line["top"],
|
438
|
+
"bottom": line["bottom"],
|
439
|
+
"x0": line["x0"],
|
440
|
+
"x1": line["x1"],
|
441
|
+
},
|
442
|
+
)
|
443
|
+
)
|
444
|
+
|
445
|
+
for i, word in enumerate(words):
|
308
446
|
while tables and word["bottom"] > tables[0][1]["bottom"]:
|
309
447
|
content_elements.append(tables.pop(0))
|
448
|
+
|
449
|
+
# Equate position of words on the same line
|
450
|
+
if i > 0 and abs(word["top"] - words[i - 1]["top"]) < 3:
|
451
|
+
word["top"] = words[i - 1]["top"]
|
452
|
+
|
310
453
|
content_elements.append(("word", word))
|
311
454
|
content_elements.extend(tables)
|
312
455
|
|
456
|
+
content_elements.sort(
|
457
|
+
key=lambda x: x[1]["top"] if isinstance(x[1], dict) and "top" in x[1] else 0
|
458
|
+
)
|
459
|
+
|
313
460
|
for element_type, element in content_elements:
|
461
|
+
# If there are any pending paragraphs or headings, add them first
|
314
462
|
if element_type == "table":
|
315
|
-
# If there are any pending paragraphs or headings, add them first
|
316
463
|
if current_heading:
|
317
|
-
level = detect_heading_level(current_heading[0]["size"])
|
464
|
+
level = detect_heading_level(current_heading[0]["size"], body_font_size)
|
318
465
|
heading_text = format_paragraph(current_heading)
|
319
466
|
markdown_content.append(f"{'#' * level} {heading_text}")
|
320
467
|
current_heading = []
|
@@ -324,11 +471,22 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
324
471
|
# Add the table
|
325
472
|
markdown_content.append(element["content"])
|
326
473
|
last_y = element["bottom"]
|
474
|
+
elif element_type == "horizontal_line":
|
475
|
+
while (next_h_line_idx < len(horizontal_lines)) and (
|
476
|
+
last_y is not None
|
477
|
+
and horizontal_lines[next_h_line_idx]["top"] <= last_y
|
478
|
+
):
|
479
|
+
# Insert the horizontal rule *after* the preceding text
|
480
|
+
if current_paragraph: # Flush any pending paragraph
|
481
|
+
markdown_content.append(format_paragraph(current_paragraph))
|
482
|
+
current_paragraph = []
|
483
|
+
markdown_content.append("\n---\n\n") # Add the rule
|
484
|
+
next_h_line_idx += 1
|
327
485
|
else:
|
328
486
|
# Process word
|
329
487
|
word = element
|
330
488
|
# Check if this might be a heading
|
331
|
-
heading_level = detect_heading_level(word["size"])
|
489
|
+
heading_level = detect_heading_level(word["size"], body_font_size)
|
332
490
|
|
333
491
|
# Detect new line based on vertical position
|
334
492
|
is_new_line = last_y is not None and abs(word["top"] - last_y) > y_tolerance
|
@@ -336,7 +494,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
336
494
|
if is_new_line:
|
337
495
|
# If we were collecting a heading
|
338
496
|
if current_heading:
|
339
|
-
level = detect_heading_level(
|
497
|
+
level = detect_heading_level(
|
498
|
+
current_heading[0]["size"], body_font_size
|
499
|
+
)
|
340
500
|
heading_text = format_paragraph(current_heading)
|
341
501
|
markdown_content.append(f"{'#' * level} {heading_text}")
|
342
502
|
current_heading = []
|
@@ -346,6 +506,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
346
506
|
markdown_content.append(format_paragraph(current_paragraph))
|
347
507
|
current_paragraph = []
|
348
508
|
|
509
|
+
indent_level = detect_indentation_level(word, base_left)
|
510
|
+
current_paragraph.append(("indent", indent_level))
|
511
|
+
|
349
512
|
# Add word to appropriate collection
|
350
513
|
if heading_level:
|
351
514
|
if current_paragraph: # Flush any pending paragraph
|
@@ -354,7 +517,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
354
517
|
current_heading.append(word)
|
355
518
|
else:
|
356
519
|
if current_heading: # Flush any pending heading
|
357
|
-
level = detect_heading_level(
|
520
|
+
level = detect_heading_level(
|
521
|
+
current_heading[0]["size"], body_font_size
|
522
|
+
)
|
358
523
|
heading_text = format_paragraph(current_heading)
|
359
524
|
markdown_content.append(f"{'#' * level} {heading_text}")
|
360
525
|
current_heading = []
|
@@ -364,7 +529,7 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
364
529
|
|
365
530
|
# Handle remaining content
|
366
531
|
if current_heading:
|
367
|
-
level = detect_heading_level(current_heading[0]["size"])
|
532
|
+
level = detect_heading_level(current_heading[0]["size"], body_font_size)
|
368
533
|
heading_text = format_paragraph(current_heading)
|
369
534
|
markdown_content.append(f"{'#' * level} {heading_text}")
|
370
535
|
|
@@ -383,8 +548,15 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
383
548
|
if links:
|
384
549
|
content = embed_links_in_text(page, content, links)
|
385
550
|
|
551
|
+
content = embed_email_links(content)
|
552
|
+
|
386
553
|
# Remove redundant formatting
|
387
|
-
content =
|
554
|
+
content = (
|
555
|
+
content.replace("** **", " ")
|
556
|
+
.replace("* *", " ")
|
557
|
+
.replace("` `", " ")
|
558
|
+
.replace("\n```\n\n```", "")
|
559
|
+
)
|
388
560
|
|
389
561
|
return content
|
390
562
|
|
lexoid/core/prompt_templates.py
CHANGED
@@ -41,7 +41,8 @@ Think step-by-step.
|
|
41
41
|
'0' is typically more oval than 'O'
|
42
42
|
'8' has a more angular top than 'B'
|
43
43
|
{custom_instructions}
|
44
|
-
- Return only the correct markdown without additional text or explanations.
|
44
|
+
- Return only the correct markdown without additional text or explanations.
|
45
|
+
- DO NOT use code blocks such as "```html" or "```markdown" in the output unless there is a code block in the content.
|
45
46
|
- Think before generating the output in <thinking></thinking> tags.
|
46
47
|
|
47
48
|
Remember, your primary objective is to create an output that, when rendered, structurally replicates the original document's content as closely as possible without losing any textual details.
|
lexoid/core/utils.py
CHANGED
@@ -345,7 +345,7 @@ def get_webpage_soup(url: str) -> BeautifulSoup:
|
|
345
345
|
# Additional wait for any dynamic content
|
346
346
|
try:
|
347
347
|
await page.wait_for_selector("body", timeout=30000)
|
348
|
-
except:
|
348
|
+
except Exception:
|
349
349
|
pass
|
350
350
|
|
351
351
|
html = await page.content()
|
@@ -561,24 +561,32 @@ def router(path: str, priority: str = "speed") -> str:
|
|
561
561
|
priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE).
|
562
562
|
"""
|
563
563
|
file_type = get_file_type(path)
|
564
|
-
if
|
564
|
+
if (
|
565
|
+
file_type.startswith("text/")
|
566
|
+
or "spreadsheet" in file_type
|
567
|
+
or "presentation" in file_type
|
568
|
+
):
|
565
569
|
return "STATIC_PARSE"
|
566
570
|
|
567
571
|
if priority == "accuracy":
|
568
572
|
# If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
|
569
573
|
# Otherwise, use LLM_PARSE
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
and
|
574
|
-
):
|
574
|
+
has_image = has_image_in_pdf(path)
|
575
|
+
has_hyperlink = has_hyperlink_in_pdf(path)
|
576
|
+
if file_type == "application/pdf" and not has_image and has_hyperlink:
|
577
|
+
logger.debug("Using STATIC_PARSE for PDF with hyperlinks and no images.")
|
575
578
|
return "STATIC_PARSE"
|
579
|
+
logger.debug(
|
580
|
+
f"Using LLM_PARSE because PDF has image ({has_image}) or has no hyperlink ({has_hyperlink})."
|
581
|
+
)
|
576
582
|
return "LLM_PARSE"
|
577
583
|
else:
|
578
584
|
# If the file is a PDF without images, use STATIC_PARSE
|
579
585
|
# Otherwise, use LLM_PARSE
|
580
586
|
if file_type == "application/pdf" and not has_image_in_pdf(path):
|
587
|
+
logger.debug("Using STATIC_PARSE for PDF without images.")
|
581
588
|
return "STATIC_PARSE"
|
589
|
+
logger.debug("Using LLM_PARSE because PDF has images")
|
582
590
|
return "LLM_PARSE"
|
583
591
|
|
584
592
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lexoid
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.14
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.10,<4.0
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -49,7 +49,8 @@ Description-Content-Type: text/markdown
|
|
49
49
|
</div>
|
50
50
|
|
51
51
|
[](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
|
52
|
-
[](https://huggingface.co/spaces/oidlabs/Lexoid)
|
53
|
+
[](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
|
53
54
|
[](https://pypi.org/project/lexoid/)
|
54
55
|
[](https://oidlabs-com.github.io/Lexoid/)
|
55
56
|
|
@@ -144,6 +145,7 @@ print(parsed_md)
|
|
144
145
|
* Hugging Face
|
145
146
|
* Together AI
|
146
147
|
* OpenRouter
|
148
|
+
* Fireworks
|
147
149
|
|
148
150
|
## Benchmark
|
149
151
|
|
@@ -151,22 +153,25 @@ Results aggregated across 5 iterations each for 5 documents.
|
|
151
153
|
|
152
154
|
_Note:_ Benchmarks are currently done in the zero-shot setting.
|
153
155
|
|
154
|
-
| Rank | Model
|
155
|
-
|
|
156
|
-
| 1
|
157
|
-
| 2
|
158
|
-
| 3
|
159
|
-
| 4
|
160
|
-
| 5
|
161
|
-
| 6
|
162
|
-
| 7
|
163
|
-
| 8
|
164
|
-
| 9
|
165
|
-
| 10
|
166
|
-
| 11
|
167
|
-
| 12
|
168
|
-
| 13
|
169
|
-
| 14
|
170
|
-
| 15
|
171
|
-
| 16
|
156
|
+
| Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
|
157
|
+
| --- | --- | --- | --- | --- | --- |
|
158
|
+
| 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.00048 |
|
159
|
+
| 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
|
160
|
+
| 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
|
161
|
+
| 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
|
162
|
+
| 5 | AUTO | 0.76 | 0.184 | 5.14 | 0.000217 |
|
163
|
+
| 6 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
|
164
|
+
| 7 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
|
165
|
+
| 8 | accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks) | 0.687 | 0.221 | 8.07 | 0.000419 |
|
166
|
+
| 9 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
|
167
|
+
| 10 | accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks) | 0.675 | 0.184 | 5.98 | 0.000226 |
|
168
|
+
| 11 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
|
169
|
+
| 12 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
|
170
|
+
| 13 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
|
171
|
+
| 14 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
|
172
|
+
| 15 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.00006 |
|
173
|
+
| 16 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
|
174
|
+
| 17 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
|
175
|
+
| 18 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
|
176
|
+
| 19 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.8 | 0.000019 |
|
172
177
|
|
@@ -0,0 +1,9 @@
|
|
1
|
+
lexoid/api.py,sha256=le24OVKpPZ4UYs3X77HsFjfRTTu50jdcLLDcp_Vn5-M,13924
|
2
|
+
lexoid/core/parse_type/llm_parser.py,sha256=gkjIuBa1IJ59FejLqMcwuvpzlm2VIotWQk02INQbiA4,13234
|
3
|
+
lexoid/core/parse_type/static_parser.py,sha256=IovvF1GCLWFPh2-mwcgv6DpJmSVQBLnGcoIq7bwQ39Q,21299
|
4
|
+
lexoid/core/prompt_templates.py,sha256=zftophGU0DNt0AKwYqN8jnAtH90-KlFTyJhs-I2yDiE,6351
|
5
|
+
lexoid/core/utils.py,sha256=R0zyOzprtDIwwR4XNhE0qzi1FBuLZxFsR4uKUCrtWtI,20138
|
6
|
+
lexoid-0.1.14.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
+
lexoid-0.1.14.dist-info/METADATA,sha256=o5543YUA6EF6iN7dTNWneQfzPeIeadpHTSqtnq3IV7Q,6383
|
8
|
+
lexoid-0.1.14.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
+
lexoid-0.1.14.dist-info/RECORD,,
|
lexoid-0.1.12.dist-info/RECORD
DELETED
@@ -1,9 +0,0 @@
|
|
1
|
-
lexoid/api.py,sha256=lTkUcbGML29JrWJv4pE_ZqbzeJuHUE8b6OnijoLBEfU,11350
|
2
|
-
lexoid/core/parse_type/llm_parser.py,sha256=rrc1Lwp-6ZAi8IVp3672mHAHUs1JefhT2rnYyQ1gA5E,11292
|
3
|
-
lexoid/core/parse_type/static_parser.py,sha256=v4GWUmZVBBIF9TnbkhPBt2gspk0Oq_ujtNGnXZHLBr8,15055
|
4
|
-
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
-
lexoid/core/utils.py,sha256=6s24X3-4Y57u70HzjIS798Tg8qx6Z3mLATf4xtENE-8,19718
|
6
|
-
lexoid-0.1.12.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
-
lexoid-0.1.12.dist-info/METADATA,sha256=XMHFMqwDj2DgSaZcZjXU881NxdPsRGBAsUyPyRsJvyU,6809
|
8
|
-
lexoid-0.1.12.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
-
lexoid-0.1.12.dist-info/RECORD,,
|
File without changes
|
File without changes
|