lexoid 0.1.11.post1__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexoid/api.py +48 -8
- lexoid/core/parse_type/llm_parser.py +34 -26
- lexoid/core/parse_type/static_parser.py +236 -29
- lexoid/core/utils.py +11 -10
- {lexoid-0.1.11.post1.dist-info → lexoid-0.1.13.dist-info}/METADATA +60 -20
- lexoid-0.1.13.dist-info/RECORD +9 -0
- lexoid-0.1.11.post1.dist-info/RECORD +0 -9
- {lexoid-0.1.11.post1.dist-info → lexoid-0.1.13.dist-info}/LICENSE +0 -0
- {lexoid-0.1.11.post1.dist-info → lexoid-0.1.13.dist-info}/WHEEL +0 -0
lexoid/api.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
import json
|
1
2
|
import os
|
2
3
|
import re
|
3
4
|
import tempfile
|
@@ -50,7 +51,8 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
|
|
50
51
|
- token_usage: Dictionary containing token usage statistics
|
51
52
|
"""
|
52
53
|
if parser_type == ParserType.AUTO:
|
53
|
-
|
54
|
+
router_priority = kwargs.get("router_priority", "speed")
|
55
|
+
parser_type = ParserType[router(path, router_priority)]
|
54
56
|
logger.debug(f"Auto-detected parser type: {parser_type}")
|
55
57
|
|
56
58
|
kwargs["start"] = (
|
@@ -80,7 +82,7 @@ def parse_chunk_list(
|
|
80
82
|
"""
|
81
83
|
combined_segments = []
|
82
84
|
raw_texts = []
|
83
|
-
token_usage = {"input": 0, "output": 0}
|
85
|
+
token_usage = {"input": 0, "output": 0, "image_count": 0}
|
84
86
|
for file_path in file_paths:
|
85
87
|
result = parse_chunk(file_path, parser_type, **kwargs)
|
86
88
|
combined_segments.extend(result["segments"])
|
@@ -88,6 +90,7 @@ def parse_chunk_list(
|
|
88
90
|
if "token_usage" in result:
|
89
91
|
token_usage["input"] += result["token_usage"]["input"]
|
90
92
|
token_usage["output"] += result["token_usage"]["output"]
|
93
|
+
token_usage["image_count"] += len(result["segments"])
|
91
94
|
token_usage["total"] = token_usage["input"] + token_usage["output"]
|
92
95
|
|
93
96
|
return {
|
@@ -135,14 +138,20 @@ def parse(
|
|
135
138
|
|
136
139
|
if type(parser_type) == str:
|
137
140
|
parser_type = ParserType[parser_type]
|
141
|
+
if (
|
142
|
+
path.lower().endswith((".doc", ".docx"))
|
143
|
+
and parser_type != ParserType.STATIC_PARSE
|
144
|
+
):
|
145
|
+
as_pdf = True
|
146
|
+
if path.lower().endswith(".xlsx") and parser_type == ParserType.LLM_PARSE:
|
147
|
+
logger.warning("LLM_PARSE does not support .xlsx files. Using STATIC_PARSE.")
|
148
|
+
parser_type = ParserType.STATIC_PARSE
|
149
|
+
if path.lower().endswith(".pptx") and parser_type == ParserType.LLM_PARSE:
|
150
|
+
logger.warning("LLM_PARSE does not support .pptx files. Using STATIC_PARSE.")
|
151
|
+
parser_type = ParserType.STATIC_PARSE
|
138
152
|
|
139
153
|
with tempfile.TemporaryDirectory() as temp_dir:
|
140
|
-
|
141
|
-
path.lower().endswith((".doc", ".docx"))
|
142
|
-
and parser_type != ParserType.STATIC_PARSE
|
143
|
-
):
|
144
|
-
as_pdf = True
|
145
|
-
|
154
|
+
kwargs["temp_dir"] = temp_dir
|
146
155
|
if path.startswith(("http://", "https://")):
|
147
156
|
kwargs["url"] = path
|
148
157
|
download_dir = kwargs.get("save_dir", os.path.join(temp_dir, "downloads/"))
|
@@ -210,9 +219,40 @@ def parse(
|
|
210
219
|
"token_usage": {
|
211
220
|
"input": sum(r["token_usage"]["input"] for r in chunk_results),
|
212
221
|
"output": sum(r["token_usage"]["output"] for r in chunk_results),
|
222
|
+
"image_count": sum(
|
223
|
+
r["token_usage"]["image_count"] for r in chunk_results
|
224
|
+
),
|
213
225
|
"total": sum(r["token_usage"]["total"] for r in chunk_results),
|
214
226
|
},
|
215
227
|
}
|
228
|
+
|
229
|
+
if "api_cost_mapping" in kwargs:
|
230
|
+
api_cost_mapping = kwargs["api_cost_mapping"]
|
231
|
+
if isinstance(api_cost_mapping, dict):
|
232
|
+
api_cost_mapping = api_cost_mapping
|
233
|
+
elif isinstance(api_cost_mapping, str) and os.path.exists(
|
234
|
+
api_cost_mapping
|
235
|
+
):
|
236
|
+
with open(api_cost_mapping, "r") as f:
|
237
|
+
api_cost_mapping = json.load(f)
|
238
|
+
else:
|
239
|
+
raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.")
|
240
|
+
|
241
|
+
api_cost = api_cost_mapping.get(
|
242
|
+
kwargs.get("model", "gemini-2.0-flash"), None
|
243
|
+
)
|
244
|
+
if api_cost:
|
245
|
+
token_usage = result["token_usage"]
|
246
|
+
token_cost = {
|
247
|
+
"input": token_usage["input"] * api_cost["input"] / 1_000_000
|
248
|
+
+ api_cost.get("input-image", 0) * token_usage["image_count"],
|
249
|
+
"output": token_usage["output"]
|
250
|
+
* api_cost["output"]
|
251
|
+
/ 1_000_000,
|
252
|
+
}
|
253
|
+
token_cost["total"] = token_cost["input"] + token_cost["output"]
|
254
|
+
result["token_cost"] = token_cost
|
255
|
+
|
216
256
|
if as_pdf:
|
217
257
|
result["pdf_path"] = path
|
218
258
|
|
@@ -31,6 +31,7 @@ def retry_on_http_error(func):
|
|
31
31
|
logger.error(f"HTTPError encountered: {e}. Retrying in 10 seconds...")
|
32
32
|
time.sleep(10)
|
33
33
|
try:
|
34
|
+
logger.debug(f"Retry {func.__name__}")
|
34
35
|
return func(*args, **kwargs)
|
35
36
|
except HTTPError as e:
|
36
37
|
logger.error(f"Retry failed: {e}")
|
@@ -49,6 +50,8 @@ def retry_on_http_error(func):
|
|
49
50
|
|
50
51
|
@retry_on_http_error
|
51
52
|
def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
|
53
|
+
if "api_provider" in kwargs and kwargs["api_provider"]:
|
54
|
+
return parse_with_api(path, api=kwargs["api_provider"], **kwargs)
|
52
55
|
if "model" not in kwargs:
|
53
56
|
kwargs["model"] = "gemini-2.0-flash"
|
54
57
|
model = kwargs.get("model")
|
@@ -57,9 +60,11 @@ def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
|
|
57
60
|
if model.startswith("gpt"):
|
58
61
|
return parse_with_api(path, api="openai", **kwargs)
|
59
62
|
if model.startswith("meta-llama"):
|
60
|
-
if
|
63
|
+
if "Turbo" in model or model == "meta-llama/Llama-Vision-Free":
|
61
64
|
return parse_with_api(path, api="together", **kwargs)
|
62
65
|
return parse_with_api(path, api="huggingface", **kwargs)
|
66
|
+
if any(model.startswith(prefix) for prefix in ["microsoft", "google", "qwen"]):
|
67
|
+
return parse_with_api(path, api="openrouter", **kwargs)
|
63
68
|
raise ValueError(f"Unsupported model: {model}")
|
64
69
|
|
65
70
|
|
@@ -81,20 +86,20 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
81
86
|
file_content = file.read()
|
82
87
|
base64_file = base64.b64encode(file_content).decode("utf-8")
|
83
88
|
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
89
|
+
if "system_prompt" in kwargs:
|
90
|
+
prompt = kwargs["system_prompt"]
|
91
|
+
else:
|
92
|
+
# Ideally, we do this ourselves. But, for now this might be a good enough.
|
93
|
+
custom_instruction = f"""- Total number of pages: {kwargs["pages_per_split_"]}. {INSTRUCTIONS_ADD_PG_BREAK}"""
|
94
|
+
if kwargs["pages_per_split_"] == 1:
|
95
|
+
custom_instruction = ""
|
96
|
+
prompt = PARSER_PROMPT.format(custom_instructions=custom_instruction)
|
88
97
|
|
89
98
|
payload = {
|
90
99
|
"contents": [
|
91
100
|
{
|
92
101
|
"parts": [
|
93
|
-
{
|
94
|
-
"text": PARSER_PROMPT.format(
|
95
|
-
custom_instructions=custom_instruction
|
96
|
-
)
|
97
|
-
},
|
102
|
+
{"text": prompt},
|
98
103
|
{"inline_data": {"mime_type": mime_type, "data": base64_file}},
|
99
104
|
]
|
100
105
|
}
|
@@ -105,9 +110,11 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
105
110
|
}
|
106
111
|
|
107
112
|
headers = {"Content-Type": "application/json"}
|
108
|
-
|
109
|
-
|
110
|
-
|
113
|
+
try:
|
114
|
+
response = requests.post(url, json=payload, headers=headers, timeout=120)
|
115
|
+
response.raise_for_status()
|
116
|
+
except requests.Timeout as e:
|
117
|
+
raise HTTPError(f"Timeout error occurred: {e}")
|
111
118
|
|
112
119
|
result = response.json()
|
113
120
|
|
@@ -130,7 +137,7 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
130
137
|
total_tokens = input_tokens + output_tokens
|
131
138
|
|
132
139
|
return {
|
133
|
-
"raw": combined_text,
|
140
|
+
"raw": combined_text.replace("<page-break>", "\n\n"),
|
134
141
|
"segments": [
|
135
142
|
{"metadata": {"page": kwargs.get("start", 0) + page_no}, "content": page}
|
136
143
|
for page_no, page in enumerate(combined_text.split("<page-break>"), start=1)
|
@@ -181,6 +188,10 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
|
181
188
|
token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
|
182
189
|
),
|
183
190
|
"together": lambda: Together(),
|
191
|
+
"openrouter": lambda: OpenAI(
|
192
|
+
base_url="https://openrouter.ai/api/v1",
|
193
|
+
api_key=os.environ["OPENROUTER_API_KEY"],
|
194
|
+
),
|
184
195
|
}
|
185
196
|
assert api in clients, f"Unsupported API: {api}"
|
186
197
|
logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
|
@@ -206,35 +217,32 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
|
206
217
|
|
207
218
|
# API-specific message formatting
|
208
219
|
def get_messages(page_num: int, image_url: str) -> List[Dict]:
|
209
|
-
base_message = {
|
210
|
-
"type": "text",
|
211
|
-
"text": LLAMA_PARSER_PROMPT,
|
212
|
-
}
|
213
220
|
image_message = {
|
214
221
|
"type": "image_url",
|
215
222
|
"image_url": {"url": image_url},
|
216
223
|
}
|
217
224
|
|
218
225
|
if api == "openai":
|
226
|
+
system_prompt = kwargs.get(
|
227
|
+
"system_prompt", PARSER_PROMPT.format(custom_instructions="")
|
228
|
+
)
|
229
|
+
user_prompt = kwargs.get("user_prompt", OPENAI_USER_PROMPT)
|
219
230
|
return [
|
220
231
|
{
|
221
232
|
"role": "system",
|
222
|
-
"content":
|
223
|
-
custom_instructions=INSTRUCTIONS_ADD_PG_BREAK
|
224
|
-
),
|
233
|
+
"content": system_prompt,
|
225
234
|
},
|
226
235
|
{
|
227
236
|
"role": "user",
|
228
237
|
"content": [
|
229
|
-
{
|
230
|
-
"type": "text",
|
231
|
-
"text": f"{OPENAI_USER_PROMPT} (Page {page_num + 1})",
|
232
|
-
},
|
238
|
+
{"type": "text", "text": user_prompt},
|
233
239
|
image_message,
|
234
240
|
],
|
235
241
|
},
|
236
242
|
]
|
237
243
|
else:
|
244
|
+
prompt = kwargs.get("system_prompt", LLAMA_PARSER_PROMPT)
|
245
|
+
base_message = {"type": "text", "text": prompt}
|
238
246
|
return [
|
239
247
|
{
|
240
248
|
"role": "user",
|
@@ -283,7 +291,7 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
|
|
283
291
|
# Sort results by page number and combine
|
284
292
|
all_results.sort(key=lambda x: x[0])
|
285
293
|
all_texts = [text for _, text, _, _, _ in all_results]
|
286
|
-
combined_text = "
|
294
|
+
combined_text = "\n\n".join(all_texts)
|
287
295
|
|
288
296
|
return {
|
289
297
|
"raw": combined_text,
|
@@ -1,12 +1,25 @@
|
|
1
|
+
import os
|
2
|
+
import re
|
1
3
|
import tempfile
|
4
|
+
from time import time
|
5
|
+
from typing import Dict, List
|
6
|
+
|
2
7
|
import pandas as pd
|
3
8
|
import pdfplumber
|
4
|
-
from
|
5
|
-
from lexoid.core.utils import get_file_type, get_uri_rect, html_to_markdown, split_pdf
|
9
|
+
from docx import Document
|
6
10
|
from pdfminer.high_level import extract_pages
|
7
11
|
from pdfminer.layout import LTTextContainer
|
8
12
|
from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
|
9
|
-
from
|
13
|
+
from pptx2md import ConversionConfig, convert
|
14
|
+
|
15
|
+
|
16
|
+
from lexoid.core.utils import (
|
17
|
+
get_file_type,
|
18
|
+
get_uri_rect,
|
19
|
+
html_to_markdown,
|
20
|
+
split_md_by_headings,
|
21
|
+
split_pdf,
|
22
|
+
)
|
10
23
|
|
11
24
|
|
12
25
|
def parse_static_doc(path: str, **kwargs) -> Dict:
|
@@ -47,8 +60,11 @@ def parse_static_doc(path: str, **kwargs) -> Dict:
|
|
47
60
|
"parent_title": kwargs.get("parent_title", ""),
|
48
61
|
"recursive_docs": [],
|
49
62
|
}
|
50
|
-
elif file_type == "text/csv":
|
51
|
-
|
63
|
+
elif file_type == "text/csv" or "spreadsheet" in file_type:
|
64
|
+
if "spreadsheet" in file_type:
|
65
|
+
df = pd.read_excel(path)
|
66
|
+
else:
|
67
|
+
df = pd.read_csv(path)
|
52
68
|
content = df.to_markdown(index=False)
|
53
69
|
return {
|
54
70
|
"raw": content,
|
@@ -58,6 +74,27 @@ def parse_static_doc(path: str, **kwargs) -> Dict:
|
|
58
74
|
"parent_title": kwargs.get("parent_title", ""),
|
59
75
|
"recursive_docs": [],
|
60
76
|
}
|
77
|
+
elif "presentation" in file_type:
|
78
|
+
md_path = os.path.join(kwargs["temp_dir"], f"{int(time())}.md")
|
79
|
+
convert(
|
80
|
+
ConversionConfig(
|
81
|
+
pptx_path=path,
|
82
|
+
output_path=md_path,
|
83
|
+
image_dir=None,
|
84
|
+
disable_image=True,
|
85
|
+
disable_notes=True,
|
86
|
+
)
|
87
|
+
)
|
88
|
+
with open(md_path, "r") as f:
|
89
|
+
content = f.read()
|
90
|
+
return {
|
91
|
+
"raw": content,
|
92
|
+
"segments": split_md_by_headings(content, "#"),
|
93
|
+
"title": kwargs["title"],
|
94
|
+
"url": kwargs.get("url", ""),
|
95
|
+
"parent_title": kwargs.get("parent_title", ""),
|
96
|
+
"recursive_docs": [],
|
97
|
+
}
|
61
98
|
else:
|
62
99
|
raise ValueError(f"Unsupported file type: {file_type}")
|
63
100
|
|
@@ -168,6 +205,25 @@ def embed_links_in_text(page, text, links):
|
|
168
205
|
return text
|
169
206
|
|
170
207
|
|
208
|
+
def detect_indentation_level(word, base_left_position):
|
209
|
+
"""Determine indentation level based on left position difference."""
|
210
|
+
left_diff = word["x0"] - base_left_position
|
211
|
+
if left_diff < 5:
|
212
|
+
return 0
|
213
|
+
return int(left_diff // 25) + 1
|
214
|
+
|
215
|
+
|
216
|
+
def embed_email_links(text: str) -> str:
|
217
|
+
"""
|
218
|
+
Detect email addresses in text and wrap them in angle brackets.
|
219
|
+
For example, 'mail@example.com' becomes '<mail@example.com>'.
|
220
|
+
"""
|
221
|
+
email_pattern = re.compile(
|
222
|
+
r"(?<![<\[])(?P<email>\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b)(?![>\]])"
|
223
|
+
)
|
224
|
+
return email_pattern.sub(lambda match: f"<{match.group('email')}>", text)
|
225
|
+
|
226
|
+
|
171
227
|
def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
172
228
|
"""
|
173
229
|
Process a single page's content and return formatted markdown text.
|
@@ -178,7 +234,26 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
178
234
|
last_y = None
|
179
235
|
x_tolerance = kwargs.get("x_tolerance", 1)
|
180
236
|
y_tolerance = kwargs.get("y_tolerance", 5)
|
181
|
-
|
237
|
+
next_h_line_idx = 0
|
238
|
+
|
239
|
+
# First detect horizontal lines that could be markdown rules
|
240
|
+
horizontal_lines = []
|
241
|
+
if hasattr(page, "lines"):
|
242
|
+
for line in page.lines:
|
243
|
+
# Check if line is approximately horizontal (within 5 degrees)
|
244
|
+
if (
|
245
|
+
abs(line["height"]) < 0.1
|
246
|
+
or abs(line["width"]) > abs(line["height"]) * 20
|
247
|
+
):
|
248
|
+
# Consider it a horizontal rule candidate
|
249
|
+
horizontal_lines.append(
|
250
|
+
{
|
251
|
+
"top": line["top"],
|
252
|
+
"bottom": line["bottom"],
|
253
|
+
"x0": line["x0"],
|
254
|
+
"x1": line["x1"],
|
255
|
+
}
|
256
|
+
)
|
182
257
|
# Table settings
|
183
258
|
vertical_strategy = kwargs.get("vertical_strategy", "lines")
|
184
259
|
horizontal_strategy = kwargs.get("horizontal_strategy", "lines")
|
@@ -208,14 +283,43 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
208
283
|
extra_attrs=["size", "top", "bottom", "fontname"],
|
209
284
|
)
|
210
285
|
|
211
|
-
|
212
|
-
""
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
286
|
+
if words:
|
287
|
+
font_sizes = [w.get("size", 12) for w in words]
|
288
|
+
body_font_size = max(set(font_sizes), key=font_sizes.count)
|
289
|
+
else:
|
290
|
+
body_font_size = 12
|
291
|
+
|
292
|
+
left_positions = []
|
293
|
+
prev_bottom = None
|
294
|
+
|
295
|
+
for word in words:
|
296
|
+
# Check if this is likely a new line (first word in line)
|
297
|
+
if prev_bottom is None or abs(word["top"] - prev_bottom) > y_tolerance:
|
298
|
+
left_positions.append(word["x0"])
|
299
|
+
prev_bottom = word["top"]
|
300
|
+
|
301
|
+
# Find the most common minimum left position (mode)
|
302
|
+
if left_positions:
|
303
|
+
base_left = max(set(left_positions), key=left_positions.count)
|
304
|
+
else:
|
305
|
+
base_left = 0
|
306
|
+
|
307
|
+
for line in horizontal_lines:
|
308
|
+
# Check each word to see if it overlaps with this line
|
309
|
+
for word in words:
|
310
|
+
# Get word bounding box coordinates
|
311
|
+
word_left = word["x0"]
|
312
|
+
word_right = word["x1"]
|
313
|
+
word_top = word["top"]
|
314
|
+
word_bottom = word["bottom"]
|
315
|
+
|
316
|
+
# Check if word overlaps with line in both x and y dimensions
|
317
|
+
x_overlap = (word_left <= line["x1"]) and (word_right >= line["x0"])
|
318
|
+
y_overlap = (word_top <= line["bottom"]) and (word_bottom >= line["top"])
|
319
|
+
|
320
|
+
if x_overlap and y_overlap:
|
321
|
+
word["text"] = f"~~{word['text']}~~"
|
322
|
+
break
|
219
323
|
|
220
324
|
def get_text_formatting(word):
|
221
325
|
"""
|
@@ -225,19 +329,22 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
225
329
|
formatting = {
|
226
330
|
"bold": False,
|
227
331
|
"italic": False,
|
332
|
+
"monospace": False,
|
228
333
|
}
|
229
|
-
|
230
334
|
# Check font name for common bold/italic indicators
|
231
335
|
font_name = word.get("fontname", "").lower()
|
232
336
|
if any(style in font_name for style in ["bold", "heavy", "black"]):
|
233
337
|
formatting["bold"] = True
|
234
338
|
if any(style in font_name for style in ["italic", "oblique"]):
|
235
339
|
formatting["italic"] = True
|
236
|
-
|
340
|
+
if "mono" in font_name: # Detect monospace fonts
|
341
|
+
formatting["monospace"] = True
|
237
342
|
return formatting
|
238
343
|
|
239
344
|
def apply_markdown_formatting(text, formatting):
|
240
345
|
"""Apply markdown formatting to text based on detected styles"""
|
346
|
+
if formatting["monospace"]:
|
347
|
+
text = f"`{text}`"
|
241
348
|
if formatting["bold"] and formatting["italic"]:
|
242
349
|
text = f"***{text}***"
|
243
350
|
elif formatting["bold"]:
|
@@ -246,12 +353,64 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
246
353
|
text = f"*{text}*"
|
247
354
|
return text
|
248
355
|
|
249
|
-
def
|
250
|
-
|
356
|
+
def format_paragraph(text_elements):
|
357
|
+
"""
|
358
|
+
Format a paragraph with styling applied to individual words.
|
359
|
+
If all words are monospace, treat the paragraph as a code block.
|
360
|
+
Otherwise, wrap monospace words with backticks (`).
|
361
|
+
"""
|
362
|
+
|
363
|
+
all_monospace = True
|
364
|
+
formatted_words = []
|
365
|
+
|
366
|
+
for element in text_elements:
|
367
|
+
if isinstance(element, tuple) and element[0] == "indent":
|
368
|
+
indent = " " * element[1] * 3
|
369
|
+
formatted_words.append(indent)
|
370
|
+
continue
|
371
|
+
|
372
|
+
text = element["text"]
|
373
|
+
formatting = get_text_formatting(element)
|
374
|
+
|
375
|
+
if formatting.get("monospace", False):
|
376
|
+
# Wrap monospace words with backticks
|
377
|
+
formatted_words.append(f"`{text}`")
|
378
|
+
else:
|
379
|
+
all_monospace = False
|
380
|
+
# Apply other markdown formatting
|
381
|
+
formatted_words.append(apply_markdown_formatting(text, formatting))
|
382
|
+
|
383
|
+
# If all words are monospace, format as a code block
|
384
|
+
if all_monospace:
|
385
|
+
if isinstance(text_elements[0], tuple):
|
386
|
+
indent_str = " " * text_elements[0][1]
|
387
|
+
if len(text_elements) > 1:
|
388
|
+
text_elements = text_elements[1:]
|
389
|
+
text_elements[0]["text"] = indent_str + text_elements[0]["text"]
|
390
|
+
else:
|
391
|
+
return indent_str
|
392
|
+
code_content = " ".join([element["text"] for element in text_elements])
|
393
|
+
return f"```\n{code_content}\n```\n\n"
|
394
|
+
|
395
|
+
# Otherwise, return the formatted paragraph
|
396
|
+
return f"{' '.join(formatted_words)}\n\n"
|
397
|
+
|
398
|
+
def detect_heading_level(font_size, body_font_size):
|
399
|
+
"""Determine heading level based on font size ratio.
|
400
|
+
|
401
|
+
Args:
|
402
|
+
font_size: The font size to evaluate
|
403
|
+
body_font_size: The base body font size for comparison
|
404
|
+
|
405
|
+
Returns:
|
406
|
+
int: The heading level (1-3) or None if not a heading
|
407
|
+
"""
|
408
|
+
size_ratio = font_size / body_font_size
|
409
|
+
if size_ratio >= 2:
|
251
410
|
return 1
|
252
|
-
elif
|
411
|
+
elif size_ratio >= 1.4:
|
253
412
|
return 2
|
254
|
-
elif
|
413
|
+
elif size_ratio >= 1.2:
|
255
414
|
return 3
|
256
415
|
return None
|
257
416
|
|
@@ -268,18 +427,41 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
268
427
|
)
|
269
428
|
)
|
270
429
|
tables.sort(key=lambda x: x[1]["bottom"])
|
430
|
+
|
271
431
|
content_elements = []
|
272
|
-
for
|
432
|
+
for line in horizontal_lines:
|
433
|
+
content_elements.append(
|
434
|
+
(
|
435
|
+
"horizontal_line",
|
436
|
+
{
|
437
|
+
"top": line["top"],
|
438
|
+
"bottom": line["bottom"],
|
439
|
+
"x0": line["x0"],
|
440
|
+
"x1": line["x1"],
|
441
|
+
},
|
442
|
+
)
|
443
|
+
)
|
444
|
+
|
445
|
+
for i, word in enumerate(words):
|
273
446
|
while tables and word["bottom"] > tables[0][1]["bottom"]:
|
274
447
|
content_elements.append(tables.pop(0))
|
448
|
+
|
449
|
+
# Equate position of words on the same line
|
450
|
+
if i > 0 and abs(word["top"] - words[i - 1]["top"]) < 3:
|
451
|
+
word["top"] = words[i - 1]["top"]
|
452
|
+
|
275
453
|
content_elements.append(("word", word))
|
276
454
|
content_elements.extend(tables)
|
277
455
|
|
456
|
+
content_elements.sort(
|
457
|
+
key=lambda x: x[1]["top"] if isinstance(x[1], dict) and "top" in x[1] else 0
|
458
|
+
)
|
459
|
+
|
278
460
|
for element_type, element in content_elements:
|
461
|
+
# If there are any pending paragraphs or headings, add them first
|
279
462
|
if element_type == "table":
|
280
|
-
# If there are any pending paragraphs or headings, add them first
|
281
463
|
if current_heading:
|
282
|
-
level = detect_heading_level(current_heading[0]["size"])
|
464
|
+
level = detect_heading_level(current_heading[0]["size"], body_font_size)
|
283
465
|
heading_text = format_paragraph(current_heading)
|
284
466
|
markdown_content.append(f"{'#' * level} {heading_text}")
|
285
467
|
current_heading = []
|
@@ -289,11 +471,22 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
289
471
|
# Add the table
|
290
472
|
markdown_content.append(element["content"])
|
291
473
|
last_y = element["bottom"]
|
474
|
+
elif element_type == "horizontal_line":
|
475
|
+
while (next_h_line_idx < len(horizontal_lines)) and (
|
476
|
+
last_y is not None
|
477
|
+
and horizontal_lines[next_h_line_idx]["top"] <= last_y
|
478
|
+
):
|
479
|
+
# Insert the horizontal rule *after* the preceding text
|
480
|
+
if current_paragraph: # Flush any pending paragraph
|
481
|
+
markdown_content.append(format_paragraph(current_paragraph))
|
482
|
+
current_paragraph = []
|
483
|
+
markdown_content.append("\n---\n\n") # Add the rule
|
484
|
+
next_h_line_idx += 1
|
292
485
|
else:
|
293
486
|
# Process word
|
294
487
|
word = element
|
295
488
|
# Check if this might be a heading
|
296
|
-
heading_level = detect_heading_level(word["size"])
|
489
|
+
heading_level = detect_heading_level(word["size"], body_font_size)
|
297
490
|
|
298
491
|
# Detect new line based on vertical position
|
299
492
|
is_new_line = last_y is not None and abs(word["top"] - last_y) > y_tolerance
|
@@ -301,7 +494,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
301
494
|
if is_new_line:
|
302
495
|
# If we were collecting a heading
|
303
496
|
if current_heading:
|
304
|
-
level = detect_heading_level(
|
497
|
+
level = detect_heading_level(
|
498
|
+
current_heading[0]["size"], body_font_size
|
499
|
+
)
|
305
500
|
heading_text = format_paragraph(current_heading)
|
306
501
|
markdown_content.append(f"{'#' * level} {heading_text}")
|
307
502
|
current_heading = []
|
@@ -311,6 +506,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
311
506
|
markdown_content.append(format_paragraph(current_paragraph))
|
312
507
|
current_paragraph = []
|
313
508
|
|
509
|
+
indent_level = detect_indentation_level(word, base_left)
|
510
|
+
current_paragraph.append(("indent", indent_level))
|
511
|
+
|
314
512
|
# Add word to appropriate collection
|
315
513
|
if heading_level:
|
316
514
|
if current_paragraph: # Flush any pending paragraph
|
@@ -319,7 +517,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
319
517
|
current_heading.append(word)
|
320
518
|
else:
|
321
519
|
if current_heading: # Flush any pending heading
|
322
|
-
level = detect_heading_level(
|
520
|
+
level = detect_heading_level(
|
521
|
+
current_heading[0]["size"], body_font_size
|
522
|
+
)
|
323
523
|
heading_text = format_paragraph(current_heading)
|
324
524
|
markdown_content.append(f"{'#' * level} {heading_text}")
|
325
525
|
current_heading = []
|
@@ -329,7 +529,7 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
329
529
|
|
330
530
|
# Handle remaining content
|
331
531
|
if current_heading:
|
332
|
-
level = detect_heading_level(current_heading[0]["size"])
|
532
|
+
level = detect_heading_level(current_heading[0]["size"], body_font_size)
|
333
533
|
heading_text = format_paragraph(current_heading)
|
334
534
|
markdown_content.append(f"{'#' * level} {heading_text}")
|
335
535
|
|
@@ -348,8 +548,15 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
348
548
|
if links:
|
349
549
|
content = embed_links_in_text(page, content, links)
|
350
550
|
|
551
|
+
content = embed_email_links(content)
|
552
|
+
|
351
553
|
# Remove redundant formatting
|
352
|
-
content =
|
554
|
+
content = (
|
555
|
+
content.replace("** **", " ")
|
556
|
+
.replace("* *", " ")
|
557
|
+
.replace("` `", " ")
|
558
|
+
.replace("\n```\n\n```", "")
|
559
|
+
)
|
353
560
|
|
354
561
|
return content
|
355
562
|
|
@@ -389,7 +596,7 @@ def parse_with_pdfplumber(path: str, **kwargs) -> Dict:
|
|
389
596
|
]
|
390
597
|
|
391
598
|
return {
|
392
|
-
"raw": "
|
599
|
+
"raw": "\n\n".join(page_texts),
|
393
600
|
"segments": segments,
|
394
601
|
"title": kwargs["title"],
|
395
602
|
"url": kwargs.get("url", ""),
|
lexoid/core/utils.py
CHANGED
@@ -46,7 +46,7 @@ def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
|
|
46
46
|
|
47
47
|
|
48
48
|
def create_sub_pdf(
|
49
|
-
input_path: str, output_path: str, page_nums: Optional[tuple[int, ...]|int] = None
|
49
|
+
input_path: str, output_path: str, page_nums: Optional[tuple[int, ...] | int] = None
|
50
50
|
) -> str:
|
51
51
|
if isinstance(page_nums, int):
|
52
52
|
page_nums = (page_nums,)
|
@@ -106,6 +106,8 @@ def is_supported_file_type(path: str) -> bool:
|
|
106
106
|
if (
|
107
107
|
file_type == "application/pdf"
|
108
108
|
or "wordprocessing" in file_type
|
109
|
+
or "spreadsheet" in file_type
|
110
|
+
or "presentation" in file_type
|
109
111
|
or file_type.startswith("image/")
|
110
112
|
or file_type.startswith("text")
|
111
113
|
):
|
@@ -217,7 +219,7 @@ def split_md_by_headings(markdown_content: str, heading_pattern: str) -> List[Di
|
|
217
219
|
pattern = r"^([^\n]+)\n-+$"
|
218
220
|
sections = re.split(pattern, markdown_content, flags=re.MULTILINE)
|
219
221
|
# Remove empty sections and strip whitespace
|
220
|
-
sections = [section.strip() for section in sections
|
222
|
+
sections = [section.strip() for section in sections]
|
221
223
|
|
222
224
|
# Handle content before first heading if it exists
|
223
225
|
if sections and not re.match(r"^[^\n]+\n-+$", sections[0], re.MULTILINE):
|
@@ -244,7 +246,7 @@ def split_md_by_headings(markdown_content: str, heading_pattern: str) -> List[Di
|
|
244
246
|
headings = re.findall(regex, markdown_content, flags=re.MULTILINE)
|
245
247
|
|
246
248
|
# Remove empty sections and strip whitespace
|
247
|
-
sections = [section.strip() for section in sections
|
249
|
+
sections = [section.strip() for section in sections]
|
248
250
|
|
249
251
|
# Handle content before first heading if it exists
|
250
252
|
if len(sections) > len(headings):
|
@@ -299,6 +301,7 @@ def html_to_markdown(html: str, title: str, url: str) -> str:
|
|
299
301
|
|
300
302
|
return content
|
301
303
|
|
304
|
+
|
302
305
|
def get_webpage_soup(url: str) -> BeautifulSoup:
|
303
306
|
try:
|
304
307
|
from playwright.async_api import async_playwright
|
@@ -549,7 +552,7 @@ def has_hyperlink_in_pdf(path: str):
|
|
549
552
|
)
|
550
553
|
|
551
554
|
|
552
|
-
def router(path: str, priority: str = "
|
555
|
+
def router(path: str, priority: str = "speed") -> str:
|
553
556
|
"""
|
554
557
|
Routes the file path to the appropriate parser based on the file type.
|
555
558
|
|
@@ -558,9 +561,9 @@ def router(path: str, priority: str = "accuracy") -> str:
|
|
558
561
|
priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE).
|
559
562
|
"""
|
560
563
|
file_type = get_file_type(path)
|
561
|
-
if file_type.startswith("text/"):
|
564
|
+
if file_type.startswith("text/") or "spreadsheet" in file_type or "presentation" in file_type:
|
562
565
|
return "STATIC_PARSE"
|
563
|
-
|
566
|
+
|
564
567
|
if priority == "accuracy":
|
565
568
|
# If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
|
566
569
|
# Otherwise, use LLM_PARSE
|
@@ -574,13 +577,11 @@ def router(path: str, priority: str = "accuracy") -> str:
|
|
574
577
|
else:
|
575
578
|
# If the file is a PDF without images, use STATIC_PARSE
|
576
579
|
# Otherwise, use LLM_PARSE
|
577
|
-
if (
|
578
|
-
file_type == "application/pdf"
|
579
|
-
and not has_image_in_pdf(path)
|
580
|
-
):
|
580
|
+
if file_type == "application/pdf" and not has_image_in_pdf(path):
|
581
581
|
return "STATIC_PARSE"
|
582
582
|
return "LLM_PARSE"
|
583
583
|
|
584
|
+
|
584
585
|
def convert_doc_to_pdf(input_path: str, temp_dir: str) -> str:
|
585
586
|
temp_path = os.path.join(
|
586
587
|
temp_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf"
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: lexoid
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.13
|
4
4
|
Summary:
|
5
5
|
Requires-Python: >=3.10,<4.0
|
6
6
|
Classifier: Programming Language :: Python :: 3
|
@@ -18,10 +18,12 @@ Requires-Dist: markdownify (>=0.13.1,<0.14.0)
|
|
18
18
|
Requires-Dist: nest-asyncio (>=1.6.0,<2.0.0)
|
19
19
|
Requires-Dist: openai (>=1.47.0,<2.0.0)
|
20
20
|
Requires-Dist: opencv-python (>=4.10.0.84,<5.0.0.0)
|
21
|
+
Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
|
21
22
|
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
22
23
|
Requires-Dist: pdfplumber (>=0.11.4,<0.12.0)
|
23
24
|
Requires-Dist: pikepdf (>=9.3.0,<10.0.0)
|
24
25
|
Requires-Dist: playwright (>=1.49.0,<2.0.0)
|
26
|
+
Requires-Dist: pptx2md (>=2.0.6,<3.0.0)
|
25
27
|
Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
|
26
28
|
Requires-Dist: pyqt5 (>=5.15.11,<6.0.0) ; platform_system != "debian"
|
27
29
|
Requires-Dist: pyqtwebengine (>=5.15.7,<6.0.0) ; platform_system != "debian"
|
@@ -31,7 +33,20 @@ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
|
|
31
33
|
Requires-Dist: together (>=1.4.0,<2.0.0)
|
32
34
|
Description-Content-Type: text/markdown
|
33
35
|
|
34
|
-
|
36
|
+
<div align="center">
|
37
|
+
|
38
|
+
```
|
39
|
+
___ _______ __ __ _______ ___ ______
|
40
|
+
| | | || |_| || || | | |
|
41
|
+
| | | ___|| || _ || | | _ |
|
42
|
+
| | | |___ | || | | || | | | | |
|
43
|
+
| |___ | ___| | | | |_| || | | |_| |
|
44
|
+
| || |___ | _ || || | | |
|
45
|
+
|_______||_______||__| |__||_______||___| |______|
|
46
|
+
|
47
|
+
```
|
48
|
+
|
49
|
+
</div>
|
35
50
|
|
36
51
|
[](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
|
37
52
|
[](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
|
@@ -43,54 +58,67 @@ Lexoid is an efficient document parsing library that supports both LLM-based and
|
|
43
58
|
[Documentation](https://oidlabs-com.github.io/Lexoid/)
|
44
59
|
|
45
60
|
## Motivation:
|
61
|
+
|
46
62
|
- Use the multi-modal advancement of LLMs
|
47
63
|
- Enable convenience for users
|
48
64
|
- Collaborate with a permissive license
|
49
65
|
|
50
66
|
## Installation
|
67
|
+
|
51
68
|
### Installing with pip
|
69
|
+
|
52
70
|
```
|
53
71
|
pip install lexoid
|
54
72
|
```
|
55
73
|
|
56
74
|
To use LLM-based parsing, define the following environment variables or create a `.env` file with the following definitions
|
75
|
+
|
57
76
|
```
|
58
77
|
OPENAI_API_KEY=""
|
59
78
|
GOOGLE_API_KEY=""
|
60
79
|
```
|
61
80
|
|
62
81
|
Optionally, to use `Playwright` for retrieving web content (instead of the `requests` library):
|
82
|
+
|
63
83
|
```
|
64
84
|
playwright install --with-deps --only-shell chromium
|
65
85
|
```
|
66
86
|
|
67
87
|
### Building `.whl` from source
|
88
|
+
|
68
89
|
```
|
69
90
|
make build
|
70
91
|
```
|
71
92
|
|
72
93
|
### Creating a local installation
|
94
|
+
|
73
95
|
To install dependencies:
|
96
|
+
|
74
97
|
```
|
75
98
|
make install
|
76
99
|
```
|
100
|
+
|
77
101
|
or, to install with dev-dependencies:
|
102
|
+
|
78
103
|
```
|
79
104
|
make dev
|
80
105
|
```
|
81
106
|
|
82
107
|
To activate virtual environment:
|
108
|
+
|
83
109
|
```
|
84
110
|
source .venv/bin/activate
|
85
111
|
```
|
86
112
|
|
87
113
|
## Usage
|
114
|
+
|
88
115
|
[Example Notebook](https://github.com/oidlabs-com/Lexoid/blob/main/examples/example_notebook.ipynb)
|
89
116
|
|
90
|
-
[Example Colab Notebook](https://
|
117
|
+
[Example Colab Notebook](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
|
91
118
|
|
92
119
|
Here's a quick example to parse documents using Lexoid:
|
93
|
-
|
120
|
+
|
121
|
+
```python
|
94
122
|
from lexoid.api import parse
|
95
123
|
from lexoid.api import ParserType
|
96
124
|
|
@@ -103,30 +131,42 @@ print(parsed_md)
|
|
103
131
|
```
|
104
132
|
|
105
133
|
### Parameters
|
134
|
+
|
106
135
|
- path (str): The file path or URL.
|
107
136
|
- parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
|
108
137
|
- pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
|
109
138
|
- max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
|
110
|
-
-
|
139
|
+
- \*\*kwargs: Additional arguments for the parser.
|
140
|
+
|
141
|
+
## Supported API Providers
|
142
|
+
* Google
|
143
|
+
* OpenAI
|
144
|
+
* Hugging Face
|
145
|
+
* Together AI
|
146
|
+
* OpenRouter
|
111
147
|
|
112
148
|
## Benchmark
|
149
|
+
|
113
150
|
Results aggregated across 5 iterations each for 5 documents.
|
114
151
|
|
115
152
|
_Note:_ Benchmarks are currently done in the zero-shot setting.
|
116
153
|
|
117
|
-
| Rank | Model
|
118
|
-
|
119
|
-
| 1
|
120
|
-
| 2
|
121
|
-
| 3
|
122
|
-
| 4
|
123
|
-
| 5
|
124
|
-
| 6
|
125
|
-
| 7
|
126
|
-
| 8
|
127
|
-
| 9
|
128
|
-
| 10
|
129
|
-
| 11
|
130
|
-
| 12
|
131
|
-
| 13
|
154
|
+
| Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost($) |
|
155
|
+
| ---- | ----------------------------------------------------- | --------------- | --------- | -------- | -------- |
|
156
|
+
| 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.000480 |
|
157
|
+
| 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
|
158
|
+
| 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
|
159
|
+
| 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
|
160
|
+
| 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
|
161
|
+
| 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
|
162
|
+
| 7 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
|
163
|
+
| 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
|
164
|
+
| 9 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
|
165
|
+
| 10 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
|
166
|
+
| 11 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
|
167
|
+
| 12 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.000060 |
|
168
|
+
| 13 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
|
169
|
+
| 14 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
|
170
|
+
| 15 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
|
171
|
+
| 16 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.80 | 0.000019 |
|
132
172
|
|
@@ -0,0 +1,9 @@
|
|
1
|
+
lexoid/api.py,sha256=lTkUcbGML29JrWJv4pE_ZqbzeJuHUE8b6OnijoLBEfU,11350
|
2
|
+
lexoid/core/parse_type/llm_parser.py,sha256=rrc1Lwp-6ZAi8IVp3672mHAHUs1JefhT2rnYyQ1gA5E,11292
|
3
|
+
lexoid/core/parse_type/static_parser.py,sha256=IovvF1GCLWFPh2-mwcgv6DpJmSVQBLnGcoIq7bwQ39Q,21299
|
4
|
+
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
+
lexoid/core/utils.py,sha256=6s24X3-4Y57u70HzjIS798Tg8qx6Z3mLATf4xtENE-8,19718
|
6
|
+
lexoid-0.1.13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
+
lexoid-0.1.13.dist-info/METADATA,sha256=GHODqox4lX6qf_gjSy8ULYJZhaKKQ1BDKEUAOMi7R2U,6809
|
8
|
+
lexoid-0.1.13.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
+
lexoid-0.1.13.dist-info/RECORD,,
|
@@ -1,9 +0,0 @@
|
|
1
|
-
lexoid/api.py,sha256=CIZBNvh38PJbD0OwK1Mp0qqkWxkAEBw2L_FkoCmagXA,9288
|
2
|
-
lexoid/core/parse_type/llm_parser.py,sha256=XfsN6RAtb14p31U2jL-9QyRKpkNAGXXiK3urWJIFi2U,10625
|
3
|
-
lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
|
4
|
-
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
-
lexoid/core/utils.py,sha256=ZB-HnSsQLmbg0zx1uHlIDnLuitENylRVCIt1nVcYrCc,19657
|
6
|
-
lexoid-0.1.11.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
-
lexoid-0.1.11.post1.dist-info/METADATA,sha256=b_XJEbQBQuvYNkEkJY1CYByVj1BMayP2g1H_Ybjo0VU,4844
|
8
|
-
lexoid-0.1.11.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
-
lexoid-0.1.11.post1.dist-info/RECORD,,
|
File without changes
|
File without changes
|