lexoid 0.1.11.post1__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
lexoid/api.py CHANGED
@@ -1,3 +1,4 @@
1
+ import json
1
2
  import os
2
3
  import re
3
4
  import tempfile
@@ -50,7 +51,8 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
50
51
  - token_usage: Dictionary containing token usage statistics
51
52
  """
52
53
  if parser_type == ParserType.AUTO:
53
- parser_type = ParserType[router(path)]
54
+ router_priority = kwargs.get("router_priority", "speed")
55
+ parser_type = ParserType[router(path, router_priority)]
54
56
  logger.debug(f"Auto-detected parser type: {parser_type}")
55
57
 
56
58
  kwargs["start"] = (
@@ -80,7 +82,7 @@ def parse_chunk_list(
80
82
  """
81
83
  combined_segments = []
82
84
  raw_texts = []
83
- token_usage = {"input": 0, "output": 0}
85
+ token_usage = {"input": 0, "output": 0, "image_count": 0}
84
86
  for file_path in file_paths:
85
87
  result = parse_chunk(file_path, parser_type, **kwargs)
86
88
  combined_segments.extend(result["segments"])
@@ -88,6 +90,7 @@ def parse_chunk_list(
88
90
  if "token_usage" in result:
89
91
  token_usage["input"] += result["token_usage"]["input"]
90
92
  token_usage["output"] += result["token_usage"]["output"]
93
+ token_usage["image_count"] += len(result["segments"])
91
94
  token_usage["total"] = token_usage["input"] + token_usage["output"]
92
95
 
93
96
  return {
@@ -135,14 +138,20 @@ def parse(
135
138
 
136
139
  if type(parser_type) == str:
137
140
  parser_type = ParserType[parser_type]
141
+ if (
142
+ path.lower().endswith((".doc", ".docx"))
143
+ and parser_type != ParserType.STATIC_PARSE
144
+ ):
145
+ as_pdf = True
146
+ if path.lower().endswith(".xlsx") and parser_type == ParserType.LLM_PARSE:
147
+ logger.warning("LLM_PARSE does not support .xlsx files. Using STATIC_PARSE.")
148
+ parser_type = ParserType.STATIC_PARSE
149
+ if path.lower().endswith(".pptx") and parser_type == ParserType.LLM_PARSE:
150
+ logger.warning("LLM_PARSE does not support .pptx files. Using STATIC_PARSE.")
151
+ parser_type = ParserType.STATIC_PARSE
138
152
 
139
153
  with tempfile.TemporaryDirectory() as temp_dir:
140
- if (
141
- path.lower().endswith((".doc", ".docx"))
142
- and parser_type != ParserType.STATIC_PARSE
143
- ):
144
- as_pdf = True
145
-
154
+ kwargs["temp_dir"] = temp_dir
146
155
  if path.startswith(("http://", "https://")):
147
156
  kwargs["url"] = path
148
157
  download_dir = kwargs.get("save_dir", os.path.join(temp_dir, "downloads/"))
@@ -210,9 +219,40 @@ def parse(
210
219
  "token_usage": {
211
220
  "input": sum(r["token_usage"]["input"] for r in chunk_results),
212
221
  "output": sum(r["token_usage"]["output"] for r in chunk_results),
222
+ "image_count": sum(
223
+ r["token_usage"]["image_count"] for r in chunk_results
224
+ ),
213
225
  "total": sum(r["token_usage"]["total"] for r in chunk_results),
214
226
  },
215
227
  }
228
+
229
+ if "api_cost_mapping" in kwargs:
230
+ api_cost_mapping = kwargs["api_cost_mapping"]
231
+ if isinstance(api_cost_mapping, dict):
232
+ api_cost_mapping = api_cost_mapping
233
+ elif isinstance(api_cost_mapping, str) and os.path.exists(
234
+ api_cost_mapping
235
+ ):
236
+ with open(api_cost_mapping, "r") as f:
237
+ api_cost_mapping = json.load(f)
238
+ else:
239
+ raise ValueError(f"Unsupported API cost value: {api_cost_mapping}.")
240
+
241
+ api_cost = api_cost_mapping.get(
242
+ kwargs.get("model", "gemini-2.0-flash"), None
243
+ )
244
+ if api_cost:
245
+ token_usage = result["token_usage"]
246
+ token_cost = {
247
+ "input": token_usage["input"] * api_cost["input"] / 1_000_000
248
+ + api_cost.get("input-image", 0) * token_usage["image_count"],
249
+ "output": token_usage["output"]
250
+ * api_cost["output"]
251
+ / 1_000_000,
252
+ }
253
+ token_cost["total"] = token_cost["input"] + token_cost["output"]
254
+ result["token_cost"] = token_cost
255
+
216
256
  if as_pdf:
217
257
  result["pdf_path"] = path
218
258
 
@@ -31,6 +31,7 @@ def retry_on_http_error(func):
31
31
  logger.error(f"HTTPError encountered: {e}. Retrying in 10 seconds...")
32
32
  time.sleep(10)
33
33
  try:
34
+ logger.debug(f"Retry {func.__name__}")
34
35
  return func(*args, **kwargs)
35
36
  except HTTPError as e:
36
37
  logger.error(f"Retry failed: {e}")
@@ -49,6 +50,8 @@ def retry_on_http_error(func):
49
50
 
50
51
  @retry_on_http_error
51
52
  def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
53
+ if "api_provider" in kwargs and kwargs["api_provider"]:
54
+ return parse_with_api(path, api=kwargs["api_provider"], **kwargs)
52
55
  if "model" not in kwargs:
53
56
  kwargs["model"] = "gemini-2.0-flash"
54
57
  model = kwargs.get("model")
@@ -57,9 +60,11 @@ def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
57
60
  if model.startswith("gpt"):
58
61
  return parse_with_api(path, api="openai", **kwargs)
59
62
  if model.startswith("meta-llama"):
60
- if model.endswith("Turbo") or model == "meta-llama/Llama-Vision-Free":
63
+ if "Turbo" in model or model == "meta-llama/Llama-Vision-Free":
61
64
  return parse_with_api(path, api="together", **kwargs)
62
65
  return parse_with_api(path, api="huggingface", **kwargs)
66
+ if any(model.startswith(prefix) for prefix in ["microsoft", "google", "qwen"]):
67
+ return parse_with_api(path, api="openrouter", **kwargs)
63
68
  raise ValueError(f"Unsupported model: {model}")
64
69
 
65
70
 
@@ -81,20 +86,20 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
81
86
  file_content = file.read()
82
87
  base64_file = base64.b64encode(file_content).decode("utf-8")
83
88
 
84
- # Ideally, we do this ourselves. But, for now this might be a good enough.
85
- custom_instruction = f"""- Total number of pages: {kwargs["pages_per_split_"]}. {INSTRUCTIONS_ADD_PG_BREAK}"""
86
- if kwargs["pages_per_split_"] == 1:
87
- custom_instruction = ""
89
+ if "system_prompt" in kwargs:
90
+ prompt = kwargs["system_prompt"]
91
+ else:
92
+ # Ideally, we do this ourselves. But, for now this might be a good enough.
93
+ custom_instruction = f"""- Total number of pages: {kwargs["pages_per_split_"]}. {INSTRUCTIONS_ADD_PG_BREAK}"""
94
+ if kwargs["pages_per_split_"] == 1:
95
+ custom_instruction = ""
96
+ prompt = PARSER_PROMPT.format(custom_instructions=custom_instruction)
88
97
 
89
98
  payload = {
90
99
  "contents": [
91
100
  {
92
101
  "parts": [
93
- {
94
- "text": PARSER_PROMPT.format(
95
- custom_instructions=custom_instruction
96
- )
97
- },
102
+ {"text": prompt},
98
103
  {"inline_data": {"mime_type": mime_type, "data": base64_file}},
99
104
  ]
100
105
  }
@@ -105,9 +110,11 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
105
110
  }
106
111
 
107
112
  headers = {"Content-Type": "application/json"}
108
-
109
- response = requests.post(url, json=payload, headers=headers)
110
- response.raise_for_status()
113
+ try:
114
+ response = requests.post(url, json=payload, headers=headers, timeout=120)
115
+ response.raise_for_status()
116
+ except requests.Timeout as e:
117
+ raise HTTPError(f"Timeout error occurred: {e}")
111
118
 
112
119
  result = response.json()
113
120
 
@@ -130,7 +137,7 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
130
137
  total_tokens = input_tokens + output_tokens
131
138
 
132
139
  return {
133
- "raw": combined_text,
140
+ "raw": combined_text.replace("<page-break>", "\n\n"),
134
141
  "segments": [
135
142
  {"metadata": {"page": kwargs.get("start", 0) + page_no}, "content": page}
136
143
  for page_no, page in enumerate(combined_text.split("<page-break>"), start=1)
@@ -181,6 +188,10 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
181
188
  token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
182
189
  ),
183
190
  "together": lambda: Together(),
191
+ "openrouter": lambda: OpenAI(
192
+ base_url="https://openrouter.ai/api/v1",
193
+ api_key=os.environ["OPENROUTER_API_KEY"],
194
+ ),
184
195
  }
185
196
  assert api in clients, f"Unsupported API: {api}"
186
197
  logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
@@ -206,35 +217,32 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
206
217
 
207
218
  # API-specific message formatting
208
219
  def get_messages(page_num: int, image_url: str) -> List[Dict]:
209
- base_message = {
210
- "type": "text",
211
- "text": LLAMA_PARSER_PROMPT,
212
- }
213
220
  image_message = {
214
221
  "type": "image_url",
215
222
  "image_url": {"url": image_url},
216
223
  }
217
224
 
218
225
  if api == "openai":
226
+ system_prompt = kwargs.get(
227
+ "system_prompt", PARSER_PROMPT.format(custom_instructions="")
228
+ )
229
+ user_prompt = kwargs.get("user_prompt", OPENAI_USER_PROMPT)
219
230
  return [
220
231
  {
221
232
  "role": "system",
222
- "content": PARSER_PROMPT.format(
223
- custom_instructions=INSTRUCTIONS_ADD_PG_BREAK
224
- ),
233
+ "content": system_prompt,
225
234
  },
226
235
  {
227
236
  "role": "user",
228
237
  "content": [
229
- {
230
- "type": "text",
231
- "text": f"{OPENAI_USER_PROMPT} (Page {page_num + 1})",
232
- },
238
+ {"type": "text", "text": user_prompt},
233
239
  image_message,
234
240
  ],
235
241
  },
236
242
  ]
237
243
  else:
244
+ prompt = kwargs.get("system_prompt", LLAMA_PARSER_PROMPT)
245
+ base_message = {"type": "text", "text": prompt}
238
246
  return [
239
247
  {
240
248
  "role": "user",
@@ -283,7 +291,7 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
283
291
  # Sort results by page number and combine
284
292
  all_results.sort(key=lambda x: x[0])
285
293
  all_texts = [text for _, text, _, _, _ in all_results]
286
- combined_text = "<page-break>".join(all_texts)
294
+ combined_text = "\n\n".join(all_texts)
287
295
 
288
296
  return {
289
297
  "raw": combined_text,
@@ -1,12 +1,25 @@
1
+ import os
2
+ import re
1
3
  import tempfile
4
+ from time import time
5
+ from typing import Dict, List
6
+
2
7
  import pandas as pd
3
8
  import pdfplumber
4
- from typing import List, Dict
5
- from lexoid.core.utils import get_file_type, get_uri_rect, html_to_markdown, split_pdf
9
+ from docx import Document
6
10
  from pdfminer.high_level import extract_pages
7
11
  from pdfminer.layout import LTTextContainer
8
12
  from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
9
- from docx import Document
13
+ from pptx2md import ConversionConfig, convert
14
+
15
+
16
+ from lexoid.core.utils import (
17
+ get_file_type,
18
+ get_uri_rect,
19
+ html_to_markdown,
20
+ split_md_by_headings,
21
+ split_pdf,
22
+ )
10
23
 
11
24
 
12
25
  def parse_static_doc(path: str, **kwargs) -> Dict:
@@ -47,8 +60,11 @@ def parse_static_doc(path: str, **kwargs) -> Dict:
47
60
  "parent_title": kwargs.get("parent_title", ""),
48
61
  "recursive_docs": [],
49
62
  }
50
- elif file_type == "text/csv":
51
- df = pd.read_csv(path)
63
+ elif file_type == "text/csv" or "spreadsheet" in file_type:
64
+ if "spreadsheet" in file_type:
65
+ df = pd.read_excel(path)
66
+ else:
67
+ df = pd.read_csv(path)
52
68
  content = df.to_markdown(index=False)
53
69
  return {
54
70
  "raw": content,
@@ -58,6 +74,27 @@ def parse_static_doc(path: str, **kwargs) -> Dict:
58
74
  "parent_title": kwargs.get("parent_title", ""),
59
75
  "recursive_docs": [],
60
76
  }
77
+ elif "presentation" in file_type:
78
+ md_path = os.path.join(kwargs["temp_dir"], f"{int(time())}.md")
79
+ convert(
80
+ ConversionConfig(
81
+ pptx_path=path,
82
+ output_path=md_path,
83
+ image_dir=None,
84
+ disable_image=True,
85
+ disable_notes=True,
86
+ )
87
+ )
88
+ with open(md_path, "r") as f:
89
+ content = f.read()
90
+ return {
91
+ "raw": content,
92
+ "segments": split_md_by_headings(content, "#"),
93
+ "title": kwargs["title"],
94
+ "url": kwargs.get("url", ""),
95
+ "parent_title": kwargs.get("parent_title", ""),
96
+ "recursive_docs": [],
97
+ }
61
98
  else:
62
99
  raise ValueError(f"Unsupported file type: {file_type}")
63
100
 
@@ -168,6 +205,25 @@ def embed_links_in_text(page, text, links):
168
205
  return text
169
206
 
170
207
 
208
+ def detect_indentation_level(word, base_left_position):
209
+ """Determine indentation level based on left position difference."""
210
+ left_diff = word["x0"] - base_left_position
211
+ if left_diff < 5:
212
+ return 0
213
+ return int(left_diff // 25) + 1
214
+
215
+
216
+ def embed_email_links(text: str) -> str:
217
+ """
218
+ Detect email addresses in text and wrap them in angle brackets.
219
+ For example, 'mail@example.com' becomes '<mail@example.com>'.
220
+ """
221
+ email_pattern = re.compile(
222
+ r"(?<![<\[])(?P<email>\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b)(?![>\]])"
223
+ )
224
+ return email_pattern.sub(lambda match: f"<{match.group('email')}>", text)
225
+
226
+
171
227
  def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
172
228
  """
173
229
  Process a single page's content and return formatted markdown text.
@@ -178,7 +234,26 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
178
234
  last_y = None
179
235
  x_tolerance = kwargs.get("x_tolerance", 1)
180
236
  y_tolerance = kwargs.get("y_tolerance", 5)
181
-
237
+ next_h_line_idx = 0
238
+
239
+ # First detect horizontal lines that could be markdown rules
240
+ horizontal_lines = []
241
+ if hasattr(page, "lines"):
242
+ for line in page.lines:
243
+ # Check if line is approximately horizontal (within 5 degrees)
244
+ if (
245
+ abs(line["height"]) < 0.1
246
+ or abs(line["width"]) > abs(line["height"]) * 20
247
+ ):
248
+ # Consider it a horizontal rule candidate
249
+ horizontal_lines.append(
250
+ {
251
+ "top": line["top"],
252
+ "bottom": line["bottom"],
253
+ "x0": line["x0"],
254
+ "x1": line["x1"],
255
+ }
256
+ )
182
257
  # Table settings
183
258
  vertical_strategy = kwargs.get("vertical_strategy", "lines")
184
259
  horizontal_strategy = kwargs.get("horizontal_strategy", "lines")
@@ -208,14 +283,43 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
208
283
  extra_attrs=["size", "top", "bottom", "fontname"],
209
284
  )
210
285
 
211
- def format_paragraph(text_elements):
212
- """Format a paragraph with styling applied to individual words"""
213
- formatted_words = []
214
- for element in text_elements:
215
- text = element["text"]
216
- formatting = get_text_formatting(element)
217
- formatted_words.append(apply_markdown_formatting(text, formatting))
218
- return f"{' '.join(formatted_words)}\n\n"
286
+ if words:
287
+ font_sizes = [w.get("size", 12) for w in words]
288
+ body_font_size = max(set(font_sizes), key=font_sizes.count)
289
+ else:
290
+ body_font_size = 12
291
+
292
+ left_positions = []
293
+ prev_bottom = None
294
+
295
+ for word in words:
296
+ # Check if this is likely a new line (first word in line)
297
+ if prev_bottom is None or abs(word["top"] - prev_bottom) > y_tolerance:
298
+ left_positions.append(word["x0"])
299
+ prev_bottom = word["top"]
300
+
301
+ # Find the most common minimum left position (mode)
302
+ if left_positions:
303
+ base_left = max(set(left_positions), key=left_positions.count)
304
+ else:
305
+ base_left = 0
306
+
307
+ for line in horizontal_lines:
308
+ # Check each word to see if it overlaps with this line
309
+ for word in words:
310
+ # Get word bounding box coordinates
311
+ word_left = word["x0"]
312
+ word_right = word["x1"]
313
+ word_top = word["top"]
314
+ word_bottom = word["bottom"]
315
+
316
+ # Check if word overlaps with line in both x and y dimensions
317
+ x_overlap = (word_left <= line["x1"]) and (word_right >= line["x0"])
318
+ y_overlap = (word_top <= line["bottom"]) and (word_bottom >= line["top"])
319
+
320
+ if x_overlap and y_overlap:
321
+ word["text"] = f"~~{word['text']}~~"
322
+ break
219
323
 
220
324
  def get_text_formatting(word):
221
325
  """
@@ -225,19 +329,22 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
225
329
  formatting = {
226
330
  "bold": False,
227
331
  "italic": False,
332
+ "monospace": False,
228
333
  }
229
-
230
334
  # Check font name for common bold/italic indicators
231
335
  font_name = word.get("fontname", "").lower()
232
336
  if any(style in font_name for style in ["bold", "heavy", "black"]):
233
337
  formatting["bold"] = True
234
338
  if any(style in font_name for style in ["italic", "oblique"]):
235
339
  formatting["italic"] = True
236
-
340
+ if "mono" in font_name: # Detect monospace fonts
341
+ formatting["monospace"] = True
237
342
  return formatting
238
343
 
239
344
  def apply_markdown_formatting(text, formatting):
240
345
  """Apply markdown formatting to text based on detected styles"""
346
+ if formatting["monospace"]:
347
+ text = f"`{text}`"
241
348
  if formatting["bold"] and formatting["italic"]:
242
349
  text = f"***{text}***"
243
350
  elif formatting["bold"]:
@@ -246,12 +353,64 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
246
353
  text = f"*{text}*"
247
354
  return text
248
355
 
249
- def detect_heading_level(font_size):
250
- if font_size >= 24:
356
+ def format_paragraph(text_elements):
357
+ """
358
+ Format a paragraph with styling applied to individual words.
359
+ If all words are monospace, treat the paragraph as a code block.
360
+ Otherwise, wrap monospace words with backticks (`).
361
+ """
362
+
363
+ all_monospace = True
364
+ formatted_words = []
365
+
366
+ for element in text_elements:
367
+ if isinstance(element, tuple) and element[0] == "indent":
368
+ indent = "&nbsp;" * element[1] * 3
369
+ formatted_words.append(indent)
370
+ continue
371
+
372
+ text = element["text"]
373
+ formatting = get_text_formatting(element)
374
+
375
+ if formatting.get("monospace", False):
376
+ # Wrap monospace words with backticks
377
+ formatted_words.append(f"`{text}`")
378
+ else:
379
+ all_monospace = False
380
+ # Apply other markdown formatting
381
+ formatted_words.append(apply_markdown_formatting(text, formatting))
382
+
383
+ # If all words are monospace, format as a code block
384
+ if all_monospace:
385
+ if isinstance(text_elements[0], tuple):
386
+ indent_str = " " * text_elements[0][1]
387
+ if len(text_elements) > 1:
388
+ text_elements = text_elements[1:]
389
+ text_elements[0]["text"] = indent_str + text_elements[0]["text"]
390
+ else:
391
+ return indent_str
392
+ code_content = " ".join([element["text"] for element in text_elements])
393
+ return f"```\n{code_content}\n```\n\n"
394
+
395
+ # Otherwise, return the formatted paragraph
396
+ return f"{' '.join(formatted_words)}\n\n"
397
+
398
+ def detect_heading_level(font_size, body_font_size):
399
+ """Determine heading level based on font size ratio.
400
+
401
+ Args:
402
+ font_size: The font size to evaluate
403
+ body_font_size: The base body font size for comparison
404
+
405
+ Returns:
406
+ int: The heading level (1-3) or None if not a heading
407
+ """
408
+ size_ratio = font_size / body_font_size
409
+ if size_ratio >= 2:
251
410
  return 1
252
- elif font_size >= 20:
411
+ elif size_ratio >= 1.4:
253
412
  return 2
254
- elif font_size >= 16:
413
+ elif size_ratio >= 1.2:
255
414
  return 3
256
415
  return None
257
416
 
@@ -268,18 +427,41 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
268
427
  )
269
428
  )
270
429
  tables.sort(key=lambda x: x[1]["bottom"])
430
+
271
431
  content_elements = []
272
- for word in words:
432
+ for line in horizontal_lines:
433
+ content_elements.append(
434
+ (
435
+ "horizontal_line",
436
+ {
437
+ "top": line["top"],
438
+ "bottom": line["bottom"],
439
+ "x0": line["x0"],
440
+ "x1": line["x1"],
441
+ },
442
+ )
443
+ )
444
+
445
+ for i, word in enumerate(words):
273
446
  while tables and word["bottom"] > tables[0][1]["bottom"]:
274
447
  content_elements.append(tables.pop(0))
448
+
449
+ # Equate position of words on the same line
450
+ if i > 0 and abs(word["top"] - words[i - 1]["top"]) < 3:
451
+ word["top"] = words[i - 1]["top"]
452
+
275
453
  content_elements.append(("word", word))
276
454
  content_elements.extend(tables)
277
455
 
456
+ content_elements.sort(
457
+ key=lambda x: x[1]["top"] if isinstance(x[1], dict) and "top" in x[1] else 0
458
+ )
459
+
278
460
  for element_type, element in content_elements:
461
+ # If there are any pending paragraphs or headings, add them first
279
462
  if element_type == "table":
280
- # If there are any pending paragraphs or headings, add them first
281
463
  if current_heading:
282
- level = detect_heading_level(current_heading[0]["size"])
464
+ level = detect_heading_level(current_heading[0]["size"], body_font_size)
283
465
  heading_text = format_paragraph(current_heading)
284
466
  markdown_content.append(f"{'#' * level} {heading_text}")
285
467
  current_heading = []
@@ -289,11 +471,22 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
289
471
  # Add the table
290
472
  markdown_content.append(element["content"])
291
473
  last_y = element["bottom"]
474
+ elif element_type == "horizontal_line":
475
+ while (next_h_line_idx < len(horizontal_lines)) and (
476
+ last_y is not None
477
+ and horizontal_lines[next_h_line_idx]["top"] <= last_y
478
+ ):
479
+ # Insert the horizontal rule *after* the preceding text
480
+ if current_paragraph: # Flush any pending paragraph
481
+ markdown_content.append(format_paragraph(current_paragraph))
482
+ current_paragraph = []
483
+ markdown_content.append("\n---\n\n") # Add the rule
484
+ next_h_line_idx += 1
292
485
  else:
293
486
  # Process word
294
487
  word = element
295
488
  # Check if this might be a heading
296
- heading_level = detect_heading_level(word["size"])
489
+ heading_level = detect_heading_level(word["size"], body_font_size)
297
490
 
298
491
  # Detect new line based on vertical position
299
492
  is_new_line = last_y is not None and abs(word["top"] - last_y) > y_tolerance
@@ -301,7 +494,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
301
494
  if is_new_line:
302
495
  # If we were collecting a heading
303
496
  if current_heading:
304
- level = detect_heading_level(current_heading[0]["size"])
497
+ level = detect_heading_level(
498
+ current_heading[0]["size"], body_font_size
499
+ )
305
500
  heading_text = format_paragraph(current_heading)
306
501
  markdown_content.append(f"{'#' * level} {heading_text}")
307
502
  current_heading = []
@@ -311,6 +506,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
311
506
  markdown_content.append(format_paragraph(current_paragraph))
312
507
  current_paragraph = []
313
508
 
509
+ indent_level = detect_indentation_level(word, base_left)
510
+ current_paragraph.append(("indent", indent_level))
511
+
314
512
  # Add word to appropriate collection
315
513
  if heading_level:
316
514
  if current_paragraph: # Flush any pending paragraph
@@ -319,7 +517,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
319
517
  current_heading.append(word)
320
518
  else:
321
519
  if current_heading: # Flush any pending heading
322
- level = detect_heading_level(current_heading[0]["size"])
520
+ level = detect_heading_level(
521
+ current_heading[0]["size"], body_font_size
522
+ )
323
523
  heading_text = format_paragraph(current_heading)
324
524
  markdown_content.append(f"{'#' * level} {heading_text}")
325
525
  current_heading = []
@@ -329,7 +529,7 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
329
529
 
330
530
  # Handle remaining content
331
531
  if current_heading:
332
- level = detect_heading_level(current_heading[0]["size"])
532
+ level = detect_heading_level(current_heading[0]["size"], body_font_size)
333
533
  heading_text = format_paragraph(current_heading)
334
534
  markdown_content.append(f"{'#' * level} {heading_text}")
335
535
 
@@ -348,8 +548,15 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
348
548
  if links:
349
549
  content = embed_links_in_text(page, content, links)
350
550
 
551
+ content = embed_email_links(content)
552
+
351
553
  # Remove redundant formatting
352
- content = content.replace("** **", " ").replace("* *", " ")
554
+ content = (
555
+ content.replace("** **", " ")
556
+ .replace("* *", " ")
557
+ .replace("` `", " ")
558
+ .replace("\n```\n\n```", "")
559
+ )
353
560
 
354
561
  return content
355
562
 
@@ -389,7 +596,7 @@ def parse_with_pdfplumber(path: str, **kwargs) -> Dict:
389
596
  ]
390
597
 
391
598
  return {
392
- "raw": "<page-break>".join(page_texts),
599
+ "raw": "\n\n".join(page_texts),
393
600
  "segments": segments,
394
601
  "title": kwargs["title"],
395
602
  "url": kwargs.get("url", ""),
lexoid/core/utils.py CHANGED
@@ -46,7 +46,7 @@ def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
46
46
 
47
47
 
48
48
  def create_sub_pdf(
49
- input_path: str, output_path: str, page_nums: Optional[tuple[int, ...]|int] = None
49
+ input_path: str, output_path: str, page_nums: Optional[tuple[int, ...] | int] = None
50
50
  ) -> str:
51
51
  if isinstance(page_nums, int):
52
52
  page_nums = (page_nums,)
@@ -106,6 +106,8 @@ def is_supported_file_type(path: str) -> bool:
106
106
  if (
107
107
  file_type == "application/pdf"
108
108
  or "wordprocessing" in file_type
109
+ or "spreadsheet" in file_type
110
+ or "presentation" in file_type
109
111
  or file_type.startswith("image/")
110
112
  or file_type.startswith("text")
111
113
  ):
@@ -217,7 +219,7 @@ def split_md_by_headings(markdown_content: str, heading_pattern: str) -> List[Di
217
219
  pattern = r"^([^\n]+)\n-+$"
218
220
  sections = re.split(pattern, markdown_content, flags=re.MULTILINE)
219
221
  # Remove empty sections and strip whitespace
220
- sections = [section.strip() for section in sections if section.strip()]
222
+ sections = [section.strip() for section in sections]
221
223
 
222
224
  # Handle content before first heading if it exists
223
225
  if sections and not re.match(r"^[^\n]+\n-+$", sections[0], re.MULTILINE):
@@ -244,7 +246,7 @@ def split_md_by_headings(markdown_content: str, heading_pattern: str) -> List[Di
244
246
  headings = re.findall(regex, markdown_content, flags=re.MULTILINE)
245
247
 
246
248
  # Remove empty sections and strip whitespace
247
- sections = [section.strip() for section in sections if section.strip()]
249
+ sections = [section.strip() for section in sections]
248
250
 
249
251
  # Handle content before first heading if it exists
250
252
  if len(sections) > len(headings):
@@ -299,6 +301,7 @@ def html_to_markdown(html: str, title: str, url: str) -> str:
299
301
 
300
302
  return content
301
303
 
304
+
302
305
  def get_webpage_soup(url: str) -> BeautifulSoup:
303
306
  try:
304
307
  from playwright.async_api import async_playwright
@@ -549,7 +552,7 @@ def has_hyperlink_in_pdf(path: str):
549
552
  )
550
553
 
551
554
 
552
- def router(path: str, priority: str = "accuracy") -> str:
555
+ def router(path: str, priority: str = "speed") -> str:
553
556
  """
554
557
  Routes the file path to the appropriate parser based on the file type.
555
558
 
@@ -558,9 +561,9 @@ def router(path: str, priority: str = "accuracy") -> str:
558
561
  priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE).
559
562
  """
560
563
  file_type = get_file_type(path)
561
- if file_type.startswith("text/"):
564
+ if file_type.startswith("text/") or "spreadsheet" in file_type or "presentation" in file_type:
562
565
  return "STATIC_PARSE"
563
-
566
+
564
567
  if priority == "accuracy":
565
568
  # If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
566
569
  # Otherwise, use LLM_PARSE
@@ -574,13 +577,11 @@ def router(path: str, priority: str = "accuracy") -> str:
574
577
  else:
575
578
  # If the file is a PDF without images, use STATIC_PARSE
576
579
  # Otherwise, use LLM_PARSE
577
- if (
578
- file_type == "application/pdf"
579
- and not has_image_in_pdf(path)
580
- ):
580
+ if file_type == "application/pdf" and not has_image_in_pdf(path):
581
581
  return "STATIC_PARSE"
582
582
  return "LLM_PARSE"
583
583
 
584
+
584
585
  def convert_doc_to_pdf(input_path: str, temp_dir: str) -> str:
585
586
  temp_path = os.path.join(
586
587
  temp_dir, os.path.splitext(os.path.basename(input_path))[0] + ".pdf"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lexoid
3
- Version: 0.1.11.post1
3
+ Version: 0.1.13
4
4
  Summary:
5
5
  Requires-Python: >=3.10,<4.0
6
6
  Classifier: Programming Language :: Python :: 3
@@ -18,10 +18,12 @@ Requires-Dist: markdownify (>=0.13.1,<0.14.0)
18
18
  Requires-Dist: nest-asyncio (>=1.6.0,<2.0.0)
19
19
  Requires-Dist: openai (>=1.47.0,<2.0.0)
20
20
  Requires-Dist: opencv-python (>=4.10.0.84,<5.0.0.0)
21
+ Requires-Dist: openpyxl (>=3.1.5,<4.0.0)
21
22
  Requires-Dist: pandas (>=2.2.3,<3.0.0)
22
23
  Requires-Dist: pdfplumber (>=0.11.4,<0.12.0)
23
24
  Requires-Dist: pikepdf (>=9.3.0,<10.0.0)
24
25
  Requires-Dist: playwright (>=1.49.0,<2.0.0)
26
+ Requires-Dist: pptx2md (>=2.0.6,<3.0.0)
25
27
  Requires-Dist: pypdfium2 (>=4.30.0,<5.0.0)
26
28
  Requires-Dist: pyqt5 (>=5.15.11,<6.0.0) ; platform_system != "debian"
27
29
  Requires-Dist: pyqtwebengine (>=5.15.7,<6.0.0) ; platform_system != "debian"
@@ -31,7 +33,20 @@ Requires-Dist: tabulate (>=0.9.0,<0.10.0)
31
33
  Requires-Dist: together (>=1.4.0,<2.0.0)
32
34
  Description-Content-Type: text/markdown
33
35
 
34
- # Lexoid
36
+ <div align="center">
37
+
38
+ ```
39
+ ___ _______ __ __ _______ ___ ______
40
+ | | | || |_| || || | | |
41
+ | | | ___|| || _ || | | _ |
42
+ | | | |___ | || | | || | | | | |
43
+ | |___ | ___| | | | |_| || | | |_| |
44
+ | || |___ | _ || || | | |
45
+ |_______||_______||__| |__||_______||___| |______|
46
+
47
+ ```
48
+
49
+ </div>
35
50
 
36
51
  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
37
52
  [![GitHub license](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
@@ -43,54 +58,67 @@ Lexoid is an efficient document parsing library that supports both LLM-based and
43
58
  [Documentation](https://oidlabs-com.github.io/Lexoid/)
44
59
 
45
60
  ## Motivation:
61
+
46
62
  - Use the multi-modal advancement of LLMs
47
63
  - Enable convenience for users
48
64
  - Collaborate with a permissive license
49
65
 
50
66
  ## Installation
67
+
51
68
  ### Installing with pip
69
+
52
70
  ```
53
71
  pip install lexoid
54
72
  ```
55
73
 
56
74
  To use LLM-based parsing, define the following environment variables or create a `.env` file with the following definitions
75
+
57
76
  ```
58
77
  OPENAI_API_KEY=""
59
78
  GOOGLE_API_KEY=""
60
79
  ```
61
80
 
62
81
  Optionally, to use `Playwright` for retrieving web content (instead of the `requests` library):
82
+
63
83
  ```
64
84
  playwright install --with-deps --only-shell chromium
65
85
  ```
66
86
 
67
87
  ### Building `.whl` from source
88
+
68
89
  ```
69
90
  make build
70
91
  ```
71
92
 
72
93
  ### Creating a local installation
94
+
73
95
  To install dependencies:
96
+
74
97
  ```
75
98
  make install
76
99
  ```
100
+
77
101
  or, to install with dev-dependencies:
102
+
78
103
  ```
79
104
  make dev
80
105
  ```
81
106
 
82
107
  To activate virtual environment:
108
+
83
109
  ```
84
110
  source .venv/bin/activate
85
111
  ```
86
112
 
87
113
  ## Usage
114
+
88
115
  [Example Notebook](https://github.com/oidlabs-com/Lexoid/blob/main/examples/example_notebook.ipynb)
89
116
 
90
- [Example Colab Notebook](https://drive.google.com/file/d/1v9R6VOUp9CEGalgZGeg5G57XzHqh_tB6/view?usp=sharing)
117
+ [Example Colab Notebook](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
91
118
 
92
119
  Here's a quick example to parse documents using Lexoid:
93
- ``` python
120
+
121
+ ```python
94
122
  from lexoid.api import parse
95
123
  from lexoid.api import ParserType
96
124
 
@@ -103,30 +131,42 @@ print(parsed_md)
103
131
  ```
104
132
 
105
133
  ### Parameters
134
+
106
135
  - path (str): The file path or URL.
107
136
  - parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
108
137
  - pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
109
138
  - max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
110
- - **kwargs: Additional arguments for the parser.
139
+ - \*\*kwargs: Additional arguments for the parser.
140
+
141
+ ## Supported API Providers
142
+ * Google
143
+ * OpenAI
144
+ * Hugging Face
145
+ * Together AI
146
+ * OpenRouter
111
147
 
112
148
  ## Benchmark
149
+
113
150
  Results aggregated across 5 iterations each for 5 documents.
114
151
 
115
152
  _Note:_ Benchmarks are currently done in the zero-shot setting.
116
153
 
117
- | Rank | Model | Mean Similarity | Std. Dev. | Time (s) |
118
- |---|---|---|---|---|
119
- | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 |
120
- | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 |
121
- | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 |
122
- | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 |
123
- | 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 |
124
- | 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 |
125
- | 7 | gpt-4o | 0.687 | 0.247 | 10.16 |
126
- | 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 |
127
- | 9 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 |
128
- | 10 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 |
129
- | 11 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 |
130
- | 12 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 |
131
- | 13 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 |
154
+ | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost($) |
155
+ | ---- | ----------------------------------------------------- | --------------- | --------- | -------- | -------- |
156
+ | 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.000480 |
157
+ | 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
158
+ | 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
159
+ | 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
160
+ | 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
161
+ | 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
162
+ | 7 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
163
+ | 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
164
+ | 9 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
165
+ | 10 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
166
+ | 11 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
167
+ | 12 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.000060 |
168
+ | 13 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
169
+ | 14 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
170
+ | 15 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
171
+ | 16 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.80 | 0.000019 |
132
172
 
@@ -0,0 +1,9 @@
1
+ lexoid/api.py,sha256=lTkUcbGML29JrWJv4pE_ZqbzeJuHUE8b6OnijoLBEfU,11350
2
+ lexoid/core/parse_type/llm_parser.py,sha256=rrc1Lwp-6ZAi8IVp3672mHAHUs1JefhT2rnYyQ1gA5E,11292
3
+ lexoid/core/parse_type/static_parser.py,sha256=IovvF1GCLWFPh2-mwcgv6DpJmSVQBLnGcoIq7bwQ39Q,21299
4
+ lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
5
+ lexoid/core/utils.py,sha256=6s24X3-4Y57u70HzjIS798Tg8qx6Z3mLATf4xtENE-8,19718
6
+ lexoid-0.1.13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
+ lexoid-0.1.13.dist-info/METADATA,sha256=GHODqox4lX6qf_gjSy8ULYJZhaKKQ1BDKEUAOMi7R2U,6809
8
+ lexoid-0.1.13.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
9
+ lexoid-0.1.13.dist-info/RECORD,,
@@ -1,9 +0,0 @@
1
- lexoid/api.py,sha256=CIZBNvh38PJbD0OwK1Mp0qqkWxkAEBw2L_FkoCmagXA,9288
2
- lexoid/core/parse_type/llm_parser.py,sha256=XfsN6RAtb14p31U2jL-9QyRKpkNAGXXiK3urWJIFi2U,10625
3
- lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
4
- lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
5
- lexoid/core/utils.py,sha256=ZB-HnSsQLmbg0zx1uHlIDnLuitENylRVCIt1nVcYrCc,19657
6
- lexoid-0.1.11.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
- lexoid-0.1.11.post1.dist-info/METADATA,sha256=b_XJEbQBQuvYNkEkJY1CYByVj1BMayP2g1H_Ybjo0VU,4844
8
- lexoid-0.1.11.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
9
- lexoid-0.1.11.post1.dist-info/RECORD,,