lexoid 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,8 @@
1
1
  import os
2
+ import re
2
3
  import tempfile
3
4
  from time import time
4
- from typing import List, Dict
5
+ from typing import Dict, List
5
6
 
6
7
  import pandas as pd
7
8
  import pdfplumber
@@ -9,14 +10,15 @@ from docx import Document
9
10
  from pdfminer.high_level import extract_pages
10
11
  from pdfminer.layout import LTTextContainer
11
12
  from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
12
- from pptx2md import convert, ConversionConfig
13
+ from pptx2md import ConversionConfig, convert
14
+
13
15
 
14
16
  from lexoid.core.utils import (
15
17
  get_file_type,
16
18
  get_uri_rect,
17
19
  html_to_markdown,
18
- split_pdf,
19
20
  split_md_by_headings,
21
+ split_pdf,
20
22
  )
21
23
 
22
24
 
@@ -203,6 +205,25 @@ def embed_links_in_text(page, text, links):
203
205
  return text
204
206
 
205
207
 
208
+ def detect_indentation_level(word, base_left_position):
209
+ """Determine indentation level based on left position difference."""
210
+ left_diff = word["x0"] - base_left_position
211
+ if left_diff < 5:
212
+ return 0
213
+ return int(left_diff // 25) + 1
214
+
215
+
216
+ def embed_email_links(text: str) -> str:
217
+ """
218
+ Detect email addresses in text and wrap them in angle brackets.
219
+ For example, 'mail@example.com' becomes '<mail@example.com>'.
220
+ """
221
+ email_pattern = re.compile(
222
+ r"(?<![<\[])(?P<email>\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b)(?![>\]])"
223
+ )
224
+ return email_pattern.sub(lambda match: f"<{match.group('email')}>", text)
225
+
226
+
206
227
  def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
207
228
  """
208
229
  Process a single page's content and return formatted markdown text.
@@ -213,7 +234,26 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
213
234
  last_y = None
214
235
  x_tolerance = kwargs.get("x_tolerance", 1)
215
236
  y_tolerance = kwargs.get("y_tolerance", 5)
216
-
237
+ next_h_line_idx = 0
238
+
239
+ # First detect horizontal lines that could be markdown rules
240
+ horizontal_lines = []
241
+ if hasattr(page, "lines"):
242
+ for line in page.lines:
243
+ # Check if line is approximately horizontal (within 5 degrees)
244
+ if (
245
+ abs(line["height"]) < 0.1
246
+ or abs(line["width"]) > abs(line["height"]) * 20
247
+ ):
248
+ # Consider it a horizontal rule candidate
249
+ horizontal_lines.append(
250
+ {
251
+ "top": line["top"],
252
+ "bottom": line["bottom"],
253
+ "x0": line["x0"],
254
+ "x1": line["x1"],
255
+ }
256
+ )
217
257
  # Table settings
218
258
  vertical_strategy = kwargs.get("vertical_strategy", "lines")
219
259
  horizontal_strategy = kwargs.get("horizontal_strategy", "lines")
@@ -243,14 +283,43 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
243
283
  extra_attrs=["size", "top", "bottom", "fontname"],
244
284
  )
245
285
 
246
- def format_paragraph(text_elements):
247
- """Format a paragraph with styling applied to individual words"""
248
- formatted_words = []
249
- for element in text_elements:
250
- text = element["text"]
251
- formatting = get_text_formatting(element)
252
- formatted_words.append(apply_markdown_formatting(text, formatting))
253
- return f"{' '.join(formatted_words)}\n\n"
286
+ if words:
287
+ font_sizes = [w.get("size", 12) for w in words]
288
+ body_font_size = max(set(font_sizes), key=font_sizes.count)
289
+ else:
290
+ body_font_size = 12
291
+
292
+ left_positions = []
293
+ prev_bottom = None
294
+
295
+ for word in words:
296
+ # Check if this is likely a new line (first word in line)
297
+ if prev_bottom is None or abs(word["top"] - prev_bottom) > y_tolerance:
298
+ left_positions.append(word["x0"])
299
+ prev_bottom = word["top"]
300
+
301
+ # Find the most common minimum left position (mode)
302
+ if left_positions:
303
+ base_left = max(set(left_positions), key=left_positions.count)
304
+ else:
305
+ base_left = 0
306
+
307
+ for line in horizontal_lines:
308
+ # Check each word to see if it overlaps with this line
309
+ for word in words:
310
+ # Get word bounding box coordinates
311
+ word_left = word["x0"]
312
+ word_right = word["x1"]
313
+ word_top = word["top"]
314
+ word_bottom = word["bottom"]
315
+
316
+ # Check if word overlaps with line in both x and y dimensions
317
+ x_overlap = (word_left <= line["x1"]) and (word_right >= line["x0"])
318
+ y_overlap = (word_top <= line["bottom"]) and (word_bottom >= line["top"])
319
+
320
+ if x_overlap and y_overlap:
321
+ word["text"] = f"~~{word['text']}~~"
322
+ break
254
323
 
255
324
  def get_text_formatting(word):
256
325
  """
@@ -260,19 +329,22 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
260
329
  formatting = {
261
330
  "bold": False,
262
331
  "italic": False,
332
+ "monospace": False,
263
333
  }
264
-
265
334
  # Check font name for common bold/italic indicators
266
335
  font_name = word.get("fontname", "").lower()
267
336
  if any(style in font_name for style in ["bold", "heavy", "black"]):
268
337
  formatting["bold"] = True
269
338
  if any(style in font_name for style in ["italic", "oblique"]):
270
339
  formatting["italic"] = True
271
-
340
+ if "mono" in font_name: # Detect monospace fonts
341
+ formatting["monospace"] = True
272
342
  return formatting
273
343
 
274
344
  def apply_markdown_formatting(text, formatting):
275
345
  """Apply markdown formatting to text based on detected styles"""
346
+ if formatting["monospace"]:
347
+ text = f"`{text}`"
276
348
  if formatting["bold"] and formatting["italic"]:
277
349
  text = f"***{text}***"
278
350
  elif formatting["bold"]:
@@ -281,12 +353,64 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
281
353
  text = f"*{text}*"
282
354
  return text
283
355
 
284
- def detect_heading_level(font_size):
285
- if font_size >= 24:
356
+ def format_paragraph(text_elements):
357
+ """
358
+ Format a paragraph with styling applied to individual words.
359
+ If all words are monospace, treat the paragraph as a code block.
360
+ Otherwise, wrap monospace words with backticks (`).
361
+ """
362
+
363
+ all_monospace = True
364
+ formatted_words = []
365
+
366
+ for element in text_elements:
367
+ if isinstance(element, tuple) and element[0] == "indent":
368
+ indent = "&nbsp;" * element[1] * 3
369
+ formatted_words.append(indent)
370
+ continue
371
+
372
+ text = element["text"]
373
+ formatting = get_text_formatting(element)
374
+
375
+ if formatting.get("monospace", False):
376
+ # Wrap monospace words with backticks
377
+ formatted_words.append(f"`{text}`")
378
+ else:
379
+ all_monospace = False
380
+ # Apply other markdown formatting
381
+ formatted_words.append(apply_markdown_formatting(text, formatting))
382
+
383
+ # If all words are monospace, format as a code block
384
+ if all_monospace:
385
+ if isinstance(text_elements[0], tuple):
386
+ indent_str = " " * text_elements[0][1]
387
+ if len(text_elements) > 1:
388
+ text_elements = text_elements[1:]
389
+ text_elements[0]["text"] = indent_str + text_elements[0]["text"]
390
+ else:
391
+ return indent_str
392
+ code_content = " ".join([element["text"] for element in text_elements])
393
+ return f"```\n{code_content}\n```\n\n"
394
+
395
+ # Otherwise, return the formatted paragraph
396
+ return f"{' '.join(formatted_words)}\n\n"
397
+
398
+ def detect_heading_level(font_size, body_font_size):
399
+ """Determine heading level based on font size ratio.
400
+
401
+ Args:
402
+ font_size: The font size to evaluate
403
+ body_font_size: The base body font size for comparison
404
+
405
+ Returns:
406
+ int: The heading level (1-3) or None if not a heading
407
+ """
408
+ size_ratio = font_size / body_font_size
409
+ if size_ratio >= 2:
286
410
  return 1
287
- elif font_size >= 20:
411
+ elif size_ratio >= 1.4:
288
412
  return 2
289
- elif font_size >= 16:
413
+ elif size_ratio >= 1.2:
290
414
  return 3
291
415
  return None
292
416
 
@@ -303,18 +427,41 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
303
427
  )
304
428
  )
305
429
  tables.sort(key=lambda x: x[1]["bottom"])
430
+
306
431
  content_elements = []
307
- for word in words:
432
+ for line in horizontal_lines:
433
+ content_elements.append(
434
+ (
435
+ "horizontal_line",
436
+ {
437
+ "top": line["top"],
438
+ "bottom": line["bottom"],
439
+ "x0": line["x0"],
440
+ "x1": line["x1"],
441
+ },
442
+ )
443
+ )
444
+
445
+ for i, word in enumerate(words):
308
446
  while tables and word["bottom"] > tables[0][1]["bottom"]:
309
447
  content_elements.append(tables.pop(0))
448
+
449
+ # Equate position of words on the same line
450
+ if i > 0 and abs(word["top"] - words[i - 1]["top"]) < 3:
451
+ word["top"] = words[i - 1]["top"]
452
+
310
453
  content_elements.append(("word", word))
311
454
  content_elements.extend(tables)
312
455
 
456
+ content_elements.sort(
457
+ key=lambda x: x[1]["top"] if isinstance(x[1], dict) and "top" in x[1] else 0
458
+ )
459
+
313
460
  for element_type, element in content_elements:
461
+ # If there are any pending paragraphs or headings, add them first
314
462
  if element_type == "table":
315
- # If there are any pending paragraphs or headings, add them first
316
463
  if current_heading:
317
- level = detect_heading_level(current_heading[0]["size"])
464
+ level = detect_heading_level(current_heading[0]["size"], body_font_size)
318
465
  heading_text = format_paragraph(current_heading)
319
466
  markdown_content.append(f"{'#' * level} {heading_text}")
320
467
  current_heading = []
@@ -324,11 +471,22 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
324
471
  # Add the table
325
472
  markdown_content.append(element["content"])
326
473
  last_y = element["bottom"]
474
+ elif element_type == "horizontal_line":
475
+ while (next_h_line_idx < len(horizontal_lines)) and (
476
+ last_y is not None
477
+ and horizontal_lines[next_h_line_idx]["top"] <= last_y
478
+ ):
479
+ # Insert the horizontal rule *after* the preceding text
480
+ if current_paragraph: # Flush any pending paragraph
481
+ markdown_content.append(format_paragraph(current_paragraph))
482
+ current_paragraph = []
483
+ markdown_content.append("\n---\n\n") # Add the rule
484
+ next_h_line_idx += 1
327
485
  else:
328
486
  # Process word
329
487
  word = element
330
488
  # Check if this might be a heading
331
- heading_level = detect_heading_level(word["size"])
489
+ heading_level = detect_heading_level(word["size"], body_font_size)
332
490
 
333
491
  # Detect new line based on vertical position
334
492
  is_new_line = last_y is not None and abs(word["top"] - last_y) > y_tolerance
@@ -336,7 +494,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
336
494
  if is_new_line:
337
495
  # If we were collecting a heading
338
496
  if current_heading:
339
- level = detect_heading_level(current_heading[0]["size"])
497
+ level = detect_heading_level(
498
+ current_heading[0]["size"], body_font_size
499
+ )
340
500
  heading_text = format_paragraph(current_heading)
341
501
  markdown_content.append(f"{'#' * level} {heading_text}")
342
502
  current_heading = []
@@ -346,6 +506,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
346
506
  markdown_content.append(format_paragraph(current_paragraph))
347
507
  current_paragraph = []
348
508
 
509
+ indent_level = detect_indentation_level(word, base_left)
510
+ current_paragraph.append(("indent", indent_level))
511
+
349
512
  # Add word to appropriate collection
350
513
  if heading_level:
351
514
  if current_paragraph: # Flush any pending paragraph
@@ -354,7 +517,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
354
517
  current_heading.append(word)
355
518
  else:
356
519
  if current_heading: # Flush any pending heading
357
- level = detect_heading_level(current_heading[0]["size"])
520
+ level = detect_heading_level(
521
+ current_heading[0]["size"], body_font_size
522
+ )
358
523
  heading_text = format_paragraph(current_heading)
359
524
  markdown_content.append(f"{'#' * level} {heading_text}")
360
525
  current_heading = []
@@ -364,7 +529,7 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
364
529
 
365
530
  # Handle remaining content
366
531
  if current_heading:
367
- level = detect_heading_level(current_heading[0]["size"])
532
+ level = detect_heading_level(current_heading[0]["size"], body_font_size)
368
533
  heading_text = format_paragraph(current_heading)
369
534
  markdown_content.append(f"{'#' * level} {heading_text}")
370
535
 
@@ -383,8 +548,15 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
383
548
  if links:
384
549
  content = embed_links_in_text(page, content, links)
385
550
 
551
+ content = embed_email_links(content)
552
+
386
553
  # Remove redundant formatting
387
- content = content.replace("** **", " ").replace("* *", " ")
554
+ content = (
555
+ content.replace("** **", " ")
556
+ .replace("* *", " ")
557
+ .replace("` `", " ")
558
+ .replace("\n```\n\n```", "")
559
+ )
388
560
 
389
561
  return content
390
562
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lexoid
3
- Version: 0.1.12
3
+ Version: 0.1.13
4
4
  Summary:
5
5
  Requires-Python: >=3.10,<4.0
6
6
  Classifier: Programming Language :: Python :: 3
@@ -1,9 +1,9 @@
1
1
  lexoid/api.py,sha256=lTkUcbGML29JrWJv4pE_ZqbzeJuHUE8b6OnijoLBEfU,11350
2
2
  lexoid/core/parse_type/llm_parser.py,sha256=rrc1Lwp-6ZAi8IVp3672mHAHUs1JefhT2rnYyQ1gA5E,11292
3
- lexoid/core/parse_type/static_parser.py,sha256=v4GWUmZVBBIF9TnbkhPBt2gspk0Oq_ujtNGnXZHLBr8,15055
3
+ lexoid/core/parse_type/static_parser.py,sha256=IovvF1GCLWFPh2-mwcgv6DpJmSVQBLnGcoIq7bwQ39Q,21299
4
4
  lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
5
5
  lexoid/core/utils.py,sha256=6s24X3-4Y57u70HzjIS798Tg8qx6Z3mLATf4xtENE-8,19718
6
- lexoid-0.1.12.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
- lexoid-0.1.12.dist-info/METADATA,sha256=XMHFMqwDj2DgSaZcZjXU881NxdPsRGBAsUyPyRsJvyU,6809
8
- lexoid-0.1.12.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
9
- lexoid-0.1.12.dist-info/RECORD,,
6
+ lexoid-0.1.13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
7
+ lexoid-0.1.13.dist-info/METADATA,sha256=GHODqox4lX6qf_gjSy8ULYJZhaKKQ1BDKEUAOMi7R2U,6809
8
+ lexoid-0.1.13.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
9
+ lexoid-0.1.13.dist-info/RECORD,,