lexoid 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexoid/core/parse_type/static_parser.py +198 -26
- {lexoid-0.1.12.dist-info → lexoid-0.1.13.dist-info}/METADATA +1 -1
- {lexoid-0.1.12.dist-info → lexoid-0.1.13.dist-info}/RECORD +5 -5
- {lexoid-0.1.12.dist-info → lexoid-0.1.13.dist-info}/LICENSE +0 -0
- {lexoid-0.1.12.dist-info → lexoid-0.1.13.dist-info}/WHEEL +0 -0
@@ -1,7 +1,8 @@
|
|
1
1
|
import os
|
2
|
+
import re
|
2
3
|
import tempfile
|
3
4
|
from time import time
|
4
|
-
from typing import
|
5
|
+
from typing import Dict, List
|
5
6
|
|
6
7
|
import pandas as pd
|
7
8
|
import pdfplumber
|
@@ -9,14 +10,15 @@ from docx import Document
|
|
9
10
|
from pdfminer.high_level import extract_pages
|
10
11
|
from pdfminer.layout import LTTextContainer
|
11
12
|
from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
|
12
|
-
from pptx2md import
|
13
|
+
from pptx2md import ConversionConfig, convert
|
14
|
+
|
13
15
|
|
14
16
|
from lexoid.core.utils import (
|
15
17
|
get_file_type,
|
16
18
|
get_uri_rect,
|
17
19
|
html_to_markdown,
|
18
|
-
split_pdf,
|
19
20
|
split_md_by_headings,
|
21
|
+
split_pdf,
|
20
22
|
)
|
21
23
|
|
22
24
|
|
@@ -203,6 +205,25 @@ def embed_links_in_text(page, text, links):
|
|
203
205
|
return text
|
204
206
|
|
205
207
|
|
208
|
+
def detect_indentation_level(word, base_left_position):
|
209
|
+
"""Determine indentation level based on left position difference."""
|
210
|
+
left_diff = word["x0"] - base_left_position
|
211
|
+
if left_diff < 5:
|
212
|
+
return 0
|
213
|
+
return int(left_diff // 25) + 1
|
214
|
+
|
215
|
+
|
216
|
+
def embed_email_links(text: str) -> str:
|
217
|
+
"""
|
218
|
+
Detect email addresses in text and wrap them in angle brackets.
|
219
|
+
For example, 'mail@example.com' becomes '<mail@example.com>'.
|
220
|
+
"""
|
221
|
+
email_pattern = re.compile(
|
222
|
+
r"(?<![<\[])(?P<email>\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b)(?![>\]])"
|
223
|
+
)
|
224
|
+
return email_pattern.sub(lambda match: f"<{match.group('email')}>", text)
|
225
|
+
|
226
|
+
|
206
227
|
def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
207
228
|
"""
|
208
229
|
Process a single page's content and return formatted markdown text.
|
@@ -213,7 +234,26 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
213
234
|
last_y = None
|
214
235
|
x_tolerance = kwargs.get("x_tolerance", 1)
|
215
236
|
y_tolerance = kwargs.get("y_tolerance", 5)
|
216
|
-
|
237
|
+
next_h_line_idx = 0
|
238
|
+
|
239
|
+
# First detect horizontal lines that could be markdown rules
|
240
|
+
horizontal_lines = []
|
241
|
+
if hasattr(page, "lines"):
|
242
|
+
for line in page.lines:
|
243
|
+
# Check if line is approximately horizontal (within 5 degrees)
|
244
|
+
if (
|
245
|
+
abs(line["height"]) < 0.1
|
246
|
+
or abs(line["width"]) > abs(line["height"]) * 20
|
247
|
+
):
|
248
|
+
# Consider it a horizontal rule candidate
|
249
|
+
horizontal_lines.append(
|
250
|
+
{
|
251
|
+
"top": line["top"],
|
252
|
+
"bottom": line["bottom"],
|
253
|
+
"x0": line["x0"],
|
254
|
+
"x1": line["x1"],
|
255
|
+
}
|
256
|
+
)
|
217
257
|
# Table settings
|
218
258
|
vertical_strategy = kwargs.get("vertical_strategy", "lines")
|
219
259
|
horizontal_strategy = kwargs.get("horizontal_strategy", "lines")
|
@@ -243,14 +283,43 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
243
283
|
extra_attrs=["size", "top", "bottom", "fontname"],
|
244
284
|
)
|
245
285
|
|
246
|
-
|
247
|
-
""
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
286
|
+
if words:
|
287
|
+
font_sizes = [w.get("size", 12) for w in words]
|
288
|
+
body_font_size = max(set(font_sizes), key=font_sizes.count)
|
289
|
+
else:
|
290
|
+
body_font_size = 12
|
291
|
+
|
292
|
+
left_positions = []
|
293
|
+
prev_bottom = None
|
294
|
+
|
295
|
+
for word in words:
|
296
|
+
# Check if this is likely a new line (first word in line)
|
297
|
+
if prev_bottom is None or abs(word["top"] - prev_bottom) > y_tolerance:
|
298
|
+
left_positions.append(word["x0"])
|
299
|
+
prev_bottom = word["top"]
|
300
|
+
|
301
|
+
# Find the most common minimum left position (mode)
|
302
|
+
if left_positions:
|
303
|
+
base_left = max(set(left_positions), key=left_positions.count)
|
304
|
+
else:
|
305
|
+
base_left = 0
|
306
|
+
|
307
|
+
for line in horizontal_lines:
|
308
|
+
# Check each word to see if it overlaps with this line
|
309
|
+
for word in words:
|
310
|
+
# Get word bounding box coordinates
|
311
|
+
word_left = word["x0"]
|
312
|
+
word_right = word["x1"]
|
313
|
+
word_top = word["top"]
|
314
|
+
word_bottom = word["bottom"]
|
315
|
+
|
316
|
+
# Check if word overlaps with line in both x and y dimensions
|
317
|
+
x_overlap = (word_left <= line["x1"]) and (word_right >= line["x0"])
|
318
|
+
y_overlap = (word_top <= line["bottom"]) and (word_bottom >= line["top"])
|
319
|
+
|
320
|
+
if x_overlap and y_overlap:
|
321
|
+
word["text"] = f"~~{word['text']}~~"
|
322
|
+
break
|
254
323
|
|
255
324
|
def get_text_formatting(word):
|
256
325
|
"""
|
@@ -260,19 +329,22 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
260
329
|
formatting = {
|
261
330
|
"bold": False,
|
262
331
|
"italic": False,
|
332
|
+
"monospace": False,
|
263
333
|
}
|
264
|
-
|
265
334
|
# Check font name for common bold/italic indicators
|
266
335
|
font_name = word.get("fontname", "").lower()
|
267
336
|
if any(style in font_name for style in ["bold", "heavy", "black"]):
|
268
337
|
formatting["bold"] = True
|
269
338
|
if any(style in font_name for style in ["italic", "oblique"]):
|
270
339
|
formatting["italic"] = True
|
271
|
-
|
340
|
+
if "mono" in font_name: # Detect monospace fonts
|
341
|
+
formatting["monospace"] = True
|
272
342
|
return formatting
|
273
343
|
|
274
344
|
def apply_markdown_formatting(text, formatting):
|
275
345
|
"""Apply markdown formatting to text based on detected styles"""
|
346
|
+
if formatting["monospace"]:
|
347
|
+
text = f"`{text}`"
|
276
348
|
if formatting["bold"] and formatting["italic"]:
|
277
349
|
text = f"***{text}***"
|
278
350
|
elif formatting["bold"]:
|
@@ -281,12 +353,64 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
281
353
|
text = f"*{text}*"
|
282
354
|
return text
|
283
355
|
|
284
|
-
def
|
285
|
-
|
356
|
+
def format_paragraph(text_elements):
|
357
|
+
"""
|
358
|
+
Format a paragraph with styling applied to individual words.
|
359
|
+
If all words are monospace, treat the paragraph as a code block.
|
360
|
+
Otherwise, wrap monospace words with backticks (`).
|
361
|
+
"""
|
362
|
+
|
363
|
+
all_monospace = True
|
364
|
+
formatted_words = []
|
365
|
+
|
366
|
+
for element in text_elements:
|
367
|
+
if isinstance(element, tuple) and element[0] == "indent":
|
368
|
+
indent = " " * element[1] * 3
|
369
|
+
formatted_words.append(indent)
|
370
|
+
continue
|
371
|
+
|
372
|
+
text = element["text"]
|
373
|
+
formatting = get_text_formatting(element)
|
374
|
+
|
375
|
+
if formatting.get("monospace", False):
|
376
|
+
# Wrap monospace words with backticks
|
377
|
+
formatted_words.append(f"`{text}`")
|
378
|
+
else:
|
379
|
+
all_monospace = False
|
380
|
+
# Apply other markdown formatting
|
381
|
+
formatted_words.append(apply_markdown_formatting(text, formatting))
|
382
|
+
|
383
|
+
# If all words are monospace, format as a code block
|
384
|
+
if all_monospace:
|
385
|
+
if isinstance(text_elements[0], tuple):
|
386
|
+
indent_str = " " * text_elements[0][1]
|
387
|
+
if len(text_elements) > 1:
|
388
|
+
text_elements = text_elements[1:]
|
389
|
+
text_elements[0]["text"] = indent_str + text_elements[0]["text"]
|
390
|
+
else:
|
391
|
+
return indent_str
|
392
|
+
code_content = " ".join([element["text"] for element in text_elements])
|
393
|
+
return f"```\n{code_content}\n```\n\n"
|
394
|
+
|
395
|
+
# Otherwise, return the formatted paragraph
|
396
|
+
return f"{' '.join(formatted_words)}\n\n"
|
397
|
+
|
398
|
+
def detect_heading_level(font_size, body_font_size):
|
399
|
+
"""Determine heading level based on font size ratio.
|
400
|
+
|
401
|
+
Args:
|
402
|
+
font_size: The font size to evaluate
|
403
|
+
body_font_size: The base body font size for comparison
|
404
|
+
|
405
|
+
Returns:
|
406
|
+
int: The heading level (1-3) or None if not a heading
|
407
|
+
"""
|
408
|
+
size_ratio = font_size / body_font_size
|
409
|
+
if size_ratio >= 2:
|
286
410
|
return 1
|
287
|
-
elif
|
411
|
+
elif size_ratio >= 1.4:
|
288
412
|
return 2
|
289
|
-
elif
|
413
|
+
elif size_ratio >= 1.2:
|
290
414
|
return 3
|
291
415
|
return None
|
292
416
|
|
@@ -303,18 +427,41 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
303
427
|
)
|
304
428
|
)
|
305
429
|
tables.sort(key=lambda x: x[1]["bottom"])
|
430
|
+
|
306
431
|
content_elements = []
|
307
|
-
for
|
432
|
+
for line in horizontal_lines:
|
433
|
+
content_elements.append(
|
434
|
+
(
|
435
|
+
"horizontal_line",
|
436
|
+
{
|
437
|
+
"top": line["top"],
|
438
|
+
"bottom": line["bottom"],
|
439
|
+
"x0": line["x0"],
|
440
|
+
"x1": line["x1"],
|
441
|
+
},
|
442
|
+
)
|
443
|
+
)
|
444
|
+
|
445
|
+
for i, word in enumerate(words):
|
308
446
|
while tables and word["bottom"] > tables[0][1]["bottom"]:
|
309
447
|
content_elements.append(tables.pop(0))
|
448
|
+
|
449
|
+
# Equate position of words on the same line
|
450
|
+
if i > 0 and abs(word["top"] - words[i - 1]["top"]) < 3:
|
451
|
+
word["top"] = words[i - 1]["top"]
|
452
|
+
|
310
453
|
content_elements.append(("word", word))
|
311
454
|
content_elements.extend(tables)
|
312
455
|
|
456
|
+
content_elements.sort(
|
457
|
+
key=lambda x: x[1]["top"] if isinstance(x[1], dict) and "top" in x[1] else 0
|
458
|
+
)
|
459
|
+
|
313
460
|
for element_type, element in content_elements:
|
461
|
+
# If there are any pending paragraphs or headings, add them first
|
314
462
|
if element_type == "table":
|
315
|
-
# If there are any pending paragraphs or headings, add them first
|
316
463
|
if current_heading:
|
317
|
-
level = detect_heading_level(current_heading[0]["size"])
|
464
|
+
level = detect_heading_level(current_heading[0]["size"], body_font_size)
|
318
465
|
heading_text = format_paragraph(current_heading)
|
319
466
|
markdown_content.append(f"{'#' * level} {heading_text}")
|
320
467
|
current_heading = []
|
@@ -324,11 +471,22 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
324
471
|
# Add the table
|
325
472
|
markdown_content.append(element["content"])
|
326
473
|
last_y = element["bottom"]
|
474
|
+
elif element_type == "horizontal_line":
|
475
|
+
while (next_h_line_idx < len(horizontal_lines)) and (
|
476
|
+
last_y is not None
|
477
|
+
and horizontal_lines[next_h_line_idx]["top"] <= last_y
|
478
|
+
):
|
479
|
+
# Insert the horizontal rule *after* the preceding text
|
480
|
+
if current_paragraph: # Flush any pending paragraph
|
481
|
+
markdown_content.append(format_paragraph(current_paragraph))
|
482
|
+
current_paragraph = []
|
483
|
+
markdown_content.append("\n---\n\n") # Add the rule
|
484
|
+
next_h_line_idx += 1
|
327
485
|
else:
|
328
486
|
# Process word
|
329
487
|
word = element
|
330
488
|
# Check if this might be a heading
|
331
|
-
heading_level = detect_heading_level(word["size"])
|
489
|
+
heading_level = detect_heading_level(word["size"], body_font_size)
|
332
490
|
|
333
491
|
# Detect new line based on vertical position
|
334
492
|
is_new_line = last_y is not None and abs(word["top"] - last_y) > y_tolerance
|
@@ -336,7 +494,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
336
494
|
if is_new_line:
|
337
495
|
# If we were collecting a heading
|
338
496
|
if current_heading:
|
339
|
-
level = detect_heading_level(
|
497
|
+
level = detect_heading_level(
|
498
|
+
current_heading[0]["size"], body_font_size
|
499
|
+
)
|
340
500
|
heading_text = format_paragraph(current_heading)
|
341
501
|
markdown_content.append(f"{'#' * level} {heading_text}")
|
342
502
|
current_heading = []
|
@@ -346,6 +506,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
346
506
|
markdown_content.append(format_paragraph(current_paragraph))
|
347
507
|
current_paragraph = []
|
348
508
|
|
509
|
+
indent_level = detect_indentation_level(word, base_left)
|
510
|
+
current_paragraph.append(("indent", indent_level))
|
511
|
+
|
349
512
|
# Add word to appropriate collection
|
350
513
|
if heading_level:
|
351
514
|
if current_paragraph: # Flush any pending paragraph
|
@@ -354,7 +517,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
354
517
|
current_heading.append(word)
|
355
518
|
else:
|
356
519
|
if current_heading: # Flush any pending heading
|
357
|
-
level = detect_heading_level(
|
520
|
+
level = detect_heading_level(
|
521
|
+
current_heading[0]["size"], body_font_size
|
522
|
+
)
|
358
523
|
heading_text = format_paragraph(current_heading)
|
359
524
|
markdown_content.append(f"{'#' * level} {heading_text}")
|
360
525
|
current_heading = []
|
@@ -364,7 +529,7 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
364
529
|
|
365
530
|
# Handle remaining content
|
366
531
|
if current_heading:
|
367
|
-
level = detect_heading_level(current_heading[0]["size"])
|
532
|
+
level = detect_heading_level(current_heading[0]["size"], body_font_size)
|
368
533
|
heading_text = format_paragraph(current_heading)
|
369
534
|
markdown_content.append(f"{'#' * level} {heading_text}")
|
370
535
|
|
@@ -383,8 +548,15 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
|
|
383
548
|
if links:
|
384
549
|
content = embed_links_in_text(page, content, links)
|
385
550
|
|
551
|
+
content = embed_email_links(content)
|
552
|
+
|
386
553
|
# Remove redundant formatting
|
387
|
-
content =
|
554
|
+
content = (
|
555
|
+
content.replace("** **", " ")
|
556
|
+
.replace("* *", " ")
|
557
|
+
.replace("` `", " ")
|
558
|
+
.replace("\n```\n\n```", "")
|
559
|
+
)
|
388
560
|
|
389
561
|
return content
|
390
562
|
|
@@ -1,9 +1,9 @@
|
|
1
1
|
lexoid/api.py,sha256=lTkUcbGML29JrWJv4pE_ZqbzeJuHUE8b6OnijoLBEfU,11350
|
2
2
|
lexoid/core/parse_type/llm_parser.py,sha256=rrc1Lwp-6ZAi8IVp3672mHAHUs1JefhT2rnYyQ1gA5E,11292
|
3
|
-
lexoid/core/parse_type/static_parser.py,sha256=
|
3
|
+
lexoid/core/parse_type/static_parser.py,sha256=IovvF1GCLWFPh2-mwcgv6DpJmSVQBLnGcoIq7bwQ39Q,21299
|
4
4
|
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
5
|
lexoid/core/utils.py,sha256=6s24X3-4Y57u70HzjIS798Tg8qx6Z3mLATf4xtENE-8,19718
|
6
|
-
lexoid-0.1.
|
7
|
-
lexoid-0.1.
|
8
|
-
lexoid-0.1.
|
9
|
-
lexoid-0.1.
|
6
|
+
lexoid-0.1.13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
+
lexoid-0.1.13.dist-info/METADATA,sha256=GHODqox4lX6qf_gjSy8ULYJZhaKKQ1BDKEUAOMi7R2U,6809
|
8
|
+
lexoid-0.1.13.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
+
lexoid-0.1.13.dist-info/RECORD,,
|
File without changes
|
File without changes
|