PyPI - lexoid - Versions diffs - 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl - Mend

lexoid 0.1.12py3-none-any.whl → 0.1.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

lexoid/core/parse_type/static_parser.py CHANGED Viewed

@@ -1,7 +1,8 @@
 import os
+import re
 import tempfile
 from time import time
-from typing import List, Dict
+from typing import Dict, List
 import pandas as pd
 import pdfplumber
@@ -9,14 +10,15 @@ from docx import Document
 from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTTextContainer
 from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
-from pptx2md import convert, ConversionConfig
+from pptx2md import ConversionConfig, convert
 from lexoid.core.utils import (
     get_file_type,
     get_uri_rect,
     html_to_markdown,
-    split_pdf,
     split_md_by_headings,
+    split_pdf,
 )
@@ -203,6 +205,25 @@ def embed_links_in_text(page, text, links):
     return text
+def detect_indentation_level(word, base_left_position):
+    """Determine indentation level based on left position difference."""
+    left_diff = word["x0"] - base_left_position
+    if left_diff < 5:
+        return 0
+    return int(left_diff // 25) + 1
+def embed_email_links(text: str) -> str:
+    """
+    Detect email addresses in text and wrap them in angle brackets.
+    For example, 'mail@example.com' becomes '<mail@example.com>'.
+    """
+    email_pattern = re.compile(
+        r"(?<![<\[])(?P<email>\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b)(?![>\]])"
+    )
+    return email_pattern.sub(lambda match: f"<{match.group('email')}>", text)
 def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
     """
     Process a single page's content and return formatted markdown text.
@@ -213,7 +234,26 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
     last_y = None
     x_tolerance = kwargs.get("x_tolerance", 1)
     y_tolerance = kwargs.get("y_tolerance", 5)
+    next_h_line_idx = 0
+    # First detect horizontal lines that could be markdown rules
+    horizontal_lines = []
+    if hasattr(page, "lines"):
+        for line in page.lines:
+            # Check if line is approximately horizontal (within 5 degrees)
+            if (
+                abs(line["height"]) < 0.1
+                or abs(line["width"]) > abs(line["height"]) * 20
+            ):
+                # Consider it a horizontal rule candidate
+                horizontal_lines.append(
+                    {
+                        "top": line["top"],
+                        "bottom": line["bottom"],
+                        "x0": line["x0"],
+                        "x1": line["x1"],
+                    }
+                )
     # Table settings
     vertical_strategy = kwargs.get("vertical_strategy", "lines")
     horizontal_strategy = kwargs.get("horizontal_strategy", "lines")
@@ -243,14 +283,43 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
         extra_attrs=["size", "top", "bottom", "fontname"],
     )
-    def format_paragraph(text_elements):
-        """Format a paragraph with styling applied to individual words"""
-        formatted_words = []
-        for element in text_elements:
-            text = element["text"]
-            formatting = get_text_formatting(element)
-            formatted_words.append(apply_markdown_formatting(text, formatting))
-        return f"{' '.join(formatted_words)}\n\n"
+    if words:
+        font_sizes = [w.get("size", 12) for w in words]
+        body_font_size = max(set(font_sizes), key=font_sizes.count)
+    else:
+        body_font_size = 12
+    left_positions = []
+    prev_bottom = None
+    for word in words:
+        # Check if this is likely a new line (first word in line)
+        if prev_bottom is None or abs(word["top"] - prev_bottom) > y_tolerance:
+            left_positions.append(word["x0"])
+        prev_bottom = word["top"]
+    # Find the most common minimum left position (mode)
+    if left_positions:
+        base_left = max(set(left_positions), key=left_positions.count)
+    else:
+        base_left = 0
+    for line in horizontal_lines:
+        # Check each word to see if it overlaps with this line
+        for word in words:
+            # Get word bounding box coordinates
+            word_left = word["x0"]
+            word_right = word["x1"]
+            word_top = word["top"]
+            word_bottom = word["bottom"]
+            # Check if word overlaps with line in both x and y dimensions
+            x_overlap = (word_left <= line["x1"]) and (word_right >= line["x0"])
+            y_overlap = (word_top <= line["bottom"]) and (word_bottom >= line["top"])
+            if x_overlap and y_overlap:
+                word["text"] = f"~~{word['text']}~~"
+                break
     def get_text_formatting(word):
         """
@@ -260,19 +329,22 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
         formatting = {
             "bold": False,
             "italic": False,
+            "monospace": False,
         }
         # Check font name for common bold/italic indicators
         font_name = word.get("fontname", "").lower()
         if any(style in font_name for style in ["bold", "heavy", "black"]):
             formatting["bold"] = True
         if any(style in font_name for style in ["italic", "oblique"]):
             formatting["italic"] = True
+        if "mono" in font_name:  # Detect monospace fonts
+            formatting["monospace"] = True
         return formatting
     def apply_markdown_formatting(text, formatting):
         """Apply markdown formatting to text based on detected styles"""
+        if formatting["monospace"]:
+            text = f"`{text}`"
         if formatting["bold"] and formatting["italic"]:
             text = f"***{text}***"
         elif formatting["bold"]:
@@ -281,12 +353,64 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
             text = f"*{text}*"
         return text
-    def detect_heading_level(font_size):
-        if font_size >= 24:
+    def format_paragraph(text_elements):
+        """
+        Format a paragraph with styling applied to individual words.
+        If all words are monospace, treat the paragraph as a code block.
+        Otherwise, wrap monospace words with backticks (`).
+        """
+        all_monospace = True
+        formatted_words = []
+        for element in text_elements:
+            if isinstance(element, tuple) and element[0] == "indent":
+                indent = "&nbsp;" * element[1] * 3
+                formatted_words.append(indent)
+                continue
+            text = element["text"]
+            formatting = get_text_formatting(element)
+            if formatting.get("monospace", False):
+                # Wrap monospace words with backticks
+                formatted_words.append(f"`{text}`")
+            else:
+                all_monospace = False
+                # Apply other markdown formatting
+                formatted_words.append(apply_markdown_formatting(text, formatting))
+        # If all words are monospace, format as a code block
+        if all_monospace:
+            if isinstance(text_elements[0], tuple):
+                indent_str = " " * text_elements[0][1]
+                if len(text_elements) > 1:
+                    text_elements = text_elements[1:]
+                    text_elements[0]["text"] = indent_str + text_elements[0]["text"]
+                else:
+                    return indent_str
+            code_content = " ".join([element["text"] for element in text_elements])
+            return f"```\n{code_content}\n```\n\n"
+        # Otherwise, return the formatted paragraph
+        return f"{' '.join(formatted_words)}\n\n"
+    def detect_heading_level(font_size, body_font_size):
+        """Determine heading level based on font size ratio.
+        Args:
+            font_size: The font size to evaluate
+            body_font_size: The base body font size for comparison
+        Returns:
+            int: The heading level (1-3) or None if not a heading
+        """
+        size_ratio = font_size / body_font_size
+        if size_ratio >= 2:
             return 1
-        elif font_size >= 20:
+        elif size_ratio >= 1.4:
             return 2
-        elif font_size >= 16:
+        elif size_ratio >= 1.2:
             return 3
         return None
@@ -303,18 +427,41 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
             )
         )
     tables.sort(key=lambda x: x[1]["bottom"])
     content_elements = []
-    for word in words:
+    for line in horizontal_lines:
+        content_elements.append(
+            (
+                "horizontal_line",
+                {
+                    "top": line["top"],
+                    "bottom": line["bottom"],
+                    "x0": line["x0"],
+                    "x1": line["x1"],
+                },
+            )
+        )
+    for i, word in enumerate(words):
         while tables and word["bottom"] > tables[0][1]["bottom"]:
             content_elements.append(tables.pop(0))
+        # Equate position of words on the same line
+        if i > 0 and abs(word["top"] - words[i - 1]["top"]) < 3:
+            word["top"] = words[i - 1]["top"]
         content_elements.append(("word", word))
     content_elements.extend(tables)
+    content_elements.sort(
+        key=lambda x: x[1]["top"] if isinstance(x[1], dict) and "top" in x[1] else 0
+    )
     for element_type, element in content_elements:
+        # If there are any pending paragraphs or headings, add them first
         if element_type == "table":
-            # If there are any pending paragraphs or headings, add them first
             if current_heading:
-                level = detect_heading_level(current_heading[0]["size"])
+                level = detect_heading_level(current_heading[0]["size"], body_font_size)
                 heading_text = format_paragraph(current_heading)
                 markdown_content.append(f"{'#' * level} {heading_text}")
                 current_heading = []
@@ -324,11 +471,22 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
             # Add the table
             markdown_content.append(element["content"])
             last_y = element["bottom"]
+        elif element_type == "horizontal_line":
+            while (next_h_line_idx < len(horizontal_lines)) and (
+                last_y is not None
+                and horizontal_lines[next_h_line_idx]["top"] <= last_y
+            ):
+                # Insert the horizontal rule *after* the preceding text
+                if current_paragraph:  # Flush any pending paragraph
+                    markdown_content.append(format_paragraph(current_paragraph))
+                    current_paragraph = []
+                markdown_content.append("\n---\n\n")  # Add the rule
+                next_h_line_idx += 1
         else:
             # Process word
             word = element
             # Check if this might be a heading
-            heading_level = detect_heading_level(word["size"])
+            heading_level = detect_heading_level(word["size"], body_font_size)
             # Detect new line based on vertical position
             is_new_line = last_y is not None and abs(word["top"] - last_y) > y_tolerance
@@ -336,7 +494,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
             if is_new_line:
                 # If we were collecting a heading
                 if current_heading:
-                    level = detect_heading_level(current_heading[0]["size"])
+                    level = detect_heading_level(
+                        current_heading[0]["size"], body_font_size
+                    )
                     heading_text = format_paragraph(current_heading)
                     markdown_content.append(f"{'#' * level} {heading_text}")
                     current_heading = []
@@ -346,6 +506,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
                     markdown_content.append(format_paragraph(current_paragraph))
                     current_paragraph = []
+                indent_level = detect_indentation_level(word, base_left)
+                current_paragraph.append(("indent", indent_level))
             # Add word to appropriate collection
             if heading_level:
                 if current_paragraph:  # Flush any pending paragraph
@@ -354,7 +517,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
                 current_heading.append(word)
             else:
                 if current_heading:  # Flush any pending heading
-                    level = detect_heading_level(current_heading[0]["size"])
+                    level = detect_heading_level(
+                        current_heading[0]["size"], body_font_size
+                    )
                     heading_text = format_paragraph(current_heading)
                     markdown_content.append(f"{'#' * level} {heading_text}")
                     current_heading = []
@@ -364,7 +529,7 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
     # Handle remaining content
     if current_heading:
-        level = detect_heading_level(current_heading[0]["size"])
+        level = detect_heading_level(current_heading[0]["size"], body_font_size)
         heading_text = format_paragraph(current_heading)
         markdown_content.append(f"{'#' * level} {heading_text}")
@@ -383,8 +548,15 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
         if links:
             content = embed_links_in_text(page, content, links)
+    content = embed_email_links(content)
     # Remove redundant formatting
-    content = content.replace("** **", " ").replace("* *", " ")
+    content = (
+        content.replace("** **", " ")
+        .replace("* *", " ")
+        .replace("` `", " ")
+        .replace("\n```\n\n```", "")
+    )
     return content

{lexoid-0.1.12.dist-info → lexoid-0.1.13.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: lexoid
-Version: 0.1.12
+Version: 0.1.13
 Summary:
 Requires-Python: >=3.10,<4.0
 Classifier: Programming Language :: Python :: 3

{lexoid-0.1.12.dist-info → lexoid-0.1.13.dist-info}/RECORD RENAMED Viewed

@@ -1,9 +1,9 @@
 lexoid/api.py,sha256=lTkUcbGML29JrWJv4pE_ZqbzeJuHUE8b6OnijoLBEfU,11350
 lexoid/core/parse_type/llm_parser.py,sha256=rrc1Lwp-6ZAi8IVp3672mHAHUs1JefhT2rnYyQ1gA5E,11292
-lexoid/core/parse_type/static_parser.py,sha256=v4GWUmZVBBIF9TnbkhPBt2gspk0Oq_ujtNGnXZHLBr8,15055
+lexoid/core/parse_type/static_parser.py,sha256=IovvF1GCLWFPh2-mwcgv6DpJmSVQBLnGcoIq7bwQ39Q,21299
 lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
 lexoid/core/utils.py,sha256=6s24X3-4Y57u70HzjIS798Tg8qx6Z3mLATf4xtENE-8,19718
-lexoid-0.1.12.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-lexoid-0.1.12.dist-info/METADATA,sha256=XMHFMqwDj2DgSaZcZjXU881NxdPsRGBAsUyPyRsJvyU,6809
-lexoid-0.1.12.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-lexoid-0.1.12.dist-info/RECORD,,
+lexoid-0.1.13.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+lexoid-0.1.13.dist-info/METADATA,sha256=GHODqox4lX6qf_gjSy8ULYJZhaKKQ1BDKEUAOMi7R2U,6809
+lexoid-0.1.13.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+lexoid-0.1.13.dist-info/RECORD,,

{lexoid-0.1.12.dist-info → lexoid-0.1.13.dist-info}/LICENSE RENAMED Viewed

File without changes

{lexoid-0.1.12.dist-info → lexoid-0.1.13.dist-info}/WHEEL RENAMED Viewed

File without changes

lexoid 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl

lexoid 0.1.12py3-none-any.whl → 0.1.13py3-none-any.whl