PyPI - lexoid - Versions diffs - 0.1.6.post1__py3-none-any.whl → 0.1.7__py3-none-any.whl - Mend

lexoid 0.1.6.post1py3-none-any.whl → 0.1.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

lexoid/core/parse_type/llm_parser.py +92 -35
lexoid/core/parse_type/static_parser.py +79 -27
lexoid/core/prompt_templates.py +19 -0
{lexoid-0.1.6.post1.dist-info → lexoid-0.1.7.dist-info}/METADATA +33 -17
lexoid-0.1.7.dist-info/RECORD +9 -0
lexoid-0.1.6.post1.dist-info/RECORD +0 -9
{lexoid-0.1.6.post1.dist-info → lexoid-0.1.7.dist-info}/LICENSE +0 -0
{lexoid-0.1.6.post1.dist-info → lexoid-0.1.7.dist-info}/WHEEL +0 -0

lexoid/core/parse_type/llm_parser.py CHANGED Viewed

@@ -10,10 +10,13 @@ from lexoid.core.prompt_templates import (
     INSTRUCTIONS_ADD_PG_BREAK,
     OPENAI_USER_PROMPT,
     PARSER_PROMPT,
+    LLAMA_PARSER_PROMPT,
 )
 from lexoid.core.utils import convert_image_to_pdf
 from loguru import logger
 from openai import OpenAI
+from huggingface_hub import InferenceClient
+from together import Together
 def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
@@ -22,10 +25,13 @@ def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
     model = kwargs.get("model")
     if model.startswith("gemini"):
         return parse_with_gemini(path, raw, **kwargs)
-    elif model.startswith("gpt"):
-        return parse_with_gpt(path, raw, **kwargs)
-    else:
-        raise ValueError(f"Unsupported model: {model}")
+    if model.startswith("gpt"):
+        return parse_with_api(path, raw, api="openai", **kwargs)
+    if model.startswith("meta-llama"):
+        if model.endswith("Turbo") or model == "meta-llama/Llama-Vision-Free":
+            return parse_with_api(path, raw, api="together", **kwargs)
+        return parse_with_api(path, raw, api="huggingface", **kwargs)
+    raise ValueError(f"Unsupported model: {model}")
 def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
@@ -120,8 +126,30 @@ def convert_pdf_page_to_base64(
     return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
-def parse_with_gpt(path: str, raw: bool, **kwargs) -> List[Dict] | str:
-    client = OpenAI()
+def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str:
+    """
+    Parse documents (PDFs or images) using various vision model APIs.
+    Args:
+        path (str): Path to the document to parse
+        raw (bool): If True, return raw text; if False, return structured data
+        api (str): Which API to use ("openai", "huggingface", or "together")
+        **kwargs: Additional arguments including model, temperature, title, etc.
+    Returns:
+        List[Dict] | str: Parsed content either as raw text or structured data
+    """
+    # Initialize appropriate client
+    clients = {
+        "openai": lambda: OpenAI(),
+        "huggingface": lambda: InferenceClient(
+            token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
+        ),
+        "together": lambda: Together(),
+    }
+    assert api in clients, f"Unsupported API: {api}"
+    logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
+    client = clients[api]()
     # Handle different input types
     mime_type, _ = mimetypes.guess_type(path)
@@ -129,50 +157,79 @@ def parse_with_gpt(path: str, raw: bool, **kwargs) -> List[Dict] | str:
         # Single image processing
         with open(path, "rb") as img_file:
             image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
-            images = [(0, image_base64)]
+            images = [(0, f"data:{mime_type};base64,{image_base64}")]
     else:
         # PDF processing
         pdf_document = pdfium.PdfDocument(path)
         images = [
-            (page_num, convert_pdf_page_to_base64(pdf_document, page_num))
+            (
+                page_num,
+                f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
+            )
             for page_num in range(len(pdf_document))
         ]
+    # API-specific message formatting
+    def get_messages(page_num: int, image_url: str) -> List[Dict]:
+        base_message = {
+            "type": "text",
+            "text": LLAMA_PARSER_PROMPT,
+        }
+        image_message = {
+            "type": "image_url",
+            "image_url": {"url": image_url},
+        }
+        if api == "openai":
+            return [
+                {
+                    "role": "system",
+                    "content": PARSER_PROMPT.format(
+                        custom_instructions=INSTRUCTIONS_ADD_PG_BREAK
+                    ),
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": f"{OPENAI_USER_PROMPT} (Page {page_num + 1})",
+                        },
+                        image_message,
+                    ],
+                },
+            ]
+        else:
+            return [
+                {
+                    "role": "user",
+                    "content": [base_message, image_message],
+                }
+            ]
     # Process each page/image
     all_results = []
-    for page_num, image_base64 in images:
-        messages = [
-            {
-                "role": "system",
-                "content": PARSER_PROMPT,
-            },
-            {
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": f"{OPENAI_USER_PROMPT} (Page {page_num + 1})",
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {"url": f"data:image/png;base64,{image_base64}"},
-                    },
-                ],
-            },
-        ]
+    for page_num, image_url in images:
+        messages = get_messages(page_num, image_url)
-        # Get completion from GPT-4 Vision
-        response = client.chat.completions.create(
-            model=kwargs["model"],
-            temperature=kwargs.get("temperature", 0.7),
-            messages=messages,
-        )
+        # Common completion parameters
+        completion_params = {
+            "model": kwargs["model"],
+            "messages": messages,
+            "max_tokens": kwargs.get("max_tokens", 1024),
+            "temperature": kwargs.get("temperature", 0.7),
+        }
+        # Get completion from selected API
+        response = client.chat.completions.create(**completion_params)
         # Extract the response text
         page_text = response.choices[0].message.content
         if kwargs.get("verbose", None):
             logger.debug(f"Page {page_num + 1} response: {page_text}")
-        result = ""
+        # Extract content between output tags if present
+        result = page_text
         if "<output>" in page_text:
             result = page_text.split("<output>")[1].strip()
         if "</output>" in result:

lexoid/core/parse_type/static_parser.py CHANGED Viewed

@@ -89,15 +89,21 @@ def process_table(table) -> str:
     # Convert to DataFrame and handle empty cells
     df = pd.DataFrame(table_data)
+    df.replace("", pd.NA, inplace=True)
+    df = df.dropna(how="all", axis=0)
+    df = df.dropna(how="all", axis=1)
     df = df.fillna("")
+    if len(df) == 0:
+        return ""
     # Use first row as header and clean it up
     df.columns = df.iloc[0]
-    df = df.drop(0)
+    df = df.drop(df.index[0])
+    df.replace(r"\n", "<br>", regex=True, inplace=True)
     # Convert to markdown with some formatting options
     markdown_table = df.to_markdown(index=False, tablefmt="pipe")
-    return f"\n{markdown_table}\n\n"  # Add newlines for proper markdown rendering
+    return f"\n{markdown_table}\n\n"
 def embed_links_in_text(page, text, links):
@@ -157,8 +163,20 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
     x_tolerance = kwargs.get("x_tolerance", 1)
     y_tolerance = kwargs.get("y_tolerance", 5)
-    # First, identify tables and their positions
-    tables = page.find_tables()
+    # Table settings
+    vertical_strategy = kwargs.get("vertical_strategy", "lines")
+    horizontal_strategy = kwargs.get("horizontal_strategy", "lines")
+    snap_x_tolerance = kwargs.get("snap_x_tolerance", 10)
+    snap_y_tolerance = kwargs.get("snap_y_tolerance", 0)
+    tables = page.find_tables(
+        table_settings={
+            "vertical_strategy": vertical_strategy,
+            "horizontal_strategy": horizontal_strategy,
+            "snap_x_tolerance": snap_x_tolerance,
+            "snap_y_tolerance": snap_y_tolerance,
+        }
+    )
     table_zones = [(table.bbox, process_table(table)) for table in tables]
     # Create a filtered page excluding table areas
@@ -171,12 +189,46 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
     words = filtered_page.extract_words(
         x_tolerance=x_tolerance,
         y_tolerance=y_tolerance,
-        extra_attrs=["size", "top", "bottom"],
+        extra_attrs=["size", "top", "bottom", "fontname"],
     )
-    def format_paragraph(text):
-        text = " ".join(text.split())
-        return f"{text}\n\n"
+    def format_paragraph(text_elements):
+        """Format a paragraph with styling applied to individual words"""
+        formatted_words = []
+        for element in text_elements:
+            text = element["text"]
+            formatting = get_text_formatting(element)
+            formatted_words.append(apply_markdown_formatting(text, formatting))
+        return f"{' '.join(formatted_words)}\n\n"
+    def get_text_formatting(word):
+        """
+        Detect text formatting based on font properties
+        Returns a dict of formatting attributes
+        """
+        formatting = {
+            "bold": False,
+            "italic": False,
+        }
+        # Check font name for common bold/italic indicators
+        font_name = word.get("fontname", "").lower()
+        if any(style in font_name for style in ["bold", "heavy", "black"]):
+            formatting["bold"] = True
+        if any(style in font_name for style in ["italic", "oblique"]):
+            formatting["italic"] = True
+        return formatting
+    def apply_markdown_formatting(text, formatting):
+        """Apply markdown formatting to text based on detected styles"""
+        if formatting["bold"] and formatting["italic"]:
+            text = f"***{text}***"
+        elif formatting["bold"]:
+            text = f"**{text}**"
+        elif formatting["italic"]:
+            text = f"*{text}*"
+        return text
     def detect_heading_level(font_size):
         if font_size >= 24:
@@ -205,17 +257,18 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
         while tables and word["bottom"] > tables[0][1]["bottom"]:
             content_elements.append(tables.pop(0))
         content_elements.append(("word", word))
+    content_elements.extend(tables)
     for element_type, element in content_elements:
         if element_type == "table":
             # If there are any pending paragraphs or headings, add them first
             if current_heading:
                 level = detect_heading_level(current_heading[0]["size"])
-                heading_text = " ".join(word["text"] for word in current_heading)
-                markdown_content.append(f"{'#' * level} {heading_text}\n\n")
+                heading_text = format_paragraph(current_heading)
+                markdown_content.append(f"{'#' * level} {heading_text}")
                 current_heading = []
             if current_paragraph:
-                markdown_content.append(format_paragraph(" ".join(current_paragraph)))
+                markdown_content.append(format_paragraph(current_paragraph))
                 current_paragraph = []
             # Add the table
             markdown_content.append(element["content"])
@@ -233,46 +286,42 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
                 # If we were collecting a heading
                 if current_heading:
                     level = detect_heading_level(current_heading[0]["size"])
-                    heading_text = " ".join(word["text"] for word in current_heading)
-                    markdown_content.append(f"{'#' * level} {heading_text}\n\n")
+                    heading_text = format_paragraph(current_heading)
+                    markdown_content.append(f"{'#' * level} {heading_text}")
                     current_heading = []
                 # If we were collecting a paragraph
                 if current_paragraph:
-                    markdown_content.append(
-                        format_paragraph(" ".join(current_paragraph))
-                    )
+                    markdown_content.append(format_paragraph(current_paragraph))
                     current_paragraph = []
             # Add word to appropriate collection
             if heading_level:
                 if current_paragraph:  # Flush any pending paragraph
-                    markdown_content.append(
-                        format_paragraph(" ".join(current_paragraph))
-                    )
+                    markdown_content.append(format_paragraph(current_paragraph))
                     current_paragraph = []
-                current_heading.append({"text": word["text"], "size": word["size"]})
+                current_heading.append(word)
             else:
                 if current_heading:  # Flush any pending heading
                     level = detect_heading_level(current_heading[0]["size"])
-                    heading_text = " ".join(word["text"] for word in current_heading)
-                    markdown_content.append(f"{'#' * level} {heading_text}\n\n")
+                    heading_text = format_paragraph(current_heading)
+                    markdown_content.append(f"{'#' * level} {heading_text}")
                     current_heading = []
-                current_paragraph.append(word["text"])
+                current_paragraph.append(word)
             last_y = word["top"]
     # Handle remaining content
     if current_heading:
         level = detect_heading_level(current_heading[0]["size"])
-        heading_text = " ".join(word["text"] for word in current_heading)
-        markdown_content.append(f"{'#' * level} {heading_text}\n\n")
+        heading_text = format_paragraph(current_heading)
+        markdown_content.append(f"{'#' * level} {heading_text}")
     if current_paragraph:
-        markdown_content.append(format_paragraph(" ".join(current_paragraph)))
+        markdown_content.append(format_paragraph(current_paragraph))
     # Process links for the page
-    content = "".join(markdown_content)  # Process links using the new function
+    content = "".join(markdown_content)
     if page.annots:
         links = []
         for annot in page.annots:
@@ -283,6 +332,9 @@ def process_pdf_page_with_pdfplumber(page, uri_rects, **kwargs):
         if links:
             content = embed_links_in_text(page, content, links)
+    # Remove redundant formatting
+    content = content.replace("** **", " ").replace("* *", " ")
     return content

lexoid/core/prompt_templates.py CHANGED Viewed

@@ -76,3 +76,22 @@ Ensure accurate representation of all content, including tables and visual eleme
 """
 INSTRUCTIONS_ADD_PG_BREAK = "Insert a `<page-break>` tag between the content of each page to maintain the original page structure."
+LLAMA_PARSER_PROMPT = """\
+You are a document conversion assistant. Your task is to accurately reproduce the content of an image in Markdown and HTML format, maintaining the visual structure and layout of the original document as closely as possible.
+Instructions:
+1. Use a combination of Markdown and HTML to replicate the document's layout and formatting.
+2. Reproduce all text content exactly as it appears, including preserving capitalization, punctuation, and any apparent errors or inconsistencies in the original.
+3. Use appropriate Markdown syntax for headings, emphasis (bold, italic), and lists where applicable.
+4. Always use HTML (`<table>`, `<tr>`, `<td>`) to represent tabular data. Include `colspan` and `rowspan` attributes if needed.
+5. For figures, graphs, or diagrams, represent them using `<img>` tags and use appropriate `alt` text.
+6. For handwritten documents, reproduce the content as typed text, maintaining the original structure and layout.
+7. Do not include any descriptions of the document's appearance, paper type, or writing implements used.
+8. Do not add any explanatory notes, comments, or additional information outside of the converted content.
+9. Ensure all special characters, symbols, and equations are accurately represented.
+10. Provide the output only once, without any duplication.
+11. Enclose the entire output within <output> and </output> tags.
+Output the converted content directly in Markdown and HTML without any additional explanations, descriptions, or notes.
+"""

{lexoid-0.1.6.post1.dist-info → lexoid-0.1.7.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: lexoid
-Version: 0.1.6.post1
+Version: 0.1.7
 Summary:
 Requires-Python: >=3.10,<4.0
 Classifier: Programming Language :: Python :: 3
@@ -11,6 +11,7 @@ Classifier: Programming Language :: Python :: 3.13
 Requires-Dist: bs4 (>=0.0.2,<0.0.3)
 Requires-Dist: docx2pdf (>=0.1.8,<0.2.0)
 Requires-Dist: google-generativeai (>=0.8.1,<0.9.0)
+Requires-Dist: huggingface-hub (>=0.27.0,<0.28.0)
 Requires-Dist: loguru (>=0.7.2,<0.8.0)
 Requires-Dist: markdown (>=3.7,<4.0)
 Requires-Dist: markdownify (>=0.13.1,<0.14.0)
@@ -27,6 +28,7 @@ Requires-Dist: pyqtwebengine (>=5.15.7,<6.0.0) ; platform_system != "debian"
 Requires-Dist: python-docx (>=1.1.2,<2.0.0)
 Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
 Requires-Dist: tabulate (>=0.9.0,<0.10.0)
+Requires-Dist: together (>=1.3.10,<2.0.0)
 Description-Content-Type: text/markdown
 # Lexoid
@@ -39,38 +41,46 @@ Lexoid is an efficient document parsing library that supports both LLM-based and
 - Collaborate with a permissive license
 ## Installation
-To install dependencies:
+### Installing with pip
 ```
-make install
+pip install lexoid
 ```
-or, to install with dev-dependencies:
+To use LLM-based parsing, define the following environment variables or create a `.env` file with the following definitions
 ```
-make dev
+OPENAI_API_KEY=""
+GOOGLE_API_KEY=""
 ```
-To activate virtual environment:
+Optionally, to use `Playwright` for retrieving web content with the `.whl` package (else regular requests will be used by default):
 ```
-source .venv/bin/activate
+playwright install --with-deps --only-shell chromium
 ```
-To use LLM-based parsing, define the following environment variables or create a `.env` file with the following definitions
+### Building `.whl` from source
+To create `.whl`:
 ```
-OPENAI_API_KEY=""
-GOOGLE_API_KEY=""
+make build
 ```
-To build a `.whl` file for testing:
+### Creating a local installation
+To install dependencies:
+```
+make install
+```
+or, to install with dev-dependencies:
 ```
-poetry build
+make dev
 ```
-Optionally, to use `Playwright` for retrieving web content with the `.whl` package (else regular requests will be used by default):
+To activate virtual environment:
 ```
-playwright install --with-deps --only-shell chromium
+source .venv/bin/activate
 ```
 ## Usage
 [Example Notebook](https://github.com/oidlabs-com/Lexoid/blob/main/examples/example_notebook.ipynb)
+[Example Colab Notebook](https://drive.google.com/file/d/1v9R6VOUp9CEGalgZGeg5G57XzHqh_tB6/view?usp=sharing)
 Here's a quick example to parse documents using Lexoid:
 ``` python
@@ -98,7 +108,13 @@ Initial results (_more updates soon_)
 | Rank | Model/Framework | Similarity | Time (s) |
 |------|-----------|------------|----------|
 | 1 | gpt-4o | 0.799 | 21.77|
-| 2 | gemini-1.5-pro | 0.742 | 15.77 |
-| 3 | gpt-4o-mini | 0.721 | 14.86 |
-| 4 | gemini-1.5-flash | 0.702 | 4.56 |
+| 2 | gemini-2.0-flash-exp | 0.797 | 13.47 |
+| 3 | gemini-exp-1121 | 0.779 | 30.88 |
+| 4 | gemini-1.5-pro | 0.742 | 15.77 |
+| 5 | gpt-4o-mini | 0.721 | 14.86 |
+| 6 | gemini-1.5-flash | 0.702 | 4.56 |
+| 7 | Llama-3.2-11B-Vision-Instruct (via HF) | 0.582 | 21.74 |
+| 8 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.556 | 4.58 |
+| 9 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.527 | 10.57 |
+| 10 | Llama-Vision-Free (via Together AI) | 0.435 | 8.42 |

lexoid-0.1.7.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
+lexoid/core/parse_type/llm_parser.py,sha256=i_iidoP_qExGTScRPMBX5X3RnjIf6XqAS_NhLkz0_LM,8464
+lexoid/core/parse_type/static_parser.py,sha256=NlAE_WMMNvNnVo2aQrA9mN3fwJ6ZrshMC8S9kG0h8CA,13772
+lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
+lexoid/core/utils.py,sha256=rd8sf2OZqMv_oHGxM1redpSwU8f_sBJ-0tzlbp8U3_A,17193
+lexoid-0.1.7.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+lexoid-0.1.7.dist-info/METADATA,sha256=yOwsqpA5U-2Z2CXr5Cnrs2a6HtqY-4WryVfYDTI7X08,4092
+lexoid-0.1.7.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+lexoid-0.1.7.dist-info/RECORD,,

lexoid-0.1.6.post1.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
-lexoid/core/parse_type/llm_parser.py,sha256=R-0HoXATCBnMJpyjOmMw_EdvVS_PKhhgC7z3NoKzhrs,6311
-lexoid/core/parse_type/static_parser.py,sha256=uFmuz_1JQHUp8FZADPhLBPEv1La2AnZ4j2Vj6SlH0fo,11993
-lexoid/core/prompt_templates.py,sha256=0KXHGNunMfrRZh5QfENcxY1s30VioY2fsu3wELc-3z8,4794
-lexoid/core/utils.py,sha256=rd8sf2OZqMv_oHGxM1redpSwU8f_sBJ-0tzlbp8U3_A,17193
-lexoid-0.1.6.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-lexoid-0.1.6.post1.dist-info/METADATA,sha256=tPhhqCNwJGR5LNSH-J9hCJf2O4AN6QJhFiXHbUcRizM,3436
-lexoid-0.1.6.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-lexoid-0.1.6.post1.dist-info/RECORD,,

{lexoid-0.1.6.post1.dist-info → lexoid-0.1.7.dist-info}/LICENSE RENAMED Viewed

File without changes

{lexoid-0.1.6.post1.dist-info → lexoid-0.1.7.dist-info}/WHEEL RENAMED Viewed

File without changes

lexoid 0.1.6.post1__py3-none-any.whl → 0.1.7__py3-none-any.whl

lexoid 0.1.6.post1py3-none-any.whl → 0.1.7py3-none-any.whl