PyPI - lexoid - Versions diffs - 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl - Mend

lexoid 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

lexoid/api.py +13 -3
lexoid/core/parse_type/llm_parser.py +7 -4
lexoid/core/utils.py +59 -27
{lexoid-0.1.9.dist-info → lexoid-0.1.11.dist-info}/METADATA +17 -14
lexoid-0.1.11.dist-info/RECORD +9 -0
lexoid-0.1.9.dist-info/RECORD +0 -9
{lexoid-0.1.9.dist-info → lexoid-0.1.11.dist-info}/LICENSE +0 -0
{lexoid-0.1.9.dist-info → lexoid-0.1.11.dist-info}/WHEEL +0 -0

lexoid/api.py CHANGED Viewed

@@ -19,6 +19,8 @@ from lexoid.core.utils import (
     recursive_read_html,
     router,
     split_pdf,
+    create_sub_pdf,
+    get_webpage_soup,
 )
@@ -83,8 +85,9 @@ def parse_chunk_list(
         result = parse_chunk(file_path, parser_type, **kwargs)
         combined_segments.extend(result["segments"])
         raw_texts.append(result["raw"])
-        token_usage["input"] += result["token_usage"]["input"]
-        token_usage["output"] += result["token_usage"]["output"]
+        if "token_usage" in result:
+            token_usage["input"] += result["token_usage"]["input"]
+            token_usage["output"] += result["token_usage"]["output"]
     token_usage["total"] = token_usage["input"] + token_usage["output"]
     return {
@@ -100,7 +103,7 @@ def parse_chunk_list(
 def parse(
     path: str,
-    parser_type: Union[str, ParserType] = "LLM_PARSE",
+    parser_type: Union[str, ParserType] = "AUTO",
     pages_per_split: int = 4,
     max_processes: int = 4,
     **kwargs,
@@ -147,6 +150,7 @@ def parse(
             if is_supported_url_file_type(path):
                 path = download_file(path, download_dir)
             elif as_pdf:
+                kwargs["title"] = get_webpage_soup(path).title.string.strip()
                 pdf_filename = kwargs.get("save_filename", f"webpage_{int(time())}.pdf")
                 if not pdf_filename.endswith(".pdf"):
                     pdf_filename += ".pdf"
@@ -163,6 +167,12 @@ def parse(
             pdf_path = os.path.join(temp_dir, "converted.pdf")
             path = convert_to_pdf(path, pdf_path)
+        if "page_nums" in kwargs and path.lower().endswith(".pdf"):
+            sub_pdf_dir = os.path.join(temp_dir, "sub_pdfs")
+            os.makedirs(sub_pdf_dir, exist_ok=True)
+            sub_pdf_path = os.path.join(sub_pdf_dir, f"{os.path.basename(path)}")
+            path = create_sub_pdf(path, sub_pdf_path, kwargs["page_nums"])
         if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
             kwargs["split"] = False
             result = parse_chunk(path, parser_type, **kwargs)

lexoid/core/parse_type/llm_parser.py CHANGED Viewed

@@ -50,7 +50,7 @@ def retry_on_http_error(func):
 @retry_on_http_error
 def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
     if "model" not in kwargs:
-        kwargs["model"] = "gemini-1.5-flash"
+        kwargs["model"] = "gemini-2.0-flash"
     model = kwargs.get("model")
     if model.startswith("gemini"):
         return parse_with_gemini(path, **kwargs)
@@ -125,6 +125,9 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
         combined_text = result.split("</output>")[0].strip()
     token_usage = result["usageMetadata"]
+    input_tokens = token_usage.get("promptTokenCount", 0)
+    output_tokens = token_usage.get("candidatesTokenCount", 0)
+    total_tokens = input_tokens + output_tokens
     return {
         "raw": combined_text,
@@ -137,9 +140,9 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
         "parent_title": kwargs.get("parent_title", ""),
         "recursive_docs": [],
         "token_usage": {
-            "input": token_usage["promptTokenCount"],
-            "output": token_usage["candidatesTokenCount"],
-            "total": token_usage["totalTokenCount"],
+            "input": input_tokens,
+            "output": output_tokens,
+            "total": total_tokens,
         },
     }

lexoid/core/utils.py CHANGED Viewed

@@ -6,7 +6,7 @@ import re
 import sys
 from difflib import SequenceMatcher
 from hashlib import md5
-from typing import Dict, List
+from typing import Dict, List, Optional
 from urllib.parse import urlparse
 import nest_asyncio
@@ -45,6 +45,20 @@ def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
     return paths
+def create_sub_pdf(
+    input_path: str, output_path: str, page_nums: Optional[tuple[int, ...]|int] = None
+) -> str:
+    if isinstance(page_nums, int):
+        page_nums = (page_nums,)
+    page_nums = tuple(sorted(set(page_nums)))
+    with pikepdf.open(input_path) as pdf:
+        indices = page_nums if page_nums else range(len(pdf.pages))
+        with pikepdf.new() as new_pdf:
+            new_pdf.pages.extend([pdf.pages[i - 1] for i in indices])
+            new_pdf.save(output_path)
+    return output_path
 def convert_image_to_pdf(image_path: str) -> bytes:
     with Image.open(image_path) as img:
         img_rgb = img.convert("RGB")
@@ -285,18 +299,7 @@ def html_to_markdown(html: str, title: str, url: str) -> str:
     return content
-def read_html_content(url: str) -> Dict:
-    """
-    Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
-    Args:
-        url (str): The URL of the HTML page.
-    Returns:
-        Dict: Dictionary containing parsed document data
-    """
+def get_webpage_soup(url: str) -> BeautifulSoup:
     try:
         from playwright.async_api import async_playwright
@@ -357,6 +360,21 @@ def read_html_content(url: str) -> Dict:
         soup = BeautifulSoup(
             response.content, "html.parser", from_encoding="iso-8859-1"
         )
+    return soup
+def read_html_content(url: str) -> Dict:
+    """
+    Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
+    Args:
+        url (str): The URL of the HTML page.
+    Returns:
+        Dict: Dictionary containing parsed document data
+    """
+    soup = get_webpage_soup(url)
     title = soup.title.string.strip() if soup.title else "No title"
     url_hash = md5(url.encode("utf-8")).hexdigest()[:8]
     full_title = f"{title} - {url_hash}"
@@ -528,23 +546,37 @@ def has_hyperlink_in_pdf(path: str):
     )
-def router(path: str):
+def router(path: str, priority: str = "accuracy") -> str:
+    """
+    Routes the file path to the appropriate parser based on the file type.
+    Args:
+        path (str): The file path to route.
+        priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE).
+    """
     file_type = get_file_type(path)
     if file_type.startswith("text/"):
         return "STATIC_PARSE"
-    # Naive routing strategy for now.
-    # Current routing strategy,
-    # 1. If the PDF has hidden hyperlinks (as alias) and no images: STATIC_PARSE
-    # 2. Other scenarios: LLM_PARSE
-    # If you have other needs, do reach out or create an issue.
-    if (
-        file_type == "application/pdf"
-        and not has_image_in_pdf(path)
-        and has_hyperlink_in_pdf(path)
-    ):
-        return "STATIC_PARSE"
-    return "LLM_PARSE"
+    if priority == "accuracy":
+        # If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
+        # Otherwise, use LLM_PARSE
+        if (
+            file_type == "application/pdf"
+            and not has_image_in_pdf(path)
+            and has_hyperlink_in_pdf(path)
+        ):
+            return "STATIC_PARSE"
+        return "LLM_PARSE"
+    else:
+        # If the file is a PDF without images, use STATIC_PARSE
+        # Otherwise, use LLM_PARSE
+        if (
+            file_type == "application/pdf"
+            and not has_image_in_pdf(path)
+        ):
+            return "STATIC_PARSE"
+        return "LLM_PARSE"
 def convert_doc_to_pdf(input_path: str, temp_dir: str) -> str:
     temp_path = os.path.join(

{lexoid-0.1.9.dist-info → lexoid-0.1.11.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: lexoid
-Version: 0.1.9
+Version: 0.1.11
 Summary:
 Requires-Python: >=3.10,<4.0
 Classifier: Programming Language :: Python :: 3
@@ -110,20 +110,23 @@ print(parsed_md)
 - **kwargs: Additional arguments for the parser.
 ## Benchmark
-Initial results (_more updates soon_)
+Results aggregated across 5 iterations each for 5 documents.
 _Note:_ Benchmarks are currently done in the zero-shot setting.
-| Rank | Model/Framework | Similarity | Time (s) |
-|------|-----------|------------|----------|
-| 1 | gpt-4o | 0.799 | 21.77|
-| 2 | gemini-2.0-flash-exp | 0.797 | 13.47 |
-| 3 | gemini-exp-1121 | 0.779 | 30.88 |
-| 4 | gemini-1.5-pro | 0.742 | 15.77 |
-| 5 | gpt-4o-mini | 0.721 | 14.86 |
-| 6 | gemini-1.5-flash | 0.702 | 4.56 |
-| 7 | Llama-3.2-11B-Vision-Instruct (via HF) | 0.582 | 21.74 |
-| 8 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.556 | 4.58 |
-| 9 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.527 | 10.57 |
-| 10 | Llama-Vision-Free (via Together AI) | 0.435 | 8.42 |
+| Rank | Model | Mean Similarity | Std. Dev. | Time (s) |
+|---|---|---|---|---|
+| 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 |
+| 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 |
+| 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 |
+| 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 |
+| 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 |
+| 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 |
+| 7 | gpt-4o | 0.687 | 0.247 | 10.16 |
+| 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 |
+| 9 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 |
+| 10 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 |
+| 11 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 |
+| 12 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 |
+| 13 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 |

lexoid-0.1.11.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+lexoid/api.py,sha256=CIZBNvh38PJbD0OwK1Mp0qqkWxkAEBw2L_FkoCmagXA,9288
+lexoid/core/parse_type/llm_parser.py,sha256=XfsN6RAtb14p31U2jL-9QyRKpkNAGXXiK3urWJIFi2U,10625
+lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
+lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
+lexoid/core/utils.py,sha256=1If_3XoUhPQRY5XMzLJBsHdyjtLgD734eYBYvsg8w5Y,19569
+lexoid-0.1.11.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+lexoid-0.1.11.dist-info/METADATA,sha256=kipDZLbUz_wkJUrzPGH2VppBNMHmaJadHR5_BAqHgjU,4838
+lexoid-0.1.11.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+lexoid-0.1.11.dist-info/RECORD,,

lexoid-0.1.9.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-lexoid/api.py,sha256=EYyKwfdrjM94bslqTb7Db_wz0R2WioFPkJAqeDJJchY,8790
-lexoid/core/parse_type/llm_parser.py,sha256=eu6zcl_uHVJ7-t506yfQT4jHpg2QGHV2CznS9X12lLQ,10515
-lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
-lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
-lexoid/core/utils.py,sha256=coVab6fCSSDpIN39WLQ6ciZVRiIx3qTsqjn2EbTmMks,18428
-lexoid-0.1.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-lexoid-0.1.9.dist-info/METADATA,sha256=EegftW7ka6fSzaEos97N2-JPjkpO3tt4wyuL9oha014,4575
-lexoid-0.1.9.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-lexoid-0.1.9.dist-info/RECORD,,

{lexoid-0.1.9.dist-info → lexoid-0.1.11.dist-info}/LICENSE RENAMED Viewed

File without changes

{lexoid-0.1.9.dist-info → lexoid-0.1.11.dist-info}/WHEEL RENAMED Viewed

File without changes

lexoid 0.1.9__py3-none-any.whl → 0.1.11__py3-none-any.whl

lexoid 0.1.9py3-none-any.whl → 0.1.11py3-none-any.whl