PyPI - lexoid - Versions diffs - 0.1.10__tar.gz → 0.1.11__tar.gz - Mend

lexoid 0.1.10tar.gz → 0.1.11tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{lexoid-0.1.10 → lexoid-0.1.11}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: lexoid
-Version: 0.1.10
+Version: 0.1.11
 Summary:
 Requires-Python: >=3.10,<4.0
 Classifier: Programming Language :: Python :: 3
@@ -110,20 +110,23 @@ print(parsed_md)
 - **kwargs: Additional arguments for the parser.
 ## Benchmark
-Initial results (_more updates soon_)
+Results aggregated across 5 iterations each for 5 documents.
 _Note:_ Benchmarks are currently done in the zero-shot setting.
-| Rank | Model/Framework | Similarity | Time (s) |
-|------|-----------|------------|----------|
-| 1 | gpt-4o | 0.799 | 21.77|
-| 2 | gemini-2.0-flash-exp | 0.797 | 13.47 |
-| 3 | gemini-exp-1121 | 0.779 | 30.88 |
-| 4 | gemini-1.5-pro | 0.742 | 15.77 |
-| 5 | gpt-4o-mini | 0.721 | 14.86 |
-| 6 | gemini-1.5-flash | 0.702 | 4.56 |
-| 7 | Llama-3.2-11B-Vision-Instruct (via HF) | 0.582 | 21.74 |
-| 8 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.556 | 4.58 |
-| 9 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.527 | 10.57 |
-| 10 | Llama-Vision-Free (via Together AI) | 0.435 | 8.42 |
+| Rank | Model | Mean Similarity | Std. Dev. | Time (s) |
+|---|---|---|---|---|
+| 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 |
+| 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 |
+| 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 |
+| 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 |
+| 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 |
+| 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 |
+| 7 | gpt-4o | 0.687 | 0.247 | 10.16 |
+| 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 |
+| 9 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 |
+| 10 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 |
+| 11 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 |
+| 12 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 |
+| 13 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 |

{lexoid-0.1.10 → lexoid-0.1.11}/README.md RENAMED Viewed

@@ -77,19 +77,22 @@ print(parsed_md)
 - **kwargs: Additional arguments for the parser.
 ## Benchmark
-Initial results (_more updates soon_)
+Results aggregated across 5 iterations each for 5 documents.
 _Note:_ Benchmarks are currently done in the zero-shot setting.
-| Rank | Model/Framework | Similarity | Time (s) |
-|------|-----------|------------|----------|
-| 1 | gpt-4o | 0.799 | 21.77|
-| 2 | gemini-2.0-flash-exp | 0.797 | 13.47 |
-| 3 | gemini-exp-1121 | 0.779 | 30.88 |
-| 4 | gemini-1.5-pro | 0.742 | 15.77 |
-| 5 | gpt-4o-mini | 0.721 | 14.86 |
-| 6 | gemini-1.5-flash | 0.702 | 4.56 |
-| 7 | Llama-3.2-11B-Vision-Instruct (via HF) | 0.582 | 21.74 |
-| 8 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.556 | 4.58 |
-| 9 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.527 | 10.57 |
-| 10 | Llama-Vision-Free (via Together AI) | 0.435 | 8.42 |
+| Rank | Model | Mean Similarity | Std. Dev. | Time (s) |
+|---|---|---|---|---|
+| 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 |
+| 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 |
+| 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 |
+| 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 |
+| 5 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 |
+| 6 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 |
+| 7 | gpt-4o | 0.687 | 0.247 | 10.16 |
+| 8 | gpt-4o-mini | 0.642 | 0.213 | 9.71 |
+| 9 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 |
+| 10 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 |
+| 11 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 |
+| 12 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 |
+| 13 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 |

{lexoid-0.1.10 → lexoid-0.1.11}/lexoid/api.py RENAMED Viewed

@@ -20,6 +20,7 @@ from lexoid.core.utils import (
     router,
     split_pdf,
     create_sub_pdf,
+    get_webpage_soup,
 )
@@ -102,7 +103,7 @@ def parse_chunk_list(
 def parse(
     path: str,
-    parser_type: Union[str, ParserType] = "LLM_PARSE",
+    parser_type: Union[str, ParserType] = "AUTO",
     pages_per_split: int = 4,
     max_processes: int = 4,
     **kwargs,
@@ -149,6 +150,7 @@ def parse(
             if is_supported_url_file_type(path):
                 path = download_file(path, download_dir)
             elif as_pdf:
+                kwargs["title"] = get_webpage_soup(path).title.string.strip()
                 pdf_filename = kwargs.get("save_filename", f"webpage_{int(time())}.pdf")
                 if not pdf_filename.endswith(".pdf"):
                     pdf_filename += ".pdf"

{lexoid-0.1.10 → lexoid-0.1.11}/lexoid/core/parse_type/llm_parser.py RENAMED Viewed

@@ -50,7 +50,7 @@ def retry_on_http_error(func):
 @retry_on_http_error
 def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
     if "model" not in kwargs:
-        kwargs["model"] = "gemini-1.5-flash"
+        kwargs["model"] = "gemini-2.0-flash"
     model = kwargs.get("model")
     if model.startswith("gemini"):
         return parse_with_gemini(path, **kwargs)

{lexoid-0.1.10 → lexoid-0.1.11}/lexoid/core/utils.py RENAMED Viewed

@@ -299,18 +299,7 @@ def html_to_markdown(html: str, title: str, url: str) -> str:
     return content
-def read_html_content(url: str) -> Dict:
-    """
-    Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
-    Args:
-        url (str): The URL of the HTML page.
-    Returns:
-        Dict: Dictionary containing parsed document data
-    """
+def get_webpage_soup(url: str) -> BeautifulSoup:
     try:
         from playwright.async_api import async_playwright
@@ -371,6 +360,21 @@ def read_html_content(url: str) -> Dict:
         soup = BeautifulSoup(
             response.content, "html.parser", from_encoding="iso-8859-1"
         )
+    return soup
+def read_html_content(url: str) -> Dict:
+    """
+    Reads the content of an HTML page from the given URL and converts it to markdown or structured content.
+    Args:
+        url (str): The URL of the HTML page.
+    Returns:
+        Dict: Dictionary containing parsed document data
+    """
+    soup = get_webpage_soup(url)
     title = soup.title.string.strip() if soup.title else "No title"
     url_hash = md5(url.encode("utf-8")).hexdigest()[:8]
     full_title = f"{title} - {url_hash}"
@@ -542,23 +546,37 @@ def has_hyperlink_in_pdf(path: str):
     )
-def router(path: str):
+def router(path: str, priority: str = "accuracy") -> str:
+    """
+    Routes the file path to the appropriate parser based on the file type.
+    Args:
+        path (str): The file path to route.
+        priority (str): The priority for routing: "accuracy" (preference to LLM_PARSE) or "speed" (preference to STATIC_PARSE).
+    """
     file_type = get_file_type(path)
     if file_type.startswith("text/"):
         return "STATIC_PARSE"
-    # Naive routing strategy for now.
-    # Current routing strategy,
-    # 1. If the PDF has hidden hyperlinks (as alias) and no images: STATIC_PARSE
-    # 2. Other scenarios: LLM_PARSE
-    # If you have other needs, do reach out or create an issue.
-    if (
-        file_type == "application/pdf"
-        and not has_image_in_pdf(path)
-        and has_hyperlink_in_pdf(path)
-    ):
-        return "STATIC_PARSE"
-    return "LLM_PARSE"
+    if priority == "accuracy":
+        # If the file is a PDF without images but has hyperlinks, use STATIC_PARSE
+        # Otherwise, use LLM_PARSE
+        if (
+            file_type == "application/pdf"
+            and not has_image_in_pdf(path)
+            and has_hyperlink_in_pdf(path)
+        ):
+            return "STATIC_PARSE"
+        return "LLM_PARSE"
+    else:
+        # If the file is a PDF without images, use STATIC_PARSE
+        # Otherwise, use LLM_PARSE
+        if (
+            file_type == "application/pdf"
+            and not has_image_in_pdf(path)
+        ):
+            return "STATIC_PARSE"
+        return "LLM_PARSE"
 def convert_doc_to_pdf(input_path: str, temp_dir: str) -> str:
     temp_path = os.path.join(