PyPI - lexoid - Versions diffs - 0.1.14__tar.gz → 0.1.15__tar.gz - Mend

lexoid 0.1.14tar.gz → 0.1.15tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

{lexoid-0.1.14 → lexoid-0.1.15}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: lexoid
-Version: 0.1.14
+Version: 0.1.15
 Summary:
 Requires-Python: >=3.10,<4.0
 Classifier: Programming Language :: Python :: 3
@@ -8,6 +8,7 @@ Classifier: Programming Language :: Python :: 3.10
 Classifier: Programming Language :: Python :: 3.11
 Classifier: Programming Language :: Python :: 3.12
 Classifier: Programming Language :: Python :: 3.13
+Requires-Dist: anthropic (>=0.55.0,<0.56.0)
 Requires-Dist: bs4 (>=0.0.2,<0.0.3)
 Requires-Dist: docx2pdf (>=0.1.8,<0.2.0)
 Requires-Dist: google-generativeai (>=0.8.1,<0.9.0)
@@ -155,23 +156,23 @@ _Note:_ Benchmarks are currently done in the zero-shot setting.
 | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
 | --- | --- | --- | --- | --- | --- |
-| 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.00048 |
-| 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
-| 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
-| 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
-| 5 | AUTO | 0.76 | 0.184 | 5.14 | 0.000217 |
-| 6 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
-| 7 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
-| 8 | accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks) | 0.687 | 0.221 | 8.07 | 0.000419 |
-| 9 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
-| 10 | accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks) | 0.675 | 0.184 | 5.98 | 0.000226 |
-| 11 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
-| 12 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
-| 13 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
-| 14 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
-| 15 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.00006 |
-| 16 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
-| 17 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
-| 18 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
-| 19 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.8 | 0.000019 |
+| 1 | AUTO | 0.906 | 0.112 | 9.56 | 0.00068 |
+| 2 | gemini-2.0-flash | 0.897 | 0.126 | 9.91 | 0.00078 |
+| 3 | gemini-2.5-flash | 0.895 | 0.148 | 54.10 | 0.01051 |
+| 4 | gemini-1.5-pro | 0.868 | 0.283 | 15.03 | 0.00637 |
+| 5 | gemini-1.5-flash | 0.864 | 0.194 | 15.47 | 0.00044 |
+| 6 | claude-3-5-sonnet-20241022 | 0.851 | 0.209 | 15.99 | 0.01758 |
+| 7 | gemini-2.5-pro | 0.849 | 0.298 | 101.95 | 0.01859 |
+| 8 | claude-sonnet-4-20250514 | 0.804 | 0.190 | 19.27 | 0.02071 |
+| 9 | claude-opus-4-20250514 | 0.772 | 0.238 | 20.03 | 0.09207 |
+| 10 | accounts/fireworks/models/llama4-maverick-instruct-basic | 0.768 | 0.234 | 12.12 | 0.00150 |
+| 11 | gpt-4o | 0.748 | 0.284 | 26.80 | 0.01478 |
+| 12 | gpt-4o-mini | 0.733 | 0.231 | 18.18 | 0.00650 |
+| 13 | gpt-4.1-mini | 0.723 | 0.269 | 20.91 | 0.00351 |
+| 14 | google/gemma-3-27b-it | 0.681 | 0.334 | 19.41 | 0.00027 |
+| 15 | gpt-4.1 | 0.650 | 0.342 | 33.72 | 0.01443 |
+| 16 | claude-3-7-sonnet-20250219 | 0.633 | 0.369 | 14.24 | 0.01763 |
+| 17 | microsoft/phi-4-multimodal-instruct | 0.622 | 0.320 | 13.15 | 0.00050 |
+| 18 | qwen/qwen-2.5-vl-7b-instruct | 0.559 | 0.348 | 17.71 | 0.00086 |
+| 19 | meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo | 0.546 | 0.239 | 29.26 | 0.01103 |

{lexoid-0.1.14 → lexoid-0.1.15}/README.md RENAMED Viewed

@@ -120,22 +120,22 @@ _Note:_ Benchmarks are currently done in the zero-shot setting.
 | Rank | Model | Mean Similarity | Std. Dev. | Time (s) | Cost ($) |
 | --- | --- | --- | --- | --- | --- |
-| 1 | gemini-2.0-flash | 0.829 | 0.102 | 7.41 | 0.00048 |
-| 2 | gemini-2.0-flash-001 | 0.814 | 0.176 | 6.85 | 0.000421 |
-| 3 | gemini-1.5-flash | 0.797 | 0.143 | 9.54 | 0.000238 |
-| 4 | gemini-2.0-pro-exp | 0.764 | 0.227 | 11.95 | TBA |
-| 5 | AUTO | 0.76 | 0.184 | 5.14 | 0.000217 |
-| 6 | gemini-2.0-flash-thinking-exp | 0.746 | 0.266 | 10.46 | TBA |
-| 7 | gemini-1.5-pro | 0.732 | 0.265 | 11.44 | 0.003332 |
-| 8 | accounts/fireworks/models/llama4-maverick-instruct-basic (via Fireworks) | 0.687 | 0.221 | 8.07 | 0.000419 |
-| 9 | gpt-4o | 0.687 | 0.247 | 10.16 | 0.004736 |
-| 10 | accounts/fireworks/models/llama4-scout-instruct-basic (via Fireworks) | 0.675 | 0.184 | 5.98 | 0.000226 |
-| 11 | gpt-4o-mini | 0.642 | 0.213 | 9.71 | 0.000275 |
-| 12 | gemma-3-27b-it (via OpenRouter) | 0.628 | 0.299 | 18.79 | 0.000096 |
-| 13 | gemini-1.5-flash-8b | 0.551 | 0.223 | 3.91 | 0.000055 |
-| 14 | Llama-Vision-Free (via Together AI) | 0.531 | 0.198 | 6.93 | 0 |
-| 15 | Llama-3.2-11B-Vision-Instruct-Turbo (via Together AI) | 0.524 | 0.192 | 3.68 | 0.00006 |
-| 16 | qwen/qwen-2.5-vl-7b-instruct (via OpenRouter) | 0.482 | 0.209 | 11.53 | 0.000052 |
-| 17 | Llama-3.2-90B-Vision-Instruct-Turbo (via Together AI) | 0.461 | 0.306 | 19.26 | 0.000426 |
-| 18 | Llama-3.2-11B-Vision-Instruct (via Hugging Face) | 0.451 | 0.257 | 4.54 | 0 |
-| 19 | microsoft/phi-4-multimodal-instruct (via OpenRouter) | 0.366 | 0.287 | 10.8 | 0.000019 |
+| 1 | AUTO | 0.906 | 0.112 | 9.56 | 0.00068 |
+| 2 | gemini-2.0-flash | 0.897 | 0.126 | 9.91 | 0.00078 |
+| 3 | gemini-2.5-flash | 0.895 | 0.148 | 54.10 | 0.01051 |
+| 4 | gemini-1.5-pro | 0.868 | 0.283 | 15.03 | 0.00637 |
+| 5 | gemini-1.5-flash | 0.864 | 0.194 | 15.47 | 0.00044 |
+| 6 | claude-3-5-sonnet-20241022 | 0.851 | 0.209 | 15.99 | 0.01758 |
+| 7 | gemini-2.5-pro | 0.849 | 0.298 | 101.95 | 0.01859 |
+| 8 | claude-sonnet-4-20250514 | 0.804 | 0.190 | 19.27 | 0.02071 |
+| 9 | claude-opus-4-20250514 | 0.772 | 0.238 | 20.03 | 0.09207 |
+| 10 | accounts/fireworks/models/llama4-maverick-instruct-basic | 0.768 | 0.234 | 12.12 | 0.00150 |
+| 11 | gpt-4o | 0.748 | 0.284 | 26.80 | 0.01478 |
+| 12 | gpt-4o-mini | 0.733 | 0.231 | 18.18 | 0.00650 |
+| 13 | gpt-4.1-mini | 0.723 | 0.269 | 20.91 | 0.00351 |
+| 14 | google/gemma-3-27b-it | 0.681 | 0.334 | 19.41 | 0.00027 |
+| 15 | gpt-4.1 | 0.650 | 0.342 | 33.72 | 0.01443 |
+| 16 | claude-3-7-sonnet-20250219 | 0.633 | 0.369 | 14.24 | 0.01763 |
+| 17 | microsoft/phi-4-multimodal-instruct | 0.622 | 0.320 | 13.15 | 0.00050 |
+| 18 | qwen/qwen-2.5-vl-7b-instruct | 0.559 | 0.348 | 17.71 | 0.00086 |
+| 19 | meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo | 0.546 | 0.239 | 29.26 | 0.01103 |

{lexoid-0.1.14 → lexoid-0.1.15}/lexoid/api.py RENAMED Viewed

@@ -4,9 +4,10 @@ import re
 import tempfile
 from concurrent.futures import ProcessPoolExecutor
 from enum import Enum
+from functools import wraps
 from glob import glob
 from time import time
-from typing import Union, Dict, List
+from typing import Optional, Union, Dict, List
 from loguru import logger
@@ -14,6 +15,7 @@ from lexoid.core.parse_type.llm_parser import (
     parse_llm_doc,
     create_response,
     convert_doc_to_base64_images,
+    get_api_provider_for_model,
 )
 from lexoid.core.parse_type.static_parser import parse_static_doc
 from lexoid.core.utils import (
@@ -35,6 +37,51 @@ class ParserType(Enum):
     AUTO = "AUTO"
+def retry_with_different_parser_type(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            if len(args) > 0:
+                kwargs["path"] = args[0]
+            if len(args) > 1:
+                router_priority = kwargs.get("router_priority", "speed")
+                if args[1] == ParserType.AUTO:
+                    parser_type = ParserType[router(kwargs["path"], router_priority)]
+                    logger.debug(f"Auto-detected parser type: {parser_type}")
+                    kwargs["routed"] = True
+                else:
+                    parser_type = args[1]
+                kwargs["parser_type"] = parser_type
+            return func(**kwargs)
+        except Exception as e:
+            if kwargs.get("parser_type") == ParserType.LLM_PARSE and kwargs.get(
+                "routed", False
+            ):
+                logger.warning(
+                    f"LLM_PARSE failed with error: {e}. Retrying with STATIC_PARSE."
+                )
+                kwargs["parser_type"] = ParserType.STATIC_PARSE
+                kwargs["routed"] = False
+                return func(**kwargs)
+            elif kwargs.get("parser_type") == ParserType.STATIC_PARSE and kwargs.get(
+                "routed", False
+            ):
+                logger.warning(
+                    f"STATIC_PARSE failed with error: {e}. Retrying with LLM_PARSE."
+                )
+                kwargs["parser_type"] = ParserType.LLM_PARSE
+                kwargs["routed"] = False
+                return func(**kwargs)
+            else:
+                logger.error(
+                    f"Parsing failed with error: {e}. No fallback parser available."
+                )
+                raise e
+    return wrapper
+@retry_with_different_parser_type
 def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
     """
     Parses a file using the specified parser type.
@@ -55,11 +102,6 @@ def parse_chunk(path: str, parser_type: ParserType, **kwargs) -> Dict:
             - token_usage: Dictionary containing token usage statistics
             - parser_used: Which parser was actually used
     """
-    if parser_type == ParserType.AUTO:
-        router_priority = kwargs.get("router_priority", "speed")
-        parser_type = ParserType[router(path, router_priority)]
-        logger.debug(f"Auto-detected parser type: {parser_type}")
     kwargs["start"] = (
         int(os.path.basename(path).split("_")[1]) - 1 if kwargs.get("split") else 0
     )
@@ -193,7 +235,7 @@ def parse(
             sub_pdf_path = os.path.join(sub_pdf_dir, f"{os.path.basename(path)}")
             path = create_sub_pdf(path, sub_pdf_path, kwargs["page_nums"])
-        if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
+        if not path.lower().endswith(".pdf"):
             kwargs["split"] = False
             result = parse_chunk_list([path], parser_type, kwargs)
         else:
@@ -300,7 +342,11 @@ def parse(
 def parse_with_schema(
-    path: str, schema: Dict, api: str = "openai", model: str = "gpt-4o-mini", **kwargs
+    path: str,
+    schema: Dict,
+    api: Optional[str] = None,
+    model: str = "gpt-4o-mini",
+    **kwargs,
 ) -> List[List[Dict]]:
     """
     Parses a PDF using an LLM to generate structured output conforming to a given JSON schema.
@@ -315,6 +361,10 @@ def parse_with_schema(
     Returns:
         List[List[Dict]]: List of dictionaries for each page, each conforming to the provided schema.
     """
+    if not api:
+        api = get_api_provider_for_model(model)
+        logger.debug(f"Using API provider: {api}")
     system_prompt = f"""
         The output should be formatted as a JSON instance that conforms to the JSON schema below.

{lexoid-0.1.14 → lexoid-0.1.15}/lexoid/core/parse_type/llm_parser.py RENAMED Viewed

@@ -8,6 +8,7 @@ from typing import Dict, List, Optional, Tuple
 import pypdfium2 as pdfium
 import requests
+from anthropic import Anthropic
 from huggingface_hub import InferenceClient
 from loguru import logger
 from openai import OpenAI
@@ -49,36 +50,41 @@ def retry_on_http_error(func):
     return wrapper
-@retry_on_http_error
-def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
-    if "api_provider" in kwargs and kwargs["api_provider"]:
-        return parse_with_api(path, api=kwargs["api_provider"], **kwargs)
-    if "model" not in kwargs:
-        kwargs["model"] = "gemini-2.0-flash"
-    model = kwargs.get("model")
+def get_api_provider_for_model(model: str) -> str:
     if model.startswith("gemini"):
-        return parse_with_gemini(path, **kwargs)
+        return "gemini"
     if model.startswith("gpt"):
-        return parse_with_api(path, api="openai", **kwargs)
+        return "openai"
     if model.startswith("meta-llama"):
         if "Turbo" in model or model == "meta-llama/Llama-Vision-Free":
-            return parse_with_api(path, api="together", **kwargs)
-        return parse_with_api(path, api="huggingface", **kwargs)
+            return "together"
+        return "huggingface"
     if any(model.startswith(prefix) for prefix in ["microsoft", "google", "qwen"]):
-        return parse_with_api(path, api="openrouter", **kwargs)
+        return "openrouter"
     if model.startswith("accounts/fireworks"):
-        return parse_with_api(path, api="fireworks", **kwargs)
+        return "fireworks"
+    if model.startswith("claude"):
+        return "anthropic"
     raise ValueError(f"Unsupported model: {model}")
-def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
-    logger.debug(f"Parsing with Gemini API and model {kwargs['model']}")
-    api_key = os.environ.get("GOOGLE_API_KEY")
-    if not api_key:
-        raise ValueError("GOOGLE_API_KEY environment variable is not set")
+@retry_on_http_error
+def parse_llm_doc(path: str, **kwargs) -> List[Dict] | str:
+    if "api_provider" in kwargs and kwargs["api_provider"]:
+        return parse_with_api(path, api=kwargs["api_provider"], **kwargs)
-    url = f"https://generativelanguage.googleapis.com/v1beta/models/{kwargs['model']}:generateContent?key={api_key}"
+    model = kwargs.get("model", "gemini-2.0-flash")
+    kwargs["model"] = model
+    api_provider = get_api_provider_for_model(model)
+    if api_provider == "gemini":
+        return parse_with_gemini(path, **kwargs)
+    else:
+        return parse_with_api(path, api=api_provider, **kwargs)
+def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
     # Check if the file is an image and convert to PDF if necessary
     mime_type, _ = mimetypes.guess_type(path)
     if mime_type and mime_type.startswith("image"):
@@ -90,6 +96,20 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
             file_content = file.read()
         base64_file = base64.b64encode(file_content).decode("utf-8")
+    return parse_image_with_gemini(
+        base64_file=base64_file, mime_type=mime_type, **kwargs
+    )
+def parse_image_with_gemini(
+    base64_file: str, mime_type: str = "image/png", **kwargs
+) -> List[Dict] | str:
+    api_key = os.environ.get("GOOGLE_API_KEY")
+    if not api_key:
+        raise ValueError("GOOGLE_API_KEY environment variable is not set")
+    url = f"https://generativelanguage.googleapis.com/v1beta/models/{kwargs['model']}:generateContent?key={api_key}"
     if "system_prompt" in kwargs:
         prompt = kwargs["system_prompt"]
     else:
@@ -109,7 +129,7 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
             }
         ],
         "generationConfig": {
-            "temperature": kwargs.get("temperature", 0.2),
+            "temperature": kwargs.get("temperature", 0),
         },
     }
@@ -129,24 +149,23 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
         if "text" in part
     )
-    combined_text = ""
+    combined_text = raw_text
     if "<output>" in raw_text:
         combined_text = raw_text.split("<output>")[-1].strip()
-    if "</output>" in result:
-        combined_text = result.split("</output>")[0].strip()
+    if "</output>" in combined_text:
+        combined_text = combined_text.split("</output>")[0].strip()
     token_usage = result["usageMetadata"]
     input_tokens = token_usage.get("promptTokenCount", 0)
     output_tokens = token_usage.get("candidatesTokenCount", 0)
     total_tokens = input_tokens + output_tokens
     return {
         "raw": combined_text.replace("<page-break>", "\n\n"),
         "segments": [
             {"metadata": {"page": kwargs.get("start", 0) + page_no}, "content": page}
             for page_no, page in enumerate(combined_text.split("<page-break>"), start=1)
         ],
-        "title": kwargs["title"],
+        "title": kwargs.get("title", ""),
         "url": kwargs.get("url", ""),
         "parent_title": kwargs.get("parent_title", ""),
         "recursive_docs": [],
@@ -218,7 +237,7 @@ def create_response(
     system_prompt: Optional[str] = None,
     user_prompt: Optional[str] = None,
     image_url: Optional[str] = None,
-    temperature: float = 0.2,
+    temperature: float = 0.0,
     max_tokens: int = 1024,
 ) -> Dict:
     # Initialize appropriate client
@@ -236,10 +255,64 @@ def create_response(
             base_url="https://api.fireworks.ai/inference/v1",
             api_key=os.environ["FIREWORKS_API_KEY"],
         ),
+        "anthropic": lambda: Anthropic(
+            api_key=os.environ["ANTHROPIC_API_KEY"],
+        ),
+        "gemini": lambda: None,  # Gemini is handled separately
     }
     assert api in clients, f"Unsupported API: {api}"
+    if api == "gemini":
+        image_url = image_url.split("data:image/png;base64,")[1]
+        response = parse_image_with_gemini(
+            base64_file=image_url,
+            model=model,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            system_prompt=system_prompt,
+        )
+        return {
+            "response": response["raw"],
+            "usage": response["token_usage"],
+        }
     client = clients[api]()
+    if api == "anthropic":
+        image_media_type = image_url.split(";")[0].split(":")[1]
+        image_data = image_url.split(",")[1]
+        response = client.messages.create(
+            model=model,
+            messages=[
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image",
+                            "source": {
+                                "type": "base64",
+                                "media_type": image_media_type,
+                                "data": image_data,
+                            },
+                        },
+                        {"type": "text", "text": user_prompt},
+                    ],
+                }
+            ],
+            max_tokens=max_tokens,
+            temperature=temperature,
+        )
+        return {
+            "response": response.content[0].text,
+            "usage": {
+                "input_tokens": response.usage.input_tokens,
+                "output_tokens": response.usage.output_tokens,
+                "total_tokens": response.usage.input_tokens
+                + response.usage.output_tokens,
+            },
+        }
     # Prepare messages for the API call
     messages = get_messages(system_prompt, user_prompt, image_url)
@@ -260,7 +333,11 @@ def create_response(
     return {
         "response": page_text,
-        "usage": token_usage,
+        "usage": {
+            "input_tokens": token_usage.prompt_tokens,
+            "output_tokens": token_usage.completion_tokens,
+            "total_tokens": token_usage.total_tokens,
+        },
     }
@@ -314,7 +391,7 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
             system_prompt=system_prompt,
             user_prompt=user_prompt,
             image_url=image_url,
-            temperature=kwargs.get("temperature", 0.2),
+            temperature=kwargs.get("temperature", 0.0),
             max_tokens=kwargs.get("max_tokens", 1024),
         )
@@ -335,9 +412,9 @@ def parse_with_api(path: str, api: str, **kwargs) -> List[Dict] | str:
             (
                 page_num,
                 result,
-                token_usage.prompt_tokens,
-                token_usage.completion_tokens,
-                token_usage.total_tokens,
+                token_usage["input_tokens"],
+                token_usage["output_tokens"],
+                token_usage["total_tokens"],
             )
         )

{lexoid-0.1.14 → lexoid-0.1.15}/lexoid/core/parse_type/static_parser.py RENAMED Viewed

@@ -1,12 +1,14 @@
 import os
 import re
 import tempfile
+from functools import wraps
 from time import time
 from typing import Dict, List
 import pandas as pd
 import pdfplumber
 from docx import Document
+from loguru import logger
 from pdfminer.high_level import extract_pages
 from pdfminer.layout import LTTextContainer
 from pdfplumber.utils import get_bbox_overlap, obj_to_bbox
@@ -22,6 +24,38 @@ from lexoid.core.utils import (
 )
+def retry_with_different_parser(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except Exception as e:
+            if "pdfplumber" in kwargs.get("framework", "pdfplumber") and not kwargs.get(
+                "routed", False
+            ):
+                kwargs["framework"] = "pdfminer"
+                logger.warning(
+                    f"Retrying with pdfminer due to error: {e}. Original framework: {kwargs['framework']}"
+                )
+                return func(*args, **kwargs)
+            elif "pdfminer" in kwargs.get("framework", "pdfplumber") and not kwargs.get(
+                "routed", False
+            ):
+                kwargs["framework"] = "pdfplumber"
+                logger.warning(
+                    f"Retrying with pdfplumber due to error: {e}. Original framework: {kwargs['framework']}"
+                )
+                return func(*args, **kwargs)
+            else:
+                logger.error(
+                    f"Failed to parse document with both pdfplumber and pdfminer: {e}"
+                )
+                raise e
+    return wrapper
+@retry_with_different_parser
 def parse_static_doc(path: str, **kwargs) -> Dict:
     """
     Parses a document using static parsing methods.

{lexoid-0.1.14 → lexoid-0.1.15}/lexoid/core/utils.py RENAMED Viewed

@@ -69,15 +69,45 @@ def convert_image_to_pdf(image_path: str) -> bytes:
 def remove_html_tags(text: str):
     html = markdown(text, extensions=["tables"])
-    return re.sub(HTML_TAG_PATTERN, "", html)
+    return re.sub(HTML_TAG_PATTERN, " ", html)
-def calculate_similarity(text1: str, text2: str, ignore_html=True) -> float:
+def clean_text(txt):
+    # Remove LaTeX commands (e.g. \command, \command[args]{args})
+    txt = re.sub(r"\\[a-zA-Z]+(\[[^\]]*\])?(\{[^}]*\})?", " ", txt)
+    # Replace all blocks of whitespace (including tabs and newlines) with a single space
+    txt = re.sub(r"\s+", " ", txt)
+    # Remove all non-alphanumeric characters except spaces
+    txt = re.sub(r"[^a-zA-Z0-9 ]", " ", txt)
+    return txt.strip()
+def calculate_similarity(
+    text1: str, text2: str, ignore_html: bool = True, diff_save_path: str = ""
+) -> float:
     """Calculate similarity ratio between two texts using SequenceMatcher."""
     if ignore_html:
         text1 = remove_html_tags(text1)
         text2 = remove_html_tags(text2)
-    return SequenceMatcher(None, text1, text2).ratio()
+    text1 = clean_text(clean_text(text1))
+    text2 = clean_text(clean_text(text2))
+    sm = SequenceMatcher(None, text1, text2)
+    # Save the diff and the texts for debugging
+    if diff_save_path:
+        with open(diff_save_path, "w") as f:
+            f.write(f"Text 1:\n{text1}\n\n")
+            f.write(f"Text 2:\n{text2}\n\n")
+            f.write("Differences:\n")
+            for tag, i1, i2, j1, j2 in sm.get_opcodes():
+                if tag == "equal":
+                    continue
+                f.write(f"{tag} {text1[i1:i2]} -> {text2[j1:j2]}\n")
+    return sm.ratio()
 def convert_pdf_page_to_image(

{lexoid-0.1.14 → lexoid-0.1.15}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "lexoid"
-version = "0.1.14"
+version = "0.1.15"
 description = ""
 authors = []
 readme = "README.md"
@@ -30,6 +30,7 @@ huggingface-hub = "^0.27.0"
 together = "^1.4.0"
 openpyxl = "^3.1.5"
 pptx2md = "^2.0.6"
+anthropic = "^0.55.0"
 [tool.poetry.group.dev.dependencies]
 ipykernel = "^6.29.5"
@@ -40,6 +41,7 @@ pytest = "^8.3.2"
 [tool.poetry.group.docs.dependencies]
 sphinx = "^8.1.3"
 pydata-sphinx-theme = "^0.16.1"
+docutils = "^0.21.2"
 [build-system]
 requires = ["poetry-core", "wheel"]

{lexoid-0.1.14 → lexoid-0.1.15}/LICENSE RENAMED Viewed

File without changes

{lexoid-0.1.14 → lexoid-0.1.15}/lexoid/core/prompt_templates.py RENAMED Viewed

File without changes

lexoid 0.1.14__tar.gz → 0.1.15__tar.gz

lexoid 0.1.14tar.gz → 0.1.15tar.gz