PyPI - lexoid - Versions diffs - 0.1.7__py3-none-any.whl → 0.1.8.post1__py3-none-any.whl - Mend

lexoid 0.1.7py3-none-any.whl → 0.1.8.post1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

lexoid/core/parse_type/llm_parser.py +114 -8
lexoid/core/utils.py +37 -2
{lexoid-0.1.7.dist-info → lexoid-0.1.8.post1.dist-info}/METADATA +15 -6
lexoid-0.1.8.post1.dist-info/RECORD +9 -0
lexoid-0.1.7.dist-info/RECORD +0 -9
{lexoid-0.1.7.dist-info → lexoid-0.1.8.post1.dist-info}/LICENSE +0 -0
{lexoid-0.1.7.dist-info → lexoid-0.1.8.post1.dist-info}/WHEEL +0 -0

lexoid/core/parse_type/llm_parser.py CHANGED Viewed

@@ -2,10 +2,13 @@ import base64
 import io
 import mimetypes
 import os
-from typing import Dict, List
+import time
 import pypdfium2 as pdfium
 import requests
+from functools import wraps
+from requests.exceptions import HTTPError
+from typing import Dict, List
 from lexoid.core.prompt_templates import (
     INSTRUCTIONS_ADD_PG_BREAK,
     OPENAI_USER_PROMPT,
@@ -16,9 +19,36 @@ from lexoid.core.utils import convert_image_to_pdf
 from loguru import logger
 from openai import OpenAI
 from huggingface_hub import InferenceClient
-from together import Together
+def retry_on_http_error(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            return func(*args, **kwargs)
+        except HTTPError as e:
+            logger.error(f"HTTPError encountered: {e}. Retrying in 10 seconds...")
+            time.sleep(10)
+            try:
+                return func(*args, **kwargs)
+            except HTTPError as e:
+                logger.error(f"Retry failed: {e}")
+                if kwargs.get("raw", False):
+                    return ""
+                return [
+                    {
+                        "metadata": {
+                            "title": kwargs["title"],
+                            "page": kwargs.get("start", 0),
+                        },
+                        "content": "",
+                    }
+                ]
+    return wrapper
+@retry_on_http_error
 def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
     if "model" not in kwargs:
         kwargs["model"] = "gemini-1.5-flash"
@@ -29,7 +59,7 @@ def parse_llm_doc(path: str, raw: bool, **kwargs) -> List[Dict] | str:
         return parse_with_api(path, raw, api="openai", **kwargs)
     if model.startswith("meta-llama"):
         if model.endswith("Turbo") or model == "meta-llama/Llama-Vision-Free":
-            return parse_with_api(path, raw, api="together", **kwargs)
+            return parse_with_together(path, raw, **kwargs)
         return parse_with_api(path, raw, api="huggingface", **kwargs)
     raise ValueError(f"Unsupported model: {model}")
@@ -107,7 +137,6 @@ def parse_with_gemini(path: str, raw: bool, **kwargs) -> List[Dict] | str:
             "content": page,
         }
         for page_no, page in enumerate(result.split("<page-break>"), start=1)
-        if page.strip()
     ]
@@ -126,6 +155,85 @@ def convert_pdf_page_to_base64(
     return base64.b64encode(img_byte_arr.getvalue()).decode("utf-8")
+def parse_with_together(path: str, raw: bool, **kwargs) -> List[Dict] | str:
+    api_key = os.environ.get("TOGETHER_API_KEY")
+    if not api_key:
+        raise ValueError("TOGETHER_API_KEY environment variable is not set")
+    url = "https://api.together.xyz/v1/chat/completions"
+    headers = {
+        "Authorization": f"Bearer {api_key}",
+        "Content-Type": "application/json",
+    }
+    mime_type, _ = mimetypes.guess_type(path)
+    if mime_type and mime_type.startswith("image"):
+        with open(path, "rb") as img_file:
+            image_base64 = base64.b64encode(img_file.read()).decode("utf-8")
+            images = [(0, f"data:{mime_type};base64,{image_base64}")]
+    else:
+        pdf_document = pdfium.PdfDocument(path)
+        images = [
+            (
+                page_num,
+                f"data:image/png;base64,{convert_pdf_page_to_base64(pdf_document, page_num)}",
+            )
+            for page_num in range(len(pdf_document))
+        ]
+    all_results = []
+    for page_num, image_url in images:
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": LLAMA_PARSER_PROMPT},
+                    {"type": "image_url", "image_url": {"url": image_url}},
+                ],
+            }
+        ]
+        payload = {
+            "model": kwargs["model"],
+            "messages": messages,
+            "max_tokens": kwargs.get("max_tokens", 1024),
+            "temperature": kwargs.get("temperature", 0.7),
+        }
+        response = requests.post(url, json=payload, headers=headers)
+        response.raise_for_status()
+        response_data = response.json()
+        page_text = response_data["choices"][0]["message"]["content"]
+        if kwargs.get("verbose", None):
+            logger.debug(f"Page {page_num + 1} response: {page_text}")
+        result = page_text
+        if "<output>" in page_text:
+            result = page_text.split("<output>")[1].strip()
+        if "</output>" in result:
+            result = result.split("</output>")[0].strip()
+        all_results.append((page_num, result))
+    all_results.sort(key=lambda x: x[0])
+    all_texts = [text for _, text in all_results]
+    combined_text = "<page-break>".join(all_texts)
+    if raw:
+        return combined_text
+    return [
+        {
+            "metadata": {
+                "title": kwargs["title"],
+                "page": kwargs.get("start", 0) + page_no,
+            },
+            "content": page,
+        }
+        for page_no, page in enumerate(all_texts, start=1)
+    ]
 def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str:
     """
     Parse documents (PDFs or images) using various vision model APIs.
@@ -133,7 +241,7 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
     Args:
         path (str): Path to the document to parse
         raw (bool): If True, return raw text; if False, return structured data
-        api (str): Which API to use ("openai", "huggingface", or "together")
+        api (str): Which API to use ("openai" or "huggingface")
         **kwargs: Additional arguments including model, temperature, title, etc.
     Returns:
@@ -145,7 +253,6 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
         "huggingface": lambda: InferenceClient(
             token=os.environ["HUGGINGFACEHUB_API_TOKEN"]
         ),
-        "together": lambda: Together(),
     }
     assert api in clients, f"Unsupported API: {api}"
     logger.debug(f"Parsing with {api} API and model {kwargs['model']}")
@@ -253,5 +360,4 @@ def parse_with_api(path: str, raw: bool, api: str, **kwargs) -> List[Dict] | str
             "content": page,
         }
         for page_no, page in enumerate(all_texts, start=1)
-        if page.strip()
     ]

lexoid/core/utils.py CHANGED Viewed

@@ -298,9 +298,44 @@ def read_html_content(url: str, raw: bool = False) -> Union[str, List[Dict]]:
         async def fetch_page():
             async with async_playwright() as p:
-                browser = await p.chromium.launch(headless=True)
-                page = await browser.new_page()
+                browser = await p.chromium.launch(
+                    headless=True,
+                    args=[
+                        "--disable-blink-features=AutomationControlled",
+                        "--no-sandbox",
+                        "--window-size=1920,1080",
+                    ],
+                )
+                context = await browser.new_context(
+                    viewport={"width": 1920, "height": 1080},
+                    user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+                    bypass_csp=True,
+                )
+                page = await context.new_page()
+                # Add headers to appear more like a real browser
+                await page.set_extra_http_headers(
+                    {
+                        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+                        "Accept-Language": "en-US,en;q=0.5",
+                        "Sec-Fetch-Dest": "document",
+                        "Sec-Fetch-Mode": "navigate",
+                        "Sec-Fetch-Site": "none",
+                        "Sec-Fetch-User": "?1",
+                    }
+                )
                 await page.goto(url)
+                # Wait for Cloudflare check to complete
+                await page.wait_for_load_state("networkidle")
+                # Additional wait for any dynamic content
+                try:
+                    await page.wait_for_selector("body", timeout=30000)
+                except:
+                    pass
                 html = await page.content()
                 await browser.close()
                 return html

{lexoid-0.1.7.dist-info → lexoid-0.1.8.post1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: lexoid
-Version: 0.1.7
+Version: 0.1.8.post1
 Summary:
 Requires-Python: >=3.10,<4.0
 Classifier: Programming Language :: Python :: 3
@@ -28,16 +28,22 @@ Requires-Dist: pyqtwebengine (>=5.15.7,<6.0.0) ; platform_system != "debian"
 Requires-Dist: python-docx (>=1.1.2,<2.0.0)
 Requires-Dist: python-dotenv (>=1.0.0,<2.0.0)
 Requires-Dist: tabulate (>=0.9.0,<0.10.0)
-Requires-Dist: together (>=1.3.10,<2.0.0)
 Description-Content-Type: text/markdown
 # Lexoid
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/oidlabs-com/Lexoid/blob/main/examples/example_notebook_colab.ipynb)
+[![GitHub license](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://github.com/oidlabs-com/Lexoid/blob/main/LICENSE)
+[![PyPI](https://img.shields.io/pypi/v/lexoid)](https://pypi.org/project/lexoid/)
+[![Docs](https://github.com/oidlabs-com/Lexoid/actions/workflows/deploy_docs.yml/badge.svg)](https://oidlabs-com.github.io/Lexoid/)
 Lexoid is an efficient document parsing library that supports both LLM-based and non-LLM-based (static) PDF document parsing.
+[Documentation](https://oidlabs-com.github.io/Lexoid/)
 ## Motivation:
 - Use the multi-modal advancement of LLMs
-- Enable convenience for users while driving innovation
+- Enable convenience for users
 - Collaborate with a permissive license
 ## Installation
@@ -52,13 +58,12 @@ OPENAI_API_KEY=""
 GOOGLE_API_KEY=""
 ```
-Optionally, to use `Playwright` for retrieving web content with the `.whl` package (else regular requests will be used by default):
+Optionally, to use `Playwright` for retrieving web content (instead of the `requests` library):
 ```
 playwright install --with-deps --only-shell chromium
 ```
 ### Building `.whl` from source
-To create `.whl`:
 ```
 make build
 ```
@@ -80,6 +85,7 @@ source .venv/bin/activate
 ## Usage
 [Example Notebook](https://github.com/oidlabs-com/Lexoid/blob/main/examples/example_notebook.ipynb)
 [Example Colab Notebook](https://drive.google.com/file/d/1v9R6VOUp9CEGalgZGeg5G57XzHqh_tB6/view?usp=sharing)
 Here's a quick example to parse documents using Lexoid:
@@ -98,13 +104,16 @@ print(parsed_md)
 ### Parameters
 - path (str): The file path or URL.
 - parser_type (str, optional): The type of parser to use ("LLM_PARSE" or "STATIC_PARSE"). Defaults to "AUTO".
-- raw (bool, optional): Whether to return raw text or structured data. Defaults to False.
+- raw (bool, optional): Return raw text or structured data. Defaults to False.
 - pages_per_split (int, optional): Number of pages per split for chunking. Defaults to 4.
 - max_threads (int, optional): Maximum number of threads for parallel processing. Defaults to 4.
 - **kwargs: Additional arguments for the parser.
 ## Benchmark
 Initial results (_more updates soon_)
+_Note:_ Benchmarks are currently done in the zero-shot setting.
 | Rank | Model/Framework | Similarity | Time (s) |
 |------|-----------|------------|----------|
 | 1 | gpt-4o | 0.799 | 21.77|

lexoid-0.1.8.post1.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
+lexoid/core/parse_type/llm_parser.py,sha256=JsrVALlK4h2j8URSgNIhdWPB6chWXrNrMlImtxVTyyU,11833
+lexoid/core/parse_type/static_parser.py,sha256=NlAE_WMMNvNnVo2aQrA9mN3fwJ6ZrshMC8S9kG0h8CA,13772
+lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
+lexoid/core/utils.py,sha256=peWuMVTk90-j0aSDaRnwigpoAz_Q5y8vSosCDc6Zl3g,18642
+lexoid-0.1.8.post1.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+lexoid-0.1.8.post1.dist-info/METADATA,sha256=mz8A_92-GrLfOmT8UYcIxWIEkcskad_9vSnNnlbE4dI,4625
+lexoid-0.1.8.post1.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+lexoid-0.1.8.post1.dist-info/RECORD,,

lexoid-0.1.7.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-lexoid/api.py,sha256=0we4A-U_nWMg57ZusthN27f1zXde2igi9jQujrjXthw,7120
-lexoid/core/parse_type/llm_parser.py,sha256=i_iidoP_qExGTScRPMBX5X3RnjIf6XqAS_NhLkz0_LM,8464
-lexoid/core/parse_type/static_parser.py,sha256=NlAE_WMMNvNnVo2aQrA9mN3fwJ6ZrshMC8S9kG0h8CA,13772
-lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
-lexoid/core/utils.py,sha256=rd8sf2OZqMv_oHGxM1redpSwU8f_sBJ-0tzlbp8U3_A,17193
-lexoid-0.1.7.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-lexoid-0.1.7.dist-info/METADATA,sha256=yOwsqpA5U-2Z2CXr5Cnrs2a6HtqY-4WryVfYDTI7X08,4092
-lexoid-0.1.7.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-lexoid-0.1.7.dist-info/RECORD,,

{lexoid-0.1.7.dist-info → lexoid-0.1.8.post1.dist-info}/LICENSE RENAMED Viewed

File without changes

{lexoid-0.1.7.dist-info → lexoid-0.1.8.post1.dist-info}/WHEEL RENAMED Viewed

File without changes

lexoid 0.1.7__py3-none-any.whl → 0.1.8.post1__py3-none-any.whl

lexoid 0.1.7py3-none-any.whl → 0.1.8.post1py3-none-any.whl