PyPI - lexoid - Versions diffs - 0.1.9__tar.gz → 0.1.10__tar.gz - Mend

lexoid 0.1.9tar.gz → 0.1.10tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{lexoid-0.1.9 → lexoid-0.1.10}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: lexoid
-Version: 0.1.9
+Version: 0.1.10
 Summary:
 Requires-Python: >=3.10,<4.0
 Classifier: Programming Language :: Python :: 3

{lexoid-0.1.9 → lexoid-0.1.10}/lexoid/api.py RENAMED Viewed

@@ -19,6 +19,7 @@ from lexoid.core.utils import (
     recursive_read_html,
     router,
     split_pdf,
+    create_sub_pdf,
 )
@@ -83,8 +84,9 @@ def parse_chunk_list(
         result = parse_chunk(file_path, parser_type, **kwargs)
         combined_segments.extend(result["segments"])
         raw_texts.append(result["raw"])
-        token_usage["input"] += result["token_usage"]["input"]
-        token_usage["output"] += result["token_usage"]["output"]
+        if "token_usage" in result:
+            token_usage["input"] += result["token_usage"]["input"]
+            token_usage["output"] += result["token_usage"]["output"]
     token_usage["total"] = token_usage["input"] + token_usage["output"]
     return {
@@ -163,6 +165,12 @@ def parse(
             pdf_path = os.path.join(temp_dir, "converted.pdf")
             path = convert_to_pdf(path, pdf_path)
+        if "page_nums" in kwargs and path.lower().endswith(".pdf"):
+            sub_pdf_dir = os.path.join(temp_dir, "sub_pdfs")
+            os.makedirs(sub_pdf_dir, exist_ok=True)
+            sub_pdf_path = os.path.join(sub_pdf_dir, f"{os.path.basename(path)}")
+            path = create_sub_pdf(path, sub_pdf_path, kwargs["page_nums"])
         if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
             kwargs["split"] = False
             result = parse_chunk(path, parser_type, **kwargs)

lexoid-0.1.10/lexoid/core/__pycache__/prompt_templates.cpython-310.pyc ADDED Viewed

Binary file

lexoid-0.1.10/lexoid/core/__pycache__/prompt_templates.cpython-312.pyc ADDED Viewed

Binary file

lexoid-0.1.10/lexoid/core/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file

lexoid-0.1.10/lexoid/core/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file

lexoid-0.1.10/lexoid/core/parse_type/__pycache__/llm_parser.cpython-310.pyc ADDED Viewed

Binary file

lexoid-0.1.10/lexoid/core/parse_type/__pycache__/llm_parser.cpython-312.pyc ADDED Viewed

Binary file

lexoid-0.1.10/lexoid/core/parse_type/__pycache__/static_parser.cpython-310.pyc ADDED Viewed

Binary file

lexoid-0.1.10/lexoid/core/parse_type/__pycache__/static_parser.cpython-312.pyc ADDED Viewed

Binary file

{lexoid-0.1.9 → lexoid-0.1.10}/lexoid/core/parse_type/llm_parser.py RENAMED Viewed

@@ -125,6 +125,9 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
         combined_text = result.split("</output>")[0].strip()
     token_usage = result["usageMetadata"]
+    input_tokens = token_usage.get("promptTokenCount", 0)
+    output_tokens = token_usage.get("candidatesTokenCount", 0)
+    total_tokens = input_tokens + output_tokens
     return {
         "raw": combined_text,
@@ -137,9 +140,9 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
         "parent_title": kwargs.get("parent_title", ""),
         "recursive_docs": [],
         "token_usage": {
-            "input": token_usage["promptTokenCount"],
-            "output": token_usage["candidatesTokenCount"],
-            "total": token_usage["totalTokenCount"],
+            "input": input_tokens,
+            "output": output_tokens,
+            "total": total_tokens,
         },
     }

{lexoid-0.1.9 → lexoid-0.1.10}/lexoid/core/utils.py RENAMED Viewed

@@ -6,7 +6,7 @@ import re
 import sys
 from difflib import SequenceMatcher
 from hashlib import md5
-from typing import Dict, List
+from typing import Dict, List, Optional
 from urllib.parse import urlparse
 import nest_asyncio
@@ -45,6 +45,20 @@ def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
     return paths
+def create_sub_pdf(
+    input_path: str, output_path: str, page_nums: Optional[tuple[int, ...]|int] = None
+) -> str:
+    if isinstance(page_nums, int):
+        page_nums = (page_nums,)
+    page_nums = tuple(sorted(set(page_nums)))
+    with pikepdf.open(input_path) as pdf:
+        indices = page_nums if page_nums else range(len(pdf.pages))
+        with pikepdf.new() as new_pdf:
+            new_pdf.pages.extend([pdf.pages[i - 1] for i in indices])
+            new_pdf.save(output_path)
+    return output_path
 def convert_image_to_pdf(image_path: str) -> bytes:
     with Image.open(image_path) as img:
         img_rgb = img.convert("RGB")

{lexoid-0.1.9 → lexoid-0.1.10}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "lexoid"
-version = "0.1.9"
+version = "0.1.10"
 description = ""
 authors = []
 readme = "README.md"

{lexoid-0.1.9 → lexoid-0.1.10}/LICENSE RENAMED Viewed

File without changes

{lexoid-0.1.9 → lexoid-0.1.10}/README.md RENAMED Viewed

File without changes

{lexoid-0.1.9 → lexoid-0.1.10}/lexoid/core/parse_type/static_parser.py RENAMED Viewed

File without changes

{lexoid-0.1.9 → lexoid-0.1.10}/lexoid/core/prompt_templates.py RENAMED Viewed

File without changes

lexoid 0.1.9__tar.gz → 0.1.10__tar.gz

lexoid 0.1.9tar.gz → 0.1.10tar.gz