PyPI - lexoid - Versions diffs - 0.1.9__py3-none-any.whl → 0.1.10__py3-none-any.whl - Mend

lexoid 0.1.9py3-none-any.whl → 0.1.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

lexoid/api.py +10 -2
lexoid/core/parse_type/llm_parser.py +6 -3
lexoid/core/utils.py +15 -1
{lexoid-0.1.9.dist-info → lexoid-0.1.10.dist-info}/METADATA +1 -1
lexoid-0.1.10.dist-info/RECORD +9 -0
lexoid-0.1.9.dist-info/RECORD +0 -9
{lexoid-0.1.9.dist-info → lexoid-0.1.10.dist-info}/LICENSE +0 -0
{lexoid-0.1.9.dist-info → lexoid-0.1.10.dist-info}/WHEEL +0 -0

lexoid/api.py CHANGED Viewed

@@ -19,6 +19,7 @@ from lexoid.core.utils import (
     recursive_read_html,
     router,
     split_pdf,
+    create_sub_pdf,
 )
@@ -83,8 +84,9 @@ def parse_chunk_list(
         result = parse_chunk(file_path, parser_type, **kwargs)
         combined_segments.extend(result["segments"])
         raw_texts.append(result["raw"])
-        token_usage["input"] += result["token_usage"]["input"]
-        token_usage["output"] += result["token_usage"]["output"]
+        if "token_usage" in result:
+            token_usage["input"] += result["token_usage"]["input"]
+            token_usage["output"] += result["token_usage"]["output"]
     token_usage["total"] = token_usage["input"] + token_usage["output"]
     return {
@@ -163,6 +165,12 @@ def parse(
             pdf_path = os.path.join(temp_dir, "converted.pdf")
             path = convert_to_pdf(path, pdf_path)
+        if "page_nums" in kwargs and path.lower().endswith(".pdf"):
+            sub_pdf_dir = os.path.join(temp_dir, "sub_pdfs")
+            os.makedirs(sub_pdf_dir, exist_ok=True)
+            sub_pdf_path = os.path.join(sub_pdf_dir, f"{os.path.basename(path)}")
+            path = create_sub_pdf(path, sub_pdf_path, kwargs["page_nums"])
         if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
             kwargs["split"] = False
             result = parse_chunk(path, parser_type, **kwargs)

lexoid/core/parse_type/llm_parser.py CHANGED Viewed

@@ -125,6 +125,9 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
         combined_text = result.split("</output>")[0].strip()
     token_usage = result["usageMetadata"]
+    input_tokens = token_usage.get("promptTokenCount", 0)
+    output_tokens = token_usage.get("candidatesTokenCount", 0)
+    total_tokens = input_tokens + output_tokens
     return {
         "raw": combined_text,
@@ -137,9 +140,9 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
         "parent_title": kwargs.get("parent_title", ""),
         "recursive_docs": [],
         "token_usage": {
-            "input": token_usage["promptTokenCount"],
-            "output": token_usage["candidatesTokenCount"],
-            "total": token_usage["totalTokenCount"],
+            "input": input_tokens,
+            "output": output_tokens,
+            "total": total_tokens,
         },
     }

lexoid/core/utils.py CHANGED Viewed

@@ -6,7 +6,7 @@ import re
 import sys
 from difflib import SequenceMatcher
 from hashlib import md5
-from typing import Dict, List
+from typing import Dict, List, Optional
 from urllib.parse import urlparse
 import nest_asyncio
@@ -45,6 +45,20 @@ def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
     return paths
+def create_sub_pdf(
+    input_path: str, output_path: str, page_nums: Optional[tuple[int, ...]|int] = None
+) -> str:
+    if isinstance(page_nums, int):
+        page_nums = (page_nums,)
+    page_nums = tuple(sorted(set(page_nums)))
+    with pikepdf.open(input_path) as pdf:
+        indices = page_nums if page_nums else range(len(pdf.pages))
+        with pikepdf.new() as new_pdf:
+            new_pdf.pages.extend([pdf.pages[i - 1] for i in indices])
+            new_pdf.save(output_path)
+    return output_path
 def convert_image_to_pdf(image_path: str) -> bytes:
     with Image.open(image_path) as img:
         img_rgb = img.convert("RGB")

{lexoid-0.1.9.dist-info → lexoid-0.1.10.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: lexoid
-Version: 0.1.9
+Version: 0.1.10
 Summary:
 Requires-Python: >=3.10,<4.0
 Classifier: Programming Language :: Python :: 3

lexoid-0.1.10.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,9 @@
+lexoid/api.py,sha256=45nkTuQcxdppeUiRsiyioJtvlVeWeoq_WgKtGCthIBY,9193
+lexoid/core/parse_type/llm_parser.py,sha256=tH19B0w78OowkDdqJg3rom0kQmyuTaTfDP98Qnwufo0,10625
+lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
+lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
+lexoid/core/utils.py,sha256=HT37qmdhPpUNN6O571G7ItE5K2Mv8SreBHmxrhdiXA8,18951
+lexoid-0.1.10.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+lexoid-0.1.10.dist-info/METADATA,sha256=4uhJ_IaHEKPl9lxKg8RRrBQ5dn7oB23XCnJNG5sNpH4,4576
+lexoid-0.1.10.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+lexoid-0.1.10.dist-info/RECORD,,

lexoid-0.1.9.dist-info/RECORD DELETED Viewed

@@ -1,9 +0,0 @@
-lexoid/api.py,sha256=EYyKwfdrjM94bslqTb7Db_wz0R2WioFPkJAqeDJJchY,8790
-lexoid/core/parse_type/llm_parser.py,sha256=eu6zcl_uHVJ7-t506yfQT4jHpg2QGHV2CznS9X12lLQ,10515
-lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
-lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
-lexoid/core/utils.py,sha256=coVab6fCSSDpIN39WLQ6ciZVRiIx3qTsqjn2EbTmMks,18428
-lexoid-0.1.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-lexoid-0.1.9.dist-info/METADATA,sha256=EegftW7ka6fSzaEos97N2-JPjkpO3tt4wyuL9oha014,4575
-lexoid-0.1.9.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-lexoid-0.1.9.dist-info/RECORD,,

{lexoid-0.1.9.dist-info → lexoid-0.1.10.dist-info}/LICENSE RENAMED Viewed

File without changes

{lexoid-0.1.9.dist-info → lexoid-0.1.10.dist-info}/WHEEL RENAMED Viewed

File without changes

lexoid 0.1.9__py3-none-any.whl → 0.1.10__py3-none-any.whl

lexoid 0.1.9py3-none-any.whl → 0.1.10py3-none-any.whl