lexoid 0.1.9__tar.gz → 0.1.10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: lexoid
3
- Version: 0.1.9
3
+ Version: 0.1.10
4
4
  Summary:
5
5
  Requires-Python: >=3.10,<4.0
6
6
  Classifier: Programming Language :: Python :: 3
@@ -19,6 +19,7 @@ from lexoid.core.utils import (
19
19
  recursive_read_html,
20
20
  router,
21
21
  split_pdf,
22
+ create_sub_pdf,
22
23
  )
23
24
 
24
25
 
@@ -83,8 +84,9 @@ def parse_chunk_list(
83
84
  result = parse_chunk(file_path, parser_type, **kwargs)
84
85
  combined_segments.extend(result["segments"])
85
86
  raw_texts.append(result["raw"])
86
- token_usage["input"] += result["token_usage"]["input"]
87
- token_usage["output"] += result["token_usage"]["output"]
87
+ if "token_usage" in result:
88
+ token_usage["input"] += result["token_usage"]["input"]
89
+ token_usage["output"] += result["token_usage"]["output"]
88
90
  token_usage["total"] = token_usage["input"] + token_usage["output"]
89
91
 
90
92
  return {
@@ -163,6 +165,12 @@ def parse(
163
165
  pdf_path = os.path.join(temp_dir, "converted.pdf")
164
166
  path = convert_to_pdf(path, pdf_path)
165
167
 
168
+ if "page_nums" in kwargs and path.lower().endswith(".pdf"):
169
+ sub_pdf_dir = os.path.join(temp_dir, "sub_pdfs")
170
+ os.makedirs(sub_pdf_dir, exist_ok=True)
171
+ sub_pdf_path = os.path.join(sub_pdf_dir, f"{os.path.basename(path)}")
172
+ path = create_sub_pdf(path, sub_pdf_path, kwargs["page_nums"])
173
+
166
174
  if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
167
175
  kwargs["split"] = False
168
176
  result = parse_chunk(path, parser_type, **kwargs)
@@ -125,6 +125,9 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
125
125
  combined_text = result.split("</output>")[0].strip()
126
126
 
127
127
  token_usage = result["usageMetadata"]
128
+ input_tokens = token_usage.get("promptTokenCount", 0)
129
+ output_tokens = token_usage.get("candidatesTokenCount", 0)
130
+ total_tokens = input_tokens + output_tokens
128
131
 
129
132
  return {
130
133
  "raw": combined_text,
@@ -137,9 +140,9 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
137
140
  "parent_title": kwargs.get("parent_title", ""),
138
141
  "recursive_docs": [],
139
142
  "token_usage": {
140
- "input": token_usage["promptTokenCount"],
141
- "output": token_usage["candidatesTokenCount"],
142
- "total": token_usage["totalTokenCount"],
143
+ "input": input_tokens,
144
+ "output": output_tokens,
145
+ "total": total_tokens,
143
146
  },
144
147
  }
145
148
 
@@ -6,7 +6,7 @@ import re
6
6
  import sys
7
7
  from difflib import SequenceMatcher
8
8
  from hashlib import md5
9
- from typing import Dict, List
9
+ from typing import Dict, List, Optional
10
10
  from urllib.parse import urlparse
11
11
 
12
12
  import nest_asyncio
@@ -45,6 +45,20 @@ def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
45
45
  return paths
46
46
 
47
47
 
48
+ def create_sub_pdf(
49
+ input_path: str, output_path: str, page_nums: Optional[tuple[int, ...]|int] = None
50
+ ) -> str:
51
+ if isinstance(page_nums, int):
52
+ page_nums = (page_nums,)
53
+ page_nums = tuple(sorted(set(page_nums)))
54
+ with pikepdf.open(input_path) as pdf:
55
+ indices = page_nums if page_nums else range(len(pdf.pages))
56
+ with pikepdf.new() as new_pdf:
57
+ new_pdf.pages.extend([pdf.pages[i - 1] for i in indices])
58
+ new_pdf.save(output_path)
59
+ return output_path
60
+
61
+
48
62
  def convert_image_to_pdf(image_path: str) -> bytes:
49
63
  with Image.open(image_path) as img:
50
64
  img_rgb = img.convert("RGB")
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "lexoid"
3
- version = "0.1.9"
3
+ version = "0.1.10"
4
4
  description = ""
5
5
  authors = []
6
6
  readme = "README.md"
File without changes
File without changes