lexoid 0.1.9__py3-none-any.whl → 0.1.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lexoid/api.py +10 -2
- lexoid/core/parse_type/llm_parser.py +6 -3
- lexoid/core/utils.py +15 -1
- {lexoid-0.1.9.dist-info → lexoid-0.1.10.dist-info}/METADATA +1 -1
- lexoid-0.1.10.dist-info/RECORD +9 -0
- lexoid-0.1.9.dist-info/RECORD +0 -9
- {lexoid-0.1.9.dist-info → lexoid-0.1.10.dist-info}/LICENSE +0 -0
- {lexoid-0.1.9.dist-info → lexoid-0.1.10.dist-info}/WHEEL +0 -0
lexoid/api.py
CHANGED
@@ -19,6 +19,7 @@ from lexoid.core.utils import (
|
|
19
19
|
recursive_read_html,
|
20
20
|
router,
|
21
21
|
split_pdf,
|
22
|
+
create_sub_pdf,
|
22
23
|
)
|
23
24
|
|
24
25
|
|
@@ -83,8 +84,9 @@ def parse_chunk_list(
|
|
83
84
|
result = parse_chunk(file_path, parser_type, **kwargs)
|
84
85
|
combined_segments.extend(result["segments"])
|
85
86
|
raw_texts.append(result["raw"])
|
86
|
-
token_usage
|
87
|
-
|
87
|
+
if "token_usage" in result:
|
88
|
+
token_usage["input"] += result["token_usage"]["input"]
|
89
|
+
token_usage["output"] += result["token_usage"]["output"]
|
88
90
|
token_usage["total"] = token_usage["input"] + token_usage["output"]
|
89
91
|
|
90
92
|
return {
|
@@ -163,6 +165,12 @@ def parse(
|
|
163
165
|
pdf_path = os.path.join(temp_dir, "converted.pdf")
|
164
166
|
path = convert_to_pdf(path, pdf_path)
|
165
167
|
|
168
|
+
if "page_nums" in kwargs and path.lower().endswith(".pdf"):
|
169
|
+
sub_pdf_dir = os.path.join(temp_dir, "sub_pdfs")
|
170
|
+
os.makedirs(sub_pdf_dir, exist_ok=True)
|
171
|
+
sub_pdf_path = os.path.join(sub_pdf_dir, f"{os.path.basename(path)}")
|
172
|
+
path = create_sub_pdf(path, sub_pdf_path, kwargs["page_nums"])
|
173
|
+
|
166
174
|
if not path.lower().endswith(".pdf") or parser_type == ParserType.STATIC_PARSE:
|
167
175
|
kwargs["split"] = False
|
168
176
|
result = parse_chunk(path, parser_type, **kwargs)
|
@@ -125,6 +125,9 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
125
125
|
combined_text = result.split("</output>")[0].strip()
|
126
126
|
|
127
127
|
token_usage = result["usageMetadata"]
|
128
|
+
input_tokens = token_usage.get("promptTokenCount", 0)
|
129
|
+
output_tokens = token_usage.get("candidatesTokenCount", 0)
|
130
|
+
total_tokens = input_tokens + output_tokens
|
128
131
|
|
129
132
|
return {
|
130
133
|
"raw": combined_text,
|
@@ -137,9 +140,9 @@ def parse_with_gemini(path: str, **kwargs) -> List[Dict] | str:
|
|
137
140
|
"parent_title": kwargs.get("parent_title", ""),
|
138
141
|
"recursive_docs": [],
|
139
142
|
"token_usage": {
|
140
|
-
"input":
|
141
|
-
"output":
|
142
|
-
"total":
|
143
|
+
"input": input_tokens,
|
144
|
+
"output": output_tokens,
|
145
|
+
"total": total_tokens,
|
143
146
|
},
|
144
147
|
}
|
145
148
|
|
lexoid/core/utils.py
CHANGED
@@ -6,7 +6,7 @@ import re
|
|
6
6
|
import sys
|
7
7
|
from difflib import SequenceMatcher
|
8
8
|
from hashlib import md5
|
9
|
-
from typing import Dict, List
|
9
|
+
from typing import Dict, List, Optional
|
10
10
|
from urllib.parse import urlparse
|
11
11
|
|
12
12
|
import nest_asyncio
|
@@ -45,6 +45,20 @@ def split_pdf(input_path: str, output_dir: str, pages_per_split: int):
|
|
45
45
|
return paths
|
46
46
|
|
47
47
|
|
48
|
+
def create_sub_pdf(
|
49
|
+
input_path: str, output_path: str, page_nums: Optional[tuple[int, ...]|int] = None
|
50
|
+
) -> str:
|
51
|
+
if isinstance(page_nums, int):
|
52
|
+
page_nums = (page_nums,)
|
53
|
+
page_nums = tuple(sorted(set(page_nums)))
|
54
|
+
with pikepdf.open(input_path) as pdf:
|
55
|
+
indices = page_nums if page_nums else range(len(pdf.pages))
|
56
|
+
with pikepdf.new() as new_pdf:
|
57
|
+
new_pdf.pages.extend([pdf.pages[i - 1] for i in indices])
|
58
|
+
new_pdf.save(output_path)
|
59
|
+
return output_path
|
60
|
+
|
61
|
+
|
48
62
|
def convert_image_to_pdf(image_path: str) -> bytes:
|
49
63
|
with Image.open(image_path) as img:
|
50
64
|
img_rgb = img.convert("RGB")
|
@@ -0,0 +1,9 @@
|
|
1
|
+
lexoid/api.py,sha256=45nkTuQcxdppeUiRsiyioJtvlVeWeoq_WgKtGCthIBY,9193
|
2
|
+
lexoid/core/parse_type/llm_parser.py,sha256=tH19B0w78OowkDdqJg3rom0kQmyuTaTfDP98Qnwufo0,10625
|
3
|
+
lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
|
4
|
+
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
+
lexoid/core/utils.py,sha256=HT37qmdhPpUNN6O571G7ItE5K2Mv8SreBHmxrhdiXA8,18951
|
6
|
+
lexoid-0.1.10.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
+
lexoid-0.1.10.dist-info/METADATA,sha256=4uhJ_IaHEKPl9lxKg8RRrBQ5dn7oB23XCnJNG5sNpH4,4576
|
8
|
+
lexoid-0.1.10.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
+
lexoid-0.1.10.dist-info/RECORD,,
|
lexoid-0.1.9.dist-info/RECORD
DELETED
@@ -1,9 +0,0 @@
|
|
1
|
-
lexoid/api.py,sha256=EYyKwfdrjM94bslqTb7Db_wz0R2WioFPkJAqeDJJchY,8790
|
2
|
-
lexoid/core/parse_type/llm_parser.py,sha256=eu6zcl_uHVJ7-t506yfQT4jHpg2QGHV2CznS9X12lLQ,10515
|
3
|
-
lexoid/core/parse_type/static_parser.py,sha256=j3khirFnXq2j3IFEu0TsYWA5sHMpe_oQLFM9Uv3hScM,14100
|
4
|
-
lexoid/core/prompt_templates.py,sha256=svSMH0yhm6ZjtOeTtUUEiCYi81ebVY9EZKPKP0Q921o,6311
|
5
|
-
lexoid/core/utils.py,sha256=coVab6fCSSDpIN39WLQ6ciZVRiIx3qTsqjn2EbTmMks,18428
|
6
|
-
lexoid-0.1.9.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
|
7
|
-
lexoid-0.1.9.dist-info/METADATA,sha256=EegftW7ka6fSzaEos97N2-JPjkpO3tt4wyuL9oha014,4575
|
8
|
-
lexoid-0.1.9.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
9
|
-
lexoid-0.1.9.dist-info/RECORD,,
|
File without changes
|
File without changes
|