knowhere-python-sdk 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowhere/__init__.py +8 -0
- knowhere/_exceptions.py +21 -3
- knowhere/_version.py +1 -1
- knowhere/lib/result_parser.py +32 -0
- knowhere/types/__init__.py +8 -0
- knowhere/types/result.py +100 -0
- {knowhere_python_sdk-0.2.0.dist-info → knowhere_python_sdk-0.2.1.dist-info}/METADATA +1 -1
- {knowhere_python_sdk-0.2.0.dist-info → knowhere_python_sdk-0.2.1.dist-info}/RECORD +9 -9
- {knowhere_python_sdk-0.2.0.dist-info → knowhere_python_sdk-0.2.1.dist-info}/WHEEL +0 -0
knowhere/__init__.py
CHANGED
|
@@ -46,6 +46,10 @@ from knowhere.types.result import (
|
|
|
46
46
|
ImageFileInfo,
|
|
47
47
|
Manifest,
|
|
48
48
|
ParseResult,
|
|
49
|
+
ProcessingCost,
|
|
50
|
+
ProcessingMetadata,
|
|
51
|
+
ProcessingTiming,
|
|
52
|
+
SlimChunk,
|
|
49
53
|
Statistics,
|
|
50
54
|
TableChunk,
|
|
51
55
|
TableFileInfo,
|
|
@@ -91,6 +95,10 @@ __all__: list[str] = [
|
|
|
91
95
|
"FileIndex",
|
|
92
96
|
"ImageFileInfo",
|
|
93
97
|
"TableFileInfo",
|
|
98
|
+
"ProcessingCost",
|
|
99
|
+
"ProcessingMetadata",
|
|
100
|
+
"ProcessingTiming",
|
|
101
|
+
"SlimChunk",
|
|
94
102
|
"BaseChunk",
|
|
95
103
|
"TextChunk",
|
|
96
104
|
"ImageChunk",
|
knowhere/_exceptions.py
CHANGED
|
@@ -387,11 +387,29 @@ def makeStatusError(
|
|
|
387
387
|
response=response,
|
|
388
388
|
)
|
|
389
389
|
|
|
390
|
-
if exception_class
|
|
391
|
-
return
|
|
390
|
+
if exception_class is RateLimitError:
|
|
391
|
+
return RateLimitError(
|
|
392
392
|
status_code,
|
|
393
393
|
**common_kwargs,
|
|
394
|
-
retry_after=retry_after,
|
|
394
|
+
retry_after=retry_after,
|
|
395
|
+
limit=limit,
|
|
396
|
+
period=period,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
if exception_class is ServiceUnavailableError:
|
|
400
|
+
return ServiceUnavailableError(
|
|
401
|
+
status_code,
|
|
402
|
+
**common_kwargs,
|
|
403
|
+
retry_after=retry_after,
|
|
404
|
+
limit=limit,
|
|
405
|
+
period=period,
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
if exception_class is GatewayTimeoutError:
|
|
409
|
+
return GatewayTimeoutError(
|
|
410
|
+
status_code,
|
|
411
|
+
**common_kwargs,
|
|
412
|
+
retry_after=retry_after,
|
|
395
413
|
limit=limit,
|
|
396
414
|
period=period,
|
|
397
415
|
)
|
knowhere/_version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "0.2.
|
|
1
|
+
__version__ = "0.2.1" # x-release-please-version
|
knowhere/lib/result_parser.py
CHANGED
|
@@ -16,6 +16,7 @@ from knowhere.types.result import (
|
|
|
16
16
|
ImageChunk,
|
|
17
17
|
Manifest,
|
|
18
18
|
ParseResult,
|
|
19
|
+
SlimChunk,
|
|
19
20
|
TableChunk,
|
|
20
21
|
TextChunk,
|
|
21
22
|
TextChunkTokens,
|
|
@@ -134,6 +135,7 @@ def _buildChunks(
|
|
|
134
135
|
type="image",
|
|
135
136
|
content=raw.get("content", ""),
|
|
136
137
|
path=raw.get("path"),
|
|
138
|
+
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
137
139
|
length=metadata.get("length", raw.get("length", 0)),
|
|
138
140
|
file_path=file_path,
|
|
139
141
|
original_name=metadata.get("original_name", raw.get("original_name")),
|
|
@@ -151,6 +153,7 @@ def _buildChunks(
|
|
|
151
153
|
type="table",
|
|
152
154
|
content=raw.get("content", ""),
|
|
153
155
|
path=raw.get("path"),
|
|
156
|
+
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
154
157
|
length=metadata.get("length", raw.get("length", 0)),
|
|
155
158
|
file_path=file_path,
|
|
156
159
|
original_name=metadata.get("original_name", raw.get("original_name")),
|
|
@@ -167,10 +170,12 @@ def _buildChunks(
|
|
|
167
170
|
type="text",
|
|
168
171
|
content=raw.get("content", ""),
|
|
169
172
|
path=raw.get("path"),
|
|
173
|
+
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
170
174
|
length=metadata.get("length", raw.get("length", 0)),
|
|
171
175
|
tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
|
|
172
176
|
keywords=metadata.get("keywords", raw.get("keywords")),
|
|
173
177
|
summary=metadata.get("summary", raw.get("summary")),
|
|
178
|
+
connect_to=metadata.get("connect_to", raw.get("connect_to")),
|
|
174
179
|
relationships=metadata.get("relationships", raw.get("relationships")),
|
|
175
180
|
)
|
|
176
181
|
|
|
@@ -230,12 +235,39 @@ def parseResultZip(
|
|
|
230
235
|
json.loads(hierarchy_text) if hierarchy_text else None
|
|
231
236
|
)
|
|
232
237
|
|
|
238
|
+
# -- Optimized sidecar files --
|
|
239
|
+
chunks_slim_text: Optional[str] = _readZipText(zf, "chunks_slim.json")
|
|
240
|
+
parsed_chunks_slim: Any = json.loads(chunks_slim_text) if chunks_slim_text else None
|
|
241
|
+
if isinstance(parsed_chunks_slim, dict) and "chunks" in parsed_chunks_slim:
|
|
242
|
+
raw_chunks_slim: List[Dict[str, Any]] = parsed_chunks_slim["chunks"]
|
|
243
|
+
elif isinstance(parsed_chunks_slim, list):
|
|
244
|
+
raw_chunks_slim = parsed_chunks_slim
|
|
245
|
+
else:
|
|
246
|
+
raw_chunks_slim = []
|
|
247
|
+
chunks_slim: Optional[List[SlimChunk]] = (
|
|
248
|
+
[SlimChunk.model_validate(chunk) for chunk in raw_chunks_slim]
|
|
249
|
+
if chunks_slim_text is not None
|
|
250
|
+
else None
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
toc_hierarchies_text: Optional[str] = _readZipText(zf, "toc_hierarchies.json")
|
|
254
|
+
toc_hierarchies: Optional[Any] = (
|
|
255
|
+
json.loads(toc_hierarchies_text) if toc_hierarchies_text else None
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
kb_csv: Optional[str] = _readZipText(zf, "kb.csv")
|
|
259
|
+
hierarchy_view_html: Optional[str] = _readZipText(zf, "hierarchy_view.html")
|
|
260
|
+
|
|
233
261
|
zf.close()
|
|
234
262
|
|
|
235
263
|
return ParseResult(
|
|
236
264
|
manifest=manifest,
|
|
237
265
|
chunks=chunks,
|
|
266
|
+
chunks_slim=chunks_slim,
|
|
238
267
|
full_markdown=full_markdown,
|
|
239
268
|
hierarchy=hierarchy,
|
|
269
|
+
toc_hierarchies=toc_hierarchies,
|
|
270
|
+
kb_csv=kb_csv,
|
|
271
|
+
hierarchy_view_html=hierarchy_view_html,
|
|
240
272
|
raw_zip=zip_bytes,
|
|
241
273
|
)
|
knowhere/types/__init__.py
CHANGED
|
@@ -13,6 +13,10 @@ from knowhere.types.result import (
|
|
|
13
13
|
ImageFileInfo,
|
|
14
14
|
Manifest,
|
|
15
15
|
ParseResult,
|
|
16
|
+
ProcessingCost,
|
|
17
|
+
ProcessingMetadata,
|
|
18
|
+
ProcessingTiming,
|
|
19
|
+
SlimChunk,
|
|
16
20
|
Statistics,
|
|
17
21
|
TableChunk,
|
|
18
22
|
TableFileInfo,
|
|
@@ -36,6 +40,10 @@ __all__: list[str] = [
|
|
|
36
40
|
"ImageFileInfo",
|
|
37
41
|
"Manifest",
|
|
38
42
|
"ParseResult",
|
|
43
|
+
"ProcessingCost",
|
|
44
|
+
"ProcessingMetadata",
|
|
45
|
+
"ProcessingTiming",
|
|
46
|
+
"SlimChunk",
|
|
39
47
|
"Statistics",
|
|
40
48
|
"TableChunk",
|
|
41
49
|
"TableFileInfo",
|
knowhere/types/result.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
|
+
import json
|
|
6
7
|
import re
|
|
7
8
|
from pathlib import Path
|
|
8
9
|
from typing import Any, Dict, List, Optional, Union
|
|
@@ -92,12 +93,39 @@ class FileIndex(BaseModel):
|
|
|
92
93
|
|
|
93
94
|
chunks: Optional[str] = None
|
|
94
95
|
markdown: Optional[str] = None
|
|
96
|
+
chunks_slim: Optional[str] = None
|
|
95
97
|
kb_csv: Optional[str] = None
|
|
96
98
|
hierarchy: Optional[str] = None
|
|
99
|
+
toc_hierarchies: Optional[str] = None
|
|
100
|
+
hierarchy_view_html: Optional[str] = None
|
|
97
101
|
images: List[ImageFileInfo] = Field(default_factory=list)
|
|
98
102
|
tables: List[TableFileInfo] = Field(default_factory=list)
|
|
99
103
|
|
|
100
104
|
|
|
105
|
+
class ProcessingCost(BaseModel):
|
|
106
|
+
"""Billing details emitted by manifest v2."""
|
|
107
|
+
|
|
108
|
+
micro_dollars: Optional[int] = None
|
|
109
|
+
credits: Optional[float] = None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class ProcessingTiming(BaseModel):
|
|
113
|
+
"""Timing details emitted by manifest v2."""
|
|
114
|
+
|
|
115
|
+
started_at: Optional[str] = None
|
|
116
|
+
completed_at: Optional[str] = None
|
|
117
|
+
duration_ms: Optional[int] = None
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class ProcessingMetadata(BaseModel):
|
|
121
|
+
"""Worker-side processing metadata emitted by manifest v2."""
|
|
122
|
+
|
|
123
|
+
page_count: Optional[int] = None
|
|
124
|
+
billing_status: Optional[str] = None
|
|
125
|
+
cost: Optional[ProcessingCost] = None
|
|
126
|
+
timing: Optional[ProcessingTiming] = None
|
|
127
|
+
|
|
128
|
+
|
|
101
129
|
class Manifest(BaseModel):
|
|
102
130
|
"""Top-level manifest describing the result ZIP contents."""
|
|
103
131
|
|
|
@@ -106,6 +134,7 @@ class Manifest(BaseModel):
|
|
|
106
134
|
data_id: Optional[str] = None
|
|
107
135
|
source_file_name: Optional[str] = None
|
|
108
136
|
processing_date: Optional[str] = None
|
|
137
|
+
processing: Optional[ProcessingMetadata] = None
|
|
109
138
|
checksum: Optional[Checksum] = None
|
|
110
139
|
statistics: Optional[Statistics] = None
|
|
111
140
|
files: Optional[FileIndex] = None
|
|
@@ -123,6 +152,7 @@ class BaseChunk(BaseModel):
|
|
|
123
152
|
type: str
|
|
124
153
|
content: str = ""
|
|
125
154
|
path: Optional[str] = None
|
|
155
|
+
page_nums: Optional[List[int]] = None
|
|
126
156
|
|
|
127
157
|
|
|
128
158
|
TextChunkTokens: TypeAlias = List[str]
|
|
@@ -136,6 +166,7 @@ class TextChunk(BaseChunk):
|
|
|
136
166
|
tokens: Optional[TextChunkTokens] = None
|
|
137
167
|
keywords: Optional[List[str]] = None
|
|
138
168
|
summary: Optional[str] = None
|
|
169
|
+
connect_to: Optional[List[Dict[str, Any]]] = None
|
|
139
170
|
relationships: Optional[List[Union[Dict[str, Any], str]]] = None
|
|
140
171
|
|
|
141
172
|
|
|
@@ -210,6 +241,15 @@ class TableChunk(BaseChunk):
|
|
|
210
241
|
Chunk = Union[TextChunk, ImageChunk, TableChunk]
|
|
211
242
|
|
|
212
243
|
|
|
244
|
+
class SlimChunk(BaseModel):
|
|
245
|
+
"""Minimal chunk entry emitted in chunks_slim.json."""
|
|
246
|
+
|
|
247
|
+
type: str
|
|
248
|
+
path: Optional[str] = None
|
|
249
|
+
content: str = ""
|
|
250
|
+
summary: Optional[str] = None
|
|
251
|
+
|
|
252
|
+
|
|
213
253
|
# ---------------------------------------------------------------------------
|
|
214
254
|
# ParseResult — the top-level object returned to the user
|
|
215
255
|
# ---------------------------------------------------------------------------
|
|
@@ -225,8 +265,12 @@ class ParseResult:
|
|
|
225
265
|
|
|
226
266
|
manifest: Manifest
|
|
227
267
|
chunks: List[Chunk]
|
|
268
|
+
chunks_slim: Optional[List[SlimChunk]]
|
|
228
269
|
full_markdown: str
|
|
229
270
|
hierarchy: Optional[Any]
|
|
271
|
+
toc_hierarchies: Optional[Any]
|
|
272
|
+
kb_csv: Optional[str]
|
|
273
|
+
hierarchy_view_html: Optional[str]
|
|
230
274
|
raw_zip: bytes
|
|
231
275
|
|
|
232
276
|
def __init__(
|
|
@@ -234,14 +278,22 @@ class ParseResult:
|
|
|
234
278
|
*,
|
|
235
279
|
manifest: Manifest,
|
|
236
280
|
chunks: List[Chunk],
|
|
281
|
+
chunks_slim: Optional[List[SlimChunk]],
|
|
237
282
|
full_markdown: str,
|
|
238
283
|
hierarchy: Optional[Any],
|
|
284
|
+
toc_hierarchies: Optional[Any],
|
|
285
|
+
kb_csv: Optional[str],
|
|
286
|
+
hierarchy_view_html: Optional[str],
|
|
239
287
|
raw_zip: bytes,
|
|
240
288
|
) -> None:
|
|
241
289
|
self.manifest = manifest
|
|
242
290
|
self.chunks = chunks
|
|
291
|
+
self.chunks_slim = chunks_slim
|
|
243
292
|
self.full_markdown = full_markdown
|
|
244
293
|
self.hierarchy = hierarchy
|
|
294
|
+
self.toc_hierarchies = toc_hierarchies
|
|
295
|
+
self.kb_csv = kb_csv
|
|
296
|
+
self.hierarchy_view_html = hierarchy_view_html
|
|
245
297
|
self.raw_zip = raw_zip
|
|
246
298
|
|
|
247
299
|
# -- convenience properties --
|
|
@@ -296,10 +348,58 @@ class ParseResult:
|
|
|
296
348
|
dir_path: Path = Path(directory)
|
|
297
349
|
dir_path.mkdir(parents=True, exist_ok=True)
|
|
298
350
|
|
|
351
|
+
# Manifest / chunks
|
|
352
|
+
manifest_path: Path = dir_path / "manifest.json"
|
|
353
|
+
manifest_path.write_text(
|
|
354
|
+
self.manifest.model_dump_json(indent=2),
|
|
355
|
+
encoding="utf-8",
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
chunks_path: Path = dir_path / "chunks.json"
|
|
359
|
+
chunks_path.write_text(
|
|
360
|
+
json.dumps([chunk.model_dump() for chunk in self.chunks], indent=2),
|
|
361
|
+
encoding="utf-8",
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
if self.chunks_slim is not None:
|
|
365
|
+
chunks_slim_path: Path = dir_path / "chunks_slim.json"
|
|
366
|
+
chunks_slim_path.write_text(
|
|
367
|
+
json.dumps(
|
|
368
|
+
{"chunks": [chunk.model_dump() for chunk in self.chunks_slim]},
|
|
369
|
+
indent=2,
|
|
370
|
+
),
|
|
371
|
+
encoding="utf-8",
|
|
372
|
+
)
|
|
373
|
+
|
|
299
374
|
# Full markdown
|
|
300
375
|
md_path: Path = dir_path / "full.md"
|
|
301
376
|
md_path.write_text(self.full_markdown, encoding="utf-8")
|
|
302
377
|
|
|
378
|
+
if self.hierarchy is not None:
|
|
379
|
+
hierarchy_path: Path = dir_path / "hierarchy.json"
|
|
380
|
+
hierarchy_path.write_text(
|
|
381
|
+
json.dumps(self.hierarchy, indent=2),
|
|
382
|
+
encoding="utf-8",
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
if self.toc_hierarchies is not None:
|
|
386
|
+
toc_hierarchies_path: Path = dir_path / "toc_hierarchies.json"
|
|
387
|
+
toc_hierarchies_path.write_text(
|
|
388
|
+
json.dumps(self.toc_hierarchies, indent=2),
|
|
389
|
+
encoding="utf-8",
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
if self.kb_csv is not None:
|
|
393
|
+
kb_csv_path: Path = dir_path / "kb.csv"
|
|
394
|
+
kb_csv_path.write_text(self.kb_csv, encoding="utf-8")
|
|
395
|
+
|
|
396
|
+
if self.hierarchy_view_html is not None:
|
|
397
|
+
hierarchy_view_path: Path = dir_path / "hierarchy_view.html"
|
|
398
|
+
hierarchy_view_path.write_text(
|
|
399
|
+
self.hierarchy_view_html,
|
|
400
|
+
encoding="utf-8",
|
|
401
|
+
)
|
|
402
|
+
|
|
303
403
|
# Images
|
|
304
404
|
if self.image_chunks:
|
|
305
405
|
images_dir: Path = dir_path / "images"
|
|
@@ -1,25 +1,25 @@
|
|
|
1
|
-
knowhere/__init__.py,sha256=
|
|
1
|
+
knowhere/__init__.py,sha256=NFNOUllG-7TZ-NVx7_g1vUPv15zQp1lvAXjb0BQotB4,2513
|
|
2
2
|
knowhere/_base_client.py,sha256=ddeRR1lWLhes5ipvYX6-TMEecjjiEBGfQdPw_vnSNqA,17978
|
|
3
3
|
knowhere/_client.py,sha256=MGU1QsyjKrzTiitm891wgNCq6JLf3DR7y7zhkil_p2E,8027
|
|
4
4
|
knowhere/_constants.py,sha256=ZNCFQC00NpUZIyc_XZ0uemjJE-E8uKAbv3BDa3po9cg,885
|
|
5
|
-
knowhere/_exceptions.py,sha256=
|
|
5
|
+
knowhere/_exceptions.py,sha256=NflH7phh_bNFOJmQ758V4mZCAFQskpGXACMz2JIfFNU,11896
|
|
6
6
|
knowhere/_logging.py,sha256=tNqEA1dLv-adTT6qRq5RBeO35FoWrnS3gwt7gKChLTA,1376
|
|
7
7
|
knowhere/_response.py,sha256=EsrM794qxCykvl82UkszeqjJzm9_OSq7nsyzaSCnx0I,1415
|
|
8
8
|
knowhere/_types.py,sha256=8-JFaRcxgBJbw2mV9BwnmCktFVph41a1mduwtXlYidI,1775
|
|
9
|
-
knowhere/_version.py,sha256=
|
|
9
|
+
knowhere/_version.py,sha256=5IhDnbb-SxjydsfhOSqft_BBCgSQNKdMjw7ElLASiGo,50
|
|
10
10
|
knowhere/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
11
11
|
knowhere/lib/__init__.py,sha256=e953V5ny3VmDtCw7y_4uPwdTkwwNpe_Y6o4AEgz3ujw,50
|
|
12
12
|
knowhere/lib/polling.py,sha256=s0EPHozAvNhXLqr5uwU8YXkkwAdF0ji_nIN0QfR6avY,4500
|
|
13
|
-
knowhere/lib/result_parser.py,sha256=
|
|
13
|
+
knowhere/lib/result_parser.py,sha256=dR3knoMq-AFMAe0M3l0YgOM-OrtSmofSLaKZO0tgYao,9882
|
|
14
14
|
knowhere/lib/upload.py,sha256=eT-O9_wB2WkWUAsUd7VzaKY6DVfNeA6WMHRdwm0HM0o,7849
|
|
15
15
|
knowhere/resources/__init__.py,sha256=_x391t8qxwkGbOmbkzcp7rR10Q8uoDLQaAkZxCq_oM8,170
|
|
16
16
|
knowhere/resources/_base.py,sha256=tgKphNTsgMhktWp6_rhyVOZyee4CYlDmD5O1_jWVvYo,1829
|
|
17
17
|
knowhere/resources/jobs.py,sha256=45P4rZ9HMnTdgcso2AwQ6lDA9U80HGsgOU0jZLBIMFU,8460
|
|
18
|
-
knowhere/types/__init__.py,sha256=
|
|
18
|
+
knowhere/types/__init__.py,sha256=2Qp2bIY7CyVieBdSfQnowyKG-ErMI3wF37-neBdwTBU,961
|
|
19
19
|
knowhere/types/job.py,sha256=8shCqvgzKKkEPOpEHdk7CnDbPQiDzy3wEd5Jngw94ZM,2362
|
|
20
20
|
knowhere/types/params.py,sha256=7DyBd4xMxtLPch-A1130-gI0ajKOv2G5tbSMkE8n6-E,543
|
|
21
|
-
knowhere/types/result.py,sha256=
|
|
21
|
+
knowhere/types/result.py,sha256=UmoxaFmxt2bhrP-2O6jYL89C2WuwZh2xcyyHl46Q1_Y,12925
|
|
22
22
|
knowhere/types/shared.py,sha256=K5ezX212othxgCviiE2WnwWFY2MS08pXKJ8Km1ZWmjc,104
|
|
23
|
-
knowhere_python_sdk-0.2.
|
|
24
|
-
knowhere_python_sdk-0.2.
|
|
25
|
-
knowhere_python_sdk-0.2.
|
|
23
|
+
knowhere_python_sdk-0.2.1.dist-info/METADATA,sha256=xf35vXtOtg7ubZWh4QNrqcjTpERpJO8kYuPXKmcmz_w,6115
|
|
24
|
+
knowhere_python_sdk-0.2.1.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
25
|
+
knowhere_python_sdk-0.2.1.dist-info/RECORD,,
|
|
File without changes
|