knowhere-python-sdk 0.2.0__tar.gz → 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- knowhere_python_sdk-0.2.1/.release-please-manifest.json +3 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/CHANGELOG.md +10 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/PKG-INFO +1 -1
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/pyproject.toml +1 -1
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/__init__.py +8 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/_exceptions.py +21 -3
- knowhere_python_sdk-0.2.1/src/knowhere/_version.py +1 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/lib/result_parser.py +32 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/types/__init__.py +8 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/types/result.py +100 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_models.py +57 -2
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_result_parser.py +195 -0
- knowhere_python_sdk-0.2.0/.release-please-manifest.json +0 -3
- knowhere_python_sdk-0.2.0/src/knowhere/_version.py +0 -1
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/.github/workflows/ci.yml +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/.github/workflows/publish-pypi.yml +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/.github/workflows/publish.yml +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/.gitignore +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/README.md +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/docs/usage.md +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/examples/async_usage.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/examples/error_handling.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/examples/parse_file.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/examples/parse_url.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/examples/step_by_step.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/release-please-config.json +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/_base_client.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/_client.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/_constants.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/_logging.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/_response.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/_types.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/lib/__init__.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/lib/polling.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/lib/upload.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/py.typed +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/resources/__init__.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/resources/_base.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/resources/jobs.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/types/job.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/types/params.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/types/shared.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/__init__.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/conftest.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/fixtures/real_result.zip +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_client.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_exceptions.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_jobs.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_logging.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_parse.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_polling.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_retry.py +0 -0
- {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_upload.py +0 -0
|
@@ -1,5 +1,15 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.2.1](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.2.0...v0.2.1) (2026-04-09)
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
### Bug Fixes
|
|
7
|
+
|
|
8
|
+
* narrow status error constructors ([c8fc035](https://github.com/Ontos-AI/knowhere-python-sdk/commit/c8fc035dade768c5364e50de890bde0fb280586e))
|
|
9
|
+
* remove stale mypy ignore ([150336a](https://github.com/Ontos-AI/knowhere-python-sdk/commit/150336a5dc0497b287437dffa6e1506f4bcf8fbf))
|
|
10
|
+
* sync optimized parse result payload ([a7903ad](https://github.com/Ontos-AI/knowhere-python-sdk/commit/a7903ad53fb5ab142c5835134c9a942eb5cdfe21))
|
|
11
|
+
* sync parse result payload with current API schema ([430b067](https://github.com/Ontos-AI/knowhere-python-sdk/commit/430b067b37ce0b2eb8bd3c81cfca56b1df657376))
|
|
12
|
+
|
|
3
13
|
## [0.2.0](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.1.0...v0.2.0) (2026-03-18)
|
|
4
14
|
|
|
5
15
|
|
|
@@ -46,6 +46,10 @@ from knowhere.types.result import (
|
|
|
46
46
|
ImageFileInfo,
|
|
47
47
|
Manifest,
|
|
48
48
|
ParseResult,
|
|
49
|
+
ProcessingCost,
|
|
50
|
+
ProcessingMetadata,
|
|
51
|
+
ProcessingTiming,
|
|
52
|
+
SlimChunk,
|
|
49
53
|
Statistics,
|
|
50
54
|
TableChunk,
|
|
51
55
|
TableFileInfo,
|
|
@@ -91,6 +95,10 @@ __all__: list[str] = [
|
|
|
91
95
|
"FileIndex",
|
|
92
96
|
"ImageFileInfo",
|
|
93
97
|
"TableFileInfo",
|
|
98
|
+
"ProcessingCost",
|
|
99
|
+
"ProcessingMetadata",
|
|
100
|
+
"ProcessingTiming",
|
|
101
|
+
"SlimChunk",
|
|
94
102
|
"BaseChunk",
|
|
95
103
|
"TextChunk",
|
|
96
104
|
"ImageChunk",
|
|
@@ -387,11 +387,29 @@ def makeStatusError(
|
|
|
387
387
|
response=response,
|
|
388
388
|
)
|
|
389
389
|
|
|
390
|
-
if exception_class
|
|
391
|
-
return
|
|
390
|
+
if exception_class is RateLimitError:
|
|
391
|
+
return RateLimitError(
|
|
392
392
|
status_code,
|
|
393
393
|
**common_kwargs,
|
|
394
|
-
retry_after=retry_after,
|
|
394
|
+
retry_after=retry_after,
|
|
395
|
+
limit=limit,
|
|
396
|
+
period=period,
|
|
397
|
+
)
|
|
398
|
+
|
|
399
|
+
if exception_class is ServiceUnavailableError:
|
|
400
|
+
return ServiceUnavailableError(
|
|
401
|
+
status_code,
|
|
402
|
+
**common_kwargs,
|
|
403
|
+
retry_after=retry_after,
|
|
404
|
+
limit=limit,
|
|
405
|
+
period=period,
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
if exception_class is GatewayTimeoutError:
|
|
409
|
+
return GatewayTimeoutError(
|
|
410
|
+
status_code,
|
|
411
|
+
**common_kwargs,
|
|
412
|
+
retry_after=retry_after,
|
|
395
413
|
limit=limit,
|
|
396
414
|
period=period,
|
|
397
415
|
)
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.2.1" # x-release-please-version
|
|
@@ -16,6 +16,7 @@ from knowhere.types.result import (
|
|
|
16
16
|
ImageChunk,
|
|
17
17
|
Manifest,
|
|
18
18
|
ParseResult,
|
|
19
|
+
SlimChunk,
|
|
19
20
|
TableChunk,
|
|
20
21
|
TextChunk,
|
|
21
22
|
TextChunkTokens,
|
|
@@ -134,6 +135,7 @@ def _buildChunks(
|
|
|
134
135
|
type="image",
|
|
135
136
|
content=raw.get("content", ""),
|
|
136
137
|
path=raw.get("path"),
|
|
138
|
+
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
137
139
|
length=metadata.get("length", raw.get("length", 0)),
|
|
138
140
|
file_path=file_path,
|
|
139
141
|
original_name=metadata.get("original_name", raw.get("original_name")),
|
|
@@ -151,6 +153,7 @@ def _buildChunks(
|
|
|
151
153
|
type="table",
|
|
152
154
|
content=raw.get("content", ""),
|
|
153
155
|
path=raw.get("path"),
|
|
156
|
+
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
154
157
|
length=metadata.get("length", raw.get("length", 0)),
|
|
155
158
|
file_path=file_path,
|
|
156
159
|
original_name=metadata.get("original_name", raw.get("original_name")),
|
|
@@ -167,10 +170,12 @@ def _buildChunks(
|
|
|
167
170
|
type="text",
|
|
168
171
|
content=raw.get("content", ""),
|
|
169
172
|
path=raw.get("path"),
|
|
173
|
+
page_nums=metadata.get("page_nums", raw.get("page_nums")),
|
|
170
174
|
length=metadata.get("length", raw.get("length", 0)),
|
|
171
175
|
tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
|
|
172
176
|
keywords=metadata.get("keywords", raw.get("keywords")),
|
|
173
177
|
summary=metadata.get("summary", raw.get("summary")),
|
|
178
|
+
connect_to=metadata.get("connect_to", raw.get("connect_to")),
|
|
174
179
|
relationships=metadata.get("relationships", raw.get("relationships")),
|
|
175
180
|
)
|
|
176
181
|
|
|
@@ -230,12 +235,39 @@ def parseResultZip(
|
|
|
230
235
|
json.loads(hierarchy_text) if hierarchy_text else None
|
|
231
236
|
)
|
|
232
237
|
|
|
238
|
+
# -- Optimized sidecar files --
|
|
239
|
+
chunks_slim_text: Optional[str] = _readZipText(zf, "chunks_slim.json")
|
|
240
|
+
parsed_chunks_slim: Any = json.loads(chunks_slim_text) if chunks_slim_text else None
|
|
241
|
+
if isinstance(parsed_chunks_slim, dict) and "chunks" in parsed_chunks_slim:
|
|
242
|
+
raw_chunks_slim: List[Dict[str, Any]] = parsed_chunks_slim["chunks"]
|
|
243
|
+
elif isinstance(parsed_chunks_slim, list):
|
|
244
|
+
raw_chunks_slim = parsed_chunks_slim
|
|
245
|
+
else:
|
|
246
|
+
raw_chunks_slim = []
|
|
247
|
+
chunks_slim: Optional[List[SlimChunk]] = (
|
|
248
|
+
[SlimChunk.model_validate(chunk) for chunk in raw_chunks_slim]
|
|
249
|
+
if chunks_slim_text is not None
|
|
250
|
+
else None
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
toc_hierarchies_text: Optional[str] = _readZipText(zf, "toc_hierarchies.json")
|
|
254
|
+
toc_hierarchies: Optional[Any] = (
|
|
255
|
+
json.loads(toc_hierarchies_text) if toc_hierarchies_text else None
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
kb_csv: Optional[str] = _readZipText(zf, "kb.csv")
|
|
259
|
+
hierarchy_view_html: Optional[str] = _readZipText(zf, "hierarchy_view.html")
|
|
260
|
+
|
|
233
261
|
zf.close()
|
|
234
262
|
|
|
235
263
|
return ParseResult(
|
|
236
264
|
manifest=manifest,
|
|
237
265
|
chunks=chunks,
|
|
266
|
+
chunks_slim=chunks_slim,
|
|
238
267
|
full_markdown=full_markdown,
|
|
239
268
|
hierarchy=hierarchy,
|
|
269
|
+
toc_hierarchies=toc_hierarchies,
|
|
270
|
+
kb_csv=kb_csv,
|
|
271
|
+
hierarchy_view_html=hierarchy_view_html,
|
|
240
272
|
raw_zip=zip_bytes,
|
|
241
273
|
)
|
|
@@ -13,6 +13,10 @@ from knowhere.types.result import (
|
|
|
13
13
|
ImageFileInfo,
|
|
14
14
|
Manifest,
|
|
15
15
|
ParseResult,
|
|
16
|
+
ProcessingCost,
|
|
17
|
+
ProcessingMetadata,
|
|
18
|
+
ProcessingTiming,
|
|
19
|
+
SlimChunk,
|
|
16
20
|
Statistics,
|
|
17
21
|
TableChunk,
|
|
18
22
|
TableFileInfo,
|
|
@@ -36,6 +40,10 @@ __all__: list[str] = [
|
|
|
36
40
|
"ImageFileInfo",
|
|
37
41
|
"Manifest",
|
|
38
42
|
"ParseResult",
|
|
43
|
+
"ProcessingCost",
|
|
44
|
+
"ProcessingMetadata",
|
|
45
|
+
"ProcessingTiming",
|
|
46
|
+
"SlimChunk",
|
|
39
47
|
"Statistics",
|
|
40
48
|
"TableChunk",
|
|
41
49
|
"TableFileInfo",
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
|
+
import json
|
|
6
7
|
import re
|
|
7
8
|
from pathlib import Path
|
|
8
9
|
from typing import Any, Dict, List, Optional, Union
|
|
@@ -92,12 +93,39 @@ class FileIndex(BaseModel):
|
|
|
92
93
|
|
|
93
94
|
chunks: Optional[str] = None
|
|
94
95
|
markdown: Optional[str] = None
|
|
96
|
+
chunks_slim: Optional[str] = None
|
|
95
97
|
kb_csv: Optional[str] = None
|
|
96
98
|
hierarchy: Optional[str] = None
|
|
99
|
+
toc_hierarchies: Optional[str] = None
|
|
100
|
+
hierarchy_view_html: Optional[str] = None
|
|
97
101
|
images: List[ImageFileInfo] = Field(default_factory=list)
|
|
98
102
|
tables: List[TableFileInfo] = Field(default_factory=list)
|
|
99
103
|
|
|
100
104
|
|
|
105
|
+
class ProcessingCost(BaseModel):
|
|
106
|
+
"""Billing details emitted by manifest v2."""
|
|
107
|
+
|
|
108
|
+
micro_dollars: Optional[int] = None
|
|
109
|
+
credits: Optional[float] = None
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
class ProcessingTiming(BaseModel):
|
|
113
|
+
"""Timing details emitted by manifest v2."""
|
|
114
|
+
|
|
115
|
+
started_at: Optional[str] = None
|
|
116
|
+
completed_at: Optional[str] = None
|
|
117
|
+
duration_ms: Optional[int] = None
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
class ProcessingMetadata(BaseModel):
|
|
121
|
+
"""Worker-side processing metadata emitted by manifest v2."""
|
|
122
|
+
|
|
123
|
+
page_count: Optional[int] = None
|
|
124
|
+
billing_status: Optional[str] = None
|
|
125
|
+
cost: Optional[ProcessingCost] = None
|
|
126
|
+
timing: Optional[ProcessingTiming] = None
|
|
127
|
+
|
|
128
|
+
|
|
101
129
|
class Manifest(BaseModel):
|
|
102
130
|
"""Top-level manifest describing the result ZIP contents."""
|
|
103
131
|
|
|
@@ -106,6 +134,7 @@ class Manifest(BaseModel):
|
|
|
106
134
|
data_id: Optional[str] = None
|
|
107
135
|
source_file_name: Optional[str] = None
|
|
108
136
|
processing_date: Optional[str] = None
|
|
137
|
+
processing: Optional[ProcessingMetadata] = None
|
|
109
138
|
checksum: Optional[Checksum] = None
|
|
110
139
|
statistics: Optional[Statistics] = None
|
|
111
140
|
files: Optional[FileIndex] = None
|
|
@@ -123,6 +152,7 @@ class BaseChunk(BaseModel):
|
|
|
123
152
|
type: str
|
|
124
153
|
content: str = ""
|
|
125
154
|
path: Optional[str] = None
|
|
155
|
+
page_nums: Optional[List[int]] = None
|
|
126
156
|
|
|
127
157
|
|
|
128
158
|
TextChunkTokens: TypeAlias = List[str]
|
|
@@ -136,6 +166,7 @@ class TextChunk(BaseChunk):
|
|
|
136
166
|
tokens: Optional[TextChunkTokens] = None
|
|
137
167
|
keywords: Optional[List[str]] = None
|
|
138
168
|
summary: Optional[str] = None
|
|
169
|
+
connect_to: Optional[List[Dict[str, Any]]] = None
|
|
139
170
|
relationships: Optional[List[Union[Dict[str, Any], str]]] = None
|
|
140
171
|
|
|
141
172
|
|
|
@@ -210,6 +241,15 @@ class TableChunk(BaseChunk):
|
|
|
210
241
|
Chunk = Union[TextChunk, ImageChunk, TableChunk]
|
|
211
242
|
|
|
212
243
|
|
|
244
|
+
class SlimChunk(BaseModel):
|
|
245
|
+
"""Minimal chunk entry emitted in chunks_slim.json."""
|
|
246
|
+
|
|
247
|
+
type: str
|
|
248
|
+
path: Optional[str] = None
|
|
249
|
+
content: str = ""
|
|
250
|
+
summary: Optional[str] = None
|
|
251
|
+
|
|
252
|
+
|
|
213
253
|
# ---------------------------------------------------------------------------
|
|
214
254
|
# ParseResult — the top-level object returned to the user
|
|
215
255
|
# ---------------------------------------------------------------------------
|
|
@@ -225,8 +265,12 @@ class ParseResult:
|
|
|
225
265
|
|
|
226
266
|
manifest: Manifest
|
|
227
267
|
chunks: List[Chunk]
|
|
268
|
+
chunks_slim: Optional[List[SlimChunk]]
|
|
228
269
|
full_markdown: str
|
|
229
270
|
hierarchy: Optional[Any]
|
|
271
|
+
toc_hierarchies: Optional[Any]
|
|
272
|
+
kb_csv: Optional[str]
|
|
273
|
+
hierarchy_view_html: Optional[str]
|
|
230
274
|
raw_zip: bytes
|
|
231
275
|
|
|
232
276
|
def __init__(
|
|
@@ -234,14 +278,22 @@ class ParseResult:
|
|
|
234
278
|
*,
|
|
235
279
|
manifest: Manifest,
|
|
236
280
|
chunks: List[Chunk],
|
|
281
|
+
chunks_slim: Optional[List[SlimChunk]],
|
|
237
282
|
full_markdown: str,
|
|
238
283
|
hierarchy: Optional[Any],
|
|
284
|
+
toc_hierarchies: Optional[Any],
|
|
285
|
+
kb_csv: Optional[str],
|
|
286
|
+
hierarchy_view_html: Optional[str],
|
|
239
287
|
raw_zip: bytes,
|
|
240
288
|
) -> None:
|
|
241
289
|
self.manifest = manifest
|
|
242
290
|
self.chunks = chunks
|
|
291
|
+
self.chunks_slim = chunks_slim
|
|
243
292
|
self.full_markdown = full_markdown
|
|
244
293
|
self.hierarchy = hierarchy
|
|
294
|
+
self.toc_hierarchies = toc_hierarchies
|
|
295
|
+
self.kb_csv = kb_csv
|
|
296
|
+
self.hierarchy_view_html = hierarchy_view_html
|
|
245
297
|
self.raw_zip = raw_zip
|
|
246
298
|
|
|
247
299
|
# -- convenience properties --
|
|
@@ -296,10 +348,58 @@ class ParseResult:
|
|
|
296
348
|
dir_path: Path = Path(directory)
|
|
297
349
|
dir_path.mkdir(parents=True, exist_ok=True)
|
|
298
350
|
|
|
351
|
+
# Manifest / chunks
|
|
352
|
+
manifest_path: Path = dir_path / "manifest.json"
|
|
353
|
+
manifest_path.write_text(
|
|
354
|
+
self.manifest.model_dump_json(indent=2),
|
|
355
|
+
encoding="utf-8",
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
chunks_path: Path = dir_path / "chunks.json"
|
|
359
|
+
chunks_path.write_text(
|
|
360
|
+
json.dumps([chunk.model_dump() for chunk in self.chunks], indent=2),
|
|
361
|
+
encoding="utf-8",
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
if self.chunks_slim is not None:
|
|
365
|
+
chunks_slim_path: Path = dir_path / "chunks_slim.json"
|
|
366
|
+
chunks_slim_path.write_text(
|
|
367
|
+
json.dumps(
|
|
368
|
+
{"chunks": [chunk.model_dump() for chunk in self.chunks_slim]},
|
|
369
|
+
indent=2,
|
|
370
|
+
),
|
|
371
|
+
encoding="utf-8",
|
|
372
|
+
)
|
|
373
|
+
|
|
299
374
|
# Full markdown
|
|
300
375
|
md_path: Path = dir_path / "full.md"
|
|
301
376
|
md_path.write_text(self.full_markdown, encoding="utf-8")
|
|
302
377
|
|
|
378
|
+
if self.hierarchy is not None:
|
|
379
|
+
hierarchy_path: Path = dir_path / "hierarchy.json"
|
|
380
|
+
hierarchy_path.write_text(
|
|
381
|
+
json.dumps(self.hierarchy, indent=2),
|
|
382
|
+
encoding="utf-8",
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
if self.toc_hierarchies is not None:
|
|
386
|
+
toc_hierarchies_path: Path = dir_path / "toc_hierarchies.json"
|
|
387
|
+
toc_hierarchies_path.write_text(
|
|
388
|
+
json.dumps(self.toc_hierarchies, indent=2),
|
|
389
|
+
encoding="utf-8",
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
if self.kb_csv is not None:
|
|
393
|
+
kb_csv_path: Path = dir_path / "kb.csv"
|
|
394
|
+
kb_csv_path.write_text(self.kb_csv, encoding="utf-8")
|
|
395
|
+
|
|
396
|
+
if self.hierarchy_view_html is not None:
|
|
397
|
+
hierarchy_view_path: Path = dir_path / "hierarchy_view.html"
|
|
398
|
+
hierarchy_view_path.write_text(
|
|
399
|
+
self.hierarchy_view_html,
|
|
400
|
+
encoding="utf-8",
|
|
401
|
+
)
|
|
402
|
+
|
|
303
403
|
# Images
|
|
304
404
|
if self.image_chunks:
|
|
305
405
|
images_dir: Path = dir_path / "images"
|
|
@@ -4,8 +4,6 @@ from __future__ import annotations
|
|
|
4
4
|
|
|
5
5
|
from typing import Any, Dict, List, Optional
|
|
6
6
|
|
|
7
|
-
import pytest
|
|
8
|
-
|
|
9
7
|
from knowhere.types.job import Job, JobError, JobResult
|
|
10
8
|
from knowhere.types.result import (
|
|
11
9
|
BaseChunk,
|
|
@@ -16,6 +14,10 @@ from knowhere.types.result import (
|
|
|
16
14
|
ImageFileInfo,
|
|
17
15
|
Manifest,
|
|
18
16
|
ParseResult,
|
|
17
|
+
ProcessingCost,
|
|
18
|
+
ProcessingMetadata,
|
|
19
|
+
ProcessingTiming,
|
|
20
|
+
SlimChunk,
|
|
19
21
|
Statistics,
|
|
20
22
|
TableChunk,
|
|
21
23
|
TableFileInfo,
|
|
@@ -269,6 +271,27 @@ class TestManifestModel:
|
|
|
269
271
|
assert manifest.statistics is None
|
|
270
272
|
assert manifest.files is None
|
|
271
273
|
|
|
274
|
+
def test_processing_metadata(self) -> None:
|
|
275
|
+
manifest: Manifest = Manifest(
|
|
276
|
+
version="2.0",
|
|
277
|
+
processing=ProcessingMetadata(
|
|
278
|
+
page_count=12,
|
|
279
|
+
billing_status="charged",
|
|
280
|
+
cost=ProcessingCost(micro_dollars=60000, credits=0.06),
|
|
281
|
+
timing=ProcessingTiming(
|
|
282
|
+
started_at="2026-04-09T08:20:56.634Z",
|
|
283
|
+
completed_at="2026-04-09T08:21:12.288Z",
|
|
284
|
+
duration_ms=15653,
|
|
285
|
+
),
|
|
286
|
+
),
|
|
287
|
+
)
|
|
288
|
+
assert manifest.processing is not None
|
|
289
|
+
assert manifest.processing.page_count == 12
|
|
290
|
+
assert manifest.processing.cost is not None
|
|
291
|
+
assert manifest.processing.cost.micro_dollars == 60000
|
|
292
|
+
assert manifest.processing.timing is not None
|
|
293
|
+
assert manifest.processing.timing.duration_ms == 15653
|
|
294
|
+
|
|
272
295
|
|
|
273
296
|
# ---------------------------------------------------------------------------
|
|
274
297
|
# Statistics model
|
|
@@ -375,6 +398,13 @@ class TestBaseChunkModel:
|
|
|
375
398
|
chunk: BaseChunk = BaseChunk(chunk_id="chunk_2", type="text")
|
|
376
399
|
assert chunk.content == ""
|
|
377
400
|
assert chunk.path is None
|
|
401
|
+
assert chunk.page_nums is None
|
|
402
|
+
|
|
403
|
+
def test_page_nums_supported(self) -> None:
|
|
404
|
+
chunk: BaseChunk = BaseChunk(
|
|
405
|
+
chunk_id="chunk_3", type="text", page_nums=[1, 2]
|
|
406
|
+
)
|
|
407
|
+
assert chunk.page_nums == [1, 2]
|
|
378
408
|
|
|
379
409
|
|
|
380
410
|
# ---------------------------------------------------------------------------
|
|
@@ -391,18 +421,23 @@ class TestTextChunkModel:
|
|
|
391
421
|
content="Some text content",
|
|
392
422
|
path="doc/section1",
|
|
393
423
|
length=17,
|
|
424
|
+
page_nums=[1, 2],
|
|
394
425
|
tokens=["Some", "text", "content"],
|
|
395
426
|
keywords=["text", "content"],
|
|
396
427
|
summary="A text chunk",
|
|
428
|
+
connect_to=[{"target": "img_1", "relation": "embeds"}],
|
|
397
429
|
relationships=[{"target": "text_2", "type": "follows"}],
|
|
398
430
|
)
|
|
399
431
|
assert chunk.chunk_id == "text_1"
|
|
400
432
|
assert chunk.type == "text"
|
|
401
433
|
assert chunk.content == "Some text content"
|
|
402
434
|
assert chunk.length == 17
|
|
435
|
+
assert chunk.page_nums == [1, 2]
|
|
403
436
|
assert chunk.tokens == ["Some", "text", "content"]
|
|
404
437
|
assert chunk.keywords == ["text", "content"]
|
|
405
438
|
assert chunk.summary == "A text chunk"
|
|
439
|
+
assert chunk.connect_to is not None
|
|
440
|
+
assert len(chunk.connect_to) == 1
|
|
406
441
|
assert chunk.relationships is not None
|
|
407
442
|
assert len(chunk.relationships) == 1
|
|
408
443
|
|
|
@@ -413,6 +448,7 @@ class TestTextChunkModel:
|
|
|
413
448
|
assert chunk.tokens is None
|
|
414
449
|
assert chunk.keywords is None
|
|
415
450
|
assert chunk.summary is None
|
|
451
|
+
assert chunk.connect_to is None
|
|
416
452
|
assert chunk.relationships is None
|
|
417
453
|
|
|
418
454
|
def test_is_instance_of_base_chunk(self) -> None:
|
|
@@ -567,8 +603,19 @@ def _build_parse_result(
|
|
|
567
603
|
return ParseResult(
|
|
568
604
|
manifest=manifest,
|
|
569
605
|
chunks=chunks if chunks is not None else default_chunks,
|
|
606
|
+
chunks_slim=[
|
|
607
|
+
SlimChunk(
|
|
608
|
+
type="text",
|
|
609
|
+
path="doc/section1",
|
|
610
|
+
content="Hello world",
|
|
611
|
+
summary="Greeting",
|
|
612
|
+
)
|
|
613
|
+
],
|
|
570
614
|
full_markdown="# Test\n\nHello world",
|
|
571
615
|
hierarchy=None,
|
|
616
|
+
toc_hierarchies=[{"toc_range": [1, 3]}],
|
|
617
|
+
kb_csv="chunk_id,type\ntext_1,text\n",
|
|
618
|
+
hierarchy_view_html="<html><body>Hierarchy</body></html>",
|
|
572
619
|
raw_zip=b"fake zip bytes",
|
|
573
620
|
)
|
|
574
621
|
|
|
@@ -657,3 +704,11 @@ class TestParseResult:
|
|
|
657
704
|
def test_raw_zip_accessible(self) -> None:
|
|
658
705
|
result: ParseResult = _build_parse_result()
|
|
659
706
|
assert result.raw_zip == b"fake zip bytes"
|
|
707
|
+
|
|
708
|
+
def test_optimized_result_fields_accessible(self) -> None:
|
|
709
|
+
result: ParseResult = _build_parse_result()
|
|
710
|
+
assert result.chunks_slim is not None
|
|
711
|
+
assert result.chunks_slim[0].path == "doc/section1"
|
|
712
|
+
assert result.toc_hierarchies == [{"toc_range": [1, 3]}]
|
|
713
|
+
assert result.kb_csv == "chunk_id,type\ntext_1,text\n"
|
|
714
|
+
assert result.hierarchy_view_html == "<html><body>Hierarchy</body></html>"
|
|
@@ -56,6 +56,7 @@ TEXT_TOKENS_LIST: List[str] = ["Ashish", "Vaswani", "attention", "transformer"]
|
|
|
56
56
|
|
|
57
57
|
MARKDOWN: str = "# Test\n\nHello world"
|
|
58
58
|
IMAGE_BYTES: bytes = b"\xff\xd8\xff\xe0"
|
|
59
|
+
TABLE_HTML: str = "<table><tr><td>Optimized</td></tr></table>"
|
|
59
60
|
|
|
60
61
|
|
|
61
62
|
def _build_zip(
|
|
@@ -120,6 +121,91 @@ def _make_manifest(checksum_value: str = "") -> Dict[str, Any]:
|
|
|
120
121
|
}
|
|
121
122
|
|
|
122
123
|
|
|
124
|
+
def _make_optimized_manifest() -> Dict[str, Any]:
|
|
125
|
+
"""Build a manifest dict matching the current optimized API payload."""
|
|
126
|
+
return {
|
|
127
|
+
"version": "2.0",
|
|
128
|
+
"job_id": "job_optimized123",
|
|
129
|
+
"data_id": None,
|
|
130
|
+
"source_file_name": "optimized.pdf",
|
|
131
|
+
"processing_date": "2026-04-09T08:21:12.294Z",
|
|
132
|
+
"processing": {
|
|
133
|
+
"page_count": 12,
|
|
134
|
+
"billing_status": "charged",
|
|
135
|
+
"cost": {
|
|
136
|
+
"micro_dollars": 60000,
|
|
137
|
+
"credits": 0.06,
|
|
138
|
+
},
|
|
139
|
+
"timing": {
|
|
140
|
+
"started_at": "2026-04-09T08:20:56.634Z",
|
|
141
|
+
"completed_at": "2026-04-09T08:21:12.288Z",
|
|
142
|
+
"duration_ms": 15653,
|
|
143
|
+
},
|
|
144
|
+
},
|
|
145
|
+
"statistics": {
|
|
146
|
+
"total_chunks": 3,
|
|
147
|
+
"text_chunks": 1,
|
|
148
|
+
"image_chunks": 1,
|
|
149
|
+
"table_chunks": 1,
|
|
150
|
+
"total_pages": None,
|
|
151
|
+
},
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _make_optimized_chunks() -> List[Dict[str, Any]]:
|
|
156
|
+
"""Build chunks matching the current optimized API payload."""
|
|
157
|
+
return [
|
|
158
|
+
{
|
|
159
|
+
"chunk_id": "text_chunk_optimized",
|
|
160
|
+
"type": "text",
|
|
161
|
+
"content": "Text chunk with embedded resources.",
|
|
162
|
+
"path": "Default_Root/optimized.pdf-->Section 1",
|
|
163
|
+
"metadata": {
|
|
164
|
+
"length": 35,
|
|
165
|
+
"summary": "",
|
|
166
|
+
"page_nums": [1, 2],
|
|
167
|
+
"tokens": ["Text", "chunk"],
|
|
168
|
+
"keywords": ["optimized"],
|
|
169
|
+
"connect_to": [
|
|
170
|
+
{
|
|
171
|
+
"target": "image_chunk_optimized",
|
|
172
|
+
"relation": "embeds",
|
|
173
|
+
"ref": "[images/IMAGE_test1.jpg]",
|
|
174
|
+
}
|
|
175
|
+
],
|
|
176
|
+
},
|
|
177
|
+
},
|
|
178
|
+
{
|
|
179
|
+
"chunk_id": "image_chunk_optimized",
|
|
180
|
+
"type": "image",
|
|
181
|
+
"content": "[images/IMAGE_test1.jpg]",
|
|
182
|
+
"path": "images/IMAGE_test1.jpg",
|
|
183
|
+
"metadata": {
|
|
184
|
+
"length": 1,
|
|
185
|
+
"summary": "Optimized image chunk",
|
|
186
|
+
"page_nums": [2],
|
|
187
|
+
"file_path": "images/IMAGE_test1.jpg",
|
|
188
|
+
"keywords": [],
|
|
189
|
+
"tokens": [],
|
|
190
|
+
},
|
|
191
|
+
},
|
|
192
|
+
{
|
|
193
|
+
"chunk_id": "table_chunk_optimized",
|
|
194
|
+
"type": "table",
|
|
195
|
+
"content": TABLE_HTML,
|
|
196
|
+
"path": "tables/table-optimized.html",
|
|
197
|
+
"metadata": {
|
|
198
|
+
"length": 1,
|
|
199
|
+
"summary": "Optimized table chunk",
|
|
200
|
+
"page_nums": [3],
|
|
201
|
+
"file_path": "tables/table-optimized.html",
|
|
202
|
+
"keywords": ["optimized"],
|
|
203
|
+
"tokens": [],
|
|
204
|
+
},
|
|
205
|
+
},
|
|
206
|
+
]
|
|
207
|
+
|
|
208
|
+
|
|
123
209
|
# ---------------------------------------------------------------------------
|
|
124
210
|
# Valid ZIP parsing
|
|
125
211
|
# ---------------------------------------------------------------------------
|
|
@@ -254,6 +340,114 @@ class TestParseValidZip:
|
|
|
254
340
|
|
|
255
341
|
assert result.getChunk("nonexistent") is None
|
|
256
342
|
|
|
343
|
+
def test_exposes_optimized_payload_metadata_and_sidecar_assets(self) -> None:
|
|
344
|
+
manifest: Dict[str, Any] = _make_optimized_manifest()
|
|
345
|
+
chunks: List[Dict[str, Any]] = _make_optimized_chunks()
|
|
346
|
+
zip_bytes: bytes = _build_zip(
|
|
347
|
+
manifest,
|
|
348
|
+
chunks=chunks,
|
|
349
|
+
markdown="# Optimized Result\n\nBody",
|
|
350
|
+
extra_entries={
|
|
351
|
+
"chunks_slim.json": json.dumps(
|
|
352
|
+
{
|
|
353
|
+
"chunks": [
|
|
354
|
+
{
|
|
355
|
+
"type": "text",
|
|
356
|
+
"path": "Default_Root/optimized.pdf-->Section 1",
|
|
357
|
+
"content": "Text chunk with embedded resources.",
|
|
358
|
+
"summary": "",
|
|
359
|
+
}
|
|
360
|
+
]
|
|
361
|
+
}
|
|
362
|
+
).encode("utf-8"),
|
|
363
|
+
"kb.csv": b"chunk_id,type\ntext_chunk_optimized,text\n",
|
|
364
|
+
"hierarchy.json": json.dumps(
|
|
365
|
+
{"Default_Root": {"optimized.pdf": {}}}
|
|
366
|
+
).encode("utf-8"),
|
|
367
|
+
"toc_hierarchies.json": json.dumps(
|
|
368
|
+
[{"toc_range": [1, 3], "scan_range": [1, 10]}]
|
|
369
|
+
).encode("utf-8"),
|
|
370
|
+
"hierarchy_view.html": b"<html><body>Optimized hierarchy view</body></html>",
|
|
371
|
+
"tables/table-optimized.html": TABLE_HTML.encode("utf-8"),
|
|
372
|
+
},
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
result: ParseResult = parseResultZip(zip_bytes, verify_checksum=False)
|
|
376
|
+
|
|
377
|
+
assert result.manifest.version == "2.0"
|
|
378
|
+
assert result.manifest.files is None
|
|
379
|
+
assert result.manifest.processing is not None
|
|
380
|
+
assert result.manifest.processing.page_count == 12
|
|
381
|
+
assert result.manifest.processing.billing_status == "charged"
|
|
382
|
+
assert result.manifest.processing.cost is not None
|
|
383
|
+
assert result.manifest.processing.cost.micro_dollars == 60000
|
|
384
|
+
assert result.text_chunks[0].page_nums == [1, 2]
|
|
385
|
+
assert result.image_chunks[0].page_nums == [2]
|
|
386
|
+
assert result.table_chunks[0].page_nums == [3]
|
|
387
|
+
assert result.text_chunks[0].connect_to == [
|
|
388
|
+
{
|
|
389
|
+
"target": "image_chunk_optimized",
|
|
390
|
+
"relation": "embeds",
|
|
391
|
+
"ref": "[images/IMAGE_test1.jpg]",
|
|
392
|
+
}
|
|
393
|
+
]
|
|
394
|
+
assert result.chunks_slim is not None
|
|
395
|
+
assert len(result.chunks_slim) == 1
|
|
396
|
+
assert result.kb_csv == "chunk_id,type\ntext_chunk_optimized,text\n"
|
|
397
|
+
assert result.toc_hierarchies == [{"toc_range": [1, 3], "scan_range": [1, 10]}]
|
|
398
|
+
assert result.hierarchy_view_html == "<html><body>Optimized hierarchy view</body></html>"
|
|
399
|
+
assert result.hierarchy == {"Default_Root": {"optimized.pdf": {}}}
|
|
400
|
+
|
|
401
|
+
def test_save_preserves_optimized_sidecar_files(self, tmp_path: Path) -> None:
|
|
402
|
+
manifest: Dict[str, Any] = _make_optimized_manifest()
|
|
403
|
+
chunks: List[Dict[str, Any]] = _make_optimized_chunks()
|
|
404
|
+
zip_bytes: bytes = _build_zip(
|
|
405
|
+
manifest,
|
|
406
|
+
chunks=chunks,
|
|
407
|
+
markdown="# Optimized Result\n\nBody",
|
|
408
|
+
extra_entries={
|
|
409
|
+
"chunks_slim.json": json.dumps(
|
|
410
|
+
{
|
|
411
|
+
"chunks": [
|
|
412
|
+
{
|
|
413
|
+
"type": "text",
|
|
414
|
+
"path": "Default_Root/optimized.pdf-->Section 1",
|
|
415
|
+
"content": "Text chunk with embedded resources.",
|
|
416
|
+
"summary": "",
|
|
417
|
+
}
|
|
418
|
+
]
|
|
419
|
+
}
|
|
420
|
+
).encode("utf-8"),
|
|
421
|
+
"kb.csv": b"chunk_id,type\ntext_chunk_optimized,text\n",
|
|
422
|
+
"hierarchy.json": json.dumps(
|
|
423
|
+
{"Default_Root": {"optimized.pdf": {}}}
|
|
424
|
+
).encode("utf-8"),
|
|
425
|
+
"toc_hierarchies.json": json.dumps(
|
|
426
|
+
[{"toc_range": [1, 3], "scan_range": [1, 10]}]
|
|
427
|
+
).encode("utf-8"),
|
|
428
|
+
"hierarchy_view.html": b"<html><body>Optimized hierarchy view</body></html>",
|
|
429
|
+
"tables/table-optimized.html": TABLE_HTML.encode("utf-8"),
|
|
430
|
+
},
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
result: ParseResult = parseResultZip(zip_bytes, verify_checksum=False)
|
|
434
|
+
output_dir: Path = tmp_path / "optimized-result"
|
|
435
|
+
|
|
436
|
+
saved_path: Path = result.save(output_dir)
|
|
437
|
+
|
|
438
|
+
assert saved_path == output_dir.resolve()
|
|
439
|
+
assert (output_dir / "manifest.json").exists()
|
|
440
|
+
assert (output_dir / "chunks.json").exists()
|
|
441
|
+
assert (output_dir / "full.md").exists()
|
|
442
|
+
assert (output_dir / "hierarchy.json").exists()
|
|
443
|
+
assert (output_dir / "chunks_slim.json").exists()
|
|
444
|
+
assert (output_dir / "kb.csv").exists()
|
|
445
|
+
assert (output_dir / "toc_hierarchies.json").exists()
|
|
446
|
+
assert (output_dir / "hierarchy_view.html").exists()
|
|
447
|
+
assert (output_dir / "images" / "IMAGE_test1.jpg").exists()
|
|
448
|
+
assert (output_dir / "tables" / "table-optimized.html").exists()
|
|
449
|
+
assert (output_dir / "result.zip").exists()
|
|
450
|
+
|
|
257
451
|
|
|
258
452
|
# ---------------------------------------------------------------------------
|
|
259
453
|
# Checksum verification
|
|
@@ -334,6 +528,7 @@ class TestMissingRequiredFiles:
|
|
|
334
528
|
result: ParseResult = parseResultZip(
|
|
335
529
|
zip_bytes, verify_checksum=False
|
|
336
530
|
)
|
|
531
|
+
assert result.chunks == []
|
|
337
532
|
|
|
338
533
|
|
|
339
534
|
# ---------------------------------------------------------------------------
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "0.2.0" # x-release-please-version
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|