knowhere-python-sdk 0.2.0__tar.gz → 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. knowhere_python_sdk-0.2.1/.release-please-manifest.json +3 -0
  2. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/CHANGELOG.md +10 -0
  3. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/PKG-INFO +1 -1
  4. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/pyproject.toml +1 -1
  5. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/__init__.py +8 -0
  6. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/_exceptions.py +21 -3
  7. knowhere_python_sdk-0.2.1/src/knowhere/_version.py +1 -0
  8. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/lib/result_parser.py +32 -0
  9. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/types/__init__.py +8 -0
  10. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/types/result.py +100 -0
  11. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_models.py +57 -2
  12. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_result_parser.py +195 -0
  13. knowhere_python_sdk-0.2.0/.release-please-manifest.json +0 -3
  14. knowhere_python_sdk-0.2.0/src/knowhere/_version.py +0 -1
  15. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/.github/workflows/ci.yml +0 -0
  16. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/.github/workflows/publish-pypi.yml +0 -0
  17. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/.github/workflows/publish.yml +0 -0
  18. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/.gitignore +0 -0
  19. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/README.md +0 -0
  20. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/docs/usage.md +0 -0
  21. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/examples/async_usage.py +0 -0
  22. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/examples/error_handling.py +0 -0
  23. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/examples/parse_file.py +0 -0
  24. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/examples/parse_url.py +0 -0
  25. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/examples/step_by_step.py +0 -0
  26. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/release-please-config.json +0 -0
  27. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/_base_client.py +0 -0
  28. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/_client.py +0 -0
  29. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/_constants.py +0 -0
  30. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/_logging.py +0 -0
  31. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/_response.py +0 -0
  32. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/_types.py +0 -0
  33. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/lib/__init__.py +0 -0
  34. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/lib/polling.py +0 -0
  35. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/lib/upload.py +0 -0
  36. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/py.typed +0 -0
  37. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/resources/__init__.py +0 -0
  38. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/resources/_base.py +0 -0
  39. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/resources/jobs.py +0 -0
  40. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/types/job.py +0 -0
  41. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/types/params.py +0 -0
  42. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/src/knowhere/types/shared.py +0 -0
  43. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/__init__.py +0 -0
  44. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/conftest.py +0 -0
  45. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/fixtures/real_result.zip +0 -0
  46. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_client.py +0 -0
  47. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_exceptions.py +0 -0
  48. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_jobs.py +0 -0
  49. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_logging.py +0 -0
  50. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_parse.py +0 -0
  51. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_polling.py +0 -0
  52. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_retry.py +0 -0
  53. {knowhere_python_sdk-0.2.0 → knowhere_python_sdk-0.2.1}/tests/test_upload.py +0 -0
@@ -0,0 +1,3 @@
1
+ {
2
+ ".": "0.2.1"
3
+ }
@@ -1,5 +1,15 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.2.1](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.2.0...v0.2.1) (2026-04-09)
4
+
5
+
6
+ ### Bug Fixes
7
+
8
+ * narrow status error constructors ([c8fc035](https://github.com/Ontos-AI/knowhere-python-sdk/commit/c8fc035dade768c5364e50de890bde0fb280586e))
9
+ * remove stale mypy ignore ([150336a](https://github.com/Ontos-AI/knowhere-python-sdk/commit/150336a5dc0497b287437dffa6e1506f4bcf8fbf))
10
+ * sync optimized parse result payload ([a7903ad](https://github.com/Ontos-AI/knowhere-python-sdk/commit/a7903ad53fb5ab142c5835134c9a942eb5cdfe21))
11
+ * sync parse result payload with current API schema ([430b067](https://github.com/Ontos-AI/knowhere-python-sdk/commit/430b067b37ce0b2eb8bd3c81cfca56b1df657376))
12
+
3
13
  ## [0.2.0](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.1.0...v0.2.0) (2026-03-18)
4
14
 
5
15
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: knowhere-python-sdk
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: Official Python SDK for the Knowhere document parsing API
5
5
  Project-URL: Homepage, https://knowhereto.ai
6
6
  Project-URL: Documentation, https://docs.knowhereto.ai
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "knowhere-python-sdk"
7
- version = "0.2.0"
7
+ version = "0.2.1"
8
8
  description = "Official Python SDK for the Knowhere document parsing API"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -46,6 +46,10 @@ from knowhere.types.result import (
46
46
  ImageFileInfo,
47
47
  Manifest,
48
48
  ParseResult,
49
+ ProcessingCost,
50
+ ProcessingMetadata,
51
+ ProcessingTiming,
52
+ SlimChunk,
49
53
  Statistics,
50
54
  TableChunk,
51
55
  TableFileInfo,
@@ -91,6 +95,10 @@ __all__: list[str] = [
91
95
  "FileIndex",
92
96
  "ImageFileInfo",
93
97
  "TableFileInfo",
98
+ "ProcessingCost",
99
+ "ProcessingMetadata",
100
+ "ProcessingTiming",
101
+ "SlimChunk",
94
102
  "BaseChunk",
95
103
  "TextChunk",
96
104
  "ImageChunk",
@@ -387,11 +387,29 @@ def makeStatusError(
387
387
  response=response,
388
388
  )
389
389
 
390
- if exception_class in (RateLimitError, ServiceUnavailableError, GatewayTimeoutError):
391
- return exception_class(
390
+ if exception_class is RateLimitError:
391
+ return RateLimitError(
392
392
  status_code,
393
393
  **common_kwargs,
394
- retry_after=retry_after, # type: ignore[call-arg]
394
+ retry_after=retry_after,
395
+ limit=limit,
396
+ period=period,
397
+ )
398
+
399
+ if exception_class is ServiceUnavailableError:
400
+ return ServiceUnavailableError(
401
+ status_code,
402
+ **common_kwargs,
403
+ retry_after=retry_after,
404
+ limit=limit,
405
+ period=period,
406
+ )
407
+
408
+ if exception_class is GatewayTimeoutError:
409
+ return GatewayTimeoutError(
410
+ status_code,
411
+ **common_kwargs,
412
+ retry_after=retry_after,
395
413
  limit=limit,
396
414
  period=period,
397
415
  )
@@ -0,0 +1 @@
1
+ __version__ = "0.2.1" # x-release-please-version
@@ -16,6 +16,7 @@ from knowhere.types.result import (
16
16
  ImageChunk,
17
17
  Manifest,
18
18
  ParseResult,
19
+ SlimChunk,
19
20
  TableChunk,
20
21
  TextChunk,
21
22
  TextChunkTokens,
@@ -134,6 +135,7 @@ def _buildChunks(
134
135
  type="image",
135
136
  content=raw.get("content", ""),
136
137
  path=raw.get("path"),
138
+ page_nums=metadata.get("page_nums", raw.get("page_nums")),
137
139
  length=metadata.get("length", raw.get("length", 0)),
138
140
  file_path=file_path,
139
141
  original_name=metadata.get("original_name", raw.get("original_name")),
@@ -151,6 +153,7 @@ def _buildChunks(
151
153
  type="table",
152
154
  content=raw.get("content", ""),
153
155
  path=raw.get("path"),
156
+ page_nums=metadata.get("page_nums", raw.get("page_nums")),
154
157
  length=metadata.get("length", raw.get("length", 0)),
155
158
  file_path=file_path,
156
159
  original_name=metadata.get("original_name", raw.get("original_name")),
@@ -167,10 +170,12 @@ def _buildChunks(
167
170
  type="text",
168
171
  content=raw.get("content", ""),
169
172
  path=raw.get("path"),
173
+ page_nums=metadata.get("page_nums", raw.get("page_nums")),
170
174
  length=metadata.get("length", raw.get("length", 0)),
171
175
  tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
172
176
  keywords=metadata.get("keywords", raw.get("keywords")),
173
177
  summary=metadata.get("summary", raw.get("summary")),
178
+ connect_to=metadata.get("connect_to", raw.get("connect_to")),
174
179
  relationships=metadata.get("relationships", raw.get("relationships")),
175
180
  )
176
181
 
@@ -230,12 +235,39 @@ def parseResultZip(
230
235
  json.loads(hierarchy_text) if hierarchy_text else None
231
236
  )
232
237
 
238
+ # -- Optimized sidecar files --
239
+ chunks_slim_text: Optional[str] = _readZipText(zf, "chunks_slim.json")
240
+ parsed_chunks_slim: Any = json.loads(chunks_slim_text) if chunks_slim_text else None
241
+ if isinstance(parsed_chunks_slim, dict) and "chunks" in parsed_chunks_slim:
242
+ raw_chunks_slim: List[Dict[str, Any]] = parsed_chunks_slim["chunks"]
243
+ elif isinstance(parsed_chunks_slim, list):
244
+ raw_chunks_slim = parsed_chunks_slim
245
+ else:
246
+ raw_chunks_slim = []
247
+ chunks_slim: Optional[List[SlimChunk]] = (
248
+ [SlimChunk.model_validate(chunk) for chunk in raw_chunks_slim]
249
+ if chunks_slim_text is not None
250
+ else None
251
+ )
252
+
253
+ toc_hierarchies_text: Optional[str] = _readZipText(zf, "toc_hierarchies.json")
254
+ toc_hierarchies: Optional[Any] = (
255
+ json.loads(toc_hierarchies_text) if toc_hierarchies_text else None
256
+ )
257
+
258
+ kb_csv: Optional[str] = _readZipText(zf, "kb.csv")
259
+ hierarchy_view_html: Optional[str] = _readZipText(zf, "hierarchy_view.html")
260
+
233
261
  zf.close()
234
262
 
235
263
  return ParseResult(
236
264
  manifest=manifest,
237
265
  chunks=chunks,
266
+ chunks_slim=chunks_slim,
238
267
  full_markdown=full_markdown,
239
268
  hierarchy=hierarchy,
269
+ toc_hierarchies=toc_hierarchies,
270
+ kb_csv=kb_csv,
271
+ hierarchy_view_html=hierarchy_view_html,
240
272
  raw_zip=zip_bytes,
241
273
  )
@@ -13,6 +13,10 @@ from knowhere.types.result import (
13
13
  ImageFileInfo,
14
14
  Manifest,
15
15
  ParseResult,
16
+ ProcessingCost,
17
+ ProcessingMetadata,
18
+ ProcessingTiming,
19
+ SlimChunk,
16
20
  Statistics,
17
21
  TableChunk,
18
22
  TableFileInfo,
@@ -36,6 +40,10 @@ __all__: list[str] = [
36
40
  "ImageFileInfo",
37
41
  "Manifest",
38
42
  "ParseResult",
43
+ "ProcessingCost",
44
+ "ProcessingMetadata",
45
+ "ProcessingTiming",
46
+ "SlimChunk",
39
47
  "Statistics",
40
48
  "TableChunk",
41
49
  "TableFileInfo",
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import os
6
+ import json
6
7
  import re
7
8
  from pathlib import Path
8
9
  from typing import Any, Dict, List, Optional, Union
@@ -92,12 +93,39 @@ class FileIndex(BaseModel):
92
93
 
93
94
  chunks: Optional[str] = None
94
95
  markdown: Optional[str] = None
96
+ chunks_slim: Optional[str] = None
95
97
  kb_csv: Optional[str] = None
96
98
  hierarchy: Optional[str] = None
99
+ toc_hierarchies: Optional[str] = None
100
+ hierarchy_view_html: Optional[str] = None
97
101
  images: List[ImageFileInfo] = Field(default_factory=list)
98
102
  tables: List[TableFileInfo] = Field(default_factory=list)
99
103
 
100
104
 
105
+ class ProcessingCost(BaseModel):
106
+ """Billing details emitted by manifest v2."""
107
+
108
+ micro_dollars: Optional[int] = None
109
+ credits: Optional[float] = None
110
+
111
+
112
+ class ProcessingTiming(BaseModel):
113
+ """Timing details emitted by manifest v2."""
114
+
115
+ started_at: Optional[str] = None
116
+ completed_at: Optional[str] = None
117
+ duration_ms: Optional[int] = None
118
+
119
+
120
+ class ProcessingMetadata(BaseModel):
121
+ """Worker-side processing metadata emitted by manifest v2."""
122
+
123
+ page_count: Optional[int] = None
124
+ billing_status: Optional[str] = None
125
+ cost: Optional[ProcessingCost] = None
126
+ timing: Optional[ProcessingTiming] = None
127
+
128
+
101
129
  class Manifest(BaseModel):
102
130
  """Top-level manifest describing the result ZIP contents."""
103
131
 
@@ -106,6 +134,7 @@ class Manifest(BaseModel):
106
134
  data_id: Optional[str] = None
107
135
  source_file_name: Optional[str] = None
108
136
  processing_date: Optional[str] = None
137
+ processing: Optional[ProcessingMetadata] = None
109
138
  checksum: Optional[Checksum] = None
110
139
  statistics: Optional[Statistics] = None
111
140
  files: Optional[FileIndex] = None
@@ -123,6 +152,7 @@ class BaseChunk(BaseModel):
123
152
  type: str
124
153
  content: str = ""
125
154
  path: Optional[str] = None
155
+ page_nums: Optional[List[int]] = None
126
156
 
127
157
 
128
158
  TextChunkTokens: TypeAlias = List[str]
@@ -136,6 +166,7 @@ class TextChunk(BaseChunk):
136
166
  tokens: Optional[TextChunkTokens] = None
137
167
  keywords: Optional[List[str]] = None
138
168
  summary: Optional[str] = None
169
+ connect_to: Optional[List[Dict[str, Any]]] = None
139
170
  relationships: Optional[List[Union[Dict[str, Any], str]]] = None
140
171
 
141
172
 
@@ -210,6 +241,15 @@ class TableChunk(BaseChunk):
210
241
  Chunk = Union[TextChunk, ImageChunk, TableChunk]
211
242
 
212
243
 
244
+ class SlimChunk(BaseModel):
245
+ """Minimal chunk entry emitted in chunks_slim.json."""
246
+
247
+ type: str
248
+ path: Optional[str] = None
249
+ content: str = ""
250
+ summary: Optional[str] = None
251
+
252
+
213
253
  # ---------------------------------------------------------------------------
214
254
  # ParseResult — the top-level object returned to the user
215
255
  # ---------------------------------------------------------------------------
@@ -225,8 +265,12 @@ class ParseResult:
225
265
 
226
266
  manifest: Manifest
227
267
  chunks: List[Chunk]
268
+ chunks_slim: Optional[List[SlimChunk]]
228
269
  full_markdown: str
229
270
  hierarchy: Optional[Any]
271
+ toc_hierarchies: Optional[Any]
272
+ kb_csv: Optional[str]
273
+ hierarchy_view_html: Optional[str]
230
274
  raw_zip: bytes
231
275
 
232
276
  def __init__(
@@ -234,14 +278,22 @@ class ParseResult:
234
278
  *,
235
279
  manifest: Manifest,
236
280
  chunks: List[Chunk],
281
+ chunks_slim: Optional[List[SlimChunk]],
237
282
  full_markdown: str,
238
283
  hierarchy: Optional[Any],
284
+ toc_hierarchies: Optional[Any],
285
+ kb_csv: Optional[str],
286
+ hierarchy_view_html: Optional[str],
239
287
  raw_zip: bytes,
240
288
  ) -> None:
241
289
  self.manifest = manifest
242
290
  self.chunks = chunks
291
+ self.chunks_slim = chunks_slim
243
292
  self.full_markdown = full_markdown
244
293
  self.hierarchy = hierarchy
294
+ self.toc_hierarchies = toc_hierarchies
295
+ self.kb_csv = kb_csv
296
+ self.hierarchy_view_html = hierarchy_view_html
245
297
  self.raw_zip = raw_zip
246
298
 
247
299
  # -- convenience properties --
@@ -296,10 +348,58 @@ class ParseResult:
296
348
  dir_path: Path = Path(directory)
297
349
  dir_path.mkdir(parents=True, exist_ok=True)
298
350
 
351
+ # Manifest / chunks
352
+ manifest_path: Path = dir_path / "manifest.json"
353
+ manifest_path.write_text(
354
+ self.manifest.model_dump_json(indent=2),
355
+ encoding="utf-8",
356
+ )
357
+
358
+ chunks_path: Path = dir_path / "chunks.json"
359
+ chunks_path.write_text(
360
+ json.dumps([chunk.model_dump() for chunk in self.chunks], indent=2),
361
+ encoding="utf-8",
362
+ )
363
+
364
+ if self.chunks_slim is not None:
365
+ chunks_slim_path: Path = dir_path / "chunks_slim.json"
366
+ chunks_slim_path.write_text(
367
+ json.dumps(
368
+ {"chunks": [chunk.model_dump() for chunk in self.chunks_slim]},
369
+ indent=2,
370
+ ),
371
+ encoding="utf-8",
372
+ )
373
+
299
374
  # Full markdown
300
375
  md_path: Path = dir_path / "full.md"
301
376
  md_path.write_text(self.full_markdown, encoding="utf-8")
302
377
 
378
+ if self.hierarchy is not None:
379
+ hierarchy_path: Path = dir_path / "hierarchy.json"
380
+ hierarchy_path.write_text(
381
+ json.dumps(self.hierarchy, indent=2),
382
+ encoding="utf-8",
383
+ )
384
+
385
+ if self.toc_hierarchies is not None:
386
+ toc_hierarchies_path: Path = dir_path / "toc_hierarchies.json"
387
+ toc_hierarchies_path.write_text(
388
+ json.dumps(self.toc_hierarchies, indent=2),
389
+ encoding="utf-8",
390
+ )
391
+
392
+ if self.kb_csv is not None:
393
+ kb_csv_path: Path = dir_path / "kb.csv"
394
+ kb_csv_path.write_text(self.kb_csv, encoding="utf-8")
395
+
396
+ if self.hierarchy_view_html is not None:
397
+ hierarchy_view_path: Path = dir_path / "hierarchy_view.html"
398
+ hierarchy_view_path.write_text(
399
+ self.hierarchy_view_html,
400
+ encoding="utf-8",
401
+ )
402
+
303
403
  # Images
304
404
  if self.image_chunks:
305
405
  images_dir: Path = dir_path / "images"
@@ -4,8 +4,6 @@ from __future__ import annotations
4
4
 
5
5
  from typing import Any, Dict, List, Optional
6
6
 
7
- import pytest
8
-
9
7
  from knowhere.types.job import Job, JobError, JobResult
10
8
  from knowhere.types.result import (
11
9
  BaseChunk,
@@ -16,6 +14,10 @@ from knowhere.types.result import (
16
14
  ImageFileInfo,
17
15
  Manifest,
18
16
  ParseResult,
17
+ ProcessingCost,
18
+ ProcessingMetadata,
19
+ ProcessingTiming,
20
+ SlimChunk,
19
21
  Statistics,
20
22
  TableChunk,
21
23
  TableFileInfo,
@@ -269,6 +271,27 @@ class TestManifestModel:
269
271
  assert manifest.statistics is None
270
272
  assert manifest.files is None
271
273
 
274
+ def test_processing_metadata(self) -> None:
275
+ manifest: Manifest = Manifest(
276
+ version="2.0",
277
+ processing=ProcessingMetadata(
278
+ page_count=12,
279
+ billing_status="charged",
280
+ cost=ProcessingCost(micro_dollars=60000, credits=0.06),
281
+ timing=ProcessingTiming(
282
+ started_at="2026-04-09T08:20:56.634Z",
283
+ completed_at="2026-04-09T08:21:12.288Z",
284
+ duration_ms=15653,
285
+ ),
286
+ ),
287
+ )
288
+ assert manifest.processing is not None
289
+ assert manifest.processing.page_count == 12
290
+ assert manifest.processing.cost is not None
291
+ assert manifest.processing.cost.micro_dollars == 60000
292
+ assert manifest.processing.timing is not None
293
+ assert manifest.processing.timing.duration_ms == 15653
294
+
272
295
 
273
296
  # ---------------------------------------------------------------------------
274
297
  # Statistics model
@@ -375,6 +398,13 @@ class TestBaseChunkModel:
375
398
  chunk: BaseChunk = BaseChunk(chunk_id="chunk_2", type="text")
376
399
  assert chunk.content == ""
377
400
  assert chunk.path is None
401
+ assert chunk.page_nums is None
402
+
403
+ def test_page_nums_supported(self) -> None:
404
+ chunk: BaseChunk = BaseChunk(
405
+ chunk_id="chunk_3", type="text", page_nums=[1, 2]
406
+ )
407
+ assert chunk.page_nums == [1, 2]
378
408
 
379
409
 
380
410
  # ---------------------------------------------------------------------------
@@ -391,18 +421,23 @@ class TestTextChunkModel:
391
421
  content="Some text content",
392
422
  path="doc/section1",
393
423
  length=17,
424
+ page_nums=[1, 2],
394
425
  tokens=["Some", "text", "content"],
395
426
  keywords=["text", "content"],
396
427
  summary="A text chunk",
428
+ connect_to=[{"target": "img_1", "relation": "embeds"}],
397
429
  relationships=[{"target": "text_2", "type": "follows"}],
398
430
  )
399
431
  assert chunk.chunk_id == "text_1"
400
432
  assert chunk.type == "text"
401
433
  assert chunk.content == "Some text content"
402
434
  assert chunk.length == 17
435
+ assert chunk.page_nums == [1, 2]
403
436
  assert chunk.tokens == ["Some", "text", "content"]
404
437
  assert chunk.keywords == ["text", "content"]
405
438
  assert chunk.summary == "A text chunk"
439
+ assert chunk.connect_to is not None
440
+ assert len(chunk.connect_to) == 1
406
441
  assert chunk.relationships is not None
407
442
  assert len(chunk.relationships) == 1
408
443
 
@@ -413,6 +448,7 @@ class TestTextChunkModel:
413
448
  assert chunk.tokens is None
414
449
  assert chunk.keywords is None
415
450
  assert chunk.summary is None
451
+ assert chunk.connect_to is None
416
452
  assert chunk.relationships is None
417
453
 
418
454
  def test_is_instance_of_base_chunk(self) -> None:
@@ -567,8 +603,19 @@ def _build_parse_result(
567
603
  return ParseResult(
568
604
  manifest=manifest,
569
605
  chunks=chunks if chunks is not None else default_chunks,
606
+ chunks_slim=[
607
+ SlimChunk(
608
+ type="text",
609
+ path="doc/section1",
610
+ content="Hello world",
611
+ summary="Greeting",
612
+ )
613
+ ],
570
614
  full_markdown="# Test\n\nHello world",
571
615
  hierarchy=None,
616
+ toc_hierarchies=[{"toc_range": [1, 3]}],
617
+ kb_csv="chunk_id,type\ntext_1,text\n",
618
+ hierarchy_view_html="<html><body>Hierarchy</body></html>",
572
619
  raw_zip=b"fake zip bytes",
573
620
  )
574
621
 
@@ -657,3 +704,11 @@ class TestParseResult:
657
704
  def test_raw_zip_accessible(self) -> None:
658
705
  result: ParseResult = _build_parse_result()
659
706
  assert result.raw_zip == b"fake zip bytes"
707
+
708
+ def test_optimized_result_fields_accessible(self) -> None:
709
+ result: ParseResult = _build_parse_result()
710
+ assert result.chunks_slim is not None
711
+ assert result.chunks_slim[0].path == "doc/section1"
712
+ assert result.toc_hierarchies == [{"toc_range": [1, 3]}]
713
+ assert result.kb_csv == "chunk_id,type\ntext_1,text\n"
714
+ assert result.hierarchy_view_html == "<html><body>Hierarchy</body></html>"
@@ -56,6 +56,7 @@ TEXT_TOKENS_LIST: List[str] = ["Ashish", "Vaswani", "attention", "transformer"]
56
56
 
57
57
  MARKDOWN: str = "# Test\n\nHello world"
58
58
  IMAGE_BYTES: bytes = b"\xff\xd8\xff\xe0"
59
+ TABLE_HTML: str = "<table><tr><td>Optimized</td></tr></table>"
59
60
 
60
61
 
61
62
  def _build_zip(
@@ -120,6 +121,91 @@ def _make_manifest(checksum_value: str = "") -> Dict[str, Any]:
120
121
  }
121
122
 
122
123
 
124
+ def _make_optimized_manifest() -> Dict[str, Any]:
125
+ """Build a manifest dict matching the current optimized API payload."""
126
+ return {
127
+ "version": "2.0",
128
+ "job_id": "job_optimized123",
129
+ "data_id": None,
130
+ "source_file_name": "optimized.pdf",
131
+ "processing_date": "2026-04-09T08:21:12.294Z",
132
+ "processing": {
133
+ "page_count": 12,
134
+ "billing_status": "charged",
135
+ "cost": {
136
+ "micro_dollars": 60000,
137
+ "credits": 0.06,
138
+ },
139
+ "timing": {
140
+ "started_at": "2026-04-09T08:20:56.634Z",
141
+ "completed_at": "2026-04-09T08:21:12.288Z",
142
+ "duration_ms": 15653,
143
+ },
144
+ },
145
+ "statistics": {
146
+ "total_chunks": 3,
147
+ "text_chunks": 1,
148
+ "image_chunks": 1,
149
+ "table_chunks": 1,
150
+ "total_pages": None,
151
+ },
152
+ }
153
+
154
+
155
+ def _make_optimized_chunks() -> List[Dict[str, Any]]:
156
+ """Build chunks matching the current optimized API payload."""
157
+ return [
158
+ {
159
+ "chunk_id": "text_chunk_optimized",
160
+ "type": "text",
161
+ "content": "Text chunk with embedded resources.",
162
+ "path": "Default_Root/optimized.pdf-->Section 1",
163
+ "metadata": {
164
+ "length": 35,
165
+ "summary": "",
166
+ "page_nums": [1, 2],
167
+ "tokens": ["Text", "chunk"],
168
+ "keywords": ["optimized"],
169
+ "connect_to": [
170
+ {
171
+ "target": "image_chunk_optimized",
172
+ "relation": "embeds",
173
+ "ref": "[images/IMAGE_test1.jpg]",
174
+ }
175
+ ],
176
+ },
177
+ },
178
+ {
179
+ "chunk_id": "image_chunk_optimized",
180
+ "type": "image",
181
+ "content": "[images/IMAGE_test1.jpg]",
182
+ "path": "images/IMAGE_test1.jpg",
183
+ "metadata": {
184
+ "length": 1,
185
+ "summary": "Optimized image chunk",
186
+ "page_nums": [2],
187
+ "file_path": "images/IMAGE_test1.jpg",
188
+ "keywords": [],
189
+ "tokens": [],
190
+ },
191
+ },
192
+ {
193
+ "chunk_id": "table_chunk_optimized",
194
+ "type": "table",
195
+ "content": TABLE_HTML,
196
+ "path": "tables/table-optimized.html",
197
+ "metadata": {
198
+ "length": 1,
199
+ "summary": "Optimized table chunk",
200
+ "page_nums": [3],
201
+ "file_path": "tables/table-optimized.html",
202
+ "keywords": ["optimized"],
203
+ "tokens": [],
204
+ },
205
+ },
206
+ ]
207
+
208
+
123
209
  # ---------------------------------------------------------------------------
124
210
  # Valid ZIP parsing
125
211
  # ---------------------------------------------------------------------------
@@ -254,6 +340,114 @@ class TestParseValidZip:
254
340
 
255
341
  assert result.getChunk("nonexistent") is None
256
342
 
343
+ def test_exposes_optimized_payload_metadata_and_sidecar_assets(self) -> None:
344
+ manifest: Dict[str, Any] = _make_optimized_manifest()
345
+ chunks: List[Dict[str, Any]] = _make_optimized_chunks()
346
+ zip_bytes: bytes = _build_zip(
347
+ manifest,
348
+ chunks=chunks,
349
+ markdown="# Optimized Result\n\nBody",
350
+ extra_entries={
351
+ "chunks_slim.json": json.dumps(
352
+ {
353
+ "chunks": [
354
+ {
355
+ "type": "text",
356
+ "path": "Default_Root/optimized.pdf-->Section 1",
357
+ "content": "Text chunk with embedded resources.",
358
+ "summary": "",
359
+ }
360
+ ]
361
+ }
362
+ ).encode("utf-8"),
363
+ "kb.csv": b"chunk_id,type\ntext_chunk_optimized,text\n",
364
+ "hierarchy.json": json.dumps(
365
+ {"Default_Root": {"optimized.pdf": {}}}
366
+ ).encode("utf-8"),
367
+ "toc_hierarchies.json": json.dumps(
368
+ [{"toc_range": [1, 3], "scan_range": [1, 10]}]
369
+ ).encode("utf-8"),
370
+ "hierarchy_view.html": b"<html><body>Optimized hierarchy view</body></html>",
371
+ "tables/table-optimized.html": TABLE_HTML.encode("utf-8"),
372
+ },
373
+ )
374
+
375
+ result: ParseResult = parseResultZip(zip_bytes, verify_checksum=False)
376
+
377
+ assert result.manifest.version == "2.0"
378
+ assert result.manifest.files is None
379
+ assert result.manifest.processing is not None
380
+ assert result.manifest.processing.page_count == 12
381
+ assert result.manifest.processing.billing_status == "charged"
382
+ assert result.manifest.processing.cost is not None
383
+ assert result.manifest.processing.cost.micro_dollars == 60000
384
+ assert result.text_chunks[0].page_nums == [1, 2]
385
+ assert result.image_chunks[0].page_nums == [2]
386
+ assert result.table_chunks[0].page_nums == [3]
387
+ assert result.text_chunks[0].connect_to == [
388
+ {
389
+ "target": "image_chunk_optimized",
390
+ "relation": "embeds",
391
+ "ref": "[images/IMAGE_test1.jpg]",
392
+ }
393
+ ]
394
+ assert result.chunks_slim is not None
395
+ assert len(result.chunks_slim) == 1
396
+ assert result.kb_csv == "chunk_id,type\ntext_chunk_optimized,text\n"
397
+ assert result.toc_hierarchies == [{"toc_range": [1, 3], "scan_range": [1, 10]}]
398
+ assert result.hierarchy_view_html == "<html><body>Optimized hierarchy view</body></html>"
399
+ assert result.hierarchy == {"Default_Root": {"optimized.pdf": {}}}
400
+
401
+ def test_save_preserves_optimized_sidecar_files(self, tmp_path: Path) -> None:
402
+ manifest: Dict[str, Any] = _make_optimized_manifest()
403
+ chunks: List[Dict[str, Any]] = _make_optimized_chunks()
404
+ zip_bytes: bytes = _build_zip(
405
+ manifest,
406
+ chunks=chunks,
407
+ markdown="# Optimized Result\n\nBody",
408
+ extra_entries={
409
+ "chunks_slim.json": json.dumps(
410
+ {
411
+ "chunks": [
412
+ {
413
+ "type": "text",
414
+ "path": "Default_Root/optimized.pdf-->Section 1",
415
+ "content": "Text chunk with embedded resources.",
416
+ "summary": "",
417
+ }
418
+ ]
419
+ }
420
+ ).encode("utf-8"),
421
+ "kb.csv": b"chunk_id,type\ntext_chunk_optimized,text\n",
422
+ "hierarchy.json": json.dumps(
423
+ {"Default_Root": {"optimized.pdf": {}}}
424
+ ).encode("utf-8"),
425
+ "toc_hierarchies.json": json.dumps(
426
+ [{"toc_range": [1, 3], "scan_range": [1, 10]}]
427
+ ).encode("utf-8"),
428
+ "hierarchy_view.html": b"<html><body>Optimized hierarchy view</body></html>",
429
+ "tables/table-optimized.html": TABLE_HTML.encode("utf-8"),
430
+ },
431
+ )
432
+
433
+ result: ParseResult = parseResultZip(zip_bytes, verify_checksum=False)
434
+ output_dir: Path = tmp_path / "optimized-result"
435
+
436
+ saved_path: Path = result.save(output_dir)
437
+
438
+ assert saved_path == output_dir.resolve()
439
+ assert (output_dir / "manifest.json").exists()
440
+ assert (output_dir / "chunks.json").exists()
441
+ assert (output_dir / "full.md").exists()
442
+ assert (output_dir / "hierarchy.json").exists()
443
+ assert (output_dir / "chunks_slim.json").exists()
444
+ assert (output_dir / "kb.csv").exists()
445
+ assert (output_dir / "toc_hierarchies.json").exists()
446
+ assert (output_dir / "hierarchy_view.html").exists()
447
+ assert (output_dir / "images" / "IMAGE_test1.jpg").exists()
448
+ assert (output_dir / "tables" / "table-optimized.html").exists()
449
+ assert (output_dir / "result.zip").exists()
450
+
257
451
 
258
452
  # ---------------------------------------------------------------------------
259
453
  # Checksum verification
@@ -334,6 +528,7 @@ class TestMissingRequiredFiles:
334
528
  result: ParseResult = parseResultZip(
335
529
  zip_bytes, verify_checksum=False
336
530
  )
531
+ assert result.chunks == []
337
532
 
338
533
 
339
534
  # ---------------------------------------------------------------------------
@@ -1,3 +0,0 @@
1
- {
2
- ".": "0.2.0"
3
- }
@@ -1 +0,0 @@
1
- __version__ = "0.2.0" # x-release-please-version