knowhere-python-sdk 0.3.2__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. knowhere_python_sdk-0.5.0/.release-please-manifest.json +3 -0
  2. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/CHANGELOG.md +14 -0
  3. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/PKG-INFO +16 -1
  4. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/README.md +15 -0
  5. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/docs/usage.md +64 -29
  6. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/pyproject.toml +1 -1
  7. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/__init__.py +14 -1
  8. knowhere_python_sdk-0.5.0/src/knowhere/_version.py +1 -0
  9. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/lib/result_parser.py +18 -59
  10. knowhere_python_sdk-0.5.0/src/knowhere/resources/documents.py +191 -0
  11. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/retrieval.py +6 -0
  12. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/types/__init__.py +14 -1
  13. knowhere_python_sdk-0.5.0/src/knowhere/types/document.py +78 -0
  14. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/types/result.py +105 -40
  15. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/types/retrieval.py +10 -3
  16. knowhere_python_sdk-0.5.0/tests/test_documents.py +219 -0
  17. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_models.py +10 -47
  18. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_result_parser.py +200 -105
  19. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_retrieval.py +93 -0
  20. knowhere_python_sdk-0.3.2/.release-please-manifest.json +0 -3
  21. knowhere_python_sdk-0.3.2/src/knowhere/_version.py +0 -1
  22. knowhere_python_sdk-0.3.2/src/knowhere/resources/documents.py +0 -74
  23. knowhere_python_sdk-0.3.2/src/knowhere/types/document.py +0 -28
  24. knowhere_python_sdk-0.3.2/tests/test_documents.py +0 -106
  25. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/.github/ISSUE_TEMPLATE/bug-report.yml +0 -0
  26. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/.github/ISSUE_TEMPLATE/config.yml +0 -0
  27. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/.github/ISSUE_TEMPLATE/feature-request.yml +0 -0
  28. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/.github/pull_request_template.md +0 -0
  29. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/.github/workflows/ci.yml +0 -0
  30. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/.github/workflows/publish-pypi.yml +0 -0
  31. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/.github/workflows/publish.yml +0 -0
  32. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/.gitignore +0 -0
  33. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/CODE_OF_CONDUCT.md +0 -0
  34. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/CONTRIBUTING.md +0 -0
  35. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/LICENSE +0 -0
  36. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/SECURITY.md +0 -0
  37. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/examples/async_usage.py +0 -0
  38. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/examples/error_handling.py +0 -0
  39. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/examples/parse_file.py +0 -0
  40. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/examples/parse_url.py +0 -0
  41. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/examples/step_by_step.py +0 -0
  42. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/release-please-config.json +0 -0
  43. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/_base_client.py +0 -0
  44. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/_client.py +0 -0
  45. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/_constants.py +0 -0
  46. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/_exceptions.py +0 -0
  47. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/_logging.py +0 -0
  48. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/_response.py +0 -0
  49. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/_types.py +0 -0
  50. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/lib/__init__.py +0 -0
  51. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/lib/polling.py +0 -0
  52. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/lib/upload.py +0 -0
  53. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/py.typed +0 -0
  54. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/__init__.py +0 -0
  55. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/_base.py +0 -0
  56. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/resources/jobs.py +0 -0
  57. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/types/job.py +0 -0
  58. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/types/params.py +0 -0
  59. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/src/knowhere/types/shared.py +0 -0
  60. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/__init__.py +0 -0
  61. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/conftest.py +0 -0
  62. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/fixtures/real_result.zip +0 -0
  63. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_client.py +0 -0
  64. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_exceptions.py +0 -0
  65. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_jobs.py +0 -0
  66. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_logging.py +0 -0
  67. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_parse.py +0 -0
  68. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_polling.py +0 -0
  69. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_retry.py +0 -0
  70. {knowhere_python_sdk-0.3.2 → knowhere_python_sdk-0.5.0}/tests/test_upload.py +0 -0
@@ -0,0 +1,3 @@
1
+ {
2
+ ".": "0.5.0"
3
+ }
@@ -1,5 +1,19 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.5.0](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.4.0...v0.5.0) (2026-05-15)
4
+
5
+
6
+ ### Features
7
+
8
+ * sync SDK with current worker ZIP contract and agentic retrieval API ([ad8db2e](https://github.com/Ontos-AI/knowhere-python-sdk/commit/ad8db2e87c77978928d046c95565e9e60c1b1f4e))
9
+
10
+ ## [0.4.0](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.3.2...v0.4.0) (2026-04-27)
11
+
12
+
13
+ ### Features
14
+
15
+ * add document chunks resource methods ([73094d4](https://github.com/Ontos-AI/knowhere-python-sdk/commit/73094d4f95ef693785fa3965f6f2a223dfd2a350))
16
+
3
17
  ## [0.3.2](https://github.com/Ontos-AI/knowhere-python-sdk/compare/v0.3.1...v0.3.2) (2026-04-23)
4
18
 
5
19
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: knowhere-python-sdk
3
- Version: 0.3.2
3
+ Version: 0.5.0
4
4
  Summary: Official Python SDK for the Knowhere document parsing API
5
5
  Project-URL: Homepage, https://knowhereto.ai
6
6
  Project-URL: Documentation, https://docs.knowhereto.ai
@@ -118,6 +118,21 @@ update_job = client.jobs.create(
118
118
  document = client.documents.get(document_id)
119
119
  print(document.status)
120
120
 
121
+ chunks = client.documents.list_chunks(
122
+ document_id,
123
+ page=1,
124
+ page_size=50,
125
+ chunk_type="text",
126
+ )
127
+ print(chunks.pagination.total)
128
+ if chunks.chunks:
129
+ chunk = client.documents.get_chunk(
130
+ document_id,
131
+ chunks.chunks[0].id,
132
+ include_asset_urls=True,
133
+ )
134
+ print(chunk.chunk.content)
135
+
121
136
  client.documents.archive(document_id)
122
137
  ```
123
138
 
@@ -85,6 +85,21 @@ update_job = client.jobs.create(
85
85
  document = client.documents.get(document_id)
86
86
  print(document.status)
87
87
 
88
+ chunks = client.documents.list_chunks(
89
+ document_id,
90
+ page=1,
91
+ page_size=50,
92
+ chunk_type="text",
93
+ )
94
+ print(chunks.pagination.total)
95
+ if chunks.chunks:
96
+ chunk = client.documents.get_chunk(
97
+ document_id,
98
+ chunks.chunks[0].id,
99
+ include_asset_urls=True,
100
+ )
101
+ print(chunk.chunk.content)
102
+
88
103
  client.documents.archive(document_id)
89
104
  ```
90
105
 
@@ -1,5 +1,9 @@
1
1
  # Knowhere Python SDK — Usage Guide
2
2
 
3
+ > **Recent changes:** Chunk metadata fields (`tokens`, `keywords`, `summary`,
4
+ > `length`, etc.) are no longer flattened to the chunk surface. Access them
5
+ > through `chunk.metadata` instead. See [Chunk Types](#chunk-types).
6
+
3
7
  Comprehensive reference for every feature, parameter, and pattern in the SDK.
4
8
 
5
9
  ## Table of Contents
@@ -219,8 +223,13 @@ result.table_chunks # List[TableChunk]
219
223
  # Lookup by ID
220
224
  chunk = result.getChunk("chunk_42")
221
225
 
222
- # Hierarchy data (document structure tree, if available)
223
- result.hierarchy
226
+ # Document navigation tree (from doc_nav.json, current worker output)
227
+ result.doc_nav # DocNav | None
228
+ result.doc_nav.sections # List[DocNavSection] — tree of titles/paths/levels
229
+ result.doc_nav.resources # DocNavResources — image/table resource summaries
230
+
231
+ # Legacy hierarchy (from hierarchy.json, older worker output)
232
+ result.hierarchy # Any | None
224
233
 
225
234
  # Raw ZIP bytes (for archival)
226
235
  result.raw_zip
@@ -239,49 +248,48 @@ result.save("./output/report/")
239
248
 
240
249
  ## Chunk Types
241
250
 
242
- Every chunk shares a base set of fields (`chunk_id`, `type`, `content`, `path`). Each type adds its own fields.
251
+ Every chunk shares a base set of fields (`chunk_id`, `type`, `content`, `path`,
252
+ `metadata`). Worker metadata is kept in the `metadata` dict — it is **not**
253
+ flattened to top-level chunk properties.
243
254
 
244
- ### TextChunk
255
+ ### Base fields (all chunk types)
245
256
 
246
257
  | Field | Type | Description |
247
258
  |-------|------|-------------|
248
259
  | `chunk_id` | `str` | Unique identifier |
249
- | `type` | `str` | Always `"text"` |
250
- | `content` | `str` | The text content |
251
- | `path` | `str \| None` | Document structure path (e.g. `"Section 1 > Subsection 2"`) |
252
- | `length` | `int` | Character count |
253
- | `tokens` | `List[str] \| None` | Tokenized words returned by the parser pipeline |
254
- | `keywords` | `List[str] \| None` | Extracted keywords (requires `summary_txt: True`) |
255
- | `summary` | `str \| None` | AI-generated summary (requires `summary_txt: True`) |
256
- | `relationships` | `List \| None` | Relationships to other chunks |
260
+ | `type` | `str` | `"text"`, `"image"`, or `"table"` |
261
+ | `content` | `str` | Text content or placeholder |
262
+ | `path` | `str \| None` | Document structure path |
263
+ | `metadata` | `dict` | Raw worker metadata (tokens, keywords, summary, length, page_nums, etc.) |
264
+
265
+ ### TextChunk
257
266
 
258
267
  ```python
259
268
  for chunk in result.text_chunks:
260
269
  print(f"[{chunk.chunk_id}] {chunk.content[:60]}...")
261
- if chunk.keywords:
262
- print(f" Keywords: {', '.join(chunk.keywords)}")
263
- if chunk.summary:
264
- print(f" Summary: {chunk.summary}")
270
+ # Metadata is in chunk.metadata, not flattened:
271
+ keywords = chunk.metadata.get("keywords", [])
272
+ summary = chunk.metadata.get("summary")
273
+ if keywords:
274
+ print(f" Keywords: {', '.join(keywords)}")
275
+ if summary:
276
+ print(f" Summary: {summary}")
265
277
  ```
266
278
 
267
279
  ### ImageChunk
268
280
 
269
281
  | Field | Type | Description |
270
282
  |-------|------|-------------|
271
- | `chunk_id` | `str` | Unique identifier |
272
- | `type` | `str` | Always `"image"` |
273
- | `content` | `str` | Text content associated with the image |
274
283
  | `file_path` | `str \| None` | Path within the ZIP |
275
- | `original_name` | `str \| None` | Original filename |
276
- | `summary` | `str \| None` | AI-generated image description (requires `summary_image: True`) |
277
284
  | `data` | `bytes` | Raw image bytes (loaded from ZIP) |
278
285
  | `format` | `str \| None` | Image format inferred from extension (property) |
279
286
 
280
287
  ```python
281
288
  for img in result.image_chunks:
282
289
  print(f"{img.file_path} ({len(img.data)} bytes, {img.format})")
283
- if img.summary:
284
- print(f" Description: {img.summary}")
290
+ summary = img.metadata.get("summary")
291
+ if summary:
292
+ print(f" Description: {summary}")
285
293
  img.save("./output/images/") # writes to disk
286
294
  ```
287
295
 
@@ -289,13 +297,7 @@ for img in result.image_chunks:
289
297
 
290
298
  | Field | Type | Description |
291
299
  |-------|------|-------------|
292
- | `chunk_id` | `str` | Unique identifier |
293
- | `type` | `str` | Always `"table"` |
294
- | `content` | `str` | Text representation of the table |
295
300
  | `file_path` | `str \| None` | Path within the ZIP |
296
- | `original_name` | `str \| None` | Original filename |
297
- | `table_type` | `str \| None` | Table classification |
298
- | `summary` | `str \| None` | AI-generated table summary (requires `summary_table: True`) |
299
301
  | `html` | `str` | Full HTML of the table (loaded from ZIP) |
300
302
 
301
303
  ```python
@@ -471,6 +473,19 @@ response = client.retrieval.query(
471
473
  top_k=5,
472
474
  )
473
475
 
476
+ # Agentic mode (LLM navigation + answer synthesis)
477
+ response = client.retrieval.query(
478
+ namespace="support-center",
479
+ query="How do I pair a Bluetooth headset?",
480
+ use_agentic=True,
481
+ top_k=5,
482
+ )
483
+ print(response.answer_text) # LLM-generated natural-language answer
484
+ print(response.router_used) # "workflow_single_step", "small_kb_all", etc.
485
+ for ref in response.referenced_chunks:
486
+ print(ref.get("chunk_id"), ref.get("asset_url"))
487
+
488
+ # Legacy results are always available
474
489
  for result in response.results:
475
490
  print(result.content)
476
491
  print(result.score)
@@ -479,6 +494,10 @@ for result in response.results:
479
494
  print(result.source.section_path)
480
495
  ```
481
496
 
497
+ | Parameter | Type | Default | Description |
498
+ |-----------|------|---------|-------------|
499
+ | `use_agentic` | `bool \| None` | `None` | Force agentic (`True`) or legacy (`False`) retrieval. `None` uses server default. |
500
+
482
501
  Retrieval results expose `content`, not the older parse-result `text` field.
483
502
  Media results may include `asset_url` when the server can sign the referenced
484
503
  artifact.
@@ -521,6 +540,22 @@ for document in document_list.documents:
521
540
  document = client.documents.get("doc_123")
522
541
  print(document.current_job_result_id)
523
542
 
543
+ chunks = client.documents.list_chunks(
544
+ "doc_123",
545
+ page=1,
546
+ page_size=50,
547
+ chunk_type="text",
548
+ )
549
+ for chunk in chunks.chunks:
550
+ print(chunk.id, chunk.content)
551
+
552
+ image_chunk = client.documents.get_chunk(
553
+ "doc_123",
554
+ "dchk_123",
555
+ include_asset_urls=True,
556
+ )
557
+ print(image_chunk.chunk.asset_url)
558
+
524
559
  archived = client.documents.archive("doc_123")
525
560
  print(archived.status) # "archived"
526
561
  ```
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "knowhere-python-sdk"
7
- version = "0.3.2"
7
+ version = "0.5.0"
8
8
  description = "Official Python SDK for the Knowhere document parsing API"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -35,7 +35,15 @@ from knowhere._exceptions import (
35
35
  )
36
36
  from knowhere._types import PollProgressCallback, UploadProgressCallback
37
37
  from knowhere._version import __version__
38
- from knowhere.types.document import Document, DocumentListResponse
38
+ from knowhere.types.document import (
39
+ Document,
40
+ DocumentChunk,
41
+ DocumentChunkListResponse,
42
+ DocumentChunkPagination,
43
+ DocumentChunkResponse,
44
+ DocumentChunkType,
45
+ DocumentListResponse,
46
+ )
39
47
  from knowhere.types.job import Job, JobError, JobProgress, JobResult
40
48
  from knowhere.types.params import ParsingParams, WebhookConfig
41
49
  from knowhere.types.retrieval import (
@@ -98,6 +106,11 @@ __all__: list[str] = [
98
106
  "JobResult",
99
107
  # Document types
100
108
  "Document",
109
+ "DocumentChunk",
110
+ "DocumentChunkListResponse",
111
+ "DocumentChunkPagination",
112
+ "DocumentChunkResponse",
113
+ "DocumentChunkType",
101
114
  "DocumentListResponse",
102
115
  # Retrieval types
103
116
  "RetrievalChannel",
@@ -0,0 +1 @@
1
+ __version__ = "0.5.0" # x-release-please-version
@@ -13,13 +13,13 @@ from knowhere._exceptions import ChecksumError, KnowhereError
13
13
  from knowhere._logging import getLogger
14
14
  from knowhere.types.result import (
15
15
  Chunk,
16
+ DocNav,
16
17
  ImageChunk,
17
18
  Manifest,
18
19
  ParseResult,
19
20
  SlimChunk,
20
21
  TableChunk,
21
22
  TextChunk,
22
- TextChunkTokens,
23
23
  )
24
24
 
25
25
  _logger = getLogger()
@@ -81,38 +81,6 @@ def _extractFilePath(raw: Dict[str, Any]) -> Optional[str]:
81
81
  return fallback
82
82
 
83
83
 
84
- def _normalizeTokenList(raw_tokens: List[Any]) -> List[str]:
85
- """Return a string-only token list with empty values removed."""
86
- normalized_tokens: List[str] = []
87
- for raw_token in raw_tokens:
88
- token_text: str = str(raw_token).strip()
89
- if token_text:
90
- normalized_tokens.append(token_text)
91
- return normalized_tokens
92
-
93
-
94
- def _parseTextChunkTokens(
95
- raw_tokens: Any,
96
- *,
97
- chunk_id: str,
98
- ) -> Optional[TextChunkTokens]:
99
- """Normalize text chunk tokens from the current backend payload."""
100
- if raw_tokens is None:
101
- return None
102
- if isinstance(raw_tokens, bool):
103
- raise KnowhereError(
104
- f"Invalid tokens payload for text chunk '{chunk_id}': expected list[str], got bool."
105
- )
106
- if isinstance(raw_tokens, list):
107
- return _normalizeTokenList(raw_tokens)
108
-
109
- raise KnowhereError(
110
- "Invalid tokens payload for text chunk "
111
- f"'{chunk_id}': expected list[str], "
112
- f"got {type(raw_tokens).__name__}."
113
- )
114
-
115
-
116
84
  def _buildChunks(
117
85
  raw_chunks: List[Dict[str, Any]],
118
86
  zf: zipfile.ZipFile,
@@ -125,58 +93,39 @@ def _buildChunks(
125
93
 
126
94
  if chunk_type == "image":
127
95
  image_data: bytes = b""
128
- # file_path may be at top level, inside metadata, or use path as fallback
129
96
  file_path: Optional[str] = _extractFilePath(raw)
130
97
  if file_path:
131
98
  image_data = _readZipBytes(zf, file_path) or b""
132
- metadata: Dict[str, Any] = raw.get("metadata", {})
133
99
  chunk: Chunk = ImageChunk(
134
100
  chunk_id=raw.get("chunk_id", ""),
135
101
  type="image",
136
102
  content=raw.get("content", ""),
137
103
  path=raw.get("path"),
138
- page_nums=metadata.get("page_nums", raw.get("page_nums")),
139
- length=metadata.get("length", raw.get("length", 0)),
140
104
  file_path=file_path,
141
- original_name=metadata.get("original_name", raw.get("original_name")),
142
- summary=metadata.get("summary", raw.get("summary")),
143
105
  data=image_data,
106
+ metadata=raw.get("metadata", {}),
144
107
  )
145
108
  elif chunk_type == "table":
146
109
  table_html: str = ""
147
110
  file_path = _extractFilePath(raw)
148
111
  if file_path:
149
112
  table_html = _readZipText(zf, file_path) or ""
150
- metadata = raw.get("metadata", {})
151
113
  chunk = TableChunk(
152
114
  chunk_id=raw.get("chunk_id", ""),
153
115
  type="table",
154
116
  content=raw.get("content", ""),
155
117
  path=raw.get("path"),
156
- page_nums=metadata.get("page_nums", raw.get("page_nums")),
157
- length=metadata.get("length", raw.get("length", 0)),
158
118
  file_path=file_path,
159
- original_name=metadata.get("original_name", raw.get("original_name")),
160
- table_type=metadata.get("table_type", raw.get("table_type")),
161
- summary=metadata.get("summary", raw.get("summary")),
162
119
  html=table_html,
120
+ metadata=raw.get("metadata", {}),
163
121
  )
164
122
  else:
165
- metadata = raw.get("metadata", {})
166
- chunk_id: str = raw.get("chunk_id", "")
167
- raw_tokens: Any = metadata.get("tokens", raw.get("tokens"))
168
123
  chunk = TextChunk(
169
- chunk_id=chunk_id,
124
+ chunk_id=raw.get("chunk_id", ""),
170
125
  type="text",
171
126
  content=raw.get("content", ""),
172
127
  path=raw.get("path"),
173
- page_nums=metadata.get("page_nums", raw.get("page_nums")),
174
- length=metadata.get("length", raw.get("length", 0)),
175
- tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
176
- keywords=metadata.get("keywords", raw.get("keywords")),
177
- summary=metadata.get("summary", raw.get("summary")),
178
- connect_to=metadata.get("connect_to", raw.get("connect_to")),
179
- relationships=metadata.get("relationships", raw.get("relationships")),
128
+ metadata=raw.get("metadata", {}),
180
129
  )
181
130
 
182
131
  chunks.append(chunk)
@@ -229,7 +178,15 @@ def parseResultZip(
229
178
  # -- Full markdown --
230
179
  full_markdown: str = _readZipText(zf, "full.md") or ""
231
180
 
232
- # -- Hierarchy --
181
+ # -- DocNav (current worker output) --
182
+ doc_nav_text: Optional[str] = _readZipText(zf, "doc_nav.json")
183
+ doc_nav: Optional[DocNav] = (
184
+ DocNav.model_validate(json.loads(doc_nav_text))
185
+ if doc_nav_text
186
+ else None
187
+ )
188
+
189
+ # -- Hierarchy (legacy — current worker no longer emits this) --
233
190
  hierarchy_text: Optional[str] = _readZipText(zf, "hierarchy.json")
234
191
  hierarchy: Optional[Any] = (
235
192
  json.loads(hierarchy_text) if hierarchy_text else None
@@ -263,11 +220,13 @@ def parseResultZip(
263
220
  return ParseResult(
264
221
  manifest=manifest,
265
222
  chunks=chunks,
266
- chunks_slim=chunks_slim,
267
223
  full_markdown=full_markdown,
224
+ raw_zip=zip_bytes,
225
+ doc_nav=doc_nav,
226
+ # Legacy — the current worker no longer emits these files
227
+ chunks_slim=chunks_slim,
268
228
  hierarchy=hierarchy,
269
229
  toc_hierarchies=toc_hierarchies,
270
230
  kb_csv=kb_csv,
271
231
  hierarchy_view_html=hierarchy_view_html,
272
- raw_zip=zip_bytes,
273
232
  )
@@ -0,0 +1,191 @@
1
+ """Documents resource for canonical document lifecycle operations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any, Dict, Optional
6
+
7
+ from knowhere.resources._base import AsyncAPIResource, SyncAPIResource
8
+ from knowhere.types.document import (
9
+ Document,
10
+ DocumentChunkListResponse,
11
+ DocumentChunkResponse,
12
+ DocumentChunkType,
13
+ DocumentListResponse,
14
+ )
15
+
16
+
17
+ class Documents(SyncAPIResource):
18
+ """Synchronous interface for ``/v1/documents`` endpoints."""
19
+
20
+ def list(self, *, namespace: Optional[str] = None) -> DocumentListResponse:
21
+ """List canonical documents in a namespace."""
22
+ params: Dict[str, Any] = {}
23
+ if namespace is not None:
24
+ params["namespace"] = namespace
25
+
26
+ return self._request(
27
+ "GET",
28
+ "v1/documents",
29
+ params=params or None,
30
+ cast_to=DocumentListResponse,
31
+ )
32
+
33
+ def get(self, document_id: str) -> Document:
34
+ """Get one canonical document by ID."""
35
+ return self._request(
36
+ "GET",
37
+ f"v1/documents/{document_id}",
38
+ cast_to=Document,
39
+ )
40
+
41
+ def list_chunks(
42
+ self,
43
+ document_id: str,
44
+ *,
45
+ page: int = 1,
46
+ page_size: int = 50,
47
+ chunk_type: Optional[DocumentChunkType] = None,
48
+ include_asset_urls: bool = False,
49
+ ) -> DocumentChunkListResponse:
50
+ """List current-revision chunks for one canonical document."""
51
+ params: Dict[str, Any] = _build_chunk_list_params(
52
+ page=page,
53
+ page_size=page_size,
54
+ chunk_type=chunk_type,
55
+ include_asset_urls=include_asset_urls,
56
+ )
57
+
58
+ return self._request(
59
+ "GET",
60
+ f"v1/documents/{document_id}/chunks",
61
+ params=params or None,
62
+ cast_to=DocumentChunkListResponse,
63
+ )
64
+
65
+ def get_chunk(
66
+ self,
67
+ document_id: str,
68
+ document_chunk_id: str,
69
+ *,
70
+ include_asset_urls: bool = False,
71
+ ) -> DocumentChunkResponse:
72
+ """Get one current-revision chunk for one canonical document."""
73
+ params: Dict[str, Any] = _build_chunk_get_params(
74
+ include_asset_urls=include_asset_urls,
75
+ )
76
+
77
+ return self._request(
78
+ "GET",
79
+ f"v1/documents/{document_id}/chunks/{document_chunk_id}",
80
+ params=params or None,
81
+ cast_to=DocumentChunkResponse,
82
+ )
83
+
84
+ def archive(self, document_id: str) -> Document:
85
+ """Archive one canonical document by ID."""
86
+ return self._request(
87
+ "POST",
88
+ f"v1/documents/{document_id}/archive",
89
+ cast_to=Document,
90
+ )
91
+
92
+
93
+ class AsyncDocuments(AsyncAPIResource):
94
+ """Asynchronous interface for ``/v1/documents`` endpoints."""
95
+
96
+ async def list(self, *, namespace: Optional[str] = None) -> DocumentListResponse:
97
+ """List canonical documents in a namespace."""
98
+ params: Dict[str, Any] = {}
99
+ if namespace is not None:
100
+ params["namespace"] = namespace
101
+
102
+ return await self._request(
103
+ "GET",
104
+ "v1/documents",
105
+ params=params or None,
106
+ cast_to=DocumentListResponse,
107
+ )
108
+
109
+ async def get(self, document_id: str) -> Document:
110
+ """Get one canonical document by ID."""
111
+ return await self._request(
112
+ "GET",
113
+ f"v1/documents/{document_id}",
114
+ cast_to=Document,
115
+ )
116
+
117
+ async def list_chunks(
118
+ self,
119
+ document_id: str,
120
+ *,
121
+ page: int = 1,
122
+ page_size: int = 50,
123
+ chunk_type: Optional[DocumentChunkType] = None,
124
+ include_asset_urls: bool = False,
125
+ ) -> DocumentChunkListResponse:
126
+ """List current-revision chunks for one canonical document."""
127
+ params: Dict[str, Any] = _build_chunk_list_params(
128
+ page=page,
129
+ page_size=page_size,
130
+ chunk_type=chunk_type,
131
+ include_asset_urls=include_asset_urls,
132
+ )
133
+
134
+ return await self._request(
135
+ "GET",
136
+ f"v1/documents/{document_id}/chunks",
137
+ params=params or None,
138
+ cast_to=DocumentChunkListResponse,
139
+ )
140
+
141
+ async def get_chunk(
142
+ self,
143
+ document_id: str,
144
+ document_chunk_id: str,
145
+ *,
146
+ include_asset_urls: bool = False,
147
+ ) -> DocumentChunkResponse:
148
+ """Get one current-revision chunk for one canonical document."""
149
+ params: Dict[str, Any] = _build_chunk_get_params(
150
+ include_asset_urls=include_asset_urls,
151
+ )
152
+
153
+ return await self._request(
154
+ "GET",
155
+ f"v1/documents/{document_id}/chunks/{document_chunk_id}",
156
+ params=params or None,
157
+ cast_to=DocumentChunkResponse,
158
+ )
159
+
160
+ async def archive(self, document_id: str) -> Document:
161
+ """Archive one canonical document by ID."""
162
+ return await self._request(
163
+ "POST",
164
+ f"v1/documents/{document_id}/archive",
165
+ cast_to=Document,
166
+ )
167
+
168
+
169
+ def _build_chunk_list_params(
170
+ *,
171
+ page: int,
172
+ page_size: int,
173
+ chunk_type: Optional[DocumentChunkType],
174
+ include_asset_urls: bool,
175
+ ) -> Dict[str, Any]:
176
+ params: Dict[str, Any] = {}
177
+ if page != 1:
178
+ params["page"] = page
179
+ if page_size != 50:
180
+ params["page_size"] = page_size
181
+ if chunk_type is not None:
182
+ params["chunk_type"] = chunk_type
183
+ if include_asset_urls:
184
+ params["include_asset_urls"] = True
185
+ return params
186
+
187
+
188
+ def _build_chunk_get_params(*, include_asset_urls: bool) -> Dict[str, Any]:
189
+ if not include_asset_urls:
190
+ return {}
191
+ return {"include_asset_urls": True}
@@ -22,6 +22,7 @@ class Retrieval(SyncAPIResource):
22
22
  query: str,
23
23
  namespace: Optional[str] = None,
24
24
  top_k: Optional[int] = None,
25
+ use_agentic: Optional[bool] = None,
25
26
  data_type: Optional[int] = None,
26
27
  signal_paths: Optional[list[str]] = None,
27
28
  filter_mode: Optional[RetrievalFilterMode] = None,
@@ -39,6 +40,8 @@ class Retrieval(SyncAPIResource):
39
40
  body["namespace"] = namespace
40
41
  if top_k is not None:
41
42
  body["top_k"] = top_k
43
+ if use_agentic is not None:
44
+ body["use_agentic"] = use_agentic
42
45
  if data_type is not None:
43
46
  body["data_type"] = data_type
44
47
  if signal_paths is not None:
@@ -77,6 +80,7 @@ class AsyncRetrieval(AsyncAPIResource):
77
80
  query: str,
78
81
  namespace: Optional[str] = None,
79
82
  top_k: Optional[int] = None,
83
+ use_agentic: Optional[bool] = None,
80
84
  data_type: Optional[int] = None,
81
85
  signal_paths: Optional[list[str]] = None,
82
86
  filter_mode: Optional[RetrievalFilterMode] = None,
@@ -94,6 +98,8 @@ class AsyncRetrieval(AsyncAPIResource):
94
98
  body["namespace"] = namespace
95
99
  if top_k is not None:
96
100
  body["top_k"] = top_k
101
+ if use_agentic is not None:
102
+ body["use_agentic"] = use_agentic
97
103
  if data_type is not None:
98
104
  body["data_type"] = data_type
99
105
  if signal_paths is not None:
@@ -2,7 +2,15 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from knowhere.types.document import Document, DocumentListResponse
5
+ from knowhere.types.document import (
6
+ Document,
7
+ DocumentChunk,
8
+ DocumentChunkListResponse,
9
+ DocumentChunkPagination,
10
+ DocumentChunkResponse,
11
+ DocumentChunkType,
12
+ DocumentListResponse,
13
+ )
6
14
  from knowhere.types.job import Job, JobError, JobResult
7
15
  from knowhere.types.params import ParsingParams, WebhookConfig
8
16
  from knowhere.types.retrieval import (
@@ -39,6 +47,11 @@ __all__: list[str] = [
39
47
  "JobResult",
40
48
  # document
41
49
  "Document",
50
+ "DocumentChunk",
51
+ "DocumentChunkListResponse",
52
+ "DocumentChunkPagination",
53
+ "DocumentChunkResponse",
54
+ "DocumentChunkType",
42
55
  "DocumentListResponse",
43
56
  # retrieval
44
57
  "RetrievalChannel",