knowhere-python-sdk 0.2.0__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
knowhere/__init__.py CHANGED
@@ -46,6 +46,10 @@ from knowhere.types.result import (
46
46
  ImageFileInfo,
47
47
  Manifest,
48
48
  ParseResult,
49
+ ProcessingCost,
50
+ ProcessingMetadata,
51
+ ProcessingTiming,
52
+ SlimChunk,
49
53
  Statistics,
50
54
  TableChunk,
51
55
  TableFileInfo,
@@ -91,6 +95,10 @@ __all__: list[str] = [
91
95
  "FileIndex",
92
96
  "ImageFileInfo",
93
97
  "TableFileInfo",
98
+ "ProcessingCost",
99
+ "ProcessingMetadata",
100
+ "ProcessingTiming",
101
+ "SlimChunk",
94
102
  "BaseChunk",
95
103
  "TextChunk",
96
104
  "ImageChunk",
knowhere/_exceptions.py CHANGED
@@ -387,11 +387,29 @@ def makeStatusError(
387
387
  response=response,
388
388
  )
389
389
 
390
- if exception_class in (RateLimitError, ServiceUnavailableError, GatewayTimeoutError):
391
- return exception_class(
390
+ if exception_class is RateLimitError:
391
+ return RateLimitError(
392
392
  status_code,
393
393
  **common_kwargs,
394
- retry_after=retry_after, # type: ignore[call-arg]
394
+ retry_after=retry_after,
395
+ limit=limit,
396
+ period=period,
397
+ )
398
+
399
+ if exception_class is ServiceUnavailableError:
400
+ return ServiceUnavailableError(
401
+ status_code,
402
+ **common_kwargs,
403
+ retry_after=retry_after,
404
+ limit=limit,
405
+ period=period,
406
+ )
407
+
408
+ if exception_class is GatewayTimeoutError:
409
+ return GatewayTimeoutError(
410
+ status_code,
411
+ **common_kwargs,
412
+ retry_after=retry_after,
395
413
  limit=limit,
396
414
  period=period,
397
415
  )
knowhere/_version.py CHANGED
@@ -1 +1 @@
1
- __version__ = "0.2.0" # x-release-please-version
1
+ __version__ = "0.2.1" # x-release-please-version
@@ -16,6 +16,7 @@ from knowhere.types.result import (
16
16
  ImageChunk,
17
17
  Manifest,
18
18
  ParseResult,
19
+ SlimChunk,
19
20
  TableChunk,
20
21
  TextChunk,
21
22
  TextChunkTokens,
@@ -134,6 +135,7 @@ def _buildChunks(
134
135
  type="image",
135
136
  content=raw.get("content", ""),
136
137
  path=raw.get("path"),
138
+ page_nums=metadata.get("page_nums", raw.get("page_nums")),
137
139
  length=metadata.get("length", raw.get("length", 0)),
138
140
  file_path=file_path,
139
141
  original_name=metadata.get("original_name", raw.get("original_name")),
@@ -151,6 +153,7 @@ def _buildChunks(
151
153
  type="table",
152
154
  content=raw.get("content", ""),
153
155
  path=raw.get("path"),
156
+ page_nums=metadata.get("page_nums", raw.get("page_nums")),
154
157
  length=metadata.get("length", raw.get("length", 0)),
155
158
  file_path=file_path,
156
159
  original_name=metadata.get("original_name", raw.get("original_name")),
@@ -167,10 +170,12 @@ def _buildChunks(
167
170
  type="text",
168
171
  content=raw.get("content", ""),
169
172
  path=raw.get("path"),
173
+ page_nums=metadata.get("page_nums", raw.get("page_nums")),
170
174
  length=metadata.get("length", raw.get("length", 0)),
171
175
  tokens=_parseTextChunkTokens(raw_tokens, chunk_id=chunk_id),
172
176
  keywords=metadata.get("keywords", raw.get("keywords")),
173
177
  summary=metadata.get("summary", raw.get("summary")),
178
+ connect_to=metadata.get("connect_to", raw.get("connect_to")),
174
179
  relationships=metadata.get("relationships", raw.get("relationships")),
175
180
  )
176
181
 
@@ -230,12 +235,39 @@ def parseResultZip(
230
235
  json.loads(hierarchy_text) if hierarchy_text else None
231
236
  )
232
237
 
238
+ # -- Optimized sidecar files --
239
+ chunks_slim_text: Optional[str] = _readZipText(zf, "chunks_slim.json")
240
+ parsed_chunks_slim: Any = json.loads(chunks_slim_text) if chunks_slim_text else None
241
+ if isinstance(parsed_chunks_slim, dict) and "chunks" in parsed_chunks_slim:
242
+ raw_chunks_slim: List[Dict[str, Any]] = parsed_chunks_slim["chunks"]
243
+ elif isinstance(parsed_chunks_slim, list):
244
+ raw_chunks_slim = parsed_chunks_slim
245
+ else:
246
+ raw_chunks_slim = []
247
+ chunks_slim: Optional[List[SlimChunk]] = (
248
+ [SlimChunk.model_validate(chunk) for chunk in raw_chunks_slim]
249
+ if chunks_slim_text is not None
250
+ else None
251
+ )
252
+
253
+ toc_hierarchies_text: Optional[str] = _readZipText(zf, "toc_hierarchies.json")
254
+ toc_hierarchies: Optional[Any] = (
255
+ json.loads(toc_hierarchies_text) if toc_hierarchies_text else None
256
+ )
257
+
258
+ kb_csv: Optional[str] = _readZipText(zf, "kb.csv")
259
+ hierarchy_view_html: Optional[str] = _readZipText(zf, "hierarchy_view.html")
260
+
233
261
  zf.close()
234
262
 
235
263
  return ParseResult(
236
264
  manifest=manifest,
237
265
  chunks=chunks,
266
+ chunks_slim=chunks_slim,
238
267
  full_markdown=full_markdown,
239
268
  hierarchy=hierarchy,
269
+ toc_hierarchies=toc_hierarchies,
270
+ kb_csv=kb_csv,
271
+ hierarchy_view_html=hierarchy_view_html,
240
272
  raw_zip=zip_bytes,
241
273
  )
@@ -13,6 +13,10 @@ from knowhere.types.result import (
13
13
  ImageFileInfo,
14
14
  Manifest,
15
15
  ParseResult,
16
+ ProcessingCost,
17
+ ProcessingMetadata,
18
+ ProcessingTiming,
19
+ SlimChunk,
16
20
  Statistics,
17
21
  TableChunk,
18
22
  TableFileInfo,
@@ -36,6 +40,10 @@ __all__: list[str] = [
36
40
  "ImageFileInfo",
37
41
  "Manifest",
38
42
  "ParseResult",
43
+ "ProcessingCost",
44
+ "ProcessingMetadata",
45
+ "ProcessingTiming",
46
+ "SlimChunk",
39
47
  "Statistics",
40
48
  "TableChunk",
41
49
  "TableFileInfo",
knowhere/types/result.py CHANGED
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import os
6
+ import json
6
7
  import re
7
8
  from pathlib import Path
8
9
  from typing import Any, Dict, List, Optional, Union
@@ -92,12 +93,39 @@ class FileIndex(BaseModel):
92
93
 
93
94
  chunks: Optional[str] = None
94
95
  markdown: Optional[str] = None
96
+ chunks_slim: Optional[str] = None
95
97
  kb_csv: Optional[str] = None
96
98
  hierarchy: Optional[str] = None
99
+ toc_hierarchies: Optional[str] = None
100
+ hierarchy_view_html: Optional[str] = None
97
101
  images: List[ImageFileInfo] = Field(default_factory=list)
98
102
  tables: List[TableFileInfo] = Field(default_factory=list)
99
103
 
100
104
 
105
+ class ProcessingCost(BaseModel):
106
+ """Billing details emitted by manifest v2."""
107
+
108
+ micro_dollars: Optional[int] = None
109
+ credits: Optional[float] = None
110
+
111
+
112
+ class ProcessingTiming(BaseModel):
113
+ """Timing details emitted by manifest v2."""
114
+
115
+ started_at: Optional[str] = None
116
+ completed_at: Optional[str] = None
117
+ duration_ms: Optional[int] = None
118
+
119
+
120
+ class ProcessingMetadata(BaseModel):
121
+ """Worker-side processing metadata emitted by manifest v2."""
122
+
123
+ page_count: Optional[int] = None
124
+ billing_status: Optional[str] = None
125
+ cost: Optional[ProcessingCost] = None
126
+ timing: Optional[ProcessingTiming] = None
127
+
128
+
101
129
  class Manifest(BaseModel):
102
130
  """Top-level manifest describing the result ZIP contents."""
103
131
 
@@ -106,6 +134,7 @@ class Manifest(BaseModel):
106
134
  data_id: Optional[str] = None
107
135
  source_file_name: Optional[str] = None
108
136
  processing_date: Optional[str] = None
137
+ processing: Optional[ProcessingMetadata] = None
109
138
  checksum: Optional[Checksum] = None
110
139
  statistics: Optional[Statistics] = None
111
140
  files: Optional[FileIndex] = None
@@ -123,6 +152,7 @@ class BaseChunk(BaseModel):
123
152
  type: str
124
153
  content: str = ""
125
154
  path: Optional[str] = None
155
+ page_nums: Optional[List[int]] = None
126
156
 
127
157
 
128
158
  TextChunkTokens: TypeAlias = List[str]
@@ -136,6 +166,7 @@ class TextChunk(BaseChunk):
136
166
  tokens: Optional[TextChunkTokens] = None
137
167
  keywords: Optional[List[str]] = None
138
168
  summary: Optional[str] = None
169
+ connect_to: Optional[List[Dict[str, Any]]] = None
139
170
  relationships: Optional[List[Union[Dict[str, Any], str]]] = None
140
171
 
141
172
 
@@ -210,6 +241,15 @@ class TableChunk(BaseChunk):
210
241
  Chunk = Union[TextChunk, ImageChunk, TableChunk]
211
242
 
212
243
 
244
+ class SlimChunk(BaseModel):
245
+ """Minimal chunk entry emitted in chunks_slim.json."""
246
+
247
+ type: str
248
+ path: Optional[str] = None
249
+ content: str = ""
250
+ summary: Optional[str] = None
251
+
252
+
213
253
  # ---------------------------------------------------------------------------
214
254
  # ParseResult — the top-level object returned to the user
215
255
  # ---------------------------------------------------------------------------
@@ -225,8 +265,12 @@ class ParseResult:
225
265
 
226
266
  manifest: Manifest
227
267
  chunks: List[Chunk]
268
+ chunks_slim: Optional[List[SlimChunk]]
228
269
  full_markdown: str
229
270
  hierarchy: Optional[Any]
271
+ toc_hierarchies: Optional[Any]
272
+ kb_csv: Optional[str]
273
+ hierarchy_view_html: Optional[str]
230
274
  raw_zip: bytes
231
275
 
232
276
  def __init__(
@@ -234,14 +278,22 @@ class ParseResult:
234
278
  *,
235
279
  manifest: Manifest,
236
280
  chunks: List[Chunk],
281
+ chunks_slim: Optional[List[SlimChunk]],
237
282
  full_markdown: str,
238
283
  hierarchy: Optional[Any],
284
+ toc_hierarchies: Optional[Any],
285
+ kb_csv: Optional[str],
286
+ hierarchy_view_html: Optional[str],
239
287
  raw_zip: bytes,
240
288
  ) -> None:
241
289
  self.manifest = manifest
242
290
  self.chunks = chunks
291
+ self.chunks_slim = chunks_slim
243
292
  self.full_markdown = full_markdown
244
293
  self.hierarchy = hierarchy
294
+ self.toc_hierarchies = toc_hierarchies
295
+ self.kb_csv = kb_csv
296
+ self.hierarchy_view_html = hierarchy_view_html
245
297
  self.raw_zip = raw_zip
246
298
 
247
299
  # -- convenience properties --
@@ -296,10 +348,58 @@ class ParseResult:
296
348
  dir_path: Path = Path(directory)
297
349
  dir_path.mkdir(parents=True, exist_ok=True)
298
350
 
351
+ # Manifest / chunks
352
+ manifest_path: Path = dir_path / "manifest.json"
353
+ manifest_path.write_text(
354
+ self.manifest.model_dump_json(indent=2),
355
+ encoding="utf-8",
356
+ )
357
+
358
+ chunks_path: Path = dir_path / "chunks.json"
359
+ chunks_path.write_text(
360
+ json.dumps([chunk.model_dump() for chunk in self.chunks], indent=2),
361
+ encoding="utf-8",
362
+ )
363
+
364
+ if self.chunks_slim is not None:
365
+ chunks_slim_path: Path = dir_path / "chunks_slim.json"
366
+ chunks_slim_path.write_text(
367
+ json.dumps(
368
+ {"chunks": [chunk.model_dump() for chunk in self.chunks_slim]},
369
+ indent=2,
370
+ ),
371
+ encoding="utf-8",
372
+ )
373
+
299
374
  # Full markdown
300
375
  md_path: Path = dir_path / "full.md"
301
376
  md_path.write_text(self.full_markdown, encoding="utf-8")
302
377
 
378
+ if self.hierarchy is not None:
379
+ hierarchy_path: Path = dir_path / "hierarchy.json"
380
+ hierarchy_path.write_text(
381
+ json.dumps(self.hierarchy, indent=2),
382
+ encoding="utf-8",
383
+ )
384
+
385
+ if self.toc_hierarchies is not None:
386
+ toc_hierarchies_path: Path = dir_path / "toc_hierarchies.json"
387
+ toc_hierarchies_path.write_text(
388
+ json.dumps(self.toc_hierarchies, indent=2),
389
+ encoding="utf-8",
390
+ )
391
+
392
+ if self.kb_csv is not None:
393
+ kb_csv_path: Path = dir_path / "kb.csv"
394
+ kb_csv_path.write_text(self.kb_csv, encoding="utf-8")
395
+
396
+ if self.hierarchy_view_html is not None:
397
+ hierarchy_view_path: Path = dir_path / "hierarchy_view.html"
398
+ hierarchy_view_path.write_text(
399
+ self.hierarchy_view_html,
400
+ encoding="utf-8",
401
+ )
402
+
303
403
  # Images
304
404
  if self.image_chunks:
305
405
  images_dir: Path = dir_path / "images"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: knowhere-python-sdk
3
- Version: 0.2.0
3
+ Version: 0.2.1
4
4
  Summary: Official Python SDK for the Knowhere document parsing API
5
5
  Project-URL: Homepage, https://knowhereto.ai
6
6
  Project-URL: Documentation, https://docs.knowhereto.ai
@@ -1,25 +1,25 @@
1
- knowhere/__init__.py,sha256=EuIpP3FtDeszonVAXMxZimjRd9iUcQ8wA53h1f27S3k,2343
1
+ knowhere/__init__.py,sha256=NFNOUllG-7TZ-NVx7_g1vUPv15zQp1lvAXjb0BQotB4,2513
2
2
  knowhere/_base_client.py,sha256=ddeRR1lWLhes5ipvYX6-TMEecjjiEBGfQdPw_vnSNqA,17978
3
3
  knowhere/_client.py,sha256=MGU1QsyjKrzTiitm891wgNCq6JLf3DR7y7zhkil_p2E,8027
4
4
  knowhere/_constants.py,sha256=ZNCFQC00NpUZIyc_XZ0uemjJE-E8uKAbv3BDa3po9cg,885
5
- knowhere/_exceptions.py,sha256=yg-4pK7AP6uUPxxyggxf8spQeXgFTpKRwELsHjCQycg,11489
5
+ knowhere/_exceptions.py,sha256=NflH7phh_bNFOJmQ758V4mZCAFQskpGXACMz2JIfFNU,11896
6
6
  knowhere/_logging.py,sha256=tNqEA1dLv-adTT6qRq5RBeO35FoWrnS3gwt7gKChLTA,1376
7
7
  knowhere/_response.py,sha256=EsrM794qxCykvl82UkszeqjJzm9_OSq7nsyzaSCnx0I,1415
8
8
  knowhere/_types.py,sha256=8-JFaRcxgBJbw2mV9BwnmCktFVph41a1mduwtXlYidI,1775
9
- knowhere/_version.py,sha256=piZV5NEcs0VIotCxwaWvzWE2ASUv5tox5ye8ogIRiIk,50
9
+ knowhere/_version.py,sha256=5IhDnbb-SxjydsfhOSqft_BBCgSQNKdMjw7ElLASiGo,50
10
10
  knowhere/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
11
  knowhere/lib/__init__.py,sha256=e953V5ny3VmDtCw7y_4uPwdTkwwNpe_Y6o4AEgz3ujw,50
12
12
  knowhere/lib/polling.py,sha256=s0EPHozAvNhXLqr5uwU8YXkkwAdF0ji_nIN0QfR6avY,4500
13
- knowhere/lib/result_parser.py,sha256=U-DK3SDKrbUY0g_-ad04bsbra1mhYy9FJ2opa1n2bTU,8406
13
+ knowhere/lib/result_parser.py,sha256=dR3knoMq-AFMAe0M3l0YgOM-OrtSmofSLaKZO0tgYao,9882
14
14
  knowhere/lib/upload.py,sha256=eT-O9_wB2WkWUAsUd7VzaKY6DVfNeA6WMHRdwm0HM0o,7849
15
15
  knowhere/resources/__init__.py,sha256=_x391t8qxwkGbOmbkzcp7rR10Q8uoDLQaAkZxCq_oM8,170
16
16
  knowhere/resources/_base.py,sha256=tgKphNTsgMhktWp6_rhyVOZyee4CYlDmD5O1_jWVvYo,1829
17
17
  knowhere/resources/jobs.py,sha256=45P4rZ9HMnTdgcso2AwQ6lDA9U80HGsgOU0jZLBIMFU,8460
18
- knowhere/types/__init__.py,sha256=OwTxpa9uo0GOEJ6Ds6rqEmXl86O49ByS6M7cscMwQo8,791
18
+ knowhere/types/__init__.py,sha256=2Qp2bIY7CyVieBdSfQnowyKG-ErMI3wF37-neBdwTBU,961
19
19
  knowhere/types/job.py,sha256=8shCqvgzKKkEPOpEHdk7CnDbPQiDzy3wEd5Jngw94ZM,2362
20
20
  knowhere/types/params.py,sha256=7DyBd4xMxtLPch-A1130-gI0ajKOv2G5tbSMkE8n6-E,543
21
- knowhere/types/result.py,sha256=Lmtaa0wQymBzAm6hXoZZr6dlfwf0WCMEda6Gd8nDIdw,9628
21
+ knowhere/types/result.py,sha256=UmoxaFmxt2bhrP-2O6jYL89C2WuwZh2xcyyHl46Q1_Y,12925
22
22
  knowhere/types/shared.py,sha256=K5ezX212othxgCviiE2WnwWFY2MS08pXKJ8Km1ZWmjc,104
23
- knowhere_python_sdk-0.2.0.dist-info/METADATA,sha256=10dnumfebnQ3VmPHmYuDexWTCdqdFLi-eAaF8FwcNpc,6115
24
- knowhere_python_sdk-0.2.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
25
- knowhere_python_sdk-0.2.0.dist-info/RECORD,,
23
+ knowhere_python_sdk-0.2.1.dist-info/METADATA,sha256=xf35vXtOtg7ubZWh4QNrqcjTpERpJO8kYuPXKmcmz_w,6115
24
+ knowhere_python_sdk-0.2.1.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
25
+ knowhere_python_sdk-0.2.1.dist-info/RECORD,,