kreuzberg 4.0.6__cp310-abi3-macosx_14_0_arm64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of kreuzberg might be problematic. Click here for more details.

kreuzberg/types.py ADDED
@@ -0,0 +1,509 @@
1
+ """Type definitions for Kreuzberg extraction results.
2
+
3
+ These TypedDicts mirror the strongly-typed Rust metadata structures,
4
+ providing type hints for Python users while the actual data comes from
5
+ the Rust core via PyO3 bindings.
6
+ """
7
+
8
+ # ruff: noqa: A005
9
+ from __future__ import annotations
10
+
11
+ from typing import Any, Literal, TypedDict
12
+
13
+
14
+ class ExcelMetadata(TypedDict, total=False):
15
+ """Excel/spreadsheet metadata."""
16
+
17
+ sheet_count: int
18
+ sheet_names: list[str]
19
+
20
+
21
+ class EmailMetadata(TypedDict, total=False):
22
+ """Email metadata."""
23
+
24
+ from_email: str | None
25
+ from_name: str | None
26
+ to_emails: list[str]
27
+ cc_emails: list[str]
28
+ bcc_emails: list[str]
29
+ message_id: str | None
30
+ attachments: list[str]
31
+
32
+
33
+ class ArchiveMetadata(TypedDict, total=False):
34
+ """Archive (ZIP/TAR/7Z) metadata."""
35
+
36
+ format: str
37
+ file_count: int
38
+ file_list: list[str]
39
+ total_size: int
40
+ compressed_size: int | None
41
+
42
+
43
+ class ImageMetadata(TypedDict, total=False):
44
+ """Image metadata."""
45
+
46
+ width: int
47
+ height: int
48
+ format: str
49
+ exif: dict[str, str]
50
+
51
+
52
+ class XmlMetadata(TypedDict, total=False):
53
+ """XML metadata."""
54
+
55
+ element_count: int
56
+ unique_elements: list[str]
57
+
58
+
59
+ class TextMetadata(TypedDict, total=False):
60
+ """Text/Markdown metadata."""
61
+
62
+ line_count: int
63
+ word_count: int
64
+ character_count: int
65
+ headers: list[str] | None
66
+ links: list[tuple[str, str]] | None
67
+ code_blocks: list[tuple[str, str]] | None
68
+
69
+
70
+ class PdfMetadata(TypedDict, total=False):
71
+ """PDF metadata."""
72
+
73
+ title: str | None
74
+ author: str | None
75
+ subject: str | None
76
+ keywords: str | None
77
+ creator: str | None
78
+ producer: str | None
79
+ creation_date: str | None
80
+ modification_date: str | None
81
+ page_count: int
82
+
83
+
84
+ class HeaderMetadata(TypedDict):
85
+ """HTML header/heading metadata."""
86
+
87
+ level: int
88
+ text: str
89
+ id: str | None
90
+ depth: int
91
+ html_offset: int
92
+
93
+
94
+ class LinkMetadata(TypedDict):
95
+ """HTML link metadata."""
96
+
97
+ href: str
98
+ text: str
99
+ title: str | None
100
+ link_type: Literal["anchor", "internal", "external", "email", "phone", "other"]
101
+ rel: list[str]
102
+ attributes: dict[str, str]
103
+
104
+
105
+ class HtmlImageMetadata(TypedDict):
106
+ """HTML image metadata."""
107
+
108
+ src: str
109
+ alt: str | None
110
+ title: str | None
111
+ dimensions: tuple[int, int] | None
112
+ image_type: Literal["data_uri", "inline_svg", "external", "relative"]
113
+ attributes: dict[str, str]
114
+
115
+
116
+ class StructuredData(TypedDict):
117
+ """Structured data (JSON-LD, microdata, RDFa) metadata."""
118
+
119
+ data_type: Literal["json_ld", "microdata", "rdfa"]
120
+ raw_json: str
121
+ schema_type: str | None
122
+
123
+
124
+ class HtmlMetadata(TypedDict, total=False):
125
+ """HTML metadata."""
126
+
127
+ title: str | None
128
+ description: str | None
129
+ keywords: list[str]
130
+ author: str | None
131
+ canonical_url: str | None
132
+ base_href: str | None
133
+ language: str | None
134
+ text_direction: Literal["ltr", "rtl", "auto"] | None
135
+ open_graph: dict[str, str]
136
+ twitter_card: dict[str, str]
137
+ meta_tags: dict[str, str]
138
+ headers: list[HeaderMetadata]
139
+ links: list[LinkMetadata]
140
+ images: list[HtmlImageMetadata]
141
+ structured_data: list[StructuredData]
142
+
143
+
144
+ class PptxMetadata(TypedDict, total=False):
145
+ """PowerPoint metadata."""
146
+
147
+ title: str | None
148
+ author: str | None
149
+ description: str | None
150
+ summary: str | None
151
+ fonts: list[str]
152
+
153
+
154
+ class OcrMetadata(TypedDict, total=False):
155
+ """OCR processing metadata."""
156
+
157
+ language: str
158
+ psm: int
159
+ output_format: str
160
+ table_count: int
161
+ table_rows: int | None
162
+ table_cols: int | None
163
+
164
+
165
+ class ImagePreprocessingMetadata(TypedDict, total=False):
166
+ """Image preprocessing metadata."""
167
+
168
+ original_dimensions: tuple[int, int]
169
+ original_dpi: tuple[float, float]
170
+ target_dpi: int
171
+ scale_factor: float
172
+ auto_adjusted: bool
173
+ final_dpi: int
174
+ new_dimensions: tuple[int, int] | None
175
+ resample_method: str
176
+ dimension_clamped: bool
177
+ calculated_dpi: int | None
178
+ skipped_resize: bool
179
+ resize_error: str | None
180
+
181
+
182
+ class ErrorMetadata(TypedDict, total=False):
183
+ """Error metadata for batch operations."""
184
+
185
+ error_type: str
186
+ message: str
187
+
188
+
189
+ class ChunkMetadata(TypedDict):
190
+ """Chunk metadata describing offsets within the original document."""
191
+
192
+ byte_start: int
193
+ byte_end: int
194
+ token_count: int | None
195
+ chunk_index: int
196
+ total_chunks: int
197
+ first_page: int | None
198
+ last_page: int | None
199
+
200
+
201
+ class PageBoundary(TypedDict):
202
+ """Page boundaries in the document content."""
203
+
204
+ byte_start: int
205
+ byte_end: int
206
+ page_number: int
207
+
208
+
209
+ class PageConfig(TypedDict, total=False):
210
+ """Page extraction configuration."""
211
+
212
+ extract_pages: bool
213
+ insert_page_markers: bool
214
+ marker_format: str
215
+
216
+
217
+ class PageInfo(TypedDict, total=False):
218
+ """Metadata for an individual page/slide/sheet.
219
+
220
+ Captures per-page information including dimensions, content counts,
221
+ and visibility state (for presentations).
222
+ """
223
+
224
+ number: int
225
+ title: str | None
226
+ dimensions: tuple[float, float] | None
227
+ image_count: int | None
228
+ table_count: int | None
229
+ hidden: bool | None
230
+
231
+
232
+ PageUnitType = Literal["page", "slide", "sheet"]
233
+ """Type of paginated unit in a document.
234
+
235
+ Distinguishes between different types of "pages":
236
+ - "page": Standard document pages (PDF, DOCX, images)
237
+ - "slide": Presentation slides (PPTX, ODP)
238
+ - "sheet": Spreadsheet sheets (XLSX, ODS)
239
+ """
240
+
241
+
242
+ class PageStructure(TypedDict, total=False):
243
+ """Page structure metadata.
244
+
245
+ Contains information about pages/slides/sheets in a document, including
246
+ boundaries for mapping chunks to pages and detailed per-page metadata.
247
+ """
248
+
249
+ total_count: int
250
+ unit_type: PageUnitType
251
+ boundaries: list[PageBoundary] | None
252
+ pages: list[PageInfo] | None
253
+
254
+
255
+ class PageContent(TypedDict):
256
+ """Content for a single page/slide.
257
+
258
+ When page extraction is enabled, documents are split into per-page content
259
+ with associated tables and images mapped to each page.
260
+ """
261
+
262
+ page_number: int
263
+ content: str
264
+ tables: list[Table]
265
+ images: list[ExtractedImage]
266
+
267
+
268
+ class Chunk(TypedDict, total=False):
269
+ """Text chunk with optional embedding vector."""
270
+
271
+ content: str
272
+ embedding: list[float] | None
273
+ metadata: ChunkMetadata
274
+
275
+
276
+ class ExtractedImage(TypedDict, total=False):
277
+ """Image artifact extracted from a document page."""
278
+
279
+ data: bytes
280
+ format: str
281
+ image_index: int
282
+ page_number: int | None
283
+ width: int | None
284
+ height: int | None
285
+ colorspace: str | None
286
+ bits_per_component: int | None
287
+ is_mask: bool
288
+ description: str | None
289
+ ocr_result: ExtractionResult | None
290
+
291
+
292
+ class Metadata(TypedDict, total=False):
293
+ """Strongly-typed metadata for extraction results.
294
+
295
+ This TypedDict mirrors the Rust Metadata struct, providing type hints
296
+ for the most common metadata fields. The actual data comes from the
297
+ Rust core and may include additional custom fields from postprocessors.
298
+
299
+ All fields are optional (total=False) since they depend on:
300
+ - File format being extracted
301
+ - Feature flags (e.g., PDF support)
302
+ - Postprocessors enabled
303
+ - Extraction configuration
304
+
305
+ Format-specific fields are flattened at the root level. Use the format_type
306
+ discriminator to determine which fields are present.
307
+
308
+ Common fields:
309
+ language: Document language (ISO 639-1 code)
310
+ date: Document date (ISO 8601 format)
311
+ subject: Document subject
312
+
313
+ Discriminator:
314
+ format_type: Format discriminator ("pdf", "excel", "email", etc.)
315
+
316
+ Format-specific fields (flattened at root level):
317
+ PDF fields (when format_type == "pdf"):
318
+ title, authors, keywords, created_at, modified_at, created_by,
319
+ producer, page_count, pdf_version, is_encrypted, width, height, summary
320
+
321
+ Excel fields (when format_type == "excel"):
322
+ sheet_count, sheet_names
323
+
324
+ Email fields (when format_type == "email"):
325
+ from_email, from_name, to_emails, cc_emails, bcc_emails,
326
+ message_id, attachments
327
+
328
+ PowerPoint fields (when format_type == "pptx"):
329
+ author, description, fonts
330
+
331
+ Archive fields (when format_type == "archive"):
332
+ format, file_count, file_list, total_size, compressed_size
333
+
334
+ Image fields (when format_type == "image"):
335
+ exif
336
+
337
+ XML fields (when format_type == "xml"):
338
+ element_count, unique_elements
339
+
340
+ Text/Markdown fields (when format_type == "text"):
341
+ line_count, word_count, character_count, headers, links, code_blocks
342
+
343
+ HTML fields (when format_type == "html"):
344
+ canonical_url, base_href, language, text_direction, open_graph,
345
+ twitter_card, meta_tags, html_headers, html_links, html_images,
346
+ structured_data
347
+
348
+ OCR fields (when format_type == "ocr"):
349
+ psm, output_format, table_count, table_rows, table_cols
350
+
351
+ Processing metadata:
352
+ image_preprocessing: Image preprocessing metadata dict
353
+
354
+ Structured data:
355
+ json_schema: JSON schema dict for structured extraction
356
+
357
+ Error handling:
358
+ error: Error metadata dict for batch operations
359
+
360
+ Custom fields:
361
+ Any additional fields added by Python postprocessors (entity extraction,
362
+ keyword extraction, etc.) will appear as top-level keys in the dict.
363
+
364
+ Example:
365
+ >>> result = extract_file("document.xml")
366
+ >>> metadata: Metadata = result["metadata"]
367
+ >>> if metadata.get("format_type") == "xml":
368
+ ... element_count = metadata["element_count"]
369
+ ... print(f"Elements: {element_count}")
370
+ >>> if "entities" in metadata: # Custom field from postprocessor
371
+ ... entities = metadata["entities"]
372
+ """
373
+
374
+ date: str
375
+ subject: str
376
+
377
+ format_type: Literal["pdf", "excel", "email", "pptx", "archive", "image", "xml", "text", "html", "ocr"]
378
+
379
+ title: str
380
+ authors: list[str]
381
+ keywords: list[str]
382
+ created_at: str
383
+ modified_at: str
384
+ created_by: str
385
+ producer: str
386
+ page_count: int
387
+ pdf_version: str
388
+ is_encrypted: bool
389
+ width: int
390
+ height: int
391
+ summary: str
392
+
393
+ sheet_count: int
394
+ sheet_names: list[str]
395
+
396
+ from_email: str
397
+ from_name: str
398
+ to_emails: list[str]
399
+ cc_emails: list[str]
400
+ bcc_emails: list[str]
401
+ message_id: str
402
+ attachments: list[str]
403
+
404
+ author: str
405
+ description: str
406
+ fonts: list[str]
407
+
408
+ format: str
409
+ file_count: int
410
+ file_list: list[str]
411
+ total_size: int
412
+ compressed_size: int
413
+
414
+ exif: dict[str, str]
415
+
416
+ element_count: int
417
+ unique_elements: list[str]
418
+
419
+ line_count: int
420
+ word_count: int
421
+ character_count: int
422
+ headers: list[str]
423
+ links: list[tuple[str, str]]
424
+ code_blocks: list[tuple[str, str]]
425
+
426
+ canonical_url: str
427
+ base_href: str
428
+ language: str
429
+ text_direction: str
430
+ open_graph: dict[str, str]
431
+ twitter_card: dict[str, str]
432
+ meta_tags: dict[str, str]
433
+ html_headers: list[HeaderMetadata]
434
+ html_links: list[LinkMetadata]
435
+ html_images: list[ImageMetadata]
436
+ structured_data: list[StructuredData]
437
+
438
+ psm: int
439
+ output_format: str
440
+ table_count: int
441
+ table_rows: int
442
+ table_cols: int
443
+
444
+ image_preprocessing: ImagePreprocessingMetadata
445
+ json_schema: dict[str, Any]
446
+ error: ErrorMetadata
447
+
448
+
449
+ class Table(TypedDict):
450
+ """Extracted table structure."""
451
+
452
+ cells: list[list[str]]
453
+ markdown: str
454
+ page_number: int
455
+
456
+
457
+ class ExtractionResult(TypedDict):
458
+ """Extraction result returned by all extraction functions.
459
+
460
+ Attributes:
461
+ content: Extracted text content
462
+ mime_type: MIME type of the processed document
463
+ metadata: Strongly-typed metadata (see Metadata TypedDict)
464
+ tables: List of extracted tables
465
+ detected_languages: List of detected language codes (ISO 639-1)
466
+ chunks: Optional list of text chunks with embeddings and metadata
467
+ images: Optional list of extracted images (with nested OCR results)
468
+ pages: Optional list of per-page content when page extraction is enabled
469
+ """
470
+
471
+ content: str
472
+ mime_type: str
473
+ metadata: Metadata
474
+ tables: list[Table]
475
+ detected_languages: list[str] | None
476
+ chunks: list[Chunk] | None
477
+ images: list[ExtractedImage] | None
478
+ pages: list[PageContent] | None
479
+
480
+
481
+ __all__ = [
482
+ "ArchiveMetadata",
483
+ "Chunk",
484
+ "ChunkMetadata",
485
+ "EmailMetadata",
486
+ "ErrorMetadata",
487
+ "ExcelMetadata",
488
+ "ExtractedImage",
489
+ "ExtractionResult",
490
+ "HeaderMetadata",
491
+ "HtmlMetadata",
492
+ "ImageMetadata",
493
+ "ImagePreprocessingMetadata",
494
+ "LinkMetadata",
495
+ "Metadata",
496
+ "OcrMetadata",
497
+ "PageBoundary",
498
+ "PageConfig",
499
+ "PageContent",
500
+ "PageInfo",
501
+ "PageStructure",
502
+ "PageUnitType",
503
+ "PdfMetadata",
504
+ "PptxMetadata",
505
+ "StructuredData",
506
+ "Table",
507
+ "TextMetadata",
508
+ "XmlMetadata",
509
+ ]