kreuzberg 4.0.6__cp310-abi3-macosx_14_0_arm64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of kreuzberg might be problematic. Click here for more details.
- kreuzberg/__init__.py +931 -0
- kreuzberg/__main__.py +160 -0
- kreuzberg/_internal_bindings.abi3.so +0 -0
- kreuzberg/_setup_lib_path.py +143 -0
- kreuzberg/exceptions.py +254 -0
- kreuzberg/ocr/__init__.py +25 -0
- kreuzberg/ocr/easyocr.py +371 -0
- kreuzberg/ocr/paddleocr.py +284 -0
- kreuzberg/ocr/protocol.py +150 -0
- kreuzberg/postprocessors/__init__.py +61 -0
- kreuzberg/postprocessors/protocol.py +83 -0
- kreuzberg/py.typed +0 -0
- kreuzberg/types.py +509 -0
- kreuzberg-4.0.6.dist-info/METADATA +470 -0
- kreuzberg-4.0.6.dist-info/RECORD +17 -0
- kreuzberg-4.0.6.dist-info/WHEEL +4 -0
- kreuzberg-4.0.6.dist-info/entry_points.txt +2 -0
kreuzberg/types.py
ADDED
|
@@ -0,0 +1,509 @@
|
|
|
1
|
+
"""Type definitions for Kreuzberg extraction results.
|
|
2
|
+
|
|
3
|
+
These TypedDicts mirror the strongly-typed Rust metadata structures,
|
|
4
|
+
providing type hints for Python users while the actual data comes from
|
|
5
|
+
the Rust core via PyO3 bindings.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
# ruff: noqa: A005
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Any, Literal, TypedDict
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class ExcelMetadata(TypedDict, total=False):
|
|
15
|
+
"""Excel/spreadsheet metadata."""
|
|
16
|
+
|
|
17
|
+
sheet_count: int
|
|
18
|
+
sheet_names: list[str]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class EmailMetadata(TypedDict, total=False):
|
|
22
|
+
"""Email metadata."""
|
|
23
|
+
|
|
24
|
+
from_email: str | None
|
|
25
|
+
from_name: str | None
|
|
26
|
+
to_emails: list[str]
|
|
27
|
+
cc_emails: list[str]
|
|
28
|
+
bcc_emails: list[str]
|
|
29
|
+
message_id: str | None
|
|
30
|
+
attachments: list[str]
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class ArchiveMetadata(TypedDict, total=False):
|
|
34
|
+
"""Archive (ZIP/TAR/7Z) metadata."""
|
|
35
|
+
|
|
36
|
+
format: str
|
|
37
|
+
file_count: int
|
|
38
|
+
file_list: list[str]
|
|
39
|
+
total_size: int
|
|
40
|
+
compressed_size: int | None
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class ImageMetadata(TypedDict, total=False):
|
|
44
|
+
"""Image metadata."""
|
|
45
|
+
|
|
46
|
+
width: int
|
|
47
|
+
height: int
|
|
48
|
+
format: str
|
|
49
|
+
exif: dict[str, str]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class XmlMetadata(TypedDict, total=False):
|
|
53
|
+
"""XML metadata."""
|
|
54
|
+
|
|
55
|
+
element_count: int
|
|
56
|
+
unique_elements: list[str]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class TextMetadata(TypedDict, total=False):
|
|
60
|
+
"""Text/Markdown metadata."""
|
|
61
|
+
|
|
62
|
+
line_count: int
|
|
63
|
+
word_count: int
|
|
64
|
+
character_count: int
|
|
65
|
+
headers: list[str] | None
|
|
66
|
+
links: list[tuple[str, str]] | None
|
|
67
|
+
code_blocks: list[tuple[str, str]] | None
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class PdfMetadata(TypedDict, total=False):
|
|
71
|
+
"""PDF metadata."""
|
|
72
|
+
|
|
73
|
+
title: str | None
|
|
74
|
+
author: str | None
|
|
75
|
+
subject: str | None
|
|
76
|
+
keywords: str | None
|
|
77
|
+
creator: str | None
|
|
78
|
+
producer: str | None
|
|
79
|
+
creation_date: str | None
|
|
80
|
+
modification_date: str | None
|
|
81
|
+
page_count: int
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
class HeaderMetadata(TypedDict):
|
|
85
|
+
"""HTML header/heading metadata."""
|
|
86
|
+
|
|
87
|
+
level: int
|
|
88
|
+
text: str
|
|
89
|
+
id: str | None
|
|
90
|
+
depth: int
|
|
91
|
+
html_offset: int
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class LinkMetadata(TypedDict):
|
|
95
|
+
"""HTML link metadata."""
|
|
96
|
+
|
|
97
|
+
href: str
|
|
98
|
+
text: str
|
|
99
|
+
title: str | None
|
|
100
|
+
link_type: Literal["anchor", "internal", "external", "email", "phone", "other"]
|
|
101
|
+
rel: list[str]
|
|
102
|
+
attributes: dict[str, str]
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
class HtmlImageMetadata(TypedDict):
|
|
106
|
+
"""HTML image metadata."""
|
|
107
|
+
|
|
108
|
+
src: str
|
|
109
|
+
alt: str | None
|
|
110
|
+
title: str | None
|
|
111
|
+
dimensions: tuple[int, int] | None
|
|
112
|
+
image_type: Literal["data_uri", "inline_svg", "external", "relative"]
|
|
113
|
+
attributes: dict[str, str]
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class StructuredData(TypedDict):
|
|
117
|
+
"""Structured data (JSON-LD, microdata, RDFa) metadata."""
|
|
118
|
+
|
|
119
|
+
data_type: Literal["json_ld", "microdata", "rdfa"]
|
|
120
|
+
raw_json: str
|
|
121
|
+
schema_type: str | None
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class HtmlMetadata(TypedDict, total=False):
|
|
125
|
+
"""HTML metadata."""
|
|
126
|
+
|
|
127
|
+
title: str | None
|
|
128
|
+
description: str | None
|
|
129
|
+
keywords: list[str]
|
|
130
|
+
author: str | None
|
|
131
|
+
canonical_url: str | None
|
|
132
|
+
base_href: str | None
|
|
133
|
+
language: str | None
|
|
134
|
+
text_direction: Literal["ltr", "rtl", "auto"] | None
|
|
135
|
+
open_graph: dict[str, str]
|
|
136
|
+
twitter_card: dict[str, str]
|
|
137
|
+
meta_tags: dict[str, str]
|
|
138
|
+
headers: list[HeaderMetadata]
|
|
139
|
+
links: list[LinkMetadata]
|
|
140
|
+
images: list[HtmlImageMetadata]
|
|
141
|
+
structured_data: list[StructuredData]
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class PptxMetadata(TypedDict, total=False):
|
|
145
|
+
"""PowerPoint metadata."""
|
|
146
|
+
|
|
147
|
+
title: str | None
|
|
148
|
+
author: str | None
|
|
149
|
+
description: str | None
|
|
150
|
+
summary: str | None
|
|
151
|
+
fonts: list[str]
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class OcrMetadata(TypedDict, total=False):
|
|
155
|
+
"""OCR processing metadata."""
|
|
156
|
+
|
|
157
|
+
language: str
|
|
158
|
+
psm: int
|
|
159
|
+
output_format: str
|
|
160
|
+
table_count: int
|
|
161
|
+
table_rows: int | None
|
|
162
|
+
table_cols: int | None
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class ImagePreprocessingMetadata(TypedDict, total=False):
|
|
166
|
+
"""Image preprocessing metadata."""
|
|
167
|
+
|
|
168
|
+
original_dimensions: tuple[int, int]
|
|
169
|
+
original_dpi: tuple[float, float]
|
|
170
|
+
target_dpi: int
|
|
171
|
+
scale_factor: float
|
|
172
|
+
auto_adjusted: bool
|
|
173
|
+
final_dpi: int
|
|
174
|
+
new_dimensions: tuple[int, int] | None
|
|
175
|
+
resample_method: str
|
|
176
|
+
dimension_clamped: bool
|
|
177
|
+
calculated_dpi: int | None
|
|
178
|
+
skipped_resize: bool
|
|
179
|
+
resize_error: str | None
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
class ErrorMetadata(TypedDict, total=False):
|
|
183
|
+
"""Error metadata for batch operations."""
|
|
184
|
+
|
|
185
|
+
error_type: str
|
|
186
|
+
message: str
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
class ChunkMetadata(TypedDict):
|
|
190
|
+
"""Chunk metadata describing offsets within the original document."""
|
|
191
|
+
|
|
192
|
+
byte_start: int
|
|
193
|
+
byte_end: int
|
|
194
|
+
token_count: int | None
|
|
195
|
+
chunk_index: int
|
|
196
|
+
total_chunks: int
|
|
197
|
+
first_page: int | None
|
|
198
|
+
last_page: int | None
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
class PageBoundary(TypedDict):
|
|
202
|
+
"""Page boundaries in the document content."""
|
|
203
|
+
|
|
204
|
+
byte_start: int
|
|
205
|
+
byte_end: int
|
|
206
|
+
page_number: int
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
class PageConfig(TypedDict, total=False):
|
|
210
|
+
"""Page extraction configuration."""
|
|
211
|
+
|
|
212
|
+
extract_pages: bool
|
|
213
|
+
insert_page_markers: bool
|
|
214
|
+
marker_format: str
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
class PageInfo(TypedDict, total=False):
|
|
218
|
+
"""Metadata for an individual page/slide/sheet.
|
|
219
|
+
|
|
220
|
+
Captures per-page information including dimensions, content counts,
|
|
221
|
+
and visibility state (for presentations).
|
|
222
|
+
"""
|
|
223
|
+
|
|
224
|
+
number: int
|
|
225
|
+
title: str | None
|
|
226
|
+
dimensions: tuple[float, float] | None
|
|
227
|
+
image_count: int | None
|
|
228
|
+
table_count: int | None
|
|
229
|
+
hidden: bool | None
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
PageUnitType = Literal["page", "slide", "sheet"]
|
|
233
|
+
"""Type of paginated unit in a document.
|
|
234
|
+
|
|
235
|
+
Distinguishes between different types of "pages":
|
|
236
|
+
- "page": Standard document pages (PDF, DOCX, images)
|
|
237
|
+
- "slide": Presentation slides (PPTX, ODP)
|
|
238
|
+
- "sheet": Spreadsheet sheets (XLSX, ODS)
|
|
239
|
+
"""
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
class PageStructure(TypedDict, total=False):
|
|
243
|
+
"""Page structure metadata.
|
|
244
|
+
|
|
245
|
+
Contains information about pages/slides/sheets in a document, including
|
|
246
|
+
boundaries for mapping chunks to pages and detailed per-page metadata.
|
|
247
|
+
"""
|
|
248
|
+
|
|
249
|
+
total_count: int
|
|
250
|
+
unit_type: PageUnitType
|
|
251
|
+
boundaries: list[PageBoundary] | None
|
|
252
|
+
pages: list[PageInfo] | None
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
class PageContent(TypedDict):
|
|
256
|
+
"""Content for a single page/slide.
|
|
257
|
+
|
|
258
|
+
When page extraction is enabled, documents are split into per-page content
|
|
259
|
+
with associated tables and images mapped to each page.
|
|
260
|
+
"""
|
|
261
|
+
|
|
262
|
+
page_number: int
|
|
263
|
+
content: str
|
|
264
|
+
tables: list[Table]
|
|
265
|
+
images: list[ExtractedImage]
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
class Chunk(TypedDict, total=False):
|
|
269
|
+
"""Text chunk with optional embedding vector."""
|
|
270
|
+
|
|
271
|
+
content: str
|
|
272
|
+
embedding: list[float] | None
|
|
273
|
+
metadata: ChunkMetadata
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
class ExtractedImage(TypedDict, total=False):
|
|
277
|
+
"""Image artifact extracted from a document page."""
|
|
278
|
+
|
|
279
|
+
data: bytes
|
|
280
|
+
format: str
|
|
281
|
+
image_index: int
|
|
282
|
+
page_number: int | None
|
|
283
|
+
width: int | None
|
|
284
|
+
height: int | None
|
|
285
|
+
colorspace: str | None
|
|
286
|
+
bits_per_component: int | None
|
|
287
|
+
is_mask: bool
|
|
288
|
+
description: str | None
|
|
289
|
+
ocr_result: ExtractionResult | None
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
class Metadata(TypedDict, total=False):
|
|
293
|
+
"""Strongly-typed metadata for extraction results.
|
|
294
|
+
|
|
295
|
+
This TypedDict mirrors the Rust Metadata struct, providing type hints
|
|
296
|
+
for the most common metadata fields. The actual data comes from the
|
|
297
|
+
Rust core and may include additional custom fields from postprocessors.
|
|
298
|
+
|
|
299
|
+
All fields are optional (total=False) since they depend on:
|
|
300
|
+
- File format being extracted
|
|
301
|
+
- Feature flags (e.g., PDF support)
|
|
302
|
+
- Postprocessors enabled
|
|
303
|
+
- Extraction configuration
|
|
304
|
+
|
|
305
|
+
Format-specific fields are flattened at the root level. Use the format_type
|
|
306
|
+
discriminator to determine which fields are present.
|
|
307
|
+
|
|
308
|
+
Common fields:
|
|
309
|
+
language: Document language (ISO 639-1 code)
|
|
310
|
+
date: Document date (ISO 8601 format)
|
|
311
|
+
subject: Document subject
|
|
312
|
+
|
|
313
|
+
Discriminator:
|
|
314
|
+
format_type: Format discriminator ("pdf", "excel", "email", etc.)
|
|
315
|
+
|
|
316
|
+
Format-specific fields (flattened at root level):
|
|
317
|
+
PDF fields (when format_type == "pdf"):
|
|
318
|
+
title, authors, keywords, created_at, modified_at, created_by,
|
|
319
|
+
producer, page_count, pdf_version, is_encrypted, width, height, summary
|
|
320
|
+
|
|
321
|
+
Excel fields (when format_type == "excel"):
|
|
322
|
+
sheet_count, sheet_names
|
|
323
|
+
|
|
324
|
+
Email fields (when format_type == "email"):
|
|
325
|
+
from_email, from_name, to_emails, cc_emails, bcc_emails,
|
|
326
|
+
message_id, attachments
|
|
327
|
+
|
|
328
|
+
PowerPoint fields (when format_type == "pptx"):
|
|
329
|
+
author, description, fonts
|
|
330
|
+
|
|
331
|
+
Archive fields (when format_type == "archive"):
|
|
332
|
+
format, file_count, file_list, total_size, compressed_size
|
|
333
|
+
|
|
334
|
+
Image fields (when format_type == "image"):
|
|
335
|
+
exif
|
|
336
|
+
|
|
337
|
+
XML fields (when format_type == "xml"):
|
|
338
|
+
element_count, unique_elements
|
|
339
|
+
|
|
340
|
+
Text/Markdown fields (when format_type == "text"):
|
|
341
|
+
line_count, word_count, character_count, headers, links, code_blocks
|
|
342
|
+
|
|
343
|
+
HTML fields (when format_type == "html"):
|
|
344
|
+
canonical_url, base_href, language, text_direction, open_graph,
|
|
345
|
+
twitter_card, meta_tags, html_headers, html_links, html_images,
|
|
346
|
+
structured_data
|
|
347
|
+
|
|
348
|
+
OCR fields (when format_type == "ocr"):
|
|
349
|
+
psm, output_format, table_count, table_rows, table_cols
|
|
350
|
+
|
|
351
|
+
Processing metadata:
|
|
352
|
+
image_preprocessing: Image preprocessing metadata dict
|
|
353
|
+
|
|
354
|
+
Structured data:
|
|
355
|
+
json_schema: JSON schema dict for structured extraction
|
|
356
|
+
|
|
357
|
+
Error handling:
|
|
358
|
+
error: Error metadata dict for batch operations
|
|
359
|
+
|
|
360
|
+
Custom fields:
|
|
361
|
+
Any additional fields added by Python postprocessors (entity extraction,
|
|
362
|
+
keyword extraction, etc.) will appear as top-level keys in the dict.
|
|
363
|
+
|
|
364
|
+
Example:
|
|
365
|
+
>>> result = extract_file("document.xml")
|
|
366
|
+
>>> metadata: Metadata = result["metadata"]
|
|
367
|
+
>>> if metadata.get("format_type") == "xml":
|
|
368
|
+
... element_count = metadata["element_count"]
|
|
369
|
+
... print(f"Elements: {element_count}")
|
|
370
|
+
>>> if "entities" in metadata: # Custom field from postprocessor
|
|
371
|
+
... entities = metadata["entities"]
|
|
372
|
+
"""
|
|
373
|
+
|
|
374
|
+
date: str
|
|
375
|
+
subject: str
|
|
376
|
+
|
|
377
|
+
format_type: Literal["pdf", "excel", "email", "pptx", "archive", "image", "xml", "text", "html", "ocr"]
|
|
378
|
+
|
|
379
|
+
title: str
|
|
380
|
+
authors: list[str]
|
|
381
|
+
keywords: list[str]
|
|
382
|
+
created_at: str
|
|
383
|
+
modified_at: str
|
|
384
|
+
created_by: str
|
|
385
|
+
producer: str
|
|
386
|
+
page_count: int
|
|
387
|
+
pdf_version: str
|
|
388
|
+
is_encrypted: bool
|
|
389
|
+
width: int
|
|
390
|
+
height: int
|
|
391
|
+
summary: str
|
|
392
|
+
|
|
393
|
+
sheet_count: int
|
|
394
|
+
sheet_names: list[str]
|
|
395
|
+
|
|
396
|
+
from_email: str
|
|
397
|
+
from_name: str
|
|
398
|
+
to_emails: list[str]
|
|
399
|
+
cc_emails: list[str]
|
|
400
|
+
bcc_emails: list[str]
|
|
401
|
+
message_id: str
|
|
402
|
+
attachments: list[str]
|
|
403
|
+
|
|
404
|
+
author: str
|
|
405
|
+
description: str
|
|
406
|
+
fonts: list[str]
|
|
407
|
+
|
|
408
|
+
format: str
|
|
409
|
+
file_count: int
|
|
410
|
+
file_list: list[str]
|
|
411
|
+
total_size: int
|
|
412
|
+
compressed_size: int
|
|
413
|
+
|
|
414
|
+
exif: dict[str, str]
|
|
415
|
+
|
|
416
|
+
element_count: int
|
|
417
|
+
unique_elements: list[str]
|
|
418
|
+
|
|
419
|
+
line_count: int
|
|
420
|
+
word_count: int
|
|
421
|
+
character_count: int
|
|
422
|
+
headers: list[str]
|
|
423
|
+
links: list[tuple[str, str]]
|
|
424
|
+
code_blocks: list[tuple[str, str]]
|
|
425
|
+
|
|
426
|
+
canonical_url: str
|
|
427
|
+
base_href: str
|
|
428
|
+
language: str
|
|
429
|
+
text_direction: str
|
|
430
|
+
open_graph: dict[str, str]
|
|
431
|
+
twitter_card: dict[str, str]
|
|
432
|
+
meta_tags: dict[str, str]
|
|
433
|
+
html_headers: list[HeaderMetadata]
|
|
434
|
+
html_links: list[LinkMetadata]
|
|
435
|
+
html_images: list[ImageMetadata]
|
|
436
|
+
structured_data: list[StructuredData]
|
|
437
|
+
|
|
438
|
+
psm: int
|
|
439
|
+
output_format: str
|
|
440
|
+
table_count: int
|
|
441
|
+
table_rows: int
|
|
442
|
+
table_cols: int
|
|
443
|
+
|
|
444
|
+
image_preprocessing: ImagePreprocessingMetadata
|
|
445
|
+
json_schema: dict[str, Any]
|
|
446
|
+
error: ErrorMetadata
|
|
447
|
+
|
|
448
|
+
|
|
449
|
+
class Table(TypedDict):
|
|
450
|
+
"""Extracted table structure."""
|
|
451
|
+
|
|
452
|
+
cells: list[list[str]]
|
|
453
|
+
markdown: str
|
|
454
|
+
page_number: int
|
|
455
|
+
|
|
456
|
+
|
|
457
|
+
class ExtractionResult(TypedDict):
|
|
458
|
+
"""Extraction result returned by all extraction functions.
|
|
459
|
+
|
|
460
|
+
Attributes:
|
|
461
|
+
content: Extracted text content
|
|
462
|
+
mime_type: MIME type of the processed document
|
|
463
|
+
metadata: Strongly-typed metadata (see Metadata TypedDict)
|
|
464
|
+
tables: List of extracted tables
|
|
465
|
+
detected_languages: List of detected language codes (ISO 639-1)
|
|
466
|
+
chunks: Optional list of text chunks with embeddings and metadata
|
|
467
|
+
images: Optional list of extracted images (with nested OCR results)
|
|
468
|
+
pages: Optional list of per-page content when page extraction is enabled
|
|
469
|
+
"""
|
|
470
|
+
|
|
471
|
+
content: str
|
|
472
|
+
mime_type: str
|
|
473
|
+
metadata: Metadata
|
|
474
|
+
tables: list[Table]
|
|
475
|
+
detected_languages: list[str] | None
|
|
476
|
+
chunks: list[Chunk] | None
|
|
477
|
+
images: list[ExtractedImage] | None
|
|
478
|
+
pages: list[PageContent] | None
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
__all__ = [
|
|
482
|
+
"ArchiveMetadata",
|
|
483
|
+
"Chunk",
|
|
484
|
+
"ChunkMetadata",
|
|
485
|
+
"EmailMetadata",
|
|
486
|
+
"ErrorMetadata",
|
|
487
|
+
"ExcelMetadata",
|
|
488
|
+
"ExtractedImage",
|
|
489
|
+
"ExtractionResult",
|
|
490
|
+
"HeaderMetadata",
|
|
491
|
+
"HtmlMetadata",
|
|
492
|
+
"ImageMetadata",
|
|
493
|
+
"ImagePreprocessingMetadata",
|
|
494
|
+
"LinkMetadata",
|
|
495
|
+
"Metadata",
|
|
496
|
+
"OcrMetadata",
|
|
497
|
+
"PageBoundary",
|
|
498
|
+
"PageConfig",
|
|
499
|
+
"PageContent",
|
|
500
|
+
"PageInfo",
|
|
501
|
+
"PageStructure",
|
|
502
|
+
"PageUnitType",
|
|
503
|
+
"PdfMetadata",
|
|
504
|
+
"PptxMetadata",
|
|
505
|
+
"StructuredData",
|
|
506
|
+
"Table",
|
|
507
|
+
"TextMetadata",
|
|
508
|
+
"XmlMetadata",
|
|
509
|
+
]
|