kreuzberg 2.1.2__py3-none-any.whl → 3.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/extraction.py CHANGED
@@ -1,198 +1,140 @@
1
- """This module provides functions to extract textual content from files.
2
-
3
- It includes vendored code:
4
-
5
- - The extract PPTX logic is based on code vendored from `markitdown` to extract text from PPTX files.
6
- See: https://github.com/microsoft/markitdown/blob/main/src/markitdown/_markitdown.py
7
- Refer to the markitdown repository for it's license (MIT).
8
- """
9
-
10
1
  from __future__ import annotations
11
2
 
12
- from functools import partial
13
- from io import BytesIO
14
3
  from pathlib import Path
15
- from typing import TYPE_CHECKING, cast
4
+ from typing import TYPE_CHECKING, Final, cast
16
5
 
17
6
  import anyio
18
- from anyio import Path as AsyncPath
19
- from PIL.Image import open as open_image
20
7
 
21
8
  from kreuzberg import ExtractionResult
22
- from kreuzberg._constants import DEFAULT_MAX_PROCESSES
23
- from kreuzberg._html import extract_html_string
9
+ from kreuzberg._chunker import get_chunker
24
10
  from kreuzberg._mime_types import (
25
- EXCEL_MIME_TYPE,
26
- HTML_MIME_TYPE,
27
- IMAGE_MIME_TYPES,
28
- PANDOC_SUPPORTED_MIME_TYPES,
29
- PDF_MIME_TYPE,
30
- POWER_POINT_MIME_TYPE,
31
- SUPPORTED_MIME_TYPES,
32
11
  validate_mime_type,
33
12
  )
34
- from kreuzberg._pandoc import process_content_with_pandoc, process_file_with_pandoc
35
- from kreuzberg._pdf import (
36
- extract_pdf_content,
37
- extract_pdf_file,
38
- )
39
- from kreuzberg._pptx import extract_pptx_file_content
40
- from kreuzberg._string import safe_decode
41
- from kreuzberg._tesseract import PSMMode, process_image_with_tesseract
42
- from kreuzberg._xlsx import extract_xlsx_content, extract_xlsx_file
43
- from kreuzberg.exceptions import ValidationError
13
+ from kreuzberg._registry import ExtractorRegistry
14
+ from kreuzberg._types import ExtractionConfig
15
+ from kreuzberg._utils._string import safe_decode
16
+ from kreuzberg._utils._sync import run_maybe_async, run_maybe_sync
44
17
 
45
18
  if TYPE_CHECKING:
46
19
  from collections.abc import Sequence
47
20
  from os import PathLike
48
21
 
49
22
 
50
- async def extract_bytes(
51
- content: bytes,
52
- mime_type: str,
53
- *,
54
- force_ocr: bool = False,
55
- language: str = "eng",
56
- max_processes: int = DEFAULT_MAX_PROCESSES,
57
- psm: PSMMode = PSMMode.AUTO,
58
- ) -> ExtractionResult:
59
- """Extract the textual content from a given byte string representing a file's contents.
23
+ DEFAULT_CONFIG: Final[ExtractionConfig] = ExtractionConfig()
60
24
 
61
- Args:
62
- content: The content to extract.
63
- mime_type: The mime type of the content.
64
- force_ocr: Whether to force OCR on PDF files that have a text layer.
65
- language: The language code for OCR. Defaults to "eng".
66
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
67
- psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
68
25
 
69
- Raises:
70
- ValidationError: If the mime type is not supported.
26
+ async def _validate_and_post_process_async(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
27
+ for validator in config.validators or []:
28
+ await run_maybe_sync(validator, result)
71
29
 
72
- Returns:
73
- The extracted content and the mime type of the content.
74
- """
75
- if mime_type not in SUPPORTED_MIME_TYPES or not any(mime_type.startswith(value) for value in SUPPORTED_MIME_TYPES):
76
- raise ValidationError(
77
- f"Unsupported mime type: {mime_type}",
78
- context={"mime_type": mime_type, "supported_mimetypes": ",".join(sorted(SUPPORTED_MIME_TYPES))},
30
+ if config.chunk_content:
31
+ result.chunks = _handle_chunk_content(
32
+ mime_type=result.mime_type,
33
+ config=config,
34
+ content=result.content,
79
35
  )
80
36
 
81
- if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
82
- return await extract_pdf_content(
83
- content, force_ocr=force_ocr, max_processes=max_processes, psm=psm, language=language
84
- )
37
+ for post_processor in config.post_processing_hooks or []:
38
+ result = await run_maybe_sync(post_processor, result)
85
39
 
86
- if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
87
- return await extract_xlsx_content(content)
40
+ return result
88
41
 
89
- if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
90
- return await process_image_with_tesseract(open_image(BytesIO(content)), psm=psm, language=language)
91
42
 
92
- if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
93
- mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
94
- ):
95
- return await process_content_with_pandoc(content=content, mime_type=mime_type)
43
+ def _validate_and_post_process_sync(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
44
+ for validator in config.validators or []:
45
+ run_maybe_async(validator, result)
96
46
 
97
- if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
98
- return await extract_pptx_file_content(content)
47
+ if config.chunk_content:
48
+ result.chunks = _handle_chunk_content(
49
+ mime_type=result.mime_type,
50
+ config=config,
51
+ content=result.content,
52
+ )
99
53
 
100
- if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
101
- return await extract_html_string(content)
54
+ for post_processor in config.post_processing_hooks or []:
55
+ result = run_maybe_async(post_processor, result)
102
56
 
103
- return ExtractionResult(
104
- content=safe_decode(content),
105
- mime_type=mime_type,
106
- metadata={},
107
- )
57
+ return result
108
58
 
109
59
 
110
- async def extract_file(
111
- file_path: PathLike[str] | str,
112
- mime_type: str | None = None,
113
- *,
114
- force_ocr: bool = False,
115
- language: str = "eng",
116
- max_processes: int = DEFAULT_MAX_PROCESSES,
117
- psm: PSMMode = PSMMode.AUTO,
118
- ) -> ExtractionResult:
119
- """Extract the textual content from a given file.
60
+ def _handle_chunk_content(
61
+ mime_type: str,
62
+ config: ExtractionConfig,
63
+ content: str,
64
+ ) -> list[str]:
65
+ chunker = get_chunker(mime_type=mime_type, max_characters=config.max_chars, overlap_characters=config.max_overlap)
66
+ return chunker.chunks(content)
67
+
68
+
69
+ async def extract_bytes(content: bytes, mime_type: str, config: ExtractionConfig = DEFAULT_CONFIG) -> ExtractionResult:
70
+ """Extract the textual content from a given byte string representing a file's contents.
120
71
 
121
72
  Args:
122
- file_path: The path to the file.
73
+ content: The content to extract.
123
74
  mime_type: The mime type of the content.
124
- force_ocr: Whether to force OCR on PDF files that have a text layer.
125
- language: The language code for OCR. Defaults to "eng".
126
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
127
- psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
75
+ config: Extraction options object, defaults to the default object.
128
76
 
129
- Raises:
130
- ValidationError: If the mime type is not supported.
131
77
 
132
78
  Returns:
133
79
  The extracted content and the mime type of the content.
134
80
  """
135
- input_file = await AsyncPath(file_path).resolve()
136
-
137
- mime_type = validate_mime_type(input_file, mime_type)
138
-
139
- if not await input_file.exists():
140
- raise ValidationError("The file does not exist.", context={"input_file": str(input_file)})
141
-
142
- if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
143
- return await extract_pdf_file(
144
- Path(input_file), force_ocr=force_ocr, max_processes=max_processes, psm=psm, language=language
81
+ mime_type = validate_mime_type(mime_type=mime_type)
82
+ if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
83
+ result = await extractor.extract_bytes_async(content)
84
+ else:
85
+ result = ExtractionResult(
86
+ content=safe_decode(content),
87
+ chunks=[],
88
+ mime_type=mime_type,
89
+ metadata={},
145
90
  )
146
91
 
147
- if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
148
- return await extract_xlsx_file(Path(input_file))
92
+ return await _validate_and_post_process_async(result=result, config=config)
149
93
 
150
- if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
151
- return await process_image_with_tesseract(input_file, psm=psm, language=language)
152
94
 
153
- if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
154
- mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
155
- ):
156
- return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type)
95
+ async def extract_file(
96
+ file_path: PathLike[str] | str, mime_type: str | None = None, config: ExtractionConfig = DEFAULT_CONFIG
97
+ ) -> ExtractionResult:
98
+ """Extract the textual content from a given file.
157
99
 
158
- if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
159
- return await extract_pptx_file_content(Path(input_file))
100
+ Args:
101
+ file_path: The path to the file.
102
+ mime_type: The mime type of the content.
103
+ config: Extraction options object, defaults to the default object.
160
104
 
161
- if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
162
- return await extract_html_string(Path(input_file))
105
+ Returns:
106
+ The extracted content and the mime type of the content.
107
+ """
108
+ mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
109
+ if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
110
+ result = await extractor.extract_path_async(Path(file_path))
111
+ else:
112
+ result = ExtractionResult(
113
+ content=safe_decode(await anyio.Path(file_path).read_bytes()), chunks=[], mime_type=mime_type, metadata={}
114
+ )
163
115
 
164
- return ExtractionResult(content=safe_decode(await input_file.read_bytes()), mime_type=mime_type, metadata={})
116
+ return await _validate_and_post_process_async(result=result, config=config)
165
117
 
166
118
 
167
119
  async def batch_extract_file(
168
- file_paths: Sequence[PathLike[str] | str],
169
- *,
170
- force_ocr: bool = False,
171
- language: str = "eng",
172
- max_processes: int = DEFAULT_MAX_PROCESSES,
173
- psm: PSMMode = PSMMode.AUTO,
120
+ file_paths: Sequence[PathLike[str] | str], config: ExtractionConfig = DEFAULT_CONFIG
174
121
  ) -> list[ExtractionResult]:
175
122
  """Extract text from multiple files concurrently.
176
123
 
177
124
  Args:
178
125
  file_paths: A sequence of paths to files to extract text from.
179
- force_ocr: Whether to force OCR on PDF files that have a text layer.
180
- language: The language code for OCR. Defaults to "eng".
181
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
182
- psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
126
+ config: Extraction options object, defaults to the default object.
183
127
 
184
128
  Returns:
185
129
  A list of extraction results in the same order as the input paths.
186
130
  """
187
- results = cast(list[ExtractionResult], ([None] * len(file_paths)))
131
+ results = cast("list[ExtractionResult]", ([None] * len(file_paths)))
188
132
 
189
133
  async def _extract_file(path: PathLike[str] | str, index: int) -> None:
190
134
  result = await extract_file(
191
135
  path,
192
- force_ocr=force_ocr,
193
- max_processes=max_processes,
194
- psm=psm,
195
- language=language,
136
+ None,
137
+ config,
196
138
  )
197
139
  results[index] = result
198
140
 
@@ -204,36 +146,21 @@ async def batch_extract_file(
204
146
 
205
147
 
206
148
  async def batch_extract_bytes(
207
- contents: Sequence[tuple[bytes, str]],
208
- *,
209
- force_ocr: bool = False,
210
- language: str = "eng",
211
- max_processes: int = DEFAULT_MAX_PROCESSES,
212
- psm: PSMMode = PSMMode.AUTO,
149
+ contents: Sequence[tuple[bytes, str]], config: ExtractionConfig = DEFAULT_CONFIG
213
150
  ) -> list[ExtractionResult]:
214
151
  """Extract text from multiple byte contents concurrently.
215
152
 
216
153
  Args:
217
154
  contents: A sequence of tuples containing (content, mime_type) pairs.
218
- force_ocr: Whether to force OCR on PDF files that have a text layer.
219
- language: The language code for OCR. Defaults to "eng".
220
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
221
- psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
155
+ config: Extraction options object, defaults to the default object.
222
156
 
223
157
  Returns:
224
158
  A list of extraction results in the same order as the input contents.
225
159
  """
226
- results = cast(list[ExtractionResult], [None] * len(contents))
160
+ results = cast("list[ExtractionResult]", [None] * len(contents))
227
161
 
228
162
  async def _extract_bytes(content: bytes, mime_type: str, index: int) -> None:
229
- result = await extract_bytes(
230
- content,
231
- mime_type,
232
- force_ocr=force_ocr,
233
- max_processes=max_processes,
234
- psm=psm,
235
- language=language,
236
- )
163
+ result = await extract_bytes(content, mime_type, config)
237
164
  results[index] = result
238
165
 
239
166
  async with anyio.create_task_group() as tg:
@@ -243,122 +170,82 @@ async def batch_extract_bytes(
243
170
  return results
244
171
 
245
172
 
246
- ### Sync proxies
247
-
248
-
249
- def extract_bytes_sync(
250
- content: bytes,
251
- mime_type: str,
252
- *,
253
- force_ocr: bool = False,
254
- language: str = "eng",
255
- max_processes: int = DEFAULT_MAX_PROCESSES,
256
- psm: PSMMode = PSMMode.AUTO,
257
- ) -> ExtractionResult:
173
+ def extract_bytes_sync(content: bytes, mime_type: str, config: ExtractionConfig = DEFAULT_CONFIG) -> ExtractionResult:
258
174
  """Synchronous version of extract_bytes.
259
175
 
260
176
  Args:
261
177
  content: The content to extract.
262
178
  mime_type: The mime type of the content.
263
- force_ocr: Whether to force OCR on PDF files that have a text layer.
264
- language: The language code for OCR. Defaults to "eng".
265
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
266
- psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
179
+ config: Extraction options object, defaults to the default object.
267
180
 
268
181
  Returns:
269
182
  The extracted content and the mime type of the content.
270
183
  """
271
- handler = partial(
272
- extract_bytes, content, mime_type, max_processes=max_processes, force_ocr=force_ocr, language=language, psm=psm
273
- )
274
- return anyio.run(handler)
184
+ mime_type = validate_mime_type(mime_type=mime_type)
185
+ if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
186
+ result = extractor.extract_bytes_sync(content)
187
+ else:
188
+ result = ExtractionResult(
189
+ content=safe_decode(content),
190
+ chunks=[],
191
+ mime_type=mime_type,
192
+ metadata={},
193
+ )
194
+
195
+ return _validate_and_post_process_sync(result=result, config=config)
275
196
 
276
197
 
277
198
  def extract_file_sync(
278
- file_path: Path | str,
279
- mime_type: str | None = None,
280
- *,
281
- force_ocr: bool = False,
282
- language: str = "eng",
283
- max_processes: int = DEFAULT_MAX_PROCESSES,
284
- psm: PSMMode = PSMMode.AUTO,
199
+ file_path: Path | str, mime_type: str | None = None, config: ExtractionConfig = DEFAULT_CONFIG
285
200
  ) -> ExtractionResult:
286
201
  """Synchronous version of extract_file.
287
202
 
288
203
  Args:
289
204
  file_path: The path to the file.
290
205
  mime_type: The mime type of the content.
291
- force_ocr: Whether to force OCR on PDF files that have a text layer.
292
- language: The language code for OCR. Defaults to "eng".
293
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
294
- psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
206
+ config: Extraction options object, defaults to the default object.
295
207
 
296
208
  Returns:
297
209
  The extracted content and the mime type of the content.
298
210
  """
299
- handler = partial(
300
- extract_file, file_path, mime_type, max_processes=max_processes, force_ocr=force_ocr, language=language, psm=psm
301
- )
302
- return anyio.run(handler)
211
+ mime_type = validate_mime_type(file_path=file_path, mime_type=mime_type)
212
+ if extractor := ExtractorRegistry.get_extractor(mime_type=mime_type, config=config):
213
+ result = extractor.extract_path_sync(Path(file_path))
214
+ else:
215
+ result = ExtractionResult(
216
+ content=Path(file_path).read_text(),
217
+ chunks=[],
218
+ mime_type=mime_type,
219
+ metadata={},
220
+ )
221
+ return _validate_and_post_process_sync(result=result, config=config)
303
222
 
304
223
 
305
224
  def batch_extract_file_sync(
306
- file_paths: Sequence[PathLike[str] | str],
307
- *,
308
- force_ocr: bool = False,
309
- language: str = "eng",
310
- max_processes: int = DEFAULT_MAX_PROCESSES,
311
- psm: PSMMode = PSMMode.AUTO,
225
+ file_paths: Sequence[PathLike[str] | str], config: ExtractionConfig = DEFAULT_CONFIG
312
226
  ) -> list[ExtractionResult]:
313
227
  """Synchronous version of batch_extract_file.
314
228
 
315
229
  Args:
316
230
  file_paths: A sequence of paths to files to extract text from.
317
- force_ocr: Whether to force OCR on PDF files that have a text layer.
318
- language: The language code for OCR. Defaults to "eng".
319
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
320
- psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
231
+ config: Extraction options object, defaults to the default object.
321
232
 
322
233
  Returns:
323
234
  A list of extraction results in the same order as the input paths.
324
235
  """
325
- handler = partial(
326
- batch_extract_file,
327
- file_paths,
328
- force_ocr=force_ocr,
329
- max_processes=max_processes,
330
- language=language,
331
- psm=psm,
332
- )
333
- return anyio.run(handler)
236
+ return [extract_file_sync(file_path=Path(file_path), mime_type=None, config=config) for file_path in file_paths]
334
237
 
335
238
 
336
239
  def batch_extract_bytes_sync(
337
- contents: Sequence[tuple[bytes, str]],
338
- *,
339
- force_ocr: bool = False,
340
- language: str = "eng",
341
- max_processes: int = DEFAULT_MAX_PROCESSES,
342
- psm: PSMMode = PSMMode.AUTO,
240
+ contents: Sequence[tuple[bytes, str]], config: ExtractionConfig = DEFAULT_CONFIG
343
241
  ) -> list[ExtractionResult]:
344
242
  """Synchronous version of batch_extract_bytes.
345
243
 
346
244
  Args:
347
245
  contents: A sequence of tuples containing (content, mime_type) pairs.
348
- force_ocr: Whether to force OCR on PDF files that have a text layer.
349
- language: The language code for OCR. Defaults to "eng".
350
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
351
- psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
246
+ config: Extraction options object, defaults to the default object.
352
247
 
353
248
  Returns:
354
249
  A list of extraction results in the same order as the input contents.
355
250
  """
356
- handler = partial(
357
- batch_extract_bytes,
358
- contents,
359
- force_ocr=force_ocr,
360
- max_processes=max_processes,
361
- language=language,
362
- psm=psm,
363
- )
364
- return anyio.run(handler)
251
+ return [extract_bytes_sync(content=content, mime_type=mime_type, config=config) for content, mime_type in contents]
@@ -0,0 +1,178 @@
1
+ Metadata-Version: 2.4
2
+ Name: kreuzberg
3
+ Version: 3.0.0
4
+ Summary: A text extraction library supporting PDFs, images, office documents and more
5
+ Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
+ License: MIT
7
+ Project-URL: homepage, https://github.com/Goldziher/kreuzberg
8
+ Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3 :: Only
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Topic :: Text Processing :: General
22
+ Classifier: Topic :: Utilities
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.9
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: anyio>=4.9.0
28
+ Requires-Dist: charset-normalizer>=3.4.1
29
+ Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
30
+ Requires-Dist: html-to-markdown>=1.2.0
31
+ Requires-Dist: playa-pdf>=0.4.1
32
+ Requires-Dist: pypdfium2==4.30.0
33
+ Requires-Dist: python-calamine>=0.3.1
34
+ Requires-Dist: python-pptx>=1.0.2
35
+ Requires-Dist: typing-extensions>=4.12.2; python_version < "3.12"
36
+ Provides-Extra: all
37
+ Requires-Dist: easyocr>=1.7.2; extra == "all"
38
+ Requires-Dist: numpy>=2.0.2; extra == "all"
39
+ Requires-Dist: paddleocr>=2.10.0; extra == "all"
40
+ Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "all"
41
+ Requires-Dist: semantic-text-splitter>=0.24.1; extra == "all"
42
+ Requires-Dist: setuptools>=76.0.0; extra == "all"
43
+ Provides-Extra: chunking
44
+ Requires-Dist: semantic-text-splitter>=0.24.1; extra == "chunking"
45
+ Provides-Extra: easyocr
46
+ Requires-Dist: easyocr>=1.7.2; extra == "easyocr"
47
+ Provides-Extra: paddleocr
48
+ Requires-Dist: numpy>=2.0.2; extra == "paddleocr"
49
+ Requires-Dist: paddleocr>=2.10.0; extra == "paddleocr"
50
+ Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "paddleocr"
51
+ Requires-Dist: setuptools>=76.0.0; extra == "paddleocr"
52
+ Dynamic: license-file
53
+
54
+ # Kreuzberg
55
+
56
+ [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
57
+ [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
58
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
59
+
60
+ Kreuzberg is a Python library for text extraction from documents. It provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs.
61
+
62
+ ## Why Kreuzberg?
63
+
64
+ - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
65
+ - **Local Processing**: No external API calls or cloud dependencies required
66
+ - **Resource Efficient**: Lightweight processing without GPU requirements
67
+ - **Format Support**: Comprehensive support for documents, images, and text formats
68
+ - **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
69
+ - **Modern Python**: Built with async/await, type hints, and a functional-first approach
70
+ - **Permissive OSS**: MIT licensed with permissively licensed dependencies
71
+
72
+ ## Quick Start
73
+
74
+ ```bash
75
+ pip install kreuzberg
76
+ ```
77
+
78
+ Install pandoc:
79
+
80
+ ```bash
81
+ # Ubuntu/Debian
82
+ sudo apt-get install tesseract-ocr pandoc
83
+
84
+ # macOS
85
+ brew install tesseract pandoc
86
+
87
+ # Windows
88
+ choco install -y tesseract pandoc
89
+ ```
90
+
91
+ The tesseract OCR engine is the default OCR engine. You can decide not to use it - and then either use one of the two alternative OCR engines, or have no OCR at all.
92
+
93
+ ### Alternative OCR engines
94
+
95
+ ```bash
96
+ # Install with EasyOCR support
97
+ pip install "kreuzberg[easyocr]"
98
+
99
+ # Install with PaddleOCR support
100
+ pip install "kreuzberg[paddleocr]"
101
+ ```
102
+
103
+ ## Quick Example
104
+
105
+ ```python
106
+ import asyncio
107
+ from kreuzberg import extract_file
108
+
109
+ async def main():
110
+ # Extract text from a PDF
111
+ result = await extract_file("document.pdf")
112
+ print(result.content)
113
+
114
+ # Extract text from an image
115
+ result = await extract_file("scan.jpg")
116
+ print(result.content)
117
+
118
+ # Extract text from a Word document
119
+ result = await extract_file("report.docx")
120
+ print(result.content)
121
+
122
+ asyncio.run(main())
123
+ ```
124
+
125
+ ## Documentation
126
+
127
+ For comprehensive documentation, visit our [GitHub Pages](https://goldziher.github.io/kreuzberg/):
128
+
129
+ - [Getting Started](https://goldziher.github.io/kreuzberg/getting-started/) - Installation and basic usage
130
+ - [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - In-depth usage information
131
+ - [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Detailed API documentation
132
+ - [Examples](https://goldziher.github.io/kreuzberg/examples/) - Code examples for common use cases
133
+ - [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - Configure OCR engines
134
+ - [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) - Choose the right OCR engine
135
+
136
+ ## Supported Formats
137
+
138
+ Kreuzberg supports a wide range of document formats:
139
+
140
+ - **Documents**: PDF, DOCX, DOC, RTF, TXT, EPUB, etc.
141
+ - **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
142
+ - **Spreadsheets**: XLSX, XLS, CSV, etc.
143
+ - **Presentations**: PPTX, PPT, etc.
144
+ - **Web Content**: HTML, XML, etc.
145
+
146
+ ## OCR Engines
147
+
148
+ Kreuzberg supports multiple OCR engines:
149
+
150
+ - **Tesseract** (Default): Lightweight, fast startup, requires system installation
151
+ - **EasyOCR**: Good for many languages, pure Python, but downloads models on first use
152
+ - **PaddleOCR**: Excellent for Asian languages, pure Python, but downloads models on first use
153
+
154
+ For comparison and selection guidance, see the [OCR Backends](https://example.com/ocr-backends) documentation.
155
+
156
+ ## Contribution
157
+
158
+ This library is open to contribution. Feel free to open issues or submit PRs. It's better to discuss issues before submitting PRs to avoid disappointment.
159
+
160
+ ### Local Development
161
+
162
+ 1. Clone the repo
163
+
164
+ 1. Install the system dependencies
165
+
166
+ 1. Install the full dependencies with `uv sync`
167
+
168
+ 1. Install the pre-commit hooks with:
169
+
170
+ ```shell
171
+ pre-commit install && pre-commit install --hook-type commit-msg
172
+ ```
173
+
174
+ 1. Make your changes and submit a PR
175
+
176
+ ## License
177
+
178
+ This library is released under the MIT license.
@@ -0,0 +1,15 @@
1
+ kreuzberg/__init__.py,sha256=KZ_y21m64cafWL7goGeG3EIDutM184st28n4UGajADs,1131
2
+ kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
3
+ kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
4
+ kreuzberg/_mime_types.py,sha256=pKtxBPDoye2knyou_VODDMPIt3eXotP-ak4MAKFI2SU,6310
5
+ kreuzberg/_playa.py,sha256=agHdhKfKLNtiP37XdNncbCP65v3Qv3m1Gn2KTRUkVx8,10396
6
+ kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
7
+ kreuzberg/_types.py,sha256=sZMxjRZQ1c_MzxdumhYSWghW6yXBwohTUIBa5eR-FKA,6582
8
+ kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
9
+ kreuzberg/extraction.py,sha256=0sjvbunx5srbR5lzjOAQjGK5JY3bCUHw-dRFmHjFz7o,8671
10
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ kreuzberg-3.0.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
12
+ kreuzberg-3.0.0.dist-info/METADATA,sha256=wlO9VCvZQy_gJJTmhGzH9j8BlPQPFQdmMZQxJOcQAUg,6515
13
+ kreuzberg-3.0.0.dist-info/WHEEL,sha256=1tXe9gY0PYatrMPMDd6jXqjfpz_B-Wqm32CPfRC58XU,91
14
+ kreuzberg-3.0.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
15
+ kreuzberg-3.0.0.dist-info/RECORD,,