kreuzberg 2.0.0__tar.gz → 2.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/PKG-INFO +48 -20
  2. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/README.md +47 -19
  3. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/__init__.py +14 -1
  4. kreuzberg-2.1.0/kreuzberg/_constants.py +8 -0
  5. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_html.py +1 -2
  6. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_pandoc.py +37 -73
  7. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_pdf.py +5 -6
  8. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_string.py +1 -1
  9. kreuzberg-2.1.0/kreuzberg/_sync.py +74 -0
  10. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_tesseract.py +55 -176
  11. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_xlsx.py +34 -36
  12. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/exceptions.py +20 -1
  13. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/extraction.py +13 -15
  14. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg.egg-info/PKG-INFO +48 -20
  15. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/pyproject.toml +4 -5
  16. kreuzberg-2.0.0/kreuzberg/_constants.py +0 -6
  17. kreuzberg-2.0.0/kreuzberg/_sync.py +0 -33
  18. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/LICENSE +0 -0
  19. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_mime_types.py +0 -0
  20. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_pptx.py +0 -0
  21. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_tmp.py +0 -0
  22. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_types.py +0 -0
  23. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/py.typed +0 -0
  24. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg.egg-info/SOURCES.txt +0 -0
  25. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg.egg-info/dependency_links.txt +0 -0
  26. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg.egg-info/requires.txt +0 -0
  27. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg.egg-info/top_level.txt +0 -0
  28. {kreuzberg-2.0.0 → kreuzberg-2.1.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: kreuzberg
3
- Version: 2.0.0
3
+ Version: 2.1.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -42,7 +42,7 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
42
42
  - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
43
43
  - **Local Processing**: No external API calls or cloud dependencies required
44
44
  - **Resource Efficient**: Lightweight processing without GPU requirements
45
- - **Lightweight**: Has few curated dependencies and a minimal footprint
45
+ - **Small Package Size**: Has few curated dependencies and a minimal footprint
46
46
  - **Format Support**: Comprehensive support for documents, images, and text formats
47
47
  - **Modern Python**: Built with async/await, type hints, and functional first approach
48
48
  - **Permissive OSS**: Kreuzberg and its dependencies have a permissive OSS license
@@ -61,10 +61,34 @@ pip install kreuzberg
61
61
 
62
62
  Kreuzberg requires two system level dependencies:
63
63
 
64
- - [Pandoc](https://pandoc.org/installing.html) - For document format conversion
65
- - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
64
+ - [Pandoc](https://pandoc.org/installing.html) - For document format conversion. Minimum required version is Pandoc 2.
65
+ - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 4.
66
66
 
67
- Please install these using their respective installation guides.
67
+ You can install these with:
68
+
69
+ #### Linux (Ubuntu)
70
+
71
+ ```shell
72
+ sudo apt-get install pandoc tesseract-ocr
73
+ ```
74
+
75
+ #### MacOS
76
+
77
+ ```shell
78
+ #
79
+ brew install tesseract pandoc
80
+ ```
81
+
82
+ #### Windows
83
+
84
+ ```shell
85
+ choco install -y tesseract pandoc
86
+ ```
87
+
88
+ Notes:
89
+
90
+ - in most distributions the tesseract-ocr package is split into multiple packages, you may need to install any language models you need aside from English separately.
91
+ - please consult the official documentation for these libraries for the most up-to-date installation instructions for your platform.
68
92
 
69
93
  ## Architecture
70
94
 
@@ -152,26 +176,30 @@ All extraction functions accept the following optional parameters for configurin
152
176
 
153
177
  #### OCR Configuration
154
178
 
155
- - `language` (default: "eng"): Specifies the language model for Tesseract OCR. This affects text recognition accuracy for non-English documents. Examples:
156
- - "eng" for English
157
- - "deu" for German
158
- - "fra" for French
179
+ - `force_ocr`(default: `False`): Forces OCR processing even for searchable PDFs.
180
+ - `language` (default: `eng`): Specifies the language model for Tesseract OCR. This affects text recognition accuracy for documents in different languages. Examples:
181
+
182
+ - `eng` for English
183
+ - `deu` for German
184
+ - `eng+deu` for English and German
159
185
 
160
- Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/) for more information.
186
+ Notes: - the order of languages effect processing time, the first language is the primary language and the second language is the secondary language etc.
161
187
 
162
- - `psm` (Page Segmentation Mode, default: PSM.AUTO): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
188
+ - `psm` (Page Segmentation Mode, default: `PSM.AUTO`): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
163
189
 
164
- #### Performance Configuration
190
+ Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/) for more information on both options.
165
191
 
166
- - `max_processes` (default: CPU count / 2): Maximum number of concurrent processes for Tesseract and Pandoc. Higher values can lead to performance improvements, but may cause resource exhaustion and deadlocks (especially for tesseract).
192
+ #### Processing Configuration
193
+
194
+ - `max_processes` (default: CPU count): Maximum number of concurrent processes for Tesseract.
167
195
 
168
196
  ### Quick Start
169
197
 
170
198
  ```python
171
199
  from pathlib import Path
172
200
  from kreuzberg import extract_file
173
- from kreuzberg.extraction import ExtractionResult
174
- from kreuzberg._tesseract import PSMMode, SupportedLanguage
201
+ from kreuzberg import ExtractionResult
202
+ from kreuzberg import PSMMode
175
203
 
176
204
 
177
205
  # Basic file extraction
@@ -193,14 +221,14 @@ async def extract_document():
193
221
  docx_result = await extract_file(Path("document.docx"))
194
222
  if docx_result.metadata:
195
223
  print(f"Title: {docx_result.metadata.get('title')}")
196
- print(f"Author: {docx_result.metadata.get('author')}")
224
+ print(f"Author: {docx_result.metadata.get('creator')}")
197
225
  ```
198
226
 
199
227
  ### Extracting Bytes
200
228
 
201
229
  ```python
202
230
  from kreuzberg import extract_bytes
203
- from kreuzberg.extraction import ExtractionResult
231
+ from kreuzberg import ExtractionResult
204
232
 
205
233
 
206
234
  async def process_upload(file_content: bytes, mime_type: str) -> ExtractionResult:
@@ -236,7 +264,7 @@ Kreuzberg supports efficient batch processing of multiple files or byte contents
236
264
 
237
265
  ```python
238
266
  from pathlib import Path
239
- from kreuzberg import batch_extract_file, batch_extract_bytes
267
+ from kreuzberg import batch_extract_file, batch_extract_bytes, batch_extract_file_sync
240
268
 
241
269
 
242
270
  # Process multiple files concurrently
@@ -346,8 +374,8 @@ async def process_document(path: str) -> tuple[str, str, Metadata]:
346
374
  Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
347
375
 
348
376
  ```python
349
- from kreuzberg import extract_file
350
- from kreuzberg.exceptions import (
377
+ from kreuzberg import (
378
+ extract_file,
351
379
  ValidationError,
352
380
  ParsingError,
353
381
  OCRError,
@@ -7,7 +7,7 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
7
7
  - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
8
8
  - **Local Processing**: No external API calls or cloud dependencies required
9
9
  - **Resource Efficient**: Lightweight processing without GPU requirements
10
- - **Lightweight**: Has few curated dependencies and a minimal footprint
10
+ - **Small Package Size**: Has few curated dependencies and a minimal footprint
11
11
  - **Format Support**: Comprehensive support for documents, images, and text formats
12
12
  - **Modern Python**: Built with async/await, type hints, and functional first approach
13
13
  - **Permissive OSS**: Kreuzberg and its dependencies have a permissive OSS license
@@ -26,10 +26,34 @@ pip install kreuzberg
26
26
 
27
27
  Kreuzberg requires two system level dependencies:
28
28
 
29
- - [Pandoc](https://pandoc.org/installing.html) - For document format conversion
30
- - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
29
+ - [Pandoc](https://pandoc.org/installing.html) - For document format conversion. Minimum required version is Pandoc 2.
30
+ - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 4.
31
31
 
32
- Please install these using their respective installation guides.
32
+ You can install these with:
33
+
34
+ #### Linux (Ubuntu)
35
+
36
+ ```shell
37
+ sudo apt-get install pandoc tesseract-ocr
38
+ ```
39
+
40
+ #### MacOS
41
+
42
+ ```shell
43
+ #
44
+ brew install tesseract pandoc
45
+ ```
46
+
47
+ #### Windows
48
+
49
+ ```shell
50
+ choco install -y tesseract pandoc
51
+ ```
52
+
53
+ Notes:
54
+
55
+ - in most distributions the tesseract-ocr package is split into multiple packages, you may need to install any language models you need aside from English separately.
56
+ - please consult the official documentation for these libraries for the most up-to-date installation instructions for your platform.
33
57
 
34
58
  ## Architecture
35
59
 
@@ -117,26 +141,30 @@ All extraction functions accept the following optional parameters for configurin
117
141
 
118
142
  #### OCR Configuration
119
143
 
120
- - `language` (default: "eng"): Specifies the language model for Tesseract OCR. This affects text recognition accuracy for non-English documents. Examples:
121
- - "eng" for English
122
- - "deu" for German
123
- - "fra" for French
144
+ - `force_ocr`(default: `False`): Forces OCR processing even for searchable PDFs.
145
+ - `language` (default: `eng`): Specifies the language model for Tesseract OCR. This affects text recognition accuracy for documents in different languages. Examples:
146
+
147
+ - `eng` for English
148
+ - `deu` for German
149
+ - `eng+deu` for English and German
124
150
 
125
- Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/) for more information.
151
+ Notes: - the order of languages effect processing time, the first language is the primary language and the second language is the secondary language etc.
126
152
 
127
- - `psm` (Page Segmentation Mode, default: PSM.AUTO): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
153
+ - `psm` (Page Segmentation Mode, default: `PSM.AUTO`): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
128
154
 
129
- #### Performance Configuration
155
+ Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/) for more information on both options.
130
156
 
131
- - `max_processes` (default: CPU count / 2): Maximum number of concurrent processes for Tesseract and Pandoc. Higher values can lead to performance improvements, but may cause resource exhaustion and deadlocks (especially for tesseract).
157
+ #### Processing Configuration
158
+
159
+ - `max_processes` (default: CPU count): Maximum number of concurrent processes for Tesseract.
132
160
 
133
161
  ### Quick Start
134
162
 
135
163
  ```python
136
164
  from pathlib import Path
137
165
  from kreuzberg import extract_file
138
- from kreuzberg.extraction import ExtractionResult
139
- from kreuzberg._tesseract import PSMMode, SupportedLanguage
166
+ from kreuzberg import ExtractionResult
167
+ from kreuzberg import PSMMode
140
168
 
141
169
 
142
170
  # Basic file extraction
@@ -158,14 +186,14 @@ async def extract_document():
158
186
  docx_result = await extract_file(Path("document.docx"))
159
187
  if docx_result.metadata:
160
188
  print(f"Title: {docx_result.metadata.get('title')}")
161
- print(f"Author: {docx_result.metadata.get('author')}")
189
+ print(f"Author: {docx_result.metadata.get('creator')}")
162
190
  ```
163
191
 
164
192
  ### Extracting Bytes
165
193
 
166
194
  ```python
167
195
  from kreuzberg import extract_bytes
168
- from kreuzberg.extraction import ExtractionResult
196
+ from kreuzberg import ExtractionResult
169
197
 
170
198
 
171
199
  async def process_upload(file_content: bytes, mime_type: str) -> ExtractionResult:
@@ -201,7 +229,7 @@ Kreuzberg supports efficient batch processing of multiple files or byte contents
201
229
 
202
230
  ```python
203
231
  from pathlib import Path
204
- from kreuzberg import batch_extract_file, batch_extract_bytes
232
+ from kreuzberg import batch_extract_file, batch_extract_bytes, batch_extract_file_sync
205
233
 
206
234
 
207
235
  # Process multiple files concurrently
@@ -311,8 +339,8 @@ async def process_document(path: str) -> tuple[str, str, Metadata]:
311
339
  Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
312
340
 
313
341
  ```python
314
- from kreuzberg import extract_file
315
- from kreuzberg.exceptions import (
342
+ from kreuzberg import (
343
+ extract_file,
316
344
  ValidationError,
317
345
  ParsingError,
318
346
  OCRError,
@@ -1,6 +1,14 @@
1
+ from ._tesseract import PSMMode
1
2
  from ._types import ExtractionResult, Metadata
2
3
  from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
3
- from .extraction import extract_bytes, extract_file
4
+ from .extraction import (
5
+ batch_extract_bytes,
6
+ batch_extract_bytes_sync,
7
+ batch_extract_file,
8
+ batch_extract_file_sync,
9
+ extract_bytes,
10
+ extract_file,
11
+ )
4
12
 
5
13
  __all__ = [
6
14
  "ExtractionResult",
@@ -8,8 +16,13 @@ __all__ = [
8
16
  "Metadata",
9
17
  "MissingDependencyError",
10
18
  "OCRError",
19
+ "PSMMode",
11
20
  "ParsingError",
12
21
  "ValidationError",
22
+ "batch_extract_bytes",
23
+ "batch_extract_bytes_sync",
24
+ "batch_extract_file",
25
+ "batch_extract_file_sync",
13
26
  "extract_bytes",
14
27
  "extract_file",
15
28
  ]
@@ -0,0 +1,8 @@
1
+ from __future__ import annotations
2
+
3
+ from multiprocessing import cpu_count
4
+ from typing import Final
5
+
6
+ DEFAULT_MAX_PROCESSES: Final[int] = cpu_count()
7
+ MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
8
+ MINIMAL_SUPPORTED_PANDOC_VERSION: Final[int] = 2
@@ -8,7 +8,6 @@ from anyio import Path as AsyncPath
8
8
  from kreuzberg import ExtractionResult
9
9
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
10
10
  from kreuzberg._string import normalize_spaces, safe_decode
11
- from kreuzberg._sync import run_sync
12
11
 
13
12
  if TYPE_CHECKING:
14
13
  from pathlib import Path
@@ -28,5 +27,5 @@ async def extract_html_string(file_path_or_contents: Path | bytes) -> Extraction
28
27
  if isinstance(file_path_or_contents, bytes)
29
28
  else await AsyncPath(file_path_or_contents).read_text()
30
29
  )
31
- result = await run_sync(html_to_markdown.convert_to_markdown, content)
30
+ result = html_to_markdown.convert_to_markdown(content)
32
31
  return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})
@@ -1,21 +1,22 @@
1
1
  from __future__ import annotations
2
2
 
3
- import subprocess
3
+ import re
4
4
  import sys
5
5
  from functools import partial
6
6
  from json import JSONDecodeError, loads
7
7
  from typing import TYPE_CHECKING, Any, Final, Literal, cast
8
8
 
9
- from anyio import CapacityLimiter, create_task_group, to_process
10
9
  from anyio import Path as AsyncPath
10
+ from anyio import run_process
11
11
 
12
- from kreuzberg._constants import DEFAULT_MAX_PROCESSES
12
+ from kreuzberg import ValidationError
13
+ from kreuzberg._constants import MINIMAL_SUPPORTED_PANDOC_VERSION
13
14
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
14
15
  from kreuzberg._string import normalize_spaces
15
- from kreuzberg._sync import run_sync
16
+ from kreuzberg._sync import run_taskgroup
16
17
  from kreuzberg._tmp import create_temp_file
17
18
  from kreuzberg._types import ExtractionResult, Metadata
18
- from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
19
+ from kreuzberg.exceptions import MissingDependencyError, ParsingError
19
20
 
20
21
  if TYPE_CHECKING: # pragma: no cover
21
22
  from collections.abc import Mapping
@@ -24,10 +25,8 @@ if TYPE_CHECKING: # pragma: no cover
24
25
  if sys.version_info < (3, 11): # pragma: no cover
25
26
  from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
26
27
 
27
-
28
28
  version_ref: Final[dict[str, bool]] = {"checked": False}
29
29
 
30
-
31
30
  # Block-level node types in Pandoc AST
32
31
  BLOCK_HEADER: Final = "Header" # Header with level, attributes and inline content
33
32
  BLOCK_PARA: Final = "Para" # Paragraph containing inline content
@@ -229,20 +228,15 @@ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
229
228
 
230
229
 
231
230
  def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
232
- if mime_type not in MIMETYPE_TO_PANDOC_TYPE_MAPPING or not any(
233
- mime_type.startswith(value) for value in MIMETYPE_TO_PANDOC_TYPE_MAPPING
234
- ):
235
- raise ValidationError(
236
- f"Unsupported mime type: {mime_type}",
237
- context={
238
- "mime_type": mime_type,
239
- "supported_mimetypes": ",".join(sorted(MIMETYPE_TO_PANDOC_TYPE_MAPPING)),
240
- },
231
+ if pandoc_type := (MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
232
+ return pandoc_type
233
+
234
+ if any(k.startswith(mime_type) for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING):
235
+ return next(
236
+ MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
241
237
  )
242
238
 
243
- return MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type) or next(
244
- MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
245
- )
239
+ raise ValidationError(f"Unsupported mime type: {mime_type}")
246
240
 
247
241
 
248
242
  async def _validate_pandoc_version() -> None:
@@ -251,20 +245,19 @@ async def _validate_pandoc_version() -> None:
251
245
  return
252
246
 
253
247
  command = ["pandoc", "--version"]
254
- result = await run_sync(subprocess.run, command, capture_output=True)
255
- version = result.stdout.decode().split("\n")[0].split()[1]
256
- if not version.startswith("3."):
257
- raise MissingDependencyError("Pandoc version 3 or above is required.")
248
+ result = await run_process(command)
249
+
250
+ version_match = re.search(r"pandoc\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
251
+ if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
252
+ raise MissingDependencyError("Pandoc version 2 or above is required")
258
253
 
259
254
  version_ref["checked"] = True
260
255
 
261
256
  except FileNotFoundError as e:
262
- raise MissingDependencyError("Pandoc is not installed.") from e
257
+ raise MissingDependencyError("Pandoc is not installed") from e
263
258
 
264
259
 
265
- async def _handle_extract_metadata(
266
- input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
267
- ) -> Metadata:
260
+ async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
268
261
  pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
269
262
  metadata_file, unlink = await create_temp_file(".json")
270
263
  try:
@@ -276,15 +269,10 @@ async def _handle_extract_metadata(
276
269
  "--standalone",
277
270
  "--quiet",
278
271
  "--output",
279
- metadata_file,
272
+ str(metadata_file),
280
273
  ]
281
274
 
282
- result = await to_process.run_sync(
283
- partial(subprocess.run, capture_output=True),
284
- command,
285
- cancellable=True,
286
- limiter=CapacityLimiter(max_processes),
287
- )
275
+ result = await run_process(command)
288
276
 
289
277
  if result.returncode != 0:
290
278
  raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
@@ -297,9 +285,7 @@ async def _handle_extract_metadata(
297
285
  await unlink()
298
286
 
299
287
 
300
- async def _handle_extract_file(
301
- input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
302
- ) -> str:
288
+ async def _handle_extract_file(input_file: str | PathLike[str], *, mime_type: str) -> str:
303
289
  pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
304
290
  output_path, unlink = await create_temp_file(".md")
305
291
  try:
@@ -315,12 +301,7 @@ async def _handle_extract_file(
315
301
 
316
302
  command.extend(["--output", str(output_path)])
317
303
 
318
- result = await to_process.run_sync(
319
- partial(subprocess.run, capture_output=True),
320
- command,
321
- cancellable=True,
322
- limiter=CapacityLimiter(max_processes),
323
- )
304
+ result = await run_process(command)
324
305
 
325
306
  if result.returncode != 0:
326
307
  raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
@@ -334,15 +315,12 @@ async def _handle_extract_file(
334
315
  await unlink()
335
316
 
336
317
 
337
- async def process_file_with_pandoc(
338
- input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
339
- ) -> ExtractionResult:
318
+ async def process_file_with_pandoc(input_file: str | PathLike[str], *, mime_type: str) -> ExtractionResult:
340
319
  """Process a single file using Pandoc and convert to markdown.
341
320
 
342
321
  Args:
343
322
  input_file: The path to the file to process.
344
323
  mime_type: The mime type of the file.
345
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
346
324
 
347
325
  Raises:
348
326
  ParsingError: If the file data could not be extracted.
@@ -354,41 +332,27 @@ async def process_file_with_pandoc(
354
332
 
355
333
  _get_pandoc_type_from_mime_type(mime_type)
356
334
 
357
- metadata: Metadata = {}
358
- content: str = ""
359
-
360
335
  try:
361
- async with create_task_group() as tg:
362
-
363
- async def _get_metadata() -> None:
364
- nonlocal metadata
365
- metadata = await _handle_extract_metadata(input_file, mime_type=mime_type, max_processes=max_processes)
366
-
367
- async def _get_content() -> None:
368
- nonlocal content
369
- content = await _handle_extract_file(input_file, mime_type=mime_type, max_processes=max_processes)
336
+ metadata, content = await run_taskgroup(
337
+ partial(_handle_extract_metadata, input_file, mime_type=mime_type),
338
+ partial(_handle_extract_file, input_file, mime_type=mime_type),
339
+ )
370
340
 
371
- tg.start_soon(_get_metadata)
372
- tg.start_soon(_get_content)
341
+ return ExtractionResult(
342
+ content=normalize_spaces(cast(str, content)),
343
+ metadata=cast(Metadata, metadata),
344
+ mime_type=MARKDOWN_MIME_TYPE,
345
+ )
373
346
  except ExceptionGroup as eg:
374
- raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from eg.exceptions[0]
375
-
376
- return ExtractionResult(
377
- content=normalize_spaces(content),
378
- metadata=metadata,
379
- mime_type=MARKDOWN_MIME_TYPE,
380
- )
347
+ raise ParsingError("Failed to process file", context={"file": str(input_file), "errors": eg.exceptions}) from eg
381
348
 
382
349
 
383
- async def process_content_with_pandoc(
384
- content: bytes, *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
385
- ) -> ExtractionResult:
350
+ async def process_content_with_pandoc(content: bytes, *, mime_type: str) -> ExtractionResult:
386
351
  """Process content using Pandoc and convert to markdown.
387
352
 
388
353
  Args:
389
354
  content: The content to process.
390
355
  mime_type: The mime type of the content.
391
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
392
356
 
393
357
  Returns:
394
358
  ExtractionResult
@@ -397,7 +361,7 @@ async def process_content_with_pandoc(
397
361
  input_file, unlink = await create_temp_file(f".{extension}")
398
362
 
399
363
  await AsyncPath(input_file).write_bytes(content)
400
- result = await process_file_with_pandoc(input_file, mime_type=mime_type, max_processes=max_processes)
364
+ result = await process_file_with_pandoc(input_file, mime_type=mime_type)
401
365
 
402
366
  await unlink()
403
367
  return result
@@ -11,7 +11,7 @@ from kreuzberg import ExtractionResult
11
11
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
12
12
  from kreuzberg._string import normalize_spaces
13
13
  from kreuzberg._sync import run_sync
14
- from kreuzberg._tesseract import PSMMode, SupportedLanguage, batch_process_images
14
+ from kreuzberg._tesseract import PSMMode, batch_process_images
15
15
  from kreuzberg.exceptions import ParsingError
16
16
 
17
17
  if TYPE_CHECKING: # pragma: no cover
@@ -67,7 +67,7 @@ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
67
67
  document: pypdfium2.PdfDocument | None = None
68
68
  try:
69
69
  document = await run_sync(pypdfium2.PdfDocument, str(input_file))
70
- return [page.render(scale=2.0).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
70
+ return [page.render(scale=4.25).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
71
71
  except pypdfium2.PdfiumError as e:
72
72
  raise ParsingError(
73
73
  "Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
@@ -80,7 +80,7 @@ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
80
80
  async def _extract_pdf_text_with_ocr(
81
81
  input_file: Path,
82
82
  *,
83
- language: SupportedLanguage = "eng",
83
+ language: str = "eng",
84
84
  max_processes: int,
85
85
  psm: PSMMode = PSMMode.AUTO,
86
86
  ) -> ExtractionResult:
@@ -132,7 +132,7 @@ async def extract_pdf_file(
132
132
  input_file: Path,
133
133
  *,
134
134
  force_ocr: bool,
135
- language: SupportedLanguage = "eng",
135
+ language: str = "eng",
136
136
  max_processes: int,
137
137
  psm: PSMMode = PSMMode.AUTO,
138
138
  ) -> ExtractionResult:
@@ -154,7 +154,6 @@ async def extract_pdf_file(
154
154
  and _validate_extracted_text(content)
155
155
  ):
156
156
  return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
157
-
158
157
  return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)
159
158
 
160
159
 
@@ -162,7 +161,7 @@ async def extract_pdf_content(
162
161
  content: bytes,
163
162
  *,
164
163
  force_ocr: bool,
165
- language: SupportedLanguage = "eng",
164
+ language: str = "eng",
166
165
  max_processes: int,
167
166
  psm: PSMMode = PSMMode.AUTO,
168
167
  ) -> ExtractionResult:
@@ -22,7 +22,7 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
22
22
  encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
23
23
 
24
24
  for enc in [e for e in encodings if e]: # pragma: no cover
25
- with suppress(UnicodeDecodeError):
25
+ with suppress(UnicodeDecodeError, LookupError):
26
26
  return byte_data.decode(enc)
27
27
 
28
28
  # If all encodings fail, fall back to latin-1 which can handle any byte
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from functools import partial
5
+ from typing import TYPE_CHECKING, TypeVar, cast
6
+
7
+ from anyio import create_task_group
8
+ from anyio.to_thread import run_sync as any_io_run_sync
9
+
10
+ if TYPE_CHECKING: # pragma: no cover
11
+ from collections.abc import Callable, Coroutine
12
+
13
+ if sys.version_info >= (3, 10):
14
+ from typing import ParamSpec
15
+ else: # pragma: no cover
16
+ from typing_extensions import ParamSpec
17
+
18
+ T = TypeVar("T")
19
+ P = ParamSpec("P")
20
+
21
+
22
+ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -> T:
23
+ """Run a synchronous function in an asynchronous context.
24
+
25
+ Args:
26
+ sync_fn: The synchronous function to run.
27
+ *args: The positional arguments to pass to the function.
28
+ **kwargs: The keyword arguments to pass to the function.
29
+
30
+ Returns:
31
+ The result of the synchronous function.
32
+ """
33
+ handler = partial(sync_fn, **kwargs)
34
+ return cast(T, await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
35
+
36
+
37
+ async def run_taskgroup(*async_tasks: Callable[[], Coroutine[None, None, T]]) -> list[T]:
38
+ """Run a list of coroutines concurrently.
39
+
40
+ Args:
41
+ *async_tasks: The list of coroutines to run.
42
+
43
+ Returns:
44
+ The results of the coroutines.
45
+ """
46
+ results = cast(list[T], [None] * len(async_tasks))
47
+
48
+ async def run_task(index: int, task: Callable[[], Coroutine[None, None, T]]) -> None:
49
+ results[index] = await task()
50
+
51
+ async with create_task_group() as tg:
52
+ for i, t in enumerate(async_tasks):
53
+ tg.start_soon(run_task, i, t)
54
+
55
+ return results
56
+
57
+
58
+ async def run_taskgroup_batched(*async_tasks: Callable[[], Coroutine[None, None, T]], batch_size: int) -> list[T]:
59
+ """Run a list of coroutines concurrently in batches.
60
+
61
+ Args:
62
+ *async_tasks: The list of coroutines to run.
63
+ batch_size: The size of each batch.
64
+
65
+ Returns:
66
+ The results of the coroutines.
67
+ """
68
+ results: list[T] = []
69
+
70
+ for i in range(0, len(async_tasks), batch_size):
71
+ batch = async_tasks[i : i + batch_size]
72
+ results.extend(await run_taskgroup(*batch))
73
+
74
+ return results