kreuzberg 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +3 -0
- kreuzberg/__main__.py +8 -0
- kreuzberg/_api/__init__.py +0 -0
- kreuzberg/_api/main.py +87 -0
- kreuzberg/_cli_config.py +175 -0
- kreuzberg/_extractors/_image.py +39 -4
- kreuzberg/_extractors/_pandoc.py +158 -18
- kreuzberg/_extractors/_pdf.py +199 -19
- kreuzberg/_extractors/_presentation.py +1 -1
- kreuzberg/_extractors/_spread_sheet.py +65 -7
- kreuzberg/_gmft.py +222 -16
- kreuzberg/_mime_types.py +62 -16
- kreuzberg/_multiprocessing/__init__.py +6 -0
- kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
- kreuzberg/_multiprocessing/process_manager.py +188 -0
- kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
- kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
- kreuzberg/_ocr/_easyocr.py +6 -12
- kreuzberg/_ocr/_paddleocr.py +15 -13
- kreuzberg/_ocr/_tesseract.py +136 -46
- kreuzberg/_playa.py +43 -0
- kreuzberg/_types.py +4 -0
- kreuzberg/_utils/_cache.py +372 -0
- kreuzberg/_utils/_device.py +10 -27
- kreuzberg/_utils/_document_cache.py +220 -0
- kreuzberg/_utils/_errors.py +232 -0
- kreuzberg/_utils/_pdf_lock.py +72 -0
- kreuzberg/_utils/_process_pool.py +100 -0
- kreuzberg/_utils/_serialization.py +82 -0
- kreuzberg/_utils/_string.py +1 -1
- kreuzberg/_utils/_sync.py +21 -0
- kreuzberg/cli.py +338 -0
- kreuzberg/extraction.py +247 -36
- kreuzberg-3.4.0.dist-info/METADATA +290 -0
- kreuzberg-3.4.0.dist-info/RECORD +50 -0
- {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/WHEEL +1 -2
- kreuzberg-3.4.0.dist-info/entry_points.txt +2 -0
- kreuzberg-3.2.0.dist-info/METADATA +0 -166
- kreuzberg-3.2.0.dist-info/RECORD +0 -34
- kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
- {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_utils/_string.py
CHANGED
@@ -20,7 +20,7 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
|
20
20
|
|
21
21
|
encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
|
22
22
|
|
23
|
-
for enc in [e for e in encodings if e]:
|
23
|
+
for enc in [e for e in encodings if e]:
|
24
24
|
with suppress(UnicodeDecodeError, LookupError):
|
25
25
|
return byte_data.decode(enc)
|
26
26
|
|
kreuzberg/_utils/_sync.py
CHANGED
@@ -119,3 +119,24 @@ def run_maybe_async(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs:
|
|
119
119
|
T: The return value of the executed function, resolved if asynchronous.
|
120
120
|
"""
|
121
121
|
return cast("T", fn(*args, **kwargs) if not iscoroutinefunction(fn) else anyio.run(partial(fn, **kwargs), *args))
|
122
|
+
|
123
|
+
|
124
|
+
def run_sync_only(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
|
125
|
+
"""Runs a function, but only if it's synchronous. Raises error if async.
|
126
|
+
|
127
|
+
This is used for pure sync code paths where we cannot handle async functions.
|
128
|
+
|
129
|
+
Args:
|
130
|
+
fn: The function to be executed, must be synchronous.
|
131
|
+
*args: Positional arguments to be passed to the function.
|
132
|
+
**kwargs: Keyword arguments to be passed to the function.
|
133
|
+
|
134
|
+
Returns:
|
135
|
+
T: The return value of the executed function.
|
136
|
+
|
137
|
+
Raises:
|
138
|
+
RuntimeError: If the function is asynchronous.
|
139
|
+
"""
|
140
|
+
if iscoroutinefunction(fn):
|
141
|
+
raise RuntimeError(f"Cannot run async function {fn.__name__} in sync-only context")
|
142
|
+
return cast("T", fn(*args, **kwargs))
|
kreuzberg/cli.py
ADDED
@@ -0,0 +1,338 @@
|
|
1
|
+
"""Command-line interface for kreuzberg."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import json
|
6
|
+
import sys
|
7
|
+
from pathlib import Path
|
8
|
+
from typing import TYPE_CHECKING, Any
|
9
|
+
|
10
|
+
try:
|
11
|
+
import click
|
12
|
+
from rich.console import Console
|
13
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
14
|
+
except ImportError as e:
|
15
|
+
raise ImportError(
|
16
|
+
"CLI dependencies are not installed. Please install kreuzberg with the 'cli' extra: pip install kreuzberg[cli]"
|
17
|
+
) from e
|
18
|
+
|
19
|
+
from kreuzberg import __version__, extract_bytes_sync, extract_file_sync
|
20
|
+
from kreuzberg._cli_config import build_extraction_config, find_default_config, load_config_from_file
|
21
|
+
from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
|
22
|
+
|
23
|
+
DEFAULT_MAX_CHARACTERS = 4000
|
24
|
+
DEFAULT_MAX_OVERLAP = 200
|
25
|
+
|
26
|
+
if TYPE_CHECKING:
|
27
|
+
from kreuzberg._types import ExtractionConfig, ExtractionResult
|
28
|
+
|
29
|
+
console = Console(stderr=True)
|
30
|
+
|
31
|
+
|
32
|
+
class OcrBackendParamType(click.ParamType):
|
33
|
+
"""Click parameter type for OCR backend selection."""
|
34
|
+
|
35
|
+
name = "ocr_backend"
|
36
|
+
|
37
|
+
def convert(self, value: Any, param: click.Parameter | None, ctx: click.Context | None) -> str | None:
|
38
|
+
"""Convert parameter value to OCR backend string."""
|
39
|
+
if value is None:
|
40
|
+
return None
|
41
|
+
if value.lower() == "none":
|
42
|
+
return "none"
|
43
|
+
valid_backends = ["tesseract", "easyocr", "paddleocr", "none"]
|
44
|
+
if value.lower() not in valid_backends:
|
45
|
+
self.fail(f"Invalid OCR backend '{value}'. Choose from: {', '.join(valid_backends)}", param, ctx)
|
46
|
+
return value.lower() # type: ignore[no-any-return]
|
47
|
+
|
48
|
+
|
49
|
+
def format_extraction_result(result: ExtractionResult, show_metadata: bool, output_format: str) -> str:
|
50
|
+
"""Format extraction result for output.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
result: Extraction result to format.
|
54
|
+
show_metadata: Whether to include metadata.
|
55
|
+
output_format: Output format (text, json).
|
56
|
+
|
57
|
+
Returns:
|
58
|
+
Formatted string.
|
59
|
+
"""
|
60
|
+
if output_format == "json":
|
61
|
+
output_data: dict[str, Any] = {
|
62
|
+
"content": result.content,
|
63
|
+
"mime_type": result.mime_type,
|
64
|
+
}
|
65
|
+
if show_metadata:
|
66
|
+
output_data["metadata"] = result.metadata
|
67
|
+
if result.tables:
|
68
|
+
output_data["tables"] = result.tables
|
69
|
+
if result.chunks:
|
70
|
+
output_data["chunks"] = result.chunks
|
71
|
+
return json.dumps(output_data, indent=2, ensure_ascii=False)
|
72
|
+
|
73
|
+
output_parts = [result.content]
|
74
|
+
|
75
|
+
if show_metadata:
|
76
|
+
output_parts.append("\n\n--- METADATA ---")
|
77
|
+
output_parts.append(json.dumps(result.metadata, indent=2, ensure_ascii=False))
|
78
|
+
|
79
|
+
if result.tables:
|
80
|
+
output_parts.append("\n\n--- TABLES ---")
|
81
|
+
for i, table in enumerate(result.tables):
|
82
|
+
output_parts.append(f"\nTable {i + 1}:")
|
83
|
+
output_parts.append(json.dumps(table, indent=2, ensure_ascii=False))
|
84
|
+
|
85
|
+
return "\n".join(output_parts)
|
86
|
+
|
87
|
+
|
88
|
+
def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
|
89
|
+
"""Load configuration from file or find default."""
|
90
|
+
file_config = {}
|
91
|
+
if config:
|
92
|
+
file_config = load_config_from_file(config)
|
93
|
+
else:
|
94
|
+
default_config = find_default_config()
|
95
|
+
if default_config:
|
96
|
+
try:
|
97
|
+
file_config = load_config_from_file(default_config)
|
98
|
+
if verbose:
|
99
|
+
console.print(f"[dim]Using configuration from: {default_config}[/dim]")
|
100
|
+
except Exception: # noqa: BLE001
|
101
|
+
pass
|
102
|
+
return file_config
|
103
|
+
|
104
|
+
|
105
|
+
def _build_cli_args(
|
106
|
+
force_ocr: bool,
|
107
|
+
chunk_content: bool,
|
108
|
+
extract_tables: bool,
|
109
|
+
max_chars: int,
|
110
|
+
max_overlap: int,
|
111
|
+
ocr_backend: str | None,
|
112
|
+
tesseract_lang: str | None,
|
113
|
+
tesseract_psm: int | None,
|
114
|
+
easyocr_languages: str | None,
|
115
|
+
paddleocr_languages: str | None,
|
116
|
+
) -> dict[str, Any]:
|
117
|
+
"""Build CLI arguments dictionary."""
|
118
|
+
cli_args: dict[str, Any] = {
|
119
|
+
"force_ocr": force_ocr if force_ocr else None,
|
120
|
+
"chunk_content": chunk_content if chunk_content else None,
|
121
|
+
"extract_tables": extract_tables if extract_tables else None,
|
122
|
+
"max_chars": max_chars if max_chars != DEFAULT_MAX_CHARACTERS else None,
|
123
|
+
"max_overlap": max_overlap if max_overlap != DEFAULT_MAX_OVERLAP else None,
|
124
|
+
"ocr_backend": ocr_backend,
|
125
|
+
}
|
126
|
+
|
127
|
+
if ocr_backend == "tesseract" and (tesseract_lang or tesseract_psm is not None):
|
128
|
+
tesseract_config = {}
|
129
|
+
if tesseract_lang:
|
130
|
+
tesseract_config["language"] = tesseract_lang
|
131
|
+
if tesseract_psm is not None:
|
132
|
+
tesseract_config["psm"] = tesseract_psm # type: ignore[assignment]
|
133
|
+
cli_args["tesseract_config"] = tesseract_config
|
134
|
+
elif ocr_backend == "easyocr" and easyocr_languages:
|
135
|
+
cli_args["easyocr_config"] = {"languages": easyocr_languages.split(",")}
|
136
|
+
elif ocr_backend == "paddleocr" and paddleocr_languages:
|
137
|
+
cli_args["paddleocr_config"] = {"languages": paddleocr_languages.split(",")}
|
138
|
+
|
139
|
+
return cli_args
|
140
|
+
|
141
|
+
|
142
|
+
def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig, verbose: bool) -> ExtractionResult:
|
143
|
+
"""Perform text extraction from file or stdin."""
|
144
|
+
if file is None or (isinstance(file, Path) and file.name == "-"):
|
145
|
+
if verbose:
|
146
|
+
console.print("[dim]Reading from stdin...[/dim]")
|
147
|
+
try:
|
148
|
+
input_bytes = sys.stdin.buffer.read()
|
149
|
+
except Exception: # noqa: BLE001
|
150
|
+
input_text = sys.stdin.read()
|
151
|
+
input_bytes = input_text.encode("utf-8")
|
152
|
+
|
153
|
+
with Progress(
|
154
|
+
SpinnerColumn(),
|
155
|
+
TextColumn("[progress.description]{task.description}"),
|
156
|
+
console=console,
|
157
|
+
transient=True,
|
158
|
+
) as progress:
|
159
|
+
progress.add_task("Extracting text...", total=None)
|
160
|
+
|
161
|
+
try:
|
162
|
+
import magic # type: ignore[import-not-found]
|
163
|
+
|
164
|
+
mime_type = magic.from_buffer(input_bytes, mime=True)
|
165
|
+
except ImportError:
|
166
|
+
content_str = input_bytes.decode("utf-8", errors="ignore").lower()
|
167
|
+
mime_type = "text/html" if "<html" in content_str or "<body" in content_str else "text/plain"
|
168
|
+
|
169
|
+
return extract_bytes_sync(input_bytes, mime_type, config=extraction_config)
|
170
|
+
else:
|
171
|
+
with Progress(
|
172
|
+
SpinnerColumn(),
|
173
|
+
TextColumn("[progress.description]{task.description}"),
|
174
|
+
console=console,
|
175
|
+
transient=True,
|
176
|
+
) as progress:
|
177
|
+
progress.add_task(f"Extracting text from {file.name}...", total=None)
|
178
|
+
return extract_file_sync(str(file), config=extraction_config)
|
179
|
+
|
180
|
+
|
181
|
+
def _write_output(
|
182
|
+
result: ExtractionResult, output: Path | None, show_metadata: bool, output_format: str, verbose: bool
|
183
|
+
) -> None:
|
184
|
+
"""Format and write extraction output."""
|
185
|
+
formatted_output = format_extraction_result(result, show_metadata, output_format)
|
186
|
+
|
187
|
+
if output:
|
188
|
+
output.write_text(formatted_output, encoding="utf-8")
|
189
|
+
if verbose:
|
190
|
+
console.print(f"[green]✓[/green] Output written to: {output}")
|
191
|
+
else:
|
192
|
+
click.echo(formatted_output)
|
193
|
+
|
194
|
+
|
195
|
+
def handle_error(error: Exception, verbose: bool) -> None:
|
196
|
+
"""Handle and display errors.
|
197
|
+
|
198
|
+
Args:
|
199
|
+
error: The exception to handle.
|
200
|
+
verbose: Whether to show full stack trace.
|
201
|
+
"""
|
202
|
+
if isinstance(error, MissingDependencyError):
|
203
|
+
console.print(f"[red]Missing dependency:[/red] {error}", style="bold")
|
204
|
+
sys.exit(2)
|
205
|
+
elif isinstance(error, KreuzbergError):
|
206
|
+
console.print(f"[red]Error:[/red] {error}", style="bold")
|
207
|
+
if verbose and error.context:
|
208
|
+
console.print("\n[dim]Context:[/dim]")
|
209
|
+
console.print(json.dumps(error.context, indent=2))
|
210
|
+
sys.exit(1)
|
211
|
+
else:
|
212
|
+
console.print(f"[red]Unexpected error:[/red] {type(error).__name__}: {error}", style="bold")
|
213
|
+
if verbose:
|
214
|
+
import traceback
|
215
|
+
|
216
|
+
console.print("\n[dim]Traceback:[/dim]")
|
217
|
+
traceback.print_exc()
|
218
|
+
sys.exit(1)
|
219
|
+
|
220
|
+
|
221
|
+
@click.group(invoke_without_command=True)
|
222
|
+
@click.version_option(version=__version__, prog_name="kreuzberg")
|
223
|
+
@click.pass_context
|
224
|
+
def cli(ctx: click.Context) -> None:
|
225
|
+
"""Kreuzberg - Text extraction from documents.
|
226
|
+
|
227
|
+
Extract text from PDFs, images, Office documents, and more.
|
228
|
+
"""
|
229
|
+
if ctx.invoked_subcommand is None:
|
230
|
+
click.echo(ctx.get_help())
|
231
|
+
|
232
|
+
|
233
|
+
@cli.command()
|
234
|
+
@click.argument("file", type=click.Path(exists=True, path_type=Path), required=False)
|
235
|
+
@click.option("-o", "--output", type=click.Path(path_type=Path), help="Output file path (default: stdout)")
|
236
|
+
@click.option("--force-ocr", is_flag=True, help="Force OCR processing")
|
237
|
+
@click.option("--chunk-content", is_flag=True, help="Enable content chunking")
|
238
|
+
@click.option("--extract-tables", is_flag=True, help="Enable table extraction")
|
239
|
+
@click.option(
|
240
|
+
"--max-chars",
|
241
|
+
type=int,
|
242
|
+
default=DEFAULT_MAX_CHARACTERS,
|
243
|
+
help=f"Maximum characters per chunk (default: {DEFAULT_MAX_CHARACTERS})",
|
244
|
+
)
|
245
|
+
@click.option(
|
246
|
+
"--max-overlap",
|
247
|
+
type=int,
|
248
|
+
default=DEFAULT_MAX_OVERLAP,
|
249
|
+
help=f"Maximum overlap between chunks (default: {DEFAULT_MAX_OVERLAP})",
|
250
|
+
)
|
251
|
+
@click.option(
|
252
|
+
"--ocr-backend", type=OcrBackendParamType(), help="OCR backend to use (tesseract, easyocr, paddleocr, none)"
|
253
|
+
)
|
254
|
+
@click.option("--config", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
|
255
|
+
@click.option("--show-metadata", is_flag=True, help="Include metadata in output")
|
256
|
+
@click.option("--output-format", type=click.Choice(["text", "json"]), default="text", help="Output format")
|
257
|
+
@click.option("-v", "--verbose", is_flag=True, help="Verbose output for debugging")
|
258
|
+
@click.option("--tesseract-lang", help="Tesseract language(s) (e.g., 'eng+deu')")
|
259
|
+
@click.option("--tesseract-psm", type=int, help="Tesseract PSM mode (0-13)")
|
260
|
+
@click.option("--easyocr-languages", help="EasyOCR language codes (comma-separated, e.g., 'en,de')")
|
261
|
+
@click.option("--paddleocr-languages", help="PaddleOCR language codes (comma-separated, e.g., 'en,german')")
|
262
|
+
@click.pass_context
|
263
|
+
def extract( # noqa: PLR0913
|
264
|
+
ctx: click.Context, # noqa: ARG001
|
265
|
+
file: Path | None,
|
266
|
+
output: Path | None,
|
267
|
+
force_ocr: bool,
|
268
|
+
chunk_content: bool,
|
269
|
+
extract_tables: bool,
|
270
|
+
max_chars: int,
|
271
|
+
max_overlap: int,
|
272
|
+
ocr_backend: str | None,
|
273
|
+
config: Path | None,
|
274
|
+
show_metadata: bool,
|
275
|
+
output_format: str,
|
276
|
+
verbose: bool,
|
277
|
+
tesseract_lang: str | None,
|
278
|
+
tesseract_psm: int | None,
|
279
|
+
easyocr_languages: str | None,
|
280
|
+
paddleocr_languages: str | None,
|
281
|
+
) -> None:
|
282
|
+
"""Extract text from a document.
|
283
|
+
|
284
|
+
FILE can be a path to a document or '-' to read from stdin.
|
285
|
+
If FILE is omitted, reads from stdin.
|
286
|
+
"""
|
287
|
+
try:
|
288
|
+
file_config = _load_config(config, verbose)
|
289
|
+
|
290
|
+
cli_args = _build_cli_args(
|
291
|
+
force_ocr,
|
292
|
+
chunk_content,
|
293
|
+
extract_tables,
|
294
|
+
max_chars,
|
295
|
+
max_overlap,
|
296
|
+
ocr_backend,
|
297
|
+
tesseract_lang,
|
298
|
+
tesseract_psm,
|
299
|
+
easyocr_languages,
|
300
|
+
paddleocr_languages,
|
301
|
+
)
|
302
|
+
|
303
|
+
extraction_config = build_extraction_config(file_config, cli_args)
|
304
|
+
|
305
|
+
result = _perform_extraction(file, extraction_config, verbose)
|
306
|
+
|
307
|
+
_write_output(result, output, show_metadata, output_format, verbose)
|
308
|
+
|
309
|
+
except Exception as e: # noqa: BLE001
|
310
|
+
handle_error(e, verbose)
|
311
|
+
|
312
|
+
|
313
|
+
@cli.command()
|
314
|
+
@click.option("--config", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
|
315
|
+
def config(config: Path | None) -> None:
|
316
|
+
"""Show current configuration."""
|
317
|
+
try:
|
318
|
+
config_path = config or find_default_config()
|
319
|
+
|
320
|
+
if config_path:
|
321
|
+
file_config = load_config_from_file(config_path)
|
322
|
+
console.print(f"[bold]Configuration from:[/bold] {config_path}")
|
323
|
+
console.print(json.dumps(file_config, indent=2))
|
324
|
+
else:
|
325
|
+
console.print("[yellow]No configuration file found.[/yellow]")
|
326
|
+
console.print("\nDefault configuration values:")
|
327
|
+
console.print(" force_ocr: False")
|
328
|
+
console.print(" chunk_content: False")
|
329
|
+
console.print(" extract_tables: False")
|
330
|
+
console.print(f" max_chars: {DEFAULT_MAX_CHARACTERS}")
|
331
|
+
console.print(f" max_overlap: {DEFAULT_MAX_OVERLAP}")
|
332
|
+
console.print(" ocr_backend: tesseract")
|
333
|
+
except Exception as e: # noqa: BLE001
|
334
|
+
handle_error(e, verbose=False)
|
335
|
+
|
336
|
+
|
337
|
+
if __name__ == "__main__":
|
338
|
+
cli()
|