kreuzberg 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. kreuzberg/__init__.py +3 -0
  2. kreuzberg/__main__.py +8 -0
  3. kreuzberg/_api/__init__.py +0 -0
  4. kreuzberg/_api/main.py +87 -0
  5. kreuzberg/_cli_config.py +175 -0
  6. kreuzberg/_extractors/_image.py +39 -4
  7. kreuzberg/_extractors/_pandoc.py +158 -18
  8. kreuzberg/_extractors/_pdf.py +199 -19
  9. kreuzberg/_extractors/_presentation.py +1 -1
  10. kreuzberg/_extractors/_spread_sheet.py +65 -7
  11. kreuzberg/_gmft.py +222 -16
  12. kreuzberg/_mime_types.py +62 -16
  13. kreuzberg/_multiprocessing/__init__.py +6 -0
  14. kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
  15. kreuzberg/_multiprocessing/process_manager.py +188 -0
  16. kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
  17. kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
  18. kreuzberg/_ocr/_easyocr.py +6 -12
  19. kreuzberg/_ocr/_paddleocr.py +15 -13
  20. kreuzberg/_ocr/_tesseract.py +136 -46
  21. kreuzberg/_playa.py +43 -0
  22. kreuzberg/_types.py +4 -0
  23. kreuzberg/_utils/_cache.py +372 -0
  24. kreuzberg/_utils/_device.py +10 -27
  25. kreuzberg/_utils/_document_cache.py +220 -0
  26. kreuzberg/_utils/_errors.py +232 -0
  27. kreuzberg/_utils/_pdf_lock.py +72 -0
  28. kreuzberg/_utils/_process_pool.py +100 -0
  29. kreuzberg/_utils/_serialization.py +82 -0
  30. kreuzberg/_utils/_string.py +1 -1
  31. kreuzberg/_utils/_sync.py +21 -0
  32. kreuzberg/cli.py +338 -0
  33. kreuzberg/extraction.py +247 -36
  34. kreuzberg-3.4.0.dist-info/METADATA +290 -0
  35. kreuzberg-3.4.0.dist-info/RECORD +50 -0
  36. {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/WHEEL +1 -2
  37. kreuzberg-3.4.0.dist-info/entry_points.txt +2 -0
  38. kreuzberg-3.2.0.dist-info/METADATA +0 -166
  39. kreuzberg-3.2.0.dist-info/RECORD +0 -34
  40. kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
  41. {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -20,7 +20,7 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
20
20
 
21
21
  encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
22
22
 
23
- for enc in [e for e in encodings if e]: # pragma: no cover
23
+ for enc in [e for e in encodings if e]:
24
24
  with suppress(UnicodeDecodeError, LookupError):
25
25
  return byte_data.decode(enc)
26
26
 
kreuzberg/_utils/_sync.py CHANGED
@@ -119,3 +119,24 @@ def run_maybe_async(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs:
119
119
  T: The return value of the executed function, resolved if asynchronous.
120
120
  """
121
121
  return cast("T", fn(*args, **kwargs) if not iscoroutinefunction(fn) else anyio.run(partial(fn, **kwargs), *args))
122
+
123
+
124
+ def run_sync_only(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
125
+ """Runs a function, but only if it's synchronous. Raises error if async.
126
+
127
+ This is used for pure sync code paths where we cannot handle async functions.
128
+
129
+ Args:
130
+ fn: The function to be executed, must be synchronous.
131
+ *args: Positional arguments to be passed to the function.
132
+ **kwargs: Keyword arguments to be passed to the function.
133
+
134
+ Returns:
135
+ T: The return value of the executed function.
136
+
137
+ Raises:
138
+ RuntimeError: If the function is asynchronous.
139
+ """
140
+ if iscoroutinefunction(fn):
141
+ raise RuntimeError(f"Cannot run async function {fn.__name__} in sync-only context")
142
+ return cast("T", fn(*args, **kwargs))
kreuzberg/cli.py ADDED
@@ -0,0 +1,338 @@
1
+ """Command-line interface for kreuzberg."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import sys
7
+ from pathlib import Path
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ try:
11
+ import click
12
+ from rich.console import Console
13
+ from rich.progress import Progress, SpinnerColumn, TextColumn
14
+ except ImportError as e:
15
+ raise ImportError(
16
+ "CLI dependencies are not installed. Please install kreuzberg with the 'cli' extra: pip install kreuzberg[cli]"
17
+ ) from e
18
+
19
+ from kreuzberg import __version__, extract_bytes_sync, extract_file_sync
20
+ from kreuzberg._cli_config import build_extraction_config, find_default_config, load_config_from_file
21
+ from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
22
+
23
+ DEFAULT_MAX_CHARACTERS = 4000
24
+ DEFAULT_MAX_OVERLAP = 200
25
+
26
+ if TYPE_CHECKING:
27
+ from kreuzberg._types import ExtractionConfig, ExtractionResult
28
+
29
+ console = Console(stderr=True)
30
+
31
+
32
+ class OcrBackendParamType(click.ParamType):
33
+ """Click parameter type for OCR backend selection."""
34
+
35
+ name = "ocr_backend"
36
+
37
+ def convert(self, value: Any, param: click.Parameter | None, ctx: click.Context | None) -> str | None:
38
+ """Convert parameter value to OCR backend string."""
39
+ if value is None:
40
+ return None
41
+ if value.lower() == "none":
42
+ return "none"
43
+ valid_backends = ["tesseract", "easyocr", "paddleocr", "none"]
44
+ if value.lower() not in valid_backends:
45
+ self.fail(f"Invalid OCR backend '{value}'. Choose from: {', '.join(valid_backends)}", param, ctx)
46
+ return value.lower() # type: ignore[no-any-return]
47
+
48
+
49
+ def format_extraction_result(result: ExtractionResult, show_metadata: bool, output_format: str) -> str:
50
+ """Format extraction result for output.
51
+
52
+ Args:
53
+ result: Extraction result to format.
54
+ show_metadata: Whether to include metadata.
55
+ output_format: Output format (text, json).
56
+
57
+ Returns:
58
+ Formatted string.
59
+ """
60
+ if output_format == "json":
61
+ output_data: dict[str, Any] = {
62
+ "content": result.content,
63
+ "mime_type": result.mime_type,
64
+ }
65
+ if show_metadata:
66
+ output_data["metadata"] = result.metadata
67
+ if result.tables:
68
+ output_data["tables"] = result.tables
69
+ if result.chunks:
70
+ output_data["chunks"] = result.chunks
71
+ return json.dumps(output_data, indent=2, ensure_ascii=False)
72
+
73
+ output_parts = [result.content]
74
+
75
+ if show_metadata:
76
+ output_parts.append("\n\n--- METADATA ---")
77
+ output_parts.append(json.dumps(result.metadata, indent=2, ensure_ascii=False))
78
+
79
+ if result.tables:
80
+ output_parts.append("\n\n--- TABLES ---")
81
+ for i, table in enumerate(result.tables):
82
+ output_parts.append(f"\nTable {i + 1}:")
83
+ output_parts.append(json.dumps(table, indent=2, ensure_ascii=False))
84
+
85
+ return "\n".join(output_parts)
86
+
87
+
88
+ def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
89
+ """Load configuration from file or find default."""
90
+ file_config = {}
91
+ if config:
92
+ file_config = load_config_from_file(config)
93
+ else:
94
+ default_config = find_default_config()
95
+ if default_config:
96
+ try:
97
+ file_config = load_config_from_file(default_config)
98
+ if verbose:
99
+ console.print(f"[dim]Using configuration from: {default_config}[/dim]")
100
+ except Exception: # noqa: BLE001
101
+ pass
102
+ return file_config
103
+
104
+
105
+ def _build_cli_args(
106
+ force_ocr: bool,
107
+ chunk_content: bool,
108
+ extract_tables: bool,
109
+ max_chars: int,
110
+ max_overlap: int,
111
+ ocr_backend: str | None,
112
+ tesseract_lang: str | None,
113
+ tesseract_psm: int | None,
114
+ easyocr_languages: str | None,
115
+ paddleocr_languages: str | None,
116
+ ) -> dict[str, Any]:
117
+ """Build CLI arguments dictionary."""
118
+ cli_args: dict[str, Any] = {
119
+ "force_ocr": force_ocr if force_ocr else None,
120
+ "chunk_content": chunk_content if chunk_content else None,
121
+ "extract_tables": extract_tables if extract_tables else None,
122
+ "max_chars": max_chars if max_chars != DEFAULT_MAX_CHARACTERS else None,
123
+ "max_overlap": max_overlap if max_overlap != DEFAULT_MAX_OVERLAP else None,
124
+ "ocr_backend": ocr_backend,
125
+ }
126
+
127
+ if ocr_backend == "tesseract" and (tesseract_lang or tesseract_psm is not None):
128
+ tesseract_config = {}
129
+ if tesseract_lang:
130
+ tesseract_config["language"] = tesseract_lang
131
+ if tesseract_psm is not None:
132
+ tesseract_config["psm"] = tesseract_psm # type: ignore[assignment]
133
+ cli_args["tesseract_config"] = tesseract_config
134
+ elif ocr_backend == "easyocr" and easyocr_languages:
135
+ cli_args["easyocr_config"] = {"languages": easyocr_languages.split(",")}
136
+ elif ocr_backend == "paddleocr" and paddleocr_languages:
137
+ cli_args["paddleocr_config"] = {"languages": paddleocr_languages.split(",")}
138
+
139
+ return cli_args
140
+
141
+
142
+ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig, verbose: bool) -> ExtractionResult:
143
+ """Perform text extraction from file or stdin."""
144
+ if file is None or (isinstance(file, Path) and file.name == "-"):
145
+ if verbose:
146
+ console.print("[dim]Reading from stdin...[/dim]")
147
+ try:
148
+ input_bytes = sys.stdin.buffer.read()
149
+ except Exception: # noqa: BLE001
150
+ input_text = sys.stdin.read()
151
+ input_bytes = input_text.encode("utf-8")
152
+
153
+ with Progress(
154
+ SpinnerColumn(),
155
+ TextColumn("[progress.description]{task.description}"),
156
+ console=console,
157
+ transient=True,
158
+ ) as progress:
159
+ progress.add_task("Extracting text...", total=None)
160
+
161
+ try:
162
+ import magic # type: ignore[import-not-found]
163
+
164
+ mime_type = magic.from_buffer(input_bytes, mime=True)
165
+ except ImportError:
166
+ content_str = input_bytes.decode("utf-8", errors="ignore").lower()
167
+ mime_type = "text/html" if "<html" in content_str or "<body" in content_str else "text/plain"
168
+
169
+ return extract_bytes_sync(input_bytes, mime_type, config=extraction_config)
170
+ else:
171
+ with Progress(
172
+ SpinnerColumn(),
173
+ TextColumn("[progress.description]{task.description}"),
174
+ console=console,
175
+ transient=True,
176
+ ) as progress:
177
+ progress.add_task(f"Extracting text from {file.name}...", total=None)
178
+ return extract_file_sync(str(file), config=extraction_config)
179
+
180
+
181
+ def _write_output(
182
+ result: ExtractionResult, output: Path | None, show_metadata: bool, output_format: str, verbose: bool
183
+ ) -> None:
184
+ """Format and write extraction output."""
185
+ formatted_output = format_extraction_result(result, show_metadata, output_format)
186
+
187
+ if output:
188
+ output.write_text(formatted_output, encoding="utf-8")
189
+ if verbose:
190
+ console.print(f"[green]✓[/green] Output written to: {output}")
191
+ else:
192
+ click.echo(formatted_output)
193
+
194
+
195
+ def handle_error(error: Exception, verbose: bool) -> None:
196
+ """Handle and display errors.
197
+
198
+ Args:
199
+ error: The exception to handle.
200
+ verbose: Whether to show full stack trace.
201
+ """
202
+ if isinstance(error, MissingDependencyError):
203
+ console.print(f"[red]Missing dependency:[/red] {error}", style="bold")
204
+ sys.exit(2)
205
+ elif isinstance(error, KreuzbergError):
206
+ console.print(f"[red]Error:[/red] {error}", style="bold")
207
+ if verbose and error.context:
208
+ console.print("\n[dim]Context:[/dim]")
209
+ console.print(json.dumps(error.context, indent=2))
210
+ sys.exit(1)
211
+ else:
212
+ console.print(f"[red]Unexpected error:[/red] {type(error).__name__}: {error}", style="bold")
213
+ if verbose:
214
+ import traceback
215
+
216
+ console.print("\n[dim]Traceback:[/dim]")
217
+ traceback.print_exc()
218
+ sys.exit(1)
219
+
220
+
221
+ @click.group(invoke_without_command=True)
222
+ @click.version_option(version=__version__, prog_name="kreuzberg")
223
+ @click.pass_context
224
+ def cli(ctx: click.Context) -> None:
225
+ """Kreuzberg - Text extraction from documents.
226
+
227
+ Extract text from PDFs, images, Office documents, and more.
228
+ """
229
+ if ctx.invoked_subcommand is None:
230
+ click.echo(ctx.get_help())
231
+
232
+
233
+ @cli.command()
234
+ @click.argument("file", type=click.Path(exists=True, path_type=Path), required=False)
235
+ @click.option("-o", "--output", type=click.Path(path_type=Path), help="Output file path (default: stdout)")
236
+ @click.option("--force-ocr", is_flag=True, help="Force OCR processing")
237
+ @click.option("--chunk-content", is_flag=True, help="Enable content chunking")
238
+ @click.option("--extract-tables", is_flag=True, help="Enable table extraction")
239
+ @click.option(
240
+ "--max-chars",
241
+ type=int,
242
+ default=DEFAULT_MAX_CHARACTERS,
243
+ help=f"Maximum characters per chunk (default: {DEFAULT_MAX_CHARACTERS})",
244
+ )
245
+ @click.option(
246
+ "--max-overlap",
247
+ type=int,
248
+ default=DEFAULT_MAX_OVERLAP,
249
+ help=f"Maximum overlap between chunks (default: {DEFAULT_MAX_OVERLAP})",
250
+ )
251
+ @click.option(
252
+ "--ocr-backend", type=OcrBackendParamType(), help="OCR backend to use (tesseract, easyocr, paddleocr, none)"
253
+ )
254
+ @click.option("--config", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
255
+ @click.option("--show-metadata", is_flag=True, help="Include metadata in output")
256
+ @click.option("--output-format", type=click.Choice(["text", "json"]), default="text", help="Output format")
257
+ @click.option("-v", "--verbose", is_flag=True, help="Verbose output for debugging")
258
+ @click.option("--tesseract-lang", help="Tesseract language(s) (e.g., 'eng+deu')")
259
+ @click.option("--tesseract-psm", type=int, help="Tesseract PSM mode (0-13)")
260
+ @click.option("--easyocr-languages", help="EasyOCR language codes (comma-separated, e.g., 'en,de')")
261
+ @click.option("--paddleocr-languages", help="PaddleOCR language codes (comma-separated, e.g., 'en,german')")
262
+ @click.pass_context
263
+ def extract( # noqa: PLR0913
264
+ ctx: click.Context, # noqa: ARG001
265
+ file: Path | None,
266
+ output: Path | None,
267
+ force_ocr: bool,
268
+ chunk_content: bool,
269
+ extract_tables: bool,
270
+ max_chars: int,
271
+ max_overlap: int,
272
+ ocr_backend: str | None,
273
+ config: Path | None,
274
+ show_metadata: bool,
275
+ output_format: str,
276
+ verbose: bool,
277
+ tesseract_lang: str | None,
278
+ tesseract_psm: int | None,
279
+ easyocr_languages: str | None,
280
+ paddleocr_languages: str | None,
281
+ ) -> None:
282
+ """Extract text from a document.
283
+
284
+ FILE can be a path to a document or '-' to read from stdin.
285
+ If FILE is omitted, reads from stdin.
286
+ """
287
+ try:
288
+ file_config = _load_config(config, verbose)
289
+
290
+ cli_args = _build_cli_args(
291
+ force_ocr,
292
+ chunk_content,
293
+ extract_tables,
294
+ max_chars,
295
+ max_overlap,
296
+ ocr_backend,
297
+ tesseract_lang,
298
+ tesseract_psm,
299
+ easyocr_languages,
300
+ paddleocr_languages,
301
+ )
302
+
303
+ extraction_config = build_extraction_config(file_config, cli_args)
304
+
305
+ result = _perform_extraction(file, extraction_config, verbose)
306
+
307
+ _write_output(result, output, show_metadata, output_format, verbose)
308
+
309
+ except Exception as e: # noqa: BLE001
310
+ handle_error(e, verbose)
311
+
312
+
313
+ @cli.command()
314
+ @click.option("--config", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
315
+ def config(config: Path | None) -> None:
316
+ """Show current configuration."""
317
+ try:
318
+ config_path = config or find_default_config()
319
+
320
+ if config_path:
321
+ file_config = load_config_from_file(config_path)
322
+ console.print(f"[bold]Configuration from:[/bold] {config_path}")
323
+ console.print(json.dumps(file_config, indent=2))
324
+ else:
325
+ console.print("[yellow]No configuration file found.[/yellow]")
326
+ console.print("\nDefault configuration values:")
327
+ console.print(" force_ocr: False")
328
+ console.print(" chunk_content: False")
329
+ console.print(" extract_tables: False")
330
+ console.print(f" max_chars: {DEFAULT_MAX_CHARACTERS}")
331
+ console.print(f" max_overlap: {DEFAULT_MAX_OVERLAP}")
332
+ console.print(" ocr_backend: tesseract")
333
+ except Exception as e: # noqa: BLE001
334
+ handle_error(e, verbose=False)
335
+
336
+
337
+ if __name__ == "__main__":
338
+ cli()