kreuzberg 3.7.0__py3-none-any.whl → 3.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. kreuzberg/_extractors/_base.py +40 -0
  2. kreuzberg/_extractors/_email.py +149 -0
  3. kreuzberg/_extractors/_html.py +15 -3
  4. kreuzberg/_extractors/_image.py +17 -18
  5. kreuzberg/_extractors/_pdf.py +68 -14
  6. kreuzberg/_extractors/_presentation.py +62 -10
  7. kreuzberg/_extractors/_spread_sheet.py +179 -4
  8. kreuzberg/_extractors/_structured.py +148 -0
  9. kreuzberg/_gmft.py +2 -2
  10. kreuzberg/_mime_types.py +27 -1
  11. kreuzberg/_multiprocessing/__init__.py +2 -3
  12. kreuzberg/_ocr/__init__.py +30 -0
  13. kreuzberg/{_multiprocessing/tesseract_pool.py → _ocr/_pool.py} +3 -5
  14. kreuzberg/_ocr/_sync.py +566 -0
  15. kreuzberg/_ocr/_tesseract.py +6 -2
  16. kreuzberg/_registry.py +4 -0
  17. kreuzberg/_types.py +131 -0
  18. kreuzberg/_utils/_cache.py +17 -2
  19. kreuzberg/_utils/_process_pool.py +178 -1
  20. kreuzberg/_utils/_quality.py +237 -0
  21. kreuzberg/_utils/_serialization.py +4 -2
  22. kreuzberg/_utils/_string.py +153 -10
  23. kreuzberg/_utils/_sync.py +5 -2
  24. kreuzberg/_utils/_table.py +261 -0
  25. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/METADATA +66 -50
  26. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/RECORD +29 -28
  27. kreuzberg/_multiprocessing/process_manager.py +0 -189
  28. kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
  29. kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
  30. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  31. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/WHEEL +0 -0
  32. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/entry_points.txt +0 -0
  33. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,566 @@
1
+ """Synchronous OCR implementations for all backends."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import tempfile
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+ from PIL import Image
10
+
11
+ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
12
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
13
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
14
+ from kreuzberg._ocr._tesseract import TesseractConfig
15
+ from kreuzberg._types import ExtractionResult
16
+ from kreuzberg._utils._string import normalize_spaces
17
+ from kreuzberg.exceptions import MissingDependencyError, OCRError
18
+
19
+
20
+ def _get_easyocr_instance(config: EasyOCRConfig) -> Any:
21
+ """Get an EasyOCR Reader instance with the given configuration."""
22
+ try:
23
+ import easyocr
24
+ except ImportError as e:
25
+ raise MissingDependencyError("EasyOCR is not installed. Install it with: pip install easyocr") from e
26
+
27
+ gpu = False
28
+ if hasattr(config, "device"):
29
+ if config.device and config.device.lower() != "cpu":
30
+ gpu = True
31
+ elif hasattr(config, "use_gpu"):
32
+ gpu = config.use_gpu
33
+
34
+ language = config.language if hasattr(config, "language") else "en"
35
+ if isinstance(language, str):
36
+ lang_list = [lang.strip().lower() for lang in language.split(",")]
37
+ else:
38
+ lang_list = [lang.lower() for lang in language]
39
+
40
+ kwargs = {
41
+ "lang_list": lang_list,
42
+ "gpu": gpu,
43
+ "model_storage_directory": getattr(config, "model_storage_directory", None),
44
+ "user_network_directory": getattr(config, "user_network_directory", None),
45
+ "recog_network": getattr(config, "recog_network", None),
46
+ "detector": getattr(config, "detector", None),
47
+ "recognizer": getattr(config, "recognizer", None),
48
+ "verbose": False,
49
+ "quantize": getattr(config, "quantize", None),
50
+ "cudnn_benchmark": getattr(config, "cudnn_benchmark", None),
51
+ }
52
+
53
+ kwargs = {k: v for k, v in kwargs.items() if v is not None}
54
+
55
+ return easyocr.Reader(**kwargs)
56
+
57
+
58
+ def process_image_easyocr_sync(
59
+ image_path: str | Path,
60
+ config: EasyOCRConfig | None = None,
61
+ ) -> ExtractionResult:
62
+ """Process an image with EasyOCR using pure sync implementation.
63
+
64
+ This bypasses all async overhead and calls EasyOCR directly.
65
+
66
+ Args:
67
+ image_path: Path to the image file.
68
+ config: EasyOCR configuration.
69
+
70
+ Returns:
71
+ Extraction result.
72
+ """
73
+ cfg = config or EasyOCRConfig()
74
+
75
+ try:
76
+ reader = _get_easyocr_instance(cfg)
77
+
78
+ readtext_kwargs = {
79
+ "decoder": cfg.decoder,
80
+ "beamWidth": cfg.beam_width,
81
+ "batch_size": getattr(cfg, "batch_size", 1),
82
+ "workers": getattr(cfg, "workers", 0),
83
+ "allowlist": getattr(cfg, "allowlist", None),
84
+ "blocklist": getattr(cfg, "blocklist", None),
85
+ "detail": getattr(cfg, "detail", 1),
86
+ "rotation_info": cfg.rotation_info,
87
+ "paragraph": getattr(cfg, "paragraph", False),
88
+ "min_size": cfg.min_size,
89
+ "text_threshold": cfg.text_threshold,
90
+ "low_text": cfg.low_text,
91
+ "link_threshold": cfg.link_threshold,
92
+ "canvas_size": cfg.canvas_size,
93
+ "mag_ratio": cfg.mag_ratio,
94
+ "slope_ths": cfg.slope_ths,
95
+ "ycenter_ths": cfg.ycenter_ths,
96
+ "height_ths": cfg.height_ths,
97
+ "width_ths": cfg.width_ths,
98
+ "add_margin": cfg.add_margin,
99
+ "x_ths": cfg.x_ths,
100
+ "y_ths": cfg.y_ths,
101
+ }
102
+
103
+ readtext_kwargs = {k: v for k, v in readtext_kwargs.items() if v is not None}
104
+
105
+ results = reader.readtext(str(image_path), **readtext_kwargs)
106
+
107
+ if not results:
108
+ return ExtractionResult(
109
+ content="",
110
+ mime_type=PLAIN_TEXT_MIME_TYPE,
111
+ metadata={},
112
+ chunks=[],
113
+ )
114
+
115
+ texts = []
116
+ confidences = []
117
+
118
+ detail_value = getattr(cfg, "detail", 1)
119
+ if detail_value:
120
+ for result in results:
121
+ min_result_length = 2
122
+ max_confidence_index = 2
123
+ if len(result) >= min_result_length:
124
+ _bbox, text = result[0], result[1]
125
+ confidence = result[max_confidence_index] if len(result) > max_confidence_index else 1.0
126
+ texts.append(text)
127
+ confidences.append(confidence)
128
+ else:
129
+ texts = results
130
+ confidences = [1.0] * len(texts)
131
+
132
+ content = "\n".join(texts)
133
+ content = normalize_spaces(content)
134
+
135
+ avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
136
+
137
+ metadata = {"confidence": avg_confidence} if confidences else {}
138
+
139
+ return ExtractionResult(
140
+ content=content,
141
+ mime_type=PLAIN_TEXT_MIME_TYPE,
142
+ metadata=metadata, # type: ignore[arg-type]
143
+ chunks=[],
144
+ )
145
+
146
+ except Exception as e:
147
+ raise OCRError(f"EasyOCR processing failed: {e}") from e
148
+
149
+
150
+ def process_image_bytes_easyocr_sync(
151
+ image_bytes: bytes,
152
+ config: EasyOCRConfig | None = None,
153
+ ) -> ExtractionResult:
154
+ """Process image bytes with EasyOCR using pure sync implementation.
155
+
156
+ Args:
157
+ image_bytes: Image data as bytes.
158
+ config: EasyOCR configuration.
159
+
160
+ Returns:
161
+ Extraction result.
162
+ """
163
+ import io
164
+
165
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
166
+ with Image.open(io.BytesIO(image_bytes)) as image:
167
+ image.save(tmp_image.name, format="PNG")
168
+ image_path = tmp_image.name
169
+
170
+ try:
171
+ return process_image_easyocr_sync(image_path, config)
172
+ finally:
173
+ image_file = Path(image_path)
174
+ if image_file.exists():
175
+ image_file.unlink()
176
+
177
+
178
+ def _get_paddleocr_instance(config: PaddleOCRConfig) -> Any:
179
+ """Get a PaddleOCR instance with the given configuration."""
180
+ try:
181
+ import paddleocr
182
+ except ImportError as e:
183
+ raise MissingDependencyError("PaddleOCR is not installed. Install it with: pip install paddleocr") from e
184
+
185
+ if hasattr(config, "device"):
186
+ if config.device and config.device.lower() != "cpu":
187
+ pass
188
+ elif hasattr(config, "use_gpu"):
189
+ pass
190
+
191
+ kwargs = {
192
+ "lang": config.language,
193
+ "use_textline_orientation": config.use_angle_cls,
194
+ }
195
+
196
+ if hasattr(config, "det_db_thresh"):
197
+ kwargs["text_det_thresh"] = config.det_db_thresh
198
+ if hasattr(config, "det_db_box_thresh"):
199
+ kwargs["text_det_box_thresh"] = config.det_db_box_thresh
200
+ if hasattr(config, "det_db_unclip_ratio"):
201
+ kwargs["text_det_unclip_ratio"] = config.det_db_unclip_ratio
202
+ if hasattr(config, "det_max_side_len"):
203
+ kwargs["text_det_limit_side_len"] = config.det_max_side_len
204
+ if hasattr(config, "drop_score"):
205
+ kwargs["text_rec_score_thresh"] = config.drop_score
206
+
207
+ return paddleocr.PaddleOCR(**kwargs)
208
+
209
+
210
+ def process_image_paddleocr_sync(
211
+ image_path: str | Path,
212
+ config: PaddleOCRConfig | None = None,
213
+ ) -> ExtractionResult:
214
+ """Process an image with PaddleOCR using pure sync implementation.
215
+
216
+ This bypasses all async overhead and calls PaddleOCR directly.
217
+
218
+ Args:
219
+ image_path: Path to the image file.
220
+ config: PaddleOCR configuration.
221
+
222
+ Returns:
223
+ Extraction result.
224
+ """
225
+ cfg = config or PaddleOCRConfig()
226
+
227
+ try:
228
+ ocr_instance = _get_paddleocr_instance(cfg)
229
+
230
+ results = ocr_instance.ocr(str(image_path))
231
+
232
+ if not results or not results[0]:
233
+ return ExtractionResult(
234
+ content="",
235
+ mime_type=PLAIN_TEXT_MIME_TYPE,
236
+ metadata={},
237
+ chunks=[],
238
+ )
239
+
240
+ ocr_result = results[0]
241
+ result_data = ocr_result.json["res"]
242
+
243
+ texts = result_data.get("rec_texts", [])
244
+ scores = result_data.get("rec_scores", [])
245
+
246
+ if not texts:
247
+ return ExtractionResult(
248
+ content="",
249
+ mime_type=PLAIN_TEXT_MIME_TYPE,
250
+ metadata={},
251
+ chunks=[],
252
+ )
253
+
254
+ content = "\n".join(texts)
255
+ content = normalize_spaces(content)
256
+
257
+ avg_confidence = sum(scores) / len(scores) if scores else 0.0
258
+
259
+ metadata = {"confidence": avg_confidence} if scores else {}
260
+
261
+ return ExtractionResult(
262
+ content=content,
263
+ mime_type=PLAIN_TEXT_MIME_TYPE,
264
+ metadata=metadata, # type: ignore[arg-type]
265
+ chunks=[],
266
+ )
267
+
268
+ except Exception as e:
269
+ raise OCRError(f"PaddleOCR processing failed: {e}") from e
270
+
271
+
272
+ def process_image_bytes_paddleocr_sync(
273
+ image_bytes: bytes,
274
+ config: PaddleOCRConfig | None = None,
275
+ ) -> ExtractionResult:
276
+ """Process image bytes with PaddleOCR using pure sync implementation.
277
+
278
+ Args:
279
+ image_bytes: Image data as bytes.
280
+ config: PaddleOCR configuration.
281
+
282
+ Returns:
283
+ Extraction result.
284
+ """
285
+ import io
286
+
287
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
288
+ with Image.open(io.BytesIO(image_bytes)) as image:
289
+ image.save(tmp_image.name, format="PNG")
290
+ image_path = tmp_image.name
291
+
292
+ try:
293
+ return process_image_paddleocr_sync(image_path, config)
294
+ finally:
295
+ image_file = Path(image_path)
296
+ if image_file.exists():
297
+ image_file.unlink()
298
+
299
+
300
+ def process_image_tesseract_sync(
301
+ image_path: str | Path,
302
+ config: TesseractConfig | None = None,
303
+ ) -> ExtractionResult:
304
+ """Process an image with Tesseract using pure sync implementation.
305
+
306
+ This bypasses all async overhead and calls Tesseract directly.
307
+
308
+ Args:
309
+ image_path: Path to the image file.
310
+ config: Tesseract configuration.
311
+
312
+ Returns:
313
+ Extraction result.
314
+ """
315
+ import os
316
+ import subprocess
317
+
318
+ cfg = config or TesseractConfig()
319
+
320
+ with tempfile.NamedTemporaryFile(suffix=".txt", delete=False) as tmp_file:
321
+ output_base = tmp_file.name.replace(".txt", "")
322
+
323
+ try:
324
+ command = [
325
+ "tesseract",
326
+ str(image_path),
327
+ output_base,
328
+ "-l",
329
+ cfg.language,
330
+ "--psm",
331
+ str(cfg.psm.value if hasattr(cfg.psm, "value") else cfg.psm),
332
+ "--oem",
333
+ "1",
334
+ "--loglevel",
335
+ "OFF",
336
+ ]
337
+
338
+ boolean_fields = [
339
+ "classify_use_pre_adapted_templates",
340
+ "language_model_ngram_on",
341
+ "tessedit_dont_blkrej_good_wds",
342
+ "tessedit_dont_rowrej_good_wds",
343
+ "tessedit_enable_dict_correction",
344
+ "tessedit_use_primary_params_model",
345
+ "textord_space_size_is_variable",
346
+ "thresholding_method",
347
+ ]
348
+
349
+ for field in boolean_fields:
350
+ if hasattr(cfg, field):
351
+ value = 1 if getattr(cfg, field) else 0
352
+ command.extend(["-c", f"{field}={value}"])
353
+
354
+ env = os.environ.copy()
355
+ env["OMP_THREAD_LIMIT"] = "1"
356
+
357
+ result = subprocess.run(
358
+ command,
359
+ check=False,
360
+ env=env,
361
+ capture_output=True,
362
+ text=True,
363
+ timeout=30,
364
+ )
365
+
366
+ if result.returncode != 0:
367
+ raise OCRError(f"Tesseract failed with return code {result.returncode}: {result.stderr}")
368
+
369
+ output_file = output_base + ".txt"
370
+ with Path(output_file).open(encoding="utf-8") as f:
371
+ text = f.read()
372
+
373
+ text = normalize_spaces(text)
374
+
375
+ return ExtractionResult(
376
+ content=text,
377
+ mime_type=PLAIN_TEXT_MIME_TYPE,
378
+ metadata={},
379
+ chunks=[],
380
+ )
381
+
382
+ finally:
383
+ for ext in [".txt"]:
384
+ temp_file = output_base + ext
385
+ temp_path = Path(temp_file)
386
+ if temp_path.exists():
387
+ temp_path.unlink()
388
+
389
+
390
+ def process_image_bytes_tesseract_sync(
391
+ image_bytes: bytes,
392
+ config: TesseractConfig | None = None,
393
+ ) -> ExtractionResult:
394
+ """Process image bytes with Tesseract using pure sync implementation.
395
+
396
+ Args:
397
+ image_bytes: Image data as bytes.
398
+ config: Tesseract configuration.
399
+
400
+ Returns:
401
+ Extraction result.
402
+ """
403
+ import io
404
+
405
+ with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
406
+ with Image.open(io.BytesIO(image_bytes)) as image:
407
+ image.save(tmp_image.name, format="PNG")
408
+ image_path = tmp_image.name
409
+
410
+ try:
411
+ return process_image_tesseract_sync(image_path, config)
412
+ finally:
413
+ image_file = Path(image_path)
414
+ if image_file.exists():
415
+ image_file.unlink()
416
+
417
+
418
+ def process_batch_images_sync(
419
+ image_paths: list[str | Path],
420
+ config: EasyOCRConfig | PaddleOCRConfig | TesseractConfig | None = None,
421
+ backend: str = "tesseract",
422
+ ) -> list[ExtractionResult]:
423
+ """Process a batch of images sequentially with pure sync implementation.
424
+
425
+ Args:
426
+ image_paths: List of image file paths.
427
+ config: OCR configuration.
428
+ backend: OCR backend to use.
429
+
430
+ Returns:
431
+ List of extraction results.
432
+ """
433
+ results = []
434
+ for image_path in image_paths:
435
+ if backend == "easyocr":
436
+ result = process_image_easyocr_sync(image_path, config) # type: ignore[arg-type]
437
+ elif backend == "paddleocr":
438
+ result = process_image_paddleocr_sync(image_path, config) # type: ignore[arg-type]
439
+ else:
440
+ result = process_image_tesseract_sync(image_path, config) # type: ignore[arg-type]
441
+ results.append(result)
442
+ return results
443
+
444
+
445
+ def process_batch_images_threaded(
446
+ image_paths: list[str | Path],
447
+ config: EasyOCRConfig | PaddleOCRConfig | TesseractConfig | None = None,
448
+ backend: str = "tesseract",
449
+ max_workers: int | None = None,
450
+ ) -> list[ExtractionResult]:
451
+ """Process a batch of images using threading.
452
+
453
+ Args:
454
+ image_paths: List of image file paths.
455
+ config: OCR configuration.
456
+ backend: OCR backend to use.
457
+ max_workers: Maximum number of threads.
458
+
459
+ Returns:
460
+ List of extraction results in same order as input.
461
+ """
462
+ import multiprocessing as mp
463
+ from concurrent.futures import ThreadPoolExecutor, as_completed
464
+
465
+ if max_workers is None:
466
+ max_workers = min(len(image_paths), mp.cpu_count())
467
+
468
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
469
+ if backend == "easyocr":
470
+ future_to_index = {
471
+ executor.submit(process_image_easyocr_sync, path, config): i # type: ignore[arg-type]
472
+ for i, path in enumerate(image_paths)
473
+ }
474
+ elif backend == "paddleocr":
475
+ future_to_index = {
476
+ executor.submit(process_image_paddleocr_sync, path, config): i # type: ignore[arg-type]
477
+ for i, path in enumerate(image_paths)
478
+ }
479
+ else:
480
+ future_to_index = {
481
+ executor.submit(process_image_tesseract_sync, path, config): i # type: ignore[arg-type]
482
+ for i, path in enumerate(image_paths)
483
+ }
484
+
485
+ results: list[ExtractionResult] = [None] * len(image_paths) # type: ignore[list-item]
486
+ for future in as_completed(future_to_index):
487
+ index = future_to_index[future]
488
+ try:
489
+ results[index] = future.result()
490
+ except Exception as e: # noqa: BLE001
491
+ results[index] = ExtractionResult(
492
+ content=f"Error: {e}",
493
+ mime_type=PLAIN_TEXT_MIME_TYPE,
494
+ metadata={"error": str(e)}, # type: ignore[typeddict-unknown-key]
495
+ chunks=[],
496
+ )
497
+
498
+ return results
499
+
500
+
501
+ def process_batch_images_process_pool(
502
+ image_paths: list[str | Path],
503
+ config: TesseractConfig | None = None,
504
+ max_workers: int | None = None,
505
+ ) -> list[ExtractionResult]:
506
+ """Process a batch of images using process pool.
507
+
508
+ Args:
509
+ image_paths: List of image file paths.
510
+ config: Tesseract configuration.
511
+ max_workers: Maximum number of processes.
512
+
513
+ Returns:
514
+ List of extraction results in same order as input.
515
+ """
516
+ import multiprocessing as mp
517
+ from concurrent.futures import ProcessPoolExecutor, as_completed
518
+
519
+ if max_workers is None:
520
+ max_workers = min(len(image_paths), mp.cpu_count())
521
+
522
+ cfg = config or TesseractConfig()
523
+ config_dict = {}
524
+ for field_name in cfg.__dataclass_fields__:
525
+ value = getattr(cfg, field_name)
526
+ if hasattr(value, "value"):
527
+ config_dict[field_name] = value.value
528
+ else:
529
+ config_dict[field_name] = value
530
+
531
+ with ProcessPoolExecutor(max_workers=max_workers) as executor:
532
+ from kreuzberg._ocr._pool import _process_image_with_tesseract
533
+
534
+ future_to_index = {
535
+ executor.submit(_process_image_with_tesseract, str(path), config_dict): i
536
+ for i, path in enumerate(image_paths)
537
+ }
538
+
539
+ results: list[ExtractionResult] = [None] * len(image_paths) # type: ignore[list-item]
540
+ for future in as_completed(future_to_index):
541
+ index = future_to_index[future]
542
+ try:
543
+ result_dict = future.result()
544
+ if result_dict["success"]:
545
+ results[index] = ExtractionResult(
546
+ content=result_dict["text"],
547
+ mime_type=PLAIN_TEXT_MIME_TYPE,
548
+ metadata={},
549
+ chunks=[],
550
+ )
551
+ else:
552
+ results[index] = ExtractionResult(
553
+ content=f"Error: {result_dict['error']}",
554
+ mime_type=PLAIN_TEXT_MIME_TYPE,
555
+ metadata={"error": result_dict["error"]}, # type: ignore[typeddict-unknown-key]
556
+ chunks=[],
557
+ )
558
+ except Exception as e: # noqa: BLE001
559
+ results[index] = ExtractionResult(
560
+ content=f"Error: {e}",
561
+ mime_type=PLAIN_TEXT_MIME_TYPE,
562
+ metadata={"error": str(e)}, # type: ignore[typeddict-unknown-key]
563
+ chunks=[],
564
+ )
565
+
566
+ return results
@@ -206,7 +206,7 @@ class TesseractConfig:
206
206
  """Enable or disable the use of n-gram-based language models for improved text recognition.
207
207
 
208
208
  Default is False for optimal performance on modern documents. Enable for degraded or historical text."""
209
- psm: PSMMode = PSMMode.AUTO_ONLY
209
+ psm: PSMMode = PSMMode.AUTO
210
210
  """Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
211
211
  tessedit_dont_blkrej_good_wds: bool = True
212
212
  """If True, prevents block rejection of words identified as good, improving text output quality."""
@@ -345,7 +345,11 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
345
345
  "OFF",
346
346
  ]
347
347
  for kwarg, value in kwargs.items():
348
- command.extend(["-c", f"{kwarg}={1 if value else 0}"])
348
+ if isinstance(value, bool):
349
+ command.extend(["-c", f"{kwarg}={1 if value else 0}"])
350
+ else:
351
+ # Handle string parameters (like tessedit_char_whitelist)
352
+ command.extend(["-c", f"{kwarg}={value}"])
349
353
 
350
354
  env: dict[str, Any] | None = None
351
355
  if sys.platform.startswith("linux"):
kreuzberg/_registry.py CHANGED
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  from functools import lru_cache
4
4
  from typing import TYPE_CHECKING, ClassVar
5
5
 
6
+ from kreuzberg._extractors._email import EmailExtractor
6
7
  from kreuzberg._extractors._html import HTMLExtractor
7
8
  from kreuzberg._extractors._image import ImageExtractor
8
9
  from kreuzberg._extractors._pandoc import (
@@ -19,6 +20,7 @@ from kreuzberg._extractors._pandoc import (
19
20
  from kreuzberg._extractors._pdf import PDFExtractor
20
21
  from kreuzberg._extractors._presentation import PresentationExtractor
21
22
  from kreuzberg._extractors._spread_sheet import SpreadSheetExtractor
23
+ from kreuzberg._extractors._structured import StructuredDataExtractor
22
24
 
23
25
  if TYPE_CHECKING:
24
26
  from kreuzberg._extractors._base import Extractor
@@ -40,6 +42,8 @@ class ExtractorRegistry:
40
42
  PresentationExtractor,
41
43
  SpreadSheetExtractor,
42
44
  HTMLExtractor,
45
+ EmailExtractor,
46
+ StructuredDataExtractor,
43
47
  MarkdownExtractor,
44
48
  ImageExtractor,
45
49
  BibliographyExtractor,