kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. kreuzberg/__init__.py +9 -2
  2. kreuzberg/_api/__init__.py +0 -0
  3. kreuzberg/_api/main.py +87 -0
  4. kreuzberg/_entity_extraction.py +238 -0
  5. kreuzberg/_extractors/_base.py +39 -1
  6. kreuzberg/_extractors/_email.py +149 -0
  7. kreuzberg/_extractors/_html.py +15 -3
  8. kreuzberg/_extractors/_image.py +27 -22
  9. kreuzberg/_extractors/_pandoc.py +3 -14
  10. kreuzberg/_extractors/_pdf.py +97 -34
  11. kreuzberg/_extractors/_presentation.py +62 -10
  12. kreuzberg/_extractors/_spread_sheet.py +181 -6
  13. kreuzberg/_extractors/_structured.py +148 -0
  14. kreuzberg/_gmft.py +318 -11
  15. kreuzberg/_language_detection.py +95 -0
  16. kreuzberg/_mcp/__init__.py +5 -0
  17. kreuzberg/_mcp/server.py +227 -0
  18. kreuzberg/_mime_types.py +27 -1
  19. kreuzberg/_ocr/__init__.py +10 -1
  20. kreuzberg/_ocr/_base.py +59 -0
  21. kreuzberg/_ocr/_easyocr.py +92 -1
  22. kreuzberg/_ocr/_paddleocr.py +89 -0
  23. kreuzberg/_ocr/_tesseract.py +569 -5
  24. kreuzberg/_registry.py +4 -0
  25. kreuzberg/_types.py +181 -4
  26. kreuzberg/_utils/_cache.py +52 -4
  27. kreuzberg/_utils/_device.py +2 -2
  28. kreuzberg/_utils/_errors.py +3 -7
  29. kreuzberg/_utils/_process_pool.py +182 -9
  30. kreuzberg/_utils/_quality.py +237 -0
  31. kreuzberg/_utils/_serialization.py +4 -2
  32. kreuzberg/_utils/_string.py +153 -10
  33. kreuzberg/_utils/_sync.py +6 -7
  34. kreuzberg/_utils/_table.py +261 -0
  35. kreuzberg/_utils/_tmp.py +2 -2
  36. kreuzberg/cli.py +1 -2
  37. kreuzberg/extraction.py +43 -34
  38. kreuzberg-3.8.1.dist-info/METADATA +301 -0
  39. kreuzberg-3.8.1.dist-info/RECORD +53 -0
  40. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
  41. kreuzberg/_multiprocessing/__init__.py +0 -6
  42. kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
  43. kreuzberg/_multiprocessing/process_manager.py +0 -188
  44. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  45. kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
  46. kreuzberg-3.3.0.dist-info/METADATA +0 -235
  47. kreuzberg-3.3.0.dist-info/RECORD +0 -48
  48. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
  49. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,148 @@
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from typing import TYPE_CHECKING, Any, ClassVar
5
+
6
+ from anyio import Path as AsyncPath
7
+
8
+ from kreuzberg._extractors._base import Extractor
9
+ from kreuzberg._mime_types import JSON_MIME_TYPE, PLAIN_TEXT_MIME_TYPE, TOML_MIME_TYPE, YAML_MIME_TYPE
10
+ from kreuzberg._types import ExtractionResult, normalize_metadata
11
+ from kreuzberg._utils._string import normalize_spaces, safe_decode
12
+ from kreuzberg._utils._sync import run_sync
13
+
14
+ if TYPE_CHECKING:
15
+ from pathlib import Path
16
+
17
+
18
+ class StructuredDataExtractor(Extractor):
19
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
20
+ JSON_MIME_TYPE,
21
+ "text/json",
22
+ YAML_MIME_TYPE,
23
+ "text/yaml",
24
+ "text/x-yaml",
25
+ "application/yaml",
26
+ TOML_MIME_TYPE,
27
+ "text/toml",
28
+ }
29
+
30
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
31
+ return await run_sync(self.extract_bytes_sync, content)
32
+
33
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
34
+ content = await AsyncPath(path).read_bytes()
35
+ return await self.extract_bytes_async(content)
36
+
37
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
38
+ text_content = safe_decode(content)
39
+
40
+ try:
41
+ if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
42
+ data = json.loads(text_content)
43
+ elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
44
+ try:
45
+ import tomllib # type: ignore[import-not-found]
46
+ except ImportError:
47
+ try:
48
+ import tomli as tomllib # type: ignore[import-not-found]
49
+ except ImportError:
50
+ return ExtractionResult(
51
+ content=normalize_spaces(text_content),
52
+ mime_type=PLAIN_TEXT_MIME_TYPE,
53
+ metadata={"warning": "tomllib/tomli not available, returning raw text"},
54
+ chunks=[],
55
+ )
56
+ data = tomllib.loads(text_content)
57
+ else:
58
+ try:
59
+ import yaml
60
+
61
+ data = yaml.safe_load(text_content)
62
+ except ImportError:
63
+ return ExtractionResult(
64
+ content=normalize_spaces(text_content),
65
+ mime_type=PLAIN_TEXT_MIME_TYPE,
66
+ metadata={"warning": "PyYAML not available, returning raw text"},
67
+ chunks=[],
68
+ )
69
+
70
+ text_parts: list[str] = []
71
+ metadata: dict[str, Any] = {}
72
+
73
+ if isinstance(data, dict):
74
+ text_parts.extend(self._extract_from_dict(data, metadata))
75
+ elif isinstance(data, list):
76
+ text_parts.extend(self._extract_from_list(data, metadata))
77
+ else:
78
+ text_parts.append(str(data))
79
+
80
+ combined_text = "\n".join(text_parts) if text_parts else text_content
81
+
82
+ return ExtractionResult(
83
+ content=normalize_spaces(combined_text),
84
+ mime_type=PLAIN_TEXT_MIME_TYPE,
85
+ metadata=normalize_metadata(metadata),
86
+ chunks=[],
87
+ )
88
+
89
+ except (ValueError, TypeError, KeyError, AttributeError, UnicodeDecodeError) as e:
90
+ return ExtractionResult(
91
+ content=normalize_spaces(text_content),
92
+ mime_type=PLAIN_TEXT_MIME_TYPE,
93
+ metadata={"parse_error": str(e)},
94
+ chunks=[],
95
+ )
96
+
97
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
98
+ content = path.read_bytes()
99
+ return self.extract_bytes_sync(content)
100
+
101
+ def _extract_from_dict(self, data: dict[str, Any], metadata: dict[str, Any], prefix: str = "") -> list[str]:
102
+ text_parts = []
103
+
104
+ for key, value in data.items():
105
+ full_key = f"{prefix}.{key}" if prefix else key
106
+
107
+ if isinstance(value, str) and value.strip():
108
+ text_parts.append(f"{full_key}: {value}")
109
+
110
+ if any(
111
+ text_field in key.lower()
112
+ for text_field in ["title", "name", "subject", "description", "content", "body", "text", "message"]
113
+ ):
114
+ metadata[full_key] = value
115
+
116
+ elif isinstance(value, (int, float, bool)):
117
+ text_parts.append(f"{full_key}: {value}")
118
+
119
+ elif isinstance(value, dict):
120
+ text_parts.extend(self._extract_from_dict(value, metadata, full_key))
121
+
122
+ elif isinstance(value, list):
123
+ text_parts.extend(self._extract_from_list(value, metadata, full_key))
124
+
125
+ elif value is not None:
126
+ text_parts.append(f"{full_key}: {value!s}")
127
+
128
+ return text_parts
129
+
130
+ def _extract_from_list(self, data: list[Any], metadata: dict[str, Any], prefix: str = "") -> list[str]:
131
+ text_parts = []
132
+
133
+ for i, item in enumerate(data):
134
+ item_key = f"{prefix}[{i}]" if prefix else f"item_{i}"
135
+
136
+ if isinstance(item, str) and item.strip():
137
+ text_parts.append(f"{item_key}: {item}")
138
+
139
+ elif isinstance(item, dict):
140
+ text_parts.extend(self._extract_from_dict(item, metadata, item_key))
141
+
142
+ elif isinstance(item, list):
143
+ text_parts.extend(self._extract_from_list(item, metadata, item_key))
144
+
145
+ elif item is not None:
146
+ text_parts.append(f"{item_key}: {item!s}")
147
+
148
+ return text_parts
kreuzberg/_gmft.py CHANGED
@@ -1,12 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import multiprocessing as mp
3
4
  import os
5
+ import queue
6
+ import signal
7
+ import traceback
4
8
  from dataclasses import dataclass, field
9
+ from io import StringIO
5
10
  from typing import TYPE_CHECKING, Any, Literal
6
11
 
7
12
  from kreuzberg._types import TableData
8
13
  from kreuzberg._utils._sync import run_sync
9
- from kreuzberg.exceptions import MissingDependencyError
14
+ from kreuzberg.exceptions import MissingDependencyError, ParsingError
10
15
 
11
16
  if TYPE_CHECKING:
12
17
  from os import PathLike
@@ -196,9 +201,7 @@ async def extract_tables( # noqa: PLR0915
196
201
 
197
202
  try:
198
203
  if use_isolated_process:
199
- from kreuzberg._multiprocessing.gmft_isolated import extract_tables_isolated_async
200
-
201
- result = await extract_tables_isolated_async(file_path, config)
204
+ result = await _extract_tables_isolated_async(file_path, config)
202
205
 
203
206
  await table_cache.aset(result, **cache_kwargs)
204
207
 
@@ -210,7 +213,7 @@ async def extract_tables( # noqa: PLR0915
210
213
  from gmft.formatters.tatr import TATRFormatConfig
211
214
  from gmft.pdf_bindings.pdfium import PyPDFium2Document
212
215
 
213
- formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
216
+ formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
214
217
  config=TATRFormatConfig(
215
218
  verbosity=config.verbosity,
216
219
  formatter_base_threshold=config.formatter_base_threshold,
@@ -226,7 +229,7 @@ async def extract_tables( # noqa: PLR0915
226
229
  force_large_table_assumption=config.force_large_table_assumption,
227
230
  )
228
231
  )
229
- detector: Any = AutoTableDetector( # type: ignore[no-untyped-call]
232
+ detector: Any = AutoTableDetector( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
230
233
  config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
231
234
  )
232
235
  doc = await run_sync(PyPDFium2Document, str(file_path))
@@ -247,7 +250,7 @@ async def extract_tables( # noqa: PLR0915
247
250
  text=data_frame.to_markdown(),
248
251
  df=data_frame,
249
252
  )
250
- for data_frame, cropped_table in zip(dataframes, cropped_tables)
253
+ for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
251
254
  ]
252
255
 
253
256
  await table_cache.aset(result, **cache_kwargs)
@@ -314,9 +317,7 @@ def extract_tables_sync(
314
317
  return cached_result # type: ignore[no-any-return]
315
318
 
316
319
  if use_isolated_process:
317
- from kreuzberg._multiprocessing.gmft_isolated import extract_tables_isolated
318
-
319
- result = extract_tables_isolated(file_path, config)
320
+ result = _extract_tables_isolated(file_path, config)
320
321
 
321
322
  table_cache.set(result, **cache_kwargs)
322
323
 
@@ -365,7 +366,7 @@ def extract_tables_sync(
365
366
  text=data_frame.to_markdown(),
366
367
  df=data_frame,
367
368
  )
368
- for data_frame, cropped_table in zip(dataframes, cropped_tables)
369
+ for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
369
370
  ]
370
371
 
371
372
  table_cache.set(result, **cache_kwargs)
@@ -378,3 +379,309 @@ def extract_tables_sync(
378
379
  raise MissingDependencyError.create_for_package(
379
380
  dependency_group="gmft", functionality="table extraction", package_name="gmft"
380
381
  ) from e
382
+
383
+
384
+ def _extract_tables_in_process(
385
+ file_path: str | PathLike[str],
386
+ config_dict: dict[str, Any],
387
+ result_queue: queue.Queue[tuple[bool, Any]],
388
+ ) -> None:
389
+ """Extract tables in an isolated process to handle potential segfaults.
390
+
391
+ Args:
392
+ file_path: Path to the PDF file
393
+ config_dict: Serialized GMFTConfig as a dict
394
+ result_queue: Queue to put results or errors
395
+ """
396
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
397
+
398
+ try:
399
+ from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
400
+ from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
401
+ from gmft.formatters.tatr import TATRFormatConfig
402
+ from gmft.pdf_bindings.pdfium import PyPDFium2Document
403
+
404
+ config = GMFTConfig(**config_dict)
405
+
406
+ formatter = AutoTableFormatter( # type: ignore[no-untyped-call]
407
+ config=TATRFormatConfig(
408
+ verbosity=config.verbosity,
409
+ formatter_base_threshold=config.formatter_base_threshold,
410
+ cell_required_confidence=config.cell_required_confidence,
411
+ remove_null_rows=config.remove_null_rows,
412
+ enable_multi_header=config.enable_multi_header,
413
+ semantic_spanning_cells=config.semantic_spanning_cells,
414
+ semantic_hierarchical_left_fill=config.semantic_hierarchical_left_fill,
415
+ large_table_if_n_rows_removed=config.large_table_if_n_rows_removed,
416
+ large_table_threshold=config.large_table_threshold,
417
+ large_table_row_overlap_threshold=config.large_table_row_overlap_threshold,
418
+ large_table_maximum_rows=config.large_table_maximum_rows,
419
+ force_large_table_assumption=config.force_large_table_assumption,
420
+ )
421
+ )
422
+ detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)) # type: ignore[no-untyped-call]
423
+
424
+ doc = PyPDFium2Document(str(file_path))
425
+ cropped_tables = []
426
+ dataframes = []
427
+
428
+ try:
429
+ for page in doc:
430
+ cropped_tables.extend(detector.extract(page)) # type: ignore[attr-defined]
431
+
432
+ for cropped_table in cropped_tables:
433
+ formatted_table = formatter.extract(cropped_table) # type: ignore[attr-defined]
434
+ dataframes.append(formatted_table.df())
435
+
436
+ results = []
437
+ for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False):
438
+ import io
439
+
440
+ img_bytes = io.BytesIO()
441
+ cropped_image = cropped_table.image()
442
+ cropped_image.save(img_bytes, format="PNG")
443
+ img_bytes.seek(0)
444
+
445
+ results.append(
446
+ {
447
+ "cropped_image_bytes": img_bytes.getvalue(),
448
+ "page_number": cropped_table.page.page_number,
449
+ "text": data_frame.to_markdown(),
450
+ "df_csv": data_frame.to_csv(index=False),
451
+ }
452
+ )
453
+
454
+ result_queue.put((True, results))
455
+
456
+ finally:
457
+ doc.close() # type: ignore[no-untyped-call]
458
+
459
+ except Exception as e: # noqa: BLE001
460
+ error_info = {"error": str(e), "type": type(e).__name__, "traceback": traceback.format_exc()}
461
+ result_queue.put((False, error_info))
462
+
463
+
464
+ def _extract_tables_isolated(
465
+ file_path: str | PathLike[str],
466
+ config: GMFTConfig | None = None,
467
+ timeout: float = 300.0,
468
+ ) -> list[TableData]:
469
+ """Extract tables using an isolated process to handle segfaults.
470
+
471
+ Args:
472
+ file_path: Path to the PDF file
473
+ config: GMFT configuration
474
+ timeout: Maximum time to wait for extraction
475
+
476
+ Returns:
477
+ List of extracted tables
478
+
479
+ Raises:
480
+ RuntimeError: If extraction fails or times out
481
+ """
482
+ config = config or GMFTConfig()
483
+ config_dict = config.__dict__.copy()
484
+
485
+ ctx = mp.get_context("spawn")
486
+ result_queue = ctx.Queue()
487
+
488
+ process = ctx.Process(
489
+ target=_extract_tables_in_process,
490
+ args=(str(file_path), config_dict, result_queue),
491
+ )
492
+
493
+ process.start()
494
+
495
+ try:
496
+ # Wait for result with timeout, checking for process death # ~keep
497
+ import time
498
+
499
+ start_time = time.time()
500
+ while True:
501
+ try:
502
+ success, result = result_queue.get_nowait()
503
+ break
504
+ except queue.Empty:
505
+ if time.time() - start_time > timeout:
506
+ raise
507
+
508
+ if not process.is_alive():
509
+ # Process died without putting result # ~keep
510
+ if process.exitcode == -signal.SIGSEGV:
511
+ raise ParsingError(
512
+ "GMFT process crashed with segmentation fault",
513
+ context={
514
+ "file_path": str(file_path),
515
+ "exit_code": process.exitcode,
516
+ },
517
+ ) from None
518
+ raise ParsingError(
519
+ f"GMFT process died unexpectedly with exit code {process.exitcode}",
520
+ context={
521
+ "file_path": str(file_path),
522
+ "exit_code": process.exitcode,
523
+ },
524
+ ) from None
525
+
526
+ time.sleep(0.1)
527
+
528
+ if success:
529
+ tables = []
530
+ for table_dict in result:
531
+ import io
532
+
533
+ from PIL import Image
534
+
535
+ img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
536
+ import pandas as pd
537
+
538
+ df = pd.read_csv(StringIO(table_dict["df_csv"]))
539
+
540
+ tables.append(
541
+ TableData(
542
+ cropped_image=img,
543
+ page_number=table_dict["page_number"],
544
+ text=table_dict["text"],
545
+ df=df,
546
+ )
547
+ )
548
+
549
+ return tables
550
+
551
+ error_info = result
552
+ raise ParsingError(
553
+ f"GMFT table extraction failed: {error_info['error']}",
554
+ context={
555
+ "file_path": str(file_path),
556
+ "error_type": error_info["type"],
557
+ "traceback": error_info["traceback"],
558
+ },
559
+ )
560
+
561
+ except queue.Empty as e:
562
+ raise ParsingError(
563
+ "GMFT table extraction timed out",
564
+ context={
565
+ "file_path": str(file_path),
566
+ "timeout": timeout,
567
+ },
568
+ ) from e
569
+ finally:
570
+ if process.is_alive():
571
+ process.terminate()
572
+ process.join(timeout=5)
573
+ if process.is_alive():
574
+ process.kill()
575
+ process.join()
576
+
577
+
578
+ async def _extract_tables_isolated_async(
579
+ file_path: str | PathLike[str],
580
+ config: GMFTConfig | None = None,
581
+ timeout: float = 300.0,
582
+ ) -> list[TableData]:
583
+ """Async version of extract_tables_isolated using asyncio.
584
+
585
+ Args:
586
+ file_path: Path to the PDF file
587
+ config: GMFT configuration
588
+ timeout: Maximum time to wait for extraction
589
+
590
+ Returns:
591
+ List of extracted tables
592
+
593
+ Raises:
594
+ RuntimeError: If extraction fails or times out
595
+ """
596
+ import anyio
597
+
598
+ config = config or GMFTConfig()
599
+ config_dict = config.__dict__.copy()
600
+
601
+ ctx = mp.get_context("spawn")
602
+ result_queue = ctx.Queue()
603
+
604
+ process = ctx.Process(
605
+ target=_extract_tables_in_process,
606
+ args=(str(file_path), config_dict, result_queue),
607
+ )
608
+
609
+ process.start()
610
+
611
+ try:
612
+
613
+ async def wait_for_result() -> tuple[bool, Any]:
614
+ while True:
615
+ try:
616
+ return result_queue.get_nowait() # type: ignore[no-any-return]
617
+ except queue.Empty: # noqa: PERF203
618
+ await anyio.sleep(0.1)
619
+ if not process.is_alive():
620
+ # Process died without putting result # ~keep
621
+ if process.exitcode == -signal.SIGSEGV:
622
+ raise ParsingError(
623
+ "GMFT process crashed with segmentation fault",
624
+ context={
625
+ "file_path": str(file_path),
626
+ "exit_code": process.exitcode,
627
+ },
628
+ ) from None
629
+ raise ParsingError(
630
+ f"GMFT process died unexpectedly with exit code {process.exitcode}",
631
+ context={
632
+ "file_path": str(file_path),
633
+ "exit_code": process.exitcode,
634
+ },
635
+ ) from None
636
+
637
+ with anyio.fail_after(timeout):
638
+ success, result = await wait_for_result()
639
+
640
+ if success:
641
+ tables = []
642
+ for table_dict in result:
643
+ import io
644
+
645
+ from PIL import Image
646
+
647
+ img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
648
+ import pandas as pd
649
+
650
+ df = pd.read_csv(StringIO(table_dict["df_csv"]))
651
+
652
+ tables.append(
653
+ TableData(
654
+ cropped_image=img,
655
+ page_number=table_dict["page_number"],
656
+ text=table_dict["text"],
657
+ df=df,
658
+ )
659
+ )
660
+
661
+ return tables
662
+
663
+ error_info = result
664
+ raise ParsingError(
665
+ f"GMFT table extraction failed: {error_info['error']}",
666
+ context={
667
+ "file_path": str(file_path),
668
+ "error_type": error_info["type"],
669
+ "traceback": error_info["traceback"],
670
+ },
671
+ )
672
+
673
+ except TimeoutError as e:
674
+ raise ParsingError(
675
+ "GMFT table extraction timed out",
676
+ context={
677
+ "file_path": str(file_path),
678
+ "timeout": timeout,
679
+ },
680
+ ) from e
681
+ finally:
682
+ if process.is_alive():
683
+ process.terminate()
684
+ await anyio.to_thread.run_sync(lambda: process.join(timeout=5))
685
+ if process.is_alive():
686
+ process.kill()
687
+ await anyio.to_thread.run_sync(process.join)
@@ -0,0 +1,95 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from functools import lru_cache
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ from kreuzberg.exceptions import MissingDependencyError
8
+
9
+ if TYPE_CHECKING:
10
+ from fast_langdetect import LangDetectConfig as FastLangDetectConfig
11
+
12
+ try:
13
+ from fast_langdetect import LangDetectConfig as FastLangDetectConfig
14
+ from fast_langdetect import detect, detect_multilingual
15
+
16
+ HAS_FAST_LANGDETECT = True
17
+ except ImportError:
18
+ HAS_FAST_LANGDETECT = False
19
+ detect = None
20
+ detect_multilingual = None
21
+ FastLangDetectConfig = None
22
+
23
+ _CACHE_SIZE = 128
24
+
25
+
26
+ @dataclass(frozen=True)
27
+ class LanguageDetectionConfig:
28
+ """Configuration for language detection.
29
+
30
+ Attributes:
31
+ low_memory: If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
32
+ Defaults to True for better memory efficiency.
33
+ top_k: Maximum number of languages to return for multilingual detection. Defaults to 3.
34
+ multilingual: If True, uses multilingual detection to handle mixed-language text.
35
+ If False, uses single language detection. Defaults to False.
36
+ cache_dir: Custom directory for model cache. If None, uses system default.
37
+ allow_fallback: If True, falls back to small model if large model fails. Defaults to True.
38
+ """
39
+
40
+ low_memory: bool = True
41
+ top_k: int = 3
42
+ multilingual: bool = False
43
+ cache_dir: str | None = None
44
+ allow_fallback: bool = True
45
+
46
+
47
+ def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangDetectConfig | None:
48
+ """Create FastLangDetectConfig from our config."""
49
+ if not HAS_FAST_LANGDETECT or FastLangDetectConfig is None:
50
+ return None
51
+
52
+ kwargs: dict[str, Any] = {
53
+ "allow_fallback": config.allow_fallback,
54
+ }
55
+ if config.cache_dir is not None:
56
+ kwargs["cache_dir"] = config.cache_dir
57
+
58
+ return FastLangDetectConfig(**kwargs)
59
+
60
+
61
+ @lru_cache(maxsize=_CACHE_SIZE)
62
+ def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -> list[str] | None:
63
+ """Detect the most probable languages in the given text using fast-langdetect.
64
+
65
+ Args:
66
+ text: The text to analyze.
67
+ config: Configuration for language detection. If None, uses defaults.
68
+
69
+ Returns:
70
+ A list of detected language codes in lowercase (e.g., ['en', 'de', 'fr']),
71
+ or None if detection fails.
72
+
73
+ Raises:
74
+ MissingDependencyError: If fast-langdetect is not installed.
75
+ """
76
+ if not HAS_FAST_LANGDETECT or detect is None or detect_multilingual is None:
77
+ raise MissingDependencyError.create_for_package(
78
+ dependency_group="langdetect", functionality="language detection", package_name="fast-langdetect"
79
+ )
80
+
81
+ if config is None:
82
+ config = LanguageDetectionConfig()
83
+
84
+ try:
85
+ if config.multilingual:
86
+ results = detect_multilingual(text, low_memory=config.low_memory, k=config.top_k)
87
+
88
+ return [result["lang"].lower() for result in results if result.get("lang")]
89
+
90
+ result = detect(text, low_memory=config.low_memory)
91
+ if result and result.get("lang"):
92
+ return [result["lang"].lower()]
93
+ return None
94
+ except Exception: # noqa: BLE001
95
+ return None
@@ -0,0 +1,5 @@
1
+ """MCP server for Kreuzberg text extraction."""
2
+
3
+ from .server import mcp
4
+
5
+ __all__ = ["mcp"]