kreuzberg 3.8.2__py3-none-any.whl → 3.9.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_chunker.py CHANGED
@@ -2,9 +2,9 @@ from __future__ import annotations
2
2
 
3
3
  from typing import TYPE_CHECKING
4
4
 
5
- from kreuzberg import MissingDependencyError
6
5
  from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
7
6
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
7
+ from kreuzberg.exceptions import MissingDependencyError
8
8
 
9
9
  if TYPE_CHECKING:
10
10
  from semantic_text_splitter import MarkdownSplitter, TextSplitter
@@ -36,11 +36,11 @@ def get_chunker(
36
36
  if key not in _chunkers:
37
37
  try:
38
38
  if mime_type == MARKDOWN_MIME_TYPE:
39
- from semantic_text_splitter import MarkdownSplitter
39
+ from semantic_text_splitter import MarkdownSplitter # noqa: PLC0415
40
40
 
41
41
  _chunkers[key] = MarkdownSplitter(max_characters, overlap_characters)
42
42
  else:
43
- from semantic_text_splitter import TextSplitter
43
+ from semantic_text_splitter import TextSplitter # noqa: PLC0415
44
44
 
45
45
  _chunkers[key] = TextSplitter(max_characters, overlap_characters)
46
46
  except ImportError as e:
kreuzberg/_config.py CHANGED
@@ -95,7 +95,7 @@ def parse_ocr_backend_config(
95
95
  # Convert psm integer to PSMMode enum if needed
96
96
  processed_config = backend_config.copy()
97
97
  if "psm" in processed_config and isinstance(processed_config["psm"], int):
98
- from kreuzberg._ocr._tesseract import PSMMode
98
+ from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
99
99
 
100
100
  processed_config["psm"] = PSMMode(processed_config["psm"])
101
101
  return TesseractConfig(**processed_config)
@@ -0,0 +1,156 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from typing import TYPE_CHECKING
5
+
6
+ from kreuzberg._ocr import get_ocr_backend
7
+ from kreuzberg.exceptions import MissingDependencyError
8
+
9
+ if TYPE_CHECKING:
10
+ from pathlib import Path
11
+
12
+ from kreuzberg._types import ExtractionConfig, ExtractionResult
13
+
14
+
15
+ DOCUMENT_CLASSIFIERS = {
16
+ "invoice": [
17
+ r"invoice",
18
+ r"bill to",
19
+ r"invoice number",
20
+ r"total amount",
21
+ r"tax id",
22
+ ],
23
+ "receipt": [
24
+ r"receipt",
25
+ r"cash receipt",
26
+ r"payment",
27
+ r"subtotal",
28
+ r"total due",
29
+ ],
30
+ "contract": [
31
+ r"agreement",
32
+ r"contract",
33
+ r"party a",
34
+ r"party b",
35
+ r"terms and conditions",
36
+ r"signature",
37
+ ],
38
+ "report": [r"report", r"summary", r"analysis", r"findings", r"conclusion"],
39
+ "form": [r"form", r"fill out", r"signature", r"date", r"submit"],
40
+ }
41
+
42
+
43
+ def _get_translated_text(result: ExtractionResult) -> str:
44
+ """Translate extracted text to English using Google Translate API.
45
+
46
+ Args:
47
+ result: ExtractionResult containing the text to be translated
48
+
49
+ Returns:
50
+ str: The translated text in lowercase English
51
+
52
+ Raises:
53
+ MissingDependencyError: If the deep-translator package is not installed
54
+ """
55
+ try:
56
+ from deep_translator import GoogleTranslator # noqa: PLC0415
57
+ except ImportError as e:
58
+ raise MissingDependencyError(
59
+ "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[auto-classify-document-type]'"
60
+ ) from e
61
+
62
+ return str(GoogleTranslator(source="auto", target="en").translate(result.content).lower())
63
+
64
+
65
+ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
66
+ """Classifies the document type based on keywords and patterns.
67
+
68
+ Args:
69
+ result: The extraction result containing the content.
70
+ config: The extraction configuration.
71
+
72
+ Returns:
73
+ A tuple containing the detected document type and the confidence score,
74
+ or (None, None) if no type is detected with sufficient confidence.
75
+ """
76
+ translated_text = _get_translated_text(result)
77
+ scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0)
78
+
79
+ for doc_type, patterns in DOCUMENT_CLASSIFIERS.items():
80
+ for pattern in patterns:
81
+ if re.search(pattern, translated_text):
82
+ scores[doc_type] += 1
83
+
84
+ total_score = sum(scores.values())
85
+ if total_score == 0:
86
+ return None, None
87
+
88
+ confidences = {doc_type: score / total_score for doc_type, score in scores.items()}
89
+
90
+ best_type, best_confidence = max(confidences.items(), key=lambda item: item[1])
91
+
92
+ if best_confidence >= config.document_type_confidence_threshold:
93
+ return best_type, best_confidence
94
+
95
+ return None, None
96
+
97
+
98
+ def classify_document_from_layout(
99
+ result: ExtractionResult, config: ExtractionConfig
100
+ ) -> tuple[str | None, float | None]:
101
+ """Classifies the document type based on layout information from OCR.
102
+
103
+ Args:
104
+ result: The extraction result containing the layout data.
105
+ config: The extraction configuration.
106
+
107
+ Returns:
108
+ A tuple containing the detected document type and the confidence score,
109
+ or (None, None) if no type is detected with sufficient confidence.
110
+ """
111
+ translated_text = _get_translated_text(result)
112
+
113
+ if result.layout is None or result.layout.empty:
114
+ return None, None
115
+
116
+ layout_df = result.layout
117
+ if not all(col in layout_df.columns for col in ["text", "top", "height"]):
118
+ return None, None
119
+
120
+ layout_df["translated_text"] = translated_text
121
+
122
+ page_height = layout_df["top"].max() + layout_df["height"].max()
123
+ scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0.0)
124
+
125
+ for doc_type, patterns in DOCUMENT_CLASSIFIERS.items():
126
+ for pattern in patterns:
127
+ found_words = layout_df[layout_df["translated_text"].str.contains(pattern, case=False, na=False)]
128
+ if not found_words.empty:
129
+ scores[doc_type] += 1.0
130
+ word_top = found_words.iloc[0]["top"]
131
+ if word_top < page_height * 0.3:
132
+ scores[doc_type] += 0.5
133
+
134
+ total_score = sum(scores.values())
135
+ if total_score == 0:
136
+ return None, None
137
+
138
+ confidences = {doc_type: score / total_score for doc_type, score in scores.items()}
139
+
140
+ best_type, best_confidence = max(confidences.items(), key=lambda item: item[1])
141
+
142
+ if best_confidence >= config.document_type_confidence_threshold:
143
+ return best_type, best_confidence
144
+
145
+ return None, None
146
+
147
+
148
+ def auto_detect_document_type(
149
+ result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
150
+ ) -> ExtractionResult:
151
+ if config.document_classification_mode == "vision" and file_path:
152
+ layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
153
+ result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
154
+ else:
155
+ result.document_type, result.document_type_confidence = classify_document(result, config)
156
+ return result
@@ -138,7 +138,7 @@ def extract_entities(
138
138
  spacy_config = SpacyEntityExtractionConfig()
139
139
 
140
140
  try:
141
- import spacy # noqa: F401
141
+ import spacy # noqa: F401, PLC0415
142
142
  except ImportError as e:
143
143
  raise MissingDependencyError.create_for_package(
144
144
  package_name="spacy",
@@ -179,7 +179,7 @@ def extract_entities(
179
179
  def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
180
180
  """Load a spaCy model with caching."""
181
181
  try:
182
- import spacy
182
+ import spacy # noqa: PLC0415
183
183
 
184
184
  if spacy_config.model_cache_dir:
185
185
  os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
@@ -223,7 +223,7 @@ def extract_keywords(
223
223
  MissingDependencyError: If `keybert` is not installed.
224
224
  """
225
225
  try:
226
- from keybert import KeyBERT
226
+ from keybert import KeyBERT # noqa: PLC0415
227
227
 
228
228
  kw_model = KeyBERT()
229
229
  keywords = kw_model.extract_keywords(text, top_n=keyword_count)
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  import contextlib
4
4
  import os
5
5
  import tempfile
6
+ from dataclasses import asdict
6
7
  from pathlib import Path
7
8
  from typing import TYPE_CHECKING, ClassVar
8
9
 
@@ -88,17 +89,17 @@ class ImageExtractor(Extractor):
88
89
  config = (
89
90
  self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
90
91
  )
91
- result = backend.process_file_sync(path, **config.__dict__)
92
+ result = backend.process_file_sync(path, **asdict(config))
92
93
  elif self.config.ocr_backend == "paddleocr":
93
94
  paddle_config = (
94
95
  self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
95
96
  )
96
- result = backend.process_file_sync(path, **paddle_config.__dict__)
97
+ result = backend.process_file_sync(path, **asdict(paddle_config))
97
98
  elif self.config.ocr_backend == "easyocr":
98
99
  easy_config = (
99
100
  self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
100
101
  )
101
- result = backend.process_file_sync(path, **easy_config.__dict__)
102
+ result = backend.process_file_sync(path, **asdict(easy_config))
102
103
  else:
103
104
  raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
104
105
  return self._apply_quality_processing(result)
@@ -3,6 +3,7 @@ from __future__ import annotations
3
3
  import contextlib
4
4
  import os
5
5
  import tempfile
6
+ from dataclasses import asdict
6
7
  from multiprocessing import cpu_count
7
8
  from pathlib import Path
8
9
  from re import Pattern
@@ -58,9 +59,13 @@ class PDFExtractor(Extractor):
58
59
  result: ExtractionResult | None = None
59
60
 
60
61
  if not self.config.force_ocr:
61
- content = await self._extract_pdf_searchable_text(path)
62
- if self._validate_extracted_text(content):
63
- result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
62
+ try:
63
+ content = await self._extract_pdf_searchable_text(path)
64
+ if self._validate_extracted_text(content):
65
+ result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
66
+ except ParsingError:
67
+ # If searchable text extraction fails, continue to OCR or empty result
68
+ pass
64
69
 
65
70
  if not result and self.config.ocr_backend is not None:
66
71
  result = await self._extract_pdf_text_with_ocr(path, self.config.ocr_backend)
@@ -73,7 +78,7 @@ class PDFExtractor(Extractor):
73
78
  if self.config.extract_tables:
74
79
  # GMFT is optional dependency
75
80
  try:
76
- from kreuzberg._gmft import extract_tables
81
+ from kreuzberg._gmft import extract_tables # noqa: PLC0415
77
82
 
78
83
  result.tables = await extract_tables(path, self.config.gmft_config)
79
84
  except ImportError:
@@ -112,16 +117,19 @@ class PDFExtractor(Extractor):
112
117
 
113
118
  def extract_path_sync(self, path: Path) -> ExtractionResult:
114
119
  """Pure sync implementation of PDF extraction from path."""
115
- text = self._extract_pdf_searchable_text_sync(path)
120
+ try:
121
+ text = self._extract_pdf_searchable_text_sync(path)
122
+ except ParsingError:
123
+ text = ""
116
124
 
117
- if self.config.force_ocr or not self._validate_extracted_text(text):
125
+ if (self.config.force_ocr or not self._validate_extracted_text(text)) and self.config.ocr_backend is not None:
118
126
  text = self._extract_pdf_with_ocr_sync(path)
119
127
 
120
128
  tables = []
121
129
  if self.config.extract_tables:
122
130
  # GMFT is optional dependency
123
131
  try:
124
- from kreuzberg._gmft import extract_tables_sync
132
+ from kreuzberg._gmft import extract_tables_sync # noqa: PLC0415
125
133
 
126
134
  tables = extract_tables_sync(path)
127
135
  except ImportError:
@@ -381,17 +389,17 @@ class PDFExtractor(Extractor):
381
389
  config = (
382
390
  self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
383
391
  )
384
- results = backend.process_batch_sync(paths, **config.__dict__)
392
+ results = backend.process_batch_sync(paths, **asdict(config))
385
393
  elif self.config.ocr_backend == "paddleocr":
386
394
  paddle_config = (
387
395
  self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
388
396
  )
389
- results = backend.process_batch_sync(paths, **paddle_config.__dict__)
397
+ results = backend.process_batch_sync(paths, **asdict(paddle_config))
390
398
  elif self.config.ocr_backend == "easyocr":
391
399
  easy_config = (
392
400
  self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
393
401
  )
394
- results = backend.process_batch_sync(paths, **easy_config.__dict__)
402
+ results = backend.process_batch_sync(paths, **asdict(easy_config))
395
403
  else:
396
404
  raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
397
405
 
@@ -11,6 +11,7 @@ from pathlib import Path
11
11
  from typing import Any
12
12
 
13
13
  from anyio import Path as AsyncPath
14
+ from PIL import Image
14
15
  from python_calamine import CalamineWorkbook
15
16
 
16
17
  from kreuzberg._extractors._base import Extractor
@@ -197,9 +198,9 @@ class SpreadSheetExtractor(Extractor):
197
198
  """Enhanced sheet processing with better table structure preservation."""
198
199
  try:
199
200
  # pandas is optional dependency
200
- import pandas as pd
201
+ import pandas as pd # noqa: PLC0415
201
202
 
202
- from kreuzberg._utils._table import enhance_table_markdown
203
+ from kreuzberg._utils._table import enhance_table_markdown # noqa: PLC0415
203
204
 
204
205
  sheet = workbook.get_sheet_by_name(sheet_name)
205
206
  data = sheet.to_python()
@@ -217,9 +218,7 @@ class SpreadSheetExtractor(Extractor):
217
218
  return f"## {sheet_name}\n\n*No data*"
218
219
 
219
220
  # Create a mock TableData for enhanced formatting
220
- from PIL import Image
221
-
222
- from kreuzberg._types import TableData
221
+ from kreuzberg._types import TableData # noqa: PLC0415
223
222
 
224
223
  # Create a 1x1 transparent image as placeholder
225
224
  placeholder_image = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
@@ -1,8 +1,22 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import json
4
+ import sys
4
5
  from typing import TYPE_CHECKING, Any, ClassVar
5
6
 
7
+ if sys.version_info >= (3, 11):
8
+ import tomllib
9
+ else:
10
+ try:
11
+ import tomli as tomllib # type: ignore[import-not-found]
12
+ except ImportError:
13
+ tomllib = None
14
+
15
+ try:
16
+ import yaml
17
+ except ImportError:
18
+ yaml = None
19
+
6
20
  from anyio import Path as AsyncPath
7
21
 
8
22
  from kreuzberg._extractors._base import Extractor
@@ -44,31 +58,23 @@ class StructuredDataExtractor(Extractor):
44
58
  if self.mime_type in {JSON_MIME_TYPE, "text/json"}:
45
59
  data = json.loads(text_content)
46
60
  elif self.mime_type in {TOML_MIME_TYPE, "text/toml"}:
47
- try:
48
- import tomllib # type: ignore[import-not-found]
49
- except ImportError:
50
- try:
51
- import tomli as tomllib # type: ignore[import-not-found]
52
- except ImportError:
53
- return ExtractionResult(
54
- content=normalize_spaces(text_content),
55
- mime_type=PLAIN_TEXT_MIME_TYPE,
56
- metadata={"warning": "tomllib/tomli not available, returning raw text"},
57
- chunks=[],
58
- )
61
+ if tomllib is None:
62
+ return ExtractionResult(
63
+ content=normalize_spaces(text_content),
64
+ mime_type=PLAIN_TEXT_MIME_TYPE,
65
+ metadata={"warning": "tomllib/tomli not available, returning raw text"},
66
+ chunks=[],
67
+ )
59
68
  data = tomllib.loads(text_content)
60
69
  else:
61
- try:
62
- import yaml
63
-
64
- data = yaml.safe_load(text_content)
65
- except ImportError:
70
+ if yaml is None:
66
71
  return ExtractionResult(
67
72
  content=normalize_spaces(text_content),
68
73
  mime_type=PLAIN_TEXT_MIME_TYPE,
69
74
  metadata={"warning": "PyYAML not available, returning raw text"},
70
75
  chunks=[],
71
76
  )
77
+ data = yaml.safe_load(text_content)
72
78
 
73
79
  text_parts: list[str] = []
74
80
  metadata: dict[str, Any] = {}
@@ -90,7 +96,7 @@ class StructuredDataExtractor(Extractor):
90
96
  chunks=[],
91
97
  )
92
98
 
93
- except (ValueError, TypeError, KeyError, AttributeError, UnicodeDecodeError) as e:
99
+ except (json.JSONDecodeError, ValueError, TypeError) as e:
94
100
  return ExtractionResult(
95
101
  content=normalize_spaces(text_content),
96
102
  mime_type=PLAIN_TEXT_MIME_TYPE,
kreuzberg/_gmft.py CHANGED
@@ -5,12 +5,16 @@ import multiprocessing as mp
5
5
  import os
6
6
  import queue
7
7
  import signal
8
+ import time
8
9
  import traceback
9
10
  from dataclasses import dataclass, field
10
11
  from io import StringIO
12
+ from pathlib import Path
11
13
  from typing import TYPE_CHECKING, Any, Literal
12
14
 
15
+ import anyio
13
16
  import msgspec
17
+ from PIL import Image
14
18
 
15
19
  from kreuzberg._types import TableData
16
20
  from kreuzberg._utils._sync import run_sync
@@ -134,7 +138,7 @@ class GMFTConfig:
134
138
  """
135
139
 
136
140
 
137
- async def extract_tables( # noqa: PLR0915
141
+ async def extract_tables(
138
142
  file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
139
143
  ) -> list[TableData]:
140
144
  """Extracts tables from a PDF file.
@@ -154,9 +158,7 @@ async def extract_tables( # noqa: PLR0915
154
158
  Returns:
155
159
  A list of table data dictionaries.
156
160
  """
157
- from pathlib import Path
158
-
159
- from kreuzberg._utils._cache import get_table_cache
161
+ from kreuzberg._utils._cache import get_table_cache # noqa: PLC0415
160
162
 
161
163
  # Determine if we should use isolated process # ~keep
162
164
  if use_isolated_process is None:
@@ -190,8 +192,6 @@ async def extract_tables( # noqa: PLR0915
190
192
  return cached_result # type: ignore[no-any-return]
191
193
 
192
194
  if table_cache.is_processing(**cache_kwargs):
193
- import anyio
194
-
195
195
  event = table_cache.mark_processing(**cache_kwargs)
196
196
  await anyio.to_thread.run_sync(event.wait)
197
197
 
@@ -211,10 +211,13 @@ async def extract_tables( # noqa: PLR0915
211
211
  return result
212
212
 
213
213
  try:
214
- from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
215
- from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
216
- from gmft.formatters.tatr import TATRFormatConfig
217
- from gmft.pdf_bindings.pdfium import PyPDFium2Document
214
+ from gmft.auto import ( # type: ignore[attr-defined] # noqa: PLC0415 # noqa: PLC0415
215
+ AutoTableDetector,
216
+ AutoTableFormatter,
217
+ )
218
+ from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined] # noqa: PLC0415
219
+ from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415 # noqa: PLC0415
220
+ from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415 # noqa: PLC0415
218
221
 
219
222
  formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
220
223
  config=TATRFormatConfig(
@@ -284,9 +287,7 @@ def extract_tables_sync(
284
287
  Returns:
285
288
  A list of table data dictionaries.
286
289
  """
287
- from pathlib import Path
288
-
289
- from kreuzberg._utils._cache import get_table_cache
290
+ from kreuzberg._utils._cache import get_table_cache # noqa: PLC0415
290
291
 
291
292
  # Determine if we should use isolated process # ~keep
292
293
  if use_isolated_process is None:
@@ -327,10 +328,10 @@ def extract_tables_sync(
327
328
  return result
328
329
 
329
330
  try:
330
- from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
331
- from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
332
- from gmft.formatters.tatr import TATRFormatConfig
333
- from gmft.pdf_bindings.pdfium import PyPDFium2Document
331
+ from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined] # noqa: PLC0415
332
+ from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined] # noqa: PLC0415
333
+ from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
334
+ from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
334
335
 
335
336
  formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
336
337
  config=TATRFormatConfig(
@@ -399,10 +400,10 @@ def _extract_tables_in_process(
399
400
  signal.signal(signal.SIGINT, signal.SIG_IGN)
400
401
 
401
402
  try:
402
- from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined]
403
- from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined]
404
- from gmft.formatters.tatr import TATRFormatConfig
405
- from gmft.pdf_bindings.pdfium import PyPDFium2Document
403
+ from gmft.auto import AutoTableDetector, AutoTableFormatter # type: ignore[attr-defined] # noqa: PLC0415
404
+ from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined] # noqa: PLC0415
405
+ from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
406
+ from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
406
407
 
407
408
  config = GMFTConfig(**config_dict)
408
409
 
@@ -495,7 +496,6 @@ def _extract_tables_isolated(
495
496
 
496
497
  try:
497
498
  # Wait for result with timeout, checking for process death # ~keep
498
- import time
499
499
 
500
500
  start_time = time.time()
501
501
  while True:
@@ -529,10 +529,8 @@ def _extract_tables_isolated(
529
529
  if success:
530
530
  tables = []
531
531
  for table_dict in result:
532
- from PIL import Image
533
-
534
532
  img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
535
- import pandas as pd
533
+ import pandas as pd # noqa: PLC0415
536
534
 
537
535
  df = pd.read_csv(StringIO(table_dict["df_csv"]))
538
536
 
@@ -577,7 +575,7 @@ def _extract_tables_isolated(
577
575
  async def _extract_tables_isolated_async(
578
576
  file_path: str | PathLike[str],
579
577
  config: GMFTConfig | None = None,
580
- timeout: float = 300.0,
578
+ timeout: float = 300.0, # noqa: ASYNC109
581
579
  ) -> list[TableData]:
582
580
  """Async version of extract_tables_isolated using asyncio.
583
581
 
@@ -592,8 +590,6 @@ async def _extract_tables_isolated_async(
592
590
  Raises:
593
591
  RuntimeError: If extraction fails or times out
594
592
  """
595
- import anyio
596
-
597
593
  config = config or GMFTConfig()
598
594
  config_dict = msgspec.to_builtins(config)
599
595
 
@@ -639,10 +635,8 @@ async def _extract_tables_isolated_async(
639
635
  if success:
640
636
  tables = []
641
637
  for table_dict in result:
642
- from PIL import Image
643
-
644
638
  img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
645
- import pandas as pd
639
+ import pandas as pd # noqa: PLC0415
646
640
 
647
641
  df = pd.read_csv(StringIO(table_dict["df_csv"]))
648
642
 
kreuzberg/_mime_types.py CHANGED
@@ -191,7 +191,7 @@ def validate_mime_type(
191
191
  return _validate_explicit_mime_type(mime_type)
192
192
 
193
193
  if file_path:
194
- from kreuzberg._utils._cache import get_mime_cache
194
+ from kreuzberg._utils._cache import get_mime_cache # noqa: PLC0415
195
195
 
196
196
  path = Path(file_path)
197
197
 
kreuzberg/_ocr/_base.py CHANGED
@@ -103,7 +103,7 @@ class OCRBackend(ABC, Generic[T]):
103
103
  Returns:
104
104
  List of extraction result objects in the same order as input paths
105
105
  """
106
- from kreuzberg._utils._sync import run_taskgroup
106
+ from kreuzberg._utils._sync import run_taskgroup # noqa: PLC0415
107
107
 
108
108
  tasks = [self.process_file(path, **kwargs) for path in paths]
109
109
  return await run_taskgroup(*tasks)
@@ -180,7 +180,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
180
180
  Raises:
181
181
  OCRError: If OCR processing fails.
182
182
  """
183
- import numpy as np
183
+ import numpy as np # noqa: PLC0415
184
184
 
185
185
  await self._init_easyocr(**kwargs)
186
186
 
@@ -318,7 +318,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
318
318
  bool: True if GPU support is available.
319
319
  """
320
320
  try:
321
- import torch
321
+ import torch # noqa: PLC0415
322
322
 
323
323
  return bool(torch.cuda.is_available())
324
324
  except ImportError:
@@ -339,7 +339,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
339
339
  return
340
340
 
341
341
  try:
342
- import easyocr
342
+ import easyocr # noqa: PLC0415
343
343
  except ImportError as e:
344
344
  raise MissingDependencyError.create_for_package(
345
345
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
@@ -507,7 +507,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
507
507
  return
508
508
 
509
509
  try:
510
- import easyocr
510
+ import easyocr # noqa: PLC0415
511
511
  except ImportError as e:
512
512
  raise MissingDependencyError.create_for_package(
513
513
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
@@ -124,7 +124,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
124
124
  Raises:
125
125
  OCRError: If OCR processing fails.
126
126
  """
127
- import numpy as np
127
+ import numpy as np # noqa: PLC0415
128
128
 
129
129
  await self._init_paddle_ocr(**kwargs)
130
130
 
@@ -260,7 +260,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
260
260
  return
261
261
 
262
262
  try:
263
- from paddleocr import PaddleOCR
263
+ from paddleocr import PaddleOCR # noqa: PLC0415
264
264
  except ImportError as e:
265
265
  raise MissingDependencyError.create_for_package(
266
266
  dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
@@ -427,7 +427,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
427
427
  return
428
428
 
429
429
  try:
430
- from paddleocr import PaddleOCR
430
+ from paddleocr import PaddleOCR # noqa: PLC0415
431
431
  except ImportError as e:
432
432
  raise MissingDependencyError.create_for_package(
433
433
  dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
@@ -12,8 +12,10 @@ from enum import Enum
12
12
  from pathlib import Path
13
13
  from typing import TYPE_CHECKING, Any, ClassVar, Final
14
14
 
15
+ import anyio
15
16
  from anyio import Path as AsyncPath
16
17
  from anyio import run_process
18
+ from PIL import Image
17
19
  from typing_extensions import Self
18
20
 
19
21
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
@@ -25,7 +27,7 @@ from kreuzberg._utils._tmp import create_temp_file
25
27
  from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
26
28
 
27
29
  if TYPE_CHECKING:
28
- from PIL.Image import Image
30
+ from PIL.Image import Image as PILImage
29
31
 
30
32
  try: # pragma: no cover
31
33
  from typing import Unpack # type: ignore[attr-defined]
@@ -233,10 +235,10 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
233
235
 
234
236
  async def process_image(
235
237
  self,
236
- image: Image,
238
+ image: PILImage,
237
239
  **kwargs: Unpack[TesseractConfig],
238
240
  ) -> ExtractionResult:
239
- from kreuzberg._utils._cache import get_ocr_cache
241
+ from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
240
242
 
241
243
  image_buffer = io.BytesIO()
242
244
  await run_sync(image.save, image_buffer, format="PNG")
@@ -254,8 +256,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
254
256
  return cached_result
255
257
 
256
258
  if ocr_cache.is_processing(**cache_kwargs):
257
- import anyio
258
-
259
259
  event = ocr_cache.mark_processing(**cache_kwargs)
260
260
  await anyio.to_thread.run_sync(event.wait)
261
261
 
@@ -286,7 +286,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
286
286
  path: Path,
287
287
  **kwargs: Unpack[TesseractConfig],
288
288
  ) -> ExtractionResult:
289
- from kreuzberg._utils._cache import get_ocr_cache
289
+ from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
290
290
 
291
291
  try:
292
292
  stat = path.stat()
@@ -314,8 +314,6 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
314
314
  return cached_result
315
315
 
316
316
  if ocr_cache.is_processing(**cache_kwargs):
317
- import anyio
318
-
319
317
  event = ocr_cache.mark_processing(**cache_kwargs)
320
318
  await anyio.to_thread.run_sync(event.wait)
321
319
 
@@ -411,7 +409,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
411
409
 
412
410
  def process_image_sync(
413
411
  self,
414
- image: Image,
412
+ image: PILImage,
415
413
  **kwargs: Unpack[TesseractConfig],
416
414
  ) -> ExtractionResult:
417
415
  """Synchronously process an image and extract its text and metadata.
@@ -423,7 +421,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
423
421
  Returns:
424
422
  The extraction result object
425
423
  """
426
- from kreuzberg._utils._cache import get_ocr_cache
424
+ from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
427
425
 
428
426
  image_buffer = io.BytesIO()
429
427
  image.save(image_buffer, format="PNG")
@@ -482,7 +480,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
482
480
  Returns:
483
481
  The extraction result object
484
482
  """
485
- from kreuzberg._utils._cache import get_ocr_cache
483
+ from kreuzberg._utils._cache import get_ocr_cache # noqa: PLC0415
486
484
 
487
485
  file_info = self._get_file_info(path)
488
486
 
@@ -771,8 +769,6 @@ def _process_image_bytes_with_tesseract(
771
769
  OCR result as dictionary.
772
770
  """
773
771
  try:
774
- from PIL import Image
775
-
776
772
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
777
773
  with Image.open(io.BytesIO(image_bytes)) as image:
778
774
  image.save(tmp_image.name, format="PNG")
@@ -810,7 +806,7 @@ class TesseractProcessPool:
810
806
  max_processes: Maximum number of processes.
811
807
  memory_limit_gb: Memory limit in GB.
812
808
  """
813
- from kreuzberg._utils._process_pool import ProcessPoolManager
809
+ from kreuzberg._utils._process_pool import ProcessPoolManager # noqa: PLC0415
814
810
 
815
811
  self.config = config or TesseractConfig()
816
812
  self.process_manager = ProcessPoolManager(
kreuzberg/_types.py CHANGED
@@ -8,7 +8,11 @@ from typing import TYPE_CHECKING, Any, Literal, TypedDict
8
8
  import msgspec
9
9
 
10
10
  from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
11
- from kreuzberg._utils._table import export_table_to_csv, export_table_to_tsv, extract_table_structure_info
11
+ from kreuzberg._utils._table import (
12
+ export_table_to_csv,
13
+ export_table_to_tsv,
14
+ extract_table_structure_info,
15
+ )
12
16
  from kreuzberg.exceptions import ValidationError
13
17
 
14
18
  if sys.version_info < (3, 11): # pragma: no cover
@@ -228,6 +232,12 @@ class ExtractionResult:
228
232
  """Extracted keywords and their scores, if keyword extraction is enabled."""
229
233
  detected_languages: list[str] | None = None
230
234
  """Languages detected in the extracted content, if language detection is enabled."""
235
+ document_type: str | None = None
236
+ """Detected document type, if document type detection is enabled."""
237
+ document_type_confidence: float | None = None
238
+ """Confidence of the detected document type."""
239
+ layout: DataFrame | None = field(default=None, repr=False, hash=False)
240
+ """Internal layout data from OCR, not for public use."""
231
241
 
232
242
  def to_dict(self, include_none: bool = False) -> dict[str, Any]:
233
243
  """Converts the ExtractionResult to a dictionary.
@@ -339,6 +349,12 @@ class ExtractionConfig:
339
349
  """Configuration for language detection. If None, uses default settings."""
340
350
  spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
341
351
  """Configuration for spaCy entity extraction. If None, uses default settings."""
352
+ auto_detect_document_type: bool = False
353
+ """Whether to automatically detect the document type."""
354
+ document_type_confidence_threshold: float = 0.7
355
+ """Confidence threshold for document type detection."""
356
+ document_classification_mode: Literal["text", "vision"] = "text"
357
+ """The mode to use for document classification."""
342
358
  enable_quality_processing: bool = True
343
359
  """Whether to apply quality post-processing to improve extraction results."""
344
360
 
@@ -349,9 +365,9 @@ class ExtractionConfig:
349
365
  object.__setattr__(self, "post_processing_hooks", tuple(self.post_processing_hooks))
350
366
  if self.validators is not None and isinstance(self.validators, list):
351
367
  object.__setattr__(self, "validators", tuple(self.validators))
352
- from kreuzberg._ocr._easyocr import EasyOCRConfig
353
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
354
- from kreuzberg._ocr._tesseract import TesseractConfig
368
+ from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
369
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
370
+ from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
355
371
 
356
372
  if self.ocr_backend is None and self.ocr_config is not None:
357
373
  raise ValidationError("'ocr_backend' is None but 'ocr_config' is provided")
@@ -381,14 +397,14 @@ class ExtractionConfig:
381
397
 
382
398
  # Lazy load and cache default configs instead of creating new instances
383
399
  if self.ocr_backend == "tesseract":
384
- from kreuzberg._ocr._tesseract import TesseractConfig
400
+ from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
385
401
 
386
402
  return asdict(TesseractConfig())
387
403
  if self.ocr_backend == "easyocr":
388
- from kreuzberg._ocr._easyocr import EasyOCRConfig
404
+ from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
389
405
 
390
406
  return asdict(EasyOCRConfig())
391
407
  # paddleocr
392
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
408
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
393
409
 
394
410
  return asdict(PaddleOCRConfig())
@@ -7,6 +7,7 @@ import os
7
7
  import threading
8
8
  import time
9
9
  from contextlib import suppress
10
+ from io import StringIO
10
11
  from pathlib import Path
11
12
  from typing import Any, Generic, TypeVar
12
13
 
@@ -126,9 +127,7 @@ class KreuzbergCache(Generic[T]):
126
127
  data = cached_data["data"]
127
128
 
128
129
  if cached_data.get("type") == "TableDataList" and isinstance(data, list):
129
- from io import StringIO
130
-
131
- import pandas as pd
130
+ import pandas as pd # noqa: PLC0415
132
131
 
133
132
  deserialized_data = []
134
133
  for item in data:
@@ -141,7 +141,7 @@ def get_device_memory_info(device: DeviceInfo) -> tuple[float | None, float | No
141
141
  def _is_cuda_available() -> bool:
142
142
  """Check if CUDA is available."""
143
143
  try:
144
- import torch # type: ignore[import-not-found,unused-ignore]
144
+ import torch # type: ignore[import-not-found,unused-ignore] # noqa: PLC0415
145
145
 
146
146
  return bool(torch.cuda.is_available())
147
147
  except ImportError:
@@ -151,7 +151,7 @@ def _is_cuda_available() -> bool:
151
151
  def _is_mps_available() -> bool:
152
152
  """Check if MPS (Apple Silicon) is available."""
153
153
  try:
154
- import torch # type: ignore[import-not-found,unused-ignore]
154
+ import torch # type: ignore[import-not-found,unused-ignore] # noqa: PLC0415
155
155
 
156
156
  return bool(torch.backends.mps.is_available())
157
157
  except ImportError:
@@ -163,7 +163,7 @@ def _get_cuda_devices() -> list[DeviceInfo]:
163
163
  devices: list[DeviceInfo] = []
164
164
 
165
165
  try:
166
- import torch
166
+ import torch # noqa: PLC0415
167
167
 
168
168
  if not torch.cuda.is_available():
169
169
  return devices
@@ -199,7 +199,7 @@ def _get_cuda_devices() -> list[DeviceInfo]:
199
199
  def _get_mps_device() -> DeviceInfo | None:
200
200
  """Get information about the MPS device."""
201
201
  try:
202
- import torch
202
+ import torch # noqa: PLC0415
203
203
 
204
204
  if not torch.backends.mps.is_available():
205
205
  return None
@@ -216,7 +216,7 @@ def _get_mps_device() -> DeviceInfo | None:
216
216
  def _get_cuda_memory_info(device_id: int) -> tuple[float | None, float | None]:
217
217
  """Get CUDA memory information for a specific device."""
218
218
  try:
219
- import torch
219
+ import torch # noqa: PLC0415
220
220
 
221
221
  if not torch.cuda.is_available():
222
222
  return None, None
@@ -329,7 +329,7 @@ def cleanup_device_memory(device: DeviceInfo) -> None:
329
329
  """
330
330
  if device.device_type == "cuda":
331
331
  try:
332
- import torch
332
+ import torch # noqa: PLC0415
333
333
 
334
334
  if torch.cuda.is_available():
335
335
  torch.cuda.empty_cache()
@@ -338,7 +338,7 @@ def cleanup_device_memory(device: DeviceInfo) -> None:
338
338
 
339
339
  elif device.device_type == "mps":
340
340
  try:
341
- import torch
341
+ import torch # noqa: PLC0415
342
342
 
343
343
  if torch.backends.mps.is_available():
344
344
  torch.mps.empty_cache()
kreuzberg/cli.py CHANGED
@@ -160,7 +160,7 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
160
160
  progress.add_task("Extracting text...", total=None)
161
161
 
162
162
  try:
163
- import magic # type: ignore[import-not-found]
163
+ import magic # type: ignore[import-not-found] # noqa: PLC0415
164
164
 
165
165
  mime_type = magic.from_buffer(input_bytes, mime=True)
166
166
  except ImportError:
@@ -260,7 +260,7 @@ def cli(ctx: click.Context) -> None:
260
260
  @click.option("--paddleocr-languages", help="PaddleOCR language codes (comma-separated, e.g., 'en,german')")
261
261
  @click.pass_context
262
262
  def extract( # noqa: PLR0913
263
- ctx: click.Context, # noqa: ARG001
263
+ _: click.Context,
264
264
  file: Path | None,
265
265
  output: Path | None,
266
266
  force_ocr: bool,
kreuzberg/extraction.py CHANGED
@@ -7,15 +7,15 @@ from typing import TYPE_CHECKING, Any, Final, cast
7
7
 
8
8
  import anyio
9
9
 
10
- from kreuzberg import ExtractionResult
11
10
  from kreuzberg._chunker import get_chunker
11
+ from kreuzberg._document_classification import auto_detect_document_type
12
12
  from kreuzberg._entity_extraction import extract_entities, extract_keywords
13
13
  from kreuzberg._language_detection import detect_languages
14
14
  from kreuzberg._mime_types import (
15
15
  validate_mime_type,
16
16
  )
17
17
  from kreuzberg._registry import ExtractorRegistry
18
- from kreuzberg._types import ExtractionConfig
18
+ from kreuzberg._types import ExtractionConfig, ExtractionResult
19
19
  from kreuzberg._utils._document_cache import get_document_cache
20
20
  from kreuzberg._utils._errors import create_error_context
21
21
  from kreuzberg._utils._string import safe_decode
@@ -30,7 +30,9 @@ if TYPE_CHECKING:
30
30
  DEFAULT_CONFIG: Final[ExtractionConfig] = ExtractionConfig()
31
31
 
32
32
 
33
- def _validate_and_post_process_helper(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
33
+ def _validate_and_post_process_helper(
34
+ result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
35
+ ) -> ExtractionResult:
34
36
  if config.chunk_content:
35
37
  result.chunks = _handle_chunk_content(
36
38
  mime_type=result.mime_type,
@@ -62,14 +64,19 @@ def _validate_and_post_process_helper(result: ExtractionResult, config: Extracti
62
64
  config=config.language_detection_config,
63
65
  )
64
66
 
67
+ if config.auto_detect_document_type:
68
+ result = auto_detect_document_type(result, config, file_path=file_path)
69
+
65
70
  return result
66
71
 
67
72
 
68
- async def _validate_and_post_process_async(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
73
+ async def _validate_and_post_process_async(
74
+ result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
75
+ ) -> ExtractionResult:
69
76
  for validator in config.validators or []:
70
77
  await run_maybe_sync(validator, result)
71
78
 
72
- result = _validate_and_post_process_helper(result, config)
79
+ result = _validate_and_post_process_helper(result, config, file_path)
73
80
 
74
81
  for post_processor in config.post_processing_hooks or []:
75
82
  result = await run_maybe_sync(post_processor, result)
@@ -77,11 +84,13 @@ async def _validate_and_post_process_async(result: ExtractionResult, config: Ext
77
84
  return result
78
85
 
79
86
 
80
- def _validate_and_post_process_sync(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
87
+ def _validate_and_post_process_sync(
88
+ result: ExtractionResult, config: ExtractionConfig, file_path: Path | None = None
89
+ ) -> ExtractionResult:
81
90
  for validator in config.validators or []:
82
91
  run_sync_only(validator, result)
83
92
 
84
- result = _validate_and_post_process_helper(result, config)
93
+ result = _validate_and_post_process_helper(result, config, file_path)
85
94
 
86
95
  for post_processor in config.post_processing_hooks or []:
87
96
  result = run_sync_only(post_processor, result)
@@ -172,7 +181,7 @@ async def extract_file(
172
181
  metadata={},
173
182
  )
174
183
 
175
- result = await _validate_and_post_process_async(result=result, config=config)
184
+ result = await _validate_and_post_process_async(result=result, config=config, file_path=path)
176
185
 
177
186
  cache.set(path, config, result)
178
187
 
@@ -357,7 +366,7 @@ def extract_file_sync(
357
366
  metadata={},
358
367
  )
359
368
 
360
- result = _validate_and_post_process_sync(result=result, config=config)
369
+ result = _validate_and_post_process_sync(result=result, config=config, file_path=path)
361
370
 
362
371
  cache.set(path, config, result)
363
372
 
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.8.2
3
+ Version: 3.9.0
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
7
7
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
8
8
  License: MIT
9
9
  License-File: LICENSE
10
- Keywords: async,document-analysis,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
10
+ Keywords: async,document-analysis,document-classification,document-intelligence,document-processing,extensible,information-extraction,mcp,metadata-extraction,model-context-protocol,ocr,pandoc,pdf-extraction,pdfium,plugin-architecture,rag,retrieval-augmented-generation,structured-data,table-extraction,tesseract,text-extraction
11
11
  Classifier: Development Status :: 5 - Production/Stable
12
12
  Classifier: Intended Audience :: Developers
13
13
  Classifier: Intended Audience :: Information Technology
@@ -60,6 +60,9 @@ Requires-Dist: spacy>=3.8.7; extra == 'all'
60
60
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
61
61
  Provides-Extra: api
62
62
  Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
63
+ Provides-Extra: auto-classify-document-type
64
+ Requires-Dist: deep-translator>=1.11.4; extra == 'auto-classify-document-type'
65
+ Requires-Dist: pandas>=2.3.1; extra == 'auto-classify-document-type'
63
66
  Provides-Extra: chunking
64
67
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
65
68
  Provides-Extra: cli
@@ -88,7 +91,7 @@ Description-Content-Type: text/markdown
88
91
  [![Documentation](https://img.shields.io/badge/docs-kreuzberg.dev-blue)](https://kreuzberg.dev/)
89
92
  [![Benchmarks](https://img.shields.io/badge/benchmarks-fastest%20CPU-orange)](https://benchmarks.kreuzberg.dev/)
90
93
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
91
- [![Test Coverage](https://img.shields.io/badge/coverage-95%25-green)](https://github.com/Goldziher/kreuzberg)
94
+ [![DeepSource](https://app.deepsource.com/gh/Goldziher/kreuzberg.svg/?label=code+coverage&show_trend=true&token=U8AW1VWWSLwVhrbtL8LmLBDN)](https://app.deepsource.com/gh/Goldziher/kreuzberg/)
92
95
 
93
96
  **A document intelligence framework for Python.** Extract text, metadata, and structured information from diverse document formats through a unified, extensible API. Built on established open source foundations including Pandoc, PDFium, and Tesseract.
94
97
 
@@ -103,6 +106,7 @@ Description-Content-Type: text/markdown
103
106
  - **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
104
107
  - **OCR Integration**: Multiple OCR engines (Tesseract, EasyOCR, PaddleOCR) with automatic fallback
105
108
  - **Table Detection**: Structured table extraction with cell-level precision via GMFT integration
109
+ - **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
106
110
 
107
111
  ### Technical Architecture
108
112
 
@@ -1,18 +1,19 @@
1
1
  kreuzberg/__init__.py,sha256=0OJ_jNKbS6GxzWC5-EfRCiE80as_ya0-wwyNsTYbxzY,1721
2
2
  kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
3
- kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
4
- kreuzberg/_config.py,sha256=_9JU88ChId8dWUjZ13ueo9_JoFekkyzuv7rZpFkrPZk,12966
3
+ kreuzberg/_chunker.py,sha256=QmYbPHPE36ztMT70xPwg_Y4NIftCDl0wyufg5X9lmTo,1932
4
+ kreuzberg/_config.py,sha256=EvrBFAawjfKgXu49tACi4CuMmmoIRt_EzbHayZqM_jU,12983
5
5
  kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
6
- kreuzberg/_entity_extraction.py,sha256=woNxARG27Z3T_l6w6N-dbt1PPe1IHptFMOZY_6etv54,7819
7
- kreuzberg/_gmft.py,sha256=Q46CyBxRxY_oDGpSuXMOJ7qfR9LwuCKXnrl60wcPvU4,25286
6
+ kreuzberg/_document_classification.py,sha256=8XVTKh8ohsb4mbKw2gPFr5OB6v4dWuzXhFE_63vHLrw,5189
7
+ kreuzberg/_entity_extraction.py,sha256=43VxtCPbuSrfi-XGSop-VRoZadJcNe02qRGwjGSaR0c,7862
8
+ kreuzberg/_gmft.py,sha256=JWDeBYWSDkh3tLUhwTmW5CbIBEig2A3o712BLshk7hE,25533
8
9
  kreuzberg/_language_detection.py,sha256=eEfj4tsh91SfB2_zQIdY-qD7TlPcppaFm0SqQmETS6Y,3295
9
- kreuzberg/_mime_types.py,sha256=OhJ6gEyyLHjyvRtkk37zyLFBsRcSd_QybBaV8TxinIg,8471
10
+ kreuzberg/_mime_types.py,sha256=2warRVqfBUNIg8JBg8yP4pRqaMPvwINosHMkJwtH_Fc,8488
10
11
  kreuzberg/_playa.py,sha256=9z4If0WHxbYQxfb8xT7T96L9Du2Fj3Ar5-rF0OHHiMM,11877
11
12
  kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
12
- kreuzberg/_types.py,sha256=GisvL0ps2LCc0heKopFwSyrEbzH3WpDxaeev4vn59X4,14257
13
- kreuzberg/cli.py,sha256=vTGS2TJlFTNMWp5LwZd3G2SS8u0m6bhQkH9n6a1oOoM,12439
13
+ kreuzberg/_types.py,sha256=Si-Kb58HgE4ckGyZnJFqbWRbCNbdyC_Y0-p75aQP838,15065
14
+ kreuzberg/cli.py,sha256=lmthZa0x8pz7SQfCoPPdTaUF9aQZ8W4w5FlnPcsGr9k,12438
14
15
  kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
15
- kreuzberg/extraction.py,sha256=UmeEVN-eSile4HMxP0iqG9092BrsH5_zSZNVHhwy0ko,16993
16
+ kreuzberg/extraction.py,sha256=Kt1mOxdlOb35yVOdpdhiRPuTgA9BW_TTG9qwCkSxSkc,17332
16
17
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
18
  kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
19
  kreuzberg/_api/main.py,sha256=g3kqXUfSie2pcw3-EWOM4TAoJUqM7yj2e-cBQJ_bmYc,3253
@@ -20,22 +21,22 @@ kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3h
20
21
  kreuzberg/_extractors/_base.py,sha256=yNVQSECFad-8_MjqpQZ4q0jQoNdzP6-tqw6l3TfgsMc,4418
21
22
  kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO8I,5695
22
23
  kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
23
- kreuzberg/_extractors/_image.py,sha256=eZ7mR4F-mTwYwUzd70xrY7SZYZrNiDxnP5bYDY5P75U,4455
24
+ kreuzberg/_extractors/_image.py,sha256=OmkqR5Cd6bTM_qHEdNHeXa5eK-3KvtLgHX-JE5oZWec,4483
24
25
  kreuzberg/_extractors/_pandoc.py,sha256=51k7XISfKaPorhapG7aIeQb94KGsfozxKyT2rwhk9Bk,26553
25
- kreuzberg/_extractors/_pdf.py,sha256=d-hG_mhAMj22bQ35YuP2nq017z27_2Pp08r1qyHxlYI,16676
26
+ kreuzberg/_extractors/_pdf.py,sha256=UlliWggWHuVwwJE-bRa7H9-_cieSa8kdrQP3x_GOxxY,17018
26
27
  kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
27
- kreuzberg/_extractors/_spread_sheet.py,sha256=vPxEDAyH-gDoVXSg-A0guOjOfaWIuRI3i2NU8xPwhK8,13695
28
- kreuzberg/_extractors/_structured.py,sha256=d0x6EyRimr8eWmr1qPb7HRWnrbKBuD-GpIrZd8XJp0o,5824
28
+ kreuzberg/_extractors/_spread_sheet.py,sha256=iagiyJsnl-89OP1eqmEv8jWl7gZBJm2x0YOyqBgLasA,13733
29
+ kreuzberg/_extractors/_structured.py,sha256=J7op5ZZ663a_CkE2SB5eKjPQPXKHdqPst-GFCJMNGqw,5727
29
30
  kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
30
31
  kreuzberg/_mcp/server.py,sha256=Ab0w7kR3m7_L1cfhYHiC8HqDL282vt4uBYwYc9w9E08,8703
31
32
  kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
32
- kreuzberg/_ocr/_base.py,sha256=CUzYMsJjCqCmHzWckmDeIB2L5hd261xrPrK8Ql-Gdm0,3876
33
- kreuzberg/_ocr/_easyocr.py,sha256=c2ndpDlIHvAI2WyvQUXLQ1hb6XynKeKARsXQcQ3ntJ0,17110
34
- kreuzberg/_ocr/_paddleocr.py,sha256=fab8a-3cvDgnt97qF-Km9ZfmkacFeKD_g15O8HXYRVc,17492
35
- kreuzberg/_ocr/_tesseract.py,sha256=r1g_PCAXgJbZ0RPGn4aSxctZ0F9lLvI3zLGLEPAnviI,31455
33
+ kreuzberg/_ocr/_base.py,sha256=urvsLRgOmVYHjxil_IsSL69FmMnboklC4CHAjdBQLKQ,3893
34
+ kreuzberg/_ocr/_easyocr.py,sha256=pw2uDmULuMQ9T1Gl4axP_ev7-qwjLt1mJHHyZ34P_FI,17178
35
+ kreuzberg/_ocr/_paddleocr.py,sha256=s75aQJILXm1ZbacyZiLPXh6jEAg9tk2NYnwPnfSDrRU,17543
36
+ kreuzberg/_ocr/_tesseract.py,sha256=teLMH1pBhpcmEXDcyZlv56hYINLGMuaKZ0CQtcu_czQ,31510
36
37
  kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
- kreuzberg/_utils/_cache.py,sha256=H2d6JOiTTAoJx5HPJoToCk4ik-ztTRNEJRrHgcSUTLs,15249
38
- kreuzberg/_utils/_device.py,sha256=PC8YUPE95pzOyU7sU_icqNZpSfi6HZlEFfmWcV1Uees,10226
38
+ kreuzberg/_utils/_cache.py,sha256=hYd_a5Ni5VJBE1XU_eN9gvQ5gg0FRsdbRgmJe-OIJHM,15253
39
+ kreuzberg/_utils/_device.py,sha256=arVrJOSp_2LbbN6lu_rMEUOezzRogdWdkF8d5q5Bg8U,10345
39
40
  kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
40
41
  kreuzberg/_utils/_errors.py,sha256=UsktQ_p7eOj9crPsFDg8HgRSE5-IpuFC7y1e6dDI_fY,6503
41
42
  kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
@@ -46,8 +47,8 @@ kreuzberg/_utils/_string.py,sha256=bCzO3UO6nXupxvtMWvHqfp1Vd9CTzEH9jmpJXQ7upAU,6
46
47
  kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
47
48
  kreuzberg/_utils/_table.py,sha256=IomrfQBP85DZI8RmQjOVs2Siq7VP9FUTYPaZR4t3yRw,8199
48
49
  kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
49
- kreuzberg-3.8.2.dist-info/METADATA,sha256=RiP64og5wOaf9gPZ7CwOsNYYx9GBnVMg8orgqZdncKA,11466
50
- kreuzberg-3.8.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
51
- kreuzberg-3.8.2.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
52
- kreuzberg-3.8.2.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
53
- kreuzberg-3.8.2.dist-info/RECORD,,
50
+ kreuzberg-3.9.0.dist-info/METADATA,sha256=C83JYzqxhGHhrqWDUmo0eJwK_2szx9ZQt3cnkocgwBY,11876
51
+ kreuzberg-3.9.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
52
+ kreuzberg-3.9.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
53
+ kreuzberg-3.9.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
54
+ kreuzberg-3.9.0.dist-info/RECORD,,