kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_config.py +248 -204
  5. kreuzberg/_document_classification.py +0 -8
  6. kreuzberg/_entity_extraction.py +1 -93
  7. kreuzberg/_extractors/_base.py +0 -5
  8. kreuzberg/_extractors/_email.py +1 -11
  9. kreuzberg/_extractors/_html.py +9 -12
  10. kreuzberg/_extractors/_image.py +1 -23
  11. kreuzberg/_extractors/_pandoc.py +10 -89
  12. kreuzberg/_extractors/_pdf.py +39 -92
  13. kreuzberg/_extractors/_presentation.py +0 -17
  14. kreuzberg/_extractors/_spread_sheet.py +13 -53
  15. kreuzberg/_extractors/_structured.py +1 -4
  16. kreuzberg/_gmft.py +14 -138
  17. kreuzberg/_language_detection.py +1 -22
  18. kreuzberg/_mcp/__init__.py +0 -2
  19. kreuzberg/_mcp/server.py +3 -10
  20. kreuzberg/_mime_types.py +1 -2
  21. kreuzberg/_ocr/_easyocr.py +21 -108
  22. kreuzberg/_ocr/_paddleocr.py +16 -94
  23. kreuzberg/_ocr/_table_extractor.py +260 -0
  24. kreuzberg/_ocr/_tesseract.py +906 -264
  25. kreuzberg/_playa.py +5 -4
  26. kreuzberg/_types.py +638 -40
  27. kreuzberg/_utils/_cache.py +88 -90
  28. kreuzberg/_utils/_device.py +0 -18
  29. kreuzberg/_utils/_document_cache.py +0 -2
  30. kreuzberg/_utils/_errors.py +0 -3
  31. kreuzberg/_utils/_pdf_lock.py +0 -2
  32. kreuzberg/_utils/_process_pool.py +19 -19
  33. kreuzberg/_utils/_quality.py +0 -43
  34. kreuzberg/_utils/_ref.py +48 -0
  35. kreuzberg/_utils/_serialization.py +0 -5
  36. kreuzberg/_utils/_string.py +9 -39
  37. kreuzberg/_utils/_sync.py +0 -1
  38. kreuzberg/_utils/_table.py +50 -57
  39. kreuzberg/cli.py +54 -74
  40. kreuzberg/extraction.py +39 -32
  41. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
  42. kreuzberg-3.13.0.dist-info/RECORD +56 -0
  43. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  44. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
  45. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/__init__.py CHANGED
@@ -1,16 +1,20 @@
1
1
  from importlib.metadata import version
2
2
 
3
- from kreuzberg._config import discover_and_load_config, load_config_from_path, try_discover_config
4
- from kreuzberg._entity_extraction import SpacyEntityExtractionConfig
5
- from kreuzberg._gmft import GMFTConfig
6
- from kreuzberg._language_detection import LanguageDetectionConfig
7
- from kreuzberg._ocr._easyocr import EasyOCRConfig
8
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
9
- from kreuzberg._ocr._tesseract import TesseractConfig
10
-
11
- from ._ocr._tesseract import PSMMode
12
3
  from ._registry import ExtractorRegistry
13
- from ._types import Entity, ExtractionConfig, ExtractionResult, Metadata, TableData
4
+ from ._types import (
5
+ EasyOCRConfig,
6
+ Entity,
7
+ ExtractionConfig,
8
+ ExtractionResult,
9
+ GMFTConfig,
10
+ LanguageDetectionConfig,
11
+ Metadata,
12
+ PaddleOCRConfig,
13
+ PSMMode,
14
+ SpacyEntityExtractionConfig,
15
+ TableData,
16
+ TesseractConfig,
17
+ )
14
18
  from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
15
19
  from .extraction import (
16
20
  batch_extract_bytes,
@@ -49,11 +53,8 @@ __all__ = [
49
53
  "batch_extract_bytes_sync",
50
54
  "batch_extract_file",
51
55
  "batch_extract_file_sync",
52
- "discover_and_load_config",
53
56
  "extract_bytes",
54
57
  "extract_bytes_sync",
55
58
  "extract_file",
56
59
  "extract_file_sync",
57
- "load_config_from_path",
58
- "try_discover_config",
59
60
  ]
kreuzberg/__main__.py CHANGED
@@ -1,5 +1,3 @@
1
- """Entry point for running kreuzberg as a module with python -m kreuzberg."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  from kreuzberg.cli import cli
kreuzberg/_api/main.py CHANGED
@@ -1,20 +1,24 @@
1
1
  from __future__ import annotations
2
2
 
3
- from json import dumps
4
- from typing import TYPE_CHECKING, Annotated, Any
3
+ from functools import lru_cache
4
+ from json import dumps, loads
5
+ from typing import TYPE_CHECKING, Annotated, Any, Literal
5
6
 
6
7
  import msgspec
7
8
 
8
9
  from kreuzberg import (
10
+ EasyOCRConfig,
9
11
  ExtractionConfig,
10
12
  ExtractionResult,
11
13
  KreuzbergError,
12
14
  MissingDependencyError,
15
+ PaddleOCRConfig,
13
16
  ParsingError,
17
+ TesseractConfig,
14
18
  ValidationError,
15
19
  batch_extract_bytes,
16
20
  )
17
- from kreuzberg._config import try_discover_config
21
+ from kreuzberg._config import discover_config
18
22
 
19
23
  if TYPE_CHECKING:
20
24
  from litestar.datastructures import UploadFile
@@ -65,17 +69,123 @@ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError
65
69
  )
66
70
 
67
71
 
72
+ def _convert_value_type(current_value: Any, new_value: Any) -> Any:
73
+ if isinstance(current_value, bool):
74
+ if isinstance(new_value, str):
75
+ return str(new_value).lower() in ("true", "1", "yes", "on")
76
+ return bool(new_value)
77
+ if isinstance(current_value, int) and not isinstance(new_value, bool):
78
+ return int(new_value) if new_value is not None else current_value
79
+ if isinstance(current_value, float):
80
+ return float(new_value) if new_value is not None else current_value
81
+ return new_value
82
+
83
+
84
+ def _create_ocr_config(
85
+ ocr_backend: Literal["tesseract", "easyocr", "paddleocr"] | None, config_dict: dict[str, Any]
86
+ ) -> Any:
87
+ if ocr_backend == "tesseract":
88
+ return TesseractConfig(**config_dict)
89
+ if ocr_backend == "easyocr":
90
+ return EasyOCRConfig(**config_dict)
91
+ if ocr_backend == "paddleocr":
92
+ return PaddleOCRConfig(**config_dict)
93
+ return config_dict
94
+
95
+
96
+ @lru_cache(maxsize=128)
97
+ def _merge_configs_cached(
98
+ static_config: ExtractionConfig | None,
99
+ query_params: tuple[tuple[str, Any], ...],
100
+ header_config: tuple[tuple[str, Any], ...] | None,
101
+ ) -> ExtractionConfig:
102
+ """Cached implementation of merge_configs with hashable parameters."""
103
+ base_config = static_config or ExtractionConfig()
104
+ config_dict = base_config.to_dict()
105
+
106
+ query_dict = dict(query_params) if query_params else {}
107
+ for key, value in query_dict.items():
108
+ if value is not None and key in config_dict:
109
+ config_dict[key] = _convert_value_type(config_dict[key], value)
110
+
111
+ if header_config:
112
+ header_dict = dict(header_config)
113
+ for key, value in header_dict.items():
114
+ if key in config_dict:
115
+ config_dict[key] = value
116
+
117
+ if "ocr_config" in config_dict and isinstance(config_dict["ocr_config"], dict):
118
+ ocr_backend = config_dict.get("ocr_backend")
119
+ config_dict["ocr_config"] = _create_ocr_config(ocr_backend, config_dict["ocr_config"])
120
+
121
+ return ExtractionConfig(**config_dict)
122
+
123
+
124
+ def merge_configs(
125
+ static_config: ExtractionConfig | None,
126
+ query_params: dict[str, Any],
127
+ header_config: dict[str, Any] | None,
128
+ ) -> ExtractionConfig:
129
+ """Merge configurations with precedence: header > query > static > default."""
130
+ query_tuple = tuple(sorted(query_params.items())) if query_params else ()
131
+ header_tuple = tuple(sorted(header_config.items())) if header_config else None
132
+
133
+ return _merge_configs_cached(static_config, query_tuple, header_tuple)
134
+
135
+
68
136
  @post("/extract", operation_id="ExtractFiles")
69
- async def handle_files_upload(
137
+ async def handle_files_upload( # noqa: PLR0913
138
+ request: Request[Any, Any, Any],
70
139
  data: Annotated[list[UploadFile], Body(media_type=RequestEncodingType.MULTI_PART)],
140
+ chunk_content: str | bool | None = None,
141
+ max_chars: int | None = None,
142
+ max_overlap: int | None = None,
143
+ extract_tables: str | bool | None = None,
144
+ extract_entities: str | bool | None = None,
145
+ extract_keywords: str | bool | None = None,
146
+ keyword_count: int | None = None,
147
+ force_ocr: str | bool | None = None,
148
+ ocr_backend: Literal["tesseract", "easyocr", "paddleocr"] | None = None,
149
+ auto_detect_language: str | bool | None = None,
150
+ pdf_password: str | None = None,
71
151
  ) -> list[ExtractionResult]:
72
- """Extracts text content from an uploaded file."""
73
- # Try to discover configuration from files
74
- config = try_discover_config()
152
+ """Extracts text content from uploaded files with optional runtime configuration.
153
+
154
+ Configuration can be provided via:
155
+ 1. Query parameters for common settings
156
+ 2. X-Extraction-Config header for complex nested configurations (JSON format)
157
+ 3. Static configuration file (kreuzberg.toml or pyproject.toml)
158
+
159
+ Precedence: Header config > Query params > Static config > Defaults
160
+ """
161
+ static_config = discover_config()
162
+
163
+ query_params = {
164
+ "chunk_content": chunk_content,
165
+ "max_chars": max_chars,
166
+ "max_overlap": max_overlap,
167
+ "extract_tables": extract_tables,
168
+ "extract_entities": extract_entities,
169
+ "extract_keywords": extract_keywords,
170
+ "keyword_count": keyword_count,
171
+ "force_ocr": force_ocr,
172
+ "ocr_backend": ocr_backend,
173
+ "auto_detect_language": auto_detect_language,
174
+ "pdf_password": pdf_password,
175
+ }
176
+
177
+ header_config = None
178
+ if config_header := request.headers.get("X-Extraction-Config"):
179
+ try:
180
+ header_config = loads(config_header)
181
+ except Exception as e:
182
+ raise ValidationError(f"Invalid JSON in X-Extraction-Config header: {e}", context={"error": str(e)}) from e
183
+
184
+ final_config = merge_configs(static_config, query_params, header_config)
75
185
 
76
186
  return await batch_extract_bytes(
77
187
  [(await file.read(), file.content_type) for file in data],
78
- config=config or ExtractionConfig(),
188
+ config=final_config,
79
189
  )
80
190
 
81
191
 
@@ -88,7 +198,7 @@ async def health_check() -> dict[str, str]:
88
198
  @get("/config", operation_id="GetConfiguration")
89
199
  async def get_configuration() -> dict[str, Any]:
90
200
  """Get the current configuration."""
91
- config = try_discover_config()
201
+ config = discover_config()
92
202
  if config is None:
93
203
  return {"message": "No configuration file found", "config": None}
94
204