kreuzberg 3.13.3__py3-none-any.whl → 3.14.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_api/main.py CHANGED
@@ -6,6 +6,7 @@ from json import dumps, loads
6
6
  from typing import TYPE_CHECKING, Annotated, Any, Literal
7
7
 
8
8
  import msgspec
9
+ from typing_extensions import TypedDict
9
10
 
10
11
  from kreuzberg import (
11
12
  EasyOCRConfig,
@@ -24,11 +25,28 @@ from kreuzberg._config import discover_config
24
25
  if TYPE_CHECKING:
25
26
  from litestar.datastructures import UploadFile
26
27
 
28
+
29
+ class HealthResponse(TypedDict):
30
+ """Response model for health check endpoint."""
31
+
32
+ status: str
33
+
34
+
35
+ class ConfigurationResponse(TypedDict):
36
+ """Response model for configuration endpoint."""
37
+
38
+ message: str
39
+ config: dict[str, Any] | None
40
+
41
+
27
42
  try:
28
43
  from litestar import Litestar, Request, Response, get, post
29
44
  from litestar.contrib.opentelemetry import OpenTelemetryConfig, OpenTelemetryPlugin
30
45
  from litestar.enums import RequestEncodingType
31
46
  from litestar.logging import StructLoggingConfig
47
+ from litestar.openapi.config import OpenAPIConfig
48
+ from litestar.openapi.spec.contact import Contact
49
+ from litestar.openapi.spec.license import License
32
50
  from litestar.params import Body
33
51
  from litestar.status_codes import (
34
52
  HTTP_400_BAD_REQUEST,
@@ -71,7 +89,6 @@ def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError
71
89
 
72
90
 
73
91
  def general_exception_handler(request: Request[Any, Any, Any], exception: Exception) -> Response[Any]:
74
- """Temporary handler to catch ALL exceptions for debugging."""
75
92
  error_type = type(exception).__name__
76
93
  error_message = str(exception)
77
94
  traceback_str = traceback.format_exc()
@@ -127,7 +144,6 @@ def _merge_configs_cached(
127
144
  query_params: tuple[tuple[str, Any], ...],
128
145
  header_config: tuple[tuple[str, Any], ...] | None,
129
146
  ) -> ExtractionConfig:
130
- """Cached implementation of merge_configs with hashable parameters."""
131
147
  base_config = static_config or ExtractionConfig()
132
148
  config_dict = base_config.to_dict()
133
149
 
@@ -150,7 +166,6 @@ def _merge_configs_cached(
150
166
 
151
167
 
152
168
  def _make_hashable(obj: Any) -> Any:
153
- """Convert nested dicts/lists to hashable tuples."""
154
169
  if isinstance(obj, dict):
155
170
  return tuple(sorted((k, _make_hashable(v)) for k, v in obj.items()))
156
171
  if isinstance(obj, list):
@@ -163,7 +178,6 @@ def merge_configs(
163
178
  query_params: dict[str, Any],
164
179
  header_config: dict[str, Any] | None,
165
180
  ) -> ExtractionConfig:
166
- """Merge configurations with precedence: header > query > static > default."""
167
181
  query_tuple = tuple(sorted(query_params.items())) if query_params else ()
168
182
  header_tuple = _make_hashable(header_config) if header_config else None
169
183
 
@@ -186,14 +200,35 @@ async def handle_files_upload( # noqa: PLR0913
186
200
  auto_detect_language: str | bool | None = None,
187
201
  pdf_password: str | None = None,
188
202
  ) -> list[ExtractionResult]:
189
- """Extracts text content from uploaded files with optional runtime configuration.
190
-
191
- Configuration can be provided via:
192
- 1. Query parameters for common settings
193
- 2. X-Extraction-Config header for complex nested configurations (JSON format)
194
- 3. Static configuration file (kreuzberg.toml or pyproject.toml)
195
-
196
- Precedence: Header config > Query params > Static config > Defaults
203
+ """Extract text, metadata, and structured data from uploaded documents.
204
+
205
+ This endpoint processes multiple file uploads and extracts comprehensive information including:
206
+ - Text content with metadata
207
+ - Tables (if enabled)
208
+ - Named entities (if enabled)
209
+ - Keywords (if enabled)
210
+ - Language detection (if enabled)
211
+
212
+ Supports various file formats including PDF, Office documents, images, and more.
213
+ Maximum file size: 1GB per file.
214
+
215
+ Args:
216
+ request: The HTTP request object
217
+ data: List of files to process (multipart form data)
218
+ chunk_content: Enable text chunking for large documents
219
+ max_chars: Maximum characters per chunk (default: 1000)
220
+ max_overlap: Character overlap between chunks (default: 200)
221
+ extract_tables: Extract tables from documents
222
+ extract_entities: Extract named entities from text
223
+ extract_keywords: Extract keywords from text
224
+ keyword_count: Number of keywords to extract (default: 10)
225
+ force_ocr: Force OCR processing even for text-based documents
226
+ ocr_backend: OCR engine to use (tesseract, easyocr, paddleocr)
227
+ auto_detect_language: Enable automatic language detection
228
+ pdf_password: Password for encrypted PDF files
229
+
230
+ Returns:
231
+ List of extraction results, one per uploaded file
197
232
  """
198
233
  static_config = discover_config()
199
234
 
@@ -227,14 +262,25 @@ async def handle_files_upload( # noqa: PLR0913
227
262
 
228
263
 
229
264
  @get("/health", operation_id="HealthCheck")
230
- async def health_check() -> dict[str, str]:
231
- """A simple health check endpoint."""
265
+ async def health_check() -> HealthResponse:
266
+ """Check the health status of the API.
267
+
268
+ Returns:
269
+ Simple status response indicating the API is operational
270
+ """
232
271
  return {"status": "ok"}
233
272
 
234
273
 
235
274
  @get("/config", operation_id="GetConfiguration")
236
- async def get_configuration() -> dict[str, Any]:
237
- """Get the current configuration."""
275
+ async def get_configuration() -> ConfigurationResponse:
276
+ """Get the current extraction configuration.
277
+
278
+ Returns the loaded configuration from kreuzberg.toml file if available,
279
+ or indicates that no configuration file was found.
280
+
281
+ Returns:
282
+ Configuration data with status message
283
+ """
238
284
  config = discover_config()
239
285
  if config is None:
240
286
  return {"message": "No configuration file found", "config": None}
@@ -245,12 +291,30 @@ async def get_configuration() -> dict[str, Any]:
245
291
  }
246
292
 
247
293
 
294
+ openapi_config = OpenAPIConfig(
295
+ title="Kreuzberg API",
296
+ version="3.14.0",
297
+ description="Document intelligence framework API for extracting text, metadata, and structured data from diverse file formats",
298
+ contact=Contact(
299
+ name="Kreuzberg",
300
+ url="https://github.com/Goldziher/kreuzberg",
301
+ ),
302
+ license=License(
303
+ name="MIT",
304
+ identifier="MIT",
305
+ ),
306
+ use_handler_docstrings=True,
307
+ create_examples=True,
308
+ )
309
+
248
310
  app = Litestar(
249
311
  route_handlers=[handle_files_upload, health_check, get_configuration],
250
312
  plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
251
- logging_config=StructLoggingConfig(), # Use default config
313
+ logging_config=StructLoggingConfig(),
314
+ openapi_config=openapi_config,
252
315
  exception_handlers={
253
316
  KreuzbergError: exception_handler,
254
- Exception: general_exception_handler, # Catch all exceptions for debugging
317
+ Exception: general_exception_handler,
255
318
  },
319
+ request_max_body_size=1024 * 1024 * 1024, # 1GB limit for large file uploads
256
320
  )
kreuzberg/_config.py CHANGED
@@ -162,7 +162,6 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
162
162
  if config_path.name == "pyproject.toml":
163
163
  return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
164
164
 
165
- # For any other TOML file, check if it has [tool.kreuzberg] section
166
165
  if "tool" in data and "kreuzberg" in data["tool"]:
167
166
  return data["tool"]["kreuzberg"] # type: ignore[no-any-return]
168
167
 
@@ -7,10 +7,13 @@ from pathlib import Path
7
7
  from typing import TYPE_CHECKING, ClassVar
8
8
 
9
9
  from anyio import Path as AsyncPath
10
+ from PIL import Image
10
11
 
11
12
  from kreuzberg._extractors._base import Extractor
12
13
  from kreuzberg._mime_types import IMAGE_MIME_TYPES
13
14
  from kreuzberg._ocr import get_ocr_backend
15
+ from kreuzberg._utils._image_preprocessing import normalize_image_dpi
16
+ from kreuzberg._utils._sync import run_sync
14
17
  from kreuzberg._utils._tmp import create_temp_file
15
18
  from kreuzberg.exceptions import ValidationError
16
19
 
@@ -57,7 +60,15 @@ class ImageExtractor(Extractor):
57
60
  if self.config.ocr_backend is None:
58
61
  raise ValidationError("ocr_backend is None, cannot perform OCR")
59
62
 
60
- result = await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
63
+ image = await run_sync(Image.open, str(path))
64
+ normalized_image, preprocessing_metadata = normalize_image_dpi(image, self.config)
65
+
66
+ backend = get_ocr_backend(self.config.ocr_backend)
67
+ result = await backend.process_image(normalized_image, **self.config.get_config_dict())
68
+
69
+ if preprocessing_metadata:
70
+ result.metadata["image_preprocessing"] = preprocessing_metadata
71
+
61
72
  return self._apply_quality_processing(result)
62
73
 
63
74
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
@@ -77,8 +88,15 @@ class ImageExtractor(Extractor):
77
88
  if self.config.ocr_backend is None:
78
89
  raise ValidationError("ocr_backend is None, cannot perform OCR")
79
90
 
91
+ image = Image.open(str(path))
92
+ normalized_image, preprocessing_metadata = normalize_image_dpi(image, self.config)
93
+
80
94
  backend = get_ocr_backend(self.config.ocr_backend)
81
- result = backend.process_file_sync(path, **self.config.get_config_dict())
95
+ result = backend.process_image_sync(normalized_image, **self.config.get_config_dict())
96
+
97
+ if preprocessing_metadata:
98
+ result.metadata["image_preprocessing"] = preprocessing_metadata
99
+
82
100
  return self._apply_quality_processing(result)
83
101
 
84
102
  def _get_extension_from_mime_type(self, mime_type: str) -> str:
@@ -21,6 +21,7 @@ from kreuzberg._ocr import get_ocr_backend
21
21
  from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
22
22
  from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata, OcrBackendType, PaddleOCRConfig, TesseractConfig
23
23
  from kreuzberg._utils._errors import create_error_context, should_retry
24
+ from kreuzberg._utils._image_preprocessing import calculate_optimal_dpi
24
25
  from kreuzberg._utils._pdf_lock import pypdfium_file_lock
25
26
  from kreuzberg._utils._string import normalize_spaces
26
27
  from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
@@ -170,7 +171,26 @@ class PDFExtractor(Extractor):
170
171
  try:
171
172
  with pypdfium_file_lock(input_file):
172
173
  document = await run_sync(pypdfium2.PdfDocument, str(input_file))
173
- return [page.render(scale=200 / 72).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
174
+ images = []
175
+ for page in cast("pypdfium2.PdfDocument", document):
176
+ width, height = page.get_size()
177
+
178
+ if self.config.auto_adjust_dpi:
179
+ optimal_dpi = calculate_optimal_dpi(
180
+ page_width=width,
181
+ page_height=height,
182
+ target_dpi=self.config.target_dpi,
183
+ max_dimension=self.config.max_image_dimension,
184
+ min_dpi=self.config.min_dpi,
185
+ max_dpi=self.config.max_dpi,
186
+ )
187
+ else:
188
+ optimal_dpi = self.config.target_dpi
189
+
190
+ scale = optimal_dpi / 72.0
191
+
192
+ images.append(page.render(scale=scale).to_pil())
193
+ return images
174
194
  except pypdfium2.PdfiumError as e: # noqa: PERF203
175
195
  last_error = e
176
196
  if not should_retry(e, attempt + 1):
@@ -35,7 +35,6 @@ class SpreadSheetExtractor(Extractor):
35
35
  SUPPORTED_MIME_TYPES = SPREADSHEET_MIME_TYPES
36
36
 
37
37
  def _get_file_extension(self) -> str:
38
- """Get the appropriate file extension based on MIME type."""
39
38
  mime_to_ext = {
40
39
  "application/vnd.ms-excel": ".xls",
41
40
  "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": ".xlsx",
kreuzberg/_gmft.py CHANGED
@@ -9,11 +9,11 @@ import time
9
9
  import traceback
10
10
  from io import StringIO
11
11
  from pathlib import Path
12
- from typing import TYPE_CHECKING, Any
12
+ from typing import TYPE_CHECKING, Any, cast
13
13
 
14
14
  import anyio
15
15
  import msgspec
16
- import pandas as pd
16
+ import polars as pl
17
17
  from PIL import Image
18
18
 
19
19
  from kreuzberg._types import GMFTConfig, TableData
@@ -25,7 +25,63 @@ if TYPE_CHECKING:
25
25
  from os import PathLike
26
26
 
27
27
  from gmft.detectors.base import CroppedTable
28
- from pandas import DataFrame
28
+
29
+
30
+ def _pandas_to_polars(pandas_df: Any) -> pl.DataFrame:
31
+ if pandas_df is None:
32
+ return pl.DataFrame()
33
+
34
+ try:
35
+ return pl.from_pandas(pandas_df)
36
+ except (TypeError, ValueError, AttributeError):
37
+ if hasattr(pandas_df, "columns") and hasattr(pandas_df.columns, "duplicated"):
38
+ mask = ~pandas_df.columns.duplicated()
39
+ pandas_df = pandas_df.loc[:, mask]
40
+ return pl.from_pandas(pandas_df)
41
+ return pl.DataFrame()
42
+
43
+
44
+ def _dataframe_to_markdown(df: Any) -> str:
45
+ if df is None:
46
+ return ""
47
+
48
+ if isinstance(df, pl.DataFrame):
49
+ if df.is_empty():
50
+ return ""
51
+ return str(df)
52
+
53
+ if hasattr(df, "to_markdown"):
54
+ return cast("str", df.to_markdown())
55
+
56
+ return str(df)
57
+
58
+
59
+ def _dataframe_to_csv(df: Any) -> str:
60
+ if df is None:
61
+ return ""
62
+
63
+ if isinstance(df, pl.DataFrame):
64
+ if df.is_empty():
65
+ return ""
66
+ return df.write_csv()
67
+
68
+ if hasattr(df, "to_csv"):
69
+ return cast("str", df.to_csv(index=False))
70
+
71
+ return ""
72
+
73
+
74
+ def _is_dataframe_empty(df: Any) -> bool:
75
+ if df is None:
76
+ return True
77
+
78
+ if isinstance(df, pl.DataFrame):
79
+ return df.is_empty()
80
+
81
+ if hasattr(df, "empty"):
82
+ return cast("bool", df.empty)
83
+
84
+ return True
29
85
 
30
86
 
31
87
  async def extract_tables(
@@ -111,7 +167,7 @@ async def extract_tables(
111
167
  )
112
168
  doc = await run_sync(PyPDFium2Document, str(file_path))
113
169
  cropped_tables: list[CroppedTable] = []
114
- dataframes: list[DataFrame] = []
170
+ dataframes: list[pl.DataFrame] = []
115
171
  try:
116
172
  for page in doc:
117
173
  cropped_tables.extend(await run_sync(detector.extract, page))
@@ -124,8 +180,8 @@ async def extract_tables(
124
180
  TableData(
125
181
  cropped_image=cropped_table.image(),
126
182
  page_number=cropped_table.page.page_number,
127
- text=data_frame.to_markdown(),
128
- df=data_frame,
183
+ text=_dataframe_to_markdown(data_frame),
184
+ df=_pandas_to_polars(data_frame),
129
185
  )
130
186
  for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
131
187
  ]
@@ -225,7 +281,7 @@ def extract_tables_sync(
225
281
  TableData(
226
282
  cropped_image=cropped_table.image(),
227
283
  page_number=cropped_table.page.page_number,
228
- text=data_frame.to_markdown(),
284
+ text=_dataframe_to_markdown(data_frame),
229
285
  df=data_frame,
230
286
  )
231
287
  for data_frame, cropped_table in zip(dataframes, cropped_tables, strict=False)
@@ -295,26 +351,16 @@ def _extract_tables_in_process(
295
351
  cropped_image.save(img_bytes, format="PNG")
296
352
  img_bytes.seek(0)
297
353
 
298
- if data_frame.empty:
299
- results.append(
300
- {
301
- "cropped_image_bytes": img_bytes.getvalue(),
302
- "page_number": cropped_table.page.page_number,
303
- "text": data_frame.to_markdown(),
304
- "df_columns": data_frame.columns.tolist(),
305
- "df_csv": None,
306
- }
307
- )
308
- else:
309
- results.append(
310
- {
311
- "cropped_image_bytes": img_bytes.getvalue(),
312
- "page_number": cropped_table.page.page_number,
313
- "text": data_frame.to_markdown(),
314
- "df_columns": None,
315
- "df_csv": data_frame.to_csv(index=False),
316
- }
317
- )
354
+ csv_data = _dataframe_to_csv(data_frame) if not _is_dataframe_empty(data_frame) else ""
355
+ results.append(
356
+ {
357
+ "cropped_image_bytes": img_bytes.getvalue(),
358
+ "page_number": cropped_table.page.page_number,
359
+ "text": _dataframe_to_markdown(data_frame),
360
+ "df_columns": data_frame.columns,
361
+ "df_csv": csv_data if csv_data else None,
362
+ }
363
+ )
318
364
 
319
365
  result_queue.put((True, results))
320
366
 
@@ -381,10 +427,10 @@ def _extract_tables_isolated(
381
427
  for table_dict in result:
382
428
  img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
383
429
 
384
- if table_dict["df_csv"] is None:
385
- df = pd.DataFrame(columns=table_dict["df_columns"])
430
+ if table_dict["df_csv"] is None or table_dict["df_csv"] == "":
431
+ df = pl.DataFrame()
386
432
  else:
387
- df = pd.read_csv(StringIO(table_dict["df_csv"]))
433
+ df = pl.read_csv(StringIO(table_dict["df_csv"]), truncate_ragged_lines=True)
388
434
 
389
435
  tables.append(
390
436
  TableData(
@@ -468,10 +514,10 @@ async def _extract_tables_isolated_async(
468
514
  for table_dict in result:
469
515
  img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
470
516
 
471
- if table_dict["df_csv"] is None:
472
- df = pd.DataFrame(columns=table_dict["df_columns"])
517
+ if table_dict["df_csv"] is None or table_dict["df_csv"] == "":
518
+ df = pl.DataFrame()
473
519
  else:
474
- df = pd.read_csv(StringIO(table_dict["df_csv"]))
520
+ df = pl.read_csv(StringIO(table_dict["df_csv"]), truncate_ragged_lines=True)
475
521
 
476
522
  tables.append(
477
523
  TableData(
kreuzberg/_mcp/server.py CHANGED
@@ -16,14 +16,6 @@ mcp = FastMCP("Kreuzberg Text Extraction")
16
16
 
17
17
 
18
18
  def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
19
- """Create ExtractionConfig with discovered config as base and tool parameters as overrides.
20
-
21
- Args:
22
- **kwargs: Tool parameters to override defaults/discovered config.
23
-
24
- Returns:
25
- ExtractionConfig instance.
26
- """
27
19
  base_config = discover_config()
28
20
 
29
21
  if base_config is None:
@@ -64,25 +56,6 @@ def extract_document( # noqa: PLR0913
64
56
  keyword_count: int = 10,
65
57
  auto_detect_language: bool = False,
66
58
  ) -> dict[str, Any]:
67
- """Extract text content from a document file.
68
-
69
- Args:
70
- file_path: Path to the document file
71
- mime_type: MIME type of the document (auto-detected if not provided)
72
- force_ocr: Force OCR even for text-based documents
73
- chunk_content: Split content into chunks
74
- extract_tables: Extract tables from the document
75
- extract_entities: Extract named entities
76
- extract_keywords: Extract keywords
77
- ocr_backend: OCR backend to use (tesseract, easyocr, paddleocr)
78
- max_chars: Maximum characters per chunk
79
- max_overlap: Character overlap between chunks
80
- keyword_count: Number of keywords to extract
81
- auto_detect_language: Auto-detect document language
82
-
83
- Returns:
84
- Extracted content with metadata, tables, chunks, entities, and keywords
85
- """
86
59
  config = _create_config_with_overrides(
87
60
  force_ocr=force_ocr,
88
61
  chunk_content=chunk_content,
@@ -115,25 +88,6 @@ def extract_bytes( # noqa: PLR0913
115
88
  keyword_count: int = 10,
116
89
  auto_detect_language: bool = False,
117
90
  ) -> dict[str, Any]:
118
- """Extract text content from document bytes.
119
-
120
- Args:
121
- content_base64: Base64-encoded document content
122
- mime_type: MIME type of the document
123
- force_ocr: Force OCR even for text-based documents
124
- chunk_content: Split content into chunks
125
- extract_tables: Extract tables from the document
126
- extract_entities: Extract named entities
127
- extract_keywords: Extract keywords
128
- ocr_backend: OCR backend to use (tesseract, easyocr, paddleocr)
129
- max_chars: Maximum characters per chunk
130
- max_overlap: Character overlap between chunks
131
- keyword_count: Number of keywords to extract
132
- auto_detect_language: Auto-detect document language
133
-
134
- Returns:
135
- Extracted content with metadata, tables, chunks, entities, and keywords
136
- """
137
91
  content_bytes = base64.b64decode(content_base64)
138
92
 
139
93
  config = _create_config_with_overrides(
@@ -158,15 +112,6 @@ def extract_simple(
158
112
  file_path: str,
159
113
  mime_type: str | None = None,
160
114
  ) -> str:
161
- """Simple text extraction from a document file.
162
-
163
- Args:
164
- file_path: Path to the document file
165
- mime_type: MIME type of the document (auto-detected if not provided)
166
-
167
- Returns:
168
- Extracted text content as a string
169
- """
170
115
  config = _create_config_with_overrides()
171
116
  result = extract_file_sync(file_path, mime_type, config)
172
117
  return result.content
@@ -174,14 +119,12 @@ def extract_simple(
174
119
 
175
120
  @mcp.resource("config://default")
176
121
  def get_default_config() -> str:
177
- """Get the default extraction configuration."""
178
122
  config = ExtractionConfig()
179
123
  return json.dumps(msgspec.to_builtins(config, order="deterministic"), indent=2)
180
124
 
181
125
 
182
126
  @mcp.resource("config://discovered")
183
127
  def get_discovered_config() -> str:
184
- """Get the discovered configuration from config files."""
185
128
  config = discover_config()
186
129
  if config is None:
187
130
  return "No configuration file found"
@@ -190,13 +133,11 @@ def get_discovered_config() -> str:
190
133
 
191
134
  @mcp.resource("config://available-backends")
192
135
  def get_available_backends() -> str:
193
- """Get available OCR backends."""
194
136
  return "tesseract, easyocr, paddleocr"
195
137
 
196
138
 
197
139
  @mcp.resource("extractors://supported-formats")
198
140
  def get_supported_formats() -> str:
199
- """Get supported document formats."""
200
141
  return """
201
142
  Supported formats:
202
143
  - PDF documents
@@ -210,14 +151,6 @@ def get_supported_formats() -> str:
210
151
 
211
152
  @mcp.prompt()
212
153
  def extract_and_summarize(file_path: str) -> list[TextContent]:
213
- """Extract text from a document and provide a summary prompt.
214
-
215
- Args:
216
- file_path: Path to the document file
217
-
218
- Returns:
219
- Extracted content with summarization prompt
220
- """
221
154
  result = extract_file_sync(file_path, None, _create_config_with_overrides())
222
155
 
223
156
  return [
@@ -230,14 +163,6 @@ def extract_and_summarize(file_path: str) -> list[TextContent]:
230
163
 
231
164
  @mcp.prompt()
232
165
  def extract_structured(file_path: str) -> list[TextContent]:
233
- """Extract text with structured analysis prompt.
234
-
235
- Args:
236
- file_path: Path to the document file
237
-
238
- Returns:
239
- Extracted content with structured analysis prompt
240
- """
241
166
  config = _create_config_with_overrides(
242
167
  extract_entities=True,
243
168
  extract_keywords=True,
@@ -262,7 +187,6 @@ def extract_structured(file_path: str) -> list[TextContent]:
262
187
 
263
188
 
264
189
  def main() -> None: # pragma: no cover
265
- """Main entry point for the MCP server."""
266
190
  mcp.run()
267
191
 
268
192
 
kreuzberg/_ocr/_base.py CHANGED
@@ -5,6 +5,7 @@ from typing import Generic, TypeVar
5
5
  from PIL.Image import Image
6
6
 
7
7
  from kreuzberg._types import ExtractionResult
8
+ from kreuzberg._utils._sync import run_taskgroup
8
9
 
9
10
  try: # pragma: no cover
10
11
  from typing import Unpack # type: ignore[attr-defined]
@@ -32,8 +33,6 @@ class OCRBackend(ABC, Generic[T]):
32
33
  return [self.process_file_sync(path, **kwargs) for path in paths] # pragma: no cover
33
34
 
34
35
  async def process_batch(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
35
- from kreuzberg._utils._sync import run_taskgroup # noqa: PLC0415
36
-
37
36
  tasks = [self.process_file(path, **kwargs) for path in paths]
38
37
  return await run_taskgroup(*tasks) # pragma: no cover
39
38