kreuzberg 3.13.3__py3-none-any.whl → 3.14.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,346 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any
4
+
5
+ from PIL import Image
6
+
7
+ from kreuzberg._types import ExtractionConfig, ImagePreprocessingMetadata
8
+
9
+ if TYPE_CHECKING:
10
+ from PIL.Image import Image as PILImage
11
+
12
+
13
+ def calculate_optimal_dpi(
14
+ page_width: float,
15
+ page_height: float,
16
+ target_dpi: int,
17
+ max_dimension: int,
18
+ min_dpi: int = 72,
19
+ max_dpi: int = 600,
20
+ ) -> int:
21
+ """Calculate optimal DPI based on page dimensions and constraints.
22
+
23
+ Args:
24
+ page_width: Page width in points (1/72 inch)
25
+ page_height: Page height in points (1/72 inch)
26
+ target_dpi: Desired target DPI
27
+ max_dimension: Maximum allowed pixel dimension
28
+ min_dpi: Minimum DPI threshold
29
+ max_dpi: Maximum DPI threshold
30
+
31
+ Returns:
32
+ Optimal DPI value that keeps image within max_dimension
33
+ """
34
+ # Convert points to inches (72 points = 1 inch)
35
+ width_inches = page_width / 72.0
36
+ height_inches = page_height / 72.0
37
+
38
+ # Calculate pixel dimensions at target DPI
39
+ target_width_pixels = int(width_inches * target_dpi)
40
+ target_height_pixels = int(height_inches * target_dpi)
41
+
42
+ # Check if target DPI results in oversized image
43
+ max_pixel_dimension = max(target_width_pixels, target_height_pixels)
44
+
45
+ if max_pixel_dimension <= max_dimension:
46
+ # Target DPI is fine, clamp to min/max bounds
47
+ return max(min_dpi, min(target_dpi, max_dpi))
48
+
49
+ # Calculate maximum DPI that keeps within dimension constraints
50
+ max_dpi_for_width = max_dimension / width_inches if width_inches > 0 else max_dpi
51
+ max_dpi_for_height = max_dimension / height_inches if height_inches > 0 else max_dpi
52
+ constrained_dpi = int(min(max_dpi_for_width, max_dpi_for_height))
53
+
54
+ # Clamp to min/max bounds
55
+ return max(min_dpi, min(constrained_dpi, max_dpi))
56
+
57
+
58
+ def _extract_image_dpi(image: PILImage) -> tuple[tuple[float, float], float]:
59
+ """Extract DPI information from image."""
60
+ current_dpi_info = image.info.get("dpi", (72.0, 72.0))
61
+ if isinstance(current_dpi_info, (list, tuple)):
62
+ original_dpi = (float(current_dpi_info[0]), float(current_dpi_info[1]))
63
+ current_dpi = float(current_dpi_info[0]) # Use horizontal DPI
64
+ else:
65
+ current_dpi = float(current_dpi_info)
66
+ original_dpi = (current_dpi, current_dpi)
67
+ return original_dpi, current_dpi
68
+
69
+
70
+ def _should_skip_processing(
71
+ original_width: int,
72
+ original_height: int,
73
+ current_dpi: float,
74
+ config: ExtractionConfig,
75
+ ) -> bool:
76
+ """Check if processing should be skipped."""
77
+ max_current_dimension = max(original_width, original_height)
78
+ current_matches_target = abs(current_dpi - config.target_dpi) < 1.0
79
+ return not config.auto_adjust_dpi and current_matches_target and max_current_dimension <= config.max_image_dimension
80
+
81
+
82
+ def _calculate_target_dpi(
83
+ original_width: int,
84
+ original_height: int,
85
+ current_dpi: float,
86
+ config: ExtractionConfig,
87
+ ) -> tuple[int, bool, int | None]:
88
+ """Calculate target DPI and whether it was auto-adjusted."""
89
+ calculated_dpi = None
90
+ if config.auto_adjust_dpi:
91
+ # Convert pixel dimensions to approximate point dimensions
92
+ # This is an approximation since we don't know the actual physical size
93
+ approx_width_points = original_width * 72.0 / current_dpi
94
+ approx_height_points = original_height * 72.0 / current_dpi
95
+
96
+ optimal_dpi = calculate_optimal_dpi(
97
+ approx_width_points,
98
+ approx_height_points,
99
+ config.target_dpi,
100
+ config.max_image_dimension,
101
+ config.min_dpi,
102
+ config.max_dpi,
103
+ )
104
+ calculated_dpi = optimal_dpi
105
+ auto_adjusted = optimal_dpi != config.target_dpi
106
+ target_dpi = optimal_dpi
107
+ else:
108
+ auto_adjusted = False
109
+ target_dpi = config.target_dpi
110
+
111
+ return target_dpi, auto_adjusted, calculated_dpi
112
+
113
+
114
+ def normalize_image_dpi(
115
+ image: PILImage,
116
+ config: ExtractionConfig,
117
+ ) -> tuple[PILImage, ImagePreprocessingMetadata]:
118
+ """Normalize image DPI and dimensions for optimal OCR processing.
119
+
120
+ Args:
121
+ image: PIL Image to normalize
122
+ config: ExtractionConfig containing DPI settings
123
+
124
+ Returns:
125
+ Tuple of (normalized_image, ImagePreprocessingMetadata)
126
+
127
+ Note:
128
+ If auto_adjust_dpi is False, uses target_dpi directly.
129
+ If True, calculates optimal DPI based on image dimensions and constraints.
130
+ """
131
+ original_width, original_height = image.size
132
+ original_dpi, current_dpi = _extract_image_dpi(image)
133
+
134
+ # If no auto-adjustment and current DPI matches target and within limits, skip processing
135
+ if _should_skip_processing(original_width, original_height, current_dpi, config):
136
+ return image, ImagePreprocessingMetadata(
137
+ original_dimensions=(original_width, original_height),
138
+ original_dpi=original_dpi,
139
+ target_dpi=config.target_dpi,
140
+ scale_factor=1.0,
141
+ auto_adjusted=False,
142
+ final_dpi=config.target_dpi,
143
+ skipped_resize=True,
144
+ )
145
+
146
+ # Calculate target DPI
147
+ target_dpi, auto_adjusted, calculated_dpi = _calculate_target_dpi(
148
+ original_width, original_height, current_dpi, config
149
+ )
150
+
151
+ # Calculate scale factor based on DPI ratio
152
+ scale_factor = target_dpi / current_dpi
153
+
154
+ # If scale factor is very close to 1.0, skip resizing
155
+ if abs(scale_factor - 1.0) < 0.05:
156
+ return image, ImagePreprocessingMetadata(
157
+ original_dimensions=(original_width, original_height),
158
+ original_dpi=original_dpi,
159
+ target_dpi=config.target_dpi,
160
+ scale_factor=scale_factor,
161
+ auto_adjusted=auto_adjusted,
162
+ final_dpi=target_dpi,
163
+ calculated_dpi=calculated_dpi,
164
+ skipped_resize=True,
165
+ )
166
+
167
+ # Calculate new dimensions
168
+ new_width = int(original_width * scale_factor)
169
+ new_height = int(original_height * scale_factor)
170
+
171
+ # Ensure we don't exceed max_dimension (safety check)
172
+ dimension_clamped = False
173
+ max_new_dimension = max(new_width, new_height)
174
+ if max_new_dimension > config.max_image_dimension:
175
+ dimension_scale = config.max_image_dimension / max_new_dimension
176
+ new_width = int(new_width * dimension_scale)
177
+ new_height = int(new_height * dimension_scale)
178
+ scale_factor *= dimension_scale
179
+ dimension_clamped = True
180
+
181
+ # Resize image
182
+ try:
183
+ # Use LANCZOS for high-quality downscaling, BICUBIC for upscaling
184
+ # Handle different PIL versions
185
+ try:
186
+ # Modern PIL version
187
+ if scale_factor < 1.0:
188
+ resample_method = Image.Resampling.LANCZOS
189
+ resample_name = "LANCZOS"
190
+ else:
191
+ resample_method = Image.Resampling.BICUBIC
192
+ resample_name = "BICUBIC"
193
+ except AttributeError:
194
+ # Older PIL version
195
+ if scale_factor < 1.0:
196
+ resample_method = getattr(Image, "LANCZOS", 1) # type: ignore[arg-type]
197
+ resample_name = "LANCZOS"
198
+ else:
199
+ resample_method = getattr(Image, "BICUBIC", 3) # type: ignore[arg-type]
200
+ resample_name = "BICUBIC"
201
+
202
+ normalized_image = image.resize((new_width, new_height), resample_method)
203
+
204
+ # Update DPI info in the new image
205
+ normalized_image.info["dpi"] = (target_dpi, target_dpi)
206
+
207
+ return normalized_image, ImagePreprocessingMetadata(
208
+ original_dimensions=(original_width, original_height),
209
+ original_dpi=original_dpi,
210
+ target_dpi=config.target_dpi,
211
+ scale_factor=scale_factor,
212
+ auto_adjusted=auto_adjusted,
213
+ final_dpi=target_dpi,
214
+ new_dimensions=(new_width, new_height),
215
+ resample_method=resample_name,
216
+ dimension_clamped=dimension_clamped,
217
+ calculated_dpi=calculated_dpi,
218
+ )
219
+
220
+ except OSError as e:
221
+ # If resizing fails, return original image with error info
222
+ return image, ImagePreprocessingMetadata(
223
+ original_dimensions=(original_width, original_height),
224
+ original_dpi=original_dpi,
225
+ target_dpi=config.target_dpi,
226
+ scale_factor=scale_factor,
227
+ auto_adjusted=auto_adjusted,
228
+ final_dpi=target_dpi,
229
+ calculated_dpi=calculated_dpi,
230
+ resize_error=str(e),
231
+ )
232
+
233
+
234
+ def get_dpi_adjustment_heuristics(
235
+ width: float,
236
+ height: float,
237
+ current_dpi: int,
238
+ target_dpi: int,
239
+ max_dimension: int,
240
+ content_type: str = "document",
241
+ ) -> dict[str, Any]:
242
+ """Get smart DPI adjustment recommendations based on content analysis.
243
+
244
+ Args:
245
+ width: Image width in pixels
246
+ height: Image height in pixels
247
+ current_dpi: Current DPI setting
248
+ target_dpi: Desired target DPI
249
+ max_dimension: Maximum allowed dimension
250
+ content_type: Type of content ("document", "photo", "mixed")
251
+
252
+ Returns:
253
+ Dictionary with adjustment recommendations and rationale
254
+ """
255
+ recommendations: list[str] = []
256
+ heuristics = {
257
+ "recommended_dpi": target_dpi,
258
+ "content_analysis": {},
259
+ "performance_impact": "medium",
260
+ "quality_impact": "medium",
261
+ "recommendations": recommendations,
262
+ }
263
+
264
+ # Calculate aspect ratio and size analysis
265
+ aspect_ratio = width / height if height > 0 else 1.0
266
+ total_pixels = width * height
267
+ megapixels = total_pixels / 1_000_000
268
+
269
+ heuristics["content_analysis"] = {
270
+ "aspect_ratio": aspect_ratio,
271
+ "megapixels": megapixels,
272
+ "is_portrait": aspect_ratio < 0.8,
273
+ "is_landscape": aspect_ratio > 1.2,
274
+ "is_large": max(width, height) > max_dimension * 0.8,
275
+ }
276
+
277
+ # Document-specific heuristics
278
+ if content_type == "document":
279
+ if aspect_ratio > 2.0 or aspect_ratio < 0.5:
280
+ # Very wide or very tall documents (like forms, receipts)
281
+ recommendations.append("Consider higher DPI for narrow documents")
282
+ if target_dpi < 200:
283
+ heuristics["recommended_dpi"] = min(200, target_dpi * 1.3)
284
+
285
+ if megapixels > 50: # Very large document
286
+ recommendations.append("Large document detected - consider DPI reduction")
287
+ heuristics["performance_impact"] = "high"
288
+ if target_dpi > 150:
289
+ heuristics["recommended_dpi"] = max(120, target_dpi * 0.8)
290
+
291
+ # Memory usage estimation
292
+ estimated_memory_mb = (width * height * 3) / (1024 * 1024) # RGB bytes
293
+ if estimated_memory_mb > 200:
294
+ heuristics["performance_impact"] = "high"
295
+ recommendations.append(f"High memory usage expected (~{estimated_memory_mb:.0f}MB)")
296
+
297
+ # Quality vs performance tradeoffs
298
+ scale_factor = target_dpi / current_dpi if current_dpi > 0 else 1.0
299
+ if scale_factor < 0.7:
300
+ heuristics["quality_impact"] = "high"
301
+ recommendations.append("Significant downscaling may reduce OCR accuracy")
302
+ elif scale_factor > 1.5:
303
+ heuristics["performance_impact"] = "high"
304
+ recommendations.append("Upscaling will increase processing time")
305
+
306
+ return heuristics
307
+
308
+
309
+ def estimate_processing_time(
310
+ width: int,
311
+ height: int,
312
+ ocr_backend: str = "tesseract",
313
+ ) -> dict[str, float | str]:
314
+ """Estimate processing time based on image dimensions and OCR backend.
315
+
316
+ Args:
317
+ width: Image width in pixels
318
+ height: Image height in pixels
319
+ ocr_backend: OCR backend name
320
+
321
+ Returns:
322
+ Dictionary with time estimates in seconds
323
+ """
324
+ total_pixels = width * height
325
+ megapixels = total_pixels / 1_000_000
326
+
327
+ # Base processing times per megapixel (rough estimates)
328
+ base_times = {
329
+ "tesseract": 2.5, # seconds per megapixel
330
+ "easyocr": 4.0, # slower due to deep learning
331
+ "paddleocr": 3.5, # moderate speed
332
+ }
333
+
334
+ base_time = base_times.get(ocr_backend, 3.0)
335
+
336
+ # Non-linear scaling for very large images
337
+ scaling_factor = 1.0 + (megapixels - 10) * 0.1 if megapixels > 10 else 1.0
338
+
339
+ estimated_time = base_time * megapixels * scaling_factor
340
+
341
+ return {
342
+ "estimated_seconds": estimated_time,
343
+ "megapixels": megapixels,
344
+ "backend": ocr_backend,
345
+ "scaling_factor": scaling_factor,
346
+ }
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import hashlib
4
4
  import io
5
+ from pathlib import Path
5
6
  from typing import TYPE_CHECKING, Any
6
7
 
7
8
  import anyio
@@ -9,17 +10,13 @@ import anyio
9
10
  from kreuzberg._utils._cache import get_ocr_cache
10
11
 
11
12
  if TYPE_CHECKING:
12
- from pathlib import Path
13
-
14
13
  from PIL.Image import Image as PILImage
15
14
 
16
15
  from kreuzberg._types import ExtractionResult
17
16
 
18
17
 
19
18
  def get_file_info(path: Path) -> dict[str, Any]:
20
- from pathlib import Path as PathType # noqa: PLC0415
21
-
22
- path_obj = PathType(path) if not isinstance(path, PathType) else path
19
+ path_obj = path if isinstance(path, Path) else Path(path)
23
20
 
24
21
  try:
25
22
  stat = path_obj.stat()
@@ -4,7 +4,7 @@ import io
4
4
  import multiprocessing as mp
5
5
  from concurrent.futures import ProcessPoolExecutor
6
6
  from contextlib import contextmanager
7
- from typing import TYPE_CHECKING, Any, TypeVar
7
+ from typing import TYPE_CHECKING, Any, TypeVar, cast
8
8
 
9
9
  import anyio
10
10
  import psutil
@@ -173,7 +173,7 @@ class ProcessPoolManager:
173
173
  self._active_tasks -= 1
174
174
 
175
175
  async with anyio.create_task_group() as tg:
176
- results: list[T] = [None] * len(arg_batches) # type: ignore[list-item]
176
+ results: list[T | None] = [None] * len(arg_batches)
177
177
 
178
178
  async def run_task(idx: int, args: tuple[Any, ...]) -> None:
179
179
  results[idx] = await submit_single(args)
@@ -181,7 +181,7 @@ class ProcessPoolManager:
181
181
  for idx, args in enumerate(arg_batches):
182
182
  tg.start_soon(run_task, idx, args)
183
183
 
184
- return results
184
+ return cast("list[T]", results)
185
185
 
186
186
  def get_system_info(self) -> dict[str, Any]:
187
187
  memory = psutil.virtual_memory()
@@ -89,6 +89,8 @@ def _format_table_row(row: Any, df: Any, float_col_formatting: dict[str, str]) -
89
89
  formatted_row.append(str(int(value)))
90
90
  else:
91
91
  formatted_row.append(f"{value:.2f}")
92
+ elif isinstance(value, bool):
93
+ formatted_row.append(str(value).lower())
92
94
  else:
93
95
  clean_value = str(value).strip().replace("|", "\\|")
94
96
  formatted_row.append(clean_value)
@@ -201,7 +203,8 @@ def extract_table_structure_info(table: TableData) -> dict[str, Any]:
201
203
 
202
204
  total_cells = df.height * df.width
203
205
  if total_cells > 0:
204
- empty_cells = df.null_count().sum().item()
206
+ null_counts = df.null_count()
207
+ empty_cells = sum(null_counts.row(0))
205
208
  info["empty_cells"] = empty_cells
206
209
  info["data_density"] = (total_cells - empty_cells) / total_cells
207
210
 
kreuzberg/cli.py CHANGED
@@ -62,7 +62,20 @@ def format_extraction_result(result: ExtractionResult, show_metadata: bool, outp
62
62
  if show_metadata:
63
63
  output_data["metadata"] = result.metadata
64
64
  if result.tables:
65
- output_data["tables"] = result.tables
65
+ json_tables = []
66
+ for table in result.tables:
67
+ json_table = {
68
+ "page_number": table.get("page_number"),
69
+ "text": table.get("text"),
70
+ }
71
+ if "df" in table and table["df"] is not None:
72
+ df = table["df"]
73
+ if hasattr(df, "write_csv"):
74
+ json_table["data_csv"] = df.write_csv()
75
+ elif hasattr(df, "to_csv"):
76
+ json_table["data_csv"] = df.to_csv(index=False)
77
+ json_tables.append(json_table)
78
+ output_data["tables"] = json_tables
66
79
  if result.chunks:
67
80
  output_data["chunks"] = result.chunks
68
81
  return json.dumps(output_data, indent=2, ensure_ascii=False)
@@ -77,7 +90,11 @@ def format_extraction_result(result: ExtractionResult, show_metadata: bool, outp
77
90
  output_parts.append("\n\n--- TABLES ---")
78
91
  for i, table in enumerate(result.tables):
79
92
  output_parts.append(f"\nTable {i + 1}:")
80
- output_parts.append(json.dumps(table, indent=2, ensure_ascii=False))
93
+ json_table = {
94
+ "page_number": table.get("page_number"),
95
+ "text": table.get("text"),
96
+ }
97
+ output_parts.append(json.dumps(json_table, indent=2, ensure_ascii=False))
81
98
 
82
99
  return "\n".join(output_parts)
83
100
 
kreuzberg/extraction.py CHANGED
@@ -426,12 +426,12 @@ def batch_extract_file_sync(
426
426
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
427
427
  future_to_index = {executor.submit(extract_single, fp): i for i, fp in enumerate(file_paths)}
428
428
 
429
- results: list[ExtractionResult] = [None] * len(file_paths) # type: ignore[list-item]
429
+ results: list[ExtractionResult | None] = [None] * len(file_paths)
430
430
  for future in as_completed(future_to_index):
431
431
  index, result = future.result()
432
432
  results[index] = result
433
433
 
434
- return results
434
+ return cast("list[ExtractionResult]", results)
435
435
 
436
436
 
437
437
  def batch_extract_bytes_sync(
@@ -479,9 +479,9 @@ def batch_extract_bytes_sync(
479
479
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
480
480
  future_to_index = {executor.submit(extract_single, (i, content)): i for i, content in enumerate(contents)}
481
481
 
482
- results: list[ExtractionResult] = [None] * len(contents) # type: ignore[list-item]
482
+ results: list[ExtractionResult | None] = [None] * len(contents)
483
483
  for future in as_completed(future_to_index):
484
484
  index, result = future.result()
485
485
  results[index] = result
486
486
 
487
- return results
487
+ return cast("list[ExtractionResult]", results)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.13.3
3
+ Version: 3.14.1
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -31,10 +31,10 @@ Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=4.10.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.10.0
35
- Requires-Dist: mcp>=1.13.0
34
+ Requires-Dist: html-to-markdown[lxml]>=1.11.0
35
+ Requires-Dist: mcp>=1.14.0
36
36
  Requires-Dist: msgspec>=0.18.0
37
- Requires-Dist: numpy>=1.24.0
37
+ Requires-Dist: numpy>=2.0.0
38
38
  Requires-Dist: playa-pdf>=0.7.0
39
39
  Requires-Dist: polars>=1.33.1
40
40
  Requires-Dist: psutil>=7.0.0
@@ -1,57 +1,58 @@
1
1
  kreuzberg/__init__.py,sha256=Oh_NTp8wf0BlvD8CSBad2A493nEWH4jTE0x8v7v1Y9w,1341
2
2
  kreuzberg/__main__.py,sha256=3cIDdzTggj2kj8uKx4WShWHmCWqdZazdM3BxUGbAuSI,104
3
3
  kreuzberg/_chunker.py,sha256=tr9_KUYTSLauFois3MsB-A-0hGcTT8hTQFrqNRTii-I,1373
4
- kreuzberg/_config.py,sha256=T6ASb3N8nPQ4g5B2FxfgK82uE4pesGllezqrmZ0gSdM,12457
4
+ kreuzberg/_config.py,sha256=2LI5z9gXniqO4afrMmbZfMdhlT2701O5OlGKkrMo-bM,12385
5
5
  kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
6
6
  kreuzberg/_document_classification.py,sha256=Mz_s2GJGsEl7MQ-67BPoGYCZibTy9Sw0PScUZKBjKOA,5736
7
7
  kreuzberg/_entity_extraction.py,sha256=5YpPnqoJ5aiHd_sy4bN4-Ngiq79RhCV6yaUQE8joGXo,3503
8
- kreuzberg/_gmft.py,sha256=jKbD7V_KP9XTLjT9SBgSgE3CyDjqbRDm9BAiWV2sAC0,19542
8
+ kreuzberg/_gmft.py,sha256=a7KDXbZM0PxyFpAIjM0xMRvxzoMo4fTQuGlFNa8uXBU,20502
9
9
  kreuzberg/_language_detection.py,sha256=T9p6aimB7QFXAQiEntIMZeH_Z62E52E8fBQ43hWuyhs,1960
10
10
  kreuzberg/_mime_types.py,sha256=kGBDSMO4XPgzUKC7iaBeChCtRQXZ9_zXq6eJydejX_k,7739
11
11
  kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
12
- kreuzberg/_registry.py,sha256=8cPpz3oZVnMwWDT2v_Q7wf-GHd5YuHmc-nkLtvPfE1I,2433
13
- kreuzberg/_types.py,sha256=D-2d_WG8HyByA163izGhjk7t-e4FL_N-_6UzlVso8Dg,36020
14
- kreuzberg/cli.py,sha256=nPH4FDW6WkoF4gtH0s4RWmxjAveJ_-Unb6fev6x0Sko,12752
12
+ kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
13
+ kreuzberg/_types.py,sha256=BEMTnA8fvHL0dDCnjq7g9Jjd2Ze8NFq988YkMH4zQ9g,39163
14
+ kreuzberg/cli.py,sha256=Ob0IfqWcaiM09pFdC6wTpdSeql0SGZDxBxfrEhJAGmo,13501
15
15
  kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
16
- kreuzberg/extraction.py,sha256=jiMKiDyTf3sHyk76sMffHR-eH-_yg-DFRMuXEKufRYI,17649
16
+ kreuzberg/extraction.py,sha256=qT-Ziw5FmMqcPT88VrglikL1RASSJCf5W7xP6L9Vi5s,17673
17
17
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- kreuzberg/_api/main.py,sha256=q0ygmdAUfTkjlqAa1RdW1KxxzxQ6IX80__UTpoXipp8,8859
19
+ kreuzberg/_api/main.py,sha256=8g_8j8Dp2e70_yYYUUrJNC5Ku9fuyNgyjUuIgJTRUW8,12500
20
20
  kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  kreuzberg/_extractors/_base.py,sha256=i2FvAhRnamEtBb4a-C7pfcdWIXnkEBw0saMQu7h1_RQ,2069
22
22
  kreuzberg/_extractors/_email.py,sha256=jn_8J4BASKJ7zFHBG0PgxNe3OT4pjmEM2tTKX8y_0AE,5887
23
23
  kreuzberg/_extractors/_html.py,sha256=NyQKChNLvaSUC_5x1qTYlIQGwL4lEbgUF7BgH9ejEVY,1583
24
- kreuzberg/_extractors/_image.py,sha256=UqPoYfvDRX6Rd1yPhcLHJLDw6d2cUzgkqOGjh2eleJM,3301
24
+ kreuzberg/_extractors/_image.py,sha256=lFPoxAf7_Zbx-1t8W4vU2bhHauiNGOAFbZxr_2gNUsw,3991
25
25
  kreuzberg/_extractors/_pandoc.py,sha256=-Ai4S1cXs7F6yeonb_7Y7_ZoWHn29E2oP1WlPtM-4HM,22505
26
- kreuzberg/_extractors/_pdf.py,sha256=Yv_c3xYzrGAjgTbwCGqbiQTDLjIUP_Pu7Z3GmMOqgqg,17865
26
+ kreuzberg/_extractors/_pdf.py,sha256=naJ_AgtAgtGIjAqiU4_G7lgftKWhUjZDLVILSG2AyVc,18757
27
27
  kreuzberg/_extractors/_presentation.py,sha256=ULGkt7dzeA9sYSEhpAucKZmkdv9EubzeZtOjoLP3Z2E,6994
28
- kreuzberg/_extractors/_spread_sheet.py,sha256=UgjkLBATirc5FXUFtRN1ArLfOYhLDJxH2wFb1s9E5vA,12784
28
+ kreuzberg/_extractors/_spread_sheet.py,sha256=eBAx_OwoyRqMzmD4Z07UlOBwcXckymgvj_0o7di6thA,12715
29
29
  kreuzberg/_extractors/_structured.py,sha256=PpefI_GDrdLyUgnElrbdB-MeTMKVWium4Ckxm5Zg100,5536
30
30
  kreuzberg/_mcp/__init__.py,sha256=h6DgLFO4TMUk7_wCJ2jn2Y6IkFmfzb-Z7jX-G5UCYVc,43
31
- kreuzberg/_mcp/server.py,sha256=iYJG6g0u7I6mWtC4R1XlxydBrPpgnp5dGJzpm9QAZig,8438
31
+ kreuzberg/_mcp/server.py,sha256=YPMJp6xnZ3DC32NEdX5Gqf3vwxsHZxXxUxZ6jghpv6I,5688
32
32
  kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
33
- kreuzberg/_ocr/_base.py,sha256=5ef2g8JuSaZF2sDiAmoaODHbeG4MT0LtNzbtW0n9BnU,1445
33
+ kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
34
34
  kreuzberg/_ocr/_easyocr.py,sha256=XbgpGt5tkE4xHleIGvV1cHlpOQTp43rSXBO1CyIyKTg,14599
35
- kreuzberg/_ocr/_paddleocr.py,sha256=58sKOHfKCHGFJNlRLrJwey8G_7xbsAAPBXB4n3hKc7k,14052
35
+ kreuzberg/_ocr/_paddleocr.py,sha256=hfc6Zi2eSUYTVVF9y9D1P2_pLiLXPfFRoJ6QDJ6oZag,15017
36
36
  kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
37
- kreuzberg/_ocr/_tesseract.py,sha256=H2T_iuXwa0FGCSQ_ZfXvmvqksxoOdOFAfv3uQA8E4-M,49160
37
+ kreuzberg/_ocr/_tesseract.py,sha256=QEKK_PDZnNiZRgpklOgMXB-cObJy6C-HuxL6Gza5Z3c,49136
38
38
  kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
- kreuzberg/_utils/_cache.py,sha256=S6Oc4TJamiuuWeJ2ABxDFbbQh4o8w38AUyZeBEc1NN8,12767
39
+ kreuzberg/_utils/_cache.py,sha256=qeyI6rJOQlKtdHjJeOjUxx31eItak_drrNn8Cf8HbN8,13956
40
40
  kreuzberg/_utils/_device.py,sha256=UxGkSTN3Up-Zn43CSyvf8CozW2xAF05Cm01LWA2FZmg,8263
41
41
  kreuzberg/_utils/_document_cache.py,sha256=tfk9_Yc1cQkT5_uM5R1uaI4w-2SjNn7QyAd6AmWkSz8,4851
42
42
  kreuzberg/_utils/_errors.py,sha256=aQYEnp8oJ-WJVmCNo7YY-25y1KZZFEwjAmxVRfw4a_M,4920
43
- kreuzberg/_utils/_ocr_cache.py,sha256=8_-qmPlK2adQKsH4OO4Mlk8wmqBMl3XxkcV_NsXVyFs,3501
43
+ kreuzberg/_utils/_image_preprocessing.py,sha256=2u0A28M07F9XlYebTG5salOUVEE3YT3m8fiR8Z2ZM8E,12326
44
+ kreuzberg/_utils/_ocr_cache.py,sha256=uCCZfdY7EiqMhCnhNwqirFOr-Wfaobd2Ntc-F07TKec,3425
44
45
  kreuzberg/_utils/_pdf_lock.py,sha256=Ytvds30aZf3yXeZFo27ZenrhUoU-GZlR2rKEkhJ_wlk,1349
45
- kreuzberg/_utils/_process_pool.py,sha256=9dPMD_gBocQ5VaeCIrlSJfPXKyXNuyKaATmqOPExxiE,6723
46
+ kreuzberg/_utils/_process_pool.py,sha256=7p8Co1w-Tvh2MUdxMcPMpvOikumrb0nN2ApQVytV-_c,6726
46
47
  kreuzberg/_utils/_quality.py,sha256=f7NbyZysyJQD8jKCNWhogvluU9A7GdEYhMsDBeMbGAA,5412
47
48
  kreuzberg/_utils/_ref.py,sha256=iOflvjTUc_F0XaL28Bd6fpvL6qkeoURGA4B77Nqky7I,840
48
49
  kreuzberg/_utils/_serialization.py,sha256=97iIgdcxdbym-BEvy0J6HAduBCUXyCGwhuEHCT_l7I4,1513
49
50
  kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4366
50
51
  kreuzberg/_utils/_sync.py,sha256=OWiciXPTGHIxgiGoHI2AglZ1siTNT-nU_JCgHPNzzHk,2196
51
- kreuzberg/_utils/_table.py,sha256=R-6owHjvcvHGhem_vDsFH7S2yMHGoUUO2PFcj-Idptk,6361
52
+ kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
52
53
  kreuzberg/_utils/_tmp.py,sha256=wnOInBkcuQoxI1vBLvNv9NqbRCEu9Y03qfOjqQuAk3s,841
53
- kreuzberg-3.13.3.dist-info/METADATA,sha256=ey7kAlKK8eTER87IiGZZpIPnYoSLwLPX2AGdOPTjj2M,12128
54
- kreuzberg-3.13.3.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
55
- kreuzberg-3.13.3.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
56
- kreuzberg-3.13.3.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
57
- kreuzberg-3.13.3.dist-info/RECORD,,
54
+ kreuzberg-3.14.1.dist-info/METADATA,sha256=4sG9L9AtvBHFxjv84obrcaYNToc_sO0-AHnnpo1-ZGY,12127
55
+ kreuzberg-3.14.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
56
+ kreuzberg-3.14.1.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
57
+ kreuzberg-3.14.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
58
+ kreuzberg-3.14.1.dist-info/RECORD,,