kreuzberg 3.13.2__py3-none-any.whl → 3.14.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_api/main.py +117 -15
- kreuzberg/_config.py +3 -0
- kreuzberg/_extractors/_image.py +20 -2
- kreuzberg/_extractors/_pdf.py +21 -1
- kreuzberg/_extractors/_spread_sheet.py +16 -2
- kreuzberg/_gmft.py +79 -33
- kreuzberg/_mcp/server.py +0 -76
- kreuzberg/_ocr/_base.py +1 -2
- kreuzberg/_ocr/_paddleocr.py +39 -13
- kreuzberg/_ocr/_tesseract.py +16 -6
- kreuzberg/_registry.py +26 -0
- kreuzberg/_types.py +64 -1
- kreuzberg/_utils/_cache.py +34 -12
- kreuzberg/_utils/_image_preprocessing.py +346 -0
- kreuzberg/_utils/_ocr_cache.py +2 -5
- kreuzberg/_utils/_process_pool.py +3 -3
- kreuzberg/_utils/_table.py +4 -1
- kreuzberg/cli.py +19 -2
- kreuzberg/extraction.py +4 -4
- {kreuzberg-3.13.2.dist-info → kreuzberg-3.14.0.dist-info}/METADATA +10 -10
- {kreuzberg-3.13.2.dist-info → kreuzberg-3.14.0.dist-info}/RECORD +24 -23
- {kreuzberg-3.13.2.dist-info → kreuzberg-3.14.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.13.2.dist-info → kreuzberg-3.14.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.13.2.dist-info → kreuzberg-3.14.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,346 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING, Any
|
4
|
+
|
5
|
+
from PIL import Image
|
6
|
+
|
7
|
+
from kreuzberg._types import ExtractionConfig, ImagePreprocessingMetadata
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from PIL.Image import Image as PILImage
|
11
|
+
|
12
|
+
|
13
|
+
def calculate_optimal_dpi(
|
14
|
+
page_width: float,
|
15
|
+
page_height: float,
|
16
|
+
target_dpi: int,
|
17
|
+
max_dimension: int,
|
18
|
+
min_dpi: int = 72,
|
19
|
+
max_dpi: int = 600,
|
20
|
+
) -> int:
|
21
|
+
"""Calculate optimal DPI based on page dimensions and constraints.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
page_width: Page width in points (1/72 inch)
|
25
|
+
page_height: Page height in points (1/72 inch)
|
26
|
+
target_dpi: Desired target DPI
|
27
|
+
max_dimension: Maximum allowed pixel dimension
|
28
|
+
min_dpi: Minimum DPI threshold
|
29
|
+
max_dpi: Maximum DPI threshold
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
Optimal DPI value that keeps image within max_dimension
|
33
|
+
"""
|
34
|
+
# Convert points to inches (72 points = 1 inch)
|
35
|
+
width_inches = page_width / 72.0
|
36
|
+
height_inches = page_height / 72.0
|
37
|
+
|
38
|
+
# Calculate pixel dimensions at target DPI
|
39
|
+
target_width_pixels = int(width_inches * target_dpi)
|
40
|
+
target_height_pixels = int(height_inches * target_dpi)
|
41
|
+
|
42
|
+
# Check if target DPI results in oversized image
|
43
|
+
max_pixel_dimension = max(target_width_pixels, target_height_pixels)
|
44
|
+
|
45
|
+
if max_pixel_dimension <= max_dimension:
|
46
|
+
# Target DPI is fine, clamp to min/max bounds
|
47
|
+
return max(min_dpi, min(target_dpi, max_dpi))
|
48
|
+
|
49
|
+
# Calculate maximum DPI that keeps within dimension constraints
|
50
|
+
max_dpi_for_width = max_dimension / width_inches if width_inches > 0 else max_dpi
|
51
|
+
max_dpi_for_height = max_dimension / height_inches if height_inches > 0 else max_dpi
|
52
|
+
constrained_dpi = int(min(max_dpi_for_width, max_dpi_for_height))
|
53
|
+
|
54
|
+
# Clamp to min/max bounds
|
55
|
+
return max(min_dpi, min(constrained_dpi, max_dpi))
|
56
|
+
|
57
|
+
|
58
|
+
def _extract_image_dpi(image: PILImage) -> tuple[tuple[float, float], float]:
|
59
|
+
"""Extract DPI information from image."""
|
60
|
+
current_dpi_info = image.info.get("dpi", (72.0, 72.0))
|
61
|
+
if isinstance(current_dpi_info, (list, tuple)):
|
62
|
+
original_dpi = (float(current_dpi_info[0]), float(current_dpi_info[1]))
|
63
|
+
current_dpi = float(current_dpi_info[0]) # Use horizontal DPI
|
64
|
+
else:
|
65
|
+
current_dpi = float(current_dpi_info)
|
66
|
+
original_dpi = (current_dpi, current_dpi)
|
67
|
+
return original_dpi, current_dpi
|
68
|
+
|
69
|
+
|
70
|
+
def _should_skip_processing(
|
71
|
+
original_width: int,
|
72
|
+
original_height: int,
|
73
|
+
current_dpi: float,
|
74
|
+
config: ExtractionConfig,
|
75
|
+
) -> bool:
|
76
|
+
"""Check if processing should be skipped."""
|
77
|
+
max_current_dimension = max(original_width, original_height)
|
78
|
+
current_matches_target = abs(current_dpi - config.target_dpi) < 1.0
|
79
|
+
return not config.auto_adjust_dpi and current_matches_target and max_current_dimension <= config.max_image_dimension
|
80
|
+
|
81
|
+
|
82
|
+
def _calculate_target_dpi(
|
83
|
+
original_width: int,
|
84
|
+
original_height: int,
|
85
|
+
current_dpi: float,
|
86
|
+
config: ExtractionConfig,
|
87
|
+
) -> tuple[int, bool, int | None]:
|
88
|
+
"""Calculate target DPI and whether it was auto-adjusted."""
|
89
|
+
calculated_dpi = None
|
90
|
+
if config.auto_adjust_dpi:
|
91
|
+
# Convert pixel dimensions to approximate point dimensions
|
92
|
+
# This is an approximation since we don't know the actual physical size
|
93
|
+
approx_width_points = original_width * 72.0 / current_dpi
|
94
|
+
approx_height_points = original_height * 72.0 / current_dpi
|
95
|
+
|
96
|
+
optimal_dpi = calculate_optimal_dpi(
|
97
|
+
approx_width_points,
|
98
|
+
approx_height_points,
|
99
|
+
config.target_dpi,
|
100
|
+
config.max_image_dimension,
|
101
|
+
config.min_dpi,
|
102
|
+
config.max_dpi,
|
103
|
+
)
|
104
|
+
calculated_dpi = optimal_dpi
|
105
|
+
auto_adjusted = optimal_dpi != config.target_dpi
|
106
|
+
target_dpi = optimal_dpi
|
107
|
+
else:
|
108
|
+
auto_adjusted = False
|
109
|
+
target_dpi = config.target_dpi
|
110
|
+
|
111
|
+
return target_dpi, auto_adjusted, calculated_dpi
|
112
|
+
|
113
|
+
|
114
|
+
def normalize_image_dpi(
|
115
|
+
image: PILImage,
|
116
|
+
config: ExtractionConfig,
|
117
|
+
) -> tuple[PILImage, ImagePreprocessingMetadata]:
|
118
|
+
"""Normalize image DPI and dimensions for optimal OCR processing.
|
119
|
+
|
120
|
+
Args:
|
121
|
+
image: PIL Image to normalize
|
122
|
+
config: ExtractionConfig containing DPI settings
|
123
|
+
|
124
|
+
Returns:
|
125
|
+
Tuple of (normalized_image, ImagePreprocessingMetadata)
|
126
|
+
|
127
|
+
Note:
|
128
|
+
If auto_adjust_dpi is False, uses target_dpi directly.
|
129
|
+
If True, calculates optimal DPI based on image dimensions and constraints.
|
130
|
+
"""
|
131
|
+
original_width, original_height = image.size
|
132
|
+
original_dpi, current_dpi = _extract_image_dpi(image)
|
133
|
+
|
134
|
+
# If no auto-adjustment and current DPI matches target and within limits, skip processing
|
135
|
+
if _should_skip_processing(original_width, original_height, current_dpi, config):
|
136
|
+
return image, ImagePreprocessingMetadata(
|
137
|
+
original_dimensions=(original_width, original_height),
|
138
|
+
original_dpi=original_dpi,
|
139
|
+
target_dpi=config.target_dpi,
|
140
|
+
scale_factor=1.0,
|
141
|
+
auto_adjusted=False,
|
142
|
+
final_dpi=config.target_dpi,
|
143
|
+
skipped_resize=True,
|
144
|
+
)
|
145
|
+
|
146
|
+
# Calculate target DPI
|
147
|
+
target_dpi, auto_adjusted, calculated_dpi = _calculate_target_dpi(
|
148
|
+
original_width, original_height, current_dpi, config
|
149
|
+
)
|
150
|
+
|
151
|
+
# Calculate scale factor based on DPI ratio
|
152
|
+
scale_factor = target_dpi / current_dpi
|
153
|
+
|
154
|
+
# If scale factor is very close to 1.0, skip resizing
|
155
|
+
if abs(scale_factor - 1.0) < 0.05:
|
156
|
+
return image, ImagePreprocessingMetadata(
|
157
|
+
original_dimensions=(original_width, original_height),
|
158
|
+
original_dpi=original_dpi,
|
159
|
+
target_dpi=config.target_dpi,
|
160
|
+
scale_factor=scale_factor,
|
161
|
+
auto_adjusted=auto_adjusted,
|
162
|
+
final_dpi=target_dpi,
|
163
|
+
calculated_dpi=calculated_dpi,
|
164
|
+
skipped_resize=True,
|
165
|
+
)
|
166
|
+
|
167
|
+
# Calculate new dimensions
|
168
|
+
new_width = int(original_width * scale_factor)
|
169
|
+
new_height = int(original_height * scale_factor)
|
170
|
+
|
171
|
+
# Ensure we don't exceed max_dimension (safety check)
|
172
|
+
dimension_clamped = False
|
173
|
+
max_new_dimension = max(new_width, new_height)
|
174
|
+
if max_new_dimension > config.max_image_dimension:
|
175
|
+
dimension_scale = config.max_image_dimension / max_new_dimension
|
176
|
+
new_width = int(new_width * dimension_scale)
|
177
|
+
new_height = int(new_height * dimension_scale)
|
178
|
+
scale_factor *= dimension_scale
|
179
|
+
dimension_clamped = True
|
180
|
+
|
181
|
+
# Resize image
|
182
|
+
try:
|
183
|
+
# Use LANCZOS for high-quality downscaling, BICUBIC for upscaling
|
184
|
+
# Handle different PIL versions
|
185
|
+
try:
|
186
|
+
# Modern PIL version
|
187
|
+
if scale_factor < 1.0:
|
188
|
+
resample_method = Image.Resampling.LANCZOS
|
189
|
+
resample_name = "LANCZOS"
|
190
|
+
else:
|
191
|
+
resample_method = Image.Resampling.BICUBIC
|
192
|
+
resample_name = "BICUBIC"
|
193
|
+
except AttributeError:
|
194
|
+
# Older PIL version
|
195
|
+
if scale_factor < 1.0:
|
196
|
+
resample_method = getattr(Image, "LANCZOS", 1) # type: ignore[arg-type]
|
197
|
+
resample_name = "LANCZOS"
|
198
|
+
else:
|
199
|
+
resample_method = getattr(Image, "BICUBIC", 3) # type: ignore[arg-type]
|
200
|
+
resample_name = "BICUBIC"
|
201
|
+
|
202
|
+
normalized_image = image.resize((new_width, new_height), resample_method)
|
203
|
+
|
204
|
+
# Update DPI info in the new image
|
205
|
+
normalized_image.info["dpi"] = (target_dpi, target_dpi)
|
206
|
+
|
207
|
+
return normalized_image, ImagePreprocessingMetadata(
|
208
|
+
original_dimensions=(original_width, original_height),
|
209
|
+
original_dpi=original_dpi,
|
210
|
+
target_dpi=config.target_dpi,
|
211
|
+
scale_factor=scale_factor,
|
212
|
+
auto_adjusted=auto_adjusted,
|
213
|
+
final_dpi=target_dpi,
|
214
|
+
new_dimensions=(new_width, new_height),
|
215
|
+
resample_method=resample_name,
|
216
|
+
dimension_clamped=dimension_clamped,
|
217
|
+
calculated_dpi=calculated_dpi,
|
218
|
+
)
|
219
|
+
|
220
|
+
except OSError as e:
|
221
|
+
# If resizing fails, return original image with error info
|
222
|
+
return image, ImagePreprocessingMetadata(
|
223
|
+
original_dimensions=(original_width, original_height),
|
224
|
+
original_dpi=original_dpi,
|
225
|
+
target_dpi=config.target_dpi,
|
226
|
+
scale_factor=scale_factor,
|
227
|
+
auto_adjusted=auto_adjusted,
|
228
|
+
final_dpi=target_dpi,
|
229
|
+
calculated_dpi=calculated_dpi,
|
230
|
+
resize_error=str(e),
|
231
|
+
)
|
232
|
+
|
233
|
+
|
234
|
+
def get_dpi_adjustment_heuristics(
|
235
|
+
width: float,
|
236
|
+
height: float,
|
237
|
+
current_dpi: int,
|
238
|
+
target_dpi: int,
|
239
|
+
max_dimension: int,
|
240
|
+
content_type: str = "document",
|
241
|
+
) -> dict[str, Any]:
|
242
|
+
"""Get smart DPI adjustment recommendations based on content analysis.
|
243
|
+
|
244
|
+
Args:
|
245
|
+
width: Image width in pixels
|
246
|
+
height: Image height in pixels
|
247
|
+
current_dpi: Current DPI setting
|
248
|
+
target_dpi: Desired target DPI
|
249
|
+
max_dimension: Maximum allowed dimension
|
250
|
+
content_type: Type of content ("document", "photo", "mixed")
|
251
|
+
|
252
|
+
Returns:
|
253
|
+
Dictionary with adjustment recommendations and rationale
|
254
|
+
"""
|
255
|
+
recommendations: list[str] = []
|
256
|
+
heuristics = {
|
257
|
+
"recommended_dpi": target_dpi,
|
258
|
+
"content_analysis": {},
|
259
|
+
"performance_impact": "medium",
|
260
|
+
"quality_impact": "medium",
|
261
|
+
"recommendations": recommendations,
|
262
|
+
}
|
263
|
+
|
264
|
+
# Calculate aspect ratio and size analysis
|
265
|
+
aspect_ratio = width / height if height > 0 else 1.0
|
266
|
+
total_pixels = width * height
|
267
|
+
megapixels = total_pixels / 1_000_000
|
268
|
+
|
269
|
+
heuristics["content_analysis"] = {
|
270
|
+
"aspect_ratio": aspect_ratio,
|
271
|
+
"megapixels": megapixels,
|
272
|
+
"is_portrait": aspect_ratio < 0.8,
|
273
|
+
"is_landscape": aspect_ratio > 1.2,
|
274
|
+
"is_large": max(width, height) > max_dimension * 0.8,
|
275
|
+
}
|
276
|
+
|
277
|
+
# Document-specific heuristics
|
278
|
+
if content_type == "document":
|
279
|
+
if aspect_ratio > 2.0 or aspect_ratio < 0.5:
|
280
|
+
# Very wide or very tall documents (like forms, receipts)
|
281
|
+
recommendations.append("Consider higher DPI for narrow documents")
|
282
|
+
if target_dpi < 200:
|
283
|
+
heuristics["recommended_dpi"] = min(200, target_dpi * 1.3)
|
284
|
+
|
285
|
+
if megapixels > 50: # Very large document
|
286
|
+
recommendations.append("Large document detected - consider DPI reduction")
|
287
|
+
heuristics["performance_impact"] = "high"
|
288
|
+
if target_dpi > 150:
|
289
|
+
heuristics["recommended_dpi"] = max(120, target_dpi * 0.8)
|
290
|
+
|
291
|
+
# Memory usage estimation
|
292
|
+
estimated_memory_mb = (width * height * 3) / (1024 * 1024) # RGB bytes
|
293
|
+
if estimated_memory_mb > 200:
|
294
|
+
heuristics["performance_impact"] = "high"
|
295
|
+
recommendations.append(f"High memory usage expected (~{estimated_memory_mb:.0f}MB)")
|
296
|
+
|
297
|
+
# Quality vs performance tradeoffs
|
298
|
+
scale_factor = target_dpi / current_dpi if current_dpi > 0 else 1.0
|
299
|
+
if scale_factor < 0.7:
|
300
|
+
heuristics["quality_impact"] = "high"
|
301
|
+
recommendations.append("Significant downscaling may reduce OCR accuracy")
|
302
|
+
elif scale_factor > 1.5:
|
303
|
+
heuristics["performance_impact"] = "high"
|
304
|
+
recommendations.append("Upscaling will increase processing time")
|
305
|
+
|
306
|
+
return heuristics
|
307
|
+
|
308
|
+
|
309
|
+
def estimate_processing_time(
|
310
|
+
width: int,
|
311
|
+
height: int,
|
312
|
+
ocr_backend: str = "tesseract",
|
313
|
+
) -> dict[str, float | str]:
|
314
|
+
"""Estimate processing time based on image dimensions and OCR backend.
|
315
|
+
|
316
|
+
Args:
|
317
|
+
width: Image width in pixels
|
318
|
+
height: Image height in pixels
|
319
|
+
ocr_backend: OCR backend name
|
320
|
+
|
321
|
+
Returns:
|
322
|
+
Dictionary with time estimates in seconds
|
323
|
+
"""
|
324
|
+
total_pixels = width * height
|
325
|
+
megapixels = total_pixels / 1_000_000
|
326
|
+
|
327
|
+
# Base processing times per megapixel (rough estimates)
|
328
|
+
base_times = {
|
329
|
+
"tesseract": 2.5, # seconds per megapixel
|
330
|
+
"easyocr": 4.0, # slower due to deep learning
|
331
|
+
"paddleocr": 3.5, # moderate speed
|
332
|
+
}
|
333
|
+
|
334
|
+
base_time = base_times.get(ocr_backend, 3.0)
|
335
|
+
|
336
|
+
# Non-linear scaling for very large images
|
337
|
+
scaling_factor = 1.0 + (megapixels - 10) * 0.1 if megapixels > 10 else 1.0
|
338
|
+
|
339
|
+
estimated_time = base_time * megapixels * scaling_factor
|
340
|
+
|
341
|
+
return {
|
342
|
+
"estimated_seconds": estimated_time,
|
343
|
+
"megapixels": megapixels,
|
344
|
+
"backend": ocr_backend,
|
345
|
+
"scaling_factor": scaling_factor,
|
346
|
+
}
|
kreuzberg/_utils/_ocr_cache.py
CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import hashlib
|
4
4
|
import io
|
5
|
+
from pathlib import Path
|
5
6
|
from typing import TYPE_CHECKING, Any
|
6
7
|
|
7
8
|
import anyio
|
@@ -9,17 +10,13 @@ import anyio
|
|
9
10
|
from kreuzberg._utils._cache import get_ocr_cache
|
10
11
|
|
11
12
|
if TYPE_CHECKING:
|
12
|
-
from pathlib import Path
|
13
|
-
|
14
13
|
from PIL.Image import Image as PILImage
|
15
14
|
|
16
15
|
from kreuzberg._types import ExtractionResult
|
17
16
|
|
18
17
|
|
19
18
|
def get_file_info(path: Path) -> dict[str, Any]:
|
20
|
-
|
21
|
-
|
22
|
-
path_obj = PathType(path) if not isinstance(path, PathType) else path
|
19
|
+
path_obj = path if isinstance(path, Path) else Path(path)
|
23
20
|
|
24
21
|
try:
|
25
22
|
stat = path_obj.stat()
|
@@ -4,7 +4,7 @@ import io
|
|
4
4
|
import multiprocessing as mp
|
5
5
|
from concurrent.futures import ProcessPoolExecutor
|
6
6
|
from contextlib import contextmanager
|
7
|
-
from typing import TYPE_CHECKING, Any, TypeVar
|
7
|
+
from typing import TYPE_CHECKING, Any, TypeVar, cast
|
8
8
|
|
9
9
|
import anyio
|
10
10
|
import psutil
|
@@ -173,7 +173,7 @@ class ProcessPoolManager:
|
|
173
173
|
self._active_tasks -= 1
|
174
174
|
|
175
175
|
async with anyio.create_task_group() as tg:
|
176
|
-
results: list[T] = [None] * len(arg_batches)
|
176
|
+
results: list[T | None] = [None] * len(arg_batches)
|
177
177
|
|
178
178
|
async def run_task(idx: int, args: tuple[Any, ...]) -> None:
|
179
179
|
results[idx] = await submit_single(args)
|
@@ -181,7 +181,7 @@ class ProcessPoolManager:
|
|
181
181
|
for idx, args in enumerate(arg_batches):
|
182
182
|
tg.start_soon(run_task, idx, args)
|
183
183
|
|
184
|
-
return results
|
184
|
+
return cast("list[T]", results)
|
185
185
|
|
186
186
|
def get_system_info(self) -> dict[str, Any]:
|
187
187
|
memory = psutil.virtual_memory()
|
kreuzberg/_utils/_table.py
CHANGED
@@ -89,6 +89,8 @@ def _format_table_row(row: Any, df: Any, float_col_formatting: dict[str, str]) -
|
|
89
89
|
formatted_row.append(str(int(value)))
|
90
90
|
else:
|
91
91
|
formatted_row.append(f"{value:.2f}")
|
92
|
+
elif isinstance(value, bool):
|
93
|
+
formatted_row.append(str(value).lower())
|
92
94
|
else:
|
93
95
|
clean_value = str(value).strip().replace("|", "\\|")
|
94
96
|
formatted_row.append(clean_value)
|
@@ -201,7 +203,8 @@ def extract_table_structure_info(table: TableData) -> dict[str, Any]:
|
|
201
203
|
|
202
204
|
total_cells = df.height * df.width
|
203
205
|
if total_cells > 0:
|
204
|
-
|
206
|
+
null_counts = df.null_count()
|
207
|
+
empty_cells = sum(null_counts.row(0))
|
205
208
|
info["empty_cells"] = empty_cells
|
206
209
|
info["data_density"] = (total_cells - empty_cells) / total_cells
|
207
210
|
|
kreuzberg/cli.py
CHANGED
@@ -62,7 +62,20 @@ def format_extraction_result(result: ExtractionResult, show_metadata: bool, outp
|
|
62
62
|
if show_metadata:
|
63
63
|
output_data["metadata"] = result.metadata
|
64
64
|
if result.tables:
|
65
|
-
|
65
|
+
json_tables = []
|
66
|
+
for table in result.tables:
|
67
|
+
json_table = {
|
68
|
+
"page_number": table.get("page_number"),
|
69
|
+
"text": table.get("text"),
|
70
|
+
}
|
71
|
+
if "df" in table and table["df"] is not None:
|
72
|
+
df = table["df"]
|
73
|
+
if hasattr(df, "write_csv"):
|
74
|
+
json_table["data_csv"] = df.write_csv()
|
75
|
+
elif hasattr(df, "to_csv"):
|
76
|
+
json_table["data_csv"] = df.to_csv(index=False)
|
77
|
+
json_tables.append(json_table)
|
78
|
+
output_data["tables"] = json_tables
|
66
79
|
if result.chunks:
|
67
80
|
output_data["chunks"] = result.chunks
|
68
81
|
return json.dumps(output_data, indent=2, ensure_ascii=False)
|
@@ -77,7 +90,11 @@ def format_extraction_result(result: ExtractionResult, show_metadata: bool, outp
|
|
77
90
|
output_parts.append("\n\n--- TABLES ---")
|
78
91
|
for i, table in enumerate(result.tables):
|
79
92
|
output_parts.append(f"\nTable {i + 1}:")
|
80
|
-
|
93
|
+
json_table = {
|
94
|
+
"page_number": table.get("page_number"),
|
95
|
+
"text": table.get("text"),
|
96
|
+
}
|
97
|
+
output_parts.append(json.dumps(json_table, indent=2, ensure_ascii=False))
|
81
98
|
|
82
99
|
return "\n".join(output_parts)
|
83
100
|
|
kreuzberg/extraction.py
CHANGED
@@ -426,12 +426,12 @@ def batch_extract_file_sync(
|
|
426
426
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
427
427
|
future_to_index = {executor.submit(extract_single, fp): i for i, fp in enumerate(file_paths)}
|
428
428
|
|
429
|
-
results: list[ExtractionResult] = [None] * len(file_paths)
|
429
|
+
results: list[ExtractionResult | None] = [None] * len(file_paths)
|
430
430
|
for future in as_completed(future_to_index):
|
431
431
|
index, result = future.result()
|
432
432
|
results[index] = result
|
433
433
|
|
434
|
-
return results
|
434
|
+
return cast("list[ExtractionResult]", results)
|
435
435
|
|
436
436
|
|
437
437
|
def batch_extract_bytes_sync(
|
@@ -479,9 +479,9 @@ def batch_extract_bytes_sync(
|
|
479
479
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
480
480
|
future_to_index = {executor.submit(extract_single, (i, content)): i for i, content in enumerate(contents)}
|
481
481
|
|
482
|
-
results: list[ExtractionResult] = [None] * len(contents)
|
482
|
+
results: list[ExtractionResult | None] = [None] * len(contents)
|
483
483
|
for future in as_completed(future_to_index):
|
484
484
|
index, result = future.result()
|
485
485
|
results[index] = result
|
486
486
|
|
487
|
-
return results
|
487
|
+
return cast("list[ExtractionResult]", results)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.14.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -31,15 +31,15 @@ Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=4.10.0
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
35
|
-
Requires-Dist: mcp>=1.
|
34
|
+
Requires-Dist: html-to-markdown[lxml]>=1.11.0
|
35
|
+
Requires-Dist: mcp>=1.14.0
|
36
36
|
Requires-Dist: msgspec>=0.18.0
|
37
|
-
Requires-Dist: numpy>=
|
37
|
+
Requires-Dist: numpy>=2.0.0
|
38
38
|
Requires-Dist: playa-pdf>=0.7.0
|
39
|
-
Requires-Dist: polars>=1.33.
|
39
|
+
Requires-Dist: polars>=1.33.1
|
40
40
|
Requires-Dist: psutil>=7.0.0
|
41
41
|
Requires-Dist: pypdfium2==4.30.0
|
42
|
-
Requires-Dist: python-calamine>=0.5.
|
42
|
+
Requires-Dist: python-calamine>=0.5.3
|
43
43
|
Requires-Dist: python-pptx>=1.0.2
|
44
44
|
Requires-Dist: typing-extensions>=4.15.0; python_version < '3.12'
|
45
45
|
Provides-Extra: additional-extensions
|
@@ -55,17 +55,17 @@ Requires-Dist: keybert>=0.9.0; extra == 'all'
|
|
55
55
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
|
56
56
|
Requires-Dist: mailparse>=1.0.15; extra == 'all'
|
57
57
|
Requires-Dist: paddleocr>=3.2.0; extra == 'all'
|
58
|
-
Requires-Dist: paddlepaddle>=3.
|
58
|
+
Requires-Dist: paddlepaddle>=3.2.0; extra == 'all'
|
59
59
|
Requires-Dist: playa-pdf[crypto]>=0.7.0; extra == 'all'
|
60
60
|
Requires-Dist: rich>=14.1.0; extra == 'all'
|
61
|
-
Requires-Dist: semantic-text-splitter>=0.
|
61
|
+
Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'all'
|
62
62
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
63
63
|
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
64
64
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
65
65
|
Provides-Extra: api
|
66
66
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'api'
|
67
67
|
Provides-Extra: chunking
|
68
|
-
Requires-Dist: semantic-text-splitter>=0.
|
68
|
+
Requires-Dist: semantic-text-splitter>=0.28.0; extra == 'chunking'
|
69
69
|
Provides-Extra: cli
|
70
70
|
Requires-Dist: click>=8.2.1; extra == 'cli'
|
71
71
|
Requires-Dist: rich>=14.1.0; extra == 'cli'
|
@@ -85,7 +85,7 @@ Provides-Extra: langdetect
|
|
85
85
|
Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
|
86
86
|
Provides-Extra: paddleocr
|
87
87
|
Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
|
88
|
-
Requires-Dist: paddlepaddle>=3.
|
88
|
+
Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
|
89
89
|
Requires-Dist: setuptools>=80.9.0; extra == 'paddleocr'
|
90
90
|
Description-Content-Type: text/markdown
|
91
91
|
|
@@ -1,57 +1,58 @@
|
|
1
1
|
kreuzberg/__init__.py,sha256=Oh_NTp8wf0BlvD8CSBad2A493nEWH4jTE0x8v7v1Y9w,1341
|
2
2
|
kreuzberg/__main__.py,sha256=3cIDdzTggj2kj8uKx4WShWHmCWqdZazdM3BxUGbAuSI,104
|
3
3
|
kreuzberg/_chunker.py,sha256=tr9_KUYTSLauFois3MsB-A-0hGcTT8hTQFrqNRTii-I,1373
|
4
|
-
kreuzberg/_config.py,sha256=
|
4
|
+
kreuzberg/_config.py,sha256=2LI5z9gXniqO4afrMmbZfMdhlT2701O5OlGKkrMo-bM,12385
|
5
5
|
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
6
6
|
kreuzberg/_document_classification.py,sha256=Mz_s2GJGsEl7MQ-67BPoGYCZibTy9Sw0PScUZKBjKOA,5736
|
7
7
|
kreuzberg/_entity_extraction.py,sha256=5YpPnqoJ5aiHd_sy4bN4-Ngiq79RhCV6yaUQE8joGXo,3503
|
8
|
-
kreuzberg/_gmft.py,sha256=
|
8
|
+
kreuzberg/_gmft.py,sha256=a7KDXbZM0PxyFpAIjM0xMRvxzoMo4fTQuGlFNa8uXBU,20502
|
9
9
|
kreuzberg/_language_detection.py,sha256=T9p6aimB7QFXAQiEntIMZeH_Z62E52E8fBQ43hWuyhs,1960
|
10
10
|
kreuzberg/_mime_types.py,sha256=kGBDSMO4XPgzUKC7iaBeChCtRQXZ9_zXq6eJydejX_k,7739
|
11
11
|
kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
|
12
|
-
kreuzberg/_registry.py,sha256=
|
13
|
-
kreuzberg/_types.py,sha256=
|
14
|
-
kreuzberg/cli.py,sha256=
|
12
|
+
kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
|
13
|
+
kreuzberg/_types.py,sha256=yw8ZzCgwp8T4byh00gdSlABDtRwro6H1pemQsO5IZMQ,39132
|
14
|
+
kreuzberg/cli.py,sha256=Ob0IfqWcaiM09pFdC6wTpdSeql0SGZDxBxfrEhJAGmo,13501
|
15
15
|
kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
|
16
|
-
kreuzberg/extraction.py,sha256=
|
16
|
+
kreuzberg/extraction.py,sha256=qT-Ziw5FmMqcPT88VrglikL1RASSJCf5W7xP6L9Vi5s,17673
|
17
17
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
-
kreuzberg/_api/main.py,sha256=
|
19
|
+
kreuzberg/_api/main.py,sha256=bZLaQpW8eoTFGvCGJgFodALy4rDfe9kuY1oj9OKPQpU,10792
|
20
20
|
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
21
21
|
kreuzberg/_extractors/_base.py,sha256=i2FvAhRnamEtBb4a-C7pfcdWIXnkEBw0saMQu7h1_RQ,2069
|
22
22
|
kreuzberg/_extractors/_email.py,sha256=jn_8J4BASKJ7zFHBG0PgxNe3OT4pjmEM2tTKX8y_0AE,5887
|
23
23
|
kreuzberg/_extractors/_html.py,sha256=NyQKChNLvaSUC_5x1qTYlIQGwL4lEbgUF7BgH9ejEVY,1583
|
24
|
-
kreuzberg/_extractors/_image.py,sha256=
|
24
|
+
kreuzberg/_extractors/_image.py,sha256=lFPoxAf7_Zbx-1t8W4vU2bhHauiNGOAFbZxr_2gNUsw,3991
|
25
25
|
kreuzberg/_extractors/_pandoc.py,sha256=-Ai4S1cXs7F6yeonb_7Y7_ZoWHn29E2oP1WlPtM-4HM,22505
|
26
|
-
kreuzberg/_extractors/_pdf.py,sha256=
|
26
|
+
kreuzberg/_extractors/_pdf.py,sha256=naJ_AgtAgtGIjAqiU4_G7lgftKWhUjZDLVILSG2AyVc,18757
|
27
27
|
kreuzberg/_extractors/_presentation.py,sha256=ULGkt7dzeA9sYSEhpAucKZmkdv9EubzeZtOjoLP3Z2E,6994
|
28
|
-
kreuzberg/_extractors/_spread_sheet.py,sha256=
|
28
|
+
kreuzberg/_extractors/_spread_sheet.py,sha256=eBAx_OwoyRqMzmD4Z07UlOBwcXckymgvj_0o7di6thA,12715
|
29
29
|
kreuzberg/_extractors/_structured.py,sha256=PpefI_GDrdLyUgnElrbdB-MeTMKVWium4Ckxm5Zg100,5536
|
30
30
|
kreuzberg/_mcp/__init__.py,sha256=h6DgLFO4TMUk7_wCJ2jn2Y6IkFmfzb-Z7jX-G5UCYVc,43
|
31
|
-
kreuzberg/_mcp/server.py,sha256=
|
31
|
+
kreuzberg/_mcp/server.py,sha256=YPMJp6xnZ3DC32NEdX5Gqf3vwxsHZxXxUxZ6jghpv6I,5688
|
32
32
|
kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
|
33
|
-
kreuzberg/_ocr/_base.py,sha256=
|
33
|
+
kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
|
34
34
|
kreuzberg/_ocr/_easyocr.py,sha256=XbgpGt5tkE4xHleIGvV1cHlpOQTp43rSXBO1CyIyKTg,14599
|
35
|
-
kreuzberg/_ocr/_paddleocr.py,sha256=
|
35
|
+
kreuzberg/_ocr/_paddleocr.py,sha256=hfc6Zi2eSUYTVVF9y9D1P2_pLiLXPfFRoJ6QDJ6oZag,15017
|
36
36
|
kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
|
37
|
-
kreuzberg/_ocr/_tesseract.py,sha256=
|
37
|
+
kreuzberg/_ocr/_tesseract.py,sha256=QEKK_PDZnNiZRgpklOgMXB-cObJy6C-HuxL6Gza5Z3c,49136
|
38
38
|
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
39
|
-
kreuzberg/_utils/_cache.py,sha256=
|
39
|
+
kreuzberg/_utils/_cache.py,sha256=qeyI6rJOQlKtdHjJeOjUxx31eItak_drrNn8Cf8HbN8,13956
|
40
40
|
kreuzberg/_utils/_device.py,sha256=UxGkSTN3Up-Zn43CSyvf8CozW2xAF05Cm01LWA2FZmg,8263
|
41
41
|
kreuzberg/_utils/_document_cache.py,sha256=tfk9_Yc1cQkT5_uM5R1uaI4w-2SjNn7QyAd6AmWkSz8,4851
|
42
42
|
kreuzberg/_utils/_errors.py,sha256=aQYEnp8oJ-WJVmCNo7YY-25y1KZZFEwjAmxVRfw4a_M,4920
|
43
|
-
kreuzberg/_utils/
|
43
|
+
kreuzberg/_utils/_image_preprocessing.py,sha256=2u0A28M07F9XlYebTG5salOUVEE3YT3m8fiR8Z2ZM8E,12326
|
44
|
+
kreuzberg/_utils/_ocr_cache.py,sha256=uCCZfdY7EiqMhCnhNwqirFOr-Wfaobd2Ntc-F07TKec,3425
|
44
45
|
kreuzberg/_utils/_pdf_lock.py,sha256=Ytvds30aZf3yXeZFo27ZenrhUoU-GZlR2rKEkhJ_wlk,1349
|
45
|
-
kreuzberg/_utils/_process_pool.py,sha256=
|
46
|
+
kreuzberg/_utils/_process_pool.py,sha256=7p8Co1w-Tvh2MUdxMcPMpvOikumrb0nN2ApQVytV-_c,6726
|
46
47
|
kreuzberg/_utils/_quality.py,sha256=f7NbyZysyJQD8jKCNWhogvluU9A7GdEYhMsDBeMbGAA,5412
|
47
48
|
kreuzberg/_utils/_ref.py,sha256=iOflvjTUc_F0XaL28Bd6fpvL6qkeoURGA4B77Nqky7I,840
|
48
49
|
kreuzberg/_utils/_serialization.py,sha256=97iIgdcxdbym-BEvy0J6HAduBCUXyCGwhuEHCT_l7I4,1513
|
49
50
|
kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4366
|
50
51
|
kreuzberg/_utils/_sync.py,sha256=OWiciXPTGHIxgiGoHI2AglZ1siTNT-nU_JCgHPNzzHk,2196
|
51
|
-
kreuzberg/_utils/_table.py,sha256=
|
52
|
+
kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
|
52
53
|
kreuzberg/_utils/_tmp.py,sha256=wnOInBkcuQoxI1vBLvNv9NqbRCEu9Y03qfOjqQuAk3s,841
|
53
|
-
kreuzberg-3.
|
54
|
-
kreuzberg-3.
|
55
|
-
kreuzberg-3.
|
56
|
-
kreuzberg-3.
|
57
|
-
kreuzberg-3.
|
54
|
+
kreuzberg-3.14.0.dist-info/METADATA,sha256=68rRivXnf8n_F9lqekOydDOd8sehWpHpbbKzRup7XDc,12127
|
55
|
+
kreuzberg-3.14.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
56
|
+
kreuzberg-3.14.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
57
|
+
kreuzberg-3.14.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
58
|
+
kreuzberg-3.14.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|