kreuzberg 3.8.0__py3-none-any.whl → 3.8.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +4 -0
- kreuzberg/_api/main.py +22 -1
- kreuzberg/_config.py +404 -0
- kreuzberg/_entity_extraction.py +4 -5
- kreuzberg/_extractors/_base.py +3 -5
- kreuzberg/_extractors/_image.py +18 -32
- kreuzberg/_extractors/_pandoc.py +3 -14
- kreuzberg/_extractors/_pdf.py +39 -57
- kreuzberg/_extractors/_spread_sheet.py +2 -3
- kreuzberg/_extractors/_structured.py +10 -7
- kreuzberg/_gmft.py +314 -10
- kreuzberg/_language_detection.py +1 -1
- kreuzberg/_mcp/server.py +58 -8
- kreuzberg/_ocr/__init__.py +1 -22
- kreuzberg/_ocr/_base.py +59 -0
- kreuzberg/_ocr/_easyocr.py +92 -1
- kreuzberg/_ocr/_paddleocr.py +90 -1
- kreuzberg/_ocr/_tesseract.py +556 -5
- kreuzberg/_playa.py +2 -3
- kreuzberg/_types.py +46 -24
- kreuzberg/_utils/_cache.py +35 -4
- kreuzberg/_utils/_device.py +10 -20
- kreuzberg/_utils/_errors.py +44 -45
- kreuzberg/_utils/_process_pool.py +2 -6
- kreuzberg/_utils/_quality.py +7 -11
- kreuzberg/_utils/_serialization.py +21 -16
- kreuzberg/_utils/_string.py +22 -12
- kreuzberg/_utils/_table.py +3 -4
- kreuzberg/cli.py +4 -5
- kreuzberg/exceptions.py +10 -0
- kreuzberg/extraction.py +6 -24
- kreuzberg-3.8.2.dist-info/METADATA +265 -0
- kreuzberg-3.8.2.dist-info/RECORD +53 -0
- kreuzberg/_cli_config.py +0 -175
- kreuzberg/_multiprocessing/__init__.py +0 -5
- kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
- kreuzberg/_ocr/_pool.py +0 -357
- kreuzberg/_ocr/_sync.py +0 -566
- kreuzberg-3.8.0.dist-info/METADATA +0 -313
- kreuzberg-3.8.0.dist-info/RECORD +0 -57
- {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/WHEEL +0 -0
- {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_types.py
CHANGED
@@ -5,7 +5,10 @@ from collections.abc import Awaitable, Callable
|
|
5
5
|
from dataclasses import asdict, dataclass, field
|
6
6
|
from typing import TYPE_CHECKING, Any, Literal, TypedDict
|
7
7
|
|
8
|
+
import msgspec
|
9
|
+
|
8
10
|
from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
|
11
|
+
from kreuzberg._utils._table import export_table_to_csv, export_table_to_tsv, extract_table_structure_info
|
9
12
|
from kreuzberg.exceptions import ValidationError
|
10
13
|
|
11
14
|
if sys.version_info < (3, 11): # pragma: no cover
|
@@ -191,7 +194,7 @@ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
|
|
191
194
|
return normalized
|
192
195
|
|
193
196
|
|
194
|
-
@dataclass(frozen=True)
|
197
|
+
@dataclass(frozen=True, slots=True)
|
195
198
|
class Entity:
|
196
199
|
"""Represents an extracted entity with type, text, and position."""
|
197
200
|
|
@@ -205,7 +208,7 @@ class Entity:
|
|
205
208
|
"""End character offset in the content"""
|
206
209
|
|
207
210
|
|
208
|
-
@dataclass
|
211
|
+
@dataclass(slots=True)
|
209
212
|
class ExtractionResult:
|
210
213
|
"""The result of a file extraction."""
|
211
214
|
|
@@ -226,9 +229,29 @@ class ExtractionResult:
|
|
226
229
|
detected_languages: list[str] | None = None
|
227
230
|
"""Languages detected in the extracted content, if language detection is enabled."""
|
228
231
|
|
229
|
-
def to_dict(self) -> dict[str, Any]:
|
230
|
-
"""Converts the ExtractionResult to a dictionary.
|
231
|
-
|
232
|
+
def to_dict(self, include_none: bool = False) -> dict[str, Any]:
|
233
|
+
"""Converts the ExtractionResult to a dictionary.
|
234
|
+
|
235
|
+
Args:
|
236
|
+
include_none: If True, include fields with None values.
|
237
|
+
If False (default), exclude None values.
|
238
|
+
|
239
|
+
Returns:
|
240
|
+
Dictionary representation of the ExtractionResult.
|
241
|
+
"""
|
242
|
+
# Use msgspec.to_builtins for efficient conversion
|
243
|
+
# The builtin_types parameter allows DataFrames to pass through
|
244
|
+
result = msgspec.to_builtins(
|
245
|
+
self,
|
246
|
+
builtin_types=(type(None),), # Allow None to pass through
|
247
|
+
order="deterministic", # Ensure consistent output
|
248
|
+
)
|
249
|
+
|
250
|
+
if include_none:
|
251
|
+
return result # type: ignore[no-any-return]
|
252
|
+
|
253
|
+
# Remove None values to match expected behavior
|
254
|
+
return {k: v for k, v in result.items() if v is not None}
|
232
255
|
|
233
256
|
def export_tables_to_csv(self) -> list[str]:
|
234
257
|
"""Export all tables to CSV format.
|
@@ -239,8 +262,6 @@ class ExtractionResult:
|
|
239
262
|
if not self.tables:
|
240
263
|
return []
|
241
264
|
|
242
|
-
from kreuzberg._utils._table import export_table_to_csv
|
243
|
-
|
244
265
|
return [export_table_to_csv(table) for table in self.tables]
|
245
266
|
|
246
267
|
def export_tables_to_tsv(self) -> list[str]:
|
@@ -252,8 +273,6 @@ class ExtractionResult:
|
|
252
273
|
if not self.tables:
|
253
274
|
return []
|
254
275
|
|
255
|
-
from kreuzberg._utils._table import export_table_to_tsv
|
256
|
-
|
257
276
|
return [export_table_to_tsv(table) for table in self.tables]
|
258
277
|
|
259
278
|
def get_table_summaries(self) -> list[dict[str, Any]]:
|
@@ -265,8 +284,6 @@ class ExtractionResult:
|
|
265
284
|
if not self.tables:
|
266
285
|
return []
|
267
286
|
|
268
|
-
from kreuzberg._utils._table import extract_table_structure_info
|
269
|
-
|
270
287
|
return [extract_table_structure_info(table) for table in self.tables]
|
271
288
|
|
272
289
|
|
@@ -274,7 +291,7 @@ PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[E
|
|
274
291
|
ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
|
275
292
|
|
276
293
|
|
277
|
-
@dataclass(unsafe_hash=True)
|
294
|
+
@dataclass(unsafe_hash=True, slots=True)
|
278
295
|
class ExtractionConfig:
|
279
296
|
"""Represents configuration settings for an extraction process.
|
280
297
|
|
@@ -355,18 +372,23 @@ class ExtractionConfig:
|
|
355
372
|
Returns:
|
356
373
|
A dict of the OCR configuration or an empty dict if no backend is provided.
|
357
374
|
"""
|
358
|
-
if self.ocr_backend is
|
359
|
-
|
360
|
-
return asdict(self.ocr_config)
|
361
|
-
if self.ocr_backend == "tesseract":
|
362
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
375
|
+
if self.ocr_backend is None:
|
376
|
+
return {}
|
363
377
|
|
364
|
-
|
365
|
-
|
366
|
-
|
378
|
+
if self.ocr_config is not None:
|
379
|
+
# Use asdict for OCR configs to preserve enum objects correctly
|
380
|
+
return asdict(self.ocr_config)
|
367
381
|
|
368
|
-
|
369
|
-
|
382
|
+
# Lazy load and cache default configs instead of creating new instances
|
383
|
+
if self.ocr_backend == "tesseract":
|
384
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
370
385
|
|
371
|
-
return asdict(
|
372
|
-
|
386
|
+
return asdict(TesseractConfig())
|
387
|
+
if self.ocr_backend == "easyocr":
|
388
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
389
|
+
|
390
|
+
return asdict(EasyOCRConfig())
|
391
|
+
# paddleocr
|
392
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
393
|
+
|
394
|
+
return asdict(PaddleOCRConfig())
|
kreuzberg/_utils/_cache.py
CHANGED
@@ -64,11 +64,10 @@ class KreuzbergCache(Generic[T]):
|
|
64
64
|
Returns:
|
65
65
|
Unique cache key string
|
66
66
|
"""
|
67
|
-
# Use more efficient string building for cache key
|
68
67
|
if not kwargs:
|
69
68
|
return "empty"
|
70
69
|
|
71
|
-
# Build key
|
70
|
+
# Build cache key using list + join (faster than StringIO)
|
72
71
|
parts = []
|
73
72
|
for key in sorted(kwargs):
|
74
73
|
value = kwargs[key]
|
@@ -81,6 +80,7 @@ class KreuzbergCache(Generic[T]):
|
|
81
80
|
parts.append(f"{key}={type(value).__name__}:{value!s}")
|
82
81
|
|
83
82
|
cache_str = "&".join(parts)
|
83
|
+
# SHA256 is secure and fast enough for cache keys
|
84
84
|
return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
|
85
85
|
|
86
86
|
def _get_cache_path(self, cache_key: str) -> Path:
|
@@ -102,15 +102,46 @@ class KreuzbergCache(Generic[T]):
|
|
102
102
|
|
103
103
|
def _serialize_result(self, result: T) -> dict[str, Any]:
|
104
104
|
"""Serialize result for caching with metadata."""
|
105
|
+
# Handle TableData objects that contain DataFrames
|
106
|
+
if isinstance(result, list) and result and isinstance(result[0], dict) and "df" in result[0]:
|
107
|
+
serialized_data = []
|
108
|
+
for item in result:
|
109
|
+
if isinstance(item, dict) and "df" in item:
|
110
|
+
# Build new dict without unnecessary copy
|
111
|
+
serialized_item = {k: v for k, v in item.items() if k != "df"}
|
112
|
+
if hasattr(item["df"], "to_csv"):
|
113
|
+
serialized_item["df_csv"] = item["df"].to_csv(index=False)
|
114
|
+
else:
|
115
|
+
# Fallback for non-DataFrame objects
|
116
|
+
serialized_item["df_csv"] = str(item["df"])
|
117
|
+
serialized_data.append(serialized_item)
|
118
|
+
else:
|
119
|
+
serialized_data.append(item)
|
120
|
+
return {"type": "TableDataList", "data": serialized_data, "cached_at": time.time()}
|
121
|
+
|
105
122
|
return {"type": type(result).__name__, "data": result, "cached_at": time.time()}
|
106
123
|
|
107
124
|
def _deserialize_result(self, cached_data: dict[str, Any]) -> T:
|
108
125
|
"""Deserialize cached result."""
|
109
126
|
data = cached_data["data"]
|
110
127
|
|
111
|
-
if cached_data.get("type") == "
|
112
|
-
from
|
128
|
+
if cached_data.get("type") == "TableDataList" and isinstance(data, list):
|
129
|
+
from io import StringIO
|
130
|
+
|
131
|
+
import pandas as pd
|
113
132
|
|
133
|
+
deserialized_data = []
|
134
|
+
for item in data:
|
135
|
+
if isinstance(item, dict) and "df_csv" in item:
|
136
|
+
# Build new dict without unnecessary copy
|
137
|
+
deserialized_item = {k: v for k, v in item.items() if k != "df_csv"}
|
138
|
+
deserialized_item["df"] = pd.read_csv(StringIO(item["df_csv"]))
|
139
|
+
deserialized_data.append(deserialized_item)
|
140
|
+
else:
|
141
|
+
deserialized_data.append(item)
|
142
|
+
return deserialized_data # type: ignore[return-value]
|
143
|
+
|
144
|
+
if cached_data.get("type") == "ExtractionResult" and isinstance(data, dict):
|
114
145
|
return ExtractionResult(**data) # type: ignore[return-value]
|
115
146
|
|
116
147
|
return data # type: ignore[no-any-return]
|
kreuzberg/_utils/_device.py
CHANGED
@@ -5,6 +5,7 @@ from __future__ import annotations
|
|
5
5
|
|
6
6
|
import warnings
|
7
7
|
from dataclasses import dataclass
|
8
|
+
from itertools import chain
|
8
9
|
from typing import Literal
|
9
10
|
|
10
11
|
from kreuzberg.exceptions import ValidationError
|
@@ -12,7 +13,7 @@ from kreuzberg.exceptions import ValidationError
|
|
12
13
|
DeviceType = Literal["cpu", "cuda", "mps", "auto"]
|
13
14
|
|
14
15
|
|
15
|
-
@dataclass(frozen=True)
|
16
|
+
@dataclass(frozen=True, slots=True)
|
16
17
|
class DeviceInfo:
|
17
18
|
"""Information about a compute device."""
|
18
19
|
|
@@ -34,28 +35,17 @@ def detect_available_devices() -> list[DeviceInfo]:
|
|
34
35
|
Returns:
|
35
36
|
List of available devices, with the most preferred device first.
|
36
37
|
"""
|
37
|
-
|
38
|
-
|
39
|
-
devices.append(
|
40
|
-
DeviceInfo(
|
41
|
-
device_type="cpu",
|
42
|
-
name="CPU",
|
43
|
-
)
|
44
|
-
)
|
45
|
-
|
46
|
-
if _is_cuda_available():
|
47
|
-
cuda_devices = _get_cuda_devices()
|
48
|
-
devices.extend(cuda_devices)
|
38
|
+
# Build device lists efficiently using generators
|
39
|
+
cpu_device = DeviceInfo(device_type="cpu", name="CPU")
|
49
40
|
|
50
|
-
if
|
51
|
-
mps_device = _get_mps_device()
|
52
|
-
if mps_device:
|
53
|
-
devices.append(mps_device)
|
41
|
+
cuda_devices = _get_cuda_devices() if _is_cuda_available() else []
|
54
42
|
|
55
|
-
|
56
|
-
|
43
|
+
mps_device = _get_mps_device() if _is_mps_available() else None
|
44
|
+
mps_devices = [mps_device] if mps_device else []
|
57
45
|
|
58
|
-
|
46
|
+
# Return GPU devices first, then CPU using itertools.chain
|
47
|
+
gpu_devices = list(chain(cuda_devices, mps_devices))
|
48
|
+
return [*gpu_devices, cpu_device]
|
59
49
|
|
60
50
|
|
61
51
|
def get_optimal_device() -> DeviceInfo:
|
kreuzberg/_utils/_errors.py
CHANGED
@@ -5,12 +5,48 @@ from __future__ import annotations
|
|
5
5
|
import platform
|
6
6
|
import traceback
|
7
7
|
from datetime import datetime, timezone
|
8
|
-
from
|
8
|
+
from pathlib import Path
|
9
|
+
from typing import Any
|
9
10
|
|
10
11
|
import psutil
|
11
12
|
|
12
|
-
|
13
|
-
|
13
|
+
from kreuzberg.exceptions import ValidationError
|
14
|
+
|
15
|
+
# Define error keywords as frozensets for O(1) membership testing
|
16
|
+
_SYSTEM_ERROR_KEYWORDS = frozenset({"memory", "resource", "process", "thread"})
|
17
|
+
_TRANSIENT_ERROR_PATTERNS = frozenset(
|
18
|
+
{
|
19
|
+
"temporary",
|
20
|
+
"locked",
|
21
|
+
"in use",
|
22
|
+
"access denied",
|
23
|
+
"permission",
|
24
|
+
"timeout",
|
25
|
+
"connection",
|
26
|
+
"network",
|
27
|
+
"too many open files",
|
28
|
+
"cannot allocate memory",
|
29
|
+
"resource temporarily unavailable",
|
30
|
+
"broken pipe",
|
31
|
+
"subprocess",
|
32
|
+
"signal",
|
33
|
+
}
|
34
|
+
)
|
35
|
+
_RESOURCE_ERROR_PATTERNS = frozenset(
|
36
|
+
{
|
37
|
+
"memory",
|
38
|
+
"out of memory",
|
39
|
+
"cannot allocate",
|
40
|
+
"too many open files",
|
41
|
+
"file descriptor",
|
42
|
+
"resource",
|
43
|
+
"exhausted",
|
44
|
+
"limit",
|
45
|
+
"cpu",
|
46
|
+
"thread",
|
47
|
+
"process",
|
48
|
+
}
|
49
|
+
)
|
14
50
|
|
15
51
|
|
16
52
|
def create_error_context(
|
@@ -37,8 +73,6 @@ def create_error_context(
|
|
37
73
|
}
|
38
74
|
|
39
75
|
if file_path:
|
40
|
-
from pathlib import Path
|
41
|
-
|
42
76
|
path = Path(file_path) if isinstance(file_path, str) else file_path
|
43
77
|
context["file"] = {
|
44
78
|
"path": str(path),
|
@@ -54,11 +88,7 @@ def create_error_context(
|
|
54
88
|
"traceback": traceback.format_exception_only(type(error), error),
|
55
89
|
}
|
56
90
|
|
57
|
-
if (
|
58
|
-
any(keyword in str(error).lower() for keyword in ["memory", "resource", "process", "thread"])
|
59
|
-
if error
|
60
|
-
else False
|
61
|
-
):
|
91
|
+
if error and any(keyword in str(error).lower() for keyword in _SYSTEM_ERROR_KEYWORDS):
|
62
92
|
try:
|
63
93
|
mem = psutil.virtual_memory()
|
64
94
|
context["system"] = {
|
@@ -96,25 +126,8 @@ def is_transient_error(error: Exception) -> bool:
|
|
96
126
|
if isinstance(error, transient_types):
|
97
127
|
return True
|
98
128
|
|
99
|
-
transient_patterns = [
|
100
|
-
"temporary",
|
101
|
-
"locked",
|
102
|
-
"in use",
|
103
|
-
"access denied",
|
104
|
-
"permission",
|
105
|
-
"timeout",
|
106
|
-
"connection",
|
107
|
-
"network",
|
108
|
-
"too many open files",
|
109
|
-
"cannot allocate memory",
|
110
|
-
"resource temporarily unavailable",
|
111
|
-
"broken pipe",
|
112
|
-
"subprocess",
|
113
|
-
"signal",
|
114
|
-
]
|
115
|
-
|
116
129
|
error_str = str(error).lower()
|
117
|
-
return any(pattern in error_str for pattern in
|
130
|
+
return any(pattern in error_str for pattern in _TRANSIENT_ERROR_PATTERNS)
|
118
131
|
|
119
132
|
|
120
133
|
def is_resource_error(error: Exception) -> bool:
|
@@ -126,22 +139,8 @@ def is_resource_error(error: Exception) -> bool:
|
|
126
139
|
Returns:
|
127
140
|
True if the error is resource-related
|
128
141
|
"""
|
129
|
-
resource_patterns = [
|
130
|
-
"memory",
|
131
|
-
"out of memory",
|
132
|
-
"cannot allocate",
|
133
|
-
"too many open files",
|
134
|
-
"file descriptor",
|
135
|
-
"resource",
|
136
|
-
"exhausted",
|
137
|
-
"limit",
|
138
|
-
"cpu",
|
139
|
-
"thread",
|
140
|
-
"process",
|
141
|
-
]
|
142
|
-
|
143
142
|
error_str = str(error).lower()
|
144
|
-
return any(pattern in error_str for pattern in
|
143
|
+
return any(pattern in error_str for pattern in _RESOURCE_ERROR_PATTERNS)
|
145
144
|
|
146
145
|
|
147
146
|
def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
|
@@ -158,8 +157,6 @@ def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
|
|
158
157
|
if attempt >= max_attempts:
|
159
158
|
return False
|
160
159
|
|
161
|
-
from kreuzberg.exceptions import ValidationError
|
162
|
-
|
163
160
|
if isinstance(error, ValidationError):
|
164
161
|
return False
|
165
162
|
|
@@ -169,6 +166,8 @@ def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
|
|
169
166
|
class BatchExtractionResult:
|
170
167
|
"""Result container for batch operations with partial success support."""
|
171
168
|
|
169
|
+
__slots__ = ("failed", "successful", "total_count")
|
170
|
+
|
172
171
|
def __init__(self) -> None:
|
173
172
|
"""Initialize batch result container."""
|
174
173
|
self.successful: list[tuple[int, Any]] = []
|
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
from __future__ import annotations
|
4
4
|
|
5
|
+
import io
|
5
6
|
import multiprocessing as mp
|
6
7
|
from concurrent.futures import ProcessPoolExecutor
|
7
8
|
from contextlib import contextmanager
|
@@ -9,6 +10,7 @@ from typing import TYPE_CHECKING, Any, TypeVar
|
|
9
10
|
|
10
11
|
import anyio
|
11
12
|
import psutil
|
13
|
+
import pypdfium2
|
12
14
|
from typing_extensions import Self
|
13
15
|
|
14
16
|
if TYPE_CHECKING:
|
@@ -59,8 +61,6 @@ def shutdown_process_pool() -> None:
|
|
59
61
|
|
60
62
|
def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
|
61
63
|
"""Worker function for extracting PDF text in a separate process."""
|
62
|
-
import pypdfium2
|
63
|
-
|
64
64
|
pdf = None
|
65
65
|
try:
|
66
66
|
pdf = pypdfium2.PdfDocument(pdf_path)
|
@@ -81,10 +81,6 @@ def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
|
|
81
81
|
|
82
82
|
def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str, list[bytes]]:
|
83
83
|
"""Worker function for converting PDF to images in a separate process."""
|
84
|
-
import io
|
85
|
-
|
86
|
-
import pypdfium2
|
87
|
-
|
88
84
|
pdf = None
|
89
85
|
try:
|
90
86
|
pdf = pypdfium2.PdfDocument(pdf_path)
|
kreuzberg/_utils/_quality.py
CHANGED
@@ -3,6 +3,7 @@
|
|
3
3
|
from __future__ import annotations
|
4
4
|
|
5
5
|
import re
|
6
|
+
from functools import reduce
|
6
7
|
from typing import Any
|
7
8
|
|
8
9
|
# Pre-compiled patterns for performance
|
@@ -102,9 +103,8 @@ def clean_extracted_text(text: str) -> str:
|
|
102
103
|
if not text:
|
103
104
|
return text
|
104
105
|
|
105
|
-
# Remove script and style content
|
106
|
-
|
107
|
-
text = pattern.sub(" ", text)
|
106
|
+
# Remove script and style content using functools.reduce for single pass
|
107
|
+
text = reduce(lambda t, pattern: pattern.sub(" ", t), _SCRIPT_PATTERNS.values(), text)
|
108
108
|
|
109
109
|
# Clean OCR artifacts
|
110
110
|
text = _clean_ocr_artifacts(text)
|
@@ -134,10 +134,8 @@ def _calculate_script_penalty(text: str, total_chars: int) -> float:
|
|
134
134
|
if total_chars == 0:
|
135
135
|
return 0.0
|
136
136
|
|
137
|
-
|
138
|
-
for pattern in _SCRIPT_PATTERNS.values()
|
139
|
-
matches = pattern.findall(text)
|
140
|
-
script_chars += sum(len(match) for match in matches)
|
137
|
+
# Use sum with generator expression for single-pass calculation
|
138
|
+
script_chars = sum(len(match) for pattern in _SCRIPT_PATTERNS.values() for match in pattern.findall(text))
|
141
139
|
|
142
140
|
return min(1.0, script_chars / total_chars)
|
143
141
|
|
@@ -147,10 +145,8 @@ def _calculate_navigation_penalty(text: str, total_chars: int) -> float:
|
|
147
145
|
if total_chars == 0:
|
148
146
|
return 0.0
|
149
147
|
|
150
|
-
|
151
|
-
for pattern in _NAVIGATION_PATTERNS.values()
|
152
|
-
matches = pattern.findall(text)
|
153
|
-
nav_chars += sum(len(match) for match in matches)
|
148
|
+
# Use sum with generator expression for single-pass calculation
|
149
|
+
nav_chars = sum(len(match) for pattern in _NAVIGATION_PATTERNS.values() for match in pattern.findall(text))
|
154
150
|
|
155
151
|
return min(1.0, nav_chars / total_chars)
|
156
152
|
|
@@ -2,16 +2,28 @@
|
|
2
2
|
|
3
3
|
from __future__ import annotations
|
4
4
|
|
5
|
-
from dataclasses import
|
6
|
-
from enum import Enum
|
5
|
+
from dataclasses import is_dataclass
|
7
6
|
from typing import Any, TypeVar, cast
|
8
7
|
|
8
|
+
import msgspec
|
9
9
|
from msgspec import MsgspecError
|
10
10
|
from msgspec.msgpack import decode, encode
|
11
11
|
|
12
12
|
T = TypeVar("T")
|
13
13
|
|
14
14
|
|
15
|
+
# Define dict method names in priority order
|
16
|
+
_DICT_METHOD_NAMES = (
|
17
|
+
"to_dict",
|
18
|
+
"as_dict",
|
19
|
+
"dict",
|
20
|
+
"model_dump",
|
21
|
+
"json",
|
22
|
+
"to_list",
|
23
|
+
"tolist",
|
24
|
+
)
|
25
|
+
|
26
|
+
|
15
27
|
def encode_hook(obj: Any) -> Any:
|
16
28
|
"""Custom encoder for complex objects."""
|
17
29
|
if callable(obj):
|
@@ -20,22 +32,15 @@ def encode_hook(obj: Any) -> Any:
|
|
20
32
|
if isinstance(obj, Exception):
|
21
33
|
return {"message": str(obj), "type": type(obj).__name__}
|
22
34
|
|
23
|
-
for
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
"json",
|
29
|
-
"to_list",
|
30
|
-
"tolist",
|
31
|
-
):
|
32
|
-
if hasattr(obj, key):
|
33
|
-
method = getattr(obj, key) # Cache the attribute lookup
|
34
|
-
if callable(method):
|
35
|
-
return method()
|
35
|
+
# Check for dict-like methods more efficiently using any() with generator
|
36
|
+
for attr_name in _DICT_METHOD_NAMES:
|
37
|
+
method = getattr(obj, attr_name, None)
|
38
|
+
if method is not None and callable(method):
|
39
|
+
return method()
|
36
40
|
|
37
41
|
if is_dataclass(obj) and not isinstance(obj, type):
|
38
|
-
|
42
|
+
# Use msgspec.to_builtins for more efficient conversion
|
43
|
+
return msgspec.to_builtins(obj)
|
39
44
|
|
40
45
|
if hasattr(obj, "save") and hasattr(obj, "format"):
|
41
46
|
return None
|
kreuzberg/_utils/_string.py
CHANGED
@@ -28,6 +28,7 @@ _encoding_cache: dict[str, str] = {}
|
|
28
28
|
@lru_cache(maxsize=128)
|
29
29
|
def _get_encoding_cache_key(data_hash: str, size: int) -> str:
|
30
30
|
"""Generate cache key for encoding detection."""
|
31
|
+
# Use string interpolation which is faster than format strings for simple cases
|
31
32
|
return f"{data_hash}:{size}"
|
32
33
|
|
33
34
|
|
@@ -104,25 +105,29 @@ def _calculate_text_confidence(text: str) -> float:
|
|
104
105
|
if not text:
|
105
106
|
return 0.0
|
106
107
|
|
107
|
-
# Check for common encoding problems
|
108
|
-
replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
|
109
|
-
control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
|
110
108
|
total_chars = len(text)
|
111
|
-
|
112
109
|
if total_chars == 0:
|
113
110
|
return 0.0
|
114
111
|
|
112
|
+
# Check for common encoding problems - compile patterns once
|
113
|
+
replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
|
114
|
+
control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
|
115
|
+
|
115
116
|
# Penalize replacement and control characters
|
116
117
|
penalty = (replacement_count + control_count * 2) / total_chars
|
117
118
|
|
118
|
-
# Bonus for readable character ranges
|
119
|
+
# Bonus for readable character ranges - more efficient counting
|
120
|
+
# Use generator expression with early termination
|
119
121
|
readable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
|
120
122
|
readability_score = readable_chars / total_chars
|
121
123
|
|
122
124
|
# Check for suspicious Cyrillic that might be misencoded Hebrew
|
123
125
|
cyrillic_matches = _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].findall(text)
|
124
|
-
if cyrillic_matches
|
125
|
-
|
126
|
+
if cyrillic_matches:
|
127
|
+
# Calculate total length more efficiently
|
128
|
+
cyrillic_length = sum(len(match) for match in cyrillic_matches)
|
129
|
+
if cyrillic_length > total_chars * 0.1:
|
130
|
+
penalty += 0.3 # Heavy penalty for likely mojibake
|
126
131
|
|
127
132
|
return max(0.0, min(1.0, readability_score - penalty))
|
128
133
|
|
@@ -164,7 +169,8 @@ def normalize_spaces(text: str) -> str:
|
|
164
169
|
|
165
170
|
# Split by double newlines to preserve paragraph breaks
|
166
171
|
paragraphs = text.split("\n\n")
|
167
|
-
|
172
|
+
|
173
|
+
result_paragraphs = []
|
168
174
|
|
169
175
|
for paragraph in paragraphs:
|
170
176
|
# Use pre-compiled patterns for better performance
|
@@ -173,10 +179,14 @@ def normalize_spaces(text: str) -> str:
|
|
173
179
|
# Clean up multiple newlines within paragraph (keep single newlines)
|
174
180
|
cleaned = _NEWLINES_PATTERN.sub("\n", cleaned)
|
175
181
|
|
176
|
-
#
|
177
|
-
lines = [
|
182
|
+
# Process lines efficiently - manual loop avoids double strip() calls
|
183
|
+
lines = []
|
184
|
+
for line in cleaned.split("\n"):
|
185
|
+
stripped_line = line.strip()
|
186
|
+
if stripped_line:
|
187
|
+
lines.append(stripped_line)
|
178
188
|
|
179
189
|
if lines:
|
180
|
-
|
190
|
+
result_paragraphs.append("\n".join(lines))
|
181
191
|
|
182
|
-
return "\n\n".join(
|
192
|
+
return "\n\n".join(result_paragraphs)
|
kreuzberg/_utils/_table.py
CHANGED
@@ -3,7 +3,6 @@
|
|
3
3
|
from __future__ import annotations
|
4
4
|
|
5
5
|
import csv
|
6
|
-
from io import StringIO
|
7
6
|
from typing import TYPE_CHECKING, Any
|
8
7
|
|
9
8
|
if TYPE_CHECKING:
|
@@ -23,9 +22,9 @@ def export_table_to_csv(table: TableData, separator: str = ",") -> str:
|
|
23
22
|
if "df" not in table or table["df"] is None:
|
24
23
|
return ""
|
25
24
|
|
26
|
-
|
27
|
-
table["df"].to_csv(
|
28
|
-
return
|
25
|
+
# Use pandas to_csv() direct string return instead of StringIO
|
26
|
+
csv_output = table["df"].to_csv(sep=separator, index=False, quoting=csv.QUOTE_MINIMAL, lineterminator="\n")
|
27
|
+
return str(csv_output).strip()
|
29
28
|
|
30
29
|
|
31
30
|
def export_table_to_tsv(table: TableData) -> str:
|
kreuzberg/cli.py
CHANGED
@@ -4,6 +4,7 @@ from __future__ import annotations
|
|
4
4
|
|
5
5
|
import json
|
6
6
|
import sys
|
7
|
+
import traceback
|
7
8
|
from pathlib import Path
|
8
9
|
from typing import TYPE_CHECKING, Any
|
9
10
|
|
@@ -17,7 +18,7 @@ except ImportError as e:
|
|
17
18
|
) from e
|
18
19
|
|
19
20
|
from kreuzberg import __version__, extract_bytes_sync, extract_file_sync
|
20
|
-
from kreuzberg.
|
21
|
+
from kreuzberg._config import build_extraction_config, find_config_file, load_config_from_file
|
21
22
|
from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
|
22
23
|
|
23
24
|
DEFAULT_MAX_CHARACTERS = 4000
|
@@ -91,7 +92,7 @@ def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
|
|
91
92
|
if config:
|
92
93
|
file_config = load_config_from_file(config)
|
93
94
|
else:
|
94
|
-
default_config =
|
95
|
+
default_config = find_config_file()
|
95
96
|
if default_config:
|
96
97
|
try:
|
97
98
|
file_config = load_config_from_file(default_config)
|
@@ -211,8 +212,6 @@ def handle_error(error: Exception, verbose: bool) -> None:
|
|
211
212
|
else:
|
212
213
|
console.print(f"[red]Unexpected error:[/red] {type(error).__name__}: {error}", style="bold")
|
213
214
|
if verbose:
|
214
|
-
import traceback
|
215
|
-
|
216
215
|
console.print("\n[dim]Traceback:[/dim]")
|
217
216
|
traceback.print_exc()
|
218
217
|
sys.exit(1)
|
@@ -315,7 +314,7 @@ def extract( # noqa: PLR0913
|
|
315
314
|
def config(config: Path | None) -> None:
|
316
315
|
"""Show current configuration."""
|
317
316
|
try:
|
318
|
-
config_path = config or
|
317
|
+
config_path = config or find_config_file()
|
319
318
|
|
320
319
|
if config_path:
|
321
320
|
file_config = load_config_from_file(config_path)
|