kreuzberg 3.8.0__py3-none-any.whl → 3.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. kreuzberg/__init__.py +4 -0
  2. kreuzberg/_api/main.py +22 -1
  3. kreuzberg/_config.py +404 -0
  4. kreuzberg/_entity_extraction.py +4 -5
  5. kreuzberg/_extractors/_base.py +3 -5
  6. kreuzberg/_extractors/_image.py +18 -32
  7. kreuzberg/_extractors/_pandoc.py +3 -14
  8. kreuzberg/_extractors/_pdf.py +39 -57
  9. kreuzberg/_extractors/_spread_sheet.py +2 -3
  10. kreuzberg/_extractors/_structured.py +10 -7
  11. kreuzberg/_gmft.py +314 -10
  12. kreuzberg/_language_detection.py +1 -1
  13. kreuzberg/_mcp/server.py +58 -8
  14. kreuzberg/_ocr/__init__.py +1 -22
  15. kreuzberg/_ocr/_base.py +59 -0
  16. kreuzberg/_ocr/_easyocr.py +92 -1
  17. kreuzberg/_ocr/_paddleocr.py +90 -1
  18. kreuzberg/_ocr/_tesseract.py +556 -5
  19. kreuzberg/_playa.py +2 -3
  20. kreuzberg/_types.py +46 -24
  21. kreuzberg/_utils/_cache.py +35 -4
  22. kreuzberg/_utils/_device.py +10 -20
  23. kreuzberg/_utils/_errors.py +44 -45
  24. kreuzberg/_utils/_process_pool.py +2 -6
  25. kreuzberg/_utils/_quality.py +7 -11
  26. kreuzberg/_utils/_serialization.py +21 -16
  27. kreuzberg/_utils/_string.py +22 -12
  28. kreuzberg/_utils/_table.py +3 -4
  29. kreuzberg/cli.py +4 -5
  30. kreuzberg/exceptions.py +10 -0
  31. kreuzberg/extraction.py +6 -24
  32. kreuzberg-3.8.2.dist-info/METADATA +265 -0
  33. kreuzberg-3.8.2.dist-info/RECORD +53 -0
  34. kreuzberg/_cli_config.py +0 -175
  35. kreuzberg/_multiprocessing/__init__.py +0 -5
  36. kreuzberg/_multiprocessing/gmft_isolated.py +0 -330
  37. kreuzberg/_ocr/_pool.py +0 -357
  38. kreuzberg/_ocr/_sync.py +0 -566
  39. kreuzberg-3.8.0.dist-info/METADATA +0 -313
  40. kreuzberg-3.8.0.dist-info/RECORD +0 -57
  41. {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/WHEEL +0 -0
  42. {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/entry_points.txt +0 -0
  43. {kreuzberg-3.8.0.dist-info → kreuzberg-3.8.2.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_types.py CHANGED
@@ -5,7 +5,10 @@ from collections.abc import Awaitable, Callable
5
5
  from dataclasses import asdict, dataclass, field
6
6
  from typing import TYPE_CHECKING, Any, Literal, TypedDict
7
7
 
8
+ import msgspec
9
+
8
10
  from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
11
+ from kreuzberg._utils._table import export_table_to_csv, export_table_to_tsv, extract_table_structure_info
9
12
  from kreuzberg.exceptions import ValidationError
10
13
 
11
14
  if sys.version_info < (3, 11): # pragma: no cover
@@ -191,7 +194,7 @@ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
191
194
  return normalized
192
195
 
193
196
 
194
- @dataclass(frozen=True)
197
+ @dataclass(frozen=True, slots=True)
195
198
  class Entity:
196
199
  """Represents an extracted entity with type, text, and position."""
197
200
 
@@ -205,7 +208,7 @@ class Entity:
205
208
  """End character offset in the content"""
206
209
 
207
210
 
208
- @dataclass
211
+ @dataclass(slots=True)
209
212
  class ExtractionResult:
210
213
  """The result of a file extraction."""
211
214
 
@@ -226,9 +229,29 @@ class ExtractionResult:
226
229
  detected_languages: list[str] | None = None
227
230
  """Languages detected in the extracted content, if language detection is enabled."""
228
231
 
229
- def to_dict(self) -> dict[str, Any]:
230
- """Converts the ExtractionResult to a dictionary."""
231
- return asdict(self)
232
+ def to_dict(self, include_none: bool = False) -> dict[str, Any]:
233
+ """Converts the ExtractionResult to a dictionary.
234
+
235
+ Args:
236
+ include_none: If True, include fields with None values.
237
+ If False (default), exclude None values.
238
+
239
+ Returns:
240
+ Dictionary representation of the ExtractionResult.
241
+ """
242
+ # Use msgspec.to_builtins for efficient conversion
243
+ # The builtin_types parameter allows DataFrames to pass through
244
+ result = msgspec.to_builtins(
245
+ self,
246
+ builtin_types=(type(None),), # Allow None to pass through
247
+ order="deterministic", # Ensure consistent output
248
+ )
249
+
250
+ if include_none:
251
+ return result # type: ignore[no-any-return]
252
+
253
+ # Remove None values to match expected behavior
254
+ return {k: v for k, v in result.items() if v is not None}
232
255
 
233
256
  def export_tables_to_csv(self) -> list[str]:
234
257
  """Export all tables to CSV format.
@@ -239,8 +262,6 @@ class ExtractionResult:
239
262
  if not self.tables:
240
263
  return []
241
264
 
242
- from kreuzberg._utils._table import export_table_to_csv
243
-
244
265
  return [export_table_to_csv(table) for table in self.tables]
245
266
 
246
267
  def export_tables_to_tsv(self) -> list[str]:
@@ -252,8 +273,6 @@ class ExtractionResult:
252
273
  if not self.tables:
253
274
  return []
254
275
 
255
- from kreuzberg._utils._table import export_table_to_tsv
256
-
257
276
  return [export_table_to_tsv(table) for table in self.tables]
258
277
 
259
278
  def get_table_summaries(self) -> list[dict[str, Any]]:
@@ -265,8 +284,6 @@ class ExtractionResult:
265
284
  if not self.tables:
266
285
  return []
267
286
 
268
- from kreuzberg._utils._table import extract_table_structure_info
269
-
270
287
  return [extract_table_structure_info(table) for table in self.tables]
271
288
 
272
289
 
@@ -274,7 +291,7 @@ PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[E
274
291
  ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
275
292
 
276
293
 
277
- @dataclass(unsafe_hash=True)
294
+ @dataclass(unsafe_hash=True, slots=True)
278
295
  class ExtractionConfig:
279
296
  """Represents configuration settings for an extraction process.
280
297
 
@@ -355,18 +372,23 @@ class ExtractionConfig:
355
372
  Returns:
356
373
  A dict of the OCR configuration or an empty dict if no backend is provided.
357
374
  """
358
- if self.ocr_backend is not None:
359
- if self.ocr_config is not None:
360
- return asdict(self.ocr_config)
361
- if self.ocr_backend == "tesseract":
362
- from kreuzberg._ocr._tesseract import TesseractConfig
375
+ if self.ocr_backend is None:
376
+ return {}
363
377
 
364
- return asdict(TesseractConfig())
365
- if self.ocr_backend == "easyocr":
366
- from kreuzberg._ocr._easyocr import EasyOCRConfig
378
+ if self.ocr_config is not None:
379
+ # Use asdict for OCR configs to preserve enum objects correctly
380
+ return asdict(self.ocr_config)
367
381
 
368
- return asdict(EasyOCRConfig())
369
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
382
+ # Lazy load and cache default configs instead of creating new instances
383
+ if self.ocr_backend == "tesseract":
384
+ from kreuzberg._ocr._tesseract import TesseractConfig
370
385
 
371
- return asdict(PaddleOCRConfig())
372
- return {}
386
+ return asdict(TesseractConfig())
387
+ if self.ocr_backend == "easyocr":
388
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
389
+
390
+ return asdict(EasyOCRConfig())
391
+ # paddleocr
392
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
393
+
394
+ return asdict(PaddleOCRConfig())
@@ -64,11 +64,10 @@ class KreuzbergCache(Generic[T]):
64
64
  Returns:
65
65
  Unique cache key string
66
66
  """
67
- # Use more efficient string building for cache key
68
67
  if not kwargs:
69
68
  return "empty"
70
69
 
71
- # Build key string efficiently
70
+ # Build cache key using list + join (faster than StringIO)
72
71
  parts = []
73
72
  for key in sorted(kwargs):
74
73
  value = kwargs[key]
@@ -81,6 +80,7 @@ class KreuzbergCache(Generic[T]):
81
80
  parts.append(f"{key}={type(value).__name__}:{value!s}")
82
81
 
83
82
  cache_str = "&".join(parts)
83
+ # SHA256 is secure and fast enough for cache keys
84
84
  return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
85
85
 
86
86
  def _get_cache_path(self, cache_key: str) -> Path:
@@ -102,15 +102,46 @@ class KreuzbergCache(Generic[T]):
102
102
 
103
103
  def _serialize_result(self, result: T) -> dict[str, Any]:
104
104
  """Serialize result for caching with metadata."""
105
+ # Handle TableData objects that contain DataFrames
106
+ if isinstance(result, list) and result and isinstance(result[0], dict) and "df" in result[0]:
107
+ serialized_data = []
108
+ for item in result:
109
+ if isinstance(item, dict) and "df" in item:
110
+ # Build new dict without unnecessary copy
111
+ serialized_item = {k: v for k, v in item.items() if k != "df"}
112
+ if hasattr(item["df"], "to_csv"):
113
+ serialized_item["df_csv"] = item["df"].to_csv(index=False)
114
+ else:
115
+ # Fallback for non-DataFrame objects
116
+ serialized_item["df_csv"] = str(item["df"])
117
+ serialized_data.append(serialized_item)
118
+ else:
119
+ serialized_data.append(item)
120
+ return {"type": "TableDataList", "data": serialized_data, "cached_at": time.time()}
121
+
105
122
  return {"type": type(result).__name__, "data": result, "cached_at": time.time()}
106
123
 
107
124
  def _deserialize_result(self, cached_data: dict[str, Any]) -> T:
108
125
  """Deserialize cached result."""
109
126
  data = cached_data["data"]
110
127
 
111
- if cached_data.get("type") == "ExtractionResult" and isinstance(data, dict):
112
- from kreuzberg._types import ExtractionResult
128
+ if cached_data.get("type") == "TableDataList" and isinstance(data, list):
129
+ from io import StringIO
130
+
131
+ import pandas as pd
113
132
 
133
+ deserialized_data = []
134
+ for item in data:
135
+ if isinstance(item, dict) and "df_csv" in item:
136
+ # Build new dict without unnecessary copy
137
+ deserialized_item = {k: v for k, v in item.items() if k != "df_csv"}
138
+ deserialized_item["df"] = pd.read_csv(StringIO(item["df_csv"]))
139
+ deserialized_data.append(deserialized_item)
140
+ else:
141
+ deserialized_data.append(item)
142
+ return deserialized_data # type: ignore[return-value]
143
+
144
+ if cached_data.get("type") == "ExtractionResult" and isinstance(data, dict):
114
145
  return ExtractionResult(**data) # type: ignore[return-value]
115
146
 
116
147
  return data # type: ignore[no-any-return]
@@ -5,6 +5,7 @@ from __future__ import annotations
5
5
 
6
6
  import warnings
7
7
  from dataclasses import dataclass
8
+ from itertools import chain
8
9
  from typing import Literal
9
10
 
10
11
  from kreuzberg.exceptions import ValidationError
@@ -12,7 +13,7 @@ from kreuzberg.exceptions import ValidationError
12
13
  DeviceType = Literal["cpu", "cuda", "mps", "auto"]
13
14
 
14
15
 
15
- @dataclass(frozen=True)
16
+ @dataclass(frozen=True, slots=True)
16
17
  class DeviceInfo:
17
18
  """Information about a compute device."""
18
19
 
@@ -34,28 +35,17 @@ def detect_available_devices() -> list[DeviceInfo]:
34
35
  Returns:
35
36
  List of available devices, with the most preferred device first.
36
37
  """
37
- devices: list[DeviceInfo] = []
38
-
39
- devices.append(
40
- DeviceInfo(
41
- device_type="cpu",
42
- name="CPU",
43
- )
44
- )
45
-
46
- if _is_cuda_available():
47
- cuda_devices = _get_cuda_devices()
48
- devices.extend(cuda_devices)
38
+ # Build device lists efficiently using generators
39
+ cpu_device = DeviceInfo(device_type="cpu", name="CPU")
49
40
 
50
- if _is_mps_available():
51
- mps_device = _get_mps_device()
52
- if mps_device:
53
- devices.append(mps_device)
41
+ cuda_devices = _get_cuda_devices() if _is_cuda_available() else []
54
42
 
55
- gpu_devices = [d for d in devices if d.device_type != "cpu"]
56
- cpu_devices = [d for d in devices if d.device_type == "cpu"]
43
+ mps_device = _get_mps_device() if _is_mps_available() else None
44
+ mps_devices = [mps_device] if mps_device else []
57
45
 
58
- return gpu_devices + cpu_devices
46
+ # Return GPU devices first, then CPU using itertools.chain
47
+ gpu_devices = list(chain(cuda_devices, mps_devices))
48
+ return [*gpu_devices, cpu_device]
59
49
 
60
50
 
61
51
  def get_optimal_device() -> DeviceInfo:
@@ -5,12 +5,48 @@ from __future__ import annotations
5
5
  import platform
6
6
  import traceback
7
7
  from datetime import datetime, timezone
8
- from typing import TYPE_CHECKING, Any
8
+ from pathlib import Path
9
+ from typing import Any
9
10
 
10
11
  import psutil
11
12
 
12
- if TYPE_CHECKING:
13
- from pathlib import Path
13
+ from kreuzberg.exceptions import ValidationError
14
+
15
+ # Define error keywords as frozensets for O(1) membership testing
16
+ _SYSTEM_ERROR_KEYWORDS = frozenset({"memory", "resource", "process", "thread"})
17
+ _TRANSIENT_ERROR_PATTERNS = frozenset(
18
+ {
19
+ "temporary",
20
+ "locked",
21
+ "in use",
22
+ "access denied",
23
+ "permission",
24
+ "timeout",
25
+ "connection",
26
+ "network",
27
+ "too many open files",
28
+ "cannot allocate memory",
29
+ "resource temporarily unavailable",
30
+ "broken pipe",
31
+ "subprocess",
32
+ "signal",
33
+ }
34
+ )
35
+ _RESOURCE_ERROR_PATTERNS = frozenset(
36
+ {
37
+ "memory",
38
+ "out of memory",
39
+ "cannot allocate",
40
+ "too many open files",
41
+ "file descriptor",
42
+ "resource",
43
+ "exhausted",
44
+ "limit",
45
+ "cpu",
46
+ "thread",
47
+ "process",
48
+ }
49
+ )
14
50
 
15
51
 
16
52
  def create_error_context(
@@ -37,8 +73,6 @@ def create_error_context(
37
73
  }
38
74
 
39
75
  if file_path:
40
- from pathlib import Path
41
-
42
76
  path = Path(file_path) if isinstance(file_path, str) else file_path
43
77
  context["file"] = {
44
78
  "path": str(path),
@@ -54,11 +88,7 @@ def create_error_context(
54
88
  "traceback": traceback.format_exception_only(type(error), error),
55
89
  }
56
90
 
57
- if (
58
- any(keyword in str(error).lower() for keyword in ["memory", "resource", "process", "thread"])
59
- if error
60
- else False
61
- ):
91
+ if error and any(keyword in str(error).lower() for keyword in _SYSTEM_ERROR_KEYWORDS):
62
92
  try:
63
93
  mem = psutil.virtual_memory()
64
94
  context["system"] = {
@@ -96,25 +126,8 @@ def is_transient_error(error: Exception) -> bool:
96
126
  if isinstance(error, transient_types):
97
127
  return True
98
128
 
99
- transient_patterns = [
100
- "temporary",
101
- "locked",
102
- "in use",
103
- "access denied",
104
- "permission",
105
- "timeout",
106
- "connection",
107
- "network",
108
- "too many open files",
109
- "cannot allocate memory",
110
- "resource temporarily unavailable",
111
- "broken pipe",
112
- "subprocess",
113
- "signal",
114
- ]
115
-
116
129
  error_str = str(error).lower()
117
- return any(pattern in error_str for pattern in transient_patterns)
130
+ return any(pattern in error_str for pattern in _TRANSIENT_ERROR_PATTERNS)
118
131
 
119
132
 
120
133
  def is_resource_error(error: Exception) -> bool:
@@ -126,22 +139,8 @@ def is_resource_error(error: Exception) -> bool:
126
139
  Returns:
127
140
  True if the error is resource-related
128
141
  """
129
- resource_patterns = [
130
- "memory",
131
- "out of memory",
132
- "cannot allocate",
133
- "too many open files",
134
- "file descriptor",
135
- "resource",
136
- "exhausted",
137
- "limit",
138
- "cpu",
139
- "thread",
140
- "process",
141
- ]
142
-
143
142
  error_str = str(error).lower()
144
- return any(pattern in error_str for pattern in resource_patterns)
143
+ return any(pattern in error_str for pattern in _RESOURCE_ERROR_PATTERNS)
145
144
 
146
145
 
147
146
  def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
@@ -158,8 +157,6 @@ def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
158
157
  if attempt >= max_attempts:
159
158
  return False
160
159
 
161
- from kreuzberg.exceptions import ValidationError
162
-
163
160
  if isinstance(error, ValidationError):
164
161
  return False
165
162
 
@@ -169,6 +166,8 @@ def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
169
166
  class BatchExtractionResult:
170
167
  """Result container for batch operations with partial success support."""
171
168
 
169
+ __slots__ = ("failed", "successful", "total_count")
170
+
172
171
  def __init__(self) -> None:
173
172
  """Initialize batch result container."""
174
173
  self.successful: list[tuple[int, Any]] = []
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import io
5
6
  import multiprocessing as mp
6
7
  from concurrent.futures import ProcessPoolExecutor
7
8
  from contextlib import contextmanager
@@ -9,6 +10,7 @@ from typing import TYPE_CHECKING, Any, TypeVar
9
10
 
10
11
  import anyio
11
12
  import psutil
13
+ import pypdfium2
12
14
  from typing_extensions import Self
13
15
 
14
16
  if TYPE_CHECKING:
@@ -59,8 +61,6 @@ def shutdown_process_pool() -> None:
59
61
 
60
62
  def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
61
63
  """Worker function for extracting PDF text in a separate process."""
62
- import pypdfium2
63
-
64
64
  pdf = None
65
65
  try:
66
66
  pdf = pypdfium2.PdfDocument(pdf_path)
@@ -81,10 +81,6 @@ def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
81
81
 
82
82
  def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str, list[bytes]]:
83
83
  """Worker function for converting PDF to images in a separate process."""
84
- import io
85
-
86
- import pypdfium2
87
-
88
84
  pdf = None
89
85
  try:
90
86
  pdf = pypdfium2.PdfDocument(pdf_path)
@@ -3,6 +3,7 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import re
6
+ from functools import reduce
6
7
  from typing import Any
7
8
 
8
9
  # Pre-compiled patterns for performance
@@ -102,9 +103,8 @@ def clean_extracted_text(text: str) -> str:
102
103
  if not text:
103
104
  return text
104
105
 
105
- # Remove script and style content
106
- for pattern in _SCRIPT_PATTERNS.values():
107
- text = pattern.sub(" ", text)
106
+ # Remove script and style content using functools.reduce for single pass
107
+ text = reduce(lambda t, pattern: pattern.sub(" ", t), _SCRIPT_PATTERNS.values(), text)
108
108
 
109
109
  # Clean OCR artifacts
110
110
  text = _clean_ocr_artifacts(text)
@@ -134,10 +134,8 @@ def _calculate_script_penalty(text: str, total_chars: int) -> float:
134
134
  if total_chars == 0:
135
135
  return 0.0
136
136
 
137
- script_chars = 0
138
- for pattern in _SCRIPT_PATTERNS.values():
139
- matches = pattern.findall(text)
140
- script_chars += sum(len(match) for match in matches)
137
+ # Use sum with generator expression for single-pass calculation
138
+ script_chars = sum(len(match) for pattern in _SCRIPT_PATTERNS.values() for match in pattern.findall(text))
141
139
 
142
140
  return min(1.0, script_chars / total_chars)
143
141
 
@@ -147,10 +145,8 @@ def _calculate_navigation_penalty(text: str, total_chars: int) -> float:
147
145
  if total_chars == 0:
148
146
  return 0.0
149
147
 
150
- nav_chars = 0
151
- for pattern in _NAVIGATION_PATTERNS.values():
152
- matches = pattern.findall(text)
153
- nav_chars += sum(len(match) for match in matches)
148
+ # Use sum with generator expression for single-pass calculation
149
+ nav_chars = sum(len(match) for pattern in _NAVIGATION_PATTERNS.values() for match in pattern.findall(text))
154
150
 
155
151
  return min(1.0, nav_chars / total_chars)
156
152
 
@@ -2,16 +2,28 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from dataclasses import asdict, is_dataclass
6
- from enum import Enum
5
+ from dataclasses import is_dataclass
7
6
  from typing import Any, TypeVar, cast
8
7
 
8
+ import msgspec
9
9
  from msgspec import MsgspecError
10
10
  from msgspec.msgpack import decode, encode
11
11
 
12
12
  T = TypeVar("T")
13
13
 
14
14
 
15
+ # Define dict method names in priority order
16
+ _DICT_METHOD_NAMES = (
17
+ "to_dict",
18
+ "as_dict",
19
+ "dict",
20
+ "model_dump",
21
+ "json",
22
+ "to_list",
23
+ "tolist",
24
+ )
25
+
26
+
15
27
  def encode_hook(obj: Any) -> Any:
16
28
  """Custom encoder for complex objects."""
17
29
  if callable(obj):
@@ -20,22 +32,15 @@ def encode_hook(obj: Any) -> Any:
20
32
  if isinstance(obj, Exception):
21
33
  return {"message": str(obj), "type": type(obj).__name__}
22
34
 
23
- for key in (
24
- "to_dict",
25
- "as_dict",
26
- "dict",
27
- "model_dump",
28
- "json",
29
- "to_list",
30
- "tolist",
31
- ):
32
- if hasattr(obj, key):
33
- method = getattr(obj, key) # Cache the attribute lookup
34
- if callable(method):
35
- return method()
35
+ # Check for dict-like methods more efficiently using any() with generator
36
+ for attr_name in _DICT_METHOD_NAMES:
37
+ method = getattr(obj, attr_name, None)
38
+ if method is not None and callable(method):
39
+ return method()
36
40
 
37
41
  if is_dataclass(obj) and not isinstance(obj, type):
38
- return {k: v if not isinstance(v, Enum) else v.value for (k, v) in asdict(obj).items()}
42
+ # Use msgspec.to_builtins for more efficient conversion
43
+ return msgspec.to_builtins(obj)
39
44
 
40
45
  if hasattr(obj, "save") and hasattr(obj, "format"):
41
46
  return None
@@ -28,6 +28,7 @@ _encoding_cache: dict[str, str] = {}
28
28
  @lru_cache(maxsize=128)
29
29
  def _get_encoding_cache_key(data_hash: str, size: int) -> str:
30
30
  """Generate cache key for encoding detection."""
31
+ # Use string interpolation which is faster than format strings for simple cases
31
32
  return f"{data_hash}:{size}"
32
33
 
33
34
 
@@ -104,25 +105,29 @@ def _calculate_text_confidence(text: str) -> float:
104
105
  if not text:
105
106
  return 0.0
106
107
 
107
- # Check for common encoding problems
108
- replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
109
- control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
110
108
  total_chars = len(text)
111
-
112
109
  if total_chars == 0:
113
110
  return 0.0
114
111
 
112
+ # Check for common encoding problems - compile patterns once
113
+ replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
114
+ control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
115
+
115
116
  # Penalize replacement and control characters
116
117
  penalty = (replacement_count + control_count * 2) / total_chars
117
118
 
118
- # Bonus for readable character ranges
119
+ # Bonus for readable character ranges - more efficient counting
120
+ # Use generator expression with early termination
119
121
  readable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
120
122
  readability_score = readable_chars / total_chars
121
123
 
122
124
  # Check for suspicious Cyrillic that might be misencoded Hebrew
123
125
  cyrillic_matches = _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].findall(text)
124
- if cyrillic_matches and len("".join(cyrillic_matches)) > total_chars * 0.1:
125
- penalty += 0.3 # Heavy penalty for likely mojibake
126
+ if cyrillic_matches:
127
+ # Calculate total length more efficiently
128
+ cyrillic_length = sum(len(match) for match in cyrillic_matches)
129
+ if cyrillic_length > total_chars * 0.1:
130
+ penalty += 0.3 # Heavy penalty for likely mojibake
126
131
 
127
132
  return max(0.0, min(1.0, readability_score - penalty))
128
133
 
@@ -164,7 +169,8 @@ def normalize_spaces(text: str) -> str:
164
169
 
165
170
  # Split by double newlines to preserve paragraph breaks
166
171
  paragraphs = text.split("\n\n")
167
- normalized_paragraphs = []
172
+
173
+ result_paragraphs = []
168
174
 
169
175
  for paragraph in paragraphs:
170
176
  # Use pre-compiled patterns for better performance
@@ -173,10 +179,14 @@ def normalize_spaces(text: str) -> str:
173
179
  # Clean up multiple newlines within paragraph (keep single newlines)
174
180
  cleaned = _NEWLINES_PATTERN.sub("\n", cleaned)
175
181
 
176
- # Strip and filter empty lines efficiently
177
- lines = [line.strip() for line in cleaned.split("\n") if line.strip()]
182
+ # Process lines efficiently - manual loop avoids double strip() calls
183
+ lines = []
184
+ for line in cleaned.split("\n"):
185
+ stripped_line = line.strip()
186
+ if stripped_line:
187
+ lines.append(stripped_line)
178
188
 
179
189
  if lines:
180
- normalized_paragraphs.append("\n".join(lines))
190
+ result_paragraphs.append("\n".join(lines))
181
191
 
182
- return "\n\n".join(normalized_paragraphs)
192
+ return "\n\n".join(result_paragraphs)
@@ -3,7 +3,6 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import csv
6
- from io import StringIO
7
6
  from typing import TYPE_CHECKING, Any
8
7
 
9
8
  if TYPE_CHECKING:
@@ -23,9 +22,9 @@ def export_table_to_csv(table: TableData, separator: str = ",") -> str:
23
22
  if "df" not in table or table["df"] is None:
24
23
  return ""
25
24
 
26
- output = StringIO()
27
- table["df"].to_csv(output, sep=separator, index=False, quoting=csv.QUOTE_MINIMAL)
28
- return output.getvalue().strip()
25
+ # Use pandas to_csv() direct string return instead of StringIO
26
+ csv_output = table["df"].to_csv(sep=separator, index=False, quoting=csv.QUOTE_MINIMAL, lineterminator="\n")
27
+ return str(csv_output).strip()
29
28
 
30
29
 
31
30
  def export_table_to_tsv(table: TableData) -> str:
kreuzberg/cli.py CHANGED
@@ -4,6 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  import json
6
6
  import sys
7
+ import traceback
7
8
  from pathlib import Path
8
9
  from typing import TYPE_CHECKING, Any
9
10
 
@@ -17,7 +18,7 @@ except ImportError as e:
17
18
  ) from e
18
19
 
19
20
  from kreuzberg import __version__, extract_bytes_sync, extract_file_sync
20
- from kreuzberg._cli_config import build_extraction_config, find_default_config, load_config_from_file
21
+ from kreuzberg._config import build_extraction_config, find_config_file, load_config_from_file
21
22
  from kreuzberg.exceptions import KreuzbergError, MissingDependencyError
22
23
 
23
24
  DEFAULT_MAX_CHARACTERS = 4000
@@ -91,7 +92,7 @@ def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
91
92
  if config:
92
93
  file_config = load_config_from_file(config)
93
94
  else:
94
- default_config = find_default_config()
95
+ default_config = find_config_file()
95
96
  if default_config:
96
97
  try:
97
98
  file_config = load_config_from_file(default_config)
@@ -211,8 +212,6 @@ def handle_error(error: Exception, verbose: bool) -> None:
211
212
  else:
212
213
  console.print(f"[red]Unexpected error:[/red] {type(error).__name__}: {error}", style="bold")
213
214
  if verbose:
214
- import traceback
215
-
216
215
  console.print("\n[dim]Traceback:[/dim]")
217
216
  traceback.print_exc()
218
217
  sys.exit(1)
@@ -315,7 +314,7 @@ def extract( # noqa: PLR0913
315
314
  def config(config: Path | None) -> None:
316
315
  """Show current configuration."""
317
316
  try:
318
- config_path = config or find_default_config()
317
+ config_path = config or find_config_file()
319
318
 
320
319
  if config_path:
321
320
  file_config = load_config_from_file(config_path)