kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_config.py +248 -204
- kreuzberg/_document_classification.py +0 -8
- kreuzberg/_entity_extraction.py +1 -93
- kreuzberg/_extractors/_base.py +0 -5
- kreuzberg/_extractors/_email.py +1 -11
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -23
- kreuzberg/_extractors/_pandoc.py +10 -89
- kreuzberg/_extractors/_pdf.py +39 -92
- kreuzberg/_extractors/_presentation.py +0 -17
- kreuzberg/_extractors/_spread_sheet.py +13 -53
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -138
- kreuzberg/_language_detection.py +1 -22
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -2
- kreuzberg/_ocr/_easyocr.py +21 -108
- kreuzberg/_ocr/_paddleocr.py +16 -94
- kreuzberg/_ocr/_table_extractor.py +260 -0
- kreuzberg/_ocr/_tesseract.py +906 -264
- kreuzberg/_playa.py +5 -4
- kreuzberg/_types.py +638 -40
- kreuzberg/_utils/_cache.py +88 -90
- kreuzberg/_utils/_device.py +0 -18
- kreuzberg/_utils/_document_cache.py +0 -2
- kreuzberg/_utils/_errors.py +0 -3
- kreuzberg/_utils/_pdf_lock.py +0 -2
- kreuzberg/_utils/_process_pool.py +19 -19
- kreuzberg/_utils/_quality.py +0 -43
- kreuzberg/_utils/_ref.py +48 -0
- kreuzberg/_utils/_serialization.py +0 -5
- kreuzberg/_utils/_string.py +9 -39
- kreuzberg/_utils/_sync.py +0 -1
- kreuzberg/_utils/_table.py +50 -57
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
- kreuzberg-3.13.0.dist-info/RECORD +56 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_utils/_ref.py
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar, cast
|
4
|
+
|
5
|
+
if TYPE_CHECKING:
|
6
|
+
from collections.abc import Callable
|
7
|
+
|
8
|
+
T = TypeVar("T")
|
9
|
+
|
10
|
+
|
11
|
+
class Ref(Generic[T]):
|
12
|
+
"""A reference container that manages singleton instances without global variables.
|
13
|
+
|
14
|
+
This provides a clean alternative to global variables by using a registry pattern
|
15
|
+
with type safety.
|
16
|
+
"""
|
17
|
+
|
18
|
+
_instances: ClassVar[dict[str, Any]] = {}
|
19
|
+
|
20
|
+
def __init__(self, name: str, factory: Callable[[], T]) -> None:
|
21
|
+
"""Initialize a reference container.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
name: Unique name for this reference
|
25
|
+
factory: Factory function to create the instance when needed
|
26
|
+
"""
|
27
|
+
self.name = name
|
28
|
+
self.factory = factory
|
29
|
+
|
30
|
+
def get(self) -> T:
|
31
|
+
"""Get the singleton instance, creating it if it doesn't exist."""
|
32
|
+
if self.name not in self._instances:
|
33
|
+
self._instances[self.name] = self.factory()
|
34
|
+
return cast("T", self._instances[self.name])
|
35
|
+
|
36
|
+
def clear(self) -> None:
|
37
|
+
"""Clear the singleton instance."""
|
38
|
+
if self.name in self._instances:
|
39
|
+
del self._instances[self.name]
|
40
|
+
|
41
|
+
def is_initialized(self) -> bool:
|
42
|
+
"""Check if the singleton instance exists."""
|
43
|
+
return self.name in self._instances
|
44
|
+
|
45
|
+
@classmethod
|
46
|
+
def clear_all(cls) -> None:
|
47
|
+
"""Clear all singleton instances."""
|
48
|
+
cls._instances.clear()
|
@@ -1,5 +1,3 @@
|
|
1
|
-
"""Fast serialization utilities using msgspec."""
|
2
|
-
|
3
1
|
from __future__ import annotations
|
4
2
|
|
5
3
|
from dataclasses import is_dataclass
|
@@ -12,7 +10,6 @@ from msgspec.msgpack import decode, encode
|
|
12
10
|
T = TypeVar("T")
|
13
11
|
|
14
12
|
|
15
|
-
# Define dict method names in priority order
|
16
13
|
_DICT_METHOD_NAMES = (
|
17
14
|
"to_dict",
|
18
15
|
"as_dict",
|
@@ -32,14 +29,12 @@ def encode_hook(obj: Any) -> Any:
|
|
32
29
|
if isinstance(obj, Exception):
|
33
30
|
return {"message": str(obj), "type": type(obj).__name__}
|
34
31
|
|
35
|
-
# Check for dict-like methods more efficiently using any() with generator
|
36
32
|
for attr_name in _DICT_METHOD_NAMES:
|
37
33
|
method = getattr(obj, attr_name, None)
|
38
34
|
if method is not None and callable(method):
|
39
35
|
return method()
|
40
36
|
|
41
37
|
if is_dataclass(obj) and not isinstance(obj, type):
|
42
|
-
# Use msgspec.to_builtins for more efficient conversion
|
43
38
|
return msgspec.to_builtins(obj)
|
44
39
|
|
45
40
|
if hasattr(obj, "save") and hasattr(obj, "format"):
|
kreuzberg/_utils/_string.py
CHANGED
@@ -7,28 +7,21 @@ from functools import lru_cache
|
|
7
7
|
|
8
8
|
import chardetng_py
|
9
9
|
|
10
|
-
# Compile regex patterns once at module level for performance
|
11
10
|
_WHITESPACE_PATTERN = re.compile(r"[ \t\f\v\r\xa0\u2000-\u200b\u2028\u2029\u3000]+")
|
12
11
|
_NEWLINES_PATTERN = re.compile(r"\n+")
|
13
12
|
_MOJIBAKE_PATTERNS = {
|
14
|
-
# Hebrew as Cyrillic patterns
|
15
13
|
"hebrew_as_cyrillic": re.compile(r"[\u0400-\u04FF]{3,}"),
|
16
|
-
# Control characters that shouldn't appear in text
|
17
14
|
"control_chars": re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]"),
|
18
|
-
# Unicode replacement characters
|
19
15
|
"replacement_chars": re.compile(r"\uFFFD+"),
|
20
|
-
# Isolated combining marks (likely encoding issues)
|
21
16
|
"isolated_combining": re.compile(r"[\u0300-\u036F](?![^\u0300-\u036F])"),
|
22
17
|
}
|
23
18
|
|
24
|
-
# Simple cache for encoding detection (in-memory, session-scoped)
|
25
19
|
_encoding_cache: dict[str, str] = {}
|
26
20
|
|
27
21
|
|
28
22
|
@lru_cache(maxsize=128)
|
29
23
|
def _get_encoding_cache_key(data_hash: str, size: int) -> str:
|
30
24
|
"""Generate cache key for encoding detection."""
|
31
|
-
# Use string interpolation which is faster than format strings for simple cases
|
32
25
|
return f"{data_hash}:{size}"
|
33
26
|
|
34
27
|
|
@@ -45,14 +38,12 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
|
45
38
|
if not byte_data:
|
46
39
|
return ""
|
47
40
|
|
48
|
-
# Try provided encoding first (fastest path)
|
49
41
|
if encoding:
|
50
42
|
with suppress(UnicodeDecodeError, LookupError):
|
51
43
|
decoded = byte_data.decode(encoding)
|
52
44
|
return _fix_mojibake(decoded)
|
53
45
|
|
54
|
-
|
55
|
-
data_hash = hashlib.sha256(byte_data[:1024]).hexdigest()[:16] # Hash first 1KB
|
46
|
+
data_hash = hashlib.sha256(byte_data[:1024]).hexdigest()[:16]
|
56
47
|
cache_key = _get_encoding_cache_key(data_hash, len(byte_data))
|
57
48
|
|
58
49
|
if cache_key in _encoding_cache:
|
@@ -61,25 +52,22 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
|
61
52
|
decoded = byte_data.decode(cached_encoding)
|
62
53
|
return _fix_mojibake(decoded)
|
63
54
|
|
64
|
-
# Use chardetng for better performance than charset-normalizer
|
65
55
|
detected_encoding = chardetng_py.detect(byte_data)
|
66
56
|
if detected_encoding:
|
67
57
|
with suppress(UnicodeDecodeError, LookupError):
|
68
58
|
decoded = byte_data.decode(detected_encoding)
|
69
|
-
#
|
70
|
-
if len(_encoding_cache) < 1000: # Prevent unlimited growth
|
59
|
+
if len(_encoding_cache) < 1000: # Prevent unlimited growth ~keep
|
71
60
|
_encoding_cache[cache_key] = detected_encoding
|
72
61
|
return _fix_mojibake(decoded)
|
73
62
|
|
74
|
-
# Try multiple encodings with confidence scoring
|
75
63
|
encodings_to_try = [
|
76
64
|
"utf-8",
|
77
|
-
"windows-1255", # Hebrew
|
78
|
-
"iso-8859-8", # Hebrew
|
79
|
-
"windows-1256", # Arabic
|
80
|
-
"iso-8859-6", # Arabic
|
81
|
-
"windows-1252", # Western European
|
82
|
-
"cp1251", # Cyrillic
|
65
|
+
"windows-1255", # Hebrew ~keep
|
66
|
+
"iso-8859-8", # Hebrew ~keep
|
67
|
+
"windows-1256", # Arabic ~keep
|
68
|
+
"iso-8859-6", # Arabic ~keep
|
69
|
+
"windows-1252", # Western European ~keep
|
70
|
+
"cp1251", # Cyrillic ~keep
|
83
71
|
]
|
84
72
|
|
85
73
|
best_result = None
|
@@ -96,7 +84,6 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
|
96
84
|
if best_result and best_confidence > 0.5:
|
97
85
|
return _fix_mojibake(best_result)
|
98
86
|
|
99
|
-
# Final fallback
|
100
87
|
return byte_data.decode("latin-1", errors="replace")
|
101
88
|
|
102
89
|
|
@@ -109,25 +96,19 @@ def _calculate_text_confidence(text: str) -> float:
|
|
109
96
|
if total_chars == 0:
|
110
97
|
return 0.0
|
111
98
|
|
112
|
-
# Check for common encoding problems - compile patterns once
|
113
99
|
replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
|
114
100
|
control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
|
115
101
|
|
116
|
-
# Penalize replacement and control characters
|
117
102
|
penalty = (replacement_count + control_count * 2) / total_chars
|
118
103
|
|
119
|
-
# Bonus for readable character ranges - more efficient counting
|
120
|
-
# Use generator expression with early termination
|
121
104
|
readable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
|
122
105
|
readability_score = readable_chars / total_chars
|
123
106
|
|
124
|
-
# Check for suspicious Cyrillic that might be misencoded Hebrew
|
125
107
|
cyrillic_matches = _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].findall(text)
|
126
108
|
if cyrillic_matches:
|
127
|
-
# Calculate total length more efficiently
|
128
109
|
cyrillic_length = sum(len(match) for match in cyrillic_matches)
|
129
110
|
if cyrillic_length > total_chars * 0.1:
|
130
|
-
penalty += 0.3
|
111
|
+
penalty += 0.3
|
131
112
|
|
132
113
|
return max(0.0, min(1.0, readability_score - penalty))
|
133
114
|
|
@@ -137,19 +118,13 @@ def _fix_mojibake(text: str) -> str:
|
|
137
118
|
if not text:
|
138
119
|
return text
|
139
120
|
|
140
|
-
# Remove control characters
|
141
121
|
text = _MOJIBAKE_PATTERNS["control_chars"].sub("", text)
|
142
122
|
|
143
|
-
# Remove replacement characters
|
144
123
|
text = _MOJIBAKE_PATTERNS["replacement_chars"].sub("", text)
|
145
124
|
|
146
|
-
# Remove isolated combining marks
|
147
125
|
text = _MOJIBAKE_PATTERNS["isolated_combining"].sub("", text)
|
148
126
|
|
149
|
-
# Try to fix Hebrew encoded as Cyrillic (common Windows-1255 -> CP1251 confusion)
|
150
127
|
if _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].search(text):
|
151
|
-
# This is a heuristic fix - in practice, you'd need actual character mapping
|
152
|
-
# For now, we flag it for manual review by keeping the text but adding a marker
|
153
128
|
pass
|
154
129
|
|
155
130
|
return text
|
@@ -167,19 +142,14 @@ def normalize_spaces(text: str) -> str:
|
|
167
142
|
if not text or not text.strip():
|
168
143
|
return ""
|
169
144
|
|
170
|
-
# Split by double newlines to preserve paragraph breaks
|
171
145
|
paragraphs = text.split("\n\n")
|
172
146
|
|
173
147
|
result_paragraphs = []
|
174
148
|
|
175
149
|
for paragraph in paragraphs:
|
176
|
-
# Use pre-compiled patterns for better performance
|
177
|
-
# Replace multiple whitespace (except newlines) with single space
|
178
150
|
cleaned = _WHITESPACE_PATTERN.sub(" ", paragraph)
|
179
|
-
# Clean up multiple newlines within paragraph (keep single newlines)
|
180
151
|
cleaned = _NEWLINES_PATTERN.sub("\n", cleaned)
|
181
152
|
|
182
|
-
# Process lines efficiently - manual loop avoids double strip() calls
|
183
153
|
lines = []
|
184
154
|
for line in cleaned.split("\n"):
|
185
155
|
stripped_line = line.strip()
|
kreuzberg/_utils/_sync.py
CHANGED
@@ -28,7 +28,6 @@ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -
|
|
28
28
|
Returns:
|
29
29
|
The result of the synchronous function.
|
30
30
|
"""
|
31
|
-
# Optimize: only create partial if we have kwargs
|
32
31
|
if kwargs:
|
33
32
|
handler = partial(sync_fn, **kwargs)
|
34
33
|
return cast("T", await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
|
kreuzberg/_utils/_table.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1
|
-
"""Table processing and export utilities."""
|
2
|
-
|
3
1
|
from __future__ import annotations
|
4
2
|
|
5
|
-
import
|
3
|
+
import io
|
6
4
|
from typing import TYPE_CHECKING, Any
|
7
5
|
|
8
6
|
if TYPE_CHECKING:
|
@@ -22,9 +20,10 @@ def export_table_to_csv(table: TableData, separator: str = ",") -> str:
|
|
22
20
|
if "df" not in table or table["df"] is None:
|
23
21
|
return ""
|
24
22
|
|
25
|
-
|
26
|
-
|
27
|
-
|
23
|
+
buffer = io.StringIO()
|
24
|
+
df = table["df"]
|
25
|
+
df.write_csv(buffer, separator=separator, include_header=True)
|
26
|
+
return buffer.getvalue().strip()
|
28
27
|
|
29
28
|
|
30
29
|
def export_table_to_tsv(table: TableData) -> str:
|
@@ -53,24 +52,19 @@ def enhance_table_markdown(table: TableData) -> str:
|
|
53
52
|
|
54
53
|
df = table["df"]
|
55
54
|
|
56
|
-
if df.
|
55
|
+
if df.is_empty():
|
57
56
|
return table.get("text", "")
|
58
57
|
|
59
|
-
# Create enhanced markdown with proper alignment
|
60
58
|
lines = []
|
61
59
|
|
62
|
-
# Header row
|
63
60
|
headers = [str(col).strip() for col in df.columns]
|
64
61
|
lines.append("| " + " | ".join(headers) + " |")
|
65
62
|
|
66
|
-
# Separator row with alignment hints
|
67
63
|
lines.append(_generate_separator_row(df))
|
68
64
|
|
69
|
-
# Analyze float columns to determine formatting strategy
|
70
65
|
float_col_formatting = _analyze_float_columns(df)
|
71
66
|
|
72
|
-
|
73
|
-
for _, row in df.iterrows():
|
67
|
+
for row in df.iter_rows(named=True):
|
74
68
|
formatted_row = _format_table_row(row, df, float_col_formatting)
|
75
69
|
lines.append("| " + " | ".join(formatted_row) + " |")
|
76
70
|
|
@@ -81,11 +75,11 @@ def _generate_separator_row(df: Any) -> str:
|
|
81
75
|
"""Generate separator row with proper alignment hints."""
|
82
76
|
separators = []
|
83
77
|
for col in df.columns:
|
84
|
-
|
85
|
-
if
|
86
|
-
separators.append("---:")
|
78
|
+
dtype_str = str(df[col].dtype)
|
79
|
+
if dtype_str in ["Int64", "Float64", "Int32", "Float32"] or _is_numeric_column(df[col]):
|
80
|
+
separators.append("---:")
|
87
81
|
else:
|
88
|
-
separators.append("---")
|
82
|
+
separators.append("---")
|
89
83
|
return "| " + " | ".join(separators) + " |"
|
90
84
|
|
91
85
|
|
@@ -93,12 +87,16 @@ def _analyze_float_columns(df: Any) -> dict[str, str]:
|
|
93
87
|
"""Analyze float columns to determine formatting strategy."""
|
94
88
|
float_col_formatting = {}
|
95
89
|
for col in df.columns:
|
96
|
-
|
97
|
-
|
90
|
+
dtype_str = str(df[col].dtype)
|
91
|
+
if dtype_str in ["Float64", "Float32"]:
|
92
|
+
non_null_values = df[col].drop_nulls()
|
98
93
|
if len(non_null_values) > 0:
|
99
|
-
|
100
|
-
|
101
|
-
|
94
|
+
try:
|
95
|
+
values_list = non_null_values.to_list()
|
96
|
+
all_integers = all(float(val).is_integer() for val in values_list if val is not None)
|
97
|
+
float_col_formatting[col] = "int" if all_integers else "float"
|
98
|
+
except (ValueError, AttributeError):
|
99
|
+
float_col_formatting[col] = "float"
|
102
100
|
else:
|
103
101
|
float_col_formatting[col] = "int"
|
104
102
|
return float_col_formatting
|
@@ -108,49 +106,47 @@ def _format_table_row(row: Any, df: Any, float_col_formatting: dict[str, str]) -
|
|
108
106
|
"""Format a single table row with proper value formatting."""
|
109
107
|
formatted_row = []
|
110
108
|
for col_name, value in row.items():
|
111
|
-
if value is None
|
109
|
+
if value is None:
|
112
110
|
formatted_row.append("")
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
elif isinstance(value, float):
|
117
|
-
# For float columns, use the determined formatting strategy
|
118
|
-
if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
|
111
|
+
else:
|
112
|
+
dtype_str = str(df[col_name].dtype)
|
113
|
+
if dtype_str in ["Int64", "Int32"]:
|
119
114
|
formatted_row.append(str(int(value)))
|
115
|
+
elif isinstance(value, float):
|
116
|
+
if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
|
117
|
+
formatted_row.append(str(int(value)))
|
118
|
+
else:
|
119
|
+
formatted_row.append(f"{value:.2f}")
|
120
120
|
else:
|
121
|
-
|
122
|
-
|
123
|
-
# Clean up text values
|
124
|
-
clean_value = str(value).strip().replace("|", "\\|") # Escape pipes
|
125
|
-
formatted_row.append(clean_value)
|
121
|
+
clean_value = str(value).strip().replace("|", "\\|")
|
122
|
+
formatted_row.append(clean_value)
|
126
123
|
return formatted_row
|
127
124
|
|
128
125
|
|
129
126
|
def _is_numeric_column(series: Any) -> bool:
|
130
|
-
"""Check if a
|
127
|
+
"""Check if a polars Series contains mostly numeric values."""
|
131
128
|
if len(series) == 0:
|
132
129
|
return False
|
133
130
|
|
134
131
|
try:
|
135
|
-
|
136
|
-
if
|
132
|
+
dtype_str = str(series.dtype)
|
133
|
+
if dtype_str in {"Int64", "Float64", "Int32", "Float32"}:
|
137
134
|
return True
|
138
135
|
|
139
|
-
# Sample-based approach for large series (>1000 rows)
|
140
136
|
sample_size = min(100, len(series))
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
137
|
+
series_no_nulls = series.drop_nulls()
|
138
|
+
|
139
|
+
if len(series_no_nulls) == 0:
|
140
|
+
return False
|
141
|
+
|
142
|
+
sample_series = series_no_nulls.slice(0, sample_size) if len(series_no_nulls) > 1000 else series_no_nulls
|
145
143
|
|
146
144
|
if len(sample_series) == 0:
|
147
145
|
return False
|
148
146
|
|
149
|
-
# Optimized numeric conversion - avoid exception overhead
|
150
147
|
numeric_count = 0
|
151
|
-
for val in sample_series:
|
148
|
+
for val in sample_series.to_list():
|
152
149
|
val_str = str(val).replace(",", "").replace("$", "").replace("%", "")
|
153
|
-
# Quick check: if it contains only digits, decimal point, minus, plus, or e
|
154
150
|
if val_str and all(c in "0123456789.-+eE" for c in val_str):
|
155
151
|
try:
|
156
152
|
float(val_str)
|
@@ -158,7 +154,6 @@ def _is_numeric_column(series: Any) -> bool:
|
|
158
154
|
except (ValueError, TypeError):
|
159
155
|
pass
|
160
156
|
|
161
|
-
# Consider numeric if >70% of sampled values are numeric
|
162
157
|
return (numeric_count / len(sample_series)) > 0.7
|
163
158
|
|
164
159
|
except (ValueError, TypeError, ZeroDivisionError):
|
@@ -190,8 +185,8 @@ def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
|
|
190
185
|
for table in tables:
|
191
186
|
if "df" in table and table["df"] is not None:
|
192
187
|
df = table["df"]
|
193
|
-
total_rows +=
|
194
|
-
total_columns +=
|
188
|
+
total_rows += df.height
|
189
|
+
total_columns += df.width
|
195
190
|
|
196
191
|
if "page_number" in table:
|
197
192
|
page_num = table["page_number"]
|
@@ -236,25 +231,23 @@ def extract_table_structure_info(table: TableData) -> dict[str, Any]:
|
|
236
231
|
|
237
232
|
df = table["df"]
|
238
233
|
|
239
|
-
if df.
|
234
|
+
if df.is_empty():
|
240
235
|
return info
|
241
236
|
|
242
|
-
info["row_count"] =
|
243
|
-
info["column_count"] =
|
244
|
-
info["has_headers"] =
|
237
|
+
info["row_count"] = df.height
|
238
|
+
info["column_count"] = df.width
|
239
|
+
info["has_headers"] = df.width > 0
|
245
240
|
|
246
|
-
# Analyze column types
|
247
241
|
for col in df.columns:
|
248
242
|
if _is_numeric_column(df[col]):
|
249
243
|
info["numeric_columns"] += 1
|
250
244
|
else:
|
251
245
|
info["text_columns"] += 1
|
252
246
|
|
253
|
-
|
254
|
-
total_cells = len(df) * len(df.columns)
|
247
|
+
total_cells = df.height * df.width
|
255
248
|
if total_cells > 0:
|
256
|
-
empty_cells = df.
|
257
|
-
info["empty_cells"] =
|
249
|
+
empty_cells = df.null_count().sum().item()
|
250
|
+
info["empty_cells"] = empty_cells
|
258
251
|
info["data_density"] = (total_cells - empty_cells) / total_cells
|
259
252
|
|
260
253
|
return info
|
kreuzberg/cli.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
"""Command-line interface for kreuzberg."""
|
2
|
-
|
3
1
|
from __future__ import annotations
|
4
2
|
|
5
3
|
import json
|
@@ -84,11 +82,11 @@ def format_extraction_result(result: ExtractionResult, show_metadata: bool, outp
|
|
84
82
|
return "\n".join(output_parts)
|
85
83
|
|
86
84
|
|
87
|
-
def _load_config(
|
85
|
+
def _load_config(config_path: Path | None, verbose: bool) -> dict[str, Any]:
|
88
86
|
"""Load configuration from file or find default."""
|
89
87
|
file_config = {}
|
90
|
-
if
|
91
|
-
file_config = load_config_from_file(
|
88
|
+
if config_path:
|
89
|
+
file_config = load_config_from_file(config_path)
|
92
90
|
else:
|
93
91
|
default_config = find_config_file()
|
94
92
|
if default_config:
|
@@ -101,39 +99,38 @@ def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
|
|
101
99
|
return file_config
|
102
100
|
|
103
101
|
|
104
|
-
def _build_cli_args(
|
105
|
-
force_ocr: bool,
|
106
|
-
chunk_content: bool,
|
107
|
-
extract_tables: bool,
|
108
|
-
max_chars: int,
|
109
|
-
max_overlap: int,
|
110
|
-
ocr_backend: str | None,
|
111
|
-
tesseract_lang: str | None,
|
112
|
-
tesseract_psm: int | None,
|
113
|
-
easyocr_languages: str | None,
|
114
|
-
paddleocr_languages: str | None,
|
115
|
-
) -> dict[str, Any]:
|
102
|
+
def _build_cli_args(params: dict[str, Any]) -> dict[str, Any]:
|
116
103
|
"""Build CLI arguments dictionary."""
|
117
104
|
cli_args: dict[str, Any] = {
|
118
|
-
"force_ocr": force_ocr if force_ocr else None,
|
119
|
-
"chunk_content": chunk_content if chunk_content else None,
|
120
|
-
"extract_tables": extract_tables if extract_tables else None,
|
121
|
-
"max_chars": max_chars if max_chars != DEFAULT_MAX_CHARACTERS else None,
|
122
|
-
"max_overlap": max_overlap if max_overlap != DEFAULT_MAX_OVERLAP else None,
|
123
|
-
"ocr_backend": ocr_backend,
|
105
|
+
"force_ocr": params["force_ocr"] if params["force_ocr"] else None,
|
106
|
+
"chunk_content": params["chunk_content"] if params["chunk_content"] else None,
|
107
|
+
"extract_tables": params["extract_tables"] if params["extract_tables"] else None,
|
108
|
+
"max_chars": params["max_chars"] if params["max_chars"] != DEFAULT_MAX_CHARACTERS else None,
|
109
|
+
"max_overlap": params["max_overlap"] if params["max_overlap"] != DEFAULT_MAX_OVERLAP else None,
|
110
|
+
"ocr_backend": params["ocr_backend"],
|
124
111
|
}
|
125
112
|
|
126
|
-
|
113
|
+
ocr_backend = params["ocr_backend"]
|
114
|
+
if ocr_backend == "tesseract" and (
|
115
|
+
params["tesseract_lang"]
|
116
|
+
or params["tesseract_psm"] is not None
|
117
|
+
or params["tesseract_output_format"]
|
118
|
+
or params["enable_table_detection"]
|
119
|
+
):
|
127
120
|
tesseract_config = {}
|
128
|
-
if tesseract_lang:
|
129
|
-
tesseract_config["language"] = tesseract_lang
|
130
|
-
if tesseract_psm is not None:
|
131
|
-
tesseract_config["psm"] = tesseract_psm
|
121
|
+
if params["tesseract_lang"]:
|
122
|
+
tesseract_config["language"] = params["tesseract_lang"]
|
123
|
+
if params["tesseract_psm"] is not None:
|
124
|
+
tesseract_config["psm"] = params["tesseract_psm"]
|
125
|
+
if params["tesseract_output_format"]:
|
126
|
+
tesseract_config["output_format"] = params["tesseract_output_format"]
|
127
|
+
if params["enable_table_detection"]:
|
128
|
+
tesseract_config["enable_table_detection"] = True
|
132
129
|
cli_args["tesseract_config"] = tesseract_config
|
133
|
-
elif ocr_backend == "easyocr" and easyocr_languages:
|
134
|
-
cli_args["easyocr_config"] = {"languages": easyocr_languages.split(",")}
|
135
|
-
elif ocr_backend == "paddleocr" and paddleocr_languages:
|
136
|
-
cli_args["paddleocr_config"] = {"languages": paddleocr_languages.split(",")}
|
130
|
+
elif ocr_backend == "easyocr" and params["easyocr_languages"]:
|
131
|
+
cli_args["easyocr_config"] = {"languages": params["easyocr_languages"].split(",")}
|
132
|
+
elif ocr_backend == "paddleocr" and params["paddleocr_languages"]:
|
133
|
+
cli_args["paddleocr_config"] = {"languages": params["paddleocr_languages"].split(",")}
|
137
134
|
|
138
135
|
return cli_args
|
139
136
|
|
@@ -158,7 +155,7 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
|
|
158
155
|
progress.add_task("Extracting text...", total=None)
|
159
156
|
|
160
157
|
try:
|
161
|
-
import magic # type: ignore[import-not-found]
|
158
|
+
import magic # type: ignore[import-not-found] # noqa: PLC0415
|
162
159
|
|
163
160
|
mime_type = magic.from_buffer(input_bytes, mime=True)
|
164
161
|
except ImportError: # pragma: no cover
|
@@ -188,7 +185,10 @@ def _write_output(
|
|
188
185
|
if verbose:
|
189
186
|
console.print(f"[green]✓[/green] Output written to: {output}")
|
190
187
|
else:
|
191
|
-
|
188
|
+
try:
|
189
|
+
click.echo(formatted_output)
|
190
|
+
except UnicodeEncodeError:
|
191
|
+
sys.stdout.buffer.write(formatted_output.encode("utf-8"))
|
192
192
|
|
193
193
|
|
194
194
|
def handle_error(error: Exception, verbose: bool) -> None: # pragma: no cover
|
@@ -248,71 +248,51 @@ def cli(ctx: click.Context) -> None:
|
|
248
248
|
@click.option(
|
249
249
|
"--ocr-backend", type=OcrBackendParamType(), help="OCR backend to use (tesseract, easyocr, paddleocr, none)"
|
250
250
|
)
|
251
|
-
@click.option("--config", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
|
251
|
+
@click.option("--config", "config_file", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
|
252
252
|
@click.option("--show-metadata", is_flag=True, help="Include metadata in output")
|
253
253
|
@click.option("--output-format", type=click.Choice(["text", "json"]), default="text", help="Output format")
|
254
254
|
@click.option("-v", "--verbose", is_flag=True, help="Verbose output for debugging")
|
255
255
|
@click.option("--tesseract-lang", help="Tesseract language(s) (e.g., 'eng+deu')")
|
256
256
|
@click.option("--tesseract-psm", type=int, help="Tesseract PSM mode (0-13)")
|
257
|
+
@click.option(
|
258
|
+
"--tesseract-output-format",
|
259
|
+
type=click.Choice(["text", "markdown", "tsv", "hocr"]),
|
260
|
+
help="Tesseract OCR output format (default: markdown)",
|
261
|
+
)
|
262
|
+
@click.option(
|
263
|
+
"--enable-table-detection", is_flag=True, help="Enable table extraction from scanned documents (with TSV format)"
|
264
|
+
)
|
257
265
|
@click.option("--easyocr-languages", help="EasyOCR language codes (comma-separated, e.g., 'en,de')")
|
258
266
|
@click.option("--paddleocr-languages", help="PaddleOCR language codes (comma-separated, e.g., 'en,german')")
|
259
267
|
@click.pass_context
|
260
|
-
def extract(
|
261
|
-
_: click.Context,
|
262
|
-
file: Path | None,
|
263
|
-
output: Path | None,
|
264
|
-
force_ocr: bool,
|
265
|
-
chunk_content: bool,
|
266
|
-
extract_tables: bool,
|
267
|
-
max_chars: int,
|
268
|
-
max_overlap: int,
|
269
|
-
ocr_backend: str | None,
|
270
|
-
config: Path | None,
|
271
|
-
show_metadata: bool,
|
272
|
-
output_format: str,
|
273
|
-
verbose: bool,
|
274
|
-
tesseract_lang: str | None,
|
275
|
-
tesseract_psm: int | None,
|
276
|
-
easyocr_languages: str | None,
|
277
|
-
paddleocr_languages: str | None,
|
278
|
-
) -> None:
|
268
|
+
def extract(ctx: click.Context) -> None:
|
279
269
|
"""Extract text from a document.
|
280
270
|
|
281
271
|
FILE can be a path to a document or '-' to read from stdin.
|
282
272
|
If FILE is omitted, reads from stdin.
|
283
273
|
"""
|
274
|
+
params = ctx.params
|
284
275
|
try:
|
285
|
-
file_config = _load_config(
|
286
|
-
|
287
|
-
cli_args = _build_cli_args(
|
288
|
-
force_ocr,
|
289
|
-
chunk_content,
|
290
|
-
extract_tables,
|
291
|
-
max_chars,
|
292
|
-
max_overlap,
|
293
|
-
ocr_backend,
|
294
|
-
tesseract_lang,
|
295
|
-
tesseract_psm,
|
296
|
-
easyocr_languages,
|
297
|
-
paddleocr_languages,
|
298
|
-
)
|
276
|
+
file_config = _load_config(params["config_file"], params["verbose"])
|
277
|
+
|
278
|
+
cli_args = _build_cli_args(params)
|
299
279
|
|
300
280
|
extraction_config = build_extraction_config(file_config, cli_args)
|
301
281
|
|
302
|
-
result = _perform_extraction(file, extraction_config, verbose)
|
282
|
+
result = _perform_extraction(params["file"], extraction_config, params["verbose"])
|
303
283
|
|
304
|
-
_write_output(result, output, show_metadata, output_format, verbose)
|
284
|
+
_write_output(result, params["output"], params["show_metadata"], params["output_format"], params["verbose"])
|
305
285
|
|
306
286
|
except Exception as e: # noqa: BLE001
|
307
|
-
handle_error(e, verbose)
|
287
|
+
handle_error(e, params["verbose"])
|
308
288
|
|
309
289
|
|
310
290
|
@cli.command()
|
311
|
-
@click.option("--config", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
|
312
|
-
def config(
|
291
|
+
@click.option("--config", "config_file", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
|
292
|
+
def config(config_file: Path | None) -> None:
|
313
293
|
"""Show current configuration."""
|
314
294
|
try:
|
315
|
-
config_path =
|
295
|
+
config_path = config_file or find_config_file()
|
316
296
|
|
317
297
|
if config_path:
|
318
298
|
file_config = load_config_from_file(config_path)
|