kreuzberg 3.6.2__py3-none-any.whl → 3.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. kreuzberg/_extractors/_base.py +40 -0
  2. kreuzberg/_extractors/_email.py +149 -0
  3. kreuzberg/_extractors/_html.py +15 -3
  4. kreuzberg/_extractors/_image.py +17 -18
  5. kreuzberg/_extractors/_pdf.py +68 -14
  6. kreuzberg/_extractors/_presentation.py +62 -10
  7. kreuzberg/_extractors/_spread_sheet.py +179 -4
  8. kreuzberg/_extractors/_structured.py +148 -0
  9. kreuzberg/_gmft.py +2 -2
  10. kreuzberg/_mcp/__init__.py +5 -0
  11. kreuzberg/_mcp/server.py +227 -0
  12. kreuzberg/_mime_types.py +27 -1
  13. kreuzberg/_multiprocessing/__init__.py +2 -3
  14. kreuzberg/_ocr/__init__.py +30 -0
  15. kreuzberg/{_multiprocessing/tesseract_pool.py → _ocr/_pool.py} +3 -5
  16. kreuzberg/_ocr/_sync.py +566 -0
  17. kreuzberg/_ocr/_tesseract.py +6 -2
  18. kreuzberg/_registry.py +4 -0
  19. kreuzberg/_types.py +131 -0
  20. kreuzberg/_utils/_cache.py +17 -2
  21. kreuzberg/_utils/_process_pool.py +178 -1
  22. kreuzberg/_utils/_quality.py +237 -0
  23. kreuzberg/_utils/_serialization.py +4 -2
  24. kreuzberg/_utils/_string.py +153 -10
  25. kreuzberg/_utils/_sync.py +5 -2
  26. kreuzberg/_utils/_table.py +261 -0
  27. {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/METADATA +116 -48
  28. kreuzberg-3.8.0.dist-info/RECORD +57 -0
  29. {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/entry_points.txt +1 -0
  30. kreuzberg/_multiprocessing/process_manager.py +0 -189
  31. kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
  32. kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
  33. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  34. kreuzberg-3.6.2.dist-info/RECORD +0 -54
  35. {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/WHEEL +0 -0
  36. {kreuzberg-3.6.2.dist-info → kreuzberg-3.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,39 +1,182 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import hashlib
4
+ import re
3
5
  from contextlib import suppress
6
+ from functools import lru_cache
4
7
 
5
- from charset_normalizer import detect
8
+ import chardetng_py
9
+
10
+ # Compile regex patterns once at module level for performance
11
+ _WHITESPACE_PATTERN = re.compile(r"[ \t\f\v\r\xa0\u2000-\u200b\u2028\u2029\u3000]+")
12
+ _NEWLINES_PATTERN = re.compile(r"\n+")
13
+ _MOJIBAKE_PATTERNS = {
14
+ # Hebrew as Cyrillic patterns
15
+ "hebrew_as_cyrillic": re.compile(r"[\u0400-\u04FF]{3,}"),
16
+ # Control characters that shouldn't appear in text
17
+ "control_chars": re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]"),
18
+ # Unicode replacement characters
19
+ "replacement_chars": re.compile(r"\uFFFD+"),
20
+ # Isolated combining marks (likely encoding issues)
21
+ "isolated_combining": re.compile(r"[\u0300-\u036F](?![^\u0300-\u036F])"),
22
+ }
23
+
24
+ # Simple cache for encoding detection (in-memory, session-scoped)
25
+ _encoding_cache: dict[str, str] = {}
26
+
27
+
28
+ @lru_cache(maxsize=128)
29
+ def _get_encoding_cache_key(data_hash: str, size: int) -> str:
30
+ """Generate cache key for encoding detection."""
31
+ return f"{data_hash}:{size}"
6
32
 
7
33
 
8
34
  def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
9
- """Decode a byte string safely, removing invalid sequences.
35
+ """Decode a byte string safely with mojibake detection and correction.
10
36
 
11
37
  Args:
12
38
  byte_data: The byte string to decode.
13
39
  encoding: The encoding to use when decoding the byte string.
14
40
 
15
41
  Returns:
16
- The decoded string.
42
+ The decoded string with mojibake detection and correction.
17
43
  """
18
44
  if not byte_data:
19
45
  return ""
20
46
 
21
- encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
47
+ # Try provided encoding first (fastest path)
48
+ if encoding:
49
+ with suppress(UnicodeDecodeError, LookupError):
50
+ decoded = byte_data.decode(encoding)
51
+ return _fix_mojibake(decoded)
22
52
 
23
- for enc in [e for e in encodings if e]:
53
+ # Check cache for similar content (performance optimization)
54
+ data_hash = hashlib.sha256(byte_data[:1024]).hexdigest()[:16] # Hash first 1KB
55
+ cache_key = _get_encoding_cache_key(data_hash, len(byte_data))
56
+
57
+ if cache_key in _encoding_cache:
58
+ cached_encoding = _encoding_cache[cache_key]
59
+ with suppress(UnicodeDecodeError, LookupError):
60
+ decoded = byte_data.decode(cached_encoding)
61
+ return _fix_mojibake(decoded)
62
+
63
+ # Use chardetng for better performance than charset-normalizer
64
+ detected_encoding = chardetng_py.detect(byte_data)
65
+ if detected_encoding:
24
66
  with suppress(UnicodeDecodeError, LookupError):
25
- return byte_data.decode(enc)
67
+ decoded = byte_data.decode(detected_encoding)
68
+ # Cache successful encoding detection
69
+ if len(_encoding_cache) < 1000: # Prevent unlimited growth
70
+ _encoding_cache[cache_key] = detected_encoding
71
+ return _fix_mojibake(decoded)
72
+
73
+ # Try multiple encodings with confidence scoring
74
+ encodings_to_try = [
75
+ "utf-8",
76
+ "windows-1255", # Hebrew
77
+ "iso-8859-8", # Hebrew
78
+ "windows-1256", # Arabic
79
+ "iso-8859-6", # Arabic
80
+ "windows-1252", # Western European
81
+ "cp1251", # Cyrillic
82
+ ]
26
83
 
84
+ best_result = None
85
+ best_confidence = 0.0
86
+
87
+ for enc in encodings_to_try:
88
+ with suppress(UnicodeDecodeError, LookupError):
89
+ decoded = byte_data.decode(enc)
90
+ confidence = _calculate_text_confidence(decoded)
91
+ if confidence > best_confidence:
92
+ best_confidence = confidence
93
+ best_result = decoded
94
+
95
+ if best_result and best_confidence > 0.5:
96
+ return _fix_mojibake(best_result)
97
+
98
+ # Final fallback
27
99
  return byte_data.decode("latin-1", errors="replace")
28
100
 
29
101
 
102
+ def _calculate_text_confidence(text: str) -> float:
103
+ """Calculate confidence score for decoded text quality."""
104
+ if not text:
105
+ return 0.0
106
+
107
+ # Check for common encoding problems
108
+ replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
109
+ control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
110
+ total_chars = len(text)
111
+
112
+ if total_chars == 0:
113
+ return 0.0
114
+
115
+ # Penalize replacement and control characters
116
+ penalty = (replacement_count + control_count * 2) / total_chars
117
+
118
+ # Bonus for readable character ranges
119
+ readable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
120
+ readability_score = readable_chars / total_chars
121
+
122
+ # Check for suspicious Cyrillic that might be misencoded Hebrew
123
+ cyrillic_matches = _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].findall(text)
124
+ if cyrillic_matches and len("".join(cyrillic_matches)) > total_chars * 0.1:
125
+ penalty += 0.3 # Heavy penalty for likely mojibake
126
+
127
+ return max(0.0, min(1.0, readability_score - penalty))
128
+
129
+
130
+ def _fix_mojibake(text: str) -> str:
131
+ """Attempt to fix common mojibake patterns."""
132
+ if not text:
133
+ return text
134
+
135
+ # Remove control characters
136
+ text = _MOJIBAKE_PATTERNS["control_chars"].sub("", text)
137
+
138
+ # Remove replacement characters
139
+ text = _MOJIBAKE_PATTERNS["replacement_chars"].sub("", text)
140
+
141
+ # Remove isolated combining marks
142
+ text = _MOJIBAKE_PATTERNS["isolated_combining"].sub("", text)
143
+
144
+ # Try to fix Hebrew encoded as Cyrillic (common Windows-1255 -> CP1251 confusion)
145
+ if _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].search(text):
146
+ # This is a heuristic fix - in practice, you'd need actual character mapping
147
+ # For now, we flag it for manual review by keeping the text but adding a marker
148
+ pass
149
+
150
+ return text
151
+
152
+
30
153
  def normalize_spaces(text: str) -> str:
31
- """Normalize the spaces in a string.
154
+ """Normalize spaces while preserving line breaks and paragraph structure.
32
155
 
33
156
  Args:
34
- text: The text to sanitize.
157
+ text: The text to normalize.
35
158
 
36
159
  Returns:
37
- The sanitized text.
160
+ The normalized text with proper spacing.
38
161
  """
39
- return " ".join(text.strip().split())
162
+ if not text or not text.strip():
163
+ return ""
164
+
165
+ # Split by double newlines to preserve paragraph breaks
166
+ paragraphs = text.split("\n\n")
167
+ normalized_paragraphs = []
168
+
169
+ for paragraph in paragraphs:
170
+ # Use pre-compiled patterns for better performance
171
+ # Replace multiple whitespace (except newlines) with single space
172
+ cleaned = _WHITESPACE_PATTERN.sub(" ", paragraph)
173
+ # Clean up multiple newlines within paragraph (keep single newlines)
174
+ cleaned = _NEWLINES_PATTERN.sub("\n", cleaned)
175
+
176
+ # Strip and filter empty lines efficiently
177
+ lines = [line.strip() for line in cleaned.split("\n") if line.strip()]
178
+
179
+ if lines:
180
+ normalized_paragraphs.append("\n".join(lines))
181
+
182
+ return "\n\n".join(normalized_paragraphs)
kreuzberg/_utils/_sync.py CHANGED
@@ -28,8 +28,11 @@ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -
28
28
  Returns:
29
29
  The result of the synchronous function.
30
30
  """
31
- handler = partial(sync_fn, **kwargs)
32
- return cast("T", await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
31
+ # Optimize: only create partial if we have kwargs
32
+ if kwargs:
33
+ handler = partial(sync_fn, **kwargs)
34
+ return cast("T", await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
35
+ return cast("T", await any_io_run_sync(sync_fn, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
33
36
 
34
37
 
35
38
  async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
@@ -0,0 +1,261 @@
1
+ """Table processing and export utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ from io import StringIO
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ if TYPE_CHECKING:
10
+ from kreuzberg._types import TableData
11
+
12
+
13
+ def export_table_to_csv(table: TableData, separator: str = ",") -> str:
14
+ r"""Export a TableData object to CSV/TSV format.
15
+
16
+ Args:
17
+ table: TableData object containing DataFrame
18
+ separator: Field separator ("," for CSV, "\t" for TSV)
19
+
20
+ Returns:
21
+ String representation in CSV/TSV format
22
+ """
23
+ if "df" not in table or table["df"] is None:
24
+ return ""
25
+
26
+ output = StringIO()
27
+ table["df"].to_csv(output, sep=separator, index=False, quoting=csv.QUOTE_MINIMAL)
28
+ return output.getvalue().strip()
29
+
30
+
31
+ def export_table_to_tsv(table: TableData) -> str:
32
+ """Export a TableData object to TSV format.
33
+
34
+ Args:
35
+ table: TableData object containing DataFrame
36
+
37
+ Returns:
38
+ String representation in TSV format
39
+ """
40
+ return export_table_to_csv(table, separator="\t")
41
+
42
+
43
+ def enhance_table_markdown(table: TableData) -> str:
44
+ """Generate enhanced markdown table with better formatting.
45
+
46
+ Args:
47
+ table: TableData object
48
+
49
+ Returns:
50
+ Enhanced markdown table string
51
+ """
52
+ if "df" not in table or table["df"] is None:
53
+ return table.get("text", "")
54
+
55
+ df = table["df"]
56
+
57
+ if df.empty:
58
+ return table.get("text", "")
59
+
60
+ # Create enhanced markdown with proper alignment
61
+ lines = []
62
+
63
+ # Header row
64
+ headers = [str(col).strip() for col in df.columns]
65
+ lines.append("| " + " | ".join(headers) + " |")
66
+
67
+ # Separator row with alignment hints
68
+ lines.append(_generate_separator_row(df))
69
+
70
+ # Analyze float columns to determine formatting strategy
71
+ float_col_formatting = _analyze_float_columns(df)
72
+
73
+ # Data rows with proper formatting
74
+ for _, row in df.iterrows():
75
+ formatted_row = _format_table_row(row, df, float_col_formatting)
76
+ lines.append("| " + " | ".join(formatted_row) + " |")
77
+
78
+ return "\n".join(lines)
79
+
80
+
81
+ def _generate_separator_row(df: Any) -> str:
82
+ """Generate separator row with proper alignment hints."""
83
+ separators = []
84
+ for col in df.columns:
85
+ # Check if column contains mostly numbers for right alignment
86
+ if df[col].dtype in ["int64", "float64"] or _is_numeric_column(df[col]):
87
+ separators.append("---:") # Right align numbers
88
+ else:
89
+ separators.append("---") # Left align text
90
+ return "| " + " | ".join(separators) + " |"
91
+
92
+
93
+ def _analyze_float_columns(df: Any) -> dict[str, str]:
94
+ """Analyze float columns to determine formatting strategy."""
95
+ float_col_formatting = {}
96
+ for col in df.columns:
97
+ if str(df[col].dtype) == "float64":
98
+ non_null_values = df[col].dropna()
99
+ if len(non_null_values) > 0:
100
+ # If all non-null values are whole numbers, format as integers
101
+ all_integers = all(val.is_integer() for val in non_null_values)
102
+ float_col_formatting[col] = "int" if all_integers else "float"
103
+ else:
104
+ float_col_formatting[col] = "int"
105
+ return float_col_formatting
106
+
107
+
108
+ def _format_table_row(row: Any, df: Any, float_col_formatting: dict[str, str]) -> list[str]:
109
+ """Format a single table row with proper value formatting."""
110
+ formatted_row = []
111
+ for col_name, value in row.items():
112
+ if value is None or (isinstance(value, float) and str(value) == "nan"):
113
+ formatted_row.append("")
114
+ elif str(df[col_name].dtype) in ["int64", "int32"]:
115
+ # For integer columns, format as integers
116
+ formatted_row.append(str(int(value)))
117
+ elif isinstance(value, float):
118
+ # For float columns, use the determined formatting strategy
119
+ if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
120
+ formatted_row.append(str(int(value)))
121
+ else:
122
+ formatted_row.append(f"{value:.2f}")
123
+ else:
124
+ # Clean up text values
125
+ clean_value = str(value).strip().replace("|", "\\|") # Escape pipes
126
+ formatted_row.append(clean_value)
127
+ return formatted_row
128
+
129
+
130
+ def _is_numeric_column(series: Any) -> bool:
131
+ """Check if a pandas Series contains mostly numeric values."""
132
+ if len(series) == 0:
133
+ return False
134
+
135
+ try:
136
+ # Check if already numeric dtype first (fastest path)
137
+ if str(series.dtype) in {"int64", "float64", "int32", "float32"}:
138
+ return True
139
+
140
+ # Sample-based approach for large series (>1000 rows)
141
+ sample_size = min(100, len(series))
142
+ if len(series) > 1000:
143
+ sample_series = series.dropna().sample(n=sample_size, random_state=42)
144
+ else:
145
+ sample_series = series.dropna()
146
+
147
+ if len(sample_series) == 0:
148
+ return False
149
+
150
+ # Optimized numeric conversion - avoid exception overhead
151
+ numeric_count = 0
152
+ for val in sample_series:
153
+ val_str = str(val).replace(",", "").replace("$", "").replace("%", "")
154
+ # Quick check: if it contains only digits, decimal point, minus, plus, or e
155
+ if val_str and all(c in "0123456789.-+eE" for c in val_str):
156
+ try:
157
+ float(val_str)
158
+ numeric_count += 1
159
+ except (ValueError, TypeError):
160
+ pass
161
+
162
+ # Consider numeric if >70% of sampled values are numeric
163
+ return (numeric_count / len(sample_series)) > 0.7
164
+
165
+ except (ValueError, TypeError, ZeroDivisionError):
166
+ return False
167
+
168
+
169
+ def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
170
+ """Generate summary statistics for extracted tables.
171
+
172
+ Args:
173
+ tables: List of TableData objects
174
+
175
+ Returns:
176
+ Dictionary with table statistics
177
+ """
178
+ if not tables:
179
+ return {
180
+ "table_count": 0,
181
+ "total_rows": 0,
182
+ "total_columns": 0,
183
+ "pages_with_tables": 0,
184
+ }
185
+
186
+ total_rows = 0
187
+ total_columns = 0
188
+ pages_with_tables = set()
189
+ tables_by_page = {}
190
+
191
+ for table in tables:
192
+ if "df" in table and table["df"] is not None:
193
+ df = table["df"]
194
+ total_rows += len(df)
195
+ total_columns += len(df.columns)
196
+
197
+ if "page_number" in table:
198
+ page_num = table["page_number"]
199
+ pages_with_tables.add(page_num)
200
+
201
+ if page_num not in tables_by_page:
202
+ tables_by_page[page_num] = 0
203
+ tables_by_page[page_num] += 1
204
+
205
+ return {
206
+ "table_count": len(tables),
207
+ "total_rows": total_rows,
208
+ "total_columns": total_columns,
209
+ "pages_with_tables": len(pages_with_tables),
210
+ "avg_rows_per_table": total_rows / len(tables) if tables else 0,
211
+ "avg_columns_per_table": total_columns / len(tables) if tables else 0,
212
+ "tables_by_page": dict(tables_by_page),
213
+ }
214
+
215
+
216
+ def extract_table_structure_info(table: TableData) -> dict[str, Any]:
217
+ """Extract structural information from a table.
218
+
219
+ Args:
220
+ table: TableData object
221
+
222
+ Returns:
223
+ Dictionary with structural information
224
+ """
225
+ info = {
226
+ "has_headers": False,
227
+ "row_count": 0,
228
+ "column_count": 0,
229
+ "numeric_columns": 0,
230
+ "text_columns": 0,
231
+ "empty_cells": 0,
232
+ "data_density": 0.0,
233
+ }
234
+
235
+ if "df" not in table or table["df"] is None:
236
+ return info
237
+
238
+ df = table["df"]
239
+
240
+ if df.empty:
241
+ return info
242
+
243
+ info["row_count"] = len(df)
244
+ info["column_count"] = len(df.columns)
245
+ info["has_headers"] = len(df.columns) > 0
246
+
247
+ # Analyze column types
248
+ for col in df.columns:
249
+ if _is_numeric_column(df[col]):
250
+ info["numeric_columns"] += 1
251
+ else:
252
+ info["text_columns"] += 1
253
+
254
+ # Calculate data density
255
+ total_cells = len(df) * len(df.columns)
256
+ if total_cells > 0:
257
+ empty_cells = df.isnull().sum().sum()
258
+ info["empty_cells"] = int(empty_cells)
259
+ info["data_density"] = (total_cells - empty_cells) / total_cells
260
+
261
+ return info