kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. kreuzberg/__init__.py +9 -2
  2. kreuzberg/_api/__init__.py +0 -0
  3. kreuzberg/_api/main.py +87 -0
  4. kreuzberg/_entity_extraction.py +238 -0
  5. kreuzberg/_extractors/_base.py +39 -1
  6. kreuzberg/_extractors/_email.py +149 -0
  7. kreuzberg/_extractors/_html.py +15 -3
  8. kreuzberg/_extractors/_image.py +27 -22
  9. kreuzberg/_extractors/_pandoc.py +3 -14
  10. kreuzberg/_extractors/_pdf.py +97 -34
  11. kreuzberg/_extractors/_presentation.py +62 -10
  12. kreuzberg/_extractors/_spread_sheet.py +181 -6
  13. kreuzberg/_extractors/_structured.py +148 -0
  14. kreuzberg/_gmft.py +318 -11
  15. kreuzberg/_language_detection.py +95 -0
  16. kreuzberg/_mcp/__init__.py +5 -0
  17. kreuzberg/_mcp/server.py +227 -0
  18. kreuzberg/_mime_types.py +27 -1
  19. kreuzberg/_ocr/__init__.py +10 -1
  20. kreuzberg/_ocr/_base.py +59 -0
  21. kreuzberg/_ocr/_easyocr.py +92 -1
  22. kreuzberg/_ocr/_paddleocr.py +89 -0
  23. kreuzberg/_ocr/_tesseract.py +569 -5
  24. kreuzberg/_registry.py +4 -0
  25. kreuzberg/_types.py +181 -4
  26. kreuzberg/_utils/_cache.py +52 -4
  27. kreuzberg/_utils/_device.py +2 -2
  28. kreuzberg/_utils/_errors.py +3 -7
  29. kreuzberg/_utils/_process_pool.py +182 -9
  30. kreuzberg/_utils/_quality.py +237 -0
  31. kreuzberg/_utils/_serialization.py +4 -2
  32. kreuzberg/_utils/_string.py +153 -10
  33. kreuzberg/_utils/_sync.py +6 -7
  34. kreuzberg/_utils/_table.py +261 -0
  35. kreuzberg/_utils/_tmp.py +2 -2
  36. kreuzberg/cli.py +1 -2
  37. kreuzberg/extraction.py +43 -34
  38. kreuzberg-3.8.1.dist-info/METADATA +301 -0
  39. kreuzberg-3.8.1.dist-info/RECORD +53 -0
  40. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
  41. kreuzberg/_multiprocessing/__init__.py +0 -6
  42. kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
  43. kreuzberg/_multiprocessing/process_manager.py +0 -188
  44. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  45. kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
  46. kreuzberg-3.3.0.dist-info/METADATA +0 -235
  47. kreuzberg-3.3.0.dist-info/RECORD +0 -48
  48. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
  49. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,261 @@
1
+ """Table processing and export utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import csv
6
+ from io import StringIO
7
+ from typing import TYPE_CHECKING, Any
8
+
9
+ if TYPE_CHECKING:
10
+ from kreuzberg._types import TableData
11
+
12
+
13
+ def export_table_to_csv(table: TableData, separator: str = ",") -> str:
14
+ r"""Export a TableData object to CSV/TSV format.
15
+
16
+ Args:
17
+ table: TableData object containing DataFrame
18
+ separator: Field separator ("," for CSV, "\t" for TSV)
19
+
20
+ Returns:
21
+ String representation in CSV/TSV format
22
+ """
23
+ if "df" not in table or table["df"] is None:
24
+ return ""
25
+
26
+ output = StringIO()
27
+ table["df"].to_csv(output, sep=separator, index=False, quoting=csv.QUOTE_MINIMAL)
28
+ return output.getvalue().strip()
29
+
30
+
31
+ def export_table_to_tsv(table: TableData) -> str:
32
+ """Export a TableData object to TSV format.
33
+
34
+ Args:
35
+ table: TableData object containing DataFrame
36
+
37
+ Returns:
38
+ String representation in TSV format
39
+ """
40
+ return export_table_to_csv(table, separator="\t")
41
+
42
+
43
+ def enhance_table_markdown(table: TableData) -> str:
44
+ """Generate enhanced markdown table with better formatting.
45
+
46
+ Args:
47
+ table: TableData object
48
+
49
+ Returns:
50
+ Enhanced markdown table string
51
+ """
52
+ if "df" not in table or table["df"] is None:
53
+ return table.get("text", "")
54
+
55
+ df = table["df"]
56
+
57
+ if df.empty:
58
+ return table.get("text", "")
59
+
60
+ # Create enhanced markdown with proper alignment
61
+ lines = []
62
+
63
+ # Header row
64
+ headers = [str(col).strip() for col in df.columns]
65
+ lines.append("| " + " | ".join(headers) + " |")
66
+
67
+ # Separator row with alignment hints
68
+ lines.append(_generate_separator_row(df))
69
+
70
+ # Analyze float columns to determine formatting strategy
71
+ float_col_formatting = _analyze_float_columns(df)
72
+
73
+ # Data rows with proper formatting
74
+ for _, row in df.iterrows():
75
+ formatted_row = _format_table_row(row, df, float_col_formatting)
76
+ lines.append("| " + " | ".join(formatted_row) + " |")
77
+
78
+ return "\n".join(lines)
79
+
80
+
81
+ def _generate_separator_row(df: Any) -> str:
82
+ """Generate separator row with proper alignment hints."""
83
+ separators = []
84
+ for col in df.columns:
85
+ # Check if column contains mostly numbers for right alignment
86
+ if df[col].dtype in ["int64", "float64"] or _is_numeric_column(df[col]):
87
+ separators.append("---:") # Right align numbers
88
+ else:
89
+ separators.append("---") # Left align text
90
+ return "| " + " | ".join(separators) + " |"
91
+
92
+
93
+ def _analyze_float_columns(df: Any) -> dict[str, str]:
94
+ """Analyze float columns to determine formatting strategy."""
95
+ float_col_formatting = {}
96
+ for col in df.columns:
97
+ if str(df[col].dtype) == "float64":
98
+ non_null_values = df[col].dropna()
99
+ if len(non_null_values) > 0:
100
+ # If all non-null values are whole numbers, format as integers
101
+ all_integers = all(val.is_integer() for val in non_null_values)
102
+ float_col_formatting[col] = "int" if all_integers else "float"
103
+ else:
104
+ float_col_formatting[col] = "int"
105
+ return float_col_formatting
106
+
107
+
108
+ def _format_table_row(row: Any, df: Any, float_col_formatting: dict[str, str]) -> list[str]:
109
+ """Format a single table row with proper value formatting."""
110
+ formatted_row = []
111
+ for col_name, value in row.items():
112
+ if value is None or (isinstance(value, float) and str(value) == "nan"):
113
+ formatted_row.append("")
114
+ elif str(df[col_name].dtype) in ["int64", "int32"]:
115
+ # For integer columns, format as integers
116
+ formatted_row.append(str(int(value)))
117
+ elif isinstance(value, float):
118
+ # For float columns, use the determined formatting strategy
119
+ if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
120
+ formatted_row.append(str(int(value)))
121
+ else:
122
+ formatted_row.append(f"{value:.2f}")
123
+ else:
124
+ # Clean up text values
125
+ clean_value = str(value).strip().replace("|", "\\|") # Escape pipes
126
+ formatted_row.append(clean_value)
127
+ return formatted_row
128
+
129
+
130
+ def _is_numeric_column(series: Any) -> bool:
131
+ """Check if a pandas Series contains mostly numeric values."""
132
+ if len(series) == 0:
133
+ return False
134
+
135
+ try:
136
+ # Check if already numeric dtype first (fastest path)
137
+ if str(series.dtype) in {"int64", "float64", "int32", "float32"}:
138
+ return True
139
+
140
+ # Sample-based approach for large series (>1000 rows)
141
+ sample_size = min(100, len(series))
142
+ if len(series) > 1000:
143
+ sample_series = series.dropna().sample(n=sample_size, random_state=42)
144
+ else:
145
+ sample_series = series.dropna()
146
+
147
+ if len(sample_series) == 0:
148
+ return False
149
+
150
+ # Optimized numeric conversion - avoid exception overhead
151
+ numeric_count = 0
152
+ for val in sample_series:
153
+ val_str = str(val).replace(",", "").replace("$", "").replace("%", "")
154
+ # Quick check: if it contains only digits, decimal point, minus, plus, or e
155
+ if val_str and all(c in "0123456789.-+eE" for c in val_str):
156
+ try:
157
+ float(val_str)
158
+ numeric_count += 1
159
+ except (ValueError, TypeError):
160
+ pass
161
+
162
+ # Consider numeric if >70% of sampled values are numeric
163
+ return (numeric_count / len(sample_series)) > 0.7
164
+
165
+ except (ValueError, TypeError, ZeroDivisionError):
166
+ return False
167
+
168
+
169
+ def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
170
+ """Generate summary statistics for extracted tables.
171
+
172
+ Args:
173
+ tables: List of TableData objects
174
+
175
+ Returns:
176
+ Dictionary with table statistics
177
+ """
178
+ if not tables:
179
+ return {
180
+ "table_count": 0,
181
+ "total_rows": 0,
182
+ "total_columns": 0,
183
+ "pages_with_tables": 0,
184
+ }
185
+
186
+ total_rows = 0
187
+ total_columns = 0
188
+ pages_with_tables = set()
189
+ tables_by_page = {}
190
+
191
+ for table in tables:
192
+ if "df" in table and table["df"] is not None:
193
+ df = table["df"]
194
+ total_rows += len(df)
195
+ total_columns += len(df.columns)
196
+
197
+ if "page_number" in table:
198
+ page_num = table["page_number"]
199
+ pages_with_tables.add(page_num)
200
+
201
+ if page_num not in tables_by_page:
202
+ tables_by_page[page_num] = 0
203
+ tables_by_page[page_num] += 1
204
+
205
+ return {
206
+ "table_count": len(tables),
207
+ "total_rows": total_rows,
208
+ "total_columns": total_columns,
209
+ "pages_with_tables": len(pages_with_tables),
210
+ "avg_rows_per_table": total_rows / len(tables) if tables else 0,
211
+ "avg_columns_per_table": total_columns / len(tables) if tables else 0,
212
+ "tables_by_page": dict(tables_by_page),
213
+ }
214
+
215
+
216
+ def extract_table_structure_info(table: TableData) -> dict[str, Any]:
217
+ """Extract structural information from a table.
218
+
219
+ Args:
220
+ table: TableData object
221
+
222
+ Returns:
223
+ Dictionary with structural information
224
+ """
225
+ info = {
226
+ "has_headers": False,
227
+ "row_count": 0,
228
+ "column_count": 0,
229
+ "numeric_columns": 0,
230
+ "text_columns": 0,
231
+ "empty_cells": 0,
232
+ "data_density": 0.0,
233
+ }
234
+
235
+ if "df" not in table or table["df"] is None:
236
+ return info
237
+
238
+ df = table["df"]
239
+
240
+ if df.empty:
241
+ return info
242
+
243
+ info["row_count"] = len(df)
244
+ info["column_count"] = len(df.columns)
245
+ info["has_headers"] = len(df.columns) > 0
246
+
247
+ # Analyze column types
248
+ for col in df.columns:
249
+ if _is_numeric_column(df[col]):
250
+ info["numeric_columns"] += 1
251
+ else:
252
+ info["text_columns"] += 1
253
+
254
+ # Calculate data density
255
+ total_cells = len(df) * len(df.columns)
256
+ if total_cells > 0:
257
+ empty_cells = df.isnull().sum().sum()
258
+ info["empty_cells"] = int(empty_cells)
259
+ info["data_density"] = (total_cells - empty_cells) / total_cells
260
+
261
+ return info
kreuzberg/_utils/_tmp.py CHANGED
@@ -3,14 +3,14 @@ from __future__ import annotations
3
3
  from contextlib import suppress
4
4
  from pathlib import Path
5
5
  from tempfile import NamedTemporaryFile
6
- from typing import TYPE_CHECKING, Callable
6
+ from typing import TYPE_CHECKING
7
7
 
8
8
  from anyio import Path as AsyncPath
9
9
 
10
10
  from kreuzberg._utils._sync import run_sync
11
11
 
12
12
  if TYPE_CHECKING: # pragma: no cover
13
- from collections.abc import Coroutine
13
+ from collections.abc import Callable, Coroutine
14
14
 
15
15
 
16
16
  async def create_temp_file(
kreuzberg/cli.py CHANGED
@@ -4,6 +4,7 @@ from __future__ import annotations
4
4
 
5
5
  import json
6
6
  import sys
7
+ import traceback
7
8
  from pathlib import Path
8
9
  from typing import TYPE_CHECKING, Any
9
10
 
@@ -211,8 +212,6 @@ def handle_error(error: Exception, verbose: bool) -> None:
211
212
  else:
212
213
  console.print(f"[red]Unexpected error:[/red] {type(error).__name__}: {error}", style="bold")
213
214
  if verbose:
214
- import traceback
215
-
216
215
  console.print("\n[dim]Traceback:[/dim]")
217
216
  traceback.print_exc()
218
217
  sys.exit(1)
kreuzberg/extraction.py CHANGED
@@ -1,17 +1,23 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import multiprocessing as mp
4
+ from concurrent.futures import ThreadPoolExecutor, as_completed
3
5
  from pathlib import Path
4
- from typing import TYPE_CHECKING, Final, cast
6
+ from typing import TYPE_CHECKING, Any, Final, cast
5
7
 
6
8
  import anyio
7
9
 
8
10
  from kreuzberg import ExtractionResult
9
11
  from kreuzberg._chunker import get_chunker
12
+ from kreuzberg._entity_extraction import extract_entities, extract_keywords
13
+ from kreuzberg._language_detection import detect_languages
10
14
  from kreuzberg._mime_types import (
11
15
  validate_mime_type,
12
16
  )
13
17
  from kreuzberg._registry import ExtractorRegistry
14
18
  from kreuzberg._types import ExtractionConfig
19
+ from kreuzberg._utils._document_cache import get_document_cache
20
+ from kreuzberg._utils._errors import create_error_context
15
21
  from kreuzberg._utils._string import safe_decode
16
22
  from kreuzberg._utils._sync import run_maybe_sync, run_sync_only
17
23
  from kreuzberg.exceptions import ValidationError
@@ -24,10 +30,7 @@ if TYPE_CHECKING:
24
30
  DEFAULT_CONFIG: Final[ExtractionConfig] = ExtractionConfig()
25
31
 
26
32
 
27
- async def _validate_and_post_process_async(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
28
- for validator in config.validators or []:
29
- await run_maybe_sync(validator, result)
30
-
33
+ def _validate_and_post_process_helper(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
31
34
  if config.chunk_content:
32
35
  result.chunks = _handle_chunk_content(
33
36
  mime_type=result.mime_type,
@@ -35,6 +38,39 @@ async def _validate_and_post_process_async(result: ExtractionResult, config: Ext
35
38
  content=result.content,
36
39
  )
37
40
 
41
+ if config.extract_entities:
42
+ try:
43
+ result.entities = extract_entities(
44
+ result.content,
45
+ custom_patterns=config.custom_entity_patterns,
46
+ )
47
+ except RuntimeError:
48
+ result.entities = None
49
+
50
+ if config.extract_keywords:
51
+ try:
52
+ result.keywords = extract_keywords(
53
+ result.content,
54
+ keyword_count=config.keyword_count,
55
+ )
56
+ except RuntimeError:
57
+ result.keywords = None
58
+
59
+ if config.auto_detect_language:
60
+ result.detected_languages = detect_languages(
61
+ result.content,
62
+ config=config.language_detection_config,
63
+ )
64
+
65
+ return result
66
+
67
+
68
+ async def _validate_and_post_process_async(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
69
+ for validator in config.validators or []:
70
+ await run_maybe_sync(validator, result)
71
+
72
+ result = _validate_and_post_process_helper(result, config)
73
+
38
74
  for post_processor in config.post_processing_hooks or []:
39
75
  result = await run_maybe_sync(post_processor, result)
40
76
 
@@ -45,12 +81,7 @@ def _validate_and_post_process_sync(result: ExtractionResult, config: Extraction
45
81
  for validator in config.validators or []:
46
82
  run_sync_only(validator, result)
47
83
 
48
- if config.chunk_content:
49
- result.chunks = _handle_chunk_content(
50
- mime_type=result.mime_type,
51
- config=config,
52
- content=result.content,
53
- )
84
+ result = _validate_and_post_process_helper(result, config)
54
85
 
55
86
  for post_processor in config.post_processing_hooks or []:
56
87
  result = run_sync_only(post_processor, result)
@@ -62,7 +93,7 @@ def _handle_chunk_content(
62
93
  mime_type: str,
63
94
  config: ExtractionConfig,
64
95
  content: str,
65
- ) -> list[str]:
96
+ ) -> Any:
66
97
  chunker = get_chunker(mime_type=mime_type, max_characters=config.max_chars, overlap_characters=config.max_overlap)
67
98
  return chunker.chunks(content)
68
99
 
@@ -109,8 +140,6 @@ async def extract_file(
109
140
  Raises:
110
141
  ValidationError: If the file path or configuration is invalid.
111
142
  """
112
- from kreuzberg._utils._document_cache import get_document_cache
113
-
114
143
  cache = get_document_cache()
115
144
  path = Path(file_path)
116
145
  cached_result = cache.get(path, config)
@@ -167,8 +196,6 @@ async def batch_extract_file(
167
196
  if not file_paths:
168
197
  return []
169
198
 
170
- import multiprocessing as mp
171
-
172
199
  max_concurrency = min(len(file_paths), mp.cpu_count() * 2)
173
200
  semaphore = anyio.Semaphore(max_concurrency)
174
201
 
@@ -184,8 +211,6 @@ async def batch_extract_file(
184
211
  )
185
212
  results[index] = result
186
213
  except Exception as e: # noqa: BLE001
187
- from kreuzberg._utils._errors import create_error_context
188
-
189
214
  error_result = ExtractionResult(
190
215
  content=f"Error: {type(e).__name__}: {e!s}",
191
216
  mime_type="text/plain",
@@ -224,8 +249,6 @@ async def batch_extract_bytes(
224
249
  if not contents:
225
250
  return []
226
251
 
227
- import multiprocessing as mp
228
-
229
252
  max_concurrency = min(len(contents), mp.cpu_count() * 2)
230
253
  semaphore = anyio.Semaphore(max_concurrency)
231
254
 
@@ -237,8 +260,6 @@ async def batch_extract_bytes(
237
260
  result = await extract_bytes(content, mime_type, config)
238
261
  results[index] = result
239
262
  except Exception as e: # noqa: BLE001
240
- from kreuzberg._utils._errors import create_error_context
241
-
242
263
  error_result = ExtractionResult(
243
264
  content=f"Error: {type(e).__name__}: {e!s}",
244
265
  mime_type="text/plain",
@@ -304,8 +325,6 @@ def extract_file_sync(
304
325
  Raises:
305
326
  ValidationError: If the file path or configuration is invalid.
306
327
  """
307
- from kreuzberg._utils._document_cache import get_document_cache
308
-
309
328
  cache = get_document_cache()
310
329
  path = Path(file_path)
311
330
  cached_result = cache.get(path, config)
@@ -362,9 +381,6 @@ def batch_extract_file_sync(
362
381
  if len(file_paths) <= 1:
363
382
  return [extract_file_sync(file_path=Path(file_path), mime_type=None, config=config) for file_path in file_paths]
364
383
 
365
- import multiprocessing as mp
366
- from concurrent.futures import ThreadPoolExecutor, as_completed
367
-
368
384
  max_workers = min(len(file_paths), mp.cpu_count())
369
385
 
370
386
  def extract_single(file_path: PathLike[str] | str) -> tuple[int, ExtractionResult]:
@@ -375,8 +391,6 @@ def batch_extract_file_sync(
375
391
  extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
376
392
  )
377
393
  except Exception as e: # noqa: BLE001
378
- from kreuzberg._utils._errors import create_error_context
379
-
380
394
  error_result = ExtractionResult(
381
395
  content=f"Error: {type(e).__name__}: {e!s}",
382
396
  mime_type="text/plain",
@@ -420,9 +434,6 @@ def batch_extract_bytes_sync(
420
434
  extract_bytes_sync(content=content, mime_type=mime_type, config=config) for content, mime_type in contents
421
435
  ]
422
436
 
423
- import multiprocessing as mp
424
- from concurrent.futures import ThreadPoolExecutor, as_completed
425
-
426
437
  max_workers = min(len(contents), mp.cpu_count())
427
438
 
428
439
  def extract_single(index_and_content: tuple[int, tuple[bytes, str]]) -> tuple[int, ExtractionResult]:
@@ -431,8 +442,6 @@ def batch_extract_bytes_sync(
431
442
  try:
432
443
  return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
433
444
  except Exception as e: # noqa: BLE001
434
- from kreuzberg._utils._errors import create_error_context
435
-
436
445
  error_result = ExtractionResult(
437
446
  content=f"Error: {type(e).__name__}: {e!s}",
438
447
  mime_type="text/plain",