kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +9 -2
- kreuzberg/_api/__init__.py +0 -0
- kreuzberg/_api/main.py +87 -0
- kreuzberg/_entity_extraction.py +238 -0
- kreuzberg/_extractors/_base.py +39 -1
- kreuzberg/_extractors/_email.py +149 -0
- kreuzberg/_extractors/_html.py +15 -3
- kreuzberg/_extractors/_image.py +27 -22
- kreuzberg/_extractors/_pandoc.py +3 -14
- kreuzberg/_extractors/_pdf.py +97 -34
- kreuzberg/_extractors/_presentation.py +62 -10
- kreuzberg/_extractors/_spread_sheet.py +181 -6
- kreuzberg/_extractors/_structured.py +148 -0
- kreuzberg/_gmft.py +318 -11
- kreuzberg/_language_detection.py +95 -0
- kreuzberg/_mcp/__init__.py +5 -0
- kreuzberg/_mcp/server.py +227 -0
- kreuzberg/_mime_types.py +27 -1
- kreuzberg/_ocr/__init__.py +10 -1
- kreuzberg/_ocr/_base.py +59 -0
- kreuzberg/_ocr/_easyocr.py +92 -1
- kreuzberg/_ocr/_paddleocr.py +89 -0
- kreuzberg/_ocr/_tesseract.py +569 -5
- kreuzberg/_registry.py +4 -0
- kreuzberg/_types.py +181 -4
- kreuzberg/_utils/_cache.py +52 -4
- kreuzberg/_utils/_device.py +2 -2
- kreuzberg/_utils/_errors.py +3 -7
- kreuzberg/_utils/_process_pool.py +182 -9
- kreuzberg/_utils/_quality.py +237 -0
- kreuzberg/_utils/_serialization.py +4 -2
- kreuzberg/_utils/_string.py +153 -10
- kreuzberg/_utils/_sync.py +6 -7
- kreuzberg/_utils/_table.py +261 -0
- kreuzberg/_utils/_tmp.py +2 -2
- kreuzberg/cli.py +1 -2
- kreuzberg/extraction.py +43 -34
- kreuzberg-3.8.1.dist-info/METADATA +301 -0
- kreuzberg-3.8.1.dist-info/RECORD +53 -0
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
- kreuzberg/_multiprocessing/__init__.py +0 -6
- kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
- kreuzberg/_multiprocessing/process_manager.py +0 -188
- kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
- kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
- kreuzberg-3.3.0.dist-info/METADATA +0 -235
- kreuzberg-3.3.0.dist-info/RECORD +0 -48
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,261 @@
|
|
1
|
+
"""Table processing and export utilities."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import csv
|
6
|
+
from io import StringIO
|
7
|
+
from typing import TYPE_CHECKING, Any
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from kreuzberg._types import TableData
|
11
|
+
|
12
|
+
|
13
|
+
def export_table_to_csv(table: TableData, separator: str = ",") -> str:
|
14
|
+
r"""Export a TableData object to CSV/TSV format.
|
15
|
+
|
16
|
+
Args:
|
17
|
+
table: TableData object containing DataFrame
|
18
|
+
separator: Field separator ("," for CSV, "\t" for TSV)
|
19
|
+
|
20
|
+
Returns:
|
21
|
+
String representation in CSV/TSV format
|
22
|
+
"""
|
23
|
+
if "df" not in table or table["df"] is None:
|
24
|
+
return ""
|
25
|
+
|
26
|
+
output = StringIO()
|
27
|
+
table["df"].to_csv(output, sep=separator, index=False, quoting=csv.QUOTE_MINIMAL)
|
28
|
+
return output.getvalue().strip()
|
29
|
+
|
30
|
+
|
31
|
+
def export_table_to_tsv(table: TableData) -> str:
|
32
|
+
"""Export a TableData object to TSV format.
|
33
|
+
|
34
|
+
Args:
|
35
|
+
table: TableData object containing DataFrame
|
36
|
+
|
37
|
+
Returns:
|
38
|
+
String representation in TSV format
|
39
|
+
"""
|
40
|
+
return export_table_to_csv(table, separator="\t")
|
41
|
+
|
42
|
+
|
43
|
+
def enhance_table_markdown(table: TableData) -> str:
|
44
|
+
"""Generate enhanced markdown table with better formatting.
|
45
|
+
|
46
|
+
Args:
|
47
|
+
table: TableData object
|
48
|
+
|
49
|
+
Returns:
|
50
|
+
Enhanced markdown table string
|
51
|
+
"""
|
52
|
+
if "df" not in table or table["df"] is None:
|
53
|
+
return table.get("text", "")
|
54
|
+
|
55
|
+
df = table["df"]
|
56
|
+
|
57
|
+
if df.empty:
|
58
|
+
return table.get("text", "")
|
59
|
+
|
60
|
+
# Create enhanced markdown with proper alignment
|
61
|
+
lines = []
|
62
|
+
|
63
|
+
# Header row
|
64
|
+
headers = [str(col).strip() for col in df.columns]
|
65
|
+
lines.append("| " + " | ".join(headers) + " |")
|
66
|
+
|
67
|
+
# Separator row with alignment hints
|
68
|
+
lines.append(_generate_separator_row(df))
|
69
|
+
|
70
|
+
# Analyze float columns to determine formatting strategy
|
71
|
+
float_col_formatting = _analyze_float_columns(df)
|
72
|
+
|
73
|
+
# Data rows with proper formatting
|
74
|
+
for _, row in df.iterrows():
|
75
|
+
formatted_row = _format_table_row(row, df, float_col_formatting)
|
76
|
+
lines.append("| " + " | ".join(formatted_row) + " |")
|
77
|
+
|
78
|
+
return "\n".join(lines)
|
79
|
+
|
80
|
+
|
81
|
+
def _generate_separator_row(df: Any) -> str:
|
82
|
+
"""Generate separator row with proper alignment hints."""
|
83
|
+
separators = []
|
84
|
+
for col in df.columns:
|
85
|
+
# Check if column contains mostly numbers for right alignment
|
86
|
+
if df[col].dtype in ["int64", "float64"] or _is_numeric_column(df[col]):
|
87
|
+
separators.append("---:") # Right align numbers
|
88
|
+
else:
|
89
|
+
separators.append("---") # Left align text
|
90
|
+
return "| " + " | ".join(separators) + " |"
|
91
|
+
|
92
|
+
|
93
|
+
def _analyze_float_columns(df: Any) -> dict[str, str]:
|
94
|
+
"""Analyze float columns to determine formatting strategy."""
|
95
|
+
float_col_formatting = {}
|
96
|
+
for col in df.columns:
|
97
|
+
if str(df[col].dtype) == "float64":
|
98
|
+
non_null_values = df[col].dropna()
|
99
|
+
if len(non_null_values) > 0:
|
100
|
+
# If all non-null values are whole numbers, format as integers
|
101
|
+
all_integers = all(val.is_integer() for val in non_null_values)
|
102
|
+
float_col_formatting[col] = "int" if all_integers else "float"
|
103
|
+
else:
|
104
|
+
float_col_formatting[col] = "int"
|
105
|
+
return float_col_formatting
|
106
|
+
|
107
|
+
|
108
|
+
def _format_table_row(row: Any, df: Any, float_col_formatting: dict[str, str]) -> list[str]:
|
109
|
+
"""Format a single table row with proper value formatting."""
|
110
|
+
formatted_row = []
|
111
|
+
for col_name, value in row.items():
|
112
|
+
if value is None or (isinstance(value, float) and str(value) == "nan"):
|
113
|
+
formatted_row.append("")
|
114
|
+
elif str(df[col_name].dtype) in ["int64", "int32"]:
|
115
|
+
# For integer columns, format as integers
|
116
|
+
formatted_row.append(str(int(value)))
|
117
|
+
elif isinstance(value, float):
|
118
|
+
# For float columns, use the determined formatting strategy
|
119
|
+
if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
|
120
|
+
formatted_row.append(str(int(value)))
|
121
|
+
else:
|
122
|
+
formatted_row.append(f"{value:.2f}")
|
123
|
+
else:
|
124
|
+
# Clean up text values
|
125
|
+
clean_value = str(value).strip().replace("|", "\\|") # Escape pipes
|
126
|
+
formatted_row.append(clean_value)
|
127
|
+
return formatted_row
|
128
|
+
|
129
|
+
|
130
|
+
def _is_numeric_column(series: Any) -> bool:
|
131
|
+
"""Check if a pandas Series contains mostly numeric values."""
|
132
|
+
if len(series) == 0:
|
133
|
+
return False
|
134
|
+
|
135
|
+
try:
|
136
|
+
# Check if already numeric dtype first (fastest path)
|
137
|
+
if str(series.dtype) in {"int64", "float64", "int32", "float32"}:
|
138
|
+
return True
|
139
|
+
|
140
|
+
# Sample-based approach for large series (>1000 rows)
|
141
|
+
sample_size = min(100, len(series))
|
142
|
+
if len(series) > 1000:
|
143
|
+
sample_series = series.dropna().sample(n=sample_size, random_state=42)
|
144
|
+
else:
|
145
|
+
sample_series = series.dropna()
|
146
|
+
|
147
|
+
if len(sample_series) == 0:
|
148
|
+
return False
|
149
|
+
|
150
|
+
# Optimized numeric conversion - avoid exception overhead
|
151
|
+
numeric_count = 0
|
152
|
+
for val in sample_series:
|
153
|
+
val_str = str(val).replace(",", "").replace("$", "").replace("%", "")
|
154
|
+
# Quick check: if it contains only digits, decimal point, minus, plus, or e
|
155
|
+
if val_str and all(c in "0123456789.-+eE" for c in val_str):
|
156
|
+
try:
|
157
|
+
float(val_str)
|
158
|
+
numeric_count += 1
|
159
|
+
except (ValueError, TypeError):
|
160
|
+
pass
|
161
|
+
|
162
|
+
# Consider numeric if >70% of sampled values are numeric
|
163
|
+
return (numeric_count / len(sample_series)) > 0.7
|
164
|
+
|
165
|
+
except (ValueError, TypeError, ZeroDivisionError):
|
166
|
+
return False
|
167
|
+
|
168
|
+
|
169
|
+
def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
|
170
|
+
"""Generate summary statistics for extracted tables.
|
171
|
+
|
172
|
+
Args:
|
173
|
+
tables: List of TableData objects
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
Dictionary with table statistics
|
177
|
+
"""
|
178
|
+
if not tables:
|
179
|
+
return {
|
180
|
+
"table_count": 0,
|
181
|
+
"total_rows": 0,
|
182
|
+
"total_columns": 0,
|
183
|
+
"pages_with_tables": 0,
|
184
|
+
}
|
185
|
+
|
186
|
+
total_rows = 0
|
187
|
+
total_columns = 0
|
188
|
+
pages_with_tables = set()
|
189
|
+
tables_by_page = {}
|
190
|
+
|
191
|
+
for table in tables:
|
192
|
+
if "df" in table and table["df"] is not None:
|
193
|
+
df = table["df"]
|
194
|
+
total_rows += len(df)
|
195
|
+
total_columns += len(df.columns)
|
196
|
+
|
197
|
+
if "page_number" in table:
|
198
|
+
page_num = table["page_number"]
|
199
|
+
pages_with_tables.add(page_num)
|
200
|
+
|
201
|
+
if page_num not in tables_by_page:
|
202
|
+
tables_by_page[page_num] = 0
|
203
|
+
tables_by_page[page_num] += 1
|
204
|
+
|
205
|
+
return {
|
206
|
+
"table_count": len(tables),
|
207
|
+
"total_rows": total_rows,
|
208
|
+
"total_columns": total_columns,
|
209
|
+
"pages_with_tables": len(pages_with_tables),
|
210
|
+
"avg_rows_per_table": total_rows / len(tables) if tables else 0,
|
211
|
+
"avg_columns_per_table": total_columns / len(tables) if tables else 0,
|
212
|
+
"tables_by_page": dict(tables_by_page),
|
213
|
+
}
|
214
|
+
|
215
|
+
|
216
|
+
def extract_table_structure_info(table: TableData) -> dict[str, Any]:
|
217
|
+
"""Extract structural information from a table.
|
218
|
+
|
219
|
+
Args:
|
220
|
+
table: TableData object
|
221
|
+
|
222
|
+
Returns:
|
223
|
+
Dictionary with structural information
|
224
|
+
"""
|
225
|
+
info = {
|
226
|
+
"has_headers": False,
|
227
|
+
"row_count": 0,
|
228
|
+
"column_count": 0,
|
229
|
+
"numeric_columns": 0,
|
230
|
+
"text_columns": 0,
|
231
|
+
"empty_cells": 0,
|
232
|
+
"data_density": 0.0,
|
233
|
+
}
|
234
|
+
|
235
|
+
if "df" not in table or table["df"] is None:
|
236
|
+
return info
|
237
|
+
|
238
|
+
df = table["df"]
|
239
|
+
|
240
|
+
if df.empty:
|
241
|
+
return info
|
242
|
+
|
243
|
+
info["row_count"] = len(df)
|
244
|
+
info["column_count"] = len(df.columns)
|
245
|
+
info["has_headers"] = len(df.columns) > 0
|
246
|
+
|
247
|
+
# Analyze column types
|
248
|
+
for col in df.columns:
|
249
|
+
if _is_numeric_column(df[col]):
|
250
|
+
info["numeric_columns"] += 1
|
251
|
+
else:
|
252
|
+
info["text_columns"] += 1
|
253
|
+
|
254
|
+
# Calculate data density
|
255
|
+
total_cells = len(df) * len(df.columns)
|
256
|
+
if total_cells > 0:
|
257
|
+
empty_cells = df.isnull().sum().sum()
|
258
|
+
info["empty_cells"] = int(empty_cells)
|
259
|
+
info["data_density"] = (total_cells - empty_cells) / total_cells
|
260
|
+
|
261
|
+
return info
|
kreuzberg/_utils/_tmp.py
CHANGED
@@ -3,14 +3,14 @@ from __future__ import annotations
|
|
3
3
|
from contextlib import suppress
|
4
4
|
from pathlib import Path
|
5
5
|
from tempfile import NamedTemporaryFile
|
6
|
-
from typing import TYPE_CHECKING
|
6
|
+
from typing import TYPE_CHECKING
|
7
7
|
|
8
8
|
from anyio import Path as AsyncPath
|
9
9
|
|
10
10
|
from kreuzberg._utils._sync import run_sync
|
11
11
|
|
12
12
|
if TYPE_CHECKING: # pragma: no cover
|
13
|
-
from collections.abc import Coroutine
|
13
|
+
from collections.abc import Callable, Coroutine
|
14
14
|
|
15
15
|
|
16
16
|
async def create_temp_file(
|
kreuzberg/cli.py
CHANGED
@@ -4,6 +4,7 @@ from __future__ import annotations
|
|
4
4
|
|
5
5
|
import json
|
6
6
|
import sys
|
7
|
+
import traceback
|
7
8
|
from pathlib import Path
|
8
9
|
from typing import TYPE_CHECKING, Any
|
9
10
|
|
@@ -211,8 +212,6 @@ def handle_error(error: Exception, verbose: bool) -> None:
|
|
211
212
|
else:
|
212
213
|
console.print(f"[red]Unexpected error:[/red] {type(error).__name__}: {error}", style="bold")
|
213
214
|
if verbose:
|
214
|
-
import traceback
|
215
|
-
|
216
215
|
console.print("\n[dim]Traceback:[/dim]")
|
217
216
|
traceback.print_exc()
|
218
217
|
sys.exit(1)
|
kreuzberg/extraction.py
CHANGED
@@ -1,17 +1,23 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import multiprocessing as mp
|
4
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
3
5
|
from pathlib import Path
|
4
|
-
from typing import TYPE_CHECKING, Final, cast
|
6
|
+
from typing import TYPE_CHECKING, Any, Final, cast
|
5
7
|
|
6
8
|
import anyio
|
7
9
|
|
8
10
|
from kreuzberg import ExtractionResult
|
9
11
|
from kreuzberg._chunker import get_chunker
|
12
|
+
from kreuzberg._entity_extraction import extract_entities, extract_keywords
|
13
|
+
from kreuzberg._language_detection import detect_languages
|
10
14
|
from kreuzberg._mime_types import (
|
11
15
|
validate_mime_type,
|
12
16
|
)
|
13
17
|
from kreuzberg._registry import ExtractorRegistry
|
14
18
|
from kreuzberg._types import ExtractionConfig
|
19
|
+
from kreuzberg._utils._document_cache import get_document_cache
|
20
|
+
from kreuzberg._utils._errors import create_error_context
|
15
21
|
from kreuzberg._utils._string import safe_decode
|
16
22
|
from kreuzberg._utils._sync import run_maybe_sync, run_sync_only
|
17
23
|
from kreuzberg.exceptions import ValidationError
|
@@ -24,10 +30,7 @@ if TYPE_CHECKING:
|
|
24
30
|
DEFAULT_CONFIG: Final[ExtractionConfig] = ExtractionConfig()
|
25
31
|
|
26
32
|
|
27
|
-
|
28
|
-
for validator in config.validators or []:
|
29
|
-
await run_maybe_sync(validator, result)
|
30
|
-
|
33
|
+
def _validate_and_post_process_helper(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
|
31
34
|
if config.chunk_content:
|
32
35
|
result.chunks = _handle_chunk_content(
|
33
36
|
mime_type=result.mime_type,
|
@@ -35,6 +38,39 @@ async def _validate_and_post_process_async(result: ExtractionResult, config: Ext
|
|
35
38
|
content=result.content,
|
36
39
|
)
|
37
40
|
|
41
|
+
if config.extract_entities:
|
42
|
+
try:
|
43
|
+
result.entities = extract_entities(
|
44
|
+
result.content,
|
45
|
+
custom_patterns=config.custom_entity_patterns,
|
46
|
+
)
|
47
|
+
except RuntimeError:
|
48
|
+
result.entities = None
|
49
|
+
|
50
|
+
if config.extract_keywords:
|
51
|
+
try:
|
52
|
+
result.keywords = extract_keywords(
|
53
|
+
result.content,
|
54
|
+
keyword_count=config.keyword_count,
|
55
|
+
)
|
56
|
+
except RuntimeError:
|
57
|
+
result.keywords = None
|
58
|
+
|
59
|
+
if config.auto_detect_language:
|
60
|
+
result.detected_languages = detect_languages(
|
61
|
+
result.content,
|
62
|
+
config=config.language_detection_config,
|
63
|
+
)
|
64
|
+
|
65
|
+
return result
|
66
|
+
|
67
|
+
|
68
|
+
async def _validate_and_post_process_async(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
|
69
|
+
for validator in config.validators or []:
|
70
|
+
await run_maybe_sync(validator, result)
|
71
|
+
|
72
|
+
result = _validate_and_post_process_helper(result, config)
|
73
|
+
|
38
74
|
for post_processor in config.post_processing_hooks or []:
|
39
75
|
result = await run_maybe_sync(post_processor, result)
|
40
76
|
|
@@ -45,12 +81,7 @@ def _validate_and_post_process_sync(result: ExtractionResult, config: Extraction
|
|
45
81
|
for validator in config.validators or []:
|
46
82
|
run_sync_only(validator, result)
|
47
83
|
|
48
|
-
|
49
|
-
result.chunks = _handle_chunk_content(
|
50
|
-
mime_type=result.mime_type,
|
51
|
-
config=config,
|
52
|
-
content=result.content,
|
53
|
-
)
|
84
|
+
result = _validate_and_post_process_helper(result, config)
|
54
85
|
|
55
86
|
for post_processor in config.post_processing_hooks or []:
|
56
87
|
result = run_sync_only(post_processor, result)
|
@@ -62,7 +93,7 @@ def _handle_chunk_content(
|
|
62
93
|
mime_type: str,
|
63
94
|
config: ExtractionConfig,
|
64
95
|
content: str,
|
65
|
-
) ->
|
96
|
+
) -> Any:
|
66
97
|
chunker = get_chunker(mime_type=mime_type, max_characters=config.max_chars, overlap_characters=config.max_overlap)
|
67
98
|
return chunker.chunks(content)
|
68
99
|
|
@@ -109,8 +140,6 @@ async def extract_file(
|
|
109
140
|
Raises:
|
110
141
|
ValidationError: If the file path or configuration is invalid.
|
111
142
|
"""
|
112
|
-
from kreuzberg._utils._document_cache import get_document_cache
|
113
|
-
|
114
143
|
cache = get_document_cache()
|
115
144
|
path = Path(file_path)
|
116
145
|
cached_result = cache.get(path, config)
|
@@ -167,8 +196,6 @@ async def batch_extract_file(
|
|
167
196
|
if not file_paths:
|
168
197
|
return []
|
169
198
|
|
170
|
-
import multiprocessing as mp
|
171
|
-
|
172
199
|
max_concurrency = min(len(file_paths), mp.cpu_count() * 2)
|
173
200
|
semaphore = anyio.Semaphore(max_concurrency)
|
174
201
|
|
@@ -184,8 +211,6 @@ async def batch_extract_file(
|
|
184
211
|
)
|
185
212
|
results[index] = result
|
186
213
|
except Exception as e: # noqa: BLE001
|
187
|
-
from kreuzberg._utils._errors import create_error_context
|
188
|
-
|
189
214
|
error_result = ExtractionResult(
|
190
215
|
content=f"Error: {type(e).__name__}: {e!s}",
|
191
216
|
mime_type="text/plain",
|
@@ -224,8 +249,6 @@ async def batch_extract_bytes(
|
|
224
249
|
if not contents:
|
225
250
|
return []
|
226
251
|
|
227
|
-
import multiprocessing as mp
|
228
|
-
|
229
252
|
max_concurrency = min(len(contents), mp.cpu_count() * 2)
|
230
253
|
semaphore = anyio.Semaphore(max_concurrency)
|
231
254
|
|
@@ -237,8 +260,6 @@ async def batch_extract_bytes(
|
|
237
260
|
result = await extract_bytes(content, mime_type, config)
|
238
261
|
results[index] = result
|
239
262
|
except Exception as e: # noqa: BLE001
|
240
|
-
from kreuzberg._utils._errors import create_error_context
|
241
|
-
|
242
263
|
error_result = ExtractionResult(
|
243
264
|
content=f"Error: {type(e).__name__}: {e!s}",
|
244
265
|
mime_type="text/plain",
|
@@ -304,8 +325,6 @@ def extract_file_sync(
|
|
304
325
|
Raises:
|
305
326
|
ValidationError: If the file path or configuration is invalid.
|
306
327
|
"""
|
307
|
-
from kreuzberg._utils._document_cache import get_document_cache
|
308
|
-
|
309
328
|
cache = get_document_cache()
|
310
329
|
path = Path(file_path)
|
311
330
|
cached_result = cache.get(path, config)
|
@@ -362,9 +381,6 @@ def batch_extract_file_sync(
|
|
362
381
|
if len(file_paths) <= 1:
|
363
382
|
return [extract_file_sync(file_path=Path(file_path), mime_type=None, config=config) for file_path in file_paths]
|
364
383
|
|
365
|
-
import multiprocessing as mp
|
366
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
367
|
-
|
368
384
|
max_workers = min(len(file_paths), mp.cpu_count())
|
369
385
|
|
370
386
|
def extract_single(file_path: PathLike[str] | str) -> tuple[int, ExtractionResult]:
|
@@ -375,8 +391,6 @@ def batch_extract_file_sync(
|
|
375
391
|
extract_file_sync(file_path=Path(file_path), mime_type=None, config=config),
|
376
392
|
)
|
377
393
|
except Exception as e: # noqa: BLE001
|
378
|
-
from kreuzberg._utils._errors import create_error_context
|
379
|
-
|
380
394
|
error_result = ExtractionResult(
|
381
395
|
content=f"Error: {type(e).__name__}: {e!s}",
|
382
396
|
mime_type="text/plain",
|
@@ -420,9 +434,6 @@ def batch_extract_bytes_sync(
|
|
420
434
|
extract_bytes_sync(content=content, mime_type=mime_type, config=config) for content, mime_type in contents
|
421
435
|
]
|
422
436
|
|
423
|
-
import multiprocessing as mp
|
424
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
425
|
-
|
426
437
|
max_workers = min(len(contents), mp.cpu_count())
|
427
438
|
|
428
439
|
def extract_single(index_and_content: tuple[int, tuple[bytes, str]]) -> tuple[int, ExtractionResult]:
|
@@ -431,8 +442,6 @@ def batch_extract_bytes_sync(
|
|
431
442
|
try:
|
432
443
|
return (index, extract_bytes_sync(content=content, mime_type=mime_type, config=config))
|
433
444
|
except Exception as e: # noqa: BLE001
|
434
|
-
from kreuzberg._utils._errors import create_error_context
|
435
|
-
|
436
445
|
error_result = ExtractionResult(
|
437
446
|
content=f"Error: {type(e).__name__}: {e!s}",
|
438
447
|
mime_type="text/plain",
|