kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +212 -292
- kreuzberg/_document_classification.py +20 -47
- kreuzberg/_entity_extraction.py +1 -122
- kreuzberg/_extractors/_base.py +4 -71
- kreuzberg/_extractors/_email.py +1 -15
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -25
- kreuzberg/_extractors/_pandoc.py +10 -147
- kreuzberg/_extractors/_pdf.py +38 -94
- kreuzberg/_extractors/_presentation.py +0 -99
- kreuzberg/_extractors/_spread_sheet.py +13 -55
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -199
- kreuzberg/_language_detection.py +1 -36
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -19
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +124 -186
- kreuzberg/_ocr/_paddleocr.py +154 -224
- kreuzberg/_ocr/_table_extractor.py +184 -0
- kreuzberg/_ocr/_tesseract.py +797 -361
- kreuzberg/_playa.py +5 -31
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +588 -93
- kreuzberg/_utils/_cache.py +84 -138
- kreuzberg/_utils/_device.py +0 -74
- kreuzberg/_utils/_document_cache.py +0 -75
- kreuzberg/_utils/_errors.py +0 -50
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -16
- kreuzberg/_utils/_process_pool.py +17 -64
- kreuzberg/_utils/_quality.py +0 -60
- kreuzberg/_utils/_ref.py +32 -0
- kreuzberg/_utils/_serialization.py +0 -30
- kreuzberg/_utils/_string.py +9 -59
- kreuzberg/_utils/_sync.py +0 -77
- kreuzberg/_utils/_table.py +49 -101
- kreuzberg/_utils/_tmp.py +0 -9
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
- kreuzberg-3.13.1.dist-info/RECORD +57 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_utils/_table.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1
|
-
"""Table processing and export utilities."""
|
2
|
-
|
3
1
|
from __future__ import annotations
|
4
2
|
|
5
|
-
import
|
3
|
+
import io
|
6
4
|
from typing import TYPE_CHECKING, Any
|
7
5
|
|
8
6
|
if TYPE_CHECKING:
|
@@ -10,67 +8,38 @@ if TYPE_CHECKING:
|
|
10
8
|
|
11
9
|
|
12
10
|
def export_table_to_csv(table: TableData, separator: str = ",") -> str:
|
13
|
-
r"""Export a TableData object to CSV/TSV format.
|
14
|
-
|
15
|
-
Args:
|
16
|
-
table: TableData object containing DataFrame
|
17
|
-
separator: Field separator ("," for CSV, "\t" for TSV)
|
18
|
-
|
19
|
-
Returns:
|
20
|
-
String representation in CSV/TSV format
|
21
|
-
"""
|
22
11
|
if "df" not in table or table["df"] is None:
|
23
12
|
return ""
|
24
13
|
|
25
|
-
|
26
|
-
|
27
|
-
|
14
|
+
buffer = io.StringIO()
|
15
|
+
df = table["df"]
|
16
|
+
df.write_csv(buffer, separator=separator, include_header=True)
|
17
|
+
return buffer.getvalue().strip()
|
28
18
|
|
29
19
|
|
30
20
|
def export_table_to_tsv(table: TableData) -> str:
|
31
|
-
"""Export a TableData object to TSV format.
|
32
|
-
|
33
|
-
Args:
|
34
|
-
table: TableData object containing DataFrame
|
35
|
-
|
36
|
-
Returns:
|
37
|
-
String representation in TSV format
|
38
|
-
"""
|
39
21
|
return export_table_to_csv(table, separator="\t")
|
40
22
|
|
41
23
|
|
42
24
|
def enhance_table_markdown(table: TableData) -> str:
|
43
|
-
"""Generate enhanced markdown table with better formatting.
|
44
|
-
|
45
|
-
Args:
|
46
|
-
table: TableData object
|
47
|
-
|
48
|
-
Returns:
|
49
|
-
Enhanced markdown table string
|
50
|
-
"""
|
51
25
|
if "df" not in table or table["df"] is None:
|
52
26
|
return table.get("text", "")
|
53
27
|
|
54
28
|
df = table["df"]
|
55
29
|
|
56
|
-
if df.
|
30
|
+
if df.is_empty():
|
57
31
|
return table.get("text", "")
|
58
32
|
|
59
|
-
# Create enhanced markdown with proper alignment
|
60
33
|
lines = []
|
61
34
|
|
62
|
-
# Header row
|
63
35
|
headers = [str(col).strip() for col in df.columns]
|
64
36
|
lines.append("| " + " | ".join(headers) + " |")
|
65
37
|
|
66
|
-
# Separator row with alignment hints
|
67
38
|
lines.append(_generate_separator_row(df))
|
68
39
|
|
69
|
-
# Analyze float columns to determine formatting strategy
|
70
40
|
float_col_formatting = _analyze_float_columns(df)
|
71
41
|
|
72
|
-
|
73
|
-
for _, row in df.iterrows():
|
42
|
+
for row in df.iter_rows(named=True):
|
74
43
|
formatted_row = _format_table_row(row, df, float_col_formatting)
|
75
44
|
lines.append("| " + " | ".join(formatted_row) + " |")
|
76
45
|
|
@@ -78,79 +47,77 @@ def enhance_table_markdown(table: TableData) -> str:
|
|
78
47
|
|
79
48
|
|
80
49
|
def _generate_separator_row(df: Any) -> str:
|
81
|
-
"""Generate separator row with proper alignment hints."""
|
82
50
|
separators = []
|
83
51
|
for col in df.columns:
|
84
|
-
|
85
|
-
if
|
86
|
-
separators.append("---:")
|
52
|
+
dtype_str = str(df[col].dtype)
|
53
|
+
if dtype_str in ["Int64", "Float64", "Int32", "Float32"] or _is_numeric_column(df[col]):
|
54
|
+
separators.append("---:")
|
87
55
|
else:
|
88
|
-
separators.append("---")
|
56
|
+
separators.append("---")
|
89
57
|
return "| " + " | ".join(separators) + " |"
|
90
58
|
|
91
59
|
|
92
60
|
def _analyze_float_columns(df: Any) -> dict[str, str]:
|
93
|
-
"""Analyze float columns to determine formatting strategy."""
|
94
61
|
float_col_formatting = {}
|
95
62
|
for col in df.columns:
|
96
|
-
|
97
|
-
|
63
|
+
dtype_str = str(df[col].dtype)
|
64
|
+
if dtype_str in ["Float64", "Float32"]:
|
65
|
+
non_null_values = df[col].drop_nulls()
|
98
66
|
if len(non_null_values) > 0:
|
99
|
-
|
100
|
-
|
101
|
-
|
67
|
+
try:
|
68
|
+
values_list = non_null_values.to_list()
|
69
|
+
all_integers = all(float(val).is_integer() for val in values_list if val is not None)
|
70
|
+
float_col_formatting[col] = "int" if all_integers else "float"
|
71
|
+
except (ValueError, AttributeError):
|
72
|
+
float_col_formatting[col] = "float"
|
102
73
|
else:
|
103
74
|
float_col_formatting[col] = "int"
|
104
75
|
return float_col_formatting
|
105
76
|
|
106
77
|
|
107
78
|
def _format_table_row(row: Any, df: Any, float_col_formatting: dict[str, str]) -> list[str]:
|
108
|
-
"""Format a single table row with proper value formatting."""
|
109
79
|
formatted_row = []
|
110
80
|
for col_name, value in row.items():
|
111
|
-
if value is None
|
81
|
+
if value is None:
|
112
82
|
formatted_row.append("")
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
elif isinstance(value, float):
|
117
|
-
# For float columns, use the determined formatting strategy
|
118
|
-
if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
|
83
|
+
else:
|
84
|
+
dtype_str = str(df[col_name].dtype)
|
85
|
+
if dtype_str in ["Int64", "Int32"]:
|
119
86
|
formatted_row.append(str(int(value)))
|
87
|
+
elif isinstance(value, float):
|
88
|
+
if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
|
89
|
+
formatted_row.append(str(int(value)))
|
90
|
+
else:
|
91
|
+
formatted_row.append(f"{value:.2f}")
|
120
92
|
else:
|
121
|
-
|
122
|
-
|
123
|
-
# Clean up text values
|
124
|
-
clean_value = str(value).strip().replace("|", "\\|") # Escape pipes
|
125
|
-
formatted_row.append(clean_value)
|
93
|
+
clean_value = str(value).strip().replace("|", "\\|")
|
94
|
+
formatted_row.append(clean_value)
|
126
95
|
return formatted_row
|
127
96
|
|
128
97
|
|
129
98
|
def _is_numeric_column(series: Any) -> bool:
|
130
|
-
"""Check if a pandas Series contains mostly numeric values."""
|
131
99
|
if len(series) == 0:
|
132
100
|
return False
|
133
101
|
|
134
102
|
try:
|
135
|
-
|
136
|
-
if
|
103
|
+
dtype_str = str(series.dtype)
|
104
|
+
if dtype_str in {"Int64", "Float64", "Int32", "Float32"}:
|
137
105
|
return True
|
138
106
|
|
139
|
-
# Sample-based approach for large series (>1000 rows)
|
140
107
|
sample_size = min(100, len(series))
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
108
|
+
series_no_nulls = series.drop_nulls()
|
109
|
+
|
110
|
+
if len(series_no_nulls) == 0:
|
111
|
+
return False
|
112
|
+
|
113
|
+
sample_series = series_no_nulls.slice(0, sample_size) if len(series_no_nulls) > 1000 else series_no_nulls
|
145
114
|
|
146
115
|
if len(sample_series) == 0:
|
147
116
|
return False
|
148
117
|
|
149
|
-
# Optimized numeric conversion - avoid exception overhead
|
150
118
|
numeric_count = 0
|
151
|
-
for val in sample_series:
|
119
|
+
for val in sample_series.to_list():
|
152
120
|
val_str = str(val).replace(",", "").replace("$", "").replace("%", "")
|
153
|
-
# Quick check: if it contains only digits, decimal point, minus, plus, or e
|
154
121
|
if val_str and all(c in "0123456789.-+eE" for c in val_str):
|
155
122
|
try:
|
156
123
|
float(val_str)
|
@@ -158,7 +125,6 @@ def _is_numeric_column(series: Any) -> bool:
|
|
158
125
|
except (ValueError, TypeError):
|
159
126
|
pass
|
160
127
|
|
161
|
-
# Consider numeric if >70% of sampled values are numeric
|
162
128
|
return (numeric_count / len(sample_series)) > 0.7
|
163
129
|
|
164
130
|
except (ValueError, TypeError, ZeroDivisionError):
|
@@ -166,14 +132,6 @@ def _is_numeric_column(series: Any) -> bool:
|
|
166
132
|
|
167
133
|
|
168
134
|
def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
|
169
|
-
"""Generate summary statistics for extracted tables.
|
170
|
-
|
171
|
-
Args:
|
172
|
-
tables: List of TableData objects
|
173
|
-
|
174
|
-
Returns:
|
175
|
-
Dictionary with table statistics
|
176
|
-
"""
|
177
135
|
if not tables:
|
178
136
|
return {
|
179
137
|
"table_count": 0,
|
@@ -190,8 +148,8 @@ def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
|
|
190
148
|
for table in tables:
|
191
149
|
if "df" in table and table["df"] is not None:
|
192
150
|
df = table["df"]
|
193
|
-
total_rows +=
|
194
|
-
total_columns +=
|
151
|
+
total_rows += df.height
|
152
|
+
total_columns += df.width
|
195
153
|
|
196
154
|
if "page_number" in table:
|
197
155
|
page_num = table["page_number"]
|
@@ -213,14 +171,6 @@ def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
|
|
213
171
|
|
214
172
|
|
215
173
|
def extract_table_structure_info(table: TableData) -> dict[str, Any]:
|
216
|
-
"""Extract structural information from a table.
|
217
|
-
|
218
|
-
Args:
|
219
|
-
table: TableData object
|
220
|
-
|
221
|
-
Returns:
|
222
|
-
Dictionary with structural information
|
223
|
-
"""
|
224
174
|
info = {
|
225
175
|
"has_headers": False,
|
226
176
|
"row_count": 0,
|
@@ -236,25 +186,23 @@ def extract_table_structure_info(table: TableData) -> dict[str, Any]:
|
|
236
186
|
|
237
187
|
df = table["df"]
|
238
188
|
|
239
|
-
if df.
|
189
|
+
if df.is_empty():
|
240
190
|
return info
|
241
191
|
|
242
|
-
info["row_count"] =
|
243
|
-
info["column_count"] =
|
244
|
-
info["has_headers"] =
|
192
|
+
info["row_count"] = df.height
|
193
|
+
info["column_count"] = df.width
|
194
|
+
info["has_headers"] = df.width > 0
|
245
195
|
|
246
|
-
# Analyze column types
|
247
196
|
for col in df.columns:
|
248
197
|
if _is_numeric_column(df[col]):
|
249
198
|
info["numeric_columns"] += 1
|
250
199
|
else:
|
251
200
|
info["text_columns"] += 1
|
252
201
|
|
253
|
-
|
254
|
-
total_cells = len(df) * len(df.columns)
|
202
|
+
total_cells = df.height * df.width
|
255
203
|
if total_cells > 0:
|
256
|
-
empty_cells = df.
|
257
|
-
info["empty_cells"] =
|
204
|
+
empty_cells = df.null_count().sum().item()
|
205
|
+
info["empty_cells"] = empty_cells
|
258
206
|
info["data_density"] = (total_cells - empty_cells) / total_cells
|
259
207
|
|
260
208
|
return info
|
kreuzberg/_utils/_tmp.py
CHANGED
@@ -16,15 +16,6 @@ if TYPE_CHECKING: # pragma: no cover
|
|
16
16
|
async def create_temp_file(
|
17
17
|
extension: str, content: bytes | None = None
|
18
18
|
) -> tuple[Path, Callable[[], Coroutine[None, None, None]]]:
|
19
|
-
"""Create a temporary file that is closed.
|
20
|
-
|
21
|
-
Args:
|
22
|
-
extension: The file extension.
|
23
|
-
content: The content to write to the file.
|
24
|
-
|
25
|
-
Returns:
|
26
|
-
The temporary file path.
|
27
|
-
"""
|
28
19
|
file = await run_sync(NamedTemporaryFile, suffix=extension, delete=False)
|
29
20
|
if content:
|
30
21
|
await AsyncPath(file.name).write_bytes(content)
|
kreuzberg/cli.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
"""Command-line interface for kreuzberg."""
|
2
|
-
|
3
1
|
from __future__ import annotations
|
4
2
|
|
5
3
|
import json
|
@@ -84,11 +82,11 @@ def format_extraction_result(result: ExtractionResult, show_metadata: bool, outp
|
|
84
82
|
return "\n".join(output_parts)
|
85
83
|
|
86
84
|
|
87
|
-
def _load_config(
|
85
|
+
def _load_config(config_path: Path | None, verbose: bool) -> dict[str, Any]:
|
88
86
|
"""Load configuration from file or find default."""
|
89
87
|
file_config = {}
|
90
|
-
if
|
91
|
-
file_config = load_config_from_file(
|
88
|
+
if config_path:
|
89
|
+
file_config = load_config_from_file(config_path)
|
92
90
|
else:
|
93
91
|
default_config = find_config_file()
|
94
92
|
if default_config:
|
@@ -101,39 +99,38 @@ def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
|
|
101
99
|
return file_config
|
102
100
|
|
103
101
|
|
104
|
-
def _build_cli_args(
|
105
|
-
force_ocr: bool,
|
106
|
-
chunk_content: bool,
|
107
|
-
extract_tables: bool,
|
108
|
-
max_chars: int,
|
109
|
-
max_overlap: int,
|
110
|
-
ocr_backend: str | None,
|
111
|
-
tesseract_lang: str | None,
|
112
|
-
tesseract_psm: int | None,
|
113
|
-
easyocr_languages: str | None,
|
114
|
-
paddleocr_languages: str | None,
|
115
|
-
) -> dict[str, Any]:
|
102
|
+
def _build_cli_args(params: dict[str, Any]) -> dict[str, Any]:
|
116
103
|
"""Build CLI arguments dictionary."""
|
117
104
|
cli_args: dict[str, Any] = {
|
118
|
-
"force_ocr": force_ocr if force_ocr else None,
|
119
|
-
"chunk_content": chunk_content if chunk_content else None,
|
120
|
-
"extract_tables": extract_tables if extract_tables else None,
|
121
|
-
"max_chars": max_chars if max_chars != DEFAULT_MAX_CHARACTERS else None,
|
122
|
-
"max_overlap": max_overlap if max_overlap != DEFAULT_MAX_OVERLAP else None,
|
123
|
-
"ocr_backend": ocr_backend,
|
105
|
+
"force_ocr": params["force_ocr"] if params["force_ocr"] else None,
|
106
|
+
"chunk_content": params["chunk_content"] if params["chunk_content"] else None,
|
107
|
+
"extract_tables": params["extract_tables"] if params["extract_tables"] else None,
|
108
|
+
"max_chars": params["max_chars"] if params["max_chars"] != DEFAULT_MAX_CHARACTERS else None,
|
109
|
+
"max_overlap": params["max_overlap"] if params["max_overlap"] != DEFAULT_MAX_OVERLAP else None,
|
110
|
+
"ocr_backend": params["ocr_backend"],
|
124
111
|
}
|
125
112
|
|
126
|
-
|
113
|
+
ocr_backend = params["ocr_backend"]
|
114
|
+
if ocr_backend == "tesseract" and (
|
115
|
+
params["tesseract_lang"]
|
116
|
+
or params["tesseract_psm"] is not None
|
117
|
+
or params["tesseract_output_format"]
|
118
|
+
or params["enable_table_detection"]
|
119
|
+
):
|
127
120
|
tesseract_config = {}
|
128
|
-
if tesseract_lang:
|
129
|
-
tesseract_config["language"] = tesseract_lang
|
130
|
-
if tesseract_psm is not None:
|
131
|
-
tesseract_config["psm"] = tesseract_psm
|
121
|
+
if params["tesseract_lang"]:
|
122
|
+
tesseract_config["language"] = params["tesseract_lang"]
|
123
|
+
if params["tesseract_psm"] is not None:
|
124
|
+
tesseract_config["psm"] = params["tesseract_psm"]
|
125
|
+
if params["tesseract_output_format"]:
|
126
|
+
tesseract_config["output_format"] = params["tesseract_output_format"]
|
127
|
+
if params["enable_table_detection"]:
|
128
|
+
tesseract_config["enable_table_detection"] = True
|
132
129
|
cli_args["tesseract_config"] = tesseract_config
|
133
|
-
elif ocr_backend == "easyocr" and easyocr_languages:
|
134
|
-
cli_args["easyocr_config"] = {"languages": easyocr_languages.split(",")}
|
135
|
-
elif ocr_backend == "paddleocr" and paddleocr_languages:
|
136
|
-
cli_args["paddleocr_config"] = {"languages": paddleocr_languages.split(",")}
|
130
|
+
elif ocr_backend == "easyocr" and params["easyocr_languages"]:
|
131
|
+
cli_args["easyocr_config"] = {"languages": params["easyocr_languages"].split(",")}
|
132
|
+
elif ocr_backend == "paddleocr" and params["paddleocr_languages"]:
|
133
|
+
cli_args["paddleocr_config"] = {"languages": params["paddleocr_languages"].split(",")}
|
137
134
|
|
138
135
|
return cli_args
|
139
136
|
|
@@ -158,7 +155,7 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
|
|
158
155
|
progress.add_task("Extracting text...", total=None)
|
159
156
|
|
160
157
|
try:
|
161
|
-
import magic # type: ignore[import-not-found]
|
158
|
+
import magic # type: ignore[import-not-found] # noqa: PLC0415
|
162
159
|
|
163
160
|
mime_type = magic.from_buffer(input_bytes, mime=True)
|
164
161
|
except ImportError: # pragma: no cover
|
@@ -188,7 +185,10 @@ def _write_output(
|
|
188
185
|
if verbose:
|
189
186
|
console.print(f"[green]✓[/green] Output written to: {output}")
|
190
187
|
else:
|
191
|
-
|
188
|
+
try:
|
189
|
+
click.echo(formatted_output)
|
190
|
+
except UnicodeEncodeError:
|
191
|
+
sys.stdout.buffer.write(formatted_output.encode("utf-8"))
|
192
192
|
|
193
193
|
|
194
194
|
def handle_error(error: Exception, verbose: bool) -> None: # pragma: no cover
|
@@ -248,71 +248,51 @@ def cli(ctx: click.Context) -> None:
|
|
248
248
|
@click.option(
|
249
249
|
"--ocr-backend", type=OcrBackendParamType(), help="OCR backend to use (tesseract, easyocr, paddleocr, none)"
|
250
250
|
)
|
251
|
-
@click.option("--config", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
|
251
|
+
@click.option("--config", "config_file", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
|
252
252
|
@click.option("--show-metadata", is_flag=True, help="Include metadata in output")
|
253
253
|
@click.option("--output-format", type=click.Choice(["text", "json"]), default="text", help="Output format")
|
254
254
|
@click.option("-v", "--verbose", is_flag=True, help="Verbose output for debugging")
|
255
255
|
@click.option("--tesseract-lang", help="Tesseract language(s) (e.g., 'eng+deu')")
|
256
256
|
@click.option("--tesseract-psm", type=int, help="Tesseract PSM mode (0-13)")
|
257
|
+
@click.option(
|
258
|
+
"--tesseract-output-format",
|
259
|
+
type=click.Choice(["text", "markdown", "tsv", "hocr"]),
|
260
|
+
help="Tesseract OCR output format (default: markdown)",
|
261
|
+
)
|
262
|
+
@click.option(
|
263
|
+
"--enable-table-detection", is_flag=True, help="Enable table extraction from scanned documents (with TSV format)"
|
264
|
+
)
|
257
265
|
@click.option("--easyocr-languages", help="EasyOCR language codes (comma-separated, e.g., 'en,de')")
|
258
266
|
@click.option("--paddleocr-languages", help="PaddleOCR language codes (comma-separated, e.g., 'en,german')")
|
259
267
|
@click.pass_context
|
260
|
-
def extract(
|
261
|
-
_: click.Context,
|
262
|
-
file: Path | None,
|
263
|
-
output: Path | None,
|
264
|
-
force_ocr: bool,
|
265
|
-
chunk_content: bool,
|
266
|
-
extract_tables: bool,
|
267
|
-
max_chars: int,
|
268
|
-
max_overlap: int,
|
269
|
-
ocr_backend: str | None,
|
270
|
-
config: Path | None,
|
271
|
-
show_metadata: bool,
|
272
|
-
output_format: str,
|
273
|
-
verbose: bool,
|
274
|
-
tesseract_lang: str | None,
|
275
|
-
tesseract_psm: int | None,
|
276
|
-
easyocr_languages: str | None,
|
277
|
-
paddleocr_languages: str | None,
|
278
|
-
) -> None:
|
268
|
+
def extract(ctx: click.Context) -> None:
|
279
269
|
"""Extract text from a document.
|
280
270
|
|
281
271
|
FILE can be a path to a document or '-' to read from stdin.
|
282
272
|
If FILE is omitted, reads from stdin.
|
283
273
|
"""
|
274
|
+
params = ctx.params
|
284
275
|
try:
|
285
|
-
file_config = _load_config(
|
286
|
-
|
287
|
-
cli_args = _build_cli_args(
|
288
|
-
force_ocr,
|
289
|
-
chunk_content,
|
290
|
-
extract_tables,
|
291
|
-
max_chars,
|
292
|
-
max_overlap,
|
293
|
-
ocr_backend,
|
294
|
-
tesseract_lang,
|
295
|
-
tesseract_psm,
|
296
|
-
easyocr_languages,
|
297
|
-
paddleocr_languages,
|
298
|
-
)
|
276
|
+
file_config = _load_config(params["config_file"], params["verbose"])
|
277
|
+
|
278
|
+
cli_args = _build_cli_args(params)
|
299
279
|
|
300
280
|
extraction_config = build_extraction_config(file_config, cli_args)
|
301
281
|
|
302
|
-
result = _perform_extraction(file, extraction_config, verbose)
|
282
|
+
result = _perform_extraction(params["file"], extraction_config, params["verbose"])
|
303
283
|
|
304
|
-
_write_output(result, output, show_metadata, output_format, verbose)
|
284
|
+
_write_output(result, params["output"], params["show_metadata"], params["output_format"], params["verbose"])
|
305
285
|
|
306
286
|
except Exception as e: # noqa: BLE001
|
307
|
-
handle_error(e, verbose)
|
287
|
+
handle_error(e, params["verbose"])
|
308
288
|
|
309
289
|
|
310
290
|
@cli.command()
|
311
|
-
@click.option("--config", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
|
312
|
-
def config(
|
291
|
+
@click.option("--config", "config_file", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
|
292
|
+
def config(config_file: Path | None) -> None:
|
313
293
|
"""Show current configuration."""
|
314
294
|
try:
|
315
|
-
config_path =
|
295
|
+
config_path = config_file or find_config_file()
|
316
296
|
|
317
297
|
if config_path:
|
318
298
|
file_config = load_config_from_file(config_path)
|
kreuzberg/extraction.py
CHANGED
@@ -151,20 +151,22 @@ async def extract_file(
|
|
151
151
|
"""
|
152
152
|
cache = get_document_cache()
|
153
153
|
path = Path(file_path)
|
154
|
-
cached_result = cache.get(path, config)
|
155
|
-
if cached_result is not None:
|
156
|
-
return cached_result
|
157
154
|
|
158
|
-
if
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
# Try cache again after waiting for other process to complete # ~keep
|
163
|
-
cached_result = cache.get(path, config) # pragma: no cover
|
164
|
-
if cached_result is not None: # pragma: no cover
|
155
|
+
if config.use_cache:
|
156
|
+
cached_result = cache.get(path, config)
|
157
|
+
if cached_result is not None:
|
165
158
|
return cached_result
|
166
159
|
|
167
|
-
|
160
|
+
if cache.is_processing(path, config):
|
161
|
+
event = cache.mark_processing(path, config)
|
162
|
+
await anyio.to_thread.run_sync(event.wait) # pragma: no cover
|
163
|
+
|
164
|
+
# Try cache again after waiting for other process to complete # ~keep
|
165
|
+
cached_result = cache.get(path, config) # pragma: no cover
|
166
|
+
if cached_result is not None: # pragma: no cover
|
167
|
+
return cached_result
|
168
|
+
|
169
|
+
cache.mark_processing(path, config)
|
168
170
|
|
169
171
|
try:
|
170
172
|
if not path.exists():
|
@@ -183,11 +185,13 @@ async def extract_file(
|
|
183
185
|
|
184
186
|
result = await _validate_and_post_process_async(result=result, config=config, file_path=path)
|
185
187
|
|
186
|
-
|
188
|
+
if config.use_cache:
|
189
|
+
cache.set(path, config, result)
|
187
190
|
|
188
191
|
return result
|
189
192
|
finally:
|
190
|
-
|
193
|
+
if config.use_cache:
|
194
|
+
cache.mark_complete(path, config)
|
191
195
|
|
192
196
|
|
193
197
|
async def batch_extract_file(
|
@@ -224,7 +228,7 @@ async def batch_extract_file(
|
|
224
228
|
content=f"Error: {type(e).__name__}: {e!s}",
|
225
229
|
mime_type="text/plain",
|
226
230
|
metadata={ # type: ignore[typeddict-unknown-key]
|
227
|
-
"error":
|
231
|
+
"error": f"{type(e).__name__}: {e!s}",
|
228
232
|
"error_context": create_error_context(
|
229
233
|
operation="batch_extract_file",
|
230
234
|
file_path=path,
|
@@ -273,7 +277,7 @@ async def batch_extract_bytes(
|
|
273
277
|
content=f"Error: {type(e).__name__}: {e!s}",
|
274
278
|
mime_type="text/plain",
|
275
279
|
metadata={ # type: ignore[typeddict-unknown-key]
|
276
|
-
"error":
|
280
|
+
"error": f"{type(e).__name__}: {e!s}",
|
277
281
|
"error_context": create_error_context(
|
278
282
|
operation="batch_extract_bytes",
|
279
283
|
error=e,
|
@@ -336,20 +340,22 @@ def extract_file_sync(
|
|
336
340
|
"""
|
337
341
|
cache = get_document_cache()
|
338
342
|
path = Path(file_path)
|
339
|
-
cached_result = cache.get(path, config)
|
340
|
-
if cached_result is not None:
|
341
|
-
return cached_result
|
342
343
|
|
343
|
-
if
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
# Try cache again after waiting for other process to complete # ~keep
|
348
|
-
cached_result = cache.get(path, config) # pragma: no cover
|
349
|
-
if cached_result is not None: # pragma: no cover
|
344
|
+
if config.use_cache:
|
345
|
+
cached_result = cache.get(path, config)
|
346
|
+
if cached_result is not None:
|
350
347
|
return cached_result
|
351
348
|
|
352
|
-
|
349
|
+
if cache.is_processing(path, config):
|
350
|
+
event = cache.mark_processing(path, config)
|
351
|
+
event.wait() # pragma: no cover
|
352
|
+
|
353
|
+
# Try cache again after waiting for other process to complete # ~keep
|
354
|
+
cached_result = cache.get(path, config) # pragma: no cover
|
355
|
+
if cached_result is not None: # pragma: no cover
|
356
|
+
return cached_result
|
357
|
+
|
358
|
+
cache.mark_processing(path, config)
|
353
359
|
|
354
360
|
try:
|
355
361
|
if not path.exists():
|
@@ -360,7 +366,7 @@ def extract_file_sync(
|
|
360
366
|
result = extractor.extract_path_sync(Path(file_path))
|
361
367
|
else:
|
362
368
|
result = ExtractionResult(
|
363
|
-
content=Path(file_path).read_text(),
|
369
|
+
content=Path(file_path).read_text(encoding="utf-8"),
|
364
370
|
chunks=[],
|
365
371
|
mime_type=mime_type,
|
366
372
|
metadata={},
|
@@ -368,11 +374,13 @@ def extract_file_sync(
|
|
368
374
|
|
369
375
|
result = _validate_and_post_process_sync(result=result, config=config, file_path=path)
|
370
376
|
|
371
|
-
|
377
|
+
if config.use_cache:
|
378
|
+
cache.set(path, config, result)
|
372
379
|
|
373
380
|
return result
|
374
381
|
finally:
|
375
|
-
|
382
|
+
if config.use_cache:
|
383
|
+
cache.mark_complete(path, config)
|
376
384
|
|
377
385
|
|
378
386
|
def batch_extract_file_sync(
|
@@ -404,7 +412,7 @@ def batch_extract_file_sync(
|
|
404
412
|
content=f"Error: {type(e).__name__}: {e!s}",
|
405
413
|
mime_type="text/plain",
|
406
414
|
metadata={ # type: ignore[typeddict-unknown-key]
|
407
|
-
"error":
|
415
|
+
"error": f"{type(e).__name__}: {e!s}",
|
408
416
|
"error_context": create_error_context(
|
409
417
|
operation="batch_extract_file_sync",
|
410
418
|
file_path=file_path,
|
@@ -455,7 +463,7 @@ def batch_extract_bytes_sync(
|
|
455
463
|
content=f"Error: {type(e).__name__}: {e!s}",
|
456
464
|
mime_type="text/plain",
|
457
465
|
metadata={ # type: ignore[typeddict-unknown-key]
|
458
|
-
"error":
|
466
|
+
"error": f"{type(e).__name__}: {e!s}",
|
459
467
|
"error_context": create_error_context(
|
460
468
|
operation="batch_extract_bytes_sync",
|
461
469
|
error=e,
|
@@ -469,7 +477,6 @@ def batch_extract_bytes_sync(
|
|
469
477
|
return (index, error_result)
|
470
478
|
|
471
479
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
472
|
-
# Avoid creating intermediate list, use enumerate directly
|
473
480
|
future_to_index = {executor.submit(extract_single, (i, content)): i for i, content in enumerate(contents)}
|
474
481
|
|
475
482
|
results: list[ExtractionResult] = [None] * len(contents) # type: ignore[list-item]
|