kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_chunker.py +0 -15
  5. kreuzberg/_config.py +212 -292
  6. kreuzberg/_document_classification.py +20 -47
  7. kreuzberg/_entity_extraction.py +1 -122
  8. kreuzberg/_extractors/_base.py +4 -71
  9. kreuzberg/_extractors/_email.py +1 -15
  10. kreuzberg/_extractors/_html.py +9 -12
  11. kreuzberg/_extractors/_image.py +1 -25
  12. kreuzberg/_extractors/_pandoc.py +10 -147
  13. kreuzberg/_extractors/_pdf.py +38 -94
  14. kreuzberg/_extractors/_presentation.py +0 -99
  15. kreuzberg/_extractors/_spread_sheet.py +13 -55
  16. kreuzberg/_extractors/_structured.py +1 -4
  17. kreuzberg/_gmft.py +14 -199
  18. kreuzberg/_language_detection.py +1 -36
  19. kreuzberg/_mcp/__init__.py +0 -2
  20. kreuzberg/_mcp/server.py +3 -10
  21. kreuzberg/_mime_types.py +1 -19
  22. kreuzberg/_ocr/_base.py +4 -76
  23. kreuzberg/_ocr/_easyocr.py +124 -186
  24. kreuzberg/_ocr/_paddleocr.py +154 -224
  25. kreuzberg/_ocr/_table_extractor.py +184 -0
  26. kreuzberg/_ocr/_tesseract.py +797 -361
  27. kreuzberg/_playa.py +5 -31
  28. kreuzberg/_registry.py +0 -36
  29. kreuzberg/_types.py +588 -93
  30. kreuzberg/_utils/_cache.py +84 -138
  31. kreuzberg/_utils/_device.py +0 -74
  32. kreuzberg/_utils/_document_cache.py +0 -75
  33. kreuzberg/_utils/_errors.py +0 -50
  34. kreuzberg/_utils/_ocr_cache.py +136 -0
  35. kreuzberg/_utils/_pdf_lock.py +0 -16
  36. kreuzberg/_utils/_process_pool.py +17 -64
  37. kreuzberg/_utils/_quality.py +0 -60
  38. kreuzberg/_utils/_ref.py +32 -0
  39. kreuzberg/_utils/_serialization.py +0 -30
  40. kreuzberg/_utils/_string.py +9 -59
  41. kreuzberg/_utils/_sync.py +0 -77
  42. kreuzberg/_utils/_table.py +49 -101
  43. kreuzberg/_utils/_tmp.py +0 -9
  44. kreuzberg/cli.py +54 -74
  45. kreuzberg/extraction.py +39 -32
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
  47. kreuzberg-3.13.1.dist-info/RECORD +57 -0
  48. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  49. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
  50. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
  51. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,8 +1,6 @@
1
- """Table processing and export utilities."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
- import csv
3
+ import io
6
4
  from typing import TYPE_CHECKING, Any
7
5
 
8
6
  if TYPE_CHECKING:
@@ -10,67 +8,38 @@ if TYPE_CHECKING:
10
8
 
11
9
 
12
10
  def export_table_to_csv(table: TableData, separator: str = ",") -> str:
13
- r"""Export a TableData object to CSV/TSV format.
14
-
15
- Args:
16
- table: TableData object containing DataFrame
17
- separator: Field separator ("," for CSV, "\t" for TSV)
18
-
19
- Returns:
20
- String representation in CSV/TSV format
21
- """
22
11
  if "df" not in table or table["df"] is None:
23
12
  return ""
24
13
 
25
- # Use pandas to_csv() direct string return instead of StringIO
26
- csv_output = table["df"].to_csv(sep=separator, index=False, quoting=csv.QUOTE_MINIMAL, lineterminator="\n")
27
- return str(csv_output).strip()
14
+ buffer = io.StringIO()
15
+ df = table["df"]
16
+ df.write_csv(buffer, separator=separator, include_header=True)
17
+ return buffer.getvalue().strip()
28
18
 
29
19
 
30
20
  def export_table_to_tsv(table: TableData) -> str:
31
- """Export a TableData object to TSV format.
32
-
33
- Args:
34
- table: TableData object containing DataFrame
35
-
36
- Returns:
37
- String representation in TSV format
38
- """
39
21
  return export_table_to_csv(table, separator="\t")
40
22
 
41
23
 
42
24
  def enhance_table_markdown(table: TableData) -> str:
43
- """Generate enhanced markdown table with better formatting.
44
-
45
- Args:
46
- table: TableData object
47
-
48
- Returns:
49
- Enhanced markdown table string
50
- """
51
25
  if "df" not in table or table["df"] is None:
52
26
  return table.get("text", "")
53
27
 
54
28
  df = table["df"]
55
29
 
56
- if df.empty:
30
+ if df.is_empty():
57
31
  return table.get("text", "")
58
32
 
59
- # Create enhanced markdown with proper alignment
60
33
  lines = []
61
34
 
62
- # Header row
63
35
  headers = [str(col).strip() for col in df.columns]
64
36
  lines.append("| " + " | ".join(headers) + " |")
65
37
 
66
- # Separator row with alignment hints
67
38
  lines.append(_generate_separator_row(df))
68
39
 
69
- # Analyze float columns to determine formatting strategy
70
40
  float_col_formatting = _analyze_float_columns(df)
71
41
 
72
- # Data rows with proper formatting
73
- for _, row in df.iterrows():
42
+ for row in df.iter_rows(named=True):
74
43
  formatted_row = _format_table_row(row, df, float_col_formatting)
75
44
  lines.append("| " + " | ".join(formatted_row) + " |")
76
45
 
@@ -78,79 +47,77 @@ def enhance_table_markdown(table: TableData) -> str:
78
47
 
79
48
 
80
49
  def _generate_separator_row(df: Any) -> str:
81
- """Generate separator row with proper alignment hints."""
82
50
  separators = []
83
51
  for col in df.columns:
84
- # Check if column contains mostly numbers for right alignment
85
- if df[col].dtype in ["int64", "float64"] or _is_numeric_column(df[col]):
86
- separators.append("---:") # Right align numbers
52
+ dtype_str = str(df[col].dtype)
53
+ if dtype_str in ["Int64", "Float64", "Int32", "Float32"] or _is_numeric_column(df[col]):
54
+ separators.append("---:")
87
55
  else:
88
- separators.append("---") # Left align text
56
+ separators.append("---")
89
57
  return "| " + " | ".join(separators) + " |"
90
58
 
91
59
 
92
60
  def _analyze_float_columns(df: Any) -> dict[str, str]:
93
- """Analyze float columns to determine formatting strategy."""
94
61
  float_col_formatting = {}
95
62
  for col in df.columns:
96
- if str(df[col].dtype) == "float64":
97
- non_null_values = df[col].dropna()
63
+ dtype_str = str(df[col].dtype)
64
+ if dtype_str in ["Float64", "Float32"]:
65
+ non_null_values = df[col].drop_nulls()
98
66
  if len(non_null_values) > 0:
99
- # If all non-null values are whole numbers, format as integers
100
- all_integers = all(val.is_integer() for val in non_null_values)
101
- float_col_formatting[col] = "int" if all_integers else "float"
67
+ try:
68
+ values_list = non_null_values.to_list()
69
+ all_integers = all(float(val).is_integer() for val in values_list if val is not None)
70
+ float_col_formatting[col] = "int" if all_integers else "float"
71
+ except (ValueError, AttributeError):
72
+ float_col_formatting[col] = "float"
102
73
  else:
103
74
  float_col_formatting[col] = "int"
104
75
  return float_col_formatting
105
76
 
106
77
 
107
78
  def _format_table_row(row: Any, df: Any, float_col_formatting: dict[str, str]) -> list[str]:
108
- """Format a single table row with proper value formatting."""
109
79
  formatted_row = []
110
80
  for col_name, value in row.items():
111
- if value is None or (isinstance(value, float) and str(value) == "nan"):
81
+ if value is None:
112
82
  formatted_row.append("")
113
- elif str(df[col_name].dtype) in ["int64", "int32"]:
114
- # For integer columns, format as integers
115
- formatted_row.append(str(int(value)))
116
- elif isinstance(value, float):
117
- # For float columns, use the determined formatting strategy
118
- if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
83
+ else:
84
+ dtype_str = str(df[col_name].dtype)
85
+ if dtype_str in ["Int64", "Int32"]:
119
86
  formatted_row.append(str(int(value)))
87
+ elif isinstance(value, float):
88
+ if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
89
+ formatted_row.append(str(int(value)))
90
+ else:
91
+ formatted_row.append(f"{value:.2f}")
120
92
  else:
121
- formatted_row.append(f"{value:.2f}")
122
- else:
123
- # Clean up text values
124
- clean_value = str(value).strip().replace("|", "\\|") # Escape pipes
125
- formatted_row.append(clean_value)
93
+ clean_value = str(value).strip().replace("|", "\\|")
94
+ formatted_row.append(clean_value)
126
95
  return formatted_row
127
96
 
128
97
 
129
98
  def _is_numeric_column(series: Any) -> bool:
130
- """Check if a pandas Series contains mostly numeric values."""
131
99
  if len(series) == 0:
132
100
  return False
133
101
 
134
102
  try:
135
- # Check if already numeric dtype first (fastest path)
136
- if str(series.dtype) in {"int64", "float64", "int32", "float32"}:
103
+ dtype_str = str(series.dtype)
104
+ if dtype_str in {"Int64", "Float64", "Int32", "Float32"}:
137
105
  return True
138
106
 
139
- # Sample-based approach for large series (>1000 rows)
140
107
  sample_size = min(100, len(series))
141
- if len(series) > 1000:
142
- sample_series = series.dropna().sample(n=sample_size, random_state=42)
143
- else:
144
- sample_series = series.dropna()
108
+ series_no_nulls = series.drop_nulls()
109
+
110
+ if len(series_no_nulls) == 0:
111
+ return False
112
+
113
+ sample_series = series_no_nulls.slice(0, sample_size) if len(series_no_nulls) > 1000 else series_no_nulls
145
114
 
146
115
  if len(sample_series) == 0:
147
116
  return False
148
117
 
149
- # Optimized numeric conversion - avoid exception overhead
150
118
  numeric_count = 0
151
- for val in sample_series:
119
+ for val in sample_series.to_list():
152
120
  val_str = str(val).replace(",", "").replace("$", "").replace("%", "")
153
- # Quick check: if it contains only digits, decimal point, minus, plus, or e
154
121
  if val_str and all(c in "0123456789.-+eE" for c in val_str):
155
122
  try:
156
123
  float(val_str)
@@ -158,7 +125,6 @@ def _is_numeric_column(series: Any) -> bool:
158
125
  except (ValueError, TypeError):
159
126
  pass
160
127
 
161
- # Consider numeric if >70% of sampled values are numeric
162
128
  return (numeric_count / len(sample_series)) > 0.7
163
129
 
164
130
  except (ValueError, TypeError, ZeroDivisionError):
@@ -166,14 +132,6 @@ def _is_numeric_column(series: Any) -> bool:
166
132
 
167
133
 
168
134
  def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
169
- """Generate summary statistics for extracted tables.
170
-
171
- Args:
172
- tables: List of TableData objects
173
-
174
- Returns:
175
- Dictionary with table statistics
176
- """
177
135
  if not tables:
178
136
  return {
179
137
  "table_count": 0,
@@ -190,8 +148,8 @@ def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
190
148
  for table in tables:
191
149
  if "df" in table and table["df"] is not None:
192
150
  df = table["df"]
193
- total_rows += len(df)
194
- total_columns += len(df.columns)
151
+ total_rows += df.height
152
+ total_columns += df.width
195
153
 
196
154
  if "page_number" in table:
197
155
  page_num = table["page_number"]
@@ -213,14 +171,6 @@ def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
213
171
 
214
172
 
215
173
  def extract_table_structure_info(table: TableData) -> dict[str, Any]:
216
- """Extract structural information from a table.
217
-
218
- Args:
219
- table: TableData object
220
-
221
- Returns:
222
- Dictionary with structural information
223
- """
224
174
  info = {
225
175
  "has_headers": False,
226
176
  "row_count": 0,
@@ -236,25 +186,23 @@ def extract_table_structure_info(table: TableData) -> dict[str, Any]:
236
186
 
237
187
  df = table["df"]
238
188
 
239
- if df.empty:
189
+ if df.is_empty():
240
190
  return info
241
191
 
242
- info["row_count"] = len(df)
243
- info["column_count"] = len(df.columns)
244
- info["has_headers"] = len(df.columns) > 0
192
+ info["row_count"] = df.height
193
+ info["column_count"] = df.width
194
+ info["has_headers"] = df.width > 0
245
195
 
246
- # Analyze column types
247
196
  for col in df.columns:
248
197
  if _is_numeric_column(df[col]):
249
198
  info["numeric_columns"] += 1
250
199
  else:
251
200
  info["text_columns"] += 1
252
201
 
253
- # Calculate data density
254
- total_cells = len(df) * len(df.columns)
202
+ total_cells = df.height * df.width
255
203
  if total_cells > 0:
256
- empty_cells = df.isnull().sum().sum()
257
- info["empty_cells"] = int(empty_cells)
204
+ empty_cells = df.null_count().sum().item()
205
+ info["empty_cells"] = empty_cells
258
206
  info["data_density"] = (total_cells - empty_cells) / total_cells
259
207
 
260
208
  return info
kreuzberg/_utils/_tmp.py CHANGED
@@ -16,15 +16,6 @@ if TYPE_CHECKING: # pragma: no cover
16
16
  async def create_temp_file(
17
17
  extension: str, content: bytes | None = None
18
18
  ) -> tuple[Path, Callable[[], Coroutine[None, None, None]]]:
19
- """Create a temporary file that is closed.
20
-
21
- Args:
22
- extension: The file extension.
23
- content: The content to write to the file.
24
-
25
- Returns:
26
- The temporary file path.
27
- """
28
19
  file = await run_sync(NamedTemporaryFile, suffix=extension, delete=False)
29
20
  if content:
30
21
  await AsyncPath(file.name).write_bytes(content)
kreuzberg/cli.py CHANGED
@@ -1,5 +1,3 @@
1
- """Command-line interface for kreuzberg."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  import json
@@ -84,11 +82,11 @@ def format_extraction_result(result: ExtractionResult, show_metadata: bool, outp
84
82
  return "\n".join(output_parts)
85
83
 
86
84
 
87
- def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
85
+ def _load_config(config_path: Path | None, verbose: bool) -> dict[str, Any]:
88
86
  """Load configuration from file or find default."""
89
87
  file_config = {}
90
- if config:
91
- file_config = load_config_from_file(config)
88
+ if config_path:
89
+ file_config = load_config_from_file(config_path)
92
90
  else:
93
91
  default_config = find_config_file()
94
92
  if default_config:
@@ -101,39 +99,38 @@ def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
101
99
  return file_config
102
100
 
103
101
 
104
- def _build_cli_args(
105
- force_ocr: bool,
106
- chunk_content: bool,
107
- extract_tables: bool,
108
- max_chars: int,
109
- max_overlap: int,
110
- ocr_backend: str | None,
111
- tesseract_lang: str | None,
112
- tesseract_psm: int | None,
113
- easyocr_languages: str | None,
114
- paddleocr_languages: str | None,
115
- ) -> dict[str, Any]:
102
+ def _build_cli_args(params: dict[str, Any]) -> dict[str, Any]:
116
103
  """Build CLI arguments dictionary."""
117
104
  cli_args: dict[str, Any] = {
118
- "force_ocr": force_ocr if force_ocr else None,
119
- "chunk_content": chunk_content if chunk_content else None,
120
- "extract_tables": extract_tables if extract_tables else None,
121
- "max_chars": max_chars if max_chars != DEFAULT_MAX_CHARACTERS else None,
122
- "max_overlap": max_overlap if max_overlap != DEFAULT_MAX_OVERLAP else None,
123
- "ocr_backend": ocr_backend,
105
+ "force_ocr": params["force_ocr"] if params["force_ocr"] else None,
106
+ "chunk_content": params["chunk_content"] if params["chunk_content"] else None,
107
+ "extract_tables": params["extract_tables"] if params["extract_tables"] else None,
108
+ "max_chars": params["max_chars"] if params["max_chars"] != DEFAULT_MAX_CHARACTERS else None,
109
+ "max_overlap": params["max_overlap"] if params["max_overlap"] != DEFAULT_MAX_OVERLAP else None,
110
+ "ocr_backend": params["ocr_backend"],
124
111
  }
125
112
 
126
- if ocr_backend == "tesseract" and (tesseract_lang or tesseract_psm is not None):
113
+ ocr_backend = params["ocr_backend"]
114
+ if ocr_backend == "tesseract" and (
115
+ params["tesseract_lang"]
116
+ or params["tesseract_psm"] is not None
117
+ or params["tesseract_output_format"]
118
+ or params["enable_table_detection"]
119
+ ):
127
120
  tesseract_config = {}
128
- if tesseract_lang:
129
- tesseract_config["language"] = tesseract_lang
130
- if tesseract_psm is not None:
131
- tesseract_config["psm"] = tesseract_psm # type: ignore[assignment]
121
+ if params["tesseract_lang"]:
122
+ tesseract_config["language"] = params["tesseract_lang"]
123
+ if params["tesseract_psm"] is not None:
124
+ tesseract_config["psm"] = params["tesseract_psm"]
125
+ if params["tesseract_output_format"]:
126
+ tesseract_config["output_format"] = params["tesseract_output_format"]
127
+ if params["enable_table_detection"]:
128
+ tesseract_config["enable_table_detection"] = True
132
129
  cli_args["tesseract_config"] = tesseract_config
133
- elif ocr_backend == "easyocr" and easyocr_languages:
134
- cli_args["easyocr_config"] = {"languages": easyocr_languages.split(",")}
135
- elif ocr_backend == "paddleocr" and paddleocr_languages:
136
- cli_args["paddleocr_config"] = {"languages": paddleocr_languages.split(",")}
130
+ elif ocr_backend == "easyocr" and params["easyocr_languages"]:
131
+ cli_args["easyocr_config"] = {"languages": params["easyocr_languages"].split(",")}
132
+ elif ocr_backend == "paddleocr" and params["paddleocr_languages"]:
133
+ cli_args["paddleocr_config"] = {"languages": params["paddleocr_languages"].split(",")}
137
134
 
138
135
  return cli_args
139
136
 
@@ -158,7 +155,7 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
158
155
  progress.add_task("Extracting text...", total=None)
159
156
 
160
157
  try:
161
- import magic # type: ignore[import-not-found] # noqa: PLC0415
158
+ import magic # type: ignore[import-not-found] # noqa: PLC0415
162
159
 
163
160
  mime_type = magic.from_buffer(input_bytes, mime=True)
164
161
  except ImportError: # pragma: no cover
@@ -188,7 +185,10 @@ def _write_output(
188
185
  if verbose:
189
186
  console.print(f"[green]✓[/green] Output written to: {output}")
190
187
  else:
191
- click.echo(formatted_output)
188
+ try:
189
+ click.echo(formatted_output)
190
+ except UnicodeEncodeError:
191
+ sys.stdout.buffer.write(formatted_output.encode("utf-8"))
192
192
 
193
193
 
194
194
  def handle_error(error: Exception, verbose: bool) -> None: # pragma: no cover
@@ -248,71 +248,51 @@ def cli(ctx: click.Context) -> None:
248
248
  @click.option(
249
249
  "--ocr-backend", type=OcrBackendParamType(), help="OCR backend to use (tesseract, easyocr, paddleocr, none)"
250
250
  )
251
- @click.option("--config", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
251
+ @click.option("--config", "config_file", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
252
252
  @click.option("--show-metadata", is_flag=True, help="Include metadata in output")
253
253
  @click.option("--output-format", type=click.Choice(["text", "json"]), default="text", help="Output format")
254
254
  @click.option("-v", "--verbose", is_flag=True, help="Verbose output for debugging")
255
255
  @click.option("--tesseract-lang", help="Tesseract language(s) (e.g., 'eng+deu')")
256
256
  @click.option("--tesseract-psm", type=int, help="Tesseract PSM mode (0-13)")
257
+ @click.option(
258
+ "--tesseract-output-format",
259
+ type=click.Choice(["text", "markdown", "tsv", "hocr"]),
260
+ help="Tesseract OCR output format (default: markdown)",
261
+ )
262
+ @click.option(
263
+ "--enable-table-detection", is_flag=True, help="Enable table extraction from scanned documents (with TSV format)"
264
+ )
257
265
  @click.option("--easyocr-languages", help="EasyOCR language codes (comma-separated, e.g., 'en,de')")
258
266
  @click.option("--paddleocr-languages", help="PaddleOCR language codes (comma-separated, e.g., 'en,german')")
259
267
  @click.pass_context
260
- def extract( # noqa: PLR0913
261
- _: click.Context,
262
- file: Path | None,
263
- output: Path | None,
264
- force_ocr: bool,
265
- chunk_content: bool,
266
- extract_tables: bool,
267
- max_chars: int,
268
- max_overlap: int,
269
- ocr_backend: str | None,
270
- config: Path | None,
271
- show_metadata: bool,
272
- output_format: str,
273
- verbose: bool,
274
- tesseract_lang: str | None,
275
- tesseract_psm: int | None,
276
- easyocr_languages: str | None,
277
- paddleocr_languages: str | None,
278
- ) -> None:
268
+ def extract(ctx: click.Context) -> None:
279
269
  """Extract text from a document.
280
270
 
281
271
  FILE can be a path to a document or '-' to read from stdin.
282
272
  If FILE is omitted, reads from stdin.
283
273
  """
274
+ params = ctx.params
284
275
  try:
285
- file_config = _load_config(config, verbose)
286
-
287
- cli_args = _build_cli_args(
288
- force_ocr,
289
- chunk_content,
290
- extract_tables,
291
- max_chars,
292
- max_overlap,
293
- ocr_backend,
294
- tesseract_lang,
295
- tesseract_psm,
296
- easyocr_languages,
297
- paddleocr_languages,
298
- )
276
+ file_config = _load_config(params["config_file"], params["verbose"])
277
+
278
+ cli_args = _build_cli_args(params)
299
279
 
300
280
  extraction_config = build_extraction_config(file_config, cli_args)
301
281
 
302
- result = _perform_extraction(file, extraction_config, verbose)
282
+ result = _perform_extraction(params["file"], extraction_config, params["verbose"])
303
283
 
304
- _write_output(result, output, show_metadata, output_format, verbose)
284
+ _write_output(result, params["output"], params["show_metadata"], params["output_format"], params["verbose"])
305
285
 
306
286
  except Exception as e: # noqa: BLE001
307
- handle_error(e, verbose)
287
+ handle_error(e, params["verbose"])
308
288
 
309
289
 
310
290
  @cli.command()
311
- @click.option("--config", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
312
- def config(config: Path | None) -> None:
291
+ @click.option("--config", "config_file", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
292
+ def config(config_file: Path | None) -> None:
313
293
  """Show current configuration."""
314
294
  try:
315
- config_path = config or find_config_file()
295
+ config_path = config_file or find_config_file()
316
296
 
317
297
  if config_path:
318
298
  file_config = load_config_from_file(config_path)
kreuzberg/extraction.py CHANGED
@@ -151,20 +151,22 @@ async def extract_file(
151
151
  """
152
152
  cache = get_document_cache()
153
153
  path = Path(file_path)
154
- cached_result = cache.get(path, config)
155
- if cached_result is not None:
156
- return cached_result
157
154
 
158
- if cache.is_processing(path, config):
159
- event = cache.mark_processing(path, config)
160
- await anyio.to_thread.run_sync(event.wait) # pragma: no cover
161
-
162
- # Try cache again after waiting for other process to complete # ~keep
163
- cached_result = cache.get(path, config) # pragma: no cover
164
- if cached_result is not None: # pragma: no cover
155
+ if config.use_cache:
156
+ cached_result = cache.get(path, config)
157
+ if cached_result is not None:
165
158
  return cached_result
166
159
 
167
- cache.mark_processing(path, config)
160
+ if cache.is_processing(path, config):
161
+ event = cache.mark_processing(path, config)
162
+ await anyio.to_thread.run_sync(event.wait) # pragma: no cover
163
+
164
+ # Try cache again after waiting for other process to complete # ~keep
165
+ cached_result = cache.get(path, config) # pragma: no cover
166
+ if cached_result is not None: # pragma: no cover
167
+ return cached_result
168
+
169
+ cache.mark_processing(path, config)
168
170
 
169
171
  try:
170
172
  if not path.exists():
@@ -183,11 +185,13 @@ async def extract_file(
183
185
 
184
186
  result = await _validate_and_post_process_async(result=result, config=config, file_path=path)
185
187
 
186
- cache.set(path, config, result)
188
+ if config.use_cache:
189
+ cache.set(path, config, result)
187
190
 
188
191
  return result
189
192
  finally:
190
- cache.mark_complete(path, config)
193
+ if config.use_cache:
194
+ cache.mark_complete(path, config)
191
195
 
192
196
 
193
197
  async def batch_extract_file(
@@ -224,7 +228,7 @@ async def batch_extract_file(
224
228
  content=f"Error: {type(e).__name__}: {e!s}",
225
229
  mime_type="text/plain",
226
230
  metadata={ # type: ignore[typeddict-unknown-key]
227
- "error": True,
231
+ "error": f"{type(e).__name__}: {e!s}",
228
232
  "error_context": create_error_context(
229
233
  operation="batch_extract_file",
230
234
  file_path=path,
@@ -273,7 +277,7 @@ async def batch_extract_bytes(
273
277
  content=f"Error: {type(e).__name__}: {e!s}",
274
278
  mime_type="text/plain",
275
279
  metadata={ # type: ignore[typeddict-unknown-key]
276
- "error": True,
280
+ "error": f"{type(e).__name__}: {e!s}",
277
281
  "error_context": create_error_context(
278
282
  operation="batch_extract_bytes",
279
283
  error=e,
@@ -336,20 +340,22 @@ def extract_file_sync(
336
340
  """
337
341
  cache = get_document_cache()
338
342
  path = Path(file_path)
339
- cached_result = cache.get(path, config)
340
- if cached_result is not None:
341
- return cached_result
342
343
 
343
- if cache.is_processing(path, config):
344
- event = cache.mark_processing(path, config)
345
- event.wait() # pragma: no cover
346
-
347
- # Try cache again after waiting for other process to complete # ~keep
348
- cached_result = cache.get(path, config) # pragma: no cover
349
- if cached_result is not None: # pragma: no cover
344
+ if config.use_cache:
345
+ cached_result = cache.get(path, config)
346
+ if cached_result is not None:
350
347
  return cached_result
351
348
 
352
- cache.mark_processing(path, config)
349
+ if cache.is_processing(path, config):
350
+ event = cache.mark_processing(path, config)
351
+ event.wait() # pragma: no cover
352
+
353
+ # Try cache again after waiting for other process to complete # ~keep
354
+ cached_result = cache.get(path, config) # pragma: no cover
355
+ if cached_result is not None: # pragma: no cover
356
+ return cached_result
357
+
358
+ cache.mark_processing(path, config)
353
359
 
354
360
  try:
355
361
  if not path.exists():
@@ -360,7 +366,7 @@ def extract_file_sync(
360
366
  result = extractor.extract_path_sync(Path(file_path))
361
367
  else:
362
368
  result = ExtractionResult(
363
- content=Path(file_path).read_text(),
369
+ content=Path(file_path).read_text(encoding="utf-8"),
364
370
  chunks=[],
365
371
  mime_type=mime_type,
366
372
  metadata={},
@@ -368,11 +374,13 @@ def extract_file_sync(
368
374
 
369
375
  result = _validate_and_post_process_sync(result=result, config=config, file_path=path)
370
376
 
371
- cache.set(path, config, result)
377
+ if config.use_cache:
378
+ cache.set(path, config, result)
372
379
 
373
380
  return result
374
381
  finally:
375
- cache.mark_complete(path, config)
382
+ if config.use_cache:
383
+ cache.mark_complete(path, config)
376
384
 
377
385
 
378
386
  def batch_extract_file_sync(
@@ -404,7 +412,7 @@ def batch_extract_file_sync(
404
412
  content=f"Error: {type(e).__name__}: {e!s}",
405
413
  mime_type="text/plain",
406
414
  metadata={ # type: ignore[typeddict-unknown-key]
407
- "error": True,
415
+ "error": f"{type(e).__name__}: {e!s}",
408
416
  "error_context": create_error_context(
409
417
  operation="batch_extract_file_sync",
410
418
  file_path=file_path,
@@ -455,7 +463,7 @@ def batch_extract_bytes_sync(
455
463
  content=f"Error: {type(e).__name__}: {e!s}",
456
464
  mime_type="text/plain",
457
465
  metadata={ # type: ignore[typeddict-unknown-key]
458
- "error": True,
466
+ "error": f"{type(e).__name__}: {e!s}",
459
467
  "error_context": create_error_context(
460
468
  operation="batch_extract_bytes_sync",
461
469
  error=e,
@@ -469,7 +477,6 @@ def batch_extract_bytes_sync(
469
477
  return (index, error_result)
470
478
 
471
479
  with ThreadPoolExecutor(max_workers=max_workers) as executor:
472
- # Avoid creating intermediate list, use enumerate directly
473
480
  future_to_index = {executor.submit(extract_single, (i, content)): i for i, content in enumerate(contents)}
474
481
 
475
482
  results: list[ExtractionResult] = [None] * len(contents) # type: ignore[list-item]