kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_config.py +248 -204
  5. kreuzberg/_document_classification.py +0 -8
  6. kreuzberg/_entity_extraction.py +1 -93
  7. kreuzberg/_extractors/_base.py +0 -5
  8. kreuzberg/_extractors/_email.py +1 -11
  9. kreuzberg/_extractors/_html.py +9 -12
  10. kreuzberg/_extractors/_image.py +1 -23
  11. kreuzberg/_extractors/_pandoc.py +10 -89
  12. kreuzberg/_extractors/_pdf.py +39 -92
  13. kreuzberg/_extractors/_presentation.py +0 -17
  14. kreuzberg/_extractors/_spread_sheet.py +13 -53
  15. kreuzberg/_extractors/_structured.py +1 -4
  16. kreuzberg/_gmft.py +14 -138
  17. kreuzberg/_language_detection.py +1 -22
  18. kreuzberg/_mcp/__init__.py +0 -2
  19. kreuzberg/_mcp/server.py +3 -10
  20. kreuzberg/_mime_types.py +1 -2
  21. kreuzberg/_ocr/_easyocr.py +21 -108
  22. kreuzberg/_ocr/_paddleocr.py +16 -94
  23. kreuzberg/_ocr/_table_extractor.py +260 -0
  24. kreuzberg/_ocr/_tesseract.py +906 -264
  25. kreuzberg/_playa.py +5 -4
  26. kreuzberg/_types.py +638 -40
  27. kreuzberg/_utils/_cache.py +88 -90
  28. kreuzberg/_utils/_device.py +0 -18
  29. kreuzberg/_utils/_document_cache.py +0 -2
  30. kreuzberg/_utils/_errors.py +0 -3
  31. kreuzberg/_utils/_pdf_lock.py +0 -2
  32. kreuzberg/_utils/_process_pool.py +19 -19
  33. kreuzberg/_utils/_quality.py +0 -43
  34. kreuzberg/_utils/_ref.py +48 -0
  35. kreuzberg/_utils/_serialization.py +0 -5
  36. kreuzberg/_utils/_string.py +9 -39
  37. kreuzberg/_utils/_sync.py +0 -1
  38. kreuzberg/_utils/_table.py +50 -57
  39. kreuzberg/cli.py +54 -74
  40. kreuzberg/extraction.py +39 -32
  41. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
  42. kreuzberg-3.13.0.dist-info/RECORD +56 -0
  43. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  44. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
  45. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,48 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar, cast
4
+
5
+ if TYPE_CHECKING:
6
+ from collections.abc import Callable
7
+
8
+ T = TypeVar("T")
9
+
10
+
11
+ class Ref(Generic[T]):
12
+ """A reference container that manages singleton instances without global variables.
13
+
14
+ This provides a clean alternative to global variables by using a registry pattern
15
+ with type safety.
16
+ """
17
+
18
+ _instances: ClassVar[dict[str, Any]] = {}
19
+
20
+ def __init__(self, name: str, factory: Callable[[], T]) -> None:
21
+ """Initialize a reference container.
22
+
23
+ Args:
24
+ name: Unique name for this reference
25
+ factory: Factory function to create the instance when needed
26
+ """
27
+ self.name = name
28
+ self.factory = factory
29
+
30
+ def get(self) -> T:
31
+ """Get the singleton instance, creating it if it doesn't exist."""
32
+ if self.name not in self._instances:
33
+ self._instances[self.name] = self.factory()
34
+ return cast("T", self._instances[self.name])
35
+
36
+ def clear(self) -> None:
37
+ """Clear the singleton instance."""
38
+ if self.name in self._instances:
39
+ del self._instances[self.name]
40
+
41
+ def is_initialized(self) -> bool:
42
+ """Check if the singleton instance exists."""
43
+ return self.name in self._instances
44
+
45
+ @classmethod
46
+ def clear_all(cls) -> None:
47
+ """Clear all singleton instances."""
48
+ cls._instances.clear()
@@ -1,5 +1,3 @@
1
- """Fast serialization utilities using msgspec."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  from dataclasses import is_dataclass
@@ -12,7 +10,6 @@ from msgspec.msgpack import decode, encode
12
10
  T = TypeVar("T")
13
11
 
14
12
 
15
- # Define dict method names in priority order
16
13
  _DICT_METHOD_NAMES = (
17
14
  "to_dict",
18
15
  "as_dict",
@@ -32,14 +29,12 @@ def encode_hook(obj: Any) -> Any:
32
29
  if isinstance(obj, Exception):
33
30
  return {"message": str(obj), "type": type(obj).__name__}
34
31
 
35
- # Check for dict-like methods more efficiently using any() with generator
36
32
  for attr_name in _DICT_METHOD_NAMES:
37
33
  method = getattr(obj, attr_name, None)
38
34
  if method is not None and callable(method):
39
35
  return method()
40
36
 
41
37
  if is_dataclass(obj) and not isinstance(obj, type):
42
- # Use msgspec.to_builtins for more efficient conversion
43
38
  return msgspec.to_builtins(obj)
44
39
 
45
40
  if hasattr(obj, "save") and hasattr(obj, "format"):
@@ -7,28 +7,21 @@ from functools import lru_cache
7
7
 
8
8
  import chardetng_py
9
9
 
10
- # Compile regex patterns once at module level for performance
11
10
  _WHITESPACE_PATTERN = re.compile(r"[ \t\f\v\r\xa0\u2000-\u200b\u2028\u2029\u3000]+")
12
11
  _NEWLINES_PATTERN = re.compile(r"\n+")
13
12
  _MOJIBAKE_PATTERNS = {
14
- # Hebrew as Cyrillic patterns
15
13
  "hebrew_as_cyrillic": re.compile(r"[\u0400-\u04FF]{3,}"),
16
- # Control characters that shouldn't appear in text
17
14
  "control_chars": re.compile(r"[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F-\x9F]"),
18
- # Unicode replacement characters
19
15
  "replacement_chars": re.compile(r"\uFFFD+"),
20
- # Isolated combining marks (likely encoding issues)
21
16
  "isolated_combining": re.compile(r"[\u0300-\u036F](?![^\u0300-\u036F])"),
22
17
  }
23
18
 
24
- # Simple cache for encoding detection (in-memory, session-scoped)
25
19
  _encoding_cache: dict[str, str] = {}
26
20
 
27
21
 
28
22
  @lru_cache(maxsize=128)
29
23
  def _get_encoding_cache_key(data_hash: str, size: int) -> str:
30
24
  """Generate cache key for encoding detection."""
31
- # Use string interpolation which is faster than format strings for simple cases
32
25
  return f"{data_hash}:{size}"
33
26
 
34
27
 
@@ -45,14 +38,12 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
45
38
  if not byte_data:
46
39
  return ""
47
40
 
48
- # Try provided encoding first (fastest path)
49
41
  if encoding:
50
42
  with suppress(UnicodeDecodeError, LookupError):
51
43
  decoded = byte_data.decode(encoding)
52
44
  return _fix_mojibake(decoded)
53
45
 
54
- # Check cache for similar content (performance optimization)
55
- data_hash = hashlib.sha256(byte_data[:1024]).hexdigest()[:16] # Hash first 1KB
46
+ data_hash = hashlib.sha256(byte_data[:1024]).hexdigest()[:16]
56
47
  cache_key = _get_encoding_cache_key(data_hash, len(byte_data))
57
48
 
58
49
  if cache_key in _encoding_cache:
@@ -61,25 +52,22 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
61
52
  decoded = byte_data.decode(cached_encoding)
62
53
  return _fix_mojibake(decoded)
63
54
 
64
- # Use chardetng for better performance than charset-normalizer
65
55
  detected_encoding = chardetng_py.detect(byte_data)
66
56
  if detected_encoding:
67
57
  with suppress(UnicodeDecodeError, LookupError):
68
58
  decoded = byte_data.decode(detected_encoding)
69
- # Cache successful encoding detection
70
- if len(_encoding_cache) < 1000: # Prevent unlimited growth
59
+ if len(_encoding_cache) < 1000: # Prevent unlimited growth ~keep
71
60
  _encoding_cache[cache_key] = detected_encoding
72
61
  return _fix_mojibake(decoded)
73
62
 
74
- # Try multiple encodings with confidence scoring
75
63
  encodings_to_try = [
76
64
  "utf-8",
77
- "windows-1255", # Hebrew
78
- "iso-8859-8", # Hebrew
79
- "windows-1256", # Arabic
80
- "iso-8859-6", # Arabic
81
- "windows-1252", # Western European
82
- "cp1251", # Cyrillic
65
+ "windows-1255", # Hebrew ~keep
66
+ "iso-8859-8", # Hebrew ~keep
67
+ "windows-1256", # Arabic ~keep
68
+ "iso-8859-6", # Arabic ~keep
69
+ "windows-1252", # Western European ~keep
70
+ "cp1251", # Cyrillic ~keep
83
71
  ]
84
72
 
85
73
  best_result = None
@@ -96,7 +84,6 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
96
84
  if best_result and best_confidence > 0.5:
97
85
  return _fix_mojibake(best_result)
98
86
 
99
- # Final fallback
100
87
  return byte_data.decode("latin-1", errors="replace")
101
88
 
102
89
 
@@ -109,25 +96,19 @@ def _calculate_text_confidence(text: str) -> float:
109
96
  if total_chars == 0:
110
97
  return 0.0
111
98
 
112
- # Check for common encoding problems - compile patterns once
113
99
  replacement_count = len(_MOJIBAKE_PATTERNS["replacement_chars"].findall(text))
114
100
  control_count = len(_MOJIBAKE_PATTERNS["control_chars"].findall(text))
115
101
 
116
- # Penalize replacement and control characters
117
102
  penalty = (replacement_count + control_count * 2) / total_chars
118
103
 
119
- # Bonus for readable character ranges - more efficient counting
120
- # Use generator expression with early termination
121
104
  readable_chars = sum(1 for c in text if c.isprintable() or c.isspace())
122
105
  readability_score = readable_chars / total_chars
123
106
 
124
- # Check for suspicious Cyrillic that might be misencoded Hebrew
125
107
  cyrillic_matches = _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].findall(text)
126
108
  if cyrillic_matches:
127
- # Calculate total length more efficiently
128
109
  cyrillic_length = sum(len(match) for match in cyrillic_matches)
129
110
  if cyrillic_length > total_chars * 0.1:
130
- penalty += 0.3 # Heavy penalty for likely mojibake
111
+ penalty += 0.3
131
112
 
132
113
  return max(0.0, min(1.0, readability_score - penalty))
133
114
 
@@ -137,19 +118,13 @@ def _fix_mojibake(text: str) -> str:
137
118
  if not text:
138
119
  return text
139
120
 
140
- # Remove control characters
141
121
  text = _MOJIBAKE_PATTERNS["control_chars"].sub("", text)
142
122
 
143
- # Remove replacement characters
144
123
  text = _MOJIBAKE_PATTERNS["replacement_chars"].sub("", text)
145
124
 
146
- # Remove isolated combining marks
147
125
  text = _MOJIBAKE_PATTERNS["isolated_combining"].sub("", text)
148
126
 
149
- # Try to fix Hebrew encoded as Cyrillic (common Windows-1255 -> CP1251 confusion)
150
127
  if _MOJIBAKE_PATTERNS["hebrew_as_cyrillic"].search(text):
151
- # This is a heuristic fix - in practice, you'd need actual character mapping
152
- # For now, we flag it for manual review by keeping the text but adding a marker
153
128
  pass
154
129
 
155
130
  return text
@@ -167,19 +142,14 @@ def normalize_spaces(text: str) -> str:
167
142
  if not text or not text.strip():
168
143
  return ""
169
144
 
170
- # Split by double newlines to preserve paragraph breaks
171
145
  paragraphs = text.split("\n\n")
172
146
 
173
147
  result_paragraphs = []
174
148
 
175
149
  for paragraph in paragraphs:
176
- # Use pre-compiled patterns for better performance
177
- # Replace multiple whitespace (except newlines) with single space
178
150
  cleaned = _WHITESPACE_PATTERN.sub(" ", paragraph)
179
- # Clean up multiple newlines within paragraph (keep single newlines)
180
151
  cleaned = _NEWLINES_PATTERN.sub("\n", cleaned)
181
152
 
182
- # Process lines efficiently - manual loop avoids double strip() calls
183
153
  lines = []
184
154
  for line in cleaned.split("\n"):
185
155
  stripped_line = line.strip()
kreuzberg/_utils/_sync.py CHANGED
@@ -28,7 +28,6 @@ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -
28
28
  Returns:
29
29
  The result of the synchronous function.
30
30
  """
31
- # Optimize: only create partial if we have kwargs
32
31
  if kwargs:
33
32
  handler = partial(sync_fn, **kwargs)
34
33
  return cast("T", await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
@@ -1,8 +1,6 @@
1
- """Table processing and export utilities."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
- import csv
3
+ import io
6
4
  from typing import TYPE_CHECKING, Any
7
5
 
8
6
  if TYPE_CHECKING:
@@ -22,9 +20,10 @@ def export_table_to_csv(table: TableData, separator: str = ",") -> str:
22
20
  if "df" not in table or table["df"] is None:
23
21
  return ""
24
22
 
25
- # Use pandas to_csv() direct string return instead of StringIO
26
- csv_output = table["df"].to_csv(sep=separator, index=False, quoting=csv.QUOTE_MINIMAL, lineterminator="\n")
27
- return str(csv_output).strip()
23
+ buffer = io.StringIO()
24
+ df = table["df"]
25
+ df.write_csv(buffer, separator=separator, include_header=True)
26
+ return buffer.getvalue().strip()
28
27
 
29
28
 
30
29
  def export_table_to_tsv(table: TableData) -> str:
@@ -53,24 +52,19 @@ def enhance_table_markdown(table: TableData) -> str:
53
52
 
54
53
  df = table["df"]
55
54
 
56
- if df.empty:
55
+ if df.is_empty():
57
56
  return table.get("text", "")
58
57
 
59
- # Create enhanced markdown with proper alignment
60
58
  lines = []
61
59
 
62
- # Header row
63
60
  headers = [str(col).strip() for col in df.columns]
64
61
  lines.append("| " + " | ".join(headers) + " |")
65
62
 
66
- # Separator row with alignment hints
67
63
  lines.append(_generate_separator_row(df))
68
64
 
69
- # Analyze float columns to determine formatting strategy
70
65
  float_col_formatting = _analyze_float_columns(df)
71
66
 
72
- # Data rows with proper formatting
73
- for _, row in df.iterrows():
67
+ for row in df.iter_rows(named=True):
74
68
  formatted_row = _format_table_row(row, df, float_col_formatting)
75
69
  lines.append("| " + " | ".join(formatted_row) + " |")
76
70
 
@@ -81,11 +75,11 @@ def _generate_separator_row(df: Any) -> str:
81
75
  """Generate separator row with proper alignment hints."""
82
76
  separators = []
83
77
  for col in df.columns:
84
- # Check if column contains mostly numbers for right alignment
85
- if df[col].dtype in ["int64", "float64"] or _is_numeric_column(df[col]):
86
- separators.append("---:") # Right align numbers
78
+ dtype_str = str(df[col].dtype)
79
+ if dtype_str in ["Int64", "Float64", "Int32", "Float32"] or _is_numeric_column(df[col]):
80
+ separators.append("---:")
87
81
  else:
88
- separators.append("---") # Left align text
82
+ separators.append("---")
89
83
  return "| " + " | ".join(separators) + " |"
90
84
 
91
85
 
@@ -93,12 +87,16 @@ def _analyze_float_columns(df: Any) -> dict[str, str]:
93
87
  """Analyze float columns to determine formatting strategy."""
94
88
  float_col_formatting = {}
95
89
  for col in df.columns:
96
- if str(df[col].dtype) == "float64":
97
- non_null_values = df[col].dropna()
90
+ dtype_str = str(df[col].dtype)
91
+ if dtype_str in ["Float64", "Float32"]:
92
+ non_null_values = df[col].drop_nulls()
98
93
  if len(non_null_values) > 0:
99
- # If all non-null values are whole numbers, format as integers
100
- all_integers = all(val.is_integer() for val in non_null_values)
101
- float_col_formatting[col] = "int" if all_integers else "float"
94
+ try:
95
+ values_list = non_null_values.to_list()
96
+ all_integers = all(float(val).is_integer() for val in values_list if val is not None)
97
+ float_col_formatting[col] = "int" if all_integers else "float"
98
+ except (ValueError, AttributeError):
99
+ float_col_formatting[col] = "float"
102
100
  else:
103
101
  float_col_formatting[col] = "int"
104
102
  return float_col_formatting
@@ -108,49 +106,47 @@ def _format_table_row(row: Any, df: Any, float_col_formatting: dict[str, str]) -
108
106
  """Format a single table row with proper value formatting."""
109
107
  formatted_row = []
110
108
  for col_name, value in row.items():
111
- if value is None or (isinstance(value, float) and str(value) == "nan"):
109
+ if value is None:
112
110
  formatted_row.append("")
113
- elif str(df[col_name].dtype) in ["int64", "int32"]:
114
- # For integer columns, format as integers
115
- formatted_row.append(str(int(value)))
116
- elif isinstance(value, float):
117
- # For float columns, use the determined formatting strategy
118
- if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
111
+ else:
112
+ dtype_str = str(df[col_name].dtype)
113
+ if dtype_str in ["Int64", "Int32"]:
119
114
  formatted_row.append(str(int(value)))
115
+ elif isinstance(value, float):
116
+ if col_name in float_col_formatting and float_col_formatting[col_name] == "int":
117
+ formatted_row.append(str(int(value)))
118
+ else:
119
+ formatted_row.append(f"{value:.2f}")
120
120
  else:
121
- formatted_row.append(f"{value:.2f}")
122
- else:
123
- # Clean up text values
124
- clean_value = str(value).strip().replace("|", "\\|") # Escape pipes
125
- formatted_row.append(clean_value)
121
+ clean_value = str(value).strip().replace("|", "\\|")
122
+ formatted_row.append(clean_value)
126
123
  return formatted_row
127
124
 
128
125
 
129
126
  def _is_numeric_column(series: Any) -> bool:
130
- """Check if a pandas Series contains mostly numeric values."""
127
+ """Check if a polars Series contains mostly numeric values."""
131
128
  if len(series) == 0:
132
129
  return False
133
130
 
134
131
  try:
135
- # Check if already numeric dtype first (fastest path)
136
- if str(series.dtype) in {"int64", "float64", "int32", "float32"}:
132
+ dtype_str = str(series.dtype)
133
+ if dtype_str in {"Int64", "Float64", "Int32", "Float32"}:
137
134
  return True
138
135
 
139
- # Sample-based approach for large series (>1000 rows)
140
136
  sample_size = min(100, len(series))
141
- if len(series) > 1000:
142
- sample_series = series.dropna().sample(n=sample_size, random_state=42)
143
- else:
144
- sample_series = series.dropna()
137
+ series_no_nulls = series.drop_nulls()
138
+
139
+ if len(series_no_nulls) == 0:
140
+ return False
141
+
142
+ sample_series = series_no_nulls.slice(0, sample_size) if len(series_no_nulls) > 1000 else series_no_nulls
145
143
 
146
144
  if len(sample_series) == 0:
147
145
  return False
148
146
 
149
- # Optimized numeric conversion - avoid exception overhead
150
147
  numeric_count = 0
151
- for val in sample_series:
148
+ for val in sample_series.to_list():
152
149
  val_str = str(val).replace(",", "").replace("$", "").replace("%", "")
153
- # Quick check: if it contains only digits, decimal point, minus, plus, or e
154
150
  if val_str and all(c in "0123456789.-+eE" for c in val_str):
155
151
  try:
156
152
  float(val_str)
@@ -158,7 +154,6 @@ def _is_numeric_column(series: Any) -> bool:
158
154
  except (ValueError, TypeError):
159
155
  pass
160
156
 
161
- # Consider numeric if >70% of sampled values are numeric
162
157
  return (numeric_count / len(sample_series)) > 0.7
163
158
 
164
159
  except (ValueError, TypeError, ZeroDivisionError):
@@ -190,8 +185,8 @@ def generate_table_summary(tables: list[TableData]) -> dict[str, Any]:
190
185
  for table in tables:
191
186
  if "df" in table and table["df"] is not None:
192
187
  df = table["df"]
193
- total_rows += len(df)
194
- total_columns += len(df.columns)
188
+ total_rows += df.height
189
+ total_columns += df.width
195
190
 
196
191
  if "page_number" in table:
197
192
  page_num = table["page_number"]
@@ -236,25 +231,23 @@ def extract_table_structure_info(table: TableData) -> dict[str, Any]:
236
231
 
237
232
  df = table["df"]
238
233
 
239
- if df.empty:
234
+ if df.is_empty():
240
235
  return info
241
236
 
242
- info["row_count"] = len(df)
243
- info["column_count"] = len(df.columns)
244
- info["has_headers"] = len(df.columns) > 0
237
+ info["row_count"] = df.height
238
+ info["column_count"] = df.width
239
+ info["has_headers"] = df.width > 0
245
240
 
246
- # Analyze column types
247
241
  for col in df.columns:
248
242
  if _is_numeric_column(df[col]):
249
243
  info["numeric_columns"] += 1
250
244
  else:
251
245
  info["text_columns"] += 1
252
246
 
253
- # Calculate data density
254
- total_cells = len(df) * len(df.columns)
247
+ total_cells = df.height * df.width
255
248
  if total_cells > 0:
256
- empty_cells = df.isnull().sum().sum()
257
- info["empty_cells"] = int(empty_cells)
249
+ empty_cells = df.null_count().sum().item()
250
+ info["empty_cells"] = empty_cells
258
251
  info["data_density"] = (total_cells - empty_cells) / total_cells
259
252
 
260
253
  return info
kreuzberg/cli.py CHANGED
@@ -1,5 +1,3 @@
1
- """Command-line interface for kreuzberg."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  import json
@@ -84,11 +82,11 @@ def format_extraction_result(result: ExtractionResult, show_metadata: bool, outp
84
82
  return "\n".join(output_parts)
85
83
 
86
84
 
87
- def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
85
+ def _load_config(config_path: Path | None, verbose: bool) -> dict[str, Any]:
88
86
  """Load configuration from file or find default."""
89
87
  file_config = {}
90
- if config:
91
- file_config = load_config_from_file(config)
88
+ if config_path:
89
+ file_config = load_config_from_file(config_path)
92
90
  else:
93
91
  default_config = find_config_file()
94
92
  if default_config:
@@ -101,39 +99,38 @@ def _load_config(config: Path | None, verbose: bool) -> dict[str, Any]:
101
99
  return file_config
102
100
 
103
101
 
104
- def _build_cli_args(
105
- force_ocr: bool,
106
- chunk_content: bool,
107
- extract_tables: bool,
108
- max_chars: int,
109
- max_overlap: int,
110
- ocr_backend: str | None,
111
- tesseract_lang: str | None,
112
- tesseract_psm: int | None,
113
- easyocr_languages: str | None,
114
- paddleocr_languages: str | None,
115
- ) -> dict[str, Any]:
102
+ def _build_cli_args(params: dict[str, Any]) -> dict[str, Any]:
116
103
  """Build CLI arguments dictionary."""
117
104
  cli_args: dict[str, Any] = {
118
- "force_ocr": force_ocr if force_ocr else None,
119
- "chunk_content": chunk_content if chunk_content else None,
120
- "extract_tables": extract_tables if extract_tables else None,
121
- "max_chars": max_chars if max_chars != DEFAULT_MAX_CHARACTERS else None,
122
- "max_overlap": max_overlap if max_overlap != DEFAULT_MAX_OVERLAP else None,
123
- "ocr_backend": ocr_backend,
105
+ "force_ocr": params["force_ocr"] if params["force_ocr"] else None,
106
+ "chunk_content": params["chunk_content"] if params["chunk_content"] else None,
107
+ "extract_tables": params["extract_tables"] if params["extract_tables"] else None,
108
+ "max_chars": params["max_chars"] if params["max_chars"] != DEFAULT_MAX_CHARACTERS else None,
109
+ "max_overlap": params["max_overlap"] if params["max_overlap"] != DEFAULT_MAX_OVERLAP else None,
110
+ "ocr_backend": params["ocr_backend"],
124
111
  }
125
112
 
126
- if ocr_backend == "tesseract" and (tesseract_lang or tesseract_psm is not None):
113
+ ocr_backend = params["ocr_backend"]
114
+ if ocr_backend == "tesseract" and (
115
+ params["tesseract_lang"]
116
+ or params["tesseract_psm"] is not None
117
+ or params["tesseract_output_format"]
118
+ or params["enable_table_detection"]
119
+ ):
127
120
  tesseract_config = {}
128
- if tesseract_lang:
129
- tesseract_config["language"] = tesseract_lang
130
- if tesseract_psm is not None:
131
- tesseract_config["psm"] = tesseract_psm # type: ignore[assignment]
121
+ if params["tesseract_lang"]:
122
+ tesseract_config["language"] = params["tesseract_lang"]
123
+ if params["tesseract_psm"] is not None:
124
+ tesseract_config["psm"] = params["tesseract_psm"]
125
+ if params["tesseract_output_format"]:
126
+ tesseract_config["output_format"] = params["tesseract_output_format"]
127
+ if params["enable_table_detection"]:
128
+ tesseract_config["enable_table_detection"] = True
132
129
  cli_args["tesseract_config"] = tesseract_config
133
- elif ocr_backend == "easyocr" and easyocr_languages:
134
- cli_args["easyocr_config"] = {"languages": easyocr_languages.split(",")}
135
- elif ocr_backend == "paddleocr" and paddleocr_languages:
136
- cli_args["paddleocr_config"] = {"languages": paddleocr_languages.split(",")}
130
+ elif ocr_backend == "easyocr" and params["easyocr_languages"]:
131
+ cli_args["easyocr_config"] = {"languages": params["easyocr_languages"].split(",")}
132
+ elif ocr_backend == "paddleocr" and params["paddleocr_languages"]:
133
+ cli_args["paddleocr_config"] = {"languages": params["paddleocr_languages"].split(",")}
137
134
 
138
135
  return cli_args
139
136
 
@@ -158,7 +155,7 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
158
155
  progress.add_task("Extracting text...", total=None)
159
156
 
160
157
  try:
161
- import magic # type: ignore[import-not-found] # noqa: PLC0415
158
+ import magic # type: ignore[import-not-found] # noqa: PLC0415
162
159
 
163
160
  mime_type = magic.from_buffer(input_bytes, mime=True)
164
161
  except ImportError: # pragma: no cover
@@ -188,7 +185,10 @@ def _write_output(
188
185
  if verbose:
189
186
  console.print(f"[green]✓[/green] Output written to: {output}")
190
187
  else:
191
- click.echo(formatted_output)
188
+ try:
189
+ click.echo(formatted_output)
190
+ except UnicodeEncodeError:
191
+ sys.stdout.buffer.write(formatted_output.encode("utf-8"))
192
192
 
193
193
 
194
194
  def handle_error(error: Exception, verbose: bool) -> None: # pragma: no cover
@@ -248,71 +248,51 @@ def cli(ctx: click.Context) -> None:
248
248
  @click.option(
249
249
  "--ocr-backend", type=OcrBackendParamType(), help="OCR backend to use (tesseract, easyocr, paddleocr, none)"
250
250
  )
251
- @click.option("--config", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
251
+ @click.option("--config", "config_file", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
252
252
  @click.option("--show-metadata", is_flag=True, help="Include metadata in output")
253
253
  @click.option("--output-format", type=click.Choice(["text", "json"]), default="text", help="Output format")
254
254
  @click.option("-v", "--verbose", is_flag=True, help="Verbose output for debugging")
255
255
  @click.option("--tesseract-lang", help="Tesseract language(s) (e.g., 'eng+deu')")
256
256
  @click.option("--tesseract-psm", type=int, help="Tesseract PSM mode (0-13)")
257
+ @click.option(
258
+ "--tesseract-output-format",
259
+ type=click.Choice(["text", "markdown", "tsv", "hocr"]),
260
+ help="Tesseract OCR output format (default: markdown)",
261
+ )
262
+ @click.option(
263
+ "--enable-table-detection", is_flag=True, help="Enable table extraction from scanned documents (with TSV format)"
264
+ )
257
265
  @click.option("--easyocr-languages", help="EasyOCR language codes (comma-separated, e.g., 'en,de')")
258
266
  @click.option("--paddleocr-languages", help="PaddleOCR language codes (comma-separated, e.g., 'en,german')")
259
267
  @click.pass_context
260
- def extract( # noqa: PLR0913
261
- _: click.Context,
262
- file: Path | None,
263
- output: Path | None,
264
- force_ocr: bool,
265
- chunk_content: bool,
266
- extract_tables: bool,
267
- max_chars: int,
268
- max_overlap: int,
269
- ocr_backend: str | None,
270
- config: Path | None,
271
- show_metadata: bool,
272
- output_format: str,
273
- verbose: bool,
274
- tesseract_lang: str | None,
275
- tesseract_psm: int | None,
276
- easyocr_languages: str | None,
277
- paddleocr_languages: str | None,
278
- ) -> None:
268
+ def extract(ctx: click.Context) -> None:
279
269
  """Extract text from a document.
280
270
 
281
271
  FILE can be a path to a document or '-' to read from stdin.
282
272
  If FILE is omitted, reads from stdin.
283
273
  """
274
+ params = ctx.params
284
275
  try:
285
- file_config = _load_config(config, verbose)
286
-
287
- cli_args = _build_cli_args(
288
- force_ocr,
289
- chunk_content,
290
- extract_tables,
291
- max_chars,
292
- max_overlap,
293
- ocr_backend,
294
- tesseract_lang,
295
- tesseract_psm,
296
- easyocr_languages,
297
- paddleocr_languages,
298
- )
276
+ file_config = _load_config(params["config_file"], params["verbose"])
277
+
278
+ cli_args = _build_cli_args(params)
299
279
 
300
280
  extraction_config = build_extraction_config(file_config, cli_args)
301
281
 
302
- result = _perform_extraction(file, extraction_config, verbose)
282
+ result = _perform_extraction(params["file"], extraction_config, params["verbose"])
303
283
 
304
- _write_output(result, output, show_metadata, output_format, verbose)
284
+ _write_output(result, params["output"], params["show_metadata"], params["output_format"], params["verbose"])
305
285
 
306
286
  except Exception as e: # noqa: BLE001
307
- handle_error(e, verbose)
287
+ handle_error(e, params["verbose"])
308
288
 
309
289
 
310
290
  @cli.command()
311
- @click.option("--config", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
312
- def config(config: Path | None) -> None:
291
+ @click.option("--config", "config_file", type=click.Path(exists=True, path_type=Path), help="Configuration file path")
292
+ def config(config_file: Path | None) -> None:
313
293
  """Show current configuration."""
314
294
  try:
315
- config_path = config or find_config_file()
295
+ config_path = config_file or find_config_file()
316
296
 
317
297
  if config_path:
318
298
  file_config = load_config_from_file(config_path)