kreuzberg 3.14.0__py3-none-any.whl → 3.15.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. kreuzberg/__init__.py +6 -0
  2. kreuzberg/_api/_config_cache.py +247 -0
  3. kreuzberg/_api/main.py +156 -30
  4. kreuzberg/_chunker.py +7 -6
  5. kreuzberg/_constants.py +2 -0
  6. kreuzberg/_document_classification.py +4 -6
  7. kreuzberg/_entity_extraction.py +9 -4
  8. kreuzberg/_extractors/_base.py +269 -3
  9. kreuzberg/_extractors/_email.py +95 -27
  10. kreuzberg/_extractors/_html.py +85 -7
  11. kreuzberg/_extractors/_image.py +23 -22
  12. kreuzberg/_extractors/_pandoc.py +106 -75
  13. kreuzberg/_extractors/_pdf.py +209 -99
  14. kreuzberg/_extractors/_presentation.py +72 -8
  15. kreuzberg/_extractors/_spread_sheet.py +25 -30
  16. kreuzberg/_mcp/server.py +345 -25
  17. kreuzberg/_mime_types.py +42 -0
  18. kreuzberg/_ocr/_easyocr.py +2 -2
  19. kreuzberg/_ocr/_paddleocr.py +1 -1
  20. kreuzberg/_ocr/_tesseract.py +74 -34
  21. kreuzberg/_types.py +182 -23
  22. kreuzberg/_utils/_cache.py +10 -4
  23. kreuzberg/_utils/_device.py +2 -4
  24. kreuzberg/_utils/_image_preprocessing.py +12 -39
  25. kreuzberg/_utils/_process_pool.py +29 -8
  26. kreuzberg/_utils/_quality.py +7 -2
  27. kreuzberg/_utils/_resource_managers.py +65 -0
  28. kreuzberg/_utils/_sync.py +36 -6
  29. kreuzberg/_utils/_tmp.py +37 -1
  30. kreuzberg/cli.py +34 -20
  31. kreuzberg/extraction.py +43 -27
  32. {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/METADATA +2 -1
  33. kreuzberg-3.15.0.dist-info/RECORD +60 -0
  34. kreuzberg-3.14.0.dist-info/RECORD +0 -58
  35. {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/WHEEL +0 -0
  36. {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/entry_points.txt +0 -0
  37. {kreuzberg-3.14.0.dist-info → kreuzberg-3.15.0.dist-info}/licenses/LICENSE +0 -0
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Any
4
4
 
5
5
  from PIL import Image
6
6
 
7
+ from kreuzberg._constants import PDF_POINTS_PER_INCH
7
8
  from kreuzberg._types import ExtractionConfig, ImagePreprocessingMetadata
8
9
 
9
10
  if TYPE_CHECKING:
@@ -31,36 +32,30 @@ def calculate_optimal_dpi(
31
32
  Returns:
32
33
  Optimal DPI value that keeps image within max_dimension
33
34
  """
34
- # Convert points to inches (72 points = 1 inch)
35
- width_inches = page_width / 72.0
36
- height_inches = page_height / 72.0
35
+ width_inches = page_width / PDF_POINTS_PER_INCH
36
+ height_inches = page_height / PDF_POINTS_PER_INCH
37
37
 
38
- # Calculate pixel dimensions at target DPI
39
38
  target_width_pixels = int(width_inches * target_dpi)
40
39
  target_height_pixels = int(height_inches * target_dpi)
41
40
 
42
- # Check if target DPI results in oversized image
43
41
  max_pixel_dimension = max(target_width_pixels, target_height_pixels)
44
42
 
45
43
  if max_pixel_dimension <= max_dimension:
46
- # Target DPI is fine, clamp to min/max bounds
47
44
  return max(min_dpi, min(target_dpi, max_dpi))
48
45
 
49
- # Calculate maximum DPI that keeps within dimension constraints
50
46
  max_dpi_for_width = max_dimension / width_inches if width_inches > 0 else max_dpi
51
47
  max_dpi_for_height = max_dimension / height_inches if height_inches > 0 else max_dpi
52
48
  constrained_dpi = int(min(max_dpi_for_width, max_dpi_for_height))
53
49
 
54
- # Clamp to min/max bounds
55
50
  return max(min_dpi, min(constrained_dpi, max_dpi))
56
51
 
57
52
 
58
53
  def _extract_image_dpi(image: PILImage) -> tuple[tuple[float, float], float]:
59
54
  """Extract DPI information from image."""
60
- current_dpi_info = image.info.get("dpi", (72.0, 72.0))
55
+ current_dpi_info = image.info.get("dpi", (PDF_POINTS_PER_INCH, PDF_POINTS_PER_INCH))
61
56
  if isinstance(current_dpi_info, (list, tuple)):
62
57
  original_dpi = (float(current_dpi_info[0]), float(current_dpi_info[1]))
63
- current_dpi = float(current_dpi_info[0]) # Use horizontal DPI
58
+ current_dpi = float(current_dpi_info[0])
64
59
  else:
65
60
  current_dpi = float(current_dpi_info)
66
61
  original_dpi = (current_dpi, current_dpi)
@@ -88,10 +83,8 @@ def _calculate_target_dpi(
88
83
  """Calculate target DPI and whether it was auto-adjusted."""
89
84
  calculated_dpi = None
90
85
  if config.auto_adjust_dpi:
91
- # Convert pixel dimensions to approximate point dimensions
92
- # This is an approximation since we don't know the actual physical size
93
- approx_width_points = original_width * 72.0 / current_dpi
94
- approx_height_points = original_height * 72.0 / current_dpi
86
+ approx_width_points = original_width * PDF_POINTS_PER_INCH / current_dpi
87
+ approx_height_points = original_height * PDF_POINTS_PER_INCH / current_dpi
95
88
 
96
89
  optimal_dpi = calculate_optimal_dpi(
97
90
  approx_width_points,
@@ -131,7 +124,6 @@ def normalize_image_dpi(
131
124
  original_width, original_height = image.size
132
125
  original_dpi, current_dpi = _extract_image_dpi(image)
133
126
 
134
- # If no auto-adjustment and current DPI matches target and within limits, skip processing
135
127
  if _should_skip_processing(original_width, original_height, current_dpi, config):
136
128
  return image, ImagePreprocessingMetadata(
137
129
  original_dimensions=(original_width, original_height),
@@ -143,15 +135,12 @@ def normalize_image_dpi(
143
135
  skipped_resize=True,
144
136
  )
145
137
 
146
- # Calculate target DPI
147
138
  target_dpi, auto_adjusted, calculated_dpi = _calculate_target_dpi(
148
139
  original_width, original_height, current_dpi, config
149
140
  )
150
141
 
151
- # Calculate scale factor based on DPI ratio
152
142
  scale_factor = target_dpi / current_dpi
153
143
 
154
- # If scale factor is very close to 1.0, skip resizing
155
144
  if abs(scale_factor - 1.0) < 0.05:
156
145
  return image, ImagePreprocessingMetadata(
157
146
  original_dimensions=(original_width, original_height),
@@ -164,11 +153,9 @@ def normalize_image_dpi(
164
153
  skipped_resize=True,
165
154
  )
166
155
 
167
- # Calculate new dimensions
168
156
  new_width = int(original_width * scale_factor)
169
157
  new_height = int(original_height * scale_factor)
170
158
 
171
- # Ensure we don't exceed max_dimension (safety check)
172
159
  dimension_clamped = False
173
160
  max_new_dimension = max(new_width, new_height)
174
161
  if max_new_dimension > config.max_image_dimension:
@@ -178,12 +165,8 @@ def normalize_image_dpi(
178
165
  scale_factor *= dimension_scale
179
166
  dimension_clamped = True
180
167
 
181
- # Resize image
182
168
  try:
183
- # Use LANCZOS for high-quality downscaling, BICUBIC for upscaling
184
- # Handle different PIL versions
185
169
  try:
186
- # Modern PIL version
187
170
  if scale_factor < 1.0:
188
171
  resample_method = Image.Resampling.LANCZOS
189
172
  resample_name = "LANCZOS"
@@ -191,7 +174,6 @@ def normalize_image_dpi(
191
174
  resample_method = Image.Resampling.BICUBIC
192
175
  resample_name = "BICUBIC"
193
176
  except AttributeError:
194
- # Older PIL version
195
177
  if scale_factor < 1.0:
196
178
  resample_method = getattr(Image, "LANCZOS", 1) # type: ignore[arg-type]
197
179
  resample_name = "LANCZOS"
@@ -201,7 +183,6 @@ def normalize_image_dpi(
201
183
 
202
184
  normalized_image = image.resize((new_width, new_height), resample_method)
203
185
 
204
- # Update DPI info in the new image
205
186
  normalized_image.info["dpi"] = (target_dpi, target_dpi)
206
187
 
207
188
  return normalized_image, ImagePreprocessingMetadata(
@@ -218,7 +199,6 @@ def normalize_image_dpi(
218
199
  )
219
200
 
220
201
  except OSError as e:
221
- # If resizing fails, return original image with error info
222
202
  return image, ImagePreprocessingMetadata(
223
203
  original_dimensions=(original_width, original_height),
224
204
  original_dpi=original_dpi,
@@ -261,7 +241,6 @@ def get_dpi_adjustment_heuristics(
261
241
  "recommendations": recommendations,
262
242
  }
263
243
 
264
- # Calculate aspect ratio and size analysis
265
244
  aspect_ratio = width / height if height > 0 else 1.0
266
245
  total_pixels = width * height
267
246
  megapixels = total_pixels / 1_000_000
@@ -274,27 +253,23 @@ def get_dpi_adjustment_heuristics(
274
253
  "is_large": max(width, height) > max_dimension * 0.8,
275
254
  }
276
255
 
277
- # Document-specific heuristics
278
256
  if content_type == "document":
279
257
  if aspect_ratio > 2.0 or aspect_ratio < 0.5:
280
- # Very wide or very tall documents (like forms, receipts)
281
258
  recommendations.append("Consider higher DPI for narrow documents")
282
259
  if target_dpi < 200:
283
260
  heuristics["recommended_dpi"] = min(200, target_dpi * 1.3)
284
261
 
285
- if megapixels > 50: # Very large document
262
+ if megapixels > 50:
286
263
  recommendations.append("Large document detected - consider DPI reduction")
287
264
  heuristics["performance_impact"] = "high"
288
265
  if target_dpi > 150:
289
266
  heuristics["recommended_dpi"] = max(120, target_dpi * 0.8)
290
267
 
291
- # Memory usage estimation
292
- estimated_memory_mb = (width * height * 3) / (1024 * 1024) # RGB bytes
268
+ estimated_memory_mb = (width * height * 3) / (1024 * 1024)
293
269
  if estimated_memory_mb > 200:
294
270
  heuristics["performance_impact"] = "high"
295
271
  recommendations.append(f"High memory usage expected (~{estimated_memory_mb:.0f}MB)")
296
272
 
297
- # Quality vs performance tradeoffs
298
273
  scale_factor = target_dpi / current_dpi if current_dpi > 0 else 1.0
299
274
  if scale_factor < 0.7:
300
275
  heuristics["quality_impact"] = "high"
@@ -324,16 +299,14 @@ def estimate_processing_time(
324
299
  total_pixels = width * height
325
300
  megapixels = total_pixels / 1_000_000
326
301
 
327
- # Base processing times per megapixel (rough estimates)
328
302
  base_times = {
329
- "tesseract": 2.5, # seconds per megapixel
330
- "easyocr": 4.0, # slower due to deep learning
331
- "paddleocr": 3.5, # moderate speed
303
+ "tesseract": 2.5,
304
+ "easyocr": 4.0,
305
+ "paddleocr": 3.5,
332
306
  }
333
307
 
334
308
  base_time = base_times.get(ocr_backend, 3.0)
335
309
 
336
- # Non-linear scaling for very large images
337
310
  scaling_factor = 1.0 + (megapixels - 10) * 0.1 if megapixels > 10 else 1.0
338
311
 
339
312
  estimated_time = base_time * megapixels * scaling_factor
@@ -19,15 +19,9 @@ if TYPE_CHECKING:
19
19
 
20
20
  T = TypeVar("T")
21
21
 
22
+ _POOL_SIZE = mp.cpu_count()
22
23
 
23
- _POOL_SIZE = max(1, mp.cpu_count() - 1)
24
-
25
-
26
- def _create_process_pool() -> ProcessPoolExecutor:
27
- return ProcessPoolExecutor(max_workers=_POOL_SIZE)
28
-
29
-
30
- _process_pool_ref = Ref("process_pool", _create_process_pool)
24
+ _process_pool_ref = Ref("process_pool", lambda: ProcessPoolExecutor(max_workers=_POOL_SIZE))
31
25
 
32
26
 
33
27
  def _get_process_pool() -> ProcessPoolExecutor:
@@ -51,6 +45,33 @@ def submit_to_process_pool(func: Callable[..., T], *args: Any, **kwargs: Any) ->
51
45
  return future.result()
52
46
 
53
47
 
48
+ def get_optimal_worker_count(num_tasks: int, cpu_intensive: bool = True) -> int:
49
+ """Calculate optimal worker count based on workload.
50
+
51
+ Optimized based on benchmarking results:
52
+ - For 1 task: Use 1 worker (avoid overhead)
53
+ - For 2-3 tasks: Use num_tasks workers
54
+ - For 4+ tasks: Use all CPU cores for CPU-intensive work
55
+ """
56
+ cpu_count = mp.cpu_count()
57
+
58
+ if num_tasks == 1:
59
+ return 1
60
+ if num_tasks <= 3:
61
+ return min(num_tasks, cpu_count)
62
+ if cpu_intensive:
63
+ return cpu_count
64
+ return min(cpu_count * 2, max(cpu_count, num_tasks))
65
+
66
+
67
+ def warmup_process_pool() -> None:
68
+ """Warm up the process pool to reduce initialization overhead."""
69
+ with process_pool() as pool:
70
+ futures = [pool.submit(lambda: None) for _ in range(_POOL_SIZE)]
71
+ for future in futures:
72
+ future.result()
73
+
74
+
54
75
  def shutdown_process_pool() -> None:
55
76
  if _process_pool_ref.is_initialized():
56
77
  pool = _process_pool_ref.get()
@@ -2,6 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  import re
4
4
  from functools import reduce
5
+ from itertools import chain
5
6
  from typing import Any
6
7
 
7
8
  _OCR_ARTIFACTS = {
@@ -97,7 +98,9 @@ def _calculate_script_penalty(text: str, total_chars: int) -> float:
97
98
  if total_chars == 0:
98
99
  return 0.0
99
100
 
100
- script_chars = sum(len(match) for pattern in _SCRIPT_PATTERNS.values() for match in pattern.findall(text))
101
+ script_chars = sum(
102
+ len(match) for match in chain.from_iterable(pattern.findall(text) for pattern in _SCRIPT_PATTERNS.values())
103
+ )
101
104
 
102
105
  return min(1.0, script_chars / total_chars)
103
106
 
@@ -106,7 +109,9 @@ def _calculate_navigation_penalty(text: str, total_chars: int) -> float:
106
109
  if total_chars == 0:
107
110
  return 0.0
108
111
 
109
- nav_chars = sum(len(match) for pattern in _NAVIGATION_PATTERNS.values() for match in pattern.findall(text))
112
+ nav_chars = sum(
113
+ len(match) for match in chain.from_iterable(pattern.findall(text) for pattern in _NAVIGATION_PATTERNS.values())
114
+ )
110
115
 
111
116
  return min(1.0, nav_chars / total_chars)
112
117
 
@@ -0,0 +1,65 @@
1
+ from __future__ import annotations
2
+
3
+ import contextlib
4
+ from typing import TYPE_CHECKING
5
+
6
+ import pypdfium2
7
+
8
+ from kreuzberg._utils._pdf_lock import pypdfium_file_lock
9
+ from kreuzberg._utils._sync import run_sync
10
+
11
+ if TYPE_CHECKING: # pragma: no cover
12
+ from collections.abc import AsyncGenerator, Generator
13
+ from pathlib import Path
14
+
15
+
16
+ @contextlib.asynccontextmanager
17
+ async def pdf_document(file_path: Path) -> AsyncGenerator[pypdfium2.PdfDocument, None]:
18
+ """Async context manager for PyPDFium document resources."""
19
+ document = None
20
+ try:
21
+ with pypdfium_file_lock(file_path):
22
+ document = await run_sync(pypdfium2.PdfDocument, str(file_path))
23
+ yield document
24
+ finally:
25
+ if document:
26
+ with pypdfium_file_lock(file_path), contextlib.suppress(Exception):
27
+ await run_sync(document.close)
28
+
29
+
30
+ @contextlib.contextmanager
31
+ def pdf_document_sync(file_path: Path) -> Generator[pypdfium2.PdfDocument, None, None]:
32
+ """Sync context manager for PyPDFium document resources."""
33
+ document = None
34
+ try:
35
+ with pypdfium_file_lock(file_path):
36
+ document = pypdfium2.PdfDocument(str(file_path))
37
+ yield document
38
+ finally:
39
+ if document:
40
+ with pypdfium_file_lock(file_path), contextlib.suppress(Exception):
41
+ document.close()
42
+
43
+
44
+ @contextlib.contextmanager
45
+ def pdf_resources_sync(*resources: object) -> Generator[None, None, None]:
46
+ """Context manager for multiple PDF resources (pages, textpages, bitmaps)."""
47
+ try:
48
+ yield
49
+ finally:
50
+ for resource in resources:
51
+ with contextlib.suppress(Exception):
52
+ if hasattr(resource, "close"):
53
+ resource.close()
54
+
55
+
56
+ @contextlib.contextmanager
57
+ def image_resources(*images: object) -> Generator[None, None, None]:
58
+ """Context manager for PIL Image resources."""
59
+ try:
60
+ yield
61
+ finally:
62
+ for image in images:
63
+ with contextlib.suppress(Exception):
64
+ if hasattr(image, "close"):
65
+ image.close()
kreuzberg/_utils/_sync.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import asyncio
3
4
  from functools import partial
4
5
  from inspect import isawaitable, iscoroutinefunction
5
6
  from typing import TYPE_CHECKING, Any, TypeVar, cast
@@ -37,14 +38,43 @@ async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
37
38
  return results
38
39
 
39
40
 
40
- async def run_taskgroup_batched(*async_tasks: Awaitable[Any], batch_size: int) -> list[Any]:
41
- results: list[Any] = []
41
+ async def run_taskgroup_batched(
42
+ *async_tasks: Awaitable[Any],
43
+ batch_size: int,
44
+ use_semaphore: bool = True,
45
+ ) -> list[Any]:
46
+ """Run async tasks with controlled concurrency.
42
47
 
43
- for i in range(0, len(async_tasks), batch_size):
44
- batch = async_tasks[i : i + batch_size]
45
- results.extend(await run_taskgroup(*batch))
48
+ Args:
49
+ async_tasks: Tasks to execute
50
+ batch_size: Maximum concurrent tasks
51
+ use_semaphore: Use semaphore for concurrency control instead of sequential batches
46
52
 
47
- return results
53
+ Returns:
54
+ List of results in the same order as input tasks
55
+ """
56
+ if not async_tasks:
57
+ return []
58
+
59
+ if len(async_tasks) <= batch_size or not use_semaphore:
60
+ results: list[Any] = []
61
+ for i in range(0, len(async_tasks), batch_size):
62
+ batch = async_tasks[i : i + batch_size]
63
+ results.extend(await run_taskgroup(*batch))
64
+ return results
65
+
66
+ semaphore = asyncio.Semaphore(batch_size)
67
+
68
+ async def run_with_semaphore(task: Awaitable[Any], index: int) -> tuple[int, Any]:
69
+ async with semaphore:
70
+ result = await task
71
+ return (index, result)
72
+
73
+ indexed_tasks = [run_with_semaphore(task, i) for i, task in enumerate(async_tasks)]
74
+ indexed_results = await asyncio.gather(*indexed_tasks)
75
+
76
+ indexed_results.sort(key=lambda x: x[0])
77
+ return [result for _, result in indexed_results]
48
78
 
49
79
 
50
80
  async def run_maybe_sync(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
kreuzberg/_utils/_tmp.py CHANGED
@@ -1,5 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import contextlib
4
+ import os
5
+ import tempfile
3
6
  from contextlib import suppress
4
7
  from pathlib import Path
5
8
  from tempfile import NamedTemporaryFile
@@ -10,7 +13,7 @@ from anyio import Path as AsyncPath
10
13
  from kreuzberg._utils._sync import run_sync
11
14
 
12
15
  if TYPE_CHECKING: # pragma: no cover
13
- from collections.abc import Callable, Coroutine
16
+ from collections.abc import AsyncGenerator, Callable, Coroutine, Generator
14
17
 
15
18
 
16
19
  async def create_temp_file(
@@ -26,3 +29,36 @@ async def create_temp_file(
26
29
  await AsyncPath(file.name).unlink(missing_ok=True)
27
30
 
28
31
  return Path(file.name), unlink
32
+
33
+
34
+ @contextlib.asynccontextmanager
35
+ async def temporary_file(extension: str, content: bytes | None = None) -> AsyncGenerator[Path, None]:
36
+ """Async context manager for temporary files with automatic cleanup."""
37
+ file_path, unlink = await create_temp_file(extension, content)
38
+ try:
39
+ yield file_path
40
+ finally:
41
+ await unlink()
42
+
43
+
44
+ @contextlib.contextmanager
45
+ def temporary_file_sync(extension: str, content: bytes | None = None) -> Generator[Path, None, None]:
46
+ """Sync context manager for temporary files with automatic cleanup."""
47
+ fd, temp_path = tempfile.mkstemp(suffix=extension)
48
+ try:
49
+ if content:
50
+ with os.fdopen(fd, "wb") as f:
51
+ f.write(content)
52
+ else:
53
+ os.close(fd)
54
+ yield Path(temp_path)
55
+ finally:
56
+ with suppress(OSError, PermissionError):
57
+ Path(temp_path).unlink()
58
+
59
+
60
+ @contextlib.contextmanager
61
+ def temporary_directory() -> Generator[Path, None, None]:
62
+ """Context manager for temporary directories with automatic cleanup."""
63
+ with tempfile.TemporaryDirectory() as temp_dir:
64
+ yield Path(temp_dir)
kreuzberg/cli.py CHANGED
@@ -122,32 +122,37 @@ def _build_cli_args(params: dict[str, Any]) -> dict[str, Any]:
122
122
  "force_ocr": params["force_ocr"] if params["force_ocr"] else None,
123
123
  "chunk_content": params["chunk_content"] if params["chunk_content"] else None,
124
124
  "extract_tables": params["extract_tables"] if params["extract_tables"] else None,
125
+ "extract_entities": params["extract_entities"] if params["extract_entities"] else None,
126
+ "extract_keywords": params["extract_keywords"] if params["extract_keywords"] else None,
127
+ "auto_detect_language": params["auto_detect_language"] if params["auto_detect_language"] else None,
128
+ "keyword_count": params["keyword_count"] if params["keyword_count"] != 10 else None,
125
129
  "max_chars": params["max_chars"] if params["max_chars"] != DEFAULT_MAX_CHARACTERS else None,
126
130
  "max_overlap": params["max_overlap"] if params["max_overlap"] != DEFAULT_MAX_OVERLAP else None,
127
131
  "ocr_backend": params["ocr_backend"],
128
132
  }
129
133
 
130
134
  ocr_backend = params["ocr_backend"]
131
- if ocr_backend == "tesseract" and (
132
- params["tesseract_lang"]
133
- or params["tesseract_psm"] is not None
134
- or params["tesseract_output_format"]
135
- or params["enable_table_detection"]
136
- ):
137
- tesseract_config = {}
138
- if params["tesseract_lang"]:
139
- tesseract_config["language"] = params["tesseract_lang"]
140
- if params["tesseract_psm"] is not None:
141
- tesseract_config["psm"] = params["tesseract_psm"]
142
- if params["tesseract_output_format"]:
143
- tesseract_config["output_format"] = params["tesseract_output_format"]
144
- if params["enable_table_detection"]:
145
- tesseract_config["enable_table_detection"] = True
146
- cli_args["tesseract_config"] = tesseract_config
147
- elif ocr_backend == "easyocr" and params["easyocr_languages"]:
148
- cli_args["easyocr_config"] = {"languages": params["easyocr_languages"].split(",")}
149
- elif ocr_backend == "paddleocr" and params["paddleocr_languages"]:
150
- cli_args["paddleocr_config"] = {"languages": params["paddleocr_languages"].split(",")}
135
+ match ocr_backend:
136
+ case "tesseract" if (
137
+ params["tesseract_lang"]
138
+ or params["tesseract_psm"] is not None
139
+ or params["tesseract_output_format"]
140
+ or params["enable_table_detection"]
141
+ ):
142
+ tesseract_config = {}
143
+ if params["tesseract_lang"]:
144
+ tesseract_config["language"] = params["tesseract_lang"]
145
+ if params["tesseract_psm"] is not None:
146
+ tesseract_config["psm"] = params["tesseract_psm"]
147
+ if params["tesseract_output_format"]:
148
+ tesseract_config["output_format"] = params["tesseract_output_format"]
149
+ if params["enable_table_detection"]:
150
+ tesseract_config["enable_table_detection"] = True
151
+ cli_args["tesseract_config"] = tesseract_config
152
+ case "easyocr" if params["easyocr_languages"]:
153
+ cli_args["easyocr_config"] = {"languages": params["easyocr_languages"].split(",")}
154
+ case "paddleocr" if params["paddleocr_languages"]:
155
+ cli_args["paddleocr_config"] = {"languages": params["paddleocr_languages"].split(",")}
151
156
 
152
157
  return cli_args
153
158
 
@@ -250,6 +255,9 @@ def cli(ctx: click.Context) -> None:
250
255
  @click.option("--force-ocr", is_flag=True, help="Force OCR processing")
251
256
  @click.option("--chunk-content", is_flag=True, help="Enable content chunking")
252
257
  @click.option("--extract-tables", is_flag=True, help="Enable table extraction")
258
+ @click.option("--extract-entities", is_flag=True, help="Enable entity extraction")
259
+ @click.option("--extract-keywords", is_flag=True, help="Enable keyword extraction")
260
+ @click.option("--auto-detect-language", is_flag=True, help="Enable automatic language detection")
253
261
  @click.option(
254
262
  "--max-chars",
255
263
  type=int,
@@ -262,6 +270,12 @@ def cli(ctx: click.Context) -> None:
262
270
  default=DEFAULT_MAX_OVERLAP,
263
271
  help=f"Maximum overlap between chunks (default: {DEFAULT_MAX_OVERLAP})",
264
272
  )
273
+ @click.option(
274
+ "--keyword-count",
275
+ type=int,
276
+ default=10,
277
+ help="Number of keywords to extract (default: 10)",
278
+ )
265
279
  @click.option(
266
280
  "--ocr-backend", type=OcrBackendParamType(), help="OCR backend to use (tesseract, easyocr, paddleocr, none)"
267
281
  )