kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_chunker.py +0 -15
  5. kreuzberg/_config.py +212 -292
  6. kreuzberg/_document_classification.py +20 -47
  7. kreuzberg/_entity_extraction.py +1 -122
  8. kreuzberg/_extractors/_base.py +4 -71
  9. kreuzberg/_extractors/_email.py +1 -15
  10. kreuzberg/_extractors/_html.py +9 -12
  11. kreuzberg/_extractors/_image.py +1 -25
  12. kreuzberg/_extractors/_pandoc.py +10 -147
  13. kreuzberg/_extractors/_pdf.py +38 -94
  14. kreuzberg/_extractors/_presentation.py +0 -99
  15. kreuzberg/_extractors/_spread_sheet.py +13 -55
  16. kreuzberg/_extractors/_structured.py +1 -4
  17. kreuzberg/_gmft.py +14 -199
  18. kreuzberg/_language_detection.py +1 -36
  19. kreuzberg/_mcp/__init__.py +0 -2
  20. kreuzberg/_mcp/server.py +3 -10
  21. kreuzberg/_mime_types.py +1 -19
  22. kreuzberg/_ocr/_base.py +4 -76
  23. kreuzberg/_ocr/_easyocr.py +124 -186
  24. kreuzberg/_ocr/_paddleocr.py +154 -224
  25. kreuzberg/_ocr/_table_extractor.py +184 -0
  26. kreuzberg/_ocr/_tesseract.py +797 -361
  27. kreuzberg/_playa.py +5 -31
  28. kreuzberg/_registry.py +0 -36
  29. kreuzberg/_types.py +588 -93
  30. kreuzberg/_utils/_cache.py +84 -138
  31. kreuzberg/_utils/_device.py +0 -74
  32. kreuzberg/_utils/_document_cache.py +0 -75
  33. kreuzberg/_utils/_errors.py +0 -50
  34. kreuzberg/_utils/_ocr_cache.py +136 -0
  35. kreuzberg/_utils/_pdf_lock.py +0 -16
  36. kreuzberg/_utils/_process_pool.py +17 -64
  37. kreuzberg/_utils/_quality.py +0 -60
  38. kreuzberg/_utils/_ref.py +32 -0
  39. kreuzberg/_utils/_serialization.py +0 -30
  40. kreuzberg/_utils/_string.py +9 -59
  41. kreuzberg/_utils/_sync.py +0 -77
  42. kreuzberg/_utils/_table.py +49 -101
  43. kreuzberg/_utils/_tmp.py +0 -9
  44. kreuzberg/cli.py +54 -74
  45. kreuzberg/extraction.py +39 -32
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
  47. kreuzberg-3.13.1.dist-info/RECORD +57 -0
  48. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  49. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
  50. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
  51. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,3 @@
1
- """Document-level caching to prevent pypdfium2 issues with duplicate processing."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  import hashlib
@@ -13,14 +11,7 @@ if TYPE_CHECKING:
13
11
 
14
12
 
15
13
  class DocumentCache:
16
- """Session-scoped cache for document extraction results.
17
-
18
- Ensures each unique document is processed only once per session,
19
- preventing pypdfium2 state corruption issues with repeated processing.
20
- """
21
-
22
14
  def __init__(self) -> None:
23
- """Initialize document cache."""
24
15
  self._cache: dict[str, ExtractionResult] = {}
25
16
  self._processing: dict[str, threading.Event] = {}
26
17
  self._lock = threading.Lock()
@@ -28,15 +19,6 @@ class DocumentCache:
28
19
  self._file_metadata: dict[str, dict[str, Any]] = {}
29
20
 
30
21
  def _get_cache_key(self, file_path: Path | str, config: ExtractionConfig | None = None) -> str:
31
- """Generate cache key for a file and config combination.
32
-
33
- Args:
34
- file_path: Path to the file
35
- config: Extraction configuration
36
-
37
- Returns:
38
- Unique cache key string
39
- """
40
22
  path = Path(file_path).resolve()
41
23
 
42
24
  try:
@@ -67,15 +49,6 @@ class DocumentCache:
67
49
  return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
68
50
 
69
51
  def _is_cache_valid(self, cache_key: str, file_path: Path | str) -> bool:
70
- """Check if cached result is still valid.
71
-
72
- Args:
73
- cache_key: The cache key to validate
74
- file_path: Path to the file
75
-
76
- Returns:
77
- True if cache is valid, False if invalidated
78
- """
79
52
  if cache_key not in self._file_metadata:
80
53
  return False
81
54
 
@@ -91,15 +64,6 @@ class DocumentCache:
91
64
  return False
92
65
 
93
66
  def get(self, file_path: Path | str, config: ExtractionConfig | None = None) -> ExtractionResult | None:
94
- """Get cached extraction result if available and valid.
95
-
96
- Args:
97
- file_path: Path to the file
98
- config: Extraction configuration
99
-
100
- Returns:
101
- Cached result if available, None otherwise
102
- """
103
67
  cache_key = self._get_cache_key(file_path, config)
104
68
 
105
69
  with self._lock:
@@ -113,13 +77,6 @@ class DocumentCache:
113
77
  return None
114
78
 
115
79
  def set(self, file_path: Path | str, config: ExtractionConfig | None, result: ExtractionResult) -> None:
116
- """Cache extraction result.
117
-
118
- Args:
119
- file_path: Path to the file
120
- config: Extraction configuration
121
- result: Extraction result to cache
122
- """
123
80
  cache_key = self._get_cache_key(file_path, config)
124
81
  path = Path(file_path)
125
82
 
@@ -142,29 +99,11 @@ class DocumentCache:
142
99
  self._file_metadata[cache_key] = file_metadata
143
100
 
144
101
  def is_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> bool:
145
- """Check if file is currently being processed.
146
-
147
- Args:
148
- file_path: Path to the file
149
- config: Extraction configuration
150
-
151
- Returns:
152
- True if file is currently being processed
153
- """
154
102
  cache_key = self._get_cache_key(file_path, config)
155
103
  with self._lock:
156
104
  return cache_key in self._processing
157
105
 
158
106
  def mark_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> threading.Event:
159
- """Mark file as being processed and return event to wait on.
160
-
161
- Args:
162
- file_path: Path to the file
163
- config: Extraction configuration
164
-
165
- Returns:
166
- Event that will be set when processing completes
167
- """
168
107
  cache_key = self._get_cache_key(file_path, config)
169
108
 
170
109
  with self._lock:
@@ -173,12 +112,6 @@ class DocumentCache:
173
112
  return self._processing[cache_key]
174
113
 
175
114
  def mark_complete(self, file_path: Path | str, config: ExtractionConfig | None = None) -> None:
176
- """Mark file processing as complete.
177
-
178
- Args:
179
- file_path: Path to the file
180
- config: Extraction configuration
181
- """
182
115
  cache_key = self._get_cache_key(file_path, config)
183
116
 
184
117
  with self._lock:
@@ -187,17 +120,11 @@ class DocumentCache:
187
120
  event.set()
188
121
 
189
122
  def clear(self) -> None:
190
- """Clear all cached results."""
191
123
  with self._lock:
192
124
  self._cache.clear()
193
125
  self._file_metadata.clear()
194
126
 
195
127
  def get_stats(self) -> dict[str, Any]:
196
- """Get cache statistics.
197
-
198
- Returns:
199
- Dictionary with cache statistics
200
- """
201
128
  with self._lock:
202
129
  return {
203
130
  "cached_documents": len(self._cache),
@@ -212,10 +139,8 @@ _document_cache = DocumentCache()
212
139
 
213
140
 
214
141
  def get_document_cache() -> DocumentCache:
215
- """Get the global document cache instance."""
216
142
  return _document_cache
217
143
 
218
144
 
219
145
  def clear_document_cache() -> None:
220
- """Clear the global document cache."""
221
146
  _document_cache.clear()
@@ -1,5 +1,3 @@
1
- """Enhanced error handling utilities."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  import platform
@@ -12,7 +10,6 @@ import psutil
12
10
 
13
11
  from kreuzberg.exceptions import ValidationError
14
12
 
15
- # Define error keywords as frozensets for O(1) membership testing
16
13
  _SYSTEM_ERROR_KEYWORDS = frozenset({"memory", "resource", "process", "thread"})
17
14
  _TRANSIENT_ERROR_PATTERNS = frozenset(
18
15
  {
@@ -56,17 +53,6 @@ def create_error_context(
56
53
  error: Exception | None = None,
57
54
  **extra: Any,
58
55
  ) -> dict[str, Any]:
59
- """Create comprehensive error context.
60
-
61
- Args:
62
- operation: The operation being performed (e.g., "extract_file", "convert_pdf_to_images")
63
- file_path: The file being processed, if applicable
64
- error: The original exception, if any
65
- **extra: Additional context fields
66
-
67
- Returns:
68
- Dictionary with error context including system info
69
- """
70
56
  context: dict[str, Any] = {
71
57
  "timestamp": datetime.now(timezone.utc).isoformat(),
72
58
  "operation": operation,
@@ -107,14 +93,6 @@ def create_error_context(
107
93
 
108
94
 
109
95
  def is_transient_error(error: Exception) -> bool:
110
- """Check if an error is likely transient and worth retrying.
111
-
112
- Args:
113
- error: The exception to check
114
-
115
- Returns:
116
- True if the error is likely transient
117
- """
118
96
  transient_types = (
119
97
  OSError,
120
98
  PermissionError,
@@ -131,29 +109,11 @@ def is_transient_error(error: Exception) -> bool:
131
109
 
132
110
 
133
111
  def is_resource_error(error: Exception) -> bool:
134
- """Check if an error is related to system resources.
135
-
136
- Args:
137
- error: The exception to check
138
-
139
- Returns:
140
- True if the error is resource-related
141
- """
142
112
  error_str = str(error).lower()
143
113
  return any(pattern in error_str for pattern in _RESOURCE_ERROR_PATTERNS)
144
114
 
145
115
 
146
116
  def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
147
- """Determine if an operation should be retried.
148
-
149
- Args:
150
- error: The exception that occurred
151
- attempt: Current attempt number (1-based)
152
- max_attempts: Maximum number of attempts
153
-
154
- Returns:
155
- True if the operation should be retried
156
- """
157
117
  if attempt >= max_attempts:
158
118
  return False
159
119
 
@@ -164,22 +124,17 @@ def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
164
124
 
165
125
 
166
126
  class BatchExtractionResult:
167
- """Result container for batch operations with partial success support."""
168
-
169
127
  __slots__ = ("failed", "successful", "total_count")
170
128
 
171
129
  def __init__(self) -> None:
172
- """Initialize batch result container."""
173
130
  self.successful: list[tuple[int, Any]] = []
174
131
  self.failed: list[tuple[int, dict[str, Any]]] = []
175
132
  self.total_count: int = 0
176
133
 
177
134
  def add_success(self, index: int, result: Any) -> None:
178
- """Add a successful result."""
179
135
  self.successful.append((index, result))
180
136
 
181
137
  def add_failure(self, index: int, error: Exception, context: dict[str, Any]) -> None:
182
- """Add a failed result with context."""
183
138
  error_info = {
184
139
  "error": {
185
140
  "type": type(error).__name__,
@@ -191,30 +146,25 @@ class BatchExtractionResult:
191
146
 
192
147
  @property
193
148
  def success_count(self) -> int:
194
- """Number of successful operations."""
195
149
  return len(self.successful)
196
150
 
197
151
  @property
198
152
  def failure_count(self) -> int:
199
- """Number of failed operations."""
200
153
  return len(self.failed)
201
154
 
202
155
  @property
203
156
  def success_rate(self) -> float:
204
- """Success rate as a percentage."""
205
157
  if self.total_count == 0:
206
158
  return 0.0
207
159
  return (self.success_count / self.total_count) * 100
208
160
 
209
161
  def get_ordered_results(self) -> list[Any | None]:
210
- """Get results in original order with None for failures."""
211
162
  results = [None] * self.total_count
212
163
  for index, result in self.successful:
213
164
  results[index] = result
214
165
  return results
215
166
 
216
167
  def get_summary(self) -> dict[str, Any]:
217
- """Get summary of batch operation."""
218
168
  return {
219
169
  "total": self.total_count,
220
170
  "successful": self.success_count,
@@ -0,0 +1,136 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import io
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ import anyio
8
+
9
+ from kreuzberg._utils._cache import get_ocr_cache
10
+
11
+ if TYPE_CHECKING:
12
+ from pathlib import Path
13
+
14
+ from PIL.Image import Image as PILImage
15
+
16
+ from kreuzberg._types import ExtractionResult
17
+
18
+
19
+ def get_file_info(path: Path) -> dict[str, Any]:
20
+ from pathlib import Path as PathType # noqa: PLC0415
21
+
22
+ path_obj = PathType(path) if not isinstance(path, PathType) else path
23
+
24
+ try:
25
+ stat = path_obj.stat()
26
+ return {
27
+ "path": str(path_obj.resolve()),
28
+ "size": stat.st_size,
29
+ "mtime": stat.st_mtime,
30
+ }
31
+ except OSError:
32
+ return {
33
+ "path": str(path_obj),
34
+ "size": 0,
35
+ "mtime": 0,
36
+ }
37
+
38
+
39
+ def generate_image_hash(image: PILImage) -> str:
40
+ save_image = image
41
+ if image.mode not in ("RGB", "RGBA", "L", "LA", "P", "1"):
42
+ save_image = image.convert("RGB")
43
+
44
+ image_buffer = io.BytesIO()
45
+ save_image.save(image_buffer, format="PNG")
46
+ image_content = image_buffer.getvalue()
47
+
48
+ return hashlib.sha256(image_content).hexdigest()[:16]
49
+
50
+
51
+ def build_cache_kwargs(
52
+ backend_name: str,
53
+ config_dict: dict[str, Any],
54
+ image_hash: str | None = None,
55
+ file_info: dict[str, Any] | None = None,
56
+ ) -> dict[str, Any]:
57
+ cache_kwargs = {
58
+ "ocr_backend": backend_name,
59
+ "ocr_config": str(sorted(config_dict.items())),
60
+ }
61
+
62
+ if image_hash:
63
+ cache_kwargs["image_hash"] = image_hash
64
+ if file_info:
65
+ cache_kwargs["file_info"] = str(sorted(file_info.items()))
66
+
67
+ return cache_kwargs
68
+
69
+
70
+ async def handle_cache_lookup_async(cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
71
+ ocr_cache = get_ocr_cache()
72
+
73
+ cached_result = await ocr_cache.aget(**cache_kwargs)
74
+ if cached_result is not None:
75
+ return cached_result
76
+
77
+ if ocr_cache.is_processing(**cache_kwargs):
78
+ event = ocr_cache.mark_processing(**cache_kwargs)
79
+ await anyio.to_thread.run_sync(event.wait)
80
+
81
+ cached_result = await ocr_cache.aget(**cache_kwargs)
82
+ if cached_result is not None:
83
+ return cached_result
84
+
85
+ ocr_cache.mark_processing(**cache_kwargs)
86
+ return None
87
+
88
+
89
+ def handle_cache_lookup_sync(cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
90
+ ocr_cache = get_ocr_cache()
91
+
92
+ cached_result = ocr_cache.get(**cache_kwargs)
93
+ if cached_result is not None:
94
+ return cached_result
95
+
96
+ if ocr_cache.is_processing(**cache_kwargs):
97
+ event = ocr_cache.mark_processing(**cache_kwargs)
98
+ event.wait()
99
+
100
+ cached_result = ocr_cache.get(**cache_kwargs)
101
+ if cached_result is not None:
102
+ return cached_result
103
+
104
+ ocr_cache.mark_processing(**cache_kwargs)
105
+ return None
106
+
107
+
108
+ async def cache_and_complete_async(
109
+ result: ExtractionResult,
110
+ cache_kwargs: dict[str, Any],
111
+ use_cache: bool,
112
+ ) -> None:
113
+ ocr_cache = get_ocr_cache()
114
+
115
+ if use_cache:
116
+ await ocr_cache.aset(result, **cache_kwargs)
117
+
118
+ ocr_cache.mark_complete(**cache_kwargs)
119
+
120
+
121
+ def cache_and_complete_sync(
122
+ result: ExtractionResult,
123
+ cache_kwargs: dict[str, Any],
124
+ use_cache: bool,
125
+ ) -> None:
126
+ ocr_cache = get_ocr_cache()
127
+
128
+ if use_cache:
129
+ ocr_cache.set(result, **cache_kwargs)
130
+
131
+ ocr_cache.mark_complete(**cache_kwargs)
132
+
133
+
134
+ def mark_processing_complete(cache_kwargs: dict[str, Any]) -> None:
135
+ ocr_cache = get_ocr_cache()
136
+ ocr_cache.mark_complete(**cache_kwargs)
@@ -1,5 +1,3 @@
1
- """PDF processing lock utilities for thread-safe pypdfium2 operations."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  import hashlib
@@ -21,13 +19,11 @@ _FILE_LOCKS_LOCK = threading.Lock()
21
19
 
22
20
 
23
21
  def _get_file_key(file_path: Path | str) -> str:
24
- """Get a consistent key for a file path."""
25
22
  path_str = str(Path(file_path).resolve())
26
23
  return hashlib.md5(path_str.encode()).hexdigest() # noqa: S324
27
24
 
28
25
 
29
26
  def _get_file_lock(file_path: Path | str) -> threading.RLock:
30
- """Get or create a lock for a specific file."""
31
27
  file_key = _get_file_key(file_path)
32
28
 
33
29
  with _FILE_LOCKS_LOCK:
@@ -41,30 +37,18 @@ def _get_file_lock(file_path: Path | str) -> threading.RLock:
41
37
 
42
38
  @contextmanager
43
39
  def pypdfium_lock() -> Generator[None, None, None]:
44
- """Context manager for thread-safe pypdfium2 operations.
45
-
46
- This prevents segmentation faults on macOS where pypdfium2
47
- is not fork-safe when used concurrently.
48
- """
49
40
  with _PYPDFIUM_LOCK:
50
41
  yield
51
42
 
52
43
 
53
44
  @contextmanager
54
45
  def pypdfium_file_lock(file_path: Path | str) -> Generator[None, None, None]:
55
- """Context manager for per-file pypdfium2 operations.
56
-
57
- This allows concurrent processing of different files while
58
- preventing segfaults. Document caching handles same-file issues.
59
- """
60
46
  lock = _get_file_lock(file_path)
61
47
  with lock:
62
48
  yield
63
49
 
64
50
 
65
51
  def with_pypdfium_lock(func: Any) -> Any:
66
- """Decorator to wrap functions with pypdfium2 lock."""
67
-
68
52
  def wrapper(*args: Any, **kwargs: Any) -> Any:
69
53
  with pypdfium_lock():
70
54
  return func(*args, **kwargs)
@@ -1,5 +1,3 @@
1
- """Process pool utilities for CPU-intensive operations."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  import io
@@ -13,6 +11,8 @@ import psutil
13
11
  import pypdfium2
14
12
  from typing_extensions import Self
15
13
 
14
+ from kreuzberg._utils._ref import Ref
15
+
16
16
  if TYPE_CHECKING:
17
17
  import types
18
18
  from collections.abc import Callable, Generator
@@ -20,47 +20,45 @@ if TYPE_CHECKING:
20
20
  T = TypeVar("T")
21
21
 
22
22
 
23
- _PROCESS_POOL: ProcessPoolExecutor | None = None
24
23
  _POOL_SIZE = max(1, mp.cpu_count() - 1)
25
24
 
26
25
 
27
- def _init_process_pool() -> ProcessPoolExecutor:
28
- """Initialize the global process pool."""
29
- global _PROCESS_POOL
30
- if _PROCESS_POOL is None:
31
- _PROCESS_POOL = ProcessPoolExecutor(max_workers=_POOL_SIZE)
32
- return _PROCESS_POOL
26
+ def _create_process_pool() -> ProcessPoolExecutor:
27
+ return ProcessPoolExecutor(max_workers=_POOL_SIZE)
28
+
29
+
30
+ _process_pool_ref = Ref("process_pool", _create_process_pool)
31
+
32
+
33
+ def _get_process_pool() -> ProcessPoolExecutor:
34
+ return _process_pool_ref.get()
33
35
 
34
36
 
35
37
  @contextmanager
36
38
  def process_pool() -> Generator[ProcessPoolExecutor, None, None]:
37
- """Get the global process pool."""
38
- pool = _init_process_pool()
39
+ pool = _get_process_pool()
39
40
  try:
40
41
  yield pool
41
42
  except Exception: # noqa: BLE001
42
43
  shutdown_process_pool()
43
- pool = _init_process_pool()
44
+ pool = _get_process_pool()
44
45
  yield pool
45
46
 
46
47
 
47
48
  def submit_to_process_pool(func: Callable[..., T], *args: Any, **kwargs: Any) -> T:
48
- """Submit a function to the process pool and wait for result."""
49
49
  with process_pool() as pool:
50
50
  future = pool.submit(func, *args, **kwargs)
51
51
  return future.result()
52
52
 
53
53
 
54
54
  def shutdown_process_pool() -> None:
55
- """Shutdown the global process pool."""
56
- global _PROCESS_POOL
57
- if _PROCESS_POOL is not None:
58
- _PROCESS_POOL.shutdown(wait=True)
59
- _PROCESS_POOL = None
55
+ if _process_pool_ref.is_initialized():
56
+ pool = _process_pool_ref.get()
57
+ pool.shutdown(wait=True)
58
+ _process_pool_ref.clear()
60
59
 
61
60
 
62
61
  def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
63
- """Worker function for extracting PDF text in a separate process."""
64
62
  pdf = None
65
63
  try:
66
64
  pdf = pypdfium2.PdfDocument(pdf_path)
@@ -80,7 +78,6 @@ def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
80
78
 
81
79
 
82
80
  def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str, list[bytes]]:
83
- """Worker function for converting PDF to images in a separate process."""
84
81
  pdf = None
85
82
  try:
86
83
  pdf = pypdfium2.PdfDocument(pdf_path)
@@ -102,19 +99,11 @@ def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str,
102
99
 
103
100
 
104
101
  class ProcessPoolManager:
105
- """Resource-aware process pool manager for CPU-intensive tasks."""
106
-
107
102
  def __init__(
108
103
  self,
109
104
  max_processes: int | None = None,
110
105
  memory_limit_gb: float | None = None,
111
106
  ) -> None:
112
- """Initialize the process pool manager.
113
-
114
- Args:
115
- max_processes: Maximum number of processes. Defaults to CPU count.
116
- memory_limit_gb: Memory limit in GB. Defaults to 75% of available memory.
117
- """
118
107
  self.max_processes = max_processes or mp.cpu_count()
119
108
 
120
109
  if memory_limit_gb is None:
@@ -127,21 +116,12 @@ class ProcessPoolManager:
127
116
  self._active_tasks = 0
128
117
 
129
118
  def get_optimal_workers(self, task_memory_mb: float = 100) -> int:
130
- """Calculate optimal number of workers based on memory constraints.
131
-
132
- Args:
133
- task_memory_mb: Estimated memory usage per task in MB.
134
-
135
- Returns:
136
- Optimal number of workers.
137
- """
138
119
  task_memory_bytes = task_memory_mb * 1024**2
139
120
  memory_based_limit = max(1, int(self.memory_limit_bytes / task_memory_bytes))
140
121
 
141
122
  return min(self.max_processes, memory_based_limit)
142
123
 
143
124
  def _ensure_executor(self, max_workers: int | None = None) -> ProcessPoolExecutor:
144
- """Ensure process pool executor is initialized."""
145
125
  if self._executor is None or getattr(self._executor, "_max_workers", None) != max_workers:
146
126
  if self._executor is not None:
147
127
  self._executor.shutdown(wait=False)
@@ -157,16 +137,6 @@ class ProcessPoolManager:
157
137
  *args: Any,
158
138
  task_memory_mb: float = 100,
159
139
  ) -> T:
160
- """Submit a task to the process pool.
161
-
162
- Args:
163
- func: Function to execute.
164
- *args: Positional arguments for the function.
165
- task_memory_mb: Estimated memory usage in MB.
166
-
167
- Returns:
168
- Result of the function execution.
169
- """
170
140
  workers = self.get_optimal_workers(task_memory_mb)
171
141
  self._ensure_executor(workers)
172
142
 
@@ -184,17 +154,6 @@ class ProcessPoolManager:
184
154
  task_memory_mb: float = 100,
185
155
  max_concurrent: int | None = None,
186
156
  ) -> list[T]:
187
- """Submit a batch of tasks to the process pool.
188
-
189
- Args:
190
- func: Function to execute.
191
- arg_batches: List of argument tuples for each task.
192
- task_memory_mb: Estimated memory usage per task in MB.
193
- max_concurrent: Maximum concurrent tasks. Defaults to optimal workers.
194
-
195
- Returns:
196
- List of results in the same order as input.
197
- """
198
157
  if not arg_batches:
199
158
  return []
200
159
 
@@ -225,7 +184,6 @@ class ProcessPoolManager:
225
184
  return results
226
185
 
227
186
  def get_system_info(self) -> dict[str, Any]:
228
- """Get current system resource information."""
229
187
  memory = psutil.virtual_memory()
230
188
  cpu_percent = psutil.cpu_percent(interval=1)
231
189
 
@@ -241,13 +199,11 @@ class ProcessPoolManager:
241
199
  }
242
200
 
243
201
  def shutdown(self, wait: bool = True) -> None:
244
- """Shutdown the process pool."""
245
202
  if self._executor is not None:
246
203
  self._executor.shutdown(wait=wait)
247
204
  self._executor = None
248
205
 
249
206
  def __enter__(self) -> Self:
250
- """Context manager entry."""
251
207
  return self
252
208
 
253
209
  def __exit__(
@@ -256,11 +212,9 @@ class ProcessPoolManager:
256
212
  exc_val: BaseException | None,
257
213
  exc_tb: types.TracebackType | None,
258
214
  ) -> None:
259
- """Context manager exit."""
260
215
  self.shutdown()
261
216
 
262
217
  async def __aenter__(self) -> Self:
263
- """Async context manager entry."""
264
218
  return self
265
219
 
266
220
  async def __aexit__(
@@ -269,5 +223,4 @@ class ProcessPoolManager:
269
223
  exc_val: BaseException | None,
270
224
  exc_tb: types.TracebackType | None,
271
225
  ) -> None:
272
- """Async context manager exit."""
273
226
  self.shutdown()