kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. kreuzberg/__init__.py +9 -2
  2. kreuzberg/_api/__init__.py +0 -0
  3. kreuzberg/_api/main.py +87 -0
  4. kreuzberg/_entity_extraction.py +238 -0
  5. kreuzberg/_extractors/_base.py +39 -1
  6. kreuzberg/_extractors/_email.py +149 -0
  7. kreuzberg/_extractors/_html.py +15 -3
  8. kreuzberg/_extractors/_image.py +27 -22
  9. kreuzberg/_extractors/_pandoc.py +3 -14
  10. kreuzberg/_extractors/_pdf.py +97 -34
  11. kreuzberg/_extractors/_presentation.py +62 -10
  12. kreuzberg/_extractors/_spread_sheet.py +181 -6
  13. kreuzberg/_extractors/_structured.py +148 -0
  14. kreuzberg/_gmft.py +318 -11
  15. kreuzberg/_language_detection.py +95 -0
  16. kreuzberg/_mcp/__init__.py +5 -0
  17. kreuzberg/_mcp/server.py +227 -0
  18. kreuzberg/_mime_types.py +27 -1
  19. kreuzberg/_ocr/__init__.py +10 -1
  20. kreuzberg/_ocr/_base.py +59 -0
  21. kreuzberg/_ocr/_easyocr.py +92 -1
  22. kreuzberg/_ocr/_paddleocr.py +89 -0
  23. kreuzberg/_ocr/_tesseract.py +569 -5
  24. kreuzberg/_registry.py +4 -0
  25. kreuzberg/_types.py +181 -4
  26. kreuzberg/_utils/_cache.py +52 -4
  27. kreuzberg/_utils/_device.py +2 -2
  28. kreuzberg/_utils/_errors.py +3 -7
  29. kreuzberg/_utils/_process_pool.py +182 -9
  30. kreuzberg/_utils/_quality.py +237 -0
  31. kreuzberg/_utils/_serialization.py +4 -2
  32. kreuzberg/_utils/_string.py +153 -10
  33. kreuzberg/_utils/_sync.py +6 -7
  34. kreuzberg/_utils/_table.py +261 -0
  35. kreuzberg/_utils/_tmp.py +2 -2
  36. kreuzberg/cli.py +1 -2
  37. kreuzberg/extraction.py +43 -34
  38. kreuzberg-3.8.1.dist-info/METADATA +301 -0
  39. kreuzberg-3.8.1.dist-info/RECORD +53 -0
  40. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
  41. kreuzberg/_multiprocessing/__init__.py +0 -6
  42. kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
  43. kreuzberg/_multiprocessing/process_manager.py +0 -188
  44. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  45. kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
  46. kreuzberg-3.3.0.dist-info/METADATA +0 -235
  47. kreuzberg-3.3.0.dist-info/RECORD +0 -48
  48. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
  49. {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_types.py CHANGED
@@ -1,9 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import sys
4
- from collections.abc import Awaitable
4
+ from collections.abc import Awaitable, Callable
5
5
  from dataclasses import asdict, dataclass, field
6
- from typing import TYPE_CHECKING, Any, Callable, Literal, TypedDict, Union
6
+ from typing import TYPE_CHECKING, Any, Literal, TypedDict
7
7
 
8
8
  from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
9
9
  from kreuzberg.exceptions import ValidationError
@@ -17,7 +17,9 @@ if TYPE_CHECKING:
17
17
  from pandas import DataFrame
18
18
  from PIL.Image import Image
19
19
 
20
+ from kreuzberg._entity_extraction import SpacyEntityExtractionConfig
20
21
  from kreuzberg._gmft import GMFTConfig
22
+ from kreuzberg._language_detection import LanguageDetectionConfig
21
23
  from kreuzberg._ocr._easyocr import EasyOCRConfig
22
24
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
23
25
  from kreuzberg._ocr._tesseract import TesseractConfig
@@ -98,6 +100,110 @@ class Metadata(TypedDict, total=False):
98
100
  width: NotRequired[int]
99
101
  """Width of the document page/slide/image, if applicable."""
100
102
 
103
+ # Email-specific fields
104
+ email_from: NotRequired[str]
105
+ """Email sender (from field)."""
106
+ email_to: NotRequired[str]
107
+ """Email recipient (to field)."""
108
+ email_cc: NotRequired[str]
109
+ """Email carbon copy recipients."""
110
+ email_bcc: NotRequired[str]
111
+ """Email blind carbon copy recipients."""
112
+ date: NotRequired[str]
113
+ """Email date or document date."""
114
+ attachments: NotRequired[list[str]]
115
+ """List of attachment names."""
116
+
117
+ # Additional metadata fields for various extractors
118
+ content: NotRequired[str]
119
+ """Content metadata field."""
120
+ parse_error: NotRequired[str]
121
+ """Parse error information."""
122
+ warning: NotRequired[str]
123
+ """Warning messages."""
124
+
125
+ # Table extraction metadata
126
+ table_count: NotRequired[int]
127
+ """Number of tables extracted from the document."""
128
+ tables_summary: NotRequired[str]
129
+ """Summary of table extraction results."""
130
+ quality_score: NotRequired[float]
131
+ """Quality score for extracted content (0.0-1.0)."""
132
+
133
+
134
+ # Cache valid metadata keys at module level for performance
135
+ _VALID_METADATA_KEYS = {
136
+ "authors",
137
+ "categories",
138
+ "citations",
139
+ "comments",
140
+ "content",
141
+ "copyright",
142
+ "created_at",
143
+ "created_by",
144
+ "description",
145
+ "fonts",
146
+ "height",
147
+ "identifier",
148
+ "keywords",
149
+ "languages",
150
+ "license",
151
+ "modified_at",
152
+ "modified_by",
153
+ "organization",
154
+ "parse_error",
155
+ "publisher",
156
+ "references",
157
+ "status",
158
+ "subject",
159
+ "subtitle",
160
+ "summary",
161
+ "title",
162
+ "version",
163
+ "warning",
164
+ "width",
165
+ "email_from",
166
+ "email_to",
167
+ "email_cc",
168
+ "email_bcc",
169
+ "date",
170
+ "attachments",
171
+ "table_count",
172
+ "tables_summary",
173
+ "quality_score",
174
+ }
175
+
176
+
177
+ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
178
+ """Normalize any dict to proper Metadata TypedDict.
179
+
180
+ Filters out invalid keys and ensures type safety.
181
+ """
182
+ if not data:
183
+ return {}
184
+
185
+ # Filter and return only valid metadata
186
+ normalized: Metadata = {}
187
+ for key, value in data.items():
188
+ if key in _VALID_METADATA_KEYS and value is not None:
189
+ normalized[key] = value # type: ignore[literal-required]
190
+
191
+ return normalized
192
+
193
+
194
+ @dataclass(frozen=True)
195
+ class Entity:
196
+ """Represents an extracted entity with type, text, and position."""
197
+
198
+ type: str
199
+ """e.g., PERSON, ORGANIZATION, LOCATION, DATE, EMAIL, PHONE, or custom"""
200
+ text: str
201
+ """Extracted text"""
202
+ start: int
203
+ """Start character offset in the content"""
204
+ end: int
205
+ """End character offset in the content"""
206
+
101
207
 
102
208
  @dataclass
103
209
  class ExtractionResult:
@@ -113,10 +219,59 @@ class ExtractionResult:
113
219
  """Extracted tables. Is an empty list if 'extract_tables' is not set to True in the ExtractionConfig."""
114
220
  chunks: list[str] = field(default_factory=list)
115
221
  """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
222
+ entities: list[Entity] | None = None
223
+ """Extracted entities, if entity extraction is enabled."""
224
+ keywords: list[tuple[str, float]] | None = None
225
+ """Extracted keywords and their scores, if keyword extraction is enabled."""
226
+ detected_languages: list[str] | None = None
227
+ """Languages detected in the extracted content, if language detection is enabled."""
228
+
229
+ def to_dict(self) -> dict[str, Any]:
230
+ """Converts the ExtractionResult to a dictionary."""
231
+ return asdict(self)
232
+
233
+ def export_tables_to_csv(self) -> list[str]:
234
+ """Export all tables to CSV format.
235
+
236
+ Returns:
237
+ List of CSV strings, one per table
238
+ """
239
+ if not self.tables:
240
+ return []
241
+
242
+ from kreuzberg._utils._table import export_table_to_csv
243
+
244
+ return [export_table_to_csv(table) for table in self.tables]
245
+
246
+ def export_tables_to_tsv(self) -> list[str]:
247
+ """Export all tables to TSV format.
248
+
249
+ Returns:
250
+ List of TSV strings, one per table
251
+ """
252
+ if not self.tables:
253
+ return []
254
+
255
+ from kreuzberg._utils._table import export_table_to_tsv
256
+
257
+ return [export_table_to_tsv(table) for table in self.tables]
258
+
259
+ def get_table_summaries(self) -> list[dict[str, Any]]:
260
+ """Get structural information for all tables.
261
+
262
+ Returns:
263
+ List of table structure dictionaries
264
+ """
265
+ if not self.tables:
266
+ return []
267
+
268
+ from kreuzberg._utils._table import extract_table_structure_info
269
+
270
+ return [extract_table_structure_info(table) for table in self.tables]
116
271
 
117
272
 
118
- PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
119
- ValidationHook = Callable[[ExtractionResult], Union[None, Awaitable[None]]]
273
+ PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[ExtractionResult]]
274
+ ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
120
275
 
121
276
 
122
277
  @dataclass(unsafe_hash=True)
@@ -153,8 +308,30 @@ class ExtractionConfig:
153
308
  """Post processing hooks to call after processing is done and before the final result is returned."""
154
309
  validators: list[ValidationHook] | None = None
155
310
  """Validation hooks to call after processing is done and before post-processing and result return."""
311
+ extract_entities: bool = False
312
+ """Whether to extract named entities from the content."""
313
+ extract_keywords: bool = False
314
+ """Whether to extract keywords from the content."""
315
+ keyword_count: int = 10
316
+ """Number of keywords to extract if extract_keywords is True."""
317
+ custom_entity_patterns: frozenset[tuple[str, str]] | None = None
318
+ """Custom entity patterns as a frozenset of (entity_type, regex_pattern) tuples."""
319
+ auto_detect_language: bool = False
320
+ """Whether to automatically detect language and configure OCR accordingly."""
321
+ language_detection_config: LanguageDetectionConfig | None = None
322
+ """Configuration for language detection. If None, uses default settings."""
323
+ spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
324
+ """Configuration for spaCy entity extraction. If None, uses default settings."""
325
+ enable_quality_processing: bool = True
326
+ """Whether to apply quality post-processing to improve extraction results."""
156
327
 
157
328
  def __post_init__(self) -> None:
329
+ if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):
330
+ object.__setattr__(self, "custom_entity_patterns", frozenset(self.custom_entity_patterns.items()))
331
+ if self.post_processing_hooks is not None and isinstance(self.post_processing_hooks, list):
332
+ object.__setattr__(self, "post_processing_hooks", tuple(self.post_processing_hooks))
333
+ if self.validators is not None and isinstance(self.validators, list):
334
+ object.__setattr__(self, "validators", tuple(self.validators))
158
335
  from kreuzberg._ocr._easyocr import EasyOCRConfig
159
336
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
160
337
  from kreuzberg._ocr._tesseract import TesseractConfig
@@ -64,8 +64,23 @@ class KreuzbergCache(Generic[T]):
64
64
  Returns:
65
65
  Unique cache key string
66
66
  """
67
- # Sort for consistent hashing # ~keep
68
- cache_str = str(sorted(kwargs.items()))
67
+ # Use more efficient string building for cache key
68
+ if not kwargs:
69
+ return "empty"
70
+
71
+ # Build key string efficiently
72
+ parts = []
73
+ for key in sorted(kwargs):
74
+ value = kwargs[key]
75
+ # Convert common types efficiently
76
+ if isinstance(value, (str, int, float, bool)):
77
+ parts.append(f"{key}={value}")
78
+ elif isinstance(value, bytes):
79
+ parts.append(f"{key}=bytes:{len(value)}")
80
+ else:
81
+ parts.append(f"{key}={type(value).__name__}:{value!s}")
82
+
83
+ cache_str = "&".join(parts)
69
84
  return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
70
85
 
71
86
  def _get_cache_path(self, cache_key: str) -> Path:
@@ -87,15 +102,48 @@ class KreuzbergCache(Generic[T]):
87
102
 
88
103
  def _serialize_result(self, result: T) -> dict[str, Any]:
89
104
  """Serialize result for caching with metadata."""
105
+ # Handle TableData objects that contain DataFrames
106
+ if isinstance(result, list) and result and isinstance(result[0], dict) and "df" in result[0]:
107
+ serialized_data = []
108
+ for item in result:
109
+ if isinstance(item, dict) and "df" in item:
110
+ # Create a copy and serialize the DataFrame as CSV
111
+ item_copy = item.copy()
112
+ if hasattr(item["df"], "to_csv"):
113
+ item_copy["df_csv"] = item["df"].to_csv(index=False)
114
+ else:
115
+ # Fallback for non-DataFrame objects
116
+ item_copy["df_csv"] = str(item["df"])
117
+ del item_copy["df"]
118
+ serialized_data.append(item_copy)
119
+ else:
120
+ serialized_data.append(item)
121
+ return {"type": "TableDataList", "data": serialized_data, "cached_at": time.time()}
122
+
90
123
  return {"type": type(result).__name__, "data": result, "cached_at": time.time()}
91
124
 
92
125
  def _deserialize_result(self, cached_data: dict[str, Any]) -> T:
93
126
  """Deserialize cached result."""
94
127
  data = cached_data["data"]
95
128
 
96
- if cached_data.get("type") == "ExtractionResult" and isinstance(data, dict):
97
- from kreuzberg._types import ExtractionResult
129
+ if cached_data.get("type") == "TableDataList" and isinstance(data, list):
130
+ deserialized_data = []
131
+ for item in data:
132
+ if isinstance(item, dict) and "df_csv" in item:
133
+ # Restore the DataFrame from CSV
134
+ item_copy = item.copy()
135
+ from io import StringIO
136
+
137
+ import pandas as pd
98
138
 
139
+ item_copy["df"] = pd.read_csv(StringIO(item["df_csv"]))
140
+ del item_copy["df_csv"]
141
+ deserialized_data.append(item_copy)
142
+ else:
143
+ deserialized_data.append(item)
144
+ return deserialized_data # type: ignore[return-value]
145
+
146
+ if cached_data.get("type") == "ExtractionResult" and isinstance(data, dict):
99
147
  return ExtractionResult(**data) # type: ignore[return-value]
100
148
 
101
149
  return data # type: ignore[no-any-return]
@@ -153,7 +153,7 @@ def _is_cuda_available() -> bool:
153
153
  try:
154
154
  import torch # type: ignore[import-not-found,unused-ignore]
155
155
 
156
- return torch.cuda.is_available()
156
+ return bool(torch.cuda.is_available())
157
157
  except ImportError:
158
158
  return False
159
159
 
@@ -163,7 +163,7 @@ def _is_mps_available() -> bool:
163
163
  try:
164
164
  import torch # type: ignore[import-not-found,unused-ignore]
165
165
 
166
- return torch.backends.mps.is_available()
166
+ return bool(torch.backends.mps.is_available())
167
167
  except ImportError:
168
168
  return False
169
169
 
@@ -5,12 +5,12 @@ from __future__ import annotations
5
5
  import platform
6
6
  import traceback
7
7
  from datetime import datetime, timezone
8
- from typing import TYPE_CHECKING, Any
8
+ from pathlib import Path
9
+ from typing import Any
9
10
 
10
11
  import psutil
11
12
 
12
- if TYPE_CHECKING:
13
- from pathlib import Path
13
+ from kreuzberg.exceptions import ValidationError
14
14
 
15
15
 
16
16
  def create_error_context(
@@ -37,8 +37,6 @@ def create_error_context(
37
37
  }
38
38
 
39
39
  if file_path:
40
- from pathlib import Path
41
-
42
40
  path = Path(file_path) if isinstance(file_path, str) else file_path
43
41
  context["file"] = {
44
42
  "path": str(path),
@@ -158,8 +156,6 @@ def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
158
156
  if attempt >= max_attempts:
159
157
  return False
160
158
 
161
- from kreuzberg.exceptions import ValidationError
162
-
163
159
  if isinstance(error, ValidationError):
164
160
  return False
165
161
 
@@ -2,13 +2,20 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ import io
5
6
  import multiprocessing as mp
6
7
  from concurrent.futures import ProcessPoolExecutor
7
8
  from contextlib import contextmanager
8
- from typing import TYPE_CHECKING, Any, Callable, TypeVar
9
+ from typing import TYPE_CHECKING, Any, TypeVar
10
+
11
+ import anyio
12
+ import psutil
13
+ import pypdfium2
14
+ from typing_extensions import Self
9
15
 
10
16
  if TYPE_CHECKING:
11
- from collections.abc import Generator
17
+ import types
18
+ from collections.abc import Callable, Generator
12
19
 
13
20
  T = TypeVar("T")
14
21
 
@@ -54,15 +61,13 @@ def shutdown_process_pool() -> None:
54
61
 
55
62
  def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
56
63
  """Worker function for extracting PDF text in a separate process."""
57
- import pypdfium2
58
-
59
64
  pdf = None
60
65
  try:
61
66
  pdf = pypdfium2.PdfDocument(pdf_path)
62
67
  text_parts = []
63
68
  for page in pdf:
64
69
  text_page = page.get_textpage()
65
- text = text_page.get_text_range()
70
+ text = text_page.get_text_bounded()
66
71
  text_parts.append(text)
67
72
  text_page.close()
68
73
  page.close()
@@ -76,10 +81,6 @@ def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
76
81
 
77
82
  def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str, list[bytes]]:
78
83
  """Worker function for converting PDF to images in a separate process."""
79
- import io
80
-
81
- import pypdfium2
82
-
83
84
  pdf = None
84
85
  try:
85
86
  pdf = pypdfium2.PdfDocument(pdf_path)
@@ -98,3 +99,175 @@ def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str,
98
99
  finally:
99
100
  if pdf:
100
101
  pdf.close()
102
+
103
+
104
+ class ProcessPoolManager:
105
+ """Resource-aware process pool manager for CPU-intensive tasks."""
106
+
107
+ def __init__(
108
+ self,
109
+ max_processes: int | None = None,
110
+ memory_limit_gb: float | None = None,
111
+ ) -> None:
112
+ """Initialize the process pool manager.
113
+
114
+ Args:
115
+ max_processes: Maximum number of processes. Defaults to CPU count.
116
+ memory_limit_gb: Memory limit in GB. Defaults to 75% of available memory.
117
+ """
118
+ self.max_processes = max_processes or mp.cpu_count()
119
+
120
+ if memory_limit_gb is None:
121
+ available_memory = psutil.virtual_memory().available
122
+ self.memory_limit_bytes = int(available_memory * 0.75) # Use 75% of available # ~keep
123
+ else:
124
+ self.memory_limit_bytes = int(memory_limit_gb * 1024**3)
125
+
126
+ self._executor: ProcessPoolExecutor | None = None
127
+ self._active_tasks = 0
128
+
129
+ def get_optimal_workers(self, task_memory_mb: float = 100) -> int:
130
+ """Calculate optimal number of workers based on memory constraints.
131
+
132
+ Args:
133
+ task_memory_mb: Estimated memory usage per task in MB.
134
+
135
+ Returns:
136
+ Optimal number of workers.
137
+ """
138
+ task_memory_bytes = task_memory_mb * 1024**2
139
+ memory_based_limit = max(1, int(self.memory_limit_bytes / task_memory_bytes))
140
+
141
+ return min(self.max_processes, memory_based_limit)
142
+
143
+ def _ensure_executor(self, max_workers: int | None = None) -> ProcessPoolExecutor:
144
+ """Ensure process pool executor is initialized."""
145
+ if self._executor is None or getattr(self._executor, "_max_workers", None) != max_workers:
146
+ if self._executor is not None:
147
+ self._executor.shutdown(wait=False)
148
+
149
+ workers = max_workers or self.max_processes
150
+ self._executor = ProcessPoolExecutor(max_workers=workers)
151
+
152
+ return self._executor
153
+
154
+ async def submit_task(
155
+ self,
156
+ func: Callable[..., T],
157
+ *args: Any,
158
+ task_memory_mb: float = 100,
159
+ ) -> T:
160
+ """Submit a task to the process pool.
161
+
162
+ Args:
163
+ func: Function to execute.
164
+ *args: Positional arguments for the function.
165
+ task_memory_mb: Estimated memory usage in MB.
166
+
167
+ Returns:
168
+ Result of the function execution.
169
+ """
170
+ workers = self.get_optimal_workers(task_memory_mb)
171
+ self._ensure_executor(workers)
172
+
173
+ self._active_tasks += 1
174
+
175
+ try:
176
+ return await anyio.to_thread.run_sync(func, *args)
177
+ finally:
178
+ self._active_tasks -= 1
179
+
180
+ async def submit_batch(
181
+ self,
182
+ func: Callable[..., T],
183
+ arg_batches: list[tuple[Any, ...]],
184
+ task_memory_mb: float = 100,
185
+ max_concurrent: int | None = None,
186
+ ) -> list[T]:
187
+ """Submit a batch of tasks to the process pool.
188
+
189
+ Args:
190
+ func: Function to execute.
191
+ arg_batches: List of argument tuples for each task.
192
+ task_memory_mb: Estimated memory usage per task in MB.
193
+ max_concurrent: Maximum concurrent tasks. Defaults to optimal workers.
194
+
195
+ Returns:
196
+ List of results in the same order as input.
197
+ """
198
+ if not arg_batches:
199
+ return []
200
+
201
+ workers = self.get_optimal_workers(task_memory_mb)
202
+ max_concurrent = max_concurrent or workers
203
+
204
+ self._ensure_executor(workers)
205
+
206
+ semaphore = anyio.CapacityLimiter(max_concurrent)
207
+
208
+ async def submit_single(args: tuple[Any, ...]) -> T:
209
+ async with semaphore:
210
+ self._active_tasks += 1
211
+ try:
212
+ return await anyio.to_thread.run_sync(func, *args)
213
+ finally:
214
+ self._active_tasks -= 1
215
+
216
+ async with anyio.create_task_group() as tg:
217
+ results: list[T] = [None] * len(arg_batches) # type: ignore[list-item]
218
+
219
+ async def run_task(idx: int, args: tuple[Any, ...]) -> None:
220
+ results[idx] = await submit_single(args)
221
+
222
+ for idx, args in enumerate(arg_batches):
223
+ tg.start_soon(run_task, idx, args)
224
+
225
+ return results
226
+
227
+ def get_system_info(self) -> dict[str, Any]:
228
+ """Get current system resource information."""
229
+ memory = psutil.virtual_memory()
230
+ cpu_percent = psutil.cpu_percent(interval=1)
231
+
232
+ return {
233
+ "cpu_count": mp.cpu_count(),
234
+ "cpu_percent": cpu_percent,
235
+ "memory_total": memory.total,
236
+ "memory_available": memory.available,
237
+ "memory_percent": memory.percent,
238
+ "active_tasks": self._active_tasks,
239
+ "max_processes": self.max_processes,
240
+ "memory_limit": self.memory_limit_bytes,
241
+ }
242
+
243
+ def shutdown(self, wait: bool = True) -> None:
244
+ """Shutdown the process pool."""
245
+ if self._executor is not None:
246
+ self._executor.shutdown(wait=wait)
247
+ self._executor = None
248
+
249
+ def __enter__(self) -> Self:
250
+ """Context manager entry."""
251
+ return self
252
+
253
+ def __exit__(
254
+ self,
255
+ exc_type: type[BaseException] | None,
256
+ exc_val: BaseException | None,
257
+ exc_tb: types.TracebackType | None,
258
+ ) -> None:
259
+ """Context manager exit."""
260
+ self.shutdown()
261
+
262
+ async def __aenter__(self) -> Self:
263
+ """Async context manager entry."""
264
+ return self
265
+
266
+ async def __aexit__(
267
+ self,
268
+ exc_type: type[BaseException] | None,
269
+ exc_val: BaseException | None,
270
+ exc_tb: types.TracebackType | None,
271
+ ) -> None:
272
+ """Async context manager exit."""
273
+ self.shutdown()