additory 0.1.0a3__py3-none-any.whl → 0.1.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- additory/__init__.py +58 -14
- additory/common/__init__.py +31 -147
- additory/common/column_selector.py +255 -0
- additory/common/distributions.py +286 -613
- additory/common/extractors.py +313 -0
- additory/common/knn_imputation.py +332 -0
- additory/common/result.py +380 -0
- additory/common/strategy_parser.py +243 -0
- additory/common/unit_conversions.py +338 -0
- additory/common/validation.py +283 -103
- additory/core/__init__.py +34 -22
- additory/core/backend.py +258 -0
- additory/core/config.py +177 -305
- additory/core/logging.py +230 -24
- additory/core/memory_manager.py +157 -495
- additory/expressions/__init__.py +2 -23
- additory/expressions/compiler.py +457 -0
- additory/expressions/engine.py +264 -487
- additory/expressions/integrity.py +179 -0
- additory/expressions/loader.py +263 -0
- additory/expressions/parser.py +363 -167
- additory/expressions/resolver.py +274 -0
- additory/functions/__init__.py +1 -0
- additory/functions/analyze/__init__.py +144 -0
- additory/functions/analyze/cardinality.py +58 -0
- additory/functions/analyze/correlations.py +66 -0
- additory/functions/analyze/distributions.py +53 -0
- additory/functions/analyze/duplicates.py +49 -0
- additory/functions/analyze/features.py +61 -0
- additory/functions/analyze/imputation.py +66 -0
- additory/functions/analyze/outliers.py +65 -0
- additory/functions/analyze/patterns.py +65 -0
- additory/functions/analyze/presets.py +72 -0
- additory/functions/analyze/quality.py +59 -0
- additory/functions/analyze/timeseries.py +53 -0
- additory/functions/analyze/types.py +45 -0
- additory/functions/expressions/__init__.py +161 -0
- additory/functions/snapshot/__init__.py +82 -0
- additory/functions/snapshot/filter.py +119 -0
- additory/functions/synthetic/__init__.py +113 -0
- additory/functions/synthetic/mode_detector.py +47 -0
- additory/functions/synthetic/strategies/__init__.py +1 -0
- additory/functions/synthetic/strategies/advanced.py +35 -0
- additory/functions/synthetic/strategies/augmentative.py +160 -0
- additory/functions/synthetic/strategies/generative.py +168 -0
- additory/functions/synthetic/strategies/presets.py +116 -0
- additory/functions/to/__init__.py +188 -0
- additory/functions/to/lookup.py +351 -0
- additory/functions/to/merge.py +189 -0
- additory/functions/to/sort.py +91 -0
- additory/functions/to/summarize.py +170 -0
- additory/functions/transform/__init__.py +140 -0
- additory/functions/transform/datetime.py +79 -0
- additory/functions/transform/extract.py +85 -0
- additory/functions/transform/harmonize.py +105 -0
- additory/functions/transform/knn.py +62 -0
- additory/functions/transform/onehotencoding.py +68 -0
- additory/functions/transform/transpose.py +42 -0
- additory-0.1.1a1.dist-info/METADATA +83 -0
- additory-0.1.1a1.dist-info/RECORD +62 -0
- additory/analysis/__init__.py +0 -48
- additory/analysis/cardinality.py +0 -126
- additory/analysis/correlations.py +0 -124
- additory/analysis/distributions.py +0 -376
- additory/analysis/quality.py +0 -158
- additory/analysis/scan.py +0 -400
- additory/common/backend.py +0 -371
- additory/common/column_utils.py +0 -191
- additory/common/exceptions.py +0 -62
- additory/common/lists.py +0 -229
- additory/common/patterns.py +0 -240
- additory/common/resolver.py +0 -567
- additory/common/sample_data.py +0 -182
- additory/core/ast_builder.py +0 -165
- additory/core/backends/__init__.py +0 -23
- additory/core/backends/arrow_bridge.py +0 -483
- additory/core/backends/cudf_bridge.py +0 -355
- additory/core/column_positioning.py +0 -358
- additory/core/compiler_polars.py +0 -166
- additory/core/enhanced_cache_manager.py +0 -1119
- additory/core/enhanced_matchers.py +0 -473
- additory/core/enhanced_version_manager.py +0 -325
- additory/core/executor.py +0 -59
- additory/core/integrity_manager.py +0 -477
- additory/core/loader.py +0 -190
- additory/core/namespace_manager.py +0 -657
- additory/core/parser.py +0 -176
- additory/core/polars_expression_engine.py +0 -601
- additory/core/registry.py +0 -176
- additory/core/sample_data_manager.py +0 -492
- additory/core/user_namespace.py +0 -751
- additory/core/validator.py +0 -27
- additory/dynamic_api.py +0 -304
- additory/expressions/proxy.py +0 -549
- additory/expressions/registry.py +0 -313
- additory/expressions/samples.py +0 -492
- additory/synthetic/__init__.py +0 -13
- additory/synthetic/column_name_resolver.py +0 -149
- additory/synthetic/distributions.py +0 -22
- additory/synthetic/forecast.py +0 -1132
- additory/synthetic/linked_list_parser.py +0 -415
- additory/synthetic/namespace_lookup.py +0 -129
- additory/synthetic/smote.py +0 -320
- additory/synthetic/strategies.py +0 -850
- additory/synthetic/synthesizer.py +0 -713
- additory/utilities/__init__.py +0 -53
- additory/utilities/encoding.py +0 -600
- additory/utilities/games.py +0 -300
- additory/utilities/keys.py +0 -8
- additory/utilities/lookup.py +0 -103
- additory/utilities/matchers.py +0 -216
- additory/utilities/resolvers.py +0 -286
- additory/utilities/settings.py +0 -167
- additory/utilities/units.py +0 -749
- additory/utilities/validators.py +0 -153
- additory-0.1.0a3.dist-info/METADATA +0 -288
- additory-0.1.0a3.dist-info/RECORD +0 -71
- additory-0.1.0a3.dist-info/licenses/LICENSE +0 -21
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/WHEEL +0 -0
- {additory-0.1.0a3.dist-info → additory-0.1.1a1.dist-info}/top_level.txt +0 -0
additory/core/memory_manager.py
CHANGED
|
@@ -1,547 +1,209 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
1
|
+
"""
|
|
2
|
+
Memory management for Additory operations.
|
|
3
3
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
from typing import Dict, List, Optional, Callable, Any
|
|
10
|
-
from dataclasses import dataclass, field
|
|
11
|
-
from datetime import datetime, timedelta
|
|
12
|
-
from contextlib import contextmanager
|
|
13
|
-
|
|
14
|
-
from .logging import log_info, log_warning
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
@dataclass
|
|
18
|
-
class MemorySnapshot:
|
|
19
|
-
"""Memory usage snapshot at a point in time"""
|
|
20
|
-
timestamp: datetime
|
|
21
|
-
process_memory_mb: float
|
|
22
|
-
arrow_allocated_bytes: int
|
|
23
|
-
python_objects_count: int
|
|
24
|
-
gc_collections: Dict[int, int]
|
|
25
|
-
custom_metrics: Dict[str, Any] = field(default_factory=dict)
|
|
4
|
+
Provides automatic cleanup of intermediate objects to free memory:
|
|
5
|
+
- Tracks temporary DataFrames
|
|
6
|
+
- Cleans up after operations
|
|
7
|
+
- Can be enabled/disabled for debugging
|
|
8
|
+
"""
|
|
26
9
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
process_memory_mb: float = 500.0 # Process memory threshold
|
|
32
|
-
arrow_memory_bytes: int = 100 * 1024 * 1024 # 100MB Arrow memory
|
|
33
|
-
python_objects_count: int = 100000 # Python object count
|
|
34
|
-
cleanup_interval_seconds: float = 30.0 # Periodic cleanup interval
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
class MemoryLeakDetector:
|
|
38
|
-
"""Detects potential memory leaks by tracking memory growth patterns"""
|
|
39
|
-
|
|
40
|
-
def __init__(self, window_size: int = 10, growth_threshold: float = 1.5):
|
|
41
|
-
self.window_size = window_size
|
|
42
|
-
self.growth_threshold = growth_threshold
|
|
43
|
-
self.snapshots: List[MemorySnapshot] = []
|
|
44
|
-
self.leak_warnings = 0
|
|
45
|
-
|
|
46
|
-
def add_snapshot(self, snapshot: MemorySnapshot):
|
|
47
|
-
"""Add a memory snapshot for leak detection"""
|
|
48
|
-
self.snapshots.append(snapshot)
|
|
49
|
-
|
|
50
|
-
# Keep only the last window_size snapshots
|
|
51
|
-
if len(self.snapshots) > self.window_size:
|
|
52
|
-
self.snapshots.pop(0)
|
|
53
|
-
|
|
54
|
-
# Check for potential leaks
|
|
55
|
-
if len(self.snapshots) >= self.window_size:
|
|
56
|
-
self._check_for_leaks()
|
|
57
|
-
|
|
58
|
-
def _check_for_leaks(self):
|
|
59
|
-
"""Check if memory usage shows signs of leaking"""
|
|
60
|
-
if len(self.snapshots) < 2:
|
|
61
|
-
return
|
|
62
|
-
|
|
63
|
-
# Calculate memory growth rate
|
|
64
|
-
first_snapshot = self.snapshots[0]
|
|
65
|
-
last_snapshot = self.snapshots[-1]
|
|
66
|
-
|
|
67
|
-
memory_growth = last_snapshot.process_memory_mb / max(first_snapshot.process_memory_mb, 1.0) # Avoid division by zero
|
|
68
|
-
time_span = (last_snapshot.timestamp - first_snapshot.timestamp).total_seconds()
|
|
69
|
-
|
|
70
|
-
# Check for sustained memory growth (relaxed time threshold for testing)
|
|
71
|
-
if memory_growth > self.growth_threshold and time_span > 10: # 10 seconds minimum
|
|
72
|
-
self.leak_warnings += 1
|
|
73
|
-
log_warning(f"[memory_manager] Potential memory leak detected: "
|
|
74
|
-
f"{memory_growth:.2f}x growth over {time_span:.1f}s")
|
|
75
|
-
|
|
76
|
-
def get_leak_status(self) -> Dict[str, Any]:
|
|
77
|
-
"""Get current leak detection status"""
|
|
78
|
-
return {
|
|
79
|
-
"leak_warnings": self.leak_warnings,
|
|
80
|
-
"snapshots_count": len(self.snapshots),
|
|
81
|
-
"monitoring_window": self.window_size,
|
|
82
|
-
"growth_threshold": self.growth_threshold
|
|
83
|
-
}
|
|
10
|
+
import gc
|
|
11
|
+
import sys
|
|
12
|
+
from typing import Any, Callable, List, Optional, Tuple
|
|
13
|
+
from functools import wraps
|
|
84
14
|
|
|
85
15
|
|
|
86
16
|
class MemoryManager:
|
|
87
|
-
"""
|
|
17
|
+
"""
|
|
18
|
+
Manages memory cleanup after operations.
|
|
19
|
+
|
|
20
|
+
Tracks intermediate objects created during processing and
|
|
21
|
+
cleans them up automatically to free memory.
|
|
22
|
+
"""
|
|
88
23
|
|
|
89
24
|
def __init__(self):
|
|
90
|
-
|
|
91
|
-
self.
|
|
92
|
-
self.
|
|
93
|
-
self.auto_cleanup_enabled = True
|
|
94
|
-
|
|
95
|
-
# State tracking
|
|
96
|
-
self.snapshots: List[MemorySnapshot] = []
|
|
97
|
-
self.cleanup_callbacks: List[Callable[[], None]] = []
|
|
98
|
-
self.leak_detector = MemoryLeakDetector()
|
|
99
|
-
|
|
100
|
-
# Statistics
|
|
101
|
-
self.stats = {
|
|
102
|
-
"total_cleanups": 0,
|
|
103
|
-
"forced_cleanups": 0,
|
|
104
|
-
"auto_cleanups": 0,
|
|
105
|
-
"memory_freed_mb": 0.0,
|
|
106
|
-
"last_cleanup": None
|
|
107
|
-
}
|
|
108
|
-
|
|
109
|
-
# Background monitoring
|
|
110
|
-
self._monitoring_thread = None
|
|
111
|
-
self._stop_monitoring = threading.Event()
|
|
112
|
-
|
|
113
|
-
# Process reference for memory monitoring
|
|
114
|
-
try:
|
|
115
|
-
self.process = psutil.Process(os.getpid())
|
|
116
|
-
except Exception as e:
|
|
117
|
-
log_warning(f"[memory_manager] Failed to initialize process monitor: {e}")
|
|
118
|
-
self.process = None
|
|
119
|
-
|
|
120
|
-
log_info("[memory_manager] Memory Manager initialized")
|
|
25
|
+
"""Initialize memory manager."""
|
|
26
|
+
self.tracked_objects: List[Tuple[Any, Optional[str]]] = []
|
|
27
|
+
self.cleanup_enabled: bool = True
|
|
121
28
|
|
|
122
|
-
def
|
|
123
|
-
"""
|
|
124
|
-
|
|
125
|
-
log_warning("[memory_manager] Monitoring already running")
|
|
126
|
-
return
|
|
127
|
-
|
|
128
|
-
self.thresholds.cleanup_interval_seconds = interval_seconds
|
|
129
|
-
self._stop_monitoring.clear()
|
|
130
|
-
|
|
131
|
-
self._monitoring_thread = threading.Thread(
|
|
132
|
-
target=self._monitoring_loop,
|
|
133
|
-
daemon=True,
|
|
134
|
-
name="MemoryMonitor"
|
|
135
|
-
)
|
|
136
|
-
self._monitoring_thread.start()
|
|
29
|
+
def track(self, obj: Any, name: Optional[str] = None) -> None:
|
|
30
|
+
"""
|
|
31
|
+
Track object for cleanup.
|
|
137
32
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
"""Stop background memory monitoring"""
|
|
142
|
-
if self._monitoring_thread and self._monitoring_thread.is_alive():
|
|
143
|
-
self._stop_monitoring.set()
|
|
144
|
-
self._monitoring_thread.join(timeout=5.0)
|
|
145
|
-
log_info("[memory_manager] Stopped background monitoring")
|
|
146
|
-
|
|
147
|
-
def _monitoring_loop(self):
|
|
148
|
-
"""Background monitoring loop"""
|
|
149
|
-
while not self._stop_monitoring.is_set():
|
|
150
|
-
try:
|
|
151
|
-
if self.monitoring_enabled:
|
|
152
|
-
# Take memory snapshot
|
|
153
|
-
snapshot = self.take_snapshot()
|
|
154
|
-
|
|
155
|
-
# Check for cleanup needs
|
|
156
|
-
if self.auto_cleanup_enabled and self._should_cleanup(snapshot):
|
|
157
|
-
self.cleanup_if_needed()
|
|
158
|
-
|
|
159
|
-
# Wait for next interval
|
|
160
|
-
self._stop_monitoring.wait(self.thresholds.cleanup_interval_seconds)
|
|
161
|
-
|
|
162
|
-
except Exception as e:
|
|
163
|
-
log_warning(f"[memory_manager] Monitoring loop error: {e}")
|
|
164
|
-
time.sleep(5) # Brief pause before retrying
|
|
165
|
-
|
|
166
|
-
def register_cleanup_callback(self, callback: Callable[[], None]):
|
|
167
|
-
"""Register a callback function for memory cleanup"""
|
|
168
|
-
self.cleanup_callbacks.append(callback)
|
|
169
|
-
log_info(f"[memory_manager] Registered cleanup callback: {callback.__name__}")
|
|
170
|
-
|
|
171
|
-
def unregister_cleanup_callback(self, callback: Callable[[], None]):
|
|
172
|
-
"""Unregister a cleanup callback"""
|
|
173
|
-
if callback in self.cleanup_callbacks:
|
|
174
|
-
self.cleanup_callbacks.remove(callback)
|
|
175
|
-
log_info(f"[memory_manager] Unregistered cleanup callback: {callback.__name__}")
|
|
176
|
-
|
|
177
|
-
def take_snapshot(self) -> MemorySnapshot:
|
|
178
|
-
"""Take a snapshot of current memory usage"""
|
|
179
|
-
try:
|
|
180
|
-
# Process memory
|
|
181
|
-
process_memory_mb = 0.0
|
|
182
|
-
if self.process:
|
|
183
|
-
memory_info = self.process.memory_info()
|
|
184
|
-
process_memory_mb = memory_info.rss / 1024 / 1024
|
|
185
|
-
|
|
186
|
-
# Arrow memory (if available)
|
|
187
|
-
arrow_allocated_bytes = 0
|
|
188
|
-
try:
|
|
189
|
-
import pyarrow as pa
|
|
190
|
-
arrow_allocated_bytes = pa.default_memory_pool().bytes_allocated()
|
|
191
|
-
except ImportError:
|
|
192
|
-
pass
|
|
193
|
-
|
|
194
|
-
# Python objects
|
|
195
|
-
python_objects_count = len(gc.get_objects())
|
|
196
|
-
|
|
197
|
-
# GC statistics
|
|
198
|
-
gc_collections = {i: gc.get_count()[i] for i in range(3)}
|
|
199
|
-
|
|
200
|
-
snapshot = MemorySnapshot(
|
|
201
|
-
timestamp=datetime.now(),
|
|
202
|
-
process_memory_mb=process_memory_mb,
|
|
203
|
-
arrow_allocated_bytes=arrow_allocated_bytes,
|
|
204
|
-
python_objects_count=python_objects_count,
|
|
205
|
-
gc_collections=gc_collections
|
|
206
|
-
)
|
|
207
|
-
|
|
208
|
-
# Store snapshot
|
|
209
|
-
self.snapshots.append(snapshot)
|
|
210
|
-
|
|
211
|
-
# Keep only recent snapshots (last 100)
|
|
212
|
-
if len(self.snapshots) > 100:
|
|
213
|
-
self.snapshots.pop(0)
|
|
214
|
-
|
|
215
|
-
# Add to leak detector
|
|
216
|
-
self.leak_detector.add_snapshot(snapshot)
|
|
33
|
+
Args:
|
|
34
|
+
obj: Object to track (typically DataFrame or Series)
|
|
35
|
+
name: Optional name for debugging
|
|
217
36
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
timestamp=datetime.now(),
|
|
224
|
-
process_memory_mb=0.0,
|
|
225
|
-
arrow_allocated_bytes=0,
|
|
226
|
-
python_objects_count=0,
|
|
227
|
-
gc_collections={}
|
|
228
|
-
)
|
|
229
|
-
|
|
230
|
-
def _should_cleanup(self, snapshot: MemorySnapshot) -> bool:
|
|
231
|
-
"""Check if cleanup is needed based on thresholds"""
|
|
232
|
-
return (
|
|
233
|
-
snapshot.process_memory_mb > self.thresholds.process_memory_mb or
|
|
234
|
-
snapshot.arrow_allocated_bytes > self.thresholds.arrow_memory_bytes or
|
|
235
|
-
snapshot.python_objects_count > self.thresholds.python_objects_count
|
|
236
|
-
)
|
|
37
|
+
Example:
|
|
38
|
+
memory_manager.track(temp_df, 'intermediate_result')
|
|
39
|
+
"""
|
|
40
|
+
if self.cleanup_enabled:
|
|
41
|
+
self.tracked_objects.append((obj, name))
|
|
237
42
|
|
|
238
|
-
def
|
|
239
|
-
"""
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
if not self._should_cleanup(snapshot):
|
|
243
|
-
return False
|
|
244
|
-
|
|
245
|
-
log_info(f"[memory_manager] Auto cleanup triggered - "
|
|
246
|
-
f"Memory: {snapshot.process_memory_mb:.1f}MB, "
|
|
247
|
-
f"Arrow: {snapshot.arrow_allocated_bytes} bytes, "
|
|
248
|
-
f"Objects: {snapshot.python_objects_count}")
|
|
249
|
-
|
|
250
|
-
memory_before = snapshot.process_memory_mb
|
|
251
|
-
self._perform_cleanup()
|
|
252
|
-
|
|
253
|
-
# Take another snapshot to measure cleanup effectiveness
|
|
254
|
-
after_snapshot = self.take_snapshot()
|
|
255
|
-
memory_freed = max(0, memory_before - after_snapshot.process_memory_mb)
|
|
256
|
-
|
|
257
|
-
self.stats["auto_cleanups"] += 1
|
|
258
|
-
self.stats["memory_freed_mb"] += memory_freed
|
|
259
|
-
self.stats["last_cleanup"] = datetime.now()
|
|
260
|
-
|
|
261
|
-
log_info(f"[memory_manager] Auto cleanup completed - "
|
|
262
|
-
f"Freed: {memory_freed:.1f}MB")
|
|
263
|
-
|
|
264
|
-
return True
|
|
265
|
-
|
|
266
|
-
def force_cleanup(self) -> Dict[str, float]:
|
|
267
|
-
"""Force immediate memory cleanup"""
|
|
268
|
-
log_info("[memory_manager] Forcing memory cleanup")
|
|
269
|
-
|
|
270
|
-
before_snapshot = self.take_snapshot()
|
|
271
|
-
memory_before = before_snapshot.process_memory_mb
|
|
272
|
-
|
|
273
|
-
self._perform_cleanup()
|
|
274
|
-
|
|
275
|
-
after_snapshot = self.take_snapshot()
|
|
276
|
-
memory_after = after_snapshot.process_memory_mb
|
|
277
|
-
memory_freed = max(0, memory_before - memory_after)
|
|
278
|
-
|
|
279
|
-
self.stats["forced_cleanups"] += 1
|
|
280
|
-
self.stats["memory_freed_mb"] += memory_freed
|
|
281
|
-
self.stats["last_cleanup"] = datetime.now()
|
|
282
|
-
|
|
283
|
-
cleanup_stats = {
|
|
284
|
-
"memory_before_mb": memory_before,
|
|
285
|
-
"memory_after_mb": memory_after,
|
|
286
|
-
"memory_freed_mb": memory_freed,
|
|
287
|
-
"arrow_freed_bytes": before_snapshot.arrow_allocated_bytes - after_snapshot.arrow_allocated_bytes
|
|
288
|
-
}
|
|
43
|
+
def cleanup(self) -> None:
|
|
44
|
+
"""
|
|
45
|
+
Clean up all tracked objects.
|
|
289
46
|
|
|
290
|
-
|
|
291
|
-
|
|
47
|
+
Deletes references to tracked objects and runs garbage collection.
|
|
48
|
+
Called automatically at the end of every operation.
|
|
292
49
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
def _perform_cleanup(self):
|
|
296
|
-
"""Perform the actual cleanup operations"""
|
|
297
|
-
# Call registered cleanup callbacks
|
|
298
|
-
for callback in self.cleanup_callbacks:
|
|
50
|
+
Example:
|
|
299
51
|
try:
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
# Additional cleanup for specific libraries
|
|
308
|
-
self._cleanup_arrow_memory()
|
|
309
|
-
self._cleanup_polars_memory()
|
|
52
|
+
result = process_data(df)
|
|
53
|
+
return result
|
|
54
|
+
finally:
|
|
55
|
+
memory_manager.cleanup()
|
|
56
|
+
"""
|
|
57
|
+
if not self.cleanup_enabled:
|
|
58
|
+
return
|
|
310
59
|
|
|
311
|
-
|
|
60
|
+
# Count objects before cleanup
|
|
61
|
+
count = len(self.tracked_objects)
|
|
312
62
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
allocated_before = pool.bytes_allocated()
|
|
323
|
-
|
|
324
|
-
# Trigger garbage collection to free Arrow objects
|
|
325
|
-
gc.collect()
|
|
63
|
+
if count > 0:
|
|
64
|
+
# Estimate memory before cleanup (optional, for logging)
|
|
65
|
+
total_memory = 0
|
|
66
|
+
for obj, name in self.tracked_objects:
|
|
67
|
+
try:
|
|
68
|
+
memory = estimate_memory_usage(obj)
|
|
69
|
+
total_memory += memory
|
|
70
|
+
except:
|
|
71
|
+
pass # Ignore errors in memory estimation
|
|
326
72
|
|
|
327
|
-
|
|
328
|
-
|
|
73
|
+
# Clear references
|
|
74
|
+
self.tracked_objects.clear()
|
|
329
75
|
|
|
330
|
-
|
|
331
|
-
log_info(f"[memory_manager] Arrow cleanup freed {freed} bytes")
|
|
332
|
-
|
|
333
|
-
except ImportError:
|
|
334
|
-
pass
|
|
335
|
-
except Exception as e:
|
|
336
|
-
log_warning(f"[memory_manager] Arrow cleanup failed: {e}")
|
|
337
|
-
|
|
338
|
-
def _cleanup_polars_memory(self):
|
|
339
|
-
"""Cleanup Polars-specific memory"""
|
|
340
|
-
try:
|
|
341
|
-
import polars as pl
|
|
342
|
-
# Polars cleanup is mostly handled by Rust's memory management
|
|
343
|
-
# But we can clear any cached data
|
|
76
|
+
# Force garbage collection
|
|
344
77
|
gc.collect()
|
|
345
78
|
|
|
346
|
-
|
|
347
|
-
pass
|
|
348
|
-
except Exception as e:
|
|
349
|
-
log_warning(f"[memory_manager] Polars cleanup failed: {e}")
|
|
350
|
-
|
|
351
|
-
def get_memory_stats(self) -> Dict[str, Any]:
|
|
352
|
-
"""Get comprehensive memory statistics"""
|
|
353
|
-
current_snapshot = self.take_snapshot()
|
|
354
|
-
|
|
355
|
-
# Calculate memory trends
|
|
356
|
-
memory_trend = "stable"
|
|
357
|
-
if len(self.snapshots) >= 10:
|
|
358
|
-
recent_memory = sum(s.process_memory_mb for s in self.snapshots[-5:]) / min(5, len(self.snapshots))
|
|
359
|
-
older_memory = sum(s.process_memory_mb for s in self.snapshots[-10:-5]) / max(1, min(5, len(self.snapshots) - 5))
|
|
360
|
-
|
|
361
|
-
if recent_memory > older_memory * 1.1:
|
|
362
|
-
memory_trend = "increasing"
|
|
363
|
-
elif recent_memory < older_memory * 0.9:
|
|
364
|
-
memory_trend = "decreasing"
|
|
365
|
-
elif len(self.snapshots) >= 2:
|
|
366
|
-
# Simple trend for fewer snapshots
|
|
367
|
-
if self.snapshots[-1].process_memory_mb > self.snapshots[0].process_memory_mb * 1.1:
|
|
368
|
-
memory_trend = "increasing"
|
|
369
|
-
elif self.snapshots[-1].process_memory_mb < self.snapshots[0].process_memory_mb * 0.9:
|
|
370
|
-
memory_trend = "decreasing"
|
|
371
|
-
|
|
372
|
-
return {
|
|
373
|
-
"current": {
|
|
374
|
-
"process_memory_mb": current_snapshot.process_memory_mb,
|
|
375
|
-
"arrow_allocated_bytes": current_snapshot.arrow_allocated_bytes,
|
|
376
|
-
"python_objects_count": current_snapshot.python_objects_count,
|
|
377
|
-
"gc_collections": current_snapshot.gc_collections
|
|
378
|
-
},
|
|
379
|
-
"thresholds": {
|
|
380
|
-
"process_memory_mb": self.thresholds.process_memory_mb,
|
|
381
|
-
"arrow_memory_bytes": self.thresholds.arrow_memory_bytes,
|
|
382
|
-
"python_objects_count": self.thresholds.python_objects_count
|
|
383
|
-
},
|
|
384
|
-
"trends": {
|
|
385
|
-
"memory_trend": memory_trend,
|
|
386
|
-
"snapshots_count": len(self.snapshots),
|
|
387
|
-
"monitoring_enabled": self.monitoring_enabled
|
|
388
|
-
},
|
|
389
|
-
"cleanup_stats": self.stats.copy(),
|
|
390
|
-
"leak_detection": self.leak_detector.get_leak_status()
|
|
391
|
-
}
|
|
392
|
-
|
|
393
|
-
def get_memory_usage_mb(self) -> float:
|
|
394
|
-
"""Get current memory usage in MB"""
|
|
395
|
-
if self.process:
|
|
79
|
+
# Log cleanup (if logger is available)
|
|
396
80
|
try:
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
setattr(self.thresholds, key, value)
|
|
407
|
-
log_info(f"[memory_manager] Updated threshold {key} = {value}")
|
|
408
|
-
else:
|
|
409
|
-
log_warning(f"[memory_manager] Unknown threshold: {key}")
|
|
410
|
-
|
|
411
|
-
def enable_monitoring(self):
|
|
412
|
-
"""Enable memory monitoring"""
|
|
413
|
-
self.monitoring_enabled = True
|
|
414
|
-
log_info("[memory_manager] Memory monitoring enabled")
|
|
81
|
+
from .logging import get_logger
|
|
82
|
+
logger = get_logger()
|
|
83
|
+
if total_memory > 0:
|
|
84
|
+
memory_mb = total_memory / (1024 * 1024)
|
|
85
|
+
logger.info(
|
|
86
|
+
f"Cleaned up {count} intermediate objects (~{memory_mb:.2f} MB)"
|
|
87
|
+
)
|
|
88
|
+
except:
|
|
89
|
+
pass # Ignore if logger not available
|
|
415
90
|
|
|
416
|
-
def
|
|
417
|
-
"""
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
self.auto_cleanup_enabled = True
|
|
424
|
-
log_info("[memory_manager] Auto cleanup enabled")
|
|
425
|
-
|
|
426
|
-
def disable_auto_cleanup(self):
|
|
427
|
-
"""Disable automatic cleanup"""
|
|
428
|
-
self.auto_cleanup_enabled = False
|
|
429
|
-
log_info("[memory_manager] Auto cleanup disabled")
|
|
91
|
+
def enable(self) -> None:
|
|
92
|
+
"""
|
|
93
|
+
Enable automatic cleanup.
|
|
94
|
+
|
|
95
|
+
Called by user code via add.enable_memory_cleanup()
|
|
96
|
+
"""
|
|
97
|
+
self.cleanup_enabled = True
|
|
430
98
|
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
before_snapshot = self.take_snapshot()
|
|
435
|
-
start_time = datetime.now()
|
|
99
|
+
def disable(self) -> None:
|
|
100
|
+
"""
|
|
101
|
+
Disable automatic cleanup.
|
|
436
102
|
|
|
437
|
-
|
|
438
|
-
|
|
103
|
+
Useful for debugging to inspect intermediate objects.
|
|
104
|
+
Called by user code via add.disable_memory_cleanup()
|
|
105
|
+
"""
|
|
106
|
+
self.cleanup_enabled = False
|
|
107
|
+
|
|
108
|
+
def get_tracked_count(self) -> int:
|
|
109
|
+
"""
|
|
110
|
+
Get number of tracked objects.
|
|
439
111
|
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
finally:
|
|
443
|
-
after_snapshot = self.take_snapshot()
|
|
444
|
-
duration = (datetime.now() - start_time).total_seconds()
|
|
445
|
-
memory_delta = after_snapshot.process_memory_mb - before_snapshot.process_memory_mb
|
|
112
|
+
Returns:
|
|
113
|
+
Number of tracked objects
|
|
446
114
|
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
# Auto cleanup if needed
|
|
452
|
-
if self.auto_cleanup_enabled and memory_delta > 50: # 50MB increase
|
|
453
|
-
self.cleanup_if_needed()
|
|
454
|
-
|
|
455
|
-
def reset_stats(self):
|
|
456
|
-
"""Reset memory management statistics"""
|
|
457
|
-
self.stats = {
|
|
458
|
-
"total_cleanups": 0,
|
|
459
|
-
"forced_cleanups": 0,
|
|
460
|
-
"auto_cleanups": 0,
|
|
461
|
-
"memory_freed_mb": 0.0,
|
|
462
|
-
"last_cleanup": None
|
|
463
|
-
}
|
|
464
|
-
self.leak_detector = MemoryLeakDetector()
|
|
465
|
-
log_info("[memory_manager] Statistics reset")
|
|
115
|
+
Used for testing and debugging.
|
|
116
|
+
"""
|
|
117
|
+
return len(self.tracked_objects)
|
|
466
118
|
|
|
467
|
-
def
|
|
468
|
-
"""
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
119
|
+
def clear_tracking(self) -> None:
|
|
120
|
+
"""
|
|
121
|
+
Clear tracking list without cleanup.
|
|
122
|
+
|
|
123
|
+
Called after successful cleanup or when resetting state.
|
|
124
|
+
"""
|
|
125
|
+
self.tracked_objects.clear()
|
|
473
126
|
|
|
474
127
|
|
|
475
128
|
# Global memory manager instance
|
|
476
|
-
_global_memory_manager = None
|
|
129
|
+
_global_memory_manager: Optional[MemoryManager] = None
|
|
477
130
|
|
|
478
131
|
|
|
479
132
|
def get_memory_manager() -> MemoryManager:
|
|
480
|
-
"""
|
|
133
|
+
"""
|
|
134
|
+
Get the global memory manager instance.
|
|
135
|
+
|
|
136
|
+
Returns:
|
|
137
|
+
Global MemoryManager instance
|
|
138
|
+
|
|
139
|
+
Example:
|
|
140
|
+
memory_manager = get_memory_manager()
|
|
141
|
+
memory_manager.track(temp_df)
|
|
142
|
+
"""
|
|
481
143
|
global _global_memory_manager
|
|
482
144
|
if _global_memory_manager is None:
|
|
483
145
|
_global_memory_manager = MemoryManager()
|
|
484
146
|
return _global_memory_manager
|
|
485
147
|
|
|
486
148
|
|
|
487
|
-
def
|
|
488
|
-
"""Convenience function for forcing memory cleanup"""
|
|
489
|
-
return get_memory_manager().force_cleanup()
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
def get_memory_stats():
|
|
493
|
-
"""Convenience function for getting memory statistics"""
|
|
494
|
-
return get_memory_manager().get_memory_stats()
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
def memory_context(description: str = "operation"):
|
|
498
|
-
"""Convenience function for memory monitoring context"""
|
|
499
|
-
return get_memory_manager().memory_context(description)
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
def memory_profile(description: str = None):
|
|
149
|
+
def cleanup_after_operation(func: Callable) -> Callable:
|
|
503
150
|
"""
|
|
504
|
-
Decorator
|
|
151
|
+
Decorator to ensure cleanup after operation.
|
|
152
|
+
|
|
153
|
+
Wraps a function to automatically clean up tracked objects
|
|
154
|
+
after execution, even if an exception occurs.
|
|
505
155
|
|
|
506
156
|
Args:
|
|
507
|
-
|
|
157
|
+
func: Function to wrap
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
Wrapped function with automatic cleanup
|
|
508
161
|
|
|
509
162
|
Example:
|
|
510
|
-
@
|
|
511
|
-
def
|
|
512
|
-
|
|
513
|
-
"""
|
|
514
|
-
def decorator(func):
|
|
515
|
-
def wrapper(*args, **kwargs):
|
|
516
|
-
func_description = description or f"{func.__module__}.{func.__name__}"
|
|
517
|
-
with memory_context(func_description):
|
|
518
|
-
return func(*args, **kwargs)
|
|
519
|
-
return wrapper
|
|
520
|
-
return decorator
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
def track_memory_usage(func):
|
|
524
|
-
"""
|
|
525
|
-
Simple decorator to track memory usage of a function.
|
|
526
|
-
|
|
527
|
-
Example:
|
|
528
|
-
@track_memory_usage
|
|
529
|
-
def expensive_operation():
|
|
530
|
-
# ... do work
|
|
163
|
+
@cleanup_after_operation
|
|
164
|
+
def to(df, ...):
|
|
165
|
+
# Function implementation
|
|
531
166
|
pass
|
|
532
167
|
"""
|
|
168
|
+
@wraps(func)
|
|
533
169
|
def wrapper(*args, **kwargs):
|
|
534
|
-
|
|
535
|
-
before = manager.get_memory_usage_mb()
|
|
536
|
-
|
|
170
|
+
memory_manager = get_memory_manager()
|
|
537
171
|
try:
|
|
538
172
|
result = func(*args, **kwargs)
|
|
539
173
|
return result
|
|
540
174
|
finally:
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
175
|
+
memory_manager.cleanup()
|
|
176
|
+
|
|
177
|
+
return wrapper
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def estimate_memory_usage(obj: Any) -> int:
|
|
181
|
+
"""
|
|
182
|
+
Estimate memory usage of an object.
|
|
546
183
|
|
|
547
|
-
|
|
184
|
+
Args:
|
|
185
|
+
obj: Object to estimate (typically DataFrame or Series)
|
|
186
|
+
|
|
187
|
+
Returns:
|
|
188
|
+
Estimated memory usage in bytes
|
|
189
|
+
|
|
190
|
+
Note:
|
|
191
|
+
This is an approximation. Actual memory usage may vary.
|
|
192
|
+
"""
|
|
193
|
+
try:
|
|
194
|
+
# Try to get memory usage from object (Polars/pandas)
|
|
195
|
+
if hasattr(obj, 'estimated_size'):
|
|
196
|
+
# Polars DataFrame
|
|
197
|
+
return obj.estimated_size()
|
|
198
|
+
elif hasattr(obj, 'memory_usage'):
|
|
199
|
+
# pandas DataFrame or Series
|
|
200
|
+
memory_usage = obj.memory_usage(deep=True)
|
|
201
|
+
if hasattr(memory_usage, 'sum'):
|
|
202
|
+
return int(memory_usage.sum())
|
|
203
|
+
return int(memory_usage)
|
|
204
|
+
else:
|
|
205
|
+
# Fallback to sys.getsizeof
|
|
206
|
+
return sys.getsizeof(obj)
|
|
207
|
+
except:
|
|
208
|
+
# If all else fails, return 0
|
|
209
|
+
return 0
|