fustor-source-fs 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,412 @@
1
+ """
2
+ Fuagent source driver for the file system.
3
+
4
+ This driver implements a 'Smart Dynamic Monitoring' strategy to efficiently
5
+ monitor large directory structures without exhausting system resources.
6
+ """
7
+ import os
8
+ import queue
9
+ import time
10
+ import datetime
11
+ import logging
12
+ import uuid
13
+ import getpass
14
+ import fnmatch
15
+ import threading
16
+ from typing import Any, Dict, Iterator, List, Tuple
17
+ from fustor_core.drivers import SourceDriver
18
+ from fustor_core.models.config import SourceConfig
19
+ from fustor_event_model.models import EventBase, UpdateEvent, DeleteEvent
20
+
21
+ from .components import _WatchManager, safe_path_handling
22
+ from .event_handler import OptimizedWatchEventHandler, get_file_metadata
23
+
24
+ logger = logging.getLogger("fustor_agent.driver.fs")
25
+
26
+ import threading
27
+
28
+ class FSDriver(SourceDriver):
29
+ _instances: Dict[str, 'FSDriver'] = {}
30
+ _lock = threading.Lock()
31
+
32
+ @property
33
+ def is_transient(self) -> bool:
34
+ """
35
+ FS driver is transient - events will be lost if not processed immediately.
36
+ """
37
+ return True
38
+
39
+ def __new__(cls, id: str, config: SourceConfig):
40
+ # Generate unique signature based on URI and credentials to ensure permission isolation
41
+ signature = f"{config.uri}#{hash(str(config.credential))}"
42
+
43
+ with FSDriver._lock:
44
+ if signature not in FSDriver._instances:
45
+ # Create new instance
46
+ instance = super().__new__(cls)
47
+ FSDriver._instances[signature] = instance
48
+ return FSDriver._instances[signature]
49
+
50
+ def __init__(self, id: str, config: SourceConfig):
51
+ # Prevent re-initialization of shared instances
52
+ if hasattr(self, '_initialized'):
53
+ return
54
+
55
+ super().__init__(id, config)
56
+ self.uri = self.config.uri
57
+ self.event_queue: queue.Queue[EventBase] = queue.Queue()
58
+ self.clock_offset = 0.0 # Placeholder for potential future use
59
+ self._stop_driver_event = threading.Event() # NEW
60
+ min_monitoring_window_days = self.config.driver_params.get("min_monitoring_window_days", 30.0)
61
+ self.watch_manager = _WatchManager(self.uri, event_handler=None, min_monitoring_window_days=min_monitoring_window_days, stop_driver_event=self._stop_driver_event)
62
+ self.event_handler = OptimizedWatchEventHandler(self.event_queue, self.watch_manager)
63
+ self.watch_manager.event_handler = self.event_handler
64
+ self._pre_scan_completed = False
65
+ self._pre_scan_lock = threading.Lock()
66
+ self._stop_driver_event = threading.Event() # NEW
67
+
68
+ self._initialized = True
69
+
70
+ def _perform_pre_scan_and_schedule(self):
71
+ """
72
+ Performs a one-time scan of the directory to populate the watch manager
73
+ with a capacity-aware, hierarchy-complete set of the most active directories.
74
+ It uses a delta to normalize server mtimes to the client's time domain.
75
+ """
76
+ with self._pre_scan_lock:
77
+ if self._pre_scan_completed:
78
+ return
79
+
80
+ logger.info(f"[fs] Performing initial directory scan to build hot-directory map for: {self.uri}")
81
+
82
+ mtime_map: Dict[str, float] = {}
83
+
84
+ # Track statistics
85
+ error_count = 0
86
+ total_entries = 0 # Total number of entries (directories and files) processed
87
+
88
+ def handle_walk_error(e: OSError):
89
+ nonlocal error_count
90
+ error_count += 1
91
+ logger.debug(f"[fs] Error during pre-scan walk, skipping path: {e.filename} - {e.strerror}")
92
+
93
+ # Step 1: Walk the entire tree to build the mtime_map with server times
94
+ for root, dirs, files in os.walk(self.uri, topdown=False, onerror=handle_walk_error):
95
+ try:
96
+ latest_mtime = os.path.getmtime(root)
97
+ except OSError:
98
+ continue
99
+
100
+ for filename in files:
101
+ file_path = os.path.join(root, filename)
102
+ try:
103
+ stat_info = os.stat(file_path)
104
+ latest_mtime = max(latest_mtime, stat_info.st_mtime)
105
+ except (FileNotFoundError, PermissionError, OSError) as e:
106
+ error_count += 1
107
+ logger.debug(f"[fs] Error during pre-scan walk, skipping path: {e.filename} - {e.strerror}")
108
+
109
+ # Count each file as an entry
110
+ total_entries += 1
111
+
112
+ for dirname in dirs:
113
+ dirpath = os.path.join(root, dirname)
114
+ latest_mtime = max(latest_mtime, mtime_map.get(dirpath, 0))
115
+ # Count each dir as an entry
116
+ total_entries += 1
117
+
118
+ # Count the current directory
119
+ mtime_map[root] = latest_mtime
120
+ total_entries += 1 # Increment for each directory processed
121
+
122
+ # Log statistics every 1000 entries (using a reasonable batch size)
123
+ if total_entries % 10000 == 0:
124
+ # Find the newest and oldest directories so far
125
+ if mtime_map:
126
+ newest_dir = max(mtime_map.items(), key=lambda x: x[1])
127
+ oldest_dir = min(mtime_map.items(), key=lambda x: x[1])
128
+ newest_age = time.time() - newest_dir[1] # Difference in seconds
129
+ oldest_age = time.time() - oldest_dir[1] # Difference in seconds
130
+ logger.info(
131
+ f"[fs] Pre-scan progress: processed {total_entries} entries, "
132
+ f"errors: {error_count}, newest_dir: {newest_dir[0]} (age: {newest_age/86400:.2f} days), "
133
+ f"oldest_dir: {oldest_dir[0]} (age: {oldest_age/86400:.2f} days)"
134
+ )
135
+
136
+ # Step 2: Calculate baseline delta using the true recursive mtime of the root.
137
+ try:
138
+ root_recursive_mtime = mtime_map.get(self.uri, os.path.getmtime(self.uri))
139
+ self.clock_offset = time.time() - root_recursive_mtime
140
+ logger.info(f"[fs] Calculated client-server time delta: {self.clock_offset:.2f} seconds.")
141
+ except OSError as e:
142
+ logger.warning(f"[fs] Could not stat root directory to calculate time delta: {e}. Proceeding without normalization.")
143
+
144
+ # Log final statistics before sorting
145
+ if mtime_map:
146
+ newest_dir = max(mtime_map.items(), key=lambda x: x[1])
147
+ oldest_dir = min(mtime_map.items(), key=lambda x: x[1])
148
+ newest_age = time.time() - newest_dir[1] # Difference in seconds
149
+ oldest_age = time.time() - oldest_dir[1] # Difference in seconds
150
+ logger.info(
151
+ f"[fs] Pre-scan completed: processed {total_entries} entries, "
152
+ f"errors: {error_count}, newest_dir: {safe_path_handling(newest_dir[0])} (age: {newest_age/86400:.2f} days), "
153
+ f"oldest_dir: {safe_path_handling(oldest_dir[0])} (age: {oldest_age/86400:.2f} days)"
154
+ )
155
+
156
+ logger.info(f"[fs] Found {len(mtime_map)} total directories. Building capacity-aware, hierarchy-complete watch set...")
157
+ sorted_dirs = sorted(mtime_map.items(), key=lambda item: item[1], reverse=True)[:self.watch_manager.watch_limit]
158
+ old_limit = self.watch_manager.watch_limit
159
+ for path, _ in sorted_dirs:
160
+ server_mtime = mtime_map.get(path)
161
+ if server_mtime:
162
+ # Normalize to client time domain while preserving relative differences
163
+ lru_timestamp = server_mtime + self.clock_offset
164
+ else:
165
+ # Fallback for parents that might not have been in mtime_map (though they should be)
166
+ lru_timestamp = time.time()
167
+ self.watch_manager.schedule(path, lru_timestamp)
168
+ if self.watch_manager.watch_limit < old_limit:
169
+ break # Stop if we hit the limit during scheduling
170
+ logger.info(f"[fs] Final watch set constructed. Total paths to watch: {len(self.watch_manager.lru_cache)}.")
171
+ self._pre_scan_completed = True
172
+
173
+
174
+ def get_snapshot_iterator(self, **kwargs) -> Iterator[EventBase]:
175
+ stream_id = f"snapshot-fs-{uuid.uuid4().hex[:6]}"
176
+ logger.info(f"[{stream_id}] Starting Snapshot Scan Phase: for path: {self.uri}")
177
+
178
+ driver_params = self.config.driver_params
179
+ if driver_params.get("startup_mode") == "message-only":
180
+ logger.info(f"[{stream_id}] Skipping snapshot due to 'message-only' mode.")
181
+ return
182
+
183
+ file_pattern = driver_params.get("file_pattern", "*")
184
+ batch_size = kwargs.get("batch_size", 100)
185
+
186
+ logger.info(f"[{stream_id}] Scan parameters: file_pattern='{file_pattern}'")
187
+
188
+ try:
189
+ batch: List[Dict[str, Any]] = []
190
+ files_processed_count = 0
191
+ error_count = 0
192
+ snapshot_time = int(time.time() * 1000)
193
+
194
+ def handle_walk_error(e: OSError):
195
+ nonlocal error_count
196
+ error_count += 1
197
+ logger.debug(f"[{stream_id}] Error during snapshot walk, skipping path: {safe_path_handling(e.filename)} - {e.strerror}")
198
+
199
+ temp_mtime_map: Dict[str, float] = {}
200
+
201
+ for root, dirs, files in os.walk(self.uri, topdown=False, onerror=handle_walk_error):
202
+ try:
203
+ dir_stat_info = os.stat(root)
204
+ latest_mtime_in_subtree = dir_stat_info.st_mtime
205
+ except OSError:
206
+ dir_stat_info = None
207
+ latest_mtime_in_subtree = 0.0
208
+
209
+ for filename in files:
210
+ file_path = os.path.join(root, filename)
211
+ try:
212
+ stat_info = os.stat(file_path)
213
+ latest_mtime_in_subtree = max(latest_mtime_in_subtree, stat_info.st_mtime)
214
+ if fnmatch.fnmatch(filename, file_pattern):
215
+ metadata = get_file_metadata(file_path, stat_info=stat_info)
216
+ if metadata:
217
+ batch.append(metadata)
218
+ files_processed_count += 1
219
+ if len(batch) >= batch_size:
220
+ # Extract fields from the first row if batch is not empty
221
+ fields = list(batch[0].keys()) if batch else []
222
+ yield UpdateEvent(event_schema=self.uri, table="files", rows=batch, index=snapshot_time, fields=fields)
223
+ batch = []
224
+ except (FileNotFoundError, PermissionError, OSError) as e:
225
+ error_count += 1
226
+ logger.debug(f"[fs] Error processing file during snapshot: {safe_path_handling(file_path)} - {str(e)}")
227
+
228
+ for dirname in dirs:
229
+ dirpath = os.path.join(root, dirname)
230
+ latest_mtime_in_subtree = max(latest_mtime_in_subtree, temp_mtime_map.get(dirpath, 0.0))
231
+
232
+ temp_mtime_map[root] = latest_mtime_in_subtree
233
+ aligned_lru_timestamp = latest_mtime_in_subtree + self.clock_offset
234
+ self.watch_manager.touch(root, aligned_lru_timestamp, is_recursive_upward=False)
235
+
236
+ if dir_stat_info:
237
+ dir_metadata = get_file_metadata(root, stat_info=dir_stat_info)
238
+ if dir_metadata:
239
+ batch.append(dir_metadata)
240
+ files_processed_count += 1
241
+
242
+ if len(batch) >= batch_size:
243
+ # Extract fields from the first row if batch is not empty
244
+ fields = list(batch[0].keys()) if batch else []
245
+ yield UpdateEvent(event_schema=self.uri, table="files", rows=batch, index=snapshot_time, fields=fields)
246
+ batch = []
247
+
248
+ if batch:
249
+ fields = list(batch[0].keys()) if batch else []
250
+ yield UpdateEvent(event_schema=self.uri, table="files", rows=batch, index=snapshot_time, fields=fields)
251
+
252
+ if error_count > 0:
253
+ logger.warning(f"[{stream_id}] Skipped {error_count} paths in total due to permission or other errors.")
254
+
255
+ logger.info(f"[{stream_id}] Full scan complete. Processed {files_processed_count} files and directories.")
256
+
257
+ except Exception as e:
258
+ logger.error(f"[{stream_id}] Snapshot phase for fs failed: {e}", exc_info=True)
259
+
260
+ def get_message_iterator(self, start_position: int=-1, **kwargs) -> Iterator[EventBase]:
261
+
262
+ # Perform pre-scan to populate watches before starting the observer.
263
+ # This is essential for the message-first architecture and must block
264
+ # until completion to prevent race conditions downstream.
265
+ self._perform_pre_scan_and_schedule()
266
+
267
+ def _iterator_func() -> Iterator[EventBase]:
268
+ # After pre-scan is complete, any new events should be considered "starting from now"
269
+ # If start_position is provided, use it; otherwise, start from current time
270
+
271
+ stream_id = f"message-fs-{uuid.uuid4().hex[:6]}"
272
+
273
+ stop_event = kwargs.get("stop_event")
274
+ self.watch_manager.start()
275
+ logger.info(f"[{stream_id}] WatchManager started.")
276
+
277
+ try:
278
+ # Process events normally, but use the effective start position
279
+ while not (stop_event and stop_event.is_set()):
280
+ try:
281
+ max_sync_delay_seconds = self.config.driver_params.get("max_sync_delay_seconds", 1.0)
282
+ event = self.event_queue.get(timeout=max_sync_delay_seconds)
283
+
284
+ if start_position!=-1 and event.index < start_position:
285
+ logger.debug(f"[{stream_id}] Skipping old event: {event.event_type} index={event.index} < start_position={start_position}")
286
+ continue
287
+
288
+ yield event
289
+
290
+ except queue.Empty:
291
+ continue
292
+ finally:
293
+ self.watch_manager.stop()
294
+ logger.info(f"[{stream_id}] Stopped real-time monitoring for: {self.uri}")
295
+
296
+ return _iterator_func()
297
+
298
+ @classmethod
299
+ async def get_available_fields(cls, **kwargs) -> Dict[str, Any]:
300
+ return {"properties": {
301
+ "file_path": {"type": "string", "description": "The full, absolute path to the file.", "column_index": 0},
302
+ "size": {"type": "integer", "description": "The size of the file in bytes.", "column_index": 1},
303
+ "modified_time": {"type": "number", "description": "The last modification time as a Unix timestamp (float).", "column_index": 2},
304
+ "created_time": {"type": "number", "description": "The creation time as a Unix timestamp (float).", "column_index": 3},
305
+ }}
306
+
307
+ @classmethod
308
+ async def test_connection(cls, **kwargs) -> Tuple[bool, str]:
309
+ path = kwargs.get("uri")
310
+ if not path or not isinstance(path, str):
311
+ return (False, "路径未提供或格式不正确。")
312
+ if not os.path.exists(path):
313
+ return (False, f"路径不存在: {path}")
314
+ if not os.path.isdir(path):
315
+ return (False, f"路径不是一个目录: {path}")
316
+ if not os.access(path, os.R_OK):
317
+ return (False, f"没有读取权限: {path}")
318
+ return (True, "连接成功,路径有效且可读。")
319
+
320
+ @classmethod
321
+ async def check_privileges(cls, **kwargs) -> Tuple[bool, str]:
322
+ path = kwargs.get("uri")
323
+ if not path:
324
+ return (False, "Path not provided in arguments.")
325
+
326
+ try:
327
+ user = getpass.getuser()
328
+ except Exception:
329
+ user = "unknown"
330
+
331
+ logger.info(f"[fs] Checking permissions for user '{user}' on path: {safe_path_handling(path)}")
332
+
333
+ if not os.path.exists(path):
334
+ return (False, f"路径不存在: {path}")
335
+ if not os.path.isdir(path):
336
+ return (False, f"路径不是一个目录: {path}")
337
+
338
+ can_read = os.access(path, os.R_OK)
339
+ can_execute = os.access(path, os.X_OK)
340
+
341
+ if can_read and can_execute:
342
+ return (True, f"权限充足:当前用户 '{user}' 可以监控该目录。")
343
+
344
+ missing_perms = []
345
+ if not can_read:
346
+ missing_perms.append("读取")
347
+ if not can_execute:
348
+ missing_perms.append("执行(进入)")
349
+
350
+ return (False, f"权限不足:当前用户 '{user}' 缺少 {' 和 '.join(missing_perms)} 权限。")
351
+
352
+ async def close(self):
353
+ """
354
+ Close the file system watcher and stop monitoring.
355
+ """
356
+ logger.info(f"[fs] Closing file system watcher for {self.uri}")
357
+
358
+ # Stop the watch manager if it's running
359
+ if hasattr(self, 'watch_manager') and self.watch_manager:
360
+ self.watch_manager.stop()
361
+
362
+ # Set the stop event to ensure any active monitoring stops
363
+ if hasattr(self, '_stop_driver_event') and self._stop_driver_event:
364
+ self._stop_driver_event.set()
365
+
366
+ logger.info(f"[fs] Closed file system watcher for {self.uri}")
367
+
368
+ @classmethod
369
+ async def get_wizard_steps(cls) -> Dict[str, Any]:
370
+ return {
371
+ "steps": [
372
+ {
373
+ "step_id": "path_setup",
374
+ "title": "目录与权限",
375
+ "schema": {
376
+ "type": "object",
377
+ "properties": {
378
+ "uri": {
379
+ "type": "string",
380
+ "title": "监控目录路径",
381
+ "description": "请输入要监控的文件夹的绝对路径。"
382
+ },
383
+ "driver_params": {
384
+ "type": "object",
385
+ "title": "驱动参数",
386
+ "properties": {
387
+ "aged_interval": {
388
+ "type": "number",
389
+ "title": "被忽略监控的陈旧文件夹的年龄 (days)",
390
+ "default": 0.5
391
+ },
392
+ "max_sync_delay_seconds": {
393
+ "type": "number",
394
+ "title": "最大同步延迟 (秒)",
395
+ "description": "实时推送的最大延迟时间。如果超过此时间没有事件,将强制推送一次。",
396
+ "default": 1.0
397
+ },
398
+ "min_monitoring_window_days": {
399
+ "type": "number",
400
+ "title": "最小监控窗口 (天)",
401
+ "description": "当需要淘汰监控目录时,确保被淘汰的目录比整个监控范围内最新的文件至少旧N天。这可以防止淘汰近期仍在活跃范围内的目录。例如,设置为30,则表示只有比最新文件早30天以上的目录才允许被淘汰。",
402
+ "default": 30.0
403
+ }
404
+ }
405
+ }
406
+ },
407
+ "required": ["uri"],
408
+ },
409
+ "validations": ["test_connection", "check_privileges"]
410
+ }
411
+ ]
412
+ }
@@ -0,0 +1,356 @@
1
+ import collections
2
+ import logging
3
+ import os
4
+ import threading
5
+ import time
6
+ import dataclasses
7
+ import heapq
8
+ from typing import Dict, List, Optional, Set, Tuple
9
+ from fustor_core.exceptions import DriverError # NEW IMPORT
10
+
11
+ # Use the low-level inotify wrapper and high-level event types
12
+ from watchdog.observers.inotify_c import Inotify
13
+ from watchdog.events import (
14
+ FileClosedEvent,
15
+ FileCreatedEvent,
16
+ FileDeletedEvent,
17
+ FileModifiedEvent,
18
+ FileMovedEvent,
19
+ DirCreatedEvent,
20
+ DirDeletedEvent,
21
+ DirModifiedEvent,
22
+ DirMovedEvent,
23
+ )
24
+
25
+ logger = logging.getLogger("fustor_agent.driver.fs")
26
+
27
+ def contains_surrogate_characters(path: str) -> bool:
28
+ """Checks if a string contains surrogate characters."""
29
+ try:
30
+ path.encode('utf-8')
31
+ return False
32
+ except UnicodeEncodeError:
33
+ return True
34
+
35
+ def safe_path_encode(path: str) -> bytes:
36
+ """Safely encodes a path to bytes, handling surrogate characters using filesystem encoding."""
37
+ try:
38
+ return os.fsencode(path)
39
+ except Exception:
40
+ # Fallback for extreme cases or non-string inputs, though fsencode is robust
41
+ return path.encode('utf-8', errors='replace')
42
+
43
+ def safe_path_handling(path: str) -> str:
44
+ """Safely handles path strings, normalizing surrogate characters if present."""
45
+ if contains_surrogate_characters(path):
46
+ # Replace surrogate characters with underscores or question marks
47
+ # by encoding with replacement and decoding back
48
+ return path.encode('utf-8', errors='replace').decode('utf-8')
49
+ return path
50
+
51
+ @dataclasses.dataclass(frozen=True)
52
+ class WatchEntry:
53
+ """Simplified entry for the LRU cache, just holds the timestamp."""
54
+ timestamp: float
55
+
56
+ class _LRUCache:
57
+ """A custom cache that evicts the item with the oldest timestamp (smallest value)."""
58
+ def __init__(self, capacity: int):
59
+ self.capacity = capacity
60
+ self.cache: Dict[str, WatchEntry] = {} # path -> WatchEntry
61
+ self.min_heap: List[Tuple[float, str]] = [] # (timestamp, path)
62
+ self.removed_from_heap: Set[str] = set()
63
+
64
+ def _clean_heap(self):
65
+ """Removes stale entries from the top of the min_heap."""
66
+ while self.min_heap:
67
+ timestamp, path = self.min_heap[0]
68
+ if path in self.removed_from_heap:
69
+ heapq.heappop(self.min_heap)
70
+ self.removed_from_heap.remove(path)
71
+ elif path not in self.cache:
72
+ heapq.heappop(self.min_heap)
73
+ else:
74
+ break
75
+
76
+ def get(self, key: str) -> Optional[WatchEntry]:
77
+ return self.cache.get(key)
78
+
79
+ def put(self, key: str, value: WatchEntry):
80
+ if key in self.cache:
81
+ self.removed_from_heap.add(key)
82
+ self.cache[key] = value
83
+ heapq.heappush(self.min_heap, (value.timestamp, key))
84
+
85
+ def evict(self) -> Optional[Tuple[str, WatchEntry]]:
86
+ """Removes and returns the item with the oldest timestamp."""
87
+ if not self.cache:
88
+ return None
89
+ self._clean_heap()
90
+ if not self.min_heap:
91
+ return None
92
+ _timestamp, oldest_key = heapq.heappop(self.min_heap)
93
+ self.removed_from_heap.discard(oldest_key)
94
+ oldest_entry = self.cache.pop(oldest_key)
95
+ return oldest_key, oldest_entry
96
+
97
+ def get_oldest(self) -> Optional[Tuple[str, WatchEntry]]:
98
+ """Returns the item with the oldest timestamp without removing it."""
99
+ if not self.cache:
100
+ return None
101
+ self._clean_heap()
102
+ if not self.min_heap:
103
+ return None
104
+ _timestamp, oldest_key = self.min_heap[0]
105
+ return oldest_key, self.cache[oldest_key]
106
+
107
+ def remove(self, key: str):
108
+ if key in self.cache:
109
+ del self.cache[key]
110
+ self.removed_from_heap.add(key)
111
+
112
+ def __contains__(self, key: str) -> bool:
113
+ return key in self.cache
114
+
115
+ def __len__(self) -> int:
116
+ return len(self.cache)
117
+
118
+
119
+ class _WatchManager:
120
+ """
121
+ Manages a single inotify instance and its watches, including LRU pruning.
122
+ This is a more resource-efficient implementation.
123
+ """
124
+ def __init__(self, root_path: str, event_handler, min_monitoring_window_days: float = 30.0, stop_driver_event: threading.Event = None):
125
+ logger.info(f"Creating a new Inotify instance for root path {root_path}.")
126
+ self.watch_limit = 10000000 # This now only limits watches, not instances.
127
+ self.lru_cache = _LRUCache(self.watch_limit)
128
+ self.event_handler = event_handler
129
+ self.root_path = root_path
130
+ self._lock = threading.RLock()
131
+ self.min_monitoring_window_days = min_monitoring_window_days
132
+ self.stop_driver_event = stop_driver_event # NEW
133
+
134
+ # Directly use the low-level Inotify class
135
+ # We watch the root path non-recursively just to initialize the instance.
136
+ # All other watches are added dynamically.
137
+ # Use safe_path_encode to handle potential surrogate characters in root_path
138
+ self.inotify = Inotify(safe_path_encode(root_path), recursive=False)
139
+
140
+ self._stop_event = threading.Event()
141
+ self.inotify_thread = threading.Thread(target=self._event_processing_loop, daemon=True)
142
+
143
+ def _event_processing_loop(self):
144
+ """
145
+ The core event loop that reads from inotify and dispatches events.
146
+ """
147
+ while not self._stop_event.is_set():
148
+ try:
149
+ raw_events = self.inotify.read_events()
150
+
151
+ # Pre-process to identify paired moves and avoid duplicate events.
152
+ paired_move_from_paths = set()
153
+ for event in raw_events:
154
+ if event.is_moved_to:
155
+ src_path_from = self.inotify.source_for_move(event)
156
+ if src_path_from:
157
+ paired_move_from_paths.add(os.fsdecode(src_path_from))
158
+
159
+ for event in raw_events:
160
+ # Use fsdecode to safely decode bytes to str, it handles surrogates correctly
161
+ src_path_str = os.fsdecode(event.src_path)
162
+
163
+ # Handle paired moves (MOVED_FROM + MOVED_TO)
164
+ if event.is_moved_to:
165
+ src_path_from = self.inotify.source_for_move(event)
166
+ if src_path_from:
167
+ src_path_from_str = os.fsdecode(src_path_from)
168
+ if event.is_directory:
169
+ self.event_handler.on_moved(DirMovedEvent(src_path_from_str, src_path_str))
170
+ else:
171
+ self.event_handler.on_moved(FileMovedEvent(src_path_from_str, src_path_str))
172
+ else:
173
+ # Unmatched MOVED_TO: treat as creation
174
+ if event.is_directory:
175
+ self.event_handler.on_created(DirCreatedEvent(src_path_str))
176
+ else:
177
+ self.event_handler.on_created(FileCreatedEvent(src_path_str))
178
+
179
+ # Handle unmatched MOVED_FROM (treat as deletion)
180
+ elif event.is_moved_from:
181
+ if src_path_str in paired_move_from_paths:
182
+ continue # Already processed as part of a move
183
+
184
+ if event.is_directory:
185
+ self.event_handler.on_deleted(DirDeletedEvent(src_path_str))
186
+ else:
187
+ self.event_handler.on_deleted(FileDeletedEvent(src_path_str))
188
+
189
+ # Handle creation events
190
+ elif event.is_create:
191
+ if event.is_directory:
192
+ self.event_handler.on_created(DirCreatedEvent(src_path_str))
193
+ else:
194
+ self.event_handler.on_created(FileCreatedEvent(src_path_str))
195
+
196
+ # Handle deletion events
197
+ elif event.is_delete:
198
+ if event.is_directory:
199
+ self.event_handler.on_deleted(DirDeletedEvent(src_path_str))
200
+ else:
201
+ self.event_handler.on_deleted(FileDeletedEvent(src_path_str))
202
+
203
+ # Handle modification events (attrib or modify)
204
+ elif event.is_attrib or event.is_modify:
205
+ if event.is_directory:
206
+ self.event_handler.on_modified(DirModifiedEvent(src_path_str))
207
+ else:
208
+ self.event_handler.on_modified(FileModifiedEvent(src_path_str))
209
+
210
+ # Handle file closed after write (definitive modification)
211
+ elif event.is_close_write:
212
+ if event.is_directory:
213
+ self.event_handler.on_closed(DirModifiedEvent(src_path_str))
214
+ else:
215
+ self.event_handler.on_closed(FileClosedEvent(src_path_str))
216
+
217
+ # Handle ignored events (watch removed)
218
+ elif event.is_ignored:
219
+ with self._lock:
220
+ if src_path_str in self.lru_cache:
221
+ self.lru_cache.remove(src_path_str)
222
+ logger.debug(f"Removed watch for '{safe_path_handling(src_path_str)}' from LRU cache due to IGNORED event.")
223
+
224
+ except KeyError as e:
225
+ logger.debug(f"Ignoring event for untracked watch descriptor: {str(e)}")
226
+
227
+ def schedule(self, path: str, timestamp: Optional[float] = None):
228
+ with self._lock:
229
+ timestamp_to_use = timestamp if timestamp is not None else time.time()
230
+ if path in self.lru_cache:
231
+ existing_entry = self.lru_cache.get(path)
232
+ if existing_entry and existing_entry.timestamp < timestamp_to_use:
233
+ self.lru_cache.put(path, WatchEntry(timestamp_to_use))
234
+ return
235
+
236
+ is_eviction_needed = len(self.lru_cache) >= self.watch_limit
237
+
238
+ oldest = self.lru_cache.get_oldest()
239
+ if oldest and oldest[1].timestamp >= timestamp_to_use and is_eviction_needed:
240
+ logger.debug(f"New watch for {safe_path_handling(path)} (ts {timestamp_to_use:.2f}) is older than oldest in cache (ts {oldest[1].timestamp:.2f}). Skipping.")
241
+ return
242
+
243
+ if is_eviction_needed:
244
+ evicted_item = self.lru_cache.evict()
245
+ if evicted_item:
246
+ evicted_path, evicted_entry = evicted_item
247
+ relative_age_days = (time.time() - evicted_entry.timestamp) / 86400
248
+
249
+ if relative_age_days < self.min_monitoring_window_days:
250
+ error_msg = (
251
+ f"Watch limit reached and an active watch for {evicted_path} "
252
+ f"(relative age: {relative_age_days:.2f} days) is about to be evicted. "
253
+ f"This is below the configured min_monitoring_window_days ({self.min_monitoring_window_days} days). "
254
+ f"Stopping driver to prevent data loss."
255
+ )
256
+ logger.error(error_msg)
257
+ if self.stop_driver_event:
258
+ self.stop_driver_event.set()
259
+ raise DriverError(error_msg)
260
+
261
+ logger.info(f"Watch limit reached. Evicting watch for {safe_path_handling(evicted_path)} (relative age: {relative_age_days:.2f} days).")
262
+ try:
263
+ self.inotify.remove_watch(safe_path_encode(evicted_path))
264
+ except (KeyError, OSError) as e:
265
+ logger.warning(f"Error removing evicted watch for {safe_path_handling(evicted_path)}: {e}")
266
+ self.unschedule_recursive(evicted_path)
267
+ else:
268
+ logger.warning(f"Watch limit of {self.watch_limit} reached, but LRU cache is empty. Cannot schedule new watch for {safe_path_handling(path)}.")
269
+ return
270
+
271
+ try:
272
+ self.inotify.add_watch(safe_path_encode(path))
273
+ self.lru_cache.put(path, WatchEntry(timestamp_to_use))
274
+ except OSError as e:
275
+ # Catch ENOENT (2) - File not found, likely deleted before we could watch it
276
+ # Catch ENOTDIR (20) - Not a directory (can happen if a file replaced a dir)
277
+ # Catch EACCES (13) - Permission denied
278
+ # Catch EINVAL (22) - Invalid argument, which can happen with special filesystems or very long paths
279
+ if e.errno == 2: # ENOENT
280
+ if os.path.exists(path):
281
+ # Path exists, but inotify reports ENOENT. This is problematic for inotify.
282
+ logger.warning(f"[fs] Could not schedule watch for {safe_path_handling(path)} (errno={e.errno}), path exists but inotify rejected it. (Consider renaming if possible).")
283
+ else:
284
+ # Path truly does not exist.
285
+ logger.warning(f"[fs] Could not schedule watch for {safe_path_handling(path)} (errno={e.errno}), it may strictly no longer exist or be inaccessible.")
286
+ return
287
+ if e.errno in (20, 13): # ENOTDIR, EACCES
288
+ logger.warning(f"[fs] Could not schedule watch for {safe_path_handling(path)} (errno={e.errno}), it may strictly no longer exist or be inaccessible.")
289
+ return
290
+ if e.errno == 22: # EINVAL - Invalid argument
291
+ logger.warning(f"[fs] Could not schedule watch for {safe_path_handling(path)} (errno={e.errno}), invalid argument. This can happen with special filesystems, bind mounts, or unusual path characters.")
292
+ return
293
+
294
+ if e.errno == 28:
295
+ new_limit = len(self.lru_cache)
296
+ relative_age_days = (time.time() - timestamp_to_use) / 86400
297
+ if relative_age_days < self.min_monitoring_window_days:
298
+ error_msg = (
299
+ f"System inotify watch limit hit. The new watch for {path} "
300
+ f"(relative age: {relative_age_days:.2f} days) is about to be skipped. "
301
+ f"This is below the configured min_monitoring_window_days ({self.min_monitoring_window_days} days). "
302
+ f"Stopping driver to prevent data loss."
303
+ )
304
+ logger.error(error_msg)
305
+ if self.stop_driver_event:
306
+ self.stop_driver_event.set()
307
+ raise DriverError(error_msg)
308
+
309
+ logger.warning(
310
+ f"System inotify watch limit hit. Dynamically adjusting watch_limit from "
311
+ f"{self.watch_limit} to {new_limit}. The new watch for {path} (relative age: {relative_age_days:.2f} days) will be skipped. "
312
+
313
+ f"Consider increasing 'fs.inotify.max_user_watchs'."
314
+ )
315
+ self.watch_limit = new_limit
316
+ return self.schedule(path, timestamp_to_use) # Retry the schedule call after adjusting the limit
317
+ else:
318
+ logger.error(f"OS error scheduling watch for {safe_path_handling(path)}: {e}", exc_info=True)
319
+ raise
320
+
321
+ def unschedule_recursive(self, path: str):
322
+ with self._lock:
323
+ paths_to_remove_from_lru = [p for p in list(self.lru_cache.cache.keys()) if p == path or p.startswith(path + os.sep)]
324
+ for p in paths_to_remove_from_lru:
325
+ try:
326
+ self.inotify.remove_watch(safe_path_encode(p))
327
+ except (KeyError, OSError) as e:
328
+ logger.warning(f"Error removing watch during recursive unschedule for {p}: {e}")
329
+ self.lru_cache.remove(p)
330
+
331
+ def touch(self, path: str, timestamp: Optional[float] = None, is_recursive_upward: bool = True):
332
+ with self._lock:
333
+ current_path = path
334
+ while True:
335
+ try:
336
+ if os.path.isdir(current_path):
337
+ self.schedule(current_path, timestamp)
338
+ except (OSError, PermissionError) as e:
339
+ logger.warning(f"[fs] Error accessing path during touch: {safe_path_handling(current_path)} - {str(e)}")
340
+
341
+ if not is_recursive_upward or len(current_path) <= len(self.root_path):
342
+ break
343
+
344
+ current_path = os.path.dirname(current_path)
345
+
346
+ def start(self):
347
+ logger.info("WatchManager: Starting inotify event thread.")
348
+ self.inotify_thread.start()
349
+
350
+ def stop(self):
351
+ logger.info("WatchManager: Stopping inotify event thread.")
352
+ self._stop_event.set()
353
+ self.inotify.close() # This will interrupt the blocking read_events() call
354
+ if self.inotify_thread.is_alive():
355
+ self.inotify_thread.join(timeout=5.0)
356
+ logger.info("WatchManager: Inotify event thread stopped.")
@@ -0,0 +1,241 @@
1
+ import os
2
+ import queue
3
+ import time
4
+ import logging
5
+ import stat
6
+ from typing import Any, Dict, Optional
7
+
8
+ from watchdog.events import FileSystemEventHandler, FileSystemEvent
9
+ from fustor_event_model.models import UpdateEvent, DeleteEvent
10
+
11
+ from .components import _WatchManager
12
+
13
+ logger = logging.getLogger("fustor_agent.driver.fs")
14
+
15
+ def get_file_metadata(path: str, stat_info: Optional[os.stat_result] = None) -> Optional[Dict[str, Any]]:
16
+ """Get file metadata, returning mtime and ctime as float timestamps."""
17
+ try:
18
+ if stat_info is None:
19
+ stat_info = os.stat(path)
20
+
21
+ is_dir = stat.S_ISDIR(stat_info.st_mode)
22
+
23
+ return {
24
+ "file_path": path,
25
+ "size": stat_info.st_size,
26
+ "modified_time": stat_info.st_mtime,
27
+ "created_time": stat_info.st_ctime,
28
+ "is_dir": is_dir
29
+ }
30
+ except FileNotFoundError:
31
+ logger.warning(f"[fs] Could not stat file, it may have been deleted before processing: {path}")
32
+ return None
33
+
34
+
35
+ class OptimizedWatchEventHandler(FileSystemEventHandler):
36
+ """
37
+ Event handler that processes watchdog events immediately using dedicated
38
+ on_* methods, which is the idiomatic way to use watchdog.
39
+ """
40
+ def __init__(self, event_queue: queue.Queue, watch_manager: _WatchManager):
41
+ super().__init__()
42
+ self.event_queue = event_queue
43
+ self.watch_manager = watch_manager
44
+
45
+ def _touch_recursive_bottom_up(self, path: str):
46
+ """Recursively touches a directory and its contents from bottom-up."""
47
+ if not os.path.exists(path): return
48
+
49
+ # First, touch all files and subdirectories
50
+ for dirpath, dirnames, _ in os.walk(path, topdown=False):
51
+ for dirname in dirnames:
52
+ subdir_path = os.path.join(dirpath, dirname)
53
+ self.watch_manager.touch(subdir_path, is_recursive_upward=False)
54
+
55
+ # Finally, touch the root of the path itself
56
+ self.watch_manager.touch(path, is_recursive_upward=False)
57
+
58
+ def _generate_move_events_recursive(self, from_path: str, to_path: str):
59
+ """Generates DeleteEvents for inferred old paths and UpdateEvents for new paths within a moved subtree."""
60
+ if not os.path.exists(to_path): return
61
+
62
+ for dirpath, dirnames, filenames in os.walk(to_path, topdown=False):
63
+ for filename in filenames:
64
+ add_path = os.path.join(dirpath, filename)
65
+ del_path = add_path.replace(to_path, from_path, 1)
66
+
67
+ # Generate DeleteEvent for the old path
68
+ row = {"file_path": del_path}
69
+ delete_event = DeleteEvent(
70
+ schema=self.watch_manager.root_path,
71
+ event_schema=self.watch_manager.root_path,
72
+ table="files",
73
+ rows=[row],
74
+ fields=list(row.keys()),
75
+ index=int(time.time() * 1000)
76
+ )
77
+ self.event_queue.put(delete_event)
78
+
79
+ # Generate UpdateEvent for the new path
80
+ metadata = get_file_metadata(add_path)
81
+ if metadata:
82
+ update_event = UpdateEvent(
83
+ schema=self.watch_manager.root_path,
84
+ event_schema=self.watch_manager.root_path,
85
+ table="files",
86
+ rows=[metadata],
87
+ fields=list(metadata.keys()),
88
+ index=int(time.time() * 1000)
89
+ )
90
+ self.event_queue.put(update_event)
91
+
92
+ for dirname in dirnames:
93
+ subdir_add_path = os.path.join(dirpath, dirname)
94
+ subdir_del_path = subdir_add_path.replace(to_path, from_path, 1)
95
+
96
+ # Generate DeleteEvent for the old directory path
97
+ row = {"file_path": subdir_del_path}
98
+ delete_event = DeleteEvent(
99
+ schema=self.watch_manager.root_path,
100
+ event_schema=self.watch_manager.root_path,
101
+ table="files",
102
+ rows=[row],
103
+ fields=list(row.keys()),
104
+ index=int(time.time() * 1000)
105
+ )
106
+ self.event_queue.put(delete_event)
107
+ # No UpdateEvent for directories, touch handles their LRU/watch status
108
+ # Generate UpdateEvent for the new path
109
+ metadata = get_file_metadata(subdir_add_path)
110
+ if metadata:
111
+ update_event = UpdateEvent(
112
+ schema=self.watch_manager.root_path,
113
+ event_schema=self.watch_manager.root_path,
114
+ table="files",
115
+ rows=[metadata],
116
+ fields=list(metadata.keys()),
117
+ index=int(time.time() * 1000)
118
+ )
119
+ self.event_queue.put(update_event)
120
+
121
+ def on_created(self, event: FileSystemEvent):
122
+ """Called when a file or directory is created."""
123
+ try:
124
+ if event.is_directory:
125
+ metadata = get_file_metadata(event.src_path)
126
+ if metadata:
127
+ update_event = UpdateEvent(
128
+ schema=self.watch_manager.root_path,
129
+ event_schema=self.watch_manager.root_path,
130
+ table="files",
131
+ rows=[metadata],
132
+ fields=list(metadata.keys()),
133
+ index=int(time.time() * 1000)
134
+ )
135
+ self.event_queue.put(update_event)
136
+ self.watch_manager.touch(event.src_path)
137
+ except Exception as e:
138
+ logger.warning(f"[fs] Error processing file creation event for {event.src_path}: {str(e)}")
139
+
140
+ def on_deleted(self, event: FileSystemEvent):
141
+ """Called when a file or directory is deleted."""
142
+ try:
143
+ # For a deleted path, we should not attempt to touch/schedule a watch.
144
+ # Instead, we unschedule and generate a delete event.
145
+
146
+ if event.is_directory:
147
+ self.watch_manager.unschedule_recursive(event.src_path)
148
+ row = {"file_path": event.src_path}
149
+ delete_event = DeleteEvent(
150
+ schema=self.watch_manager.root_path,
151
+ event_schema=self.watch_manager.root_path,
152
+ table="files",
153
+ rows=[row],
154
+ fields=list(row.keys()),
155
+ index=int(time.time() * 1000)
156
+ )
157
+ self.event_queue.put(delete_event)
158
+
159
+ # A deletion is an activity, touch the parent path to update its timestamp.
160
+ # We assume the parent is always a directory.
161
+ self.watch_manager.touch(os.path.dirname(event.src_path))
162
+ except Exception as e:
163
+ logger.warning(f"[fs] Error processing file deletion event for {event.src_path}: {str(e)}")
164
+
165
+ def on_moved(self, event: FileSystemEvent):
166
+ """Called when a file or a directory is moved or renamed."""
167
+ try:
168
+ # Touch the parent of the source path to update its timestamp (something disappeared).
169
+ self.watch_manager.touch(os.path.dirname(event.src_path))
170
+ # Touch the parent of the destination path to update its timestamp (something appeared).
171
+ self.watch_manager.touch(os.path.dirname(event.dest_path))
172
+
173
+ # Create and queue the delete event for the old location
174
+ delete_row = {"file_path": event.src_path}
175
+ delete_event = DeleteEvent(
176
+ schema=self.watch_manager.root_path,
177
+ event_schema=self.watch_manager.root_path,
178
+ table="files",
179
+ rows=[delete_row],
180
+ fields=list(delete_row.keys()),
181
+ index=int(time.time() * 1000)
182
+ )
183
+ self.event_queue.put(delete_event)
184
+
185
+ # Handle the creation/update event for the new location
186
+ if event.is_directory:
187
+ # For directories, process recursively
188
+ self._generate_move_events_recursive(event.src_path, event.dest_path)
189
+ # Recursively touch all contents at the new destination to ensure watches are updated/scheduled.
190
+ self._touch_recursive_bottom_up(event.dest_path)
191
+ # Unschedule the old path recursively
192
+ self.watch_manager.unschedule_recursive(event.src_path)
193
+ else:
194
+ # For files, create update event for new location
195
+ metadata = get_file_metadata(event.dest_path)
196
+ if metadata:
197
+ update_event = UpdateEvent(
198
+ schema=self.watch_manager.root_path,
199
+ event_schema=self.watch_manager.root_path,
200
+ table="files",
201
+ rows=[metadata],
202
+ fields=list(metadata.keys()),
203
+ index=int(time.time() * 1000)
204
+ )
205
+ self.event_queue.put(update_event)
206
+ # Touch the file itself at its new destination
207
+ self.watch_manager.touch(event.dest_path)
208
+ except Exception as e:
209
+ logger.warning(f"[fs] Error processing file move event for {event.src_path} -> {event.dest_path}: {str(e)}")
210
+ # Note: If we get here, the delete_event may already be in the queue
211
+ # This is an inherent issue with partial failure in distributed systems,
212
+ # but we prevent the entire system from crashing
213
+
214
+ def on_modified(self, event: FileSystemEvent):
215
+ """
216
+ Called when a file or directory is modified.
217
+ This is intentionally ignored to wait for the 'closed' event,
218
+ ensuring the file is fully written.
219
+ """
220
+ pass
221
+
222
+ def on_closed(self, event: FileSystemEvent):
223
+ """
224
+ Called when a file opened for writing is closed.
225
+ """
226
+ try:
227
+ self.watch_manager.touch(event.src_path)
228
+ if not event.is_directory:
229
+ metadata = get_file_metadata(event.src_path)
230
+ if metadata:
231
+ update_event = UpdateEvent(
232
+ schema=self.watch_manager.root_path,
233
+ event_schema=self.watch_manager.root_path,
234
+ table="files",
235
+ rows=[metadata],
236
+ fields=list(metadata.keys()),
237
+ index=int(time.time() * 1000)
238
+ )
239
+ self.event_queue.put(update_event)
240
+ except Exception as e:
241
+ logger.warning(f"[fs] Error processing file closed event for {event.src_path}: {str(e)}")
File without changes
@@ -0,0 +1,8 @@
1
+ Metadata-Version: 2.4
2
+ Name: fustor-source-fs
3
+ Version: 0.1.4
4
+ Summary: Fustor Agent FS Source Driver
5
+ License-Expression: MIT
6
+ Requires-Dist: fustor-core
7
+ Requires-Dist: fustor-event-model
8
+ Requires-Dist: watchdog
@@ -0,0 +1,9 @@
1
+ fustor_source_fs/__init__.py,sha256=nl3IL57-yK_j0wHvR-tXL3VqUJETgtbM5cxIXMWHD6M,20143
2
+ fustor_source_fs/components.py,sha256=2fK7p7UWDaPh7wwKbM8YOKNrcn9Jw9PN3bMAPHgCBiI,17589
3
+ fustor_source_fs/event_handler.py,sha256=yxLqlppaeVP6ZnJ8SeMck1LO7Ps_6flRqeFlcOfEwvI,10862
4
+ fustor_source_fs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
+ fustor_source_fs-0.1.4.dist-info/METADATA,sha256=KE9hfxMojQ2WJhmn2uw2WDgLQA1oYHqG3lqgkDwuai0,208
6
+ fustor_source_fs-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
7
+ fustor_source_fs-0.1.4.dist-info/entry_points.txt,sha256=ruXHaAbRgq5l2QL5tf2QojUQzAxmE4m5hjYvLID5SPI,62
8
+ fustor_source_fs-0.1.4.dist-info/top_level.txt,sha256=SfFvq54lUQVNr5GjMXj1qtUegMNYULlHrNt5KbNfHdI,17
9
+ fustor_source_fs-0.1.4.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1,2 @@
1
+ [fustor_agent.drivers.sources]
2
+ fs = fustor_source_fs:FSDriver
@@ -0,0 +1 @@
1
+ fustor_source_fs