fustor-source-fs 0.1.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fustor_source_fs/__init__.py +412 -0
- fustor_source_fs/components.py +356 -0
- fustor_source_fs/event_handler.py +241 -0
- fustor_source_fs/py.typed +0 -0
- fustor_source_fs-0.1.4.dist-info/METADATA +8 -0
- fustor_source_fs-0.1.4.dist-info/RECORD +9 -0
- fustor_source_fs-0.1.4.dist-info/WHEEL +5 -0
- fustor_source_fs-0.1.4.dist-info/entry_points.txt +2 -0
- fustor_source_fs-0.1.4.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Fuagent source driver for the file system.
|
|
3
|
+
|
|
4
|
+
This driver implements a 'Smart Dynamic Monitoring' strategy to efficiently
|
|
5
|
+
monitor large directory structures without exhausting system resources.
|
|
6
|
+
"""
|
|
7
|
+
import os
|
|
8
|
+
import queue
|
|
9
|
+
import time
|
|
10
|
+
import datetime
|
|
11
|
+
import logging
|
|
12
|
+
import uuid
|
|
13
|
+
import getpass
|
|
14
|
+
import fnmatch
|
|
15
|
+
import threading
|
|
16
|
+
from typing import Any, Dict, Iterator, List, Tuple
|
|
17
|
+
from fustor_core.drivers import SourceDriver
|
|
18
|
+
from fustor_core.models.config import SourceConfig
|
|
19
|
+
from fustor_event_model.models import EventBase, UpdateEvent, DeleteEvent
|
|
20
|
+
|
|
21
|
+
from .components import _WatchManager, safe_path_handling
|
|
22
|
+
from .event_handler import OptimizedWatchEventHandler, get_file_metadata
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger("fustor_agent.driver.fs")
|
|
25
|
+
|
|
26
|
+
import threading
|
|
27
|
+
|
|
28
|
+
class FSDriver(SourceDriver):
|
|
29
|
+
_instances: Dict[str, 'FSDriver'] = {}
|
|
30
|
+
_lock = threading.Lock()
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def is_transient(self) -> bool:
|
|
34
|
+
"""
|
|
35
|
+
FS driver is transient - events will be lost if not processed immediately.
|
|
36
|
+
"""
|
|
37
|
+
return True
|
|
38
|
+
|
|
39
|
+
def __new__(cls, id: str, config: SourceConfig):
|
|
40
|
+
# Generate unique signature based on URI and credentials to ensure permission isolation
|
|
41
|
+
signature = f"{config.uri}#{hash(str(config.credential))}"
|
|
42
|
+
|
|
43
|
+
with FSDriver._lock:
|
|
44
|
+
if signature not in FSDriver._instances:
|
|
45
|
+
# Create new instance
|
|
46
|
+
instance = super().__new__(cls)
|
|
47
|
+
FSDriver._instances[signature] = instance
|
|
48
|
+
return FSDriver._instances[signature]
|
|
49
|
+
|
|
50
|
+
def __init__(self, id: str, config: SourceConfig):
|
|
51
|
+
# Prevent re-initialization of shared instances
|
|
52
|
+
if hasattr(self, '_initialized'):
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
super().__init__(id, config)
|
|
56
|
+
self.uri = self.config.uri
|
|
57
|
+
self.event_queue: queue.Queue[EventBase] = queue.Queue()
|
|
58
|
+
self.clock_offset = 0.0 # Placeholder for potential future use
|
|
59
|
+
self._stop_driver_event = threading.Event() # NEW
|
|
60
|
+
min_monitoring_window_days = self.config.driver_params.get("min_monitoring_window_days", 30.0)
|
|
61
|
+
self.watch_manager = _WatchManager(self.uri, event_handler=None, min_monitoring_window_days=min_monitoring_window_days, stop_driver_event=self._stop_driver_event)
|
|
62
|
+
self.event_handler = OptimizedWatchEventHandler(self.event_queue, self.watch_manager)
|
|
63
|
+
self.watch_manager.event_handler = self.event_handler
|
|
64
|
+
self._pre_scan_completed = False
|
|
65
|
+
self._pre_scan_lock = threading.Lock()
|
|
66
|
+
self._stop_driver_event = threading.Event() # NEW
|
|
67
|
+
|
|
68
|
+
self._initialized = True
|
|
69
|
+
|
|
70
|
+
def _perform_pre_scan_and_schedule(self):
|
|
71
|
+
"""
|
|
72
|
+
Performs a one-time scan of the directory to populate the watch manager
|
|
73
|
+
with a capacity-aware, hierarchy-complete set of the most active directories.
|
|
74
|
+
It uses a delta to normalize server mtimes to the client's time domain.
|
|
75
|
+
"""
|
|
76
|
+
with self._pre_scan_lock:
|
|
77
|
+
if self._pre_scan_completed:
|
|
78
|
+
return
|
|
79
|
+
|
|
80
|
+
logger.info(f"[fs] Performing initial directory scan to build hot-directory map for: {self.uri}")
|
|
81
|
+
|
|
82
|
+
mtime_map: Dict[str, float] = {}
|
|
83
|
+
|
|
84
|
+
# Track statistics
|
|
85
|
+
error_count = 0
|
|
86
|
+
total_entries = 0 # Total number of entries (directories and files) processed
|
|
87
|
+
|
|
88
|
+
def handle_walk_error(e: OSError):
|
|
89
|
+
nonlocal error_count
|
|
90
|
+
error_count += 1
|
|
91
|
+
logger.debug(f"[fs] Error during pre-scan walk, skipping path: {e.filename} - {e.strerror}")
|
|
92
|
+
|
|
93
|
+
# Step 1: Walk the entire tree to build the mtime_map with server times
|
|
94
|
+
for root, dirs, files in os.walk(self.uri, topdown=False, onerror=handle_walk_error):
|
|
95
|
+
try:
|
|
96
|
+
latest_mtime = os.path.getmtime(root)
|
|
97
|
+
except OSError:
|
|
98
|
+
continue
|
|
99
|
+
|
|
100
|
+
for filename in files:
|
|
101
|
+
file_path = os.path.join(root, filename)
|
|
102
|
+
try:
|
|
103
|
+
stat_info = os.stat(file_path)
|
|
104
|
+
latest_mtime = max(latest_mtime, stat_info.st_mtime)
|
|
105
|
+
except (FileNotFoundError, PermissionError, OSError) as e:
|
|
106
|
+
error_count += 1
|
|
107
|
+
logger.debug(f"[fs] Error during pre-scan walk, skipping path: {e.filename} - {e.strerror}")
|
|
108
|
+
|
|
109
|
+
# Count each file as an entry
|
|
110
|
+
total_entries += 1
|
|
111
|
+
|
|
112
|
+
for dirname in dirs:
|
|
113
|
+
dirpath = os.path.join(root, dirname)
|
|
114
|
+
latest_mtime = max(latest_mtime, mtime_map.get(dirpath, 0))
|
|
115
|
+
# Count each dir as an entry
|
|
116
|
+
total_entries += 1
|
|
117
|
+
|
|
118
|
+
# Count the current directory
|
|
119
|
+
mtime_map[root] = latest_mtime
|
|
120
|
+
total_entries += 1 # Increment for each directory processed
|
|
121
|
+
|
|
122
|
+
# Log statistics every 1000 entries (using a reasonable batch size)
|
|
123
|
+
if total_entries % 10000 == 0:
|
|
124
|
+
# Find the newest and oldest directories so far
|
|
125
|
+
if mtime_map:
|
|
126
|
+
newest_dir = max(mtime_map.items(), key=lambda x: x[1])
|
|
127
|
+
oldest_dir = min(mtime_map.items(), key=lambda x: x[1])
|
|
128
|
+
newest_age = time.time() - newest_dir[1] # Difference in seconds
|
|
129
|
+
oldest_age = time.time() - oldest_dir[1] # Difference in seconds
|
|
130
|
+
logger.info(
|
|
131
|
+
f"[fs] Pre-scan progress: processed {total_entries} entries, "
|
|
132
|
+
f"errors: {error_count}, newest_dir: {newest_dir[0]} (age: {newest_age/86400:.2f} days), "
|
|
133
|
+
f"oldest_dir: {oldest_dir[0]} (age: {oldest_age/86400:.2f} days)"
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Step 2: Calculate baseline delta using the true recursive mtime of the root.
|
|
137
|
+
try:
|
|
138
|
+
root_recursive_mtime = mtime_map.get(self.uri, os.path.getmtime(self.uri))
|
|
139
|
+
self.clock_offset = time.time() - root_recursive_mtime
|
|
140
|
+
logger.info(f"[fs] Calculated client-server time delta: {self.clock_offset:.2f} seconds.")
|
|
141
|
+
except OSError as e:
|
|
142
|
+
logger.warning(f"[fs] Could not stat root directory to calculate time delta: {e}. Proceeding without normalization.")
|
|
143
|
+
|
|
144
|
+
# Log final statistics before sorting
|
|
145
|
+
if mtime_map:
|
|
146
|
+
newest_dir = max(mtime_map.items(), key=lambda x: x[1])
|
|
147
|
+
oldest_dir = min(mtime_map.items(), key=lambda x: x[1])
|
|
148
|
+
newest_age = time.time() - newest_dir[1] # Difference in seconds
|
|
149
|
+
oldest_age = time.time() - oldest_dir[1] # Difference in seconds
|
|
150
|
+
logger.info(
|
|
151
|
+
f"[fs] Pre-scan completed: processed {total_entries} entries, "
|
|
152
|
+
f"errors: {error_count}, newest_dir: {safe_path_handling(newest_dir[0])} (age: {newest_age/86400:.2f} days), "
|
|
153
|
+
f"oldest_dir: {safe_path_handling(oldest_dir[0])} (age: {oldest_age/86400:.2f} days)"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
logger.info(f"[fs] Found {len(mtime_map)} total directories. Building capacity-aware, hierarchy-complete watch set...")
|
|
157
|
+
sorted_dirs = sorted(mtime_map.items(), key=lambda item: item[1], reverse=True)[:self.watch_manager.watch_limit]
|
|
158
|
+
old_limit = self.watch_manager.watch_limit
|
|
159
|
+
for path, _ in sorted_dirs:
|
|
160
|
+
server_mtime = mtime_map.get(path)
|
|
161
|
+
if server_mtime:
|
|
162
|
+
# Normalize to client time domain while preserving relative differences
|
|
163
|
+
lru_timestamp = server_mtime + self.clock_offset
|
|
164
|
+
else:
|
|
165
|
+
# Fallback for parents that might not have been in mtime_map (though they should be)
|
|
166
|
+
lru_timestamp = time.time()
|
|
167
|
+
self.watch_manager.schedule(path, lru_timestamp)
|
|
168
|
+
if self.watch_manager.watch_limit < old_limit:
|
|
169
|
+
break # Stop if we hit the limit during scheduling
|
|
170
|
+
logger.info(f"[fs] Final watch set constructed. Total paths to watch: {len(self.watch_manager.lru_cache)}.")
|
|
171
|
+
self._pre_scan_completed = True
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def get_snapshot_iterator(self, **kwargs) -> Iterator[EventBase]:
|
|
175
|
+
stream_id = f"snapshot-fs-{uuid.uuid4().hex[:6]}"
|
|
176
|
+
logger.info(f"[{stream_id}] Starting Snapshot Scan Phase: for path: {self.uri}")
|
|
177
|
+
|
|
178
|
+
driver_params = self.config.driver_params
|
|
179
|
+
if driver_params.get("startup_mode") == "message-only":
|
|
180
|
+
logger.info(f"[{stream_id}] Skipping snapshot due to 'message-only' mode.")
|
|
181
|
+
return
|
|
182
|
+
|
|
183
|
+
file_pattern = driver_params.get("file_pattern", "*")
|
|
184
|
+
batch_size = kwargs.get("batch_size", 100)
|
|
185
|
+
|
|
186
|
+
logger.info(f"[{stream_id}] Scan parameters: file_pattern='{file_pattern}'")
|
|
187
|
+
|
|
188
|
+
try:
|
|
189
|
+
batch: List[Dict[str, Any]] = []
|
|
190
|
+
files_processed_count = 0
|
|
191
|
+
error_count = 0
|
|
192
|
+
snapshot_time = int(time.time() * 1000)
|
|
193
|
+
|
|
194
|
+
def handle_walk_error(e: OSError):
|
|
195
|
+
nonlocal error_count
|
|
196
|
+
error_count += 1
|
|
197
|
+
logger.debug(f"[{stream_id}] Error during snapshot walk, skipping path: {safe_path_handling(e.filename)} - {e.strerror}")
|
|
198
|
+
|
|
199
|
+
temp_mtime_map: Dict[str, float] = {}
|
|
200
|
+
|
|
201
|
+
for root, dirs, files in os.walk(self.uri, topdown=False, onerror=handle_walk_error):
|
|
202
|
+
try:
|
|
203
|
+
dir_stat_info = os.stat(root)
|
|
204
|
+
latest_mtime_in_subtree = dir_stat_info.st_mtime
|
|
205
|
+
except OSError:
|
|
206
|
+
dir_stat_info = None
|
|
207
|
+
latest_mtime_in_subtree = 0.0
|
|
208
|
+
|
|
209
|
+
for filename in files:
|
|
210
|
+
file_path = os.path.join(root, filename)
|
|
211
|
+
try:
|
|
212
|
+
stat_info = os.stat(file_path)
|
|
213
|
+
latest_mtime_in_subtree = max(latest_mtime_in_subtree, stat_info.st_mtime)
|
|
214
|
+
if fnmatch.fnmatch(filename, file_pattern):
|
|
215
|
+
metadata = get_file_metadata(file_path, stat_info=stat_info)
|
|
216
|
+
if metadata:
|
|
217
|
+
batch.append(metadata)
|
|
218
|
+
files_processed_count += 1
|
|
219
|
+
if len(batch) >= batch_size:
|
|
220
|
+
# Extract fields from the first row if batch is not empty
|
|
221
|
+
fields = list(batch[0].keys()) if batch else []
|
|
222
|
+
yield UpdateEvent(event_schema=self.uri, table="files", rows=batch, index=snapshot_time, fields=fields)
|
|
223
|
+
batch = []
|
|
224
|
+
except (FileNotFoundError, PermissionError, OSError) as e:
|
|
225
|
+
error_count += 1
|
|
226
|
+
logger.debug(f"[fs] Error processing file during snapshot: {safe_path_handling(file_path)} - {str(e)}")
|
|
227
|
+
|
|
228
|
+
for dirname in dirs:
|
|
229
|
+
dirpath = os.path.join(root, dirname)
|
|
230
|
+
latest_mtime_in_subtree = max(latest_mtime_in_subtree, temp_mtime_map.get(dirpath, 0.0))
|
|
231
|
+
|
|
232
|
+
temp_mtime_map[root] = latest_mtime_in_subtree
|
|
233
|
+
aligned_lru_timestamp = latest_mtime_in_subtree + self.clock_offset
|
|
234
|
+
self.watch_manager.touch(root, aligned_lru_timestamp, is_recursive_upward=False)
|
|
235
|
+
|
|
236
|
+
if dir_stat_info:
|
|
237
|
+
dir_metadata = get_file_metadata(root, stat_info=dir_stat_info)
|
|
238
|
+
if dir_metadata:
|
|
239
|
+
batch.append(dir_metadata)
|
|
240
|
+
files_processed_count += 1
|
|
241
|
+
|
|
242
|
+
if len(batch) >= batch_size:
|
|
243
|
+
# Extract fields from the first row if batch is not empty
|
|
244
|
+
fields = list(batch[0].keys()) if batch else []
|
|
245
|
+
yield UpdateEvent(event_schema=self.uri, table="files", rows=batch, index=snapshot_time, fields=fields)
|
|
246
|
+
batch = []
|
|
247
|
+
|
|
248
|
+
if batch:
|
|
249
|
+
fields = list(batch[0].keys()) if batch else []
|
|
250
|
+
yield UpdateEvent(event_schema=self.uri, table="files", rows=batch, index=snapshot_time, fields=fields)
|
|
251
|
+
|
|
252
|
+
if error_count > 0:
|
|
253
|
+
logger.warning(f"[{stream_id}] Skipped {error_count} paths in total due to permission or other errors.")
|
|
254
|
+
|
|
255
|
+
logger.info(f"[{stream_id}] Full scan complete. Processed {files_processed_count} files and directories.")
|
|
256
|
+
|
|
257
|
+
except Exception as e:
|
|
258
|
+
logger.error(f"[{stream_id}] Snapshot phase for fs failed: {e}", exc_info=True)
|
|
259
|
+
|
|
260
|
+
def get_message_iterator(self, start_position: int=-1, **kwargs) -> Iterator[EventBase]:
|
|
261
|
+
|
|
262
|
+
# Perform pre-scan to populate watches before starting the observer.
|
|
263
|
+
# This is essential for the message-first architecture and must block
|
|
264
|
+
# until completion to prevent race conditions downstream.
|
|
265
|
+
self._perform_pre_scan_and_schedule()
|
|
266
|
+
|
|
267
|
+
def _iterator_func() -> Iterator[EventBase]:
|
|
268
|
+
# After pre-scan is complete, any new events should be considered "starting from now"
|
|
269
|
+
# If start_position is provided, use it; otherwise, start from current time
|
|
270
|
+
|
|
271
|
+
stream_id = f"message-fs-{uuid.uuid4().hex[:6]}"
|
|
272
|
+
|
|
273
|
+
stop_event = kwargs.get("stop_event")
|
|
274
|
+
self.watch_manager.start()
|
|
275
|
+
logger.info(f"[{stream_id}] WatchManager started.")
|
|
276
|
+
|
|
277
|
+
try:
|
|
278
|
+
# Process events normally, but use the effective start position
|
|
279
|
+
while not (stop_event and stop_event.is_set()):
|
|
280
|
+
try:
|
|
281
|
+
max_sync_delay_seconds = self.config.driver_params.get("max_sync_delay_seconds", 1.0)
|
|
282
|
+
event = self.event_queue.get(timeout=max_sync_delay_seconds)
|
|
283
|
+
|
|
284
|
+
if start_position!=-1 and event.index < start_position:
|
|
285
|
+
logger.debug(f"[{stream_id}] Skipping old event: {event.event_type} index={event.index} < start_position={start_position}")
|
|
286
|
+
continue
|
|
287
|
+
|
|
288
|
+
yield event
|
|
289
|
+
|
|
290
|
+
except queue.Empty:
|
|
291
|
+
continue
|
|
292
|
+
finally:
|
|
293
|
+
self.watch_manager.stop()
|
|
294
|
+
logger.info(f"[{stream_id}] Stopped real-time monitoring for: {self.uri}")
|
|
295
|
+
|
|
296
|
+
return _iterator_func()
|
|
297
|
+
|
|
298
|
+
@classmethod
|
|
299
|
+
async def get_available_fields(cls, **kwargs) -> Dict[str, Any]:
|
|
300
|
+
return {"properties": {
|
|
301
|
+
"file_path": {"type": "string", "description": "The full, absolute path to the file.", "column_index": 0},
|
|
302
|
+
"size": {"type": "integer", "description": "The size of the file in bytes.", "column_index": 1},
|
|
303
|
+
"modified_time": {"type": "number", "description": "The last modification time as a Unix timestamp (float).", "column_index": 2},
|
|
304
|
+
"created_time": {"type": "number", "description": "The creation time as a Unix timestamp (float).", "column_index": 3},
|
|
305
|
+
}}
|
|
306
|
+
|
|
307
|
+
@classmethod
|
|
308
|
+
async def test_connection(cls, **kwargs) -> Tuple[bool, str]:
|
|
309
|
+
path = kwargs.get("uri")
|
|
310
|
+
if not path or not isinstance(path, str):
|
|
311
|
+
return (False, "路径未提供或格式不正确。")
|
|
312
|
+
if not os.path.exists(path):
|
|
313
|
+
return (False, f"路径不存在: {path}")
|
|
314
|
+
if not os.path.isdir(path):
|
|
315
|
+
return (False, f"路径不是一个目录: {path}")
|
|
316
|
+
if not os.access(path, os.R_OK):
|
|
317
|
+
return (False, f"没有读取权限: {path}")
|
|
318
|
+
return (True, "连接成功,路径有效且可读。")
|
|
319
|
+
|
|
320
|
+
@classmethod
|
|
321
|
+
async def check_privileges(cls, **kwargs) -> Tuple[bool, str]:
|
|
322
|
+
path = kwargs.get("uri")
|
|
323
|
+
if not path:
|
|
324
|
+
return (False, "Path not provided in arguments.")
|
|
325
|
+
|
|
326
|
+
try:
|
|
327
|
+
user = getpass.getuser()
|
|
328
|
+
except Exception:
|
|
329
|
+
user = "unknown"
|
|
330
|
+
|
|
331
|
+
logger.info(f"[fs] Checking permissions for user '{user}' on path: {safe_path_handling(path)}")
|
|
332
|
+
|
|
333
|
+
if not os.path.exists(path):
|
|
334
|
+
return (False, f"路径不存在: {path}")
|
|
335
|
+
if not os.path.isdir(path):
|
|
336
|
+
return (False, f"路径不是一个目录: {path}")
|
|
337
|
+
|
|
338
|
+
can_read = os.access(path, os.R_OK)
|
|
339
|
+
can_execute = os.access(path, os.X_OK)
|
|
340
|
+
|
|
341
|
+
if can_read and can_execute:
|
|
342
|
+
return (True, f"权限充足:当前用户 '{user}' 可以监控该目录。")
|
|
343
|
+
|
|
344
|
+
missing_perms = []
|
|
345
|
+
if not can_read:
|
|
346
|
+
missing_perms.append("读取")
|
|
347
|
+
if not can_execute:
|
|
348
|
+
missing_perms.append("执行(进入)")
|
|
349
|
+
|
|
350
|
+
return (False, f"权限不足:当前用户 '{user}' 缺少 {' 和 '.join(missing_perms)} 权限。")
|
|
351
|
+
|
|
352
|
+
async def close(self):
|
|
353
|
+
"""
|
|
354
|
+
Close the file system watcher and stop monitoring.
|
|
355
|
+
"""
|
|
356
|
+
logger.info(f"[fs] Closing file system watcher for {self.uri}")
|
|
357
|
+
|
|
358
|
+
# Stop the watch manager if it's running
|
|
359
|
+
if hasattr(self, 'watch_manager') and self.watch_manager:
|
|
360
|
+
self.watch_manager.stop()
|
|
361
|
+
|
|
362
|
+
# Set the stop event to ensure any active monitoring stops
|
|
363
|
+
if hasattr(self, '_stop_driver_event') and self._stop_driver_event:
|
|
364
|
+
self._stop_driver_event.set()
|
|
365
|
+
|
|
366
|
+
logger.info(f"[fs] Closed file system watcher for {self.uri}")
|
|
367
|
+
|
|
368
|
+
@classmethod
|
|
369
|
+
async def get_wizard_steps(cls) -> Dict[str, Any]:
|
|
370
|
+
return {
|
|
371
|
+
"steps": [
|
|
372
|
+
{
|
|
373
|
+
"step_id": "path_setup",
|
|
374
|
+
"title": "目录与权限",
|
|
375
|
+
"schema": {
|
|
376
|
+
"type": "object",
|
|
377
|
+
"properties": {
|
|
378
|
+
"uri": {
|
|
379
|
+
"type": "string",
|
|
380
|
+
"title": "监控目录路径",
|
|
381
|
+
"description": "请输入要监控的文件夹的绝对路径。"
|
|
382
|
+
},
|
|
383
|
+
"driver_params": {
|
|
384
|
+
"type": "object",
|
|
385
|
+
"title": "驱动参数",
|
|
386
|
+
"properties": {
|
|
387
|
+
"aged_interval": {
|
|
388
|
+
"type": "number",
|
|
389
|
+
"title": "被忽略监控的陈旧文件夹的年龄 (days)",
|
|
390
|
+
"default": 0.5
|
|
391
|
+
},
|
|
392
|
+
"max_sync_delay_seconds": {
|
|
393
|
+
"type": "number",
|
|
394
|
+
"title": "最大同步延迟 (秒)",
|
|
395
|
+
"description": "实时推送的最大延迟时间。如果超过此时间没有事件,将强制推送一次。",
|
|
396
|
+
"default": 1.0
|
|
397
|
+
},
|
|
398
|
+
"min_monitoring_window_days": {
|
|
399
|
+
"type": "number",
|
|
400
|
+
"title": "最小监控窗口 (天)",
|
|
401
|
+
"description": "当需要淘汰监控目录时,确保被淘汰的目录比整个监控范围内最新的文件至少旧N天。这可以防止淘汰近期仍在活跃范围内的目录。例如,设置为30,则表示只有比最新文件早30天以上的目录才允许被淘汰。",
|
|
402
|
+
"default": 30.0
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
},
|
|
407
|
+
"required": ["uri"],
|
|
408
|
+
},
|
|
409
|
+
"validations": ["test_connection", "check_privileges"]
|
|
410
|
+
}
|
|
411
|
+
]
|
|
412
|
+
}
|
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
import collections
|
|
2
|
+
import logging
|
|
3
|
+
import os
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
import dataclasses
|
|
7
|
+
import heapq
|
|
8
|
+
from typing import Dict, List, Optional, Set, Tuple
|
|
9
|
+
from fustor_core.exceptions import DriverError # NEW IMPORT
|
|
10
|
+
|
|
11
|
+
# Use the low-level inotify wrapper and high-level event types
|
|
12
|
+
from watchdog.observers.inotify_c import Inotify
|
|
13
|
+
from watchdog.events import (
|
|
14
|
+
FileClosedEvent,
|
|
15
|
+
FileCreatedEvent,
|
|
16
|
+
FileDeletedEvent,
|
|
17
|
+
FileModifiedEvent,
|
|
18
|
+
FileMovedEvent,
|
|
19
|
+
DirCreatedEvent,
|
|
20
|
+
DirDeletedEvent,
|
|
21
|
+
DirModifiedEvent,
|
|
22
|
+
DirMovedEvent,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
logger = logging.getLogger("fustor_agent.driver.fs")
|
|
26
|
+
|
|
27
|
+
def contains_surrogate_characters(path: str) -> bool:
|
|
28
|
+
"""Checks if a string contains surrogate characters."""
|
|
29
|
+
try:
|
|
30
|
+
path.encode('utf-8')
|
|
31
|
+
return False
|
|
32
|
+
except UnicodeEncodeError:
|
|
33
|
+
return True
|
|
34
|
+
|
|
35
|
+
def safe_path_encode(path: str) -> bytes:
|
|
36
|
+
"""Safely encodes a path to bytes, handling surrogate characters using filesystem encoding."""
|
|
37
|
+
try:
|
|
38
|
+
return os.fsencode(path)
|
|
39
|
+
except Exception:
|
|
40
|
+
# Fallback for extreme cases or non-string inputs, though fsencode is robust
|
|
41
|
+
return path.encode('utf-8', errors='replace')
|
|
42
|
+
|
|
43
|
+
def safe_path_handling(path: str) -> str:
|
|
44
|
+
"""Safely handles path strings, normalizing surrogate characters if present."""
|
|
45
|
+
if contains_surrogate_characters(path):
|
|
46
|
+
# Replace surrogate characters with underscores or question marks
|
|
47
|
+
# by encoding with replacement and decoding back
|
|
48
|
+
return path.encode('utf-8', errors='replace').decode('utf-8')
|
|
49
|
+
return path
|
|
50
|
+
|
|
51
|
+
@dataclasses.dataclass(frozen=True)
|
|
52
|
+
class WatchEntry:
|
|
53
|
+
"""Simplified entry for the LRU cache, just holds the timestamp."""
|
|
54
|
+
timestamp: float
|
|
55
|
+
|
|
56
|
+
class _LRUCache:
|
|
57
|
+
"""A custom cache that evicts the item with the oldest timestamp (smallest value)."""
|
|
58
|
+
def __init__(self, capacity: int):
|
|
59
|
+
self.capacity = capacity
|
|
60
|
+
self.cache: Dict[str, WatchEntry] = {} # path -> WatchEntry
|
|
61
|
+
self.min_heap: List[Tuple[float, str]] = [] # (timestamp, path)
|
|
62
|
+
self.removed_from_heap: Set[str] = set()
|
|
63
|
+
|
|
64
|
+
def _clean_heap(self):
|
|
65
|
+
"""Removes stale entries from the top of the min_heap."""
|
|
66
|
+
while self.min_heap:
|
|
67
|
+
timestamp, path = self.min_heap[0]
|
|
68
|
+
if path in self.removed_from_heap:
|
|
69
|
+
heapq.heappop(self.min_heap)
|
|
70
|
+
self.removed_from_heap.remove(path)
|
|
71
|
+
elif path not in self.cache:
|
|
72
|
+
heapq.heappop(self.min_heap)
|
|
73
|
+
else:
|
|
74
|
+
break
|
|
75
|
+
|
|
76
|
+
def get(self, key: str) -> Optional[WatchEntry]:
|
|
77
|
+
return self.cache.get(key)
|
|
78
|
+
|
|
79
|
+
def put(self, key: str, value: WatchEntry):
|
|
80
|
+
if key in self.cache:
|
|
81
|
+
self.removed_from_heap.add(key)
|
|
82
|
+
self.cache[key] = value
|
|
83
|
+
heapq.heappush(self.min_heap, (value.timestamp, key))
|
|
84
|
+
|
|
85
|
+
def evict(self) -> Optional[Tuple[str, WatchEntry]]:
|
|
86
|
+
"""Removes and returns the item with the oldest timestamp."""
|
|
87
|
+
if not self.cache:
|
|
88
|
+
return None
|
|
89
|
+
self._clean_heap()
|
|
90
|
+
if not self.min_heap:
|
|
91
|
+
return None
|
|
92
|
+
_timestamp, oldest_key = heapq.heappop(self.min_heap)
|
|
93
|
+
self.removed_from_heap.discard(oldest_key)
|
|
94
|
+
oldest_entry = self.cache.pop(oldest_key)
|
|
95
|
+
return oldest_key, oldest_entry
|
|
96
|
+
|
|
97
|
+
def get_oldest(self) -> Optional[Tuple[str, WatchEntry]]:
|
|
98
|
+
"""Returns the item with the oldest timestamp without removing it."""
|
|
99
|
+
if not self.cache:
|
|
100
|
+
return None
|
|
101
|
+
self._clean_heap()
|
|
102
|
+
if not self.min_heap:
|
|
103
|
+
return None
|
|
104
|
+
_timestamp, oldest_key = self.min_heap[0]
|
|
105
|
+
return oldest_key, self.cache[oldest_key]
|
|
106
|
+
|
|
107
|
+
def remove(self, key: str):
|
|
108
|
+
if key in self.cache:
|
|
109
|
+
del self.cache[key]
|
|
110
|
+
self.removed_from_heap.add(key)
|
|
111
|
+
|
|
112
|
+
def __contains__(self, key: str) -> bool:
|
|
113
|
+
return key in self.cache
|
|
114
|
+
|
|
115
|
+
def __len__(self) -> int:
|
|
116
|
+
return len(self.cache)
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
class _WatchManager:
|
|
120
|
+
"""
|
|
121
|
+
Manages a single inotify instance and its watches, including LRU pruning.
|
|
122
|
+
This is a more resource-efficient implementation.
|
|
123
|
+
"""
|
|
124
|
+
def __init__(self, root_path: str, event_handler, min_monitoring_window_days: float = 30.0, stop_driver_event: threading.Event = None):
|
|
125
|
+
logger.info(f"Creating a new Inotify instance for root path {root_path}.")
|
|
126
|
+
self.watch_limit = 10000000 # This now only limits watches, not instances.
|
|
127
|
+
self.lru_cache = _LRUCache(self.watch_limit)
|
|
128
|
+
self.event_handler = event_handler
|
|
129
|
+
self.root_path = root_path
|
|
130
|
+
self._lock = threading.RLock()
|
|
131
|
+
self.min_monitoring_window_days = min_monitoring_window_days
|
|
132
|
+
self.stop_driver_event = stop_driver_event # NEW
|
|
133
|
+
|
|
134
|
+
# Directly use the low-level Inotify class
|
|
135
|
+
# We watch the root path non-recursively just to initialize the instance.
|
|
136
|
+
# All other watches are added dynamically.
|
|
137
|
+
# Use safe_path_encode to handle potential surrogate characters in root_path
|
|
138
|
+
self.inotify = Inotify(safe_path_encode(root_path), recursive=False)
|
|
139
|
+
|
|
140
|
+
self._stop_event = threading.Event()
|
|
141
|
+
self.inotify_thread = threading.Thread(target=self._event_processing_loop, daemon=True)
|
|
142
|
+
|
|
143
|
+
def _event_processing_loop(self):
|
|
144
|
+
"""
|
|
145
|
+
The core event loop that reads from inotify and dispatches events.
|
|
146
|
+
"""
|
|
147
|
+
while not self._stop_event.is_set():
|
|
148
|
+
try:
|
|
149
|
+
raw_events = self.inotify.read_events()
|
|
150
|
+
|
|
151
|
+
# Pre-process to identify paired moves and avoid duplicate events.
|
|
152
|
+
paired_move_from_paths = set()
|
|
153
|
+
for event in raw_events:
|
|
154
|
+
if event.is_moved_to:
|
|
155
|
+
src_path_from = self.inotify.source_for_move(event)
|
|
156
|
+
if src_path_from:
|
|
157
|
+
paired_move_from_paths.add(os.fsdecode(src_path_from))
|
|
158
|
+
|
|
159
|
+
for event in raw_events:
|
|
160
|
+
# Use fsdecode to safely decode bytes to str, it handles surrogates correctly
|
|
161
|
+
src_path_str = os.fsdecode(event.src_path)
|
|
162
|
+
|
|
163
|
+
# Handle paired moves (MOVED_FROM + MOVED_TO)
|
|
164
|
+
if event.is_moved_to:
|
|
165
|
+
src_path_from = self.inotify.source_for_move(event)
|
|
166
|
+
if src_path_from:
|
|
167
|
+
src_path_from_str = os.fsdecode(src_path_from)
|
|
168
|
+
if event.is_directory:
|
|
169
|
+
self.event_handler.on_moved(DirMovedEvent(src_path_from_str, src_path_str))
|
|
170
|
+
else:
|
|
171
|
+
self.event_handler.on_moved(FileMovedEvent(src_path_from_str, src_path_str))
|
|
172
|
+
else:
|
|
173
|
+
# Unmatched MOVED_TO: treat as creation
|
|
174
|
+
if event.is_directory:
|
|
175
|
+
self.event_handler.on_created(DirCreatedEvent(src_path_str))
|
|
176
|
+
else:
|
|
177
|
+
self.event_handler.on_created(FileCreatedEvent(src_path_str))
|
|
178
|
+
|
|
179
|
+
# Handle unmatched MOVED_FROM (treat as deletion)
|
|
180
|
+
elif event.is_moved_from:
|
|
181
|
+
if src_path_str in paired_move_from_paths:
|
|
182
|
+
continue # Already processed as part of a move
|
|
183
|
+
|
|
184
|
+
if event.is_directory:
|
|
185
|
+
self.event_handler.on_deleted(DirDeletedEvent(src_path_str))
|
|
186
|
+
else:
|
|
187
|
+
self.event_handler.on_deleted(FileDeletedEvent(src_path_str))
|
|
188
|
+
|
|
189
|
+
# Handle creation events
|
|
190
|
+
elif event.is_create:
|
|
191
|
+
if event.is_directory:
|
|
192
|
+
self.event_handler.on_created(DirCreatedEvent(src_path_str))
|
|
193
|
+
else:
|
|
194
|
+
self.event_handler.on_created(FileCreatedEvent(src_path_str))
|
|
195
|
+
|
|
196
|
+
# Handle deletion events
|
|
197
|
+
elif event.is_delete:
|
|
198
|
+
if event.is_directory:
|
|
199
|
+
self.event_handler.on_deleted(DirDeletedEvent(src_path_str))
|
|
200
|
+
else:
|
|
201
|
+
self.event_handler.on_deleted(FileDeletedEvent(src_path_str))
|
|
202
|
+
|
|
203
|
+
# Handle modification events (attrib or modify)
|
|
204
|
+
elif event.is_attrib or event.is_modify:
|
|
205
|
+
if event.is_directory:
|
|
206
|
+
self.event_handler.on_modified(DirModifiedEvent(src_path_str))
|
|
207
|
+
else:
|
|
208
|
+
self.event_handler.on_modified(FileModifiedEvent(src_path_str))
|
|
209
|
+
|
|
210
|
+
# Handle file closed after write (definitive modification)
|
|
211
|
+
elif event.is_close_write:
|
|
212
|
+
if event.is_directory:
|
|
213
|
+
self.event_handler.on_closed(DirModifiedEvent(src_path_str))
|
|
214
|
+
else:
|
|
215
|
+
self.event_handler.on_closed(FileClosedEvent(src_path_str))
|
|
216
|
+
|
|
217
|
+
# Handle ignored events (watch removed)
|
|
218
|
+
elif event.is_ignored:
|
|
219
|
+
with self._lock:
|
|
220
|
+
if src_path_str in self.lru_cache:
|
|
221
|
+
self.lru_cache.remove(src_path_str)
|
|
222
|
+
logger.debug(f"Removed watch for '{safe_path_handling(src_path_str)}' from LRU cache due to IGNORED event.")
|
|
223
|
+
|
|
224
|
+
except KeyError as e:
|
|
225
|
+
logger.debug(f"Ignoring event for untracked watch descriptor: {str(e)}")
|
|
226
|
+
|
|
227
|
+
def schedule(self, path: str, timestamp: Optional[float] = None):
|
|
228
|
+
with self._lock:
|
|
229
|
+
timestamp_to_use = timestamp if timestamp is not None else time.time()
|
|
230
|
+
if path in self.lru_cache:
|
|
231
|
+
existing_entry = self.lru_cache.get(path)
|
|
232
|
+
if existing_entry and existing_entry.timestamp < timestamp_to_use:
|
|
233
|
+
self.lru_cache.put(path, WatchEntry(timestamp_to_use))
|
|
234
|
+
return
|
|
235
|
+
|
|
236
|
+
is_eviction_needed = len(self.lru_cache) >= self.watch_limit
|
|
237
|
+
|
|
238
|
+
oldest = self.lru_cache.get_oldest()
|
|
239
|
+
if oldest and oldest[1].timestamp >= timestamp_to_use and is_eviction_needed:
|
|
240
|
+
logger.debug(f"New watch for {safe_path_handling(path)} (ts {timestamp_to_use:.2f}) is older than oldest in cache (ts {oldest[1].timestamp:.2f}). Skipping.")
|
|
241
|
+
return
|
|
242
|
+
|
|
243
|
+
if is_eviction_needed:
|
|
244
|
+
evicted_item = self.lru_cache.evict()
|
|
245
|
+
if evicted_item:
|
|
246
|
+
evicted_path, evicted_entry = evicted_item
|
|
247
|
+
relative_age_days = (time.time() - evicted_entry.timestamp) / 86400
|
|
248
|
+
|
|
249
|
+
if relative_age_days < self.min_monitoring_window_days:
|
|
250
|
+
error_msg = (
|
|
251
|
+
f"Watch limit reached and an active watch for {evicted_path} "
|
|
252
|
+
f"(relative age: {relative_age_days:.2f} days) is about to be evicted. "
|
|
253
|
+
f"This is below the configured min_monitoring_window_days ({self.min_monitoring_window_days} days). "
|
|
254
|
+
f"Stopping driver to prevent data loss."
|
|
255
|
+
)
|
|
256
|
+
logger.error(error_msg)
|
|
257
|
+
if self.stop_driver_event:
|
|
258
|
+
self.stop_driver_event.set()
|
|
259
|
+
raise DriverError(error_msg)
|
|
260
|
+
|
|
261
|
+
logger.info(f"Watch limit reached. Evicting watch for {safe_path_handling(evicted_path)} (relative age: {relative_age_days:.2f} days).")
|
|
262
|
+
try:
|
|
263
|
+
self.inotify.remove_watch(safe_path_encode(evicted_path))
|
|
264
|
+
except (KeyError, OSError) as e:
|
|
265
|
+
logger.warning(f"Error removing evicted watch for {safe_path_handling(evicted_path)}: {e}")
|
|
266
|
+
self.unschedule_recursive(evicted_path)
|
|
267
|
+
else:
|
|
268
|
+
logger.warning(f"Watch limit of {self.watch_limit} reached, but LRU cache is empty. Cannot schedule new watch for {safe_path_handling(path)}.")
|
|
269
|
+
return
|
|
270
|
+
|
|
271
|
+
try:
|
|
272
|
+
self.inotify.add_watch(safe_path_encode(path))
|
|
273
|
+
self.lru_cache.put(path, WatchEntry(timestamp_to_use))
|
|
274
|
+
except OSError as e:
|
|
275
|
+
# Catch ENOENT (2) - File not found, likely deleted before we could watch it
|
|
276
|
+
# Catch ENOTDIR (20) - Not a directory (can happen if a file replaced a dir)
|
|
277
|
+
# Catch EACCES (13) - Permission denied
|
|
278
|
+
# Catch EINVAL (22) - Invalid argument, which can happen with special filesystems or very long paths
|
|
279
|
+
if e.errno == 2: # ENOENT
|
|
280
|
+
if os.path.exists(path):
|
|
281
|
+
# Path exists, but inotify reports ENOENT. This is problematic for inotify.
|
|
282
|
+
logger.warning(f"[fs] Could not schedule watch for {safe_path_handling(path)} (errno={e.errno}), path exists but inotify rejected it. (Consider renaming if possible).")
|
|
283
|
+
else:
|
|
284
|
+
# Path truly does not exist.
|
|
285
|
+
logger.warning(f"[fs] Could not schedule watch for {safe_path_handling(path)} (errno={e.errno}), it may strictly no longer exist or be inaccessible.")
|
|
286
|
+
return
|
|
287
|
+
if e.errno in (20, 13): # ENOTDIR, EACCES
|
|
288
|
+
logger.warning(f"[fs] Could not schedule watch for {safe_path_handling(path)} (errno={e.errno}), it may strictly no longer exist or be inaccessible.")
|
|
289
|
+
return
|
|
290
|
+
if e.errno == 22: # EINVAL - Invalid argument
|
|
291
|
+
logger.warning(f"[fs] Could not schedule watch for {safe_path_handling(path)} (errno={e.errno}), invalid argument. This can happen with special filesystems, bind mounts, or unusual path characters.")
|
|
292
|
+
return
|
|
293
|
+
|
|
294
|
+
if e.errno == 28:
|
|
295
|
+
new_limit = len(self.lru_cache)
|
|
296
|
+
relative_age_days = (time.time() - timestamp_to_use) / 86400
|
|
297
|
+
if relative_age_days < self.min_monitoring_window_days:
|
|
298
|
+
error_msg = (
|
|
299
|
+
f"System inotify watch limit hit. The new watch for {path} "
|
|
300
|
+
f"(relative age: {relative_age_days:.2f} days) is about to be skipped. "
|
|
301
|
+
f"This is below the configured min_monitoring_window_days ({self.min_monitoring_window_days} days). "
|
|
302
|
+
f"Stopping driver to prevent data loss."
|
|
303
|
+
)
|
|
304
|
+
logger.error(error_msg)
|
|
305
|
+
if self.stop_driver_event:
|
|
306
|
+
self.stop_driver_event.set()
|
|
307
|
+
raise DriverError(error_msg)
|
|
308
|
+
|
|
309
|
+
logger.warning(
|
|
310
|
+
f"System inotify watch limit hit. Dynamically adjusting watch_limit from "
|
|
311
|
+
f"{self.watch_limit} to {new_limit}. The new watch for {path} (relative age: {relative_age_days:.2f} days) will be skipped. "
|
|
312
|
+
|
|
313
|
+
f"Consider increasing 'fs.inotify.max_user_watchs'."
|
|
314
|
+
)
|
|
315
|
+
self.watch_limit = new_limit
|
|
316
|
+
return self.schedule(path, timestamp_to_use) # Retry the schedule call after adjusting the limit
|
|
317
|
+
else:
|
|
318
|
+
logger.error(f"OS error scheduling watch for {safe_path_handling(path)}: {e}", exc_info=True)
|
|
319
|
+
raise
|
|
320
|
+
|
|
321
|
+
def unschedule_recursive(self, path: str):
|
|
322
|
+
with self._lock:
|
|
323
|
+
paths_to_remove_from_lru = [p for p in list(self.lru_cache.cache.keys()) if p == path or p.startswith(path + os.sep)]
|
|
324
|
+
for p in paths_to_remove_from_lru:
|
|
325
|
+
try:
|
|
326
|
+
self.inotify.remove_watch(safe_path_encode(p))
|
|
327
|
+
except (KeyError, OSError) as e:
|
|
328
|
+
logger.warning(f"Error removing watch during recursive unschedule for {p}: {e}")
|
|
329
|
+
self.lru_cache.remove(p)
|
|
330
|
+
|
|
331
|
+
def touch(self, path: str, timestamp: Optional[float] = None, is_recursive_upward: bool = True):
|
|
332
|
+
with self._lock:
|
|
333
|
+
current_path = path
|
|
334
|
+
while True:
|
|
335
|
+
try:
|
|
336
|
+
if os.path.isdir(current_path):
|
|
337
|
+
self.schedule(current_path, timestamp)
|
|
338
|
+
except (OSError, PermissionError) as e:
|
|
339
|
+
logger.warning(f"[fs] Error accessing path during touch: {safe_path_handling(current_path)} - {str(e)}")
|
|
340
|
+
|
|
341
|
+
if not is_recursive_upward or len(current_path) <= len(self.root_path):
|
|
342
|
+
break
|
|
343
|
+
|
|
344
|
+
current_path = os.path.dirname(current_path)
|
|
345
|
+
|
|
346
|
+
def start(self):
|
|
347
|
+
logger.info("WatchManager: Starting inotify event thread.")
|
|
348
|
+
self.inotify_thread.start()
|
|
349
|
+
|
|
350
|
+
def stop(self):
|
|
351
|
+
logger.info("WatchManager: Stopping inotify event thread.")
|
|
352
|
+
self._stop_event.set()
|
|
353
|
+
self.inotify.close() # This will interrupt the blocking read_events() call
|
|
354
|
+
if self.inotify_thread.is_alive():
|
|
355
|
+
self.inotify_thread.join(timeout=5.0)
|
|
356
|
+
logger.info("WatchManager: Inotify event thread stopped.")
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import queue
|
|
3
|
+
import time
|
|
4
|
+
import logging
|
|
5
|
+
import stat
|
|
6
|
+
from typing import Any, Dict, Optional
|
|
7
|
+
|
|
8
|
+
from watchdog.events import FileSystemEventHandler, FileSystemEvent
|
|
9
|
+
from fustor_event_model.models import UpdateEvent, DeleteEvent
|
|
10
|
+
|
|
11
|
+
from .components import _WatchManager
|
|
12
|
+
|
|
13
|
+
logger = logging.getLogger("fustor_agent.driver.fs")
|
|
14
|
+
|
|
15
|
+
def get_file_metadata(path: str, stat_info: Optional[os.stat_result] = None) -> Optional[Dict[str, Any]]:
|
|
16
|
+
"""Get file metadata, returning mtime and ctime as float timestamps."""
|
|
17
|
+
try:
|
|
18
|
+
if stat_info is None:
|
|
19
|
+
stat_info = os.stat(path)
|
|
20
|
+
|
|
21
|
+
is_dir = stat.S_ISDIR(stat_info.st_mode)
|
|
22
|
+
|
|
23
|
+
return {
|
|
24
|
+
"file_path": path,
|
|
25
|
+
"size": stat_info.st_size,
|
|
26
|
+
"modified_time": stat_info.st_mtime,
|
|
27
|
+
"created_time": stat_info.st_ctime,
|
|
28
|
+
"is_dir": is_dir
|
|
29
|
+
}
|
|
30
|
+
except FileNotFoundError:
|
|
31
|
+
logger.warning(f"[fs] Could not stat file, it may have been deleted before processing: {path}")
|
|
32
|
+
return None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class OptimizedWatchEventHandler(FileSystemEventHandler):
|
|
36
|
+
"""
|
|
37
|
+
Event handler that processes watchdog events immediately using dedicated
|
|
38
|
+
on_* methods, which is the idiomatic way to use watchdog.
|
|
39
|
+
"""
|
|
40
|
+
def __init__(self, event_queue: queue.Queue, watch_manager: _WatchManager):
|
|
41
|
+
super().__init__()
|
|
42
|
+
self.event_queue = event_queue
|
|
43
|
+
self.watch_manager = watch_manager
|
|
44
|
+
|
|
45
|
+
def _touch_recursive_bottom_up(self, path: str):
|
|
46
|
+
"""Recursively touches a directory and its contents from bottom-up."""
|
|
47
|
+
if not os.path.exists(path): return
|
|
48
|
+
|
|
49
|
+
# First, touch all files and subdirectories
|
|
50
|
+
for dirpath, dirnames, _ in os.walk(path, topdown=False):
|
|
51
|
+
for dirname in dirnames:
|
|
52
|
+
subdir_path = os.path.join(dirpath, dirname)
|
|
53
|
+
self.watch_manager.touch(subdir_path, is_recursive_upward=False)
|
|
54
|
+
|
|
55
|
+
# Finally, touch the root of the path itself
|
|
56
|
+
self.watch_manager.touch(path, is_recursive_upward=False)
|
|
57
|
+
|
|
58
|
+
def _generate_move_events_recursive(self, from_path: str, to_path: str):
|
|
59
|
+
"""Generates DeleteEvents for inferred old paths and UpdateEvents for new paths within a moved subtree."""
|
|
60
|
+
if not os.path.exists(to_path): return
|
|
61
|
+
|
|
62
|
+
for dirpath, dirnames, filenames in os.walk(to_path, topdown=False):
|
|
63
|
+
for filename in filenames:
|
|
64
|
+
add_path = os.path.join(dirpath, filename)
|
|
65
|
+
del_path = add_path.replace(to_path, from_path, 1)
|
|
66
|
+
|
|
67
|
+
# Generate DeleteEvent for the old path
|
|
68
|
+
row = {"file_path": del_path}
|
|
69
|
+
delete_event = DeleteEvent(
|
|
70
|
+
schema=self.watch_manager.root_path,
|
|
71
|
+
event_schema=self.watch_manager.root_path,
|
|
72
|
+
table="files",
|
|
73
|
+
rows=[row],
|
|
74
|
+
fields=list(row.keys()),
|
|
75
|
+
index=int(time.time() * 1000)
|
|
76
|
+
)
|
|
77
|
+
self.event_queue.put(delete_event)
|
|
78
|
+
|
|
79
|
+
# Generate UpdateEvent for the new path
|
|
80
|
+
metadata = get_file_metadata(add_path)
|
|
81
|
+
if metadata:
|
|
82
|
+
update_event = UpdateEvent(
|
|
83
|
+
schema=self.watch_manager.root_path,
|
|
84
|
+
event_schema=self.watch_manager.root_path,
|
|
85
|
+
table="files",
|
|
86
|
+
rows=[metadata],
|
|
87
|
+
fields=list(metadata.keys()),
|
|
88
|
+
index=int(time.time() * 1000)
|
|
89
|
+
)
|
|
90
|
+
self.event_queue.put(update_event)
|
|
91
|
+
|
|
92
|
+
for dirname in dirnames:
|
|
93
|
+
subdir_add_path = os.path.join(dirpath, dirname)
|
|
94
|
+
subdir_del_path = subdir_add_path.replace(to_path, from_path, 1)
|
|
95
|
+
|
|
96
|
+
# Generate DeleteEvent for the old directory path
|
|
97
|
+
row = {"file_path": subdir_del_path}
|
|
98
|
+
delete_event = DeleteEvent(
|
|
99
|
+
schema=self.watch_manager.root_path,
|
|
100
|
+
event_schema=self.watch_manager.root_path,
|
|
101
|
+
table="files",
|
|
102
|
+
rows=[row],
|
|
103
|
+
fields=list(row.keys()),
|
|
104
|
+
index=int(time.time() * 1000)
|
|
105
|
+
)
|
|
106
|
+
self.event_queue.put(delete_event)
|
|
107
|
+
# No UpdateEvent for directories, touch handles their LRU/watch status
|
|
108
|
+
# Generate UpdateEvent for the new path
|
|
109
|
+
metadata = get_file_metadata(subdir_add_path)
|
|
110
|
+
if metadata:
|
|
111
|
+
update_event = UpdateEvent(
|
|
112
|
+
schema=self.watch_manager.root_path,
|
|
113
|
+
event_schema=self.watch_manager.root_path,
|
|
114
|
+
table="files",
|
|
115
|
+
rows=[metadata],
|
|
116
|
+
fields=list(metadata.keys()),
|
|
117
|
+
index=int(time.time() * 1000)
|
|
118
|
+
)
|
|
119
|
+
self.event_queue.put(update_event)
|
|
120
|
+
|
|
121
|
+
def on_created(self, event: FileSystemEvent):
|
|
122
|
+
"""Called when a file or directory is created."""
|
|
123
|
+
try:
|
|
124
|
+
if event.is_directory:
|
|
125
|
+
metadata = get_file_metadata(event.src_path)
|
|
126
|
+
if metadata:
|
|
127
|
+
update_event = UpdateEvent(
|
|
128
|
+
schema=self.watch_manager.root_path,
|
|
129
|
+
event_schema=self.watch_manager.root_path,
|
|
130
|
+
table="files",
|
|
131
|
+
rows=[metadata],
|
|
132
|
+
fields=list(metadata.keys()),
|
|
133
|
+
index=int(time.time() * 1000)
|
|
134
|
+
)
|
|
135
|
+
self.event_queue.put(update_event)
|
|
136
|
+
self.watch_manager.touch(event.src_path)
|
|
137
|
+
except Exception as e:
|
|
138
|
+
logger.warning(f"[fs] Error processing file creation event for {event.src_path}: {str(e)}")
|
|
139
|
+
|
|
140
|
+
def on_deleted(self, event: FileSystemEvent):
|
|
141
|
+
"""Called when a file or directory is deleted."""
|
|
142
|
+
try:
|
|
143
|
+
# For a deleted path, we should not attempt to touch/schedule a watch.
|
|
144
|
+
# Instead, we unschedule and generate a delete event.
|
|
145
|
+
|
|
146
|
+
if event.is_directory:
|
|
147
|
+
self.watch_manager.unschedule_recursive(event.src_path)
|
|
148
|
+
row = {"file_path": event.src_path}
|
|
149
|
+
delete_event = DeleteEvent(
|
|
150
|
+
schema=self.watch_manager.root_path,
|
|
151
|
+
event_schema=self.watch_manager.root_path,
|
|
152
|
+
table="files",
|
|
153
|
+
rows=[row],
|
|
154
|
+
fields=list(row.keys()),
|
|
155
|
+
index=int(time.time() * 1000)
|
|
156
|
+
)
|
|
157
|
+
self.event_queue.put(delete_event)
|
|
158
|
+
|
|
159
|
+
# A deletion is an activity, touch the parent path to update its timestamp.
|
|
160
|
+
# We assume the parent is always a directory.
|
|
161
|
+
self.watch_manager.touch(os.path.dirname(event.src_path))
|
|
162
|
+
except Exception as e:
|
|
163
|
+
logger.warning(f"[fs] Error processing file deletion event for {event.src_path}: {str(e)}")
|
|
164
|
+
|
|
165
|
+
def on_moved(self, event: FileSystemEvent):
|
|
166
|
+
"""Called when a file or a directory is moved or renamed."""
|
|
167
|
+
try:
|
|
168
|
+
# Touch the parent of the source path to update its timestamp (something disappeared).
|
|
169
|
+
self.watch_manager.touch(os.path.dirname(event.src_path))
|
|
170
|
+
# Touch the parent of the destination path to update its timestamp (something appeared).
|
|
171
|
+
self.watch_manager.touch(os.path.dirname(event.dest_path))
|
|
172
|
+
|
|
173
|
+
# Create and queue the delete event for the old location
|
|
174
|
+
delete_row = {"file_path": event.src_path}
|
|
175
|
+
delete_event = DeleteEvent(
|
|
176
|
+
schema=self.watch_manager.root_path,
|
|
177
|
+
event_schema=self.watch_manager.root_path,
|
|
178
|
+
table="files",
|
|
179
|
+
rows=[delete_row],
|
|
180
|
+
fields=list(delete_row.keys()),
|
|
181
|
+
index=int(time.time() * 1000)
|
|
182
|
+
)
|
|
183
|
+
self.event_queue.put(delete_event)
|
|
184
|
+
|
|
185
|
+
# Handle the creation/update event for the new location
|
|
186
|
+
if event.is_directory:
|
|
187
|
+
# For directories, process recursively
|
|
188
|
+
self._generate_move_events_recursive(event.src_path, event.dest_path)
|
|
189
|
+
# Recursively touch all contents at the new destination to ensure watches are updated/scheduled.
|
|
190
|
+
self._touch_recursive_bottom_up(event.dest_path)
|
|
191
|
+
# Unschedule the old path recursively
|
|
192
|
+
self.watch_manager.unschedule_recursive(event.src_path)
|
|
193
|
+
else:
|
|
194
|
+
# For files, create update event for new location
|
|
195
|
+
metadata = get_file_metadata(event.dest_path)
|
|
196
|
+
if metadata:
|
|
197
|
+
update_event = UpdateEvent(
|
|
198
|
+
schema=self.watch_manager.root_path,
|
|
199
|
+
event_schema=self.watch_manager.root_path,
|
|
200
|
+
table="files",
|
|
201
|
+
rows=[metadata],
|
|
202
|
+
fields=list(metadata.keys()),
|
|
203
|
+
index=int(time.time() * 1000)
|
|
204
|
+
)
|
|
205
|
+
self.event_queue.put(update_event)
|
|
206
|
+
# Touch the file itself at its new destination
|
|
207
|
+
self.watch_manager.touch(event.dest_path)
|
|
208
|
+
except Exception as e:
|
|
209
|
+
logger.warning(f"[fs] Error processing file move event for {event.src_path} -> {event.dest_path}: {str(e)}")
|
|
210
|
+
# Note: If we get here, the delete_event may already be in the queue
|
|
211
|
+
# This is an inherent issue with partial failure in distributed systems,
|
|
212
|
+
# but we prevent the entire system from crashing
|
|
213
|
+
|
|
214
|
+
def on_modified(self, event: FileSystemEvent):
|
|
215
|
+
"""
|
|
216
|
+
Called when a file or directory is modified.
|
|
217
|
+
This is intentionally ignored to wait for the 'closed' event,
|
|
218
|
+
ensuring the file is fully written.
|
|
219
|
+
"""
|
|
220
|
+
pass
|
|
221
|
+
|
|
222
|
+
def on_closed(self, event: FileSystemEvent):
|
|
223
|
+
"""
|
|
224
|
+
Called when a file opened for writing is closed.
|
|
225
|
+
"""
|
|
226
|
+
try:
|
|
227
|
+
self.watch_manager.touch(event.src_path)
|
|
228
|
+
if not event.is_directory:
|
|
229
|
+
metadata = get_file_metadata(event.src_path)
|
|
230
|
+
if metadata:
|
|
231
|
+
update_event = UpdateEvent(
|
|
232
|
+
schema=self.watch_manager.root_path,
|
|
233
|
+
event_schema=self.watch_manager.root_path,
|
|
234
|
+
table="files",
|
|
235
|
+
rows=[metadata],
|
|
236
|
+
fields=list(metadata.keys()),
|
|
237
|
+
index=int(time.time() * 1000)
|
|
238
|
+
)
|
|
239
|
+
self.event_queue.put(update_event)
|
|
240
|
+
except Exception as e:
|
|
241
|
+
logger.warning(f"[fs] Error processing file closed event for {event.src_path}: {str(e)}")
|
|
File without changes
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
fustor_source_fs/__init__.py,sha256=nl3IL57-yK_j0wHvR-tXL3VqUJETgtbM5cxIXMWHD6M,20143
|
|
2
|
+
fustor_source_fs/components.py,sha256=2fK7p7UWDaPh7wwKbM8YOKNrcn9Jw9PN3bMAPHgCBiI,17589
|
|
3
|
+
fustor_source_fs/event_handler.py,sha256=yxLqlppaeVP6ZnJ8SeMck1LO7Ps_6flRqeFlcOfEwvI,10862
|
|
4
|
+
fustor_source_fs/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
5
|
+
fustor_source_fs-0.1.4.dist-info/METADATA,sha256=KE9hfxMojQ2WJhmn2uw2WDgLQA1oYHqG3lqgkDwuai0,208
|
|
6
|
+
fustor_source_fs-0.1.4.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
7
|
+
fustor_source_fs-0.1.4.dist-info/entry_points.txt,sha256=ruXHaAbRgq5l2QL5tf2QojUQzAxmE4m5hjYvLID5SPI,62
|
|
8
|
+
fustor_source_fs-0.1.4.dist-info/top_level.txt,sha256=SfFvq54lUQVNr5GjMXj1qtUegMNYULlHrNt5KbNfHdI,17
|
|
9
|
+
fustor_source_fs-0.1.4.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
fustor_source_fs
|