fustor-source-fs 0.1.2.post3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fustor_source_fs-0.1.2.post3/PKG-INFO +8 -0
- fustor_source_fs-0.1.2.post3/README.md +44 -0
- fustor_source_fs-0.1.2.post3/flow.md +21 -0
- fustor_source_fs-0.1.2.post3/pyproject.toml +22 -0
- fustor_source_fs-0.1.2.post3/setup.cfg +4 -0
- fustor_source_fs-0.1.2.post3/src/fustor_source_fs/__init__.py +412 -0
- fustor_source_fs-0.1.2.post3/src/fustor_source_fs/components.py +356 -0
- fustor_source_fs-0.1.2.post3/src/fustor_source_fs/event_handler.py +241 -0
- fustor_source_fs-0.1.2.post3/src/fustor_source_fs/py.typed +0 -0
- fustor_source_fs-0.1.2.post3/src/fustor_source_fs.egg-info/PKG-INFO +8 -0
- fustor_source_fs-0.1.2.post3/src/fustor_source_fs.egg-info/SOURCES.txt +27 -0
- fustor_source_fs-0.1.2.post3/src/fustor_source_fs.egg-info/dependency_links.txt +1 -0
- fustor_source_fs-0.1.2.post3/src/fustor_source_fs.egg-info/entry_points.txt +2 -0
- fustor_source_fs-0.1.2.post3/src/fustor_source_fs.egg-info/requires.txt +3 -0
- fustor_source_fs-0.1.2.post3/src/fustor_source_fs.egg-info/top_level.txt +1 -0
- fustor_source_fs-0.1.2.post3/tests/test_buffer_full_agent_only.py +107 -0
- fustor_source_fs-0.1.2.post3/tests/test_continuous_write.py +145 -0
- fustor_source_fs-0.1.2.post3/tests/test_crash_write.py +117 -0
- fustor_source_fs-0.1.2.post3/tests/test_dynamic_monitoring.py +278 -0
- fustor_source_fs-0.1.2.post3/tests/test_enoent_handling.py +45 -0
- fustor_source_fs-0.1.2.post3/tests/test_errno_22_handling.py +100 -0
- fustor_source_fs-0.1.2.post3/tests/test_event_handler_move.py +165 -0
- fustor_source_fs-0.1.2.post3/tests/test_ftp_scenario.py +167 -0
- fustor_source_fs-0.1.2.post3/tests/test_invalid_args.py +119 -0
- fustor_source_fs-0.1.2.post3/tests/test_message_iterator.py +139 -0
- fustor_source_fs-0.1.2.post3/tests/test_snapshot.py +118 -0
- fustor_source_fs-0.1.2.post3/tests/test_surrogate_bug.py +59 -0
- fustor_source_fs-0.1.2.post3/tests/test_touch_exceptions.py +283 -0
- fustor_source_fs-0.1.2.post3/tests/test_unicode_logging.py +74 -0
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# fustor-source-fs
|
|
2
|
+
|
|
3
|
+
This package provides a `SourceDriver` implementation for the Fustor Agent service, enabling it to monitor and extract data from local file systems. It employs a "Smart Dynamic Monitoring" strategy to efficiently handle large directory structures, supporting both snapshot (initial scan) and real-time (event-driven) synchronization of file changes.
|
|
4
|
+
|
|
5
|
+
## Features
|
|
6
|
+
|
|
7
|
+
* **File System Monitoring**: Utilizes `watchdog` to detect file creation, modification, deletion, and movement events in real-time.
|
|
8
|
+
* **Smart Dynamic Monitoring**: Implements a sophisticated strategy to monitor large directory trees efficiently, including:
|
|
9
|
+
* **Capacity-aware scheduling**: Prioritizes monitoring of "hot" (recently modified) directories.
|
|
10
|
+
* **LRU eviction**: Automatically evicts least recently used directory watches to stay within system limits.
|
|
11
|
+
* **Adaptive limits**: Adjusts monitoring limits dynamically based on system feedback.
|
|
12
|
+
* **Snapshot Synchronization**: Performs an initial scan of the configured directory to capture existing files as a snapshot.
|
|
13
|
+
* **Real-time Message Synchronization**: Delivers file system events (create, update, delete) as they occur.
|
|
14
|
+
* **Shared Instance Model**: Optimizes resource usage by sharing `_WatchManager` instances for identical configurations.
|
|
15
|
+
* **Connection & Privilege Checking**: Verifies path existence, readability, and execution permissions.
|
|
16
|
+
* **Wizard Definition**: Provides a configuration wizard for UI integration, guiding users through path setup and monitoring parameters.
|
|
17
|
+
* **Transient Source**: Identified as a transient source, meaning events are lost if not processed immediately, leading to specific back-pressure handling.
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
This package is part of the Fustor monorepo and is typically installed in editable mode within the monorepo's development environment using `uv sync`. It is registered as a `fustor_agent.drivers.sources` entry point.
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
|
|
25
|
+
To use the `fustor-source-fs` driver, configure a Source in your Fustor Agent setup with the driver type `fs`. You will need to provide the absolute path to the directory you wish to monitor.
|
|
26
|
+
|
|
27
|
+
Example (conceptual configuration in Fustor Agent):
|
|
28
|
+
|
|
29
|
+
```yaml
|
|
30
|
+
# ~/.fustor/config.yaml
|
|
31
|
+
sources:
|
|
32
|
+
my-fs-source:
|
|
33
|
+
driver_type: fs
|
|
34
|
+
uri: /path/to/your/monitored/directory
|
|
35
|
+
driver_params:
|
|
36
|
+
min_monitoring_window_days: 7 # Ensure directories are monitored for at least 7 days
|
|
37
|
+
max_sync_delay_seconds: 0.5 # Max delay for real-time events
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Dependencies
|
|
41
|
+
|
|
42
|
+
* `watchdog`: Python library to monitor file system events.
|
|
43
|
+
* `fustor-core`: Provides the `SourceDriver` abstract base class and other core components.
|
|
44
|
+
* `fustor-event-model`: Provides `EventBase` for event data structures.
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# NFS监控事件流处理流程图
|
|
2
|
+
|
|
3
|
+
```mermaid
|
|
4
|
+
flowchart TD
|
|
5
|
+
A[NFS Server<br/>(10M files)] --> B{File System<br/>Change?}
|
|
6
|
+
B -->|Yes| C[Server A<br/>fuagent-1]
|
|
7
|
+
B -->|Yes| D[Server B<br/>fuagent-2]
|
|
8
|
+
C --> E[FSDriver<br/>detects change]
|
|
9
|
+
D --> F[FSDriver<br/>detects change]
|
|
10
|
+
E --> G[Convert to Event<br/>(path, type, metadata)]
|
|
11
|
+
F --> H[Convert to Event<br/>(path, type, metadata)]
|
|
12
|
+
G --> I[Send Event Stream<br/>Server A events]
|
|
13
|
+
H --> J[Send Event Stream<br/>Server B events]
|
|
14
|
+
I --> K[Consumer<br/>Aggregator]
|
|
15
|
+
J --> K
|
|
16
|
+
K --> L[Event Deduplication<br/>by ID/Timestamp]
|
|
17
|
+
L --> M[Merge events<br/>from both sources]
|
|
18
|
+
M --> N[Update Directory<br/>Tree View]
|
|
19
|
+
N --> O[Verify<br/>Consistency]
|
|
20
|
+
O --> P[Provides consistent<br/>directory view API]
|
|
21
|
+
```
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "fustor-source-fs"
|
|
3
|
+
dynamic = ["version"]
|
|
4
|
+
description = "Fustor Agent FS Source Driver"
|
|
5
|
+
license = "MIT"
|
|
6
|
+
dependencies = [ "fustor-core", "fustor-event-model", "watchdog",]
|
|
7
|
+
|
|
8
|
+
[build-system]
|
|
9
|
+
requires = [ "setuptools>=61.0", "setuptools-scm>=8.0"]
|
|
10
|
+
build-backend = "setuptools.build_meta"
|
|
11
|
+
|
|
12
|
+
[tool.setuptools_scm]
|
|
13
|
+
root = "../.."
|
|
14
|
+
version_scheme = "post-release"
|
|
15
|
+
local_scheme = "dirty-tag"
|
|
16
|
+
|
|
17
|
+
["project.urls"]
|
|
18
|
+
Homepage = "https://github.com/excelwang/fustor/tree/master/packages/source_fs"
|
|
19
|
+
"Bug Tracker" = "https://github.com/excelwang/fustor/issues"
|
|
20
|
+
|
|
21
|
+
[project.entry-points."fustor_agent.drivers.sources"]
|
|
22
|
+
fs = "fustor_source_fs:FSDriver"
|
|
@@ -0,0 +1,412 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Fuagent source driver for the file system.
|
|
3
|
+
|
|
4
|
+
This driver implements a 'Smart Dynamic Monitoring' strategy to efficiently
|
|
5
|
+
monitor large directory structures without exhausting system resources.
|
|
6
|
+
"""
|
|
7
|
+
import os
|
|
8
|
+
import queue
|
|
9
|
+
import time
|
|
10
|
+
import datetime
|
|
11
|
+
import logging
|
|
12
|
+
import uuid
|
|
13
|
+
import getpass
|
|
14
|
+
import fnmatch
|
|
15
|
+
import threading
|
|
16
|
+
from typing import Any, Dict, Iterator, List, Tuple
|
|
17
|
+
from fustor_core.drivers import SourceDriver
|
|
18
|
+
from fustor_core.models.config import SourceConfig
|
|
19
|
+
from fustor_event_model.models import EventBase, UpdateEvent, DeleteEvent
|
|
20
|
+
|
|
21
|
+
from .components import _WatchManager, safe_path_handling
|
|
22
|
+
from .event_handler import OptimizedWatchEventHandler, get_file_metadata
|
|
23
|
+
|
|
24
|
+
logger = logging.getLogger("fustor_agent.driver.fs")
|
|
25
|
+
|
|
26
|
+
import threading
|
|
27
|
+
|
|
28
|
+
class FSDriver(SourceDriver):
|
|
29
|
+
_instances: Dict[str, 'FSDriver'] = {}
|
|
30
|
+
_lock = threading.Lock()
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def is_transient(self) -> bool:
|
|
34
|
+
"""
|
|
35
|
+
FS driver is transient - events will be lost if not processed immediately.
|
|
36
|
+
"""
|
|
37
|
+
return True
|
|
38
|
+
|
|
39
|
+
def __new__(cls, id: str, config: SourceConfig):
|
|
40
|
+
# Generate unique signature based on URI and credentials to ensure permission isolation
|
|
41
|
+
signature = f"{config.uri}#{hash(str(config.credential))}"
|
|
42
|
+
|
|
43
|
+
with FSDriver._lock:
|
|
44
|
+
if signature not in FSDriver._instances:
|
|
45
|
+
# Create new instance
|
|
46
|
+
instance = super().__new__(cls)
|
|
47
|
+
FSDriver._instances[signature] = instance
|
|
48
|
+
return FSDriver._instances[signature]
|
|
49
|
+
|
|
50
|
+
def __init__(self, id: str, config: SourceConfig):
|
|
51
|
+
# Prevent re-initialization of shared instances
|
|
52
|
+
if hasattr(self, '_initialized'):
|
|
53
|
+
return
|
|
54
|
+
|
|
55
|
+
super().__init__(id, config)
|
|
56
|
+
self.uri = self.config.uri
|
|
57
|
+
self.event_queue: queue.Queue[EventBase] = queue.Queue()
|
|
58
|
+
self.clock_offset = 0.0 # Placeholder for potential future use
|
|
59
|
+
self._stop_driver_event = threading.Event() # NEW
|
|
60
|
+
min_monitoring_window_days = self.config.driver_params.get("min_monitoring_window_days", 30.0)
|
|
61
|
+
self.watch_manager = _WatchManager(self.uri, event_handler=None, min_monitoring_window_days=min_monitoring_window_days, stop_driver_event=self._stop_driver_event)
|
|
62
|
+
self.event_handler = OptimizedWatchEventHandler(self.event_queue, self.watch_manager)
|
|
63
|
+
self.watch_manager.event_handler = self.event_handler
|
|
64
|
+
self._pre_scan_completed = False
|
|
65
|
+
self._pre_scan_lock = threading.Lock()
|
|
66
|
+
self._stop_driver_event = threading.Event() # NEW
|
|
67
|
+
|
|
68
|
+
self._initialized = True
|
|
69
|
+
|
|
70
|
+
def _perform_pre_scan_and_schedule(self):
|
|
71
|
+
"""
|
|
72
|
+
Performs a one-time scan of the directory to populate the watch manager
|
|
73
|
+
with a capacity-aware, hierarchy-complete set of the most active directories.
|
|
74
|
+
It uses a delta to normalize server mtimes to the client's time domain.
|
|
75
|
+
"""
|
|
76
|
+
with self._pre_scan_lock:
|
|
77
|
+
if self._pre_scan_completed:
|
|
78
|
+
return
|
|
79
|
+
|
|
80
|
+
logger.info(f"[fs] Performing initial directory scan to build hot-directory map for: {self.uri}")
|
|
81
|
+
|
|
82
|
+
mtime_map: Dict[str, float] = {}
|
|
83
|
+
|
|
84
|
+
# Track statistics
|
|
85
|
+
error_count = 0
|
|
86
|
+
total_entries = 0 # Total number of entries (directories and files) processed
|
|
87
|
+
|
|
88
|
+
def handle_walk_error(e: OSError):
|
|
89
|
+
nonlocal error_count
|
|
90
|
+
error_count += 1
|
|
91
|
+
logger.debug(f"[fs] Error during pre-scan walk, skipping path: {e.filename} - {e.strerror}")
|
|
92
|
+
|
|
93
|
+
# Step 1: Walk the entire tree to build the mtime_map with server times
|
|
94
|
+
for root, dirs, files in os.walk(self.uri, topdown=False, onerror=handle_walk_error):
|
|
95
|
+
try:
|
|
96
|
+
latest_mtime = os.path.getmtime(root)
|
|
97
|
+
except OSError:
|
|
98
|
+
continue
|
|
99
|
+
|
|
100
|
+
for filename in files:
|
|
101
|
+
file_path = os.path.join(root, filename)
|
|
102
|
+
try:
|
|
103
|
+
stat_info = os.stat(file_path)
|
|
104
|
+
latest_mtime = max(latest_mtime, stat_info.st_mtime)
|
|
105
|
+
except (FileNotFoundError, PermissionError, OSError) as e:
|
|
106
|
+
error_count += 1
|
|
107
|
+
logger.debug(f"[fs] Error during pre-scan walk, skipping path: {e.filename} - {e.strerror}")
|
|
108
|
+
|
|
109
|
+
# Count each file as an entry
|
|
110
|
+
total_entries += 1
|
|
111
|
+
|
|
112
|
+
for dirname in dirs:
|
|
113
|
+
dirpath = os.path.join(root, dirname)
|
|
114
|
+
latest_mtime = max(latest_mtime, mtime_map.get(dirpath, 0))
|
|
115
|
+
# Count each dir as an entry
|
|
116
|
+
total_entries += 1
|
|
117
|
+
|
|
118
|
+
# Count the current directory
|
|
119
|
+
mtime_map[root] = latest_mtime
|
|
120
|
+
total_entries += 1 # Increment for each directory processed
|
|
121
|
+
|
|
122
|
+
# Log statistics every 1000 entries (using a reasonable batch size)
|
|
123
|
+
if total_entries % 10000 == 0:
|
|
124
|
+
# Find the newest and oldest directories so far
|
|
125
|
+
if mtime_map:
|
|
126
|
+
newest_dir = max(mtime_map.items(), key=lambda x: x[1])
|
|
127
|
+
oldest_dir = min(mtime_map.items(), key=lambda x: x[1])
|
|
128
|
+
newest_age = time.time() - newest_dir[1] # Difference in seconds
|
|
129
|
+
oldest_age = time.time() - oldest_dir[1] # Difference in seconds
|
|
130
|
+
logger.info(
|
|
131
|
+
f"[fs] Pre-scan progress: processed {total_entries} entries, "
|
|
132
|
+
f"errors: {error_count}, newest_dir: {newest_dir[0]} (age: {newest_age/86400:.2f} days), "
|
|
133
|
+
f"oldest_dir: {oldest_dir[0]} (age: {oldest_age/86400:.2f} days)"
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Step 2: Calculate baseline delta using the true recursive mtime of the root.
|
|
137
|
+
try:
|
|
138
|
+
root_recursive_mtime = mtime_map.get(self.uri, os.path.getmtime(self.uri))
|
|
139
|
+
self.clock_offset = time.time() - root_recursive_mtime
|
|
140
|
+
logger.info(f"[fs] Calculated client-server time delta: {self.clock_offset:.2f} seconds.")
|
|
141
|
+
except OSError as e:
|
|
142
|
+
logger.warning(f"[fs] Could not stat root directory to calculate time delta: {e}. Proceeding without normalization.")
|
|
143
|
+
|
|
144
|
+
# Log final statistics before sorting
|
|
145
|
+
if mtime_map:
|
|
146
|
+
newest_dir = max(mtime_map.items(), key=lambda x: x[1])
|
|
147
|
+
oldest_dir = min(mtime_map.items(), key=lambda x: x[1])
|
|
148
|
+
newest_age = time.time() - newest_dir[1] # Difference in seconds
|
|
149
|
+
oldest_age = time.time() - oldest_dir[1] # Difference in seconds
|
|
150
|
+
logger.info(
|
|
151
|
+
f"[fs] Pre-scan completed: processed {total_entries} entries, "
|
|
152
|
+
f"errors: {error_count}, newest_dir: {safe_path_handling(newest_dir[0])} (age: {newest_age/86400:.2f} days), "
|
|
153
|
+
f"oldest_dir: {safe_path_handling(oldest_dir[0])} (age: {oldest_age/86400:.2f} days)"
|
|
154
|
+
)
|
|
155
|
+
|
|
156
|
+
logger.info(f"[fs] Found {len(mtime_map)} total directories. Building capacity-aware, hierarchy-complete watch set...")
|
|
157
|
+
sorted_dirs = sorted(mtime_map.items(), key=lambda item: item[1], reverse=True)[:self.watch_manager.watch_limit]
|
|
158
|
+
old_limit = self.watch_manager.watch_limit
|
|
159
|
+
for path, _ in sorted_dirs:
|
|
160
|
+
server_mtime = mtime_map.get(path)
|
|
161
|
+
if server_mtime:
|
|
162
|
+
# Normalize to client time domain while preserving relative differences
|
|
163
|
+
lru_timestamp = server_mtime + self.clock_offset
|
|
164
|
+
else:
|
|
165
|
+
# Fallback for parents that might not have been in mtime_map (though they should be)
|
|
166
|
+
lru_timestamp = time.time()
|
|
167
|
+
self.watch_manager.schedule(path, lru_timestamp)
|
|
168
|
+
if self.watch_manager.watch_limit < old_limit:
|
|
169
|
+
break # Stop if we hit the limit during scheduling
|
|
170
|
+
logger.info(f"[fs] Final watch set constructed. Total paths to watch: {len(self.watch_manager.lru_cache)}.")
|
|
171
|
+
self._pre_scan_completed = True
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def get_snapshot_iterator(self, **kwargs) -> Iterator[EventBase]:
|
|
175
|
+
stream_id = f"snapshot-fs-{uuid.uuid4().hex[:6]}"
|
|
176
|
+
logger.info(f"[{stream_id}] Starting Snapshot Scan Phase: for path: {self.uri}")
|
|
177
|
+
|
|
178
|
+
driver_params = self.config.driver_params
|
|
179
|
+
if driver_params.get("startup_mode") == "message-only":
|
|
180
|
+
logger.info(f"[{stream_id}] Skipping snapshot due to 'message-only' mode.")
|
|
181
|
+
return
|
|
182
|
+
|
|
183
|
+
file_pattern = driver_params.get("file_pattern", "*")
|
|
184
|
+
batch_size = kwargs.get("batch_size", 100)
|
|
185
|
+
|
|
186
|
+
logger.info(f"[{stream_id}] Scan parameters: file_pattern='{file_pattern}'")
|
|
187
|
+
|
|
188
|
+
try:
|
|
189
|
+
batch: List[Dict[str, Any]] = []
|
|
190
|
+
files_processed_count = 0
|
|
191
|
+
error_count = 0
|
|
192
|
+
snapshot_time = int(time.time() * 1000)
|
|
193
|
+
|
|
194
|
+
def handle_walk_error(e: OSError):
|
|
195
|
+
nonlocal error_count
|
|
196
|
+
error_count += 1
|
|
197
|
+
logger.debug(f"[{stream_id}] Error during snapshot walk, skipping path: {safe_path_handling(e.filename)} - {e.strerror}")
|
|
198
|
+
|
|
199
|
+
temp_mtime_map: Dict[str, float] = {}
|
|
200
|
+
|
|
201
|
+
for root, dirs, files in os.walk(self.uri, topdown=False, onerror=handle_walk_error):
|
|
202
|
+
try:
|
|
203
|
+
dir_stat_info = os.stat(root)
|
|
204
|
+
latest_mtime_in_subtree = dir_stat_info.st_mtime
|
|
205
|
+
except OSError:
|
|
206
|
+
dir_stat_info = None
|
|
207
|
+
latest_mtime_in_subtree = 0.0
|
|
208
|
+
|
|
209
|
+
for filename in files:
|
|
210
|
+
file_path = os.path.join(root, filename)
|
|
211
|
+
try:
|
|
212
|
+
stat_info = os.stat(file_path)
|
|
213
|
+
latest_mtime_in_subtree = max(latest_mtime_in_subtree, stat_info.st_mtime)
|
|
214
|
+
if fnmatch.fnmatch(filename, file_pattern):
|
|
215
|
+
metadata = get_file_metadata(file_path, stat_info=stat_info)
|
|
216
|
+
if metadata:
|
|
217
|
+
batch.append(metadata)
|
|
218
|
+
files_processed_count += 1
|
|
219
|
+
if len(batch) >= batch_size:
|
|
220
|
+
# Extract fields from the first row if batch is not empty
|
|
221
|
+
fields = list(batch[0].keys()) if batch else []
|
|
222
|
+
yield UpdateEvent(event_schema=self.uri, table="files", rows=batch, index=snapshot_time, fields=fields)
|
|
223
|
+
batch = []
|
|
224
|
+
except (FileNotFoundError, PermissionError, OSError) as e:
|
|
225
|
+
error_count += 1
|
|
226
|
+
logger.debug(f"[fs] Error processing file during snapshot: {safe_path_handling(file_path)} - {str(e)}")
|
|
227
|
+
|
|
228
|
+
for dirname in dirs:
|
|
229
|
+
dirpath = os.path.join(root, dirname)
|
|
230
|
+
latest_mtime_in_subtree = max(latest_mtime_in_subtree, temp_mtime_map.get(dirpath, 0.0))
|
|
231
|
+
|
|
232
|
+
temp_mtime_map[root] = latest_mtime_in_subtree
|
|
233
|
+
aligned_lru_timestamp = latest_mtime_in_subtree + self.clock_offset
|
|
234
|
+
self.watch_manager.touch(root, aligned_lru_timestamp, is_recursive_upward=False)
|
|
235
|
+
|
|
236
|
+
if dir_stat_info:
|
|
237
|
+
dir_metadata = get_file_metadata(root, stat_info=dir_stat_info)
|
|
238
|
+
if dir_metadata:
|
|
239
|
+
batch.append(dir_metadata)
|
|
240
|
+
files_processed_count += 1
|
|
241
|
+
|
|
242
|
+
if len(batch) >= batch_size:
|
|
243
|
+
# Extract fields from the first row if batch is not empty
|
|
244
|
+
fields = list(batch[0].keys()) if batch else []
|
|
245
|
+
yield UpdateEvent(event_schema=self.uri, table="files", rows=batch, index=snapshot_time, fields=fields)
|
|
246
|
+
batch = []
|
|
247
|
+
|
|
248
|
+
if batch:
|
|
249
|
+
fields = list(batch[0].keys()) if batch else []
|
|
250
|
+
yield UpdateEvent(event_schema=self.uri, table="files", rows=batch, index=snapshot_time, fields=fields)
|
|
251
|
+
|
|
252
|
+
if error_count > 0:
|
|
253
|
+
logger.warning(f"[{stream_id}] Skipped {error_count} paths in total due to permission or other errors.")
|
|
254
|
+
|
|
255
|
+
logger.info(f"[{stream_id}] Full scan complete. Processed {files_processed_count} files and directories.")
|
|
256
|
+
|
|
257
|
+
except Exception as e:
|
|
258
|
+
logger.error(f"[{stream_id}] Snapshot phase for fs failed: {e}", exc_info=True)
|
|
259
|
+
|
|
260
|
+
def get_message_iterator(self, start_position: int=-1, **kwargs) -> Iterator[EventBase]:
|
|
261
|
+
|
|
262
|
+
# Perform pre-scan to populate watches before starting the observer.
|
|
263
|
+
# This is essential for the message-first architecture and must block
|
|
264
|
+
# until completion to prevent race conditions downstream.
|
|
265
|
+
self._perform_pre_scan_and_schedule()
|
|
266
|
+
|
|
267
|
+
def _iterator_func() -> Iterator[EventBase]:
|
|
268
|
+
# After pre-scan is complete, any new events should be considered "starting from now"
|
|
269
|
+
# If start_position is provided, use it; otherwise, start from current time
|
|
270
|
+
|
|
271
|
+
stream_id = f"message-fs-{uuid.uuid4().hex[:6]}"
|
|
272
|
+
|
|
273
|
+
stop_event = kwargs.get("stop_event")
|
|
274
|
+
self.watch_manager.start()
|
|
275
|
+
logger.info(f"[{stream_id}] WatchManager started.")
|
|
276
|
+
|
|
277
|
+
try:
|
|
278
|
+
# Process events normally, but use the effective start position
|
|
279
|
+
while not (stop_event and stop_event.is_set()):
|
|
280
|
+
try:
|
|
281
|
+
max_sync_delay_seconds = self.config.driver_params.get("max_sync_delay_seconds", 1.0)
|
|
282
|
+
event = self.event_queue.get(timeout=max_sync_delay_seconds)
|
|
283
|
+
|
|
284
|
+
if start_position!=-1 and event.index < start_position:
|
|
285
|
+
logger.debug(f"[{stream_id}] Skipping old event: {event.event_type} index={event.index} < start_position={start_position}")
|
|
286
|
+
continue
|
|
287
|
+
|
|
288
|
+
yield event
|
|
289
|
+
|
|
290
|
+
except queue.Empty:
|
|
291
|
+
continue
|
|
292
|
+
finally:
|
|
293
|
+
self.watch_manager.stop()
|
|
294
|
+
logger.info(f"[{stream_id}] Stopped real-time monitoring for: {self.uri}")
|
|
295
|
+
|
|
296
|
+
return _iterator_func()
|
|
297
|
+
|
|
298
|
+
@classmethod
|
|
299
|
+
async def get_available_fields(cls, **kwargs) -> Dict[str, Any]:
|
|
300
|
+
return {"properties": {
|
|
301
|
+
"file_path": {"type": "string", "description": "The full, absolute path to the file.", "column_index": 0},
|
|
302
|
+
"size": {"type": "integer", "description": "The size of the file in bytes.", "column_index": 1},
|
|
303
|
+
"modified_time": {"type": "number", "description": "The last modification time as a Unix timestamp (float).", "column_index": 2},
|
|
304
|
+
"created_time": {"type": "number", "description": "The creation time as a Unix timestamp (float).", "column_index": 3},
|
|
305
|
+
}}
|
|
306
|
+
|
|
307
|
+
@classmethod
|
|
308
|
+
async def test_connection(cls, **kwargs) -> Tuple[bool, str]:
|
|
309
|
+
path = kwargs.get("uri")
|
|
310
|
+
if not path or not isinstance(path, str):
|
|
311
|
+
return (False, "路径未提供或格式不正确。")
|
|
312
|
+
if not os.path.exists(path):
|
|
313
|
+
return (False, f"路径不存在: {path}")
|
|
314
|
+
if not os.path.isdir(path):
|
|
315
|
+
return (False, f"路径不是一个目录: {path}")
|
|
316
|
+
if not os.access(path, os.R_OK):
|
|
317
|
+
return (False, f"没有读取权限: {path}")
|
|
318
|
+
return (True, "连接成功,路径有效且可读。")
|
|
319
|
+
|
|
320
|
+
@classmethod
|
|
321
|
+
async def check_privileges(cls, **kwargs) -> Tuple[bool, str]:
|
|
322
|
+
path = kwargs.get("uri")
|
|
323
|
+
if not path:
|
|
324
|
+
return (False, "Path not provided in arguments.")
|
|
325
|
+
|
|
326
|
+
try:
|
|
327
|
+
user = getpass.getuser()
|
|
328
|
+
except Exception:
|
|
329
|
+
user = "unknown"
|
|
330
|
+
|
|
331
|
+
logger.info(f"[fs] Checking permissions for user '{user}' on path: {safe_path_handling(path)}")
|
|
332
|
+
|
|
333
|
+
if not os.path.exists(path):
|
|
334
|
+
return (False, f"路径不存在: {path}")
|
|
335
|
+
if not os.path.isdir(path):
|
|
336
|
+
return (False, f"路径不是一个目录: {path}")
|
|
337
|
+
|
|
338
|
+
can_read = os.access(path, os.R_OK)
|
|
339
|
+
can_execute = os.access(path, os.X_OK)
|
|
340
|
+
|
|
341
|
+
if can_read and can_execute:
|
|
342
|
+
return (True, f"权限充足:当前用户 '{user}' 可以监控该目录。")
|
|
343
|
+
|
|
344
|
+
missing_perms = []
|
|
345
|
+
if not can_read:
|
|
346
|
+
missing_perms.append("读取")
|
|
347
|
+
if not can_execute:
|
|
348
|
+
missing_perms.append("执行(进入)")
|
|
349
|
+
|
|
350
|
+
return (False, f"权限不足:当前用户 '{user}' 缺少 {' 和 '.join(missing_perms)} 权限。")
|
|
351
|
+
|
|
352
|
+
async def close(self):
|
|
353
|
+
"""
|
|
354
|
+
Close the file system watcher and stop monitoring.
|
|
355
|
+
"""
|
|
356
|
+
logger.info(f"[fs] Closing file system watcher for {self.uri}")
|
|
357
|
+
|
|
358
|
+
# Stop the watch manager if it's running
|
|
359
|
+
if hasattr(self, 'watch_manager') and self.watch_manager:
|
|
360
|
+
self.watch_manager.stop()
|
|
361
|
+
|
|
362
|
+
# Set the stop event to ensure any active monitoring stops
|
|
363
|
+
if hasattr(self, '_stop_driver_event') and self._stop_driver_event:
|
|
364
|
+
self._stop_driver_event.set()
|
|
365
|
+
|
|
366
|
+
logger.info(f"[fs] Closed file system watcher for {self.uri}")
|
|
367
|
+
|
|
368
|
+
@classmethod
|
|
369
|
+
async def get_wizard_steps(cls) -> Dict[str, Any]:
|
|
370
|
+
return {
|
|
371
|
+
"steps": [
|
|
372
|
+
{
|
|
373
|
+
"step_id": "path_setup",
|
|
374
|
+
"title": "目录与权限",
|
|
375
|
+
"schema": {
|
|
376
|
+
"type": "object",
|
|
377
|
+
"properties": {
|
|
378
|
+
"uri": {
|
|
379
|
+
"type": "string",
|
|
380
|
+
"title": "监控目录路径",
|
|
381
|
+
"description": "请输入要监控的文件夹的绝对路径。"
|
|
382
|
+
},
|
|
383
|
+
"driver_params": {
|
|
384
|
+
"type": "object",
|
|
385
|
+
"title": "驱动参数",
|
|
386
|
+
"properties": {
|
|
387
|
+
"aged_interval": {
|
|
388
|
+
"type": "number",
|
|
389
|
+
"title": "被忽略监控的陈旧文件夹的年龄 (days)",
|
|
390
|
+
"default": 0.5
|
|
391
|
+
},
|
|
392
|
+
"max_sync_delay_seconds": {
|
|
393
|
+
"type": "number",
|
|
394
|
+
"title": "最大同步延迟 (秒)",
|
|
395
|
+
"description": "实时推送的最大延迟时间。如果超过此时间没有事件,将强制推送一次。",
|
|
396
|
+
"default": 1.0
|
|
397
|
+
},
|
|
398
|
+
"min_monitoring_window_days": {
|
|
399
|
+
"type": "number",
|
|
400
|
+
"title": "最小监控窗口 (天)",
|
|
401
|
+
"description": "当需要淘汰监控目录时,确保被淘汰的目录比整个监控范围内最新的文件至少旧N天。这可以防止淘汰近期仍在活跃范围内的目录。例如,设置为30,则表示只有比最新文件早30天以上的目录才允许被淘汰。",
|
|
402
|
+
"default": 30.0
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
}
|
|
406
|
+
},
|
|
407
|
+
"required": ["uri"],
|
|
408
|
+
},
|
|
409
|
+
"validations": ["test_connection", "check_privileges"]
|
|
410
|
+
}
|
|
411
|
+
]
|
|
412
|
+
}
|