fustor-source-mysql 0.1.9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.4
2
+ Name: fustor-source-mysql
3
+ Version: 0.1.9
4
+ Summary: A MySQL source for Fustor Agent
5
+ Author-email: Huajin Wang <wanghuajin999@163.com>
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: fustor-core
8
+ Requires-Dist: aiomysql>=0.2.0
9
+ Requires-Dist: mysql-replication>=1.0.9
10
+ Provides-Extra: dev
11
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
12
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
13
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
14
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
@@ -0,0 +1,47 @@
1
+ # fustor-source-mysql
2
+
3
+ This package provides a `SourceDriver` implementation for the Fustor Agent service, enabling it to extract data from MySQL databases. It supports both consistent snapshot (historical) and real-time change data capture (CDC) via MySQL's binary log.
4
+
5
+ ## Features
6
+
7
+ * **Consistent Snapshot Synchronization**: Performs a consistent snapshot of tables using `START TRANSACTION WITH CONSISTENT SNAPSHOT` to capture historical data.
8
+ * **Real-time Change Data Capture (CDC)**: Streams real-time data changes (INSERT, UPDATE, DELETE) from MySQL's binary log using `pymysqlreplication`.
9
+ * **Binlog Position Tracking**: Manages and checks binlog positions for resuming streams and determining data availability.
10
+ * **Connection Management**: Handles connection to MySQL using username/password credentials.
11
+ * **Runtime Parameter Validation**: Checks essential MySQL global variables like `log_bin` and `binlog_format` to ensure proper CDC setup.
12
+ * **Agent User Management**: Provides functionality to create a dedicated agent user with necessary replication and select privileges.
13
+ * **Privilege Checking**: Verifies that the agent user has the required permissions.
14
+ * **Field Discovery**: Dynamically discovers available fields (columns) from MySQL schemas.
15
+ * **Shared Instance Model**: Optimizes resource usage by sharing MySQL client instances for identical configurations.
16
+ * **Wizard Definition**: Provides a comprehensive configuration wizard for UI integration, guiding users through connection, runtime checks, and agent user setup.
17
+
18
+ ## Installation
19
+
20
+ This package is part of the Fustor monorepo and is typically installed in editable mode within the monorepo's development environment using `uv sync`. It is registered as a `fustor_agent.drivers.sources` entry point.
21
+
22
+ ## Usage
23
+
24
+ To use the `fustor-source-mysql` driver, configure a Source in your Fustor Agent setup with the driver type `mysql`. You will need to provide the MySQL URI (host:port) and credentials for both an administrative user (for setup and checks) and a dedicated agent user (for data extraction).
25
+
26
+ Example (conceptual configuration in Fustor Agent):
27
+
28
+ ```yaml
29
+ # ~/.fustor/config.yaml
30
+ sources:
31
+ my-mysql-source:
32
+ driver_type: mysql
33
+ uri: localhost:3306
34
+ admin_creds: # Used for initial setup and checks, not saved
35
+ user: admin_user
36
+ passwd: admin_password
37
+ credential: # Dedicated agent user for data extraction, saved
38
+ user: fustor_agent_user
39
+ passwd: agent_password
40
+ ```
41
+
42
+ ## Dependencies
43
+
44
+ * `aiomysql`: Asynchronous MySQL client for Python.
45
+ * `mysql-replication`: Library for reading MySQL binary logs.
46
+ * `fustor-core`: Provides the `SourceDriver` abstract base class and other core components.
47
+ * `fustor-event-model`: Provides `EventBase` for event data structures.
@@ -0,0 +1,33 @@
1
+ [project]
2
+ name = "fustor-source-mysql"
3
+ dynamic = ["version"]
4
+ description = "A MySQL source for Fustor Agent"
5
+ requires-python = ">=3.11"
6
+ dependencies = [ "fustor-core", "aiomysql>=0.2.0", "mysql-replication>=1.0.9",]
7
+ [[project.authors]]
8
+ name = "Huajin Wang"
9
+ email = "wanghuajin999@163.com"
10
+
11
+ [build-system]
12
+ requires = [ "setuptools>=61.0", "setuptools-scm>=8.0"]
13
+ build-backend = "setuptools.build_meta"
14
+
15
+ [tool.setuptools_scm]
16
+ root = "../.."
17
+ version_scheme = "post-release"
18
+ local_scheme = "dirty-tag"
19
+
20
+ ["project.urls"]
21
+ Homepage = "https://github.com/excelwang/fustor/tree/master/packages/source_mysql"
22
+ "Bug Tracker" = "https://github.com/excelwang/fustor/issues"
23
+
24
+ license = "MIT"
25
+
26
+ [project.optional-dependencies]
27
+ dev = [ "pytest>=8.0.0", "ruff>=0.1.0", "mypy>=1.0.0", "pytest-asyncio>=0.23.0",]
28
+
29
+ [project.entry-points."fustor_agent.drivers.sources"]
30
+ mysql = "fustor_source_mysql:MysqlDriver"
31
+
32
+ [tool.setuptools.packages.find]
33
+ where = [ "src",]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,547 @@
1
+ "Fuagent source driver for MySQL."
2
+ import time
3
+ import pymysql
4
+ from pymysql.cursors import SSCursor
5
+ import uuid
6
+ from typing import Iterator, Optional, Dict, Any, Tuple, List, Set
7
+ from decimal import Decimal
8
+ from datetime import datetime, date, timedelta
9
+ from contextlib import contextmanager
10
+ from pymysqlreplication import BinLogStreamReader
11
+ from pymysqlreplication.row_event import DeleteRowsEvent, UpdateRowsEvent, WriteRowsEvent
12
+ import logging
13
+ import aiomysql
14
+ import threading
15
+ import json
16
+ import os
17
+
18
+ from fustor_core.drivers import SourceDriver
19
+ from fustor_core.models.config import SourceConfig, PasswdCredential
20
+ from fustor_core.exceptions import DriverError
21
+ from fustor_event_model.models import EventBase, InsertEvent, UpdateEvent, DeleteEvent
22
+
23
+ logger = logging.getLogger("fustor_agent.driver.mysql")
24
+
25
+ class MysqlDriver(SourceDriver):
26
+ _instances: Dict[str, 'MysqlDriver'] = {}
27
+ _lock = threading.Lock()
28
+
29
+ def __new__(cls, id: str, config: SourceConfig):
30
+ # Generate unique signature: URI + credential to ensure permission isolation
31
+ signature = f"{config.uri}#{hash(str(config.credential))}"
32
+
33
+ with MysqlDriver._lock:
34
+ if signature not in MysqlDriver._instances:
35
+ instance = super().__new__(cls)
36
+ MysqlDriver._instances[signature] = instance
37
+ return MysqlDriver._instances[signature]
38
+
39
+ def __init__(self, id: str, config: SourceConfig):
40
+ # Prevent re-initialization of shared instances
41
+ if hasattr(self, '_initialized'):
42
+ return
43
+
44
+ super().__init__(id, config)
45
+ self.uri = self.config.uri
46
+ self.credential: PasswdCredential = self.config.credential
47
+ self.column_maps: Dict[str, Dict[int, str]] = {}
48
+ self._load_schema_and_build_map()
49
+
50
+ self._initialized = True
51
+
52
+ def _load_schema_and_build_map(self):
53
+ schema_file_path = os.path.join('.conf', 'schemas', f'source_{self.id}.schema.json')
54
+ if not os.path.exists(schema_file_path):
55
+ logger.warning(f"Schema file not found for source '{self.id}' at '{schema_file_path}'. Binlog events will use placeholder column names.")
56
+ return
57
+
58
+ try:
59
+ with open(schema_file_path, 'r', encoding='utf-8') as f:
60
+ schema_data = json.load(f)
61
+
62
+ for table_key, table_schema in schema_data.get("properties", {}).items():
63
+ column_map: Dict[int, str] = {}
64
+ for col_name, col_props in table_schema.get("properties", {}).items():
65
+ col_index = col_props.get("column_index")
66
+ if col_index is not None:
67
+ column_map[col_index] = col_name
68
+ self.column_maps[table_key] = column_map
69
+ logger.info(f"Successfully loaded schema and built column maps for source '{self.id}'.")
70
+ except (json.JSONDecodeError, IOError) as e:
71
+ logger.error(f"Failed to load or parse schema file '{schema_file_path}': {e}", exc_info=True)
72
+
73
+ def _get_row_with_column_names(self, table_key: str, values: List[Any]) -> Dict[str, Any]:
74
+ column_map = self.column_maps.get(table_key)
75
+ if not column_map:
76
+ return {f"UNKNOWN_COL{i}": val for i, val in enumerate(values)}
77
+
78
+ row_dict = {}
79
+ for i, val in enumerate(values):
80
+ col_name = column_map.get(i, f"UNKNOWN_COL{i}")
81
+ row_dict[col_name] = val
82
+ return row_dict
83
+
84
+ def get_snapshot_iterator(self, **kwargs) -> Iterator[EventBase]:
85
+ stream_id = f"snapshot-{uuid.uuid4().hex[:6]}"
86
+ logger.info(f"[{stream_id}] Starting Consistent Snapshot.")
87
+
88
+ snapshot_conn = None
89
+ try:
90
+ host, port_str = self.uri.split(':')
91
+ snapshot_conn = pymysql.connect(
92
+ host=host, port=int(port_str), user=self.credential.user, passwd=self.credential.passwd or ''
93
+ )
94
+
95
+ with snapshot_conn.cursor(SSCursor) as cursor:
96
+ cursor.execute("START TRANSACTION WITH CONSISTENT SNAPSHOT")
97
+ logger.info(f"[{stream_id}] Transaction started for consistent snapshot.")
98
+
99
+ cursor.execute("SHOW MASTER STATUS")
100
+ status = cursor.fetchone()
101
+ if not status:
102
+ raise DriverError("Could not get master status to determine snapshot position.")
103
+
104
+ binlog_start_pos_int = _generate_event_index(status[0], status[1])
105
+ logger.info(f"[{stream_id}] Consistent snapshot locked at position: {binlog_start_pos_int} ({status[0]}:{status[1]})")
106
+
107
+ required_fields = kwargs.get("required_fields_tracker").get_fields() if kwargs.get("required_fields_tracker") else set()
108
+ table_columns: Dict[Tuple[str, str], List[str]] = {}
109
+ for full_field_name in required_fields:
110
+ field_parts = full_field_name.split('.')
111
+ if len(field_parts) < 3: continue
112
+ schema, table_name, column_name = field_parts[0], field_parts[1], field_parts[2]
113
+ key = (schema, table_name)
114
+ if key not in table_columns:
115
+ table_columns[key] = []
116
+ table_columns[key].append(column_name)
117
+
118
+ for (schema, table_name), columns in table_columns.items():
119
+ if not columns: continue
120
+ columns_csv = ', '.join([f"`{col}`" for col in columns])
121
+ query = f"SELECT {columns_csv} FROM `{schema}`.`{table_name}`"
122
+
123
+ logger.debug(f"[{stream_id}] Executing snapshot query: {query}")
124
+ cursor.execute(query)
125
+
126
+ batch_size = kwargs.get("batch_size", 100)
127
+ while True:
128
+ batch = cursor.fetchmany(batch_size)
129
+ if not batch:
130
+ break
131
+
132
+ rows = [{columns[i]: _normalize_row(col) for i, col in enumerate(row)} for row in batch]
133
+ if rows:
134
+ event = InsertEvent(event_schema, table_name, rows, index=binlog_start_pos_int)
135
+ yield event
136
+
137
+ snapshot_conn.commit()
138
+ logger.info(f"[{stream_id}] Snapshot transaction committed.")
139
+
140
+ except Exception as e:
141
+ if snapshot_conn:
142
+ snapshot_conn.rollback()
143
+ logger.error(f"[{stream_id}] Snapshot phase failed, transaction rolled back: {e}", exc_info=True)
144
+ raise
145
+ finally:
146
+ if snapshot_conn:
147
+ snapshot_conn.close()
148
+
149
+ def is_position_available(self, position: int) -> bool:
150
+ """
151
+ Checks if the MySQL binlog position is available for resuming.
152
+ """
153
+ if position <= 0: #means from the latest snapshot
154
+ return False
155
+
156
+ try:
157
+ logger.debug(f"Checking availability of binlog position {position}")
158
+ with _create_binlog_streamer(self.uri, self.credential, position, "pos-check", None, connect_timeout=5) as checker:
159
+ pass # If context manager succeeds, position is valid
160
+ logger.debug(f"Binlog position {position} is available.")
161
+ return True
162
+ except Exception as e:
163
+ # Broad exception to catch various pymysqlreplication errors for lost logs
164
+ logger.warning(f"Binlog position {position} is not available (Reason: {e}).")
165
+ return False
166
+
167
+ def get_message_iterator(self, start_position: int=-1, **kwargs) -> Iterator[EventBase]:
168
+ """
169
+ Performs incremental data capture (CDC).
170
+ """
171
+
172
+ def _iterator_func() -> Iterator[EventBase]:
173
+ stream_id = f"message-stream-{uuid.uuid4().hex[:6]}"
174
+
175
+ stop_event = kwargs.get("stop_event")
176
+ required_fields_tracker = kwargs.get("required_fields_tracker")
177
+ max_retries = self.config.max_retries
178
+ retry_delay_sec = self.config.retry_delay_sec
179
+
180
+ event_id_from = start_position if start_position != -1 else 0
181
+ attempt = 0
182
+ while attempt < max_retries:
183
+ if stop_event and stop_event.is_set(): break
184
+ try:
185
+ with _create_binlog_streamer(self.uri, self.credential, event_id_from, stream_id, stop_event) as streamer:
186
+ for binlog_event in streamer:
187
+ if stop_event and stop_event.is_set(): break
188
+
189
+ if required_fields_tracker and required_fields_tracker.wait_for_change(timeout=0.1):
190
+ required_fields_tracker.clear_event()
191
+
192
+ if streamer.log_file is None or streamer.log_pos is None: continue
193
+ event_index = _generate_event_index(streamer.log_file, streamer.log_pos)
194
+ event = None
195
+ if hasattr(binlog_event, 'rows') and binlog_event.rows:
196
+ table_key = f"{binlog_event.event_schema}.{binlog_event.table}"
197
+ if isinstance(binlog_event, WriteRowsEvent):
198
+ rows = [_normalize_row(self._get_row_with_column_names(table_key, row['values'])) for row in binlog_event.rows]
199
+ event = InsertEvent(binlog_event.event_schema, binlog_event.table, rows, index=event_index)
200
+ elif isinstance(binlog_event, UpdateRowsEvent):
201
+ rows = [_normalize_row(self._get_row_with_column_names(table_key, row['after_values'])) for row in binlog_event.rows]
202
+ event = UpdateEvent(binlog_event.event_schema, binlog_event.table, rows, index=event_index)
203
+ elif isinstance(binlog_event, DeleteRowsEvent):
204
+ rows = [_normalize_row(self._get_row_with_column_names(table_key, row['values'])) for row in binlog_event.rows]
205
+ event = DeleteEvent(binlog_event.event_schema, binlog_event.table, rows, index=event_index)
206
+
207
+ if event:
208
+ filtered_event = _filter_event_rows(event, required_fields_tracker.get_fields() if required_fields_tracker else set())
209
+ if filtered_event:
210
+ yield filtered_event
211
+
212
+ event_id_from = event_index
213
+
214
+ if stop_event and stop_event.is_set(): break
215
+ break
216
+ except Exception as e:
217
+ attempt += 1
218
+ if attempt < max_retries:
219
+ logger.warning(f"[{stream_id}] Transient error in binlog stream (attempt {attempt}/{max_retries}): {e}")
220
+ time.sleep(retry_delay_sec)
221
+ else:
222
+ logger.error(f"[{stream_id}] Failed after {max_retries} retries in binlog stream: {e}", exc_info=True)
223
+ raise DriverError(f"Binlog streaming failed after {max_retries} retries: {e}")
224
+
225
+ logger.info(f"[{stream_id}] Message iterator finished.")
226
+
227
+ return _iterator_func()
228
+
229
+ @classmethod
230
+ async def test_connection(cls, **kwargs) -> Tuple[bool, str]:
231
+ uri = kwargs.get("uri")
232
+ admin_creds_dict = kwargs.get("admin_creds", {})
233
+ if not uri or not admin_creds_dict:
234
+ return (False, "缺少 'uri' 或 'admin_creds' 参数")
235
+ creds = PasswdCredential(**admin_creds_dict)
236
+
237
+ conn = None
238
+ try:
239
+ conn = await _get_connection(uri, creds)
240
+ logger.info(f"Successfully tested connection to {uri}")
241
+ return True, "数据库连接成功。"
242
+ except Exception as e:
243
+ logger.error(f"MySQL async test_connection failed: {e}", exc_info=True)
244
+ return False, f"数据库连接失败: {e}"
245
+ finally:
246
+ if conn is not None:
247
+ close = getattr(conn, "close", None)
248
+ if callable(close):
249
+ close()
250
+
251
+ @classmethod
252
+ async def check_runtime_params(cls, **kwargs) -> Tuple[bool, str]:
253
+ uri = kwargs.get("uri")
254
+ admin_creds_dict = kwargs.get("admin_creds", {})
255
+ if not uri or not admin_creds_dict:
256
+ return (False, "缺少 'uri' 或 'admin_creds' 参数")
257
+ admin_creds = PasswdCredential(**admin_creds_dict)
258
+
259
+ conn = None
260
+ try:
261
+ conn = await _get_connection(uri, admin_creds)
262
+ async with conn.cursor() as cursor:
263
+ await cursor.execute("SHOW GLOBAL VARIABLES LIKE 'log_bin'")
264
+ log_bin = await cursor.fetchone()
265
+ if not log_bin or log_bin[1] != 'ON':
266
+ return (False, "配置检查失败: 全局变量 'log_bin' 必须为 'ON'")
267
+
268
+ await cursor.execute("SHOW GLOBAL VARIABLES LIKE 'binlog_format'")
269
+ binlog_format = await cursor.fetchone()
270
+ if not binlog_format or binlog_format[1] != 'ROW':
271
+ return (False, "配置检查失败: 全局变量 'binlog_format' 必须为 'ROW'")
272
+ logger.info("Runtime parameters check passed")
273
+ return True, "运行时参数有效。"
274
+ except Exception as e:
275
+ logger.error(f"MySQL check_runtime_params failed: {e}", exc_info=True)
276
+ return False, f"检查运行时参数失败: {e}"
277
+ finally:
278
+ if conn is not None:
279
+ close = getattr(conn, "close", None)
280
+ if callable(close):
281
+ close()
282
+
283
+ @classmethod
284
+ async def create_agent_user(cls, **kwargs) -> Tuple[bool, str]:
285
+ uri = kwargs.get("uri")
286
+ admin_creds_dict = kwargs.get("admin_creds", {})
287
+ agent_user_dict = kwargs.get("credential", {})
288
+ if not uri or not admin_creds_dict or not agent_user_dict:
289
+ return (False, "缺少 'uri', 'admin_creds', 或 'credential' 参数")
290
+ admin_creds = PasswdCredential(**admin_creds_dict)
291
+ agent_user = PasswdCredential(**agent_user_dict)
292
+
293
+ conn = None
294
+ try:
295
+ conn = await _get_connection(uri, admin_creds)
296
+ async with conn.cursor() as cursor:
297
+ await cursor.execute(
298
+ "CREATE USER IF NOT EXISTS %s@%s IDENTIFIED BY %s",
299
+ (agent_user.user, '%', agent_user.passwd or '')
300
+ )
301
+ await cursor.execute(
302
+ "GRANT REPLICATION SLAVE, REPLICATION CLIENT, SELECT ON *.* TO %s@%s",
303
+ (agent_user.user, '%')
304
+ )
305
+ await cursor.execute("FLUSH PRIVILEGES")
306
+ logger.info(f"User '{agent_user.user}' is ready for replication.")
307
+ return True, f"用户 '{agent_user.user}' 已成功创建或验证。"
308
+ except Exception as e:
309
+ logger.error(f"Failed to create or grant privileges to user '{agent_user.user}': {e}", exc_info=True)
310
+ return False, f"创建或授权用户 '{agent_user.user}' 失败: {e}"
311
+ finally:
312
+ if conn is not None:
313
+ close = getattr(conn, "close", None)
314
+ if callable(close):
315
+ close()
316
+
317
+ @classmethod
318
+ async def check_privileges(cls, **kwargs) -> Tuple[bool, str]:
319
+ uri = kwargs.get("uri")
320
+ admin_creds_dict = kwargs.get("admin_creds", {})
321
+ agent_user_dict = kwargs.get("credential", {})
322
+ if not uri or not admin_creds_dict or not agent_user_dict:
323
+ return (False, "缺少 'uri', 'admin_creds', 或 'credential' 参数")
324
+ admin_creds = PasswdCredential(**admin_creds_dict)
325
+ agent_user = PasswdCredential(**agent_user_dict)
326
+
327
+ conn = None
328
+ try:
329
+ conn = await _get_connection(uri, admin_creds)
330
+ async with conn.cursor() as cursor:
331
+ await cursor.execute(
332
+ "SELECT Repl_slave_priv, Repl_client_priv, Select_priv FROM mysql.user WHERE User = %s AND Host = %s",
333
+ (agent_user.user, '%')
334
+ )
335
+ result = await cursor.fetchone()
336
+ if not result or result[0] != 'Y' or result[1] != 'Y' or result[2] != 'Y':
337
+ msg = f"用户 '{agent_user.user}' 缺少必要的权限 (REPLICATION SLAVE, REPLICATION CLIENT, SELECT)。"
338
+ logger.error(msg)
339
+ return False, msg
340
+
341
+ logger.info(f"User '{agent_user.user}' privileges verified")
342
+ return True, f"用户 '{agent_user.user}' 权限充足。"
343
+ except Exception as e:
344
+ logger.error(f"MySQL check_user_privileges failed for user '{agent_user.user}': {e}", exc_info=True)
345
+ return False, f"检查用户 '{agent_user.user}' 权限失败: {e}"
346
+ finally:
347
+ if conn is not None:
348
+ close = getattr(conn, "close", None)
349
+ if callable(close):
350
+ close()
351
+
352
+ @classmethod
353
+ async def get_available_fields(cls, **kwargs) -> Dict[str, Any]:
354
+ uri = kwargs.get("uri")
355
+ admin_creds_dict = kwargs.get("admin_creds")
356
+ if not uri or not admin_creds_dict:
357
+ raise DriverError("get_available_fields requires 'uri' and 'admin_creds'.")
358
+
359
+ creds = PasswdCredential(**admin_creds_dict)
360
+
361
+ conn = None
362
+ try:
363
+ conn = await _get_connection(uri, creds)
364
+ available_fields = {}
365
+ system_schemas = ('information_schema', 'mysql', 'performance_schema', 'sys')
366
+
367
+ async with conn.cursor(aiomysql.DictCursor) as cursor:
368
+ await cursor.execute("SELECT TABLE_SCHEMA, TABLE_NAME, COLUMN_NAME, ORDINAL_POSITION FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_SCHEMA NOT IN %s ORDER BY TABLE_SCHEMA, TABLE_NAME, ORDINAL_POSITION", (system_schemas,))
369
+ rows = await cursor.fetchall()
370
+ for row in rows:
371
+ composite_key = f"{row['TABLE_SCHEMA']}.{row['TABLE_NAME']}.{row['COLUMN_NAME']}"
372
+ available_fields[composite_key] = {"type": "string", "column_index": row['ORDINAL_POSITION'] - 1}
373
+
374
+ logger.info(f"Successfully retrieved {len(available_fields)} available fields from {uri}.")
375
+ return {"properties": available_fields}
376
+ except pymysql.err.OperationalError as e:
377
+ error_message = f"连接到 MySQL 失败: 访问被拒绝。请检查用户名、密码和主机权限。"
378
+ logger.debug(f"Original MySQL connection error in mysql driver: {e}", exc_info=True)
379
+ raise DriverError(error_message) from e
380
+ except Exception as e:
381
+ logger.error(f"Error getting available fields: {e}", exc_info=True)
382
+ raise
383
+ finally:
384
+ if conn is not None:
385
+ conn.close()
386
+
387
+ @classmethod
388
+ async def get_wizard_steps(cls) -> Dict[str, Any]:
389
+ return {
390
+ "steps": [
391
+ {
392
+ "step_id": "connection",
393
+ "title": "连接与发现",
394
+ "schema": {
395
+ "type": "object",
396
+ "properties": {
397
+ "uri": {
398
+ "type": "string",
399
+ "title": "URI",
400
+ "description": "MySQL服务器地址 (例如, localhost:3306)",
401
+ "pattern": "^[a-zA-Z0-9._-]+:\\\\d+$"
402
+ },
403
+ "admin_creds": {
404
+ "$ref": "#/components/schemas/PasswdCredential",
405
+ "title": "管理员凭证",
406
+ "description": "用于执行连接测试、环境检查和创建代理用户的一次性管理员凭证。此凭证不会被保存。"
407
+ }
408
+ },
409
+ "required": ["uri", "admin_creds"]
410
+ },
411
+ "validations": ["test_connection", "check_params", "discover_fields_no_cache"]
412
+ },
413
+ {
414
+ "step_id": "agent_setup",
415
+ "title": "代理用户与参数",
416
+ "schema": {
417
+ "type": "object",
418
+ "properties": {
419
+ "credential": {
420
+ "$ref": "#/components/schemas/PasswdCredential",
421
+ "title": "代理用户凭证",
422
+ "description": "为FuAgent创建一个专用的、权限受限的用户,用于日常的数据拉取。此凭证将被保存。"
423
+ }
424
+ },
425
+ "required": ["credential"]
426
+ },
427
+ "validations": ["create_agent_user", "check_privileges"]
428
+ }
429
+ ],
430
+ "components": {
431
+ "schemas": {
432
+ "PasswdCredential": {
433
+ "type": "object",
434
+ "title": "用户名/密码凭证",
435
+ "properties": {
436
+ "user": { "type": "string", "title": "用户名" },
437
+ "passwd": { "type": "string", "title": "密码", "format": "password" }
438
+ },
439
+ "required": ["user"]
440
+ }
441
+ }
442
+ }
443
+ }
444
+
445
+ # --- Module-level helper functions and classes ---
446
+
447
+ @contextmanager
448
+ def _create_binlog_streamer(
449
+ uri: str, user_creds: PasswdCredential, event_id_from: int, stream_id: str, stop_event: Optional[threading.Event] = None, connect_timeout: int = 30
450
+ ) -> Iterator[BinLogStreamReader]:
451
+ streamer = None
452
+ try:
453
+ host, port_str = uri.split(':')
454
+ mysql_settings = {
455
+ "host": host,
456
+ "port": int(port_str),
457
+ "user": user_creds.user,
458
+ "passwd": user_creds.passwd or ''
459
+ }
460
+
461
+ log_file, log_pos = _parse_event_index(event_id_from)
462
+
463
+ server_id = 10086 + int(uuid.uuid4().hex[:8], 16) % 1000
464
+ streamer = BinLogStreamReader(
465
+ connection_settings=mysql_settings,
466
+ server_id=server_id,
467
+ resume_stream=True,
468
+ log_file=log_file,
469
+ log_pos=log_pos,
470
+ blocking=True,
471
+ only_events=[DeleteRowsEvent, WriteRowsEvent, UpdateRowsEvent]
472
+ )
473
+ logger.info(f"Stream {stream_id}: Started MySQL binlog monitoring from {log_file}:{log_pos} with server_id {server_id}")
474
+ if stop_event and stop_event.is_set():
475
+ logger.info(f"Stream {stream_id}: Stop event already set, not starting binlog stream.")
476
+ return
477
+ yield streamer
478
+ except Exception as e:
479
+ logger.error(f"Stream {stream_id}: Failed to create BinLogStreamReader: {e}", exc_info=True)
480
+ raise
481
+ finally:
482
+ if streamer:
483
+ streamer.close()
484
+ logger.info(f"Stream {stream_id}: MySQL binlog stream closed.")
485
+
486
+ async def _get_connection(uri: str, creds: PasswdCredential) -> aiomysql.Connection:
487
+ host, port_str = uri.split(':')
488
+ return await aiomysql.connect(
489
+ host=host, port=int(port_str), user=creds.user,
490
+ password=creds.passwd or '', autocommit=True
491
+ )
492
+
493
+ def _generate_event_index(log_file: str, log_pos: int) -> int:
494
+ if not log_file:
495
+ return 0
496
+ try:
497
+ return (int(log_file.split('.')[-1]) << 32) | log_pos
498
+ except (ValueError, IndexError):
499
+ logger.warning(f"Invalid log_file format: {log_file}, returning default index 0")
500
+ return 0
501
+
502
+ def _parse_event_index(index: int) -> Tuple[Optional[str], int]:
503
+ if index == 0:
504
+ return None, 4
505
+ try:
506
+ return f"mysql-bin.{index >> 32:06d}", index & 0xFFFFFFFF
507
+ except Exception as e:
508
+ logger.error(f"Failed to parse event index {index}: {e}", exc_info=True)
509
+ return None, 4
510
+
511
+ def _normalize_row(data):
512
+ if isinstance(data, dict):
513
+ return {k: _normalize_row(v) for k, v in data.items()}
514
+ if isinstance(data, list):
515
+ return [_normalize_row(item) for item in data]
516
+ if isinstance(data, (datetime, date, timedelta)):
517
+ return str(data)
518
+ if isinstance(data, Decimal):
519
+ return float(data)
520
+ return data
521
+
522
+ def _filter_event_rows(event: EventBase, required_fields: Set[str]) -> Optional[EventBase]:
523
+ if not required_fields:
524
+ return event
525
+
526
+ event_prefix = f"{event.event_schema}.{event.table}."
527
+ if not any(f.startswith(event_prefix) for f in required_fields):
528
+ return None
529
+
530
+ filtered_rows = []
531
+ for row in event.rows:
532
+ filtered_row = {}
533
+ for field_name, field_value in row.items():
534
+ full_field_name = f"{event.event_schema}.{event.table}.{field_name}"
535
+ if full_field_name in required_fields:
536
+ filtered_row[field_name] = field_value
537
+
538
+ if filtered_row:
539
+ filtered_rows.append(filtered_row)
540
+
541
+ if filtered_rows:
542
+ new_event = type(event)(event.event_schema, event.table, filtered_rows)
543
+ new_event.fields = list(filtered_rows[0].keys())
544
+ new_event.index = event.index
545
+ return new_event
546
+
547
+ return None
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.4
2
+ Name: fustor-source-mysql
3
+ Version: 0.1.9
4
+ Summary: A MySQL source for Fustor Agent
5
+ Author-email: Huajin Wang <wanghuajin999@163.com>
6
+ Requires-Python: >=3.11
7
+ Requires-Dist: fustor-core
8
+ Requires-Dist: aiomysql>=0.2.0
9
+ Requires-Dist: mysql-replication>=1.0.9
10
+ Provides-Extra: dev
11
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
12
+ Requires-Dist: ruff>=0.1.0; extra == "dev"
13
+ Requires-Dist: mypy>=1.0.0; extra == "dev"
14
+ Requires-Dist: pytest-asyncio>=0.23.0; extra == "dev"
@@ -0,0 +1,13 @@
1
+ README.md
2
+ pyproject.toml
3
+ src/fustor_source_mysql/__init__.py
4
+ src/fustor_source_mysql/py.typed
5
+ src/fustor_source_mysql.egg-info/PKG-INFO
6
+ src/fustor_source_mysql.egg-info/SOURCES.txt
7
+ src/fustor_source_mysql.egg-info/dependency_links.txt
8
+ src/fustor_source_mysql.egg-info/entry_points.txt
9
+ src/fustor_source_mysql.egg-info/requires.txt
10
+ src/fustor_source_mysql.egg-info/top_level.txt
11
+ tests/conftest.py
12
+ tests/docker-mysql.yml
13
+ tests/mysql-init.sql
@@ -0,0 +1,2 @@
1
+ [fustor_agent.drivers.sources]
2
+ mysql = fustor_source_mysql:MysqlDriver
@@ -0,0 +1,9 @@
1
+ fustor-core
2
+ aiomysql>=0.2.0
3
+ mysql-replication>=1.0.9
4
+
5
+ [dev]
6
+ pytest>=8.0.0
7
+ ruff>=0.1.0
8
+ mypy>=1.0.0
9
+ pytest-asyncio>=0.23.0
@@ -0,0 +1 @@
1
+ fustor_source_mysql
@@ -0,0 +1,52 @@
1
+ import pytest
2
+ import pytest_asyncio
3
+ import os
4
+ import pymysql
5
+
6
+ from fustor_agent.app import App
7
+
8
+ @pytest_asyncio.fixture(scope="function")
9
+ async def test_db_setup(test_app_instance: App):
10
+ # ... (this fixture is unchanged)
11
+ source_config = test_app_instance.source_config_service.get_config('test-test')
12
+ if not source_config:
13
+ pytest.fail("Source config 'test-test' not found. Ensure config.yaml is correctly set up for tests.")
14
+
15
+ mysql_root_password = os.getenv("MYSQL_ROOT_PASSWORD", "")
16
+ if not mysql_root_password:
17
+ pytest.skip("MYSQL_ROOT_PASSWORD environment variable not set, skipping integration test.")
18
+
19
+ conn = pymysql.connect(
20
+ host=source_config.uri.split(':')[0],
21
+ port=int(source_config.uri.split(':')[1]),
22
+ user="root",
23
+ password=mysql_root_password,
24
+ database="testdb"
25
+ )
26
+ table_name = "test_snapshot_table"
27
+ with conn.cursor() as cursor:
28
+ cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
29
+ cursor.execute(f"""
30
+ CREATE TABLE {table_name} (
31
+ id INT AUTO_INCREMENT PRIMARY KEY,
32
+ name VARCHAR(255) NOT NULL,
33
+ value INT
34
+ );
35
+ """)
36
+ cursor.executemany(f"INSERT INTO {table_name} (name, value) VALUES (%s, %s)", [('record_1', 100), ('record_2', 200), ('record_3', 300)])
37
+ conn.commit()
38
+ conn.close()
39
+
40
+ yield table_name
41
+
42
+ conn = pymysql.connect(
43
+ host=source_config.uri.split(':')[0],
44
+ port=int(source_config.uri.split(':')[1]),
45
+ user="root",
46
+ password=mysql_root_password,
47
+ database="testdb"
48
+ )
49
+ with conn.cursor() as cursor:
50
+ cursor.execute(f"DROP TABLE IF EXISTS {table_name}")
51
+ conn.commit()
52
+ conn.close()
@@ -0,0 +1,25 @@
1
+ services:
2
+ mysql-test:
3
+ image: mysql:8.2.0
4
+ command:
5
+ --upgrade=FORCE
6
+ --log-bin=mysql-bin
7
+ --server-id=1
8
+ --binlog-format=ROW
9
+ --log_replica_updates=ON
10
+ --binlog_expire_logs_seconds=0
11
+ container_name: fuagent-mysql-test
12
+ environment:
13
+ MYSQL_ROOT_PASSWORD: testroot
14
+ MYSQL_DATABASE: testdb
15
+ MYSQL_USER: testuser
16
+ MYSQL_PASSWORD: testpass
17
+ ports:
18
+ - "3307:3306"
19
+ healthcheck:
20
+ test: ["CMD", "sh", "-c", "mysqladmin ping -uroot -ptestroot --protocol=tcp"]
21
+ interval: 5s
22
+ timeout: 10s
23
+ retries: 60
24
+ volumes:
25
+ - ./mysql-init.sql:/docker-entrypoint-initdb.d/init.sql
@@ -0,0 +1,90 @@
1
+ -- 创建测试表
2
+ -- =================================================================
3
+ -- Table structure for user (用户信息表)
4
+ -- =================================================================
5
+ CREATE TABLE `user` (
6
+ `user_id` INT NOT NULL AUTO_INCREMENT COMMENT '用户主键ID',
7
+ `email` VARCHAR(128) NULL,
8
+ `openid_user_id` VARCHAR(50) NULL,
9
+ PRIMARY KEY (`user_id`)
10
+ ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
11
+
12
+ -- =================================================================
13
+ -- Table structure for dataset (数据信息表)
14
+ -- =================================================================
15
+ CREATE TABLE `dataset` (
16
+ `dataset_id` INT NOT NULL AUTO_INCREMENT COMMENT '数据集主键ID',
17
+ `user_id` INT NULL COMMENT '外键,关联到用户表',
18
+ `status` TINYINT(1) NULL,
19
+ `create_time` DATETIME NULL,
20
+ `title` VARCHAR(300) NULL,
21
+ `description` MEDIUMTEXT NULL,
22
+ `is_deleted` TINYINT(1) DEFAULT 0,
23
+ `is_checked` INT NULL,
24
+ `is_review` INT NULL,
25
+ `backup_status` INT NULL,
26
+ `path` VARCHAR(60) NULL,
27
+ PRIMARY KEY (`dataset_id`),
28
+ CONSTRAINT `fk_dataset_user` FOREIGN KEY (`user_id`) REFERENCES `user` (`user_id`) ON DELETE SET NULL ON UPDATE CASCADE
29
+ ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
30
+
31
+ -- =================================================================
32
+ -- Table structure for file (文件信息表)
33
+ -- =================================================================
34
+ CREATE TABLE `file` (
35
+ `file_id` INT NOT NULL AUTO_INCREMENT COMMENT '文件主键ID',
36
+ `dataset_id` INT NULL COMMENT '外键,关联到数据信息表(dataset)',
37
+ `file_name` VARCHAR(300) NULL,
38
+ `is_deleted` TINYINT(1) DEFAULT 0,
39
+ `relative_path` VARCHAR(500) NULL,
40
+ `file_size` VARCHAR(100) NULL,
41
+ `file_suffix` VARCHAR(300) NULL,
42
+ `file_code` VARCHAR(300) NULL,
43
+ `md5` VARCHAR(300) NULL,
44
+ `status` VARCHAR(30) NULL,
45
+ `create_time` DATETIME NULL,
46
+ PRIMARY KEY (`file_id`),
47
+ CONSTRAINT `fk_file_dataset` FOREIGN KEY (`dataset_id`) REFERENCES `dataset` (`dataset_id`) ON DELETE CASCADE ON UPDATE CASCADE
48
+ ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
49
+
50
+ -- 启用 binlog需要依赖容器启动参数配置
51
+ SET GLOBAL binlog_format = ROW;
52
+
53
+ -- 创建必要的权限(测试专用)
54
+ GRANT REPLICATION SLAVE, REPLICATION CLIENT ON *.* TO 'testuser'@'%';
55
+ GRANT SELECT, INSERT, UPDATE, DELETE ON testdb.* TO 'testuser'@'%';
56
+ FLUSH PRIVILEGES;
57
+
58
+ -- =================================================================
59
+ -- 插入测试数据 (5条/表)
60
+ -- =================================================================
61
+
62
+ -- user表数据
63
+ INSERT INTO `user` (`email`, `openid_user_id`) VALUES
64
+ ('user1@example.com', 'openid1'),
65
+ ('user2@example.com', 'openid2'),
66
+ ('user3@example.com', 'openid3'),
67
+ ('user4@example.com', 'openid4'),
68
+ ('user5@example.com', 'openid5');
69
+
70
+ -- dataset表数据
71
+ INSERT INTO `dataset` (
72
+ `user_id`, `status`, `create_time`, `title`,
73
+ `description`, `is_checked`, `is_review`, `backup_status`, `path`
74
+ ) VALUES
75
+ (1, 1, NOW(), 'Dataset 1', 'Description 1', 1, 0, 0, '/datasets/1'),
76
+ (2, 1, NOW(), 'Dataset 2', 'Description 2', 1, 0, 0, '/datasets/2'),
77
+ (3, 1, NOW(), 'Dataset 3', 'Description 3', 1, 0, 0, '/datasets/3'),
78
+ (4, 1, NOW(), 'Dataset 4', 'Description 4', 1, 0, 0, '/datasets/4'),
79
+ (5, 1, NOW(), 'Dataset 5', 'Description 5', 1, 0, 0, '/datasets/5');
80
+
81
+ -- file表数据
82
+ INSERT INTO `file` (
83
+ `dataset_id`, `file_name`, `relative_path`, `file_size`,
84
+ `file_suffix`, `file_code`, `md5`, `status`, `create_time`
85
+ ) VALUES
86
+ (1, 'file1.txt', 'files/1', '1024', 'txt', 'F001', 'md51', 'active', NOW()),
87
+ (2, 'file2.jpg', 'files/2', '2048', 'jpg', 'F002', 'md52', 'active', NOW()),
88
+ (3, 'file3.pdf', 'files/3', '3072', 'pdf', 'F003', 'md53', 'active', NOW()),
89
+ (4, 'file4.png', 'files/4', '4096', 'png', 'F004', 'md54', 'active', NOW()),
90
+ (5, 'file5.doc', 'files/5', '5120', 'doc', 'F005', 'md55', 'active', NOW());