core-cdc 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
core-cdc-1.0.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Alejandro Cora González.
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,60 @@
1
+ Metadata-Version: 2.1
2
+ Name: core-cdc
3
+ Version: 1.0.0
4
+ Summary: This project/library contains the core mechanism and resources for Change Data Capture services...
5
+ Author-email: Alejandro Cora González <alek.cora.glez@gmail.com>
6
+ Maintainer: Alejandro Cora González
7
+ License: MIT
8
+ Project-URL: Homepage, https://gitlab.com/bytecode-solutions/core/core-cdc
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Topic :: Utilities
16
+ Requires-Python: >=3.9
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: core-mixins
20
+ Requires-Dist: core-tests
21
+ Requires-Dist: core-aws
22
+ Requires-Dist: core-db
23
+ Requires-Dist: mysql-replication==0.31
24
+ Requires-Dist: pymongo==4.6.0
25
+ Provides-Extra: test
26
+
27
+ # core-cdc (CDC a.k.a Change Data Capture)
28
+ _______________________________________________________________________________
29
+
30
+ It provides the core mechanism and required resources to
31
+ implement "Change Data Capture" services...
32
+
33
+ ## Execution Environment
34
+
35
+ ### Install libraries
36
+ ```commandline
37
+ pip install --upgrade pip
38
+ pip install virtualenv
39
+ ```
40
+
41
+ ### Create the Python Virtual Environment.
42
+ ```commandline
43
+ virtualenv --python=python3.11 .venv
44
+ ```
45
+
46
+ ### Activate the Virtual Environment.
47
+ ```commandline
48
+ source .venv/bin/activate
49
+ ```
50
+
51
+ ### Install required libraries.
52
+ ```commandline
53
+ pip install .
54
+ ```
55
+
56
+ ### Check tests and coverage...
57
+ ```commandline
58
+ python manager.py run-test
59
+ python manager.py run-coverage
60
+ ```
@@ -0,0 +1,34 @@
1
+ # core-cdc (CDC a.k.a Change Data Capture)
2
+ _______________________________________________________________________________
3
+
4
+ It provides the core mechanism and required resources to
5
+ implement "Change Data Capture" services...
6
+
7
+ ## Execution Environment
8
+
9
+ ### Install libraries
10
+ ```commandline
11
+ pip install --upgrade pip
12
+ pip install virtualenv
13
+ ```
14
+
15
+ ### Create the Python Virtual Environment.
16
+ ```commandline
17
+ virtualenv --python=python3.11 .venv
18
+ ```
19
+
20
+ ### Activate the Virtual Environment.
21
+ ```commandline
22
+ source .venv/bin/activate
23
+ ```
24
+
25
+ ### Install required libraries.
26
+ ```commandline
27
+ pip install .
28
+ ```
29
+
30
+ ### Check tests and coverage...
31
+ ```commandline
32
+ python manager.py run-test
33
+ python manager.py run-coverage
34
+ ```
@@ -0,0 +1,3 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from . import targets
@@ -0,0 +1,111 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from datetime import date, datetime
7
+ from enum import StrEnum
8
+ from typing import Dict, Tuple, Optional
9
+
10
+
11
+ class EventType(StrEnum):
12
+ GLOBAL = "GLOBAL" # Event related to the processor itself.
13
+ DDL_STATEMENT = "QUERY" # Event for DDL statement (like create table, etc.).
14
+ INSERT = "INSERT" # Event for DML INSERT operation.
15
+ UPDATE = "UPDATE" # Event for DML UPDATE operation.
16
+ DELETE = "DELETE" # Event for DML DELETE operation.
17
+
18
+
19
+ class Record:
20
+ """
21
+ It provides a wrapper or common object useful for integration
22
+ across services that needs to handle data replication
23
+ via Change Data Capture producers and consumers...
24
+ """
25
+
26
+ def __init__(
27
+ self, event_timestamp: int, event_type: EventType, record: Dict, service: str,
28
+ source: str, position: int, primary_key: Tuple | str, schema_name: str, table_name: str,
29
+ global_id: Optional[str], transaction_id: Optional[str]) -> None:
30
+
31
+ """
32
+ It creates a new record with all the attributes required for streaming and
33
+ replication using a Data Change Capture service...
34
+
35
+ :param: global_id: Unique identifier. (like: Global Transaction Identifier (gtid) in MySQL engine).
36
+ :param: transaction_id: Identifier for the transaction. (like: xid in MySQL engine).
37
+
38
+ :param event_timestamp: Timestamp when the record was generated.
39
+ :param event_type: It specifies the operation that generated the event.
40
+ :param record: The record (like a record/row in a database).
41
+
42
+ :param service: The service from where the record came.
43
+ :param source: The source from where the record was retrieved (like binlog file name).
44
+ :param position: Record position in the source.
45
+
46
+ :param: primary_key: Attributes to use to identify a record as unique.
47
+ :param: schema_name: Schema from where the record was retrieved.
48
+ :param: table: Table from where the record was retrieved.
49
+ """
50
+
51
+ self.global_id = global_id
52
+ self.transaction_id = transaction_id
53
+
54
+ self.event_timestamp = event_timestamp
55
+ self.event_type = event_type
56
+ self.record = record
57
+
58
+ self.service = service
59
+ self.source = source
60
+ self.position = position
61
+
62
+ self.primary_key = primary_key
63
+ self.schema_name = schema_name
64
+ self.table_name = table_name
65
+
66
+ def __str__(self):
67
+ return json.dumps(self.to_json())
68
+
69
+ def to_json(self):
70
+ """
71
+ It returns the JSON version of the record required to be streamed...
72
+
73
+ :return: A dictionary that follows the below structure:
74
+ {
75
+ "global_id": "",
76
+ "transaction_id": "",
77
+ "event_timestamp": 1653685384,
78
+ "event_type": "INSERT | UPDATE | DELETE",
79
+ "service": "service-name",
80
+ "source": "binlog.000001 | FileName | Something",
81
+ "position": 8077,
82
+ "primary_key": "(str | tuple)",
83
+ "schema_name": "schema_name",
84
+ "table_name": "table_name",
85
+ "record": {
86
+ ...
87
+ // This is an example...
88
+ "id": "000-1",
89
+ "category": "Marketing",
90
+ "price": 100.0,
91
+ ...
92
+ }
93
+ }
94
+ """
95
+
96
+ return {
97
+ "global_id": self.global_id,
98
+ "transaction_id": self.transaction_id,
99
+ "event_timestamp": self.event_timestamp,
100
+ "event_type": self.event_type.value,
101
+ "service": self.service,
102
+ "source": self.source,
103
+ "position": self.position,
104
+ "primary_key": self.primary_key,
105
+ "schema_name": self.schema_name,
106
+ "table_name": self.table_name,
107
+ "record": {
108
+ key: value.isoformat() if type(value) in (datetime, date) else value
109
+ for key, value in self.record.items()
110
+ }
111
+ }
File without changes
@@ -0,0 +1,110 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from abc import abstractmethod
4
+ from logging import Logger
5
+ from typing import Any, Dict, List, Iterator, Iterable, Type, Self
6
+
7
+ from core_mixins.interfaces.factory import IFactory
8
+
9
+ from core_cdc.base import EventType, Record
10
+ from core_cdc.targets.base import Target
11
+
12
+
13
+ class IProcessor(IFactory):
14
+ """ Interface for all "Change Data Capture" implementations """
15
+
16
+ _impls: Dict[str, Type[Self]] = {}
17
+
18
+ def __init__(
19
+ self, service: str, targets: List[Target], logger: Logger,
20
+ events_to_stream: Iterable[EventType] = (EventType.INSERT, EventType.UPDATE, EventType.DELETE),
21
+ add_event_timestamp: bool = False) -> None:
22
+
23
+ """
24
+ :param service: The name of the service that is doing the CDC process.
25
+ :param events_to_stream: By default, insert, update and, delete operations will be streamed.
26
+ :param logger: Logger used.
27
+
28
+ :param add_event_timestamp:
29
+ If True, the column event_timestamp will be added when a table is created. The column
30
+ could be useful for UPSERT or MERGE operations.
31
+ """
32
+
33
+ self.service = service
34
+ self.targets = targets
35
+ self.events_to_stream = events_to_stream
36
+ self.add_event_timestamp = add_event_timestamp
37
+ self.logger = logger
38
+
39
+ def execute(self):
40
+ self.logger.info("Reading events from the stream...")
41
+
42
+ for event in self.get_events():
43
+ event_type = self.get_event_type(event)
44
+
45
+ if event_type == EventType.DDL_STATEMENT:
46
+ for target in self.targets:
47
+ if target.execute_ddl:
48
+ try:
49
+ query = target.get_ddl_query(event)
50
+ if query:
51
+ target.execute(query)
52
+ self.logger.info(f"The below query was executed in: {target.register_name()}.")
53
+ self.logger.info(query)
54
+
55
+ if self.add_event_timestamp:
56
+ if query.lower().count("create table"):
57
+ schema, table = Target.get_schema_table_from_query(query)
58
+
59
+ target.execute(
60
+ target.get_add_column_ddl(
61
+ schema=schema, table=table,
62
+ column="event_timestamp",
63
+ type_="bigint"
64
+ )
65
+ )
66
+
67
+ except Exception as error:
68
+ self.logger.error(f"[{target.register_name()}] -- {error}")
69
+
70
+ elif event_type in self.events_to_stream:
71
+ for target in self.targets:
72
+ try:
73
+ records = self.process_dml_event(event)
74
+ target.save(records)
75
+
76
+ self.logger.info({
77
+ "target": target.register_name(),
78
+ "number_of_records": len(records),
79
+ "event_type": event_type,
80
+ "schema": records[0].schema_name,
81
+ "table": records[0].table_name
82
+ })
83
+
84
+ except Exception as error:
85
+ self.logger.error(f"[{target.register_name()}] -- {error}")
86
+
87
+ else:
88
+ try:
89
+ self.process_event(event)
90
+
91
+ except Exception as error:
92
+ self.logger.error(f"Error: {error}")
93
+
94
+ @abstractmethod
95
+ def get_events(self) -> Iterator[Any]:
96
+ """ It returns an iterator with the events to process """
97
+
98
+ @abstractmethod
99
+ def get_event_type(self, event: Any) -> EventType:
100
+ """ It returns the event type """
101
+
102
+ @abstractmethod
103
+ def process_dml_event(self, event: Any, **kwargs) -> List[Record]:
104
+ """ It processes the event and return the records to stream """
105
+
106
+ def process_event(self, event: Any):
107
+ """
108
+ It should be implemented if another event (apart from DDL
109
+ and DML) must be processed...
110
+ """
@@ -0,0 +1,105 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from typing import Any, List, Iterator, Dict
4
+
5
+ from pymongo.change_stream import ChangeStream
6
+
7
+ from core_cdc.base import Record, EventType
8
+ from .base import IProcessor
9
+
10
+
11
+ class MongoDbStreamProcessor(IProcessor):
12
+ """
13
+ It processes the events from the MongoDB Stream.
14
+
15
+ A change stream is a real-time stream of database changes that flows from your
16
+ database to your application. With change streams, your applications can react—in real
17
+ time—to data changes in a single collection, a database, or even an entire
18
+ deployment. For apps that rely on notifications of changing data, change
19
+ streams are critical.
20
+
21
+ More information:
22
+ https://www.mongodb.com/basics/change-streams
23
+ """
24
+
25
+ def __init__(self, stream: ChangeStream, save_full_event: bool = True, **kwargs):
26
+ """
27
+ :param stream: DatabaseChangeStream object.
28
+ :param save_full_event: If True, all the event will be streamed, otherwise only fullDocument.
29
+
30
+ To create a stream you can use:
31
+ * db.collection.watch()
32
+ * db.watch()
33
+
34
+ Example:
35
+ pipeline = [{'$match': {'operationType': 'insert'}}, {'$match': {'operationType': 'replace'}}]
36
+ MongoDbStreamProcessor(
37
+ stream = MongoClient(...)["database"].<collection>.watch(pipeline)
38
+ )
39
+
40
+ More information...
41
+ * https://www.mongodb.com/basics/change-streams
42
+ * https://www.mongodb.com/docs/manual/changeStreams/#open-a-change-stream
43
+ * https://www.mongodb.com/docs/manual/reference/method/db.watch/#db.watch--
44
+ """
45
+
46
+ super(MongoDbStreamProcessor, self).__init__(**kwargs)
47
+ self.save_full_event = save_full_event
48
+ self.stream = stream
49
+
50
+ def get_events(self) -> Iterator[Any]:
51
+ for event in self.stream:
52
+ self.logger.info(
53
+ f"Received event: {event.get('operationType')} "
54
+ f"for document: {event.get('documentKey', {}).get('_id')}."
55
+ )
56
+
57
+ event["clusterTime"] = event["clusterTime"].time if event.get("clusterTime") else None
58
+ yield event
59
+
60
+ # To store the token to resume execution...
61
+ self.save_resume_token(event["_id"])
62
+
63
+ def get_event_type(self, event: Dict) -> EventType:
64
+ opt_type = event.get("operationType")
65
+ if opt_type == "insert":
66
+ return EventType.INSERT
67
+
68
+ elif opt_type in ("replace", "update"):
69
+ return EventType.UPDATE
70
+
71
+ elif opt_type == "delete":
72
+ return EventType.DELETE
73
+
74
+ return EventType.GLOBAL
75
+
76
+ def process_dml_event(self, event: Any, **kwargs) -> List[Record]:
77
+ metadata = {
78
+ "global_id": None,
79
+ "transaction_id": None,
80
+ "event_timestamp": event["clusterTime"],
81
+ "event_type": self.get_event_type(event),
82
+ "service": self.service,
83
+ "source": None,
84
+ "position": 0,
85
+ "primary_key": "_id",
86
+ "schema_name": event["ns"]["db"],
87
+ "table_name": event["ns"]["coll"]
88
+ }
89
+
90
+ event["documentKey"]["_id"] = str(event["documentKey"]["_id"])
91
+ if event.get("fullDocument"):
92
+ event["fullDocument"]["_id"] = str(event["fullDocument"]["_id"])
93
+
94
+ return [
95
+ Record(
96
+ record=event if self.save_full_event else event.get("fullDocument", {}),
97
+ **metadata
98
+ )
99
+ ]
100
+
101
+ def save_resume_token(self, token):
102
+ """
103
+ It stores the token that can be used to resume the
104
+ process in a certain point...
105
+ """
@@ -0,0 +1,62 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from core_cdc.processors.base import IProcessor
4
+
5
+
6
+ class MsSqlProcessor(IProcessor):
7
+ """
8
+ Change Data Capture (CDC) is a feature provided by Microsoft SQL Server to capture
9
+ and track changes made to tables at the database level. It is designed to efficiently
10
+ identify and record changes to data, making it a powerful tool for applications
11
+ that require real-time or near-real-time data integration, synchronization,
12
+ and auditing. Here are some key points
13
+ about SQL Server Change Data Capture:
14
+
15
+ 1. **Purpose:**
16
+ - **Data Integration:** CDC facilitates the integration of data between different systems and databases by capturing changes at the source.
17
+ - **Synchronization:** It allows keeping multiple databases in sync by identifying and applying changes made to one database to another.
18
+ - **Auditing:** CDC provides a reliable and efficient way to audit changes to data, tracking who made the change and when.
19
+
20
+ 2. **How it Works:**
21
+ - CDC works by enabling change tracking on selected tables. When enabled, SQL Server maintains change tables to store the details of changes made to the tracked tables.
22
+ - The change tables store information about INSERTs, UPDATEs, and DELETEs, including the columns affected and the corresponding values before and after the change.
23
+
24
+ 3. **Configuration:**
25
+ - To use CDC, you need to enable it at the database level and then enable it for specific tables.
26
+ - Enabling CDC involves creating the required system tables and stored procedures for change tracking.
27
+ - Once enabled, SQL Server automatically populates change tables with the details of changes made to the tracked tables.
28
+
29
+ 4. **Querying Changes:**
30
+ - After CDC is enabled, you can query the change tables to retrieve information about changes made to tracked tables.
31
+ - The `cdc.fn_cdc_get_all_changes_<capture_instance>` function is commonly used to retrieve changes for a specific capture instance.
32
+
33
+ 5. **Retention Period:**
34
+ - CDC allows you to specify a retention period for change data. After this period, older change data is automatically purged.
35
+
36
+ 6. **Integration with ETL and Replication:**
37
+ - CDC is often used in conjunction with Extract, Transform, Load (ETL) processes to capture changes at the source and propagate them to a data warehouse or another database.
38
+ - It is also integrated with SQL Server Replication to capture and replicate changes to other SQL Server instances.
39
+
40
+ 7. **Security Considerations:**
41
+ - CDC respects SQL Server security settings, ensuring that only authorized users can access the change data.
42
+
43
+ Here's a basic example of enabling CDC for a table:
44
+
45
+ ```sql
46
+ -- Enable CDC at the database level
47
+ EXEC sys.sp_cdc_enable_db;
48
+
49
+ -- Enable CDC for a specific table
50
+ EXEC sys.sp_cdc_enable_table
51
+ @source_schema = 'dbo',
52
+ @source_name = 'YourTableName',
53
+ @role_name = 'cdc_admin';
54
+ ```
55
+
56
+ Once CDC is enabled, you can query the change tables to retrieve
57
+ information about the changes made to the tracked table. Keep in mind that
58
+ using CDC introduces some overhead, and you should carefully consider the
59
+ impact on performance and storage when enabling it. It's a powerful
60
+ feature for scenarios requiring change tracking, but it may not be
61
+ necessary for every application.
62
+ """
@@ -0,0 +1,120 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from typing import Any, Iterator, List
4
+
5
+ from pymysqlreplication import BinLogStreamReader
6
+ from pymysqlreplication.event import GtidEvent
7
+ from pymysqlreplication.event import QueryEvent
8
+ from pymysqlreplication.event import RotateEvent
9
+ from pymysqlreplication.event import XidEvent
10
+ from pymysqlreplication.row_event import DeleteRowsEvent
11
+ from pymysqlreplication.row_event import UpdateRowsEvent
12
+ from pymysqlreplication.row_event import WriteRowsEvent
13
+
14
+ from core_cdc.base import Record, EventType
15
+ from .base import IProcessor
16
+
17
+
18
+ class MySqlBinlogProcessor(IProcessor):
19
+ """
20
+ It processes the events from the BinLog files.
21
+
22
+ The binary log contains “events” that describe database changes such as table creation
23
+ operations or changes to table data. It also contains events for statements that potentially
24
+ could have made changes (for example, a DELETE which matched no rows), unless row-based
25
+ logging is used. The binary log also contains information about how long each
26
+ statement took that updated data.
27
+
28
+ More information:
29
+ https://dev.mysql.com/doc/refman/8.0/en/binary-log.html
30
+ """
31
+
32
+ def __init__(self, stream: BinLogStreamReader, **kwargs) -> None:
33
+ """
34
+ https://python-mysql-replication.readthedocs.io/en/stable/binlogstream.html
35
+ :param stream: BinLogStreamReader object.
36
+ """
37
+
38
+ super(MySqlBinlogProcessor, self).__init__(**kwargs)
39
+ self.stream = stream
40
+
41
+ # To keep the tracking of the processed elements...
42
+ self.log_file = None
43
+ self.log_pos = None
44
+ self.gtid = None
45
+ self.xid = None
46
+
47
+ def get_events(self) -> Iterator[Any]:
48
+ for event in self.stream:
49
+ self.logger.info(f"Received event: {event.__class__.__name__}.")
50
+ self.log_file, self.log_pos = self.stream.log_file, self.stream.log_pos
51
+ self.logger.info(f"File: {self.log_file}, Position: {self.log_pos}.")
52
+
53
+ yield event
54
+ self._update_log_pos(self.log_pos)
55
+
56
+ def get_event_type(self, event: Any) -> EventType:
57
+ if isinstance(event, QueryEvent):
58
+ return EventType.DDL_STATEMENT
59
+
60
+ elif isinstance(event, WriteRowsEvent):
61
+ return EventType.INSERT
62
+
63
+ elif isinstance(event, UpdateRowsEvent):
64
+ return EventType.UPDATE
65
+
66
+ elif isinstance(event, DeleteRowsEvent):
67
+ return EventType.DELETE
68
+
69
+ return EventType.GLOBAL
70
+
71
+ def process_dml_event(self, event: Any, **kwargs) -> List[Record]:
72
+ metadata = {
73
+ "global_id": self.gtid,
74
+ "transaction_id": self.xid,
75
+ "event_timestamp": event.timestamp,
76
+ "event_type": "",
77
+ "service": self.service,
78
+ "source": self.log_file,
79
+ "position": self.log_pos,
80
+ "primary_key": event.primary_key,
81
+ "schema_name": event.schema,
82
+ "table_name": event.table
83
+ }
84
+
85
+ if isinstance(event, WriteRowsEvent):
86
+ metadata["event_type"] = EventType.INSERT
87
+
88
+ if isinstance(event, DeleteRowsEvent):
89
+ metadata["event_type"] = EventType.DELETE
90
+
91
+ if isinstance(event, UpdateRowsEvent):
92
+ metadata["event_type"] = EventType.UPDATE
93
+
94
+ return [
95
+ Record(record=row.get("after_values", {}), **metadata)
96
+ for row in event.rows
97
+ ]
98
+
99
+ return [
100
+ Record(record=row.get("values", {}), **metadata)
101
+ for row in event.rows
102
+ ]
103
+
104
+ def process_event(self, event: Any, **kwargs):
105
+ if isinstance(event, GtidEvent):
106
+ self.gtid = event.gtid
107
+
108
+ elif isinstance(event, XidEvent):
109
+ self.xid = event.xid
110
+
111
+ elif isinstance(event, RotateEvent):
112
+ self.logger.info(f"NEXT FILE: {event.next_binlog}. POSITION: {event.position}.")
113
+ self._update_log_file(event.next_binlog)
114
+ self._update_log_pos(event.position)
115
+
116
+ def _update_log_file(self, log_file_name: str):
117
+ """ It updates the log_file name """
118
+
119
+ def _update_log_pos(self, position: int):
120
+ """ It updates the log_file position """
@@ -0,0 +1,59 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from core_cdc.processors.base import IProcessor
4
+
5
+
6
+ class OracleProcessor(IProcessor):
7
+ """
8
+ Oracle Database does not provide a dedicated Python library similar to
9
+ the SQL Server Change Data Capture (CDC) feature. However, you can implement change
10
+ data capture in Oracle using a combination of Oracle features and Oracle Database
11
+ connectors for Python.
12
+
13
+ Here's a general approach:
14
+
15
+ 1. **Oracle Flashback Technology:**
16
+ - Oracle offers Flashback Technology, including Flashback Query and Flashback Version Query, which allows you to query data as it appeared at a previous point in time.
17
+ - You can leverage these features to implement a form of change data capture by periodically querying the database for changes within a specified time range.
18
+
19
+ 2. **Database Triggers:**
20
+ - Oracle supports triggers, which are procedures that are automatically executed in response to specific events on a particular table or view.
21
+ - You can create triggers to capture information about changes (e.g., inserts, updates, deletes) into separate audit or change tables.
22
+
23
+ 3. **LogMiner:**
24
+ - Oracle LogMiner is a utility that allows you to mine redo log files and obtain information about changes made to the database.
25
+ - You can use Oracle's LogMiner to track changes and update a change tracking table.
26
+
27
+ 4. **Oracle Database Connectors for Python:**
28
+ - You can use Python libraries such as `cx_Oracle` to connect to Oracle Database and execute queries or procedures that implement your change data capture logic.
29
+ - Install `cx_Oracle` using: `pip install cx_Oracle`.
30
+
31
+ Here's a simplified example using Python and `cx_Oracle`:
32
+
33
+ ```python
34
+ import cx_Oracle
35
+
36
+ # Connect to Oracle
37
+ connection = cx_Oracle.connect("username/password@localhost/orcl")
38
+
39
+ # Create a cursor
40
+ cursor = connection.cursor()
41
+
42
+ # Execute a query using Flashback Query
43
+ query = "SELECT * FROM your_table AS OF TIMESTAMP (SYSTIMESTAMP - INTERVAL '1' DAY)"
44
+ cursor.execute(query)
45
+
46
+ # Fetch the results
47
+ for row in cursor.fetchall():
48
+ print(row)
49
+
50
+ # Close the cursor and connection
51
+ cursor.close()
52
+ connection.close()
53
+ ```
54
+
55
+ Remember that Oracle Database may have specific requirements and considerations
56
+ for implementing change data capture based on your use case. You might also want to
57
+ explore Oracle GoldenGate, a comprehensive solution for real-time data integration
58
+ and replication that goes beyond basic change tracking.
59
+ """
@@ -0,0 +1,6 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from .kinesis import KinesisDataStreamTarget
4
+ from .mysql import MySQLTarget
5
+ from .sns import SnsTarget
6
+ from .sqs import SqsTarget
@@ -0,0 +1,117 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from __future__ import annotations
4
+
5
+ from abc import abstractmethod
6
+ from logging import Logger
7
+ from re import compile
8
+ from typing import Dict, List, Tuple, Type, Self
9
+
10
+ from core_mixins.interfaces.factory import IFactory
11
+ from pymysqlreplication.event import QueryEvent
12
+
13
+ from core_cdc.base import Record
14
+
15
+
16
+ class Target(IFactory):
17
+ """
18
+ This is the base class for the specific implementations of the targets the data will be sent
19
+ or replicated. A target could be a database, queue, topic, data warehouse, etc...
20
+ """
21
+
22
+ # Registered targets...
23
+ _impls: Dict[str, Type[Self]] = {}
24
+
25
+ def __init__(
26
+ self, logger: Logger, execute_ddl: bool = False, send_data: bool = False,
27
+ **kwargs) -> None:
28
+
29
+ self.execute_ddl = execute_ddl
30
+ self.send_data = send_data
31
+ self.logger = logger
32
+ self.client = None
33
+
34
+ for attr in kwargs:
35
+ setattr(self, attr, kwargs[attr])
36
+
37
+ def init_client(self, **kwargs) -> None:
38
+ """ The target's implementations must implement this method """
39
+
40
+ def connect(self):
41
+ if self.client:
42
+ self.client.connect()
43
+
44
+ def get_ddl_query(self, event: QueryEvent) -> str | None:
45
+ """
46
+ Each engine could use a different query for DDL operations...
47
+ :return: The query or None if not supported.
48
+ """
49
+
50
+ # sql_statement = sub(r"/\*.*?\*/", "", event.query.lower()).strip()
51
+ sql_statement = event.query.lower()
52
+
53
+ if sql_statement.count("create schema") or sql_statement.count("create database"):
54
+ return self.get_create_schema_statement(event)
55
+
56
+ elif sql_statement.count("drop schema") or sql_statement.count("drop database"):
57
+ return self.get_drop_schema_statement(event)
58
+
59
+ elif sql_statement.count("create table"):
60
+ return self.get_create_table_statement(event)
61
+
62
+ elif sql_statement.count("alter table"):
63
+ return self.get_alter_table_statement(event)
64
+
65
+ elif sql_statement.count("drop table"):
66
+ return self.get_drop_table_statement(event)
67
+
68
+ @staticmethod
69
+ def get_add_column_ddl(schema: str, table: str, column: str, type_: str) -> str:
70
+ """ Returns the DDL to add a new column """
71
+
72
+ return f"ALTER TABLE `{schema}`.`{table}` ADD COLUMN `{column}` {type_};"
73
+
74
+ @staticmethod
75
+ def get_create_schema_statement(event: QueryEvent) -> str:
76
+ return event.query
77
+
78
+ @staticmethod
79
+ def get_drop_schema_statement(event: QueryEvent) -> str:
80
+ return event.query
81
+
82
+ @staticmethod
83
+ def get_create_table_statement(event: QueryEvent) -> str:
84
+ return event.query
85
+
86
+ @staticmethod
87
+ def get_alter_table_statement(event: QueryEvent) -> str:
88
+ return event.query
89
+
90
+ @staticmethod
91
+ def get_schema_table_from_query(query: str) -> Tuple[str, str]:
92
+ """ Returns schema, table from query using Regex """
93
+
94
+ # TODO if the query does not contains "`" this will not work...
95
+ data = compile("`[a-z_]+`.`[a-z_]+`").search(query).group(0)
96
+ schema, table = data.replace("`", "").split(".")
97
+ return schema, table
98
+
99
+ @staticmethod
100
+ def get_drop_table_statement(event: QueryEvent) -> str:
101
+ return event.query
102
+
103
+ def execute(self, query: str):
104
+ if self.client:
105
+ self.client.execute(query)
106
+
107
+ def save(self, records: List[Record], **kwargs):
108
+ if self.send_data:
109
+ self._save(records, **kwargs)
110
+ self.logger.info(f"{len(records)} records were sent to: {self.register_name()}!")
111
+
112
+ @abstractmethod
113
+ def _save(self, records: List[Record], **kwargs):
114
+ """ Specific implementation to store the data into the Engine """
115
+
116
+ def close(self):
117
+ """ Implement it if is required to release or close resources """
@@ -0,0 +1,27 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from typing import List
4
+
5
+ from core_aws.services.kinesis.client import KinesisClient
6
+
7
+ from core_cdc.base import Record
8
+ from .base import Target
9
+
10
+
11
+ class KinesisDataStreamTarget(Target):
12
+ """ Send data to a Kinesis Data Stream """
13
+
14
+ def __init__(self, aws_region: str, stream_name: str, partition_key: str, **kwargs):
15
+ super(KinesisDataStreamTarget, self).__init__(**kwargs)
16
+
17
+ self.aws_region = aws_region
18
+ self.client = KinesisClient(region=self.aws_region)
19
+ self.partition_key = partition_key
20
+ self.stream_name = stream_name
21
+
22
+ def _save(self, records: List[Record], **kwargs):
23
+ return self.client.send_records(
24
+ records=[x.to_json() for x in records],
25
+ stream_name=self.stream_name,
26
+ partition_key=self.partition_key,
27
+ **kwargs)
@@ -0,0 +1,36 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from typing import List
4
+
5
+ from core_db.engines.mysql import MySQLClient
6
+
7
+ from core_cdc.base import Record
8
+ from .base import Target
9
+
10
+
11
+ class MySQLTarget(Target):
12
+ """
13
+ This is the processor for synchronization with another MySQL instance. This capability
14
+ is provided because even if the replication process is done outside this service
15
+ is required to apply the DDL statements to keep the structure up to date...
16
+ """
17
+
18
+ def init_client(self, **kwargs) -> None:
19
+ """
20
+ Expecting at least: host, user, password, database
21
+ More information: https://pymysql.readthedocs.io/en/latest/user/index.html
22
+ """
23
+
24
+ self.client = MySQLClient(**kwargs)
25
+
26
+ def _save(self, records: List[Record], **kwargs):
27
+ """
28
+ We are not implementing the mechanism to persist the data directly
29
+ into the database because we believe decoupling (by sending the
30
+ data to SQS queue or SNS topic) is a better solution. But you
31
+ could modify the behavior or inherit the class...
32
+
33
+ Possible solutions:
34
+ * self.client.get_insert_dml(...)
35
+ * self.client.get_merge_dml(...)
36
+ """
@@ -0,0 +1,80 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import re
4
+ from typing import List
5
+
6
+ from core_db.engines.snowflake import SnowflakeClient
7
+ from pymysqlreplication.event import QueryEvent
8
+
9
+ from core_cdc.base import Record
10
+ from .base import Target
11
+
12
+
13
+ class SnowflakeTarget(Target):
14
+ """ Processor for synchronization with Snowflake """
15
+
16
+ def init_client(self, **kwargs) -> None:
17
+ """
18
+ Expecting: host, user, password, account, database, warehouse, schema, role.
19
+ https://docs.snowflake.com/en/user-guide/python-connector-example.html
20
+ """
21
+
22
+ self.client = SnowflakeClient(**kwargs)
23
+
24
+ def _save(self, records: List[Record], **kwargs):
25
+ """
26
+ We are not implementing the mechanism to persist the data directly
27
+ into the data warehouse because we believe decoupling (by sending the
28
+ data to SQS queue or SNS topic) is a better solution. But you
29
+ could modify the behavior or inherit the class...
30
+ """
31
+
32
+ @staticmethod
33
+ def get_create_schema_statement(event: QueryEvent):
34
+ return f"CREATE IF NOT EXIST SCHEMA {event.schema.decode()};"
35
+
36
+ def get_create_table_statement(self, event: QueryEvent):
37
+ return self.transform_create_table_query(event.query.lower())
38
+
39
+ @staticmethod
40
+ def transform_create_table_query(query: str) -> str:
41
+ """ Transform a MySQL query to a Snowflake one """
42
+
43
+ mapper = [
44
+ (r"`", ""),
45
+ (r"create table", "create table if not exists"),
46
+ (r"unique index \w+ \(+\w+ \w+\) \w+", ""),
47
+
48
+ (r"not null", ""),
49
+ (r"null", ""),
50
+
51
+ # Not necessary because it's a replication...
52
+ (r" generated always as[.]*(.*?)\)", ""),
53
+ (r"auto_increment", ""),
54
+ (r"unsigned", ""),
55
+ (r"zerofill", ""),
56
+
57
+ # Updating types...
58
+ (r" blob", " binary"),
59
+ (r" varbinary[.]*(.*?)\)", " binary"),
60
+ (r" year[.]*(.*?)\)", " varchar(4)"),
61
+ (r" json", " object"),
62
+ (r" enum[.]*(.*?)\)", " varchar"),
63
+ (r" set[.]*(.*?)\)", " varchar"),
64
+
65
+ # Removing unnecessary blank spaces...
66
+ (r"[ ]{2,}", ""),
67
+
68
+ # Removing \n...
69
+ (r"[\n]*,", ","),
70
+
71
+ # Removing unnecessary commas...
72
+ (r",,", ","),
73
+ (r",[\n]*\)", ")")
74
+ ]
75
+
76
+ for pattern, replacement in mapper:
77
+ regex = re.compile(pattern, re.MULTILINE)
78
+ query = re.sub(regex, replacement, query)
79
+
80
+ return query
@@ -0,0 +1,30 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from typing import List
4
+
5
+ from core_aws.services.sns.client import SnsClient
6
+ from core_aws.services.sns.client import SnsMessage
7
+
8
+ from core_cdc.base import Record
9
+ from core_cdc.targets.base import Target
10
+
11
+
12
+ class SnsTarget(Target):
13
+ """ To send data to a SQS queue """
14
+
15
+ def __init__(self, aws_region: str, topic_arn: str, batch_size: int = 10, **kwargs):
16
+ super(SnsTarget, self).__init__(**kwargs)
17
+
18
+ self.aws_region = aws_region
19
+ self.client = SnsClient(region=aws_region, batch_size=batch_size)
20
+ self.topic_arn = topic_arn
21
+ self.execute_ddl = False
22
+
23
+ def _save(self, records: List[Record], **kwargs):
24
+ self.client.publish_batch(
25
+ topic_arn=self.topic_arn,
26
+ messages=[
27
+ SnsMessage(Id=f"{rec.table_name}-{x}", Message=rec.to_json())
28
+ for x, rec in enumerate(records)
29
+ ]
30
+ )
@@ -0,0 +1,44 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ import json
4
+ from typing import List
5
+ from uuid import uuid4
6
+
7
+ from core_aws.services.sqs.client import SqsClient
8
+ from core_mixins.utils import get_batches
9
+
10
+ from core_cdc.base import Record
11
+ from core_cdc.targets.base import Target
12
+
13
+
14
+ class SqsTarget(Target):
15
+ """ To send data to a SQS queue """
16
+
17
+ def __init__(self, aws_region: str, queue_name: str, **kwargs):
18
+ super(SqsTarget, self).__init__(**kwargs)
19
+
20
+ self.aws_region = aws_region
21
+ self.execute_ddl = False
22
+ self.queue_url = None
23
+
24
+ self.queue_name = queue_name
25
+ self.client = SqsClient(region=aws_region)
26
+ self.queue_url = None
27
+
28
+ def init_client(self, **kwargs) -> None:
29
+ self.queue_url = self.client.get_queue_by_name(self.queue_name).url
30
+
31
+ def _save(self, records: List[Record], **kwargs):
32
+ for batch in get_batches([x.to_json() for x in records], 10):
33
+ self.client.send_message_batch(
34
+ queue_url=self.queue_url,
35
+ entries=[
36
+ {
37
+ "Id": str(uuid4()),
38
+ "MessageBody": json.dumps(entry),
39
+ "DelaySeconds": 0,
40
+ # "MessageDeduplicationId": "",
41
+ # "MessageGroupId": ""
42
+ } for entry in batch
43
+ ]
44
+ )
@@ -0,0 +1,60 @@
1
+ Metadata-Version: 2.1
2
+ Name: core-cdc
3
+ Version: 1.0.0
4
+ Summary: This project/library contains the core mechanism and resources for Change Data Capture services...
5
+ Author-email: Alejandro Cora González <alek.cora.glez@gmail.com>
6
+ Maintainer: Alejandro Cora González
7
+ License: MIT
8
+ Project-URL: Homepage, https://gitlab.com/bytecode-solutions/core/core-cdc
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.9
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: Topic :: Utilities
16
+ Requires-Python: >=3.9
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: core-mixins
20
+ Requires-Dist: core-tests
21
+ Requires-Dist: core-aws
22
+ Requires-Dist: core-db
23
+ Requires-Dist: mysql-replication==0.31
24
+ Requires-Dist: pymongo==4.6.0
25
+ Provides-Extra: test
26
+
27
+ # core-cdc (CDC a.k.a Change Data Capture)
28
+ _______________________________________________________________________________
29
+
30
+ It provides the core mechanism and required resources to
31
+ implement "Change Data Capture" services...
32
+
33
+ ## Execution Environment
34
+
35
+ ### Install libraries
36
+ ```commandline
37
+ pip install --upgrade pip
38
+ pip install virtualenv
39
+ ```
40
+
41
+ ### Create the Python Virtual Environment.
42
+ ```commandline
43
+ virtualenv --python=python3.11 .venv
44
+ ```
45
+
46
+ ### Activate the Virtual Environment.
47
+ ```commandline
48
+ source .venv/bin/activate
49
+ ```
50
+
51
+ ### Install required libraries.
52
+ ```commandline
53
+ pip install .
54
+ ```
55
+
56
+ ### Check tests and coverage...
57
+ ```commandline
58
+ python manager.py run-test
59
+ python manager.py run-coverage
60
+ ```
@@ -0,0 +1,24 @@
1
+ LICENSE
2
+ README.md
3
+ pyproject.toml
4
+ setup.py
5
+ core_cdc/__init__.py
6
+ core_cdc/base.py
7
+ core_cdc.egg-info/PKG-INFO
8
+ core_cdc.egg-info/SOURCES.txt
9
+ core_cdc.egg-info/dependency_links.txt
10
+ core_cdc.egg-info/requires.txt
11
+ core_cdc.egg-info/top_level.txt
12
+ core_cdc/processors/__init__.py
13
+ core_cdc/processors/base.py
14
+ core_cdc/processors/mongo_stream.py
15
+ core_cdc/processors/mssql.py
16
+ core_cdc/processors/mysql_binlog.py
17
+ core_cdc/processors/oracle.py
18
+ core_cdc/targets/__init__.py
19
+ core_cdc/targets/base.py
20
+ core_cdc/targets/kinesis.py
21
+ core_cdc/targets/mysql.py
22
+ core_cdc/targets/snowflake.py
23
+ core_cdc/targets/sns.py
24
+ core_cdc/targets/sqs.py
@@ -0,0 +1,8 @@
1
+ core-mixins
2
+ core-tests
3
+ core-aws
4
+ core-db
5
+ mysql-replication==0.31
6
+ pymongo==4.6.0
7
+
8
+ [test]
@@ -0,0 +1 @@
1
+ core_cdc
@@ -0,0 +1,50 @@
1
+ # Other project metadata fields as specified in:
2
+ # https://packaging.python.org/en/latest/specifications/declaring-project-metadata/
3
+ # https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html
4
+
5
+ [build-system]
6
+ requires = ["setuptools>=61", "wheel"]
7
+ build-backend = "setuptools.build_meta"
8
+
9
+ [project]
10
+ name = "core-cdc"
11
+ description = "This project/library contains the core mechanism and resources for Change Data Capture services..."
12
+ version = "1.0.0"
13
+
14
+ authors = [
15
+ {name = "Alejandro Cora González", email = "alek.cora.glez@gmail.com"}
16
+ ]
17
+
18
+ maintainers = [
19
+ {name = "Alejandro Cora González"}
20
+ ]
21
+
22
+ requires-python = ">=3.9"
23
+ license = {text = "MIT"}
24
+ readme = "README.md"
25
+
26
+ classifiers = [
27
+ # Classifiers -> https://pypi.org/classifiers/
28
+ "License :: OSI Approved :: MIT License",
29
+ "Topic :: Software Development :: Libraries :: Python Modules",
30
+ "Programming Language :: Python :: 3",
31
+ "Programming Language :: Python :: 3.9",
32
+ "Programming Language :: Python :: 3.11",
33
+ "Intended Audience :: Developers",
34
+ "Topic :: Utilities"
35
+ ]
36
+
37
+ dependencies = [
38
+ "core-mixins",
39
+ "core-tests",
40
+ "core-aws",
41
+ "core-db",
42
+ "mysql-replication==0.31",
43
+ "pymongo==4.6.0"
44
+ ]
45
+
46
+ [project.urls]
47
+ Homepage = "https://gitlab.com/bytecode-solutions/core/core-cdc"
48
+
49
+ [project.optional-dependencies]
50
+ test = []
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,6 @@
1
+ # -*- coding: utf-8 -*-
2
+
3
+ from setuptools import setup
4
+
5
+
6
+ setup()