core-cdc 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- core-cdc-1.0.0/LICENSE +21 -0
- core-cdc-1.0.0/PKG-INFO +60 -0
- core-cdc-1.0.0/README.md +34 -0
- core-cdc-1.0.0/core_cdc/__init__.py +3 -0
- core-cdc-1.0.0/core_cdc/base.py +111 -0
- core-cdc-1.0.0/core_cdc/processors/__init__.py +0 -0
- core-cdc-1.0.0/core_cdc/processors/base.py +110 -0
- core-cdc-1.0.0/core_cdc/processors/mongo_stream.py +105 -0
- core-cdc-1.0.0/core_cdc/processors/mssql.py +62 -0
- core-cdc-1.0.0/core_cdc/processors/mysql_binlog.py +120 -0
- core-cdc-1.0.0/core_cdc/processors/oracle.py +59 -0
- core-cdc-1.0.0/core_cdc/targets/__init__.py +6 -0
- core-cdc-1.0.0/core_cdc/targets/base.py +117 -0
- core-cdc-1.0.0/core_cdc/targets/kinesis.py +27 -0
- core-cdc-1.0.0/core_cdc/targets/mysql.py +36 -0
- core-cdc-1.0.0/core_cdc/targets/snowflake.py +80 -0
- core-cdc-1.0.0/core_cdc/targets/sns.py +30 -0
- core-cdc-1.0.0/core_cdc/targets/sqs.py +44 -0
- core-cdc-1.0.0/core_cdc.egg-info/PKG-INFO +60 -0
- core-cdc-1.0.0/core_cdc.egg-info/SOURCES.txt +24 -0
- core-cdc-1.0.0/core_cdc.egg-info/dependency_links.txt +1 -0
- core-cdc-1.0.0/core_cdc.egg-info/requires.txt +8 -0
- core-cdc-1.0.0/core_cdc.egg-info/top_level.txt +1 -0
- core-cdc-1.0.0/pyproject.toml +50 -0
- core-cdc-1.0.0/setup.cfg +4 -0
- core-cdc-1.0.0/setup.py +6 -0
core-cdc-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2022 Alejandro Cora González.
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
core-cdc-1.0.0/PKG-INFO
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: core-cdc
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: This project/library contains the core mechanism and resources for Change Data Capture services...
|
|
5
|
+
Author-email: Alejandro Cora González <alek.cora.glez@gmail.com>
|
|
6
|
+
Maintainer: Alejandro Cora González
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://gitlab.com/bytecode-solutions/core/core-cdc
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Topic :: Utilities
|
|
16
|
+
Requires-Python: >=3.9
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: core-mixins
|
|
20
|
+
Requires-Dist: core-tests
|
|
21
|
+
Requires-Dist: core-aws
|
|
22
|
+
Requires-Dist: core-db
|
|
23
|
+
Requires-Dist: mysql-replication==0.31
|
|
24
|
+
Requires-Dist: pymongo==4.6.0
|
|
25
|
+
Provides-Extra: test
|
|
26
|
+
|
|
27
|
+
# core-cdc (CDC a.k.a Change Data Capture)
|
|
28
|
+
_______________________________________________________________________________
|
|
29
|
+
|
|
30
|
+
It provides the core mechanism and required resources to
|
|
31
|
+
implement "Change Data Capture" services...
|
|
32
|
+
|
|
33
|
+
## Execution Environment
|
|
34
|
+
|
|
35
|
+
### Install libraries
|
|
36
|
+
```commandline
|
|
37
|
+
pip install --upgrade pip
|
|
38
|
+
pip install virtualenv
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Create the Python Virtual Environment.
|
|
42
|
+
```commandline
|
|
43
|
+
virtualenv --python=python3.11 .venv
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Activate the Virtual Environment.
|
|
47
|
+
```commandline
|
|
48
|
+
source .venv/bin/activate
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Install required libraries.
|
|
52
|
+
```commandline
|
|
53
|
+
pip install .
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Check tests and coverage...
|
|
57
|
+
```commandline
|
|
58
|
+
python manager.py run-test
|
|
59
|
+
python manager.py run-coverage
|
|
60
|
+
```
|
core-cdc-1.0.0/README.md
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# core-cdc (CDC a.k.a Change Data Capture)
|
|
2
|
+
_______________________________________________________________________________
|
|
3
|
+
|
|
4
|
+
It provides the core mechanism and required resources to
|
|
5
|
+
implement "Change Data Capture" services...
|
|
6
|
+
|
|
7
|
+
## Execution Environment
|
|
8
|
+
|
|
9
|
+
### Install libraries
|
|
10
|
+
```commandline
|
|
11
|
+
pip install --upgrade pip
|
|
12
|
+
pip install virtualenv
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
### Create the Python Virtual Environment.
|
|
16
|
+
```commandline
|
|
17
|
+
virtualenv --python=python3.11 .venv
|
|
18
|
+
```
|
|
19
|
+
|
|
20
|
+
### Activate the Virtual Environment.
|
|
21
|
+
```commandline
|
|
22
|
+
source .venv/bin/activate
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
### Install required libraries.
|
|
26
|
+
```commandline
|
|
27
|
+
pip install .
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Check tests and coverage...
|
|
31
|
+
```commandline
|
|
32
|
+
python manager.py run-test
|
|
33
|
+
python manager.py run-coverage
|
|
34
|
+
```
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from datetime import date, datetime
|
|
7
|
+
from enum import StrEnum
|
|
8
|
+
from typing import Dict, Tuple, Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class EventType(StrEnum):
|
|
12
|
+
GLOBAL = "GLOBAL" # Event related to the processor itself.
|
|
13
|
+
DDL_STATEMENT = "QUERY" # Event for DDL statement (like create table, etc.).
|
|
14
|
+
INSERT = "INSERT" # Event for DML INSERT operation.
|
|
15
|
+
UPDATE = "UPDATE" # Event for DML UPDATE operation.
|
|
16
|
+
DELETE = "DELETE" # Event for DML DELETE operation.
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Record:
|
|
20
|
+
"""
|
|
21
|
+
It provides a wrapper or common object useful for integration
|
|
22
|
+
across services that needs to handle data replication
|
|
23
|
+
via Change Data Capture producers and consumers...
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(
|
|
27
|
+
self, event_timestamp: int, event_type: EventType, record: Dict, service: str,
|
|
28
|
+
source: str, position: int, primary_key: Tuple | str, schema_name: str, table_name: str,
|
|
29
|
+
global_id: Optional[str], transaction_id: Optional[str]) -> None:
|
|
30
|
+
|
|
31
|
+
"""
|
|
32
|
+
It creates a new record with all the attributes required for streaming and
|
|
33
|
+
replication using a Data Change Capture service...
|
|
34
|
+
|
|
35
|
+
:param: global_id: Unique identifier. (like: Global Transaction Identifier (gtid) in MySQL engine).
|
|
36
|
+
:param: transaction_id: Identifier for the transaction. (like: xid in MySQL engine).
|
|
37
|
+
|
|
38
|
+
:param event_timestamp: Timestamp when the record was generated.
|
|
39
|
+
:param event_type: It specifies the operation that generated the event.
|
|
40
|
+
:param record: The record (like a record/row in a database).
|
|
41
|
+
|
|
42
|
+
:param service: The service from where the record came.
|
|
43
|
+
:param source: The source from where the record was retrieved (like binlog file name).
|
|
44
|
+
:param position: Record position in the source.
|
|
45
|
+
|
|
46
|
+
:param: primary_key: Attributes to use to identify a record as unique.
|
|
47
|
+
:param: schema_name: Schema from where the record was retrieved.
|
|
48
|
+
:param: table: Table from where the record was retrieved.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
self.global_id = global_id
|
|
52
|
+
self.transaction_id = transaction_id
|
|
53
|
+
|
|
54
|
+
self.event_timestamp = event_timestamp
|
|
55
|
+
self.event_type = event_type
|
|
56
|
+
self.record = record
|
|
57
|
+
|
|
58
|
+
self.service = service
|
|
59
|
+
self.source = source
|
|
60
|
+
self.position = position
|
|
61
|
+
|
|
62
|
+
self.primary_key = primary_key
|
|
63
|
+
self.schema_name = schema_name
|
|
64
|
+
self.table_name = table_name
|
|
65
|
+
|
|
66
|
+
def __str__(self):
|
|
67
|
+
return json.dumps(self.to_json())
|
|
68
|
+
|
|
69
|
+
def to_json(self):
|
|
70
|
+
"""
|
|
71
|
+
It returns the JSON version of the record required to be streamed...
|
|
72
|
+
|
|
73
|
+
:return: A dictionary that follows the below structure:
|
|
74
|
+
{
|
|
75
|
+
"global_id": "",
|
|
76
|
+
"transaction_id": "",
|
|
77
|
+
"event_timestamp": 1653685384,
|
|
78
|
+
"event_type": "INSERT | UPDATE | DELETE",
|
|
79
|
+
"service": "service-name",
|
|
80
|
+
"source": "binlog.000001 | FileName | Something",
|
|
81
|
+
"position": 8077,
|
|
82
|
+
"primary_key": "(str | tuple)",
|
|
83
|
+
"schema_name": "schema_name",
|
|
84
|
+
"table_name": "table_name",
|
|
85
|
+
"record": {
|
|
86
|
+
...
|
|
87
|
+
// This is an example...
|
|
88
|
+
"id": "000-1",
|
|
89
|
+
"category": "Marketing",
|
|
90
|
+
"price": 100.0,
|
|
91
|
+
...
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
"""
|
|
95
|
+
|
|
96
|
+
return {
|
|
97
|
+
"global_id": self.global_id,
|
|
98
|
+
"transaction_id": self.transaction_id,
|
|
99
|
+
"event_timestamp": self.event_timestamp,
|
|
100
|
+
"event_type": self.event_type.value,
|
|
101
|
+
"service": self.service,
|
|
102
|
+
"source": self.source,
|
|
103
|
+
"position": self.position,
|
|
104
|
+
"primary_key": self.primary_key,
|
|
105
|
+
"schema_name": self.schema_name,
|
|
106
|
+
"table_name": self.table_name,
|
|
107
|
+
"record": {
|
|
108
|
+
key: value.isoformat() if type(value) in (datetime, date) else value
|
|
109
|
+
for key, value in self.record.items()
|
|
110
|
+
}
|
|
111
|
+
}
|
|
File without changes
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from abc import abstractmethod
|
|
4
|
+
from logging import Logger
|
|
5
|
+
from typing import Any, Dict, List, Iterator, Iterable, Type, Self
|
|
6
|
+
|
|
7
|
+
from core_mixins.interfaces.factory import IFactory
|
|
8
|
+
|
|
9
|
+
from core_cdc.base import EventType, Record
|
|
10
|
+
from core_cdc.targets.base import Target
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class IProcessor(IFactory):
|
|
14
|
+
""" Interface for all "Change Data Capture" implementations """
|
|
15
|
+
|
|
16
|
+
_impls: Dict[str, Type[Self]] = {}
|
|
17
|
+
|
|
18
|
+
def __init__(
|
|
19
|
+
self, service: str, targets: List[Target], logger: Logger,
|
|
20
|
+
events_to_stream: Iterable[EventType] = (EventType.INSERT, EventType.UPDATE, EventType.DELETE),
|
|
21
|
+
add_event_timestamp: bool = False) -> None:
|
|
22
|
+
|
|
23
|
+
"""
|
|
24
|
+
:param service: The name of the service that is doing the CDC process.
|
|
25
|
+
:param events_to_stream: By default, insert, update and, delete operations will be streamed.
|
|
26
|
+
:param logger: Logger used.
|
|
27
|
+
|
|
28
|
+
:param add_event_timestamp:
|
|
29
|
+
If True, the column event_timestamp will be added when a table is created. The column
|
|
30
|
+
could be useful for UPSERT or MERGE operations.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
self.service = service
|
|
34
|
+
self.targets = targets
|
|
35
|
+
self.events_to_stream = events_to_stream
|
|
36
|
+
self.add_event_timestamp = add_event_timestamp
|
|
37
|
+
self.logger = logger
|
|
38
|
+
|
|
39
|
+
def execute(self):
|
|
40
|
+
self.logger.info("Reading events from the stream...")
|
|
41
|
+
|
|
42
|
+
for event in self.get_events():
|
|
43
|
+
event_type = self.get_event_type(event)
|
|
44
|
+
|
|
45
|
+
if event_type == EventType.DDL_STATEMENT:
|
|
46
|
+
for target in self.targets:
|
|
47
|
+
if target.execute_ddl:
|
|
48
|
+
try:
|
|
49
|
+
query = target.get_ddl_query(event)
|
|
50
|
+
if query:
|
|
51
|
+
target.execute(query)
|
|
52
|
+
self.logger.info(f"The below query was executed in: {target.register_name()}.")
|
|
53
|
+
self.logger.info(query)
|
|
54
|
+
|
|
55
|
+
if self.add_event_timestamp:
|
|
56
|
+
if query.lower().count("create table"):
|
|
57
|
+
schema, table = Target.get_schema_table_from_query(query)
|
|
58
|
+
|
|
59
|
+
target.execute(
|
|
60
|
+
target.get_add_column_ddl(
|
|
61
|
+
schema=schema, table=table,
|
|
62
|
+
column="event_timestamp",
|
|
63
|
+
type_="bigint"
|
|
64
|
+
)
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
except Exception as error:
|
|
68
|
+
self.logger.error(f"[{target.register_name()}] -- {error}")
|
|
69
|
+
|
|
70
|
+
elif event_type in self.events_to_stream:
|
|
71
|
+
for target in self.targets:
|
|
72
|
+
try:
|
|
73
|
+
records = self.process_dml_event(event)
|
|
74
|
+
target.save(records)
|
|
75
|
+
|
|
76
|
+
self.logger.info({
|
|
77
|
+
"target": target.register_name(),
|
|
78
|
+
"number_of_records": len(records),
|
|
79
|
+
"event_type": event_type,
|
|
80
|
+
"schema": records[0].schema_name,
|
|
81
|
+
"table": records[0].table_name
|
|
82
|
+
})
|
|
83
|
+
|
|
84
|
+
except Exception as error:
|
|
85
|
+
self.logger.error(f"[{target.register_name()}] -- {error}")
|
|
86
|
+
|
|
87
|
+
else:
|
|
88
|
+
try:
|
|
89
|
+
self.process_event(event)
|
|
90
|
+
|
|
91
|
+
except Exception as error:
|
|
92
|
+
self.logger.error(f"Error: {error}")
|
|
93
|
+
|
|
94
|
+
@abstractmethod
|
|
95
|
+
def get_events(self) -> Iterator[Any]:
|
|
96
|
+
""" It returns an iterator with the events to process """
|
|
97
|
+
|
|
98
|
+
@abstractmethod
|
|
99
|
+
def get_event_type(self, event: Any) -> EventType:
|
|
100
|
+
""" It returns the event type """
|
|
101
|
+
|
|
102
|
+
@abstractmethod
|
|
103
|
+
def process_dml_event(self, event: Any, **kwargs) -> List[Record]:
|
|
104
|
+
""" It processes the event and return the records to stream """
|
|
105
|
+
|
|
106
|
+
def process_event(self, event: Any):
|
|
107
|
+
"""
|
|
108
|
+
It should be implemented if another event (apart from DDL
|
|
109
|
+
and DML) must be processed...
|
|
110
|
+
"""
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from typing import Any, List, Iterator, Dict
|
|
4
|
+
|
|
5
|
+
from pymongo.change_stream import ChangeStream
|
|
6
|
+
|
|
7
|
+
from core_cdc.base import Record, EventType
|
|
8
|
+
from .base import IProcessor
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MongoDbStreamProcessor(IProcessor):
|
|
12
|
+
"""
|
|
13
|
+
It processes the events from the MongoDB Stream.
|
|
14
|
+
|
|
15
|
+
A change stream is a real-time stream of database changes that flows from your
|
|
16
|
+
database to your application. With change streams, your applications can react—in real
|
|
17
|
+
time—to data changes in a single collection, a database, or even an entire
|
|
18
|
+
deployment. For apps that rely on notifications of changing data, change
|
|
19
|
+
streams are critical.
|
|
20
|
+
|
|
21
|
+
More information:
|
|
22
|
+
https://www.mongodb.com/basics/change-streams
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
def __init__(self, stream: ChangeStream, save_full_event: bool = True, **kwargs):
|
|
26
|
+
"""
|
|
27
|
+
:param stream: DatabaseChangeStream object.
|
|
28
|
+
:param save_full_event: If True, all the event will be streamed, otherwise only fullDocument.
|
|
29
|
+
|
|
30
|
+
To create a stream you can use:
|
|
31
|
+
* db.collection.watch()
|
|
32
|
+
* db.watch()
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
pipeline = [{'$match': {'operationType': 'insert'}}, {'$match': {'operationType': 'replace'}}]
|
|
36
|
+
MongoDbStreamProcessor(
|
|
37
|
+
stream = MongoClient(...)["database"].<collection>.watch(pipeline)
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
More information...
|
|
41
|
+
* https://www.mongodb.com/basics/change-streams
|
|
42
|
+
* https://www.mongodb.com/docs/manual/changeStreams/#open-a-change-stream
|
|
43
|
+
* https://www.mongodb.com/docs/manual/reference/method/db.watch/#db.watch--
|
|
44
|
+
"""
|
|
45
|
+
|
|
46
|
+
super(MongoDbStreamProcessor, self).__init__(**kwargs)
|
|
47
|
+
self.save_full_event = save_full_event
|
|
48
|
+
self.stream = stream
|
|
49
|
+
|
|
50
|
+
def get_events(self) -> Iterator[Any]:
|
|
51
|
+
for event in self.stream:
|
|
52
|
+
self.logger.info(
|
|
53
|
+
f"Received event: {event.get('operationType')} "
|
|
54
|
+
f"for document: {event.get('documentKey', {}).get('_id')}."
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
event["clusterTime"] = event["clusterTime"].time if event.get("clusterTime") else None
|
|
58
|
+
yield event
|
|
59
|
+
|
|
60
|
+
# To store the token to resume execution...
|
|
61
|
+
self.save_resume_token(event["_id"])
|
|
62
|
+
|
|
63
|
+
def get_event_type(self, event: Dict) -> EventType:
|
|
64
|
+
opt_type = event.get("operationType")
|
|
65
|
+
if opt_type == "insert":
|
|
66
|
+
return EventType.INSERT
|
|
67
|
+
|
|
68
|
+
elif opt_type in ("replace", "update"):
|
|
69
|
+
return EventType.UPDATE
|
|
70
|
+
|
|
71
|
+
elif opt_type == "delete":
|
|
72
|
+
return EventType.DELETE
|
|
73
|
+
|
|
74
|
+
return EventType.GLOBAL
|
|
75
|
+
|
|
76
|
+
def process_dml_event(self, event: Any, **kwargs) -> List[Record]:
|
|
77
|
+
metadata = {
|
|
78
|
+
"global_id": None,
|
|
79
|
+
"transaction_id": None,
|
|
80
|
+
"event_timestamp": event["clusterTime"],
|
|
81
|
+
"event_type": self.get_event_type(event),
|
|
82
|
+
"service": self.service,
|
|
83
|
+
"source": None,
|
|
84
|
+
"position": 0,
|
|
85
|
+
"primary_key": "_id",
|
|
86
|
+
"schema_name": event["ns"]["db"],
|
|
87
|
+
"table_name": event["ns"]["coll"]
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
event["documentKey"]["_id"] = str(event["documentKey"]["_id"])
|
|
91
|
+
if event.get("fullDocument"):
|
|
92
|
+
event["fullDocument"]["_id"] = str(event["fullDocument"]["_id"])
|
|
93
|
+
|
|
94
|
+
return [
|
|
95
|
+
Record(
|
|
96
|
+
record=event if self.save_full_event else event.get("fullDocument", {}),
|
|
97
|
+
**metadata
|
|
98
|
+
)
|
|
99
|
+
]
|
|
100
|
+
|
|
101
|
+
def save_resume_token(self, token):
|
|
102
|
+
"""
|
|
103
|
+
It stores the token that can be used to resume the
|
|
104
|
+
process in a certain point...
|
|
105
|
+
"""
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from core_cdc.processors.base import IProcessor
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class MsSqlProcessor(IProcessor):
|
|
7
|
+
"""
|
|
8
|
+
Change Data Capture (CDC) is a feature provided by Microsoft SQL Server to capture
|
|
9
|
+
and track changes made to tables at the database level. It is designed to efficiently
|
|
10
|
+
identify and record changes to data, making it a powerful tool for applications
|
|
11
|
+
that require real-time or near-real-time data integration, synchronization,
|
|
12
|
+
and auditing. Here are some key points
|
|
13
|
+
about SQL Server Change Data Capture:
|
|
14
|
+
|
|
15
|
+
1. **Purpose:**
|
|
16
|
+
- **Data Integration:** CDC facilitates the integration of data between different systems and databases by capturing changes at the source.
|
|
17
|
+
- **Synchronization:** It allows keeping multiple databases in sync by identifying and applying changes made to one database to another.
|
|
18
|
+
- **Auditing:** CDC provides a reliable and efficient way to audit changes to data, tracking who made the change and when.
|
|
19
|
+
|
|
20
|
+
2. **How it Works:**
|
|
21
|
+
- CDC works by enabling change tracking on selected tables. When enabled, SQL Server maintains change tables to store the details of changes made to the tracked tables.
|
|
22
|
+
- The change tables store information about INSERTs, UPDATEs, and DELETEs, including the columns affected and the corresponding values before and after the change.
|
|
23
|
+
|
|
24
|
+
3. **Configuration:**
|
|
25
|
+
- To use CDC, you need to enable it at the database level and then enable it for specific tables.
|
|
26
|
+
- Enabling CDC involves creating the required system tables and stored procedures for change tracking.
|
|
27
|
+
- Once enabled, SQL Server automatically populates change tables with the details of changes made to the tracked tables.
|
|
28
|
+
|
|
29
|
+
4. **Querying Changes:**
|
|
30
|
+
- After CDC is enabled, you can query the change tables to retrieve information about changes made to tracked tables.
|
|
31
|
+
- The `cdc.fn_cdc_get_all_changes_<capture_instance>` function is commonly used to retrieve changes for a specific capture instance.
|
|
32
|
+
|
|
33
|
+
5. **Retention Period:**
|
|
34
|
+
- CDC allows you to specify a retention period for change data. After this period, older change data is automatically purged.
|
|
35
|
+
|
|
36
|
+
6. **Integration with ETL and Replication:**
|
|
37
|
+
- CDC is often used in conjunction with Extract, Transform, Load (ETL) processes to capture changes at the source and propagate them to a data warehouse or another database.
|
|
38
|
+
- It is also integrated with SQL Server Replication to capture and replicate changes to other SQL Server instances.
|
|
39
|
+
|
|
40
|
+
7. **Security Considerations:**
|
|
41
|
+
- CDC respects SQL Server security settings, ensuring that only authorized users can access the change data.
|
|
42
|
+
|
|
43
|
+
Here's a basic example of enabling CDC for a table:
|
|
44
|
+
|
|
45
|
+
```sql
|
|
46
|
+
-- Enable CDC at the database level
|
|
47
|
+
EXEC sys.sp_cdc_enable_db;
|
|
48
|
+
|
|
49
|
+
-- Enable CDC for a specific table
|
|
50
|
+
EXEC sys.sp_cdc_enable_table
|
|
51
|
+
@source_schema = 'dbo',
|
|
52
|
+
@source_name = 'YourTableName',
|
|
53
|
+
@role_name = 'cdc_admin';
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
Once CDC is enabled, you can query the change tables to retrieve
|
|
57
|
+
information about the changes made to the tracked table. Keep in mind that
|
|
58
|
+
using CDC introduces some overhead, and you should carefully consider the
|
|
59
|
+
impact on performance and storage when enabling it. It's a powerful
|
|
60
|
+
feature for scenarios requiring change tracking, but it may not be
|
|
61
|
+
necessary for every application.
|
|
62
|
+
"""
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from typing import Any, Iterator, List
|
|
4
|
+
|
|
5
|
+
from pymysqlreplication import BinLogStreamReader
|
|
6
|
+
from pymysqlreplication.event import GtidEvent
|
|
7
|
+
from pymysqlreplication.event import QueryEvent
|
|
8
|
+
from pymysqlreplication.event import RotateEvent
|
|
9
|
+
from pymysqlreplication.event import XidEvent
|
|
10
|
+
from pymysqlreplication.row_event import DeleteRowsEvent
|
|
11
|
+
from pymysqlreplication.row_event import UpdateRowsEvent
|
|
12
|
+
from pymysqlreplication.row_event import WriteRowsEvent
|
|
13
|
+
|
|
14
|
+
from core_cdc.base import Record, EventType
|
|
15
|
+
from .base import IProcessor
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MySqlBinlogProcessor(IProcessor):
|
|
19
|
+
"""
|
|
20
|
+
It processes the events from the BinLog files.
|
|
21
|
+
|
|
22
|
+
The binary log contains “events” that describe database changes such as table creation
|
|
23
|
+
operations or changes to table data. It also contains events for statements that potentially
|
|
24
|
+
could have made changes (for example, a DELETE which matched no rows), unless row-based
|
|
25
|
+
logging is used. The binary log also contains information about how long each
|
|
26
|
+
statement took that updated data.
|
|
27
|
+
|
|
28
|
+
More information:
|
|
29
|
+
https://dev.mysql.com/doc/refman/8.0/en/binary-log.html
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(self, stream: BinLogStreamReader, **kwargs) -> None:
|
|
33
|
+
"""
|
|
34
|
+
https://python-mysql-replication.readthedocs.io/en/stable/binlogstream.html
|
|
35
|
+
:param stream: BinLogStreamReader object.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
super(MySqlBinlogProcessor, self).__init__(**kwargs)
|
|
39
|
+
self.stream = stream
|
|
40
|
+
|
|
41
|
+
# To keep the tracking of the processed elements...
|
|
42
|
+
self.log_file = None
|
|
43
|
+
self.log_pos = None
|
|
44
|
+
self.gtid = None
|
|
45
|
+
self.xid = None
|
|
46
|
+
|
|
47
|
+
def get_events(self) -> Iterator[Any]:
|
|
48
|
+
for event in self.stream:
|
|
49
|
+
self.logger.info(f"Received event: {event.__class__.__name__}.")
|
|
50
|
+
self.log_file, self.log_pos = self.stream.log_file, self.stream.log_pos
|
|
51
|
+
self.logger.info(f"File: {self.log_file}, Position: {self.log_pos}.")
|
|
52
|
+
|
|
53
|
+
yield event
|
|
54
|
+
self._update_log_pos(self.log_pos)
|
|
55
|
+
|
|
56
|
+
def get_event_type(self, event: Any) -> EventType:
|
|
57
|
+
if isinstance(event, QueryEvent):
|
|
58
|
+
return EventType.DDL_STATEMENT
|
|
59
|
+
|
|
60
|
+
elif isinstance(event, WriteRowsEvent):
|
|
61
|
+
return EventType.INSERT
|
|
62
|
+
|
|
63
|
+
elif isinstance(event, UpdateRowsEvent):
|
|
64
|
+
return EventType.UPDATE
|
|
65
|
+
|
|
66
|
+
elif isinstance(event, DeleteRowsEvent):
|
|
67
|
+
return EventType.DELETE
|
|
68
|
+
|
|
69
|
+
return EventType.GLOBAL
|
|
70
|
+
|
|
71
|
+
def process_dml_event(self, event: Any, **kwargs) -> List[Record]:
|
|
72
|
+
metadata = {
|
|
73
|
+
"global_id": self.gtid,
|
|
74
|
+
"transaction_id": self.xid,
|
|
75
|
+
"event_timestamp": event.timestamp,
|
|
76
|
+
"event_type": "",
|
|
77
|
+
"service": self.service,
|
|
78
|
+
"source": self.log_file,
|
|
79
|
+
"position": self.log_pos,
|
|
80
|
+
"primary_key": event.primary_key,
|
|
81
|
+
"schema_name": event.schema,
|
|
82
|
+
"table_name": event.table
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
if isinstance(event, WriteRowsEvent):
|
|
86
|
+
metadata["event_type"] = EventType.INSERT
|
|
87
|
+
|
|
88
|
+
if isinstance(event, DeleteRowsEvent):
|
|
89
|
+
metadata["event_type"] = EventType.DELETE
|
|
90
|
+
|
|
91
|
+
if isinstance(event, UpdateRowsEvent):
|
|
92
|
+
metadata["event_type"] = EventType.UPDATE
|
|
93
|
+
|
|
94
|
+
return [
|
|
95
|
+
Record(record=row.get("after_values", {}), **metadata)
|
|
96
|
+
for row in event.rows
|
|
97
|
+
]
|
|
98
|
+
|
|
99
|
+
return [
|
|
100
|
+
Record(record=row.get("values", {}), **metadata)
|
|
101
|
+
for row in event.rows
|
|
102
|
+
]
|
|
103
|
+
|
|
104
|
+
def process_event(self, event: Any, **kwargs):
|
|
105
|
+
if isinstance(event, GtidEvent):
|
|
106
|
+
self.gtid = event.gtid
|
|
107
|
+
|
|
108
|
+
elif isinstance(event, XidEvent):
|
|
109
|
+
self.xid = event.xid
|
|
110
|
+
|
|
111
|
+
elif isinstance(event, RotateEvent):
|
|
112
|
+
self.logger.info(f"NEXT FILE: {event.next_binlog}. POSITION: {event.position}.")
|
|
113
|
+
self._update_log_file(event.next_binlog)
|
|
114
|
+
self._update_log_pos(event.position)
|
|
115
|
+
|
|
116
|
+
def _update_log_file(self, log_file_name: str):
|
|
117
|
+
""" It updates the log_file name """
|
|
118
|
+
|
|
119
|
+
def _update_log_pos(self, position: int):
|
|
120
|
+
""" It updates the log_file position """
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from core_cdc.processors.base import IProcessor
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class OracleProcessor(IProcessor):
|
|
7
|
+
"""
|
|
8
|
+
Oracle Database does not provide a dedicated Python library similar to
|
|
9
|
+
the SQL Server Change Data Capture (CDC) feature. However, you can implement change
|
|
10
|
+
data capture in Oracle using a combination of Oracle features and Oracle Database
|
|
11
|
+
connectors for Python.
|
|
12
|
+
|
|
13
|
+
Here's a general approach:
|
|
14
|
+
|
|
15
|
+
1. **Oracle Flashback Technology:**
|
|
16
|
+
- Oracle offers Flashback Technology, including Flashback Query and Flashback Version Query, which allows you to query data as it appeared at a previous point in time.
|
|
17
|
+
- You can leverage these features to implement a form of change data capture by periodically querying the database for changes within a specified time range.
|
|
18
|
+
|
|
19
|
+
2. **Database Triggers:**
|
|
20
|
+
- Oracle supports triggers, which are procedures that are automatically executed in response to specific events on a particular table or view.
|
|
21
|
+
- You can create triggers to capture information about changes (e.g., inserts, updates, deletes) into separate audit or change tables.
|
|
22
|
+
|
|
23
|
+
3. **LogMiner:**
|
|
24
|
+
- Oracle LogMiner is a utility that allows you to mine redo log files and obtain information about changes made to the database.
|
|
25
|
+
- You can use Oracle's LogMiner to track changes and update a change tracking table.
|
|
26
|
+
|
|
27
|
+
4. **Oracle Database Connectors for Python:**
|
|
28
|
+
- You can use Python libraries such as `cx_Oracle` to connect to Oracle Database and execute queries or procedures that implement your change data capture logic.
|
|
29
|
+
- Install `cx_Oracle` using: `pip install cx_Oracle`.
|
|
30
|
+
|
|
31
|
+
Here's a simplified example using Python and `cx_Oracle`:
|
|
32
|
+
|
|
33
|
+
```python
|
|
34
|
+
import cx_Oracle
|
|
35
|
+
|
|
36
|
+
# Connect to Oracle
|
|
37
|
+
connection = cx_Oracle.connect("username/password@localhost/orcl")
|
|
38
|
+
|
|
39
|
+
# Create a cursor
|
|
40
|
+
cursor = connection.cursor()
|
|
41
|
+
|
|
42
|
+
# Execute a query using Flashback Query
|
|
43
|
+
query = "SELECT * FROM your_table AS OF TIMESTAMP (SYSTIMESTAMP - INTERVAL '1' DAY)"
|
|
44
|
+
cursor.execute(query)
|
|
45
|
+
|
|
46
|
+
# Fetch the results
|
|
47
|
+
for row in cursor.fetchall():
|
|
48
|
+
print(row)
|
|
49
|
+
|
|
50
|
+
# Close the cursor and connection
|
|
51
|
+
cursor.close()
|
|
52
|
+
connection.close()
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
Remember that Oracle Database may have specific requirements and considerations
|
|
56
|
+
for implementing change data capture based on your use case. You might also want to
|
|
57
|
+
explore Oracle GoldenGate, a comprehensive solution for real-time data integration
|
|
58
|
+
and replication that goes beyond basic change tracking.
|
|
59
|
+
"""
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from abc import abstractmethod
|
|
6
|
+
from logging import Logger
|
|
7
|
+
from re import compile
|
|
8
|
+
from typing import Dict, List, Tuple, Type, Self
|
|
9
|
+
|
|
10
|
+
from core_mixins.interfaces.factory import IFactory
|
|
11
|
+
from pymysqlreplication.event import QueryEvent
|
|
12
|
+
|
|
13
|
+
from core_cdc.base import Record
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class Target(IFactory):
|
|
17
|
+
"""
|
|
18
|
+
This is the base class for the specific implementations of the targets the data will be sent
|
|
19
|
+
or replicated. A target could be a database, queue, topic, data warehouse, etc...
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
# Registered targets...
|
|
23
|
+
_impls: Dict[str, Type[Self]] = {}
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self, logger: Logger, execute_ddl: bool = False, send_data: bool = False,
|
|
27
|
+
**kwargs) -> None:
|
|
28
|
+
|
|
29
|
+
self.execute_ddl = execute_ddl
|
|
30
|
+
self.send_data = send_data
|
|
31
|
+
self.logger = logger
|
|
32
|
+
self.client = None
|
|
33
|
+
|
|
34
|
+
for attr in kwargs:
|
|
35
|
+
setattr(self, attr, kwargs[attr])
|
|
36
|
+
|
|
37
|
+
def init_client(self, **kwargs) -> None:
|
|
38
|
+
""" The target's implementations must implement this method """
|
|
39
|
+
|
|
40
|
+
def connect(self):
|
|
41
|
+
if self.client:
|
|
42
|
+
self.client.connect()
|
|
43
|
+
|
|
44
|
+
def get_ddl_query(self, event: QueryEvent) -> str | None:
|
|
45
|
+
"""
|
|
46
|
+
Each engine could use a different query for DDL operations...
|
|
47
|
+
:return: The query or None if not supported.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
# sql_statement = sub(r"/\*.*?\*/", "", event.query.lower()).strip()
|
|
51
|
+
sql_statement = event.query.lower()
|
|
52
|
+
|
|
53
|
+
if sql_statement.count("create schema") or sql_statement.count("create database"):
|
|
54
|
+
return self.get_create_schema_statement(event)
|
|
55
|
+
|
|
56
|
+
elif sql_statement.count("drop schema") or sql_statement.count("drop database"):
|
|
57
|
+
return self.get_drop_schema_statement(event)
|
|
58
|
+
|
|
59
|
+
elif sql_statement.count("create table"):
|
|
60
|
+
return self.get_create_table_statement(event)
|
|
61
|
+
|
|
62
|
+
elif sql_statement.count("alter table"):
|
|
63
|
+
return self.get_alter_table_statement(event)
|
|
64
|
+
|
|
65
|
+
elif sql_statement.count("drop table"):
|
|
66
|
+
return self.get_drop_table_statement(event)
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def get_add_column_ddl(schema: str, table: str, column: str, type_: str) -> str:
|
|
70
|
+
""" Returns the DDL to add a new column """
|
|
71
|
+
|
|
72
|
+
return f"ALTER TABLE `{schema}`.`{table}` ADD COLUMN `{column}` {type_};"
|
|
73
|
+
|
|
74
|
+
@staticmethod
|
|
75
|
+
def get_create_schema_statement(event: QueryEvent) -> str:
|
|
76
|
+
return event.query
|
|
77
|
+
|
|
78
|
+
@staticmethod
|
|
79
|
+
def get_drop_schema_statement(event: QueryEvent) -> str:
|
|
80
|
+
return event.query
|
|
81
|
+
|
|
82
|
+
@staticmethod
|
|
83
|
+
def get_create_table_statement(event: QueryEvent) -> str:
|
|
84
|
+
return event.query
|
|
85
|
+
|
|
86
|
+
@staticmethod
|
|
87
|
+
def get_alter_table_statement(event: QueryEvent) -> str:
|
|
88
|
+
return event.query
|
|
89
|
+
|
|
90
|
+
@staticmethod
|
|
91
|
+
def get_schema_table_from_query(query: str) -> Tuple[str, str]:
|
|
92
|
+
""" Returns schema, table from query using Regex """
|
|
93
|
+
|
|
94
|
+
# TODO if the query does not contains "`" this will not work...
|
|
95
|
+
data = compile("`[a-z_]+`.`[a-z_]+`").search(query).group(0)
|
|
96
|
+
schema, table = data.replace("`", "").split(".")
|
|
97
|
+
return schema, table
|
|
98
|
+
|
|
99
|
+
@staticmethod
|
|
100
|
+
def get_drop_table_statement(event: QueryEvent) -> str:
|
|
101
|
+
return event.query
|
|
102
|
+
|
|
103
|
+
def execute(self, query: str):
|
|
104
|
+
if self.client:
|
|
105
|
+
self.client.execute(query)
|
|
106
|
+
|
|
107
|
+
def save(self, records: List[Record], **kwargs):
|
|
108
|
+
if self.send_data:
|
|
109
|
+
self._save(records, **kwargs)
|
|
110
|
+
self.logger.info(f"{len(records)} records were sent to: {self.register_name()}!")
|
|
111
|
+
|
|
112
|
+
@abstractmethod
|
|
113
|
+
def _save(self, records: List[Record], **kwargs):
|
|
114
|
+
""" Specific implementation to store the data into the Engine """
|
|
115
|
+
|
|
116
|
+
def close(self):
|
|
117
|
+
""" Implement it if is required to release or close resources """
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from core_aws.services.kinesis.client import KinesisClient
|
|
6
|
+
|
|
7
|
+
from core_cdc.base import Record
|
|
8
|
+
from .base import Target
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class KinesisDataStreamTarget(Target):
|
|
12
|
+
""" Send data to a Kinesis Data Stream """
|
|
13
|
+
|
|
14
|
+
def __init__(self, aws_region: str, stream_name: str, partition_key: str, **kwargs):
|
|
15
|
+
super(KinesisDataStreamTarget, self).__init__(**kwargs)
|
|
16
|
+
|
|
17
|
+
self.aws_region = aws_region
|
|
18
|
+
self.client = KinesisClient(region=self.aws_region)
|
|
19
|
+
self.partition_key = partition_key
|
|
20
|
+
self.stream_name = stream_name
|
|
21
|
+
|
|
22
|
+
def _save(self, records: List[Record], **kwargs):
|
|
23
|
+
return self.client.send_records(
|
|
24
|
+
records=[x.to_json() for x in records],
|
|
25
|
+
stream_name=self.stream_name,
|
|
26
|
+
partition_key=self.partition_key,
|
|
27
|
+
**kwargs)
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from core_db.engines.mysql import MySQLClient
|
|
6
|
+
|
|
7
|
+
from core_cdc.base import Record
|
|
8
|
+
from .base import Target
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class MySQLTarget(Target):
|
|
12
|
+
"""
|
|
13
|
+
This is the processor for synchronization with another MySQL instance. This capability
|
|
14
|
+
is provided because even if the replication process is done outside this service
|
|
15
|
+
is required to apply the DDL statements to keep the structure up to date...
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
def init_client(self, **kwargs) -> None:
|
|
19
|
+
"""
|
|
20
|
+
Expecting at least: host, user, password, database
|
|
21
|
+
More information: https://pymysql.readthedocs.io/en/latest/user/index.html
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
self.client = MySQLClient(**kwargs)
|
|
25
|
+
|
|
26
|
+
def _save(self, records: List[Record], **kwargs):
|
|
27
|
+
"""
|
|
28
|
+
We are not implementing the mechanism to persist the data directly
|
|
29
|
+
into the database because we believe decoupling (by sending the
|
|
30
|
+
data to SQS queue or SNS topic) is a better solution. But you
|
|
31
|
+
could modify the behavior or inherit the class...
|
|
32
|
+
|
|
33
|
+
Possible solutions:
|
|
34
|
+
* self.client.get_insert_dml(...)
|
|
35
|
+
* self.client.get_merge_dml(...)
|
|
36
|
+
"""
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
import re
|
|
4
|
+
from typing import List
|
|
5
|
+
|
|
6
|
+
from core_db.engines.snowflake import SnowflakeClient
|
|
7
|
+
from pymysqlreplication.event import QueryEvent
|
|
8
|
+
|
|
9
|
+
from core_cdc.base import Record
|
|
10
|
+
from .base import Target
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class SnowflakeTarget(Target):
|
|
14
|
+
""" Processor for synchronization with Snowflake """
|
|
15
|
+
|
|
16
|
+
def init_client(self, **kwargs) -> None:
|
|
17
|
+
"""
|
|
18
|
+
Expecting: host, user, password, account, database, warehouse, schema, role.
|
|
19
|
+
https://docs.snowflake.com/en/user-guide/python-connector-example.html
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
self.client = SnowflakeClient(**kwargs)
|
|
23
|
+
|
|
24
|
+
def _save(self, records: List[Record], **kwargs):
|
|
25
|
+
"""
|
|
26
|
+
We are not implementing the mechanism to persist the data directly
|
|
27
|
+
into the data warehouse because we believe decoupling (by sending the
|
|
28
|
+
data to SQS queue or SNS topic) is a better solution. But you
|
|
29
|
+
could modify the behavior or inherit the class...
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
@staticmethod
|
|
33
|
+
def get_create_schema_statement(event: QueryEvent):
|
|
34
|
+
return f"CREATE IF NOT EXIST SCHEMA {event.schema.decode()};"
|
|
35
|
+
|
|
36
|
+
def get_create_table_statement(self, event: QueryEvent):
|
|
37
|
+
return self.transform_create_table_query(event.query.lower())
|
|
38
|
+
|
|
39
|
+
@staticmethod
|
|
40
|
+
def transform_create_table_query(query: str) -> str:
|
|
41
|
+
""" Transform a MySQL query to a Snowflake one """
|
|
42
|
+
|
|
43
|
+
mapper = [
|
|
44
|
+
(r"`", ""),
|
|
45
|
+
(r"create table", "create table if not exists"),
|
|
46
|
+
(r"unique index \w+ \(+\w+ \w+\) \w+", ""),
|
|
47
|
+
|
|
48
|
+
(r"not null", ""),
|
|
49
|
+
(r"null", ""),
|
|
50
|
+
|
|
51
|
+
# Not necessary because it's a replication...
|
|
52
|
+
(r" generated always as[.]*(.*?)\)", ""),
|
|
53
|
+
(r"auto_increment", ""),
|
|
54
|
+
(r"unsigned", ""),
|
|
55
|
+
(r"zerofill", ""),
|
|
56
|
+
|
|
57
|
+
# Updating types...
|
|
58
|
+
(r" blob", " binary"),
|
|
59
|
+
(r" varbinary[.]*(.*?)\)", " binary"),
|
|
60
|
+
(r" year[.]*(.*?)\)", " varchar(4)"),
|
|
61
|
+
(r" json", " object"),
|
|
62
|
+
(r" enum[.]*(.*?)\)", " varchar"),
|
|
63
|
+
(r" set[.]*(.*?)\)", " varchar"),
|
|
64
|
+
|
|
65
|
+
# Removing unnecessary blank spaces...
|
|
66
|
+
(r"[ ]{2,}", ""),
|
|
67
|
+
|
|
68
|
+
# Removing \n...
|
|
69
|
+
(r"[\n]*,", ","),
|
|
70
|
+
|
|
71
|
+
# Removing unnecessary commas...
|
|
72
|
+
(r",,", ","),
|
|
73
|
+
(r",[\n]*\)", ")")
|
|
74
|
+
]
|
|
75
|
+
|
|
76
|
+
for pattern, replacement in mapper:
|
|
77
|
+
regex = re.compile(pattern, re.MULTILINE)
|
|
78
|
+
query = re.sub(regex, replacement, query)
|
|
79
|
+
|
|
80
|
+
return query
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
from typing import List
|
|
4
|
+
|
|
5
|
+
from core_aws.services.sns.client import SnsClient
|
|
6
|
+
from core_aws.services.sns.client import SnsMessage
|
|
7
|
+
|
|
8
|
+
from core_cdc.base import Record
|
|
9
|
+
from core_cdc.targets.base import Target
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SnsTarget(Target):
|
|
13
|
+
""" To send data to a SQS queue """
|
|
14
|
+
|
|
15
|
+
def __init__(self, aws_region: str, topic_arn: str, batch_size: int = 10, **kwargs):
|
|
16
|
+
super(SnsTarget, self).__init__(**kwargs)
|
|
17
|
+
|
|
18
|
+
self.aws_region = aws_region
|
|
19
|
+
self.client = SnsClient(region=aws_region, batch_size=batch_size)
|
|
20
|
+
self.topic_arn = topic_arn
|
|
21
|
+
self.execute_ddl = False
|
|
22
|
+
|
|
23
|
+
def _save(self, records: List[Record], **kwargs):
|
|
24
|
+
self.client.publish_batch(
|
|
25
|
+
topic_arn=self.topic_arn,
|
|
26
|
+
messages=[
|
|
27
|
+
SnsMessage(Id=f"{rec.table_name}-{x}", Message=rec.to_json())
|
|
28
|
+
for x, rec in enumerate(records)
|
|
29
|
+
]
|
|
30
|
+
)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# -*- coding: utf-8 -*-
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
from typing import List
|
|
5
|
+
from uuid import uuid4
|
|
6
|
+
|
|
7
|
+
from core_aws.services.sqs.client import SqsClient
|
|
8
|
+
from core_mixins.utils import get_batches
|
|
9
|
+
|
|
10
|
+
from core_cdc.base import Record
|
|
11
|
+
from core_cdc.targets.base import Target
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SqsTarget(Target):
|
|
15
|
+
""" To send data to a SQS queue """
|
|
16
|
+
|
|
17
|
+
def __init__(self, aws_region: str, queue_name: str, **kwargs):
|
|
18
|
+
super(SqsTarget, self).__init__(**kwargs)
|
|
19
|
+
|
|
20
|
+
self.aws_region = aws_region
|
|
21
|
+
self.execute_ddl = False
|
|
22
|
+
self.queue_url = None
|
|
23
|
+
|
|
24
|
+
self.queue_name = queue_name
|
|
25
|
+
self.client = SqsClient(region=aws_region)
|
|
26
|
+
self.queue_url = None
|
|
27
|
+
|
|
28
|
+
def init_client(self, **kwargs) -> None:
|
|
29
|
+
self.queue_url = self.client.get_queue_by_name(self.queue_name).url
|
|
30
|
+
|
|
31
|
+
def _save(self, records: List[Record], **kwargs):
|
|
32
|
+
for batch in get_batches([x.to_json() for x in records], 10):
|
|
33
|
+
self.client.send_message_batch(
|
|
34
|
+
queue_url=self.queue_url,
|
|
35
|
+
entries=[
|
|
36
|
+
{
|
|
37
|
+
"Id": str(uuid4()),
|
|
38
|
+
"MessageBody": json.dumps(entry),
|
|
39
|
+
"DelaySeconds": 0,
|
|
40
|
+
# "MessageDeduplicationId": "",
|
|
41
|
+
# "MessageGroupId": ""
|
|
42
|
+
} for entry in batch
|
|
43
|
+
]
|
|
44
|
+
)
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: core-cdc
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: This project/library contains the core mechanism and resources for Change Data Capture services...
|
|
5
|
+
Author-email: Alejandro Cora González <alek.cora.glez@gmail.com>
|
|
6
|
+
Maintainer: Alejandro Cora González
|
|
7
|
+
License: MIT
|
|
8
|
+
Project-URL: Homepage, https://gitlab.com/bytecode-solutions/core/core-cdc
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: Topic :: Utilities
|
|
16
|
+
Requires-Python: >=3.9
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: core-mixins
|
|
20
|
+
Requires-Dist: core-tests
|
|
21
|
+
Requires-Dist: core-aws
|
|
22
|
+
Requires-Dist: core-db
|
|
23
|
+
Requires-Dist: mysql-replication==0.31
|
|
24
|
+
Requires-Dist: pymongo==4.6.0
|
|
25
|
+
Provides-Extra: test
|
|
26
|
+
|
|
27
|
+
# core-cdc (CDC a.k.a Change Data Capture)
|
|
28
|
+
_______________________________________________________________________________
|
|
29
|
+
|
|
30
|
+
It provides the core mechanism and required resources to
|
|
31
|
+
implement "Change Data Capture" services...
|
|
32
|
+
|
|
33
|
+
## Execution Environment
|
|
34
|
+
|
|
35
|
+
### Install libraries
|
|
36
|
+
```commandline
|
|
37
|
+
pip install --upgrade pip
|
|
38
|
+
pip install virtualenv
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
### Create the Python Virtual Environment.
|
|
42
|
+
```commandline
|
|
43
|
+
virtualenv --python=python3.11 .venv
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Activate the Virtual Environment.
|
|
47
|
+
```commandline
|
|
48
|
+
source .venv/bin/activate
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
### Install required libraries.
|
|
52
|
+
```commandline
|
|
53
|
+
pip install .
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
### Check tests and coverage...
|
|
57
|
+
```commandline
|
|
58
|
+
python manager.py run-test
|
|
59
|
+
python manager.py run-coverage
|
|
60
|
+
```
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
setup.py
|
|
5
|
+
core_cdc/__init__.py
|
|
6
|
+
core_cdc/base.py
|
|
7
|
+
core_cdc.egg-info/PKG-INFO
|
|
8
|
+
core_cdc.egg-info/SOURCES.txt
|
|
9
|
+
core_cdc.egg-info/dependency_links.txt
|
|
10
|
+
core_cdc.egg-info/requires.txt
|
|
11
|
+
core_cdc.egg-info/top_level.txt
|
|
12
|
+
core_cdc/processors/__init__.py
|
|
13
|
+
core_cdc/processors/base.py
|
|
14
|
+
core_cdc/processors/mongo_stream.py
|
|
15
|
+
core_cdc/processors/mssql.py
|
|
16
|
+
core_cdc/processors/mysql_binlog.py
|
|
17
|
+
core_cdc/processors/oracle.py
|
|
18
|
+
core_cdc/targets/__init__.py
|
|
19
|
+
core_cdc/targets/base.py
|
|
20
|
+
core_cdc/targets/kinesis.py
|
|
21
|
+
core_cdc/targets/mysql.py
|
|
22
|
+
core_cdc/targets/snowflake.py
|
|
23
|
+
core_cdc/targets/sns.py
|
|
24
|
+
core_cdc/targets/sqs.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
core_cdc
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
# Other project metadata fields as specified in:
|
|
2
|
+
# https://packaging.python.org/en/latest/specifications/declaring-project-metadata/
|
|
3
|
+
# https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html
|
|
4
|
+
|
|
5
|
+
[build-system]
|
|
6
|
+
requires = ["setuptools>=61", "wheel"]
|
|
7
|
+
build-backend = "setuptools.build_meta"
|
|
8
|
+
|
|
9
|
+
[project]
|
|
10
|
+
name = "core-cdc"
|
|
11
|
+
description = "This project/library contains the core mechanism and resources for Change Data Capture services..."
|
|
12
|
+
version = "1.0.0"
|
|
13
|
+
|
|
14
|
+
authors = [
|
|
15
|
+
{name = "Alejandro Cora González", email = "alek.cora.glez@gmail.com"}
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
maintainers = [
|
|
19
|
+
{name = "Alejandro Cora González"}
|
|
20
|
+
]
|
|
21
|
+
|
|
22
|
+
requires-python = ">=3.9"
|
|
23
|
+
license = {text = "MIT"}
|
|
24
|
+
readme = "README.md"
|
|
25
|
+
|
|
26
|
+
classifiers = [
|
|
27
|
+
# Classifiers -> https://pypi.org/classifiers/
|
|
28
|
+
"License :: OSI Approved :: MIT License",
|
|
29
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
30
|
+
"Programming Language :: Python :: 3",
|
|
31
|
+
"Programming Language :: Python :: 3.9",
|
|
32
|
+
"Programming Language :: Python :: 3.11",
|
|
33
|
+
"Intended Audience :: Developers",
|
|
34
|
+
"Topic :: Utilities"
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
dependencies = [
|
|
38
|
+
"core-mixins",
|
|
39
|
+
"core-tests",
|
|
40
|
+
"core-aws",
|
|
41
|
+
"core-db",
|
|
42
|
+
"mysql-replication==0.31",
|
|
43
|
+
"pymongo==4.6.0"
|
|
44
|
+
]
|
|
45
|
+
|
|
46
|
+
[project.urls]
|
|
47
|
+
Homepage = "https://gitlab.com/bytecode-solutions/core/core-cdc"
|
|
48
|
+
|
|
49
|
+
[project.optional-dependencies]
|
|
50
|
+
test = []
|
core-cdc-1.0.0/setup.cfg
ADDED