db-condenser 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- db_condenser-1.0.0/LICENSE +9 -0
- db_condenser-1.0.0/PKG-INFO +119 -0
- db_condenser-1.0.0/README.md +94 -0
- db_condenser-1.0.0/pyproject.toml +54 -0
- db_condenser-1.0.0/src/db_condenser/__init__.py +0 -0
- db_condenser-1.0.0/src/db_condenser/config_reader.py +191 -0
- db_condenser-1.0.0/src/db_condenser/data_masking.py +52 -0
- db_condenser-1.0.0/src/db_condenser/database_helper.py +13 -0
- db_condenser-1.0.0/src/db_condenser/db_connect.py +126 -0
- db_condenser-1.0.0/src/db_condenser/direct_subset.py +128 -0
- db_condenser-1.0.0/src/db_condenser/mysql_database_creator.py +128 -0
- db_condenser-1.0.0/src/db_condenser/mysql_database_helper.py +302 -0
- db_condenser-1.0.0/src/db_condenser/psql_database_creator.py +243 -0
- db_condenser-1.0.0/src/db_condenser/psql_database_helper.py +428 -0
- db_condenser-1.0.0/src/db_condenser/result_tabulator.py +49 -0
- db_condenser-1.0.0/src/db_condenser/subset.py +590 -0
- db_condenser-1.0.0/src/db_condenser/subset_utils.py +226 -0
- db_condenser-1.0.0/src/db_condenser/topo_orderer.py +46 -0
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
Copyright 2019, Tonic AI
|
|
2
|
+
|
|
3
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
4
|
+
|
|
5
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
6
|
+
|
|
7
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
8
|
+
|
|
9
|
+
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: db-condenser
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Config-driven database subsetting tool that creates representative samples while preserving referential integrity. Modernized fork of Tonic's Condenser with performance and feature improvements.
|
|
5
|
+
Keywords: database,subset,postgresql,sampling,data,migration,testing
|
|
6
|
+
Author: Thomas Khuu
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
License-File: LICENSE
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Environment :: Console
|
|
11
|
+
Classifier: Operating System :: OS Independent
|
|
12
|
+
Classifier: Programming Language :: Python
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
18
|
+
Requires-Dist: faker>=38.2.0,<39
|
|
19
|
+
Requires-Dist: mysql-connector-python>=9.5.0,<10
|
|
20
|
+
Requires-Dist: psycopg[binary]>=3.2,<4
|
|
21
|
+
Requires-Dist: toposort==1.10
|
|
22
|
+
Requires-Python: >=3.10
|
|
23
|
+
Project-URL: Repository, https://github.com/tkhuu01/db-condenser
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
|
|
26
|
+
# Database Condenser
|
|
27
|
+
|
|
28
|
+
A config-driven database subsetting tool for PostgreSQL and MySQL, forked from
|
|
29
|
+
Tonic's Condenser.
|
|
30
|
+
|
|
31
|
+
Some changes from the original Condenser:
|
|
32
|
+
|
|
33
|
+
* Concurrent thread pool usage to speed up subsetting
|
|
34
|
+
* Optional temp-table strategy for memory-efficient ID batching
|
|
35
|
+
* Optional Postgres COPY protocol for faster transfer
|
|
36
|
+
* Automatic sequence reset after subsetting
|
|
37
|
+
* Built on psycopg3 and managed with astral's uv
|
|
38
|
+
|
|
39
|
+
Subsetting data is the process of taking a representative sample of your data
|
|
40
|
+
in a manner that preserves the integrity of your database, e.g., give me 5% of
|
|
41
|
+
my users. If you do this naively, e.g., just grab 5% of all the tables in your
|
|
42
|
+
database, most likely, your database will break foreign key constraints. At
|
|
43
|
+
best, you’ll end up with a statistically non-representative data sample.
|
|
44
|
+
|
|
45
|
+
One common use-case is to scale down a production database to a more reasonable
|
|
46
|
+
size so that it can be used in staging, test, and development environments. This
|
|
47
|
+
can be done to save costs and, when used in tandem with PII removal, can be
|
|
48
|
+
quite powerful as a productivity enhancer. Another example is copying specific
|
|
49
|
+
rows from one database and placing them into another while maintaining referential
|
|
50
|
+
integrity.
|
|
51
|
+
|
|
52
|
+
You can find more about the original Condenser details
|
|
53
|
+
[here](https://www.tonic.ai/blog/condenser-a-database-subsetting-tool) and
|
|
54
|
+
[here](https://www.tonic.ai/blog/condenser-v2/).
|
|
55
|
+
|
|
56
|
+
## Installation
|
|
57
|
+
|
|
58
|
+
Six steps to set up from source, assuming Python 3.10+:
|
|
59
|
+
|
|
60
|
+
1. Install [astral-uv](https://docs.astral.sh/uv/getting-started/installation/)
|
|
61
|
+
|
|
62
|
+
2. Install Postgres and/or MySQL database tools. For Postgres we need `pg_dump`
|
|
63
|
+
and `psql` tools; they need to be on your `$PATH` or point to them with
|
|
64
|
+
`$POSTGRES_PATH`. For MySQL we need `mysqldump` and `mysql`, they can be on your
|
|
65
|
+
`$PATH` or point to them with `$MYSQL_PATH`.
|
|
66
|
+
|
|
67
|
+
3. Clone this project locally.
|
|
68
|
+
|
|
69
|
+
4. Install the project with `uv sync --frozen`
|
|
70
|
+
|
|
71
|
+
5. Setup your configuration and save it in `config.json`. The provided
|
|
72
|
+
`config.json.example` has the skeleton of what you need to provide: source and
|
|
73
|
+
destination database connection details, as well as subsetting goals in
|
|
74
|
+
`initial_targets`. Here's an example that will collect 10% of a table
|
|
75
|
+
named `public.target_table`.
|
|
76
|
+
|
|
77
|
+
```
|
|
78
|
+
"initial_targets": [
|
|
79
|
+
{
|
|
80
|
+
"table": "public.target_table",
|
|
81
|
+
"percent": 10
|
|
82
|
+
}
|
|
83
|
+
]
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
There may be more required configuration depending on your database, but
|
|
87
|
+
simple databases should be easy. See the CONFIG.md for more details,
|
|
88
|
+
and `config.json.example_all` for all of the options in a single config file.
|
|
89
|
+
|
|
90
|
+
6. Run! `$ uv run subset`
|
|
91
|
+
|
|
92
|
+
Or, from PyPI:
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
pip install db-condenser
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
pipx install db-condenser
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
## Running
|
|
103
|
+
|
|
104
|
+
Almost all the configuration is in the `config.json` file, so running it is as simple as
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
uv run subset
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
Three command-line arguments are supported:
|
|
111
|
+
|
|
112
|
+
`-v`: Verbose output. Useful for performance debugging. Lists almost every
|
|
113
|
+
query made, and its speed.
|
|
114
|
+
|
|
115
|
+
`--no-constraints`: For Postgres this will not add constraints found in the source
|
|
116
|
+
database to the destination database. This option has no effect for MySQL.
|
|
117
|
+
|
|
118
|
+
`-y`: Skip confirmation on subsetting to destination host if it isn't localhost
|
|
119
|
+
or 127.0.0.1
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
# Database Condenser
|
|
2
|
+
|
|
3
|
+
A config-driven database subsetting tool for PostgreSQL and MySQL, forked from
|
|
4
|
+
Tonic's Condenser.
|
|
5
|
+
|
|
6
|
+
Some changes from the original Condenser:
|
|
7
|
+
|
|
8
|
+
* Concurrent thread pool usage to speed up subsetting
|
|
9
|
+
* Optional temp-table strategy for memory-efficient ID batching
|
|
10
|
+
* Optional Postgres COPY protocol for faster transfer
|
|
11
|
+
* Automatic sequence reset after subsetting
|
|
12
|
+
* Built on psycopg3 and managed with astral's uv
|
|
13
|
+
|
|
14
|
+
Subsetting data is the process of taking a representative sample of your data
|
|
15
|
+
in a manner that preserves the integrity of your database, e.g., give me 5% of
|
|
16
|
+
my users. If you do this naively, e.g., just grab 5% of all the tables in your
|
|
17
|
+
database, most likely, your database will break foreign key constraints. At
|
|
18
|
+
best, you’ll end up with a statistically non-representative data sample.
|
|
19
|
+
|
|
20
|
+
One common use-case is to scale down a production database to a more reasonable
|
|
21
|
+
size so that it can be used in staging, test, and development environments. This
|
|
22
|
+
can be done to save costs and, when used in tandem with PII removal, can be
|
|
23
|
+
quite powerful as a productivity enhancer. Another example is copying specific
|
|
24
|
+
rows from one database and placing them into another while maintaining referential
|
|
25
|
+
integrity.
|
|
26
|
+
|
|
27
|
+
You can find more about the original Condenser details
|
|
28
|
+
[here](https://www.tonic.ai/blog/condenser-a-database-subsetting-tool) and
|
|
29
|
+
[here](https://www.tonic.ai/blog/condenser-v2/).
|
|
30
|
+
|
|
31
|
+
## Installation
|
|
32
|
+
|
|
33
|
+
Six steps to set up from source, assuming Python 3.10+:
|
|
34
|
+
|
|
35
|
+
1. Install [astral-uv](https://docs.astral.sh/uv/getting-started/installation/)
|
|
36
|
+
|
|
37
|
+
2. Install Postgres and/or MySQL database tools. For Postgres we need `pg_dump`
|
|
38
|
+
and `psql` tools; they need to be on your `$PATH` or point to them with
|
|
39
|
+
`$POSTGRES_PATH`. For MySQL we need `mysqldump` and `mysql`, they can be on your
|
|
40
|
+
`$PATH` or point to them with `$MYSQL_PATH`.
|
|
41
|
+
|
|
42
|
+
3. Clone this project locally.
|
|
43
|
+
|
|
44
|
+
4. Install the project with `uv sync --frozen`
|
|
45
|
+
|
|
46
|
+
5. Setup your configuration and save it in `config.json`. The provided
|
|
47
|
+
`config.json.example` has the skeleton of what you need to provide: source and
|
|
48
|
+
destination database connection details, as well as subsetting goals in
|
|
49
|
+
`initial_targets`. Here's an example that will collect 10% of a table
|
|
50
|
+
named `public.target_table`.
|
|
51
|
+
|
|
52
|
+
```
|
|
53
|
+
"initial_targets": [
|
|
54
|
+
{
|
|
55
|
+
"table": "public.target_table",
|
|
56
|
+
"percent": 10
|
|
57
|
+
}
|
|
58
|
+
]
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
There may be more required configuration depending on your database, but
|
|
62
|
+
simple databases should be easy. See the CONFIG.md for more details,
|
|
63
|
+
and `config.json.example_all` for all of the options in a single config file.
|
|
64
|
+
|
|
65
|
+
6. Run! `$ uv run subset`
|
|
66
|
+
|
|
67
|
+
Or, from PyPI:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
pip install db-condenser
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
pipx install db-condenser
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Running
|
|
78
|
+
|
|
79
|
+
Almost all the configuration is in the `config.json` file, so running it is as simple as
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
uv run subset
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
Three command-line arguments are supported:
|
|
86
|
+
|
|
87
|
+
`-v`: Verbose output. Useful for performance debugging. Lists almost every
|
|
88
|
+
query made, and its speed.
|
|
89
|
+
|
|
90
|
+
`--no-constraints`: For Postgres this will not add constraints found in the source
|
|
91
|
+
database to the destination database. This option has no effect for MySQL.
|
|
92
|
+
|
|
93
|
+
`-y`: Skip confirmation on subsetting to destination host if it isn't localhost
|
|
94
|
+
or 127.0.0.1
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "db-condenser"
|
|
3
|
+
description = "Config-driven database subsetting tool that creates representative samples while preserving referential integrity. Modernized fork of Tonic's Condenser with performance and feature improvements."
|
|
4
|
+
authors = [{ name = "Thomas Khuu" }]
|
|
5
|
+
license = "MIT"
|
|
6
|
+
license-files = ["LICENSE"]
|
|
7
|
+
version = "1.0.0"
|
|
8
|
+
readme = "README.md"
|
|
9
|
+
classifiers = [
|
|
10
|
+
"Intended Audience :: Developers",
|
|
11
|
+
"Environment :: Console",
|
|
12
|
+
"Operating System :: OS Independent",
|
|
13
|
+
"Programming Language :: Python",
|
|
14
|
+
"Programming Language :: Python :: 3",
|
|
15
|
+
"Programming Language :: Python :: 3.10",
|
|
16
|
+
"Programming Language :: Python :: 3.11",
|
|
17
|
+
"Programming Language :: Python :: 3.12",
|
|
18
|
+
"Programming Language :: Python :: 3.13",
|
|
19
|
+
]
|
|
20
|
+
requires-python = ">=3.10"
|
|
21
|
+
keywords = [
|
|
22
|
+
"database",
|
|
23
|
+
"subset",
|
|
24
|
+
"postgresql",
|
|
25
|
+
"sampling",
|
|
26
|
+
"data",
|
|
27
|
+
"migration",
|
|
28
|
+
"testing",
|
|
29
|
+
]
|
|
30
|
+
dependencies = [
|
|
31
|
+
"faker>=38.2.0,<39",
|
|
32
|
+
"mysql-connector-python>=9.5.0,<10",
|
|
33
|
+
"psycopg[binary]>=3.2,<4",
|
|
34
|
+
"toposort==1.10",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.urls]
|
|
38
|
+
Repository = "https://github.com/tkhuu01/db-condenser"
|
|
39
|
+
|
|
40
|
+
[project.scripts]
|
|
41
|
+
subset = "db_condenser.direct_subset:main"
|
|
42
|
+
|
|
43
|
+
[dependency-groups]
|
|
44
|
+
dev = [
|
|
45
|
+
"pytest>=8",
|
|
46
|
+
"ruff==0.15.11",
|
|
47
|
+
]
|
|
48
|
+
|
|
49
|
+
[tool.ruff.lint]
|
|
50
|
+
extend-select = ["I"]
|
|
51
|
+
|
|
52
|
+
[build-system]
|
|
53
|
+
requires = ["uv_build>=0.11.0,<0.12.0"]
|
|
54
|
+
build-backend = "uv_build"
|
|
File without changes
|
|
@@ -0,0 +1,191 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import sys
|
|
3
|
+
from dataclasses import dataclass, field
|
|
4
|
+
from enum import Enum
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
@dataclass
|
|
9
|
+
class InitialTarget:
|
|
10
|
+
table: str
|
|
11
|
+
percent: float | None = None
|
|
12
|
+
where: str | None = None
|
|
13
|
+
|
|
14
|
+
def __post_init__(self):
|
|
15
|
+
# Exactly one of where/percent must be set
|
|
16
|
+
if (self.where is None) == (self.percent is None):
|
|
17
|
+
raise ValueError(
|
|
18
|
+
"Initial Target must specify exactly one of 'where' or 'percent'"
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class DbType(str, Enum):
|
|
23
|
+
POSTGRES = "postgres"
|
|
24
|
+
MYSQL = "mysql"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class DbConnectInfo:
|
|
29
|
+
user_name: str
|
|
30
|
+
host: str
|
|
31
|
+
db_name: str
|
|
32
|
+
port: int
|
|
33
|
+
ssl_mode: str | None = None
|
|
34
|
+
# No password will prompt user
|
|
35
|
+
password: str | None = None
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class UpstreamFilter:
|
|
40
|
+
condition: str
|
|
41
|
+
table: str | None = None
|
|
42
|
+
column: str | None = None
|
|
43
|
+
|
|
44
|
+
def __post_init__(self):
|
|
45
|
+
# Exactly one of table/column must be set
|
|
46
|
+
if (self.table is None) == (self.column is None):
|
|
47
|
+
raise ValueError(
|
|
48
|
+
"Upstream filters must specify exactly one of 'table' or 'column'"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
@dataclass
|
|
53
|
+
class DependencyBreak:
|
|
54
|
+
fk_table: str
|
|
55
|
+
target_table: str
|
|
56
|
+
preserve_fk_opportunistically: bool = False
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
@dataclass
|
|
60
|
+
class FkAugmentation:
|
|
61
|
+
fk_table: str
|
|
62
|
+
fk_columns: list[str]
|
|
63
|
+
target_table: str
|
|
64
|
+
target_columns: list[str]
|
|
65
|
+
|
|
66
|
+
def __post_init__(self):
|
|
67
|
+
if len(self.fk_columns) != len(self.target_columns):
|
|
68
|
+
raise ValueError("fk_columns and target_columns must be the same length")
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@dataclass
|
|
72
|
+
class Config:
|
|
73
|
+
db_type: DbType
|
|
74
|
+
initial_targets: list[InitialTarget]
|
|
75
|
+
source_db_connection_info: DbConnectInfo
|
|
76
|
+
destination_db_connection_info: DbConnectInfo
|
|
77
|
+
keep_disconnected_tables: bool = False
|
|
78
|
+
upstream_filters: list[UpstreamFilter] = field(default_factory=list)
|
|
79
|
+
excluded_tables: list[str] = field(default_factory=list)
|
|
80
|
+
passthrough_tables: list[str] = field(default_factory=list)
|
|
81
|
+
dependency_breaks: list[DependencyBreak] = field(default_factory=list)
|
|
82
|
+
fk_augmentation: list[FkAugmentation] = field(default_factory=list)
|
|
83
|
+
max_rows_per_table: int | Literal["ALL"] | None = None
|
|
84
|
+
use_temp_tables: bool = False
|
|
85
|
+
use_copy_protocol: bool = False
|
|
86
|
+
pre_constraint_sql: list[str] = field(default_factory=list)
|
|
87
|
+
post_subset_sql: list[str] = field(default_factory=list)
|
|
88
|
+
|
|
89
|
+
@property
|
|
90
|
+
def dependency_break_set(self) -> set[tuple[str, str]]:
|
|
91
|
+
return {(b.fk_table, b.target_table) for b in self.dependency_breaks}
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def preserve_fk_opportunistically(self) -> set[tuple[str, str]]:
|
|
95
|
+
return {
|
|
96
|
+
(b.fk_table, b.target_table)
|
|
97
|
+
for b in self.dependency_breaks
|
|
98
|
+
if b.preserve_fk_opportunistically
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
@property
|
|
102
|
+
def initial_target_tables(self) -> list[str]:
|
|
103
|
+
return [target.table for target in self.initial_targets]
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
config: Config | None = None
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def _raw_dict_to_config(raw_config: dict) -> Config:
|
|
110
|
+
initial_targets = []
|
|
111
|
+
db_type = DbType(raw_config["db_type"].lower())
|
|
112
|
+
|
|
113
|
+
initial_targets = [
|
|
114
|
+
InitialTarget(**target) for target in raw_config["initial_targets"]
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
source_db = DbConnectInfo(**raw_config["source_db_connection_info"])
|
|
118
|
+
dest_db = DbConnectInfo(**raw_config["destination_db_connection_info"])
|
|
119
|
+
|
|
120
|
+
upstream_filters = [
|
|
121
|
+
UpstreamFilter(**table) for table in raw_config.get("upstream_filters", [])
|
|
122
|
+
]
|
|
123
|
+
|
|
124
|
+
excluded_tables = [table for table in raw_config.get("excluded_tables", [])]
|
|
125
|
+
passthrough_tables = list(
|
|
126
|
+
set([table for table in raw_config.get("passthrough_tables", [])])
|
|
127
|
+
)
|
|
128
|
+
dependency_breaks = [
|
|
129
|
+
DependencyBreak(**relation)
|
|
130
|
+
for relation in raw_config.get("dependency_breaks", [])
|
|
131
|
+
]
|
|
132
|
+
fk_augmentation = []
|
|
133
|
+
for fka in raw_config.get("fk_augmentation", []):
|
|
134
|
+
if "fk_schema" in fka:
|
|
135
|
+
fka = {
|
|
136
|
+
"fk_table": fka["fk_schema"] + "." + fka["fk_table"],
|
|
137
|
+
"fk_columns": fka["fk_columns"],
|
|
138
|
+
"target_table": fka["target_schema"] + "." + fka["target_table"],
|
|
139
|
+
"target_columns": fka["target_columns"],
|
|
140
|
+
}
|
|
141
|
+
fk_augmentation.append(FkAugmentation(**fka))
|
|
142
|
+
|
|
143
|
+
pre_constraint_sql = [sql for sql in raw_config.get("pre_constraint_sql", [])]
|
|
144
|
+
post_subset_sql = [sql for sql in raw_config.get("post_subset_sql", [])]
|
|
145
|
+
max_rows_per_table = raw_config.get("max_rows_per_table", None)
|
|
146
|
+
use_temp_tables = bool(raw_config.get("use_temp_tables", False))
|
|
147
|
+
use_copy_protocol = bool(raw_config.get("use_copy_protocol", False))
|
|
148
|
+
return Config(
|
|
149
|
+
db_type=db_type,
|
|
150
|
+
initial_targets=initial_targets,
|
|
151
|
+
source_db_connection_info=source_db,
|
|
152
|
+
destination_db_connection_info=dest_db,
|
|
153
|
+
keep_disconnected_tables=bool(
|
|
154
|
+
raw_config.get("keep_disconnected_tables", False)
|
|
155
|
+
),
|
|
156
|
+
upstream_filters=upstream_filters,
|
|
157
|
+
excluded_tables=excluded_tables,
|
|
158
|
+
passthrough_tables=passthrough_tables,
|
|
159
|
+
dependency_breaks=dependency_breaks,
|
|
160
|
+
fk_augmentation=fk_augmentation,
|
|
161
|
+
max_rows_per_table=max_rows_per_table,
|
|
162
|
+
use_temp_tables=use_temp_tables,
|
|
163
|
+
use_copy_protocol=use_copy_protocol,
|
|
164
|
+
pre_constraint_sql=pre_constraint_sql,
|
|
165
|
+
post_subset_sql=post_subset_sql,
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
def initialize(file_like=None):
|
|
170
|
+
global config
|
|
171
|
+
if config:
|
|
172
|
+
print("WARNING: Attempted to initialize configuration twice.", file=sys.stderr)
|
|
173
|
+
|
|
174
|
+
if not file_like:
|
|
175
|
+
with open("config.json", "r") as fp:
|
|
176
|
+
raw_config = json.load(fp)
|
|
177
|
+
else:
|
|
178
|
+
raw_config = json.load(file_like)
|
|
179
|
+
|
|
180
|
+
config = _raw_dict_to_config(raw_config)
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def get_config() -> Config:
|
|
184
|
+
if config is None:
|
|
185
|
+
raise RuntimeError("Config not initialized — call initialize() first")
|
|
186
|
+
return config
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def reset_config():
|
|
190
|
+
global config
|
|
191
|
+
config = None
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
from typing import Any, Final
|
|
2
|
+
|
|
3
|
+
from faker import Faker
|
|
4
|
+
|
|
5
|
+
fake = Faker()
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class DataMasking:
|
|
9
|
+
@staticmethod
|
|
10
|
+
def null_out(_: str):
|
|
11
|
+
return None
|
|
12
|
+
|
|
13
|
+
@staticmethod
|
|
14
|
+
def mask_numbers(value: Any) -> str | None:
|
|
15
|
+
"""
|
|
16
|
+
Mask certain strings that may contain a mixture of letters,
|
|
17
|
+
normal characters, whitespaces, or special characters
|
|
18
|
+
"""
|
|
19
|
+
if value is None:
|
|
20
|
+
return None
|
|
21
|
+
str_value = str(value)
|
|
22
|
+
return "".join(
|
|
23
|
+
str(fake.random_digit()) if c.isdigit() else c for c in str_value
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
@staticmethod
|
|
27
|
+
def mask_characters(value: Any) -> str | None:
|
|
28
|
+
"""
|
|
29
|
+
Mask certain strings that may contain a mixture of letters,
|
|
30
|
+
normal characters, whitespaces, or special characters
|
|
31
|
+
"""
|
|
32
|
+
if value is None:
|
|
33
|
+
return None
|
|
34
|
+
str_value = str(value)
|
|
35
|
+
return "".join(fake.random_letter() if c.isalpha() else c for c in str_value)
|
|
36
|
+
|
|
37
|
+
@staticmethod
|
|
38
|
+
def mask_email(email: Any) -> str | None:
|
|
39
|
+
if email is None:
|
|
40
|
+
return None
|
|
41
|
+
s = str(email).split("@")
|
|
42
|
+
if len(s) < 2:
|
|
43
|
+
return fake.email()
|
|
44
|
+
return f"{fake.user_name()}@{s[1]}"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
DATA_MASKING_MAPPER: Final = {
|
|
48
|
+
"null_out": DataMasking.null_out,
|
|
49
|
+
"mask_numbers": DataMasking.mask_numbers,
|
|
50
|
+
"mask_characters": DataMasking.mask_characters,
|
|
51
|
+
"mask_email": DataMasking.mask_email,
|
|
52
|
+
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from db_condenser.config_reader import DbType, get_config
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def get_specific_helper():
|
|
5
|
+
config = get_config()
|
|
6
|
+
if config.db_type == DbType.POSTGRES:
|
|
7
|
+
from db_condenser import psql_database_helper
|
|
8
|
+
|
|
9
|
+
return psql_database_helper
|
|
10
|
+
else:
|
|
11
|
+
from db_condenser import mysql_database_helper
|
|
12
|
+
|
|
13
|
+
return mysql_database_helper
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
import getpass
|
|
2
|
+
import sys
|
|
3
|
+
import time
|
|
4
|
+
from datetime import datetime
|
|
5
|
+
|
|
6
|
+
import mysql.connector
|
|
7
|
+
import psycopg
|
|
8
|
+
|
|
9
|
+
from db_condenser.config_reader import DbConnectInfo, DbType
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class DbConnection:
|
|
13
|
+
def __init__(self, connection):
|
|
14
|
+
self.connection = connection
|
|
15
|
+
|
|
16
|
+
def commit(self):
|
|
17
|
+
self.connection.commit()
|
|
18
|
+
|
|
19
|
+
def close(self):
|
|
20
|
+
self.connection.close()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class LoggingCursor:
|
|
24
|
+
def __init__(self, cursor, verbose=False):
|
|
25
|
+
self.inner_cursor = cursor
|
|
26
|
+
self._verbose = verbose
|
|
27
|
+
|
|
28
|
+
def execute(self, query, params=None):
|
|
29
|
+
start_time = time.time()
|
|
30
|
+
if self._verbose:
|
|
31
|
+
print("Beginning query @ {}:\n\t{}".format(str(datetime.now()), query))
|
|
32
|
+
sys.stdout.flush()
|
|
33
|
+
retval = self.inner_cursor.execute(query, params)
|
|
34
|
+
if self._verbose:
|
|
35
|
+
print("\tQuery completed in {}s".format(time.time() - start_time))
|
|
36
|
+
sys.stdout.flush()
|
|
37
|
+
return retval
|
|
38
|
+
|
|
39
|
+
def __getattr__(self, name):
|
|
40
|
+
return self.inner_cursor.__getattribute__(name)
|
|
41
|
+
|
|
42
|
+
def __exit__(self, a, b, c):
|
|
43
|
+
return self.inner_cursor.__exit__(a, b, c)
|
|
44
|
+
|
|
45
|
+
def __enter__(self):
|
|
46
|
+
return LoggingCursor(self.inner_cursor.__enter__(), self._verbose)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# small wrapper to the connection class that gives us a common interface to the cursor()
|
|
50
|
+
# method across MySQL and Postgres. This one is for Postgres
|
|
51
|
+
class PsqlConnection(DbConnection):
|
|
52
|
+
def __init__(self, connect, read_repeatable, verbose=False):
|
|
53
|
+
connection_args = dict(
|
|
54
|
+
dbname=connect.db_name,
|
|
55
|
+
user=connect.user,
|
|
56
|
+
password=connect.password,
|
|
57
|
+
host=connect.host,
|
|
58
|
+
port=connect.port,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
if connect.ssl_mode:
|
|
62
|
+
connection_args["sslmode"] = connect.ssl_mode
|
|
63
|
+
|
|
64
|
+
DbConnection.__init__(self, psycopg.connect(**connection_args))
|
|
65
|
+
self._verbose = verbose
|
|
66
|
+
if read_repeatable:
|
|
67
|
+
self.connection.isolation_level = psycopg.IsolationLevel.REPEATABLE_READ
|
|
68
|
+
|
|
69
|
+
def cursor(self, name=None, withhold=False):
|
|
70
|
+
return LoggingCursor(
|
|
71
|
+
self.connection.cursor(name=name, withhold=withhold), self._verbose
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
# small wrapper to the connection class that gives us a common interface to the cursor()
|
|
76
|
+
# method across MySQL and Postgres. This one is for MySQL
|
|
77
|
+
class MySqlConnection(DbConnection):
|
|
78
|
+
def __init__(self, connect, read_repeatable, verbose=False):
|
|
79
|
+
DbConnection.__init__(
|
|
80
|
+
self,
|
|
81
|
+
mysql.connector.connect(
|
|
82
|
+
host=connect.host,
|
|
83
|
+
port=connect.port,
|
|
84
|
+
user=connect.user,
|
|
85
|
+
password=connect.password,
|
|
86
|
+
database=connect.db_name,
|
|
87
|
+
),
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
self.db_name = connect.db_name
|
|
91
|
+
self._verbose = verbose
|
|
92
|
+
|
|
93
|
+
if read_repeatable:
|
|
94
|
+
self.connection.start_transaction(isolation_level="REPEATABLE READ")
|
|
95
|
+
|
|
96
|
+
def cursor(self, name=None, withhold=False):
|
|
97
|
+
return LoggingCursor(self.connection.cursor(), self._verbose)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
class DbConnect:
|
|
101
|
+
def __init__(self, db_type: DbType, connection_info: DbConnectInfo, verbose=False):
|
|
102
|
+
if connection_info.password is None:
|
|
103
|
+
connection_info.password = getpass.getpass(
|
|
104
|
+
"Enter password for {0} on host {1}: ".format(
|
|
105
|
+
connection_info.user_name, connection_info.host
|
|
106
|
+
)
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
self.user = connection_info.user_name
|
|
110
|
+
self.password = connection_info.password
|
|
111
|
+
self.host = connection_info.host
|
|
112
|
+
self.port = connection_info.port
|
|
113
|
+
self.db_name = connection_info.db_name
|
|
114
|
+
self.ssl_mode = connection_info.ssl_mode
|
|
115
|
+
self.__db_type = db_type
|
|
116
|
+
self._verbose = verbose
|
|
117
|
+
|
|
118
|
+
def get_db_connection(
|
|
119
|
+
self, read_repeatable=False
|
|
120
|
+
) -> PsqlConnection | MySqlConnection:
|
|
121
|
+
if self.__db_type == DbType.POSTGRES:
|
|
122
|
+
return PsqlConnection(self, read_repeatable, self._verbose)
|
|
123
|
+
elif self.__db_type == DbType.MYSQL:
|
|
124
|
+
return MySqlConnection(self, read_repeatable, self._verbose)
|
|
125
|
+
else:
|
|
126
|
+
raise ValueError("unknown db_type " + self.__db_type)
|