data-transfer-lib 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data_transfer_lib/__init__.py +11 -0
- data_transfer_lib/connections/__init__.py +4 -0
- data_transfer_lib/connections/base.py +39 -0
- data_transfer_lib/connections/clickhouse.py +67 -0
- data_transfer_lib/connections/postgres.py +82 -0
- data_transfer_lib/reader/__init__.py +3 -0
- data_transfer_lib/reader/base.py +25 -0
- data_transfer_lib/reader/reader.py +43 -0
- data_transfer_lib/schema/__init__.py +4 -0
- data_transfer_lib/schema/mapper.py +241 -0
- data_transfer_lib/schema/validator.py +107 -0
- data_transfer_lib/utils/__init__.py +13 -0
- data_transfer_lib/utils/exceptions.py +14 -0
- data_transfer_lib/writer/__init__.py +3 -0
- data_transfer_lib/writer/base.py +27 -0
- data_transfer_lib/writer/writer.py +72 -0
- data_transfer_lib-0.1.1.dist-info/METADATA +30 -0
- data_transfer_lib-0.1.1.dist-info/RECORD +20 -0
- data_transfer_lib-0.1.1.dist-info/WHEEL +5 -0
- data_transfer_lib-0.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Transfer Library для трансфера данных между БД с использованием PySpark
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from data_transfer_lib.connections.postgres import Postgres
|
|
6
|
+
from data_transfer_lib.connections.clickhouse import ClickHouse
|
|
7
|
+
from data_transfer_lib.reader.reader import Reader
|
|
8
|
+
from data_transfer_lib.writer.writer import Writer
|
|
9
|
+
|
|
10
|
+
__version__ = "0.1.0"
|
|
11
|
+
__all__ = ["Postgres", "ClickHouse", "Reader", "Writer"]
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Optional, Dict, Any
|
|
3
|
+
from pyspark.sql import SparkSession
|
|
4
|
+
|
|
5
|
+
class BaseConnection(ABC):
|
|
6
|
+
def __init__(
|
|
7
|
+
self,
|
|
8
|
+
host: str,
|
|
9
|
+
port: int,
|
|
10
|
+
user: str,
|
|
11
|
+
password: str,
|
|
12
|
+
database: Optional[str] = None,
|
|
13
|
+
spark: Optional[SparkSession] = None,
|
|
14
|
+
):
|
|
15
|
+
self.host = host
|
|
16
|
+
self.port = port
|
|
17
|
+
self.user = user
|
|
18
|
+
self.password = password
|
|
19
|
+
self.database = database
|
|
20
|
+
self.spark = spark or self._get_or_create_spark()
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def get_jdbc_url(self) -> str:
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def get_connection_properties(self) -> Dict[str, str]:
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def test_connection(self) -> bool:
|
|
32
|
+
pass
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def get_table_schema(self, db_name: str, table_name: str) -> Dict[str, Any]:
|
|
36
|
+
pass
|
|
37
|
+
|
|
38
|
+
def _get_or_create_spark(self) -> SparkSession:
|
|
39
|
+
return SparkSession.builder.getOrCreate()
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
from typing import Dict, Any, Optional
|
|
2
|
+
from pyspark.sql import SparkSession
|
|
3
|
+
from data_transfer_lib.connections.base import BaseConnection
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class ClickHouse(BaseConnection):
|
|
7
|
+
def __init__(
|
|
8
|
+
self,
|
|
9
|
+
host: str,
|
|
10
|
+
user: str,
|
|
11
|
+
password: str,
|
|
12
|
+
database: str = "default",
|
|
13
|
+
port: int = 8123,
|
|
14
|
+
spark: Optional[SparkSession] = None
|
|
15
|
+
):
|
|
16
|
+
super().__init__(
|
|
17
|
+
host=host,
|
|
18
|
+
port=port,
|
|
19
|
+
user=user,
|
|
20
|
+
password=password,
|
|
21
|
+
database=database,
|
|
22
|
+
spark=spark,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
def get_jdbc_url(self) -> str:
|
|
26
|
+
return f"jdbc:clickhouse://{self.host}:{self.port}/{self.database}"
|
|
27
|
+
|
|
28
|
+
def get_connection_properties(self) -> Dict[str, str]:
|
|
29
|
+
return {
|
|
30
|
+
"user": self.user,
|
|
31
|
+
"password": self.password,
|
|
32
|
+
"driver": "com.clickhouse.jdbc.ClickHouseDriver"
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
def test_connection(self) -> bool:
|
|
36
|
+
return True
|
|
37
|
+
|
|
38
|
+
def get_table_schema(self, db_name: str, table_name: str) -> Dict[str, Any]:
|
|
39
|
+
schema_query = f"""
|
|
40
|
+
SELECT
|
|
41
|
+
name,
|
|
42
|
+
type
|
|
43
|
+
FROM system.columns
|
|
44
|
+
WHERE database = '{db_name}'
|
|
45
|
+
AND table = '{table_name}'
|
|
46
|
+
ORDER BY position
|
|
47
|
+
"""
|
|
48
|
+
|
|
49
|
+
# TODO: try -> except
|
|
50
|
+
schema_df = (
|
|
51
|
+
self.spark.read
|
|
52
|
+
.format("jdbc")
|
|
53
|
+
.option("url", self.get_jdbc_url())
|
|
54
|
+
.option("query", schema_query)
|
|
55
|
+
.option("user", self.user)
|
|
56
|
+
.option("password", self.password)
|
|
57
|
+
.option("driver", "com.clickhouse.jdbc.ClickHouseDriver")
|
|
58
|
+
.load()
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
schema_dict: Dict[str, str] = {}
|
|
62
|
+
for row in schema_df.collect():
|
|
63
|
+
column_name = row['name']
|
|
64
|
+
column_type = row['type']
|
|
65
|
+
schema_dict[column_name] = column_type
|
|
66
|
+
|
|
67
|
+
return schema_dict
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
from typing import Dict, Any, Optional
|
|
2
|
+
from pyspark.sql import SparkSession
|
|
3
|
+
from data_transfer_lib.connections.base import BaseConnection
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class Postgres(BaseConnection):
|
|
7
|
+
def __init__(
|
|
8
|
+
self,
|
|
9
|
+
host: str,
|
|
10
|
+
user: str,
|
|
11
|
+
password: str,
|
|
12
|
+
database: str = "postgres",
|
|
13
|
+
port: int = 5432,
|
|
14
|
+
spark: Optional[SparkSession] = None,
|
|
15
|
+
):
|
|
16
|
+
super().__init__(
|
|
17
|
+
host=host,
|
|
18
|
+
port=port,
|
|
19
|
+
user=user,
|
|
20
|
+
password=password,
|
|
21
|
+
database=database,
|
|
22
|
+
spark=spark,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
def get_jdbc_url(self) -> str:
|
|
26
|
+
return f"jdbc:postgresql://{self.host}:{self.port}/{self.database}"
|
|
27
|
+
|
|
28
|
+
def get_connection_properties(self) -> Dict[str, str]:
|
|
29
|
+
return {
|
|
30
|
+
"user": self.user,
|
|
31
|
+
"password": self.password,
|
|
32
|
+
"driver": "org.postgresql.Driver"
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
def test_connection(self) -> bool:
|
|
36
|
+
print("Проверка подключения к PostgreSQL")
|
|
37
|
+
return True
|
|
38
|
+
|
|
39
|
+
def get_table_schema(self, db_name: str, table_name: str) -> Dict[str, Any]:
|
|
40
|
+
|
|
41
|
+
schema_query = f"""
|
|
42
|
+
SELECT
|
|
43
|
+
column_name,
|
|
44
|
+
data_type,
|
|
45
|
+
character_maximum_length,
|
|
46
|
+
numeric_precision,
|
|
47
|
+
numeric_scale
|
|
48
|
+
FROM information_schema.columns
|
|
49
|
+
WHERE table_schema = '{db_name}'
|
|
50
|
+
AND table_name = '{table_name}'
|
|
51
|
+
ORDER BY ordinal_position
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
# TODO: try -> except
|
|
55
|
+
|
|
56
|
+
schema_df = (
|
|
57
|
+
self.spark.read
|
|
58
|
+
.format("jdbc")
|
|
59
|
+
.option("url", self.get_jdbc_url())
|
|
60
|
+
.option("query", schema_query)
|
|
61
|
+
.option("user", self.user)
|
|
62
|
+
.option("password", self.password)
|
|
63
|
+
.option("driver", "org.postgresql.Driver")
|
|
64
|
+
.load()
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
schema_dict = {}
|
|
68
|
+
for row in schema_df.collect():
|
|
69
|
+
column_name = row['column_name']
|
|
70
|
+
data_type = row['data_type']
|
|
71
|
+
|
|
72
|
+
# Add precision and scale
|
|
73
|
+
if data_type in ('numeric', 'decimal') and row['numeric_precision']:
|
|
74
|
+
data_type = f"{data_type}({row['numeric_precision']},{row['numeric_scale']})"
|
|
75
|
+
|
|
76
|
+
# Add max_length
|
|
77
|
+
elif data_type in ('character varying', 'varchar') and row['character_maximum_length']:
|
|
78
|
+
data_type = f"varchar({row['character_maximum_length']})"
|
|
79
|
+
|
|
80
|
+
schema_dict[column_name] = data_type
|
|
81
|
+
|
|
82
|
+
return schema_dict
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from typing import Optional, Any
|
|
3
|
+
from pyspark.sql import DataFrame
|
|
4
|
+
from data_transfer_lib.connections.base import BaseConnection
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class BaseReader(ABC):
|
|
8
|
+
def __init__(
|
|
9
|
+
self,
|
|
10
|
+
connection: BaseConnection,
|
|
11
|
+
db_name: str,
|
|
12
|
+
table_name: str,
|
|
13
|
+
):
|
|
14
|
+
self.connection = connection
|
|
15
|
+
self.db_name = db_name
|
|
16
|
+
self.table_name = table_name
|
|
17
|
+
self._prepare()
|
|
18
|
+
|
|
19
|
+
@abstractmethod
|
|
20
|
+
def _prepare(self) -> None:
|
|
21
|
+
pass
|
|
22
|
+
|
|
23
|
+
@abstractmethod
|
|
24
|
+
def start(self) -> DataFrame:
|
|
25
|
+
pass
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame
|
|
2
|
+
from data_transfer_lib.reader.base import BaseReader
|
|
3
|
+
from data_transfer_lib.connections.base import BaseConnection
|
|
4
|
+
from data_transfer_lib.schema.validator import SchemaValidator
|
|
5
|
+
|
|
6
|
+
class Reader(BaseReader):
|
|
7
|
+
def __init__(
|
|
8
|
+
self,
|
|
9
|
+
connection: BaseConnection,
|
|
10
|
+
db_name: str,
|
|
11
|
+
table_name: str,
|
|
12
|
+
):
|
|
13
|
+
self.source_schema = None
|
|
14
|
+
super().__init__(
|
|
15
|
+
connection=connection,
|
|
16
|
+
db_name=db_name,
|
|
17
|
+
table_name=table_name,
|
|
18
|
+
)
|
|
19
|
+
|
|
20
|
+
def _prepare(self) -> None:
|
|
21
|
+
self.source_schema = self.connection.get_table_schema(
|
|
22
|
+
self.db_name,
|
|
23
|
+
self.table_name
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
SchemaValidator.validate_source_to_spark(self.source_schema)
|
|
27
|
+
|
|
28
|
+
def start(self) -> DataFrame:
|
|
29
|
+
|
|
30
|
+
table_name = f"{self.db_name}.{self.table_name}"
|
|
31
|
+
|
|
32
|
+
reader = (
|
|
33
|
+
self.connection
|
|
34
|
+
.spark.read
|
|
35
|
+
.format("jdbc")
|
|
36
|
+
.option("url", self.connection.get_jdbc_url())
|
|
37
|
+
.option("dbtable", table_name)
|
|
38
|
+
.option("user", self.connection.user)
|
|
39
|
+
.option("password", self.connection.password)
|
|
40
|
+
.option("driver", self.connection.get_connection_properties()["driver"])
|
|
41
|
+
)
|
|
42
|
+
df = reader.load()
|
|
43
|
+
return df
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
from typing import Dict, Any
|
|
2
|
+
from pyspark.sql.types import DataType
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class TypeMapper:
|
|
6
|
+
POSTGRES_TO_SPARK = {
|
|
7
|
+
# Boolean
|
|
8
|
+
"boolean": "BooleanType",
|
|
9
|
+
"bool": "BooleanType",
|
|
10
|
+
|
|
11
|
+
# Integer types
|
|
12
|
+
"smallint": "ShortType",
|
|
13
|
+
"int2": "ShortType",
|
|
14
|
+
"smallserial": "ShortType",
|
|
15
|
+
"serial2": "ShortType",
|
|
16
|
+
|
|
17
|
+
"integer": "IntegerType",
|
|
18
|
+
"int": "IntegerType",
|
|
19
|
+
"int4": "IntegerType",
|
|
20
|
+
"serial": "IntegerType",
|
|
21
|
+
"serial4": "IntegerType",
|
|
22
|
+
|
|
23
|
+
"bigint": "LongType",
|
|
24
|
+
"int8": "LongType",
|
|
25
|
+
"bigserial": "LongType",
|
|
26
|
+
"serial8": "LongType",
|
|
27
|
+
|
|
28
|
+
# Floating point
|
|
29
|
+
"real": "FloatType",
|
|
30
|
+
"float4": "FloatType",
|
|
31
|
+
"double precision": "DoubleType",
|
|
32
|
+
"float8": "DoubleType",
|
|
33
|
+
# float и float(p) обрабатываются отдельно
|
|
34
|
+
|
|
35
|
+
# Numeric/Decimal
|
|
36
|
+
"numeric": "DecimalType",
|
|
37
|
+
"decimal": "DecimalType",
|
|
38
|
+
|
|
39
|
+
# String types
|
|
40
|
+
"character varying": "StringType", # varchar
|
|
41
|
+
"varchar": "StringType",
|
|
42
|
+
"character": "StringType", # char
|
|
43
|
+
"char": "StringType",
|
|
44
|
+
"bpchar": "StringType",
|
|
45
|
+
"text": "StringType",
|
|
46
|
+
|
|
47
|
+
# Binary
|
|
48
|
+
"bytea": "BinaryType",
|
|
49
|
+
|
|
50
|
+
# Date/Time
|
|
51
|
+
"date": "DateType",
|
|
52
|
+
"timestamp": "TimestampType",
|
|
53
|
+
"timestamp without time zone": "TimestampType",
|
|
54
|
+
"timestamp with time zone": "TimestampType",
|
|
55
|
+
"timestamptz": "TimestampType",
|
|
56
|
+
"time": "TimestampType",
|
|
57
|
+
"time without time zone": "TimestampType",
|
|
58
|
+
"time with time zone": "TimestampType",
|
|
59
|
+
"timetz": "TimestampType",
|
|
60
|
+
|
|
61
|
+
# Interval
|
|
62
|
+
"interval": "StringType",
|
|
63
|
+
|
|
64
|
+
# Monetary
|
|
65
|
+
"money": "StringType",
|
|
66
|
+
|
|
67
|
+
# Network Address Types
|
|
68
|
+
"inet": "StringType",
|
|
69
|
+
"cidr": "StringType",
|
|
70
|
+
"macaddr": "StringType",
|
|
71
|
+
"macaddr8": "StringType",
|
|
72
|
+
|
|
73
|
+
# Geometric Types
|
|
74
|
+
"point": "StringType",
|
|
75
|
+
"line": "StringType",
|
|
76
|
+
"lseg": "StringType",
|
|
77
|
+
"box": "StringType",
|
|
78
|
+
"path": "StringType",
|
|
79
|
+
"polygon": "StringType",
|
|
80
|
+
"circle": "StringType",
|
|
81
|
+
|
|
82
|
+
# Log Sequence Number
|
|
83
|
+
"pg_lsn": "StringType",
|
|
84
|
+
|
|
85
|
+
# Bit types handled separately (bit(1) -> BooleanType, bit(>1) -> BinaryType)
|
|
86
|
+
|
|
87
|
+
# Text Search Types
|
|
88
|
+
"tsvector": "StringType",
|
|
89
|
+
"tsquery": "StringType",
|
|
90
|
+
|
|
91
|
+
# UUID
|
|
92
|
+
"uuid": "StringType",
|
|
93
|
+
|
|
94
|
+
# XML
|
|
95
|
+
"xml": "StringType",
|
|
96
|
+
|
|
97
|
+
# JSON
|
|
98
|
+
"json": "StringType",
|
|
99
|
+
"jsonb": "StringType",
|
|
100
|
+
|
|
101
|
+
# Range Types
|
|
102
|
+
"int4range": "StringType",
|
|
103
|
+
"int8range": "StringType",
|
|
104
|
+
"numrange": "StringType",
|
|
105
|
+
"tsrange": "StringType",
|
|
106
|
+
"tstzrange": "StringType",
|
|
107
|
+
"daterange": "StringType",
|
|
108
|
+
|
|
109
|
+
# Object Identifier Types
|
|
110
|
+
"oid": "DecimalType",
|
|
111
|
+
"regproc": "StringType",
|
|
112
|
+
"regprocedure": "StringType",
|
|
113
|
+
"regoper": "StringType",
|
|
114
|
+
"regoperator": "StringType",
|
|
115
|
+
"regclass": "StringType",
|
|
116
|
+
"regtype": "StringType",
|
|
117
|
+
"regrole": "StringType",
|
|
118
|
+
"regnamespace": "StringType",
|
|
119
|
+
"regconfig": "StringType",
|
|
120
|
+
"regdictionary": "StringType",
|
|
121
|
+
|
|
122
|
+
# Pseudo types
|
|
123
|
+
"void": "NullType",
|
|
124
|
+
|
|
125
|
+
# ENUM types map to StringType
|
|
126
|
+
# Array types map to ArrayType
|
|
127
|
+
# Composite types map to StringType
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
SPARK_TO_CLICKHOUSE = {
|
|
131
|
+
"IntegerType": "Int32",
|
|
132
|
+
"LongType": "Int64",
|
|
133
|
+
"ShortType": "Int16",
|
|
134
|
+
"DecimalType": "Decimal",
|
|
135
|
+
"FloatType": "Float32",
|
|
136
|
+
"DoubleType": "Float64",
|
|
137
|
+
"StringType": "String",
|
|
138
|
+
"TimestampType": "DateTime",
|
|
139
|
+
"DateType": "Date",
|
|
140
|
+
"BooleanType": "UInt8",
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
@classmethod
|
|
145
|
+
def postgres_to_spark(cls, pg_type: str) -> str:
|
|
146
|
+
base_type, params = cls._parse_pg_type(pg_type)
|
|
147
|
+
|
|
148
|
+
# float(p) - depends on precision
|
|
149
|
+
if base_type == "float":
|
|
150
|
+
if params and len(params) > 0:
|
|
151
|
+
p = int(params[0])
|
|
152
|
+
if 1 <= p <= 24:
|
|
153
|
+
return "FloatType"
|
|
154
|
+
elif 25 <= p <= 53:
|
|
155
|
+
return "DoubleType"
|
|
156
|
+
return "DoubleType"
|
|
157
|
+
|
|
158
|
+
# bit types
|
|
159
|
+
if base_type == "bit":
|
|
160
|
+
if params and len(params) > 0:
|
|
161
|
+
n = int(params[0])
|
|
162
|
+
if n == 1:
|
|
163
|
+
return "BooleanType"
|
|
164
|
+
else:
|
|
165
|
+
return "BinaryType"
|
|
166
|
+
else:
|
|
167
|
+
# bit(1)
|
|
168
|
+
return "BooleanType"
|
|
169
|
+
|
|
170
|
+
if base_type == "bit varying" or base_type == "varbit":
|
|
171
|
+
return "BinaryType"
|
|
172
|
+
|
|
173
|
+
if base_type in ("character varying", "varchar") and params:
|
|
174
|
+
return f"VarcharType({params[0]})"
|
|
175
|
+
|
|
176
|
+
if base_type in ("character", "char") and params:
|
|
177
|
+
return f"CharType({params[0]})"
|
|
178
|
+
|
|
179
|
+
# numeric(p, s) / decimal(p, s)
|
|
180
|
+
if base_type in ("numeric", "decimal"):
|
|
181
|
+
if params and len(params) >= 1:
|
|
182
|
+
p = int(params[0])
|
|
183
|
+
s = int(params[1]) if len(params) > 1 else 0
|
|
184
|
+
|
|
185
|
+
# PostgreSQL 15+: if s < 0
|
|
186
|
+
if s < 0:
|
|
187
|
+
adjusted_p = min(p - s, 38)
|
|
188
|
+
return f"DecimalType({adjusted_p}, 0)"
|
|
189
|
+
|
|
190
|
+
# If p > 38, the fraction part will be truncated if exceeded
|
|
191
|
+
if p > 38:
|
|
192
|
+
# TODO: if any value of this column have an actual precision greater 38
|
|
193
|
+
# will fail with NUMERIC_VALUE_OUT_OF_RANGE.WITHOUT_SUGGESTION error.
|
|
194
|
+
return f"DecimalType(38, {min(s, 38)})"
|
|
195
|
+
|
|
196
|
+
return f"DecimalType({p}, {s})"
|
|
197
|
+
else:
|
|
198
|
+
return "DecimalType(38, 0)"
|
|
199
|
+
|
|
200
|
+
if base_type == "oid":
|
|
201
|
+
return "DecimalType(20, 0)"
|
|
202
|
+
|
|
203
|
+
if base_type.endswith("[]"):
|
|
204
|
+
element_type = base_type[:-2]
|
|
205
|
+
element_spark_type = cls.postgres_to_spark(element_type)
|
|
206
|
+
return f"ArrayType({element_spark_type})"
|
|
207
|
+
|
|
208
|
+
# Other mapping - OK
|
|
209
|
+
spark_type = cls.POSTGRES_TO_SPARK.get(base_type.lower())
|
|
210
|
+
|
|
211
|
+
if spark_type:
|
|
212
|
+
return spark_type
|
|
213
|
+
|
|
214
|
+
# TODO: Exception for inknown type of PostgreSQL -> now to StringType
|
|
215
|
+
return "StringType"
|
|
216
|
+
|
|
217
|
+
@staticmethod
|
|
218
|
+
def _parse_pg_type(pg_type: str) -> tuple:
|
|
219
|
+
"""
|
|
220
|
+
Args:
|
|
221
|
+
pg_type: For example "varchar(100)", "numeric(10,2)", "bit(5)"
|
|
222
|
+
Returns:
|
|
223
|
+
(base_type, [params]) - For example ("varchar", ["100"]), ("numeric", ["10", "2"])
|
|
224
|
+
"""
|
|
225
|
+
pg_type = pg_type.strip().lower()
|
|
226
|
+
|
|
227
|
+
if "(" in pg_type and ")" in pg_type:
|
|
228
|
+
base_type = pg_type[:pg_type.index("(")].strip()
|
|
229
|
+
params_str = pg_type[pg_type.index("(") + 1:pg_type.rindex(")")].strip()
|
|
230
|
+
params = [p.strip() for p in params_str.split(",")]
|
|
231
|
+
return base_type, params
|
|
232
|
+
else:
|
|
233
|
+
return pg_type, []
|
|
234
|
+
|
|
235
|
+
@classmethod
|
|
236
|
+
def spark_to_clickhouse(cls, spark_type: str) -> str:
|
|
237
|
+
return cls.SPARK_TO_CLICKHOUSE.get(spark_type, "String")
|
|
238
|
+
|
|
239
|
+
@classmethod
|
|
240
|
+
def validate_mapping(cls, source_type: str, target_type: str) -> bool:
|
|
241
|
+
return True
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
from typing import Dict, Any
|
|
2
|
+
from pyspark.sql import DataFrame
|
|
3
|
+
from data_transfer_lib.utils.exceptions import SchemaValidationException
|
|
4
|
+
from data_transfer_lib.schema.mapper import TypeMapper
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class SchemaValidator:
|
|
8
|
+
|
|
9
|
+
@staticmethod
|
|
10
|
+
def validate_source_to_spark(source_schema: Dict[str, Any]) -> bool:
|
|
11
|
+
|
|
12
|
+
unsupported_types: list[str] = []
|
|
13
|
+
|
|
14
|
+
for column, pg_type in source_schema.items():
|
|
15
|
+
base_type = pg_type.split('(')[0].strip().lower()
|
|
16
|
+
spark_type = TypeMapper.postgres_to_spark(base_type)
|
|
17
|
+
|
|
18
|
+
# TODO: do it with flag
|
|
19
|
+
if spark_type == "StringType" and base_type not in TypeMapper.POSTGRES_TO_SPARK:
|
|
20
|
+
unsupported_types.append(f"{column}: {pg_type}")
|
|
21
|
+
|
|
22
|
+
if unsupported_types:
|
|
23
|
+
error_msg = f"Error:\n Unsupported types detected: {', '.join(unsupported_types)}"
|
|
24
|
+
raise SchemaValidationException(error_msg)
|
|
25
|
+
|
|
26
|
+
return True
|
|
27
|
+
|
|
28
|
+
@staticmethod
|
|
29
|
+
def validate_spark_to_target(df_schema: Dict[str, Any], target_schema: Dict[str, Any]) -> bool:
|
|
30
|
+
|
|
31
|
+
errors: list[str] = []
|
|
32
|
+
warnings: list[str] = []
|
|
33
|
+
|
|
34
|
+
# Check for missing columns
|
|
35
|
+
df_columns = set(df_schema.keys())
|
|
36
|
+
target_columns = set(target_schema.keys())
|
|
37
|
+
missing_columns = df_columns - target_columns
|
|
38
|
+
if missing_columns:
|
|
39
|
+
errors.append(f"Missing columns: {missing_columns}")
|
|
40
|
+
|
|
41
|
+
# Check type compatibility for common columns
|
|
42
|
+
for column_name in df_columns & target_columns:
|
|
43
|
+
spark_type = df_schema[column_name]
|
|
44
|
+
target_type = target_schema[column_name]
|
|
45
|
+
|
|
46
|
+
is_compatible, message = SchemaValidator._check_type_compatibility(
|
|
47
|
+
column_name, spark_type, target_type
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
if not is_compatible:
|
|
51
|
+
errors.append(message)
|
|
52
|
+
elif message:
|
|
53
|
+
warnings.append(message)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
for warning in warnings:
|
|
57
|
+
print(f"Warnings: {warning}")
|
|
58
|
+
|
|
59
|
+
if errors:
|
|
60
|
+
error_msg = "Uncompatible schemas:\n" + "\n".join(errors)
|
|
61
|
+
print(f"Error: {error_msg}")
|
|
62
|
+
raise SchemaValidationException(error_msg)
|
|
63
|
+
|
|
64
|
+
print("Validation is succed")
|
|
65
|
+
return True
|
|
66
|
+
|
|
67
|
+
@staticmethod
|
|
68
|
+
def compare_schemas(source_schema: Dict[str, Any], target_schema: Dict[str, Any]) -> Dict[str, Any]:
|
|
69
|
+
return {}
|
|
70
|
+
|
|
71
|
+
# TODO: it's easy type checkin, need to be customized
|
|
72
|
+
@staticmethod
|
|
73
|
+
def _check_type_compatibility(column: str, spark_type: str, target_type: str) -> tuple[bool, str]:
|
|
74
|
+
|
|
75
|
+
# Check for numeric types compatibility
|
|
76
|
+
spark_numeric_types = {
|
|
77
|
+
"IntegerType", "LongType", "ShortType",
|
|
78
|
+
"DecimalType", "FloatType", "DoubleType"
|
|
79
|
+
}
|
|
80
|
+
is_spark_type_numeric = any(t in spark_type for t in spark_numeric_types)
|
|
81
|
+
|
|
82
|
+
target_numeric_types = {
|
|
83
|
+
"Int8", "Int16", "Int32", "Int64",
|
|
84
|
+
"UInt8", "UInt16", "UInt32", "UInt64",
|
|
85
|
+
"Float32", "Float64"
|
|
86
|
+
}
|
|
87
|
+
is_target_type_numeric = any(t in target_type for t in target_numeric_types)
|
|
88
|
+
|
|
89
|
+
if is_spark_type_numeric and is_target_type_numeric:
|
|
90
|
+
return (True, "")
|
|
91
|
+
|
|
92
|
+
# Check for string types compatibility
|
|
93
|
+
if "String" in spark_type and "String" in target_type:
|
|
94
|
+
return (True, "")
|
|
95
|
+
|
|
96
|
+
# Check for date types compatibility
|
|
97
|
+
if "Timestamp" in spark_type and "DateTime" in target_type:
|
|
98
|
+
return (True, "")
|
|
99
|
+
if "Date" in spark_type and "Date" in target_type:
|
|
100
|
+
return (True, "")
|
|
101
|
+
|
|
102
|
+
# Check for decimal types compatibility
|
|
103
|
+
if "Decimal" in spark_type and "Decimal" in target_type:
|
|
104
|
+
return (True, "")
|
|
105
|
+
|
|
106
|
+
# Types are not compatible
|
|
107
|
+
return (False, f"Column '{column}': uncompatible types ({spark_type} -> {target_type})")
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from data_transfer_lib.utils.exceptions import (
|
|
2
|
+
DataTransferException,
|
|
3
|
+
ConnectionException,
|
|
4
|
+
SchemaValidationException,
|
|
5
|
+
TypeMappingException
|
|
6
|
+
)
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"DataTransferException",
|
|
10
|
+
"ConnectionException",
|
|
11
|
+
"SchemaValidationException",
|
|
12
|
+
"TypeMappingException"
|
|
13
|
+
]
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
from abc import ABC, abstractmethod
|
|
2
|
+
from pyspark.sql import DataFrame
|
|
3
|
+
from data_transfer_lib.connections.base import BaseConnection
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class BaseWriter(ABC):
|
|
7
|
+
def __init__(
|
|
8
|
+
self,
|
|
9
|
+
connection: BaseConnection,
|
|
10
|
+
db_name: str,
|
|
11
|
+
table_name: str,
|
|
12
|
+
if_exists: bool = True,
|
|
13
|
+
):
|
|
14
|
+
self.connection = connection
|
|
15
|
+
self.db_name = db_name
|
|
16
|
+
self.table_name = table_name
|
|
17
|
+
self.if_exists = if_exists
|
|
18
|
+
|
|
19
|
+
self._prepare()
|
|
20
|
+
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def _prepare(self) -> None:
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def start(self, df: DataFrame, **params) -> None:
|
|
27
|
+
pass
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
from pyspark.sql import DataFrame
|
|
2
|
+
from data_transfer_lib.writer.base import BaseWriter
|
|
3
|
+
from data_transfer_lib.connections.base import BaseConnection
|
|
4
|
+
from data_transfer_lib.schema.validator import SchemaValidator
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Writer(BaseWriter):
|
|
8
|
+
def __init__(
|
|
9
|
+
self,
|
|
10
|
+
connection: BaseConnection,
|
|
11
|
+
db_name: str,
|
|
12
|
+
table_name: str,
|
|
13
|
+
if_exists: bool = True,
|
|
14
|
+
):
|
|
15
|
+
self.target_schema = None
|
|
16
|
+
super().__init__(
|
|
17
|
+
connection=connection,
|
|
18
|
+
db_name=db_name,
|
|
19
|
+
table_name=table_name,
|
|
20
|
+
if_exists=if_exists,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
def _prepare(self) -> None:
|
|
24
|
+
|
|
25
|
+
if self.if_exists:
|
|
26
|
+
self.target_schema = self.connection.get_table_schema(
|
|
27
|
+
self.db_name,
|
|
28
|
+
self.table_name
|
|
29
|
+
)
|
|
30
|
+
print(self.target_schema)
|
|
31
|
+
else:
|
|
32
|
+
error_msg = f"{self.db_name}.{self.table_name} table does't exist"
|
|
33
|
+
raise DataTransferException(error_msg)
|
|
34
|
+
|
|
35
|
+
def start(self, df: DataFrame, **params) -> None:
|
|
36
|
+
|
|
37
|
+
df_schema = {}
|
|
38
|
+
for field in df.schema.fields:
|
|
39
|
+
df_schema[field.name] = str(field.dataType)
|
|
40
|
+
|
|
41
|
+
try:
|
|
42
|
+
SchemaValidator.validate_spark_to_target(
|
|
43
|
+
df_schema,
|
|
44
|
+
self.target_schema
|
|
45
|
+
)
|
|
46
|
+
except SchemaValidationException as e:
|
|
47
|
+
print(f"Error: {e}")
|
|
48
|
+
raise
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
if num_partitions := params.get("num_partitions", None):
|
|
52
|
+
df = df.repartition(num_partitions)
|
|
53
|
+
|
|
54
|
+
full_table_name = f"{self.db_name}.{self.table_name}"
|
|
55
|
+
|
|
56
|
+
writer = (
|
|
57
|
+
df.write
|
|
58
|
+
.format("jdbc")
|
|
59
|
+
.option("url", self.connection.get_jdbc_url())
|
|
60
|
+
.option("dbtable", full_table_name)
|
|
61
|
+
.option("user", self.connection.user)
|
|
62
|
+
.option("password", self.connection.password)
|
|
63
|
+
.option("driver", self.connection.get_connection_properties()["driver"])
|
|
64
|
+
.option("batchsize", params.get("batch_size", 10000))
|
|
65
|
+
.mode(params.get("mode", "append"))
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
writer.save()
|
|
69
|
+
|
|
70
|
+
except Exception as e:
|
|
71
|
+
raise ConnectionException(f"Couldn't write int table: {e}")
|
|
72
|
+
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: data-transfer-lib
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: Library for data transfer between databases using PySpark
|
|
5
|
+
Home-page: https://github.com/llirkh/data_transfer_lib
|
|
6
|
+
Author: llirikh
|
|
7
|
+
Author-email: zhukov.kg@phystech.edu
|
|
8
|
+
Classifier: Development Status :: 3 - Alpha
|
|
9
|
+
Classifier: Intended Audience :: Developers
|
|
10
|
+
Classifier: Topic :: Database
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: pyspark>=3.3.0
|
|
18
|
+
Dynamic: author
|
|
19
|
+
Dynamic: author-email
|
|
20
|
+
Dynamic: classifier
|
|
21
|
+
Dynamic: description
|
|
22
|
+
Dynamic: description-content-type
|
|
23
|
+
Dynamic: home-page
|
|
24
|
+
Dynamic: requires-dist
|
|
25
|
+
Dynamic: requires-python
|
|
26
|
+
Dynamic: summary
|
|
27
|
+
|
|
28
|
+
# Project description
|
|
29
|
+
|
|
30
|
+
Project description will be here
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
data_transfer_lib/__init__.py,sha256=n1XAEEQnXfoHFgAFXO_0dI7x0103G_ofnJhjrBqW4cM,431
|
|
2
|
+
data_transfer_lib/connections/__init__.py,sha256=95_Il7sIDU7UIoRJ5mDLdjK1nyBkO-zva_Bfvj7AyPI,161
|
|
3
|
+
data_transfer_lib/connections/base.py,sha256=j041jyzNv45WKZk5nFrJBeuJXo8aCZVwD3loZTq61XM,1016
|
|
4
|
+
data_transfer_lib/connections/clickhouse.py,sha256=zL_KRUODF3FIjaZzxADWUXD_KSaxmAC41TicS_P6jAc,1941
|
|
5
|
+
data_transfer_lib/connections/postgres.py,sha256=sPJM-Y_ujo5KJTDOJ6Y9ysK-J0p8CDU6k198-n8Fkxw,2570
|
|
6
|
+
data_transfer_lib/reader/__init__.py,sha256=SA4HFp20RKcFy1idtPOE0a86V2wgcIP7ljzRYfnfVUc,72
|
|
7
|
+
data_transfer_lib/reader/base.py,sha256=VNU4-4MoDvVui10fU6IJMI6xHl0YxXLuz5gHua82kKA,582
|
|
8
|
+
data_transfer_lib/reader/reader.py,sha256=5s3PvUDakTZ6lTiUhEo8hp743ccUE3ovDHMeZj0znSo,1327
|
|
9
|
+
data_transfer_lib/schema/__init__.py,sha256=kxdcCDDtDDB1YpymEzle4Yl2w3nVa6wLW7nAYGVYcpI,162
|
|
10
|
+
data_transfer_lib/schema/mapper.py,sha256=lIXsa7vMP4cG5spruZNX_8sDh95h0ZO59w1j1fo0ne0,7549
|
|
11
|
+
data_transfer_lib/schema/validator.py,sha256=jG4mVRHLaWEg4Ru2Gd_cT65lCRkmiRxF6OpwfF4qZek,3994
|
|
12
|
+
data_transfer_lib/utils/__init__.py,sha256=B8zhVMM1pRM4oRJ1XoMgmE8I8K-0hmP0yu3ONQV1bvQ,289
|
|
13
|
+
data_transfer_lib/utils/exceptions.py,sha256=zJ91uknxGHaxEMZPgjbiPoAw8Nhra3Y-r24iTe8S1J8,238
|
|
14
|
+
data_transfer_lib/writer/__init__.py,sha256=OqrRHW216VAOtFzzOz7jL-H7kxYlfj--8IQs9rOyNgo,72
|
|
15
|
+
data_transfer_lib/writer/base.py,sha256=0qCjWqNWluTCOWJM4xfwRy9TeFsq_LlQLjtkoeOC0Ok,645
|
|
16
|
+
data_transfer_lib/writer/writer.py,sha256=o1fM2dJFExqrYI21hJUiyrHZD2ZOmhFc3kY0Q3gfh1E,2348
|
|
17
|
+
data_transfer_lib-0.1.1.dist-info/METADATA,sha256=BP95JdnRPpKtFbuAg8KD6rQmKp5WLB8UFeA6KF-oYiA,900
|
|
18
|
+
data_transfer_lib-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
19
|
+
data_transfer_lib-0.1.1.dist-info/top_level.txt,sha256=Yz0tNpzxGcQmSb1vBGHge5Ab5IoaBa_5pzSMlyxusII,18
|
|
20
|
+
data_transfer_lib-0.1.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
data_transfer_lib
|