data-transfer-lib 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,11 @@
1
+ """
2
+ Data Transfer Library для трансфера данных между БД с использованием PySpark
3
+ """
4
+
5
+ from data_transfer_lib.connections.postgres import Postgres
6
+ from data_transfer_lib.connections.clickhouse import ClickHouse
7
+ from data_transfer_lib.reader.reader import Reader
8
+ from data_transfer_lib.writer.writer import Writer
9
+
10
+ __version__ = "0.1.0"
11
+ __all__ = ["Postgres", "ClickHouse", "Reader", "Writer"]
@@ -0,0 +1,4 @@
1
+ from data_transfer_lib.connections.postgres import Postgres
2
+ from data_transfer_lib.connections.clickhouse import ClickHouse
3
+
4
+ __all__ = ["Postgres", "ClickHouse"]
@@ -0,0 +1,39 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Optional, Dict, Any
3
+ from pyspark.sql import SparkSession
4
+
5
+ class BaseConnection(ABC):
6
+ def __init__(
7
+ self,
8
+ host: str,
9
+ port: int,
10
+ user: str,
11
+ password: str,
12
+ database: Optional[str] = None,
13
+ spark: Optional[SparkSession] = None,
14
+ ):
15
+ self.host = host
16
+ self.port = port
17
+ self.user = user
18
+ self.password = password
19
+ self.database = database
20
+ self.spark = spark or self._get_or_create_spark()
21
+
22
+ @abstractmethod
23
+ def get_jdbc_url(self) -> str:
24
+ pass
25
+
26
+ @abstractmethod
27
+ def get_connection_properties(self) -> Dict[str, str]:
28
+ pass
29
+
30
+ @abstractmethod
31
+ def test_connection(self) -> bool:
32
+ pass
33
+
34
+ @abstractmethod
35
+ def get_table_schema(self, db_name: str, table_name: str) -> Dict[str, Any]:
36
+ pass
37
+
38
+ def _get_or_create_spark(self) -> SparkSession:
39
+ return SparkSession.builder.getOrCreate()
@@ -0,0 +1,67 @@
1
+ from typing import Dict, Any, Optional
2
+ from pyspark.sql import SparkSession
3
+ from data_transfer_lib.connections.base import BaseConnection
4
+
5
+
6
+ class ClickHouse(BaseConnection):
7
+ def __init__(
8
+ self,
9
+ host: str,
10
+ user: str,
11
+ password: str,
12
+ database: str = "default",
13
+ port: int = 8123,
14
+ spark: Optional[SparkSession] = None
15
+ ):
16
+ super().__init__(
17
+ host=host,
18
+ port=port,
19
+ user=user,
20
+ password=password,
21
+ database=database,
22
+ spark=spark,
23
+ )
24
+
25
+ def get_jdbc_url(self) -> str:
26
+ return f"jdbc:clickhouse://{self.host}:{self.port}/{self.database}"
27
+
28
+ def get_connection_properties(self) -> Dict[str, str]:
29
+ return {
30
+ "user": self.user,
31
+ "password": self.password,
32
+ "driver": "com.clickhouse.jdbc.ClickHouseDriver"
33
+ }
34
+
35
+ def test_connection(self) -> bool:
36
+ return True
37
+
38
+ def get_table_schema(self, db_name: str, table_name: str) -> Dict[str, Any]:
39
+ schema_query = f"""
40
+ SELECT
41
+ name,
42
+ type
43
+ FROM system.columns
44
+ WHERE database = '{db_name}'
45
+ AND table = '{table_name}'
46
+ ORDER BY position
47
+ """
48
+
49
+ # TODO: try -> except
50
+ schema_df = (
51
+ self.spark.read
52
+ .format("jdbc")
53
+ .option("url", self.get_jdbc_url())
54
+ .option("query", schema_query)
55
+ .option("user", self.user)
56
+ .option("password", self.password)
57
+ .option("driver", "com.clickhouse.jdbc.ClickHouseDriver")
58
+ .load()
59
+ )
60
+
61
+ schema_dict: Dict[str, str] = {}
62
+ for row in schema_df.collect():
63
+ column_name = row['name']
64
+ column_type = row['type']
65
+ schema_dict[column_name] = column_type
66
+
67
+ return schema_dict
@@ -0,0 +1,82 @@
1
+ from typing import Dict, Any, Optional
2
+ from pyspark.sql import SparkSession
3
+ from data_transfer_lib.connections.base import BaseConnection
4
+
5
+
6
+ class Postgres(BaseConnection):
7
+ def __init__(
8
+ self,
9
+ host: str,
10
+ user: str,
11
+ password: str,
12
+ database: str = "postgres",
13
+ port: int = 5432,
14
+ spark: Optional[SparkSession] = None,
15
+ ):
16
+ super().__init__(
17
+ host=host,
18
+ port=port,
19
+ user=user,
20
+ password=password,
21
+ database=database,
22
+ spark=spark,
23
+ )
24
+
25
+ def get_jdbc_url(self) -> str:
26
+ return f"jdbc:postgresql://{self.host}:{self.port}/{self.database}"
27
+
28
+ def get_connection_properties(self) -> Dict[str, str]:
29
+ return {
30
+ "user": self.user,
31
+ "password": self.password,
32
+ "driver": "org.postgresql.Driver"
33
+ }
34
+
35
+ def test_connection(self) -> bool:
36
+ print("Проверка подключения к PostgreSQL")
37
+ return True
38
+
39
+ def get_table_schema(self, db_name: str, table_name: str) -> Dict[str, Any]:
40
+
41
+ schema_query = f"""
42
+ SELECT
43
+ column_name,
44
+ data_type,
45
+ character_maximum_length,
46
+ numeric_precision,
47
+ numeric_scale
48
+ FROM information_schema.columns
49
+ WHERE table_schema = '{db_name}'
50
+ AND table_name = '{table_name}'
51
+ ORDER BY ordinal_position
52
+ """
53
+
54
+ # TODO: try -> except
55
+
56
+ schema_df = (
57
+ self.spark.read
58
+ .format("jdbc")
59
+ .option("url", self.get_jdbc_url())
60
+ .option("query", schema_query)
61
+ .option("user", self.user)
62
+ .option("password", self.password)
63
+ .option("driver", "org.postgresql.Driver")
64
+ .load()
65
+ )
66
+
67
+ schema_dict = {}
68
+ for row in schema_df.collect():
69
+ column_name = row['column_name']
70
+ data_type = row['data_type']
71
+
72
+ # Add precision and scale
73
+ if data_type in ('numeric', 'decimal') and row['numeric_precision']:
74
+ data_type = f"{data_type}({row['numeric_precision']},{row['numeric_scale']})"
75
+
76
+ # Add max_length
77
+ elif data_type in ('character varying', 'varchar') and row['character_maximum_length']:
78
+ data_type = f"varchar({row['character_maximum_length']})"
79
+
80
+ schema_dict[column_name] = data_type
81
+
82
+ return schema_dict
@@ -0,0 +1,3 @@
1
+ from data_transfer_lib.reader.reader import Reader
2
+
3
+ __all__ = ["Reader"]
@@ -0,0 +1,25 @@
1
+ from abc import ABC, abstractmethod
2
+ from typing import Optional, Any
3
+ from pyspark.sql import DataFrame
4
+ from data_transfer_lib.connections.base import BaseConnection
5
+
6
+
7
+ class BaseReader(ABC):
8
+ def __init__(
9
+ self,
10
+ connection: BaseConnection,
11
+ db_name: str,
12
+ table_name: str,
13
+ ):
14
+ self.connection = connection
15
+ self.db_name = db_name
16
+ self.table_name = table_name
17
+ self._prepare()
18
+
19
+ @abstractmethod
20
+ def _prepare(self) -> None:
21
+ pass
22
+
23
+ @abstractmethod
24
+ def start(self) -> DataFrame:
25
+ pass
@@ -0,0 +1,43 @@
1
+ from pyspark.sql import DataFrame
2
+ from data_transfer_lib.reader.base import BaseReader
3
+ from data_transfer_lib.connections.base import BaseConnection
4
+ from data_transfer_lib.schema.validator import SchemaValidator
5
+
6
+ class Reader(BaseReader):
7
+ def __init__(
8
+ self,
9
+ connection: BaseConnection,
10
+ db_name: str,
11
+ table_name: str,
12
+ ):
13
+ self.source_schema = None
14
+ super().__init__(
15
+ connection=connection,
16
+ db_name=db_name,
17
+ table_name=table_name,
18
+ )
19
+
20
+ def _prepare(self) -> None:
21
+ self.source_schema = self.connection.get_table_schema(
22
+ self.db_name,
23
+ self.table_name
24
+ )
25
+
26
+ SchemaValidator.validate_source_to_spark(self.source_schema)
27
+
28
+ def start(self) -> DataFrame:
29
+
30
+ table_name = f"{self.db_name}.{self.table_name}"
31
+
32
+ reader = (
33
+ self.connection
34
+ .spark.read
35
+ .format("jdbc")
36
+ .option("url", self.connection.get_jdbc_url())
37
+ .option("dbtable", table_name)
38
+ .option("user", self.connection.user)
39
+ .option("password", self.connection.password)
40
+ .option("driver", self.connection.get_connection_properties()["driver"])
41
+ )
42
+ df = reader.load()
43
+ return df
@@ -0,0 +1,4 @@
1
+ from data_transfer_lib.schema.mapper import TypeMapper
2
+ from data_transfer_lib.schema.validator import SchemaValidator
3
+
4
+ __all__ = ["TypeMapper", "SchemaValidator"]
@@ -0,0 +1,241 @@
1
+ from typing import Dict, Any
2
+ from pyspark.sql.types import DataType
3
+
4
+
5
+ class TypeMapper:
6
+ POSTGRES_TO_SPARK = {
7
+ # Boolean
8
+ "boolean": "BooleanType",
9
+ "bool": "BooleanType",
10
+
11
+ # Integer types
12
+ "smallint": "ShortType",
13
+ "int2": "ShortType",
14
+ "smallserial": "ShortType",
15
+ "serial2": "ShortType",
16
+
17
+ "integer": "IntegerType",
18
+ "int": "IntegerType",
19
+ "int4": "IntegerType",
20
+ "serial": "IntegerType",
21
+ "serial4": "IntegerType",
22
+
23
+ "bigint": "LongType",
24
+ "int8": "LongType",
25
+ "bigserial": "LongType",
26
+ "serial8": "LongType",
27
+
28
+ # Floating point
29
+ "real": "FloatType",
30
+ "float4": "FloatType",
31
+ "double precision": "DoubleType",
32
+ "float8": "DoubleType",
33
+ # float и float(p) обрабатываются отдельно
34
+
35
+ # Numeric/Decimal
36
+ "numeric": "DecimalType",
37
+ "decimal": "DecimalType",
38
+
39
+ # String types
40
+ "character varying": "StringType", # varchar
41
+ "varchar": "StringType",
42
+ "character": "StringType", # char
43
+ "char": "StringType",
44
+ "bpchar": "StringType",
45
+ "text": "StringType",
46
+
47
+ # Binary
48
+ "bytea": "BinaryType",
49
+
50
+ # Date/Time
51
+ "date": "DateType",
52
+ "timestamp": "TimestampType",
53
+ "timestamp without time zone": "TimestampType",
54
+ "timestamp with time zone": "TimestampType",
55
+ "timestamptz": "TimestampType",
56
+ "time": "TimestampType",
57
+ "time without time zone": "TimestampType",
58
+ "time with time zone": "TimestampType",
59
+ "timetz": "TimestampType",
60
+
61
+ # Interval
62
+ "interval": "StringType",
63
+
64
+ # Monetary
65
+ "money": "StringType",
66
+
67
+ # Network Address Types
68
+ "inet": "StringType",
69
+ "cidr": "StringType",
70
+ "macaddr": "StringType",
71
+ "macaddr8": "StringType",
72
+
73
+ # Geometric Types
74
+ "point": "StringType",
75
+ "line": "StringType",
76
+ "lseg": "StringType",
77
+ "box": "StringType",
78
+ "path": "StringType",
79
+ "polygon": "StringType",
80
+ "circle": "StringType",
81
+
82
+ # Log Sequence Number
83
+ "pg_lsn": "StringType",
84
+
85
+ # Bit types handled separately (bit(1) -> BooleanType, bit(>1) -> BinaryType)
86
+
87
+ # Text Search Types
88
+ "tsvector": "StringType",
89
+ "tsquery": "StringType",
90
+
91
+ # UUID
92
+ "uuid": "StringType",
93
+
94
+ # XML
95
+ "xml": "StringType",
96
+
97
+ # JSON
98
+ "json": "StringType",
99
+ "jsonb": "StringType",
100
+
101
+ # Range Types
102
+ "int4range": "StringType",
103
+ "int8range": "StringType",
104
+ "numrange": "StringType",
105
+ "tsrange": "StringType",
106
+ "tstzrange": "StringType",
107
+ "daterange": "StringType",
108
+
109
+ # Object Identifier Types
110
+ "oid": "DecimalType",
111
+ "regproc": "StringType",
112
+ "regprocedure": "StringType",
113
+ "regoper": "StringType",
114
+ "regoperator": "StringType",
115
+ "regclass": "StringType",
116
+ "regtype": "StringType",
117
+ "regrole": "StringType",
118
+ "regnamespace": "StringType",
119
+ "regconfig": "StringType",
120
+ "regdictionary": "StringType",
121
+
122
+ # Pseudo types
123
+ "void": "NullType",
124
+
125
+ # ENUM types map to StringType
126
+ # Array types map to ArrayType
127
+ # Composite types map to StringType
128
+ }
129
+
130
+ SPARK_TO_CLICKHOUSE = {
131
+ "IntegerType": "Int32",
132
+ "LongType": "Int64",
133
+ "ShortType": "Int16",
134
+ "DecimalType": "Decimal",
135
+ "FloatType": "Float32",
136
+ "DoubleType": "Float64",
137
+ "StringType": "String",
138
+ "TimestampType": "DateTime",
139
+ "DateType": "Date",
140
+ "BooleanType": "UInt8",
141
+ }
142
+
143
+
144
+ @classmethod
145
+ def postgres_to_spark(cls, pg_type: str) -> str:
146
+ base_type, params = cls._parse_pg_type(pg_type)
147
+
148
+ # float(p) - depends on precision
149
+ if base_type == "float":
150
+ if params and len(params) > 0:
151
+ p = int(params[0])
152
+ if 1 <= p <= 24:
153
+ return "FloatType"
154
+ elif 25 <= p <= 53:
155
+ return "DoubleType"
156
+ return "DoubleType"
157
+
158
+ # bit types
159
+ if base_type == "bit":
160
+ if params and len(params) > 0:
161
+ n = int(params[0])
162
+ if n == 1:
163
+ return "BooleanType"
164
+ else:
165
+ return "BinaryType"
166
+ else:
167
+ # bit(1)
168
+ return "BooleanType"
169
+
170
+ if base_type == "bit varying" or base_type == "varbit":
171
+ return "BinaryType"
172
+
173
+ if base_type in ("character varying", "varchar") and params:
174
+ return f"VarcharType({params[0]})"
175
+
176
+ if base_type in ("character", "char") and params:
177
+ return f"CharType({params[0]})"
178
+
179
+ # numeric(p, s) / decimal(p, s)
180
+ if base_type in ("numeric", "decimal"):
181
+ if params and len(params) >= 1:
182
+ p = int(params[0])
183
+ s = int(params[1]) if len(params) > 1 else 0
184
+
185
+ # PostgreSQL 15+: if s < 0
186
+ if s < 0:
187
+ adjusted_p = min(p - s, 38)
188
+ return f"DecimalType({adjusted_p}, 0)"
189
+
190
+ # If p > 38, the fraction part will be truncated if exceeded
191
+ if p > 38:
192
+ # TODO: if any value of this column have an actual precision greater 38
193
+ # will fail with NUMERIC_VALUE_OUT_OF_RANGE.WITHOUT_SUGGESTION error.
194
+ return f"DecimalType(38, {min(s, 38)})"
195
+
196
+ return f"DecimalType({p}, {s})"
197
+ else:
198
+ return "DecimalType(38, 0)"
199
+
200
+ if base_type == "oid":
201
+ return "DecimalType(20, 0)"
202
+
203
+ if base_type.endswith("[]"):
204
+ element_type = base_type[:-2]
205
+ element_spark_type = cls.postgres_to_spark(element_type)
206
+ return f"ArrayType({element_spark_type})"
207
+
208
+ # Other mapping - OK
209
+ spark_type = cls.POSTGRES_TO_SPARK.get(base_type.lower())
210
+
211
+ if spark_type:
212
+ return spark_type
213
+
214
+ # TODO: Exception for inknown type of PostgreSQL -> now to StringType
215
+ return "StringType"
216
+
217
+ @staticmethod
218
+ def _parse_pg_type(pg_type: str) -> tuple:
219
+ """
220
+ Args:
221
+ pg_type: For example "varchar(100)", "numeric(10,2)", "bit(5)"
222
+ Returns:
223
+ (base_type, [params]) - For example ("varchar", ["100"]), ("numeric", ["10", "2"])
224
+ """
225
+ pg_type = pg_type.strip().lower()
226
+
227
+ if "(" in pg_type and ")" in pg_type:
228
+ base_type = pg_type[:pg_type.index("(")].strip()
229
+ params_str = pg_type[pg_type.index("(") + 1:pg_type.rindex(")")].strip()
230
+ params = [p.strip() for p in params_str.split(",")]
231
+ return base_type, params
232
+ else:
233
+ return pg_type, []
234
+
235
+ @classmethod
236
+ def spark_to_clickhouse(cls, spark_type: str) -> str:
237
+ return cls.SPARK_TO_CLICKHOUSE.get(spark_type, "String")
238
+
239
+ @classmethod
240
+ def validate_mapping(cls, source_type: str, target_type: str) -> bool:
241
+ return True
@@ -0,0 +1,107 @@
1
+ from typing import Dict, Any
2
+ from pyspark.sql import DataFrame
3
+ from data_transfer_lib.utils.exceptions import SchemaValidationException
4
+ from data_transfer_lib.schema.mapper import TypeMapper
5
+
6
+
7
+ class SchemaValidator:
8
+
9
+ @staticmethod
10
+ def validate_source_to_spark(source_schema: Dict[str, Any]) -> bool:
11
+
12
+ unsupported_types: list[str] = []
13
+
14
+ for column, pg_type in source_schema.items():
15
+ base_type = pg_type.split('(')[0].strip().lower()
16
+ spark_type = TypeMapper.postgres_to_spark(base_type)
17
+
18
+ # TODO: do it with flag
19
+ if spark_type == "StringType" and base_type not in TypeMapper.POSTGRES_TO_SPARK:
20
+ unsupported_types.append(f"{column}: {pg_type}")
21
+
22
+ if unsupported_types:
23
+ error_msg = f"Error:\n Unsupported types detected: {', '.join(unsupported_types)}"
24
+ raise SchemaValidationException(error_msg)
25
+
26
+ return True
27
+
28
+ @staticmethod
29
+ def validate_spark_to_target(df_schema: Dict[str, Any], target_schema: Dict[str, Any]) -> bool:
30
+
31
+ errors: list[str] = []
32
+ warnings: list[str] = []
33
+
34
+ # Check for missing columns
35
+ df_columns = set(df_schema.keys())
36
+ target_columns = set(target_schema.keys())
37
+ missing_columns = df_columns - target_columns
38
+ if missing_columns:
39
+ errors.append(f"Missing columns: {missing_columns}")
40
+
41
+ # Check type compatibility for common columns
42
+ for column_name in df_columns & target_columns:
43
+ spark_type = df_schema[column_name]
44
+ target_type = target_schema[column_name]
45
+
46
+ is_compatible, message = SchemaValidator._check_type_compatibility(
47
+ column_name, spark_type, target_type
48
+ )
49
+
50
+ if not is_compatible:
51
+ errors.append(message)
52
+ elif message:
53
+ warnings.append(message)
54
+
55
+
56
+ for warning in warnings:
57
+ print(f"Warnings: {warning}")
58
+
59
+ if errors:
60
+ error_msg = "Uncompatible schemas:\n" + "\n".join(errors)
61
+ print(f"Error: {error_msg}")
62
+ raise SchemaValidationException(error_msg)
63
+
64
+ print("Validation is succed")
65
+ return True
66
+
67
+ @staticmethod
68
+ def compare_schemas(source_schema: Dict[str, Any], target_schema: Dict[str, Any]) -> Dict[str, Any]:
69
+ return {}
70
+
71
+ # TODO: it's easy type checkin, need to be customized
72
+ @staticmethod
73
+ def _check_type_compatibility(column: str, spark_type: str, target_type: str) -> tuple[bool, str]:
74
+
75
+ # Check for numeric types compatibility
76
+ spark_numeric_types = {
77
+ "IntegerType", "LongType", "ShortType",
78
+ "DecimalType", "FloatType", "DoubleType"
79
+ }
80
+ is_spark_type_numeric = any(t in spark_type for t in spark_numeric_types)
81
+
82
+ target_numeric_types = {
83
+ "Int8", "Int16", "Int32", "Int64",
84
+ "UInt8", "UInt16", "UInt32", "UInt64",
85
+ "Float32", "Float64"
86
+ }
87
+ is_target_type_numeric = any(t in target_type for t in target_numeric_types)
88
+
89
+ if is_spark_type_numeric and is_target_type_numeric:
90
+ return (True, "")
91
+
92
+ # Check for string types compatibility
93
+ if "String" in spark_type and "String" in target_type:
94
+ return (True, "")
95
+
96
+ # Check for date types compatibility
97
+ if "Timestamp" in spark_type and "DateTime" in target_type:
98
+ return (True, "")
99
+ if "Date" in spark_type and "Date" in target_type:
100
+ return (True, "")
101
+
102
+ # Check for decimal types compatibility
103
+ if "Decimal" in spark_type and "Decimal" in target_type:
104
+ return (True, "")
105
+
106
+ # Types are not compatible
107
+ return (False, f"Column '{column}': uncompatible types ({spark_type} -> {target_type})")
@@ -0,0 +1,13 @@
1
+ from data_transfer_lib.utils.exceptions import (
2
+ DataTransferException,
3
+ ConnectionException,
4
+ SchemaValidationException,
5
+ TypeMappingException
6
+ )
7
+
8
+ __all__ = [
9
+ "DataTransferException",
10
+ "ConnectionException",
11
+ "SchemaValidationException",
12
+ "TypeMappingException"
13
+ ]
@@ -0,0 +1,14 @@
1
+ class DataTransferException(Exception):
2
+ pass
3
+
4
+
5
+ class ConnectionException(DataTransferException):
6
+ pass
7
+
8
+
9
+ class SchemaValidationException(DataTransferException):
10
+ pass
11
+
12
+
13
+ class TypeMappingException(DataTransferException):
14
+ pass
@@ -0,0 +1,3 @@
1
+ from data_transfer_lib.writer.writer import Writer
2
+
3
+ __all__ = ["Writer"]
@@ -0,0 +1,27 @@
1
+ from abc import ABC, abstractmethod
2
+ from pyspark.sql import DataFrame
3
+ from data_transfer_lib.connections.base import BaseConnection
4
+
5
+
6
+ class BaseWriter(ABC):
7
+ def __init__(
8
+ self,
9
+ connection: BaseConnection,
10
+ db_name: str,
11
+ table_name: str,
12
+ if_exists: bool = True,
13
+ ):
14
+ self.connection = connection
15
+ self.db_name = db_name
16
+ self.table_name = table_name
17
+ self.if_exists = if_exists
18
+
19
+ self._prepare()
20
+
21
+ @abstractmethod
22
+ def _prepare(self) -> None:
23
+ pass
24
+
25
+ @abstractmethod
26
+ def start(self, df: DataFrame, **params) -> None:
27
+ pass
@@ -0,0 +1,72 @@
1
+ from pyspark.sql import DataFrame
2
+ from data_transfer_lib.writer.base import BaseWriter
3
+ from data_transfer_lib.connections.base import BaseConnection
4
+ from data_transfer_lib.schema.validator import SchemaValidator
5
+
6
+
7
+ class Writer(BaseWriter):
8
+ def __init__(
9
+ self,
10
+ connection: BaseConnection,
11
+ db_name: str,
12
+ table_name: str,
13
+ if_exists: bool = True,
14
+ ):
15
+ self.target_schema = None
16
+ super().__init__(
17
+ connection=connection,
18
+ db_name=db_name,
19
+ table_name=table_name,
20
+ if_exists=if_exists,
21
+ )
22
+
23
+ def _prepare(self) -> None:
24
+
25
+ if self.if_exists:
26
+ self.target_schema = self.connection.get_table_schema(
27
+ self.db_name,
28
+ self.table_name
29
+ )
30
+ print(self.target_schema)
31
+ else:
32
+ error_msg = f"{self.db_name}.{self.table_name} table does't exist"
33
+ raise DataTransferException(error_msg)
34
+
35
+ def start(self, df: DataFrame, **params) -> None:
36
+
37
+ df_schema = {}
38
+ for field in df.schema.fields:
39
+ df_schema[field.name] = str(field.dataType)
40
+
41
+ try:
42
+ SchemaValidator.validate_spark_to_target(
43
+ df_schema,
44
+ self.target_schema
45
+ )
46
+ except SchemaValidationException as e:
47
+ print(f"Error: {e}")
48
+ raise
49
+
50
+ try:
51
+ if num_partitions := params.get("num_partitions", None):
52
+ df = df.repartition(num_partitions)
53
+
54
+ full_table_name = f"{self.db_name}.{self.table_name}"
55
+
56
+ writer = (
57
+ df.write
58
+ .format("jdbc")
59
+ .option("url", self.connection.get_jdbc_url())
60
+ .option("dbtable", full_table_name)
61
+ .option("user", self.connection.user)
62
+ .option("password", self.connection.password)
63
+ .option("driver", self.connection.get_connection_properties()["driver"])
64
+ .option("batchsize", params.get("batch_size", 10000))
65
+ .mode(params.get("mode", "append"))
66
+ )
67
+
68
+ writer.save()
69
+
70
+ except Exception as e:
71
+ raise ConnectionException(f"Couldn't write int table: {e}")
72
+
@@ -0,0 +1,30 @@
1
+ Metadata-Version: 2.4
2
+ Name: data-transfer-lib
3
+ Version: 0.1.1
4
+ Summary: Library for data transfer between databases using PySpark
5
+ Home-page: https://github.com/llirkh/data_transfer_lib
6
+ Author: llirikh
7
+ Author-email: zhukov.kg@phystech.edu
8
+ Classifier: Development Status :: 3 - Alpha
9
+ Classifier: Intended Audience :: Developers
10
+ Classifier: Topic :: Database
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.8
13
+ Classifier: Programming Language :: Python :: 3.9
14
+ Classifier: Programming Language :: Python :: 3.10
15
+ Requires-Python: >=3.8
16
+ Description-Content-Type: text/markdown
17
+ Requires-Dist: pyspark>=3.3.0
18
+ Dynamic: author
19
+ Dynamic: author-email
20
+ Dynamic: classifier
21
+ Dynamic: description
22
+ Dynamic: description-content-type
23
+ Dynamic: home-page
24
+ Dynamic: requires-dist
25
+ Dynamic: requires-python
26
+ Dynamic: summary
27
+
28
+ # Project description
29
+
30
+ Project description will be here
@@ -0,0 +1,20 @@
1
+ data_transfer_lib/__init__.py,sha256=n1XAEEQnXfoHFgAFXO_0dI7x0103G_ofnJhjrBqW4cM,431
2
+ data_transfer_lib/connections/__init__.py,sha256=95_Il7sIDU7UIoRJ5mDLdjK1nyBkO-zva_Bfvj7AyPI,161
3
+ data_transfer_lib/connections/base.py,sha256=j041jyzNv45WKZk5nFrJBeuJXo8aCZVwD3loZTq61XM,1016
4
+ data_transfer_lib/connections/clickhouse.py,sha256=zL_KRUODF3FIjaZzxADWUXD_KSaxmAC41TicS_P6jAc,1941
5
+ data_transfer_lib/connections/postgres.py,sha256=sPJM-Y_ujo5KJTDOJ6Y9ysK-J0p8CDU6k198-n8Fkxw,2570
6
+ data_transfer_lib/reader/__init__.py,sha256=SA4HFp20RKcFy1idtPOE0a86V2wgcIP7ljzRYfnfVUc,72
7
+ data_transfer_lib/reader/base.py,sha256=VNU4-4MoDvVui10fU6IJMI6xHl0YxXLuz5gHua82kKA,582
8
+ data_transfer_lib/reader/reader.py,sha256=5s3PvUDakTZ6lTiUhEo8hp743ccUE3ovDHMeZj0znSo,1327
9
+ data_transfer_lib/schema/__init__.py,sha256=kxdcCDDtDDB1YpymEzle4Yl2w3nVa6wLW7nAYGVYcpI,162
10
+ data_transfer_lib/schema/mapper.py,sha256=lIXsa7vMP4cG5spruZNX_8sDh95h0ZO59w1j1fo0ne0,7549
11
+ data_transfer_lib/schema/validator.py,sha256=jG4mVRHLaWEg4Ru2Gd_cT65lCRkmiRxF6OpwfF4qZek,3994
12
+ data_transfer_lib/utils/__init__.py,sha256=B8zhVMM1pRM4oRJ1XoMgmE8I8K-0hmP0yu3ONQV1bvQ,289
13
+ data_transfer_lib/utils/exceptions.py,sha256=zJ91uknxGHaxEMZPgjbiPoAw8Nhra3Y-r24iTe8S1J8,238
14
+ data_transfer_lib/writer/__init__.py,sha256=OqrRHW216VAOtFzzOz7jL-H7kxYlfj--8IQs9rOyNgo,72
15
+ data_transfer_lib/writer/base.py,sha256=0qCjWqNWluTCOWJM4xfwRy9TeFsq_LlQLjtkoeOC0Ok,645
16
+ data_transfer_lib/writer/writer.py,sha256=o1fM2dJFExqrYI21hJUiyrHZD2ZOmhFc3kY0Q3gfh1E,2348
17
+ data_transfer_lib-0.1.1.dist-info/METADATA,sha256=BP95JdnRPpKtFbuAg8KD6rQmKp5WLB8UFeA6KF-oYiA,900
18
+ data_transfer_lib-0.1.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
19
+ data_transfer_lib-0.1.1.dist-info/top_level.txt,sha256=Yz0tNpzxGcQmSb1vBGHge5Ab5IoaBa_5pzSMlyxusII,18
20
+ data_transfer_lib-0.1.1.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (80.9.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ data_transfer_lib