data-syncmaster 0.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (110) hide show
  1. data_syncmaster-0.1.1.dist-info/LICENSE.txt +203 -0
  2. data_syncmaster-0.1.1.dist-info/METADATA +115 -0
  3. data_syncmaster-0.1.1.dist-info/RECORD +110 -0
  4. data_syncmaster-0.1.1.dist-info/WHEEL +4 -0
  5. syncmaster/__init__.py +6 -0
  6. syncmaster/backend/__init__.py +2 -0
  7. syncmaster/backend/api/__init__.py +2 -0
  8. syncmaster/backend/api/deps.py +20 -0
  9. syncmaster/backend/api/monitoring.py +10 -0
  10. syncmaster/backend/api/router.py +10 -0
  11. syncmaster/backend/api/v1/__init__.py +2 -0
  12. syncmaster/backend/api/v1/auth/__init__.py +2 -0
  13. syncmaster/backend/api/v1/auth/router.py +32 -0
  14. syncmaster/backend/api/v1/auth/utils.py +26 -0
  15. syncmaster/backend/api/v1/connections.py +300 -0
  16. syncmaster/backend/api/v1/groups.py +225 -0
  17. syncmaster/backend/api/v1/queue.py +148 -0
  18. syncmaster/backend/api/v1/router.py +18 -0
  19. syncmaster/backend/api/v1/transfers/__init__.py +2 -0
  20. syncmaster/backend/api/v1/transfers/router.py +469 -0
  21. syncmaster/backend/api/v1/transfers/utils.py +17 -0
  22. syncmaster/backend/api/v1/users.py +75 -0
  23. syncmaster/backend/export_openapi_schema.py +26 -0
  24. syncmaster/backend/handler.py +203 -0
  25. syncmaster/backend/logger.py +2 -0
  26. syncmaster/backend/main.py +63 -0
  27. syncmaster/backend/pre_start.py +94 -0
  28. syncmaster/backend/services/__init__.py +4 -0
  29. syncmaster/backend/services/auth.py +58 -0
  30. syncmaster/backend/services/unit_of_work.py +44 -0
  31. syncmaster/config.py +110 -0
  32. syncmaster/db/__init__.py +2 -0
  33. syncmaster/db/alembic.ini +41 -0
  34. syncmaster/db/base.py +28 -0
  35. syncmaster/db/factory.py +37 -0
  36. syncmaster/db/migrations/README +1 -0
  37. syncmaster/db/migrations/__init__.py +2 -0
  38. syncmaster/db/migrations/env.py +87 -0
  39. syncmaster/db/migrations/script.py.mako +24 -0
  40. syncmaster/db/migrations/versions/2023-11-23_478240cdad4b_init.py +242 -0
  41. syncmaster/db/migrations/versions/__init__.py +2 -0
  42. syncmaster/db/mixins.py +33 -0
  43. syncmaster/db/models.py +194 -0
  44. syncmaster/db/repositories/__init__.py +22 -0
  45. syncmaster/db/repositories/base.py +109 -0
  46. syncmaster/db/repositories/connection.py +138 -0
  47. syncmaster/db/repositories/credentials_repository.py +87 -0
  48. syncmaster/db/repositories/group.py +264 -0
  49. syncmaster/db/repositories/queue.py +195 -0
  50. syncmaster/db/repositories/repository_with_owner.py +115 -0
  51. syncmaster/db/repositories/run.py +78 -0
  52. syncmaster/db/repositories/transfer.py +202 -0
  53. syncmaster/db/repositories/user.py +72 -0
  54. syncmaster/db/repositories/utils.py +25 -0
  55. syncmaster/db/utils.py +31 -0
  56. syncmaster/dto/__init__.py +2 -0
  57. syncmaster/dto/connections.py +60 -0
  58. syncmaster/dto/transfers.py +46 -0
  59. syncmaster/exceptions/__init__.py +13 -0
  60. syncmaster/exceptions/base.py +12 -0
  61. syncmaster/exceptions/connection.py +28 -0
  62. syncmaster/exceptions/credentials.py +8 -0
  63. syncmaster/exceptions/group.py +27 -0
  64. syncmaster/exceptions/queue.py +16 -0
  65. syncmaster/exceptions/run.py +19 -0
  66. syncmaster/exceptions/transfer.py +39 -0
  67. syncmaster/exceptions/user.py +11 -0
  68. syncmaster/schemas/__init__.py +2 -0
  69. syncmaster/schemas/v1/__init__.py +54 -0
  70. syncmaster/schemas/v1/auth.py +12 -0
  71. syncmaster/schemas/v1/connection_types.py +9 -0
  72. syncmaster/schemas/v1/connections/__init__.py +2 -0
  73. syncmaster/schemas/v1/connections/connection.py +146 -0
  74. syncmaster/schemas/v1/connections/hdfs.py +40 -0
  75. syncmaster/schemas/v1/connections/hive.py +40 -0
  76. syncmaster/schemas/v1/connections/oracle.py +58 -0
  77. syncmaster/schemas/v1/connections/postgres.py +48 -0
  78. syncmaster/schemas/v1/connections/s3.py +66 -0
  79. syncmaster/schemas/v1/file_formats.py +7 -0
  80. syncmaster/schemas/v1/groups.py +39 -0
  81. syncmaster/schemas/v1/page.py +40 -0
  82. syncmaster/schemas/v1/queue.py +32 -0
  83. syncmaster/schemas/v1/status.py +16 -0
  84. syncmaster/schemas/v1/transfer_types.py +6 -0
  85. syncmaster/schemas/v1/transfers/__init__.py +172 -0
  86. syncmaster/schemas/v1/transfers/db.py +23 -0
  87. syncmaster/schemas/v1/transfers/file/__init__.py +2 -0
  88. syncmaster/schemas/v1/transfers/file/base.py +47 -0
  89. syncmaster/schemas/v1/transfers/file/hdfs.py +27 -0
  90. syncmaster/schemas/v1/transfers/file/s3.py +27 -0
  91. syncmaster/schemas/v1/transfers/file_format.py +29 -0
  92. syncmaster/schemas/v1/transfers/run.py +37 -0
  93. syncmaster/schemas/v1/transfers/strategy.py +15 -0
  94. syncmaster/schemas/v1/types.py +5 -0
  95. syncmaster/schemas/v1/users.py +83 -0
  96. syncmaster/worker/__init__.py +2 -0
  97. syncmaster/worker/base.py +14 -0
  98. syncmaster/worker/config.py +18 -0
  99. syncmaster/worker/controller.py +127 -0
  100. syncmaster/worker/handlers/__init__.py +2 -0
  101. syncmaster/worker/handlers/base.py +49 -0
  102. syncmaster/worker/handlers/file/__init__.py +2 -0
  103. syncmaster/worker/handlers/file/base.py +56 -0
  104. syncmaster/worker/handlers/file/hdfs.py +14 -0
  105. syncmaster/worker/handlers/file/s3.py +20 -0
  106. syncmaster/worker/handlers/hive.py +41 -0
  107. syncmaster/worker/handlers/oracle.py +48 -0
  108. syncmaster/worker/handlers/postgres.py +47 -0
  109. syncmaster/worker/spark.py +93 -0
  110. syncmaster/worker/transfer.py +85 -0
@@ -0,0 +1,15 @@
1
+ # SPDX-FileCopyrightText: 2023-2024 MTS (Mobile Telesystems)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ from __future__ import annotations
4
+
5
+ from pydantic import BaseModel
6
+
7
+ from syncmaster.schemas.v1.transfer_types import FULL_TYPE, INCREMENTAL_TYPE
8
+
9
+
10
+ class FullStrategy(BaseModel):
11
+ type: FULL_TYPE
12
+
13
+
14
+ class IncrementalStrategy(BaseModel):
15
+ type: INCREMENTAL_TYPE
@@ -0,0 +1,5 @@
1
+ # SPDX-FileCopyrightText: 2023-2024 MTS (Mobile Telesystems)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ from pydantic import constr
4
+
5
+ NameConstr = constr(min_length=1)
@@ -0,0 +1,83 @@
1
+ # SPDX-FileCopyrightText: 2023-2024 MTS (Mobile Telesystems)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ from pydantic import BaseModel, constr
4
+
5
+ from syncmaster.db.models import GroupMemberRole
6
+ from syncmaster.db.utils import Pagination
7
+ from syncmaster.schemas.v1.page import MetaPageSchema, PageSchema
8
+
9
+
10
+ class UpdateUserSchema(BaseModel):
11
+ username: constr(pattern=r"^[_a-z0-9]+$") # noqa: F722
12
+
13
+
14
+ class ReadGroupMember(BaseModel):
15
+ id: int
16
+ username: str
17
+ role: GroupMemberRole
18
+
19
+ class Config:
20
+ from_attributes = True
21
+
22
+
23
+ class ReadUserSchema(BaseModel):
24
+ id: int
25
+ username: str
26
+ is_superuser: bool
27
+
28
+ class Config:
29
+ from_attributes = True
30
+
31
+
32
+ class FullUserSchema(ReadGroupMember):
33
+ is_active: bool
34
+
35
+ class Config:
36
+ from_attributes = True
37
+
38
+
39
+ class UserPageSchemaAsGroupMember(PageSchema):
40
+ items: list[ReadGroupMember]
41
+
42
+ @classmethod
43
+ def from_pagination(cls, pagination: Pagination):
44
+ return cls(
45
+ meta=MetaPageSchema(
46
+ page=pagination.page,
47
+ pages=pagination.pages,
48
+ page_size=pagination.page_size,
49
+ total=pagination.total,
50
+ has_next=pagination.has_next,
51
+ has_previous=pagination.has_previous,
52
+ next_page=pagination.next_page,
53
+ previous_page=pagination.previous_page,
54
+ ),
55
+ items=[
56
+ ReadGroupMember(
57
+ id=user.id,
58
+ username=user.username,
59
+ role=role,
60
+ )
61
+ for user, role in pagination.items
62
+ ],
63
+ )
64
+
65
+
66
+ class UserPageSchema(PageSchema):
67
+ items: list[ReadUserSchema]
68
+
69
+ @classmethod
70
+ def from_pagination(cls, pagination: Pagination):
71
+ return cls(
72
+ meta=MetaPageSchema(
73
+ page=pagination.page,
74
+ pages=pagination.pages,
75
+ page_size=pagination.page_size,
76
+ total=pagination.total,
77
+ has_next=pagination.has_next,
78
+ has_previous=pagination.has_previous,
79
+ next_page=pagination.next_page,
80
+ previous_page=pagination.previous_page,
81
+ ),
82
+ items=pagination.items,
83
+ )
@@ -0,0 +1,2 @@
1
+ # SPDX-FileCopyrightText: 2023-2024 MTS (Mobile Telesystems)
2
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,14 @@
1
+ # SPDX-FileCopyrightText: 2023-2024 MTS (Mobile Telesystems)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ from celery import Task
4
+ from sqlalchemy import create_engine
5
+
6
+ from syncmaster.config import Settings
7
+
8
+
9
+ class WorkerTask(Task):
10
+ def __init__(self) -> None:
11
+ self.settings = Settings()
12
+ self.engine = create_engine(
13
+ url=self.settings.build_db_connection_uri(driver="psycopg2"),
14
+ )
@@ -0,0 +1,18 @@
1
+ # SPDX-FileCopyrightText: 2023-2024 MTS (Mobile Telesystems)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ from celery import Celery
4
+
5
+ from syncmaster.config import Settings
6
+ from syncmaster.worker.base import WorkerTask
7
+
8
+ settings = Settings()
9
+
10
+ celery = Celery(
11
+ __name__,
12
+ broker=settings.build_rabbit_connection_uri(),
13
+ backend="db+" + settings.build_db_connection_uri(driver="psycopg2"),
14
+ task_cls=WorkerTask,
15
+ imports=[
16
+ "syncmaster.worker.transfer",
17
+ ],
18
+ )
@@ -0,0 +1,127 @@
1
+ # SPDX-FileCopyrightText: 2023-2024 MTS (Mobile Telesystems)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ import logging
4
+ from typing import Any
5
+
6
+ from syncmaster.config import Settings
7
+ from syncmaster.db.models import Connection, Transfer
8
+ from syncmaster.dto.connections import (
9
+ HDFSConnectionDTO,
10
+ HiveConnectionDTO,
11
+ OracleConnectionDTO,
12
+ PostgresConnectionDTO,
13
+ S3ConnectionDTO,
14
+ )
15
+ from syncmaster.dto.transfers import (
16
+ HDFSTransferDTO,
17
+ HiveTransferDTO,
18
+ OracleTransferDTO,
19
+ PostgresTransferDTO,
20
+ S3TransferDTO,
21
+ )
22
+ from syncmaster.exceptions.connection import ConnectionTypeNotRecognizedError
23
+ from syncmaster.worker.handlers.base import Handler
24
+ from syncmaster.worker.handlers.file.hdfs import HDFSHandler
25
+ from syncmaster.worker.handlers.file.s3 import S3Handler
26
+ from syncmaster.worker.handlers.hive import HiveHandler
27
+ from syncmaster.worker.handlers.oracle import OracleHandler
28
+ from syncmaster.worker.handlers.postgres import PostgresHandler
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+
33
+ connection_handler_proxy = {
34
+ "hive": (
35
+ HiveHandler,
36
+ HiveConnectionDTO,
37
+ HiveTransferDTO,
38
+ ),
39
+ "oracle": (
40
+ OracleHandler,
41
+ OracleConnectionDTO,
42
+ OracleTransferDTO,
43
+ ),
44
+ "postgres": (
45
+ PostgresHandler,
46
+ PostgresConnectionDTO,
47
+ PostgresTransferDTO,
48
+ ),
49
+ "s3": (
50
+ S3Handler,
51
+ S3ConnectionDTO,
52
+ S3TransferDTO,
53
+ ),
54
+ "hdfs": (
55
+ HDFSHandler,
56
+ HDFSConnectionDTO,
57
+ HDFSTransferDTO,
58
+ ),
59
+ }
60
+
61
+
62
+ class TransferController:
63
+ source_handler: Handler
64
+ target_handler: Handler
65
+
66
+ def __init__(
67
+ self,
68
+ transfer: Transfer,
69
+ source_connection: Connection,
70
+ source_auth_data: dict,
71
+ target_connection: Connection,
72
+ target_auth_data: dict,
73
+ settings: Settings,
74
+ ):
75
+ self.source_handler = self.get_handler(
76
+ connection_data=source_connection.data,
77
+ transfer_params=transfer.source_params,
78
+ connection_auth_data=source_auth_data,
79
+ )
80
+ self.target_handler = self.get_handler(
81
+ connection_data=target_connection.data,
82
+ transfer_params=transfer.target_params,
83
+ connection_auth_data=target_auth_data,
84
+ )
85
+ spark = settings.CREATE_SPARK_SESSION_FUNCTION(
86
+ settings,
87
+ target=self.target_handler.connection_dto,
88
+ source=self.source_handler.connection_dto,
89
+ )
90
+
91
+ self.source_handler.set_spark(spark)
92
+ self.target_handler.set_spark(spark)
93
+ logger.info("source connection = %s", self.source_handler)
94
+ logger.info("target connection = %s", self.target_handler)
95
+
96
+ def start_transfer(self) -> None:
97
+ self.source_handler.init_connection()
98
+ self.source_handler.init_reader()
99
+
100
+ self.target_handler.init_connection()
101
+ self.target_handler.init_writer()
102
+ logger.info("Source and target were initialized")
103
+
104
+ df = self.target_handler.normalize_column_name(self.source_handler.read())
105
+ logger.info("Data has been read")
106
+
107
+ self.target_handler.write(df)
108
+ logger.info("Data has been inserted")
109
+
110
+ def get_handler(
111
+ self,
112
+ connection_data: dict[str, Any],
113
+ connection_auth_data: dict,
114
+ transfer_params: dict[str, Any],
115
+ ) -> Handler:
116
+ connection_data.update(connection_auth_data)
117
+ handler_type = connection_data["type"]
118
+
119
+ if connection_handler_proxy.get(handler_type, None) is None:
120
+ raise ConnectionTypeNotRecognizedError
121
+
122
+ handler, connection_dto, transfer_dto = connection_handler_proxy[handler_type]
123
+
124
+ return handler(
125
+ connection_dto=connection_dto(**connection_data),
126
+ transfer_dto=transfer_dto(**transfer_params),
127
+ )
@@ -0,0 +1,2 @@
1
+ # SPDX-FileCopyrightText: 2023-2024 MTS (Mobile Telesystems)
2
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,49 @@
1
+ # SPDX-FileCopyrightText: 2023-2024 MTS (Mobile Telesystems)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ from abc import ABC
4
+
5
+ from onetl.db import DBReader, DBWriter
6
+ from pyspark.sql import SparkSession
7
+ from pyspark.sql.dataframe import DataFrame
8
+
9
+ from syncmaster.dto.connections import ConnectionDTO
10
+ from syncmaster.dto.transfers import TransferDTO
11
+
12
+
13
+ class Handler(ABC):
14
+ def __init__(
15
+ self,
16
+ connection_dto: ConnectionDTO,
17
+ transfer_dto: TransferDTO,
18
+ spark: SparkSession | None = None,
19
+ ) -> None:
20
+ self.spark = spark
21
+ self.reader: DBReader | None = None
22
+ self.writer: DBWriter | None = None
23
+ self.connection_dto = connection_dto
24
+ self.transfer_dto = transfer_dto
25
+
26
+ def init_connection(self): ...
27
+
28
+ def set_spark(self, spark: SparkSession):
29
+ self.spark = spark
30
+
31
+ def init_reader(self):
32
+ if self.connection_dto is None:
33
+ raise ValueError("At first you need to initialize connection. Run `init_connection")
34
+
35
+ def init_writer(self):
36
+ if self.connection_dto is None:
37
+ raise ValueError("At first you need to initialize connection. Run `init_connection")
38
+
39
+ def read(self) -> DataFrame:
40
+ if self.reader is None:
41
+ raise ValueError("Reader is not initialized")
42
+ return self.reader.run()
43
+
44
+ def write(self, df: DataFrame) -> None:
45
+ if self.writer is None:
46
+ raise ValueError("Writer is not initialized")
47
+ return self.writer.run(df=df)
48
+
49
+ def normalize_column_name(self, df: DataFrame) -> DataFrame: ...
@@ -0,0 +1,2 @@
1
+ # SPDX-FileCopyrightText: 2023-2024 MTS (Mobile Telesystems)
2
+ # SPDX-License-Identifier: Apache-2.0
@@ -0,0 +1,56 @@
1
+ # SPDX-FileCopyrightText: 2023-2024 MTS (Mobile Telesystems)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ import json
4
+
5
+ from onetl.base.base_file_df_connection import BaseFileDFConnection
6
+ from onetl.file import FileDFReader, FileDFWriter
7
+ from onetl.file.format import CSV, JSON, JSONLine
8
+ from pyspark.sql.dataframe import DataFrame
9
+ from pyspark.sql.types import StructType
10
+
11
+ from syncmaster.dto.connections import ConnectionDTO
12
+ from syncmaster.dto.transfers import TransferDTO
13
+ from syncmaster.worker.handlers.base import Handler
14
+
15
+
16
+ class FileHandler(Handler):
17
+ connection: BaseFileDFConnection
18
+ connection_dto: ConnectionDTO
19
+ transfer_dto: TransferDTO
20
+
21
+ def init_connection(self): ...
22
+
23
+ def init_reader(self):
24
+ super().init_reader()
25
+
26
+ self.reader = FileDFReader(
27
+ connection=self.connection,
28
+ format=self._get_format(),
29
+ source_path=self.transfer_dto.directory_path,
30
+ df_schema=StructType.fromJson(json.loads(self.transfer_dto.df_schema)),
31
+ options=self.transfer_dto.options,
32
+ )
33
+
34
+ def init_writer(self):
35
+ super().init_writer()
36
+
37
+ self.writer = FileDFWriter(
38
+ connection=self.connection,
39
+ format=self._get_format(),
40
+ target_path=self.transfer_dto.directory_path,
41
+ options=self.transfer_dto.options,
42
+ )
43
+
44
+ def normalize_column_name(self, df: DataFrame) -> DataFrame:
45
+ return df
46
+
47
+ def _get_format(self):
48
+ file_type = self.transfer_dto.file_format["type"]
49
+ if file_type == "csv":
50
+ return CSV.parse_obj(self.transfer_dto.file_format)
51
+ elif file_type == "jsonline":
52
+ return JSONLine.parse_obj(self.transfer_dto.file_format)
53
+ elif file_type == "json":
54
+ return JSON.parse_obj(self.transfer_dto.file_format)
55
+ else:
56
+ raise ValueError("Unknown file type")
@@ -0,0 +1,14 @@
1
+ # SPDX-FileCopyrightText: 2023-2024 MTS (Mobile Telesystems)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+
4
+ from onetl.connection import SparkHDFS
5
+
6
+ from syncmaster.worker.handlers.file.base import FileHandler
7
+
8
+
9
+ class HDFSHandler(FileHandler):
10
+ def init_connection(self):
11
+ self.connection = SparkHDFS(
12
+ cluster=self.connection_dto.cluster,
13
+ spark=self.spark,
14
+ ).check()
@@ -0,0 +1,20 @@
1
+ # SPDX-FileCopyrightText: 2023-2024 MTS (Mobile Telesystems)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ from onetl.connection import SparkS3
4
+
5
+ from syncmaster.worker.handlers.file.base import FileHandler
6
+
7
+
8
+ class S3Handler(FileHandler):
9
+ def init_connection(self):
10
+ self.connection = SparkS3(
11
+ host=self.connection_dto.host,
12
+ port=self.connection_dto.port,
13
+ access_key=self.connection_dto.access_key,
14
+ secret_key=self.connection_dto.secret_key,
15
+ bucket=self.connection_dto.bucket,
16
+ protocol=self.connection_dto.protocol,
17
+ region=self.connection_dto.region,
18
+ extra=self.connection_dto.additional_params,
19
+ spark=self.spark,
20
+ ).check()
@@ -0,0 +1,41 @@
1
+ # SPDX-FileCopyrightText: 2023-2024 MTS (Mobile Telesystems)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ from onetl.connection import Hive
4
+ from onetl.db import DBReader, DBWriter
5
+ from pyspark.sql.dataframe import DataFrame
6
+
7
+ from syncmaster.dto.connections import HiveConnectionDTO
8
+ from syncmaster.dto.transfers import HiveTransferDTO
9
+ from syncmaster.worker.handlers.base import Handler
10
+
11
+
12
+ class HiveHandler(Handler):
13
+ connection: Hive
14
+ connection_dto: HiveConnectionDTO
15
+ transfer_dto: HiveTransferDTO
16
+
17
+ def init_connection(self):
18
+ self.connection = Hive(
19
+ cluster=self.connection_dto.cluster,
20
+ spark=self.spark,
21
+ ).check()
22
+
23
+ def init_reader(self):
24
+ super().init_reader()
25
+ self.spark.catalog.refreshTable(self.transfer_dto.table_name)
26
+ self.reader = DBReader(
27
+ connection=self.connection,
28
+ table=self.transfer_dto.table_name,
29
+ )
30
+
31
+ def init_writer(self):
32
+ super().init_writer()
33
+ self.writer = DBWriter(
34
+ connection=self.connection,
35
+ table=self.transfer_dto.table_name,
36
+ )
37
+
38
+ def normalize_column_name(self, df: DataFrame) -> DataFrame:
39
+ for column_name in df.columns:
40
+ df = df.withColumnRenamed(column_name, column_name.lower())
41
+ return df
@@ -0,0 +1,48 @@
1
+ # SPDX-FileCopyrightText: 2023-2024 MTS (Mobile Telesystems)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ from onetl.connection import Oracle
4
+ from onetl.db import DBReader, DBWriter
5
+ from pyspark.sql.dataframe import DataFrame
6
+
7
+ from syncmaster.dto.connections import OracleConnectionDTO
8
+ from syncmaster.dto.transfers import OracleTransferDTO
9
+ from syncmaster.worker.handlers.base import Handler
10
+
11
+
12
+ class OracleHandler(Handler):
13
+ connection: Oracle
14
+ connection_dto: OracleConnectionDTO
15
+ transfer_dto: OracleTransferDTO
16
+
17
+ def init_connection(self):
18
+ self.connection = Oracle(
19
+ host=self.connection_dto.host,
20
+ port=self.connection_dto.port,
21
+ user=self.connection_dto.user,
22
+ password=self.connection_dto.password,
23
+ sid=self.connection_dto.sid,
24
+ service_name=self.connection_dto.service_name,
25
+ extra=self.connection_dto.additional_params,
26
+ spark=self.spark,
27
+ ).check()
28
+
29
+ def init_reader(self):
30
+ super().init_reader()
31
+ df = self.connection.get_df_schema(self.transfer_dto.table_name)
32
+ self.reader = DBReader(
33
+ connection=self.connection,
34
+ table=self.transfer_dto.table_name,
35
+ columns=[f'"{f}"' for f in df.fieldNames()],
36
+ )
37
+
38
+ def init_writer(self):
39
+ super().init_writer()
40
+ self.writer = DBWriter(
41
+ connection=self.connection,
42
+ table=self.transfer_dto.table_name,
43
+ )
44
+
45
+ def normalize_column_name(self, df: DataFrame) -> DataFrame:
46
+ for column_name in df.columns:
47
+ df = df.withColumnRenamed(column_name, column_name.upper())
48
+ return df
@@ -0,0 +1,47 @@
1
+ # SPDX-FileCopyrightText: 2023-2024 MTS (Mobile Telesystems)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ from onetl.connection import Postgres
4
+ from onetl.db import DBReader, DBWriter
5
+ from pyspark.sql.dataframe import DataFrame
6
+
7
+ from syncmaster.dto.connections import PostgresConnectionDTO
8
+ from syncmaster.dto.transfers import PostgresTransferDTO
9
+ from syncmaster.worker.handlers.base import Handler
10
+
11
+
12
+ class PostgresHandler(Handler):
13
+ connection: Postgres
14
+ connection_dto: PostgresConnectionDTO
15
+ transfer_dto: PostgresTransferDTO
16
+
17
+ def init_connection(self):
18
+ self.connection = Postgres(
19
+ host=self.connection_dto.host,
20
+ user=self.connection_dto.user,
21
+ password=self.connection_dto.password,
22
+ port=self.connection_dto.port,
23
+ database=self.connection_dto.database_name,
24
+ extra=self.connection_dto.additional_params,
25
+ spark=self.spark,
26
+ ).check()
27
+
28
+ def init_reader(self):
29
+ super().init_reader()
30
+ df = self.connection.get_df_schema(self.transfer_dto.table_name)
31
+ self.reader = DBReader(
32
+ connection=self.connection,
33
+ table=self.transfer_dto.table_name,
34
+ columns=[f'"{f}"' for f in df.fieldNames()],
35
+ )
36
+
37
+ def init_writer(self):
38
+ super().init_writer()
39
+ self.writer = DBWriter(
40
+ connection=self.connection,
41
+ table=self.transfer_dto.table_name,
42
+ )
43
+
44
+ def normalize_column_name(self, df: DataFrame) -> DataFrame:
45
+ for column_name in df.columns:
46
+ df = df.withColumnRenamed(column_name, column_name.lower())
47
+ return df
@@ -0,0 +1,93 @@
1
+ # SPDX-FileCopyrightText: 2023-2024 MTS (Mobile Telesystems)
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ import logging
4
+
5
+ import pyspark
6
+ from onetl.connection import Oracle, Postgres, SparkS3
7
+ from pyspark.sql import SparkSession
8
+
9
+ from syncmaster.config import Settings
10
+ from syncmaster.dto.connections import ConnectionDTO
11
+
12
+ log = logging.getLogger(__name__)
13
+
14
+
15
+ def get_worker_spark_session(
16
+ settings: Settings, # used in test spark session definition
17
+ source: ConnectionDTO,
18
+ target: ConnectionDTO,
19
+ ) -> SparkSession:
20
+ """Through the source and target parameters you can get credentials for authorization at the source"""
21
+ spark_builder = SparkSession.builder.appName("celery_worker")
22
+
23
+ for k, v in get_spark_session_conf(source, target).items():
24
+ spark_builder = spark_builder.config(k, v)
25
+
26
+ if source.type == "hive" or target.type == "hive": # type: ignore
27
+ log.debug("Enabling Hive support")
28
+ spark_builder = spark_builder.enableHiveSupport()
29
+
30
+ return spark_builder.getOrCreate()
31
+
32
+
33
+ def get_packages(db_type: str) -> list[str]:
34
+ if db_type == "postgres":
35
+ return Postgres.get_packages()
36
+ if db_type == "oracle":
37
+ return Oracle.get_packages()
38
+ if db_type == "s3":
39
+ spark_version = pyspark.__version__
40
+ return SparkS3.get_packages(spark_version=spark_version)
41
+
42
+ # If the database type does not require downloading .jar packages
43
+ return []
44
+
45
+
46
+ def get_excluded_packages(db_type: str):
47
+ if db_type == "s3":
48
+ return [
49
+ "com.google.cloud.bigdataoss:gcs-connector",
50
+ "org.apache.hadoop:hadoop-aliyun",
51
+ "org.apache.hadoop:hadoop-azure-datalake",
52
+ "org.apache.hadoop:hadoop-azure",
53
+ ]
54
+ return []
55
+
56
+
57
+ def get_spark_session_conf(
58
+ source: ConnectionDTO,
59
+ target: ConnectionDTO,
60
+ ) -> dict:
61
+ maven_packages: list[str] = []
62
+ excluded_packages: list[str] = []
63
+
64
+ for db_type in source, target:
65
+ maven_packages.extend(get_packages(db_type=db_type.type)) # type: ignore
66
+ excluded_packages.extend(get_excluded_packages(db_type=db_type.type)) # type: ignore
67
+
68
+ log.debug("Passing Maven packages: %s", maven_packages)
69
+
70
+ config = {
71
+ "spark.jars.packages": ",".join(maven_packages),
72
+ "spark.sql.pyspark.jvmStacktrace.enabled": "true",
73
+ }
74
+
75
+ if excluded_packages:
76
+ config["spark.jars.excludes"] = ",".join(excluded_packages)
77
+
78
+ if source.type == "s3": # type: ignore
79
+ config.update(
80
+ {
81
+ "spark.hadoop.fs.s3a.committer.magic.enabled": "true",
82
+ "spark.hadoop.fs.s3a.committer.name": "magic",
83
+ "spark.hadoop.mapreduce.outputcommitter.factory.scheme.s3a": (
84
+ "org.apache.hadoop.fs.s3a.commit.S3ACommitterFactory"
85
+ ),
86
+ "spark.sql.parquet.output.committer.class": (
87
+ "org.apache.spark.internal.io.cloud.BindingParquetOutputCommitter"
88
+ ),
89
+ "spark.sql.sources.commitProtocolClass": "org.apache.spark.internal.io.cloud.PathOutputCommitProtocol",
90
+ }
91
+ )
92
+
93
+ return config