cosmotech-acceleration-library 1.1.0__py3-none-any.whl → 2.1.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cosmotech/coal/__init__.py +1 -1
- cosmotech/coal/aws/__init__.py +1 -9
- cosmotech/coal/aws/s3.py +181 -214
- cosmotech/coal/azure/__init__.py +5 -5
- cosmotech/coal/azure/adx/__init__.py +24 -10
- cosmotech/coal/azure/adx/auth.py +2 -2
- cosmotech/coal/azure/adx/ingestion.py +10 -14
- cosmotech/coal/azure/adx/query.py +1 -1
- cosmotech/coal/azure/adx/runner.py +13 -14
- cosmotech/coal/azure/adx/store.py +5 -86
- cosmotech/coal/azure/adx/tables.py +2 -2
- cosmotech/coal/azure/adx/utils.py +2 -2
- cosmotech/coal/azure/blob.py +20 -26
- cosmotech/coal/azure/storage.py +3 -3
- cosmotech/coal/cosmotech_api/__init__.py +0 -28
- cosmotech/coal/cosmotech_api/apis/__init__.py +14 -0
- cosmotech/coal/cosmotech_api/apis/dataset.py +222 -0
- cosmotech/coal/cosmotech_api/apis/meta.py +25 -0
- cosmotech/coal/cosmotech_api/apis/organization.py +24 -0
- cosmotech/coal/cosmotech_api/apis/run.py +38 -0
- cosmotech/coal/cosmotech_api/apis/runner.py +75 -0
- cosmotech/coal/cosmotech_api/apis/solution.py +23 -0
- cosmotech/coal/cosmotech_api/apis/workspace.py +108 -0
- cosmotech/coal/cosmotech_api/objects/__init__.py +9 -0
- cosmotech/coal/cosmotech_api/objects/connection.py +125 -0
- cosmotech/coal/cosmotech_api/objects/parameters.py +127 -0
- cosmotech/coal/postgresql/runner.py +58 -41
- cosmotech/coal/postgresql/store.py +56 -15
- cosmotech/coal/postgresql/utils.py +255 -0
- cosmotech/coal/singlestore/store.py +3 -2
- cosmotech/coal/store/__init__.py +16 -13
- cosmotech/coal/store/output/__init__.py +0 -0
- cosmotech/coal/store/output/aws_channel.py +74 -0
- cosmotech/coal/store/output/az_storage_channel.py +33 -0
- cosmotech/coal/store/output/channel_interface.py +38 -0
- cosmotech/coal/store/output/channel_spliter.py +61 -0
- cosmotech/coal/store/output/postgres_channel.py +37 -0
- cosmotech/coal/store/pandas.py +1 -1
- cosmotech/coal/store/pyarrow.py +2 -2
- cosmotech/coal/store/store.py +4 -7
- cosmotech/coal/utils/configuration.py +197 -0
- cosmotech/coal/utils/decorator.py +4 -7
- cosmotech/csm_data/commands/adx_send_data.py +1 -1
- cosmotech/csm_data/commands/adx_send_runnerdata.py +3 -2
- cosmotech/csm_data/commands/api/api.py +6 -19
- cosmotech/csm_data/commands/api/postgres_send_runner_metadata.py +20 -16
- cosmotech/csm_data/commands/api/run_load_data.py +15 -52
- cosmotech/csm_data/commands/api/wsf_load_file.py +13 -16
- cosmotech/csm_data/commands/api/wsf_send_file.py +11 -14
- cosmotech/csm_data/commands/az_storage_upload.py +3 -2
- cosmotech/csm_data/commands/s3_bucket_delete.py +16 -15
- cosmotech/csm_data/commands/s3_bucket_download.py +16 -16
- cosmotech/csm_data/commands/s3_bucket_upload.py +16 -14
- cosmotech/csm_data/commands/store/dump_to_azure.py +3 -2
- cosmotech/csm_data/commands/store/dump_to_postgresql.py +3 -2
- cosmotech/csm_data/commands/store/dump_to_s3.py +18 -16
- cosmotech/csm_data/commands/store/list_tables.py +3 -2
- cosmotech/csm_data/commands/store/load_csv_folder.py +10 -4
- cosmotech/csm_data/commands/store/load_from_singlestore.py +3 -2
- cosmotech/csm_data/commands/store/output.py +35 -0
- cosmotech/csm_data/commands/store/reset.py +8 -3
- cosmotech/csm_data/commands/store/store.py +3 -3
- cosmotech/csm_data/main.py +4 -4
- cosmotech/csm_data/utils/decorators.py +4 -3
- cosmotech/translation/coal/en-US/coal/cosmotech_api/initialization.yml +8 -0
- cosmotech/translation/coal/en-US/coal/services/dataset.yml +10 -14
- cosmotech/translation/coal/en-US/coal/store/output/data_interface.yml +1 -0
- cosmotech/translation/coal/en-US/coal/store/output/split.yml +6 -0
- cosmotech/translation/coal/en-US/coal/utils/configuration.yml +2 -0
- cosmotech/translation/csm_data/en-US/csm_data/commands/store/output.yml +7 -0
- {cosmotech_acceleration_library-1.1.0.dist-info → cosmotech_acceleration_library-2.1.0rc1.dist-info}/METADATA +29 -33
- cosmotech_acceleration_library-2.1.0rc1.dist-info/RECORD +153 -0
- {cosmotech_acceleration_library-1.1.0.dist-info → cosmotech_acceleration_library-2.1.0rc1.dist-info}/WHEEL +1 -1
- cosmotech/coal/azure/functions.py +0 -72
- cosmotech/coal/cosmotech_api/connection.py +0 -96
- cosmotech/coal/cosmotech_api/dataset/__init__.py +0 -26
- cosmotech/coal/cosmotech_api/dataset/converters.py +0 -164
- cosmotech/coal/cosmotech_api/dataset/download/__init__.py +0 -19
- cosmotech/coal/cosmotech_api/dataset/download/adt.py +0 -119
- cosmotech/coal/cosmotech_api/dataset/download/common.py +0 -140
- cosmotech/coal/cosmotech_api/dataset/download/file.py +0 -229
- cosmotech/coal/cosmotech_api/dataset/download/twingraph.py +0 -185
- cosmotech/coal/cosmotech_api/dataset/upload.py +0 -41
- cosmotech/coal/cosmotech_api/dataset/utils.py +0 -132
- cosmotech/coal/cosmotech_api/parameters.py +0 -48
- cosmotech/coal/cosmotech_api/run.py +0 -25
- cosmotech/coal/cosmotech_api/run_data.py +0 -173
- cosmotech/coal/cosmotech_api/run_template.py +0 -108
- cosmotech/coal/cosmotech_api/runner/__init__.py +0 -28
- cosmotech/coal/cosmotech_api/runner/data.py +0 -38
- cosmotech/coal/cosmotech_api/runner/datasets.py +0 -416
- cosmotech/coal/cosmotech_api/runner/download.py +0 -135
- cosmotech/coal/cosmotech_api/runner/metadata.py +0 -42
- cosmotech/coal/cosmotech_api/runner/parameters.py +0 -157
- cosmotech/coal/cosmotech_api/twin_data_layer.py +0 -512
- cosmotech/coal/cosmotech_api/workspace.py +0 -127
- cosmotech/coal/utils/postgresql.py +0 -236
- cosmotech/coal/utils/semver.py +0 -6
- cosmotech/csm_data/commands/api/rds_load_csv.py +0 -90
- cosmotech/csm_data/commands/api/rds_send_csv.py +0 -74
- cosmotech/csm_data/commands/api/rds_send_store.py +0 -74
- cosmotech/csm_data/commands/api/runtemplate_load_handler.py +0 -66
- cosmotech/csm_data/commands/api/tdl_load_files.py +0 -76
- cosmotech/csm_data/commands/api/tdl_send_files.py +0 -82
- cosmotech/orchestrator_plugins/csm-data/templates/api/rds_load_csv.json +0 -27
- cosmotech/orchestrator_plugins/csm-data/templates/api/rds_send_csv.json +0 -27
- cosmotech/orchestrator_plugins/csm-data/templates/api/rds_send_store.json +0 -27
- cosmotech/orchestrator_plugins/csm-data/templates/api/runtemplate_load_handler.json +0 -27
- cosmotech/orchestrator_plugins/csm-data/templates/api/tdl_load_files.json +0 -32
- cosmotech/orchestrator_plugins/csm-data/templates/api/tdl_send_files.json +0 -27
- cosmotech/translation/coal/en-US/coal/cosmotech_api/run_data.yml +0 -2
- cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_load_csv.yml +0 -13
- cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_send_csv.yml +0 -12
- cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_send_store.yml +0 -12
- cosmotech/translation/csm_data/en-US/csm_data/commands/api/tdl_load_files.yml +0 -14
- cosmotech/translation/csm_data/en-US/csm_data/commands/api/tdl_send_files.yml +0 -18
- cosmotech_acceleration_library-1.1.0.dist-info/RECORD +0 -171
- {cosmotech_acceleration_library-1.1.0.dist-info → cosmotech_acceleration_library-2.1.0rc1.dist-info}/entry_points.txt +0 -0
- {cosmotech_acceleration_library-1.1.0.dist-info → cosmotech_acceleration_library-2.1.0rc1.dist-info}/licenses/LICENSE +0 -0
- {cosmotech_acceleration_library-1.1.0.dist-info → cosmotech_acceleration_library-2.1.0rc1.dist-info}/top_level.txt +0 -0
|
@@ -13,12 +13,13 @@ for store operations.
|
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
15
|
from time import perf_counter
|
|
16
|
-
import pyarrow
|
|
17
16
|
|
|
17
|
+
from cosmotech.orchestrator.utils.translate import T
|
|
18
|
+
|
|
19
|
+
from cosmotech.coal.postgresql.utils import PostgresUtils
|
|
18
20
|
from cosmotech.coal.store.store import Store
|
|
21
|
+
from cosmotech.coal.utils.configuration import Configuration
|
|
19
22
|
from cosmotech.coal.utils.logger import LOGGER
|
|
20
|
-
from cosmotech.coal.utils.postgresql import send_pyarrow_table_to_postgresql
|
|
21
|
-
from cosmotech.orchestrator.utils.translate import T
|
|
22
23
|
|
|
23
24
|
|
|
24
25
|
def dump_store_to_postgresql(
|
|
@@ -32,6 +33,8 @@ def dump_store_to_postgresql(
|
|
|
32
33
|
table_prefix: str = "Cosmotech_",
|
|
33
34
|
replace: bool = True,
|
|
34
35
|
force_encode: bool = False,
|
|
36
|
+
selected_tables: list[str] = [],
|
|
37
|
+
fk_id: str = None,
|
|
35
38
|
) -> None:
|
|
36
39
|
"""
|
|
37
40
|
Dump Store data to a PostgreSQL database.
|
|
@@ -46,36 +49,74 @@ def dump_store_to_postgresql(
|
|
|
46
49
|
postgres_password: PostgreSQL password
|
|
47
50
|
table_prefix: Table prefix
|
|
48
51
|
replace: Whether to replace existing tables
|
|
49
|
-
force_encode: force password encoding
|
|
52
|
+
force_encode: force password encoding to percent encoding
|
|
53
|
+
selected_tables: list of tables to send
|
|
54
|
+
fk_id: foreign key id to add to all table on all rows
|
|
50
55
|
"""
|
|
51
|
-
|
|
56
|
+
_c = Configuration(
|
|
57
|
+
{
|
|
58
|
+
"coal": {"store": store_folder},
|
|
59
|
+
"postgres": {
|
|
60
|
+
"host": postgres_host,
|
|
61
|
+
"port": postgres_port,
|
|
62
|
+
"db_name": postgres_db,
|
|
63
|
+
"db_schema": postgres_schema,
|
|
64
|
+
"user_name": postgres_user,
|
|
65
|
+
"user_password": postgres_password,
|
|
66
|
+
"password_encoding": force_encode,
|
|
67
|
+
"table_prefix": table_prefix,
|
|
68
|
+
},
|
|
69
|
+
}
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
dump_store_to_postgresql_from_conf(configuration=_c, replace=replace, selected_tables=selected_tables, fk_id=fk_id)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def dump_store_to_postgresql_from_conf(
|
|
76
|
+
configuration: Configuration,
|
|
77
|
+
replace: bool = True,
|
|
78
|
+
selected_tables: list[str] = [],
|
|
79
|
+
fk_id: str = None,
|
|
80
|
+
) -> None:
|
|
81
|
+
"""
|
|
82
|
+
Dump Store data to a PostgreSQL database.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
configuration: coal Configuration
|
|
86
|
+
replace: Whether to replace existing tables
|
|
87
|
+
selected_tables: list of tables to send
|
|
88
|
+
fk_id: foreign key id to add to all table on all rows
|
|
89
|
+
"""
|
|
90
|
+
_psql = PostgresUtils(configuration)
|
|
91
|
+
_s = Store(configuration=configuration)
|
|
52
92
|
|
|
53
93
|
tables = list(_s.list_tables())
|
|
94
|
+
if selected_tables:
|
|
95
|
+
tables = [t for t in tables if t in selected_tables]
|
|
54
96
|
if len(tables):
|
|
55
|
-
LOGGER.info(T("coal.services.database.sending_data").format(table=f"{
|
|
97
|
+
LOGGER.info(T("coal.services.database.sending_data").format(table=f"{_psql.db_name}.{_psql.db_schema}"))
|
|
56
98
|
total_rows = 0
|
|
57
99
|
_process_start = perf_counter()
|
|
58
100
|
for table_name in tables:
|
|
59
101
|
_s_time = perf_counter()
|
|
60
|
-
target_table_name = f"{table_prefix}{table_name}"
|
|
102
|
+
target_table_name = f"{_psql.table_prefix}{table_name}"
|
|
61
103
|
LOGGER.info(T("coal.services.database.table_entry").format(table=target_table_name))
|
|
62
104
|
data = _s.get_table(table_name)
|
|
63
105
|
if not len(data):
|
|
64
106
|
LOGGER.info(T("coal.services.database.no_rows"))
|
|
65
107
|
continue
|
|
108
|
+
if fk_id:
|
|
109
|
+
data = data.append_column("csm_run_id", [[fk_id] * data.num_rows])
|
|
66
110
|
_dl_time = perf_counter()
|
|
67
|
-
rows = send_pyarrow_table_to_postgresql(
|
|
111
|
+
rows = _psql.send_pyarrow_table_to_postgresql(
|
|
68
112
|
data,
|
|
69
113
|
target_table_name,
|
|
70
|
-
postgres_host,
|
|
71
|
-
postgres_port,
|
|
72
|
-
postgres_db,
|
|
73
|
-
postgres_schema,
|
|
74
|
-
postgres_user,
|
|
75
|
-
postgres_password,
|
|
76
114
|
replace,
|
|
77
|
-
force_encode,
|
|
78
115
|
)
|
|
116
|
+
if fk_id and _psql.is_metadata_exists():
|
|
117
|
+
metadata_table = f"{_psql.table_prefix}RunnerMetadata"
|
|
118
|
+
_psql.add_fk_constraint(table_name, "csm_run_id", metadata_table, "last_csm_run_id")
|
|
119
|
+
|
|
79
120
|
total_rows += rows
|
|
80
121
|
_up_time = perf_counter()
|
|
81
122
|
LOGGER.info(T("coal.services.database.row_count").format(count=rows))
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
# Copyright (C) - 2023 - 2025 - Cosmo Tech
|
|
2
|
+
# This document and all information contained herein is the exclusive property -
|
|
3
|
+
# including all intellectual property rights pertaining thereto - of Cosmo Tech.
|
|
4
|
+
# Any use, reproduction, translation, broadcasting, transmission, distribution,
|
|
5
|
+
# etc., to any person is prohibited unless it has been previously and
|
|
6
|
+
# specifically authorized by written means by Cosmo Tech.
|
|
7
|
+
|
|
8
|
+
from typing import Optional
|
|
9
|
+
from urllib.parse import quote
|
|
10
|
+
|
|
11
|
+
import adbc_driver_manager
|
|
12
|
+
import pyarrow as pa
|
|
13
|
+
from adbc_driver_postgresql import dbapi
|
|
14
|
+
from cosmotech.orchestrator.utils.translate import T
|
|
15
|
+
from pyarrow import Table
|
|
16
|
+
|
|
17
|
+
from cosmotech.coal.utils.configuration import Configuration
|
|
18
|
+
from cosmotech.coal.utils.logger import LOGGER
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class PostgresUtils:
|
|
22
|
+
|
|
23
|
+
def __init__(self, configuration: Configuration):
|
|
24
|
+
self._configuration = configuration.postgres
|
|
25
|
+
|
|
26
|
+
@property
|
|
27
|
+
def table_prefix(self):
|
|
28
|
+
if "table_prefix" in self._configuration:
|
|
29
|
+
return self._configuration.table_prefix
|
|
30
|
+
return "Cosmotech_"
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def db_name(self):
|
|
34
|
+
return self._configuration.db_name
|
|
35
|
+
|
|
36
|
+
@property
|
|
37
|
+
def db_schema(self):
|
|
38
|
+
return self._configuration.db_schema
|
|
39
|
+
|
|
40
|
+
@property
|
|
41
|
+
def host_uri(self):
|
|
42
|
+
return self._configuration.host
|
|
43
|
+
|
|
44
|
+
@property
|
|
45
|
+
def host_port(self):
|
|
46
|
+
return self._configuration.port
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def user_name(self):
|
|
50
|
+
return self._configuration.user_name
|
|
51
|
+
|
|
52
|
+
@property
|
|
53
|
+
def user_password(self):
|
|
54
|
+
return self._configuration.user_password
|
|
55
|
+
|
|
56
|
+
@property
|
|
57
|
+
def password_encoding(self):
|
|
58
|
+
if "password_encoding" in self._configuration:
|
|
59
|
+
return self._configuration.password_encoding
|
|
60
|
+
return False
|
|
61
|
+
|
|
62
|
+
@property
|
|
63
|
+
def full_uri(self) -> str:
|
|
64
|
+
# Check if password needs percent encoding (contains special characters)
|
|
65
|
+
# We don't log anything about the password for security
|
|
66
|
+
encoded_password = self.user_password
|
|
67
|
+
if self.password_encoding:
|
|
68
|
+
encoded_password = quote(self.user_password, safe="")
|
|
69
|
+
|
|
70
|
+
return (
|
|
71
|
+
"postgresql://" + f"{self.user_name}"
|
|
72
|
+
f":{encoded_password}"
|
|
73
|
+
f"@{self.host_uri}"
|
|
74
|
+
f":{self.host_port}"
|
|
75
|
+
f"/{self.db_name}"
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
@property
|
|
79
|
+
def metadata_table_name(self) -> str:
|
|
80
|
+
return f"{self.table_prefix}RunnerMetadata"
|
|
81
|
+
|
|
82
|
+
def get_postgresql_table_schema(self, target_table_name: str) -> Optional[pa.Schema]:
|
|
83
|
+
"""
|
|
84
|
+
Get the schema of an existing PostgreSQL table using SQL queries.
|
|
85
|
+
|
|
86
|
+
Args:
|
|
87
|
+
target_table_name: Name of the table
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
PyArrow Schema if table exists, None otherwise
|
|
91
|
+
"""
|
|
92
|
+
LOGGER.debug(
|
|
93
|
+
T("coal.services.postgresql.getting_schema").format(
|
|
94
|
+
postgres_schema=self.db_schema, target_table_name=target_table_name
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
with dbapi.connect(self.full_uri) as conn:
|
|
99
|
+
try:
|
|
100
|
+
return conn.adbc_get_table_schema(
|
|
101
|
+
target_table_name,
|
|
102
|
+
db_schema_filter=self.db_schema,
|
|
103
|
+
)
|
|
104
|
+
except adbc_driver_manager.ProgrammingError:
|
|
105
|
+
LOGGER.warning(
|
|
106
|
+
T("coal.services.postgresql.table_not_found").format(
|
|
107
|
+
postgres_schema=self.db_schema, target_table_name=target_table_name
|
|
108
|
+
)
|
|
109
|
+
)
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
def send_pyarrow_table_to_postgresql(
|
|
113
|
+
self,
|
|
114
|
+
data: Table,
|
|
115
|
+
target_table_name: str,
|
|
116
|
+
replace: bool,
|
|
117
|
+
) -> int:
|
|
118
|
+
LOGGER.debug(
|
|
119
|
+
T("coal.services.postgresql.preparing_send").format(
|
|
120
|
+
postgres_schema=self.db_schema, target_table_name=target_table_name
|
|
121
|
+
)
|
|
122
|
+
)
|
|
123
|
+
LOGGER.debug(T("coal.services.postgresql.input_rows").format(rows=len(data)))
|
|
124
|
+
|
|
125
|
+
# Get existing schema if table exists
|
|
126
|
+
existing_schema = self.get_postgresql_table_schema(target_table_name)
|
|
127
|
+
|
|
128
|
+
if existing_schema is not None:
|
|
129
|
+
LOGGER.debug(T("coal.services.postgresql.found_existing_table").format(schema=existing_schema))
|
|
130
|
+
if not replace:
|
|
131
|
+
LOGGER.debug(T("coal.services.postgresql.adapting_data"))
|
|
132
|
+
data = adapt_table_to_schema(data, existing_schema)
|
|
133
|
+
else:
|
|
134
|
+
LOGGER.debug(T("coal.services.postgresql.replace_mode"))
|
|
135
|
+
else:
|
|
136
|
+
LOGGER.debug(T("coal.services.postgresql.no_existing_table"))
|
|
137
|
+
|
|
138
|
+
# Proceed with ingestion
|
|
139
|
+
total = 0
|
|
140
|
+
|
|
141
|
+
LOGGER.debug(T("coal.services.postgresql.connecting"))
|
|
142
|
+
with dbapi.connect(self.full_uri, autocommit=True) as conn:
|
|
143
|
+
with conn.cursor() as curs:
|
|
144
|
+
mode = "replace" if replace else "create_append"
|
|
145
|
+
LOGGER.debug(T("coal.services.postgresql.ingesting_data").format(mode=mode))
|
|
146
|
+
total += curs.adbc_ingest(target_table_name, data, mode, db_schema_name=self.db_schema)
|
|
147
|
+
|
|
148
|
+
LOGGER.debug(T("coal.services.postgresql.ingestion_success").format(rows=total))
|
|
149
|
+
return total
|
|
150
|
+
|
|
151
|
+
def add_fk_constraint(
|
|
152
|
+
self,
|
|
153
|
+
from_table: str,
|
|
154
|
+
from_col: str,
|
|
155
|
+
to_table: str,
|
|
156
|
+
to_col: str,
|
|
157
|
+
) -> None:
|
|
158
|
+
# Connect to PostgreSQL and remove runner metadata row
|
|
159
|
+
with dbapi.connect(self.full_uri, autocommit=True) as conn:
|
|
160
|
+
with conn.cursor() as curs:
|
|
161
|
+
sql_add_fk = f"""
|
|
162
|
+
ALTER TABLE {self.db_schema}.{from_table}
|
|
163
|
+
CONSTRAINT metadata FOREIGN KEY ({from_col}) REFERENCES {to_table}({to_col})
|
|
164
|
+
"""
|
|
165
|
+
curs.execute(sql_add_fk)
|
|
166
|
+
conn.commit()
|
|
167
|
+
|
|
168
|
+
def is_metadata_exists(self) -> None:
|
|
169
|
+
with dbapi.connect(self.full_uri, autocommit=True) as conn:
|
|
170
|
+
try:
|
|
171
|
+
conn.adbc_get_table_schema(
|
|
172
|
+
self.metadata_table_name,
|
|
173
|
+
db_schema_filter=self.db_schema,
|
|
174
|
+
)
|
|
175
|
+
return True
|
|
176
|
+
except adbc_driver_manager.ProgrammingError:
|
|
177
|
+
return False
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def adapt_table_to_schema(data: pa.Table, target_schema: pa.Schema) -> pa.Table:
|
|
181
|
+
"""
|
|
182
|
+
Adapt a PyArrow table to match a target schema with detailed logging.
|
|
183
|
+
"""
|
|
184
|
+
LOGGER.debug(T("coal.services.postgresql.schema_adaptation_start").format(rows=len(data)))
|
|
185
|
+
LOGGER.debug(T("coal.services.postgresql.original_schema").format(schema=data.schema))
|
|
186
|
+
LOGGER.debug(T("coal.services.postgresql.target_schema").format(schema=target_schema))
|
|
187
|
+
|
|
188
|
+
target_fields = {field.name: field.type for field in target_schema}
|
|
189
|
+
new_columns = []
|
|
190
|
+
|
|
191
|
+
# Track adaptations for summary
|
|
192
|
+
added_columns = []
|
|
193
|
+
dropped_columns = []
|
|
194
|
+
type_conversions = []
|
|
195
|
+
failed_conversions = []
|
|
196
|
+
|
|
197
|
+
# Process each field in target schema
|
|
198
|
+
for field_name, target_type in target_fields.items():
|
|
199
|
+
if field_name in data.column_names:
|
|
200
|
+
# Column exists - try to cast to target type
|
|
201
|
+
col = data[field_name]
|
|
202
|
+
original_type = col.type
|
|
203
|
+
|
|
204
|
+
if original_type != target_type:
|
|
205
|
+
LOGGER.debug(
|
|
206
|
+
T("coal.services.postgresql.casting_column").format(
|
|
207
|
+
field_name=field_name,
|
|
208
|
+
original_type=original_type,
|
|
209
|
+
target_type=target_type,
|
|
210
|
+
)
|
|
211
|
+
)
|
|
212
|
+
try:
|
|
213
|
+
new_col = pa.compute.cast(col, target_type)
|
|
214
|
+
new_columns.append(new_col)
|
|
215
|
+
type_conversions.append(f"{field_name}: {original_type} -> {target_type}")
|
|
216
|
+
except pa.ArrowInvalid as e:
|
|
217
|
+
LOGGER.warning(
|
|
218
|
+
T("coal.services.postgresql.cast_failed").format(
|
|
219
|
+
field_name=field_name,
|
|
220
|
+
original_type=original_type,
|
|
221
|
+
target_type=target_type,
|
|
222
|
+
error=str(e),
|
|
223
|
+
)
|
|
224
|
+
)
|
|
225
|
+
new_columns.append(pa.nulls(len(data), type=target_type))
|
|
226
|
+
failed_conversions.append(f"{field_name}: {original_type} -> {target_type}")
|
|
227
|
+
else:
|
|
228
|
+
new_columns.append(col)
|
|
229
|
+
else:
|
|
230
|
+
# Column doesn't exist - add nulls
|
|
231
|
+
LOGGER.debug(T("coal.services.postgresql.adding_missing_column").format(field_name=field_name))
|
|
232
|
+
new_columns.append(pa.nulls(len(data), type=target_type))
|
|
233
|
+
added_columns.append(field_name)
|
|
234
|
+
|
|
235
|
+
# Log columns that will be dropped
|
|
236
|
+
dropped_columns = [name for name in data.column_names if name not in target_fields]
|
|
237
|
+
if dropped_columns:
|
|
238
|
+
LOGGER.debug(T("coal.services.postgresql.dropping_columns").format(columns=dropped_columns))
|
|
239
|
+
|
|
240
|
+
# Create new table
|
|
241
|
+
adapted_table = pa.Table.from_arrays(new_columns, schema=target_schema)
|
|
242
|
+
|
|
243
|
+
# Log summary of adaptations
|
|
244
|
+
LOGGER.debug(T("coal.services.postgresql.adaptation_summary"))
|
|
245
|
+
if added_columns:
|
|
246
|
+
LOGGER.debug(T("coal.services.postgresql.added_columns").format(columns=added_columns))
|
|
247
|
+
if dropped_columns:
|
|
248
|
+
LOGGER.debug(T("coal.services.postgresql.dropped_columns").format(columns=dropped_columns))
|
|
249
|
+
if type_conversions:
|
|
250
|
+
LOGGER.debug(T("coal.services.postgresql.successful_conversions").format(conversions=type_conversions))
|
|
251
|
+
if failed_conversions:
|
|
252
|
+
LOGGER.debug(T("coal.services.postgresql.failed_conversions").format(conversions=failed_conversions))
|
|
253
|
+
|
|
254
|
+
LOGGER.debug(T("coal.services.postgresql.final_schema").format(schema=adapted_table.schema))
|
|
255
|
+
return adapted_table
|
|
@@ -12,15 +12,16 @@ This module provides functions for interacting with SingleStore databases
|
|
|
12
12
|
for store operations.
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
|
+
import csv
|
|
15
16
|
import pathlib
|
|
16
17
|
import time
|
|
17
|
-
|
|
18
|
+
|
|
18
19
|
import singlestoredb as s2
|
|
20
|
+
from cosmotech.orchestrator.utils.translate import T
|
|
19
21
|
|
|
20
22
|
from cosmotech.coal.store.csv import store_csv_file
|
|
21
23
|
from cosmotech.coal.store.store import Store
|
|
22
24
|
from cosmotech.coal.utils.logger import LOGGER
|
|
23
|
-
from cosmotech.orchestrator.utils.translate import T
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
def _get_data(table_name: str, output_directory: str, cursor) -> None:
|
cosmotech/coal/store/__init__.py
CHANGED
|
@@ -12,31 +12,34 @@ This module provides functions for working with the Store,
|
|
|
12
12
|
including loading and converting data.
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
|
-
# Re-export the Store class
|
|
16
|
-
from cosmotech.coal.store.store import Store
|
|
17
|
-
|
|
18
15
|
# Re-export functions from the csv module
|
|
19
16
|
from cosmotech.coal.store.csv import (
|
|
20
|
-
store_csv_file,
|
|
21
17
|
convert_store_table_to_csv,
|
|
18
|
+
store_csv_file,
|
|
22
19
|
)
|
|
23
20
|
|
|
24
21
|
# Re-export functions from the native_python module
|
|
25
22
|
from cosmotech.coal.store.native_python import (
|
|
26
|
-
store_pylist,
|
|
27
23
|
convert_table_as_pylist,
|
|
24
|
+
store_pylist,
|
|
28
25
|
)
|
|
29
|
-
|
|
30
|
-
# Re-export functions from the pandas module (if available)
|
|
31
|
-
|
|
32
26
|
from cosmotech.coal.store.pandas import (
|
|
33
|
-
store_dataframe,
|
|
34
27
|
convert_store_table_to_dataframe as convert_store_table_to_pandas_dataframe,
|
|
35
28
|
)
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
29
|
+
from cosmotech.coal.store.pandas import (
|
|
30
|
+
store_dataframe,
|
|
31
|
+
)
|
|
39
32
|
from cosmotech.coal.store.pyarrow import (
|
|
40
|
-
store_table,
|
|
41
33
|
convert_store_table_to_dataframe as convert_store_table_to_pyarrow_table,
|
|
42
34
|
)
|
|
35
|
+
from cosmotech.coal.store.pyarrow import (
|
|
36
|
+
store_table,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# Re-export the Store class
|
|
40
|
+
from cosmotech.coal.store.store import Store
|
|
41
|
+
|
|
42
|
+
# Re-export functions from the pandas module (if available)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
# Re-export functions from the pyarrow module (if available)
|
|
File without changes
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
from io import BytesIO
|
|
2
|
+
from typing import Optional
|
|
3
|
+
|
|
4
|
+
import pyarrow.csv as pc
|
|
5
|
+
import pyarrow.parquet as pq
|
|
6
|
+
from cosmotech.orchestrator.utils.translate import T
|
|
7
|
+
|
|
8
|
+
from cosmotech.coal.aws import S3
|
|
9
|
+
from cosmotech.coal.store.output.channel_interface import (
|
|
10
|
+
ChannelInterface,
|
|
11
|
+
MissingChannelConfigError,
|
|
12
|
+
)
|
|
13
|
+
from cosmotech.coal.store.store import Store
|
|
14
|
+
from cosmotech.coal.utils.configuration import Configuration, Dotdict
|
|
15
|
+
from cosmotech.coal.utils.logger import LOGGER
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class AwsChannel(ChannelInterface):
|
|
19
|
+
required_keys = {
|
|
20
|
+
"coal": ["store"],
|
|
21
|
+
"s3": ["access_key_id", "endpoint_url", "secret_access_key"],
|
|
22
|
+
}
|
|
23
|
+
requirement_string = required_keys
|
|
24
|
+
|
|
25
|
+
def __init__(self, dct: Dotdict = None):
|
|
26
|
+
super().__init__(dct)
|
|
27
|
+
self._s3 = S3(self.configuration)
|
|
28
|
+
|
|
29
|
+
def send(self, filter: Optional[list[str]] = None) -> bool:
|
|
30
|
+
|
|
31
|
+
_s = Store(configuration=self.configuration)
|
|
32
|
+
|
|
33
|
+
if self._s3.output_type not in ("sqlite", "csv", "parquet"):
|
|
34
|
+
LOGGER.error(T("coal.common.errors.data_invalid_output_type").format(output_type=self._s3.output_type))
|
|
35
|
+
raise ValueError(T("coal.common.errors.data_invalid_output_type").format(output_type=self._s3.output_type))
|
|
36
|
+
|
|
37
|
+
if self._s3.output_type == "sqlite":
|
|
38
|
+
_file_path = _s._database_path
|
|
39
|
+
_file_name = "db.sqlite"
|
|
40
|
+
_uploaded_file_name = self.configuration.s3.bucket_prefix + _file_name
|
|
41
|
+
LOGGER.info(
|
|
42
|
+
T("coal.common.data_transfer.file_sent").format(file_path=_file_path, uploaded_name=_uploaded_file_name)
|
|
43
|
+
)
|
|
44
|
+
self._s3.upload_file(_file_path, _uploaded_file_name)
|
|
45
|
+
else:
|
|
46
|
+
tables = list(_s.list_tables())
|
|
47
|
+
if filter:
|
|
48
|
+
tables = [t for t in tables if t in filter]
|
|
49
|
+
|
|
50
|
+
for table_name in tables:
|
|
51
|
+
_data_stream = BytesIO()
|
|
52
|
+
_file_name = None
|
|
53
|
+
_data = _s.get_table(table_name)
|
|
54
|
+
if not len(_data):
|
|
55
|
+
LOGGER.info(T("coal.common.data_transfer.table_empty").format(table_name=table_name))
|
|
56
|
+
continue
|
|
57
|
+
if self._s3.output_type == "csv":
|
|
58
|
+
_file_name = table_name + ".csv"
|
|
59
|
+
pc.write_csv(_data, _data_stream)
|
|
60
|
+
elif self._s3.output_type == "parquet":
|
|
61
|
+
_file_name = table_name + ".parquet"
|
|
62
|
+
pq.write_table(_data, _data_stream)
|
|
63
|
+
LOGGER.info(
|
|
64
|
+
T("coal.common.data_transfer.sending_table").format(
|
|
65
|
+
table_name=table_name, output_type=self._s3.output_type
|
|
66
|
+
)
|
|
67
|
+
)
|
|
68
|
+
self._s3.upload_data_stream(
|
|
69
|
+
data_stream=_data_stream,
|
|
70
|
+
file_name=_file_name,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
def delete(self):
|
|
74
|
+
self._s3.delete_objects()
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from cosmotech.coal.azure.blob import dump_store_to_azure
|
|
4
|
+
from cosmotech.coal.store.output.channel_interface import (
|
|
5
|
+
ChannelInterface,
|
|
6
|
+
MissingChannelConfigError,
|
|
7
|
+
)
|
|
8
|
+
from cosmotech.coal.utils.configuration import Configuration, Dotdict
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class AzureStorageChannel(ChannelInterface):
|
|
12
|
+
required_keys = {
|
|
13
|
+
"coal": ["store"],
|
|
14
|
+
"azure": [
|
|
15
|
+
"account_name",
|
|
16
|
+
"container_name",
|
|
17
|
+
"tenant_id",
|
|
18
|
+
"client_id",
|
|
19
|
+
"client_secret",
|
|
20
|
+
"output_type",
|
|
21
|
+
"file_prefix",
|
|
22
|
+
],
|
|
23
|
+
}
|
|
24
|
+
requirement_string = required_keys
|
|
25
|
+
|
|
26
|
+
def send(self, filter: Optional[list[str]] = None) -> bool:
|
|
27
|
+
dump_store_to_azure(
|
|
28
|
+
self.configuration,
|
|
29
|
+
selected_tables=filter,
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def delete(self):
|
|
33
|
+
pass
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from cosmotech.orchestrator.utils.translate import T
|
|
4
|
+
|
|
5
|
+
from cosmotech.coal.utils.configuration import Configuration, Dotdict
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class ChannelInterface:
|
|
9
|
+
required_keys = {}
|
|
10
|
+
requirement_string: str = T("coal.store.output.data_interface.requirements")
|
|
11
|
+
|
|
12
|
+
def __init__(self, dct: Dotdict = None):
|
|
13
|
+
self.configuration = Configuration(dct)
|
|
14
|
+
if not self.is_available():
|
|
15
|
+
raise MissingChannelConfigError(self)
|
|
16
|
+
|
|
17
|
+
def send(self, filter: Optional[list[str]] = None) -> bool:
|
|
18
|
+
raise NotImplementedError()
|
|
19
|
+
|
|
20
|
+
def delete(self) -> bool:
|
|
21
|
+
raise NotImplementedError()
|
|
22
|
+
|
|
23
|
+
def is_available(self) -> bool:
|
|
24
|
+
try:
|
|
25
|
+
return all(
|
|
26
|
+
all(key in self.configuration[section] for key in self.required_keys[section])
|
|
27
|
+
for section in self.required_keys.keys()
|
|
28
|
+
)
|
|
29
|
+
except KeyError:
|
|
30
|
+
return False
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class MissingChannelConfigError(Exception):
|
|
34
|
+
def __init__(self, interface_class):
|
|
35
|
+
self.message = T("coal.store.output.split.requirements").format(
|
|
36
|
+
interface_name=interface_class.__class__.__name__, requirements=interface_class.requirement_string
|
|
37
|
+
)
|
|
38
|
+
super().__init__(self.message)
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from cosmotech.orchestrator.utils.translate import T
|
|
4
|
+
|
|
5
|
+
from cosmotech.coal.store.output.aws_channel import AwsChannel
|
|
6
|
+
from cosmotech.coal.store.output.az_storage_channel import AzureStorageChannel
|
|
7
|
+
from cosmotech.coal.store.output.channel_interface import ChannelInterface
|
|
8
|
+
from cosmotech.coal.store.output.postgres_channel import PostgresChannel
|
|
9
|
+
from cosmotech.coal.utils.configuration import Dotdict
|
|
10
|
+
from cosmotech.coal.utils.logger import LOGGER
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ChannelSpliter(ChannelInterface):
|
|
14
|
+
requirement_string: str = "(Requires any working interface)"
|
|
15
|
+
targets = list()
|
|
16
|
+
available_interfaces: dict[str, ChannelInterface] = {
|
|
17
|
+
"s3": AwsChannel,
|
|
18
|
+
"az_storage": AzureStorageChannel,
|
|
19
|
+
"postgres": PostgresChannel,
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
def __init__(self, dct: Dotdict = None):
|
|
23
|
+
super().__init__(dct)
|
|
24
|
+
self.targets = list()
|
|
25
|
+
if "outputs" not in self.configuration:
|
|
26
|
+
raise AttributeError(T("coal.store.output.split.no_targets"))
|
|
27
|
+
for output in self.configuration.outputs:
|
|
28
|
+
channel = self.available_interfaces[output.type]
|
|
29
|
+
_i = channel(output.conf)
|
|
30
|
+
if _i.is_available():
|
|
31
|
+
self.targets.append(_i)
|
|
32
|
+
else:
|
|
33
|
+
LOGGER.warning(
|
|
34
|
+
T("coal.store.output.split.requirements").format(
|
|
35
|
+
interface_name=channel.__class__.__name__, requirements=channel.requirement_string
|
|
36
|
+
)
|
|
37
|
+
)
|
|
38
|
+
if not self.targets:
|
|
39
|
+
raise AttributeError(T("coal.store.output.split.no_targets"))
|
|
40
|
+
|
|
41
|
+
def send(self, filter: Optional[list[str]] = None) -> bool:
|
|
42
|
+
any_ok = False
|
|
43
|
+
for i in self.targets:
|
|
44
|
+
try:
|
|
45
|
+
any_ok = i.send(filter=filter) or any_ok
|
|
46
|
+
except Exception:
|
|
47
|
+
LOGGER.error(T("coal.store.output.split.send.error").format(interface_name=i.__class__.__name__))
|
|
48
|
+
if len(self.targets) < 2:
|
|
49
|
+
raise
|
|
50
|
+
return any_ok
|
|
51
|
+
|
|
52
|
+
def delete(self, filter: Optional[list[str]] = None) -> bool:
|
|
53
|
+
any_ok = False
|
|
54
|
+
for i in self.targets:
|
|
55
|
+
try:
|
|
56
|
+
any_ok = i.delete() or any_ok
|
|
57
|
+
except Exception:
|
|
58
|
+
LOGGER.error(T("coal.store.output.split.delete.error").format(interface_name=i.__class__.__name__))
|
|
59
|
+
if len(self.targets) < 2:
|
|
60
|
+
raise
|
|
61
|
+
return any_ok
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from typing import Optional
|
|
2
|
+
|
|
3
|
+
from cosmotech.coal.postgresql.runner import (
|
|
4
|
+
remove_runner_metadata_from_postgresql,
|
|
5
|
+
send_runner_metadata_to_postgresql,
|
|
6
|
+
)
|
|
7
|
+
from cosmotech.coal.postgresql.store import dump_store_to_postgresql_from_conf
|
|
8
|
+
from cosmotech.coal.store.output.channel_interface import ChannelInterface
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class PostgresChannel(ChannelInterface):
|
|
12
|
+
required_keys = {
|
|
13
|
+
"coal": ["store"],
|
|
14
|
+
"cosmotech": ["organization_id", "workspace_id", "runner_id"],
|
|
15
|
+
"postgres": [
|
|
16
|
+
"host",
|
|
17
|
+
"port",
|
|
18
|
+
"db_name",
|
|
19
|
+
"db_schema",
|
|
20
|
+
"user_name",
|
|
21
|
+
"user_password",
|
|
22
|
+
],
|
|
23
|
+
}
|
|
24
|
+
requirement_string = required_keys
|
|
25
|
+
|
|
26
|
+
def send(self, filter: Optional[list[str]] = None) -> bool:
|
|
27
|
+
run_id = send_runner_metadata_to_postgresql(self.configuration)
|
|
28
|
+
dump_store_to_postgresql_from_conf(
|
|
29
|
+
configuration=self.configuration,
|
|
30
|
+
selected_tables=filter,
|
|
31
|
+
fk_id=run_id,
|
|
32
|
+
replace=False,
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
def delete(self):
|
|
36
|
+
# removing metadata will trigger cascade delete on real data
|
|
37
|
+
remove_runner_metadata_from_postgresql(self.configuration)
|