cosmotech-acceleration-library 2.0.0__py3-none-any.whl → 2.1.0rc1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cosmotech/coal/__init__.py +1 -1
- cosmotech/coal/azure/__init__.py +5 -5
- cosmotech/coal/azure/adx/__init__.py +24 -10
- cosmotech/coal/azure/adx/ingestion.py +10 -14
- cosmotech/coal/azure/adx/query.py +1 -1
- cosmotech/coal/azure/adx/utils.py +2 -2
- cosmotech/coal/azure/blob.py +14 -20
- cosmotech/coal/cosmotech_api/apis/dataset.py +135 -16
- cosmotech/coal/cosmotech_api/apis/runner.py +23 -19
- cosmotech/coal/postgresql/runner.py +8 -11
- cosmotech/coal/postgresql/store.py +20 -25
- cosmotech/coal/postgresql/utils.py +2 -1
- cosmotech/coal/singlestore/store.py +3 -2
- cosmotech/coal/store/__init__.py +16 -13
- cosmotech/coal/store/output/aws_channel.py +12 -11
- cosmotech/coal/store/output/az_storage_channel.py +9 -18
- cosmotech/coal/store/output/channel_interface.py +15 -0
- cosmotech/coal/store/output/channel_spliter.py +11 -5
- cosmotech/coal/store/output/postgres_channel.py +7 -10
- cosmotech/coal/store/pandas.py +1 -1
- cosmotech/coal/store/pyarrow.py +2 -2
- cosmotech/coal/store/store.py +4 -7
- cosmotech/coal/utils/configuration.py +76 -48
- cosmotech/csm_data/commands/adx_send_data.py +1 -1
- cosmotech/csm_data/commands/adx_send_runnerdata.py +3 -2
- cosmotech/csm_data/commands/api/run_load_data.py +10 -8
- cosmotech/csm_data/commands/az_storage_upload.py +3 -2
- cosmotech/csm_data/commands/store/dump_to_azure.py +3 -2
- cosmotech/csm_data/commands/store/dump_to_postgresql.py +3 -2
- cosmotech/csm_data/commands/store/list_tables.py +3 -2
- cosmotech/csm_data/commands/store/load_csv_folder.py +10 -4
- cosmotech/csm_data/commands/store/load_from_singlestore.py +3 -2
- cosmotech/csm_data/commands/store/reset.py +8 -3
- cosmotech/csm_data/main.py +4 -4
- cosmotech/csm_data/utils/decorators.py +4 -3
- cosmotech/translation/coal/en-US/coal/services/dataset.yml +6 -0
- {cosmotech_acceleration_library-2.0.0.dist-info → cosmotech_acceleration_library-2.1.0rc1.dist-info}/METADATA +26 -27
- {cosmotech_acceleration_library-2.0.0.dist-info → cosmotech_acceleration_library-2.1.0rc1.dist-info}/RECORD +42 -42
- {cosmotech_acceleration_library-2.0.0.dist-info → cosmotech_acceleration_library-2.1.0rc1.dist-info}/WHEEL +1 -1
- {cosmotech_acceleration_library-2.0.0.dist-info → cosmotech_acceleration_library-2.1.0rc1.dist-info}/entry_points.txt +0 -0
- {cosmotech_acceleration_library-2.0.0.dist-info → cosmotech_acceleration_library-2.1.0rc1.dist-info}/licenses/LICENSE +0 -0
- {cosmotech_acceleration_library-2.0.0.dist-info → cosmotech_acceleration_library-2.1.0rc1.dist-info}/top_level.txt +0 -0
cosmotech/coal/__init__.py
CHANGED
cosmotech/coal/azure/__init__.py
CHANGED
|
@@ -11,13 +11,13 @@ Azure services integration module.
|
|
|
11
11
|
This module provides functions for interacting with Azure services like Storage and ADX.
|
|
12
12
|
"""
|
|
13
13
|
|
|
14
|
+
# Re-export blob functions for easier importing
|
|
15
|
+
from cosmotech.coal.azure.blob import (
|
|
16
|
+
dump_store_to_azure,
|
|
17
|
+
)
|
|
18
|
+
|
|
14
19
|
# Re-export storage functions for easier importing
|
|
15
20
|
from cosmotech.coal.azure.storage import (
|
|
16
21
|
upload_file,
|
|
17
22
|
upload_folder,
|
|
18
23
|
)
|
|
19
|
-
|
|
20
|
-
# Re-export blob functions for easier importing
|
|
21
|
-
from cosmotech.coal.azure.blob import (
|
|
22
|
-
dump_store_to_azure,
|
|
23
|
-
)
|
|
@@ -5,22 +5,36 @@
|
|
|
5
5
|
# etc., to any person is prohibited unless it has been previously and
|
|
6
6
|
# specifically authorized by written means by Cosmo Tech.
|
|
7
7
|
|
|
8
|
-
from cosmotech.coal.azure.adx.auth import
|
|
9
|
-
|
|
8
|
+
from cosmotech.coal.azure.adx.auth import (
|
|
9
|
+
create_ingest_client,
|
|
10
|
+
create_kusto_client,
|
|
11
|
+
initialize_clients,
|
|
12
|
+
)
|
|
10
13
|
from cosmotech.coal.azure.adx.ingestion import (
|
|
11
|
-
|
|
12
|
-
send_to_adx,
|
|
14
|
+
IngestionStatus,
|
|
13
15
|
check_ingestion_status,
|
|
14
|
-
monitor_ingestion,
|
|
15
16
|
handle_failures,
|
|
16
|
-
|
|
17
|
+
ingest_dataframe,
|
|
18
|
+
monitor_ingestion,
|
|
19
|
+
send_to_adx,
|
|
17
20
|
)
|
|
18
|
-
from cosmotech.coal.azure.adx.
|
|
19
|
-
from cosmotech.coal.azure.adx.utils import type_mapping, create_column_mapping
|
|
20
|
-
from cosmotech.coal.azure.adx.store import send_pyarrow_table_to_adx, send_table_data, process_tables, send_store_to_adx
|
|
21
|
+
from cosmotech.coal.azure.adx.query import run_command_query, run_query
|
|
21
22
|
from cosmotech.coal.azure.adx.runner import (
|
|
22
|
-
prepare_csv_content,
|
|
23
23
|
construct_create_query,
|
|
24
24
|
insert_csv_files,
|
|
25
|
+
prepare_csv_content,
|
|
25
26
|
send_runner_data,
|
|
26
27
|
)
|
|
28
|
+
from cosmotech.coal.azure.adx.store import (
|
|
29
|
+
process_tables,
|
|
30
|
+
send_pyarrow_table_to_adx,
|
|
31
|
+
send_store_to_adx,
|
|
32
|
+
send_table_data,
|
|
33
|
+
)
|
|
34
|
+
from cosmotech.coal.azure.adx.tables import (
|
|
35
|
+
_drop_by_tag,
|
|
36
|
+
check_and_create_table,
|
|
37
|
+
create_table,
|
|
38
|
+
table_exists,
|
|
39
|
+
)
|
|
40
|
+
from cosmotech.coal.azure.adx.utils import create_column_mapping, type_mapping
|
|
@@ -5,28 +5,24 @@
|
|
|
5
5
|
# etc., to any person is prohibited unless it has been previously and
|
|
6
6
|
# specifically authorized by written means by Cosmo Tech.
|
|
7
7
|
|
|
8
|
+
import os
|
|
9
|
+
import time
|
|
8
10
|
from enum import Enum
|
|
9
|
-
from typing import Dict
|
|
10
|
-
from typing import Iterator
|
|
11
|
-
from typing import List
|
|
12
|
-
from typing import Optional
|
|
13
|
-
from typing import Tuple
|
|
11
|
+
from typing import Dict, Iterator, List, Optional, Tuple
|
|
14
12
|
|
|
15
|
-
import os
|
|
16
13
|
import pandas as pd
|
|
17
|
-
import time
|
|
18
14
|
import tqdm
|
|
19
15
|
from azure.kusto.data import KustoClient
|
|
20
16
|
from azure.kusto.data.data_format import DataFormat
|
|
21
|
-
from azure.kusto.ingest import IngestionProperties
|
|
22
|
-
from azure.kusto.ingest import
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
17
|
+
from azure.kusto.ingest import IngestionProperties, QueuedIngestClient, ReportLevel
|
|
18
|
+
from azure.kusto.ingest.status import (
|
|
19
|
+
FailureMessage,
|
|
20
|
+
KustoIngestStatusQueues,
|
|
21
|
+
SuccessMessage,
|
|
22
|
+
)
|
|
27
23
|
from cosmotech.orchestrator.utils.translate import T
|
|
28
24
|
|
|
29
|
-
from cosmotech.coal.azure.adx.tables import
|
|
25
|
+
from cosmotech.coal.azure.adx.tables import _drop_by_tag, create_table
|
|
30
26
|
from cosmotech.coal.azure.adx.utils import type_mapping
|
|
31
27
|
from cosmotech.coal.utils.logger import LOGGER
|
|
32
28
|
|
|
@@ -7,9 +7,9 @@
|
|
|
7
7
|
|
|
8
8
|
from azure.kusto.data import KustoClient
|
|
9
9
|
from azure.kusto.data.response import KustoResponseDataSet
|
|
10
|
+
from cosmotech.orchestrator.utils.translate import T
|
|
10
11
|
|
|
11
12
|
from cosmotech.coal.utils.logger import LOGGER
|
|
12
|
-
from cosmotech.orchestrator.utils.translate import T
|
|
13
13
|
|
|
14
14
|
|
|
15
15
|
def run_query(client: KustoClient, database: str, query: str) -> KustoResponseDataSet:
|
|
@@ -5,13 +5,13 @@
|
|
|
5
5
|
# etc., to any person is prohibited unless it has been previously and
|
|
6
6
|
# specifically authorized by written means by Cosmo Tech.
|
|
7
7
|
|
|
8
|
-
import dateutil.parser
|
|
9
8
|
from typing import Any, Dict
|
|
10
9
|
|
|
10
|
+
import dateutil.parser
|
|
11
11
|
import pyarrow
|
|
12
|
+
from cosmotech.orchestrator.utils.translate import T
|
|
12
13
|
|
|
13
14
|
from cosmotech.coal.utils.logger import LOGGER
|
|
14
|
-
from cosmotech.orchestrator.utils.translate import T
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
def create_column_mapping(data: pyarrow.Table) -> Dict[str, str]:
|
cosmotech/coal/azure/blob.py
CHANGED
|
@@ -21,6 +21,7 @@ from azure.storage.blob import BlobServiceClient
|
|
|
21
21
|
from cosmotech.orchestrator.utils.translate import T
|
|
22
22
|
|
|
23
23
|
from cosmotech.coal.store.store import Store
|
|
24
|
+
from cosmotech.coal.utils.configuration import Configuration
|
|
24
25
|
from cosmotech.coal.utils.logger import LOGGER
|
|
25
26
|
|
|
26
27
|
VALID_TYPES = (
|
|
@@ -31,42 +32,35 @@ VALID_TYPES = (
|
|
|
31
32
|
|
|
32
33
|
|
|
33
34
|
def dump_store_to_azure(
|
|
34
|
-
|
|
35
|
-
account_name: str,
|
|
36
|
-
container_name: str,
|
|
37
|
-
tenant_id: str,
|
|
38
|
-
client_id: str,
|
|
39
|
-
client_secret: str,
|
|
40
|
-
output_type: str = "sqlite",
|
|
41
|
-
file_prefix: str = "",
|
|
35
|
+
configuration: Configuration = Configuration(),
|
|
42
36
|
selected_tables: list[str] = [],
|
|
43
37
|
) -> None:
|
|
44
38
|
"""
|
|
45
39
|
Dump Store data to Azure Blob Storage.
|
|
46
40
|
|
|
47
41
|
Args:
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
container_name: Azure Storage container name
|
|
51
|
-
tenant_id: Azure tenant ID
|
|
52
|
-
client_id: Azure client ID
|
|
53
|
-
client_secret: Azure client secret
|
|
54
|
-
output_type: Output file type (sqlite, csv, or parquet)
|
|
55
|
-
file_prefix: Prefix for uploaded files
|
|
42
|
+
configuration: Configuration utils class
|
|
43
|
+
selected_tables: List of tables name
|
|
56
44
|
|
|
57
45
|
Raises:
|
|
58
46
|
ValueError: If the output type is invalid
|
|
59
47
|
"""
|
|
60
|
-
_s = Store(
|
|
48
|
+
_s = Store(configuration=configuration)
|
|
49
|
+
output_type = configuration.safe_get("azure.output_type", default="sqlite")
|
|
50
|
+
file_prefix = configuration.safe_get("azure.file_prefix", default="")
|
|
61
51
|
|
|
62
52
|
if output_type not in VALID_TYPES:
|
|
63
53
|
LOGGER.error(T("coal.common.validation.invalid_output_type").format(output_type=output_type))
|
|
64
54
|
raise ValueError(T("coal.common.validation.invalid_output_type").format(output_type=output_type))
|
|
65
55
|
|
|
66
56
|
container_client = BlobServiceClient(
|
|
67
|
-
account_url=f"https://{account_name}.blob.core.windows.net/",
|
|
68
|
-
credential=ClientSecretCredential(
|
|
69
|
-
|
|
57
|
+
account_url=f"https://{configuration.azure.account_name}.blob.core.windows.net/",
|
|
58
|
+
credential=ClientSecretCredential(
|
|
59
|
+
tenant_id=configuration.azure.tenant_id,
|
|
60
|
+
client_id=configuration.azure.client_id,
|
|
61
|
+
client_secret=configuration.azure.client_secret,
|
|
62
|
+
),
|
|
63
|
+
).get_container_client(configuration.azure.container_name)
|
|
70
64
|
|
|
71
65
|
def data_upload(data_stream: BytesIO, file_name: str):
|
|
72
66
|
uploaded_file_name = file_prefix + file_name
|
|
@@ -35,30 +35,49 @@ class DatasetApi(BaseDatasetApi, Connection):
|
|
|
35
35
|
LOGGER.debug(T("coal.cosmotech_api.initialization.dataset_api_initialized"))
|
|
36
36
|
|
|
37
37
|
def download_dataset(self, dataset_id) -> Dataset:
|
|
38
|
+
LOGGER.debug(f"Downloading dataset {dataset_id}")
|
|
38
39
|
dataset = self.get_dataset(
|
|
39
40
|
organization_id=self.configuration.cosmotech.organization_id,
|
|
40
41
|
workspace_id=self.configuration.cosmotech.workspace_id,
|
|
41
42
|
dataset_id=dataset_id,
|
|
42
43
|
)
|
|
44
|
+
# send dataset files under dataset id folder
|
|
45
|
+
destination = Path(self.configuration.cosmotech.dataset_absolute_path) / dataset_id
|
|
46
|
+
for part in dataset.parts:
|
|
47
|
+
self._download_part(dataset_id, part, destination)
|
|
48
|
+
return dataset
|
|
43
49
|
|
|
44
|
-
|
|
45
|
-
|
|
50
|
+
def download_parameter(self, dataset_id) -> Dataset:
|
|
51
|
+
LOGGER.debug(f"Downloading dataset {dataset_id}")
|
|
52
|
+
dataset = self.get_dataset(
|
|
53
|
+
organization_id=self.configuration.cosmotech.organization_id,
|
|
54
|
+
workspace_id=self.configuration.cosmotech.workspace_id,
|
|
55
|
+
dataset_id=dataset_id,
|
|
56
|
+
)
|
|
57
|
+
# send parameters file under parameters_name folder
|
|
58
|
+
destination = Path(self.configuration.cosmotech.parameters_absolute_path) / dataset_id
|
|
46
59
|
for part in dataset.parts:
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
data_part = self.download_dataset_part(
|
|
50
|
-
organization_id=self.configuration.cosmotech.organization_id,
|
|
51
|
-
workspace_id=self.configuration.cosmotech.workspace_id,
|
|
52
|
-
dataset_id=dataset_id,
|
|
53
|
-
dataset_part_id=part.id,
|
|
54
|
-
)
|
|
55
|
-
with open(part_file_path, "wb") as binary_file:
|
|
56
|
-
binary_file.write(data_part)
|
|
57
|
-
LOGGER.debug(
|
|
58
|
-
T("coal.services.dataset.part_downloaded").format(part_name=part.source_name, file_path=part_file_path)
|
|
59
|
-
)
|
|
60
|
+
part_dst = destination / part.name
|
|
61
|
+
self._download_part(dataset_id, part, part_dst)
|
|
60
62
|
return dataset
|
|
61
63
|
|
|
64
|
+
def _download_part(self, dataset_id, dataset_part, destination):
|
|
65
|
+
part_file_path = destination / dataset_part.source_name
|
|
66
|
+
part_file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
67
|
+
data_part = self.download_dataset_part(
|
|
68
|
+
organization_id=self.configuration.cosmotech.organization_id,
|
|
69
|
+
workspace_id=self.configuration.cosmotech.workspace_id,
|
|
70
|
+
dataset_id=dataset_id,
|
|
71
|
+
dataset_part_id=dataset_part.id,
|
|
72
|
+
)
|
|
73
|
+
with open(part_file_path, "wb") as binary_file:
|
|
74
|
+
binary_file.write(data_part)
|
|
75
|
+
LOGGER.debug(
|
|
76
|
+
T("coal.services.dataset.part_downloaded").format(
|
|
77
|
+
part_name=dataset_part.source_name, file_path=part_file_path
|
|
78
|
+
)
|
|
79
|
+
)
|
|
80
|
+
|
|
62
81
|
@staticmethod
|
|
63
82
|
def path_to_parts(_path, part_type) -> list[tuple[str, Path, DatasetPartTypeEnum]]:
|
|
64
83
|
if (_path := Path(_path)).is_dir():
|
|
@@ -70,7 +89,21 @@ class DatasetApi(BaseDatasetApi, Connection):
|
|
|
70
89
|
dataset_name: str,
|
|
71
90
|
as_files: Optional[list[Union[Path, str]]] = (),
|
|
72
91
|
as_db: Optional[list[Union[Path, str]]] = (),
|
|
92
|
+
tags: Optional[list[str]] = None,
|
|
93
|
+
additional_data: Optional[dict] = None,
|
|
73
94
|
) -> Dataset:
|
|
95
|
+
"""Upload a new dataset with optional tags and additional data.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
dataset_name: The name of the dataset to create
|
|
99
|
+
as_files: List of file paths to upload as FILE type parts
|
|
100
|
+
as_db: List of file paths to upload as DB type parts
|
|
101
|
+
tags: Optional list of tags to associate with the dataset
|
|
102
|
+
additional_data: Optional dictionary of additional metadata
|
|
103
|
+
|
|
104
|
+
Returns:
|
|
105
|
+
The created Dataset object
|
|
106
|
+
"""
|
|
74
107
|
_parts = list()
|
|
75
108
|
|
|
76
109
|
for _f in as_files:
|
|
@@ -81,6 +114,8 @@ class DatasetApi(BaseDatasetApi, Connection):
|
|
|
81
114
|
|
|
82
115
|
d_request = DatasetCreateRequest(
|
|
83
116
|
name=dataset_name,
|
|
117
|
+
tags=tags,
|
|
118
|
+
additional_data=additional_data,
|
|
84
119
|
parts=list(
|
|
85
120
|
DatasetPartCreateRequest(
|
|
86
121
|
name=_p_name,
|
|
@@ -92,12 +127,96 @@ class DatasetApi(BaseDatasetApi, Connection):
|
|
|
92
127
|
),
|
|
93
128
|
)
|
|
94
129
|
|
|
130
|
+
_files = []
|
|
131
|
+
for _p in _parts:
|
|
132
|
+
with _p[1].open("rb") as _p_file:
|
|
133
|
+
_files.append((_p[0], _p_file.read()))
|
|
134
|
+
|
|
95
135
|
d_ret = self.create_dataset(
|
|
96
136
|
self.configuration.cosmotech.organization_id,
|
|
97
137
|
self.configuration.cosmotech.workspace_id,
|
|
98
138
|
d_request,
|
|
99
|
-
files=
|
|
139
|
+
files=_files,
|
|
100
140
|
)
|
|
101
141
|
|
|
102
142
|
LOGGER.info(T("coal.services.dataset.dataset_created").format(dataset_id=d_ret.id))
|
|
103
143
|
return d_ret
|
|
144
|
+
|
|
145
|
+
def upload_dataset_parts(
|
|
146
|
+
self,
|
|
147
|
+
dataset_id: str,
|
|
148
|
+
as_files: Optional[list[Union[Path, str]]] = (),
|
|
149
|
+
as_db: Optional[list[Union[Path, str]]] = (),
|
|
150
|
+
replace_existing: bool = False,
|
|
151
|
+
) -> Dataset:
|
|
152
|
+
"""Upload parts to an existing dataset.
|
|
153
|
+
|
|
154
|
+
Args:
|
|
155
|
+
dataset_id: The ID of the existing dataset
|
|
156
|
+
as_files: List of file paths to upload as FILE type parts
|
|
157
|
+
as_db: List of file paths to upload as DB type parts
|
|
158
|
+
replace_existing: If True, replace existing parts with same name
|
|
159
|
+
|
|
160
|
+
Returns:
|
|
161
|
+
The updated Dataset object
|
|
162
|
+
"""
|
|
163
|
+
# Get current dataset to check existing parts
|
|
164
|
+
current_dataset = self.get_dataset(
|
|
165
|
+
organization_id=self.configuration.cosmotech.organization_id,
|
|
166
|
+
workspace_id=self.configuration.cosmotech.workspace_id,
|
|
167
|
+
dataset_id=dataset_id,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
# Build set of existing part names and their IDs for quick lookup
|
|
171
|
+
existing_parts = {part.source_name: part.id for part in (current_dataset.parts or [])}
|
|
172
|
+
|
|
173
|
+
# Collect parts to upload
|
|
174
|
+
_parts = list()
|
|
175
|
+
for _f in as_files:
|
|
176
|
+
_parts.extend(self.path_to_parts(_f, DatasetPartTypeEnum.FILE))
|
|
177
|
+
for _db in as_db:
|
|
178
|
+
_parts.extend(self.path_to_parts(_db, DatasetPartTypeEnum.DB))
|
|
179
|
+
|
|
180
|
+
# Process each part
|
|
181
|
+
for _p_name, _p_path, _type in _parts:
|
|
182
|
+
if _p_name in existing_parts:
|
|
183
|
+
if replace_existing:
|
|
184
|
+
# Delete existing part before creating new one
|
|
185
|
+
self.delete_dataset_part(
|
|
186
|
+
organization_id=self.configuration.cosmotech.organization_id,
|
|
187
|
+
workspace_id=self.configuration.cosmotech.workspace_id,
|
|
188
|
+
dataset_id=dataset_id,
|
|
189
|
+
dataset_part_id=existing_parts[_p_name],
|
|
190
|
+
)
|
|
191
|
+
LOGGER.info(T("coal.services.dataset.part_replaced").format(part_name=_p_name))
|
|
192
|
+
else:
|
|
193
|
+
LOGGER.warning(T("coal.services.dataset.part_skipped").format(part_name=_p_name))
|
|
194
|
+
continue
|
|
195
|
+
|
|
196
|
+
# Create new part
|
|
197
|
+
part_request = DatasetPartCreateRequest(
|
|
198
|
+
name=_p_name,
|
|
199
|
+
description=_p_name,
|
|
200
|
+
sourceName=_p_name,
|
|
201
|
+
type=_type,
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
with _p_path.open("rb") as _p_file:
|
|
205
|
+
self.create_dataset_part(
|
|
206
|
+
organization_id=self.configuration.cosmotech.organization_id,
|
|
207
|
+
workspace_id=self.configuration.cosmotech.workspace_id,
|
|
208
|
+
dataset_id=dataset_id,
|
|
209
|
+
dataset_part_create_request=part_request,
|
|
210
|
+
file=(_p_name, _p_file.read()),
|
|
211
|
+
)
|
|
212
|
+
LOGGER.debug(T("coal.services.dataset.part_uploaded").format(part_name=_p_name))
|
|
213
|
+
|
|
214
|
+
# Return updated dataset
|
|
215
|
+
updated_dataset = self.get_dataset(
|
|
216
|
+
organization_id=self.configuration.cosmotech.organization_id,
|
|
217
|
+
workspace_id=self.configuration.cosmotech.workspace_id,
|
|
218
|
+
dataset_id=dataset_id,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
LOGGER.info(T("coal.services.dataset.parts_uploaded").format(dataset_id=dataset_id))
|
|
222
|
+
return updated_dataset
|
|
@@ -29,43 +29,47 @@ class RunnerApi(BaseRunnerApi, Connection):
|
|
|
29
29
|
|
|
30
30
|
def get_runner_metadata(
|
|
31
31
|
self,
|
|
32
|
-
|
|
33
|
-
workspace_id: str,
|
|
34
|
-
runner_id: str,
|
|
32
|
+
runner_id: Optional[str] = None,
|
|
35
33
|
include: Optional[list[str]] = None,
|
|
36
34
|
exclude: Optional[list[str]] = None,
|
|
37
35
|
) -> dict[str, Any]:
|
|
38
|
-
runner = self.get_runner(
|
|
36
|
+
runner = self.get_runner(
|
|
37
|
+
self.configuration.cosmotech.organization_id,
|
|
38
|
+
self.configuration.cosmotech.workspace_id,
|
|
39
|
+
runner_id or self.configuration.cosmotech.runner_id,
|
|
40
|
+
)
|
|
39
41
|
|
|
40
42
|
return runner.model_dump(by_alias=True, exclude_none=True, include=include, exclude=exclude, mode="json")
|
|
41
43
|
|
|
42
44
|
def download_runner_data(
|
|
43
45
|
self,
|
|
44
|
-
|
|
45
|
-
workspace_id: str,
|
|
46
|
-
runner_id: str,
|
|
47
|
-
parameter_folder: str,
|
|
48
|
-
dataset_folder: Optional[str] = None,
|
|
46
|
+
download_datasets: Optional[str] = None,
|
|
49
47
|
):
|
|
50
48
|
LOGGER.info(T("coal.cosmotech_api.runner.starting_download"))
|
|
51
49
|
|
|
52
50
|
# Get runner data
|
|
53
|
-
|
|
51
|
+
runner = self.get_runner(
|
|
52
|
+
self.configuration.cosmotech.organization_id,
|
|
53
|
+
self.configuration.cosmotech.workspace_id,
|
|
54
|
+
self.configuration.cosmotech.runner_id,
|
|
55
|
+
)
|
|
54
56
|
|
|
55
57
|
# Skip if no parameters found
|
|
56
|
-
if not
|
|
58
|
+
if not runner.parameters_values:
|
|
57
59
|
LOGGER.warning(T("coal.cosmotech_api.runner.no_parameters"))
|
|
58
60
|
else:
|
|
59
61
|
LOGGER.info(T("coal.cosmotech_api.runner.loaded_data"))
|
|
60
|
-
parameters = Parameters(
|
|
61
|
-
parameters.write_parameters_to_json(
|
|
62
|
+
parameters = Parameters(runner)
|
|
63
|
+
parameters.write_parameters_to_json(self.configuration.cosmotech.parameters_absolute_path)
|
|
62
64
|
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
65
|
+
if runner.datasets.parameter:
|
|
66
|
+
ds_api = DatasetApi(self.configuration)
|
|
67
|
+
ds_api.download_parameter(runner.datasets.parameter)
|
|
66
68
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
+
# Download datasets if requested
|
|
70
|
+
if download_datasets:
|
|
71
|
+
LOGGER.info(T("coal.cosmotech_api.runner.downloading_datasets").format(count=len(runner.datasets.bases)))
|
|
72
|
+
if runner.datasets.bases:
|
|
69
73
|
ds_api = DatasetApi(self.configuration)
|
|
70
|
-
for dataset_id in
|
|
74
|
+
for dataset_id in runner.datasets.bases:
|
|
71
75
|
ds_api.download_dataset(dataset_id)
|
|
@@ -38,8 +38,6 @@ def send_runner_metadata_to_postgresql(
|
|
|
38
38
|
# Get runner metadata
|
|
39
39
|
_runner_api = RunnerApi(configuration)
|
|
40
40
|
runner = _runner_api.get_runner_metadata(
|
|
41
|
-
configuration.cosmotech.organization_id,
|
|
42
|
-
configuration.cosmotech.workspace_id,
|
|
43
41
|
configuration.cosmotech.runner_id,
|
|
44
42
|
)
|
|
45
43
|
|
|
@@ -51,7 +49,7 @@ def send_runner_metadata_to_postgresql(
|
|
|
51
49
|
CREATE TABLE IF NOT EXISTS {schema_table} (
|
|
52
50
|
id varchar(32) PRIMARY KEY,
|
|
53
51
|
name varchar(256),
|
|
54
|
-
|
|
52
|
+
last_csm_run_id varchar(32),
|
|
55
53
|
run_template_id varchar(32)
|
|
56
54
|
);
|
|
57
55
|
"""
|
|
@@ -60,24 +58,25 @@ def send_runner_metadata_to_postgresql(
|
|
|
60
58
|
conn.commit()
|
|
61
59
|
LOGGER.info(T("coal.services.postgresql.metadata"))
|
|
62
60
|
sql_upsert = f"""
|
|
63
|
-
INSERT INTO {schema_table} (id, name,
|
|
64
|
-
VALUES(
|
|
61
|
+
INSERT INTO {schema_table} (id, name, last_csm_run_id, run_template_id)
|
|
62
|
+
VALUES ($1, $2, $3, $4)
|
|
65
63
|
ON CONFLICT (id)
|
|
66
64
|
DO
|
|
67
|
-
UPDATE SET name = EXCLUDED.name,
|
|
65
|
+
UPDATE SET name = EXCLUDED.name, last_csm_run_id = EXCLUDED.last_csm_run_id;
|
|
68
66
|
"""
|
|
67
|
+
LOGGER.debug(runner)
|
|
69
68
|
curs.execute(
|
|
70
69
|
sql_upsert,
|
|
71
70
|
(
|
|
72
71
|
runner.get("id"),
|
|
73
72
|
runner.get("name"),
|
|
74
|
-
runner.get("lastRunId"),
|
|
73
|
+
runner.get("lastRunInfo").get("lastRunId"),
|
|
75
74
|
runner.get("runTemplateId"),
|
|
76
75
|
),
|
|
77
76
|
)
|
|
78
77
|
conn.commit()
|
|
79
78
|
LOGGER.info(T("coal.services.postgresql.metadata_updated"))
|
|
80
|
-
return runner.get("lastRunId")
|
|
79
|
+
return runner.get("lastRunInfo").get("lastRunId")
|
|
81
80
|
|
|
82
81
|
|
|
83
82
|
def remove_runner_metadata_from_postgresql(
|
|
@@ -97,8 +96,6 @@ def remove_runner_metadata_from_postgresql(
|
|
|
97
96
|
# Get runner metadata
|
|
98
97
|
_runner_api = RunnerApi(configuration)
|
|
99
98
|
runner = _runner_api.get_runner_metadata(
|
|
100
|
-
configuration.cosmotech.organization_id,
|
|
101
|
-
configuration.cosmotech.workspace_id,
|
|
102
99
|
configuration.cosmotech.runner_id,
|
|
103
100
|
)
|
|
104
101
|
|
|
@@ -108,7 +105,7 @@ def remove_runner_metadata_from_postgresql(
|
|
|
108
105
|
schema_table = f"{_psql.db_schema}.{_psql.table_prefix}RunnerMetadata"
|
|
109
106
|
sql_delete_from_metatable = f"""
|
|
110
107
|
DELETE FROM {schema_table}
|
|
111
|
-
WHERE
|
|
108
|
+
WHERE last_csm_run_id={runner.get("lastRunId")};
|
|
112
109
|
"""
|
|
113
110
|
curs.execute(sql_delete_from_metatable)
|
|
114
111
|
conn.commit()
|
|
@@ -53,24 +53,27 @@ def dump_store_to_postgresql(
|
|
|
53
53
|
selected_tables: list of tables to send
|
|
54
54
|
fk_id: foreign key id to add to all table on all rows
|
|
55
55
|
"""
|
|
56
|
-
_c = Configuration(
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
56
|
+
_c = Configuration(
|
|
57
|
+
{
|
|
58
|
+
"coal": {"store": store_folder},
|
|
59
|
+
"postgres": {
|
|
60
|
+
"host": postgres_host,
|
|
61
|
+
"port": postgres_port,
|
|
62
|
+
"db_name": postgres_db,
|
|
63
|
+
"db_schema": postgres_schema,
|
|
64
|
+
"user_name": postgres_user,
|
|
65
|
+
"user_password": postgres_password,
|
|
66
|
+
"password_encoding": force_encode,
|
|
67
|
+
"table_prefix": table_prefix,
|
|
68
|
+
},
|
|
69
|
+
}
|
|
68
70
|
)
|
|
69
71
|
|
|
72
|
+
dump_store_to_postgresql_from_conf(configuration=_c, replace=replace, selected_tables=selected_tables, fk_id=fk_id)
|
|
73
|
+
|
|
70
74
|
|
|
71
75
|
def dump_store_to_postgresql_from_conf(
|
|
72
76
|
configuration: Configuration,
|
|
73
|
-
store_folder: str,
|
|
74
77
|
replace: bool = True,
|
|
75
78
|
selected_tables: list[str] = [],
|
|
76
79
|
fk_id: str = None,
|
|
@@ -80,14 +83,12 @@ def dump_store_to_postgresql_from_conf(
|
|
|
80
83
|
|
|
81
84
|
Args:
|
|
82
85
|
configuration: coal Configuration
|
|
83
|
-
store_folder: Folder containing the Store
|
|
84
86
|
replace: Whether to replace existing tables
|
|
85
87
|
selected_tables: list of tables to send
|
|
86
88
|
fk_id: foreign key id to add to all table on all rows
|
|
87
89
|
"""
|
|
88
90
|
_psql = PostgresUtils(configuration)
|
|
89
|
-
|
|
90
|
-
_s = Store(store_location=store_folder)
|
|
91
|
+
_s = Store(configuration=configuration)
|
|
91
92
|
|
|
92
93
|
tables = list(_s.list_tables())
|
|
93
94
|
if selected_tables:
|
|
@@ -100,18 +101,12 @@ def dump_store_to_postgresql_from_conf(
|
|
|
100
101
|
_s_time = perf_counter()
|
|
101
102
|
target_table_name = f"{_psql.table_prefix}{table_name}"
|
|
102
103
|
LOGGER.info(T("coal.services.database.table_entry").format(table=target_table_name))
|
|
103
|
-
if fk_id:
|
|
104
|
-
_s.execute_query(
|
|
105
|
-
f"""
|
|
106
|
-
ALTER TABLE {_psql.table_prefix}{table_name}
|
|
107
|
-
ADD run_id TEXT NOT NULL
|
|
108
|
-
DEFAULT ({fk_id})
|
|
109
|
-
"""
|
|
110
|
-
)
|
|
111
104
|
data = _s.get_table(table_name)
|
|
112
105
|
if not len(data):
|
|
113
106
|
LOGGER.info(T("coal.services.database.no_rows"))
|
|
114
107
|
continue
|
|
108
|
+
if fk_id:
|
|
109
|
+
data = data.append_column("csm_run_id", [[fk_id] * data.num_rows])
|
|
115
110
|
_dl_time = perf_counter()
|
|
116
111
|
rows = _psql.send_pyarrow_table_to_postgresql(
|
|
117
112
|
data,
|
|
@@ -120,7 +115,7 @@ def dump_store_to_postgresql_from_conf(
|
|
|
120
115
|
)
|
|
121
116
|
if fk_id and _psql.is_metadata_exists():
|
|
122
117
|
metadata_table = f"{_psql.table_prefix}RunnerMetadata"
|
|
123
|
-
_psql.add_fk_constraint(table_name, "
|
|
118
|
+
_psql.add_fk_constraint(table_name, "csm_run_id", metadata_table, "last_csm_run_id")
|
|
124
119
|
|
|
125
120
|
total_rows += rows
|
|
126
121
|
_up_time = perf_counter()
|
|
@@ -75,7 +75,8 @@ class PostgresUtils:
|
|
|
75
75
|
f"/{self.db_name}"
|
|
76
76
|
)
|
|
77
77
|
|
|
78
|
-
|
|
78
|
+
@property
|
|
79
|
+
def metadata_table_name(self) -> str:
|
|
79
80
|
return f"{self.table_prefix}RunnerMetadata"
|
|
80
81
|
|
|
81
82
|
def get_postgresql_table_schema(self, target_table_name: str) -> Optional[pa.Schema]:
|
|
@@ -12,15 +12,16 @@ This module provides functions for interacting with SingleStore databases
|
|
|
12
12
|
for store operations.
|
|
13
13
|
"""
|
|
14
14
|
|
|
15
|
+
import csv
|
|
15
16
|
import pathlib
|
|
16
17
|
import time
|
|
17
|
-
|
|
18
|
+
|
|
18
19
|
import singlestoredb as s2
|
|
20
|
+
from cosmotech.orchestrator.utils.translate import T
|
|
19
21
|
|
|
20
22
|
from cosmotech.coal.store.csv import store_csv_file
|
|
21
23
|
from cosmotech.coal.store.store import Store
|
|
22
24
|
from cosmotech.coal.utils.logger import LOGGER
|
|
23
|
-
from cosmotech.orchestrator.utils.translate import T
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
def _get_data(table_name: str, output_directory: str, cursor) -> None:
|