cosmotech-acceleration-library 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cosmotech/coal/__init__.py +8 -0
- cosmotech/coal/aws/__init__.py +23 -0
- cosmotech/coal/aws/s3.py +235 -0
- cosmotech/coal/azure/__init__.py +23 -0
- cosmotech/coal/azure/adx/__init__.py +26 -0
- cosmotech/coal/azure/adx/auth.py +125 -0
- cosmotech/coal/azure/adx/ingestion.py +329 -0
- cosmotech/coal/azure/adx/query.py +56 -0
- cosmotech/coal/azure/adx/runner.py +217 -0
- cosmotech/coal/azure/adx/store.py +255 -0
- cosmotech/coal/azure/adx/tables.py +118 -0
- cosmotech/coal/azure/adx/utils.py +71 -0
- cosmotech/coal/azure/blob.py +109 -0
- cosmotech/coal/azure/functions.py +72 -0
- cosmotech/coal/azure/storage.py +74 -0
- cosmotech/coal/cosmotech_api/__init__.py +36 -0
- cosmotech/coal/cosmotech_api/connection.py +96 -0
- cosmotech/coal/cosmotech_api/dataset/__init__.py +26 -0
- cosmotech/coal/cosmotech_api/dataset/converters.py +164 -0
- cosmotech/coal/cosmotech_api/dataset/download/__init__.py +19 -0
- cosmotech/coal/cosmotech_api/dataset/download/adt.py +119 -0
- cosmotech/coal/cosmotech_api/dataset/download/common.py +140 -0
- cosmotech/coal/cosmotech_api/dataset/download/file.py +216 -0
- cosmotech/coal/cosmotech_api/dataset/download/twingraph.py +188 -0
- cosmotech/coal/cosmotech_api/dataset/utils.py +132 -0
- cosmotech/coal/cosmotech_api/parameters.py +48 -0
- cosmotech/coal/cosmotech_api/run.py +25 -0
- cosmotech/coal/cosmotech_api/run_data.py +173 -0
- cosmotech/coal/cosmotech_api/run_template.py +108 -0
- cosmotech/coal/cosmotech_api/runner/__init__.py +28 -0
- cosmotech/coal/cosmotech_api/runner/data.py +38 -0
- cosmotech/coal/cosmotech_api/runner/datasets.py +364 -0
- cosmotech/coal/cosmotech_api/runner/download.py +146 -0
- cosmotech/coal/cosmotech_api/runner/metadata.py +42 -0
- cosmotech/coal/cosmotech_api/runner/parameters.py +157 -0
- cosmotech/coal/cosmotech_api/twin_data_layer.py +512 -0
- cosmotech/coal/cosmotech_api/workspace.py +127 -0
- cosmotech/coal/csm/__init__.py +6 -0
- cosmotech/coal/csm/engine/__init__.py +47 -0
- cosmotech/coal/postgresql/__init__.py +22 -0
- cosmotech/coal/postgresql/runner.py +93 -0
- cosmotech/coal/postgresql/store.py +98 -0
- cosmotech/coal/singlestore/__init__.py +17 -0
- cosmotech/coal/singlestore/store.py +100 -0
- cosmotech/coal/store/__init__.py +42 -0
- cosmotech/coal/store/csv.py +44 -0
- cosmotech/coal/store/native_python.py +25 -0
- cosmotech/coal/store/pandas.py +26 -0
- cosmotech/coal/store/pyarrow.py +23 -0
- cosmotech/coal/store/store.py +79 -0
- cosmotech/coal/utils/__init__.py +18 -0
- cosmotech/coal/utils/api.py +68 -0
- cosmotech/coal/utils/logger.py +10 -0
- cosmotech/coal/utils/postgresql.py +236 -0
- cosmotech/csm_data/__init__.py +6 -0
- cosmotech/csm_data/commands/__init__.py +6 -0
- cosmotech/csm_data/commands/adx_send_data.py +92 -0
- cosmotech/csm_data/commands/adx_send_runnerdata.py +119 -0
- cosmotech/csm_data/commands/api/__init__.py +6 -0
- cosmotech/csm_data/commands/api/api.py +50 -0
- cosmotech/csm_data/commands/api/postgres_send_runner_metadata.py +119 -0
- cosmotech/csm_data/commands/api/rds_load_csv.py +90 -0
- cosmotech/csm_data/commands/api/rds_send_csv.py +74 -0
- cosmotech/csm_data/commands/api/rds_send_store.py +74 -0
- cosmotech/csm_data/commands/api/run_load_data.py +120 -0
- cosmotech/csm_data/commands/api/runtemplate_load_handler.py +66 -0
- cosmotech/csm_data/commands/api/tdl_load_files.py +76 -0
- cosmotech/csm_data/commands/api/tdl_send_files.py +82 -0
- cosmotech/csm_data/commands/api/wsf_load_file.py +66 -0
- cosmotech/csm_data/commands/api/wsf_send_file.py +68 -0
- cosmotech/csm_data/commands/az_storage_upload.py +76 -0
- cosmotech/csm_data/commands/s3_bucket_delete.py +107 -0
- cosmotech/csm_data/commands/s3_bucket_download.py +118 -0
- cosmotech/csm_data/commands/s3_bucket_upload.py +128 -0
- cosmotech/csm_data/commands/store/__init__.py +6 -0
- cosmotech/csm_data/commands/store/dump_to_azure.py +120 -0
- cosmotech/csm_data/commands/store/dump_to_postgresql.py +107 -0
- cosmotech/csm_data/commands/store/dump_to_s3.py +169 -0
- cosmotech/csm_data/commands/store/list_tables.py +48 -0
- cosmotech/csm_data/commands/store/load_csv_folder.py +43 -0
- cosmotech/csm_data/commands/store/load_from_singlestore.py +96 -0
- cosmotech/csm_data/commands/store/reset.py +31 -0
- cosmotech/csm_data/commands/store/store.py +37 -0
- cosmotech/csm_data/main.py +57 -0
- cosmotech/csm_data/utils/__init__.py +6 -0
- cosmotech/csm_data/utils/click.py +18 -0
- cosmotech/csm_data/utils/decorators.py +75 -0
- cosmotech/orchestrator_plugins/csm-data/__init__.py +11 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/postgres_send_runner_metadata.json +40 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/rds_load_csv.json +27 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/rds_send_csv.json +27 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/rds_send_store.json +27 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/run_load_data.json +30 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/runtemplate_load_handler.json +27 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/tdl_load_files.json +32 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/tdl_send_files.json +27 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/try_api_connection.json +9 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/wsf_load_file.json +36 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/wsf_send_file.json +36 -0
- cosmotech/orchestrator_plugins/csm-data/templates/main/adx_send_runnerdata.json +29 -0
- cosmotech/orchestrator_plugins/csm-data/templates/main/az_storage_upload.json +25 -0
- cosmotech/orchestrator_plugins/csm-data/templates/main/s3_bucket_delete.json +31 -0
- cosmotech/orchestrator_plugins/csm-data/templates/main/s3_bucket_download.json +34 -0
- cosmotech/orchestrator_plugins/csm-data/templates/main/s3_bucket_upload.json +35 -0
- cosmotech/orchestrator_plugins/csm-data/templates/store/store_dump_to_azure.json +35 -0
- cosmotech/orchestrator_plugins/csm-data/templates/store/store_dump_to_postgresql.json +34 -0
- cosmotech/orchestrator_plugins/csm-data/templates/store/store_dump_to_s3.json +36 -0
- cosmotech/orchestrator_plugins/csm-data/templates/store/store_list_tables.json +15 -0
- cosmotech/orchestrator_plugins/csm-data/templates/store/store_load_csv_folder.json +18 -0
- cosmotech/orchestrator_plugins/csm-data/templates/store/store_load_from_singlestore.json +34 -0
- cosmotech/orchestrator_plugins/csm-data/templates/store/store_reset.json +15 -0
- cosmotech/translation/coal/__init__.py +6 -0
- cosmotech/translation/coal/en-US/coal/common/data_transfer.yml +6 -0
- cosmotech/translation/coal/en-US/coal/common/errors.yml +9 -0
- cosmotech/translation/coal/en-US/coal/common/file_operations.yml +6 -0
- cosmotech/translation/coal/en-US/coal/common/progress.yml +6 -0
- cosmotech/translation/coal/en-US/coal/common/timing.yml +5 -0
- cosmotech/translation/coal/en-US/coal/common/validation.yml +8 -0
- cosmotech/translation/coal/en-US/coal/cosmotech_api/connection.yml +10 -0
- cosmotech/translation/coal/en-US/coal/cosmotech_api/run_data.yml +2 -0
- cosmotech/translation/coal/en-US/coal/cosmotech_api/run_template.yml +8 -0
- cosmotech/translation/coal/en-US/coal/cosmotech_api/runner.yml +16 -0
- cosmotech/translation/coal/en-US/coal/cosmotech_api/solution.yml +5 -0
- cosmotech/translation/coal/en-US/coal/cosmotech_api/workspace.yml +7 -0
- cosmotech/translation/coal/en-US/coal/services/adx.yml +59 -0
- cosmotech/translation/coal/en-US/coal/services/api.yml +8 -0
- cosmotech/translation/coal/en-US/coal/services/azure_storage.yml +14 -0
- cosmotech/translation/coal/en-US/coal/services/database.yml +19 -0
- cosmotech/translation/coal/en-US/coal/services/dataset.yml +68 -0
- cosmotech/translation/coal/en-US/coal/services/postgresql.yml +28 -0
- cosmotech/translation/coal/en-US/coal/services/s3.yml +9 -0
- cosmotech/translation/coal/en-US/coal/solution.yml +3 -0
- cosmotech/translation/coal/en-US/coal/web.yml +2 -0
- cosmotech/translation/csm_data/__init__.py +6 -0
- cosmotech/translation/csm_data/en-US/csm-data.yml +434 -0
- cosmotech_acceleration_library-1.0.0.dist-info/METADATA +255 -0
- cosmotech_acceleration_library-1.0.0.dist-info/RECORD +141 -0
- cosmotech_acceleration_library-1.0.0.dist-info/WHEEL +5 -0
- cosmotech_acceleration_library-1.0.0.dist-info/entry_points.txt +2 -0
- cosmotech_acceleration_library-1.0.0.dist-info/licenses/LICENSE +17 -0
- cosmotech_acceleration_library-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
# Copyright (C) - 2023 - 2025 - Cosmo Tech
|
|
2
|
+
# This document and all information contained herein is the exclusive property -
|
|
3
|
+
# including all intellectual property rights pertaining thereto - of Cosmo Tech.
|
|
4
|
+
# Any use, reproduction, translation, broadcasting, transmission, distribution,
|
|
5
|
+
# etc., to any person is prohibited unless it has been previously and
|
|
6
|
+
# specifically authorized by written means by Cosmo Tech.
|
|
7
|
+
|
|
8
|
+
from enum import Enum
|
|
9
|
+
from typing import Dict
|
|
10
|
+
from typing import Iterator
|
|
11
|
+
from typing import List
|
|
12
|
+
from typing import Optional
|
|
13
|
+
from typing import Tuple
|
|
14
|
+
|
|
15
|
+
import os
|
|
16
|
+
import pandas as pd
|
|
17
|
+
import time
|
|
18
|
+
import tqdm
|
|
19
|
+
from azure.kusto.data import KustoClient
|
|
20
|
+
from azure.kusto.data.data_format import DataFormat
|
|
21
|
+
from azure.kusto.ingest import IngestionProperties
|
|
22
|
+
from azure.kusto.ingest import QueuedIngestClient
|
|
23
|
+
from azure.kusto.ingest import ReportLevel
|
|
24
|
+
from azure.kusto.ingest.status import FailureMessage
|
|
25
|
+
from azure.kusto.ingest.status import KustoIngestStatusQueues
|
|
26
|
+
from azure.kusto.ingest.status import SuccessMessage
|
|
27
|
+
from cosmotech.orchestrator.utils.translate import T
|
|
28
|
+
|
|
29
|
+
from cosmotech.coal.azure.adx.tables import create_table, _drop_by_tag
|
|
30
|
+
from cosmotech.coal.azure.adx.utils import type_mapping
|
|
31
|
+
from cosmotech.coal.utils.logger import LOGGER
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class IngestionStatus(Enum):
|
|
35
|
+
QUEUED = "QUEUED"
|
|
36
|
+
SUCCESS = "SUCCESS"
|
|
37
|
+
FAILURE = "FAILURE"
|
|
38
|
+
UNKNOWN = "UNKNOWN"
|
|
39
|
+
TIMEOUT = "TIMED OUT"
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
# Global dictionaries to track ingestion status
|
|
43
|
+
_ingest_status: Dict[str, IngestionStatus] = {}
|
|
44
|
+
_ingest_times: Dict[str, float] = {}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def ingest_dataframe(
|
|
48
|
+
client: QueuedIngestClient,
|
|
49
|
+
database: str,
|
|
50
|
+
table_name: str,
|
|
51
|
+
dataframe: pd.DataFrame,
|
|
52
|
+
drop_by_tag: Optional[str] = None,
|
|
53
|
+
):
|
|
54
|
+
"""
|
|
55
|
+
Ingest a pandas DataFrame into an ADX table.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
client: The QueuedIngestClient to use
|
|
59
|
+
database: The name of the database
|
|
60
|
+
table_name: The name of the table
|
|
61
|
+
dataframe: The DataFrame to ingest
|
|
62
|
+
drop_by_tag: Tag used for the drop by capacity of the Cosmotech API
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
The ingestion result with source_id for status tracking
|
|
66
|
+
"""
|
|
67
|
+
LOGGER.debug(T("coal.services.adx.ingesting_dataframe").format(table_name=table_name, rows=len(dataframe)))
|
|
68
|
+
|
|
69
|
+
drop_by_tags = [drop_by_tag] if (drop_by_tag is not None) else None
|
|
70
|
+
|
|
71
|
+
properties = IngestionProperties(
|
|
72
|
+
database=database,
|
|
73
|
+
table=table_name,
|
|
74
|
+
data_format=DataFormat.CSV,
|
|
75
|
+
drop_by_tags=drop_by_tags,
|
|
76
|
+
report_level=ReportLevel.FailuresAndSuccesses,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
ingestion_result = client.ingest_from_dataframe(dataframe, ingestion_properties=properties)
|
|
80
|
+
|
|
81
|
+
# Track the ingestion status
|
|
82
|
+
source_id = str(ingestion_result.source_id)
|
|
83
|
+
_ingest_status[source_id] = IngestionStatus.QUEUED
|
|
84
|
+
_ingest_times[source_id] = time.time()
|
|
85
|
+
|
|
86
|
+
LOGGER.debug(T("coal.services.adx.ingestion_queued").format(source_id=source_id))
|
|
87
|
+
|
|
88
|
+
return ingestion_result
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def send_to_adx(
|
|
92
|
+
query_client: KustoClient,
|
|
93
|
+
ingest_client: QueuedIngestClient,
|
|
94
|
+
database: str,
|
|
95
|
+
dict_list: List[Dict],
|
|
96
|
+
table_name: str,
|
|
97
|
+
ignore_table_creation: bool = True,
|
|
98
|
+
drop_by_tag: Optional[str] = None,
|
|
99
|
+
):
|
|
100
|
+
"""
|
|
101
|
+
Send a list of dictionaries to an ADX table.
|
|
102
|
+
|
|
103
|
+
Args:
|
|
104
|
+
query_client: The KustoClient for querying
|
|
105
|
+
ingest_client: The QueuedIngestClient for ingestion
|
|
106
|
+
database: The name of the database
|
|
107
|
+
dict_list: The list of dictionaries to send
|
|
108
|
+
table_name: The name of the table
|
|
109
|
+
ignore_table_creation: If False, will create the table if it doesn't exist
|
|
110
|
+
drop_by_tag: Tag used for the drop by capacity of the Cosmotech API
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
The ingestion result with source_id for status tracking
|
|
114
|
+
"""
|
|
115
|
+
LOGGER.debug(T("coal.services.adx.sending_to_adx").format(table_name=table_name, items=len(dict_list)))
|
|
116
|
+
|
|
117
|
+
if not dict_list:
|
|
118
|
+
LOGGER.warning(T("coal.services.adx.empty_dict_list"))
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
if not ignore_table_creation:
|
|
122
|
+
# If the target table does not exist create it
|
|
123
|
+
# First create the columns types needed for the table
|
|
124
|
+
types = {k: type_mapping(k, dict_list[0][k]) for k in dict_list[0].keys()}
|
|
125
|
+
|
|
126
|
+
# Then try to create the table
|
|
127
|
+
if not create_table(query_client, database, table_name, types):
|
|
128
|
+
LOGGER.error(T("coal.services.adx.table_creation_failed").format(table_name=table_name))
|
|
129
|
+
return False
|
|
130
|
+
|
|
131
|
+
# Create a dataframe with the data to write and send them to ADX
|
|
132
|
+
df = pd.DataFrame(dict_list)
|
|
133
|
+
return ingest_dataframe(ingest_client, database, table_name, df, drop_by_tag)
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def check_ingestion_status(
|
|
137
|
+
client: QueuedIngestClient,
|
|
138
|
+
source_ids: List[str],
|
|
139
|
+
timeout: Optional[int] = None,
|
|
140
|
+
) -> Iterator[Tuple[str, IngestionStatus]]:
|
|
141
|
+
"""
|
|
142
|
+
Check the status of ingestion operations.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
client: The QueuedIngestClient to use
|
|
146
|
+
source_ids: List of source IDs to check
|
|
147
|
+
timeout: Timeout in seconds (default: 900)
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Iterator of (source_id, status) tuples
|
|
151
|
+
"""
|
|
152
|
+
default_timeout = 900
|
|
153
|
+
remaining_ids = []
|
|
154
|
+
|
|
155
|
+
# First yield any already known statuses
|
|
156
|
+
for source_id in source_ids:
|
|
157
|
+
if source_id not in _ingest_status:
|
|
158
|
+
_ingest_status[source_id] = IngestionStatus.UNKNOWN
|
|
159
|
+
_ingest_times[source_id] = time.time()
|
|
160
|
+
|
|
161
|
+
if _ingest_status[source_id] not in [
|
|
162
|
+
IngestionStatus.QUEUED,
|
|
163
|
+
IngestionStatus.UNKNOWN,
|
|
164
|
+
]:
|
|
165
|
+
yield source_id, _ingest_status[source_id]
|
|
166
|
+
else:
|
|
167
|
+
remaining_ids.append(source_id)
|
|
168
|
+
|
|
169
|
+
if not remaining_ids:
|
|
170
|
+
return
|
|
171
|
+
|
|
172
|
+
LOGGER.debug(T("coal.services.adx.checking_status").format(count=len(remaining_ids)))
|
|
173
|
+
|
|
174
|
+
# Get status queues
|
|
175
|
+
qs = KustoIngestStatusQueues(client)
|
|
176
|
+
|
|
177
|
+
def get_messages(queues):
|
|
178
|
+
_r = []
|
|
179
|
+
for q in queues:
|
|
180
|
+
_r.extend(((q, m) for m in q.receive_messages(messages_per_page=32, visibility_timeout=1)))
|
|
181
|
+
return _r
|
|
182
|
+
|
|
183
|
+
successes = get_messages(qs.success._get_queues())
|
|
184
|
+
failures = get_messages(qs.failure._get_queues())
|
|
185
|
+
|
|
186
|
+
LOGGER.debug(T("coal.services.adx.status_messages").format(success=len(successes), failure=len(failures)))
|
|
187
|
+
|
|
188
|
+
queued_ids = list(remaining_ids)
|
|
189
|
+
# Process success and failure messages
|
|
190
|
+
for messages, cast_func, status, log_function in [
|
|
191
|
+
(successes, SuccessMessage, IngestionStatus.SUCCESS, LOGGER.debug),
|
|
192
|
+
(failures, FailureMessage, IngestionStatus.FAILURE, LOGGER.error),
|
|
193
|
+
]:
|
|
194
|
+
for _q, _m in messages:
|
|
195
|
+
dm = cast_func(_m.content)
|
|
196
|
+
to_check_ids = remaining_ids[:]
|
|
197
|
+
|
|
198
|
+
for source_id in to_check_ids:
|
|
199
|
+
if dm.IngestionSourceId == str(source_id):
|
|
200
|
+
_ingest_status[source_id] = status
|
|
201
|
+
|
|
202
|
+
log_function(T("coal.services.adx.status_found").format(source_id=source_id, status=status.value))
|
|
203
|
+
|
|
204
|
+
_q.delete_message(_m)
|
|
205
|
+
remaining_ids.remove(source_id)
|
|
206
|
+
break
|
|
207
|
+
else:
|
|
208
|
+
# The message did not correspond to a known ID
|
|
209
|
+
continue
|
|
210
|
+
|
|
211
|
+
# Check for timeouts
|
|
212
|
+
actual_timeout = timeout if timeout is not None else default_timeout
|
|
213
|
+
for source_id in remaining_ids:
|
|
214
|
+
if time.time() - _ingest_times[source_id] > actual_timeout:
|
|
215
|
+
_ingest_status[source_id] = IngestionStatus.TIMEOUT
|
|
216
|
+
LOGGER.warning(T("coal.services.adx.ingestion_timeout").format(source_id=source_id))
|
|
217
|
+
|
|
218
|
+
# Yield results for remaining IDs
|
|
219
|
+
for source_id in queued_ids:
|
|
220
|
+
yield source_id, _ingest_status[source_id]
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def monitor_ingestion(
|
|
224
|
+
ingest_client: QueuedIngestClient, source_ids: List[str], table_ingestion_id_mapping: Dict[str, str]
|
|
225
|
+
) -> bool:
|
|
226
|
+
"""
|
|
227
|
+
Monitor the ingestion process with progress reporting.
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
ingest_client: The ingest client
|
|
231
|
+
source_ids: List of source IDs to monitor
|
|
232
|
+
table_ingestion_id_mapping: Mapping of source IDs to table names
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
bool: True if any failures occurred, False otherwise
|
|
236
|
+
"""
|
|
237
|
+
has_failures = False
|
|
238
|
+
source_ids_copy = source_ids.copy()
|
|
239
|
+
|
|
240
|
+
LOGGER.info(T("coal.services.adx.waiting_ingestion"))
|
|
241
|
+
|
|
242
|
+
with tqdm.tqdm(desc="Ingestion status", total=len(source_ids_copy)) as pbar:
|
|
243
|
+
while any(
|
|
244
|
+
list(
|
|
245
|
+
map(
|
|
246
|
+
lambda _status: _status[1] in (IngestionStatus.QUEUED, IngestionStatus.UNKNOWN),
|
|
247
|
+
results := list(check_ingestion_status(ingest_client, source_ids_copy)),
|
|
248
|
+
)
|
|
249
|
+
)
|
|
250
|
+
):
|
|
251
|
+
# Check for failures
|
|
252
|
+
for ingestion_id, ingestion_status in results:
|
|
253
|
+
if ingestion_status == IngestionStatus.FAILURE:
|
|
254
|
+
LOGGER.error(
|
|
255
|
+
T("coal.services.adx.ingestion_failed").format(
|
|
256
|
+
ingestion_id=ingestion_id, table=table_ingestion_id_mapping.get(ingestion_id)
|
|
257
|
+
)
|
|
258
|
+
)
|
|
259
|
+
has_failures = True
|
|
260
|
+
|
|
261
|
+
cleared_ids = list(
|
|
262
|
+
result for result in results if result[1] not in (IngestionStatus.QUEUED, IngestionStatus.UNKNOWN)
|
|
263
|
+
)
|
|
264
|
+
|
|
265
|
+
for ingestion_id, ingestion_status in cleared_ids:
|
|
266
|
+
pbar.update(1)
|
|
267
|
+
source_ids_copy.remove(ingestion_id)
|
|
268
|
+
|
|
269
|
+
time.sleep(1)
|
|
270
|
+
if os.environ.get("CSM_USE_RICH", "False").lower() in ("true", "1", "yes", "t", "y"):
|
|
271
|
+
pbar.refresh()
|
|
272
|
+
else:
|
|
273
|
+
for ingestion_id, ingestion_status in results:
|
|
274
|
+
if ingestion_status == IngestionStatus.FAILURE:
|
|
275
|
+
LOGGER.error(
|
|
276
|
+
T("coal.services.adx.ingestion_failed").format(
|
|
277
|
+
ingestion_id=ingestion_id, table=table_ingestion_id_mapping.get(ingestion_id)
|
|
278
|
+
)
|
|
279
|
+
)
|
|
280
|
+
has_failures = True
|
|
281
|
+
pbar.update(len(source_ids_copy))
|
|
282
|
+
|
|
283
|
+
LOGGER.info(T("coal.services.adx.ingestion_completed"))
|
|
284
|
+
return has_failures
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def handle_failures(kusto_client: KustoClient, database: str, operation_tag: str, has_failures: bool) -> bool:
|
|
288
|
+
"""
|
|
289
|
+
Handle any failures and perform rollbacks if needed.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
kusto_client: The Kusto client
|
|
293
|
+
database: The database name
|
|
294
|
+
operation_tag: The operation tag for tracking
|
|
295
|
+
has_failures: Whether any failures occurred
|
|
296
|
+
|
|
297
|
+
Returns:
|
|
298
|
+
bool: True if the process should abort, False otherwise
|
|
299
|
+
"""
|
|
300
|
+
if has_failures:
|
|
301
|
+
LOGGER.warning(T("coal.services.adx.failures_detected").format(operation_tag=operation_tag))
|
|
302
|
+
_drop_by_tag(kusto_client, database, operation_tag)
|
|
303
|
+
return True
|
|
304
|
+
return False
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def clear_ingestion_status_queues(client: QueuedIngestClient, confirmation: bool = False):
|
|
308
|
+
"""
|
|
309
|
+
Clear all data in the ingestion status queues.
|
|
310
|
+
DANGEROUS: This will clear all queues for the entire ADX cluster.
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
client: The QueuedIngestClient to use
|
|
314
|
+
confirmation: Must be True to proceed with clearing
|
|
315
|
+
"""
|
|
316
|
+
if not confirmation:
|
|
317
|
+
LOGGER.warning(T("coal.services.adx.clear_queues_no_confirmation"))
|
|
318
|
+
return
|
|
319
|
+
|
|
320
|
+
LOGGER.warning(T("coal.services.adx.clearing_queues"))
|
|
321
|
+
qs = KustoIngestStatusQueues(client)
|
|
322
|
+
|
|
323
|
+
while not qs.success.is_empty():
|
|
324
|
+
qs.success.pop(32)
|
|
325
|
+
|
|
326
|
+
while not qs.failure.is_empty():
|
|
327
|
+
qs.failure.pop(32)
|
|
328
|
+
|
|
329
|
+
LOGGER.info(T("coal.services.adx.queues_cleared"))
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
# Copyright (C) - 2023 - 2025 - Cosmo Tech
|
|
2
|
+
# This document and all information contained herein is the exclusive property -
|
|
3
|
+
# including all intellectual property rights pertaining thereto - of Cosmo Tech.
|
|
4
|
+
# Any use, reproduction, translation, broadcasting, transmission, distribution,
|
|
5
|
+
# etc., to any person is prohibited unless it has been previously and
|
|
6
|
+
# specifically authorized by written means by Cosmo Tech.
|
|
7
|
+
|
|
8
|
+
from azure.kusto.data import KustoClient
|
|
9
|
+
from azure.kusto.data.response import KustoResponseDataSet
|
|
10
|
+
|
|
11
|
+
from cosmotech.coal.utils.logger import LOGGER
|
|
12
|
+
from cosmotech.orchestrator.utils.translate import T
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def run_query(client: KustoClient, database: str, query: str) -> KustoResponseDataSet:
|
|
16
|
+
"""
|
|
17
|
+
Execute a simple query on the database.
|
|
18
|
+
|
|
19
|
+
Args:
|
|
20
|
+
client: The KustoClient to use
|
|
21
|
+
database: The name of the database
|
|
22
|
+
query: The query to execute
|
|
23
|
+
|
|
24
|
+
Returns:
|
|
25
|
+
KustoResponseDataSet: The results of the query
|
|
26
|
+
"""
|
|
27
|
+
LOGGER.debug(T("coal.services.adx.running_query").format(database=database, query=query))
|
|
28
|
+
|
|
29
|
+
result = client.execute(database, query)
|
|
30
|
+
LOGGER.debug(
|
|
31
|
+
T("coal.services.adx.query_complete").format(
|
|
32
|
+
rows=len(result.primary_results[0]) if result.primary_results else 0
|
|
33
|
+
)
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
return result
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def run_command_query(client: KustoClient, database: str, query: str) -> KustoResponseDataSet:
|
|
40
|
+
"""
|
|
41
|
+
Execute a command query on the database.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
client: The KustoClient to use
|
|
45
|
+
database: The name of the database
|
|
46
|
+
query: The query to execute
|
|
47
|
+
|
|
48
|
+
Returns:
|
|
49
|
+
KustoResponseDataSet: The results of the query
|
|
50
|
+
"""
|
|
51
|
+
LOGGER.debug(T("coal.services.adx.running_command").format(database=database, query=query))
|
|
52
|
+
|
|
53
|
+
result = client.execute_mgmt(database, query)
|
|
54
|
+
LOGGER.debug(T("coal.services.adx.command_complete"))
|
|
55
|
+
|
|
56
|
+
return result
|
|
@@ -0,0 +1,217 @@
|
|
|
1
|
+
# Copyright (C) - 2023 - 2025 - Cosmo Tech
|
|
2
|
+
# This document and all information contained herein is the exclusive property -
|
|
3
|
+
# including all intellectual property rights pertaining thereto - of Cosmo Tech.
|
|
4
|
+
# Any use, reproduction, translation, broadcasting, transmission, distribution,
|
|
5
|
+
# etc., to any person is prohibited unless it has been previously and
|
|
6
|
+
# specifically authorized by written means by Cosmo Tech.
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
ADX runner data ingestion module.
|
|
10
|
+
|
|
11
|
+
This module provides functions for ingesting runner data into Azure Data Explorer.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import pathlib
|
|
15
|
+
import time
|
|
16
|
+
from collections import defaultdict
|
|
17
|
+
from typing import Dict, Any, List, Tuple, Optional
|
|
18
|
+
|
|
19
|
+
from azure.kusto.data.response import KustoResponseDataSet
|
|
20
|
+
from azure.kusto.ingest import ColumnMapping
|
|
21
|
+
from azure.kusto.ingest import FileDescriptor
|
|
22
|
+
from azure.kusto.ingest import IngestionMappingKind
|
|
23
|
+
from azure.kusto.ingest import IngestionProperties
|
|
24
|
+
from azure.kusto.ingest import IngestionResult
|
|
25
|
+
from azure.kusto.ingest import ReportLevel
|
|
26
|
+
|
|
27
|
+
from azure.kusto.data import KustoClient
|
|
28
|
+
from azure.kusto.ingest import QueuedIngestClient
|
|
29
|
+
|
|
30
|
+
from cosmotech.coal.azure.adx.auth import initialize_clients
|
|
31
|
+
from cosmotech.coal.azure.adx.query import run_query, run_command_query
|
|
32
|
+
from cosmotech.coal.azure.adx.ingestion import check_ingestion_status, IngestionStatus
|
|
33
|
+
from cosmotech.coal.utils.logger import LOGGER
|
|
34
|
+
from cosmotech.orchestrator.utils.translate import T
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def prepare_csv_content(folder_path: str) -> Dict[str, Dict[str, Any]]:
|
|
38
|
+
"""
|
|
39
|
+
Navigate through `folder_path` to generate csv information for each csv file in it.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
folder_path: Path to the folder containing CSV files
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
A map of filename to file_infos
|
|
46
|
+
file infos:
|
|
47
|
+
dict:
|
|
48
|
+
filename -> filename as a string without path & extension
|
|
49
|
+
headers -> map of column_name -> column_type
|
|
50
|
+
"""
|
|
51
|
+
content = dict()
|
|
52
|
+
root = pathlib.Path(folder_path)
|
|
53
|
+
for _file in root.rglob("*.csv"):
|
|
54
|
+
with open(_file) as _csv_content:
|
|
55
|
+
header = _csv_content.readline().replace("@", "").strip()
|
|
56
|
+
headers = header.split(",") if header else list()
|
|
57
|
+
cols = {k.strip(): "string" for k in headers}
|
|
58
|
+
csv_datas = {"filename": _file.name.removesuffix(".csv"), "headers": cols}
|
|
59
|
+
content[str(_file)] = csv_datas
|
|
60
|
+
LOGGER.debug(T("coal.services.adx.content_debug").format(content=content))
|
|
61
|
+
|
|
62
|
+
return content
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def construct_create_query(files_data: Dict[str, Dict[str, Any]]) -> Dict[str, str]:
|
|
66
|
+
"""
|
|
67
|
+
Construct ADX table creation queries for the given CSV files.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
files_data: Map of filename to file_infos as returned by prepare_csv_content
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Map of table_name to creation query
|
|
74
|
+
"""
|
|
75
|
+
queries = dict()
|
|
76
|
+
for file_path, file_info in files_data.items():
|
|
77
|
+
filename = file_info.get("filename")
|
|
78
|
+
fields = file_info.get("headers")
|
|
79
|
+
query = f".create-merge table {filename} ({','.join(':'.join((k, v)) for k, v in fields.items())})"
|
|
80
|
+
queries[filename] = query
|
|
81
|
+
return queries
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def insert_csv_files(
|
|
85
|
+
files_data: Dict[str, Dict[str, Any]],
|
|
86
|
+
ingest_client: QueuedIngestClient,
|
|
87
|
+
runner_id: str,
|
|
88
|
+
database: str,
|
|
89
|
+
wait: bool = False,
|
|
90
|
+
wait_limit: int = 5,
|
|
91
|
+
wait_duration: int = 8,
|
|
92
|
+
) -> None:
|
|
93
|
+
"""
|
|
94
|
+
Insert CSV files into ADX tables.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
files_data: Map of filename to file_infos as returned by prepare_csv_content
|
|
98
|
+
kusto_client: The KustoClient for querying
|
|
99
|
+
ingest_client: The QueuedIngestClient for ingestion
|
|
100
|
+
runner_id: Runner ID to use as a tag
|
|
101
|
+
database: ADX database name
|
|
102
|
+
wait: Whether to wait for ingestion to complete
|
|
103
|
+
wait_limit: Number of retries while waiting
|
|
104
|
+
wait_duration: Duration between each try while waiting
|
|
105
|
+
"""
|
|
106
|
+
ingestion_ids = dict()
|
|
107
|
+
for file_path, file_info in files_data.items():
|
|
108
|
+
filename = file_info.get("filename")
|
|
109
|
+
fields = file_info.get("headers")
|
|
110
|
+
with open(file_path) as _f:
|
|
111
|
+
file_size = sum(map(len, _f.readlines()))
|
|
112
|
+
LOGGER.debug(T("coal.common.data_transfer.sending_data").format(size=file_size))
|
|
113
|
+
fd = FileDescriptor(file_path, file_size)
|
|
114
|
+
ord = 0
|
|
115
|
+
mappings = list()
|
|
116
|
+
for column, _type in fields.items():
|
|
117
|
+
mapping = ColumnMapping(column_name=column, column_type=_type, ordinal=ord)
|
|
118
|
+
ord += 1
|
|
119
|
+
mappings.append(mapping)
|
|
120
|
+
run_col = ColumnMapping(
|
|
121
|
+
column_name="run",
|
|
122
|
+
column_type="string",
|
|
123
|
+
ordinal=ord,
|
|
124
|
+
const_value=runner_id,
|
|
125
|
+
)
|
|
126
|
+
mappings.append(run_col)
|
|
127
|
+
ingestion_properties = IngestionProperties(
|
|
128
|
+
database=database,
|
|
129
|
+
table=filename,
|
|
130
|
+
column_mappings=mappings,
|
|
131
|
+
ingestion_mapping_kind=IngestionMappingKind.CSV,
|
|
132
|
+
drop_by_tags=[
|
|
133
|
+
runner_id,
|
|
134
|
+
],
|
|
135
|
+
report_level=ReportLevel.FailuresAndSuccesses,
|
|
136
|
+
additional_properties={"ignoreFirstRecord": "true"},
|
|
137
|
+
)
|
|
138
|
+
LOGGER.info(T("coal.services.adx.ingesting").format(table=filename))
|
|
139
|
+
results: IngestionResult = ingest_client.ingest_from_file(fd, ingestion_properties)
|
|
140
|
+
ingestion_ids[str(results.source_id)] = filename
|
|
141
|
+
if wait:
|
|
142
|
+
count = 0
|
|
143
|
+
while any(
|
|
144
|
+
map(
|
|
145
|
+
lambda s: s[1] in (IngestionStatus.QUEUED, IngestionStatus.UNKNOWN),
|
|
146
|
+
check_ingestion_status(ingest_client, source_ids=list(ingestion_ids.keys())),
|
|
147
|
+
)
|
|
148
|
+
):
|
|
149
|
+
count += 1
|
|
150
|
+
if count > wait_limit:
|
|
151
|
+
LOGGER.warning(T("coal.services.adx.max_retry"))
|
|
152
|
+
break
|
|
153
|
+
LOGGER.info(
|
|
154
|
+
T("coal.services.adx.waiting_results").format(duration=wait_duration, count=count, limit=wait_limit)
|
|
155
|
+
)
|
|
156
|
+
time.sleep(wait_duration)
|
|
157
|
+
|
|
158
|
+
LOGGER.info(T("coal.services.adx.status"))
|
|
159
|
+
for _id, status in check_ingestion_status(ingest_client, source_ids=list(ingestion_ids.keys())):
|
|
160
|
+
color = (
|
|
161
|
+
"red"
|
|
162
|
+
if status == IngestionStatus.FAILURE
|
|
163
|
+
else "green"
|
|
164
|
+
if status == IngestionStatus.SUCCESS
|
|
165
|
+
else "bright_black"
|
|
166
|
+
)
|
|
167
|
+
LOGGER.info(
|
|
168
|
+
T("coal.services.adx.status_report").format(table=ingestion_ids[_id], status=status.name, color=color)
|
|
169
|
+
)
|
|
170
|
+
else:
|
|
171
|
+
LOGGER.info(T("coal.services.adx.no_wait"))
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def send_runner_data(
|
|
175
|
+
dataset_absolute_path: str,
|
|
176
|
+
parameters_absolute_path: str,
|
|
177
|
+
runner_id: str,
|
|
178
|
+
adx_uri: str,
|
|
179
|
+
adx_ingest_uri: str,
|
|
180
|
+
database_name: str,
|
|
181
|
+
send_parameters: bool = False,
|
|
182
|
+
send_datasets: bool = False,
|
|
183
|
+
wait: bool = False,
|
|
184
|
+
) -> None:
|
|
185
|
+
"""
|
|
186
|
+
Send runner data to ADX.
|
|
187
|
+
|
|
188
|
+
Args:
|
|
189
|
+
dataset_absolute_path: Path to the dataset folder
|
|
190
|
+
parameters_absolute_path: Path to the parameters folder
|
|
191
|
+
runner_id: Runner ID to use as a tag
|
|
192
|
+
adx_uri: ADX cluster URI
|
|
193
|
+
adx_ingest_uri: ADX ingestion URI
|
|
194
|
+
database_name: ADX database name
|
|
195
|
+
send_parameters: Whether to send parameters
|
|
196
|
+
send_datasets: Whether to send datasets
|
|
197
|
+
wait: Whether to wait for ingestion to complete
|
|
198
|
+
"""
|
|
199
|
+
csv_data = dict()
|
|
200
|
+
if send_parameters:
|
|
201
|
+
csv_data.update(prepare_csv_content(parameters_absolute_path))
|
|
202
|
+
if send_datasets:
|
|
203
|
+
csv_data.update(prepare_csv_content(dataset_absolute_path))
|
|
204
|
+
queries = construct_create_query(csv_data)
|
|
205
|
+
kusto_client, ingest_client = initialize_clients(adx_uri, adx_ingest_uri)
|
|
206
|
+
for k, v in queries.items():
|
|
207
|
+
LOGGER.info(T("coal.services.adx.creating_table").format(table_name=k, database=database_name))
|
|
208
|
+
r: KustoResponseDataSet = run_query(kusto_client, database_name, v)
|
|
209
|
+
if r.errors_count == 0:
|
|
210
|
+
LOGGER.info(T("coal.services.adx.table_created").format(table_name=k))
|
|
211
|
+
else:
|
|
212
|
+
LOGGER.error(T("coal.services.adx.table_creation_failed").format(table_name=k, database=database_name))
|
|
213
|
+
LOGGER.error(T("coal.services.adx.exceptions").format(exceptions=r.get_exceptions()))
|
|
214
|
+
raise RuntimeError(f"Failed to create table {k}")
|
|
215
|
+
insert_csv_files(
|
|
216
|
+
files_data=csv_data, ingest_client=ingest_client, runner_id=runner_id, database=database_name, wait=wait
|
|
217
|
+
)
|