cosmotech-acceleration-library 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (141) hide show
  1. cosmotech/coal/__init__.py +8 -0
  2. cosmotech/coal/aws/__init__.py +23 -0
  3. cosmotech/coal/aws/s3.py +235 -0
  4. cosmotech/coal/azure/__init__.py +23 -0
  5. cosmotech/coal/azure/adx/__init__.py +26 -0
  6. cosmotech/coal/azure/adx/auth.py +125 -0
  7. cosmotech/coal/azure/adx/ingestion.py +329 -0
  8. cosmotech/coal/azure/adx/query.py +56 -0
  9. cosmotech/coal/azure/adx/runner.py +217 -0
  10. cosmotech/coal/azure/adx/store.py +255 -0
  11. cosmotech/coal/azure/adx/tables.py +118 -0
  12. cosmotech/coal/azure/adx/utils.py +71 -0
  13. cosmotech/coal/azure/blob.py +109 -0
  14. cosmotech/coal/azure/functions.py +72 -0
  15. cosmotech/coal/azure/storage.py +74 -0
  16. cosmotech/coal/cosmotech_api/__init__.py +36 -0
  17. cosmotech/coal/cosmotech_api/connection.py +96 -0
  18. cosmotech/coal/cosmotech_api/dataset/__init__.py +26 -0
  19. cosmotech/coal/cosmotech_api/dataset/converters.py +164 -0
  20. cosmotech/coal/cosmotech_api/dataset/download/__init__.py +19 -0
  21. cosmotech/coal/cosmotech_api/dataset/download/adt.py +119 -0
  22. cosmotech/coal/cosmotech_api/dataset/download/common.py +140 -0
  23. cosmotech/coal/cosmotech_api/dataset/download/file.py +216 -0
  24. cosmotech/coal/cosmotech_api/dataset/download/twingraph.py +188 -0
  25. cosmotech/coal/cosmotech_api/dataset/utils.py +132 -0
  26. cosmotech/coal/cosmotech_api/parameters.py +48 -0
  27. cosmotech/coal/cosmotech_api/run.py +25 -0
  28. cosmotech/coal/cosmotech_api/run_data.py +173 -0
  29. cosmotech/coal/cosmotech_api/run_template.py +108 -0
  30. cosmotech/coal/cosmotech_api/runner/__init__.py +28 -0
  31. cosmotech/coal/cosmotech_api/runner/data.py +38 -0
  32. cosmotech/coal/cosmotech_api/runner/datasets.py +364 -0
  33. cosmotech/coal/cosmotech_api/runner/download.py +146 -0
  34. cosmotech/coal/cosmotech_api/runner/metadata.py +42 -0
  35. cosmotech/coal/cosmotech_api/runner/parameters.py +157 -0
  36. cosmotech/coal/cosmotech_api/twin_data_layer.py +512 -0
  37. cosmotech/coal/cosmotech_api/workspace.py +127 -0
  38. cosmotech/coal/csm/__init__.py +6 -0
  39. cosmotech/coal/csm/engine/__init__.py +47 -0
  40. cosmotech/coal/postgresql/__init__.py +22 -0
  41. cosmotech/coal/postgresql/runner.py +93 -0
  42. cosmotech/coal/postgresql/store.py +98 -0
  43. cosmotech/coal/singlestore/__init__.py +17 -0
  44. cosmotech/coal/singlestore/store.py +100 -0
  45. cosmotech/coal/store/__init__.py +42 -0
  46. cosmotech/coal/store/csv.py +44 -0
  47. cosmotech/coal/store/native_python.py +25 -0
  48. cosmotech/coal/store/pandas.py +26 -0
  49. cosmotech/coal/store/pyarrow.py +23 -0
  50. cosmotech/coal/store/store.py +79 -0
  51. cosmotech/coal/utils/__init__.py +18 -0
  52. cosmotech/coal/utils/api.py +68 -0
  53. cosmotech/coal/utils/logger.py +10 -0
  54. cosmotech/coal/utils/postgresql.py +236 -0
  55. cosmotech/csm_data/__init__.py +6 -0
  56. cosmotech/csm_data/commands/__init__.py +6 -0
  57. cosmotech/csm_data/commands/adx_send_data.py +92 -0
  58. cosmotech/csm_data/commands/adx_send_runnerdata.py +119 -0
  59. cosmotech/csm_data/commands/api/__init__.py +6 -0
  60. cosmotech/csm_data/commands/api/api.py +50 -0
  61. cosmotech/csm_data/commands/api/postgres_send_runner_metadata.py +119 -0
  62. cosmotech/csm_data/commands/api/rds_load_csv.py +90 -0
  63. cosmotech/csm_data/commands/api/rds_send_csv.py +74 -0
  64. cosmotech/csm_data/commands/api/rds_send_store.py +74 -0
  65. cosmotech/csm_data/commands/api/run_load_data.py +120 -0
  66. cosmotech/csm_data/commands/api/runtemplate_load_handler.py +66 -0
  67. cosmotech/csm_data/commands/api/tdl_load_files.py +76 -0
  68. cosmotech/csm_data/commands/api/tdl_send_files.py +82 -0
  69. cosmotech/csm_data/commands/api/wsf_load_file.py +66 -0
  70. cosmotech/csm_data/commands/api/wsf_send_file.py +68 -0
  71. cosmotech/csm_data/commands/az_storage_upload.py +76 -0
  72. cosmotech/csm_data/commands/s3_bucket_delete.py +107 -0
  73. cosmotech/csm_data/commands/s3_bucket_download.py +118 -0
  74. cosmotech/csm_data/commands/s3_bucket_upload.py +128 -0
  75. cosmotech/csm_data/commands/store/__init__.py +6 -0
  76. cosmotech/csm_data/commands/store/dump_to_azure.py +120 -0
  77. cosmotech/csm_data/commands/store/dump_to_postgresql.py +107 -0
  78. cosmotech/csm_data/commands/store/dump_to_s3.py +169 -0
  79. cosmotech/csm_data/commands/store/list_tables.py +48 -0
  80. cosmotech/csm_data/commands/store/load_csv_folder.py +43 -0
  81. cosmotech/csm_data/commands/store/load_from_singlestore.py +96 -0
  82. cosmotech/csm_data/commands/store/reset.py +31 -0
  83. cosmotech/csm_data/commands/store/store.py +37 -0
  84. cosmotech/csm_data/main.py +57 -0
  85. cosmotech/csm_data/utils/__init__.py +6 -0
  86. cosmotech/csm_data/utils/click.py +18 -0
  87. cosmotech/csm_data/utils/decorators.py +75 -0
  88. cosmotech/orchestrator_plugins/csm-data/__init__.py +11 -0
  89. cosmotech/orchestrator_plugins/csm-data/templates/api/postgres_send_runner_metadata.json +40 -0
  90. cosmotech/orchestrator_plugins/csm-data/templates/api/rds_load_csv.json +27 -0
  91. cosmotech/orchestrator_plugins/csm-data/templates/api/rds_send_csv.json +27 -0
  92. cosmotech/orchestrator_plugins/csm-data/templates/api/rds_send_store.json +27 -0
  93. cosmotech/orchestrator_plugins/csm-data/templates/api/run_load_data.json +30 -0
  94. cosmotech/orchestrator_plugins/csm-data/templates/api/runtemplate_load_handler.json +27 -0
  95. cosmotech/orchestrator_plugins/csm-data/templates/api/tdl_load_files.json +32 -0
  96. cosmotech/orchestrator_plugins/csm-data/templates/api/tdl_send_files.json +27 -0
  97. cosmotech/orchestrator_plugins/csm-data/templates/api/try_api_connection.json +9 -0
  98. cosmotech/orchestrator_plugins/csm-data/templates/api/wsf_load_file.json +36 -0
  99. cosmotech/orchestrator_plugins/csm-data/templates/api/wsf_send_file.json +36 -0
  100. cosmotech/orchestrator_plugins/csm-data/templates/main/adx_send_runnerdata.json +29 -0
  101. cosmotech/orchestrator_plugins/csm-data/templates/main/az_storage_upload.json +25 -0
  102. cosmotech/orchestrator_plugins/csm-data/templates/main/s3_bucket_delete.json +31 -0
  103. cosmotech/orchestrator_plugins/csm-data/templates/main/s3_bucket_download.json +34 -0
  104. cosmotech/orchestrator_plugins/csm-data/templates/main/s3_bucket_upload.json +35 -0
  105. cosmotech/orchestrator_plugins/csm-data/templates/store/store_dump_to_azure.json +35 -0
  106. cosmotech/orchestrator_plugins/csm-data/templates/store/store_dump_to_postgresql.json +34 -0
  107. cosmotech/orchestrator_plugins/csm-data/templates/store/store_dump_to_s3.json +36 -0
  108. cosmotech/orchestrator_plugins/csm-data/templates/store/store_list_tables.json +15 -0
  109. cosmotech/orchestrator_plugins/csm-data/templates/store/store_load_csv_folder.json +18 -0
  110. cosmotech/orchestrator_plugins/csm-data/templates/store/store_load_from_singlestore.json +34 -0
  111. cosmotech/orchestrator_plugins/csm-data/templates/store/store_reset.json +15 -0
  112. cosmotech/translation/coal/__init__.py +6 -0
  113. cosmotech/translation/coal/en-US/coal/common/data_transfer.yml +6 -0
  114. cosmotech/translation/coal/en-US/coal/common/errors.yml +9 -0
  115. cosmotech/translation/coal/en-US/coal/common/file_operations.yml +6 -0
  116. cosmotech/translation/coal/en-US/coal/common/progress.yml +6 -0
  117. cosmotech/translation/coal/en-US/coal/common/timing.yml +5 -0
  118. cosmotech/translation/coal/en-US/coal/common/validation.yml +8 -0
  119. cosmotech/translation/coal/en-US/coal/cosmotech_api/connection.yml +10 -0
  120. cosmotech/translation/coal/en-US/coal/cosmotech_api/run_data.yml +2 -0
  121. cosmotech/translation/coal/en-US/coal/cosmotech_api/run_template.yml +8 -0
  122. cosmotech/translation/coal/en-US/coal/cosmotech_api/runner.yml +16 -0
  123. cosmotech/translation/coal/en-US/coal/cosmotech_api/solution.yml +5 -0
  124. cosmotech/translation/coal/en-US/coal/cosmotech_api/workspace.yml +7 -0
  125. cosmotech/translation/coal/en-US/coal/services/adx.yml +59 -0
  126. cosmotech/translation/coal/en-US/coal/services/api.yml +8 -0
  127. cosmotech/translation/coal/en-US/coal/services/azure_storage.yml +14 -0
  128. cosmotech/translation/coal/en-US/coal/services/database.yml +19 -0
  129. cosmotech/translation/coal/en-US/coal/services/dataset.yml +68 -0
  130. cosmotech/translation/coal/en-US/coal/services/postgresql.yml +28 -0
  131. cosmotech/translation/coal/en-US/coal/services/s3.yml +9 -0
  132. cosmotech/translation/coal/en-US/coal/solution.yml +3 -0
  133. cosmotech/translation/coal/en-US/coal/web.yml +2 -0
  134. cosmotech/translation/csm_data/__init__.py +6 -0
  135. cosmotech/translation/csm_data/en-US/csm-data.yml +434 -0
  136. cosmotech_acceleration_library-1.0.0.dist-info/METADATA +255 -0
  137. cosmotech_acceleration_library-1.0.0.dist-info/RECORD +141 -0
  138. cosmotech_acceleration_library-1.0.0.dist-info/WHEEL +5 -0
  139. cosmotech_acceleration_library-1.0.0.dist-info/entry_points.txt +2 -0
  140. cosmotech_acceleration_library-1.0.0.dist-info/licenses/LICENSE +17 -0
  141. cosmotech_acceleration_library-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,329 @@
1
+ # Copyright (C) - 2023 - 2025 - Cosmo Tech
2
+ # This document and all information contained herein is the exclusive property -
3
+ # including all intellectual property rights pertaining thereto - of Cosmo Tech.
4
+ # Any use, reproduction, translation, broadcasting, transmission, distribution,
5
+ # etc., to any person is prohibited unless it has been previously and
6
+ # specifically authorized by written means by Cosmo Tech.
7
+
8
+ from enum import Enum
9
+ from typing import Dict
10
+ from typing import Iterator
11
+ from typing import List
12
+ from typing import Optional
13
+ from typing import Tuple
14
+
15
+ import os
16
+ import pandas as pd
17
+ import time
18
+ import tqdm
19
+ from azure.kusto.data import KustoClient
20
+ from azure.kusto.data.data_format import DataFormat
21
+ from azure.kusto.ingest import IngestionProperties
22
+ from azure.kusto.ingest import QueuedIngestClient
23
+ from azure.kusto.ingest import ReportLevel
24
+ from azure.kusto.ingest.status import FailureMessage
25
+ from azure.kusto.ingest.status import KustoIngestStatusQueues
26
+ from azure.kusto.ingest.status import SuccessMessage
27
+ from cosmotech.orchestrator.utils.translate import T
28
+
29
+ from cosmotech.coal.azure.adx.tables import create_table, _drop_by_tag
30
+ from cosmotech.coal.azure.adx.utils import type_mapping
31
+ from cosmotech.coal.utils.logger import LOGGER
32
+
33
+
34
+ class IngestionStatus(Enum):
35
+ QUEUED = "QUEUED"
36
+ SUCCESS = "SUCCESS"
37
+ FAILURE = "FAILURE"
38
+ UNKNOWN = "UNKNOWN"
39
+ TIMEOUT = "TIMED OUT"
40
+
41
+
42
+ # Global dictionaries to track ingestion status
43
+ _ingest_status: Dict[str, IngestionStatus] = {}
44
+ _ingest_times: Dict[str, float] = {}
45
+
46
+
47
+ def ingest_dataframe(
48
+ client: QueuedIngestClient,
49
+ database: str,
50
+ table_name: str,
51
+ dataframe: pd.DataFrame,
52
+ drop_by_tag: Optional[str] = None,
53
+ ):
54
+ """
55
+ Ingest a pandas DataFrame into an ADX table.
56
+
57
+ Args:
58
+ client: The QueuedIngestClient to use
59
+ database: The name of the database
60
+ table_name: The name of the table
61
+ dataframe: The DataFrame to ingest
62
+ drop_by_tag: Tag used for the drop by capacity of the Cosmotech API
63
+
64
+ Returns:
65
+ The ingestion result with source_id for status tracking
66
+ """
67
+ LOGGER.debug(T("coal.services.adx.ingesting_dataframe").format(table_name=table_name, rows=len(dataframe)))
68
+
69
+ drop_by_tags = [drop_by_tag] if (drop_by_tag is not None) else None
70
+
71
+ properties = IngestionProperties(
72
+ database=database,
73
+ table=table_name,
74
+ data_format=DataFormat.CSV,
75
+ drop_by_tags=drop_by_tags,
76
+ report_level=ReportLevel.FailuresAndSuccesses,
77
+ )
78
+
79
+ ingestion_result = client.ingest_from_dataframe(dataframe, ingestion_properties=properties)
80
+
81
+ # Track the ingestion status
82
+ source_id = str(ingestion_result.source_id)
83
+ _ingest_status[source_id] = IngestionStatus.QUEUED
84
+ _ingest_times[source_id] = time.time()
85
+
86
+ LOGGER.debug(T("coal.services.adx.ingestion_queued").format(source_id=source_id))
87
+
88
+ return ingestion_result
89
+
90
+
91
+ def send_to_adx(
92
+ query_client: KustoClient,
93
+ ingest_client: QueuedIngestClient,
94
+ database: str,
95
+ dict_list: List[Dict],
96
+ table_name: str,
97
+ ignore_table_creation: bool = True,
98
+ drop_by_tag: Optional[str] = None,
99
+ ):
100
+ """
101
+ Send a list of dictionaries to an ADX table.
102
+
103
+ Args:
104
+ query_client: The KustoClient for querying
105
+ ingest_client: The QueuedIngestClient for ingestion
106
+ database: The name of the database
107
+ dict_list: The list of dictionaries to send
108
+ table_name: The name of the table
109
+ ignore_table_creation: If False, will create the table if it doesn't exist
110
+ drop_by_tag: Tag used for the drop by capacity of the Cosmotech API
111
+
112
+ Returns:
113
+ The ingestion result with source_id for status tracking
114
+ """
115
+ LOGGER.debug(T("coal.services.adx.sending_to_adx").format(table_name=table_name, items=len(dict_list)))
116
+
117
+ if not dict_list:
118
+ LOGGER.warning(T("coal.services.adx.empty_dict_list"))
119
+ return None
120
+
121
+ if not ignore_table_creation:
122
+ # If the target table does not exist create it
123
+ # First create the columns types needed for the table
124
+ types = {k: type_mapping(k, dict_list[0][k]) for k in dict_list[0].keys()}
125
+
126
+ # Then try to create the table
127
+ if not create_table(query_client, database, table_name, types):
128
+ LOGGER.error(T("coal.services.adx.table_creation_failed").format(table_name=table_name))
129
+ return False
130
+
131
+ # Create a dataframe with the data to write and send them to ADX
132
+ df = pd.DataFrame(dict_list)
133
+ return ingest_dataframe(ingest_client, database, table_name, df, drop_by_tag)
134
+
135
+
136
+ def check_ingestion_status(
137
+ client: QueuedIngestClient,
138
+ source_ids: List[str],
139
+ timeout: Optional[int] = None,
140
+ ) -> Iterator[Tuple[str, IngestionStatus]]:
141
+ """
142
+ Check the status of ingestion operations.
143
+
144
+ Args:
145
+ client: The QueuedIngestClient to use
146
+ source_ids: List of source IDs to check
147
+ timeout: Timeout in seconds (default: 900)
148
+
149
+ Returns:
150
+ Iterator of (source_id, status) tuples
151
+ """
152
+ default_timeout = 900
153
+ remaining_ids = []
154
+
155
+ # First yield any already known statuses
156
+ for source_id in source_ids:
157
+ if source_id not in _ingest_status:
158
+ _ingest_status[source_id] = IngestionStatus.UNKNOWN
159
+ _ingest_times[source_id] = time.time()
160
+
161
+ if _ingest_status[source_id] not in [
162
+ IngestionStatus.QUEUED,
163
+ IngestionStatus.UNKNOWN,
164
+ ]:
165
+ yield source_id, _ingest_status[source_id]
166
+ else:
167
+ remaining_ids.append(source_id)
168
+
169
+ if not remaining_ids:
170
+ return
171
+
172
+ LOGGER.debug(T("coal.services.adx.checking_status").format(count=len(remaining_ids)))
173
+
174
+ # Get status queues
175
+ qs = KustoIngestStatusQueues(client)
176
+
177
+ def get_messages(queues):
178
+ _r = []
179
+ for q in queues:
180
+ _r.extend(((q, m) for m in q.receive_messages(messages_per_page=32, visibility_timeout=1)))
181
+ return _r
182
+
183
+ successes = get_messages(qs.success._get_queues())
184
+ failures = get_messages(qs.failure._get_queues())
185
+
186
+ LOGGER.debug(T("coal.services.adx.status_messages").format(success=len(successes), failure=len(failures)))
187
+
188
+ queued_ids = list(remaining_ids)
189
+ # Process success and failure messages
190
+ for messages, cast_func, status, log_function in [
191
+ (successes, SuccessMessage, IngestionStatus.SUCCESS, LOGGER.debug),
192
+ (failures, FailureMessage, IngestionStatus.FAILURE, LOGGER.error),
193
+ ]:
194
+ for _q, _m in messages:
195
+ dm = cast_func(_m.content)
196
+ to_check_ids = remaining_ids[:]
197
+
198
+ for source_id in to_check_ids:
199
+ if dm.IngestionSourceId == str(source_id):
200
+ _ingest_status[source_id] = status
201
+
202
+ log_function(T("coal.services.adx.status_found").format(source_id=source_id, status=status.value))
203
+
204
+ _q.delete_message(_m)
205
+ remaining_ids.remove(source_id)
206
+ break
207
+ else:
208
+ # The message did not correspond to a known ID
209
+ continue
210
+
211
+ # Check for timeouts
212
+ actual_timeout = timeout if timeout is not None else default_timeout
213
+ for source_id in remaining_ids:
214
+ if time.time() - _ingest_times[source_id] > actual_timeout:
215
+ _ingest_status[source_id] = IngestionStatus.TIMEOUT
216
+ LOGGER.warning(T("coal.services.adx.ingestion_timeout").format(source_id=source_id))
217
+
218
+ # Yield results for remaining IDs
219
+ for source_id in queued_ids:
220
+ yield source_id, _ingest_status[source_id]
221
+
222
+
223
+ def monitor_ingestion(
224
+ ingest_client: QueuedIngestClient, source_ids: List[str], table_ingestion_id_mapping: Dict[str, str]
225
+ ) -> bool:
226
+ """
227
+ Monitor the ingestion process with progress reporting.
228
+
229
+ Args:
230
+ ingest_client: The ingest client
231
+ source_ids: List of source IDs to monitor
232
+ table_ingestion_id_mapping: Mapping of source IDs to table names
233
+
234
+ Returns:
235
+ bool: True if any failures occurred, False otherwise
236
+ """
237
+ has_failures = False
238
+ source_ids_copy = source_ids.copy()
239
+
240
+ LOGGER.info(T("coal.services.adx.waiting_ingestion"))
241
+
242
+ with tqdm.tqdm(desc="Ingestion status", total=len(source_ids_copy)) as pbar:
243
+ while any(
244
+ list(
245
+ map(
246
+ lambda _status: _status[1] in (IngestionStatus.QUEUED, IngestionStatus.UNKNOWN),
247
+ results := list(check_ingestion_status(ingest_client, source_ids_copy)),
248
+ )
249
+ )
250
+ ):
251
+ # Check for failures
252
+ for ingestion_id, ingestion_status in results:
253
+ if ingestion_status == IngestionStatus.FAILURE:
254
+ LOGGER.error(
255
+ T("coal.services.adx.ingestion_failed").format(
256
+ ingestion_id=ingestion_id, table=table_ingestion_id_mapping.get(ingestion_id)
257
+ )
258
+ )
259
+ has_failures = True
260
+
261
+ cleared_ids = list(
262
+ result for result in results if result[1] not in (IngestionStatus.QUEUED, IngestionStatus.UNKNOWN)
263
+ )
264
+
265
+ for ingestion_id, ingestion_status in cleared_ids:
266
+ pbar.update(1)
267
+ source_ids_copy.remove(ingestion_id)
268
+
269
+ time.sleep(1)
270
+ if os.environ.get("CSM_USE_RICH", "False").lower() in ("true", "1", "yes", "t", "y"):
271
+ pbar.refresh()
272
+ else:
273
+ for ingestion_id, ingestion_status in results:
274
+ if ingestion_status == IngestionStatus.FAILURE:
275
+ LOGGER.error(
276
+ T("coal.services.adx.ingestion_failed").format(
277
+ ingestion_id=ingestion_id, table=table_ingestion_id_mapping.get(ingestion_id)
278
+ )
279
+ )
280
+ has_failures = True
281
+ pbar.update(len(source_ids_copy))
282
+
283
+ LOGGER.info(T("coal.services.adx.ingestion_completed"))
284
+ return has_failures
285
+
286
+
287
+ def handle_failures(kusto_client: KustoClient, database: str, operation_tag: str, has_failures: bool) -> bool:
288
+ """
289
+ Handle any failures and perform rollbacks if needed.
290
+
291
+ Args:
292
+ kusto_client: The Kusto client
293
+ database: The database name
294
+ operation_tag: The operation tag for tracking
295
+ has_failures: Whether any failures occurred
296
+
297
+ Returns:
298
+ bool: True if the process should abort, False otherwise
299
+ """
300
+ if has_failures:
301
+ LOGGER.warning(T("coal.services.adx.failures_detected").format(operation_tag=operation_tag))
302
+ _drop_by_tag(kusto_client, database, operation_tag)
303
+ return True
304
+ return False
305
+
306
+
307
+ def clear_ingestion_status_queues(client: QueuedIngestClient, confirmation: bool = False):
308
+ """
309
+ Clear all data in the ingestion status queues.
310
+ DANGEROUS: This will clear all queues for the entire ADX cluster.
311
+
312
+ Args:
313
+ client: The QueuedIngestClient to use
314
+ confirmation: Must be True to proceed with clearing
315
+ """
316
+ if not confirmation:
317
+ LOGGER.warning(T("coal.services.adx.clear_queues_no_confirmation"))
318
+ return
319
+
320
+ LOGGER.warning(T("coal.services.adx.clearing_queues"))
321
+ qs = KustoIngestStatusQueues(client)
322
+
323
+ while not qs.success.is_empty():
324
+ qs.success.pop(32)
325
+
326
+ while not qs.failure.is_empty():
327
+ qs.failure.pop(32)
328
+
329
+ LOGGER.info(T("coal.services.adx.queues_cleared"))
@@ -0,0 +1,56 @@
1
+ # Copyright (C) - 2023 - 2025 - Cosmo Tech
2
+ # This document and all information contained herein is the exclusive property -
3
+ # including all intellectual property rights pertaining thereto - of Cosmo Tech.
4
+ # Any use, reproduction, translation, broadcasting, transmission, distribution,
5
+ # etc., to any person is prohibited unless it has been previously and
6
+ # specifically authorized by written means by Cosmo Tech.
7
+
8
+ from azure.kusto.data import KustoClient
9
+ from azure.kusto.data.response import KustoResponseDataSet
10
+
11
+ from cosmotech.coal.utils.logger import LOGGER
12
+ from cosmotech.orchestrator.utils.translate import T
13
+
14
+
15
+ def run_query(client: KustoClient, database: str, query: str) -> KustoResponseDataSet:
16
+ """
17
+ Execute a simple query on the database.
18
+
19
+ Args:
20
+ client: The KustoClient to use
21
+ database: The name of the database
22
+ query: The query to execute
23
+
24
+ Returns:
25
+ KustoResponseDataSet: The results of the query
26
+ """
27
+ LOGGER.debug(T("coal.services.adx.running_query").format(database=database, query=query))
28
+
29
+ result = client.execute(database, query)
30
+ LOGGER.debug(
31
+ T("coal.services.adx.query_complete").format(
32
+ rows=len(result.primary_results[0]) if result.primary_results else 0
33
+ )
34
+ )
35
+
36
+ return result
37
+
38
+
39
+ def run_command_query(client: KustoClient, database: str, query: str) -> KustoResponseDataSet:
40
+ """
41
+ Execute a command query on the database.
42
+
43
+ Args:
44
+ client: The KustoClient to use
45
+ database: The name of the database
46
+ query: The query to execute
47
+
48
+ Returns:
49
+ KustoResponseDataSet: The results of the query
50
+ """
51
+ LOGGER.debug(T("coal.services.adx.running_command").format(database=database, query=query))
52
+
53
+ result = client.execute_mgmt(database, query)
54
+ LOGGER.debug(T("coal.services.adx.command_complete"))
55
+
56
+ return result
@@ -0,0 +1,217 @@
1
+ # Copyright (C) - 2023 - 2025 - Cosmo Tech
2
+ # This document and all information contained herein is the exclusive property -
3
+ # including all intellectual property rights pertaining thereto - of Cosmo Tech.
4
+ # Any use, reproduction, translation, broadcasting, transmission, distribution,
5
+ # etc., to any person is prohibited unless it has been previously and
6
+ # specifically authorized by written means by Cosmo Tech.
7
+
8
+ """
9
+ ADX runner data ingestion module.
10
+
11
+ This module provides functions for ingesting runner data into Azure Data Explorer.
12
+ """
13
+
14
+ import pathlib
15
+ import time
16
+ from collections import defaultdict
17
+ from typing import Dict, Any, List, Tuple, Optional
18
+
19
+ from azure.kusto.data.response import KustoResponseDataSet
20
+ from azure.kusto.ingest import ColumnMapping
21
+ from azure.kusto.ingest import FileDescriptor
22
+ from azure.kusto.ingest import IngestionMappingKind
23
+ from azure.kusto.ingest import IngestionProperties
24
+ from azure.kusto.ingest import IngestionResult
25
+ from azure.kusto.ingest import ReportLevel
26
+
27
+ from azure.kusto.data import KustoClient
28
+ from azure.kusto.ingest import QueuedIngestClient
29
+
30
+ from cosmotech.coal.azure.adx.auth import initialize_clients
31
+ from cosmotech.coal.azure.adx.query import run_query, run_command_query
32
+ from cosmotech.coal.azure.adx.ingestion import check_ingestion_status, IngestionStatus
33
+ from cosmotech.coal.utils.logger import LOGGER
34
+ from cosmotech.orchestrator.utils.translate import T
35
+
36
+
37
+ def prepare_csv_content(folder_path: str) -> Dict[str, Dict[str, Any]]:
38
+ """
39
+ Navigate through `folder_path` to generate csv information for each csv file in it.
40
+
41
+ Args:
42
+ folder_path: Path to the folder containing CSV files
43
+
44
+ Returns:
45
+ A map of filename to file_infos
46
+ file infos:
47
+ dict:
48
+ filename -> filename as a string without path & extension
49
+ headers -> map of column_name -> column_type
50
+ """
51
+ content = dict()
52
+ root = pathlib.Path(folder_path)
53
+ for _file in root.rglob("*.csv"):
54
+ with open(_file) as _csv_content:
55
+ header = _csv_content.readline().replace("@", "").strip()
56
+ headers = header.split(",") if header else list()
57
+ cols = {k.strip(): "string" for k in headers}
58
+ csv_datas = {"filename": _file.name.removesuffix(".csv"), "headers": cols}
59
+ content[str(_file)] = csv_datas
60
+ LOGGER.debug(T("coal.services.adx.content_debug").format(content=content))
61
+
62
+ return content
63
+
64
+
65
+ def construct_create_query(files_data: Dict[str, Dict[str, Any]]) -> Dict[str, str]:
66
+ """
67
+ Construct ADX table creation queries for the given CSV files.
68
+
69
+ Args:
70
+ files_data: Map of filename to file_infos as returned by prepare_csv_content
71
+
72
+ Returns:
73
+ Map of table_name to creation query
74
+ """
75
+ queries = dict()
76
+ for file_path, file_info in files_data.items():
77
+ filename = file_info.get("filename")
78
+ fields = file_info.get("headers")
79
+ query = f".create-merge table {filename} ({','.join(':'.join((k, v)) for k, v in fields.items())})"
80
+ queries[filename] = query
81
+ return queries
82
+
83
+
84
+ def insert_csv_files(
85
+ files_data: Dict[str, Dict[str, Any]],
86
+ ingest_client: QueuedIngestClient,
87
+ runner_id: str,
88
+ database: str,
89
+ wait: bool = False,
90
+ wait_limit: int = 5,
91
+ wait_duration: int = 8,
92
+ ) -> None:
93
+ """
94
+ Insert CSV files into ADX tables.
95
+
96
+ Args:
97
+ files_data: Map of filename to file_infos as returned by prepare_csv_content
98
+ kusto_client: The KustoClient for querying
99
+ ingest_client: The QueuedIngestClient for ingestion
100
+ runner_id: Runner ID to use as a tag
101
+ database: ADX database name
102
+ wait: Whether to wait for ingestion to complete
103
+ wait_limit: Number of retries while waiting
104
+ wait_duration: Duration between each try while waiting
105
+ """
106
+ ingestion_ids = dict()
107
+ for file_path, file_info in files_data.items():
108
+ filename = file_info.get("filename")
109
+ fields = file_info.get("headers")
110
+ with open(file_path) as _f:
111
+ file_size = sum(map(len, _f.readlines()))
112
+ LOGGER.debug(T("coal.common.data_transfer.sending_data").format(size=file_size))
113
+ fd = FileDescriptor(file_path, file_size)
114
+ ord = 0
115
+ mappings = list()
116
+ for column, _type in fields.items():
117
+ mapping = ColumnMapping(column_name=column, column_type=_type, ordinal=ord)
118
+ ord += 1
119
+ mappings.append(mapping)
120
+ run_col = ColumnMapping(
121
+ column_name="run",
122
+ column_type="string",
123
+ ordinal=ord,
124
+ const_value=runner_id,
125
+ )
126
+ mappings.append(run_col)
127
+ ingestion_properties = IngestionProperties(
128
+ database=database,
129
+ table=filename,
130
+ column_mappings=mappings,
131
+ ingestion_mapping_kind=IngestionMappingKind.CSV,
132
+ drop_by_tags=[
133
+ runner_id,
134
+ ],
135
+ report_level=ReportLevel.FailuresAndSuccesses,
136
+ additional_properties={"ignoreFirstRecord": "true"},
137
+ )
138
+ LOGGER.info(T("coal.services.adx.ingesting").format(table=filename))
139
+ results: IngestionResult = ingest_client.ingest_from_file(fd, ingestion_properties)
140
+ ingestion_ids[str(results.source_id)] = filename
141
+ if wait:
142
+ count = 0
143
+ while any(
144
+ map(
145
+ lambda s: s[1] in (IngestionStatus.QUEUED, IngestionStatus.UNKNOWN),
146
+ check_ingestion_status(ingest_client, source_ids=list(ingestion_ids.keys())),
147
+ )
148
+ ):
149
+ count += 1
150
+ if count > wait_limit:
151
+ LOGGER.warning(T("coal.services.adx.max_retry"))
152
+ break
153
+ LOGGER.info(
154
+ T("coal.services.adx.waiting_results").format(duration=wait_duration, count=count, limit=wait_limit)
155
+ )
156
+ time.sleep(wait_duration)
157
+
158
+ LOGGER.info(T("coal.services.adx.status"))
159
+ for _id, status in check_ingestion_status(ingest_client, source_ids=list(ingestion_ids.keys())):
160
+ color = (
161
+ "red"
162
+ if status == IngestionStatus.FAILURE
163
+ else "green"
164
+ if status == IngestionStatus.SUCCESS
165
+ else "bright_black"
166
+ )
167
+ LOGGER.info(
168
+ T("coal.services.adx.status_report").format(table=ingestion_ids[_id], status=status.name, color=color)
169
+ )
170
+ else:
171
+ LOGGER.info(T("coal.services.adx.no_wait"))
172
+
173
+
174
+ def send_runner_data(
175
+ dataset_absolute_path: str,
176
+ parameters_absolute_path: str,
177
+ runner_id: str,
178
+ adx_uri: str,
179
+ adx_ingest_uri: str,
180
+ database_name: str,
181
+ send_parameters: bool = False,
182
+ send_datasets: bool = False,
183
+ wait: bool = False,
184
+ ) -> None:
185
+ """
186
+ Send runner data to ADX.
187
+
188
+ Args:
189
+ dataset_absolute_path: Path to the dataset folder
190
+ parameters_absolute_path: Path to the parameters folder
191
+ runner_id: Runner ID to use as a tag
192
+ adx_uri: ADX cluster URI
193
+ adx_ingest_uri: ADX ingestion URI
194
+ database_name: ADX database name
195
+ send_parameters: Whether to send parameters
196
+ send_datasets: Whether to send datasets
197
+ wait: Whether to wait for ingestion to complete
198
+ """
199
+ csv_data = dict()
200
+ if send_parameters:
201
+ csv_data.update(prepare_csv_content(parameters_absolute_path))
202
+ if send_datasets:
203
+ csv_data.update(prepare_csv_content(dataset_absolute_path))
204
+ queries = construct_create_query(csv_data)
205
+ kusto_client, ingest_client = initialize_clients(adx_uri, adx_ingest_uri)
206
+ for k, v in queries.items():
207
+ LOGGER.info(T("coal.services.adx.creating_table").format(table_name=k, database=database_name))
208
+ r: KustoResponseDataSet = run_query(kusto_client, database_name, v)
209
+ if r.errors_count == 0:
210
+ LOGGER.info(T("coal.services.adx.table_created").format(table_name=k))
211
+ else:
212
+ LOGGER.error(T("coal.services.adx.table_creation_failed").format(table_name=k, database=database_name))
213
+ LOGGER.error(T("coal.services.adx.exceptions").format(exceptions=r.get_exceptions()))
214
+ raise RuntimeError(f"Failed to create table {k}")
215
+ insert_csv_files(
216
+ files_data=csv_data, ingest_client=ingest_client, runner_id=runner_id, database=database_name, wait=wait
217
+ )