cosmotech-acceleration-library 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. cosmotech/coal/__init__.py +1 -1
  2. cosmotech/coal/azure/adx/runner.py +1 -3
  3. cosmotech/coal/cosmotech_api/__init__.py +10 -6
  4. cosmotech/coal/cosmotech_api/dataset/download/file.py +117 -104
  5. cosmotech/coal/cosmotech_api/dataset/download/twingraph.py +10 -13
  6. cosmotech/coal/cosmotech_api/dataset/upload.py +41 -0
  7. cosmotech/coal/cosmotech_api/runner/datasets.py +71 -19
  8. cosmotech/coal/cosmotech_api/runner/download.py +3 -14
  9. cosmotech/coal/postgresql/runner.py +3 -1
  10. cosmotech/coal/postgresql/store.py +3 -0
  11. cosmotech/coal/utils/decorator.py +25 -0
  12. cosmotech/coal/utils/semver.py +6 -0
  13. cosmotech/csm_data/commands/adx_send_data.py +7 -7
  14. cosmotech/csm_data/commands/adx_send_runnerdata.py +10 -10
  15. cosmotech/csm_data/commands/api/api.py +1 -1
  16. cosmotech/csm_data/commands/api/postgres_send_runner_metadata.py +23 -11
  17. cosmotech/csm_data/commands/api/rds_load_csv.py +8 -8
  18. cosmotech/csm_data/commands/api/rds_send_csv.py +6 -6
  19. cosmotech/csm_data/commands/api/rds_send_store.py +6 -6
  20. cosmotech/csm_data/commands/api/run_load_data.py +10 -10
  21. cosmotech/csm_data/commands/api/runtemplate_load_handler.py +5 -5
  22. cosmotech/csm_data/commands/api/tdl_load_files.py +6 -6
  23. cosmotech/csm_data/commands/api/tdl_send_files.py +7 -7
  24. cosmotech/csm_data/commands/api/wsf_load_file.py +10 -8
  25. cosmotech/csm_data/commands/api/wsf_send_file.py +10 -8
  26. cosmotech/csm_data/commands/az_storage_upload.py +6 -6
  27. cosmotech/csm_data/commands/s3_bucket_delete.py +8 -8
  28. cosmotech/csm_data/commands/s3_bucket_download.py +9 -9
  29. cosmotech/csm_data/commands/s3_bucket_upload.py +10 -10
  30. cosmotech/csm_data/commands/store/dump_to_azure.py +9 -9
  31. cosmotech/csm_data/commands/store/dump_to_postgresql.py +22 -10
  32. cosmotech/csm_data/commands/store/dump_to_s3.py +10 -10
  33. cosmotech/csm_data/commands/store/list_tables.py +3 -3
  34. cosmotech/csm_data/commands/store/load_csv_folder.py +3 -3
  35. cosmotech/csm_data/commands/store/load_from_singlestore.py +8 -8
  36. cosmotech/csm_data/commands/store/reset.py +2 -2
  37. cosmotech/csm_data/commands/store/store.py +1 -2
  38. cosmotech/csm_data/main.py +8 -6
  39. cosmotech/csm_data/utils/decorators.py +1 -1
  40. cosmotech/translation/csm_data/en-US/csm_data/commands/api/api.yml +8 -0
  41. cosmotech/translation/csm_data/en-US/csm_data/commands/api/postgres_send_runner_metadata.yml +17 -0
  42. cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_load_csv.yml +13 -0
  43. cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_send_csv.yml +12 -0
  44. cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_send_store.yml +12 -0
  45. cosmotech/translation/csm_data/en-US/csm_data/commands/api/run_load_data.yml +15 -0
  46. cosmotech/translation/csm_data/en-US/csm_data/commands/api/runtemplate_load_handler.yml +7 -0
  47. cosmotech/translation/csm_data/en-US/csm_data/commands/api/tdl_load_files.yml +14 -0
  48. cosmotech/translation/csm_data/en-US/csm_data/commands/api/tdl_send_files.yml +18 -0
  49. cosmotech/translation/csm_data/en-US/csm_data/commands/api/wsf_load_file.yml +10 -0
  50. cosmotech/translation/csm_data/en-US/csm_data/commands/api/wsf_send_file.yml +12 -0
  51. cosmotech/translation/csm_data/en-US/csm_data/commands/main.yml +9 -0
  52. cosmotech/translation/csm_data/en-US/csm_data/commands/storage/adx_send_data.yml +8 -0
  53. cosmotech/translation/csm_data/en-US/csm_data/commands/storage/adx_send_runnerdata.yml +15 -0
  54. cosmotech/translation/csm_data/en-US/csm_data/commands/storage/az_storage_upload.yml +8 -0
  55. cosmotech/translation/csm_data/en-US/csm_data/commands/storage/s3_bucket_delete.yml +17 -0
  56. cosmotech/translation/csm_data/en-US/csm_data/commands/storage/s3_bucket_download.yml +18 -0
  57. cosmotech/translation/csm_data/en-US/csm_data/commands/storage/s3_bucket_upload.yml +21 -0
  58. cosmotech/translation/csm_data/en-US/csm_data/commands/storage/storage.yml +4 -0
  59. cosmotech/translation/csm_data/en-US/csm_data/commands/store/dump_to_azure.yml +23 -0
  60. cosmotech/translation/csm_data/en-US/csm_data/commands/store/dump_to_postgresql.yml +20 -0
  61. cosmotech/translation/csm_data/en-US/csm_data/commands/store/dump_to_s3.yml +26 -0
  62. cosmotech/translation/csm_data/en-US/csm_data/commands/store/list_tables.yml +5 -0
  63. cosmotech/translation/csm_data/en-US/csm_data/commands/store/load_csv_folder.yml +5 -0
  64. cosmotech/translation/csm_data/en-US/csm_data/commands/store/load_from_singlestore.yml +16 -0
  65. cosmotech/translation/csm_data/en-US/csm_data/commands/store/reset.yml +4 -0
  66. cosmotech/translation/csm_data/en-US/csm_data/commands/store/store.yml +4 -0
  67. cosmotech/translation/csm_data/en-US/csm_data/commons/decorators.yml +2 -0
  68. cosmotech/translation/csm_data/en-US/csm_data/commons/version.yml +4 -0
  69. {cosmotech_acceleration_library-1.0.0.dist-info → cosmotech_acceleration_library-1.1.0.dist-info}/METADATA +13 -14
  70. {cosmotech_acceleration_library-1.0.0.dist-info → cosmotech_acceleration_library-1.1.0.dist-info}/RECORD +74 -44
  71. {cosmotech_acceleration_library-1.0.0.dist-info → cosmotech_acceleration_library-1.1.0.dist-info}/WHEEL +1 -1
  72. cosmotech/coal/utils/api.py +0 -68
  73. cosmotech/translation/csm_data/en-US/csm-data.yml +0 -434
  74. {cosmotech_acceleration_library-1.0.0.dist-info → cosmotech_acceleration_library-1.1.0.dist-info}/entry_points.txt +0 -0
  75. {cosmotech_acceleration_library-1.0.0.dist-info → cosmotech_acceleration_library-1.1.0.dist-info}/licenses/LICENSE +0 -0
  76. {cosmotech_acceleration_library-1.0.0.dist-info → cosmotech_acceleration_library-1.1.0.dist-info}/top_level.txt +0 -0
@@ -5,4 +5,4 @@
5
5
  # etc., to any person is prohibited unless it has been previously and
6
6
  # specifically authorized by written means by Cosmo Tech.
7
7
 
8
- __version__ = "1.0.0"
8
+ __version__ = "1.1.0"
@@ -160,9 +160,7 @@ def insert_csv_files(
160
160
  color = (
161
161
  "red"
162
162
  if status == IngestionStatus.FAILURE
163
- else "green"
164
- if status == IngestionStatus.SUCCESS
165
- else "bright_black"
163
+ else "green" if status == IngestionStatus.SUCCESS else "bright_black"
166
164
  )
167
165
  LOGGER.info(
168
166
  T("coal.services.adx.status_report").format(table=ingestion_ids[_id], status=status.name, color=color)
@@ -16,12 +16,16 @@ from cosmotech.coal.cosmotech_api.parameters import (
16
16
  write_parameters,
17
17
  )
18
18
 
19
- # Re-export functions from the twin_data_layer module
20
- from cosmotech.coal.cosmotech_api.twin_data_layer import (
21
- get_dataset_id_from_runner,
22
- send_files_to_tdl,
23
- load_files_from_tdl,
24
- )
19
+ from cosmotech.coal.utils.semver import semver_of
20
+
21
+ csm_version = semver_of("cosmotech_api")
22
+ if csm_version.major < 5:
23
+ # Re-export functions from the twin_data_layer module
24
+ from cosmotech.coal.cosmotech_api.twin_data_layer import (
25
+ get_dataset_id_from_runner,
26
+ send_files_to_tdl,
27
+ load_files_from_tdl,
28
+ )
25
29
 
26
30
  # Re-export functions from the run_data module
27
31
  from cosmotech.coal.cosmotech_api.run_data import (
@@ -12,16 +12,130 @@ import os
12
12
  import tempfile
13
13
  import time
14
14
  from pathlib import Path
15
- from typing import Dict, List, Any, Optional, Union, Tuple
15
+ from typing import Dict, Any, Optional, Union, Tuple
16
16
 
17
17
  from cosmotech_api import WorkspaceApi
18
18
  from openpyxl import load_workbook
19
19
 
20
+ from cosmotech.coal.utils.decorator import timed
20
21
  from cosmotech.coal.utils.logger import LOGGER
21
22
  from cosmotech.orchestrator.utils.translate import T
22
23
  from cosmotech.coal.cosmotech_api.connection import get_api_client
23
24
 
24
25
 
26
+ def process_xls(target_file) -> Dict[str, Any]:
27
+ content = {}
28
+
29
+ LOGGER.debug(T("coal.services.dataset.processing_excel").format(file_name=target_file))
30
+ wb = load_workbook(target_file, data_only=True)
31
+
32
+ for sheet_name in wb.sheetnames:
33
+ sheet = wb[sheet_name]
34
+ content[sheet_name] = list()
35
+ headers = next(sheet.iter_rows(max_row=1, values_only=True))
36
+
37
+ row_count = 0
38
+ for r in sheet.iter_rows(min_row=2, values_only=True):
39
+ row = {k: v for k, v in zip(headers, r)}
40
+ new_row = dict()
41
+
42
+ for key, value in row.items():
43
+ try:
44
+ converted_value = json.load(io.StringIO(value))
45
+ except (json.decoder.JSONDecodeError, TypeError):
46
+ converted_value = value
47
+
48
+ if converted_value is not None:
49
+ new_row[key] = converted_value
50
+
51
+ if new_row:
52
+ content[sheet_name].append(new_row)
53
+ row_count += 1
54
+
55
+ LOGGER.debug(T("coal.services.dataset.sheet_processed").format(sheet_name=sheet_name, rows=row_count))
56
+ return content
57
+
58
+
59
+ def process_csv(target_file) -> Dict[str, Any]:
60
+ content = {}
61
+
62
+ LOGGER.debug(T("coal.services.dataset.processing_csv").format(file_name=target_file))
63
+ with open(target_file, "r") as file:
64
+ current_filename = os.path.basename(target_file)[: -len(".csv")]
65
+ content[current_filename] = list()
66
+
67
+ row_count = 0
68
+ for csv_row in csv.DictReader(file):
69
+ csv_row: dict
70
+ new_row = dict()
71
+
72
+ for key, value in csv_row.items():
73
+ try:
74
+ # Try to convert any json row to dict object
75
+ converted_value = json.load(io.StringIO(value))
76
+ except json.decoder.JSONDecodeError:
77
+ converted_value = value
78
+
79
+ if converted_value == "":
80
+ converted_value = None
81
+
82
+ if converted_value is not None:
83
+ new_row[key] = converted_value
84
+
85
+ content[current_filename].append(new_row)
86
+ row_count += 1
87
+
88
+ LOGGER.debug(T("coal.services.dataset.csv_processed").format(file_name=current_filename, rows=row_count))
89
+ return content
90
+
91
+
92
+ def process_json(target_file) -> Dict[str, Any]:
93
+ content = {}
94
+ LOGGER.debug(T("coal.services.dataset.processing_json").format(file_name=target_file))
95
+ with open(target_file, "r") as _file:
96
+ current_filename = os.path.basename(target_file)
97
+ content[current_filename] = json.load(_file)
98
+
99
+ if isinstance(content[current_filename], dict):
100
+ item_count = len(content[current_filename])
101
+ elif isinstance(content[current_filename], list):
102
+ item_count = len(content[current_filename])
103
+ else:
104
+ item_count = 1
105
+
106
+ LOGGER.debug(T("coal.services.dataset.json_processed").format(file_name=current_filename, items=item_count))
107
+ return content
108
+
109
+
110
+ def process_txt(target_file) -> Dict[str, Any]:
111
+ content = {}
112
+ LOGGER.debug(T("coal.services.dataset.processing_text").format(file_name=target_file))
113
+ with open(target_file, "r") as _file:
114
+ current_filename = os.path.basename(target_file)
115
+ content[current_filename] = _file.read()
116
+
117
+ line_count = content[current_filename].count("\n") + 1
118
+ LOGGER.debug(T("coal.services.dataset.text_processed").format(file_name=current_filename, lines=line_count))
119
+ return content
120
+
121
+
122
+ def read_file(file_name, file):
123
+ @timed(f"process {file_name}", debug=True)
124
+ def timed_read_file(file_name, file):
125
+ content = {}
126
+ if ".xls" in file_name:
127
+ content.update(process_xls(file))
128
+ elif ".csv" in file_name:
129
+ content.update(process_csv(file))
130
+ elif ".json" in file_name:
131
+ content.update(process_json(file))
132
+ else:
133
+ content.update(process_txt(file))
134
+ return content
135
+
136
+ return timed_read_file(file_name, file)
137
+
138
+
25
139
  def download_file_dataset(
26
140
  organization_id: str,
27
141
  workspace_id: str,
@@ -105,109 +219,8 @@ def download_file_dataset(
105
219
  )
106
220
  )
107
221
 
108
- if not read_files:
109
- continue
110
-
111
- # Process file based on type
112
- process_start = time.time()
113
-
114
- if ".xls" in _file_name:
115
- LOGGER.debug(T("coal.services.dataset.processing_excel").format(file_name=target_file))
116
- wb = load_workbook(target_file, data_only=True)
117
-
118
- for sheet_name in wb.sheetnames:
119
- sheet = wb[sheet_name]
120
- content[sheet_name] = list()
121
- headers = next(sheet.iter_rows(max_row=1, values_only=True))
122
-
123
- def item(_row: tuple) -> dict:
124
- return {k: v for k, v in zip(headers, _row)}
125
-
126
- row_count = 0
127
- for r in sheet.iter_rows(min_row=2, values_only=True):
128
- row = item(r)
129
- new_row = dict()
130
-
131
- for key, value in row.items():
132
- try:
133
- converted_value = json.load(io.StringIO(value))
134
- except (json.decoder.JSONDecodeError, TypeError):
135
- converted_value = value
136
-
137
- if converted_value is not None:
138
- new_row[key] = converted_value
139
-
140
- if new_row:
141
- content[sheet_name].append(new_row)
142
- row_count += 1
143
-
144
- LOGGER.debug(
145
- T("coal.services.dataset.sheet_processed").format(sheet_name=sheet_name, rows=row_count)
146
- )
147
-
148
- elif ".csv" in _file_name:
149
- LOGGER.debug(T("coal.services.dataset.processing_csv").format(file_name=target_file))
150
- with open(target_file, "r") as file:
151
- current_filename = os.path.basename(target_file)[: -len(".csv")]
152
- content[current_filename] = list()
153
-
154
- row_count = 0
155
- for csv_row in csv.DictReader(file):
156
- csv_row: dict
157
- new_row = dict()
158
-
159
- for key, value in csv_row.items():
160
- try:
161
- # Try to convert any json row to dict object
162
- converted_value = json.load(io.StringIO(value))
163
- except json.decoder.JSONDecodeError:
164
- converted_value = value
165
-
166
- if converted_value == "":
167
- converted_value = None
168
-
169
- if converted_value is not None:
170
- new_row[key] = converted_value
171
-
172
- content[current_filename].append(new_row)
173
- row_count += 1
174
-
175
- LOGGER.debug(
176
- T("coal.services.dataset.csv_processed").format(file_name=current_filename, rows=row_count)
177
- )
178
-
179
- elif ".json" in _file_name:
180
- LOGGER.debug(T("coal.services.dataset.processing_json").format(file_name=target_file))
181
- with open(target_file, "r") as _file:
182
- current_filename = os.path.basename(target_file)
183
- content[current_filename] = json.load(_file)
184
-
185
- if isinstance(content[current_filename], dict):
186
- item_count = len(content[current_filename])
187
- elif isinstance(content[current_filename], list):
188
- item_count = len(content[current_filename])
189
- else:
190
- item_count = 1
191
-
192
- LOGGER.debug(
193
- T("coal.services.dataset.json_processed").format(file_name=current_filename, items=item_count)
194
- )
195
-
196
- else:
197
- LOGGER.debug(T("coal.services.dataset.processing_text").format(file_name=target_file))
198
- with open(target_file, "r") as _file:
199
- current_filename = os.path.basename(target_file)
200
- content[current_filename] = "\n".join(line for line in _file)
201
-
202
- line_count = content[current_filename].count("\n") + 1
203
- LOGGER.debug(
204
- T("coal.services.dataset.text_processed").format(file_name=current_filename, lines=line_count)
205
- )
206
-
207
- process_time = time.time() - process_start
208
- LOGGER.debug(
209
- T("coal.common.timing.operation_completed").format(operation=f"process {_file_name}", time=process_time)
210
- )
222
+ if read_files:
223
+ content.update(read_file(_file_name, target_file))
211
224
 
212
225
  elapsed_time = time.time() - start_time
213
226
  LOGGER.info(T("coal.common.timing.operation_completed").format(operation="File download", time=elapsed_time))
@@ -8,14 +8,9 @@
8
8
  import time
9
9
  import tempfile
10
10
  from pathlib import Path
11
- from typing import Dict, List, Any, Optional, Union, Tuple
11
+ from typing import Dict, Any, Optional, Union, Tuple
12
12
 
13
- from cosmotech_api import (
14
- DatasetApi,
15
- DatasetTwinGraphQuery,
16
- TwinGraphQuery,
17
- TwingraphApi,
18
- )
13
+ import cosmotech_api
19
14
 
20
15
  from cosmotech.coal.utils.logger import LOGGER
21
16
  from cosmotech.orchestrator.utils.translate import T
@@ -47,12 +42,12 @@ def download_twingraph_dataset(
47
42
  )
48
43
 
49
44
  with get_api_client()[0] as api_client:
50
- dataset_api = DatasetApi(api_client)
45
+ dataset_api = cosmotech_api.DatasetApi(api_client)
51
46
 
52
47
  # Query nodes
53
48
  nodes_start = time.time()
54
49
  LOGGER.debug(T("coal.services.dataset.twingraph_querying_nodes").format(dataset_id=dataset_id))
55
- nodes_query = DatasetTwinGraphQuery(query="MATCH(n) RETURN n")
50
+ nodes_query = cosmotech_api.DatasetTwinGraphQuery(query="MATCH(n) RETURN n")
56
51
 
57
52
  nodes = dataset_api.twingraph_query(
58
53
  organization_id=organization_id,
@@ -67,7 +62,9 @@ def download_twingraph_dataset(
67
62
  # Query edges
68
63
  edges_start = time.time()
69
64
  LOGGER.debug(T("coal.services.dataset.twingraph_querying_edges").format(dataset_id=dataset_id))
70
- edges_query = DatasetTwinGraphQuery(query="MATCH(n)-[r]->(m) RETURN n as src, r as rel, m as dest")
65
+ edges_query = cosmotech_api.DatasetTwinGraphQuery(
66
+ query="MATCH(n)-[r]->(m) RETURN n as src, r as rel, m as dest"
67
+ )
71
68
 
72
69
  edges = dataset_api.twingraph_query(
73
70
  organization_id=organization_id,
@@ -129,12 +126,12 @@ def download_legacy_twingraph_dataset(
129
126
  )
130
127
 
131
128
  with get_api_client()[0] as api_client:
132
- api_instance = TwingraphApi(api_client)
129
+ api_instance = cosmotech_api.TwingraphApi(api_client)
133
130
 
134
131
  # Query nodes
135
132
  nodes_start = time.time()
136
133
  LOGGER.debug(T("coal.services.dataset.legacy_twingraph_querying_nodes").format(cache_name=cache_name))
137
- _query_nodes = TwinGraphQuery(query="MATCH(n) RETURN n")
134
+ _query_nodes = cosmotech_api.TwinGraphQuery(query="MATCH(n) RETURN n")
138
135
 
139
136
  nodes = api_instance.query(
140
137
  organization_id=organization_id,
@@ -149,7 +146,7 @@ def download_legacy_twingraph_dataset(
149
146
  # Query relationships
150
147
  rel_start = time.time()
151
148
  LOGGER.debug(T("coal.services.dataset.legacy_twingraph_querying_relations").format(cache_name=cache_name))
152
- _query_rel = TwinGraphQuery(query="MATCH(n)-[r]->(m) RETURN n as src, r as rel, m as dest")
149
+ _query_rel = cosmotech_api.TwinGraphQuery(query="MATCH(n)-[r]->(m) RETURN n as src, r as rel, m as dest")
153
150
 
154
151
  rel = api_instance.query(
155
152
  organization_id=organization_id,
@@ -0,0 +1,41 @@
1
+ import pathlib
2
+
3
+ from cosmotech_api import Dataset
4
+ from cosmotech_api import DatasetPartTypeEnum
5
+ from cosmotech_api.api.dataset_api import DatasetApi
6
+ from cosmotech_api.api.dataset_api import DatasetCreateRequest
7
+ from cosmotech_api.api.dataset_api import DatasetPartCreateRequest
8
+ import pprint
9
+
10
+ from cosmotech.coal.cosmotech_api.connection import get_api_client
11
+ from cosmotech.coal.utils.logger import LOGGER
12
+
13
+ LOGGER.info("Generating dataset content")
14
+
15
+
16
+ def upload_dataset(organization_id, workspace_id, dataset_name, dataset_dir) -> Dataset:
17
+ dataset_path = pathlib.Path(dataset_dir)
18
+
19
+ with get_api_client()[0] as client:
20
+ d_api = DatasetApi(client)
21
+ _files = list(_p for _p in dataset_path.rglob("*") if _p.is_file())
22
+ d_request = DatasetCreateRequest(
23
+ name=dataset_name,
24
+ parts=list(
25
+ DatasetPartCreateRequest(
26
+ name=_p.name,
27
+ description=str(_p.relative_to(dataset_path)),
28
+ sourceName=str(_p.relative_to(dataset_path)),
29
+ type=DatasetPartTypeEnum.FILE,
30
+ )
31
+ for _p in _files
32
+ ),
33
+ )
34
+ pprint.pprint(d_request.to_dict())
35
+ d_ret = d_api.create_dataset(
36
+ organization_id,
37
+ workspace_id,
38
+ d_request,
39
+ files=list((str(_p.relative_to(dataset_path)), _p.open("rb").read()) for _p in _files),
40
+ )
41
+ return d_ret
@@ -12,7 +12,7 @@ Dataset handling functions.
12
12
  import multiprocessing
13
13
  import tempfile
14
14
  from pathlib import Path
15
- from typing import Dict, List, Any, Optional, Union, Tuple
15
+ from typing import Dict, List, Any, Optional, Union
16
16
 
17
17
  from azure.identity import DefaultAzureCredential
18
18
  from cosmotech_api.api.dataset_api import DatasetApi
@@ -25,6 +25,7 @@ from cosmotech.coal.cosmotech_api.dataset import (
25
25
  download_legacy_twingraph_dataset,
26
26
  download_file_dataset,
27
27
  )
28
+ from cosmotech.coal.cosmotech_api.dataset.download import file
28
29
  from cosmotech.coal.utils.logger import LOGGER
29
30
  from cosmotech.orchestrator.utils.translate import T
30
31
 
@@ -54,7 +55,72 @@ def download_dataset(
54
55
  workspace_id: str,
55
56
  dataset_id: str,
56
57
  read_files: bool = True,
57
- credentials: Optional[DefaultAzureCredential] = None,
58
+ ) -> Dict[str, Any]:
59
+ """
60
+ retro-compatibility to cosmo-api v4
61
+ """
62
+ from cosmotech.coal.utils.semver import semver_of
63
+
64
+ csm_version = semver_of("cosmotech_api")
65
+ if csm_version.major >= 5:
66
+ return download_dataset_v5(organization_id, workspace_id, dataset_id, read_files)
67
+ else:
68
+ return download_dataset_v4(organization_id, workspace_id, dataset_id, read_files)
69
+
70
+
71
+ def download_dataset_v5(
72
+ organization_id: str,
73
+ workspace_id: str,
74
+ dataset_id: str,
75
+ read_files: bool = True,
76
+ ) -> Dict[str, Any]:
77
+ """
78
+ Download a single dataset by ID.
79
+
80
+ Args:
81
+ organization_id: Organization ID
82
+ workspace_id: Workspace ID
83
+ dataset_id: Dataset ID
84
+ read_files: Whether to read file contents
85
+
86
+ Returns:
87
+ Dataset information dictionary
88
+ """
89
+
90
+ # Get dataset information
91
+ with get_api_client()[0] as api_client:
92
+ dataset_api_instance = DatasetApi(api_client)
93
+ dataset = dataset_api_instance.get_dataset(
94
+ organization_id=organization_id, workspace_id=workspace_id, dataset_id=dataset_id
95
+ )
96
+
97
+ content = dict()
98
+ tmp_dataset_dir = tempfile.mkdtemp()
99
+ tmp_dataset_dir_path = Path(tmp_dataset_dir)
100
+ for part in dataset.parts:
101
+ part_file_path = tmp_dataset_dir_path / part.source_name
102
+ part_file_path.parent.mkdir(parents=True, exist_ok=True)
103
+ data_part = dataset_api_instance.download_dataset_part(organization_id, workspace_id, dataset_id, part.id)
104
+ with open(part_file_path, "wb") as binary_file:
105
+ binary_file.write(data_part)
106
+
107
+ if read_files:
108
+ content.update(file.read_file(part.source_name, part_file_path))
109
+
110
+ return {
111
+ "type": "csm_dataset",
112
+ "content": content,
113
+ "name": dataset.name,
114
+ "folder_path": tmp_dataset_dir,
115
+ "dataset_id": dataset_id,
116
+ }
117
+
118
+
119
+ def download_dataset_v4(
120
+ organization_id: str,
121
+ workspace_id: str,
122
+ dataset_id: str,
123
+ read_files: bool = True,
58
124
  ) -> Dict[str, Any]:
59
125
  """
60
126
  Download a single dataset by ID.
@@ -64,7 +130,6 @@ def download_dataset(
64
130
  workspace_id: Workspace ID
65
131
  dataset_id: Dataset ID
66
132
  read_files: Whether to read file contents
67
- credentials: Azure credentials (if None, uses DefaultAzureCredential if needed)
68
133
 
69
134
  Returns:
70
135
  Dataset information dictionary
@@ -91,7 +156,7 @@ def download_dataset(
91
156
  if is_adt:
92
157
  content, folder_path = download_adt_dataset(
93
158
  adt_address=parameters["AZURE_DIGITAL_TWINS_URL"],
94
- credentials=credentials,
159
+ credentials=DefaultAzureCredential(),
95
160
  )
96
161
  return {
97
162
  "type": "adt",
@@ -159,9 +224,7 @@ def download_dataset(
159
224
  }
160
225
 
161
226
 
162
- def download_dataset_process(
163
- _dataset_id, organization_id, workspace_id, read_files, credentials, _return_dict, _error_dict
164
- ):
227
+ def download_dataset_process(_dataset_id, organization_id, workspace_id, read_files, _return_dict, _error_dict):
165
228
  """
166
229
  Process function for downloading a dataset in a separate process.
167
230
 
@@ -174,7 +237,6 @@ def download_dataset_process(
174
237
  organization_id: Organization ID
175
238
  workspace_id: Workspace ID
176
239
  read_files: Whether to read file contents
177
- credentials: Azure credentials (if None, uses DefaultAzureCredential if needed)
178
240
  _return_dict: Shared dictionary to store successful download results
179
241
  _error_dict: Shared dictionary to store error messages
180
242
 
@@ -187,7 +249,6 @@ def download_dataset_process(
187
249
  workspace_id=workspace_id,
188
250
  dataset_id=_dataset_id,
189
251
  read_files=read_files,
190
- credentials=credentials,
191
252
  )
192
253
  _return_dict[_dataset_id] = _c
193
254
  except Exception as e:
@@ -200,7 +261,6 @@ def download_datasets_parallel(
200
261
  workspace_id: str,
201
262
  dataset_ids: List[str],
202
263
  read_files: bool = True,
203
- credentials: Optional[DefaultAzureCredential] = None,
204
264
  ) -> Dict[str, Dict[str, Any]]:
205
265
  """
206
266
  Download multiple datasets in parallel.
@@ -210,7 +270,6 @@ def download_datasets_parallel(
210
270
  workspace_id: Workspace ID
211
271
  dataset_ids: List of dataset IDs
212
272
  read_files: Whether to read file contents
213
- credentials: Azure credentials (if None, uses DefaultAzureCredential if needed)
214
273
 
215
274
  Returns:
216
275
  Dictionary mapping dataset IDs to dataset information
@@ -225,7 +284,7 @@ def download_datasets_parallel(
225
284
  dataset_id,
226
285
  multiprocessing.Process(
227
286
  target=download_dataset_process,
228
- args=(dataset_id, organization_id, workspace_id, read_files, credentials, return_dict, error_dict),
287
+ args=(dataset_id, organization_id, workspace_id, read_files, return_dict, error_dict),
229
288
  ),
230
289
  )
231
290
  for dataset_id in dataset_ids
@@ -251,7 +310,6 @@ def download_datasets_sequential(
251
310
  workspace_id: str,
252
311
  dataset_ids: List[str],
253
312
  read_files: bool = True,
254
- credentials: Optional[DefaultAzureCredential] = None,
255
313
  ) -> Dict[str, Dict[str, Any]]:
256
314
  """
257
315
  Download multiple datasets sequentially.
@@ -261,7 +319,6 @@ def download_datasets_sequential(
261
319
  workspace_id: Workspace ID
262
320
  dataset_ids: List of dataset IDs
263
321
  read_files: Whether to read file contents
264
- credentials: Azure credentials (if None, uses DefaultAzureCredential if needed)
265
322
 
266
323
  Returns:
267
324
  Dictionary mapping dataset IDs to dataset information
@@ -279,7 +336,6 @@ def download_datasets_sequential(
279
336
  workspace_id=workspace_id,
280
337
  dataset_id=dataset_id,
281
338
  read_files=read_files,
282
- credentials=credentials,
283
339
  )
284
340
  except Exception as e:
285
341
  error_dict[dataset_id] = f"{type(e).__name__}: {str(e)}"
@@ -294,7 +350,6 @@ def download_datasets(
294
350
  dataset_ids: List[str],
295
351
  read_files: bool = True,
296
352
  parallel: bool = True,
297
- credentials: Optional[DefaultAzureCredential] = None,
298
353
  ) -> Dict[str, Dict[str, Any]]:
299
354
  """
300
355
  Download multiple datasets, either in parallel or sequentially.
@@ -305,7 +360,6 @@ def download_datasets(
305
360
  dataset_ids: List of dataset IDs
306
361
  read_files: Whether to read file contents
307
362
  parallel: Whether to download in parallel
308
- credentials: Azure credentials (if None, uses DefaultAzureCredential if needed)
309
363
 
310
364
  Returns:
311
365
  Dictionary mapping dataset IDs to dataset information
@@ -319,7 +373,6 @@ def download_datasets(
319
373
  workspace_id=workspace_id,
320
374
  dataset_ids=dataset_ids,
321
375
  read_files=read_files,
322
- credentials=credentials,
323
376
  )
324
377
  else:
325
378
  return download_datasets_sequential(
@@ -327,7 +380,6 @@ def download_datasets(
327
380
  workspace_id=workspace_id,
328
381
  dataset_ids=dataset_ids,
329
382
  read_files=read_files,
330
- credentials=credentials,
331
383
  )
332
384
 
333
385
 
@@ -12,13 +12,8 @@ Orchestration functions for downloading runner and run data.
12
12
  import os
13
13
  import pathlib
14
14
  import shutil
15
- from typing import Dict, List, Any, Optional
15
+ from typing import Dict, Any, Optional
16
16
 
17
- from azure.identity import DefaultAzureCredential
18
- from cosmotech_api.api.runner_api import RunnerApi
19
- from cosmotech_api.exceptions import ApiException
20
-
21
- from cosmotech.coal.cosmotech_api.connection import get_api_client
22
17
  from cosmotech.coal.cosmotech_api.runner.data import get_runner_data
23
18
  from cosmotech.coal.cosmotech_api.runner.parameters import (
24
19
  format_parameters_list,
@@ -65,11 +60,6 @@ def download_runner_data(
65
60
  """
66
61
  LOGGER.info(T("coal.cosmotech_api.runner.starting_download"))
67
62
 
68
- # Get credentials if needed
69
- credentials = None
70
- if get_api_client()[1] == "Azure Entra Connection":
71
- credentials = DefaultAzureCredential()
72
-
73
63
  # Get runner data
74
64
  runner_data = get_runner_data(organization_id, workspace_id, runner_id)
75
65
 
@@ -100,19 +90,18 @@ def download_runner_data(
100
90
  dataset_ids=dataset_ids,
101
91
  read_files=read_files,
102
92
  parallel=parallel,
103
- credentials=credentials,
104
93
  )
105
94
 
106
95
  result["datasets"] = datasets
107
96
 
108
- # Process datasets
97
+ # List datasets set as parameter
109
98
  datasets_parameters_ids = {
110
99
  param.value: param.parameter_id
111
100
  for param in runner_data.parameters_values
112
101
  if param.var_type == "%DATASETID%" and param.value
113
102
  }
114
103
 
115
- # Save datasets to parameter folders
104
+ # Save parameter datasets to parameter folders
116
105
  for dataset_id, dataset_info in datasets.items():
117
106
  # If dataset is referenced by a parameter, save to parameter folder
118
107
  if dataset_id in datasets_parameters_ids:
@@ -32,6 +32,7 @@ def send_runner_metadata_to_postgresql(
32
32
  postgres_user: str,
33
33
  postgres_password: str,
34
34
  table_prefix: str = "Cosmotech_",
35
+ force_encode: bool = False,
35
36
  ) -> None:
36
37
  """
37
38
  Send runner metadata to a PostgreSQL database.
@@ -47,6 +48,7 @@ def send_runner_metadata_to_postgresql(
47
48
  postgres_user: PostgreSQL username
48
49
  postgres_password: PostgreSQL password
49
50
  table_prefix: Table prefix
51
+ force_encode: force password encoding to percent encoding
50
52
  """
51
53
  # Get runner metadata
52
54
  with get_api_client()[0] as api_client:
@@ -54,7 +56,7 @@ def send_runner_metadata_to_postgresql(
54
56
 
55
57
  # Generate PostgreSQL URI
56
58
  postgresql_full_uri = generate_postgresql_full_uri(
57
- postgres_host, str(postgres_port), postgres_db, postgres_user, postgres_password
59
+ postgres_host, str(postgres_port), postgres_db, postgres_user, postgres_password, force_encode=force_encode
58
60
  )
59
61
 
60
62
  # Connect to PostgreSQL and update runner metadata