cosmotech-acceleration-library 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. cosmotech/coal/__init__.py +1 -1
  2. cosmotech/coal/aws/__init__.py +1 -9
  3. cosmotech/coal/aws/s3.py +181 -214
  4. cosmotech/coal/azure/adx/auth.py +2 -2
  5. cosmotech/coal/azure/adx/runner.py +13 -14
  6. cosmotech/coal/azure/adx/store.py +5 -86
  7. cosmotech/coal/azure/adx/tables.py +2 -2
  8. cosmotech/coal/azure/blob.py +6 -6
  9. cosmotech/coal/azure/storage.py +3 -3
  10. cosmotech/coal/cosmotech_api/__init__.py +0 -28
  11. cosmotech/coal/cosmotech_api/apis/__init__.py +14 -0
  12. cosmotech/coal/cosmotech_api/apis/dataset.py +103 -0
  13. cosmotech/coal/cosmotech_api/apis/meta.py +25 -0
  14. cosmotech/coal/cosmotech_api/apis/organization.py +24 -0
  15. cosmotech/coal/cosmotech_api/apis/run.py +38 -0
  16. cosmotech/coal/cosmotech_api/apis/runner.py +71 -0
  17. cosmotech/coal/cosmotech_api/apis/solution.py +23 -0
  18. cosmotech/coal/cosmotech_api/apis/workspace.py +108 -0
  19. cosmotech/coal/cosmotech_api/objects/__init__.py +9 -0
  20. cosmotech/coal/cosmotech_api/objects/connection.py +125 -0
  21. cosmotech/coal/cosmotech_api/objects/parameters.py +127 -0
  22. cosmotech/coal/postgresql/runner.py +56 -36
  23. cosmotech/coal/postgresql/store.py +60 -14
  24. cosmotech/coal/postgresql/utils.py +254 -0
  25. cosmotech/coal/store/output/__init__.py +0 -0
  26. cosmotech/coal/store/output/aws_channel.py +73 -0
  27. cosmotech/coal/store/output/az_storage_channel.py +42 -0
  28. cosmotech/coal/store/output/channel_interface.py +23 -0
  29. cosmotech/coal/store/output/channel_spliter.py +55 -0
  30. cosmotech/coal/store/output/postgres_channel.py +40 -0
  31. cosmotech/coal/utils/configuration.py +169 -0
  32. cosmotech/coal/utils/decorator.py +4 -7
  33. cosmotech/csm_data/commands/api/api.py +6 -19
  34. cosmotech/csm_data/commands/api/postgres_send_runner_metadata.py +20 -16
  35. cosmotech/csm_data/commands/api/run_load_data.py +7 -46
  36. cosmotech/csm_data/commands/api/wsf_load_file.py +13 -16
  37. cosmotech/csm_data/commands/api/wsf_send_file.py +11 -14
  38. cosmotech/csm_data/commands/s3_bucket_delete.py +16 -15
  39. cosmotech/csm_data/commands/s3_bucket_download.py +16 -16
  40. cosmotech/csm_data/commands/s3_bucket_upload.py +16 -14
  41. cosmotech/csm_data/commands/store/dump_to_s3.py +18 -16
  42. cosmotech/csm_data/commands/store/output.py +35 -0
  43. cosmotech/csm_data/commands/store/store.py +3 -3
  44. cosmotech/translation/coal/en-US/coal/cosmotech_api/initialization.yml +8 -0
  45. cosmotech/translation/coal/en-US/coal/services/dataset.yml +4 -14
  46. cosmotech/translation/coal/en-US/coal/store/output/data_interface.yml +1 -0
  47. cosmotech/translation/coal/en-US/coal/store/output/split.yml +6 -0
  48. cosmotech/translation/coal/en-US/coal/utils/configuration.yml +2 -0
  49. cosmotech/translation/csm_data/en-US/csm_data/commands/store/output.yml +7 -0
  50. {cosmotech_acceleration_library-1.1.0.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/METADATA +5 -8
  51. {cosmotech_acceleration_library-1.1.0.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/RECORD +55 -73
  52. cosmotech/coal/azure/functions.py +0 -72
  53. cosmotech/coal/cosmotech_api/connection.py +0 -96
  54. cosmotech/coal/cosmotech_api/dataset/__init__.py +0 -26
  55. cosmotech/coal/cosmotech_api/dataset/converters.py +0 -164
  56. cosmotech/coal/cosmotech_api/dataset/download/__init__.py +0 -19
  57. cosmotech/coal/cosmotech_api/dataset/download/adt.py +0 -119
  58. cosmotech/coal/cosmotech_api/dataset/download/common.py +0 -140
  59. cosmotech/coal/cosmotech_api/dataset/download/file.py +0 -229
  60. cosmotech/coal/cosmotech_api/dataset/download/twingraph.py +0 -185
  61. cosmotech/coal/cosmotech_api/dataset/upload.py +0 -41
  62. cosmotech/coal/cosmotech_api/dataset/utils.py +0 -132
  63. cosmotech/coal/cosmotech_api/parameters.py +0 -48
  64. cosmotech/coal/cosmotech_api/run.py +0 -25
  65. cosmotech/coal/cosmotech_api/run_data.py +0 -173
  66. cosmotech/coal/cosmotech_api/run_template.py +0 -108
  67. cosmotech/coal/cosmotech_api/runner/__init__.py +0 -28
  68. cosmotech/coal/cosmotech_api/runner/data.py +0 -38
  69. cosmotech/coal/cosmotech_api/runner/datasets.py +0 -416
  70. cosmotech/coal/cosmotech_api/runner/download.py +0 -135
  71. cosmotech/coal/cosmotech_api/runner/metadata.py +0 -42
  72. cosmotech/coal/cosmotech_api/runner/parameters.py +0 -157
  73. cosmotech/coal/cosmotech_api/twin_data_layer.py +0 -512
  74. cosmotech/coal/cosmotech_api/workspace.py +0 -127
  75. cosmotech/coal/utils/postgresql.py +0 -236
  76. cosmotech/coal/utils/semver.py +0 -6
  77. cosmotech/csm_data/commands/api/rds_load_csv.py +0 -90
  78. cosmotech/csm_data/commands/api/rds_send_csv.py +0 -74
  79. cosmotech/csm_data/commands/api/rds_send_store.py +0 -74
  80. cosmotech/csm_data/commands/api/runtemplate_load_handler.py +0 -66
  81. cosmotech/csm_data/commands/api/tdl_load_files.py +0 -76
  82. cosmotech/csm_data/commands/api/tdl_send_files.py +0 -82
  83. cosmotech/orchestrator_plugins/csm-data/templates/api/rds_load_csv.json +0 -27
  84. cosmotech/orchestrator_plugins/csm-data/templates/api/rds_send_csv.json +0 -27
  85. cosmotech/orchestrator_plugins/csm-data/templates/api/rds_send_store.json +0 -27
  86. cosmotech/orchestrator_plugins/csm-data/templates/api/runtemplate_load_handler.json +0 -27
  87. cosmotech/orchestrator_plugins/csm-data/templates/api/tdl_load_files.json +0 -32
  88. cosmotech/orchestrator_plugins/csm-data/templates/api/tdl_send_files.json +0 -27
  89. cosmotech/translation/coal/en-US/coal/cosmotech_api/run_data.yml +0 -2
  90. cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_load_csv.yml +0 -13
  91. cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_send_csv.yml +0 -12
  92. cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_send_store.yml +0 -12
  93. cosmotech/translation/csm_data/en-US/csm_data/commands/api/tdl_load_files.yml +0 -14
  94. cosmotech/translation/csm_data/en-US/csm_data/commands/api/tdl_send_files.yml +0 -18
  95. {cosmotech_acceleration_library-1.1.0.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/WHEEL +0 -0
  96. {cosmotech_acceleration_library-1.1.0.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/entry_points.txt +0 -0
  97. {cosmotech_acceleration_library-1.1.0.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/licenses/LICENSE +0 -0
  98. {cosmotech_acceleration_library-1.1.0.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,157 +0,0 @@
1
- # Copyright (C) - 2023 - 2025 - Cosmo Tech
2
- # This document and all information contained herein is the exclusive property -
3
- # including all intellectual property rights pertaining thereto - of Cosmo Tech.
4
- # Any use, reproduction, translation, broadcasting, transmission, distribution,
5
- # etc., to any person is prohibited unless it has been previously and
6
- # specifically authorized by written means by Cosmo Tech.
7
-
8
- """
9
- Parameter handling functions.
10
- """
11
-
12
- import json
13
- import os
14
- import pathlib
15
- from csv import DictWriter
16
- from typing import List, Dict, Any
17
-
18
- from cosmotech.coal.utils.logger import LOGGER
19
- from cosmotech.orchestrator.utils.translate import T
20
-
21
-
22
- def get_runner_parameters(runner_data: Any) -> Dict[str, Any]:
23
- """
24
- Extract parameters from runner data.
25
-
26
- Args:
27
- runner_data: Runner data object
28
-
29
- Returns:
30
- Dictionary mapping parameter IDs to values
31
- """
32
- content = dict()
33
- for parameter in runner_data.parameters_values:
34
- content[parameter.parameter_id] = parameter.value
35
- return content
36
-
37
-
38
- def format_parameters_list(runner_data: Any) -> List[Dict[str, Any]]:
39
- """
40
- Format parameters from runner data as a list of dictionaries.
41
-
42
- Args:
43
- runner_data: Runner data object
44
-
45
- Returns:
46
- List of parameter dictionaries
47
- """
48
- parameters = []
49
-
50
- if not runner_data.parameters_values:
51
- return parameters
52
-
53
- max_name_size = max(map(lambda r: len(r.parameter_id), runner_data.parameters_values))
54
- max_type_size = max(map(lambda r: len(r.var_type), runner_data.parameters_values))
55
-
56
- for parameter_data in runner_data.parameters_values:
57
- parameter_name = parameter_data.parameter_id
58
- value = parameter_data.value
59
- var_type = parameter_data.var_type
60
- is_inherited = parameter_data.is_inherited
61
-
62
- parameters.append(
63
- {
64
- "parameterId": parameter_name,
65
- "value": value,
66
- "varType": var_type,
67
- "isInherited": is_inherited,
68
- }
69
- )
70
-
71
- LOGGER.debug(
72
- T("coal.cosmotech_api.runner.parameter_debug").format(
73
- param_id=parameter_name,
74
- max_name_size=max_name_size,
75
- var_type=var_type,
76
- max_type_size=max_type_size,
77
- value=value,
78
- inherited=" inherited" if is_inherited else "",
79
- )
80
- )
81
-
82
- return parameters
83
-
84
-
85
- def write_parameters_to_json(parameter_folder: str, parameters: List[Dict[str, Any]]) -> str:
86
- """
87
- Write parameters to a JSON file.
88
-
89
- Args:
90
- parameter_folder: Folder to write the file to
91
- parameters: List of parameter dictionaries
92
-
93
- Returns:
94
- Path to the created file
95
- """
96
- pathlib.Path(parameter_folder).mkdir(exist_ok=True, parents=True)
97
- tmp_parameter_file = os.path.join(parameter_folder, "parameters.json")
98
-
99
- LOGGER.info(T("coal.cosmotech_api.runner.generating_file").format(file=tmp_parameter_file))
100
-
101
- with open(tmp_parameter_file, "w") as _file:
102
- json.dump(parameters, _file, indent=2)
103
-
104
- return tmp_parameter_file
105
-
106
-
107
- def write_parameters_to_csv(parameter_folder: str, parameters: List[Dict[str, Any]]) -> str:
108
- """
109
- Write parameters to a CSV file.
110
-
111
- Args:
112
- parameter_folder: Folder to write the file to
113
- parameters: List of parameter dictionaries
114
-
115
- Returns:
116
- Path to the created file
117
- """
118
- pathlib.Path(parameter_folder).mkdir(exist_ok=True, parents=True)
119
- tmp_parameter_file = os.path.join(parameter_folder, "parameters.csv")
120
-
121
- LOGGER.info(T("coal.cosmotech_api.runner.generating_file").format(file=tmp_parameter_file))
122
-
123
- with open(tmp_parameter_file, "w") as _file:
124
- _w = DictWriter(_file, fieldnames=["parameterId", "value", "varType", "isInherited"])
125
- _w.writeheader()
126
- _w.writerows(parameters)
127
-
128
- return tmp_parameter_file
129
-
130
-
131
- def write_parameters(
132
- parameter_folder: str,
133
- parameters: List[Dict[str, Any]],
134
- write_csv: bool = True,
135
- write_json: bool = False,
136
- ) -> Dict[str, str]:
137
- """
138
- Write parameters to files based on specified formats.
139
-
140
- Args:
141
- parameter_folder: Folder to write the files to
142
- parameters: List of parameter dictionaries
143
- write_csv: Whether to write a CSV file
144
- write_json: Whether to write a JSON file
145
-
146
- Returns:
147
- Dictionary mapping file types to file paths
148
- """
149
- result = {}
150
-
151
- if write_csv:
152
- result["csv"] = write_parameters_to_csv(parameter_folder, parameters)
153
-
154
- if write_json:
155
- result["json"] = write_parameters_to_json(parameter_folder, parameters)
156
-
157
- return result
@@ -1,512 +0,0 @@
1
- # Copyright (C) - 2023 - 2025 - Cosmo Tech
2
- # This document and all information contained herein is the exclusive property -
3
- # including all intellectual property rights pertaining thereto - of Cosmo Tech.
4
- # Any use, reproduction, translation, broadcasting, transmission, distribution,
5
- # etc., to any person is prohibited unless it has been previously and
6
- # specifically authorized by written means by Cosmo Tech.
7
-
8
- """
9
- Twin Data Layer operations module.
10
-
11
- This module provides functions for interacting with the Twin Data Layer,
12
- including sending and loading files.
13
- """
14
-
15
- import json
16
- import pathlib
17
- from csv import DictReader, DictWriter
18
- from io import StringIO
19
- from typing import Dict, List, Any, Optional, Set, Tuple
20
-
21
- import requests
22
- from cosmotech_api import DatasetApi, RunnerApi, DatasetTwinGraphQuery
23
-
24
- from cosmotech.coal.cosmotech_api.connection import get_api_client
25
- from cosmotech.coal.utils.logger import LOGGER
26
- from cosmotech.orchestrator.utils.translate import T
27
-
28
- ID_COLUMN = "id"
29
-
30
- SOURCE_COLUMN = "src"
31
-
32
- TARGET_COLUMN = "dest"
33
-
34
- BATCH_SIZE_LIMIT = 10000
35
-
36
-
37
- class CSVSourceFile:
38
- def __init__(self, file_path: pathlib.Path):
39
- self.file_path = file_path
40
- if not file_path.name.endswith(".csv"):
41
- raise ValueError(T("coal.common.validation.not_csv_file").format(file_path=file_path))
42
- with open(file_path) as _file:
43
- dr = DictReader(_file)
44
- self.fields = list(dr.fieldnames)
45
- self.object_type = file_path.name[:-4]
46
-
47
- self.id_column = None
48
- self.source_column = None
49
- self.target_column = None
50
-
51
- for _c in self.fields:
52
- if _c.lower() == ID_COLUMN:
53
- self.id_column = _c
54
- if _c.lower() == SOURCE_COLUMN:
55
- self.source_column = _c
56
- if _c.lower() == TARGET_COLUMN:
57
- self.target_column = _c
58
-
59
- has_id = self.id_column is not None
60
- has_source = self.source_column is not None
61
- has_target = self.target_column is not None
62
-
63
- is_relation = all([has_source, has_target])
64
-
65
- if not has_id and not is_relation:
66
- LOGGER.error(T("coal.common.validation.invalid_nodes_relations").format(file_path=file_path))
67
- LOGGER.error(T("coal.common.validation.node_requirements").format(id_column=ID_COLUMN))
68
- LOGGER.error(
69
- T("coal.common.validation.relationship_requirements").format(
70
- id_column=ID_COLUMN,
71
- source_column=SOURCE_COLUMN,
72
- target_column=TARGET_COLUMN,
73
- )
74
- )
75
- raise ValueError(T("coal.common.validation.invalid_nodes_relations").format(file_path=file_path))
76
-
77
- self.is_node = has_id and not is_relation
78
-
79
- self.content_fields = {
80
- _f: _f for _f in self.fields if _f not in [self.id_column, self.source_column, self.target_column]
81
- }
82
- if has_id:
83
- self.content_fields[ID_COLUMN] = self.id_column
84
- if is_relation:
85
- self.content_fields[SOURCE_COLUMN] = self.source_column
86
- self.content_fields[TARGET_COLUMN] = self.target_column
87
-
88
- def reload(self, inplace: bool = False) -> "CSVSourceFile":
89
- if inplace:
90
- self.__init__(self.file_path)
91
- return self
92
- return CSVSourceFile(self.file_path)
93
-
94
- def generate_query_insert(self) -> str:
95
- """
96
- Read a CSV file headers and generate a CREATE cypher query
97
- :return: the Cypher query for CREATE
98
- """
99
-
100
- field_names = sorted(self.content_fields.keys(), key=len, reverse=True)
101
-
102
- if self.is_node:
103
- query = (
104
- "CREATE (:"
105
- + self.object_type
106
- + ", ".join(f"{property_name}: ${self.content_fields[property_name]}" for property_name in field_names)
107
- + "})"
108
- )
109
- # query = ("UNWIND $params AS params " +
110
- # f"MERGE (n:{self.object_type}) " +
111
- # "SET n += params")
112
- else:
113
- query = (
114
- "MATCH "
115
- + "(source {"
116
- + ID_COLUMN
117
- + ":$"
118
- + self.source_column
119
- + "}),\n"
120
- + "(target {"
121
- + ID_COLUMN
122
- + ":$"
123
- + self.target_column
124
- + "})\n"
125
- + "CREATE (source)-[rel:"
126
- + self.object_type
127
- + " {"
128
- + ", ".join(f"{property_name}: ${self.content_fields[property_name]}" for property_name in field_names)
129
- + "}"
130
- + "]->(target)\n"
131
- )
132
- # query = ("UNWIND $params AS params " +
133
- # "MATCH (source {" + ID_COLUMN + ":params." + self.source_column + "})\n" +
134
- # "MATCH (target {" + ID_COLUMN + ":params." + self.target_column + "})\n" +
135
- # f"CREATE (from) - [rel:{self.object_type}]->(to)" +
136
- # "SET rel += params")
137
- return query
138
-
139
-
140
- def get_dataset_id_from_runner(organization_id: str, workspace_id: str, runner_id: str) -> str:
141
- """
142
- Get the dataset ID from a runner.
143
-
144
- Args:
145
- organization_id: Organization ID
146
- workspace_id: Workspace ID
147
- runner_id: Runner ID
148
-
149
- Returns:
150
- Dataset ID
151
- """
152
- api_client, _ = get_api_client()
153
- api_runner = RunnerApi(api_client)
154
-
155
- runner_info = api_runner.get_runner(
156
- organization_id,
157
- workspace_id,
158
- runner_id,
159
- )
160
-
161
- if (datasets_len := len(runner_info.dataset_list)) != 1:
162
- LOGGER.error(
163
- T("coal.cosmotech_api.runner.not_single_dataset").format(runner_id=runner_info.id, count=datasets_len)
164
- )
165
- LOGGER.debug(T("coal.cosmotech_api.runner.runner_info").format(info=runner_info))
166
- raise ValueError(f"Runner {runner_info.id} does not have exactly one dataset")
167
-
168
- return runner_info.dataset_list[0]
169
-
170
-
171
- def send_files_to_tdl(
172
- api_url: str,
173
- organization_id: str,
174
- workspace_id: str,
175
- runner_id: str,
176
- directory_path: str,
177
- clear: bool = True,
178
- ) -> None:
179
- """
180
- Send CSV files to the Twin Data Layer.
181
-
182
- Args:
183
- api_url: API URL
184
- organization_id: Organization ID
185
- workspace_id: Workspace ID
186
- runner_id: Runner ID
187
- directory_path: Directory containing CSV files
188
- clear: Whether to clear the dataset before sending files
189
- """
190
- api_client, _ = get_api_client()
191
- api_ds = DatasetApi(api_client)
192
-
193
- # Get dataset ID from runner
194
- dataset_id = get_dataset_id_from_runner(organization_id, workspace_id, runner_id)
195
-
196
- # Get dataset info
197
- dataset_info = api_ds.find_dataset_by_id(organization_id, dataset_id)
198
- dataset_info.ingestion_status = "SUCCESS"
199
- api_ds.update_dataset(organization_id, dataset_id, dataset_info)
200
-
201
- # Process CSV files
202
- entities_queries = {}
203
- relation_queries = {}
204
-
205
- content_path = pathlib.Path(directory_path)
206
- if not content_path.is_dir():
207
- LOGGER.error(T("coal.common.file_operations.not_directory").format(target_dir=directory_path))
208
- raise ValueError(f"{directory_path} is not a directory")
209
-
210
- # Process CSV files
211
- for file_path in content_path.glob("*.csv"):
212
- _csv = CSVSourceFile(file_path)
213
- if _csv.is_node:
214
- LOGGER.info(T("coal.services.azure_storage.sending_content").format(file=file_path))
215
- entities_queries[file_path] = _csv.generate_query_insert()
216
- else:
217
- LOGGER.info(T("coal.services.azure_storage.sending_content").format(file=file_path))
218
- relation_queries[file_path] = _csv.generate_query_insert()
219
-
220
- # Prepare headers
221
- header = {
222
- "Accept": "application/json",
223
- "Content-Type": "text/csv",
224
- "User-Agent": "OpenAPI-Generator/1.0.0/python",
225
- }
226
- header.update(api_client.default_headers)
227
-
228
- for authtype, authinfo in api_ds.api_client.configuration.auth_settings().items():
229
- api_ds.api_client._apply_auth_params(header, None, None, None, None, authinfo)
230
-
231
- # Clear dataset if requested
232
- if clear:
233
- LOGGER.info(T("coal.services.azure_storage.clearing_content"))
234
- clear_query = "MATCH (n) DETACH DELETE n"
235
- api_ds.twingraph_query(organization_id, dataset_id, DatasetTwinGraphQuery(query=str(clear_query)))
236
-
237
- # Send files
238
- for query_dict in [entities_queries, relation_queries]:
239
- for file_path, query in query_dict.items():
240
- _process_csv_file(
241
- file_path=file_path,
242
- query=query,
243
- api_url=api_url,
244
- organization_id=organization_id,
245
- dataset_id=dataset_id,
246
- header=header,
247
- )
248
-
249
- LOGGER.info(T("coal.services.azure_storage.all_data_sent"))
250
-
251
- # Update dataset status
252
- dataset_info.ingestion_status = "SUCCESS"
253
- dataset_info.twincache_status = "FULL"
254
- api_ds.update_dataset(organization_id, dataset_id, dataset_info)
255
-
256
-
257
- def _process_csv_file(
258
- file_path: pathlib.Path,
259
- query: str,
260
- api_url: str,
261
- organization_id: str,
262
- dataset_id: str,
263
- header: Dict[str, str],
264
- ) -> None:
265
- """
266
- Process a CSV file and send it to the Twin Data Layer.
267
-
268
- Args:
269
- file_path: Path to the CSV file
270
- query: Query to execute
271
- api_url: API URL
272
- organization_id: Organization ID
273
- dataset_id: Dataset ID
274
- header: HTTP headers
275
- """
276
- content = StringIO()
277
- size = 0
278
- batch = 1
279
- errors = []
280
- query_craft = api_url + f"/organizations/{organization_id}/datasets/{dataset_id}/batch?query={query}"
281
- LOGGER.info(T("coal.services.azure_storage.sending_content").format(file=file_path))
282
-
283
- with open(file_path, "r") as _f:
284
- dr = DictReader(_f)
285
- dw = DictWriter(content, fieldnames=sorted(dr.fieldnames, key=len, reverse=True))
286
- dw.writeheader()
287
- for row in dr:
288
- dw.writerow(row)
289
- size += 1
290
- if size > BATCH_SIZE_LIMIT:
291
- LOGGER.info(T("coal.services.azure_storage.row_batch").format(count=batch * BATCH_SIZE_LIMIT))
292
- batch += 1
293
- content.seek(0)
294
- post = requests.post(query_craft, data=content.read(), headers=header)
295
- post.raise_for_status()
296
- errors.extend(json.loads(post.content)["errors"])
297
- content = StringIO()
298
- dw = DictWriter(
299
- content,
300
- fieldnames=sorted(dr.fieldnames, key=len, reverse=True),
301
- )
302
- dw.writeheader()
303
- size = 0
304
-
305
- if size > 0:
306
- content.seek(0)
307
- post = requests.post(query_craft, data=content.read(), headers=header)
308
- post.raise_for_status()
309
- errors.extend(json.loads(post.content)["errors"])
310
-
311
- if len(errors):
312
- LOGGER.error(T("coal.services.azure_storage.import_errors").format(count=len(errors)))
313
- for _err in errors:
314
- LOGGER.error(T("coal.services.azure_storage.error_detail").format(error=str(_err)))
315
- raise ValueError(f"Error importing data from {file_path}")
316
-
317
-
318
- def load_files_from_tdl(
319
- organization_id: str,
320
- workspace_id: str,
321
- directory_path: str,
322
- runner_id: str,
323
- ) -> None:
324
- """
325
- Load files from the Twin Data Layer.
326
-
327
- Args:
328
- organization_id: Organization ID
329
- workspace_id: Workspace ID
330
- directory_path: Directory to save files to
331
- runner_id: Runner ID
332
- """
333
- api_client, _ = get_api_client()
334
- api_ds = DatasetApi(api_client)
335
-
336
- # Get dataset ID from runner
337
- dataset_id = get_dataset_id_from_runner(organization_id, workspace_id, runner_id)
338
-
339
- # Get dataset info
340
- dataset_info = api_ds.find_dataset_by_id(organization_id, dataset_id)
341
- if dataset_info.ingestion_status != "SUCCESS":
342
- LOGGER.error(
343
- T("coal.cosmotech_api.runner.dataset_state").format(
344
- dataset_id=dataset_id, status=dataset_info.ingestion_status
345
- )
346
- )
347
- LOGGER.debug(T("coal.cosmotech_api.runner.dataset_info").format(info=dataset_info))
348
- raise ValueError(f"Dataset {dataset_id} is not in SUCCESS state")
349
-
350
- # Create directory
351
- directory_path = pathlib.Path(directory_path)
352
- if directory_path.is_file():
353
- LOGGER.error(T("coal.common.file_operations.not_directory").format(target_dir=directory_path))
354
- raise ValueError(f"{directory_path} is not a directory")
355
-
356
- directory_path.mkdir(parents=True, exist_ok=True)
357
-
358
- # Get node and relationship properties
359
- item_queries = {}
360
- properties_nodes = _get_node_properties(api_ds, organization_id, dataset_id)
361
- properties_relationships = _get_relationship_properties(api_ds, organization_id, dataset_id)
362
-
363
- # Create queries
364
- for label, keys in properties_nodes.items():
365
- node_query = f"MATCH (n:{label}) RETURN {', '.join(map(lambda k: f'n.`{k}` as `{k}`', keys))}"
366
- item_queries[label] = node_query
367
-
368
- for label, keys in properties_relationships.items():
369
- rel_query = f"MATCH ()-[n:{label}]->() RETURN {', '.join(map(lambda k: f'n.`{k}` as `{k}`', keys))}"
370
- item_queries[label] = rel_query
371
-
372
- # Execute queries and write files
373
- files_content, files_headers = _execute_queries(api_ds, organization_id, dataset_id, item_queries)
374
- _write_files(directory_path, files_content, files_headers)
375
-
376
- LOGGER.info(T("coal.services.azure_storage.all_csv_written"))
377
-
378
-
379
- def _get_node_properties(api_ds: DatasetApi, organization_id: str, dataset_id: str) -> Dict[str, Set[str]]:
380
- """
381
- Get node properties from the Twin Data Layer.
382
-
383
- Args:
384
- api_ds: Dataset API
385
- organization_id: Organization ID
386
- dataset_id: Dataset ID
387
-
388
- Returns:
389
- Dictionary of node labels to sets of property keys
390
- """
391
- get_node_properties_query = "MATCH (n) RETURN distinct labels(n)[0] as label, keys(n) as keys"
392
- node_properties_results: List[Dict[str, Any]] = api_ds.twingraph_query(
393
- organization_id,
394
- dataset_id,
395
- DatasetTwinGraphQuery(query=get_node_properties_query),
396
- )
397
-
398
- properties_nodes = {}
399
- for _r in node_properties_results:
400
- label = _r["label"]
401
- keys = _r["keys"]
402
- if label not in properties_nodes:
403
- properties_nodes[label] = set()
404
- properties_nodes[label].update(keys)
405
-
406
- return properties_nodes
407
-
408
-
409
- def _get_relationship_properties(api_ds: DatasetApi, organization_id: str, dataset_id: str) -> Dict[str, Set[str]]:
410
- """
411
- Get relationship properties from the Twin Data Layer.
412
-
413
- Args:
414
- api_ds: Dataset API
415
- organization_id: Organization ID
416
- dataset_id: Dataset ID
417
-
418
- Returns:
419
- Dictionary of relationship types to sets of property keys
420
- """
421
- get_relationship_properties_query = "MATCH ()-[r]->() RETURN distinct type(r) as label, keys(r) as keys"
422
- relationship_properties_results: List[Dict[str, Any]] = api_ds.twingraph_query(
423
- organization_id,
424
- dataset_id,
425
- DatasetTwinGraphQuery(query=get_relationship_properties_query),
426
- )
427
-
428
- properties_relationships = {}
429
- for _r in relationship_properties_results:
430
- label = _r["label"]
431
- keys = _r["keys"]
432
- if label not in properties_relationships:
433
- properties_relationships[label] = set()
434
- properties_relationships[label].update(keys)
435
-
436
- return properties_relationships
437
-
438
-
439
- def _execute_queries(
440
- api_ds: DatasetApi, organization_id: str, dataset_id: str, item_queries: Dict[str, str]
441
- ) -> Tuple[Dict[str, List[Dict[str, Any]]], Dict[str, Set[str]]]:
442
- """
443
- Execute queries against the Twin Data Layer.
444
-
445
- Args:
446
- api_ds: Dataset API
447
- organization_id: Organization ID
448
- dataset_id: Dataset ID
449
- item_queries: Dictionary of element types to queries
450
-
451
- Returns:
452
- Tuple of (files_content, files_headers)
453
- """
454
- files_content = {}
455
- files_headers = {}
456
-
457
- for element_type, query in item_queries.items():
458
- element_query: List[Dict[str, Any]] = api_ds.twingraph_query(
459
- organization_id, dataset_id, DatasetTwinGraphQuery(query=query)
460
- )
461
- for element in element_query:
462
- if element_type not in files_content:
463
- files_content[element_type] = []
464
- files_headers[element_type] = set()
465
- files_content[element_type].append(element)
466
- files_headers[element_type].update(element.keys())
467
-
468
- return files_content, files_headers
469
-
470
-
471
- def _write_files(
472
- directory_path: pathlib.Path,
473
- files_content: Dict[str, List[Dict[str, Any]]],
474
- files_headers: Dict[str, Set[str]],
475
- ) -> None:
476
- """
477
- Write files to disk.
478
-
479
- Args:
480
- directory_path: Directory to write files to
481
- files_content: Dictionary of file names to lists of rows
482
- files_headers: Dictionary of file names to sets of headers
483
- """
484
- for file_name in files_content.keys():
485
- file_path = directory_path / (file_name + ".csv")
486
- LOGGER.info(
487
- T("coal.services.azure_storage.writing_lines").format(count=len(files_content[file_name]), file=file_path)
488
- )
489
- with file_path.open("w") as _f:
490
- headers = files_headers[file_name]
491
- has_id = "id" in headers
492
- is_relation = "src" in headers
493
- new_headers = []
494
- if has_id:
495
- headers.remove("id")
496
- new_headers.append("id")
497
- if is_relation:
498
- headers.remove("src")
499
- headers.remove("dest")
500
- new_headers.append("src")
501
- new_headers.append("dest")
502
- headers = new_headers + sorted(headers)
503
-
504
- dw = DictWriter(_f, fieldnames=headers)
505
- dw.writeheader()
506
- for row in sorted(files_content[file_name], key=lambda r: r.get("id", "")):
507
- dw.writerow(
508
- {
509
- key: (json.dumps(value) if isinstance(value, (bool, dict, list)) else value)
510
- for key, value in row.items()
511
- }
512
- )