cosmotech-acceleration-library 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cosmotech/coal/__init__.py +8 -0
- cosmotech/coal/aws/__init__.py +23 -0
- cosmotech/coal/aws/s3.py +235 -0
- cosmotech/coal/azure/__init__.py +23 -0
- cosmotech/coal/azure/adx/__init__.py +26 -0
- cosmotech/coal/azure/adx/auth.py +125 -0
- cosmotech/coal/azure/adx/ingestion.py +329 -0
- cosmotech/coal/azure/adx/query.py +56 -0
- cosmotech/coal/azure/adx/runner.py +217 -0
- cosmotech/coal/azure/adx/store.py +255 -0
- cosmotech/coal/azure/adx/tables.py +118 -0
- cosmotech/coal/azure/adx/utils.py +71 -0
- cosmotech/coal/azure/blob.py +109 -0
- cosmotech/coal/azure/functions.py +72 -0
- cosmotech/coal/azure/storage.py +74 -0
- cosmotech/coal/cosmotech_api/__init__.py +36 -0
- cosmotech/coal/cosmotech_api/connection.py +96 -0
- cosmotech/coal/cosmotech_api/dataset/__init__.py +26 -0
- cosmotech/coal/cosmotech_api/dataset/converters.py +164 -0
- cosmotech/coal/cosmotech_api/dataset/download/__init__.py +19 -0
- cosmotech/coal/cosmotech_api/dataset/download/adt.py +119 -0
- cosmotech/coal/cosmotech_api/dataset/download/common.py +140 -0
- cosmotech/coal/cosmotech_api/dataset/download/file.py +216 -0
- cosmotech/coal/cosmotech_api/dataset/download/twingraph.py +188 -0
- cosmotech/coal/cosmotech_api/dataset/utils.py +132 -0
- cosmotech/coal/cosmotech_api/parameters.py +48 -0
- cosmotech/coal/cosmotech_api/run.py +25 -0
- cosmotech/coal/cosmotech_api/run_data.py +173 -0
- cosmotech/coal/cosmotech_api/run_template.py +108 -0
- cosmotech/coal/cosmotech_api/runner/__init__.py +28 -0
- cosmotech/coal/cosmotech_api/runner/data.py +38 -0
- cosmotech/coal/cosmotech_api/runner/datasets.py +364 -0
- cosmotech/coal/cosmotech_api/runner/download.py +146 -0
- cosmotech/coal/cosmotech_api/runner/metadata.py +42 -0
- cosmotech/coal/cosmotech_api/runner/parameters.py +157 -0
- cosmotech/coal/cosmotech_api/twin_data_layer.py +512 -0
- cosmotech/coal/cosmotech_api/workspace.py +127 -0
- cosmotech/coal/csm/__init__.py +6 -0
- cosmotech/coal/csm/engine/__init__.py +47 -0
- cosmotech/coal/postgresql/__init__.py +22 -0
- cosmotech/coal/postgresql/runner.py +93 -0
- cosmotech/coal/postgresql/store.py +98 -0
- cosmotech/coal/singlestore/__init__.py +17 -0
- cosmotech/coal/singlestore/store.py +100 -0
- cosmotech/coal/store/__init__.py +42 -0
- cosmotech/coal/store/csv.py +44 -0
- cosmotech/coal/store/native_python.py +25 -0
- cosmotech/coal/store/pandas.py +26 -0
- cosmotech/coal/store/pyarrow.py +23 -0
- cosmotech/coal/store/store.py +79 -0
- cosmotech/coal/utils/__init__.py +18 -0
- cosmotech/coal/utils/api.py +68 -0
- cosmotech/coal/utils/logger.py +10 -0
- cosmotech/coal/utils/postgresql.py +236 -0
- cosmotech/csm_data/__init__.py +6 -0
- cosmotech/csm_data/commands/__init__.py +6 -0
- cosmotech/csm_data/commands/adx_send_data.py +92 -0
- cosmotech/csm_data/commands/adx_send_runnerdata.py +119 -0
- cosmotech/csm_data/commands/api/__init__.py +6 -0
- cosmotech/csm_data/commands/api/api.py +50 -0
- cosmotech/csm_data/commands/api/postgres_send_runner_metadata.py +119 -0
- cosmotech/csm_data/commands/api/rds_load_csv.py +90 -0
- cosmotech/csm_data/commands/api/rds_send_csv.py +74 -0
- cosmotech/csm_data/commands/api/rds_send_store.py +74 -0
- cosmotech/csm_data/commands/api/run_load_data.py +120 -0
- cosmotech/csm_data/commands/api/runtemplate_load_handler.py +66 -0
- cosmotech/csm_data/commands/api/tdl_load_files.py +76 -0
- cosmotech/csm_data/commands/api/tdl_send_files.py +82 -0
- cosmotech/csm_data/commands/api/wsf_load_file.py +66 -0
- cosmotech/csm_data/commands/api/wsf_send_file.py +68 -0
- cosmotech/csm_data/commands/az_storage_upload.py +76 -0
- cosmotech/csm_data/commands/s3_bucket_delete.py +107 -0
- cosmotech/csm_data/commands/s3_bucket_download.py +118 -0
- cosmotech/csm_data/commands/s3_bucket_upload.py +128 -0
- cosmotech/csm_data/commands/store/__init__.py +6 -0
- cosmotech/csm_data/commands/store/dump_to_azure.py +120 -0
- cosmotech/csm_data/commands/store/dump_to_postgresql.py +107 -0
- cosmotech/csm_data/commands/store/dump_to_s3.py +169 -0
- cosmotech/csm_data/commands/store/list_tables.py +48 -0
- cosmotech/csm_data/commands/store/load_csv_folder.py +43 -0
- cosmotech/csm_data/commands/store/load_from_singlestore.py +96 -0
- cosmotech/csm_data/commands/store/reset.py +31 -0
- cosmotech/csm_data/commands/store/store.py +37 -0
- cosmotech/csm_data/main.py +57 -0
- cosmotech/csm_data/utils/__init__.py +6 -0
- cosmotech/csm_data/utils/click.py +18 -0
- cosmotech/csm_data/utils/decorators.py +75 -0
- cosmotech/orchestrator_plugins/csm-data/__init__.py +11 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/postgres_send_runner_metadata.json +40 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/rds_load_csv.json +27 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/rds_send_csv.json +27 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/rds_send_store.json +27 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/run_load_data.json +30 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/runtemplate_load_handler.json +27 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/tdl_load_files.json +32 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/tdl_send_files.json +27 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/try_api_connection.json +9 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/wsf_load_file.json +36 -0
- cosmotech/orchestrator_plugins/csm-data/templates/api/wsf_send_file.json +36 -0
- cosmotech/orchestrator_plugins/csm-data/templates/main/adx_send_runnerdata.json +29 -0
- cosmotech/orchestrator_plugins/csm-data/templates/main/az_storage_upload.json +25 -0
- cosmotech/orchestrator_plugins/csm-data/templates/main/s3_bucket_delete.json +31 -0
- cosmotech/orchestrator_plugins/csm-data/templates/main/s3_bucket_download.json +34 -0
- cosmotech/orchestrator_plugins/csm-data/templates/main/s3_bucket_upload.json +35 -0
- cosmotech/orchestrator_plugins/csm-data/templates/store/store_dump_to_azure.json +35 -0
- cosmotech/orchestrator_plugins/csm-data/templates/store/store_dump_to_postgresql.json +34 -0
- cosmotech/orchestrator_plugins/csm-data/templates/store/store_dump_to_s3.json +36 -0
- cosmotech/orchestrator_plugins/csm-data/templates/store/store_list_tables.json +15 -0
- cosmotech/orchestrator_plugins/csm-data/templates/store/store_load_csv_folder.json +18 -0
- cosmotech/orchestrator_plugins/csm-data/templates/store/store_load_from_singlestore.json +34 -0
- cosmotech/orchestrator_plugins/csm-data/templates/store/store_reset.json +15 -0
- cosmotech/translation/coal/__init__.py +6 -0
- cosmotech/translation/coal/en-US/coal/common/data_transfer.yml +6 -0
- cosmotech/translation/coal/en-US/coal/common/errors.yml +9 -0
- cosmotech/translation/coal/en-US/coal/common/file_operations.yml +6 -0
- cosmotech/translation/coal/en-US/coal/common/progress.yml +6 -0
- cosmotech/translation/coal/en-US/coal/common/timing.yml +5 -0
- cosmotech/translation/coal/en-US/coal/common/validation.yml +8 -0
- cosmotech/translation/coal/en-US/coal/cosmotech_api/connection.yml +10 -0
- cosmotech/translation/coal/en-US/coal/cosmotech_api/run_data.yml +2 -0
- cosmotech/translation/coal/en-US/coal/cosmotech_api/run_template.yml +8 -0
- cosmotech/translation/coal/en-US/coal/cosmotech_api/runner.yml +16 -0
- cosmotech/translation/coal/en-US/coal/cosmotech_api/solution.yml +5 -0
- cosmotech/translation/coal/en-US/coal/cosmotech_api/workspace.yml +7 -0
- cosmotech/translation/coal/en-US/coal/services/adx.yml +59 -0
- cosmotech/translation/coal/en-US/coal/services/api.yml +8 -0
- cosmotech/translation/coal/en-US/coal/services/azure_storage.yml +14 -0
- cosmotech/translation/coal/en-US/coal/services/database.yml +19 -0
- cosmotech/translation/coal/en-US/coal/services/dataset.yml +68 -0
- cosmotech/translation/coal/en-US/coal/services/postgresql.yml +28 -0
- cosmotech/translation/coal/en-US/coal/services/s3.yml +9 -0
- cosmotech/translation/coal/en-US/coal/solution.yml +3 -0
- cosmotech/translation/coal/en-US/coal/web.yml +2 -0
- cosmotech/translation/csm_data/__init__.py +6 -0
- cosmotech/translation/csm_data/en-US/csm-data.yml +434 -0
- cosmotech_acceleration_library-1.0.0.dist-info/METADATA +255 -0
- cosmotech_acceleration_library-1.0.0.dist-info/RECORD +141 -0
- cosmotech_acceleration_library-1.0.0.dist-info/WHEEL +5 -0
- cosmotech_acceleration_library-1.0.0.dist-info/entry_points.txt +2 -0
- cosmotech_acceleration_library-1.0.0.dist-info/licenses/LICENSE +17 -0
- cosmotech_acceleration_library-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# Copyright (C) - 2023 - 2025 - Cosmo Tech
|
|
2
|
+
# This document and all information contained herein is the exclusive property -
|
|
3
|
+
# including all intellectual property rights pertaining thereto - of Cosmo Tech.
|
|
4
|
+
# Any use, reproduction, translation, broadcasting, transmission, distribution,
|
|
5
|
+
# etc., to any person is prohibited unless it has been previously and
|
|
6
|
+
# specifically authorized by written means by Cosmo Tech.
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
Parameter handling functions.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import pathlib
|
|
15
|
+
from csv import DictWriter
|
|
16
|
+
from typing import List, Dict, Any
|
|
17
|
+
|
|
18
|
+
from cosmotech.coal.utils.logger import LOGGER
|
|
19
|
+
from cosmotech.orchestrator.utils.translate import T
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def get_runner_parameters(runner_data: Any) -> Dict[str, Any]:
|
|
23
|
+
"""
|
|
24
|
+
Extract parameters from runner data.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
runner_data: Runner data object
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Dictionary mapping parameter IDs to values
|
|
31
|
+
"""
|
|
32
|
+
content = dict()
|
|
33
|
+
for parameter in runner_data.parameters_values:
|
|
34
|
+
content[parameter.parameter_id] = parameter.value
|
|
35
|
+
return content
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def format_parameters_list(runner_data: Any) -> List[Dict[str, Any]]:
|
|
39
|
+
"""
|
|
40
|
+
Format parameters from runner data as a list of dictionaries.
|
|
41
|
+
|
|
42
|
+
Args:
|
|
43
|
+
runner_data: Runner data object
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
List of parameter dictionaries
|
|
47
|
+
"""
|
|
48
|
+
parameters = []
|
|
49
|
+
|
|
50
|
+
if not runner_data.parameters_values:
|
|
51
|
+
return parameters
|
|
52
|
+
|
|
53
|
+
max_name_size = max(map(lambda r: len(r.parameter_id), runner_data.parameters_values))
|
|
54
|
+
max_type_size = max(map(lambda r: len(r.var_type), runner_data.parameters_values))
|
|
55
|
+
|
|
56
|
+
for parameter_data in runner_data.parameters_values:
|
|
57
|
+
parameter_name = parameter_data.parameter_id
|
|
58
|
+
value = parameter_data.value
|
|
59
|
+
var_type = parameter_data.var_type
|
|
60
|
+
is_inherited = parameter_data.is_inherited
|
|
61
|
+
|
|
62
|
+
parameters.append(
|
|
63
|
+
{
|
|
64
|
+
"parameterId": parameter_name,
|
|
65
|
+
"value": value,
|
|
66
|
+
"varType": var_type,
|
|
67
|
+
"isInherited": is_inherited,
|
|
68
|
+
}
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
LOGGER.debug(
|
|
72
|
+
T("coal.cosmotech_api.runner.parameter_debug").format(
|
|
73
|
+
param_id=parameter_name,
|
|
74
|
+
max_name_size=max_name_size,
|
|
75
|
+
var_type=var_type,
|
|
76
|
+
max_type_size=max_type_size,
|
|
77
|
+
value=value,
|
|
78
|
+
inherited=" inherited" if is_inherited else "",
|
|
79
|
+
)
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
return parameters
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def write_parameters_to_json(parameter_folder: str, parameters: List[Dict[str, Any]]) -> str:
|
|
86
|
+
"""
|
|
87
|
+
Write parameters to a JSON file.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
parameter_folder: Folder to write the file to
|
|
91
|
+
parameters: List of parameter dictionaries
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
Path to the created file
|
|
95
|
+
"""
|
|
96
|
+
pathlib.Path(parameter_folder).mkdir(exist_ok=True, parents=True)
|
|
97
|
+
tmp_parameter_file = os.path.join(parameter_folder, "parameters.json")
|
|
98
|
+
|
|
99
|
+
LOGGER.info(T("coal.cosmotech_api.runner.generating_file").format(file=tmp_parameter_file))
|
|
100
|
+
|
|
101
|
+
with open(tmp_parameter_file, "w") as _file:
|
|
102
|
+
json.dump(parameters, _file, indent=2)
|
|
103
|
+
|
|
104
|
+
return tmp_parameter_file
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def write_parameters_to_csv(parameter_folder: str, parameters: List[Dict[str, Any]]) -> str:
|
|
108
|
+
"""
|
|
109
|
+
Write parameters to a CSV file.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
parameter_folder: Folder to write the file to
|
|
113
|
+
parameters: List of parameter dictionaries
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Path to the created file
|
|
117
|
+
"""
|
|
118
|
+
pathlib.Path(parameter_folder).mkdir(exist_ok=True, parents=True)
|
|
119
|
+
tmp_parameter_file = os.path.join(parameter_folder, "parameters.csv")
|
|
120
|
+
|
|
121
|
+
LOGGER.info(T("coal.cosmotech_api.runner.generating_file").format(file=tmp_parameter_file))
|
|
122
|
+
|
|
123
|
+
with open(tmp_parameter_file, "w") as _file:
|
|
124
|
+
_w = DictWriter(_file, fieldnames=["parameterId", "value", "varType", "isInherited"])
|
|
125
|
+
_w.writeheader()
|
|
126
|
+
_w.writerows(parameters)
|
|
127
|
+
|
|
128
|
+
return tmp_parameter_file
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def write_parameters(
|
|
132
|
+
parameter_folder: str,
|
|
133
|
+
parameters: List[Dict[str, Any]],
|
|
134
|
+
write_csv: bool = True,
|
|
135
|
+
write_json: bool = False,
|
|
136
|
+
) -> Dict[str, str]:
|
|
137
|
+
"""
|
|
138
|
+
Write parameters to files based on specified formats.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
parameter_folder: Folder to write the files to
|
|
142
|
+
parameters: List of parameter dictionaries
|
|
143
|
+
write_csv: Whether to write a CSV file
|
|
144
|
+
write_json: Whether to write a JSON file
|
|
145
|
+
|
|
146
|
+
Returns:
|
|
147
|
+
Dictionary mapping file types to file paths
|
|
148
|
+
"""
|
|
149
|
+
result = {}
|
|
150
|
+
|
|
151
|
+
if write_csv:
|
|
152
|
+
result["csv"] = write_parameters_to_csv(parameter_folder, parameters)
|
|
153
|
+
|
|
154
|
+
if write_json:
|
|
155
|
+
result["json"] = write_parameters_to_json(parameter_folder, parameters)
|
|
156
|
+
|
|
157
|
+
return result
|
|
@@ -0,0 +1,512 @@
|
|
|
1
|
+
# Copyright (C) - 2023 - 2025 - Cosmo Tech
|
|
2
|
+
# This document and all information contained herein is the exclusive property -
|
|
3
|
+
# including all intellectual property rights pertaining thereto - of Cosmo Tech.
|
|
4
|
+
# Any use, reproduction, translation, broadcasting, transmission, distribution,
|
|
5
|
+
# etc., to any person is prohibited unless it has been previously and
|
|
6
|
+
# specifically authorized by written means by Cosmo Tech.
|
|
7
|
+
|
|
8
|
+
"""
|
|
9
|
+
Twin Data Layer operations module.
|
|
10
|
+
|
|
11
|
+
This module provides functions for interacting with the Twin Data Layer,
|
|
12
|
+
including sending and loading files.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
import pathlib
|
|
17
|
+
from csv import DictReader, DictWriter
|
|
18
|
+
from io import StringIO
|
|
19
|
+
from typing import Dict, List, Any, Optional, Set, Tuple
|
|
20
|
+
|
|
21
|
+
import requests
|
|
22
|
+
from cosmotech_api import DatasetApi, RunnerApi, DatasetTwinGraphQuery
|
|
23
|
+
|
|
24
|
+
from cosmotech.coal.cosmotech_api.connection import get_api_client
|
|
25
|
+
from cosmotech.coal.utils.logger import LOGGER
|
|
26
|
+
from cosmotech.orchestrator.utils.translate import T
|
|
27
|
+
|
|
28
|
+
ID_COLUMN = "id"
|
|
29
|
+
|
|
30
|
+
SOURCE_COLUMN = "src"
|
|
31
|
+
|
|
32
|
+
TARGET_COLUMN = "dest"
|
|
33
|
+
|
|
34
|
+
BATCH_SIZE_LIMIT = 10000
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class CSVSourceFile:
|
|
38
|
+
def __init__(self, file_path: pathlib.Path):
|
|
39
|
+
self.file_path = file_path
|
|
40
|
+
if not file_path.name.endswith(".csv"):
|
|
41
|
+
raise ValueError(T("coal.common.validation.not_csv_file").format(file_path=file_path))
|
|
42
|
+
with open(file_path) as _file:
|
|
43
|
+
dr = DictReader(_file)
|
|
44
|
+
self.fields = list(dr.fieldnames)
|
|
45
|
+
self.object_type = file_path.name[:-4]
|
|
46
|
+
|
|
47
|
+
self.id_column = None
|
|
48
|
+
self.source_column = None
|
|
49
|
+
self.target_column = None
|
|
50
|
+
|
|
51
|
+
for _c in self.fields:
|
|
52
|
+
if _c.lower() == ID_COLUMN:
|
|
53
|
+
self.id_column = _c
|
|
54
|
+
if _c.lower() == SOURCE_COLUMN:
|
|
55
|
+
self.source_column = _c
|
|
56
|
+
if _c.lower() == TARGET_COLUMN:
|
|
57
|
+
self.target_column = _c
|
|
58
|
+
|
|
59
|
+
has_id = self.id_column is not None
|
|
60
|
+
has_source = self.source_column is not None
|
|
61
|
+
has_target = self.target_column is not None
|
|
62
|
+
|
|
63
|
+
is_relation = all([has_source, has_target])
|
|
64
|
+
|
|
65
|
+
if not has_id and not is_relation:
|
|
66
|
+
LOGGER.error(T("coal.common.validation.invalid_nodes_relations").format(file_path=file_path))
|
|
67
|
+
LOGGER.error(T("coal.common.validation.node_requirements").format(id_column=ID_COLUMN))
|
|
68
|
+
LOGGER.error(
|
|
69
|
+
T("coal.common.validation.relationship_requirements").format(
|
|
70
|
+
id_column=ID_COLUMN,
|
|
71
|
+
source_column=SOURCE_COLUMN,
|
|
72
|
+
target_column=TARGET_COLUMN,
|
|
73
|
+
)
|
|
74
|
+
)
|
|
75
|
+
raise ValueError(T("coal.common.validation.invalid_nodes_relations").format(file_path=file_path))
|
|
76
|
+
|
|
77
|
+
self.is_node = has_id and not is_relation
|
|
78
|
+
|
|
79
|
+
self.content_fields = {
|
|
80
|
+
_f: _f for _f in self.fields if _f not in [self.id_column, self.source_column, self.target_column]
|
|
81
|
+
}
|
|
82
|
+
if has_id:
|
|
83
|
+
self.content_fields[ID_COLUMN] = self.id_column
|
|
84
|
+
if is_relation:
|
|
85
|
+
self.content_fields[SOURCE_COLUMN] = self.source_column
|
|
86
|
+
self.content_fields[TARGET_COLUMN] = self.target_column
|
|
87
|
+
|
|
88
|
+
def reload(self, inplace: bool = False) -> "CSVSourceFile":
|
|
89
|
+
if inplace:
|
|
90
|
+
self.__init__(self.file_path)
|
|
91
|
+
return self
|
|
92
|
+
return CSVSourceFile(self.file_path)
|
|
93
|
+
|
|
94
|
+
def generate_query_insert(self) -> str:
|
|
95
|
+
"""
|
|
96
|
+
Read a CSV file headers and generate a CREATE cypher query
|
|
97
|
+
:return: the Cypher query for CREATE
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
field_names = sorted(self.content_fields.keys(), key=len, reverse=True)
|
|
101
|
+
|
|
102
|
+
if self.is_node:
|
|
103
|
+
query = (
|
|
104
|
+
"CREATE (:"
|
|
105
|
+
+ self.object_type
|
|
106
|
+
+ ", ".join(f"{property_name}: ${self.content_fields[property_name]}" for property_name in field_names)
|
|
107
|
+
+ "})"
|
|
108
|
+
)
|
|
109
|
+
# query = ("UNWIND $params AS params " +
|
|
110
|
+
# f"MERGE (n:{self.object_type}) " +
|
|
111
|
+
# "SET n += params")
|
|
112
|
+
else:
|
|
113
|
+
query = (
|
|
114
|
+
"MATCH "
|
|
115
|
+
+ "(source {"
|
|
116
|
+
+ ID_COLUMN
|
|
117
|
+
+ ":$"
|
|
118
|
+
+ self.source_column
|
|
119
|
+
+ "}),\n"
|
|
120
|
+
+ "(target {"
|
|
121
|
+
+ ID_COLUMN
|
|
122
|
+
+ ":$"
|
|
123
|
+
+ self.target_column
|
|
124
|
+
+ "})\n"
|
|
125
|
+
+ "CREATE (source)-[rel:"
|
|
126
|
+
+ self.object_type
|
|
127
|
+
+ " {"
|
|
128
|
+
+ ", ".join(f"{property_name}: ${self.content_fields[property_name]}" for property_name in field_names)
|
|
129
|
+
+ "}"
|
|
130
|
+
+ "]->(target)\n"
|
|
131
|
+
)
|
|
132
|
+
# query = ("UNWIND $params AS params " +
|
|
133
|
+
# "MATCH (source {" + ID_COLUMN + ":params." + self.source_column + "})\n" +
|
|
134
|
+
# "MATCH (target {" + ID_COLUMN + ":params." + self.target_column + "})\n" +
|
|
135
|
+
# f"CREATE (from) - [rel:{self.object_type}]->(to)" +
|
|
136
|
+
# "SET rel += params")
|
|
137
|
+
return query
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def get_dataset_id_from_runner(organization_id: str, workspace_id: str, runner_id: str) -> str:
|
|
141
|
+
"""
|
|
142
|
+
Get the dataset ID from a runner.
|
|
143
|
+
|
|
144
|
+
Args:
|
|
145
|
+
organization_id: Organization ID
|
|
146
|
+
workspace_id: Workspace ID
|
|
147
|
+
runner_id: Runner ID
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
Dataset ID
|
|
151
|
+
"""
|
|
152
|
+
api_client, _ = get_api_client()
|
|
153
|
+
api_runner = RunnerApi(api_client)
|
|
154
|
+
|
|
155
|
+
runner_info = api_runner.get_runner(
|
|
156
|
+
organization_id,
|
|
157
|
+
workspace_id,
|
|
158
|
+
runner_id,
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
if (datasets_len := len(runner_info.dataset_list)) != 1:
|
|
162
|
+
LOGGER.error(
|
|
163
|
+
T("coal.cosmotech_api.runner.not_single_dataset").format(runner_id=runner_info.id, count=datasets_len)
|
|
164
|
+
)
|
|
165
|
+
LOGGER.debug(T("coal.cosmotech_api.runner.runner_info").format(info=runner_info))
|
|
166
|
+
raise ValueError(f"Runner {runner_info.id} does not have exactly one dataset")
|
|
167
|
+
|
|
168
|
+
return runner_info.dataset_list[0]
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
def send_files_to_tdl(
|
|
172
|
+
api_url: str,
|
|
173
|
+
organization_id: str,
|
|
174
|
+
workspace_id: str,
|
|
175
|
+
runner_id: str,
|
|
176
|
+
directory_path: str,
|
|
177
|
+
clear: bool = True,
|
|
178
|
+
) -> None:
|
|
179
|
+
"""
|
|
180
|
+
Send CSV files to the Twin Data Layer.
|
|
181
|
+
|
|
182
|
+
Args:
|
|
183
|
+
api_url: API URL
|
|
184
|
+
organization_id: Organization ID
|
|
185
|
+
workspace_id: Workspace ID
|
|
186
|
+
runner_id: Runner ID
|
|
187
|
+
directory_path: Directory containing CSV files
|
|
188
|
+
clear: Whether to clear the dataset before sending files
|
|
189
|
+
"""
|
|
190
|
+
api_client, _ = get_api_client()
|
|
191
|
+
api_ds = DatasetApi(api_client)
|
|
192
|
+
|
|
193
|
+
# Get dataset ID from runner
|
|
194
|
+
dataset_id = get_dataset_id_from_runner(organization_id, workspace_id, runner_id)
|
|
195
|
+
|
|
196
|
+
# Get dataset info
|
|
197
|
+
dataset_info = api_ds.find_dataset_by_id(organization_id, dataset_id)
|
|
198
|
+
dataset_info.ingestion_status = "SUCCESS"
|
|
199
|
+
api_ds.update_dataset(organization_id, dataset_id, dataset_info)
|
|
200
|
+
|
|
201
|
+
# Process CSV files
|
|
202
|
+
entities_queries = {}
|
|
203
|
+
relation_queries = {}
|
|
204
|
+
|
|
205
|
+
content_path = pathlib.Path(directory_path)
|
|
206
|
+
if not content_path.is_dir():
|
|
207
|
+
LOGGER.error(T("coal.common.file_operations.not_directory").format(target_dir=directory_path))
|
|
208
|
+
raise ValueError(f"{directory_path} is not a directory")
|
|
209
|
+
|
|
210
|
+
# Process CSV files
|
|
211
|
+
for file_path in content_path.glob("*.csv"):
|
|
212
|
+
_csv = CSVSourceFile(file_path)
|
|
213
|
+
if _csv.is_node:
|
|
214
|
+
LOGGER.info(T("coal.services.azure_storage.sending_content").format(file=file_path))
|
|
215
|
+
entities_queries[file_path] = _csv.generate_query_insert()
|
|
216
|
+
else:
|
|
217
|
+
LOGGER.info(T("coal.services.azure_storage.sending_content").format(file=file_path))
|
|
218
|
+
relation_queries[file_path] = _csv.generate_query_insert()
|
|
219
|
+
|
|
220
|
+
# Prepare headers
|
|
221
|
+
header = {
|
|
222
|
+
"Accept": "application/json",
|
|
223
|
+
"Content-Type": "text/csv",
|
|
224
|
+
"User-Agent": "OpenAPI-Generator/1.0.0/python",
|
|
225
|
+
}
|
|
226
|
+
header.update(api_client.default_headers)
|
|
227
|
+
|
|
228
|
+
for authtype, authinfo in api_ds.api_client.configuration.auth_settings().items():
|
|
229
|
+
api_ds.api_client._apply_auth_params(header, None, None, None, None, authinfo)
|
|
230
|
+
|
|
231
|
+
# Clear dataset if requested
|
|
232
|
+
if clear:
|
|
233
|
+
LOGGER.info(T("coal.services.azure_storage.clearing_content"))
|
|
234
|
+
clear_query = "MATCH (n) DETACH DELETE n"
|
|
235
|
+
api_ds.twingraph_query(organization_id, dataset_id, DatasetTwinGraphQuery(query=str(clear_query)))
|
|
236
|
+
|
|
237
|
+
# Send files
|
|
238
|
+
for query_dict in [entities_queries, relation_queries]:
|
|
239
|
+
for file_path, query in query_dict.items():
|
|
240
|
+
_process_csv_file(
|
|
241
|
+
file_path=file_path,
|
|
242
|
+
query=query,
|
|
243
|
+
api_url=api_url,
|
|
244
|
+
organization_id=organization_id,
|
|
245
|
+
dataset_id=dataset_id,
|
|
246
|
+
header=header,
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
LOGGER.info(T("coal.services.azure_storage.all_data_sent"))
|
|
250
|
+
|
|
251
|
+
# Update dataset status
|
|
252
|
+
dataset_info.ingestion_status = "SUCCESS"
|
|
253
|
+
dataset_info.twincache_status = "FULL"
|
|
254
|
+
api_ds.update_dataset(organization_id, dataset_id, dataset_info)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _process_csv_file(
|
|
258
|
+
file_path: pathlib.Path,
|
|
259
|
+
query: str,
|
|
260
|
+
api_url: str,
|
|
261
|
+
organization_id: str,
|
|
262
|
+
dataset_id: str,
|
|
263
|
+
header: Dict[str, str],
|
|
264
|
+
) -> None:
|
|
265
|
+
"""
|
|
266
|
+
Process a CSV file and send it to the Twin Data Layer.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
file_path: Path to the CSV file
|
|
270
|
+
query: Query to execute
|
|
271
|
+
api_url: API URL
|
|
272
|
+
organization_id: Organization ID
|
|
273
|
+
dataset_id: Dataset ID
|
|
274
|
+
header: HTTP headers
|
|
275
|
+
"""
|
|
276
|
+
content = StringIO()
|
|
277
|
+
size = 0
|
|
278
|
+
batch = 1
|
|
279
|
+
errors = []
|
|
280
|
+
query_craft = api_url + f"/organizations/{organization_id}/datasets/{dataset_id}/batch?query={query}"
|
|
281
|
+
LOGGER.info(T("coal.services.azure_storage.sending_content").format(file=file_path))
|
|
282
|
+
|
|
283
|
+
with open(file_path, "r") as _f:
|
|
284
|
+
dr = DictReader(_f)
|
|
285
|
+
dw = DictWriter(content, fieldnames=sorted(dr.fieldnames, key=len, reverse=True))
|
|
286
|
+
dw.writeheader()
|
|
287
|
+
for row in dr:
|
|
288
|
+
dw.writerow(row)
|
|
289
|
+
size += 1
|
|
290
|
+
if size > BATCH_SIZE_LIMIT:
|
|
291
|
+
LOGGER.info(T("coal.services.azure_storage.row_batch").format(count=batch * BATCH_SIZE_LIMIT))
|
|
292
|
+
batch += 1
|
|
293
|
+
content.seek(0)
|
|
294
|
+
post = requests.post(query_craft, data=content.read(), headers=header)
|
|
295
|
+
post.raise_for_status()
|
|
296
|
+
errors.extend(json.loads(post.content)["errors"])
|
|
297
|
+
content = StringIO()
|
|
298
|
+
dw = DictWriter(
|
|
299
|
+
content,
|
|
300
|
+
fieldnames=sorted(dr.fieldnames, key=len, reverse=True),
|
|
301
|
+
)
|
|
302
|
+
dw.writeheader()
|
|
303
|
+
size = 0
|
|
304
|
+
|
|
305
|
+
if size > 0:
|
|
306
|
+
content.seek(0)
|
|
307
|
+
post = requests.post(query_craft, data=content.read(), headers=header)
|
|
308
|
+
post.raise_for_status()
|
|
309
|
+
errors.extend(json.loads(post.content)["errors"])
|
|
310
|
+
|
|
311
|
+
if len(errors):
|
|
312
|
+
LOGGER.error(T("coal.services.azure_storage.import_errors").format(count=len(errors)))
|
|
313
|
+
for _err in errors:
|
|
314
|
+
LOGGER.error(T("coal.services.azure_storage.error_detail").format(error=str(_err)))
|
|
315
|
+
raise ValueError(f"Error importing data from {file_path}")
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def load_files_from_tdl(
|
|
319
|
+
organization_id: str,
|
|
320
|
+
workspace_id: str,
|
|
321
|
+
directory_path: str,
|
|
322
|
+
runner_id: str,
|
|
323
|
+
) -> None:
|
|
324
|
+
"""
|
|
325
|
+
Load files from the Twin Data Layer.
|
|
326
|
+
|
|
327
|
+
Args:
|
|
328
|
+
organization_id: Organization ID
|
|
329
|
+
workspace_id: Workspace ID
|
|
330
|
+
directory_path: Directory to save files to
|
|
331
|
+
runner_id: Runner ID
|
|
332
|
+
"""
|
|
333
|
+
api_client, _ = get_api_client()
|
|
334
|
+
api_ds = DatasetApi(api_client)
|
|
335
|
+
|
|
336
|
+
# Get dataset ID from runner
|
|
337
|
+
dataset_id = get_dataset_id_from_runner(organization_id, workspace_id, runner_id)
|
|
338
|
+
|
|
339
|
+
# Get dataset info
|
|
340
|
+
dataset_info = api_ds.find_dataset_by_id(organization_id, dataset_id)
|
|
341
|
+
if dataset_info.ingestion_status != "SUCCESS":
|
|
342
|
+
LOGGER.error(
|
|
343
|
+
T("coal.cosmotech_api.runner.dataset_state").format(
|
|
344
|
+
dataset_id=dataset_id, status=dataset_info.ingestion_status
|
|
345
|
+
)
|
|
346
|
+
)
|
|
347
|
+
LOGGER.debug(T("coal.cosmotech_api.runner.dataset_info").format(info=dataset_info))
|
|
348
|
+
raise ValueError(f"Dataset {dataset_id} is not in SUCCESS state")
|
|
349
|
+
|
|
350
|
+
# Create directory
|
|
351
|
+
directory_path = pathlib.Path(directory_path)
|
|
352
|
+
if directory_path.is_file():
|
|
353
|
+
LOGGER.error(T("coal.common.file_operations.not_directory").format(target_dir=directory_path))
|
|
354
|
+
raise ValueError(f"{directory_path} is not a directory")
|
|
355
|
+
|
|
356
|
+
directory_path.mkdir(parents=True, exist_ok=True)
|
|
357
|
+
|
|
358
|
+
# Get node and relationship properties
|
|
359
|
+
item_queries = {}
|
|
360
|
+
properties_nodes = _get_node_properties(api_ds, organization_id, dataset_id)
|
|
361
|
+
properties_relationships = _get_relationship_properties(api_ds, organization_id, dataset_id)
|
|
362
|
+
|
|
363
|
+
# Create queries
|
|
364
|
+
for label, keys in properties_nodes.items():
|
|
365
|
+
node_query = f"MATCH (n:{label}) RETURN {', '.join(map(lambda k: f'n.`{k}` as `{k}`', keys))}"
|
|
366
|
+
item_queries[label] = node_query
|
|
367
|
+
|
|
368
|
+
for label, keys in properties_relationships.items():
|
|
369
|
+
rel_query = f"MATCH ()-[n:{label}]->() RETURN {', '.join(map(lambda k: f'n.`{k}` as `{k}`', keys))}"
|
|
370
|
+
item_queries[label] = rel_query
|
|
371
|
+
|
|
372
|
+
# Execute queries and write files
|
|
373
|
+
files_content, files_headers = _execute_queries(api_ds, organization_id, dataset_id, item_queries)
|
|
374
|
+
_write_files(directory_path, files_content, files_headers)
|
|
375
|
+
|
|
376
|
+
LOGGER.info(T("coal.services.azure_storage.all_csv_written"))
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
def _get_node_properties(api_ds: DatasetApi, organization_id: str, dataset_id: str) -> Dict[str, Set[str]]:
|
|
380
|
+
"""
|
|
381
|
+
Get node properties from the Twin Data Layer.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
api_ds: Dataset API
|
|
385
|
+
organization_id: Organization ID
|
|
386
|
+
dataset_id: Dataset ID
|
|
387
|
+
|
|
388
|
+
Returns:
|
|
389
|
+
Dictionary of node labels to sets of property keys
|
|
390
|
+
"""
|
|
391
|
+
get_node_properties_query = "MATCH (n) RETURN distinct labels(n)[0] as label, keys(n) as keys"
|
|
392
|
+
node_properties_results: List[Dict[str, Any]] = api_ds.twingraph_query(
|
|
393
|
+
organization_id,
|
|
394
|
+
dataset_id,
|
|
395
|
+
DatasetTwinGraphQuery(query=get_node_properties_query),
|
|
396
|
+
)
|
|
397
|
+
|
|
398
|
+
properties_nodes = {}
|
|
399
|
+
for _r in node_properties_results:
|
|
400
|
+
label = _r["label"]
|
|
401
|
+
keys = _r["keys"]
|
|
402
|
+
if label not in properties_nodes:
|
|
403
|
+
properties_nodes[label] = set()
|
|
404
|
+
properties_nodes[label].update(keys)
|
|
405
|
+
|
|
406
|
+
return properties_nodes
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
def _get_relationship_properties(api_ds: DatasetApi, organization_id: str, dataset_id: str) -> Dict[str, Set[str]]:
|
|
410
|
+
"""
|
|
411
|
+
Get relationship properties from the Twin Data Layer.
|
|
412
|
+
|
|
413
|
+
Args:
|
|
414
|
+
api_ds: Dataset API
|
|
415
|
+
organization_id: Organization ID
|
|
416
|
+
dataset_id: Dataset ID
|
|
417
|
+
|
|
418
|
+
Returns:
|
|
419
|
+
Dictionary of relationship types to sets of property keys
|
|
420
|
+
"""
|
|
421
|
+
get_relationship_properties_query = "MATCH ()-[r]->() RETURN distinct type(r) as label, keys(r) as keys"
|
|
422
|
+
relationship_properties_results: List[Dict[str, Any]] = api_ds.twingraph_query(
|
|
423
|
+
organization_id,
|
|
424
|
+
dataset_id,
|
|
425
|
+
DatasetTwinGraphQuery(query=get_relationship_properties_query),
|
|
426
|
+
)
|
|
427
|
+
|
|
428
|
+
properties_relationships = {}
|
|
429
|
+
for _r in relationship_properties_results:
|
|
430
|
+
label = _r["label"]
|
|
431
|
+
keys = _r["keys"]
|
|
432
|
+
if label not in properties_relationships:
|
|
433
|
+
properties_relationships[label] = set()
|
|
434
|
+
properties_relationships[label].update(keys)
|
|
435
|
+
|
|
436
|
+
return properties_relationships
|
|
437
|
+
|
|
438
|
+
|
|
439
|
+
def _execute_queries(
|
|
440
|
+
api_ds: DatasetApi, organization_id: str, dataset_id: str, item_queries: Dict[str, str]
|
|
441
|
+
) -> Tuple[Dict[str, List[Dict[str, Any]]], Dict[str, Set[str]]]:
|
|
442
|
+
"""
|
|
443
|
+
Execute queries against the Twin Data Layer.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
api_ds: Dataset API
|
|
447
|
+
organization_id: Organization ID
|
|
448
|
+
dataset_id: Dataset ID
|
|
449
|
+
item_queries: Dictionary of element types to queries
|
|
450
|
+
|
|
451
|
+
Returns:
|
|
452
|
+
Tuple of (files_content, files_headers)
|
|
453
|
+
"""
|
|
454
|
+
files_content = {}
|
|
455
|
+
files_headers = {}
|
|
456
|
+
|
|
457
|
+
for element_type, query in item_queries.items():
|
|
458
|
+
element_query: List[Dict[str, Any]] = api_ds.twingraph_query(
|
|
459
|
+
organization_id, dataset_id, DatasetTwinGraphQuery(query=query)
|
|
460
|
+
)
|
|
461
|
+
for element in element_query:
|
|
462
|
+
if element_type not in files_content:
|
|
463
|
+
files_content[element_type] = []
|
|
464
|
+
files_headers[element_type] = set()
|
|
465
|
+
files_content[element_type].append(element)
|
|
466
|
+
files_headers[element_type].update(element.keys())
|
|
467
|
+
|
|
468
|
+
return files_content, files_headers
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def _write_files(
|
|
472
|
+
directory_path: pathlib.Path,
|
|
473
|
+
files_content: Dict[str, List[Dict[str, Any]]],
|
|
474
|
+
files_headers: Dict[str, Set[str]],
|
|
475
|
+
) -> None:
|
|
476
|
+
"""
|
|
477
|
+
Write files to disk.
|
|
478
|
+
|
|
479
|
+
Args:
|
|
480
|
+
directory_path: Directory to write files to
|
|
481
|
+
files_content: Dictionary of file names to lists of rows
|
|
482
|
+
files_headers: Dictionary of file names to sets of headers
|
|
483
|
+
"""
|
|
484
|
+
for file_name in files_content.keys():
|
|
485
|
+
file_path = directory_path / (file_name + ".csv")
|
|
486
|
+
LOGGER.info(
|
|
487
|
+
T("coal.services.azure_storage.writing_lines").format(count=len(files_content[file_name]), file=file_path)
|
|
488
|
+
)
|
|
489
|
+
with file_path.open("w") as _f:
|
|
490
|
+
headers = files_headers[file_name]
|
|
491
|
+
has_id = "id" in headers
|
|
492
|
+
is_relation = "src" in headers
|
|
493
|
+
new_headers = []
|
|
494
|
+
if has_id:
|
|
495
|
+
headers.remove("id")
|
|
496
|
+
new_headers.append("id")
|
|
497
|
+
if is_relation:
|
|
498
|
+
headers.remove("src")
|
|
499
|
+
headers.remove("dest")
|
|
500
|
+
new_headers.append("src")
|
|
501
|
+
new_headers.append("dest")
|
|
502
|
+
headers = new_headers + sorted(headers)
|
|
503
|
+
|
|
504
|
+
dw = DictWriter(_f, fieldnames=headers)
|
|
505
|
+
dw.writeheader()
|
|
506
|
+
for row in sorted(files_content[file_name], key=lambda r: r.get("id", "")):
|
|
507
|
+
dw.writerow(
|
|
508
|
+
{
|
|
509
|
+
key: (json.dumps(value) if isinstance(value, (bool, dict, list)) else value)
|
|
510
|
+
for key, value in row.items()
|
|
511
|
+
}
|
|
512
|
+
)
|