cosmotech-acceleration-library 1.0.1__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cosmotech/coal/__init__.py +1 -1
- cosmotech/coal/aws/__init__.py +1 -9
- cosmotech/coal/aws/s3.py +181 -214
- cosmotech/coal/azure/adx/auth.py +2 -2
- cosmotech/coal/azure/adx/runner.py +13 -14
- cosmotech/coal/azure/adx/store.py +5 -86
- cosmotech/coal/azure/adx/tables.py +2 -2
- cosmotech/coal/azure/blob.py +6 -6
- cosmotech/coal/azure/storage.py +3 -3
- cosmotech/coal/cosmotech_api/__init__.py +0 -24
- cosmotech/coal/cosmotech_api/apis/__init__.py +14 -0
- cosmotech/coal/cosmotech_api/apis/dataset.py +103 -0
- cosmotech/coal/cosmotech_api/apis/meta.py +25 -0
- cosmotech/coal/cosmotech_api/apis/organization.py +24 -0
- cosmotech/coal/cosmotech_api/apis/run.py +38 -0
- cosmotech/coal/cosmotech_api/apis/runner.py +71 -0
- cosmotech/coal/cosmotech_api/apis/solution.py +23 -0
- cosmotech/coal/cosmotech_api/apis/workspace.py +108 -0
- cosmotech/coal/cosmotech_api/objects/__init__.py +9 -0
- cosmotech/coal/cosmotech_api/objects/connection.py +125 -0
- cosmotech/coal/cosmotech_api/objects/parameters.py +127 -0
- cosmotech/coal/postgresql/runner.py +56 -36
- cosmotech/coal/postgresql/store.py +60 -14
- cosmotech/coal/postgresql/utils.py +254 -0
- cosmotech/coal/store/output/__init__.py +0 -0
- cosmotech/coal/store/output/aws_channel.py +73 -0
- cosmotech/coal/store/output/az_storage_channel.py +42 -0
- cosmotech/coal/store/output/channel_interface.py +23 -0
- cosmotech/coal/store/output/channel_spliter.py +55 -0
- cosmotech/coal/store/output/postgres_channel.py +40 -0
- cosmotech/coal/utils/configuration.py +169 -0
- cosmotech/coal/utils/decorator.py +22 -0
- cosmotech/csm_data/commands/api/api.py +6 -19
- cosmotech/csm_data/commands/api/postgres_send_runner_metadata.py +20 -16
- cosmotech/csm_data/commands/api/run_load_data.py +7 -46
- cosmotech/csm_data/commands/api/wsf_load_file.py +14 -15
- cosmotech/csm_data/commands/api/wsf_send_file.py +12 -13
- cosmotech/csm_data/commands/s3_bucket_delete.py +16 -15
- cosmotech/csm_data/commands/s3_bucket_download.py +16 -16
- cosmotech/csm_data/commands/s3_bucket_upload.py +16 -14
- cosmotech/csm_data/commands/store/dump_to_s3.py +18 -16
- cosmotech/csm_data/commands/store/output.py +35 -0
- cosmotech/csm_data/commands/store/store.py +3 -4
- cosmotech/translation/coal/en-US/coal/cosmotech_api/initialization.yml +8 -0
- cosmotech/translation/coal/en-US/coal/services/dataset.yml +4 -14
- cosmotech/translation/coal/en-US/coal/store/output/data_interface.yml +1 -0
- cosmotech/translation/coal/en-US/coal/store/output/split.yml +6 -0
- cosmotech/translation/coal/en-US/coal/utils/configuration.yml +2 -0
- cosmotech/translation/csm_data/en-US/csm_data/commands/store/output.yml +7 -0
- {cosmotech_acceleration_library-1.0.1.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/METADATA +8 -9
- {cosmotech_acceleration_library-1.0.1.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/RECORD +55 -71
- cosmotech/coal/azure/functions.py +0 -72
- cosmotech/coal/cosmotech_api/connection.py +0 -96
- cosmotech/coal/cosmotech_api/dataset/__init__.py +0 -26
- cosmotech/coal/cosmotech_api/dataset/converters.py +0 -164
- cosmotech/coal/cosmotech_api/dataset/download/__init__.py +0 -19
- cosmotech/coal/cosmotech_api/dataset/download/adt.py +0 -119
- cosmotech/coal/cosmotech_api/dataset/download/common.py +0 -140
- cosmotech/coal/cosmotech_api/dataset/download/file.py +0 -216
- cosmotech/coal/cosmotech_api/dataset/download/twingraph.py +0 -188
- cosmotech/coal/cosmotech_api/dataset/utils.py +0 -132
- cosmotech/coal/cosmotech_api/parameters.py +0 -48
- cosmotech/coal/cosmotech_api/run.py +0 -25
- cosmotech/coal/cosmotech_api/run_data.py +0 -173
- cosmotech/coal/cosmotech_api/run_template.py +0 -108
- cosmotech/coal/cosmotech_api/runner/__init__.py +0 -28
- cosmotech/coal/cosmotech_api/runner/data.py +0 -38
- cosmotech/coal/cosmotech_api/runner/datasets.py +0 -364
- cosmotech/coal/cosmotech_api/runner/download.py +0 -146
- cosmotech/coal/cosmotech_api/runner/metadata.py +0 -42
- cosmotech/coal/cosmotech_api/runner/parameters.py +0 -157
- cosmotech/coal/cosmotech_api/twin_data_layer.py +0 -512
- cosmotech/coal/cosmotech_api/workspace.py +0 -127
- cosmotech/coal/utils/api.py +0 -68
- cosmotech/coal/utils/postgresql.py +0 -236
- cosmotech/csm_data/commands/api/rds_load_csv.py +0 -90
- cosmotech/csm_data/commands/api/rds_send_csv.py +0 -74
- cosmotech/csm_data/commands/api/rds_send_store.py +0 -74
- cosmotech/csm_data/commands/api/runtemplate_load_handler.py +0 -66
- cosmotech/csm_data/commands/api/tdl_load_files.py +0 -76
- cosmotech/csm_data/commands/api/tdl_send_files.py +0 -82
- cosmotech/orchestrator_plugins/csm-data/templates/api/rds_load_csv.json +0 -27
- cosmotech/orchestrator_plugins/csm-data/templates/api/rds_send_csv.json +0 -27
- cosmotech/orchestrator_plugins/csm-data/templates/api/rds_send_store.json +0 -27
- cosmotech/orchestrator_plugins/csm-data/templates/api/runtemplate_load_handler.json +0 -27
- cosmotech/orchestrator_plugins/csm-data/templates/api/tdl_load_files.json +0 -32
- cosmotech/orchestrator_plugins/csm-data/templates/api/tdl_send_files.json +0 -27
- cosmotech/translation/coal/en-US/coal/cosmotech_api/run_data.yml +0 -2
- cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_load_csv.yml +0 -13
- cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_send_csv.yml +0 -12
- cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_send_store.yml +0 -12
- cosmotech/translation/csm_data/en-US/csm_data/commands/api/tdl_load_files.yml +0 -14
- cosmotech/translation/csm_data/en-US/csm_data/commands/api/tdl_send_files.yml +0 -18
- {cosmotech_acceleration_library-1.0.1.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/WHEEL +0 -0
- {cosmotech_acceleration_library-1.0.1.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/entry_points.txt +0 -0
- {cosmotech_acceleration_library-1.0.1.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/licenses/LICENSE +0 -0
- {cosmotech_acceleration_library-1.0.1.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/top_level.txt +0 -0
|
@@ -1,119 +0,0 @@
|
|
|
1
|
-
# Copyright (C) - 2023 - 2025 - Cosmo Tech
|
|
2
|
-
# This document and all information contained herein is the exclusive property -
|
|
3
|
-
# including all intellectual property rights pertaining thereto - of Cosmo Tech.
|
|
4
|
-
# Any use, reproduction, translation, broadcasting, transmission, distribution,
|
|
5
|
-
# etc., to any person is prohibited unless it has been previously and
|
|
6
|
-
# specifically authorized by written means by Cosmo Tech.
|
|
7
|
-
|
|
8
|
-
import time
|
|
9
|
-
import tempfile
|
|
10
|
-
from pathlib import Path
|
|
11
|
-
from typing import Dict, Any, Optional, Union, Tuple
|
|
12
|
-
|
|
13
|
-
from azure.digitaltwins.core import DigitalTwinsClient
|
|
14
|
-
from azure.identity import DefaultAzureCredential
|
|
15
|
-
|
|
16
|
-
from cosmotech.coal.utils.logger import LOGGER
|
|
17
|
-
from cosmotech.orchestrator.utils.translate import T
|
|
18
|
-
from cosmotech.coal.cosmotech_api.connection import get_api_client
|
|
19
|
-
from cosmotech.coal.cosmotech_api.dataset.converters import convert_dataset_to_files
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
def download_adt_dataset(
|
|
23
|
-
adt_address: str,
|
|
24
|
-
target_folder: Optional[Union[str, Path]] = None,
|
|
25
|
-
credentials: Optional[DefaultAzureCredential] = None,
|
|
26
|
-
) -> Tuple[Dict[str, Any], Path]:
|
|
27
|
-
"""
|
|
28
|
-
Download dataset from Azure Digital Twins.
|
|
29
|
-
|
|
30
|
-
Args:
|
|
31
|
-
adt_address: The ADT instance address
|
|
32
|
-
target_folder: Optional folder to save files (if None, uses temp dir)
|
|
33
|
-
credentials: Optional Azure credentials (if None, uses DefaultAzureCredential)
|
|
34
|
-
|
|
35
|
-
Returns:
|
|
36
|
-
Tuple of (content dict, folder path)
|
|
37
|
-
"""
|
|
38
|
-
start_time = time.time()
|
|
39
|
-
LOGGER.info(T("coal.services.dataset.download_started").format(dataset_type="ADT"))
|
|
40
|
-
LOGGER.debug(T("coal.services.dataset.adt_connecting").format(url=adt_address))
|
|
41
|
-
|
|
42
|
-
# Create credentials if not provided
|
|
43
|
-
if credentials is None:
|
|
44
|
-
if get_api_client()[1] == "Azure Entra Connection":
|
|
45
|
-
credentials = DefaultAzureCredential()
|
|
46
|
-
else:
|
|
47
|
-
LOGGER.error(T("coal.services.dataset.adt_no_credentials"))
|
|
48
|
-
raise ValueError("No credentials available for ADT connection")
|
|
49
|
-
|
|
50
|
-
# Create client and download data
|
|
51
|
-
client = DigitalTwinsClient(adt_address, credentials)
|
|
52
|
-
|
|
53
|
-
# Query twins
|
|
54
|
-
query_start = time.time()
|
|
55
|
-
LOGGER.debug(T("coal.services.dataset.adt_querying_twins"))
|
|
56
|
-
query_expression = "SELECT * FROM digitaltwins"
|
|
57
|
-
query_result = client.query_twins(query_expression)
|
|
58
|
-
|
|
59
|
-
json_content = dict()
|
|
60
|
-
twin_count = 0
|
|
61
|
-
|
|
62
|
-
for twin in query_result:
|
|
63
|
-
twin_count += 1
|
|
64
|
-
entity_type = twin.get("$metadata").get("$model").split(":")[-1].split(";")[0]
|
|
65
|
-
t_content = {k: v for k, v in twin.items()}
|
|
66
|
-
t_content["id"] = t_content["$dtId"]
|
|
67
|
-
|
|
68
|
-
# Remove system properties
|
|
69
|
-
for k in list(twin.keys()):
|
|
70
|
-
if k[0] == "$":
|
|
71
|
-
del t_content[k]
|
|
72
|
-
|
|
73
|
-
json_content.setdefault(entity_type, [])
|
|
74
|
-
json_content[entity_type].append(t_content)
|
|
75
|
-
|
|
76
|
-
query_time = time.time() - query_start
|
|
77
|
-
LOGGER.debug(T("coal.services.dataset.adt_twins_found").format(count=twin_count))
|
|
78
|
-
LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="twins query", time=query_time))
|
|
79
|
-
|
|
80
|
-
# Query relationships
|
|
81
|
-
rel_start = time.time()
|
|
82
|
-
LOGGER.debug(T("coal.services.dataset.adt_querying_relations"))
|
|
83
|
-
relations_query = "SELECT * FROM relationships"
|
|
84
|
-
query_result = client.query_twins(relations_query)
|
|
85
|
-
|
|
86
|
-
relation_count = 0
|
|
87
|
-
for relation in query_result:
|
|
88
|
-
relation_count += 1
|
|
89
|
-
tr = {"$relationshipId": "id", "$sourceId": "source", "$targetId": "target"}
|
|
90
|
-
r_content = {k: v for k, v in relation.items()}
|
|
91
|
-
|
|
92
|
-
# Map system properties to standard names
|
|
93
|
-
for k, v in tr.items():
|
|
94
|
-
r_content[v] = r_content[k]
|
|
95
|
-
|
|
96
|
-
# Remove system properties
|
|
97
|
-
for k in list(relation.keys()):
|
|
98
|
-
if k[0] == "$":
|
|
99
|
-
del r_content[k]
|
|
100
|
-
|
|
101
|
-
json_content.setdefault(relation["$relationshipName"], [])
|
|
102
|
-
json_content[relation["$relationshipName"]].append(r_content)
|
|
103
|
-
|
|
104
|
-
rel_time = time.time() - rel_start
|
|
105
|
-
LOGGER.debug(T("coal.services.dataset.adt_relations_found").format(count=relation_count))
|
|
106
|
-
LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="relations query", time=rel_time))
|
|
107
|
-
|
|
108
|
-
# Convert to files if target_folder is provided
|
|
109
|
-
if target_folder:
|
|
110
|
-
dataset_info = {"type": "adt", "content": json_content, "name": "ADT Dataset"}
|
|
111
|
-
target_folder = convert_dataset_to_files(dataset_info, target_folder)
|
|
112
|
-
else:
|
|
113
|
-
target_folder = tempfile.mkdtemp()
|
|
114
|
-
|
|
115
|
-
elapsed_time = time.time() - start_time
|
|
116
|
-
LOGGER.info(T("coal.common.timing.operation_completed").format(operation="ADT download", time=elapsed_time))
|
|
117
|
-
LOGGER.info(T("coal.services.dataset.download_completed").format(dataset_type="ADT"))
|
|
118
|
-
|
|
119
|
-
return json_content, Path(target_folder)
|
|
@@ -1,140 +0,0 @@
|
|
|
1
|
-
# Copyright (C) - 2023 - 2025 - Cosmo Tech
|
|
2
|
-
# This document and all information contained herein is the exclusive property -
|
|
3
|
-
# including all intellectual property rights pertaining thereto - of Cosmo Tech.
|
|
4
|
-
# Any use, reproduction, translation, broadcasting, transmission, distribution,
|
|
5
|
-
# etc., to any person is prohibited unless it has been previously and
|
|
6
|
-
# specifically authorized by written means by Cosmo Tech.
|
|
7
|
-
|
|
8
|
-
import time
|
|
9
|
-
from pathlib import Path
|
|
10
|
-
from typing import Dict, Any, Optional, Union, Tuple
|
|
11
|
-
|
|
12
|
-
from cosmotech_api import DatasetApi
|
|
13
|
-
|
|
14
|
-
from cosmotech.coal.utils.logger import LOGGER
|
|
15
|
-
from cosmotech.orchestrator.utils.translate import T
|
|
16
|
-
from cosmotech.coal.cosmotech_api.connection import get_api_client
|
|
17
|
-
|
|
18
|
-
# Import specific download functions
|
|
19
|
-
# These imports are defined here to avoid circular imports
|
|
20
|
-
# The functions are imported directly from their modules
|
|
21
|
-
from cosmotech.coal.cosmotech_api.dataset.download.adt import download_adt_dataset
|
|
22
|
-
from cosmotech.coal.cosmotech_api.dataset.download.twingraph import (
|
|
23
|
-
download_twingraph_dataset,
|
|
24
|
-
download_legacy_twingraph_dataset,
|
|
25
|
-
)
|
|
26
|
-
from cosmotech.coal.cosmotech_api.dataset.download.file import download_file_dataset
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def download_dataset_by_id(
|
|
30
|
-
organization_id: str,
|
|
31
|
-
workspace_id: str,
|
|
32
|
-
dataset_id: str,
|
|
33
|
-
target_folder: Optional[Union[str, Path]] = None,
|
|
34
|
-
) -> Tuple[Dict[str, Any], Path]:
|
|
35
|
-
"""
|
|
36
|
-
Download dataset by ID.
|
|
37
|
-
|
|
38
|
-
Args:
|
|
39
|
-
organization_id: Organization ID
|
|
40
|
-
workspace_id: Workspace ID
|
|
41
|
-
dataset_id: Dataset ID
|
|
42
|
-
target_folder: Optional folder to save files (if None, uses temp dir)
|
|
43
|
-
|
|
44
|
-
Returns:
|
|
45
|
-
Tuple of (dataset info dict, folder path)
|
|
46
|
-
"""
|
|
47
|
-
start_time = time.time()
|
|
48
|
-
LOGGER.info(T("coal.services.dataset.download_started").format(dataset_type="Dataset"))
|
|
49
|
-
LOGGER.debug(
|
|
50
|
-
T("coal.services.dataset.dataset_downloading").format(organization_id=organization_id, dataset_id=dataset_id)
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
with get_api_client()[0] as api_client:
|
|
54
|
-
api_instance = DatasetApi(api_client)
|
|
55
|
-
|
|
56
|
-
# Get dataset info
|
|
57
|
-
info_start = time.time()
|
|
58
|
-
dataset = api_instance.find_dataset_by_id(organization_id=organization_id, dataset_id=dataset_id)
|
|
59
|
-
info_time = time.time() - info_start
|
|
60
|
-
|
|
61
|
-
LOGGER.debug(
|
|
62
|
-
T("coal.services.dataset.dataset_info_retrieved").format(dataset_name=dataset.name, dataset_id=dataset_id)
|
|
63
|
-
)
|
|
64
|
-
LOGGER.debug(
|
|
65
|
-
T("coal.common.timing.operation_completed").format(operation="dataset info retrieval", time=info_time)
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
# Determine dataset type and download
|
|
69
|
-
if dataset.connector is None:
|
|
70
|
-
parameters = []
|
|
71
|
-
else:
|
|
72
|
-
parameters = dataset.connector.parameters_values
|
|
73
|
-
|
|
74
|
-
is_adt = "AZURE_DIGITAL_TWINS_URL" in parameters
|
|
75
|
-
is_storage = "AZURE_STORAGE_CONTAINER_BLOB_PREFIX" in parameters
|
|
76
|
-
is_legacy_twin_cache = "TWIN_CACHE_NAME" in parameters and dataset.twingraph_id is None
|
|
77
|
-
is_in_workspace_file = (
|
|
78
|
-
False if dataset.tags is None else "workspaceFile" in dataset.tags or "dataset_part" in dataset.tags
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
download_start = time.time()
|
|
82
|
-
|
|
83
|
-
if is_adt:
|
|
84
|
-
LOGGER.debug(T("coal.services.dataset.dataset_type_detected").format(type="ADT"))
|
|
85
|
-
content, folder = download_adt_dataset(
|
|
86
|
-
adt_address=parameters["AZURE_DIGITAL_TWINS_URL"],
|
|
87
|
-
target_folder=target_folder,
|
|
88
|
-
)
|
|
89
|
-
dataset_type = "adt"
|
|
90
|
-
|
|
91
|
-
elif is_legacy_twin_cache:
|
|
92
|
-
LOGGER.debug(T("coal.services.dataset.dataset_type_detected").format(type="Legacy TwinGraph"))
|
|
93
|
-
twin_cache_name = parameters["TWIN_CACHE_NAME"]
|
|
94
|
-
content, folder = download_legacy_twingraph_dataset(
|
|
95
|
-
organization_id=organization_id,
|
|
96
|
-
cache_name=twin_cache_name,
|
|
97
|
-
target_folder=target_folder,
|
|
98
|
-
)
|
|
99
|
-
dataset_type = "twincache"
|
|
100
|
-
|
|
101
|
-
elif is_storage or is_in_workspace_file:
|
|
102
|
-
if is_storage:
|
|
103
|
-
LOGGER.debug(T("coal.services.dataset.dataset_type_detected").format(type="Storage"))
|
|
104
|
-
_file_name = parameters["AZURE_STORAGE_CONTAINER_BLOB_PREFIX"].replace("%WORKSPACE_FILE%/", "")
|
|
105
|
-
else:
|
|
106
|
-
LOGGER.debug(T("coal.services.dataset.dataset_type_detected").format(type="Workspace File"))
|
|
107
|
-
_file_name = dataset.source.location
|
|
108
|
-
|
|
109
|
-
content, folder = download_file_dataset(
|
|
110
|
-
organization_id=organization_id,
|
|
111
|
-
workspace_id=workspace_id,
|
|
112
|
-
file_name=_file_name,
|
|
113
|
-
target_folder=target_folder,
|
|
114
|
-
)
|
|
115
|
-
dataset_type = _file_name.split(".")[-1]
|
|
116
|
-
|
|
117
|
-
else:
|
|
118
|
-
LOGGER.debug(T("coal.services.dataset.dataset_type_detected").format(type="TwinGraph"))
|
|
119
|
-
content, folder = download_twingraph_dataset(
|
|
120
|
-
organization_id=organization_id,
|
|
121
|
-
dataset_id=dataset_id,
|
|
122
|
-
target_folder=target_folder,
|
|
123
|
-
)
|
|
124
|
-
dataset_type = "twincache"
|
|
125
|
-
|
|
126
|
-
download_time = time.time() - download_start
|
|
127
|
-
LOGGER.debug(
|
|
128
|
-
T("coal.common.timing.operation_completed").format(operation="content download", time=download_time)
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
# Prepare result
|
|
132
|
-
dataset_info = {"type": dataset_type, "content": content, "name": dataset.name}
|
|
133
|
-
|
|
134
|
-
elapsed_time = time.time() - start_time
|
|
135
|
-
LOGGER.info(
|
|
136
|
-
T("coal.common.timing.operation_completed").format(operation="total dataset download", time=elapsed_time)
|
|
137
|
-
)
|
|
138
|
-
LOGGER.info(T("coal.services.dataset.download_completed").format(dataset_type="Dataset"))
|
|
139
|
-
|
|
140
|
-
return dataset_info, folder
|
|
@@ -1,216 +0,0 @@
|
|
|
1
|
-
# Copyright (C) - 2023 - 2025 - Cosmo Tech
|
|
2
|
-
# This document and all information contained herein is the exclusive property -
|
|
3
|
-
# including all intellectual property rights pertaining thereto - of Cosmo Tech.
|
|
4
|
-
# Any use, reproduction, translation, broadcasting, transmission, distribution,
|
|
5
|
-
# etc., to any person is prohibited unless it has been previously and
|
|
6
|
-
# specifically authorized by written means by Cosmo Tech.
|
|
7
|
-
|
|
8
|
-
import csv
|
|
9
|
-
import io
|
|
10
|
-
import json
|
|
11
|
-
import os
|
|
12
|
-
import tempfile
|
|
13
|
-
import time
|
|
14
|
-
from pathlib import Path
|
|
15
|
-
from typing import Dict, List, Any, Optional, Union, Tuple
|
|
16
|
-
|
|
17
|
-
from cosmotech_api import WorkspaceApi
|
|
18
|
-
from openpyxl import load_workbook
|
|
19
|
-
|
|
20
|
-
from cosmotech.coal.utils.logger import LOGGER
|
|
21
|
-
from cosmotech.orchestrator.utils.translate import T
|
|
22
|
-
from cosmotech.coal.cosmotech_api.connection import get_api_client
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
def download_file_dataset(
|
|
26
|
-
organization_id: str,
|
|
27
|
-
workspace_id: str,
|
|
28
|
-
file_name: str,
|
|
29
|
-
target_folder: Optional[Union[str, Path]] = None,
|
|
30
|
-
read_files: bool = True,
|
|
31
|
-
) -> Tuple[Dict[str, Any], Path]:
|
|
32
|
-
"""
|
|
33
|
-
Download file dataset.
|
|
34
|
-
|
|
35
|
-
Args:
|
|
36
|
-
organization_id: Organization ID
|
|
37
|
-
workspace_id: Workspace ID
|
|
38
|
-
file_name: File name to download
|
|
39
|
-
target_folder: Optional folder to save files (if None, uses temp dir)
|
|
40
|
-
read_files: Whether to read file contents
|
|
41
|
-
|
|
42
|
-
Returns:
|
|
43
|
-
Tuple of (content dict, folder path)
|
|
44
|
-
"""
|
|
45
|
-
start_time = time.time()
|
|
46
|
-
LOGGER.info(T("coal.services.dataset.download_started").format(dataset_type="File"))
|
|
47
|
-
LOGGER.debug(
|
|
48
|
-
T("coal.services.dataset.file_downloading").format(
|
|
49
|
-
organization_id=organization_id,
|
|
50
|
-
workspace_id=workspace_id,
|
|
51
|
-
file_name=file_name,
|
|
52
|
-
)
|
|
53
|
-
)
|
|
54
|
-
|
|
55
|
-
# Create temp directory for downloaded files
|
|
56
|
-
if target_folder is None:
|
|
57
|
-
tmp_dataset_dir = tempfile.mkdtemp()
|
|
58
|
-
else:
|
|
59
|
-
tmp_dataset_dir = Path(target_folder)
|
|
60
|
-
tmp_dataset_dir.mkdir(parents=True, exist_ok=True)
|
|
61
|
-
tmp_dataset_dir = str(tmp_dataset_dir)
|
|
62
|
-
|
|
63
|
-
LOGGER.debug(T("coal.services.dataset.using_folder").format(folder=tmp_dataset_dir))
|
|
64
|
-
|
|
65
|
-
content = dict()
|
|
66
|
-
|
|
67
|
-
with get_api_client()[0] as api_client:
|
|
68
|
-
api_ws = WorkspaceApi(api_client)
|
|
69
|
-
|
|
70
|
-
# Find all files matching the pattern
|
|
71
|
-
list_start = time.time()
|
|
72
|
-
LOGGER.debug(T("coal.services.dataset.listing_workspace_files"))
|
|
73
|
-
all_api_files = api_ws.find_all_workspace_files(organization_id, workspace_id)
|
|
74
|
-
|
|
75
|
-
existing_files = list(_f.file_name for _f in all_api_files if _f.file_name.startswith(file_name))
|
|
76
|
-
list_time = time.time() - list_start
|
|
77
|
-
|
|
78
|
-
LOGGER.debug(T("coal.services.dataset.workspace_files_found").format(count=len(existing_files)))
|
|
79
|
-
LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="file listing", time=list_time))
|
|
80
|
-
|
|
81
|
-
if not existing_files:
|
|
82
|
-
LOGGER.warning(T("coal.services.dataset.no_files_found").format(file_name=file_name))
|
|
83
|
-
return content, Path(tmp_dataset_dir)
|
|
84
|
-
|
|
85
|
-
# Download and process each file
|
|
86
|
-
for _file_name in existing_files:
|
|
87
|
-
download_start = time.time()
|
|
88
|
-
LOGGER.debug(T("coal.services.dataset.downloading_file").format(file_name=_file_name))
|
|
89
|
-
|
|
90
|
-
dl_file = api_ws.download_workspace_file(
|
|
91
|
-
organization_id=organization_id,
|
|
92
|
-
workspace_id=workspace_id,
|
|
93
|
-
file_name=_file_name,
|
|
94
|
-
)
|
|
95
|
-
|
|
96
|
-
target_file = os.path.join(tmp_dataset_dir, _file_name.split("/")[-1])
|
|
97
|
-
with open(target_file, "wb") as tmp_file:
|
|
98
|
-
tmp_file.write(dl_file)
|
|
99
|
-
|
|
100
|
-
download_time = time.time() - download_start
|
|
101
|
-
LOGGER.debug(T("coal.services.dataset.file_downloaded").format(file_name=_file_name, path=target_file))
|
|
102
|
-
LOGGER.debug(
|
|
103
|
-
T("coal.common.timing.operation_completed").format(
|
|
104
|
-
operation=f"download {_file_name}", time=download_time
|
|
105
|
-
)
|
|
106
|
-
)
|
|
107
|
-
|
|
108
|
-
if not read_files:
|
|
109
|
-
continue
|
|
110
|
-
|
|
111
|
-
# Process file based on type
|
|
112
|
-
process_start = time.time()
|
|
113
|
-
|
|
114
|
-
if ".xls" in _file_name:
|
|
115
|
-
LOGGER.debug(T("coal.services.dataset.processing_excel").format(file_name=target_file))
|
|
116
|
-
wb = load_workbook(target_file, data_only=True)
|
|
117
|
-
|
|
118
|
-
for sheet_name in wb.sheetnames:
|
|
119
|
-
sheet = wb[sheet_name]
|
|
120
|
-
content[sheet_name] = list()
|
|
121
|
-
headers = next(sheet.iter_rows(max_row=1, values_only=True))
|
|
122
|
-
|
|
123
|
-
def item(_row: tuple) -> dict:
|
|
124
|
-
return {k: v for k, v in zip(headers, _row)}
|
|
125
|
-
|
|
126
|
-
row_count = 0
|
|
127
|
-
for r in sheet.iter_rows(min_row=2, values_only=True):
|
|
128
|
-
row = item(r)
|
|
129
|
-
new_row = dict()
|
|
130
|
-
|
|
131
|
-
for key, value in row.items():
|
|
132
|
-
try:
|
|
133
|
-
converted_value = json.load(io.StringIO(value))
|
|
134
|
-
except (json.decoder.JSONDecodeError, TypeError):
|
|
135
|
-
converted_value = value
|
|
136
|
-
|
|
137
|
-
if converted_value is not None:
|
|
138
|
-
new_row[key] = converted_value
|
|
139
|
-
|
|
140
|
-
if new_row:
|
|
141
|
-
content[sheet_name].append(new_row)
|
|
142
|
-
row_count += 1
|
|
143
|
-
|
|
144
|
-
LOGGER.debug(
|
|
145
|
-
T("coal.services.dataset.sheet_processed").format(sheet_name=sheet_name, rows=row_count)
|
|
146
|
-
)
|
|
147
|
-
|
|
148
|
-
elif ".csv" in _file_name:
|
|
149
|
-
LOGGER.debug(T("coal.services.dataset.processing_csv").format(file_name=target_file))
|
|
150
|
-
with open(target_file, "r") as file:
|
|
151
|
-
current_filename = os.path.basename(target_file)[: -len(".csv")]
|
|
152
|
-
content[current_filename] = list()
|
|
153
|
-
|
|
154
|
-
row_count = 0
|
|
155
|
-
for csv_row in csv.DictReader(file):
|
|
156
|
-
csv_row: dict
|
|
157
|
-
new_row = dict()
|
|
158
|
-
|
|
159
|
-
for key, value in csv_row.items():
|
|
160
|
-
try:
|
|
161
|
-
# Try to convert any json row to dict object
|
|
162
|
-
converted_value = json.load(io.StringIO(value))
|
|
163
|
-
except json.decoder.JSONDecodeError:
|
|
164
|
-
converted_value = value
|
|
165
|
-
|
|
166
|
-
if converted_value == "":
|
|
167
|
-
converted_value = None
|
|
168
|
-
|
|
169
|
-
if converted_value is not None:
|
|
170
|
-
new_row[key] = converted_value
|
|
171
|
-
|
|
172
|
-
content[current_filename].append(new_row)
|
|
173
|
-
row_count += 1
|
|
174
|
-
|
|
175
|
-
LOGGER.debug(
|
|
176
|
-
T("coal.services.dataset.csv_processed").format(file_name=current_filename, rows=row_count)
|
|
177
|
-
)
|
|
178
|
-
|
|
179
|
-
elif ".json" in _file_name:
|
|
180
|
-
LOGGER.debug(T("coal.services.dataset.processing_json").format(file_name=target_file))
|
|
181
|
-
with open(target_file, "r") as _file:
|
|
182
|
-
current_filename = os.path.basename(target_file)
|
|
183
|
-
content[current_filename] = json.load(_file)
|
|
184
|
-
|
|
185
|
-
if isinstance(content[current_filename], dict):
|
|
186
|
-
item_count = len(content[current_filename])
|
|
187
|
-
elif isinstance(content[current_filename], list):
|
|
188
|
-
item_count = len(content[current_filename])
|
|
189
|
-
else:
|
|
190
|
-
item_count = 1
|
|
191
|
-
|
|
192
|
-
LOGGER.debug(
|
|
193
|
-
T("coal.services.dataset.json_processed").format(file_name=current_filename, items=item_count)
|
|
194
|
-
)
|
|
195
|
-
|
|
196
|
-
else:
|
|
197
|
-
LOGGER.debug(T("coal.services.dataset.processing_text").format(file_name=target_file))
|
|
198
|
-
with open(target_file, "r") as _file:
|
|
199
|
-
current_filename = os.path.basename(target_file)
|
|
200
|
-
content[current_filename] = "\n".join(line for line in _file)
|
|
201
|
-
|
|
202
|
-
line_count = content[current_filename].count("\n") + 1
|
|
203
|
-
LOGGER.debug(
|
|
204
|
-
T("coal.services.dataset.text_processed").format(file_name=current_filename, lines=line_count)
|
|
205
|
-
)
|
|
206
|
-
|
|
207
|
-
process_time = time.time() - process_start
|
|
208
|
-
LOGGER.debug(
|
|
209
|
-
T("coal.common.timing.operation_completed").format(operation=f"process {_file_name}", time=process_time)
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
elapsed_time = time.time() - start_time
|
|
213
|
-
LOGGER.info(T("coal.common.timing.operation_completed").format(operation="File download", time=elapsed_time))
|
|
214
|
-
LOGGER.info(T("coal.services.dataset.download_completed").format(dataset_type="File"))
|
|
215
|
-
|
|
216
|
-
return content, Path(tmp_dataset_dir)
|
|
@@ -1,188 +0,0 @@
|
|
|
1
|
-
# Copyright (C) - 2023 - 2025 - Cosmo Tech
|
|
2
|
-
# This document and all information contained herein is the exclusive property -
|
|
3
|
-
# including all intellectual property rights pertaining thereto - of Cosmo Tech.
|
|
4
|
-
# Any use, reproduction, translation, broadcasting, transmission, distribution,
|
|
5
|
-
# etc., to any person is prohibited unless it has been previously and
|
|
6
|
-
# specifically authorized by written means by Cosmo Tech.
|
|
7
|
-
|
|
8
|
-
import time
|
|
9
|
-
import tempfile
|
|
10
|
-
from pathlib import Path
|
|
11
|
-
from typing import Dict, List, Any, Optional, Union, Tuple
|
|
12
|
-
|
|
13
|
-
from cosmotech_api import (
|
|
14
|
-
DatasetApi,
|
|
15
|
-
DatasetTwinGraphQuery,
|
|
16
|
-
TwinGraphQuery,
|
|
17
|
-
TwingraphApi,
|
|
18
|
-
)
|
|
19
|
-
|
|
20
|
-
from cosmotech.coal.utils.logger import LOGGER
|
|
21
|
-
from cosmotech.orchestrator.utils.translate import T
|
|
22
|
-
from cosmotech.coal.cosmotech_api.connection import get_api_client
|
|
23
|
-
from cosmotech.coal.cosmotech_api.dataset.utils import get_content_from_twin_graph_data
|
|
24
|
-
from cosmotech.coal.cosmotech_api.dataset.converters import convert_dataset_to_files
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def download_twingraph_dataset(
|
|
28
|
-
organization_id: str,
|
|
29
|
-
dataset_id: str,
|
|
30
|
-
target_folder: Optional[Union[str, Path]] = None,
|
|
31
|
-
) -> Tuple[Dict[str, Any], Path]:
|
|
32
|
-
"""
|
|
33
|
-
Download dataset from TwinGraph.
|
|
34
|
-
|
|
35
|
-
Args:
|
|
36
|
-
organization_id: Organization ID
|
|
37
|
-
dataset_id: Dataset ID
|
|
38
|
-
target_folder: Optional folder to save files (if None, uses temp dir)
|
|
39
|
-
|
|
40
|
-
Returns:
|
|
41
|
-
Tuple of (content dict, folder path)
|
|
42
|
-
"""
|
|
43
|
-
start_time = time.time()
|
|
44
|
-
LOGGER.info(T("coal.services.dataset.download_started").format(dataset_type="TwinGraph"))
|
|
45
|
-
LOGGER.debug(
|
|
46
|
-
T("coal.services.dataset.twingraph_downloading").format(organization_id=organization_id, dataset_id=dataset_id)
|
|
47
|
-
)
|
|
48
|
-
|
|
49
|
-
with get_api_client()[0] as api_client:
|
|
50
|
-
dataset_api = DatasetApi(api_client)
|
|
51
|
-
|
|
52
|
-
# Query nodes
|
|
53
|
-
nodes_start = time.time()
|
|
54
|
-
LOGGER.debug(T("coal.services.dataset.twingraph_querying_nodes").format(dataset_id=dataset_id))
|
|
55
|
-
nodes_query = DatasetTwinGraphQuery(query="MATCH(n) RETURN n")
|
|
56
|
-
|
|
57
|
-
nodes = dataset_api.twingraph_query(
|
|
58
|
-
organization_id=organization_id,
|
|
59
|
-
dataset_id=dataset_id,
|
|
60
|
-
dataset_twin_graph_query=nodes_query,
|
|
61
|
-
)
|
|
62
|
-
|
|
63
|
-
nodes_time = time.time() - nodes_start
|
|
64
|
-
LOGGER.debug(T("coal.services.dataset.twingraph_nodes_found").format(count=len(nodes)))
|
|
65
|
-
LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="nodes query", time=nodes_time))
|
|
66
|
-
|
|
67
|
-
# Query edges
|
|
68
|
-
edges_start = time.time()
|
|
69
|
-
LOGGER.debug(T("coal.services.dataset.twingraph_querying_edges").format(dataset_id=dataset_id))
|
|
70
|
-
edges_query = DatasetTwinGraphQuery(query="MATCH(n)-[r]->(m) RETURN n as src, r as rel, m as dest")
|
|
71
|
-
|
|
72
|
-
edges = dataset_api.twingraph_query(
|
|
73
|
-
organization_id=organization_id,
|
|
74
|
-
dataset_id=dataset_id,
|
|
75
|
-
dataset_twin_graph_query=edges_query,
|
|
76
|
-
)
|
|
77
|
-
|
|
78
|
-
edges_time = time.time() - edges_start
|
|
79
|
-
LOGGER.debug(T("coal.services.dataset.twingraph_edges_found").format(count=len(edges)))
|
|
80
|
-
LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="edges query", time=edges_time))
|
|
81
|
-
|
|
82
|
-
# Process results
|
|
83
|
-
process_start = time.time()
|
|
84
|
-
content = get_content_from_twin_graph_data(nodes, edges, True)
|
|
85
|
-
process_time = time.time() - process_start
|
|
86
|
-
|
|
87
|
-
LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="data processing", time=process_time))
|
|
88
|
-
|
|
89
|
-
# Convert to files if target_folder is provided
|
|
90
|
-
if target_folder:
|
|
91
|
-
dataset_info = {
|
|
92
|
-
"type": "twincache",
|
|
93
|
-
"content": content,
|
|
94
|
-
"name": f"TwinGraph Dataset {dataset_id}",
|
|
95
|
-
}
|
|
96
|
-
target_folder = convert_dataset_to_files(dataset_info, target_folder)
|
|
97
|
-
else:
|
|
98
|
-
target_folder = tempfile.mkdtemp()
|
|
99
|
-
|
|
100
|
-
elapsed_time = time.time() - start_time
|
|
101
|
-
LOGGER.info(T("coal.common.timing.operation_completed").format(operation="TwinGraph download", time=elapsed_time))
|
|
102
|
-
LOGGER.info(T("coal.services.dataset.download_completed").format(dataset_type="TwinGraph"))
|
|
103
|
-
|
|
104
|
-
return content, Path(target_folder)
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
def download_legacy_twingraph_dataset(
|
|
108
|
-
organization_id: str,
|
|
109
|
-
cache_name: str,
|
|
110
|
-
target_folder: Optional[Union[str, Path]] = None,
|
|
111
|
-
) -> Tuple[Dict[str, Any], Path]:
|
|
112
|
-
"""
|
|
113
|
-
Download dataset from legacy TwinGraph.
|
|
114
|
-
|
|
115
|
-
Args:
|
|
116
|
-
organization_id: Organization ID
|
|
117
|
-
cache_name: Twin cache name
|
|
118
|
-
target_folder: Optional folder to save files (if None, uses temp dir)
|
|
119
|
-
|
|
120
|
-
Returns:
|
|
121
|
-
Tuple of (content dict, folder path)
|
|
122
|
-
"""
|
|
123
|
-
start_time = time.time()
|
|
124
|
-
LOGGER.info(T("coal.services.dataset.download_started").format(dataset_type="Legacy TwinGraph"))
|
|
125
|
-
LOGGER.debug(
|
|
126
|
-
T("coal.services.dataset.legacy_twingraph_downloading").format(
|
|
127
|
-
organization_id=organization_id, cache_name=cache_name
|
|
128
|
-
)
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
with get_api_client()[0] as api_client:
|
|
132
|
-
api_instance = TwingraphApi(api_client)
|
|
133
|
-
|
|
134
|
-
# Query nodes
|
|
135
|
-
nodes_start = time.time()
|
|
136
|
-
LOGGER.debug(T("coal.services.dataset.legacy_twingraph_querying_nodes").format(cache_name=cache_name))
|
|
137
|
-
_query_nodes = TwinGraphQuery(query="MATCH(n) RETURN n")
|
|
138
|
-
|
|
139
|
-
nodes = api_instance.query(
|
|
140
|
-
organization_id=organization_id,
|
|
141
|
-
graph_id=cache_name,
|
|
142
|
-
twin_graph_query=_query_nodes,
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
nodes_time = time.time() - nodes_start
|
|
146
|
-
LOGGER.debug(T("coal.services.dataset.legacy_twingraph_nodes_found").format(count=len(nodes)))
|
|
147
|
-
LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="nodes query", time=nodes_time))
|
|
148
|
-
|
|
149
|
-
# Query relationships
|
|
150
|
-
rel_start = time.time()
|
|
151
|
-
LOGGER.debug(T("coal.services.dataset.legacy_twingraph_querying_relations").format(cache_name=cache_name))
|
|
152
|
-
_query_rel = TwinGraphQuery(query="MATCH(n)-[r]->(m) RETURN n as src, r as rel, m as dest")
|
|
153
|
-
|
|
154
|
-
rel = api_instance.query(
|
|
155
|
-
organization_id=organization_id,
|
|
156
|
-
graph_id=cache_name,
|
|
157
|
-
twin_graph_query=_query_rel,
|
|
158
|
-
)
|
|
159
|
-
|
|
160
|
-
rel_time = time.time() - rel_start
|
|
161
|
-
LOGGER.debug(T("coal.services.dataset.legacy_twingraph_relations_found").format(count=len(rel)))
|
|
162
|
-
LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="relations query", time=rel_time))
|
|
163
|
-
|
|
164
|
-
# Process results
|
|
165
|
-
process_start = time.time()
|
|
166
|
-
content = get_content_from_twin_graph_data(nodes, rel, False)
|
|
167
|
-
process_time = time.time() - process_start
|
|
168
|
-
|
|
169
|
-
LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="data processing", time=process_time))
|
|
170
|
-
|
|
171
|
-
# Convert to files if target_folder is provided
|
|
172
|
-
if target_folder:
|
|
173
|
-
dataset_info = {
|
|
174
|
-
"type": "twincache",
|
|
175
|
-
"content": content,
|
|
176
|
-
"name": f"Legacy TwinGraph Dataset {cache_name}",
|
|
177
|
-
}
|
|
178
|
-
target_folder = convert_dataset_to_files(dataset_info, target_folder)
|
|
179
|
-
else:
|
|
180
|
-
target_folder = tempfile.mkdtemp()
|
|
181
|
-
|
|
182
|
-
elapsed_time = time.time() - start_time
|
|
183
|
-
LOGGER.info(
|
|
184
|
-
T("coal.common.timing.operation_completed").format(operation="Legacy TwinGraph download", time=elapsed_time)
|
|
185
|
-
)
|
|
186
|
-
LOGGER.info(T("coal.services.dataset.download_completed").format(dataset_type="Legacy TwinGraph"))
|
|
187
|
-
|
|
188
|
-
return content, Path(target_folder)
|