cosmotech-acceleration-library 1.1.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (98) hide show
  1. cosmotech/coal/__init__.py +1 -1
  2. cosmotech/coal/aws/__init__.py +1 -9
  3. cosmotech/coal/aws/s3.py +181 -214
  4. cosmotech/coal/azure/adx/auth.py +2 -2
  5. cosmotech/coal/azure/adx/runner.py +13 -14
  6. cosmotech/coal/azure/adx/store.py +5 -86
  7. cosmotech/coal/azure/adx/tables.py +2 -2
  8. cosmotech/coal/azure/blob.py +6 -6
  9. cosmotech/coal/azure/storage.py +3 -3
  10. cosmotech/coal/cosmotech_api/__init__.py +0 -28
  11. cosmotech/coal/cosmotech_api/apis/__init__.py +14 -0
  12. cosmotech/coal/cosmotech_api/apis/dataset.py +103 -0
  13. cosmotech/coal/cosmotech_api/apis/meta.py +25 -0
  14. cosmotech/coal/cosmotech_api/apis/organization.py +24 -0
  15. cosmotech/coal/cosmotech_api/apis/run.py +38 -0
  16. cosmotech/coal/cosmotech_api/apis/runner.py +71 -0
  17. cosmotech/coal/cosmotech_api/apis/solution.py +23 -0
  18. cosmotech/coal/cosmotech_api/apis/workspace.py +108 -0
  19. cosmotech/coal/cosmotech_api/objects/__init__.py +9 -0
  20. cosmotech/coal/cosmotech_api/objects/connection.py +125 -0
  21. cosmotech/coal/cosmotech_api/objects/parameters.py +127 -0
  22. cosmotech/coal/postgresql/runner.py +56 -36
  23. cosmotech/coal/postgresql/store.py +60 -14
  24. cosmotech/coal/postgresql/utils.py +254 -0
  25. cosmotech/coal/store/output/__init__.py +0 -0
  26. cosmotech/coal/store/output/aws_channel.py +73 -0
  27. cosmotech/coal/store/output/az_storage_channel.py +42 -0
  28. cosmotech/coal/store/output/channel_interface.py +23 -0
  29. cosmotech/coal/store/output/channel_spliter.py +55 -0
  30. cosmotech/coal/store/output/postgres_channel.py +40 -0
  31. cosmotech/coal/utils/configuration.py +169 -0
  32. cosmotech/coal/utils/decorator.py +4 -7
  33. cosmotech/csm_data/commands/api/api.py +6 -19
  34. cosmotech/csm_data/commands/api/postgres_send_runner_metadata.py +20 -16
  35. cosmotech/csm_data/commands/api/run_load_data.py +7 -46
  36. cosmotech/csm_data/commands/api/wsf_load_file.py +13 -16
  37. cosmotech/csm_data/commands/api/wsf_send_file.py +11 -14
  38. cosmotech/csm_data/commands/s3_bucket_delete.py +16 -15
  39. cosmotech/csm_data/commands/s3_bucket_download.py +16 -16
  40. cosmotech/csm_data/commands/s3_bucket_upload.py +16 -14
  41. cosmotech/csm_data/commands/store/dump_to_s3.py +18 -16
  42. cosmotech/csm_data/commands/store/output.py +35 -0
  43. cosmotech/csm_data/commands/store/store.py +3 -3
  44. cosmotech/translation/coal/en-US/coal/cosmotech_api/initialization.yml +8 -0
  45. cosmotech/translation/coal/en-US/coal/services/dataset.yml +4 -14
  46. cosmotech/translation/coal/en-US/coal/store/output/data_interface.yml +1 -0
  47. cosmotech/translation/coal/en-US/coal/store/output/split.yml +6 -0
  48. cosmotech/translation/coal/en-US/coal/utils/configuration.yml +2 -0
  49. cosmotech/translation/csm_data/en-US/csm_data/commands/store/output.yml +7 -0
  50. {cosmotech_acceleration_library-1.1.0.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/METADATA +5 -8
  51. {cosmotech_acceleration_library-1.1.0.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/RECORD +55 -73
  52. cosmotech/coal/azure/functions.py +0 -72
  53. cosmotech/coal/cosmotech_api/connection.py +0 -96
  54. cosmotech/coal/cosmotech_api/dataset/__init__.py +0 -26
  55. cosmotech/coal/cosmotech_api/dataset/converters.py +0 -164
  56. cosmotech/coal/cosmotech_api/dataset/download/__init__.py +0 -19
  57. cosmotech/coal/cosmotech_api/dataset/download/adt.py +0 -119
  58. cosmotech/coal/cosmotech_api/dataset/download/common.py +0 -140
  59. cosmotech/coal/cosmotech_api/dataset/download/file.py +0 -229
  60. cosmotech/coal/cosmotech_api/dataset/download/twingraph.py +0 -185
  61. cosmotech/coal/cosmotech_api/dataset/upload.py +0 -41
  62. cosmotech/coal/cosmotech_api/dataset/utils.py +0 -132
  63. cosmotech/coal/cosmotech_api/parameters.py +0 -48
  64. cosmotech/coal/cosmotech_api/run.py +0 -25
  65. cosmotech/coal/cosmotech_api/run_data.py +0 -173
  66. cosmotech/coal/cosmotech_api/run_template.py +0 -108
  67. cosmotech/coal/cosmotech_api/runner/__init__.py +0 -28
  68. cosmotech/coal/cosmotech_api/runner/data.py +0 -38
  69. cosmotech/coal/cosmotech_api/runner/datasets.py +0 -416
  70. cosmotech/coal/cosmotech_api/runner/download.py +0 -135
  71. cosmotech/coal/cosmotech_api/runner/metadata.py +0 -42
  72. cosmotech/coal/cosmotech_api/runner/parameters.py +0 -157
  73. cosmotech/coal/cosmotech_api/twin_data_layer.py +0 -512
  74. cosmotech/coal/cosmotech_api/workspace.py +0 -127
  75. cosmotech/coal/utils/postgresql.py +0 -236
  76. cosmotech/coal/utils/semver.py +0 -6
  77. cosmotech/csm_data/commands/api/rds_load_csv.py +0 -90
  78. cosmotech/csm_data/commands/api/rds_send_csv.py +0 -74
  79. cosmotech/csm_data/commands/api/rds_send_store.py +0 -74
  80. cosmotech/csm_data/commands/api/runtemplate_load_handler.py +0 -66
  81. cosmotech/csm_data/commands/api/tdl_load_files.py +0 -76
  82. cosmotech/csm_data/commands/api/tdl_send_files.py +0 -82
  83. cosmotech/orchestrator_plugins/csm-data/templates/api/rds_load_csv.json +0 -27
  84. cosmotech/orchestrator_plugins/csm-data/templates/api/rds_send_csv.json +0 -27
  85. cosmotech/orchestrator_plugins/csm-data/templates/api/rds_send_store.json +0 -27
  86. cosmotech/orchestrator_plugins/csm-data/templates/api/runtemplate_load_handler.json +0 -27
  87. cosmotech/orchestrator_plugins/csm-data/templates/api/tdl_load_files.json +0 -32
  88. cosmotech/orchestrator_plugins/csm-data/templates/api/tdl_send_files.json +0 -27
  89. cosmotech/translation/coal/en-US/coal/cosmotech_api/run_data.yml +0 -2
  90. cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_load_csv.yml +0 -13
  91. cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_send_csv.yml +0 -12
  92. cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_send_store.yml +0 -12
  93. cosmotech/translation/csm_data/en-US/csm_data/commands/api/tdl_load_files.yml +0 -14
  94. cosmotech/translation/csm_data/en-US/csm_data/commands/api/tdl_send_files.yml +0 -18
  95. {cosmotech_acceleration_library-1.1.0.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/WHEEL +0 -0
  96. {cosmotech_acceleration_library-1.1.0.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/entry_points.txt +0 -0
  97. {cosmotech_acceleration_library-1.1.0.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/licenses/LICENSE +0 -0
  98. {cosmotech_acceleration_library-1.1.0.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,119 +0,0 @@
1
- # Copyright (C) - 2023 - 2025 - Cosmo Tech
2
- # This document and all information contained herein is the exclusive property -
3
- # including all intellectual property rights pertaining thereto - of Cosmo Tech.
4
- # Any use, reproduction, translation, broadcasting, transmission, distribution,
5
- # etc., to any person is prohibited unless it has been previously and
6
- # specifically authorized by written means by Cosmo Tech.
7
-
8
- import time
9
- import tempfile
10
- from pathlib import Path
11
- from typing import Dict, Any, Optional, Union, Tuple
12
-
13
- from azure.digitaltwins.core import DigitalTwinsClient
14
- from azure.identity import DefaultAzureCredential
15
-
16
- from cosmotech.coal.utils.logger import LOGGER
17
- from cosmotech.orchestrator.utils.translate import T
18
- from cosmotech.coal.cosmotech_api.connection import get_api_client
19
- from cosmotech.coal.cosmotech_api.dataset.converters import convert_dataset_to_files
20
-
21
-
22
- def download_adt_dataset(
23
- adt_address: str,
24
- target_folder: Optional[Union[str, Path]] = None,
25
- credentials: Optional[DefaultAzureCredential] = None,
26
- ) -> Tuple[Dict[str, Any], Path]:
27
- """
28
- Download dataset from Azure Digital Twins.
29
-
30
- Args:
31
- adt_address: The ADT instance address
32
- target_folder: Optional folder to save files (if None, uses temp dir)
33
- credentials: Optional Azure credentials (if None, uses DefaultAzureCredential)
34
-
35
- Returns:
36
- Tuple of (content dict, folder path)
37
- """
38
- start_time = time.time()
39
- LOGGER.info(T("coal.services.dataset.download_started").format(dataset_type="ADT"))
40
- LOGGER.debug(T("coal.services.dataset.adt_connecting").format(url=adt_address))
41
-
42
- # Create credentials if not provided
43
- if credentials is None:
44
- if get_api_client()[1] == "Azure Entra Connection":
45
- credentials = DefaultAzureCredential()
46
- else:
47
- LOGGER.error(T("coal.services.dataset.adt_no_credentials"))
48
- raise ValueError("No credentials available for ADT connection")
49
-
50
- # Create client and download data
51
- client = DigitalTwinsClient(adt_address, credentials)
52
-
53
- # Query twins
54
- query_start = time.time()
55
- LOGGER.debug(T("coal.services.dataset.adt_querying_twins"))
56
- query_expression = "SELECT * FROM digitaltwins"
57
- query_result = client.query_twins(query_expression)
58
-
59
- json_content = dict()
60
- twin_count = 0
61
-
62
- for twin in query_result:
63
- twin_count += 1
64
- entity_type = twin.get("$metadata").get("$model").split(":")[-1].split(";")[0]
65
- t_content = {k: v for k, v in twin.items()}
66
- t_content["id"] = t_content["$dtId"]
67
-
68
- # Remove system properties
69
- for k in list(twin.keys()):
70
- if k[0] == "$":
71
- del t_content[k]
72
-
73
- json_content.setdefault(entity_type, [])
74
- json_content[entity_type].append(t_content)
75
-
76
- query_time = time.time() - query_start
77
- LOGGER.debug(T("coal.services.dataset.adt_twins_found").format(count=twin_count))
78
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="twins query", time=query_time))
79
-
80
- # Query relationships
81
- rel_start = time.time()
82
- LOGGER.debug(T("coal.services.dataset.adt_querying_relations"))
83
- relations_query = "SELECT * FROM relationships"
84
- query_result = client.query_twins(relations_query)
85
-
86
- relation_count = 0
87
- for relation in query_result:
88
- relation_count += 1
89
- tr = {"$relationshipId": "id", "$sourceId": "source", "$targetId": "target"}
90
- r_content = {k: v for k, v in relation.items()}
91
-
92
- # Map system properties to standard names
93
- for k, v in tr.items():
94
- r_content[v] = r_content[k]
95
-
96
- # Remove system properties
97
- for k in list(relation.keys()):
98
- if k[0] == "$":
99
- del r_content[k]
100
-
101
- json_content.setdefault(relation["$relationshipName"], [])
102
- json_content[relation["$relationshipName"]].append(r_content)
103
-
104
- rel_time = time.time() - rel_start
105
- LOGGER.debug(T("coal.services.dataset.adt_relations_found").format(count=relation_count))
106
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="relations query", time=rel_time))
107
-
108
- # Convert to files if target_folder is provided
109
- if target_folder:
110
- dataset_info = {"type": "adt", "content": json_content, "name": "ADT Dataset"}
111
- target_folder = convert_dataset_to_files(dataset_info, target_folder)
112
- else:
113
- target_folder = tempfile.mkdtemp()
114
-
115
- elapsed_time = time.time() - start_time
116
- LOGGER.info(T("coal.common.timing.operation_completed").format(operation="ADT download", time=elapsed_time))
117
- LOGGER.info(T("coal.services.dataset.download_completed").format(dataset_type="ADT"))
118
-
119
- return json_content, Path(target_folder)
@@ -1,140 +0,0 @@
1
- # Copyright (C) - 2023 - 2025 - Cosmo Tech
2
- # This document and all information contained herein is the exclusive property -
3
- # including all intellectual property rights pertaining thereto - of Cosmo Tech.
4
- # Any use, reproduction, translation, broadcasting, transmission, distribution,
5
- # etc., to any person is prohibited unless it has been previously and
6
- # specifically authorized by written means by Cosmo Tech.
7
-
8
- import time
9
- from pathlib import Path
10
- from typing import Dict, Any, Optional, Union, Tuple
11
-
12
- from cosmotech_api import DatasetApi
13
-
14
- from cosmotech.coal.utils.logger import LOGGER
15
- from cosmotech.orchestrator.utils.translate import T
16
- from cosmotech.coal.cosmotech_api.connection import get_api_client
17
-
18
- # Import specific download functions
19
- # These imports are defined here to avoid circular imports
20
- # The functions are imported directly from their modules
21
- from cosmotech.coal.cosmotech_api.dataset.download.adt import download_adt_dataset
22
- from cosmotech.coal.cosmotech_api.dataset.download.twingraph import (
23
- download_twingraph_dataset,
24
- download_legacy_twingraph_dataset,
25
- )
26
- from cosmotech.coal.cosmotech_api.dataset.download.file import download_file_dataset
27
-
28
-
29
- def download_dataset_by_id(
30
- organization_id: str,
31
- workspace_id: str,
32
- dataset_id: str,
33
- target_folder: Optional[Union[str, Path]] = None,
34
- ) -> Tuple[Dict[str, Any], Path]:
35
- """
36
- Download dataset by ID.
37
-
38
- Args:
39
- organization_id: Organization ID
40
- workspace_id: Workspace ID
41
- dataset_id: Dataset ID
42
- target_folder: Optional folder to save files (if None, uses temp dir)
43
-
44
- Returns:
45
- Tuple of (dataset info dict, folder path)
46
- """
47
- start_time = time.time()
48
- LOGGER.info(T("coal.services.dataset.download_started").format(dataset_type="Dataset"))
49
- LOGGER.debug(
50
- T("coal.services.dataset.dataset_downloading").format(organization_id=organization_id, dataset_id=dataset_id)
51
- )
52
-
53
- with get_api_client()[0] as api_client:
54
- api_instance = DatasetApi(api_client)
55
-
56
- # Get dataset info
57
- info_start = time.time()
58
- dataset = api_instance.find_dataset_by_id(organization_id=organization_id, dataset_id=dataset_id)
59
- info_time = time.time() - info_start
60
-
61
- LOGGER.debug(
62
- T("coal.services.dataset.dataset_info_retrieved").format(dataset_name=dataset.name, dataset_id=dataset_id)
63
- )
64
- LOGGER.debug(
65
- T("coal.common.timing.operation_completed").format(operation="dataset info retrieval", time=info_time)
66
- )
67
-
68
- # Determine dataset type and download
69
- if dataset.connector is None:
70
- parameters = []
71
- else:
72
- parameters = dataset.connector.parameters_values
73
-
74
- is_adt = "AZURE_DIGITAL_TWINS_URL" in parameters
75
- is_storage = "AZURE_STORAGE_CONTAINER_BLOB_PREFIX" in parameters
76
- is_legacy_twin_cache = "TWIN_CACHE_NAME" in parameters and dataset.twingraph_id is None
77
- is_in_workspace_file = (
78
- False if dataset.tags is None else "workspaceFile" in dataset.tags or "dataset_part" in dataset.tags
79
- )
80
-
81
- download_start = time.time()
82
-
83
- if is_adt:
84
- LOGGER.debug(T("coal.services.dataset.dataset_type_detected").format(type="ADT"))
85
- content, folder = download_adt_dataset(
86
- adt_address=parameters["AZURE_DIGITAL_TWINS_URL"],
87
- target_folder=target_folder,
88
- )
89
- dataset_type = "adt"
90
-
91
- elif is_legacy_twin_cache:
92
- LOGGER.debug(T("coal.services.dataset.dataset_type_detected").format(type="Legacy TwinGraph"))
93
- twin_cache_name = parameters["TWIN_CACHE_NAME"]
94
- content, folder = download_legacy_twingraph_dataset(
95
- organization_id=organization_id,
96
- cache_name=twin_cache_name,
97
- target_folder=target_folder,
98
- )
99
- dataset_type = "twincache"
100
-
101
- elif is_storage or is_in_workspace_file:
102
- if is_storage:
103
- LOGGER.debug(T("coal.services.dataset.dataset_type_detected").format(type="Storage"))
104
- _file_name = parameters["AZURE_STORAGE_CONTAINER_BLOB_PREFIX"].replace("%WORKSPACE_FILE%/", "")
105
- else:
106
- LOGGER.debug(T("coal.services.dataset.dataset_type_detected").format(type="Workspace File"))
107
- _file_name = dataset.source.location
108
-
109
- content, folder = download_file_dataset(
110
- organization_id=organization_id,
111
- workspace_id=workspace_id,
112
- file_name=_file_name,
113
- target_folder=target_folder,
114
- )
115
- dataset_type = _file_name.split(".")[-1]
116
-
117
- else:
118
- LOGGER.debug(T("coal.services.dataset.dataset_type_detected").format(type="TwinGraph"))
119
- content, folder = download_twingraph_dataset(
120
- organization_id=organization_id,
121
- dataset_id=dataset_id,
122
- target_folder=target_folder,
123
- )
124
- dataset_type = "twincache"
125
-
126
- download_time = time.time() - download_start
127
- LOGGER.debug(
128
- T("coal.common.timing.operation_completed").format(operation="content download", time=download_time)
129
- )
130
-
131
- # Prepare result
132
- dataset_info = {"type": dataset_type, "content": content, "name": dataset.name}
133
-
134
- elapsed_time = time.time() - start_time
135
- LOGGER.info(
136
- T("coal.common.timing.operation_completed").format(operation="total dataset download", time=elapsed_time)
137
- )
138
- LOGGER.info(T("coal.services.dataset.download_completed").format(dataset_type="Dataset"))
139
-
140
- return dataset_info, folder
@@ -1,229 +0,0 @@
1
- # Copyright (C) - 2023 - 2025 - Cosmo Tech
2
- # This document and all information contained herein is the exclusive property -
3
- # including all intellectual property rights pertaining thereto - of Cosmo Tech.
4
- # Any use, reproduction, translation, broadcasting, transmission, distribution,
5
- # etc., to any person is prohibited unless it has been previously and
6
- # specifically authorized by written means by Cosmo Tech.
7
-
8
- import csv
9
- import io
10
- import json
11
- import os
12
- import tempfile
13
- import time
14
- from pathlib import Path
15
- from typing import Dict, Any, Optional, Union, Tuple
16
-
17
- from cosmotech_api import WorkspaceApi
18
- from openpyxl import load_workbook
19
-
20
- from cosmotech.coal.utils.decorator import timed
21
- from cosmotech.coal.utils.logger import LOGGER
22
- from cosmotech.orchestrator.utils.translate import T
23
- from cosmotech.coal.cosmotech_api.connection import get_api_client
24
-
25
-
26
- def process_xls(target_file) -> Dict[str, Any]:
27
- content = {}
28
-
29
- LOGGER.debug(T("coal.services.dataset.processing_excel").format(file_name=target_file))
30
- wb = load_workbook(target_file, data_only=True)
31
-
32
- for sheet_name in wb.sheetnames:
33
- sheet = wb[sheet_name]
34
- content[sheet_name] = list()
35
- headers = next(sheet.iter_rows(max_row=1, values_only=True))
36
-
37
- row_count = 0
38
- for r in sheet.iter_rows(min_row=2, values_only=True):
39
- row = {k: v for k, v in zip(headers, r)}
40
- new_row = dict()
41
-
42
- for key, value in row.items():
43
- try:
44
- converted_value = json.load(io.StringIO(value))
45
- except (json.decoder.JSONDecodeError, TypeError):
46
- converted_value = value
47
-
48
- if converted_value is not None:
49
- new_row[key] = converted_value
50
-
51
- if new_row:
52
- content[sheet_name].append(new_row)
53
- row_count += 1
54
-
55
- LOGGER.debug(T("coal.services.dataset.sheet_processed").format(sheet_name=sheet_name, rows=row_count))
56
- return content
57
-
58
-
59
- def process_csv(target_file) -> Dict[str, Any]:
60
- content = {}
61
-
62
- LOGGER.debug(T("coal.services.dataset.processing_csv").format(file_name=target_file))
63
- with open(target_file, "r") as file:
64
- current_filename = os.path.basename(target_file)[: -len(".csv")]
65
- content[current_filename] = list()
66
-
67
- row_count = 0
68
- for csv_row in csv.DictReader(file):
69
- csv_row: dict
70
- new_row = dict()
71
-
72
- for key, value in csv_row.items():
73
- try:
74
- # Try to convert any json row to dict object
75
- converted_value = json.load(io.StringIO(value))
76
- except json.decoder.JSONDecodeError:
77
- converted_value = value
78
-
79
- if converted_value == "":
80
- converted_value = None
81
-
82
- if converted_value is not None:
83
- new_row[key] = converted_value
84
-
85
- content[current_filename].append(new_row)
86
- row_count += 1
87
-
88
- LOGGER.debug(T("coal.services.dataset.csv_processed").format(file_name=current_filename, rows=row_count))
89
- return content
90
-
91
-
92
- def process_json(target_file) -> Dict[str, Any]:
93
- content = {}
94
- LOGGER.debug(T("coal.services.dataset.processing_json").format(file_name=target_file))
95
- with open(target_file, "r") as _file:
96
- current_filename = os.path.basename(target_file)
97
- content[current_filename] = json.load(_file)
98
-
99
- if isinstance(content[current_filename], dict):
100
- item_count = len(content[current_filename])
101
- elif isinstance(content[current_filename], list):
102
- item_count = len(content[current_filename])
103
- else:
104
- item_count = 1
105
-
106
- LOGGER.debug(T("coal.services.dataset.json_processed").format(file_name=current_filename, items=item_count))
107
- return content
108
-
109
-
110
- def process_txt(target_file) -> Dict[str, Any]:
111
- content = {}
112
- LOGGER.debug(T("coal.services.dataset.processing_text").format(file_name=target_file))
113
- with open(target_file, "r") as _file:
114
- current_filename = os.path.basename(target_file)
115
- content[current_filename] = _file.read()
116
-
117
- line_count = content[current_filename].count("\n") + 1
118
- LOGGER.debug(T("coal.services.dataset.text_processed").format(file_name=current_filename, lines=line_count))
119
- return content
120
-
121
-
122
- def read_file(file_name, file):
123
- @timed(f"process {file_name}", debug=True)
124
- def timed_read_file(file_name, file):
125
- content = {}
126
- if ".xls" in file_name:
127
- content.update(process_xls(file))
128
- elif ".csv" in file_name:
129
- content.update(process_csv(file))
130
- elif ".json" in file_name:
131
- content.update(process_json(file))
132
- else:
133
- content.update(process_txt(file))
134
- return content
135
-
136
- return timed_read_file(file_name, file)
137
-
138
-
139
- def download_file_dataset(
140
- organization_id: str,
141
- workspace_id: str,
142
- file_name: str,
143
- target_folder: Optional[Union[str, Path]] = None,
144
- read_files: bool = True,
145
- ) -> Tuple[Dict[str, Any], Path]:
146
- """
147
- Download file dataset.
148
-
149
- Args:
150
- organization_id: Organization ID
151
- workspace_id: Workspace ID
152
- file_name: File name to download
153
- target_folder: Optional folder to save files (if None, uses temp dir)
154
- read_files: Whether to read file contents
155
-
156
- Returns:
157
- Tuple of (content dict, folder path)
158
- """
159
- start_time = time.time()
160
- LOGGER.info(T("coal.services.dataset.download_started").format(dataset_type="File"))
161
- LOGGER.debug(
162
- T("coal.services.dataset.file_downloading").format(
163
- organization_id=organization_id,
164
- workspace_id=workspace_id,
165
- file_name=file_name,
166
- )
167
- )
168
-
169
- # Create temp directory for downloaded files
170
- if target_folder is None:
171
- tmp_dataset_dir = tempfile.mkdtemp()
172
- else:
173
- tmp_dataset_dir = Path(target_folder)
174
- tmp_dataset_dir.mkdir(parents=True, exist_ok=True)
175
- tmp_dataset_dir = str(tmp_dataset_dir)
176
-
177
- LOGGER.debug(T("coal.services.dataset.using_folder").format(folder=tmp_dataset_dir))
178
-
179
- content = dict()
180
-
181
- with get_api_client()[0] as api_client:
182
- api_ws = WorkspaceApi(api_client)
183
-
184
- # Find all files matching the pattern
185
- list_start = time.time()
186
- LOGGER.debug(T("coal.services.dataset.listing_workspace_files"))
187
- all_api_files = api_ws.find_all_workspace_files(organization_id, workspace_id)
188
-
189
- existing_files = list(_f.file_name for _f in all_api_files if _f.file_name.startswith(file_name))
190
- list_time = time.time() - list_start
191
-
192
- LOGGER.debug(T("coal.services.dataset.workspace_files_found").format(count=len(existing_files)))
193
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="file listing", time=list_time))
194
-
195
- if not existing_files:
196
- LOGGER.warning(T("coal.services.dataset.no_files_found").format(file_name=file_name))
197
- return content, Path(tmp_dataset_dir)
198
-
199
- # Download and process each file
200
- for _file_name in existing_files:
201
- download_start = time.time()
202
- LOGGER.debug(T("coal.services.dataset.downloading_file").format(file_name=_file_name))
203
-
204
- dl_file = api_ws.download_workspace_file(
205
- organization_id=organization_id,
206
- workspace_id=workspace_id,
207
- file_name=_file_name,
208
- )
209
-
210
- target_file = os.path.join(tmp_dataset_dir, _file_name.split("/")[-1])
211
- with open(target_file, "wb") as tmp_file:
212
- tmp_file.write(dl_file)
213
-
214
- download_time = time.time() - download_start
215
- LOGGER.debug(T("coal.services.dataset.file_downloaded").format(file_name=_file_name, path=target_file))
216
- LOGGER.debug(
217
- T("coal.common.timing.operation_completed").format(
218
- operation=f"download {_file_name}", time=download_time
219
- )
220
- )
221
-
222
- if read_files:
223
- content.update(read_file(_file_name, target_file))
224
-
225
- elapsed_time = time.time() - start_time
226
- LOGGER.info(T("coal.common.timing.operation_completed").format(operation="File download", time=elapsed_time))
227
- LOGGER.info(T("coal.services.dataset.download_completed").format(dataset_type="File"))
228
-
229
- return content, Path(tmp_dataset_dir)
@@ -1,185 +0,0 @@
1
- # Copyright (C) - 2023 - 2025 - Cosmo Tech
2
- # This document and all information contained herein is the exclusive property -
3
- # including all intellectual property rights pertaining thereto - of Cosmo Tech.
4
- # Any use, reproduction, translation, broadcasting, transmission, distribution,
5
- # etc., to any person is prohibited unless it has been previously and
6
- # specifically authorized by written means by Cosmo Tech.
7
-
8
- import time
9
- import tempfile
10
- from pathlib import Path
11
- from typing import Dict, Any, Optional, Union, Tuple
12
-
13
- import cosmotech_api
14
-
15
- from cosmotech.coal.utils.logger import LOGGER
16
- from cosmotech.orchestrator.utils.translate import T
17
- from cosmotech.coal.cosmotech_api.connection import get_api_client
18
- from cosmotech.coal.cosmotech_api.dataset.utils import get_content_from_twin_graph_data
19
- from cosmotech.coal.cosmotech_api.dataset.converters import convert_dataset_to_files
20
-
21
-
22
- def download_twingraph_dataset(
23
- organization_id: str,
24
- dataset_id: str,
25
- target_folder: Optional[Union[str, Path]] = None,
26
- ) -> Tuple[Dict[str, Any], Path]:
27
- """
28
- Download dataset from TwinGraph.
29
-
30
- Args:
31
- organization_id: Organization ID
32
- dataset_id: Dataset ID
33
- target_folder: Optional folder to save files (if None, uses temp dir)
34
-
35
- Returns:
36
- Tuple of (content dict, folder path)
37
- """
38
- start_time = time.time()
39
- LOGGER.info(T("coal.services.dataset.download_started").format(dataset_type="TwinGraph"))
40
- LOGGER.debug(
41
- T("coal.services.dataset.twingraph_downloading").format(organization_id=organization_id, dataset_id=dataset_id)
42
- )
43
-
44
- with get_api_client()[0] as api_client:
45
- dataset_api = cosmotech_api.DatasetApi(api_client)
46
-
47
- # Query nodes
48
- nodes_start = time.time()
49
- LOGGER.debug(T("coal.services.dataset.twingraph_querying_nodes").format(dataset_id=dataset_id))
50
- nodes_query = cosmotech_api.DatasetTwinGraphQuery(query="MATCH(n) RETURN n")
51
-
52
- nodes = dataset_api.twingraph_query(
53
- organization_id=organization_id,
54
- dataset_id=dataset_id,
55
- dataset_twin_graph_query=nodes_query,
56
- )
57
-
58
- nodes_time = time.time() - nodes_start
59
- LOGGER.debug(T("coal.services.dataset.twingraph_nodes_found").format(count=len(nodes)))
60
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="nodes query", time=nodes_time))
61
-
62
- # Query edges
63
- edges_start = time.time()
64
- LOGGER.debug(T("coal.services.dataset.twingraph_querying_edges").format(dataset_id=dataset_id))
65
- edges_query = cosmotech_api.DatasetTwinGraphQuery(
66
- query="MATCH(n)-[r]->(m) RETURN n as src, r as rel, m as dest"
67
- )
68
-
69
- edges = dataset_api.twingraph_query(
70
- organization_id=organization_id,
71
- dataset_id=dataset_id,
72
- dataset_twin_graph_query=edges_query,
73
- )
74
-
75
- edges_time = time.time() - edges_start
76
- LOGGER.debug(T("coal.services.dataset.twingraph_edges_found").format(count=len(edges)))
77
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="edges query", time=edges_time))
78
-
79
- # Process results
80
- process_start = time.time()
81
- content = get_content_from_twin_graph_data(nodes, edges, True)
82
- process_time = time.time() - process_start
83
-
84
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="data processing", time=process_time))
85
-
86
- # Convert to files if target_folder is provided
87
- if target_folder:
88
- dataset_info = {
89
- "type": "twincache",
90
- "content": content,
91
- "name": f"TwinGraph Dataset {dataset_id}",
92
- }
93
- target_folder = convert_dataset_to_files(dataset_info, target_folder)
94
- else:
95
- target_folder = tempfile.mkdtemp()
96
-
97
- elapsed_time = time.time() - start_time
98
- LOGGER.info(T("coal.common.timing.operation_completed").format(operation="TwinGraph download", time=elapsed_time))
99
- LOGGER.info(T("coal.services.dataset.download_completed").format(dataset_type="TwinGraph"))
100
-
101
- return content, Path(target_folder)
102
-
103
-
104
- def download_legacy_twingraph_dataset(
105
- organization_id: str,
106
- cache_name: str,
107
- target_folder: Optional[Union[str, Path]] = None,
108
- ) -> Tuple[Dict[str, Any], Path]:
109
- """
110
- Download dataset from legacy TwinGraph.
111
-
112
- Args:
113
- organization_id: Organization ID
114
- cache_name: Twin cache name
115
- target_folder: Optional folder to save files (if None, uses temp dir)
116
-
117
- Returns:
118
- Tuple of (content dict, folder path)
119
- """
120
- start_time = time.time()
121
- LOGGER.info(T("coal.services.dataset.download_started").format(dataset_type="Legacy TwinGraph"))
122
- LOGGER.debug(
123
- T("coal.services.dataset.legacy_twingraph_downloading").format(
124
- organization_id=organization_id, cache_name=cache_name
125
- )
126
- )
127
-
128
- with get_api_client()[0] as api_client:
129
- api_instance = cosmotech_api.TwingraphApi(api_client)
130
-
131
- # Query nodes
132
- nodes_start = time.time()
133
- LOGGER.debug(T("coal.services.dataset.legacy_twingraph_querying_nodes").format(cache_name=cache_name))
134
- _query_nodes = cosmotech_api.TwinGraphQuery(query="MATCH(n) RETURN n")
135
-
136
- nodes = api_instance.query(
137
- organization_id=organization_id,
138
- graph_id=cache_name,
139
- twin_graph_query=_query_nodes,
140
- )
141
-
142
- nodes_time = time.time() - nodes_start
143
- LOGGER.debug(T("coal.services.dataset.legacy_twingraph_nodes_found").format(count=len(nodes)))
144
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="nodes query", time=nodes_time))
145
-
146
- # Query relationships
147
- rel_start = time.time()
148
- LOGGER.debug(T("coal.services.dataset.legacy_twingraph_querying_relations").format(cache_name=cache_name))
149
- _query_rel = cosmotech_api.TwinGraphQuery(query="MATCH(n)-[r]->(m) RETURN n as src, r as rel, m as dest")
150
-
151
- rel = api_instance.query(
152
- organization_id=organization_id,
153
- graph_id=cache_name,
154
- twin_graph_query=_query_rel,
155
- )
156
-
157
- rel_time = time.time() - rel_start
158
- LOGGER.debug(T("coal.services.dataset.legacy_twingraph_relations_found").format(count=len(rel)))
159
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="relations query", time=rel_time))
160
-
161
- # Process results
162
- process_start = time.time()
163
- content = get_content_from_twin_graph_data(nodes, rel, False)
164
- process_time = time.time() - process_start
165
-
166
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="data processing", time=process_time))
167
-
168
- # Convert to files if target_folder is provided
169
- if target_folder:
170
- dataset_info = {
171
- "type": "twincache",
172
- "content": content,
173
- "name": f"Legacy TwinGraph Dataset {cache_name}",
174
- }
175
- target_folder = convert_dataset_to_files(dataset_info, target_folder)
176
- else:
177
- target_folder = tempfile.mkdtemp()
178
-
179
- elapsed_time = time.time() - start_time
180
- LOGGER.info(
181
- T("coal.common.timing.operation_completed").format(operation="Legacy TwinGraph download", time=elapsed_time)
182
- )
183
- LOGGER.info(T("coal.services.dataset.download_completed").format(dataset_type="Legacy TwinGraph"))
184
-
185
- return content, Path(target_folder)