cosmotech-acceleration-library 1.0.1__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. cosmotech/coal/__init__.py +1 -1
  2. cosmotech/coal/aws/__init__.py +1 -9
  3. cosmotech/coal/aws/s3.py +181 -214
  4. cosmotech/coal/azure/adx/auth.py +2 -2
  5. cosmotech/coal/azure/adx/runner.py +13 -14
  6. cosmotech/coal/azure/adx/store.py +5 -86
  7. cosmotech/coal/azure/adx/tables.py +2 -2
  8. cosmotech/coal/azure/blob.py +6 -6
  9. cosmotech/coal/azure/storage.py +3 -3
  10. cosmotech/coal/cosmotech_api/__init__.py +0 -24
  11. cosmotech/coal/cosmotech_api/apis/__init__.py +14 -0
  12. cosmotech/coal/cosmotech_api/apis/dataset.py +103 -0
  13. cosmotech/coal/cosmotech_api/apis/meta.py +25 -0
  14. cosmotech/coal/cosmotech_api/apis/organization.py +24 -0
  15. cosmotech/coal/cosmotech_api/apis/run.py +38 -0
  16. cosmotech/coal/cosmotech_api/apis/runner.py +71 -0
  17. cosmotech/coal/cosmotech_api/apis/solution.py +23 -0
  18. cosmotech/coal/cosmotech_api/apis/workspace.py +108 -0
  19. cosmotech/coal/cosmotech_api/objects/__init__.py +9 -0
  20. cosmotech/coal/cosmotech_api/objects/connection.py +125 -0
  21. cosmotech/coal/cosmotech_api/objects/parameters.py +127 -0
  22. cosmotech/coal/postgresql/runner.py +56 -36
  23. cosmotech/coal/postgresql/store.py +60 -14
  24. cosmotech/coal/postgresql/utils.py +254 -0
  25. cosmotech/coal/store/output/__init__.py +0 -0
  26. cosmotech/coal/store/output/aws_channel.py +73 -0
  27. cosmotech/coal/store/output/az_storage_channel.py +42 -0
  28. cosmotech/coal/store/output/channel_interface.py +23 -0
  29. cosmotech/coal/store/output/channel_spliter.py +55 -0
  30. cosmotech/coal/store/output/postgres_channel.py +40 -0
  31. cosmotech/coal/utils/configuration.py +169 -0
  32. cosmotech/coal/utils/decorator.py +22 -0
  33. cosmotech/csm_data/commands/api/api.py +6 -19
  34. cosmotech/csm_data/commands/api/postgres_send_runner_metadata.py +20 -16
  35. cosmotech/csm_data/commands/api/run_load_data.py +7 -46
  36. cosmotech/csm_data/commands/api/wsf_load_file.py +14 -15
  37. cosmotech/csm_data/commands/api/wsf_send_file.py +12 -13
  38. cosmotech/csm_data/commands/s3_bucket_delete.py +16 -15
  39. cosmotech/csm_data/commands/s3_bucket_download.py +16 -16
  40. cosmotech/csm_data/commands/s3_bucket_upload.py +16 -14
  41. cosmotech/csm_data/commands/store/dump_to_s3.py +18 -16
  42. cosmotech/csm_data/commands/store/output.py +35 -0
  43. cosmotech/csm_data/commands/store/store.py +3 -4
  44. cosmotech/translation/coal/en-US/coal/cosmotech_api/initialization.yml +8 -0
  45. cosmotech/translation/coal/en-US/coal/services/dataset.yml +4 -14
  46. cosmotech/translation/coal/en-US/coal/store/output/data_interface.yml +1 -0
  47. cosmotech/translation/coal/en-US/coal/store/output/split.yml +6 -0
  48. cosmotech/translation/coal/en-US/coal/utils/configuration.yml +2 -0
  49. cosmotech/translation/csm_data/en-US/csm_data/commands/store/output.yml +7 -0
  50. {cosmotech_acceleration_library-1.0.1.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/METADATA +8 -9
  51. {cosmotech_acceleration_library-1.0.1.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/RECORD +55 -71
  52. cosmotech/coal/azure/functions.py +0 -72
  53. cosmotech/coal/cosmotech_api/connection.py +0 -96
  54. cosmotech/coal/cosmotech_api/dataset/__init__.py +0 -26
  55. cosmotech/coal/cosmotech_api/dataset/converters.py +0 -164
  56. cosmotech/coal/cosmotech_api/dataset/download/__init__.py +0 -19
  57. cosmotech/coal/cosmotech_api/dataset/download/adt.py +0 -119
  58. cosmotech/coal/cosmotech_api/dataset/download/common.py +0 -140
  59. cosmotech/coal/cosmotech_api/dataset/download/file.py +0 -216
  60. cosmotech/coal/cosmotech_api/dataset/download/twingraph.py +0 -188
  61. cosmotech/coal/cosmotech_api/dataset/utils.py +0 -132
  62. cosmotech/coal/cosmotech_api/parameters.py +0 -48
  63. cosmotech/coal/cosmotech_api/run.py +0 -25
  64. cosmotech/coal/cosmotech_api/run_data.py +0 -173
  65. cosmotech/coal/cosmotech_api/run_template.py +0 -108
  66. cosmotech/coal/cosmotech_api/runner/__init__.py +0 -28
  67. cosmotech/coal/cosmotech_api/runner/data.py +0 -38
  68. cosmotech/coal/cosmotech_api/runner/datasets.py +0 -364
  69. cosmotech/coal/cosmotech_api/runner/download.py +0 -146
  70. cosmotech/coal/cosmotech_api/runner/metadata.py +0 -42
  71. cosmotech/coal/cosmotech_api/runner/parameters.py +0 -157
  72. cosmotech/coal/cosmotech_api/twin_data_layer.py +0 -512
  73. cosmotech/coal/cosmotech_api/workspace.py +0 -127
  74. cosmotech/coal/utils/api.py +0 -68
  75. cosmotech/coal/utils/postgresql.py +0 -236
  76. cosmotech/csm_data/commands/api/rds_load_csv.py +0 -90
  77. cosmotech/csm_data/commands/api/rds_send_csv.py +0 -74
  78. cosmotech/csm_data/commands/api/rds_send_store.py +0 -74
  79. cosmotech/csm_data/commands/api/runtemplate_load_handler.py +0 -66
  80. cosmotech/csm_data/commands/api/tdl_load_files.py +0 -76
  81. cosmotech/csm_data/commands/api/tdl_send_files.py +0 -82
  82. cosmotech/orchestrator_plugins/csm-data/templates/api/rds_load_csv.json +0 -27
  83. cosmotech/orchestrator_plugins/csm-data/templates/api/rds_send_csv.json +0 -27
  84. cosmotech/orchestrator_plugins/csm-data/templates/api/rds_send_store.json +0 -27
  85. cosmotech/orchestrator_plugins/csm-data/templates/api/runtemplate_load_handler.json +0 -27
  86. cosmotech/orchestrator_plugins/csm-data/templates/api/tdl_load_files.json +0 -32
  87. cosmotech/orchestrator_plugins/csm-data/templates/api/tdl_send_files.json +0 -27
  88. cosmotech/translation/coal/en-US/coal/cosmotech_api/run_data.yml +0 -2
  89. cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_load_csv.yml +0 -13
  90. cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_send_csv.yml +0 -12
  91. cosmotech/translation/csm_data/en-US/csm_data/commands/api/rds_send_store.yml +0 -12
  92. cosmotech/translation/csm_data/en-US/csm_data/commands/api/tdl_load_files.yml +0 -14
  93. cosmotech/translation/csm_data/en-US/csm_data/commands/api/tdl_send_files.yml +0 -18
  94. {cosmotech_acceleration_library-1.0.1.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/WHEEL +0 -0
  95. {cosmotech_acceleration_library-1.0.1.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/entry_points.txt +0 -0
  96. {cosmotech_acceleration_library-1.0.1.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/licenses/LICENSE +0 -0
  97. {cosmotech_acceleration_library-1.0.1.dist-info → cosmotech_acceleration_library-2.0.0.dist-info}/top_level.txt +0 -0
@@ -1,119 +0,0 @@
1
- # Copyright (C) - 2023 - 2025 - Cosmo Tech
2
- # This document and all information contained herein is the exclusive property -
3
- # including all intellectual property rights pertaining thereto - of Cosmo Tech.
4
- # Any use, reproduction, translation, broadcasting, transmission, distribution,
5
- # etc., to any person is prohibited unless it has been previously and
6
- # specifically authorized by written means by Cosmo Tech.
7
-
8
- import time
9
- import tempfile
10
- from pathlib import Path
11
- from typing import Dict, Any, Optional, Union, Tuple
12
-
13
- from azure.digitaltwins.core import DigitalTwinsClient
14
- from azure.identity import DefaultAzureCredential
15
-
16
- from cosmotech.coal.utils.logger import LOGGER
17
- from cosmotech.orchestrator.utils.translate import T
18
- from cosmotech.coal.cosmotech_api.connection import get_api_client
19
- from cosmotech.coal.cosmotech_api.dataset.converters import convert_dataset_to_files
20
-
21
-
22
- def download_adt_dataset(
23
- adt_address: str,
24
- target_folder: Optional[Union[str, Path]] = None,
25
- credentials: Optional[DefaultAzureCredential] = None,
26
- ) -> Tuple[Dict[str, Any], Path]:
27
- """
28
- Download dataset from Azure Digital Twins.
29
-
30
- Args:
31
- adt_address: The ADT instance address
32
- target_folder: Optional folder to save files (if None, uses temp dir)
33
- credentials: Optional Azure credentials (if None, uses DefaultAzureCredential)
34
-
35
- Returns:
36
- Tuple of (content dict, folder path)
37
- """
38
- start_time = time.time()
39
- LOGGER.info(T("coal.services.dataset.download_started").format(dataset_type="ADT"))
40
- LOGGER.debug(T("coal.services.dataset.adt_connecting").format(url=adt_address))
41
-
42
- # Create credentials if not provided
43
- if credentials is None:
44
- if get_api_client()[1] == "Azure Entra Connection":
45
- credentials = DefaultAzureCredential()
46
- else:
47
- LOGGER.error(T("coal.services.dataset.adt_no_credentials"))
48
- raise ValueError("No credentials available for ADT connection")
49
-
50
- # Create client and download data
51
- client = DigitalTwinsClient(adt_address, credentials)
52
-
53
- # Query twins
54
- query_start = time.time()
55
- LOGGER.debug(T("coal.services.dataset.adt_querying_twins"))
56
- query_expression = "SELECT * FROM digitaltwins"
57
- query_result = client.query_twins(query_expression)
58
-
59
- json_content = dict()
60
- twin_count = 0
61
-
62
- for twin in query_result:
63
- twin_count += 1
64
- entity_type = twin.get("$metadata").get("$model").split(":")[-1].split(";")[0]
65
- t_content = {k: v for k, v in twin.items()}
66
- t_content["id"] = t_content["$dtId"]
67
-
68
- # Remove system properties
69
- for k in list(twin.keys()):
70
- if k[0] == "$":
71
- del t_content[k]
72
-
73
- json_content.setdefault(entity_type, [])
74
- json_content[entity_type].append(t_content)
75
-
76
- query_time = time.time() - query_start
77
- LOGGER.debug(T("coal.services.dataset.adt_twins_found").format(count=twin_count))
78
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="twins query", time=query_time))
79
-
80
- # Query relationships
81
- rel_start = time.time()
82
- LOGGER.debug(T("coal.services.dataset.adt_querying_relations"))
83
- relations_query = "SELECT * FROM relationships"
84
- query_result = client.query_twins(relations_query)
85
-
86
- relation_count = 0
87
- for relation in query_result:
88
- relation_count += 1
89
- tr = {"$relationshipId": "id", "$sourceId": "source", "$targetId": "target"}
90
- r_content = {k: v for k, v in relation.items()}
91
-
92
- # Map system properties to standard names
93
- for k, v in tr.items():
94
- r_content[v] = r_content[k]
95
-
96
- # Remove system properties
97
- for k in list(relation.keys()):
98
- if k[0] == "$":
99
- del r_content[k]
100
-
101
- json_content.setdefault(relation["$relationshipName"], [])
102
- json_content[relation["$relationshipName"]].append(r_content)
103
-
104
- rel_time = time.time() - rel_start
105
- LOGGER.debug(T("coal.services.dataset.adt_relations_found").format(count=relation_count))
106
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="relations query", time=rel_time))
107
-
108
- # Convert to files if target_folder is provided
109
- if target_folder:
110
- dataset_info = {"type": "adt", "content": json_content, "name": "ADT Dataset"}
111
- target_folder = convert_dataset_to_files(dataset_info, target_folder)
112
- else:
113
- target_folder = tempfile.mkdtemp()
114
-
115
- elapsed_time = time.time() - start_time
116
- LOGGER.info(T("coal.common.timing.operation_completed").format(operation="ADT download", time=elapsed_time))
117
- LOGGER.info(T("coal.services.dataset.download_completed").format(dataset_type="ADT"))
118
-
119
- return json_content, Path(target_folder)
@@ -1,140 +0,0 @@
1
- # Copyright (C) - 2023 - 2025 - Cosmo Tech
2
- # This document and all information contained herein is the exclusive property -
3
- # including all intellectual property rights pertaining thereto - of Cosmo Tech.
4
- # Any use, reproduction, translation, broadcasting, transmission, distribution,
5
- # etc., to any person is prohibited unless it has been previously and
6
- # specifically authorized by written means by Cosmo Tech.
7
-
8
- import time
9
- from pathlib import Path
10
- from typing import Dict, Any, Optional, Union, Tuple
11
-
12
- from cosmotech_api import DatasetApi
13
-
14
- from cosmotech.coal.utils.logger import LOGGER
15
- from cosmotech.orchestrator.utils.translate import T
16
- from cosmotech.coal.cosmotech_api.connection import get_api_client
17
-
18
- # Import specific download functions
19
- # These imports are defined here to avoid circular imports
20
- # The functions are imported directly from their modules
21
- from cosmotech.coal.cosmotech_api.dataset.download.adt import download_adt_dataset
22
- from cosmotech.coal.cosmotech_api.dataset.download.twingraph import (
23
- download_twingraph_dataset,
24
- download_legacy_twingraph_dataset,
25
- )
26
- from cosmotech.coal.cosmotech_api.dataset.download.file import download_file_dataset
27
-
28
-
29
- def download_dataset_by_id(
30
- organization_id: str,
31
- workspace_id: str,
32
- dataset_id: str,
33
- target_folder: Optional[Union[str, Path]] = None,
34
- ) -> Tuple[Dict[str, Any], Path]:
35
- """
36
- Download dataset by ID.
37
-
38
- Args:
39
- organization_id: Organization ID
40
- workspace_id: Workspace ID
41
- dataset_id: Dataset ID
42
- target_folder: Optional folder to save files (if None, uses temp dir)
43
-
44
- Returns:
45
- Tuple of (dataset info dict, folder path)
46
- """
47
- start_time = time.time()
48
- LOGGER.info(T("coal.services.dataset.download_started").format(dataset_type="Dataset"))
49
- LOGGER.debug(
50
- T("coal.services.dataset.dataset_downloading").format(organization_id=organization_id, dataset_id=dataset_id)
51
- )
52
-
53
- with get_api_client()[0] as api_client:
54
- api_instance = DatasetApi(api_client)
55
-
56
- # Get dataset info
57
- info_start = time.time()
58
- dataset = api_instance.find_dataset_by_id(organization_id=organization_id, dataset_id=dataset_id)
59
- info_time = time.time() - info_start
60
-
61
- LOGGER.debug(
62
- T("coal.services.dataset.dataset_info_retrieved").format(dataset_name=dataset.name, dataset_id=dataset_id)
63
- )
64
- LOGGER.debug(
65
- T("coal.common.timing.operation_completed").format(operation="dataset info retrieval", time=info_time)
66
- )
67
-
68
- # Determine dataset type and download
69
- if dataset.connector is None:
70
- parameters = []
71
- else:
72
- parameters = dataset.connector.parameters_values
73
-
74
- is_adt = "AZURE_DIGITAL_TWINS_URL" in parameters
75
- is_storage = "AZURE_STORAGE_CONTAINER_BLOB_PREFIX" in parameters
76
- is_legacy_twin_cache = "TWIN_CACHE_NAME" in parameters and dataset.twingraph_id is None
77
- is_in_workspace_file = (
78
- False if dataset.tags is None else "workspaceFile" in dataset.tags or "dataset_part" in dataset.tags
79
- )
80
-
81
- download_start = time.time()
82
-
83
- if is_adt:
84
- LOGGER.debug(T("coal.services.dataset.dataset_type_detected").format(type="ADT"))
85
- content, folder = download_adt_dataset(
86
- adt_address=parameters["AZURE_DIGITAL_TWINS_URL"],
87
- target_folder=target_folder,
88
- )
89
- dataset_type = "adt"
90
-
91
- elif is_legacy_twin_cache:
92
- LOGGER.debug(T("coal.services.dataset.dataset_type_detected").format(type="Legacy TwinGraph"))
93
- twin_cache_name = parameters["TWIN_CACHE_NAME"]
94
- content, folder = download_legacy_twingraph_dataset(
95
- organization_id=organization_id,
96
- cache_name=twin_cache_name,
97
- target_folder=target_folder,
98
- )
99
- dataset_type = "twincache"
100
-
101
- elif is_storage or is_in_workspace_file:
102
- if is_storage:
103
- LOGGER.debug(T("coal.services.dataset.dataset_type_detected").format(type="Storage"))
104
- _file_name = parameters["AZURE_STORAGE_CONTAINER_BLOB_PREFIX"].replace("%WORKSPACE_FILE%/", "")
105
- else:
106
- LOGGER.debug(T("coal.services.dataset.dataset_type_detected").format(type="Workspace File"))
107
- _file_name = dataset.source.location
108
-
109
- content, folder = download_file_dataset(
110
- organization_id=organization_id,
111
- workspace_id=workspace_id,
112
- file_name=_file_name,
113
- target_folder=target_folder,
114
- )
115
- dataset_type = _file_name.split(".")[-1]
116
-
117
- else:
118
- LOGGER.debug(T("coal.services.dataset.dataset_type_detected").format(type="TwinGraph"))
119
- content, folder = download_twingraph_dataset(
120
- organization_id=organization_id,
121
- dataset_id=dataset_id,
122
- target_folder=target_folder,
123
- )
124
- dataset_type = "twincache"
125
-
126
- download_time = time.time() - download_start
127
- LOGGER.debug(
128
- T("coal.common.timing.operation_completed").format(operation="content download", time=download_time)
129
- )
130
-
131
- # Prepare result
132
- dataset_info = {"type": dataset_type, "content": content, "name": dataset.name}
133
-
134
- elapsed_time = time.time() - start_time
135
- LOGGER.info(
136
- T("coal.common.timing.operation_completed").format(operation="total dataset download", time=elapsed_time)
137
- )
138
- LOGGER.info(T("coal.services.dataset.download_completed").format(dataset_type="Dataset"))
139
-
140
- return dataset_info, folder
@@ -1,216 +0,0 @@
1
- # Copyright (C) - 2023 - 2025 - Cosmo Tech
2
- # This document and all information contained herein is the exclusive property -
3
- # including all intellectual property rights pertaining thereto - of Cosmo Tech.
4
- # Any use, reproduction, translation, broadcasting, transmission, distribution,
5
- # etc., to any person is prohibited unless it has been previously and
6
- # specifically authorized by written means by Cosmo Tech.
7
-
8
- import csv
9
- import io
10
- import json
11
- import os
12
- import tempfile
13
- import time
14
- from pathlib import Path
15
- from typing import Dict, List, Any, Optional, Union, Tuple
16
-
17
- from cosmotech_api import WorkspaceApi
18
- from openpyxl import load_workbook
19
-
20
- from cosmotech.coal.utils.logger import LOGGER
21
- from cosmotech.orchestrator.utils.translate import T
22
- from cosmotech.coal.cosmotech_api.connection import get_api_client
23
-
24
-
25
- def download_file_dataset(
26
- organization_id: str,
27
- workspace_id: str,
28
- file_name: str,
29
- target_folder: Optional[Union[str, Path]] = None,
30
- read_files: bool = True,
31
- ) -> Tuple[Dict[str, Any], Path]:
32
- """
33
- Download file dataset.
34
-
35
- Args:
36
- organization_id: Organization ID
37
- workspace_id: Workspace ID
38
- file_name: File name to download
39
- target_folder: Optional folder to save files (if None, uses temp dir)
40
- read_files: Whether to read file contents
41
-
42
- Returns:
43
- Tuple of (content dict, folder path)
44
- """
45
- start_time = time.time()
46
- LOGGER.info(T("coal.services.dataset.download_started").format(dataset_type="File"))
47
- LOGGER.debug(
48
- T("coal.services.dataset.file_downloading").format(
49
- organization_id=organization_id,
50
- workspace_id=workspace_id,
51
- file_name=file_name,
52
- )
53
- )
54
-
55
- # Create temp directory for downloaded files
56
- if target_folder is None:
57
- tmp_dataset_dir = tempfile.mkdtemp()
58
- else:
59
- tmp_dataset_dir = Path(target_folder)
60
- tmp_dataset_dir.mkdir(parents=True, exist_ok=True)
61
- tmp_dataset_dir = str(tmp_dataset_dir)
62
-
63
- LOGGER.debug(T("coal.services.dataset.using_folder").format(folder=tmp_dataset_dir))
64
-
65
- content = dict()
66
-
67
- with get_api_client()[0] as api_client:
68
- api_ws = WorkspaceApi(api_client)
69
-
70
- # Find all files matching the pattern
71
- list_start = time.time()
72
- LOGGER.debug(T("coal.services.dataset.listing_workspace_files"))
73
- all_api_files = api_ws.find_all_workspace_files(organization_id, workspace_id)
74
-
75
- existing_files = list(_f.file_name for _f in all_api_files if _f.file_name.startswith(file_name))
76
- list_time = time.time() - list_start
77
-
78
- LOGGER.debug(T("coal.services.dataset.workspace_files_found").format(count=len(existing_files)))
79
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="file listing", time=list_time))
80
-
81
- if not existing_files:
82
- LOGGER.warning(T("coal.services.dataset.no_files_found").format(file_name=file_name))
83
- return content, Path(tmp_dataset_dir)
84
-
85
- # Download and process each file
86
- for _file_name in existing_files:
87
- download_start = time.time()
88
- LOGGER.debug(T("coal.services.dataset.downloading_file").format(file_name=_file_name))
89
-
90
- dl_file = api_ws.download_workspace_file(
91
- organization_id=organization_id,
92
- workspace_id=workspace_id,
93
- file_name=_file_name,
94
- )
95
-
96
- target_file = os.path.join(tmp_dataset_dir, _file_name.split("/")[-1])
97
- with open(target_file, "wb") as tmp_file:
98
- tmp_file.write(dl_file)
99
-
100
- download_time = time.time() - download_start
101
- LOGGER.debug(T("coal.services.dataset.file_downloaded").format(file_name=_file_name, path=target_file))
102
- LOGGER.debug(
103
- T("coal.common.timing.operation_completed").format(
104
- operation=f"download {_file_name}", time=download_time
105
- )
106
- )
107
-
108
- if not read_files:
109
- continue
110
-
111
- # Process file based on type
112
- process_start = time.time()
113
-
114
- if ".xls" in _file_name:
115
- LOGGER.debug(T("coal.services.dataset.processing_excel").format(file_name=target_file))
116
- wb = load_workbook(target_file, data_only=True)
117
-
118
- for sheet_name in wb.sheetnames:
119
- sheet = wb[sheet_name]
120
- content[sheet_name] = list()
121
- headers = next(sheet.iter_rows(max_row=1, values_only=True))
122
-
123
- def item(_row: tuple) -> dict:
124
- return {k: v for k, v in zip(headers, _row)}
125
-
126
- row_count = 0
127
- for r in sheet.iter_rows(min_row=2, values_only=True):
128
- row = item(r)
129
- new_row = dict()
130
-
131
- for key, value in row.items():
132
- try:
133
- converted_value = json.load(io.StringIO(value))
134
- except (json.decoder.JSONDecodeError, TypeError):
135
- converted_value = value
136
-
137
- if converted_value is not None:
138
- new_row[key] = converted_value
139
-
140
- if new_row:
141
- content[sheet_name].append(new_row)
142
- row_count += 1
143
-
144
- LOGGER.debug(
145
- T("coal.services.dataset.sheet_processed").format(sheet_name=sheet_name, rows=row_count)
146
- )
147
-
148
- elif ".csv" in _file_name:
149
- LOGGER.debug(T("coal.services.dataset.processing_csv").format(file_name=target_file))
150
- with open(target_file, "r") as file:
151
- current_filename = os.path.basename(target_file)[: -len(".csv")]
152
- content[current_filename] = list()
153
-
154
- row_count = 0
155
- for csv_row in csv.DictReader(file):
156
- csv_row: dict
157
- new_row = dict()
158
-
159
- for key, value in csv_row.items():
160
- try:
161
- # Try to convert any json row to dict object
162
- converted_value = json.load(io.StringIO(value))
163
- except json.decoder.JSONDecodeError:
164
- converted_value = value
165
-
166
- if converted_value == "":
167
- converted_value = None
168
-
169
- if converted_value is not None:
170
- new_row[key] = converted_value
171
-
172
- content[current_filename].append(new_row)
173
- row_count += 1
174
-
175
- LOGGER.debug(
176
- T("coal.services.dataset.csv_processed").format(file_name=current_filename, rows=row_count)
177
- )
178
-
179
- elif ".json" in _file_name:
180
- LOGGER.debug(T("coal.services.dataset.processing_json").format(file_name=target_file))
181
- with open(target_file, "r") as _file:
182
- current_filename = os.path.basename(target_file)
183
- content[current_filename] = json.load(_file)
184
-
185
- if isinstance(content[current_filename], dict):
186
- item_count = len(content[current_filename])
187
- elif isinstance(content[current_filename], list):
188
- item_count = len(content[current_filename])
189
- else:
190
- item_count = 1
191
-
192
- LOGGER.debug(
193
- T("coal.services.dataset.json_processed").format(file_name=current_filename, items=item_count)
194
- )
195
-
196
- else:
197
- LOGGER.debug(T("coal.services.dataset.processing_text").format(file_name=target_file))
198
- with open(target_file, "r") as _file:
199
- current_filename = os.path.basename(target_file)
200
- content[current_filename] = "\n".join(line for line in _file)
201
-
202
- line_count = content[current_filename].count("\n") + 1
203
- LOGGER.debug(
204
- T("coal.services.dataset.text_processed").format(file_name=current_filename, lines=line_count)
205
- )
206
-
207
- process_time = time.time() - process_start
208
- LOGGER.debug(
209
- T("coal.common.timing.operation_completed").format(operation=f"process {_file_name}", time=process_time)
210
- )
211
-
212
- elapsed_time = time.time() - start_time
213
- LOGGER.info(T("coal.common.timing.operation_completed").format(operation="File download", time=elapsed_time))
214
- LOGGER.info(T("coal.services.dataset.download_completed").format(dataset_type="File"))
215
-
216
- return content, Path(tmp_dataset_dir)
@@ -1,188 +0,0 @@
1
- # Copyright (C) - 2023 - 2025 - Cosmo Tech
2
- # This document and all information contained herein is the exclusive property -
3
- # including all intellectual property rights pertaining thereto - of Cosmo Tech.
4
- # Any use, reproduction, translation, broadcasting, transmission, distribution,
5
- # etc., to any person is prohibited unless it has been previously and
6
- # specifically authorized by written means by Cosmo Tech.
7
-
8
- import time
9
- import tempfile
10
- from pathlib import Path
11
- from typing import Dict, List, Any, Optional, Union, Tuple
12
-
13
- from cosmotech_api import (
14
- DatasetApi,
15
- DatasetTwinGraphQuery,
16
- TwinGraphQuery,
17
- TwingraphApi,
18
- )
19
-
20
- from cosmotech.coal.utils.logger import LOGGER
21
- from cosmotech.orchestrator.utils.translate import T
22
- from cosmotech.coal.cosmotech_api.connection import get_api_client
23
- from cosmotech.coal.cosmotech_api.dataset.utils import get_content_from_twin_graph_data
24
- from cosmotech.coal.cosmotech_api.dataset.converters import convert_dataset_to_files
25
-
26
-
27
- def download_twingraph_dataset(
28
- organization_id: str,
29
- dataset_id: str,
30
- target_folder: Optional[Union[str, Path]] = None,
31
- ) -> Tuple[Dict[str, Any], Path]:
32
- """
33
- Download dataset from TwinGraph.
34
-
35
- Args:
36
- organization_id: Organization ID
37
- dataset_id: Dataset ID
38
- target_folder: Optional folder to save files (if None, uses temp dir)
39
-
40
- Returns:
41
- Tuple of (content dict, folder path)
42
- """
43
- start_time = time.time()
44
- LOGGER.info(T("coal.services.dataset.download_started").format(dataset_type="TwinGraph"))
45
- LOGGER.debug(
46
- T("coal.services.dataset.twingraph_downloading").format(organization_id=organization_id, dataset_id=dataset_id)
47
- )
48
-
49
- with get_api_client()[0] as api_client:
50
- dataset_api = DatasetApi(api_client)
51
-
52
- # Query nodes
53
- nodes_start = time.time()
54
- LOGGER.debug(T("coal.services.dataset.twingraph_querying_nodes").format(dataset_id=dataset_id))
55
- nodes_query = DatasetTwinGraphQuery(query="MATCH(n) RETURN n")
56
-
57
- nodes = dataset_api.twingraph_query(
58
- organization_id=organization_id,
59
- dataset_id=dataset_id,
60
- dataset_twin_graph_query=nodes_query,
61
- )
62
-
63
- nodes_time = time.time() - nodes_start
64
- LOGGER.debug(T("coal.services.dataset.twingraph_nodes_found").format(count=len(nodes)))
65
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="nodes query", time=nodes_time))
66
-
67
- # Query edges
68
- edges_start = time.time()
69
- LOGGER.debug(T("coal.services.dataset.twingraph_querying_edges").format(dataset_id=dataset_id))
70
- edges_query = DatasetTwinGraphQuery(query="MATCH(n)-[r]->(m) RETURN n as src, r as rel, m as dest")
71
-
72
- edges = dataset_api.twingraph_query(
73
- organization_id=organization_id,
74
- dataset_id=dataset_id,
75
- dataset_twin_graph_query=edges_query,
76
- )
77
-
78
- edges_time = time.time() - edges_start
79
- LOGGER.debug(T("coal.services.dataset.twingraph_edges_found").format(count=len(edges)))
80
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="edges query", time=edges_time))
81
-
82
- # Process results
83
- process_start = time.time()
84
- content = get_content_from_twin_graph_data(nodes, edges, True)
85
- process_time = time.time() - process_start
86
-
87
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="data processing", time=process_time))
88
-
89
- # Convert to files if target_folder is provided
90
- if target_folder:
91
- dataset_info = {
92
- "type": "twincache",
93
- "content": content,
94
- "name": f"TwinGraph Dataset {dataset_id}",
95
- }
96
- target_folder = convert_dataset_to_files(dataset_info, target_folder)
97
- else:
98
- target_folder = tempfile.mkdtemp()
99
-
100
- elapsed_time = time.time() - start_time
101
- LOGGER.info(T("coal.common.timing.operation_completed").format(operation="TwinGraph download", time=elapsed_time))
102
- LOGGER.info(T("coal.services.dataset.download_completed").format(dataset_type="TwinGraph"))
103
-
104
- return content, Path(target_folder)
105
-
106
-
107
- def download_legacy_twingraph_dataset(
108
- organization_id: str,
109
- cache_name: str,
110
- target_folder: Optional[Union[str, Path]] = None,
111
- ) -> Tuple[Dict[str, Any], Path]:
112
- """
113
- Download dataset from legacy TwinGraph.
114
-
115
- Args:
116
- organization_id: Organization ID
117
- cache_name: Twin cache name
118
- target_folder: Optional folder to save files (if None, uses temp dir)
119
-
120
- Returns:
121
- Tuple of (content dict, folder path)
122
- """
123
- start_time = time.time()
124
- LOGGER.info(T("coal.services.dataset.download_started").format(dataset_type="Legacy TwinGraph"))
125
- LOGGER.debug(
126
- T("coal.services.dataset.legacy_twingraph_downloading").format(
127
- organization_id=organization_id, cache_name=cache_name
128
- )
129
- )
130
-
131
- with get_api_client()[0] as api_client:
132
- api_instance = TwingraphApi(api_client)
133
-
134
- # Query nodes
135
- nodes_start = time.time()
136
- LOGGER.debug(T("coal.services.dataset.legacy_twingraph_querying_nodes").format(cache_name=cache_name))
137
- _query_nodes = TwinGraphQuery(query="MATCH(n) RETURN n")
138
-
139
- nodes = api_instance.query(
140
- organization_id=organization_id,
141
- graph_id=cache_name,
142
- twin_graph_query=_query_nodes,
143
- )
144
-
145
- nodes_time = time.time() - nodes_start
146
- LOGGER.debug(T("coal.services.dataset.legacy_twingraph_nodes_found").format(count=len(nodes)))
147
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="nodes query", time=nodes_time))
148
-
149
- # Query relationships
150
- rel_start = time.time()
151
- LOGGER.debug(T("coal.services.dataset.legacy_twingraph_querying_relations").format(cache_name=cache_name))
152
- _query_rel = TwinGraphQuery(query="MATCH(n)-[r]->(m) RETURN n as src, r as rel, m as dest")
153
-
154
- rel = api_instance.query(
155
- organization_id=organization_id,
156
- graph_id=cache_name,
157
- twin_graph_query=_query_rel,
158
- )
159
-
160
- rel_time = time.time() - rel_start
161
- LOGGER.debug(T("coal.services.dataset.legacy_twingraph_relations_found").format(count=len(rel)))
162
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="relations query", time=rel_time))
163
-
164
- # Process results
165
- process_start = time.time()
166
- content = get_content_from_twin_graph_data(nodes, rel, False)
167
- process_time = time.time() - process_start
168
-
169
- LOGGER.debug(T("coal.common.timing.operation_completed").format(operation="data processing", time=process_time))
170
-
171
- # Convert to files if target_folder is provided
172
- if target_folder:
173
- dataset_info = {
174
- "type": "twincache",
175
- "content": content,
176
- "name": f"Legacy TwinGraph Dataset {cache_name}",
177
- }
178
- target_folder = convert_dataset_to_files(dataset_info, target_folder)
179
- else:
180
- target_folder = tempfile.mkdtemp()
181
-
182
- elapsed_time = time.time() - start_time
183
- LOGGER.info(
184
- T("coal.common.timing.operation_completed").format(operation="Legacy TwinGraph download", time=elapsed_time)
185
- )
186
- LOGGER.info(T("coal.services.dataset.download_completed").format(dataset_type="Legacy TwinGraph"))
187
-
188
- return content, Path(target_folder)