carbonarc 1.0.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
carbonarc/data.py ADDED
@@ -0,0 +1,545 @@
1
+ import os
2
+ import logging
3
+ from io import BytesIO
4
+ from typing import Optional
5
+ import base64
6
+
7
+ from carbonarc.base.client import BaseAPIClient
8
+ from carbonarc.base.utils import is_valid_date
9
+
10
+ log = logging.getLogger(__name__)
11
+
12
+
13
+ class DataAPIClient(BaseAPIClient):
14
+ """
15
+ A client for interacting with the Carbon Arc Data API.
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ token: str,
21
+ host: str = "https://platform.carbonarc.co",
22
+ version: str = "v2",
23
+ ):
24
+ """
25
+ Initialize DataAPIClient with an authentication token and user agent.
26
+
27
+ Args:
28
+ token: The authentication token to be used for requests.
29
+ host: The base URL of the Carbon Arc API.
30
+ version: The API version to use.
31
+ """
32
+ super().__init__(token=token, host=host, version=version)
33
+
34
+ self.base_data_url = self._build_base_url("library")
35
+
36
+ def get_datasets(
37
+ self,
38
+ ) -> dict:
39
+ url = f"{self.base_data_url}/data"
40
+
41
+ return self._get(url)
42
+
43
+ def get_dataset_information(self, data_identifier: str) -> dict:
44
+ """
45
+ Get the information for a specific dataset from the Carbon Arc API.
46
+
47
+ Args:
48
+ data_identifier (str): The identifier of the data to retrieve information for.
49
+
50
+ Returns:
51
+ dict: A dictionary containing the information for the specified dataset.
52
+ """
53
+ endpoint = f"data/{data_identifier}/information"
54
+ url = f"{self.base_data_url}/{endpoint}"
55
+
56
+ return self._get(url)
57
+
58
+ def get_data_manifest(
59
+ self,
60
+ data_identifier: str,
61
+ created_since: Optional[str] = None,
62
+ updated_since: Optional[str] = None,
63
+ ) -> dict:
64
+ """
65
+ Get the manifest for a specific data identifier from the Carbon Arc API.
66
+
67
+ Args:
68
+ data_identifier (str): The identifier of the data to retrieve manifest for.
69
+ created_since (Optional[str]): The filter for created timestamp. Format is YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS.
70
+ updated_since (Optional[str]): The filter by updated timestamp, modification_time field. Format is YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS.
71
+
72
+ Returns:
73
+ dict: A dictionary containing the manifest for the specified data identifier.
74
+ """
75
+ endpoint = f"data/{data_identifier}/manifest"
76
+ url = f"{self.base_data_url}/{endpoint}"
77
+ params = {}
78
+ if created_since:
79
+ # validate created_since format
80
+ if not is_valid_date(created_since):
81
+ raise ValueError(
82
+ "created_since must be in YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS format."
83
+ )
84
+ params["created_since"] = created_since
85
+ if updated_since:
86
+ # validate updated_since format
87
+ if not is_valid_date(updated_since):
88
+ raise ValueError(
89
+ "updated_since must be in YYYY-MM-DD or YYYY-MM-DDTHH:MM:SS format."
90
+ )
91
+ params["updated_since"] = updated_since
92
+ return self._get(url, params=params)
93
+
94
+ def buy_data(self, order: dict) -> dict:
95
+ """
96
+ Buy data from the Carbon Arc API.
97
+
98
+ Args:
99
+ order (dict): The order to buy data for.
100
+
101
+ Returns:
102
+ dict: A dictionary containing the information for the specified order.
103
+ """
104
+ endpoint = "data/buy"
105
+ url = f"{self.base_data_url}/{endpoint}"
106
+
107
+ return self._post(url, json=order)
108
+
109
+ def get_order_details(self, order_id: str) -> dict:
110
+ """
111
+ Get the details of an order from the Carbon Arc API.
112
+
113
+ Args:
114
+ order_id (str): The ID of the order to get details for.
115
+
116
+ Returns:
117
+ dict: A dictionary containing the details of the order.
118
+ """
119
+ endpoint = f"data/order/{order_id}"
120
+ url = f"{self.base_data_url}/{endpoint}"
121
+
122
+ return self._get(url)
123
+
124
+ def download_file(self, file_id: str) -> dict:
125
+ """
126
+ Download a data file from the Carbon Arc API.
127
+
128
+ Args:
129
+ file_id (str): The ID of the file to download.
130
+
131
+ Returns:
132
+ dict: A dictionary containing the file.
133
+ """
134
+ endpoint = f"data/files/{file_id}"
135
+ url = f"{self.base_data_url}/{endpoint}"
136
+
137
+ return self._get(url)
138
+
139
+ def __stream_data(
140
+ self,
141
+ url: str,
142
+ chunk_size: int = 1024 * 1024 * 250, # 250MB
143
+ ):
144
+ """
145
+ Download a file stream from the Carbon Arc API.
146
+
147
+ Args:
148
+ url (str): The URL of the file to download.
149
+ chunk_size (int): The size of each chunk to download.
150
+
151
+ Returns:
152
+ generator: A generator yielding the raw stream of the file.
153
+ """
154
+ response = self.request_manager.get_stream(url)
155
+ for chunk in response.iter_content(chunk_size=chunk_size):
156
+ yield chunk
157
+
158
+ def download_data_to_file(
159
+ self, url: str, output_file: str, chunk_size: int = 1024 * 1024 * 250
160
+ ):
161
+ """
162
+ Download data for a specific data identifier and save it to a file.
163
+
164
+ Args:
165
+ url (str): The URL of the file to download.
166
+ output_file (str): The path to the file where the data should be saved.
167
+ chunk_size (int): The size of each chunk to download.
168
+
169
+ Returns:
170
+ str: The path to the downloaded file.
171
+ """
172
+ # check if output_file directory exists
173
+ output_dir = os.path.dirname(output_file)
174
+ if not os.path.exists(output_dir):
175
+ raise FileNotFoundError(f"Output directory {output_dir} does not exist.")
176
+
177
+ with open(output_file, "wb") as f:
178
+ for chunk in self.__stream_data(url, chunk_size):
179
+ f.write(chunk)
180
+
181
+ def download_data_to_s3(
182
+ self,
183
+ s3_client,
184
+ file_url: str,
185
+ s3_bucket: str,
186
+ s3_key_prefix: str,
187
+ chunk_size: int = 5 * 1024 * 1024, # Default to 5MB
188
+ ):
189
+ log.info(f"Downloading file {file_url} to S3...")
190
+
191
+ # Ensure chunk size is at least 5MB (AWS requirement for multipart uploads)
192
+ if chunk_size < 5 * 1024 * 1024:
193
+ chunk_size = 5 * 1024 * 1024
194
+ log.info(
195
+ "Chunk size adjusted to 5MB to meet AWS minimum part size requirement"
196
+ )
197
+
198
+ # Make the request
199
+ response = self.request_manager.get_stream(file_url)
200
+ response.raise_for_status()
201
+
202
+ # Extract filename from response headers
203
+ filename = (
204
+ response.headers["Content-Disposition"].split("filename=")[1].strip('"')
205
+ )
206
+
207
+ # Create the full S3 key (path + filename)
208
+ s3_key = f"{s3_key_prefix.rstrip('/')}/{filename}"
209
+
210
+ # Check if file is small enough for direct upload
211
+ content_length = int(response.headers.get("Content-Length", 0))
212
+
213
+ # If file is small (less than 10MB) or content length is unknown, use simple upload
214
+ if content_length > 0 and content_length < 10 * 1024 * 1024:
215
+ log.warning(f"File is small ({content_length} bytes), using simple upload")
216
+ content = response.content
217
+ s3_client.put_object(Bucket=s3_bucket, Key=s3_key, Body=content)
218
+ log.info(f"File uploaded successfully to s3://{s3_bucket}/{s3_key}")
219
+ return f"s3://{s3_bucket}/{s3_key}"
220
+
221
+ # For larger files, use multipart upload
222
+ log.info(f"Initiating multipart upload to s3://{s3_bucket}/{s3_key}")
223
+ multipart_upload = s3_client.create_multipart_upload(
224
+ Bucket=s3_bucket, Key=s3_key
225
+ )
226
+
227
+ upload_id = multipart_upload["UploadId"]
228
+ parts = []
229
+ part_number = 1
230
+
231
+ try:
232
+ # Use a buffer to collect chunks until we have at least 5MB
233
+ buffer = BytesIO()
234
+ buffer_size = 0
235
+
236
+ for chunk in response.iter_content(
237
+ chunk_size=1024 * 1024
238
+ ): # Read in 1MB chunks
239
+ if not chunk:
240
+ continue
241
+
242
+ # Add the chunk to our buffer
243
+ buffer.write(chunk)
244
+ buffer_size += len(chunk)
245
+
246
+ # If we have at least 5MB (or this is the last chunk), upload the part
247
+ if buffer_size >= chunk_size:
248
+ # Reset buffer position to beginning for reading
249
+ buffer.seek(0)
250
+
251
+ # Upload the part
252
+ part = s3_client.upload_part(
253
+ Bucket=s3_bucket,
254
+ Key=s3_key,
255
+ PartNumber=part_number,
256
+ UploadId=upload_id,
257
+ Body=buffer.read(),
258
+ )
259
+
260
+ # Add the part info to our parts list
261
+ parts.append({"PartNumber": part_number, "ETag": part["ETag"]})
262
+
263
+ log.info(f"Uploaded part {part_number} ({buffer_size} bytes)")
264
+ part_number += 1
265
+
266
+ # Reset the buffer
267
+ buffer = BytesIO()
268
+ buffer_size = 0
269
+
270
+ # Upload any remaining data as the final part (can be less than 5MB)
271
+ if buffer_size > 0:
272
+ buffer.seek(0)
273
+ part = s3_client.upload_part(
274
+ Bucket=s3_bucket,
275
+ Key=s3_key,
276
+ PartNumber=part_number,
277
+ UploadId=upload_id,
278
+ Body=buffer.read(),
279
+ )
280
+
281
+ parts.append({"PartNumber": part_number, "ETag": part["ETag"]})
282
+
283
+ log.info(f"Uploaded final part {part_number} ({buffer_size} bytes)")
284
+
285
+ # Complete the multipart upload only if we have parts
286
+ if parts:
287
+ s3_client.complete_multipart_upload(
288
+ Bucket=s3_bucket,
289
+ Key=s3_key,
290
+ UploadId=upload_id,
291
+ MultipartUpload={"Parts": parts},
292
+ )
293
+
294
+ log.info(f"File uploaded successfully to s3://{s3_bucket}/{s3_key}")
295
+ else:
296
+ # No parts were uploaded, likely an empty file
297
+ s3_client.abort_multipart_upload(
298
+ Bucket=s3_bucket, Key=s3_key, UploadId=upload_id
299
+ )
300
+
301
+ # Upload an empty file instead
302
+ s3_client.put_object(Bucket=s3_bucket, Key=s3_key, Body=b"")
303
+ log.warning(f"Empty file uploaded to s3://{s3_bucket}/{s3_key}")
304
+
305
+ return f"s3://{s3_bucket}/{s3_key}"
306
+
307
+ except Exception as e:
308
+ # Abort the multipart upload if something goes wrong
309
+ s3_client.abort_multipart_upload(
310
+ Bucket=s3_bucket, Key=s3_key, UploadId=upload_id
311
+ )
312
+ log.error(f"Multipart upload aborted due to: {str(e)}")
313
+ raise
314
+
315
+ def download_data_to_azure(
316
+ self,
317
+ blob_service_client,
318
+ file_url: str,
319
+ container_name: str,
320
+ blob_prefix: str,
321
+ chunk_size: int = 4 * 1024 * 1024, # Default to 4MB (Azure recommendation)
322
+ ):
323
+ log.info(f"Downloading file {file_url} to Azure Blob Storage...")
324
+
325
+ # Ensure chunk size is at least 4MB (Azure recommendation for block blobs)
326
+ if chunk_size < 4 * 1024 * 1024:
327
+ chunk_size = 4 * 1024 * 1024
328
+ log.info(
329
+ "Chunk size adjusted to 4MB for optimal Azure Blob Storage performance"
330
+ )
331
+
332
+ # Make the request
333
+ response = self.request_manager.get_stream(file_url)
334
+ response.raise_for_status()
335
+
336
+ # Extract filename from response headers
337
+ filename = (
338
+ response.headers["Content-Disposition"].split("filename=")[1].strip('"')
339
+ )
340
+
341
+ # Create the full blob path (prefix + filename)
342
+ blob_name = f"{blob_prefix.rstrip('/')}/{filename}"
343
+
344
+ # Check if file is small enough for direct upload
345
+ content_length = int(response.headers.get("Content-Length", 0))
346
+
347
+ # If file is small (less than 10MB) or content length is unknown, use simple upload
348
+ if content_length > 0 and content_length < 10 * 1024 * 1024:
349
+ log.warning(f"File is small ({content_length} bytes), using simple upload")
350
+ content = response.content
351
+
352
+ # Get blob client
353
+ blob_client = blob_service_client.get_blob_client(
354
+ container=container_name, blob=blob_name
355
+ )
356
+
357
+ # Upload the content
358
+ blob_client.upload_blob(content, overwrite=True)
359
+ log.info(f"File uploaded successfully to azure://{container_name}/{blob_name}")
360
+ return f"azure://{container_name}/{blob_name}"
361
+
362
+ # For larger files, use block blob upload
363
+ log.info(f"Initiating block blob upload to azure://{container_name}/{blob_name}")
364
+
365
+ # Get blob client
366
+ blob_client = blob_service_client.get_blob_client(
367
+ container=container_name, blob=blob_name
368
+ )
369
+
370
+ block_list = []
371
+ block_number = 0
372
+
373
+ try:
374
+ # Use a buffer to collect chunks until we have the required size
375
+ buffer = BytesIO()
376
+ buffer_size = 0
377
+
378
+ for chunk in response.iter_content(
379
+ chunk_size=1024 * 1024
380
+ ): # Read in 1MB chunks
381
+ if not chunk:
382
+ continue
383
+
384
+ # Add the chunk to our buffer
385
+ buffer.write(chunk)
386
+ buffer_size += len(chunk)
387
+
388
+ # If we have enough data, upload the block
389
+ if buffer_size >= chunk_size:
390
+ # Reset buffer position to beginning for reading
391
+ buffer.seek(0)
392
+
393
+ # Generate block ID (must be base64 encoded)
394
+ block_id = base64.b64encode(f"block-{block_number:06d}".encode()).decode()
395
+
396
+ # Upload the block
397
+ blob_client.stage_block(block_id, buffer.read())
398
+
399
+ # Add the block ID to our list
400
+ block_list.append(block_id)
401
+
402
+ log.info(f"Uploaded block {block_number} ({buffer_size} bytes)")
403
+ block_number += 1
404
+
405
+ # Reset the buffer
406
+ buffer = BytesIO()
407
+ buffer_size = 0
408
+
409
+ # Upload any remaining data as the final block
410
+ if buffer_size > 0:
411
+ buffer.seek(0)
412
+ block_id = base64.b64encode(f"block-{block_number:06d}".encode()).decode()
413
+ blob_client.stage_block(block_id, buffer.read())
414
+ block_list.append(block_id)
415
+
416
+ log.info(f"Uploaded final block {block_number} ({buffer_size} bytes)")
417
+
418
+ # Commit the block list only if we have blocks
419
+ if block_list:
420
+ blob_client.commit_block_list(block_list)
421
+ log.info(f"File uploaded successfully to azure://{container_name}/{blob_name}")
422
+ else:
423
+ # No blocks were uploaded, likely an empty file
424
+ blob_client.upload_blob(b"", overwrite=True)
425
+ log.warning(f"Empty file uploaded to azure://{container_name}/{blob_name}")
426
+
427
+ return f"azure://{container_name}/{blob_name}"
428
+
429
+ except Exception as e:
430
+ log.error(f"Azure blob upload failed due to: {str(e)}")
431
+ raise
432
+
433
+ def download_data_to_gcp(
434
+ self,
435
+ storage_client,
436
+ file_url: str,
437
+ bucket_name: str,
438
+ blob_prefix: str,
439
+ chunk_size: int = 5 * 1024 * 1024, # Default to 5MB
440
+ ):
441
+ log.info(f"Downloading file {file_url} to Google Cloud Storage...")
442
+
443
+ # Ensure chunk size is at least 5MB (GCP recommendation for resumable uploads)
444
+ if chunk_size < 5 * 1024 * 1024:
445
+ chunk_size = 5 * 1024 * 1024
446
+ log.info(
447
+ "Chunk size adjusted to 5MB for optimal Google Cloud Storage performance"
448
+ )
449
+
450
+ # Make the request
451
+ response = self.request_manager.get_stream(file_url)
452
+ response.raise_for_status()
453
+
454
+ # Extract filename from response headers
455
+ filename = (
456
+ response.headers["Content-Disposition"].split("filename=")[1].strip('"')
457
+ )
458
+
459
+ # Create the full blob path (prefix + filename)
460
+ blob_name = f"{blob_prefix.rstrip('/')}/{filename}"
461
+
462
+ # Check if file is small enough for direct upload
463
+ content_length = int(response.headers.get("Content-Length", 0))
464
+
465
+ # If file is small (less than 10MB) or content length is unknown, use simple upload
466
+ if content_length > 0 and content_length < 10 * 1024 * 1024:
467
+ log.warning(f"File is small ({content_length} bytes), using simple upload")
468
+ content = response.content
469
+
470
+ # Get bucket and blob
471
+ bucket = storage_client.bucket(bucket_name)
472
+ blob = bucket.blob(blob_name)
473
+
474
+ # Upload the content
475
+ blob.upload_from_string(content)
476
+ log.info(f"File uploaded successfully to gs://{bucket_name}/{blob_name}")
477
+ return f"gs://{bucket_name}/{blob_name}"
478
+
479
+ # For larger files, use resumable upload
480
+ log.info(f"Initiating resumable upload to gs://{bucket_name}/{blob_name}")
481
+
482
+ # Get bucket and blob
483
+ bucket = storage_client.bucket(bucket_name)
484
+ blob = bucket.blob(blob_name)
485
+
486
+ try:
487
+ # Start resumable upload
488
+ transport = storage_client._http
489
+ url = blob._get_upload_url(transport)
490
+
491
+ # Use a buffer to collect chunks
492
+ buffer = BytesIO()
493
+ buffer_size = 0
494
+ total_uploaded = 0
495
+
496
+ for chunk in response.iter_content(
497
+ chunk_size=1024 * 1024
498
+ ): # Read in 1MB chunks
499
+ if not chunk:
500
+ continue
501
+
502
+ # Add the chunk to our buffer
503
+ buffer.write(chunk)
504
+ buffer_size += len(chunk)
505
+
506
+ # If we have enough data, upload the chunk
507
+ if buffer_size >= chunk_size:
508
+ # Reset buffer position to beginning for reading
509
+ buffer.seek(0)
510
+ chunk_data = buffer.read()
511
+
512
+ # Upload the chunk
513
+ blob._do_upload_chunk(transport, url, chunk_data, total_uploaded)
514
+ total_uploaded += len(chunk_data)
515
+
516
+ log.info(f"Uploaded chunk ({len(chunk_data)} bytes), total: {total_uploaded} bytes")
517
+
518
+ # Reset the buffer
519
+ buffer = BytesIO()
520
+ buffer_size = 0
521
+
522
+ # Upload any remaining data as the final chunk
523
+ if buffer_size > 0:
524
+ buffer.seek(0)
525
+ chunk_data = buffer.read()
526
+ blob._do_upload_chunk(transport, url, chunk_data, total_uploaded)
527
+ total_uploaded += len(chunk_data)
528
+
529
+ log.info(f"Uploaded final chunk ({len(chunk_data)} bytes), total: {total_uploaded} bytes")
530
+
531
+ # Finalize the upload
532
+ blob._do_finalize_upload(transport, url, total_uploaded)
533
+ log.info(f"File uploaded successfully to gs://{bucket_name}/{blob_name}")
534
+
535
+ return f"gs://{bucket_name}/{blob_name}"
536
+
537
+ except Exception as e:
538
+ log.error(f"Google Cloud Storage upload failed due to: {str(e)}")
539
+ raise
540
+
541
+ def get_graphs(self) -> dict:
542
+ raise NotImplementedError("get_graphs is not implemented yet.")
543
+
544
+ def get_graph_data(self, data_identifier: str) -> dict:
545
+ raise NotImplementedError("get_graph_data is not implemented yet.")