FlowerPower 0.11.6.19__py3-none-any.whl → 0.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. flowerpower/cfg/__init__.py +3 -3
  2. flowerpower/cfg/pipeline/__init__.py +5 -3
  3. flowerpower/cfg/project/__init__.py +3 -3
  4. flowerpower/cfg/project/job_queue.py +1 -128
  5. flowerpower/cli/__init__.py +5 -5
  6. flowerpower/cli/cfg.py +0 -3
  7. flowerpower/cli/job_queue.py +401 -133
  8. flowerpower/cli/pipeline.py +14 -413
  9. flowerpower/cli/utils.py +0 -1
  10. flowerpower/flowerpower.py +537 -28
  11. flowerpower/job_queue/__init__.py +5 -94
  12. flowerpower/job_queue/base.py +201 -3
  13. flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -3
  14. flowerpower/job_queue/rq/manager.py +388 -77
  15. flowerpower/pipeline/__init__.py +2 -0
  16. flowerpower/pipeline/base.py +2 -2
  17. flowerpower/pipeline/io.py +14 -16
  18. flowerpower/pipeline/manager.py +21 -642
  19. flowerpower/pipeline/pipeline.py +571 -0
  20. flowerpower/pipeline/registry.py +242 -10
  21. flowerpower/pipeline/visualizer.py +1 -2
  22. flowerpower/plugins/_io/__init__.py +8 -0
  23. flowerpower/plugins/mqtt/manager.py +6 -6
  24. flowerpower/settings/backend.py +0 -2
  25. flowerpower/settings/job_queue.py +1 -57
  26. flowerpower/utils/misc.py +0 -256
  27. flowerpower/utils/monkey.py +1 -83
  28. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/METADATA +308 -152
  29. flowerpower-0.20.0.dist-info/RECORD +58 -0
  30. flowerpower/fs/__init__.py +0 -29
  31. flowerpower/fs/base.py +0 -662
  32. flowerpower/fs/ext.py +0 -2143
  33. flowerpower/fs/storage_options.py +0 -1420
  34. flowerpower/job_queue/apscheduler/__init__.py +0 -11
  35. flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -110
  36. flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -93
  37. flowerpower/job_queue/apscheduler/manager.py +0 -1051
  38. flowerpower/job_queue/apscheduler/setup.py +0 -554
  39. flowerpower/job_queue/apscheduler/trigger.py +0 -169
  40. flowerpower/job_queue/apscheduler/utils.py +0 -311
  41. flowerpower/pipeline/job_queue.py +0 -583
  42. flowerpower/pipeline/runner.py +0 -603
  43. flowerpower/plugins/io/base.py +0 -2520
  44. flowerpower/plugins/io/helpers/datetime.py +0 -298
  45. flowerpower/plugins/io/helpers/polars.py +0 -875
  46. flowerpower/plugins/io/helpers/pyarrow.py +0 -570
  47. flowerpower/plugins/io/helpers/sql.py +0 -202
  48. flowerpower/plugins/io/loader/__init__.py +0 -28
  49. flowerpower/plugins/io/loader/csv.py +0 -37
  50. flowerpower/plugins/io/loader/deltatable.py +0 -190
  51. flowerpower/plugins/io/loader/duckdb.py +0 -19
  52. flowerpower/plugins/io/loader/json.py +0 -37
  53. flowerpower/plugins/io/loader/mqtt.py +0 -159
  54. flowerpower/plugins/io/loader/mssql.py +0 -26
  55. flowerpower/plugins/io/loader/mysql.py +0 -26
  56. flowerpower/plugins/io/loader/oracle.py +0 -26
  57. flowerpower/plugins/io/loader/parquet.py +0 -35
  58. flowerpower/plugins/io/loader/postgres.py +0 -26
  59. flowerpower/plugins/io/loader/pydala.py +0 -19
  60. flowerpower/plugins/io/loader/sqlite.py +0 -23
  61. flowerpower/plugins/io/metadata.py +0 -244
  62. flowerpower/plugins/io/saver/__init__.py +0 -28
  63. flowerpower/plugins/io/saver/csv.py +0 -36
  64. flowerpower/plugins/io/saver/deltatable.py +0 -186
  65. flowerpower/plugins/io/saver/duckdb.py +0 -19
  66. flowerpower/plugins/io/saver/json.py +0 -36
  67. flowerpower/plugins/io/saver/mqtt.py +0 -28
  68. flowerpower/plugins/io/saver/mssql.py +0 -26
  69. flowerpower/plugins/io/saver/mysql.py +0 -26
  70. flowerpower/plugins/io/saver/oracle.py +0 -26
  71. flowerpower/plugins/io/saver/parquet.py +0 -36
  72. flowerpower/plugins/io/saver/postgres.py +0 -26
  73. flowerpower/plugins/io/saver/pydala.py +0 -20
  74. flowerpower/plugins/io/saver/sqlite.py +0 -24
  75. flowerpower/utils/scheduler.py +0 -311
  76. flowerpower-0.11.6.19.dist-info/RECORD +0 -102
  77. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/WHEEL +0 -0
  78. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/entry_points.txt +0 -0
  79. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/licenses/LICENSE +0 -0
  80. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/top_level.txt +0 -0
@@ -1,1420 +0,0 @@
1
- import configparser
2
- import os
3
- from typing import Any, TypeVar, Union
4
-
5
- import msgspec
6
- import yaml
7
- from fsspec import AbstractFileSystem, filesystem
8
- from fsspec.utils import infer_storage_options
9
-
10
-
11
- class BaseStorageOptions(msgspec.Struct):
12
- """Base class for filesystem storage configuration options.
13
-
14
- Provides common functionality for all storage option classes including:
15
- - YAML serialization/deserialization
16
- - Dictionary conversion
17
- - Filesystem instance creation
18
- - Configuration updates
19
-
20
- Attributes:
21
- protocol (str): Storage protocol identifier (e.g., "s3", "gs", "file")
22
-
23
- Example:
24
- >>> # Create and save options
25
- >>> options = BaseStorageOptions(protocol="s3")
26
- >>> options.to_yaml("config.yml")
27
- >>>
28
- >>> # Load from YAML
29
- >>> loaded = BaseStorageOptions.from_yaml("config.yml")
30
- >>> print(loaded.protocol)
31
- 's3'
32
- """
33
-
34
- protocol: str
35
-
36
- def to_dict(self, with_protocol: bool = False) -> dict:
37
- """Convert storage options to dictionary.
38
-
39
- Args:
40
- with_protocol: Whether to include protocol in output dictionary
41
-
42
- Returns:
43
- dict: Dictionary of storage options with non-None values
44
-
45
- Example:
46
- >>> options = BaseStorageOptions(protocol="s3")
47
- >>> print(options.to_dict())
48
- {}
49
- >>> print(options.to_dict(with_protocol=True))
50
- {'protocol': 's3'}
51
- """
52
- data = msgspec.structs.asdict(self)
53
- result = {}
54
- for key, value in data.items():
55
- if value is None:
56
- continue
57
-
58
- if key == "protocol":
59
- if with_protocol:
60
- result[key] = value
61
- else:
62
- result[key] = value
63
- return result
64
-
65
- @classmethod
66
- def from_yaml(
67
- cls, path: str, fs: AbstractFileSystem = None
68
- ) -> "BaseStorageOptions":
69
- """Load storage options from YAML file.
70
-
71
- Args:
72
- path: Path to YAML configuration file
73
- fs: Filesystem to use for reading file
74
-
75
- Returns:
76
- BaseStorageOptions: Loaded storage options instance
77
-
78
- Example:
79
- >>> # Load from local file
80
- >>> options = BaseStorageOptions.from_yaml("config.yml")
81
- >>> print(options.protocol)
82
- 's3'
83
- """
84
- if fs is None:
85
- fs = filesystem("file")
86
- with fs.open(path) as f:
87
- data = yaml.safe_load(f)
88
- return cls(**data)
89
-
90
- def to_yaml(self, path: str, fs: AbstractFileSystem = None) -> None:
91
- """Save storage options to YAML file.
92
-
93
- Args:
94
- path: Path where to save configuration
95
- fs: Filesystem to use for writing
96
-
97
- Example:
98
- >>> options = BaseStorageOptions(protocol="s3")
99
- >>> options.to_yaml("config.yml")
100
- """
101
- if fs is None:
102
- fs = filesystem("file")
103
- data = self.to_dict()
104
- with fs.open(path, "w") as f:
105
- yaml.safe_dump(data, f)
106
-
107
- def to_filesystem(self) -> AbstractFileSystem:
108
- """Create fsspec filesystem instance from options.
109
-
110
- Returns:
111
- AbstractFileSystem: Configured filesystem instance
112
-
113
- Example:
114
- >>> options = BaseStorageOptions(protocol="file")
115
- >>> fs = options.to_filesystem()
116
- >>> files = fs.ls("/path/to/data")
117
- """
118
- return filesystem(**self.to_dict(with_protocol=True))
119
-
120
- def update(self, **kwargs: Any) -> "BaseStorageOptions":
121
- """Update storage options with new values.
122
-
123
- Args:
124
- **kwargs: New option values to set
125
-
126
- Returns:
127
- BaseStorageOptions: Updated instance
128
-
129
- Example:
130
- >>> options = BaseStorageOptions(protocol="s3")
131
- >>> options = options.update(region="us-east-1")
132
- >>> print(options.region)
133
- 'us-east-1'
134
- """
135
- return self.replace(**kwargs)
136
-
137
-
138
- class AzureStorageOptions(BaseStorageOptions):
139
- """Azure Storage configuration options.
140
-
141
- Provides configuration for Azure storage services:
142
- - Azure Blob Storage (az://)
143
- - Azure Data Lake Storage Gen2 (abfs://)
144
- - Azure Data Lake Storage Gen1 (adl://)
145
-
146
- Supports multiple authentication methods:
147
- - Connection string
148
- - Account key
149
- - Service principal
150
- - Managed identity
151
- - SAS token
152
-
153
- Attributes:
154
- protocol (str): Storage protocol ("az", "abfs", or "adl")
155
- account_name (str): Storage account name
156
- account_key (str): Storage account access key
157
- connection_string (str): Full connection string
158
- tenant_id (str): Azure AD tenant ID
159
- client_id (str): Service principal client ID
160
- client_secret (str): Service principal client secret
161
- sas_token (str): SAS token for limited access
162
-
163
- Example:
164
- >>> # Blob Storage with account key
165
- >>> options = AzureStorageOptions(
166
- ... protocol="az",
167
- ... account_name="mystorageacct",
168
- ... account_key="key123..."
169
- ... )
170
- >>>
171
- >>> # Data Lake with service principal
172
- >>> options = AzureStorageOptions(
173
- ... protocol="abfs",
174
- ... account_name="mydatalake",
175
- ... tenant_id="tenant123",
176
- ... client_id="client123",
177
- ... client_secret="secret123"
178
- ... )
179
- >>>
180
- >>> # Simple connection string auth
181
- >>> options = AzureStorageOptions(
182
- ... protocol="az",
183
- ... connection_string="DefaultEndpoints..."
184
- ... )
185
- """
186
-
187
- protocol: str
188
- account_name: str | None = None
189
- account_key: str | None = None
190
- connection_string: str | None = None
191
- tenant_id: str | None = None
192
- client_id: str | None = None
193
- client_secret: str | None = None
194
- sas_token: str | None = None
195
-
196
- @classmethod
197
- def from_env(cls) -> "AzureStorageOptions":
198
- """Create storage options from environment variables.
199
-
200
- Reads standard Azure environment variables:
201
- - AZURE_STORAGE_ACCOUNT_NAME
202
- - AZURE_STORAGE_ACCOUNT_KEY
203
- - AZURE_STORAGE_CONNECTION_STRING
204
- - AZURE_TENANT_ID
205
- - AZURE_CLIENT_ID
206
- - AZURE_CLIENT_SECRET
207
- - AZURE_STORAGE_SAS_TOKEN
208
-
209
- Returns:
210
- AzureStorageOptions: Configured storage options
211
-
212
- Example:
213
- >>> # With environment variables set:
214
- >>> options = AzureStorageOptions.from_env()
215
- >>> print(options.account_name) # From AZURE_STORAGE_ACCOUNT_NAME
216
- 'mystorageacct'
217
- """
218
- return cls(
219
- protocol=os.getenv("AZURE_STORAGE_PROTOCOL", "az"),
220
- account_name=os.getenv("AZURE_STORAGE_ACCOUNT_NAME"),
221
- account_key=os.getenv("AZURE_STORAGE_ACCOUNT_KEY"),
222
- connection_string=os.getenv("AZURE_STORAGE_CONNECTION_STRING"),
223
- tenant_id=os.getenv("AZURE_TENANT_ID"),
224
- client_id=os.getenv("AZURE_CLIENT_ID"),
225
- client_secret=os.getenv("AZURE_CLIENT_SECRET"),
226
- sas_token=os.getenv("AZURE_STORAGE_SAS_TOKEN"),
227
- )
228
-
229
- def to_env(self) -> None:
230
- """Export options to environment variables.
231
-
232
- Sets standard Azure environment variables.
233
-
234
- Example:
235
- >>> options = AzureStorageOptions(
236
- ... protocol="az",
237
- ... account_name="mystorageacct",
238
- ... account_key="key123"
239
- ... )
240
- >>> options.to_env()
241
- >>> print(os.getenv("AZURE_STORAGE_ACCOUNT_NAME"))
242
- 'mystorageacct'
243
- """
244
- env = {
245
- "AZURE_STORAGE_PROTOCOL": self.protocol,
246
- "AZURE_STORAGE_ACCOUNT_NAME": self.account_name,
247
- "AZURE_STORAGE_ACCOUNT_KEY": self.account_key,
248
- "AZURE_STORAGE_CONNECTION_STRING": self.connection_string,
249
- "AZURE_TENANT_ID": self.tenant_id,
250
- "AZURE_CLIENT_ID": self.client_id,
251
- "AZURE_CLIENT_SECRET": self.client_secret,
252
- "AZURE_STORAGE_SAS_TOKEN": self.sas_token,
253
- }
254
- env = {k: v for k, v in env.items() if v is not None}
255
- os.environ.update(env)
256
-
257
-
258
- class GcsStorageOptions(BaseStorageOptions):
259
- """Google Cloud Storage configuration options.
260
-
261
- Provides configuration for GCS access with support for:
262
- - Service account authentication
263
- - Default application credentials
264
- - Token-based authentication
265
- - Project configuration
266
- - Custom endpoints
267
-
268
- Attributes:
269
- protocol (str): Storage protocol ("gs" or "gcs")
270
- token (str): Path to service account JSON file
271
- project (str): Google Cloud project ID
272
- access_token (str): OAuth2 access token
273
- endpoint_url (str): Custom storage endpoint
274
- timeout (int): Request timeout in seconds
275
-
276
- Example:
277
- >>> # Service account auth
278
- >>> options = GcsStorageOptions(
279
- ... protocol="gs",
280
- ... token="path/to/service-account.json",
281
- ... project="my-project-123"
282
- ... )
283
- >>>
284
- >>> # Application default credentials
285
- >>> options = GcsStorageOptions(
286
- ... protocol="gcs",
287
- ... project="my-project-123"
288
- ... )
289
- >>>
290
- >>> # Custom endpoint (e.g., test server)
291
- >>> options = GcsStorageOptions(
292
- ... protocol="gs",
293
- ... endpoint_url="http://localhost:4443",
294
- ... token="test-token.json"
295
- ... )
296
- """
297
-
298
- protocol: str
299
- token: str | None = None
300
- project: str | None = None
301
- access_token: str | None = None
302
- endpoint_url: str | None = None
303
- timeout: int | None = None
304
-
305
- @classmethod
306
- def from_env(cls) -> "GcsStorageOptions":
307
- """Create storage options from environment variables.
308
-
309
- Reads standard GCP environment variables:
310
- - GOOGLE_CLOUD_PROJECT: Project ID
311
- - GOOGLE_APPLICATION_CREDENTIALS: Service account file path
312
- - STORAGE_EMULATOR_HOST: Custom endpoint (for testing)
313
- - GCS_OAUTH_TOKEN: OAuth2 access token
314
-
315
- Returns:
316
- GcsStorageOptions: Configured storage options
317
-
318
- Example:
319
- >>> # With environment variables set:
320
- >>> options = GcsStorageOptions.from_env()
321
- >>> print(options.project) # From GOOGLE_CLOUD_PROJECT
322
- 'my-project-123'
323
- """
324
- return cls(
325
- protocol="gs",
326
- project=os.getenv("GOOGLE_CLOUD_PROJECT"),
327
- token=os.getenv("GOOGLE_APPLICATION_CREDENTIALS"),
328
- endpoint_url=os.getenv("STORAGE_EMULATOR_HOST"),
329
- access_token=os.getenv("GCS_OAUTH_TOKEN"),
330
- )
331
-
332
- def to_env(self) -> None:
333
- """Export options to environment variables.
334
-
335
- Sets standard GCP environment variables.
336
-
337
- Example:
338
- >>> options = GcsStorageOptions(
339
- ... protocol="gs",
340
- ... project="my-project",
341
- ... token="service-account.json"
342
- ... )
343
- >>> options.to_env()
344
- >>> print(os.getenv("GOOGLE_CLOUD_PROJECT"))
345
- 'my-project'
346
- """
347
- env = {
348
- "GOOGLE_CLOUD_PROJECT": self.project,
349
- "GOOGLE_APPLICATION_CREDENTIALS": self.token,
350
- "STORAGE_EMULATOR_HOST": self.endpoint_url,
351
- "GCS_OAUTH_TOKEN": self.access_token,
352
- }
353
- env = {k: v for k, v in env.items() if v is not None}
354
- os.environ.update(env)
355
-
356
- def to_fsspec_kwargs(self) -> dict:
357
- """Convert options to fsspec filesystem arguments.
358
-
359
- Returns:
360
- dict: Arguments suitable for GCSFileSystem
361
-
362
- Example:
363
- >>> options = GcsStorageOptions(
364
- ... protocol="gs",
365
- ... token="service-account.json",
366
- ... project="my-project"
367
- ... )
368
- >>> kwargs = options.to_fsspec_kwargs()
369
- >>> fs = filesystem("gcs", **kwargs)
370
- """
371
- kwargs = {
372
- "token": self.token,
373
- "project": self.project,
374
- "access_token": self.access_token,
375
- "endpoint_url": self.endpoint_url,
376
- "timeout": self.timeout,
377
- }
378
- return {k: v for k, v in kwargs.items() if v is not None}
379
-
380
-
381
- class AwsStorageOptions(BaseStorageOptions):
382
- """AWS S3 storage configuration options.
383
-
384
- Provides comprehensive configuration for S3 access with support for:
385
- - Multiple authentication methods (keys, profiles, environment)
386
- - Custom endpoints for S3-compatible services
387
- - Region configuration
388
- - SSL/TLS settings
389
-
390
- Attributes:
391
- protocol (str): Always "s3" for S3 storage
392
- access_key_id (str): AWS access key ID
393
- secret_access_key (str): AWS secret access key
394
- session_token (str): AWS session token
395
- endpoint_url (str): Custom S3 endpoint URL
396
- region (str): AWS region name
397
- allow_invalid_certificates (bool): Skip SSL certificate validation
398
- allow_http (bool): Allow unencrypted HTTP connections
399
- profile (str): AWS credentials profile name
400
-
401
- Example:
402
- >>> # Basic credentials
403
- >>> options = AwsStorageOptions(
404
- ... access_key_id="AKIAXXXXXXXX",
405
- ... secret_access_key="SECRETKEY",
406
- ... region="us-east-1"
407
- ... )
408
- >>>
409
- >>> # Profile-based auth
410
- >>> options = AwsStorageOptions.create(profile="dev")
411
- >>>
412
- >>> # S3-compatible service (MinIO)
413
- >>> options = AwsStorageOptions(
414
- ... endpoint_url="http://localhost:9000",
415
- ... access_key_id="minioadmin",
416
- ... secret_access_key="minioadmin",
417
- ... allow_http=True
418
- ... )
419
- """
420
-
421
- protocol: str = "s3"
422
- access_key_id: str | None = None
423
- secret_access_key: str | None = None
424
- session_token: str | None = None
425
- endpoint_url: str | None = None
426
- region: str | None = None
427
- allow_invalid_certificates: bool | None = None
428
- allow_http: bool | None = None
429
-
430
- @classmethod
431
- def create(
432
- cls,
433
- protocol: str = "s3",
434
- access_key_id: str | None = None,
435
- secret_access_key: str | None = None,
436
- session_token: str | None = None,
437
- endpoint_url: str | None = None,
438
- region: str | None = None,
439
- allow_invalid_certificates: bool | None = None,
440
- allow_http: bool | None = None,
441
- # Alias and loading params
442
- key: str | None = None,
443
- secret: str | None = None,
444
- token: str | None = None, # maps to session_token
445
- profile: str | None = None,
446
- ) -> "AwsStorageOptions":
447
- """Creates an AwsStorageOptions instance, handling aliases and profile loading.
448
-
449
- Args:
450
- protocol: Storage protocol, defaults to "s3".
451
- access_key_id: AWS access key ID.
452
- secret_access_key: AWS secret access key.
453
- session_token: AWS session token.
454
- endpoint_url: Custom S3 endpoint URL.
455
- region: AWS region name.
456
- allow_invalid_certificates: Skip SSL certificate validation.
457
- allow_http: Allow unencrypted HTTP connections.
458
- key: Alias for access_key_id.
459
- secret: Alias for secret_access_key.
460
- token: Alias for session_token.
461
- profile: AWS credentials profile name to load credentials from.
462
-
463
- Returns:
464
- An initialized AwsStorageOptions instance.
465
- """
466
-
467
- # Initial values from explicit args or their aliases
468
- args = {
469
- "protocol": protocol,
470
- "access_key_id": access_key_id if access_key_id is not None else key,
471
- "secret_access_key": secret_access_key
472
- if secret_access_key is not None
473
- else secret,
474
- "session_token": session_token if session_token is not None else token,
475
- "endpoint_url": endpoint_url,
476
- "region": region,
477
- "allow_invalid_certificates": allow_invalid_certificates,
478
- "allow_http": allow_http,
479
- }
480
-
481
- if profile is not None:
482
- # Note: allow_invalid_certificates and allow_http are passed to from_aws_credentials.
483
- # If they are None here, from_aws_credentials will use its own defaults for those flags when reading.
484
- profile_instance = cls.from_aws_credentials(
485
- profile=profile,
486
- allow_invalid_certificates=args["allow_invalid_certificates"],
487
- allow_http=args["allow_http"],
488
- )
489
- # Fill in missing values from profile if not already set by direct/aliased args
490
- if args["access_key_id"] is None:
491
- args["access_key_id"] = profile_instance.access_key_id
492
- if args["secret_access_key"] is None:
493
- args["secret_access_key"] = profile_instance.secret_access_key
494
- if args["session_token"] is None:
495
- args["session_token"] = profile_instance.session_token
496
- if args["endpoint_url"] is None:
497
- args["endpoint_url"] = profile_instance.endpoint_url
498
- if args["region"] is None:
499
- args["region"] = profile_instance.region
500
- # If allow_invalid_certificates/allow_http were None in args, and from_aws_credentials
501
- # used its defaults to set them on profile_instance, we update args.
502
- if (
503
- args["allow_invalid_certificates"] is None
504
- and profile_instance.allow_invalid_certificates is not None
505
- ):
506
- args["allow_invalid_certificates"] = (
507
- profile_instance.allow_invalid_certificates
508
- )
509
- if args["allow_http"] is None and profile_instance.allow_http is not None:
510
- args["allow_http"] = profile_instance.allow_http
511
-
512
- # Ensure protocol is 's3' if it somehow became None
513
- if args["protocol"] is None:
514
- args["protocol"] = "s3"
515
-
516
- return cls(**args)
517
-
518
- @classmethod
519
- def from_aws_credentials(
520
- cls,
521
- profile: str,
522
- allow_invalid_certificates: bool = False,
523
- allow_http: bool = False,
524
- ) -> "AwsStorageOptions":
525
- """Create storage options from AWS credentials file.
526
-
527
- Loads credentials from ~/.aws/credentials and ~/.aws/config files.
528
-
529
- Args:
530
- profile: AWS credentials profile name
531
- allow_invalid_certificates: Skip SSL certificate validation
532
- allow_http: Allow unencrypted HTTP connections
533
-
534
- Returns:
535
- AwsStorageOptions: Configured storage options
536
-
537
- Raises:
538
- ValueError: If profile not found
539
- FileNotFoundError: If credentials files missing
540
-
541
- Example:
542
- >>> # Load developer profile
543
- >>> options = AwsStorageOptions.from_aws_credentials(
544
- ... profile="dev",
545
- ... allow_http=True # For local testing
546
- ... )
547
- """
548
- cp = configparser.ConfigParser()
549
- cp.read(os.path.expanduser("~/.aws/credentials"))
550
- cp.read(os.path.expanduser("~/.aws/config"))
551
- if profile not in cp:
552
- raise ValueError(f"Profile '{profile}' not found in AWS credentials file")
553
-
554
- return cls(
555
- protocol="s3",
556
- access_key_id=cp[profile].get("aws_access_key_id", None),
557
- secret_access_key=cp[profile].get("aws_secret_access_key", None),
558
- session_token=cp[profile].get("aws_session_token", None),
559
- endpoint_url=cp[profile].get("aws_endpoint_url", None)
560
- or cp[profile].get("endpoint_url", None)
561
- or cp[profile].get("aws_endpoint", None)
562
- or cp[profile].get("endpoint", None),
563
- region=(
564
- cp[profile].get("region", None)
565
- or cp[f"profile {profile}"].get("region", None)
566
- if f"profile {profile}" in cp
567
- else None
568
- ),
569
- allow_invalid_certificates=allow_invalid_certificates,
570
- allow_http=allow_http,
571
- )
572
-
573
- @classmethod
574
- def from_env(cls) -> "AwsStorageOptions":
575
- """Create storage options from environment variables.
576
-
577
- Reads standard AWS environment variables:
578
- - AWS_ACCESS_KEY_ID
579
- - AWS_SECRET_ACCESS_KEY
580
- - AWS_SESSION_TOKEN
581
- - AWS_ENDPOINT_URL
582
- - AWS_DEFAULT_REGION
583
- - ALLOW_INVALID_CERTIFICATES
584
- - AWS_ALLOW_HTTP
585
-
586
- Returns:
587
- AwsStorageOptions: Configured storage options
588
-
589
- Example:
590
- >>> # Load from environment
591
- >>> options = AwsStorageOptions.from_env()
592
- >>> print(options.region)
593
- 'us-east-1' # From AWS_DEFAULT_REGION
594
- """
595
- return cls(
596
- access_key_id=os.getenv("AWS_ACCESS_KEY_ID"),
597
- secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
598
- session_token=os.getenv("AWS_SESSION_TOKEN"),
599
- endpoint_url=os.getenv("AWS_ENDPOINT_URL"),
600
- region=os.getenv("AWS_DEFAULT_REGION"),
601
- allow_invalid_certificates="true"
602
- == (os.getenv("ALLOW_INVALID_CERTIFICATES", "False").lower()),
603
- allow_http="true" == (os.getenv("AWS_ALLOW_HTTP", "False").lower()),
604
- )
605
-
606
- def to_fsspec_kwargs(self) -> dict:
607
- """Convert options to fsspec filesystem arguments.
608
-
609
- Returns:
610
- dict: Arguments suitable for fsspec S3FileSystem
611
-
612
- Example:
613
- >>> options = AwsStorageOptions(
614
- ... access_key_id="KEY",
615
- ... secret_access_key="SECRET",
616
- ... region="us-west-2"
617
- ... )
618
- >>> kwargs = options.to_fsspec_kwargs()
619
- >>> fs = filesystem("s3", **kwargs)
620
- """
621
- fsspec_kwargs = {
622
- "key": self.access_key_id,
623
- "secret": self.secret_access_key,
624
- "token": self.session_token,
625
- "endpoint_url": self.endpoint_url,
626
- "client_kwargs": {
627
- "region_name": self.region,
628
- "verify": not self.allow_invalid_certificates
629
- if self.allow_invalid_certificates is not None
630
- else True,
631
- "use_ssl": not self.allow_http if self.allow_http is not None else True,
632
- },
633
- }
634
- return {k: v for k, v in fsspec_kwargs.items() if v is not None}
635
-
636
- def to_object_store_kwargs(self, with_conditional_put: bool = False) -> dict:
637
- """Convert options to object store arguments.
638
-
639
- Args:
640
- with_conditional_put: Add etag-based conditional put support
641
-
642
- Returns:
643
- dict: Arguments suitable for object store clients
644
-
645
- Example:
646
- >>> options = AwsStorageOptions(
647
- ... access_key_id="KEY",
648
- ... secret_access_key="SECRET"
649
- ... )
650
- >>> kwargs = options.to_object_store_kwargs()
651
- >>> client = ObjectStore(**kwargs)
652
- """
653
- kwargs = {
654
- k: str(v)
655
- for k, v in self.to_dict().items()
656
- if v is not None and k != "protocol"
657
- }
658
- if with_conditional_put:
659
- kwargs["conditional_put"] = "etag"
660
- return kwargs
661
-
662
- def to_env(self) -> None:
663
- """Export options to environment variables.
664
-
665
- Sets standard AWS environment variables.
666
-
667
- Example:
668
- >>> options = AwsStorageOptions(
669
- ... access_key_id="KEY",
670
- ... secret_access_key="SECRET",
671
- ... region="us-east-1"
672
- ... )
673
- >>> options.to_env()
674
- >>> print(os.getenv("AWS_ACCESS_KEY_ID"))
675
- 'KEY'
676
- """
677
- env = {
678
- "AWS_ACCESS_KEY_ID": self.access_key_id,
679
- "AWS_SECRET_ACCESS_KEY": self.secret_access_key,
680
- "AWS_SESSION_TOKEN": self.session_token,
681
- "AWS_ENDPOINT_URL": self.endpoint_url,
682
- "AWS_DEFAULT_REGION": self.region,
683
- "ALLOW_INVALID_CERTIFICATES": str(self.allow_invalid_certificates),
684
- "AWS_ALLOW_HTTP": str(self.allow_http),
685
- }
686
- env = {k: v for k, v in env.items() if v is not None}
687
- os.environ.update(env)
688
-
689
- def to_filesystem(self):
690
- return filesystem(self.protocol, **self.to_fsspec_kwargs())
691
-
692
-
693
- class GitHubStorageOptions(BaseStorageOptions):
694
- """GitHub repository storage configuration options.
695
-
696
- Provides access to files in GitHub repositories with support for:
697
- - Public and private repositories
698
- - Branch/tag/commit selection
699
- - Token-based authentication
700
- - Custom GitHub Enterprise instances
701
-
702
- Attributes:
703
- protocol (str): Always "github" for GitHub storage
704
- org (str): Organization or user name
705
- repo (str): Repository name
706
- ref (str): Git reference (branch, tag, or commit SHA)
707
- token (str): GitHub personal access token
708
- api_url (str): Custom GitHub API URL for enterprise instances
709
-
710
- Example:
711
- >>> # Public repository
712
- >>> options = GitHubStorageOptions(
713
- ... org="microsoft",
714
- ... repo="vscode",
715
- ... ref="main"
716
- ... )
717
- >>>
718
- >>> # Private repository
719
- >>> options = GitHubStorageOptions(
720
- ... org="myorg",
721
- ... repo="private-repo",
722
- ... token="ghp_xxxx",
723
- ... ref="develop"
724
- ... )
725
- >>>
726
- >>> # Enterprise instance
727
- >>> options = GitHubStorageOptions(
728
- ... org="company",
729
- ... repo="internal",
730
- ... api_url="https://github.company.com/api/v3",
731
- ... token="ghp_xxxx"
732
- ... )
733
- """
734
-
735
- protocol: str = "github"
736
- org: str | None = None
737
- repo: str | None = None
738
- ref: str | None = None
739
- token: str | None = None
740
- api_url: str | None = None
741
-
742
- @classmethod
743
- def from_env(cls) -> "GitHubStorageOptions":
744
- """Create storage options from environment variables.
745
-
746
- Reads standard GitHub environment variables:
747
- - GITHUB_ORG: Organization or user name
748
- - GITHUB_REPO: Repository name
749
- - GITHUB_REF: Git reference
750
- - GITHUB_TOKEN: Personal access token
751
- - GITHUB_API_URL: Custom API URL
752
-
753
- Returns:
754
- GitHubStorageOptions: Configured storage options
755
-
756
- Example:
757
- >>> # With environment variables set:
758
- >>> options = GitHubStorageOptions.from_env()
759
- >>> print(options.org) # From GITHUB_ORG
760
- 'microsoft'
761
- """
762
- return cls(
763
- protocol="github",
764
- org=os.getenv("GITHUB_ORG"),
765
- repo=os.getenv("GITHUB_REPO"),
766
- ref=os.getenv("GITHUB_REF"),
767
- token=os.getenv("GITHUB_TOKEN"),
768
- api_url=os.getenv("GITHUB_API_URL"),
769
- )
770
-
771
- def to_env(self) -> None:
772
- """Export options to environment variables.
773
-
774
- Sets standard GitHub environment variables.
775
-
776
- Example:
777
- >>> options = GitHubStorageOptions(
778
- ... org="microsoft",
779
- ... repo="vscode",
780
- ... token="ghp_xxxx"
781
- ... )
782
- >>> options.to_env()
783
- >>> print(os.getenv("GITHUB_ORG"))
784
- 'microsoft'
785
- """
786
- env = {
787
- "GITHUB_ORG": self.org,
788
- "GITHUB_REPO": self.repo,
789
- "GITHUB_REF": self.ref,
790
- "GITHUB_TOKEN": self.token,
791
- "GITHUB_API_URL": self.api_url,
792
- }
793
- env = {k: v for k, v in env.items() if v is not None}
794
- os.environ.update(env)
795
-
796
- def to_fsspec_kwargs(self) -> dict:
797
- """Convert options to fsspec filesystem arguments.
798
-
799
- Returns:
800
- dict: Arguments suitable for GitHubFileSystem
801
-
802
- Example:
803
- >>> options = GitHubStorageOptions(
804
- ... org="microsoft",
805
- ... repo="vscode",
806
- ... token="ghp_xxxx"
807
- ... )
808
- >>> kwargs = options.to_fsspec_kwargs()
809
- >>> fs = filesystem("github", **kwargs)
810
- """
811
- kwargs = {
812
- "org": self.org,
813
- "repo": self.repo,
814
- "ref": self.ref,
815
- "token": self.token,
816
- "api_url": self.api_url,
817
- }
818
- return {k: v for k, v in kwargs.items() if v is not None}
819
-
820
-
821
- class GitLabStorageOptions(BaseStorageOptions):
822
- """GitLab repository storage configuration options.
823
-
824
- Provides access to files in GitLab repositories with support for:
825
- - Public and private repositories
826
- - Self-hosted GitLab instances
827
- - Project ID or name-based access
828
- - Branch/tag/commit selection
829
- - Token-based authentication
830
-
831
- Attributes:
832
- protocol (str): Always "gitlab" for GitLab storage
833
- base_url (str): GitLab instance URL, defaults to gitlab.com
834
- project_id (str | int): Project ID number
835
- project_name (str): Project name/path
836
- ref (str): Git reference (branch, tag, or commit SHA)
837
- token (str): GitLab personal access token
838
- api_version (str): API version to use
839
-
840
- Example:
841
- >>> # Public project on gitlab.com
842
- >>> options = GitLabStorageOptions(
843
- ... project_name="group/project",
844
- ... ref="main"
845
- ... )
846
- >>>
847
- >>> # Private project with token
848
- >>> options = GitLabStorageOptions(
849
- ... project_id=12345,
850
- ... token="glpat_xxxx",
851
- ... ref="develop"
852
- ... )
853
- >>>
854
- >>> # Self-hosted instance
855
- >>> options = GitLabStorageOptions(
856
- ... base_url="https://gitlab.company.com",
857
- ... project_name="internal/project",
858
- ... token="glpat_xxxx"
859
- ... )
860
- """
861
-
862
- protocol: str = "gitlab"
863
- base_url: str = "https://gitlab.com"
864
- project_id: str | int | None = None
865
- project_name: str | None = None
866
- ref: str | None = None
867
- token: str | None = None
868
- api_version: str = "v4"
869
-
870
- def __post_init__(self) -> None:
871
- """Validate GitLab configuration after initialization.
872
-
873
- Ensures either project_id or project_name is provided.
874
-
875
- Args:
876
- __context: Pydantic validation context (unused)
877
-
878
- Raises:
879
- ValueError: If neither project_id nor project_name is provided
880
-
881
- Example:
882
- >>> # Valid initialization
883
- >>> options = GitLabStorageOptions(project_id=12345)
884
- >>>
885
- >>> # Invalid initialization
886
- >>> try:
887
- ... options = GitLabStorageOptions()
888
- ... except ValueError as e:
889
- ... print(str(e))
890
- 'Either project_id or project_name must be provided'
891
- """
892
- if self.project_id is None and self.project_name is None:
893
- raise ValueError("Either project_id or project_name must be provided")
894
-
895
- @classmethod
896
- def from_env(cls) -> "GitLabStorageOptions":
897
- """Create storage options from environment variables.
898
-
899
- Reads standard GitLab environment variables:
900
- - GITLAB_URL: Instance URL
901
- - GITLAB_PROJECT_ID: Project ID
902
- - GITLAB_PROJECT_NAME: Project name/path
903
- - GITLAB_REF: Git reference
904
- - GITLAB_TOKEN: Personal access token
905
- - GITLAB_API_VERSION: API version
906
-
907
- Returns:
908
- GitLabStorageOptions: Configured storage options
909
-
910
- Example:
911
- >>> # With environment variables set:
912
- >>> options = GitLabStorageOptions.from_env()
913
- >>> print(options.project_id) # From GITLAB_PROJECT_ID
914
- '12345'
915
- """
916
- return cls(
917
- protocol="gitlab",
918
- base_url=os.getenv("GITLAB_URL", "https://gitlab.com"),
919
- project_id=os.getenv("GITLAB_PROJECT_ID"),
920
- project_name=os.getenv("GITLAB_PROJECT_NAME"),
921
- ref=os.getenv("GITLAB_REF"),
922
- token=os.getenv("GITLAB_TOKEN"),
923
- api_version=os.getenv("GITLAB_API_VERSION", "v4"),
924
- )
925
-
926
- def to_env(self) -> None:
927
- """Export options to environment variables.
928
-
929
- Sets standard GitLab environment variables.
930
-
931
- Example:
932
- >>> options = GitLabStorageOptions(
933
- ... project_id=12345,
934
- ... token="glpat_xxxx"
935
- ... )
936
- >>> options.to_env()
937
- >>> print(os.getenv("GITLAB_PROJECT_ID"))
938
- '12345'
939
- """
940
- env = {
941
- "GITLAB_URL": self.base_url,
942
- "GITLAB_PROJECT_ID": str(self.project_id) if self.project_id else None,
943
- "GITLAB_PROJECT_NAME": self.project_name,
944
- "GITLAB_REF": self.ref,
945
- "GITLAB_TOKEN": self.token,
946
- "GITLAB_API_VERSION": self.api_version,
947
- }
948
- env = {k: v for k, v in env.items() if v is not None}
949
- os.environ.update(env)
950
-
951
- def to_fsspec_kwargs(self) -> dict:
952
- """Convert options to fsspec filesystem arguments.
953
-
954
- Returns:
955
- dict: Arguments suitable for GitLabFileSystem
956
-
957
- Example:
958
- >>> options = GitLabStorageOptions(
959
- ... project_id=12345,
960
- ... token="glpat_xxxx"
961
- ... )
962
- >>> kwargs = options.to_fsspec_kwargs()
963
- >>> fs = filesystem("gitlab", **kwargs)
964
- """
965
- kwargs = {
966
- "base_url": self.base_url,
967
- "project_id": self.project_id,
968
- "project_name": self.project_name,
969
- "ref": self.ref,
970
- "token": self.token,
971
- "api_version": self.api_version,
972
- }
973
- return {k: v for k, v in kwargs.items() if v is not None}
974
-
975
-
976
- class LocalStorageOptions(BaseStorageOptions):
977
- """Local filesystem configuration options.
978
-
979
- Provides basic configuration for local file access. While this class
980
- is simple, it maintains consistency with other storage options and
981
- enables transparent switching between local and remote storage.
982
-
983
- Attributes:
984
- protocol (str): Always "file" for local filesystem
985
- auto_mkdir (bool): Create directories automatically
986
- mode (int): Default file creation mode (unix-style)
987
-
988
- Example:
989
- >>> # Basic local access
990
- >>> options = LocalStorageOptions()
991
- >>> fs = options.to_filesystem()
992
- >>> files = fs.ls("/path/to/data")
993
- >>>
994
- >>> # With auto directory creation
995
- >>> options = LocalStorageOptions(auto_mkdir=True)
996
- >>> fs = options.to_filesystem()
997
- >>> with fs.open("/new/path/file.txt", "w") as f:
998
- ... f.write("test") # Creates /new/path/ automatically
999
- """
1000
-
1001
- protocol: str = "file"
1002
- auto_mkdir: bool = False
1003
- mode: int | None = None
1004
-
1005
- def to_fsspec_kwargs(self) -> dict:
1006
- """Convert options to fsspec filesystem arguments.
1007
-
1008
- Returns:
1009
- dict: Arguments suitable for LocalFileSystem
1010
-
1011
- Example:
1012
- >>> options = LocalStorageOptions(auto_mkdir=True)
1013
- >>> kwargs = options.to_fsspec_kwargs()
1014
- >>> fs = filesystem("file", **kwargs)
1015
- """
1016
- kwargs = {
1017
- "auto_mkdir": self.auto_mkdir,
1018
- "mode": self.mode,
1019
- }
1020
- return {k: v for k, v in kwargs.items() if v is not None}
1021
-
1022
-
1023
- def from_dict(protocol: str, storage_options: dict) -> BaseStorageOptions:
1024
- """Create appropriate storage options instance from dictionary.
1025
-
1026
- Factory function that creates the correct storage options class based on protocol.
1027
-
1028
- Args:
1029
- protocol: Storage protocol identifier (e.g., "s3", "gs", "file")
1030
- storage_options: Dictionary of configuration options
1031
-
1032
- Returns:
1033
- BaseStorageOptions: Appropriate storage options instance
1034
-
1035
- Raises:
1036
- ValueError: If protocol is not supported
1037
-
1038
- Example:
1039
- >>> # Create S3 options
1040
- >>> options = from_dict("s3", {
1041
- ... "access_key_id": "KEY",
1042
- ... "secret_access_key": "SECRET"
1043
- ... })
1044
- >>> print(type(options).__name__)
1045
- 'AwsStorageOptions'
1046
- """
1047
- if protocol == "s3":
1048
- if (
1049
- "profile" in storage_options
1050
- or "key" in storage_options
1051
- or "secret" in storage_options
1052
- ):
1053
- return AwsStorageOptions.create(**storage_options)
1054
- return AwsStorageOptions(**storage_options)
1055
- elif protocol in ["az", "abfs", "adl"]:
1056
- return AzureStorageOptions(**storage_options)
1057
- elif protocol in ["gs", "gcs"]:
1058
- return GcsStorageOptions(**storage_options)
1059
- elif protocol == "github":
1060
- return GitHubStorageOptions(**storage_options)
1061
- elif protocol == "gitlab":
1062
- return GitLabStorageOptions(**storage_options)
1063
- elif protocol == "file":
1064
- return LocalStorageOptions()
1065
- else:
1066
- raise ValueError(f"Unsupported protocol: {protocol}")
1067
-
1068
-
1069
- def from_env(protocol: str) -> BaseStorageOptions:
1070
- """Create storage options from environment variables.
1071
-
1072
- Factory function that creates and configures storage options from
1073
- protocol-specific environment variables.
1074
-
1075
- Args:
1076
- protocol: Storage protocol identifier (e.g., "s3", "github")
1077
-
1078
- Returns:
1079
- BaseStorageOptions: Configured storage options instance
1080
-
1081
- Raises:
1082
- ValueError: If protocol is not supported
1083
-
1084
- Example:
1085
- >>> # With AWS credentials in environment
1086
- >>> options = from_env("s3")
1087
- >>> print(options.access_key_id) # From AWS_ACCESS_KEY_ID
1088
- 'AKIAXXXXXX'
1089
- """
1090
- if protocol == "s3":
1091
- return AwsStorageOptions.from_env()
1092
- elif protocol == "github":
1093
- return GitHubStorageOptions.from_env()
1094
- elif protocol == "gitlab":
1095
- return GitLabStorageOptions.from_env()
1096
- elif protocol == "file":
1097
- return LocalStorageOptions()
1098
- else:
1099
- raise ValueError(f"Unsupported protocol: {protocol}")
1100
-
1101
-
1102
- class StorageOptions(msgspec.Struct):
1103
- """High-level storage options container and factory.
1104
-
1105
- Provides a unified interface for creating and managing storage options
1106
- for different protocols.
1107
-
1108
- Attributes:
1109
- storage_options (BaseStorageOptions): Underlying storage options instance
1110
-
1111
- Example:
1112
- >>> # Create from protocol
1113
- >>> options = StorageOptions.create(
1114
- ... protocol="s3",
1115
- ... access_key_id="KEY",
1116
- ... secret_access_key="SECRET"
1117
- ... )
1118
- >>>
1119
- >>> # Create from existing options
1120
- >>> s3_opts = AwsStorageOptions(access_key_id="KEY")
1121
- >>> options = StorageOptions(storage_options=s3_opts)
1122
- """
1123
-
1124
- storage_options: BaseStorageOptions
1125
-
1126
- @classmethod
1127
- def create(cls, **data: Any) -> "StorageOptions":
1128
- """Create storage options from arguments.
1129
-
1130
- Args:
1131
- **data: Either:
1132
- - protocol and configuration options
1133
- - storage_options=pre-configured instance
1134
-
1135
- Returns:
1136
- StorageOptions: Configured storage options instance
1137
-
1138
- Raises:
1139
- ValueError: If protocol missing or invalid
1140
-
1141
- Example:
1142
- >>> # Direct protocol config
1143
- >>> options = StorageOptions.create(
1144
- ... protocol="s3",
1145
- ... region="us-east-1"
1146
- ... )
1147
- """
1148
- protocol = data.get("protocol")
1149
- if protocol is None and "storage_options" not in data:
1150
- raise ValueError("protocol must be specified")
1151
-
1152
- if "storage_options" not in data:
1153
- if protocol == "s3":
1154
- if "profile" in data or "key" in data or "secret" in data:
1155
- storage_options = AwsStorageOptions.create(**data)
1156
- else:
1157
- storage_options = AwsStorageOptions(**data)
1158
- elif protocol == "github":
1159
- storage_options = GitHubStorageOptions(**data)
1160
- elif protocol == "gitlab":
1161
- storage_options = GitLabStorageOptions(**data)
1162
- elif protocol in ["az", "abfs", "adl"]:
1163
- storage_options = AzureStorageOptions(**data)
1164
- elif protocol in ["gs", "gcs"]:
1165
- storage_options = GcsStorageOptions(**data)
1166
- elif protocol == "file":
1167
- storage_options = LocalStorageOptions(**data)
1168
- else:
1169
- raise ValueError(f"Unsupported protocol: {protocol}")
1170
-
1171
- return cls(storage_options=storage_options)
1172
- else:
1173
- return cls(**data)
1174
-
1175
- @classmethod
1176
- def from_yaml(cls, path: str, fs: AbstractFileSystem = None) -> "StorageOptions":
1177
- """Create storage options from YAML configuration.
1178
-
1179
- Args:
1180
- path: Path to YAML configuration file
1181
- fs: Filesystem for reading configuration
1182
-
1183
- Returns:
1184
- StorageOptions: Configured storage options
1185
-
1186
- Example:
1187
- >>> # Load from config file
1188
- >>> options = StorageOptions.from_yaml("storage.yml")
1189
- >>> print(options.storage_options.protocol)
1190
- 's3'
1191
- """
1192
- with fs.open(path, "r") as f:
1193
- data = yaml.safe_load(f)
1194
- return cls(**data)
1195
-
1196
- @classmethod
1197
- def from_env(cls, protocol: str) -> "StorageOptions":
1198
- """Create storage options from environment variables.
1199
-
1200
- Args:
1201
- protocol: Storage protocol to configure
1202
-
1203
- Returns:
1204
- StorageOptions: Environment-configured options
1205
-
1206
- Example:
1207
- >>> # Load AWS config from environment
1208
- >>> options = StorageOptions.from_env("s3")
1209
- """
1210
- if protocol == "s3":
1211
- return cls(storage_options=AwsStorageOptions.from_env())
1212
- elif protocol == "github":
1213
- return cls(storage_options=GitHubStorageOptions.from_env())
1214
- elif protocol == "gitlab":
1215
- return cls(storage_options=GitLabStorageOptions.from_env())
1216
- elif protocol == "file":
1217
- return cls(storage_options=LocalStorageOptions())
1218
- else:
1219
- raise ValueError(f"Unsupported protocol: {protocol}")
1220
-
1221
- def to_filesystem(self) -> AbstractFileSystem:
1222
- """Create fsspec filesystem instance.
1223
-
1224
- Returns:
1225
- AbstractFileSystem: Configured filesystem instance
1226
-
1227
- Example:
1228
- >>> options = StorageOptions(protocol="file")
1229
- >>> fs = options.to_filesystem()
1230
- >>> files = fs.ls("/data")
1231
- """
1232
- return self.storage_options.to_filesystem()
1233
-
1234
- def to_dict(self, protocol: bool = False) -> dict:
1235
- """Convert storage options to dictionary.
1236
-
1237
- Args:
1238
- protocol: Whether to include protocol in output
1239
-
1240
- Returns:
1241
- dict: Storage options as dictionary
1242
-
1243
- Example:
1244
- >>> options = StorageOptions(
1245
- ... protocol="s3",
1246
- ... region="us-east-1"
1247
- ... )
1248
- >>> print(options.to_dict())
1249
- {'region': 'us-east-1'}
1250
- """
1251
- return self.storage_options.to_dict(protocol=protocol)
1252
-
1253
- def to_object_store_kwargs(self, with_conditional_put: bool = False) -> dict:
1254
- """Get options formatted for object store clients.
1255
-
1256
- Args:
1257
- with_conditional_put: Add etag-based conditional put support
1258
-
1259
- Returns:
1260
- dict: Object store configuration dictionary
1261
-
1262
- Example:
1263
- >>> options = StorageOptions(protocol="s3")
1264
- >>> kwargs = options.to_object_store_kwargs()
1265
- >>> store = ObjectStore(**kwargs)
1266
- """
1267
- return self.storage_options.to_object_store_kwargs(
1268
- with_conditional_put=with_conditional_put
1269
- )
1270
-
1271
-
1272
- def infer_protocol_from_uri(uri: str) -> str:
1273
- """Infer the storage protocol from a URI string.
1274
-
1275
- Analyzes the URI to determine the appropriate storage protocol based on
1276
- the scheme or path format.
1277
-
1278
- Args:
1279
- uri: URI or path string to analyze. Examples:
1280
- - "s3://bucket/path"
1281
- - "gs://bucket/path"
1282
- - "github://org/repo"
1283
- - "/local/path"
1284
-
1285
- Returns:
1286
- str: Inferred protocol identifier
1287
-
1288
- Example:
1289
- >>> # S3 protocol
1290
- >>> infer_protocol_from_uri("s3://my-bucket/data")
1291
- 's3'
1292
- >>>
1293
- >>> # Local file
1294
- >>> infer_protocol_from_uri("/home/user/data")
1295
- 'file'
1296
- >>>
1297
- >>> # GitHub repository
1298
- >>> infer_protocol_from_uri("github://microsoft/vscode")
1299
- 'github'
1300
- """
1301
- if uri.startswith("s3://"):
1302
- return "s3"
1303
- elif uri.startswith("gs://") or uri.startswith("gcs://"):
1304
- return "gs"
1305
- elif uri.startswith("github://"):
1306
- return "github"
1307
- elif uri.startswith("gitlab://"):
1308
- return "gitlab"
1309
- elif uri.startswith(("az://", "abfs://", "adl://")):
1310
- return uri.split("://")[0]
1311
- else:
1312
- return "file"
1313
-
1314
-
1315
- def storage_options_from_uri(uri: str) -> BaseStorageOptions:
1316
- """Create storage options instance from a URI string.
1317
-
1318
- Infers the protocol and extracts relevant configuration from the URI
1319
- to create appropriate storage options.
1320
-
1321
- Args:
1322
- uri: URI string containing protocol and optional configuration.
1323
- Examples:
1324
- - "s3://bucket/path"
1325
- - "gs://project/bucket/path"
1326
- - "github://org/repo"
1327
-
1328
- Returns:
1329
- BaseStorageOptions: Configured storage options instance
1330
-
1331
- Example:
1332
- >>> # S3 options
1333
- >>> opts = storage_options_from_uri("s3://my-bucket/data")
1334
- >>> print(opts.protocol)
1335
- 's3'
1336
- >>>
1337
- >>> # GitHub options
1338
- >>> opts = storage_options_from_uri("github://microsoft/vscode")
1339
- >>> print(opts.org)
1340
- 'microsoft'
1341
- >>> print(opts.repo)
1342
- 'vscode'
1343
- """
1344
- protocol = infer_protocol_from_uri(uri)
1345
- options = infer_storage_options(uri)
1346
-
1347
- if protocol == "s3":
1348
- return AwsStorageOptions(protocol=protocol, **options)
1349
- elif protocol in ["gs", "gcs"]:
1350
- return GcsStorageOptions(protocol=protocol, **options)
1351
- elif protocol == "github":
1352
- parts = uri.replace("github://", "").split("/")
1353
- return GitHubStorageOptions(
1354
- protocol=protocol, org=parts[0], repo=parts[1] if len(parts) > 1 else None
1355
- )
1356
- elif protocol == "gitlab":
1357
- parts = uri.replace("gitlab://", "").split("/")
1358
- return GitLabStorageOptions(
1359
- protocol=protocol, project_name=parts[-1] if parts else None
1360
- )
1361
- elif protocol in ["az", "abfs", "adl"]:
1362
- return AzureStorageOptions(protocol=protocol, **options)
1363
- else:
1364
- return LocalStorageOptions()
1365
-
1366
-
1367
- def merge_storage_options(
1368
- *options: BaseStorageOptions | dict | None, overwrite: bool = True
1369
- ) -> BaseStorageOptions:
1370
- """Merge multiple storage options into a single configuration.
1371
-
1372
- Combines options from multiple sources with control over precedence.
1373
-
1374
- Args:
1375
- *options: Storage options to merge. Can be:
1376
- - BaseStorageOptions instances
1377
- - Dictionaries of options
1378
- - None values (ignored)
1379
- overwrite: Whether later options override earlier ones
1380
-
1381
- Returns:
1382
- BaseStorageOptions: Combined storage options
1383
-
1384
- Example:
1385
- >>> # Merge with overwrite
1386
- >>> base = AwsStorageOptions(
1387
- ... region="us-east-1",
1388
- ... access_key_id="OLD_KEY"
1389
- ... )
1390
- >>> override = {"access_key_id": "NEW_KEY"}
1391
- >>> merged = merge_storage_options(base, override)
1392
- >>> print(merged.access_key_id)
1393
- 'NEW_KEY'
1394
- >>>
1395
- >>> # Preserve existing values
1396
- >>> merged = merge_storage_options(
1397
- ... base,
1398
- ... override,
1399
- ... overwrite=False
1400
- ... )
1401
- >>> print(merged.access_key_id)
1402
- 'OLD_KEY'
1403
- """
1404
- result = {}
1405
- protocol = None
1406
-
1407
- for opts in options:
1408
- if opts is None:
1409
- continue
1410
- if isinstance(opts, BaseStorageOptions):
1411
- opts = opts.to_dict(with_protocol=True)
1412
- if not protocol and "protocol" in opts:
1413
- protocol = opts["protocol"]
1414
- for k, v in opts.items():
1415
- if overwrite or k not in result:
1416
- result[k] = v
1417
-
1418
- if not protocol:
1419
- protocol = "file"
1420
- return from_dict(protocol, result)