outerbounds 0.3.171__py3-none-any.whl → 0.3.173rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,612 @@
1
+ import os
2
+ import sys
3
+ import time
4
+ import tarfile
5
+ import json
6
+ from io import BytesIO
7
+ from typing import List, Tuple, Dict, Any, Optional, Callable, Union
8
+
9
+ from metaflow.datastore.content_addressed_store import ContentAddressedStore
10
+ from metaflow.metaflow_config import (
11
+ DATASTORE_SYSROOT_S3,
12
+ DATASTORE_SYSROOT_AZURE,
13
+ DATASTORE_SYSROOT_GS,
14
+ DATASTORE_SYSROOT_LOCAL,
15
+ )
16
+
17
+ # Default prefix for code packages in content addressed store
18
+ CODE_PACKAGE_PREFIX = "apps-code-packages"
19
+
20
+
21
+ class CodePackager:
22
+ """
23
+ A datastore-agnostic class for packaging code.
24
+
25
+ This class handles creating a code package (tarball) for deployment
26
+ and provides methods for storing and retrieving it using Metaflow's
27
+ ContentAddressedStore directly.
28
+
29
+ Usage examples:
30
+ ```python
31
+ packager = CodePackager(
32
+ datastore_type: str = "s3",
33
+ datastore_root = None,
34
+ code_package_prefix = None,
35
+ )
36
+
37
+ package_url, package_key = packager.store(
38
+ paths_to_include = ["./"],
39
+ file_suffixes = [".py", ".txt", ".yaml", ".yml", ".json"],
40
+ )
41
+
42
+ package_url, package_key = packager.store(
43
+ package_create_fn = lambda: my_custom_package_create_fn(),
44
+ )
45
+ ```
46
+ """
47
+
48
+ def __init__(
49
+ self,
50
+ datastore_type: str = "s3",
51
+ datastore_root: Optional[str] = None,
52
+ code_package_prefix: Optional[str] = None,
53
+ ):
54
+ """
55
+ Initialize the CodePackager with datastore configuration.
56
+
57
+ Parameters
58
+ ----------
59
+ datastore_type : str, default "s3"
60
+ The type of datastore to use: "s3", "azure", "gs", or "local"
61
+ datastore_root : str, optional
62
+ Root path for the datastore. If not provided, uses the default for the datastore type.
63
+ code_package_prefix : str, optional
64
+ The prefix to use for storing code packages in the content addressed store.
65
+ If not provided, uses the CODE_PACKAGE_PREFIX configuration value.
66
+ """
67
+ self._datastore_type = datastore_type
68
+ self._datastore_root = datastore_root
69
+ self._code_package_prefix = code_package_prefix
70
+
71
+ def store(
72
+ self,
73
+ package_create_fn: Optional[Callable[[], bytes]] = None,
74
+ paths_to_include: Optional[List[str]] = None,
75
+ file_suffixes: Optional[List[str]] = None,
76
+ metadata: Optional[Dict[str, Any]] = None,
77
+ ) -> Tuple[str, str]:
78
+ """
79
+ Create and store a code package using Metaflow's ContentAddressedStore.
80
+
81
+ This method can be called in two ways:
82
+ 1. With paths_to_include and file_suffixes to use the default packaging
83
+ 2. With a custom package_create_fn for custom packaging logic
84
+
85
+ Parameters
86
+ ----------
87
+ package_create_fn : Callable[[], bytes], optional
88
+ A function that creates and returns a package as bytes.
89
+ This allows for custom packaging logic without dependency on specific objects.
90
+ paths_to_include : List[str], optional
91
+ List of paths to include in the package. Used by default_package_create.
92
+ file_suffixes : List[str], optional
93
+ List of file suffixes to include. Used by default_package_create.
94
+ metadata : Dict[str, Any], optional
95
+ Metadata to include in the package when using default_package_create.
96
+
97
+ Returns
98
+ -------
99
+ Tuple[str, str]
100
+ A tuple containing (package_url, package_key) that identifies the location
101
+ and content-addressed key of the stored package.
102
+ """
103
+ # Prepare default values
104
+ _paths_to_include = paths_to_include or []
105
+ _file_suffixes = file_suffixes or [
106
+ ".py",
107
+ ".txt",
108
+ ".yaml",
109
+ ".yml",
110
+ ".json",
111
+ ".html",
112
+ ".css",
113
+ ".js",
114
+ ".jsx",
115
+ ".ts",
116
+ ".tsx",
117
+ ".md",
118
+ ".rst",
119
+ ]
120
+ _metadata = metadata or {}
121
+
122
+ # If no package_create_fn provided, use default_package_create
123
+ if package_create_fn is None:
124
+ _package_create_fn = lambda: self.default_package_create(
125
+ _paths_to_include, _file_suffixes, _metadata
126
+ )
127
+ else:
128
+ _package_create_fn = package_create_fn
129
+
130
+ # Create the package
131
+ code_package = _package_create_fn()
132
+
133
+ # Get the ContentAddressedStore for the specified datastore
134
+ ca_store = self.get_content_addressed_store(
135
+ datastore_type=self._datastore_type,
136
+ datastore_root=self._datastore_root,
137
+ prefix=(
138
+ str(self._code_package_prefix)
139
+ if self._code_package_prefix is not None
140
+ else str(CODE_PACKAGE_PREFIX)
141
+ ),
142
+ )
143
+
144
+ # Store the package using raw=True to ensure we can access it directly via URL
145
+ results = ca_store.save_blobs([code_package], raw=True, len_hint=1)
146
+ package_url, package_key = results[0].uri, results[0].key
147
+
148
+ return package_url, package_key
149
+
150
+ @staticmethod
151
+ def get_content_addressed_store(
152
+ datastore_type: str = "s3",
153
+ datastore_root: Optional[str] = None,
154
+ prefix: Optional[str] = None,
155
+ ) -> ContentAddressedStore:
156
+ """
157
+ Get a ContentAddressedStore instance for the specified datastore.
158
+
159
+ Parameters
160
+ ----------
161
+ datastore_type : str, default "s3"
162
+ Type of datastore: "s3", "azure", "gs", or "local"
163
+ datastore_root : str, optional
164
+ Root path for the datastore. If not provided, uses the default for the datastore type.
165
+ prefix : str, optional
166
+ Prefix to use when storing objects in the datastore.
167
+ If not provided, uses the CODE_PACKAGE_PREFIX configuration value.
168
+
169
+ Returns
170
+ -------
171
+ ContentAddressedStore
172
+ A ContentAddressedStore instance configured for the specified datastore
173
+ """
174
+ from metaflow.plugins import DATASTORES
175
+
176
+ datastore_impls = [i for i in DATASTORES if i.TYPE == datastore_type]
177
+ if len(datastore_impls) == 0:
178
+ raise ValueError(f"Unsupported datastore type: {datastore_type}")
179
+ if len(datastore_impls) > 1:
180
+ raise ValueError(
181
+ f"Multiple datastore implementations found for type: {datastore_type}"
182
+ )
183
+ datastore_impl = datastore_impls[0]
184
+ root = None
185
+ # Import the storage implementation based on datastore_type
186
+ if datastore_type == "s3":
187
+ root = datastore_root or DATASTORE_SYSROOT_S3
188
+ elif datastore_type == "azure":
189
+ root = datastore_root or DATASTORE_SYSROOT_AZURE
190
+ elif datastore_type == "gs":
191
+ root = datastore_root or DATASTORE_SYSROOT_GS
192
+ elif datastore_type == "local":
193
+ root = datastore_root or DATASTORE_SYSROOT_LOCAL
194
+
195
+ # Ensure prefix is a string
196
+ store_prefix = str(prefix) if prefix is not None else str(CODE_PACKAGE_PREFIX)
197
+
198
+ storage_impl = datastore_impl(root=root)
199
+ # Create and return a ContentAddressedStore
200
+ return ContentAddressedStore(prefix=store_prefix, storage_impl=storage_impl)
201
+
202
+ @staticmethod
203
+ def get_download_cmd(
204
+ package_url: str,
205
+ datastore_type: str,
206
+ python_cmd: str = "python",
207
+ target_file: str = "job.tar",
208
+ escape_quotes: bool = True,
209
+ ) -> str:
210
+ """
211
+ Generate a command to download the code package.
212
+
213
+ Parameters
214
+ ----------
215
+ package_url : str
216
+ The URL of the package to download
217
+ datastore_type : str
218
+ The type of datastore (s3, azure, gs, local)
219
+ python_cmd : str, optional
220
+ The Python command to use
221
+ target_file : str, optional
222
+ The target file name to save the package as
223
+ escape_quotes : bool, optional
224
+ Whether to escape quotes in the command
225
+
226
+ Returns
227
+ -------
228
+ str
229
+ A shell command string to download the package
230
+ """
231
+ if datastore_type == "s3":
232
+ from metaflow.plugins.aws.aws_utils import parse_s3_full_path
233
+
234
+ bucket, s3_object = parse_s3_full_path(package_url)
235
+ # Simplify the script and use single quotes to avoid shell escaping issues
236
+ script = 'import boto3, os; ep=os.getenv({quote}METAFLOW_S3_ENDPOINT_URL{quote}); boto3.client("s3", **({{"endpoint_url":ep}} if ep else {{}})).download_file({quote}{bucket}{quote}, {quote}{s3_object}{quote}, {quote}{target_file}{quote})'.format(
237
+ quote='\\"' if escape_quotes else '"',
238
+ bucket=bucket,
239
+ s3_object=s3_object,
240
+ target_file=target_file,
241
+ )
242
+ # Format the command with proper quoting
243
+ return f"{python_cmd} -c '{script}'"
244
+ elif datastore_type == "azure":
245
+ from metaflow.plugins.azure.azure_utils import parse_azure_full_path
246
+
247
+ container_name, blob = parse_azure_full_path(package_url)
248
+ # remove a trailing slash, if present
249
+ blob_endpoint = "${METAFLOW_AZURE_STORAGE_BLOB_SERVICE_ENDPOINT%/}"
250
+ return "download-azure-blob --blob-endpoint={blob_endpoint} --container={container} --blob={blob} --output-file={target}".format(
251
+ blob_endpoint=blob_endpoint,
252
+ blob=blob,
253
+ container=container_name,
254
+ target=target_file,
255
+ )
256
+ elif datastore_type == "gs":
257
+ from metaflow.plugins.gcp.gs_utils import parse_gs_full_path
258
+
259
+ bucket_name, gs_object = parse_gs_full_path(package_url)
260
+ return "download-gcp-object --bucket=%s --object=%s --output-file=%s" % (
261
+ bucket_name,
262
+ gs_object,
263
+ target_file,
264
+ )
265
+ elif datastore_type == "local":
266
+ # For local storage, simply copy the file
267
+ return "cp %s %s" % (package_url, target_file)
268
+ else:
269
+ raise NotImplementedError(
270
+ f"Download command not implemented for datastore type: {datastore_type}"
271
+ )
272
+
273
+ def get_package_commands(
274
+ self,
275
+ code_package_url: str,
276
+ python_cmd: str = "python",
277
+ target_file: str = "job.tar",
278
+ working_dir: str = "metaflow",
279
+ retries: int = 5,
280
+ escape_quotes: bool = True,
281
+ ) -> List[str]:
282
+ """
283
+ Get a complete list of shell commands to download and extract a code package.
284
+
285
+ This method generates a comprehensive set of shell commands for downloading
286
+ and extracting a code package, similar to MetaflowEnvironment.get_package_commands.
287
+
288
+ Parameters
289
+ ----------
290
+ code_package_url : str
291
+ The URL of the code package to download
292
+ python_cmd : str, optional
293
+ The Python command to use
294
+ target_file : str, optional
295
+ The target file name to save the package as
296
+ working_dir : str, optional
297
+ The directory to create and extract the package into
298
+ retries : int, optional
299
+ Number of download retries to attempt
300
+ escape_quotes : bool, optional
301
+ Whether to escape quotes in the command
302
+
303
+ Returns
304
+ -------
305
+ List[str]
306
+ List of shell commands to execute
307
+ """
308
+ # Use the datastore_type from initialization if not provided
309
+ datastore_type = self._datastore_type
310
+
311
+ # Helper function to create dependency installation command
312
+ def _get_install_dependencies_cmd():
313
+ base_cmd = "{} -m pip install -qqq --no-compile --no-cache-dir --disable-pip-version-check".format(
314
+ python_cmd
315
+ )
316
+
317
+ datastore_packages = {
318
+ "s3": ["boto3"],
319
+ "azure": [
320
+ "azure-identity",
321
+ "azure-storage-blob",
322
+ "azure-keyvault-secrets",
323
+ "simple-azure-blob-downloader",
324
+ ],
325
+ "gs": [
326
+ "google-cloud-storage",
327
+ "google-auth",
328
+ "simple-gcp-object-downloader",
329
+ "google-cloud-secret-manager",
330
+ ],
331
+ "local": [],
332
+ }
333
+
334
+ if datastore_type not in datastore_packages:
335
+ raise NotImplementedError(
336
+ "Unknown datastore type: {}".format(datastore_type)
337
+ )
338
+
339
+ if not datastore_packages[datastore_type]:
340
+ return "# No dependencies required for local datastore"
341
+
342
+ cmd = "{} {}".format(
343
+ base_cmd, " ".join(datastore_packages[datastore_type] + ["requests"])
344
+ )
345
+ # Skip pip installs if we know packages might already be available
346
+ return "if [ -z $METAFLOW_SKIP_INSTALL_DEPENDENCIES ]; then {}; fi".format(
347
+ cmd
348
+ )
349
+
350
+ download_cmd = self.get_download_cmd(
351
+ code_package_url, datastore_type, python_cmd, target_file, escape_quotes
352
+ )
353
+
354
+ # Define the log functions for bash
355
+ bash_mflog = (
356
+ 'function mflog() { echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")]" "$@"; }'
357
+ )
358
+ bash_flush_logs = 'function flush_mflogs() { echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Flushing logs"; }'
359
+
360
+ cmds = [
361
+ bash_mflog,
362
+ bash_flush_logs,
363
+ "mflog 'Setting up task environment.'",
364
+ _get_install_dependencies_cmd(),
365
+ f"mkdir -p {working_dir}",
366
+ f"cd {working_dir}",
367
+ "mkdir -p .metaflow", # mute local datastore creation log
368
+ f"i=0; while [ $i -le {retries} ]; do "
369
+ "mflog 'Downloading code package...'; "
370
+ + download_cmd
371
+ + " && mflog 'Code package downloaded.' && break; "
372
+ "sleep 10; i=$((i+1)); "
373
+ "done",
374
+ f"if [ $i -gt {retries} ]; then "
375
+ "mflog 'Failed to download code package from %s "
376
+ f"after {retries+1} tries. Exiting...' && exit 1; "
377
+ "fi" % code_package_url,
378
+ "TAR_OPTIONS='--warning=no-timestamp' tar xf %s" % target_file,
379
+ "mflog 'Task is starting.'",
380
+ "flush_mflogs",
381
+ ]
382
+
383
+ return cmds
384
+
385
+ @staticmethod
386
+ def directory_walker(
387
+ root, exclude_hidden=True, suffixes=None, follow_symlinks=True
388
+ ) -> List[Tuple[str, str]]:
389
+ """
390
+ Walk a directory and yield tuples of (file_path, relative_arcname) for files
391
+ that match the given suffix filters.
392
+
393
+ This function is similar to MetaflowPackage._walk and handles symlinks safely.
394
+
395
+ Parameters
396
+ ----------
397
+ root : str
398
+ The root directory to walk
399
+ exclude_hidden : bool, default True
400
+ Whether to exclude hidden files and directories (those starting with '.')
401
+ suffixes : List[str], optional
402
+ List of file suffixes to include (e.g. ['.py', '.txt'])
403
+ follow_symlinks : bool, default True
404
+ Whether to follow symlinks (with cycle detection)
405
+
406
+ Returns
407
+ -------
408
+ List[Tuple[str, str]]
409
+ List of tuples (file_path, relative_arcname) where:
410
+ - file_path is the full path to the file
411
+ - relative_arcname is the path to use within the archive
412
+ """
413
+ if suffixes is None:
414
+ suffixes = []
415
+
416
+ # Convert root to unicode to handle files/folders with non-ascii chars
417
+ root = str(root)
418
+
419
+ # Calculate the prefix length to strip from paths
420
+ prefixlen = len(os.path.dirname(root)) + 1 # +1 for the trailing slash
421
+
422
+ # Use a set to track visited symlinks to avoid cycles
423
+ seen = set()
424
+
425
+ def _walk_without_cycles(walk_root):
426
+ for parent, dirs, files in os.walk(walk_root, followlinks=follow_symlinks):
427
+ # If not following symlinks, we're done
428
+ if not follow_symlinks:
429
+ yield parent, files
430
+ continue
431
+
432
+ # When following symlinks, we need to check for cycles
433
+ for d_idx in range(
434
+ len(dirs) - 1, -1, -1
435
+ ): # Iterate backwards to safely remove
436
+ d = dirs[d_idx]
437
+ path = os.path.join(parent, d)
438
+ if os.path.islink(path):
439
+ # Break cycles by never following the same symlink twice
440
+ reallink = os.path.realpath(path)
441
+ if reallink in seen:
442
+ # Remove from dirs to avoid following it
443
+ dirs.pop(d_idx)
444
+ else:
445
+ seen.add(reallink)
446
+
447
+ yield parent, files
448
+
449
+ # Build the list of path tuples
450
+ result = []
451
+ for path, files in _walk_without_cycles(root):
452
+ # Skip hidden directories if requested
453
+ if exclude_hidden and "/." in path:
454
+ continue
455
+
456
+ for fname in files:
457
+ # Skip hidden files if requested, unless they have a specified suffix
458
+ if (
459
+ (fname[0] == "." and fname in suffixes)
460
+ or (fname[0] != "." or not exclude_hidden)
461
+ and any(fname.endswith(suffix) for suffix in suffixes)
462
+ ):
463
+ file_path = os.path.join(path, fname)
464
+ rel_path = file_path[prefixlen:]
465
+ result.append((file_path, rel_path))
466
+
467
+ return result
468
+
469
+ @staticmethod
470
+ def default_package_create(
471
+ paths: List[str], suffixes: List[str], metadata: Optional[Dict[str, Any]] = None
472
+ ) -> bytes:
473
+ """
474
+ Create a default tarball package from specified paths.
475
+
476
+ Parameters
477
+ ----------
478
+ paths : List[str]
479
+ List of paths to include in the package
480
+ suffixes : List[str]
481
+ List of file suffixes to include
482
+ metadata : Dict[str, Any], optional
483
+ Metadata to include in the package
484
+
485
+ Returns
486
+ -------
487
+ bytes
488
+ The binary content of the tarball
489
+ """
490
+ buf = BytesIO()
491
+
492
+ with tarfile.open(fileobj=buf, mode="w:gz", compresslevel=3) as tar:
493
+ # Add metadata if provided
494
+ if metadata:
495
+ metadata_buf = BytesIO()
496
+ metadata_buf.write(json.dumps(metadata).encode("utf-8"))
497
+ metadata_buf.seek(0)
498
+ info = tarfile.TarInfo("metadata.json")
499
+ info.size = len(metadata_buf.getvalue())
500
+ info.mtime = 1747158696 # 13 May 2025 10:31:36 (so that we dont have a changing hash everytime we run)
501
+ tar.addfile(info, metadata_buf)
502
+
503
+ def no_mtime(tarinfo):
504
+ # a modification time change should not change the hash of
505
+ # the package. Only content modifications will.
506
+ # Setting this default to Dec 3, 2019
507
+ tarinfo.mtime = 1747158696 # 13 May 2025 10:31:36 (so that we dont have a changing hash everytime we run)
508
+ return tarinfo
509
+
510
+ # Add files from specified paths
511
+ for path in paths:
512
+ if os.path.isdir(path):
513
+ # Use directory_walker for directories to handle symlinks properly
514
+ for file_path, rel_path in CodePackager.directory_walker(
515
+ path,
516
+ exclude_hidden=True,
517
+ suffixes=suffixes,
518
+ follow_symlinks=True,
519
+ ):
520
+ tar.add(
521
+ file_path,
522
+ arcname=rel_path,
523
+ filter=no_mtime,
524
+ recursive=False,
525
+ )
526
+ elif os.path.isfile(path):
527
+ if any(path.endswith(suffix) for suffix in suffixes):
528
+ tar.add(path, arcname=os.path.basename(path))
529
+
530
+ tarball = bytearray(buf.getvalue())
531
+ tarball[4:8] = [0] * 4 # Reset 4 bytes from offset 4 to account for ts
532
+ return tarball
533
+
534
+ @staticmethod
535
+ def _add_tar_file(tar, filename, buf):
536
+ tarinfo = tarfile.TarInfo(name=filename)
537
+ tarinfo.size = len(buf.getvalue())
538
+ buf.seek(0)
539
+ tarinfo.mtime = 1747158696 # 13 May 2025 10:31:36 (so that we dont have a changing hash everytime we run)
540
+ tar.addfile(tarinfo, fileobj=buf)
541
+
542
+ @classmethod
543
+ def package_directory(
544
+ cls,
545
+ directory_path: str,
546
+ suffixes: Optional[List[str]] = None,
547
+ exclude_hidden: bool = True,
548
+ metadata: Optional[Dict[str, Any]] = None,
549
+ follow_symlinks: bool = True,
550
+ ) -> bytes:
551
+ """
552
+ Package a directory and all of its contents that match the given suffixes.
553
+
554
+ This is a convenience method that works similarly to MetaflowPackage._walk
555
+ to package a directory for deployment.
556
+
557
+ Parameters
558
+ ----------
559
+ directory_path : str
560
+ The directory to package
561
+ suffixes : List[str], optional
562
+ List of file suffixes to include (defaults to standard code extensions)
563
+ exclude_hidden : bool, default True
564
+ Whether to exclude hidden files and directories
565
+ metadata : Dict[str, Any], optional
566
+ Metadata to include in the package
567
+ follow_symlinks : bool, default True
568
+ Whether to follow symlinks when walking the directory
569
+
570
+ Returns
571
+ -------
572
+ bytes
573
+ The binary content of the tarball
574
+ """
575
+ if not os.path.isdir(directory_path):
576
+ raise ValueError(f"The path '{directory_path}' is not a directory")
577
+
578
+ # Use default suffixes if none provided
579
+ if suffixes is None:
580
+ suffixes = [".py", ".txt", ".yaml", ".yml", ".json"]
581
+
582
+ buf = BytesIO()
583
+
584
+ def no_mtime(tarinfo):
585
+ # a modification time change should not change the hash of
586
+ # the package. Only content modifications will.
587
+ # Setting this to a fixed date so that we don't have a changing hash everytime we run
588
+ tarinfo.mtime = 1747158696 # 13 May 2025 10:31:36
589
+ return tarinfo
590
+
591
+ with tarfile.open(
592
+ fileobj=buf, mode="w:gz", compresslevel=3, dereference=True
593
+ ) as tar:
594
+ # Add metadata if provided
595
+ if metadata:
596
+ cls._add_tar_file(
597
+ tar, "metadata.json", BytesIO(json.dumps(metadata).encode("utf-8"))
598
+ )
599
+
600
+ # Walk the directory and add matching files
601
+ for file_path, rel_path in cls.directory_walker(
602
+ directory_path,
603
+ exclude_hidden=exclude_hidden,
604
+ suffixes=suffixes,
605
+ follow_symlinks=follow_symlinks,
606
+ ):
607
+ # Remove debug print statement
608
+ tar.add(file_path, arcname=rel_path, recursive=False, filter=no_mtime)
609
+
610
+ tarball = bytearray(buf.getvalue())
611
+ tarball[4:8] = [0] * 4 # Reset 4 bytes from offset 4 to account for ts
612
+ return tarball