outerbounds 0.3.183rc1__py3-none-any.whl → 0.3.185__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,610 +0,0 @@
1
- import os
2
- import sys
3
- import time
4
- import tarfile
5
- import json
6
- from io import BytesIO
7
- from typing import List, Tuple, Dict, Any, Optional, Callable, Union
8
-
9
- from metaflow.datastore.content_addressed_store import ContentAddressedStore
10
- from metaflow.util import to_unicode
11
- from metaflow.metaflow_config import (
12
- DATASTORE_SYSROOT_S3,
13
- DATASTORE_SYSROOT_AZURE,
14
- DATASTORE_SYSROOT_GS,
15
- DATASTORE_SYSROOT_LOCAL,
16
- )
17
-
18
- # Default prefix for code packages in content addressed store
19
- CODE_PACKAGE_PREFIX = "apps-code-packages"
20
-
21
-
22
- # this is os.walk(follow_symlinks=True) with cycle detection
23
- def walk_without_cycles(top_root):
24
- seen = set()
25
-
26
- def _recurse(root):
27
- for parent, dirs, files in os.walk(root):
28
- for d in dirs:
29
- path = os.path.join(parent, d)
30
- if os.path.islink(path):
31
- # Breaking loops: never follow the same symlink twice
32
- #
33
- # NOTE: this also means that links to sibling links are
34
- # not followed. In this case:
35
- #
36
- # x -> y
37
- # y -> oo
38
- # oo/real_file
39
- #
40
- # real_file is only included twice, not three times
41
- reallink = os.path.realpath(path)
42
- if reallink not in seen:
43
- seen.add(reallink)
44
- for x in _recurse(path):
45
- yield x
46
- yield parent, files
47
-
48
- for x in _recurse(top_root):
49
- yield x
50
-
51
-
52
- def symlink_friendly_walk(root, exclude_hidden=True, suffixes=None):
53
- if suffixes is None:
54
- suffixes = []
55
- root = to_unicode(root) # handle files/folder with non ascii chars
56
- prefixlen = len("%s/" % os.path.dirname(root))
57
- for (
58
- path,
59
- files,
60
- ) in walk_without_cycles(root):
61
- if exclude_hidden and "/." in path:
62
- continue
63
- # path = path[2:] # strip the ./ prefix
64
- # if path and (path[0] == '.' or './' in path):
65
- # continue
66
- for fname in files:
67
- if (fname[0] == "." and fname in suffixes) or (
68
- fname[0] != "." and any(fname.endswith(suffix) for suffix in suffixes)
69
- ):
70
- p = os.path.join(path, fname)
71
- yield p, p[prefixlen:]
72
-
73
-
74
- class CodePackager:
75
- """
76
- A datastore-agnostic class for packaging code.
77
-
78
- This class handles creating a code package (tarball) for deployment
79
- and provides methods for storing and retrieving it using Metaflow's
80
- ContentAddressedStore directly.
81
-
82
- Usage examples:
83
- ```python
84
- packager = CodePackager(
85
- datastore_type: str = "s3",
86
- datastore_root = None,
87
- code_package_prefix = None,
88
- )
89
-
90
- package_url, package_key = packager.store(
91
- paths_to_include = ["./"],
92
- file_suffixes = [".py", ".txt", ".yaml", ".yml", ".json"],
93
- )
94
-
95
- package_url, package_key = packager.store(
96
- package_create_fn = lambda: my_custom_package_create_fn(),
97
- )
98
- ```
99
- """
100
-
101
- def __init__(
102
- self,
103
- datastore_type: str = "s3",
104
- datastore_root: Optional[str] = None,
105
- code_package_prefix: Optional[str] = None,
106
- ):
107
- """
108
- Initialize the CodePackager with datastore configuration.
109
-
110
- Parameters
111
- ----------
112
- datastore_type : str, default "s3"
113
- The type of datastore to use: "s3", "azure", "gs", or "local"
114
- datastore_root : str, optional
115
- Root path for the datastore. If not provided, uses the default for the datastore type.
116
- code_package_prefix : str, optional
117
- The prefix to use for storing code packages in the content addressed store.
118
- If not provided, uses the CODE_PACKAGE_PREFIX configuration value.
119
- """
120
- self._datastore_type = datastore_type
121
- self._datastore_root = datastore_root
122
- self._code_package_prefix = code_package_prefix
123
-
124
- def store(
125
- self,
126
- package_create_fn: Optional[Callable[[], bytes]] = None,
127
- paths_to_include: Optional[List[str]] = None,
128
- file_suffixes: Optional[List[str]] = None,
129
- metadata: Optional[Dict[str, Any]] = None,
130
- ) -> Tuple[str, str]:
131
- """
132
- Create and store a code package using Metaflow's ContentAddressedStore.
133
-
134
- This method can be called in two ways:
135
- 1. With paths_to_include and file_suffixes to use the default packaging
136
- 2. With a custom package_create_fn for custom packaging logic
137
-
138
- Parameters
139
- ----------
140
- package_create_fn : Callable[[], bytes], optional
141
- A function that creates and returns a package as bytes.
142
- This allows for custom packaging logic without dependency on specific objects.
143
- paths_to_include : List[str], optional
144
- List of paths to include in the package. Used by default_package_create.
145
- file_suffixes : List[str], optional
146
- List of file suffixes to include. Used by default_package_create.
147
- metadata : Dict[str, Any], optional
148
- Metadata to include in the package when using default_package_create.
149
-
150
- Returns
151
- -------
152
- Tuple[str, str]
153
- A tuple containing (package_url, package_key) that identifies the location
154
- and content-addressed key of the stored package.
155
- """
156
- # Prepare default values
157
- _paths_to_include = paths_to_include or []
158
- _file_suffixes = file_suffixes or [
159
- ".py",
160
- ".txt",
161
- ".yaml",
162
- ".yml",
163
- ".json",
164
- ".html",
165
- ".css",
166
- ".js",
167
- ".jsx",
168
- ".ts",
169
- ".tsx",
170
- ".md",
171
- ".rst",
172
- ]
173
- _metadata = metadata or {}
174
-
175
- # If no package_create_fn provided, use default_package_create
176
- if package_create_fn is None:
177
- _package_create_fn = lambda: self.default_package_create(
178
- _paths_to_include, _file_suffixes, _metadata
179
- )
180
- else:
181
- _package_create_fn = package_create_fn
182
-
183
- # Create the package
184
- code_package = _package_create_fn()
185
-
186
- # Get the ContentAddressedStore for the specified datastore
187
- ca_store = self.get_content_addressed_store(
188
- datastore_type=self._datastore_type,
189
- datastore_root=self._datastore_root,
190
- prefix=(
191
- str(self._code_package_prefix)
192
- if self._code_package_prefix is not None
193
- else str(CODE_PACKAGE_PREFIX)
194
- ),
195
- )
196
-
197
- # Store the package using raw=True to ensure we can access it directly via URL
198
- results = ca_store.save_blobs([code_package], raw=True, len_hint=1)
199
- package_url, package_key = results[0].uri, results[0].key
200
-
201
- return package_url, package_key
202
-
203
- @staticmethod
204
- def get_content_addressed_store(
205
- datastore_type: str = "s3",
206
- datastore_root: Optional[str] = None,
207
- prefix: Optional[str] = None,
208
- ) -> ContentAddressedStore:
209
- """
210
- Get a ContentAddressedStore instance for the specified datastore.
211
-
212
- Parameters
213
- ----------
214
- datastore_type : str, default "s3"
215
- Type of datastore: "s3", "azure", "gs", or "local"
216
- datastore_root : str, optional
217
- Root path for the datastore. If not provided, uses the default for the datastore type.
218
- prefix : str, optional
219
- Prefix to use when storing objects in the datastore.
220
- If not provided, uses the CODE_PACKAGE_PREFIX configuration value.
221
-
222
- Returns
223
- -------
224
- ContentAddressedStore
225
- A ContentAddressedStore instance configured for the specified datastore
226
- """
227
- from metaflow.plugins import DATASTORES
228
-
229
- datastore_impls = [i for i in DATASTORES if i.TYPE == datastore_type]
230
- if len(datastore_impls) == 0:
231
- raise ValueError(f"Unsupported datastore type: {datastore_type}")
232
- if len(datastore_impls) > 1:
233
- raise ValueError(
234
- f"Multiple datastore implementations found for type: {datastore_type}"
235
- )
236
- datastore_impl = datastore_impls[0]
237
- root = None
238
- # Import the storage implementation based on datastore_type
239
- if datastore_type == "s3":
240
- root = datastore_root or DATASTORE_SYSROOT_S3
241
- elif datastore_type == "azure":
242
- root = datastore_root or DATASTORE_SYSROOT_AZURE
243
- elif datastore_type == "gs":
244
- root = datastore_root or DATASTORE_SYSROOT_GS
245
- elif datastore_type == "local":
246
- root = datastore_root or DATASTORE_SYSROOT_LOCAL
247
-
248
- # Ensure prefix is a string
249
- store_prefix = str(prefix) if prefix is not None else str(CODE_PACKAGE_PREFIX)
250
-
251
- storage_impl = datastore_impl(root=root)
252
- # Create and return a ContentAddressedStore
253
- return ContentAddressedStore(prefix=store_prefix, storage_impl=storage_impl)
254
-
255
- @staticmethod
256
- def get_download_cmd(
257
- package_url: str,
258
- datastore_type: str,
259
- python_cmd: str = "python",
260
- target_file: str = "job.tar",
261
- escape_quotes: bool = True,
262
- ) -> str:
263
- """
264
- Generate a command to download the code package.
265
-
266
- Parameters
267
- ----------
268
- package_url : str
269
- The URL of the package to download
270
- datastore_type : str
271
- The type of datastore (s3, azure, gs, local)
272
- python_cmd : str, optional
273
- The Python command to use
274
- target_file : str, optional
275
- The target file name to save the package as
276
- escape_quotes : bool, optional
277
- Whether to escape quotes in the command
278
-
279
- Returns
280
- -------
281
- str
282
- A shell command string to download the package
283
- """
284
- if datastore_type == "s3":
285
- from metaflow.plugins.aws.aws_utils import parse_s3_full_path
286
-
287
- bucket, s3_object = parse_s3_full_path(package_url)
288
- # Simplify the script and use single quotes to avoid shell escaping issues
289
- script = 'import boto3, os; ep=os.getenv({quote}METAFLOW_S3_ENDPOINT_URL{quote}); boto3.client("s3", **({{"endpoint_url":ep}} if ep else {{}})).download_file({quote}{bucket}{quote}, {quote}{s3_object}{quote}, {quote}{target_file}{quote})'.format(
290
- quote='\\"' if escape_quotes else '"',
291
- bucket=bucket,
292
- s3_object=s3_object,
293
- target_file=target_file,
294
- )
295
- # Format the command with proper quoting
296
- return f"{python_cmd} -c '{script}'"
297
- elif datastore_type == "azure":
298
- from metaflow.plugins.azure.azure_utils import parse_azure_full_path
299
-
300
- container_name, blob = parse_azure_full_path(package_url)
301
- # remove a trailing slash, if present
302
- blob_endpoint = "${METAFLOW_AZURE_STORAGE_BLOB_SERVICE_ENDPOINT%/}"
303
- return "download-azure-blob --blob-endpoint={blob_endpoint} --container={container} --blob={blob} --output-file={target}".format(
304
- blob_endpoint=blob_endpoint,
305
- blob=blob,
306
- container=container_name,
307
- target=target_file,
308
- )
309
- elif datastore_type == "gs":
310
- from metaflow.plugins.gcp.gs_utils import parse_gs_full_path
311
-
312
- bucket_name, gs_object = parse_gs_full_path(package_url)
313
- return "download-gcp-object --bucket=%s --object=%s --output-file=%s" % (
314
- bucket_name,
315
- gs_object,
316
- target_file,
317
- )
318
- elif datastore_type == "local":
319
- # For local storage, simply copy the file
320
- return "cp %s %s" % (package_url, target_file)
321
- else:
322
- raise NotImplementedError(
323
- f"Download command not implemented for datastore type: {datastore_type}"
324
- )
325
-
326
- def get_package_commands(
327
- self,
328
- code_package_url: str,
329
- python_cmd: str = "python",
330
- target_file: str = "job.tar",
331
- working_dir: str = "metaflow",
332
- retries: int = 5,
333
- escape_quotes: bool = True,
334
- ) -> List[str]:
335
- """
336
- Get a complete list of shell commands to download and extract a code package.
337
-
338
- This method generates a comprehensive set of shell commands for downloading
339
- and extracting a code package, similar to MetaflowEnvironment.get_package_commands.
340
-
341
- Parameters
342
- ----------
343
- code_package_url : str
344
- The URL of the code package to download
345
- python_cmd : str, optional
346
- The Python command to use
347
- target_file : str, optional
348
- The target file name to save the package as
349
- working_dir : str, optional
350
- The directory to create and extract the package into
351
- retries : int, optional
352
- Number of download retries to attempt
353
- escape_quotes : bool, optional
354
- Whether to escape quotes in the command
355
-
356
- Returns
357
- -------
358
- List[str]
359
- List of shell commands to execute
360
- """
361
- # Use the datastore_type from initialization if not provided
362
- datastore_type = self._datastore_type
363
-
364
- # Helper function to create dependency installation command
365
- def _get_install_dependencies_cmd():
366
- base_cmd = "{} -m pip install -qqq --no-compile --no-cache-dir --disable-pip-version-check".format(
367
- python_cmd
368
- )
369
-
370
- datastore_packages = {
371
- "s3": ["boto3"],
372
- "azure": [
373
- "azure-identity",
374
- "azure-storage-blob",
375
- "azure-keyvault-secrets",
376
- "simple-azure-blob-downloader",
377
- ],
378
- "gs": [
379
- "google-cloud-storage",
380
- "google-auth",
381
- "simple-gcp-object-downloader",
382
- "google-cloud-secret-manager",
383
- ],
384
- "local": [],
385
- }
386
-
387
- if datastore_type not in datastore_packages:
388
- raise NotImplementedError(
389
- "Unknown datastore type: {}".format(datastore_type)
390
- )
391
-
392
- if not datastore_packages[datastore_type]:
393
- return "# No dependencies required for local datastore"
394
-
395
- cmd = "{} {}".format(
396
- base_cmd, " ".join(datastore_packages[datastore_type] + ["requests"])
397
- )
398
- # Skip pip installs if we know packages might already be available
399
- return "if [ -z $METAFLOW_SKIP_INSTALL_DEPENDENCIES ]; then {}; fi".format(
400
- cmd
401
- )
402
-
403
- download_cmd = self.get_download_cmd(
404
- code_package_url, datastore_type, python_cmd, target_file, escape_quotes
405
- )
406
-
407
- # Define the log functions for bash
408
- bash_mflog = (
409
- 'function mflog() { echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")]" "$@"; }'
410
- )
411
- bash_flush_logs = 'function flush_mflogs() { echo "[$(date -u +"%Y-%m-%dT%H:%M:%SZ")] Flushing logs"; }'
412
-
413
- cmds = [
414
- bash_mflog,
415
- bash_flush_logs,
416
- "mflog 'Setting up task environment.'",
417
- _get_install_dependencies_cmd(),
418
- f"mkdir -p {working_dir}",
419
- f"cd {working_dir}",
420
- "mkdir -p .metaflow", # mute local datastore creation log
421
- f"i=0; while [ $i -le {retries} ]; do "
422
- "mflog 'Downloading code package...'; "
423
- + download_cmd
424
- + " && mflog 'Code package downloaded.' && break; "
425
- "sleep 10; i=$((i+1)); "
426
- "done",
427
- f"if [ $i -gt {retries} ]; then "
428
- "mflog 'Failed to download code package from %s "
429
- f"after {retries+1} tries. Exiting...' && exit 1; "
430
- "fi" % code_package_url,
431
- "TAR_OPTIONS='--warning=no-timestamp' tar xf %s" % target_file,
432
- "mflog 'Task is starting.'",
433
- "flush_mflogs",
434
- ]
435
-
436
- return cmds
437
-
438
- @staticmethod
439
- def directory_walker(
440
- root,
441
- exclude_hidden=True,
442
- suffixes=None,
443
- ) -> List[Tuple[str, str]]:
444
- """
445
- Walk a directory and yield tuples of (file_path, relative_arcname) for files
446
- that match the given suffix filters. It will follow symlinks, but not cycles.
447
-
448
- This function is similar to MetaflowPackage._walk and handles symlinks safely.
449
-
450
- Parameters
451
- ----------
452
- root : str
453
- The root directory to walk
454
- exclude_hidden : bool, default True
455
- Whether to exclude hidden files and directories (those starting with '.')
456
- suffixes : List[str], optional
457
- List of file suffixes to include (e.g. ['.py', '.txt'])
458
-
459
- Returns
460
- -------
461
- List[Tuple[str, str]]
462
- List of tuples (file_path, relative_arcname) where:
463
- - file_path is the full path to the file
464
- - relative_arcname is the path to use within the archive
465
- """
466
- files = []
467
- for file_path, rel_path in symlink_friendly_walk(
468
- root, exclude_hidden, suffixes
469
- ):
470
- files.append((file_path, rel_path))
471
- return files
472
-
473
- @staticmethod
474
- def default_package_create(
475
- paths: List[str], suffixes: List[str], metadata: Optional[Dict[str, Any]] = None
476
- ) -> bytes:
477
- """
478
- Create a default tarball package from specified paths.
479
-
480
- Parameters
481
- ----------
482
- paths : List[str]
483
- List of paths to include in the package
484
- suffixes : List[str]
485
- List of file suffixes to include
486
- metadata : Dict[str, Any], optional
487
- Metadata to include in the package
488
-
489
- Returns
490
- -------
491
- bytes
492
- The binary content of the tarball
493
- """
494
- buf = BytesIO()
495
-
496
- with tarfile.open(fileobj=buf, mode="w:gz", compresslevel=3) as tar:
497
- # Add metadata if provided
498
- if metadata:
499
- metadata_buf = BytesIO()
500
- metadata_buf.write(json.dumps(metadata).encode("utf-8"))
501
- metadata_buf.seek(0)
502
- info = tarfile.TarInfo("metadata.json")
503
- info.size = len(metadata_buf.getvalue())
504
- info.mtime = 1747158696 # 13 May 2025 10:31:36 (so that we dont have a changing hash everytime we run)
505
- tar.addfile(info, metadata_buf)
506
-
507
- def no_mtime(tarinfo):
508
- # a modification time change should not change the hash of
509
- # the package. Only content modifications will.
510
- # Setting this default to Dec 3, 2019
511
- tarinfo.mtime = 1747158696 # 13 May 2025 10:31:36 (so that we dont have a changing hash everytime we run)
512
- return tarinfo
513
-
514
- # Add files from specified paths
515
- for path in paths:
516
- if os.path.isdir(path):
517
- # Use directory_walker for directories to handle symlinks properly
518
- for file_path, rel_path in CodePackager.directory_walker(
519
- path,
520
- exclude_hidden=True,
521
- suffixes=suffixes,
522
- ):
523
- tar.add(
524
- file_path,
525
- arcname=rel_path,
526
- filter=no_mtime,
527
- recursive=False,
528
- )
529
- elif os.path.isfile(path):
530
- if any(path.endswith(suffix) for suffix in suffixes):
531
- tar.add(path, arcname=os.path.basename(path))
532
-
533
- tarball = bytearray(buf.getvalue())
534
- tarball[4:8] = [0] * 4 # Reset 4 bytes from offset 4 to account for ts
535
- return tarball
536
-
537
- @staticmethod
538
- def _add_tar_file(tar, filename, buf):
539
- tarinfo = tarfile.TarInfo(name=filename)
540
- tarinfo.size = len(buf.getvalue())
541
- buf.seek(0)
542
- tarinfo.mtime = 1747158696 # 13 May 2025 10:31:36 (so that we dont have a changing hash everytime we run)
543
- tar.addfile(tarinfo, fileobj=buf)
544
-
545
- @classmethod
546
- def package_directory(
547
- cls,
548
- directory_path: str,
549
- suffixes: Optional[List[str]] = None,
550
- exclude_hidden: bool = True,
551
- metadata: Optional[Dict[str, Any]] = None,
552
- ) -> bytes:
553
- """
554
- Package a directory and all of its contents that match the given suffixes.
555
-
556
- This is a convenience method that works similarly to MetaflowPackage._walk
557
- to package a directory for deployment. Will default follow_symlinks.
558
-
559
- Parameters
560
- ----------
561
- directory_path : str
562
- The directory to package
563
- suffixes : List[str], optional
564
- List of file suffixes to include (defaults to standard code extensions)
565
- exclude_hidden : bool, default True
566
- Whether to exclude hidden files and directories
567
- metadata : Dict[str, Any], optional
568
- Metadata to include in the package
569
- Returns
570
- -------
571
- bytes
572
- The binary content of the tarball
573
- """
574
- if not os.path.isdir(directory_path):
575
- raise ValueError(f"The path '{directory_path}' is not a directory")
576
-
577
- # Use default suffixes if none provided
578
- if suffixes is None:
579
- suffixes = [".py", ".txt", ".yaml", ".yml", ".json"]
580
-
581
- buf = BytesIO()
582
-
583
- def no_mtime(tarinfo):
584
- # a modification time change should not change the hash of
585
- # the package. Only content modifications will.
586
- # Setting this to a fixed date so that we don't have a changing hash everytime we run
587
- tarinfo.mtime = 1747158696 # 13 May 2025 10:31:36
588
- return tarinfo
589
-
590
- with tarfile.open(
591
- fileobj=buf, mode="w:gz", compresslevel=3, dereference=True
592
- ) as tar:
593
- # Add metadata if provided
594
- if metadata:
595
- cls._add_tar_file(
596
- tar, "metadata.json", BytesIO(json.dumps(metadata).encode("utf-8"))
597
- )
598
-
599
- # Walk the directory and add matching files
600
- for file_path, rel_path in cls.directory_walker(
601
- directory_path,
602
- exclude_hidden=exclude_hidden,
603
- suffixes=suffixes,
604
- ):
605
- # Remove debug print statement
606
- tar.add(file_path, arcname=rel_path, recursive=False, filter=no_mtime)
607
-
608
- tarball = bytearray(buf.getvalue())
609
- tarball[4:8] = [0] * 4 # Reset 4 bytes from offset 4 to account for ts
610
- return tarball