metaflow 2.8.1__py2.py3-none-any.whl → 2.8.3__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. metaflow/client/core.py +14 -4
  2. metaflow/cmd/configure_cmd.py +3 -3
  3. metaflow/cmd/main_cli.py +9 -14
  4. metaflow/current.py +15 -0
  5. metaflow/datastore/datastore_set.py +7 -7
  6. metaflow/datastore/flow_datastore.py +1 -2
  7. metaflow/extension_support/__init__.py +1 -0
  8. metaflow/extension_support/integrations.py +141 -0
  9. metaflow/integrations.py +29 -0
  10. metaflow/metaflow_config.py +21 -0
  11. metaflow/metaflow_environment.py +5 -4
  12. metaflow/package.py +1 -1
  13. metaflow/plugins/airflow/airflow.py +0 -1
  14. metaflow/plugins/argo/argo_workflows.py +2 -0
  15. metaflow/plugins/argo/argo_workflows_cli.py +11 -1
  16. metaflow/plugins/aws/aws_utils.py +6 -1
  17. metaflow/plugins/aws/batch/batch.py +30 -8
  18. metaflow/plugins/aws/batch/batch_cli.py +12 -0
  19. metaflow/plugins/aws/batch/batch_client.py +39 -2
  20. metaflow/plugins/aws/batch/batch_decorator.py +23 -0
  21. metaflow/plugins/aws/step_functions/step_functions.py +7 -4
  22. metaflow/plugins/aws/step_functions/step_functions_cli.py +11 -1
  23. metaflow/plugins/cards/card_modules/bundle.css +56 -56
  24. metaflow/plugins/cards/card_modules/convert_to_native_type.py +67 -5
  25. metaflow/plugins/cards/card_modules/main.js +14 -7
  26. metaflow/plugins/conda/conda_environment.py +2 -2
  27. metaflow/plugins/conda/conda_step_decorator.py +7 -1
  28. metaflow/plugins/datatools/s3/s3.py +2 -2
  29. metaflow/plugins/env_escape/communication/channel.py +1 -1
  30. metaflow/plugins/kubernetes/kubernetes.py +4 -0
  31. metaflow/plugins/kubernetes/kubernetes_decorator.py +6 -2
  32. metaflow/plugins/kubernetes/kubernetes_job.py +17 -2
  33. metaflow/plugins/metadata/service.py +3 -2
  34. metaflow/runtime.py +5 -3
  35. metaflow/tutorials/02-statistics/README.md +4 -9
  36. metaflow/tutorials/02-statistics/stats.py +38 -11
  37. metaflow/tutorials/03-playlist-redux/playlist.py +24 -16
  38. metaflow/tutorials/04-playlist-plus/playlist.py +14 -23
  39. metaflow/tutorials/05-hello-cloud/README.md +45 -0
  40. metaflow/tutorials/{05-helloaws/helloaws.ipynb → 05-hello-cloud/hello-cloud.ipynb} +10 -5
  41. metaflow/tutorials/{05-helloaws/helloaws.py → 05-hello-cloud/hello-cloud.py} +11 -13
  42. metaflow/tutorials/06-statistics-redux/README.md +6 -29
  43. metaflow/tutorials/06-statistics-redux/stats.ipynb +2 -2
  44. metaflow/tutorials/07-worldview/README.md +3 -11
  45. metaflow/tutorials/07-worldview/worldview.ipynb +3 -3
  46. metaflow/tutorials/08-autopilot/README.md +10 -17
  47. metaflow/tutorials/08-autopilot/autopilot.ipynb +12 -7
  48. {metaflow-2.8.1.dist-info → metaflow-2.8.3.dist-info}/METADATA +1 -6
  49. {metaflow-2.8.1.dist-info → metaflow-2.8.3.dist-info}/RECORD +53 -51
  50. metaflow/tutorials/05-helloaws/README.md +0 -27
  51. {metaflow-2.8.1.dist-info → metaflow-2.8.3.dist-info}/LICENSE +0 -0
  52. {metaflow-2.8.1.dist-info → metaflow-2.8.3.dist-info}/WHEEL +0 -0
  53. {metaflow-2.8.1.dist-info → metaflow-2.8.3.dist-info}/entry_points.txt +0 -0
  54. {metaflow-2.8.1.dist-info → metaflow-2.8.3.dist-info}/top_level.txt +0 -0
@@ -132,5 +132,5 @@ class CondaEnvironment(MetaflowEnvironment):
132
132
  def get_package_commands(self, code_package_url, datastore_type):
133
133
  return self.base_env.get_package_commands(code_package_url, datastore_type)
134
134
 
135
- def get_environment_info(self):
136
- return self.base_env.get_environment_info()
135
+ def get_environment_info(self, include_ext_info=False):
136
+ return self.base_env.get_environment_info(include_ext_info)
@@ -285,7 +285,13 @@ class CondaStepDecorator(StepDecorator):
285
285
  mode="wt",
286
286
  encoding="utf-8",
287
287
  ) as f:
288
- f.write(json.dumps(self._cur_environment.get_environment_info()))
288
+ f.write(
289
+ json.dumps(
290
+ self._cur_environment.get_environment_info(
291
+ include_ext_info=True
292
+ )
293
+ )
294
+ )
289
295
 
290
296
  # Do the same for EXT_PKG
291
297
  try:
@@ -17,6 +17,7 @@ from metaflow.metaflow_config import (
17
17
  DATATOOLS_S3ROOT,
18
18
  S3_RETRY_COUNT,
19
19
  S3_TRANSIENT_RETRY_COUNT,
20
+ TEMPDIR,
20
21
  )
21
22
  from metaflow.util import (
22
23
  namedtuple_with_defaults,
@@ -142,7 +143,6 @@ class S3Object(object):
142
143
  range_info: Optional[RangeInfo] = None,
143
144
  last_modified: int = None,
144
145
  ):
145
-
146
146
  # all fields of S3Object should return a unicode object
147
147
  prefix, url, path = map(ensure_unicode, (prefix, url, path))
148
148
 
@@ -481,7 +481,7 @@ class S3(object):
481
481
 
482
482
  def __init__(
483
483
  self,
484
- tmproot: str = ".",
484
+ tmproot: str = TEMPDIR,
485
485
  bucket: Optional[str] = None,
486
486
  prefix: Optional[str] = None,
487
487
  run: Optional[Union[FlowSpec, "Run"]] = None,
@@ -36,7 +36,7 @@ class Channel(object):
36
36
  sz_bytes = self._stream.read(self._fmt.size, timeout)
37
37
  msg_sz = self._fmt.unpack(sz_bytes)[0]
38
38
  obj_bytes = self._stream.read(msg_sz, timeout)
39
- return json.loads(obj_bytes, encoding="utf-8")
39
+ return json.loads(obj_bytes)
40
40
  except EOFError as e:
41
41
  raise RuntimeError("Cannot receive object over streaming interface: %s" % e)
42
42
  except BaseException as e:
@@ -15,6 +15,7 @@ from metaflow.metaflow_config import (
15
15
  DEFAULT_AWS_CLIENT_PROVIDER,
16
16
  DEFAULT_METADATA,
17
17
  KUBERNETES_SANDBOX_INIT_SCRIPT,
18
+ KUBERNETES_FETCH_EC2_METADATA,
18
19
  S3_ENDPOINT_URL,
19
20
  AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
20
21
  DATASTORE_SYSROOT_AZURE,
@@ -199,6 +200,9 @@ class Kubernetes(object):
199
200
  .environment_variable("METAFLOW_DEFAULT_DATASTORE", self._datastore.TYPE)
200
201
  .environment_variable("METAFLOW_DEFAULT_METADATA", DEFAULT_METADATA)
201
202
  .environment_variable("METAFLOW_KUBERNETES_WORKLOAD", 1)
203
+ .environment_variable(
204
+ "METAFLOW_KUBERNETES_FETCH_EC2_METADATA", KUBERNETES_FETCH_EC2_METADATA
205
+ )
202
206
  .environment_variable("METAFLOW_RUNTIME_ENVIRONMENT", "kubernetes")
203
207
  .environment_variable(
204
208
  "METAFLOW_DEFAULT_SECRETS_BACKEND_TYPE", DEFAULT_SECRETS_BACKEND_TYPE
@@ -17,6 +17,7 @@ from metaflow.metaflow_config import (
17
17
  KUBERNETES_TOLERATIONS,
18
18
  KUBERNETES_SERVICE_ACCOUNT,
19
19
  KUBERNETES_SECRETS,
20
+ KUBERNETES_FETCH_EC2_METADATA,
20
21
  )
21
22
  from metaflow.plugins.resources_decorator import ResourcesDecorator
22
23
  from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
@@ -327,8 +328,11 @@ class KubernetesDecorator(StepDecorator):
327
328
 
328
329
  # TODO (savin): Introduce equivalent support for Microsoft Azure and
329
330
  # Google Cloud Platform
330
- instance_meta = get_ec2_instance_metadata()
331
- meta.update(instance_meta)
331
+ # TODO: Introduce a way to detect Cloud Provider, so unnecessary requests (and delays)
332
+ # can be avoided by not having to try out all providers.
333
+ if KUBERNETES_FETCH_EC2_METADATA:
334
+ instance_meta = get_ec2_instance_metadata()
335
+ meta.update(instance_meta)
332
336
 
333
337
  # Unfortunately, there doesn't seem to be any straight forward way right
334
338
  # now to attach the Batch/v1 name - While we can rely on a hacky approach
@@ -413,8 +413,23 @@ class RunningJob(object):
413
413
  except:
414
414
  # Best effort. It's likely that this API call could be
415
415
  # blocked for the user.
416
- pass
417
- # raise
416
+ # --------------------------------------------------------
417
+ # We try patching Job parallelism anyway. Stopping any runaway
418
+ # jobs (and their pods) is secondary to correctly showing
419
+ # "Killed" status on the Kubernetes pod.
420
+ #
421
+ # This has the effect of pausing the job.
422
+ try:
423
+ client.BatchV1Api().patch_namespaced_job(
424
+ name=self._name,
425
+ namespace=self._namespace,
426
+ field_manager="metaflow",
427
+ body={"spec": {"parallelism": 0}},
428
+ )
429
+ except:
430
+ # Best effort.
431
+ pass
432
+ # raise
418
433
  else:
419
434
  # Case 2.
420
435
  # This has the effect of pausing the job.
@@ -137,9 +137,10 @@ class ServiceMetadataProvider(MetadataProvider):
137
137
  payload[HB_URL_KEY] = self.url_run_template.format(**data)
138
138
  else:
139
139
  raise Exception("invalid heartbeat type")
140
- payload["service_version"] = self.version()
140
+ service_version = self.version()
141
+ payload["service_version"] = service_version
141
142
  # start sidecar
142
- if self.version() is None or LooseVersion(self.version()) < LooseVersion(
143
+ if service_version is None or LooseVersion(service_version) < LooseVersion(
143
144
  "2.0.4"
144
145
  ):
145
146
  # if old version of the service is running
metaflow/runtime.py CHANGED
@@ -198,9 +198,11 @@ class NativeRuntime(object):
198
198
  self._is_cloned[task.path] = task.is_cloned
199
199
 
200
200
  def execute(self):
201
- run_url = None
202
- if UI_URL:
203
- run_url = "%s/%s/%s" % (UI_URL.rstrip("/"), self._flow.name, self._run_id)
201
+ run_url = (
202
+ "%s/%s/%s" % (UI_URL.rstrip("/"), self._flow.name, self._run_id)
203
+ if UI_URL
204
+ else None
205
+ )
204
206
 
205
207
  if run_url:
206
208
  self._logger(
@@ -1,26 +1,21 @@
1
1
  # Episode 02-statistics: Is this Data Science?
2
2
 
3
- **Use metaflow to load the movie metadata CSV file into a Pandas Dataframe and
3
+ **Use metaflow to load the movie metadata CSV file into a dataframe and
4
4
  compute some movie genre-specific statistics. These statistics are then used in
5
5
  later examples to improve our playlist generator. You can optionally use the
6
6
  Metaflow client to eyeball the results in a Notebook, and make some simple
7
7
  plots using the Matplotlib library.**
8
8
 
9
- Please note that Episode 04, a follow-on to this episode, requires Pandas version 1.3.3.
10
- Please make sure that you install or upgrade/downgrade to Pandas 1.3.3.
11
-
12
9
  #### Showcasing:
13
10
  - Fan-out over a set of parameters using Metaflow foreach.
14
- - Using external packages like Pandas.
15
11
  - Plotting results in a Notebook.
16
12
 
17
13
  #### Before playing this episode:
18
- 1. ```python -m pip install pandas==1.3.3```
19
- 2. ```python -m pip install notebook```
20
- 3. ```python -m pip install matplotlib```
14
+ 1. ```python -m pip install notebook```
15
+ 2. ```python -m pip install matplotlib```
21
16
 
22
17
  #### To play this episode:
23
18
  1. ```cd metaflow-tutorials```
24
19
  2. ```python 02-statistics/stats.py show```
25
20
  3. ```python 02-statistics/stats.py run```
26
- 4. ```jupyter-notebook 02-statistics/stats.ipynb```
21
+ 4. ```jupyter-notebook 02-statistics/stats.ipynb```
@@ -19,7 +19,7 @@ class MovieStatsFlow(FlowSpec):
19
19
  A flow to generate some statistics about the movie genres.
20
20
 
21
21
  The flow performs the following steps:
22
- 1) Ingests a CSV into a Pandas Dataframe.
22
+ 1) Ingests a CSV into a dataframe.
23
23
  2) Fan-out over genre using Metaflow foreach.
24
24
  3) Compute quartiles for each genre.
25
25
  4) Save a dictionary of genre-specific statistics.
@@ -36,16 +36,29 @@ class MovieStatsFlow(FlowSpec):
36
36
  def start(self):
37
37
  """
38
38
  The start step:
39
- 1) Loads the movie metadata into pandas dataframe.
39
+ 1) Loads the movie metadata into dataframe.
40
40
  2) Finds all the unique genres.
41
41
  3) Launches parallel statistics computation for each genre.
42
42
 
43
43
  """
44
- import pandas
44
+ import csv
45
45
  from io import StringIO
46
46
 
47
- # Load the data set into a pandas dataframe.
48
- self.dataframe = pandas.read_csv(StringIO(self.movie_data))
47
+ # Load the data set into a dataframe structure.
48
+ self.dataframe = {
49
+ "movie_title": [],
50
+ "title_year": [],
51
+ "genres": [],
52
+ "gross": [],
53
+ }
54
+
55
+ for row in csv.reader(StringIO(self.movie_data), delimiter=","):
56
+ if row[0] == "movie_title":
57
+ continue
58
+ self.dataframe["movie_title"].append(row[0])
59
+ self.dataframe["title_year"].append(int(row[1]))
60
+ self.dataframe["genres"].append(row[2])
61
+ self.dataframe["gross"].append(int(row[3]))
49
62
 
50
63
  # The column 'genres' has a list of genres for each movie. Let's get
51
64
  # all the unique genres.
@@ -63,8 +76,8 @@ class MovieStatsFlow(FlowSpec):
63
76
  def compute_statistics(self):
64
77
  """
65
78
  Compute statistics for a single genre.
66
-
67
79
  """
80
+
68
81
  # The genre currently being processed is a class property called
69
82
  # 'input'.
70
83
  self.genre = self.input
@@ -72,13 +85,27 @@ class MovieStatsFlow(FlowSpec):
72
85
 
73
86
  # Find all the movies that have this genre and build a dataframe with
74
87
  # just those movies and just the columns of interest.
75
- selector = self.dataframe["genres"].apply(lambda row: self.genre in row)
76
- self.dataframe = self.dataframe[selector]
77
- self.dataframe = self.dataframe[["movie_title", "genres", "gross"]]
88
+ selector = [self.genre in row for row in self.dataframe["genres"]]
89
+
90
+ for col in self.dataframe.keys():
91
+ self.dataframe[col] = [
92
+ col for col, is_genre in zip(self.dataframe[col], selector) if is_genre
93
+ ]
94
+
95
+ # Sort by gross box office and drop unused column.
96
+ argsort_indices = sorted(
97
+ range(len(self.dataframe["gross"])), key=self.dataframe["gross"].__getitem__
98
+ )
99
+ for col in self.dataframe.keys():
100
+ self.dataframe[col] = [self.dataframe[col][idx] for idx in argsort_indices]
101
+ del self.dataframe["title_year"]
78
102
 
79
103
  # Get some statistics on the gross box office for these titles.
80
- points = [0.25, 0.5, 0.75]
81
- self.quartiles = self.dataframe["gross"].quantile(points).values
104
+ n_points = len(self.dataframe["movie_title"])
105
+ self.quartiles = []
106
+ for cut in [0.25, 0.5, 0.75]:
107
+ idx = 0 if n_points < 2 else round(n_points * cut)
108
+ self.quartiles.append(self.dataframe["gross"][idx])
82
109
 
83
110
  # Join the results from other genres.
84
111
  self.next(self.join)
@@ -53,19 +53,25 @@ class PlayListFlow(FlowSpec):
53
53
  This step chooses a random title for a different movie genre.
54
54
 
55
55
  """
56
- import pandas
57
-
58
- # Concatenate all the genre-specific data frames and choose a random
59
- # movie.
60
- df = pandas.concat(
61
- [
62
- data["dataframe"]
63
- for genre, data in self.genre_stats.items()
64
- if genre != self.genre.lower()
65
- ]
66
- )
67
- df = df.sample(n=1)
68
- self.bonus = (df["movie_title"].values[0], df["genres"].values[0])
56
+ import random
57
+
58
+ # Concatenate all the genre-specific data frames.
59
+ df = {"movie_title": [], "genres": []}
60
+ for genre, data in self.genre_stats.items():
61
+ if genre != self.genre.lower():
62
+ for row_idx in range(len(data["dataframe"]["movie_title"])):
63
+ if (
64
+ self.genre.lower()
65
+ not in data["dataframe"]["genres"][row_idx].lower()
66
+ ):
67
+ df["movie_title"].append(
68
+ data["dataframe"]["movie_title"][row_idx]
69
+ )
70
+ df["genres"].append(data["dataframe"]["genres"][row_idx])
71
+
72
+ # Choose a random movie.
73
+ random_index = random.randint(0, len(df["genres"]) - 1)
74
+ self.bonus = (df["movie_title"][random_index], df["genres"][random_index])
69
75
 
70
76
  self.next(self.join)
71
77
 
@@ -82,12 +88,14 @@ class PlayListFlow(FlowSpec):
82
88
  genre = self.genre.lower()
83
89
  if genre not in self.genre_stats:
84
90
  self.movies = []
85
-
86
91
  else:
87
92
  df = self.genre_stats[genre]["dataframe"]
88
93
  quartiles = self.genre_stats[genre]["quartiles"]
89
- selector = df["gross"] >= quartiles[-1]
90
- self.movies = list(df[selector]["movie_title"])
94
+ self.movies = [
95
+ df["movie_title"][i]
96
+ for i, g in enumerate(df["gross"])
97
+ if g >= quartiles[-1]
98
+ ]
91
99
 
92
100
  # Shuffle the playlist.
93
101
  shuffle(self.movies)
@@ -10,7 +10,7 @@ def get_python_version():
10
10
  """
11
11
  import platform
12
12
 
13
- versions = {"2": "2.7.15", "3": "3.7.3"}
13
+ versions = {"2": "2.7.15", "3": "3.9.10"}
14
14
  return versions[platform.python_version_tuple()[0]]
15
15
 
16
16
 
@@ -48,17 +48,12 @@ class PlayListFlow(FlowSpec):
48
48
  default=5,
49
49
  )
50
50
 
51
- @conda(libraries={"pandas": "1.3.3"})
52
51
  @step
53
52
  def start(self):
54
53
  """
55
54
  Use the Metaflow client to retrieve the latest successful run from our
56
55
  MovieStatsFlow and assign them as data artifacts in this flow.
57
56
 
58
- This step uses 'conda' to isolate the environment. This step will
59
- always use pandas==1.3.3 regardless of what is installed on the
60
- system.
61
-
62
57
  """
63
58
  # Load the analysis from the MovieStatsFlow.
64
59
  from metaflow import Flow, get_metadata
@@ -80,7 +75,7 @@ class PlayListFlow(FlowSpec):
80
75
  # Compute our two recommendation types in parallel.
81
76
  self.next(self.bonus_movie, self.genre_movies)
82
77
 
83
- @conda(libraries={"editdistance": "0.5.3", "pandas": "1.3.3"})
78
+ @conda(libraries={"editdistance": "0.5.3"})
84
79
  @step
85
80
  def bonus_movie(self):
86
81
  """
@@ -90,9 +85,7 @@ class PlayListFlow(FlowSpec):
90
85
  This step uses 'conda' to isolate the environment. Note that the
91
86
  package 'editdistance' need not be installed in your python
92
87
  environment.
93
-
94
88
  """
95
- import pandas
96
89
  import editdistance
97
90
 
98
91
  # Define a helper function to compute the similarity between two
@@ -101,27 +94,23 @@ class PlayListFlow(FlowSpec):
101
94
  return editdistance.eval(self.hint, movie_title)
102
95
 
103
96
  # Compute the distance and take the argmin to find the closest title.
104
- distance = self.dataframe["movie_title"].apply(_edit_distance)
105
- index = distance.idxmin()
97
+ distance = [
98
+ _edit_distance(movie_title) for movie_title in self.dataframe["movie_title"]
99
+ ]
100
+ index = distance.index(min(distance))
106
101
  self.bonus = (
107
- self.dataframe["movie_title"].values[index],
108
- self.dataframe["genres"].values[index],
102
+ self.dataframe["movie_title"][index],
103
+ self.dataframe["genres"][index],
109
104
  )
110
105
 
111
106
  self.next(self.join)
112
107
 
113
- @conda(libraries={"pandas": "1.3.3"})
114
108
  @step
115
109
  def genre_movies(self):
116
110
  """
117
111
  Select the top performing movies from the use specified genre.
118
-
119
- This step uses 'conda' to isolate the environment. This step will
120
- always use pandas==1.3.3 regardless of what is installed on the
121
- system.
122
-
123
112
  """
124
- import pandas
113
+
125
114
  from random import shuffle
126
115
 
127
116
  # For the genre of interest, generate a potential playlist using only
@@ -129,12 +118,14 @@ class PlayListFlow(FlowSpec):
129
118
  genre = self.genre.lower()
130
119
  if genre not in self.genre_stats:
131
120
  self.movies = []
132
-
133
121
  else:
134
122
  df = self.genre_stats[genre]["dataframe"]
135
123
  quartiles = self.genre_stats[genre]["quartiles"]
136
- selector = df["gross"] >= quartiles[-1]
137
- self.movies = list(df[selector]["movie_title"])
124
+ self.movies = [
125
+ df["movie_title"][i]
126
+ for i, g in enumerate(df["gross"])
127
+ if g >= quartiles[-1]
128
+ ]
138
129
 
139
130
  # Shuffle the content.
140
131
  shuffle(self.movies)
@@ -0,0 +1,45 @@
1
+ # Episode 05-hellocloud: Look Mom, We're in the Cloud.
2
+
3
+ **This flow is a simple linear workflow that verifies your Kubernetes
4
+ configuration. The 'start' and 'end' steps will run locally, while the 'hello'
5
+ step will run remotely on Kubernetes. After configuring Metaflow to run on the cloud,
6
+ data and metadata about your runs will be stored remotely. This means you can
7
+ use the client to access information about any flow from anywhere.**
8
+
9
+ #### Showcasing:
10
+ - Kubernetes decorator.
11
+ - Accessing data artifacts generated remotely in a local notebook.
12
+ - retry decorator.
13
+
14
+ #### To play this episode:
15
+ Open ```05-hello-cloud/hello-cloud.ipynb```
16
+
17
+ # Episode 5: Hello Cloud
18
+
19
+ ## Look Mom, We're in the Cloud.
20
+
21
+ This flow is a simple linear workflow that verifies your cloud configuration. The `start` and `end` steps will run locally, while the `hello` step will [run remotely](/scaling/remote-tasks/introduction). After [configuring Metaflow](/getting-started/infrastructure) to run in the cloud, data and metadata about your runs will be stored remotely. This means you can use the client to access information about any flow from anywhere.
22
+
23
+ You can find the tutorial code on [GitHub](https://github.com/Netflix/metaflow/tree/master/metaflow/tutorials/05-hello-cloud)
24
+
25
+ **Showcasing:**
26
+
27
+ - [Kubernetes](https://docs.metaflow.org/scaling/remote-tasks/kubernetes) and the [`@kubernetes`](https://docs.metaflow.org/scaling/remote-tasks/introduction) decorator.
28
+ - Using the [Client API](../../../metaflow/client) to access data artifacts generated remotely in a local notebook.
29
+ - [`@retry`](https://docs.metaflow.org/scaling/failures#retrying-tasks-with-the-retry-decorator)decorator.
30
+
31
+ **Before playing this episode:**
32
+
33
+ 1. `python -m pip install notebook`
34
+ 2. This tutorial requires access to compute and storage resources in the cloud, which can be configured by
35
+ 1. Following the instructions [here](https://outerbounds.com/docs/engineering-welcome/) or
36
+ 2. Requesting [a sandbox](https://outerbounds.com/sandbox/).
37
+
38
+ **To play this episode:**
39
+
40
+ 1. `cd metaflow-tutorials`
41
+ 2. `python 05-hello-cloud/hello-cloud.py run`
42
+ 3. `jupyter-notebook 05-hello-cloud/hello-cloud.ipynb`
43
+ 4. Open _**hello-cloud.ipynb**_ in a notebook
44
+
45
+ <TutorialsLink link="../../tutorials"/>
@@ -4,9 +4,9 @@
4
4
  "cell_type": "markdown",
5
5
  "metadata": {},
6
6
  "source": [
7
- "# Episode 05-helloaws: Look Mom, We're in the Cloud\n",
7
+ "# Episode 05-hellocloud: Look Mom, We're in the Cloud\n",
8
8
  "\n",
9
- "### In HellowAWSFlow, the 'start' and 'end' steps were run locally, while the 'hello' step was run remotely on AWS batch. Since we are using AWS, data artifacts and metadata were stored remotely. This means you can use the client to access information about any flow from anywhere. This notebook shows you how. "
9
+ "### In HellowCloudFlow, the 'start' and 'end' steps were run locally, while the 'hello' step was run remotely on Kubernetes. Since we are using AWS, data artifacts and metadata were stored remotely. This means you can use the client to access information about any flow from anywhere. This notebook shows you how. "
10
10
  ]
11
11
  },
12
12
  {
@@ -41,7 +41,7 @@
41
41
  "source": [
42
42
  "# Set namespace to None to search over all namespaces\n",
43
43
  "namespace(None)\n",
44
- "run = Flow('HelloAWSFlow').latest_successful_run\n",
44
+ "run = Flow('HelloCloudFlow').latest_successful_run\n",
45
45
  "print(\"Using run: %s\" % str(run))\n",
46
46
  "print(run.data.message)"
47
47
  ]
@@ -49,7 +49,7 @@
49
49
  ],
50
50
  "metadata": {
51
51
  "kernelspec": {
52
- "display_name": "Python 3",
52
+ "display_name": "Python 3.10.6 64-bit ('3.10.6')",
53
53
  "language": "python",
54
54
  "name": "python3"
55
55
  },
@@ -63,7 +63,12 @@
63
63
  "name": "python",
64
64
  "nbconvert_exporter": "python",
65
65
  "pygments_lexer": "ipython3",
66
- "version": "3.8.0"
66
+ "version": "3.10.6"
67
+ },
68
+ "vscode": {
69
+ "interpreter": {
70
+ "hash": "60d98827d7482d2a0f6aae287a18990d3a1d423e0f66197ec6cdef8a2e07b41f"
71
+ }
67
72
  }
68
73
  },
69
74
  "nbformat": 4,
@@ -1,11 +1,11 @@
1
- from metaflow import FlowSpec, step, batch, retry
1
+ from metaflow import FlowSpec, step, kubernetes, retry
2
2
 
3
3
 
4
- class HelloAWSFlow(FlowSpec):
4
+ class HelloCloudFlow(FlowSpec):
5
5
  """
6
- A flow where Metaflow prints 'Metaflow says Hi from AWS!'
6
+ A flow where Metaflow prints 'Metaflow says Hi from the cloud!'
7
7
 
8
- Run this flow to validate your AWS configuration.
8
+ Run this flow to validate your Kubernetes configuration.
9
9
 
10
10
  """
11
11
 
@@ -18,30 +18,28 @@ class HelloAWSFlow(FlowSpec):
18
18
  """
19
19
  from metaflow import get_metadata
20
20
 
21
- print("HelloAWS is starting.")
21
+ print("HelloCloud is starting.")
22
22
  print("")
23
23
  print("Using metadata provider: %s" % get_metadata())
24
24
  print("")
25
25
  print("The start step is running locally. Next, the ")
26
- print("'hello' step will run remotely on AWS batch. ")
27
- print("If you are running in the Netflix sandbox, ")
28
- print("it may take some time to acquire a compute resource.")
26
+ print("'hello' step will run remotely on Kubernetes. ")
29
27
 
30
28
  self.next(self.hello)
31
29
 
32
- @batch(cpu=1, memory=500)
30
+ @kubernetes(cpu=1, memory=500)
33
31
  @retry
34
32
  @step
35
33
  def hello(self):
36
34
  """
37
- This steps runs remotely on AWS batch using 1 virtual CPU and 500Mb of
35
+ This steps runs remotely on Kubernetes using 1 virtual CPU and 500Mb of
38
36
  memory. Since we are now using a remote metadata service and data
39
37
  store, the flow information and artifacts are available from
40
38
  anywhere. The step also uses the retry decorator, so that if something
41
39
  goes wrong, the step will be automatically retried.
42
40
 
43
41
  """
44
- self.message = "Hi from AWS!"
42
+ self.message = "Hi from the cloud!"
45
43
  print("Metaflow says: %s" % self.message)
46
44
  self.next(self.end)
47
45
 
@@ -52,8 +50,8 @@ class HelloAWSFlow(FlowSpec):
52
50
  which the flow is executed.
53
51
 
54
52
  """
55
- print("HelloAWS is finished.")
53
+ print("HelloCloud is finished.")
56
54
 
57
55
 
58
56
  if __name__ == "__main__":
59
- HelloAWSFlow()
57
+ HelloCloudFlow()
@@ -3,42 +3,19 @@
3
3
  **This example revisits 'Episode 02-statistics: Is this Data Science?'. With
4
4
  Metaflow, you don't need to make any code changes to scale-up your flow by
5
5
  running on remote compute. In this example we re-run the 'stats.py' workflow
6
- adding the '--with batch' command line argument. This instructs Metaflow to run
7
- all your steps on AWS batch without changing any code. You can control the
6
+ adding the '--with kubernetes' command line argument. This instructs Metaflow to run
7
+ all your steps on AWS Kubernetes without changing any code. You can control the
8
8
  behavior with additional arguments, like '--max-workers'. For this example,
9
9
  'max-workers' is used to limit the number of parallel genre-specific statistics
10
10
  computations.
11
11
  You can then access the data artifacts (even the local CSV file) from anywhere
12
- because the data is being stored in AWS S3.
13
- This tutorial uses `pandas` which may not be available in your environment.
14
- Use the 'conda' package manager with the `conda-forge` channel added to run
15
- this tutorial in any environment**
12
+ because the data is being stored in AWS S3.**
16
13
 
17
14
  #### Showcasing:
18
- - '--with batch' command line option
15
+ - '--with kubernetes' command line option
19
16
  - '--max-workers' command line option
20
17
  - Accessing data locally or remotely
21
- - Metaflow's conda based dependency management.
22
-
23
-
24
- #### Before playing this episode:
25
- 1. ```python -m pip install pandas```
26
- 2. ```python -m pip install notebook```
27
- 3. ```python -m pip install matplotlib```
28
- 4. This tutorial requires the 'conda' package manager to be installed with the
29
- conda-forge channel added.
30
- a. Download Miniconda at https://docs.conda.io/en/latest/miniconda.html
31
- b. ```conda config --add channels conda-forge```
32
- 5. This tutorial requires access to compute and storage resources on AWS, which
33
- can be configured by
34
- 1. Following the instructions at
35
- https://docs.metaflow.org/metaflow-on-aws/deploy-to-aws or
36
- 2. Requesting a sandbox at
37
- https://docs.metaflow.org/metaflow-on-aws/metaflow-sandbox
38
-
39
18
 
40
19
  #### To play this episode:
41
- 1. ```cd metaflow-tutorials```
42
- 2. ```python 02-statistics/stats.py --environment conda run --with batch --max-workers 4 --with conda:python=3.7,libraries="{pandas:1.3.3}"```
43
- 3. ```jupyter-notebook 06-statistics-redux/stats.ipynb```
44
- 4. Open 'stats.ipynb' in your remote Sagemaker notebook
20
+ 1. ```python 02-statistics/stats.py run --with kubernetes --max-workers 4```
21
+ 2. Open ```06-statistics-redux/stats.ipynb```
@@ -113,7 +113,7 @@
113
113
  ],
114
114
  "metadata": {
115
115
  "kernelspec": {
116
- "display_name": "Python 3",
116
+ "display_name": "Python 3 (ipykernel)",
117
117
  "language": "python",
118
118
  "name": "python3"
119
119
  },
@@ -127,7 +127,7 @@
127
127
  "name": "python",
128
128
  "nbconvert_exporter": "python",
129
129
  "pygments_lexer": "ipython3",
130
- "version": "3.8.0"
130
+ "version": "3.11.0"
131
131
  }
132
132
  },
133
133
  "nbformat": 4,