metaflow 2.8.1__py2.py3-none-any.whl → 2.8.3__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow/client/core.py +14 -4
- metaflow/cmd/configure_cmd.py +3 -3
- metaflow/cmd/main_cli.py +9 -14
- metaflow/current.py +15 -0
- metaflow/datastore/datastore_set.py +7 -7
- metaflow/datastore/flow_datastore.py +1 -2
- metaflow/extension_support/__init__.py +1 -0
- metaflow/extension_support/integrations.py +141 -0
- metaflow/integrations.py +29 -0
- metaflow/metaflow_config.py +21 -0
- metaflow/metaflow_environment.py +5 -4
- metaflow/package.py +1 -1
- metaflow/plugins/airflow/airflow.py +0 -1
- metaflow/plugins/argo/argo_workflows.py +2 -0
- metaflow/plugins/argo/argo_workflows_cli.py +11 -1
- metaflow/plugins/aws/aws_utils.py +6 -1
- metaflow/plugins/aws/batch/batch.py +30 -8
- metaflow/plugins/aws/batch/batch_cli.py +12 -0
- metaflow/plugins/aws/batch/batch_client.py +39 -2
- metaflow/plugins/aws/batch/batch_decorator.py +23 -0
- metaflow/plugins/aws/step_functions/step_functions.py +7 -4
- metaflow/plugins/aws/step_functions/step_functions_cli.py +11 -1
- metaflow/plugins/cards/card_modules/bundle.css +56 -56
- metaflow/plugins/cards/card_modules/convert_to_native_type.py +67 -5
- metaflow/plugins/cards/card_modules/main.js +14 -7
- metaflow/plugins/conda/conda_environment.py +2 -2
- metaflow/plugins/conda/conda_step_decorator.py +7 -1
- metaflow/plugins/datatools/s3/s3.py +2 -2
- metaflow/plugins/env_escape/communication/channel.py +1 -1
- metaflow/plugins/kubernetes/kubernetes.py +4 -0
- metaflow/plugins/kubernetes/kubernetes_decorator.py +6 -2
- metaflow/plugins/kubernetes/kubernetes_job.py +17 -2
- metaflow/plugins/metadata/service.py +3 -2
- metaflow/runtime.py +5 -3
- metaflow/tutorials/02-statistics/README.md +4 -9
- metaflow/tutorials/02-statistics/stats.py +38 -11
- metaflow/tutorials/03-playlist-redux/playlist.py +24 -16
- metaflow/tutorials/04-playlist-plus/playlist.py +14 -23
- metaflow/tutorials/05-hello-cloud/README.md +45 -0
- metaflow/tutorials/{05-helloaws/helloaws.ipynb → 05-hello-cloud/hello-cloud.ipynb} +10 -5
- metaflow/tutorials/{05-helloaws/helloaws.py → 05-hello-cloud/hello-cloud.py} +11 -13
- metaflow/tutorials/06-statistics-redux/README.md +6 -29
- metaflow/tutorials/06-statistics-redux/stats.ipynb +2 -2
- metaflow/tutorials/07-worldview/README.md +3 -11
- metaflow/tutorials/07-worldview/worldview.ipynb +3 -3
- metaflow/tutorials/08-autopilot/README.md +10 -17
- metaflow/tutorials/08-autopilot/autopilot.ipynb +12 -7
- {metaflow-2.8.1.dist-info → metaflow-2.8.3.dist-info}/METADATA +1 -6
- {metaflow-2.8.1.dist-info → metaflow-2.8.3.dist-info}/RECORD +53 -51
- metaflow/tutorials/05-helloaws/README.md +0 -27
- {metaflow-2.8.1.dist-info → metaflow-2.8.3.dist-info}/LICENSE +0 -0
- {metaflow-2.8.1.dist-info → metaflow-2.8.3.dist-info}/WHEEL +0 -0
- {metaflow-2.8.1.dist-info → metaflow-2.8.3.dist-info}/entry_points.txt +0 -0
- {metaflow-2.8.1.dist-info → metaflow-2.8.3.dist-info}/top_level.txt +0 -0
@@ -132,5 +132,5 @@ class CondaEnvironment(MetaflowEnvironment):
|
|
132
132
|
def get_package_commands(self, code_package_url, datastore_type):
|
133
133
|
return self.base_env.get_package_commands(code_package_url, datastore_type)
|
134
134
|
|
135
|
-
def get_environment_info(self):
|
136
|
-
return self.base_env.get_environment_info()
|
135
|
+
def get_environment_info(self, include_ext_info=False):
|
136
|
+
return self.base_env.get_environment_info(include_ext_info)
|
@@ -285,7 +285,13 @@ class CondaStepDecorator(StepDecorator):
|
|
285
285
|
mode="wt",
|
286
286
|
encoding="utf-8",
|
287
287
|
) as f:
|
288
|
-
f.write(
|
288
|
+
f.write(
|
289
|
+
json.dumps(
|
290
|
+
self._cur_environment.get_environment_info(
|
291
|
+
include_ext_info=True
|
292
|
+
)
|
293
|
+
)
|
294
|
+
)
|
289
295
|
|
290
296
|
# Do the same for EXT_PKG
|
291
297
|
try:
|
@@ -17,6 +17,7 @@ from metaflow.metaflow_config import (
|
|
17
17
|
DATATOOLS_S3ROOT,
|
18
18
|
S3_RETRY_COUNT,
|
19
19
|
S3_TRANSIENT_RETRY_COUNT,
|
20
|
+
TEMPDIR,
|
20
21
|
)
|
21
22
|
from metaflow.util import (
|
22
23
|
namedtuple_with_defaults,
|
@@ -142,7 +143,6 @@ class S3Object(object):
|
|
142
143
|
range_info: Optional[RangeInfo] = None,
|
143
144
|
last_modified: int = None,
|
144
145
|
):
|
145
|
-
|
146
146
|
# all fields of S3Object should return a unicode object
|
147
147
|
prefix, url, path = map(ensure_unicode, (prefix, url, path))
|
148
148
|
|
@@ -481,7 +481,7 @@ class S3(object):
|
|
481
481
|
|
482
482
|
def __init__(
|
483
483
|
self,
|
484
|
-
tmproot: str =
|
484
|
+
tmproot: str = TEMPDIR,
|
485
485
|
bucket: Optional[str] = None,
|
486
486
|
prefix: Optional[str] = None,
|
487
487
|
run: Optional[Union[FlowSpec, "Run"]] = None,
|
@@ -36,7 +36,7 @@ class Channel(object):
|
|
36
36
|
sz_bytes = self._stream.read(self._fmt.size, timeout)
|
37
37
|
msg_sz = self._fmt.unpack(sz_bytes)[0]
|
38
38
|
obj_bytes = self._stream.read(msg_sz, timeout)
|
39
|
-
return json.loads(obj_bytes
|
39
|
+
return json.loads(obj_bytes)
|
40
40
|
except EOFError as e:
|
41
41
|
raise RuntimeError("Cannot receive object over streaming interface: %s" % e)
|
42
42
|
except BaseException as e:
|
@@ -15,6 +15,7 @@ from metaflow.metaflow_config import (
|
|
15
15
|
DEFAULT_AWS_CLIENT_PROVIDER,
|
16
16
|
DEFAULT_METADATA,
|
17
17
|
KUBERNETES_SANDBOX_INIT_SCRIPT,
|
18
|
+
KUBERNETES_FETCH_EC2_METADATA,
|
18
19
|
S3_ENDPOINT_URL,
|
19
20
|
AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
|
20
21
|
DATASTORE_SYSROOT_AZURE,
|
@@ -199,6 +200,9 @@ class Kubernetes(object):
|
|
199
200
|
.environment_variable("METAFLOW_DEFAULT_DATASTORE", self._datastore.TYPE)
|
200
201
|
.environment_variable("METAFLOW_DEFAULT_METADATA", DEFAULT_METADATA)
|
201
202
|
.environment_variable("METAFLOW_KUBERNETES_WORKLOAD", 1)
|
203
|
+
.environment_variable(
|
204
|
+
"METAFLOW_KUBERNETES_FETCH_EC2_METADATA", KUBERNETES_FETCH_EC2_METADATA
|
205
|
+
)
|
202
206
|
.environment_variable("METAFLOW_RUNTIME_ENVIRONMENT", "kubernetes")
|
203
207
|
.environment_variable(
|
204
208
|
"METAFLOW_DEFAULT_SECRETS_BACKEND_TYPE", DEFAULT_SECRETS_BACKEND_TYPE
|
@@ -17,6 +17,7 @@ from metaflow.metaflow_config import (
|
|
17
17
|
KUBERNETES_TOLERATIONS,
|
18
18
|
KUBERNETES_SERVICE_ACCOUNT,
|
19
19
|
KUBERNETES_SECRETS,
|
20
|
+
KUBERNETES_FETCH_EC2_METADATA,
|
20
21
|
)
|
21
22
|
from metaflow.plugins.resources_decorator import ResourcesDecorator
|
22
23
|
from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
|
@@ -327,8 +328,11 @@ class KubernetesDecorator(StepDecorator):
|
|
327
328
|
|
328
329
|
# TODO (savin): Introduce equivalent support for Microsoft Azure and
|
329
330
|
# Google Cloud Platform
|
330
|
-
|
331
|
-
|
331
|
+
# TODO: Introduce a way to detect Cloud Provider, so unnecessary requests (and delays)
|
332
|
+
# can be avoided by not having to try out all providers.
|
333
|
+
if KUBERNETES_FETCH_EC2_METADATA:
|
334
|
+
instance_meta = get_ec2_instance_metadata()
|
335
|
+
meta.update(instance_meta)
|
332
336
|
|
333
337
|
# Unfortunately, there doesn't seem to be any straight forward way right
|
334
338
|
# now to attach the Batch/v1 name - While we can rely on a hacky approach
|
@@ -413,8 +413,23 @@ class RunningJob(object):
|
|
413
413
|
except:
|
414
414
|
# Best effort. It's likely that this API call could be
|
415
415
|
# blocked for the user.
|
416
|
-
|
417
|
-
#
|
416
|
+
# --------------------------------------------------------
|
417
|
+
# We try patching Job parallelism anyway. Stopping any runaway
|
418
|
+
# jobs (and their pods) is secondary to correctly showing
|
419
|
+
# "Killed" status on the Kubernetes pod.
|
420
|
+
#
|
421
|
+
# This has the effect of pausing the job.
|
422
|
+
try:
|
423
|
+
client.BatchV1Api().patch_namespaced_job(
|
424
|
+
name=self._name,
|
425
|
+
namespace=self._namespace,
|
426
|
+
field_manager="metaflow",
|
427
|
+
body={"spec": {"parallelism": 0}},
|
428
|
+
)
|
429
|
+
except:
|
430
|
+
# Best effort.
|
431
|
+
pass
|
432
|
+
# raise
|
418
433
|
else:
|
419
434
|
# Case 2.
|
420
435
|
# This has the effect of pausing the job.
|
@@ -137,9 +137,10 @@ class ServiceMetadataProvider(MetadataProvider):
|
|
137
137
|
payload[HB_URL_KEY] = self.url_run_template.format(**data)
|
138
138
|
else:
|
139
139
|
raise Exception("invalid heartbeat type")
|
140
|
-
|
140
|
+
service_version = self.version()
|
141
|
+
payload["service_version"] = service_version
|
141
142
|
# start sidecar
|
142
|
-
if
|
143
|
+
if service_version is None or LooseVersion(service_version) < LooseVersion(
|
143
144
|
"2.0.4"
|
144
145
|
):
|
145
146
|
# if old version of the service is running
|
metaflow/runtime.py
CHANGED
@@ -198,9 +198,11 @@ class NativeRuntime(object):
|
|
198
198
|
self._is_cloned[task.path] = task.is_cloned
|
199
199
|
|
200
200
|
def execute(self):
|
201
|
-
run_url =
|
202
|
-
|
203
|
-
|
201
|
+
run_url = (
|
202
|
+
"%s/%s/%s" % (UI_URL.rstrip("/"), self._flow.name, self._run_id)
|
203
|
+
if UI_URL
|
204
|
+
else None
|
205
|
+
)
|
204
206
|
|
205
207
|
if run_url:
|
206
208
|
self._logger(
|
@@ -1,26 +1,21 @@
|
|
1
1
|
# Episode 02-statistics: Is this Data Science?
|
2
2
|
|
3
|
-
**Use metaflow to load the movie metadata CSV file into a
|
3
|
+
**Use metaflow to load the movie metadata CSV file into a dataframe and
|
4
4
|
compute some movie genre-specific statistics. These statistics are then used in
|
5
5
|
later examples to improve our playlist generator. You can optionally use the
|
6
6
|
Metaflow client to eyeball the results in a Notebook, and make some simple
|
7
7
|
plots using the Matplotlib library.**
|
8
8
|
|
9
|
-
Please note that Episode 04, a follow-on to this episode, requires Pandas version 1.3.3.
|
10
|
-
Please make sure that you install or upgrade/downgrade to Pandas 1.3.3.
|
11
|
-
|
12
9
|
#### Showcasing:
|
13
10
|
- Fan-out over a set of parameters using Metaflow foreach.
|
14
|
-
- Using external packages like Pandas.
|
15
11
|
- Plotting results in a Notebook.
|
16
12
|
|
17
13
|
#### Before playing this episode:
|
18
|
-
1. ```python -m pip install
|
19
|
-
2. ```python -m pip install
|
20
|
-
3. ```python -m pip install matplotlib```
|
14
|
+
1. ```python -m pip install notebook```
|
15
|
+
2. ```python -m pip install matplotlib```
|
21
16
|
|
22
17
|
#### To play this episode:
|
23
18
|
1. ```cd metaflow-tutorials```
|
24
19
|
2. ```python 02-statistics/stats.py show```
|
25
20
|
3. ```python 02-statistics/stats.py run```
|
26
|
-
4. ```jupyter-notebook 02-statistics/stats.ipynb```
|
21
|
+
4. ```jupyter-notebook 02-statistics/stats.ipynb```
|
@@ -19,7 +19,7 @@ class MovieStatsFlow(FlowSpec):
|
|
19
19
|
A flow to generate some statistics about the movie genres.
|
20
20
|
|
21
21
|
The flow performs the following steps:
|
22
|
-
1) Ingests a CSV into a
|
22
|
+
1) Ingests a CSV into a dataframe.
|
23
23
|
2) Fan-out over genre using Metaflow foreach.
|
24
24
|
3) Compute quartiles for each genre.
|
25
25
|
4) Save a dictionary of genre-specific statistics.
|
@@ -36,16 +36,29 @@ class MovieStatsFlow(FlowSpec):
|
|
36
36
|
def start(self):
|
37
37
|
"""
|
38
38
|
The start step:
|
39
|
-
1) Loads the movie metadata into
|
39
|
+
1) Loads the movie metadata into dataframe.
|
40
40
|
2) Finds all the unique genres.
|
41
41
|
3) Launches parallel statistics computation for each genre.
|
42
42
|
|
43
43
|
"""
|
44
|
-
import
|
44
|
+
import csv
|
45
45
|
from io import StringIO
|
46
46
|
|
47
|
-
# Load the data set into a
|
48
|
-
self.dataframe =
|
47
|
+
# Load the data set into a dataframe structure.
|
48
|
+
self.dataframe = {
|
49
|
+
"movie_title": [],
|
50
|
+
"title_year": [],
|
51
|
+
"genres": [],
|
52
|
+
"gross": [],
|
53
|
+
}
|
54
|
+
|
55
|
+
for row in csv.reader(StringIO(self.movie_data), delimiter=","):
|
56
|
+
if row[0] == "movie_title":
|
57
|
+
continue
|
58
|
+
self.dataframe["movie_title"].append(row[0])
|
59
|
+
self.dataframe["title_year"].append(int(row[1]))
|
60
|
+
self.dataframe["genres"].append(row[2])
|
61
|
+
self.dataframe["gross"].append(int(row[3]))
|
49
62
|
|
50
63
|
# The column 'genres' has a list of genres for each movie. Let's get
|
51
64
|
# all the unique genres.
|
@@ -63,8 +76,8 @@ class MovieStatsFlow(FlowSpec):
|
|
63
76
|
def compute_statistics(self):
|
64
77
|
"""
|
65
78
|
Compute statistics for a single genre.
|
66
|
-
|
67
79
|
"""
|
80
|
+
|
68
81
|
# The genre currently being processed is a class property called
|
69
82
|
# 'input'.
|
70
83
|
self.genre = self.input
|
@@ -72,13 +85,27 @@ class MovieStatsFlow(FlowSpec):
|
|
72
85
|
|
73
86
|
# Find all the movies that have this genre and build a dataframe with
|
74
87
|
# just those movies and just the columns of interest.
|
75
|
-
selector = self.dataframe["genres"]
|
76
|
-
|
77
|
-
|
88
|
+
selector = [self.genre in row for row in self.dataframe["genres"]]
|
89
|
+
|
90
|
+
for col in self.dataframe.keys():
|
91
|
+
self.dataframe[col] = [
|
92
|
+
col for col, is_genre in zip(self.dataframe[col], selector) if is_genre
|
93
|
+
]
|
94
|
+
|
95
|
+
# Sort by gross box office and drop unused column.
|
96
|
+
argsort_indices = sorted(
|
97
|
+
range(len(self.dataframe["gross"])), key=self.dataframe["gross"].__getitem__
|
98
|
+
)
|
99
|
+
for col in self.dataframe.keys():
|
100
|
+
self.dataframe[col] = [self.dataframe[col][idx] for idx in argsort_indices]
|
101
|
+
del self.dataframe["title_year"]
|
78
102
|
|
79
103
|
# Get some statistics on the gross box office for these titles.
|
80
|
-
|
81
|
-
self.quartiles =
|
104
|
+
n_points = len(self.dataframe["movie_title"])
|
105
|
+
self.quartiles = []
|
106
|
+
for cut in [0.25, 0.5, 0.75]:
|
107
|
+
idx = 0 if n_points < 2 else round(n_points * cut)
|
108
|
+
self.quartiles.append(self.dataframe["gross"][idx])
|
82
109
|
|
83
110
|
# Join the results from other genres.
|
84
111
|
self.next(self.join)
|
@@ -53,19 +53,25 @@ class PlayListFlow(FlowSpec):
|
|
53
53
|
This step chooses a random title for a different movie genre.
|
54
54
|
|
55
55
|
"""
|
56
|
-
import
|
57
|
-
|
58
|
-
# Concatenate all the genre-specific data frames
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
data["dataframe"]
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
56
|
+
import random
|
57
|
+
|
58
|
+
# Concatenate all the genre-specific data frames.
|
59
|
+
df = {"movie_title": [], "genres": []}
|
60
|
+
for genre, data in self.genre_stats.items():
|
61
|
+
if genre != self.genre.lower():
|
62
|
+
for row_idx in range(len(data["dataframe"]["movie_title"])):
|
63
|
+
if (
|
64
|
+
self.genre.lower()
|
65
|
+
not in data["dataframe"]["genres"][row_idx].lower()
|
66
|
+
):
|
67
|
+
df["movie_title"].append(
|
68
|
+
data["dataframe"]["movie_title"][row_idx]
|
69
|
+
)
|
70
|
+
df["genres"].append(data["dataframe"]["genres"][row_idx])
|
71
|
+
|
72
|
+
# Choose a random movie.
|
73
|
+
random_index = random.randint(0, len(df["genres"]) - 1)
|
74
|
+
self.bonus = (df["movie_title"][random_index], df["genres"][random_index])
|
69
75
|
|
70
76
|
self.next(self.join)
|
71
77
|
|
@@ -82,12 +88,14 @@ class PlayListFlow(FlowSpec):
|
|
82
88
|
genre = self.genre.lower()
|
83
89
|
if genre not in self.genre_stats:
|
84
90
|
self.movies = []
|
85
|
-
|
86
91
|
else:
|
87
92
|
df = self.genre_stats[genre]["dataframe"]
|
88
93
|
quartiles = self.genre_stats[genre]["quartiles"]
|
89
|
-
|
90
|
-
|
94
|
+
self.movies = [
|
95
|
+
df["movie_title"][i]
|
96
|
+
for i, g in enumerate(df["gross"])
|
97
|
+
if g >= quartiles[-1]
|
98
|
+
]
|
91
99
|
|
92
100
|
# Shuffle the playlist.
|
93
101
|
shuffle(self.movies)
|
@@ -10,7 +10,7 @@ def get_python_version():
|
|
10
10
|
"""
|
11
11
|
import platform
|
12
12
|
|
13
|
-
versions = {"2": "2.7.15", "3": "3.
|
13
|
+
versions = {"2": "2.7.15", "3": "3.9.10"}
|
14
14
|
return versions[platform.python_version_tuple()[0]]
|
15
15
|
|
16
16
|
|
@@ -48,17 +48,12 @@ class PlayListFlow(FlowSpec):
|
|
48
48
|
default=5,
|
49
49
|
)
|
50
50
|
|
51
|
-
@conda(libraries={"pandas": "1.3.3"})
|
52
51
|
@step
|
53
52
|
def start(self):
|
54
53
|
"""
|
55
54
|
Use the Metaflow client to retrieve the latest successful run from our
|
56
55
|
MovieStatsFlow and assign them as data artifacts in this flow.
|
57
56
|
|
58
|
-
This step uses 'conda' to isolate the environment. This step will
|
59
|
-
always use pandas==1.3.3 regardless of what is installed on the
|
60
|
-
system.
|
61
|
-
|
62
57
|
"""
|
63
58
|
# Load the analysis from the MovieStatsFlow.
|
64
59
|
from metaflow import Flow, get_metadata
|
@@ -80,7 +75,7 @@ class PlayListFlow(FlowSpec):
|
|
80
75
|
# Compute our two recommendation types in parallel.
|
81
76
|
self.next(self.bonus_movie, self.genre_movies)
|
82
77
|
|
83
|
-
@conda(libraries={"editdistance": "0.5.3"
|
78
|
+
@conda(libraries={"editdistance": "0.5.3"})
|
84
79
|
@step
|
85
80
|
def bonus_movie(self):
|
86
81
|
"""
|
@@ -90,9 +85,7 @@ class PlayListFlow(FlowSpec):
|
|
90
85
|
This step uses 'conda' to isolate the environment. Note that the
|
91
86
|
package 'editdistance' need not be installed in your python
|
92
87
|
environment.
|
93
|
-
|
94
88
|
"""
|
95
|
-
import pandas
|
96
89
|
import editdistance
|
97
90
|
|
98
91
|
# Define a helper function to compute the similarity between two
|
@@ -101,27 +94,23 @@ class PlayListFlow(FlowSpec):
|
|
101
94
|
return editdistance.eval(self.hint, movie_title)
|
102
95
|
|
103
96
|
# Compute the distance and take the argmin to find the closest title.
|
104
|
-
distance =
|
105
|
-
|
97
|
+
distance = [
|
98
|
+
_edit_distance(movie_title) for movie_title in self.dataframe["movie_title"]
|
99
|
+
]
|
100
|
+
index = distance.index(min(distance))
|
106
101
|
self.bonus = (
|
107
|
-
self.dataframe["movie_title"]
|
108
|
-
self.dataframe["genres"]
|
102
|
+
self.dataframe["movie_title"][index],
|
103
|
+
self.dataframe["genres"][index],
|
109
104
|
)
|
110
105
|
|
111
106
|
self.next(self.join)
|
112
107
|
|
113
|
-
@conda(libraries={"pandas": "1.3.3"})
|
114
108
|
@step
|
115
109
|
def genre_movies(self):
|
116
110
|
"""
|
117
111
|
Select the top performing movies from the use specified genre.
|
118
|
-
|
119
|
-
This step uses 'conda' to isolate the environment. This step will
|
120
|
-
always use pandas==1.3.3 regardless of what is installed on the
|
121
|
-
system.
|
122
|
-
|
123
112
|
"""
|
124
|
-
|
113
|
+
|
125
114
|
from random import shuffle
|
126
115
|
|
127
116
|
# For the genre of interest, generate a potential playlist using only
|
@@ -129,12 +118,14 @@ class PlayListFlow(FlowSpec):
|
|
129
118
|
genre = self.genre.lower()
|
130
119
|
if genre not in self.genre_stats:
|
131
120
|
self.movies = []
|
132
|
-
|
133
121
|
else:
|
134
122
|
df = self.genre_stats[genre]["dataframe"]
|
135
123
|
quartiles = self.genre_stats[genre]["quartiles"]
|
136
|
-
|
137
|
-
|
124
|
+
self.movies = [
|
125
|
+
df["movie_title"][i]
|
126
|
+
for i, g in enumerate(df["gross"])
|
127
|
+
if g >= quartiles[-1]
|
128
|
+
]
|
138
129
|
|
139
130
|
# Shuffle the content.
|
140
131
|
shuffle(self.movies)
|
@@ -0,0 +1,45 @@
|
|
1
|
+
# Episode 05-hellocloud: Look Mom, We're in the Cloud.
|
2
|
+
|
3
|
+
**This flow is a simple linear workflow that verifies your Kubernetes
|
4
|
+
configuration. The 'start' and 'end' steps will run locally, while the 'hello'
|
5
|
+
step will run remotely on Kubernetes. After configuring Metaflow to run on the cloud,
|
6
|
+
data and metadata about your runs will be stored remotely. This means you can
|
7
|
+
use the client to access information about any flow from anywhere.**
|
8
|
+
|
9
|
+
#### Showcasing:
|
10
|
+
- Kubernetes decorator.
|
11
|
+
- Accessing data artifacts generated remotely in a local notebook.
|
12
|
+
- retry decorator.
|
13
|
+
|
14
|
+
#### To play this episode:
|
15
|
+
Open ```05-hello-cloud/hello-cloud.ipynb```
|
16
|
+
|
17
|
+
# Episode 5: Hello Cloud
|
18
|
+
|
19
|
+
## Look Mom, We're in the Cloud.
|
20
|
+
|
21
|
+
This flow is a simple linear workflow that verifies your cloud configuration. The `start` and `end` steps will run locally, while the `hello` step will [run remotely](/scaling/remote-tasks/introduction). After [configuring Metaflow](/getting-started/infrastructure) to run in the cloud, data and metadata about your runs will be stored remotely. This means you can use the client to access information about any flow from anywhere.
|
22
|
+
|
23
|
+
You can find the tutorial code on [GitHub](https://github.com/Netflix/metaflow/tree/master/metaflow/tutorials/05-hello-cloud)
|
24
|
+
|
25
|
+
**Showcasing:**
|
26
|
+
|
27
|
+
- [Kubernetes](https://docs.metaflow.org/scaling/remote-tasks/kubernetes) and the [`@kubernetes`](https://docs.metaflow.org/scaling/remote-tasks/introduction) decorator.
|
28
|
+
- Using the [Client API](../../../metaflow/client) to access data artifacts generated remotely in a local notebook.
|
29
|
+
- [`@retry`](https://docs.metaflow.org/scaling/failures#retrying-tasks-with-the-retry-decorator)decorator.
|
30
|
+
|
31
|
+
**Before playing this episode:**
|
32
|
+
|
33
|
+
1. `python -m pip install notebook`
|
34
|
+
2. This tutorial requires access to compute and storage resources in the cloud, which can be configured by
|
35
|
+
1. Following the instructions [here](https://outerbounds.com/docs/engineering-welcome/) or
|
36
|
+
2. Requesting [a sandbox](https://outerbounds.com/sandbox/).
|
37
|
+
|
38
|
+
**To play this episode:**
|
39
|
+
|
40
|
+
1. `cd metaflow-tutorials`
|
41
|
+
2. `python 05-hello-cloud/hello-cloud.py run`
|
42
|
+
3. `jupyter-notebook 05-hello-cloud/hello-cloud.ipynb`
|
43
|
+
4. Open _**hello-cloud.ipynb**_ in a notebook
|
44
|
+
|
45
|
+
<TutorialsLink link="../../tutorials"/>
|
@@ -4,9 +4,9 @@
|
|
4
4
|
"cell_type": "markdown",
|
5
5
|
"metadata": {},
|
6
6
|
"source": [
|
7
|
-
"# Episode 05-
|
7
|
+
"# Episode 05-hellocloud: Look Mom, We're in the Cloud\n",
|
8
8
|
"\n",
|
9
|
-
"### In
|
9
|
+
"### In HellowCloudFlow, the 'start' and 'end' steps were run locally, while the 'hello' step was run remotely on Kubernetes. Since we are using AWS, data artifacts and metadata were stored remotely. This means you can use the client to access information about any flow from anywhere. This notebook shows you how. "
|
10
10
|
]
|
11
11
|
},
|
12
12
|
{
|
@@ -41,7 +41,7 @@
|
|
41
41
|
"source": [
|
42
42
|
"# Set namespace to None to search over all namespaces\n",
|
43
43
|
"namespace(None)\n",
|
44
|
-
"run = Flow('
|
44
|
+
"run = Flow('HelloCloudFlow').latest_successful_run\n",
|
45
45
|
"print(\"Using run: %s\" % str(run))\n",
|
46
46
|
"print(run.data.message)"
|
47
47
|
]
|
@@ -49,7 +49,7 @@
|
|
49
49
|
],
|
50
50
|
"metadata": {
|
51
51
|
"kernelspec": {
|
52
|
-
"display_name": "Python 3",
|
52
|
+
"display_name": "Python 3.10.6 64-bit ('3.10.6')",
|
53
53
|
"language": "python",
|
54
54
|
"name": "python3"
|
55
55
|
},
|
@@ -63,7 +63,12 @@
|
|
63
63
|
"name": "python",
|
64
64
|
"nbconvert_exporter": "python",
|
65
65
|
"pygments_lexer": "ipython3",
|
66
|
-
"version": "3.
|
66
|
+
"version": "3.10.6"
|
67
|
+
},
|
68
|
+
"vscode": {
|
69
|
+
"interpreter": {
|
70
|
+
"hash": "60d98827d7482d2a0f6aae287a18990d3a1d423e0f66197ec6cdef8a2e07b41f"
|
71
|
+
}
|
67
72
|
}
|
68
73
|
},
|
69
74
|
"nbformat": 4,
|
@@ -1,11 +1,11 @@
|
|
1
|
-
from metaflow import FlowSpec, step,
|
1
|
+
from metaflow import FlowSpec, step, kubernetes, retry
|
2
2
|
|
3
3
|
|
4
|
-
class
|
4
|
+
class HelloCloudFlow(FlowSpec):
|
5
5
|
"""
|
6
|
-
A flow where Metaflow prints 'Metaflow says Hi from
|
6
|
+
A flow where Metaflow prints 'Metaflow says Hi from the cloud!'
|
7
7
|
|
8
|
-
Run this flow to validate your
|
8
|
+
Run this flow to validate your Kubernetes configuration.
|
9
9
|
|
10
10
|
"""
|
11
11
|
|
@@ -18,30 +18,28 @@ class HelloAWSFlow(FlowSpec):
|
|
18
18
|
"""
|
19
19
|
from metaflow import get_metadata
|
20
20
|
|
21
|
-
print("
|
21
|
+
print("HelloCloud is starting.")
|
22
22
|
print("")
|
23
23
|
print("Using metadata provider: %s" % get_metadata())
|
24
24
|
print("")
|
25
25
|
print("The start step is running locally. Next, the ")
|
26
|
-
print("'hello' step will run remotely on
|
27
|
-
print("If you are running in the Netflix sandbox, ")
|
28
|
-
print("it may take some time to acquire a compute resource.")
|
26
|
+
print("'hello' step will run remotely on Kubernetes. ")
|
29
27
|
|
30
28
|
self.next(self.hello)
|
31
29
|
|
32
|
-
@
|
30
|
+
@kubernetes(cpu=1, memory=500)
|
33
31
|
@retry
|
34
32
|
@step
|
35
33
|
def hello(self):
|
36
34
|
"""
|
37
|
-
This steps runs remotely on
|
35
|
+
This steps runs remotely on Kubernetes using 1 virtual CPU and 500Mb of
|
38
36
|
memory. Since we are now using a remote metadata service and data
|
39
37
|
store, the flow information and artifacts are available from
|
40
38
|
anywhere. The step also uses the retry decorator, so that if something
|
41
39
|
goes wrong, the step will be automatically retried.
|
42
40
|
|
43
41
|
"""
|
44
|
-
self.message = "Hi from
|
42
|
+
self.message = "Hi from the cloud!"
|
45
43
|
print("Metaflow says: %s" % self.message)
|
46
44
|
self.next(self.end)
|
47
45
|
|
@@ -52,8 +50,8 @@ class HelloAWSFlow(FlowSpec):
|
|
52
50
|
which the flow is executed.
|
53
51
|
|
54
52
|
"""
|
55
|
-
print("
|
53
|
+
print("HelloCloud is finished.")
|
56
54
|
|
57
55
|
|
58
56
|
if __name__ == "__main__":
|
59
|
-
|
57
|
+
HelloCloudFlow()
|
@@ -3,42 +3,19 @@
|
|
3
3
|
**This example revisits 'Episode 02-statistics: Is this Data Science?'. With
|
4
4
|
Metaflow, you don't need to make any code changes to scale-up your flow by
|
5
5
|
running on remote compute. In this example we re-run the 'stats.py' workflow
|
6
|
-
adding the '--with
|
7
|
-
all your steps on AWS
|
6
|
+
adding the '--with kubernetes' command line argument. This instructs Metaflow to run
|
7
|
+
all your steps on AWS Kubernetes without changing any code. You can control the
|
8
8
|
behavior with additional arguments, like '--max-workers'. For this example,
|
9
9
|
'max-workers' is used to limit the number of parallel genre-specific statistics
|
10
10
|
computations.
|
11
11
|
You can then access the data artifacts (even the local CSV file) from anywhere
|
12
|
-
because the data is being stored in AWS S3
|
13
|
-
This tutorial uses `pandas` which may not be available in your environment.
|
14
|
-
Use the 'conda' package manager with the `conda-forge` channel added to run
|
15
|
-
this tutorial in any environment**
|
12
|
+
because the data is being stored in AWS S3.**
|
16
13
|
|
17
14
|
#### Showcasing:
|
18
|
-
- '--with
|
15
|
+
- '--with kubernetes' command line option
|
19
16
|
- '--max-workers' command line option
|
20
17
|
- Accessing data locally or remotely
|
21
|
-
- Metaflow's conda based dependency management.
|
22
|
-
|
23
|
-
|
24
|
-
#### Before playing this episode:
|
25
|
-
1. ```python -m pip install pandas```
|
26
|
-
2. ```python -m pip install notebook```
|
27
|
-
3. ```python -m pip install matplotlib```
|
28
|
-
4. This tutorial requires the 'conda' package manager to be installed with the
|
29
|
-
conda-forge channel added.
|
30
|
-
a. Download Miniconda at https://docs.conda.io/en/latest/miniconda.html
|
31
|
-
b. ```conda config --add channels conda-forge```
|
32
|
-
5. This tutorial requires access to compute and storage resources on AWS, which
|
33
|
-
can be configured by
|
34
|
-
1. Following the instructions at
|
35
|
-
https://docs.metaflow.org/metaflow-on-aws/deploy-to-aws or
|
36
|
-
2. Requesting a sandbox at
|
37
|
-
https://docs.metaflow.org/metaflow-on-aws/metaflow-sandbox
|
38
|
-
|
39
18
|
|
40
19
|
#### To play this episode:
|
41
|
-
1. ```
|
42
|
-
2. ```
|
43
|
-
3. ```jupyter-notebook 06-statistics-redux/stats.ipynb```
|
44
|
-
4. Open 'stats.ipynb' in your remote Sagemaker notebook
|
20
|
+
1. ```python 02-statistics/stats.py run --with kubernetes --max-workers 4```
|
21
|
+
2. Open ```06-statistics-redux/stats.ipynb```
|
@@ -113,7 +113,7 @@
|
|
113
113
|
],
|
114
114
|
"metadata": {
|
115
115
|
"kernelspec": {
|
116
|
-
"display_name": "Python 3",
|
116
|
+
"display_name": "Python 3 (ipykernel)",
|
117
117
|
"language": "python",
|
118
118
|
"name": "python3"
|
119
119
|
},
|
@@ -127,7 +127,7 @@
|
|
127
127
|
"name": "python",
|
128
128
|
"nbconvert_exporter": "python",
|
129
129
|
"pygments_lexer": "ipython3",
|
130
|
-
"version": "3.
|
130
|
+
"version": "3.11.0"
|
131
131
|
}
|
132
132
|
},
|
133
133
|
"nbformat": 4,
|