metaflow 2.14.0__py2.py3-none-any.whl → 2.14.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,10 +2,12 @@ import collections
2
2
  import glob
3
3
  import json
4
4
  import os
5
+ import re
5
6
  import random
6
7
  import tempfile
7
8
  import time
8
9
  from collections import namedtuple
10
+ from typing import List
9
11
 
10
12
  from metaflow.exception import MetaflowInternalError, MetaflowTaggingError
11
13
  from metaflow.metadata_provider.metadata import ObjectOrder
@@ -202,6 +204,70 @@ class LocalMetadataProvider(MetadataProvider):
202
204
  "Tagging failed due to too many conflicting updates from other processes"
203
205
  )
204
206
 
207
+ @classmethod
208
+ def filter_tasks_by_metadata(
209
+ cls,
210
+ flow_name: str,
211
+ run_id: str,
212
+ step_name: str,
213
+ field_name: str,
214
+ pattern: str,
215
+ ) -> List[str]:
216
+ """
217
+ Filter tasks by metadata field and pattern, returning task pathspecs that match criteria.
218
+
219
+ Parameters
220
+ ----------
221
+ flow_name : str
222
+ Identifier for the flow
223
+ run_id : str
224
+ Identifier for the run
225
+ step_name : str
226
+ Name of the step to query tasks from
227
+ field_name : str
228
+ Name of metadata field to query
229
+ pattern : str
230
+ Pattern to match in metadata field value
231
+
232
+ Returns
233
+ -------
234
+ List[str]
235
+ List of task pathspecs that match the query criteria
236
+ """
237
+ tasks = cls.get_object("step", "task", {}, None, flow_name, run_id, step_name)
238
+ if not tasks:
239
+ return []
240
+
241
+ regex = re.compile(pattern)
242
+ matching_task_pathspecs = []
243
+
244
+ for task in tasks:
245
+ task_id = task.get("task_id")
246
+ if not task_id:
247
+ continue
248
+
249
+ if pattern == ".*":
250
+ # If the pattern is ".*", we can match all tasks without reading metadata
251
+ matching_task_pathspecs.append(
252
+ f"{flow_name}/{run_id}/{step_name}/{task_id}"
253
+ )
254
+ continue
255
+
256
+ metadata = cls.get_object(
257
+ "task", "metadata", {}, None, flow_name, run_id, step_name, task_id
258
+ )
259
+
260
+ if any(
261
+ meta.get("field_name") == field_name
262
+ and regex.match(meta.get("value", ""))
263
+ for meta in metadata
264
+ ):
265
+ matching_task_pathspecs.append(
266
+ f"{flow_name}/{run_id}/{step_name}/{task_id}"
267
+ )
268
+
269
+ return matching_task_pathspecs
270
+
205
271
  @classmethod
206
272
  def _get_object_internal(
207
273
  cls, obj_type, obj_order, sub_type, sub_order, filters, attempt, *args
@@ -4,6 +4,7 @@ import time
4
4
 
5
5
  import requests
6
6
 
7
+ from typing import List
7
8
  from metaflow.exception import (
8
9
  MetaflowException,
9
10
  MetaflowInternalError,
@@ -13,6 +14,7 @@ from metaflow.metadata_provider import MetadataProvider
13
14
  from metaflow.metadata_provider.heartbeat import HB_URL_KEY
14
15
  from metaflow.metaflow_config import SERVICE_HEADERS, SERVICE_RETRY_COUNT, SERVICE_URL
15
16
  from metaflow.sidecar import Message, MessageTypes, Sidecar
17
+ from urllib.parse import urlencode
16
18
  from metaflow.util import version_parse
17
19
 
18
20
 
@@ -318,6 +320,55 @@ class ServiceMetadataProvider(MetadataProvider):
318
320
  self._register_system_metadata(run_id, step_name, task["task_id"], attempt)
319
321
  return task["task_id"], did_create
320
322
 
323
+ @classmethod
324
+ def filter_tasks_by_metadata(
325
+ cls,
326
+ flow_name: str,
327
+ run_id: str,
328
+ step_name: str,
329
+ field_name: str,
330
+ pattern: str,
331
+ ) -> List[str]:
332
+ """
333
+ Filter tasks by metadata field and pattern, returning task pathspecs that match criteria.
334
+
335
+ Parameters
336
+ ----------
337
+ flow_name : str
338
+ Flow name, that the run belongs to.
339
+ run_id: str
340
+ Run id, together with flow_id, that identifies the specific Run whose tasks to query
341
+ step_name: str
342
+ Step name to query tasks from
343
+ field_name: str
344
+ Metadata field name to query
345
+ pattern: str
346
+ Pattern to match in metadata field value
347
+
348
+ Returns
349
+ -------
350
+ List[str]
351
+ List of task pathspecs that satisfy the query
352
+ """
353
+ query_params = {
354
+ "metadata_field_name": field_name,
355
+ "pattern": pattern,
356
+ "step_name": step_name,
357
+ }
358
+ url = ServiceMetadataProvider._obj_path(flow_name, run_id, step_name)
359
+ url = f"{url}/filtered_tasks?{urlencode(query_params)}"
360
+ try:
361
+ resp = cls._request(None, url, "GET")
362
+ except Exception as e:
363
+ if e.http_code == 404:
364
+ # filter_tasks_by_metadata endpoint does not exist in the version of metadata service
365
+ # deployed currently. Raise a more informative error message.
366
+ raise MetaflowInternalError(
367
+ "The version of metadata service deployed currently does not support filtering tasks by metadata. "
368
+ "Upgrade Metadata service to version 2.15 or greater to use this feature."
369
+ ) from e
370
+ return resp
371
+
321
372
  @staticmethod
322
373
  def _obj_path(
323
374
  flow_name,
@@ -10,7 +10,7 @@ import tarfile
10
10
  import time
11
11
  from urllib.error import URLError
12
12
  from urllib.request import urlopen
13
- from metaflow.metaflow_config import DATASTORE_LOCAL_DIR, CONDA_USE_FAST_INIT
13
+ from metaflow.metaflow_config import DATASTORE_LOCAL_DIR
14
14
  from metaflow.plugins import DATASTORES
15
15
  from metaflow.plugins.pypi.utils import MICROMAMBA_MIRROR_URL, MICROMAMBA_URL
16
16
  from metaflow.util import which
@@ -329,8 +329,6 @@ if __name__ == "__main__":
329
329
 
330
330
  @timer
331
331
  def fast_setup_environment(architecture, storage, env, prefix, pkgs_dir):
332
- install_fast_initializer(architecture)
333
-
334
332
  # Get package urls
335
333
  conda_pkgs = env["conda"]
336
334
  pypi_pkgs = env.get("pypi", [])
@@ -381,7 +379,9 @@ if __name__ == "__main__":
381
379
  with open(os.path.join(manifest_dir, MAGIC_FILE)) as f:
382
380
  env = json.load(f)[id_][architecture]
383
381
 
384
- if CONDA_USE_FAST_INIT:
382
+ if datastore_type == "s3":
383
+ # TODO: Remove this once fast-initializer is ready for all datastores
384
+ install_fast_initializer(architecture)
385
385
  fast_setup_environment(architecture, storage, env, prefix, pkgs_dir)
386
386
  else:
387
387
  setup_environment(
@@ -97,12 +97,13 @@ def _method_sanity_check(
97
97
  check_type(supplied_v, annotations[supplied_k])
98
98
  except TypeCheckError:
99
99
  raise TypeError(
100
- "Invalid type for '%s' (%s), expected: '%s', default is '%s'"
100
+ "Invalid type for '%s' (%s), expected: '%s', default is '%s' but found '%s'"
101
101
  % (
102
102
  supplied_k,
103
103
  type(supplied_k),
104
104
  annotations[supplied_k],
105
105
  defaults[supplied_k],
106
+ str(supplied_v),
106
107
  )
107
108
  )
108
109
 
@@ -218,7 +219,7 @@ def get_inspect_param_obj(p: Union[click.Argument, click.Option], kind: str):
218
219
  default=inspect.Parameter.empty if is_vararg else p.default,
219
220
  annotation=annotation,
220
221
  ),
221
- annotation,
222
+ Optional[TTuple[annotation]] if is_vararg else annotation,
222
223
  )
223
224
 
224
225
 
@@ -392,7 +393,9 @@ class MetaflowAPI(object):
392
393
  options = params.pop("options", {})
393
394
 
394
395
  for _, v in args.items():
395
- if isinstance(v, list):
396
+ if v is None:
397
+ continue
398
+ if isinstance(v, (list, tuple)):
396
399
  for i in v:
397
400
  components.append(i)
398
401
  else:
@@ -49,7 +49,7 @@ def process_messages(worker_type, worker):
49
49
 
50
50
 
51
51
  @click.command(help="Initialize workers")
52
- @tracing.cli_entrypoint("sidecar")
52
+ @tracing.cli("sidecar")
53
53
  @click.argument("worker-type")
54
54
  def main(worker_type):
55
55
  sidecar_type = SIDECARS.get(worker_type)
metaflow/task.py CHANGED
@@ -493,6 +493,25 @@ class MetaflowTask(object):
493
493
  )
494
494
  )
495
495
 
496
+ # Add runtime dag information to the metadata of the task
497
+ foreach_execution_path = ",".join(
498
+ [
499
+ "{}:{}".format(foreach_frame.step, foreach_frame.index)
500
+ for foreach_frame in foreach_stack
501
+ ]
502
+ )
503
+ if foreach_execution_path:
504
+ metadata.extend(
505
+ [
506
+ MetaDatum(
507
+ field="foreach-execution-path",
508
+ value=foreach_execution_path,
509
+ type="foreach-execution-path",
510
+ tags=metadata_tags,
511
+ ),
512
+ ]
513
+ )
514
+
496
515
  self.metadata.register_metadata(
497
516
  run_id,
498
517
  step_name,
@@ -559,6 +578,7 @@ class MetaflowTask(object):
559
578
  self.flow._success = False
560
579
  self.flow._task_ok = None
561
580
  self.flow._exception = None
581
+
562
582
  # Note: All internal flow attributes (ie: non-user artifacts)
563
583
  # should either be set prior to running the user code or listed in
564
584
  # FlowSpec._EPHEMERAL to allow for proper merging/importing of
@@ -616,7 +636,6 @@ class MetaflowTask(object):
616
636
  "graph_info": self.flow._graph_info,
617
637
  }
618
638
  )
619
-
620
639
  for deco in decorators:
621
640
  deco.task_pre_step(
622
641
  step_name,
@@ -728,7 +747,7 @@ class MetaflowTask(object):
728
747
  value=attempt_ok,
729
748
  type="internal_attempt_status",
730
749
  tags=["attempt_id:{0}".format(retry_count)],
731
- )
750
+ ),
732
751
  ],
733
752
  )
734
753
 
@@ -20,15 +20,15 @@ def post_fork():
20
20
  yield
21
21
 
22
22
 
23
- def cli_entrypoint(name: str):
24
- def cli_entrypoint_wrap(func):
23
+ def cli(name: str):
24
+ def cli_wrap(func):
25
25
  @wraps(func)
26
26
  def wrapper_func(*args, **kwargs):
27
27
  return func(*args, **kwargs)
28
28
 
29
29
  return wrapper_func
30
30
 
31
- return cli_entrypoint_wrap
31
+ return cli_wrap
32
32
 
33
33
 
34
34
  def inject_tracing_vars(env_dict: Dict[str, str]) -> Dict[str, str]:
@@ -40,7 +40,9 @@ def get_trace_id() -> str:
40
40
 
41
41
 
42
42
  @contextlib.contextmanager
43
- def traced(name, attrs={}):
43
+ def traced(name, attrs=None):
44
+ if attrs is None:
45
+ attrs = {}
44
46
  yield
45
47
 
46
48
 
@@ -54,17 +56,15 @@ def tracing(func):
54
56
 
55
57
  if not DISABLE_TRACING and (CONSOLE_TRACE_ENABLED or OTEL_ENDPOINT or ZIPKIN_ENDPOINT):
56
58
  try:
57
- # Overrides No-Op implementations if a specific provider is configured.
58
59
  from .tracing_modules import (
59
60
  init_tracing,
60
61
  post_fork,
61
- cli_entrypoint,
62
+ cli,
62
63
  inject_tracing_vars,
63
64
  get_trace_id,
64
65
  traced,
65
66
  tracing,
66
67
  )
67
-
68
68
  except ImportError as e:
69
69
  # We keep the errors silent by default so that having tracing environment variables present
70
70
  # does not affect users with no need for tracing.
@@ -3,60 +3,53 @@ from metaflow.metaflow_config import (
3
3
  OTEL_ENDPOINT,
4
4
  ZIPKIN_ENDPOINT,
5
5
  CONSOLE_TRACE_ENABLED,
6
+ SERVICE_AUTH_KEY,
7
+ SERVICE_HEADERS,
6
8
  )
7
9
 
8
10
 
9
11
  def get_span_exporter():
10
- if OTEL_ENDPOINT:
11
- return set_otel_exporter()
12
+ exporter_map = {
13
+ OTEL_ENDPOINT: _create_otel_exporter,
14
+ ZIPKIN_ENDPOINT: _create_zipkin_exporter,
15
+ CONSOLE_TRACE_ENABLED: _create_console_exporter,
16
+ }
12
17
 
13
- elif ZIPKIN_ENDPOINT:
14
- return set_zipkin_exporter()
18
+ for config, create_exporter in exporter_map.items():
19
+ if config:
20
+ return create_exporter()
15
21
 
16
- elif CONSOLE_TRACE_ENABLED:
17
- return set_console_exporter()
18
- else:
19
- print("WARNING: endpoints not set up for Opentelemetry", file=sys.stderr)
20
- return
22
+ print("WARNING: endpoints not set up for OpenTelemetry", file=sys.stderr)
23
+ return None
21
24
 
22
25
 
23
- def set_otel_exporter():
26
+ def _create_otel_exporter():
24
27
  from opentelemetry.exporter.otlp.proto.http.trace_exporter import OTLPSpanExporter
25
28
 
26
- from metaflow.metaflow_config import (
27
- SERVICE_AUTH_KEY,
28
- SERVICE_HEADERS,
29
- )
29
+ if not any([SERVICE_AUTH_KEY, SERVICE_HEADERS]):
30
+ print("WARNING: no auth settings for OpenTelemetry", file=sys.stderr)
31
+ return None
32
+
33
+ config = {
34
+ "endpoint": OTEL_ENDPOINT,
35
+ "timeout": 1,
36
+ }
30
37
 
31
38
  if SERVICE_AUTH_KEY:
32
- span_exporter = OTLPSpanExporter(
33
- endpoint=OTEL_ENDPOINT,
34
- headers={"x-api-key": SERVICE_AUTH_KEY},
35
- timeout=1,
36
- )
39
+ config["headers"] = {"x-api-key": SERVICE_AUTH_KEY}
37
40
  elif SERVICE_HEADERS:
38
- span_exporter = OTLPSpanExporter(
39
- endpoint=OTEL_ENDPOINT,
40
- headers=SERVICE_HEADERS,
41
- timeout=1,
42
- )
43
- else:
44
- print("WARNING: no auth settings for Opentelemetry", file=sys.stderr)
45
- return
46
- return span_exporter
47
-
48
-
49
- def set_zipkin_exporter():
41
+ config["headers"] = SERVICE_HEADERS
42
+
43
+ return OTLPSpanExporter(**config)
44
+
45
+
46
+ def _create_zipkin_exporter():
50
47
  from opentelemetry.exporter.zipkin.proto.http import ZipkinExporter
51
48
 
52
- span_exporter = ZipkinExporter(
53
- endpoint=ZIPKIN_ENDPOINT,
54
- )
55
- return span_exporter
49
+ return ZipkinExporter(endpoint=ZIPKIN_ENDPOINT)
56
50
 
57
51
 
58
- def set_console_exporter():
52
+ def _create_console_exporter():
59
53
  from opentelemetry.sdk.trace.export import ConsoleSpanExporter
60
54
 
61
- span_exporter = ConsoleSpanExporter()
62
- return span_exporter
55
+ return ConsoleSpanExporter()
@@ -13,45 +13,51 @@ from typing import Dict, List, Optional
13
13
  from opentelemetry import trace as trace_api, context
14
14
  from .span_exporter import get_span_exporter
15
15
 
16
- tracer_provider = None
16
+ tracer_provider: Optional[TracerProvider] = None
17
17
 
18
18
 
19
19
  def init_tracing():
20
20
  global tracer_provider
21
21
  if tracer_provider is not None:
22
- print("Tracing already initialized", file=sys.stderr)
23
22
  return
24
23
 
25
24
  from .propagator import EnvPropagator
26
25
 
27
26
  set_global_textmap(EnvPropagator(None))
28
- span_exporter = get_span_exporter()
29
27
 
30
- if "METAFLOW_KUBERNETES_POD_NAMESPACE" in os.environ:
31
- service_name = "metaflow-kubernetes"
32
- elif "AWS_BATCH_JOB_ID" in os.environ:
33
- service_name = "metaflow-awsbatch"
34
- else:
35
- service_name = "metaflow-local"
28
+ span_exporter = get_span_exporter()
29
+ if span_exporter is None:
30
+ return
36
31
 
37
32
  tracer_provider = TracerProvider(
38
- resource=Resource.create({SERVICE_NAME: service_name})
33
+ resource=Resource.create(
34
+ {
35
+ SERVICE_NAME: "metaflow",
36
+ }
37
+ )
39
38
  )
40
39
  trace_api.set_tracer_provider(tracer_provider)
41
40
 
42
41
  span_processor = BatchSpanProcessor(span_exporter)
43
42
  tracer_provider.add_span_processor(span_processor)
44
43
 
45
- from opentelemetry.instrumentation.requests import RequestsInstrumentor
44
+ try:
45
+ from opentelemetry.instrumentation.requests import RequestsInstrumentor
46
46
 
47
- RequestsInstrumentor().instrument()
47
+ RequestsInstrumentor().instrument(
48
+ tracer_provider=tracer_provider,
49
+ )
50
+ except ImportError:
51
+ pass
48
52
 
49
53
 
50
54
  @contextlib.contextmanager
51
55
  def post_fork():
52
56
  global tracer_provider
57
+
53
58
  tracer_provider = None
54
59
  init_tracing()
60
+
55
61
  token = context.attach(extract(os.environ))
56
62
  try:
57
63
  tracer = trace_api.get_tracer_provider().get_tracer(__name__)
@@ -59,47 +65,27 @@ def post_fork():
59
65
  "fork", kind=trace_api.SpanKind.SERVER
60
66
  ) as span:
61
67
  span.set_attribute("cmd", " ".join(sys.argv))
68
+ span.set_attribute("pid", str(os.getpid()))
62
69
  yield
63
70
  finally:
64
71
  context.detach(token)
65
72
 
66
73
 
67
- def _extract_token_after(tokens: List[str], before_token: str) -> Optional[str]:
68
- for i, tok in enumerate(tokens):
69
- if i > 0 and tokens[i - 1] == before_token:
70
- return tok
71
-
72
-
73
- def cli_entrypoint(name: str):
74
- def cli_entrypoint_wrap(func):
74
+ def cli(name: str):
75
+ def cli_wrap(func):
75
76
  @wraps(func)
76
77
  def wrapper_func(*args, **kwargs):
77
78
  global tracer_provider
78
-
79
79
  init_tracing()
80
80
 
81
- assert tracer_provider is not None # make type checker happy
81
+ if tracer_provider is None:
82
+ return func(*args, **kwargs)
82
83
 
83
84
  token = context.attach(extract(os.environ))
84
85
  try:
85
86
  tracer = trace_api.get_tracer_provider().get_tracer(__name__)
86
-
87
- card_subcommand = _extract_token_after(sys.argv, "card")
88
-
89
- step_name = _extract_token_after(sys.argv, "step")
90
- task_id = _extract_token_after(sys.argv, "--task-id")
91
- run_id = _extract_token_after(sys.argv, "--run-id")
92
- if step_name and task_id and run_id:
93
- better_name = "/".join([run_id, step_name, task_id])
94
- elif card_subcommand:
95
- better_name = "card/" + card_subcommand
96
- elif "run" in sys.argv:
97
- better_name = "run"
98
- else:
99
- better_name = None
100
-
101
87
  with tracer.start_as_current_span(
102
- better_name or name, kind=trace_api.SpanKind.SERVER
88
+ name, kind=trace_api.SpanKind.SERVER
103
89
  ) as span:
104
90
  span.set_attribute("cmd", " ".join(sys.argv))
105
91
  span.set_attribute("pid", str(os.getpid()))
@@ -113,7 +99,7 @@ def cli_entrypoint(name: str):
113
99
 
114
100
  return wrapper_func
115
101
 
116
- return cli_entrypoint_wrap
102
+ return cli_wrap
117
103
 
118
104
 
119
105
  def inject_tracing_vars(env_dict: Dict[str, str]) -> Dict[str, str]:
@@ -122,23 +108,32 @@ def inject_tracing_vars(env_dict: Dict[str, str]) -> Dict[str, str]:
122
108
 
123
109
 
124
110
  def get_trace_id() -> str:
125
- return format_trace_id(trace_api.get_current_span().get_span_context().trace_id)
111
+ try:
112
+ return format_trace_id(trace_api.get_current_span().get_span_context().trace_id)
113
+ except Exception:
114
+ return ""
126
115
 
127
116
 
128
117
  @contextlib.contextmanager
129
- def traced(name, attrs={}):
118
+ def traced(name: str, attrs: Optional[Dict] = None):
119
+ if tracer_provider is None:
120
+ yield
121
+ return
130
122
  tracer = trace_api.get_tracer_provider().get_tracer(__name__)
131
123
  with tracer.start_as_current_span(name) as span:
132
- for k, v in attrs.items():
133
- span.set_attribute(k, v)
124
+ if attrs:
125
+ for k, v in attrs.items():
126
+ span.set_attribute(k, v)
134
127
  yield
135
128
 
136
129
 
137
130
  def tracing(func):
138
131
  @wraps(func)
139
132
  def wrapper_func(*args, **kwargs):
140
- tracer = trace_api.get_tracer_provider().get_tracer(func.__module__.__name__)
133
+ if tracer_provider is None:
134
+ return func(*args, **kwargs)
141
135
 
136
+ tracer = trace_api.get_tracer_provider().get_tracer(func.__module__)
142
137
  with tracer.start_as_current_span(func.__name__):
143
138
  return func(*args, **kwargs)
144
139
 
metaflow/version.py CHANGED
@@ -1 +1 @@
1
- metaflow_version = "2.14.0"
1
+ metaflow_version = "2.14.1"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: metaflow
3
- Version: 2.14.0
3
+ Version: 2.14.1
4
4
  Summary: Metaflow: More Data Science, Less Engineering
5
5
  Author: Metaflow Developers
6
6
  Author-email: help@metaflow.org
@@ -26,7 +26,7 @@ License-File: LICENSE
26
26
  Requires-Dist: requests
27
27
  Requires-Dist: boto3
28
28
  Provides-Extra: stubs
29
- Requires-Dist: metaflow-stubs==2.14.0; extra == "stubs"
29
+ Requires-Dist: metaflow-stubs==2.14.1; extra == "stubs"
30
30
  Dynamic: author
31
31
  Dynamic: author-email
32
32
  Dynamic: classifier