flowcept 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowcept/commons/daos/docdb_dao/mongodb_dao.py +11 -2
- flowcept/commons/daos/mq_dao/mq_dao_mofka.py +4 -4
- flowcept/commons/flowcept_dataclasses/workflow_object.py +4 -2
- flowcept/configs.py +1 -1
- flowcept/flowcept_api/flowcept_controller.py +1 -0
- flowcept/flowceptor/adapters/base_interceptor.py +13 -5
- flowcept/flowceptor/adapters/dask/dask_interceptor.py +2 -0
- flowcept/flowceptor/consumers/consumer_utils.py +1 -3
- flowcept/flowceptor/consumers/document_inserter.py +3 -1
- flowcept/flowceptor/telemetry_capture.py +6 -3
- flowcept/instrumentation/flowcept_loop.py +5 -5
- flowcept/instrumentation/flowcept_torch.py +1 -1
- flowcept/instrumentation/task_capture.py +9 -6
- flowcept/version.py +1 -1
- {flowcept-0.8.4.dist-info → flowcept-0.8.6.dist-info}/METADATA +29 -15
- {flowcept-0.8.4.dist-info → flowcept-0.8.6.dist-info}/RECORD +19 -19
- resources/sample_settings.yaml +20 -18
- {flowcept-0.8.4.dist-info → flowcept-0.8.6.dist-info}/WHEEL +0 -0
- {flowcept-0.8.4.dist-info → flowcept-0.8.6.dist-info}/licenses/LICENSE +0 -0
|
@@ -61,9 +61,18 @@ class MongoDBDAO(DocumentDBDAO):
|
|
|
61
61
|
self.logger = FlowceptLogger()
|
|
62
62
|
|
|
63
63
|
if MONGO_URI is not None:
|
|
64
|
-
self._client = MongoClient(MONGO_URI
|
|
64
|
+
self._client = MongoClient(MONGO_URI,
|
|
65
|
+
maxPoolSize=1000, # TODO: conf file
|
|
66
|
+
socketTimeoutMS=60000,
|
|
67
|
+
connectTimeoutMS=60000,
|
|
68
|
+
serverSelectionTimeoutMS=60000
|
|
69
|
+
)
|
|
65
70
|
else:
|
|
66
|
-
self._client = MongoClient(MONGO_HOST, MONGO_PORT
|
|
71
|
+
self._client = MongoClient(MONGO_HOST, MONGO_PORT, maxPoolSize=1000,
|
|
72
|
+
socketTimeoutMS=60000,
|
|
73
|
+
connectTimeoutMS=60000,
|
|
74
|
+
serverSelectionTimeoutMS=60000
|
|
75
|
+
)
|
|
67
76
|
self._db = self._client[MONGO_DB]
|
|
68
77
|
|
|
69
78
|
self._tasks_collection = self._db["tasks"]
|
|
@@ -65,7 +65,7 @@ class MQDaoMofka(MQDao):
|
|
|
65
65
|
|
|
66
66
|
def _bulk_publish(self, buffer, channel=MQ_CHANNEL, serializer=msgpack.dumps):
|
|
67
67
|
try:
|
|
68
|
-
self.logger.debug(f"Going to send Message:\n\t[BEGIN_MSG]{buffer}\n[END_MSG]\t")
|
|
68
|
+
#self.logger.debug(f"Going to send Message:\n\t[BEGIN_MSG]{buffer}\n[END_MSG]\t")
|
|
69
69
|
for m in buffer:
|
|
70
70
|
self.producer.push(m)
|
|
71
71
|
|
|
@@ -75,14 +75,14 @@ class MQDaoMofka(MQDao):
|
|
|
75
75
|
self.logger.error(f"Message that caused error: {buffer}")
|
|
76
76
|
try:
|
|
77
77
|
self.producer.flush()
|
|
78
|
-
self.logger.info(f"Flushed {len(buffer)} msgs to MQ!")
|
|
78
|
+
#self.logger.info(f"Flushed {len(buffer)} msgs to MQ!")
|
|
79
79
|
except Exception as e:
|
|
80
80
|
self.logger.exception(e)
|
|
81
81
|
|
|
82
82
|
def _bulk_publish_timed(self, buffer, channel=MQ_CHANNEL, serializer=msgpack.dumps):
|
|
83
83
|
total = 0
|
|
84
84
|
try:
|
|
85
|
-
self.logger.debug(f"Going to send Message:\n\t[BEGIN_MSG]{buffer}\n[END_MSG]\t")
|
|
85
|
+
#self.logger.debug(f"Going to send Message:\n\t[BEGIN_MSG]{buffer}\n[END_MSG]\t")
|
|
86
86
|
|
|
87
87
|
for m in buffer:
|
|
88
88
|
self.producer.push(m)
|
|
@@ -97,7 +97,7 @@ class MQDaoMofka(MQDao):
|
|
|
97
97
|
self.producer.flush()
|
|
98
98
|
t2 = time()
|
|
99
99
|
self._flush_events.append(["bulk", t1, t2, t2 - t1, total])
|
|
100
|
-
self.logger.info(f"Flushed {len(buffer)} msgs to MQ!")
|
|
100
|
+
# self.logger.info(f"Flushed {len(buffer)} msgs to MQ!")
|
|
101
101
|
except Exception as e:
|
|
102
102
|
self.logger.exception(e)
|
|
103
103
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""Workflow module."""
|
|
1
|
+
"""Workflow Object module."""
|
|
2
2
|
|
|
3
3
|
from typing import Dict, AnyStr, List
|
|
4
4
|
import msgpack
|
|
@@ -12,6 +12,7 @@ from flowcept.configs import (
|
|
|
12
12
|
SYS_NAME,
|
|
13
13
|
EXTRA_METADATA,
|
|
14
14
|
ENVIRONMENT_ID,
|
|
15
|
+
SETTINGS_PATH,
|
|
15
16
|
)
|
|
16
17
|
|
|
17
18
|
|
|
@@ -23,6 +24,7 @@ class WorkflowObject:
|
|
|
23
24
|
workflow_id: AnyStr = None
|
|
24
25
|
parent_workflow_id: AnyStr = None
|
|
25
26
|
machine_info: Dict = None
|
|
27
|
+
conf: Dict = None
|
|
26
28
|
flowcept_settings: Dict = None
|
|
27
29
|
flowcept_version: AnyStr = None
|
|
28
30
|
utc_timestamp: float = None
|
|
@@ -70,7 +72,7 @@ class WorkflowObject:
|
|
|
70
72
|
"""Enrich it."""
|
|
71
73
|
self.utc_timestamp = get_utc_now()
|
|
72
74
|
self.flowcept_settings = OmegaConf.to_container(settings) if isinstance(settings, DictConfig) else settings
|
|
73
|
-
|
|
75
|
+
self.conf = {"settings_path": SETTINGS_PATH}
|
|
74
76
|
if adapter_key is not None:
|
|
75
77
|
# TODO :base-interceptor-refactor: :code-reorg: :usability:
|
|
76
78
|
# revisit all times we assume settings is not none
|
flowcept/configs.py
CHANGED
|
@@ -146,7 +146,7 @@ PERF_LOG = settings["project"].get("performance_logging", False)
|
|
|
146
146
|
JSON_SERIALIZER = settings["project"].get("json_serializer", "default")
|
|
147
147
|
REPLACE_NON_JSON_SERIALIZABLE = settings["project"].get("replace_non_json_serializable", True)
|
|
148
148
|
ENRICH_MESSAGES = settings["project"].get("enrich_messages", True)
|
|
149
|
-
|
|
149
|
+
|
|
150
150
|
|
|
151
151
|
TELEMETRY_CAPTURE = settings.get("telemetry_capture", None)
|
|
152
152
|
|
|
@@ -8,7 +8,7 @@ from flowcept.commons.flowcept_dataclasses.workflow_object import (
|
|
|
8
8
|
WorkflowObject,
|
|
9
9
|
)
|
|
10
10
|
from flowcept.configs import (
|
|
11
|
-
ENRICH_MESSAGES,
|
|
11
|
+
ENRICH_MESSAGES, INSTRUMENTATION,
|
|
12
12
|
)
|
|
13
13
|
from flowcept.commons.flowcept_logger import FlowceptLogger
|
|
14
14
|
from flowcept.commons.daos.mq_dao.mq_dao_base import MQDao
|
|
@@ -49,15 +49,23 @@ class BaseInterceptor(object):
|
|
|
49
49
|
elif kind in "dask":
|
|
50
50
|
# This is dask's client interceptor. We essentially use it to store the dask workflow.
|
|
51
51
|
# That's why we don't need another special interceptor and we can reuse the instrumentation one.
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
return InstrumentationInterceptor.get_instance()
|
|
52
|
+
return BaseInterceptor._build_instrumentation_interceptor()
|
|
55
53
|
elif kind == "instrumentation":
|
|
54
|
+
return BaseInterceptor._build_instrumentation_interceptor()
|
|
55
|
+
else:
|
|
56
|
+
raise NotImplementedError
|
|
57
|
+
|
|
58
|
+
@staticmethod
|
|
59
|
+
def _build_instrumentation_interceptor():
|
|
60
|
+
# By using singleton, we lose the thread safety for the Interceptor, particularly, its MQ buffer.
|
|
61
|
+
# Since some use cases need threads, this allows disabling the singleton for more thread safety.
|
|
62
|
+
is_singleton = INSTRUMENTATION.get("singleton", True)
|
|
63
|
+
if is_singleton:
|
|
56
64
|
from flowcept.flowceptor.adapters.instrumentation_interceptor import InstrumentationInterceptor
|
|
57
65
|
|
|
58
66
|
return InstrumentationInterceptor.get_instance()
|
|
59
67
|
else:
|
|
60
|
-
|
|
68
|
+
return BaseInterceptor(kind="instrumentation")
|
|
61
69
|
|
|
62
70
|
def __init__(self, plugin_key=None, kind=None):
|
|
63
71
|
self.logger = FlowceptLogger()
|
|
@@ -80,6 +80,8 @@ class DaskWorkerInterceptor(BaseInterceptor):
|
|
|
80
80
|
self._generated_workflow_id = True
|
|
81
81
|
super().start(bundle_exec_id=self._worker.scheduler.address)
|
|
82
82
|
|
|
83
|
+
self._worker._interceptor = self
|
|
84
|
+
|
|
83
85
|
instrumentation = INSTRUMENTATION.get("enabled", False)
|
|
84
86
|
if instrumentation:
|
|
85
87
|
InstrumentationInterceptor.get_instance().start(
|
|
@@ -45,13 +45,11 @@ def curate_task_msg(task_msg_dict: dict, convert_times=True):
|
|
|
45
45
|
task_msg_dict["workflow_id"] = task_msg_dict["used"].pop("workflow_id")
|
|
46
46
|
|
|
47
47
|
if convert_times:
|
|
48
|
-
has_time_fields = False
|
|
49
48
|
for time_field in TaskObject.get_time_field_names():
|
|
50
49
|
if time_field in task_msg_dict:
|
|
51
|
-
has_time_fields = True
|
|
52
50
|
task_msg_dict[time_field] = datetime.fromtimestamp(task_msg_dict[time_field], pytz.utc)
|
|
53
51
|
|
|
54
|
-
if not
|
|
52
|
+
if "registered_at" not in task_msg_dict:
|
|
55
53
|
task_msg_dict["registered_at"] = datetime.fromtimestamp(time(), pytz.utc)
|
|
56
54
|
|
|
57
55
|
|
|
@@ -77,6 +77,7 @@ class DocumentInserter:
|
|
|
77
77
|
flush_interval=INSERTION_BUFFER_TIME,
|
|
78
78
|
)
|
|
79
79
|
|
|
80
|
+
|
|
80
81
|
def _set_buffer_size(self):
|
|
81
82
|
if not ADAPTIVE_DB_BUFFER_SIZE:
|
|
82
83
|
return
|
|
@@ -211,10 +212,11 @@ class DocumentInserter:
|
|
|
211
212
|
return True
|
|
212
213
|
|
|
213
214
|
def stop(self, bundle_exec_id=None):
|
|
214
|
-
"""Stop
|
|
215
|
+
"""Stop document inserter."""
|
|
215
216
|
if self.check_safe_stops:
|
|
216
217
|
trial = 0
|
|
217
218
|
while not self._mq_dao.all_time_based_threads_ended(bundle_exec_id):
|
|
219
|
+
self.logger.debug(f"# time_based_threads for bundle_exec_id {bundle_exec_id} is {self._mq_dao._keyvalue_dao.set_count(bundle_exec_id)}")
|
|
218
220
|
trial += 1
|
|
219
221
|
self.logger.info(
|
|
220
222
|
f"Doc Inserter {id(self)}: It's still not safe to stop DocInserter. "
|
|
@@ -119,8 +119,8 @@ class GPUCapture:
|
|
|
119
119
|
if "name" in gpu_conf:
|
|
120
120
|
flowcept_gpu_info["name"] = nvmlDeviceGetName(device)
|
|
121
121
|
|
|
122
|
-
if "
|
|
123
|
-
flowcept_gpu_info["
|
|
122
|
+
if "id" in gpu_conf:
|
|
123
|
+
flowcept_gpu_info["id"] = nvmlDeviceGetUUID(device)
|
|
124
124
|
|
|
125
125
|
return flowcept_gpu_info
|
|
126
126
|
|
|
@@ -160,7 +160,6 @@ class GPUCapture:
|
|
|
160
160
|
}
|
|
161
161
|
if "others" in gpu_conf:
|
|
162
162
|
flowcept_gpu_info["others"] = {
|
|
163
|
-
"uuid": amdsmi_get_gpu_device_uuid(device),
|
|
164
163
|
"current_gfxclk": all_metrics["current_gfxclk"],
|
|
165
164
|
"current_socclk": all_metrics["current_socclk"],
|
|
166
165
|
"current_uclk": all_metrics["current_uclk"],
|
|
@@ -168,6 +167,9 @@ class GPUCapture:
|
|
|
168
167
|
"current_dclk0": all_metrics["current_dclk0"],
|
|
169
168
|
}
|
|
170
169
|
|
|
170
|
+
if "id" in gpu_conf:
|
|
171
|
+
flowcept_gpu_info["id"] = amdsmi_get_gpu_device_uuid(device)
|
|
172
|
+
|
|
171
173
|
return flowcept_gpu_info
|
|
172
174
|
|
|
173
175
|
|
|
@@ -193,6 +195,7 @@ elif GPUCapture.GPU_VENDOR == "nvidia":
|
|
|
193
195
|
nvmlDeviceGetTemperature,
|
|
194
196
|
nvmlDeviceGetPowerUsage,
|
|
195
197
|
NVML_TEMPERATURE_GPU,
|
|
198
|
+
nvmlDeviceGetUUID,
|
|
196
199
|
)
|
|
197
200
|
|
|
198
201
|
FlowceptLogger().debug("Imported Nvidia modules!")
|
|
@@ -156,13 +156,13 @@ class FlowceptLoop:
|
|
|
156
156
|
"used": {"i": self._next_counter, self._item_name: self._current_item},
|
|
157
157
|
"parent_task_id": self._parent_task_id,
|
|
158
158
|
}
|
|
159
|
-
tel = FlowceptLoop._interceptor.telemetry_capture.capture()
|
|
160
|
-
if tel:
|
|
161
|
-
iteration_task["telemetry_at_start"] = tel.to_dict()
|
|
162
159
|
return iteration_task
|
|
163
160
|
|
|
164
|
-
def _end_iteration_task(self,
|
|
165
|
-
|
|
161
|
+
def _end_iteration_task(self, _):
|
|
162
|
+
self._last_iteration_task["status"] = Status.FINISHED.value
|
|
163
|
+
tel = FlowceptLoop._interceptor.telemetry_capture.capture()
|
|
164
|
+
if tel:
|
|
165
|
+
self._last_iteration_task["telemetry_at_end"] = tel.to_dict()
|
|
166
166
|
FlowceptLoop._interceptor.intercept(self._last_iteration_task)
|
|
167
167
|
|
|
168
168
|
def _do_nothing_in_end_iter(self, *args, **kwargs):
|
|
@@ -17,7 +17,6 @@ from flowcept.commons.flowcept_dataclasses.workflow_object import (
|
|
|
17
17
|
)
|
|
18
18
|
from flowcept.commons.vocabulary import Status
|
|
19
19
|
from flowcept.configs import (
|
|
20
|
-
REGISTER_WORKFLOW,
|
|
21
20
|
INSTRUMENTATION,
|
|
22
21
|
TELEMETRY_CAPTURE,
|
|
23
22
|
REPLACE_NON_JSON_SERIALIZABLE,
|
|
@@ -30,6 +29,7 @@ from flowcept.instrumentation.flowcept_task import get_current_context_task_id
|
|
|
30
29
|
|
|
31
30
|
TORCH_CONFIG = INSTRUMENTATION.get("torch")
|
|
32
31
|
|
|
32
|
+
REGISTER_WORKFLOW = TORCH_CONFIG.get("register_workflow", True)
|
|
33
33
|
|
|
34
34
|
def flowcept_torch(cls):
|
|
35
35
|
"""
|
|
@@ -49,9 +49,6 @@ class FlowceptTask(object):
|
|
|
49
49
|
are no-ops, and no data is captured.
|
|
50
50
|
"""
|
|
51
51
|
|
|
52
|
-
if INSTRUMENTATION_ENABLED:
|
|
53
|
-
_interceptor = InstrumentationInterceptor.get_instance()
|
|
54
|
-
|
|
55
52
|
def __init__(
|
|
56
53
|
self,
|
|
57
54
|
task_id: str = None,
|
|
@@ -60,12 +57,18 @@ class FlowceptTask(object):
|
|
|
60
57
|
activity_id: str = None,
|
|
61
58
|
used: Dict = None,
|
|
62
59
|
custom_metadata: Dict = None,
|
|
60
|
+
flowcept: 'Flowcept' = None
|
|
63
61
|
):
|
|
64
62
|
if not INSTRUMENTATION_ENABLED:
|
|
65
63
|
self._ended = True
|
|
66
64
|
return
|
|
65
|
+
if flowcept is not None and flowcept._interceptor_instances[0].kind == "instrumentation":
|
|
66
|
+
self._interceptor = flowcept._interceptor_instances[0]
|
|
67
|
+
else:
|
|
68
|
+
self._interceptor = InstrumentationInterceptor.get_instance()
|
|
69
|
+
|
|
67
70
|
self._task = TaskObject()
|
|
68
|
-
self._task.telemetry_at_start =
|
|
71
|
+
self._task.telemetry_at_start = self._interceptor.telemetry_capture.capture()
|
|
69
72
|
self._task.activity_id = activity_id
|
|
70
73
|
self._task.started_at = time()
|
|
71
74
|
self._task.task_id = task_id or str(self._task.started_at)
|
|
@@ -117,11 +120,11 @@ class FlowceptTask(object):
|
|
|
117
120
|
"""
|
|
118
121
|
if not INSTRUMENTATION_ENABLED:
|
|
119
122
|
return
|
|
120
|
-
self._task.telemetry_at_end =
|
|
123
|
+
self._task.telemetry_at_end = self._interceptor.telemetry_capture.capture()
|
|
121
124
|
self._task.ended_at = ended_at or time()
|
|
122
125
|
self._task.status = status
|
|
123
126
|
self._task.stderr = stderr
|
|
124
127
|
self._task.stdout = stdout
|
|
125
128
|
self._task.generated = generated
|
|
126
|
-
|
|
129
|
+
self._interceptor.intercept(self._task.to_dict())
|
|
127
130
|
self._ended = True
|
flowcept/version.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: flowcept
|
|
3
|
-
Version: 0.8.
|
|
3
|
+
Version: 0.8.6
|
|
4
4
|
Summary: Capture and query workflow provenance data using data observability
|
|
5
5
|
Project-URL: GitHub, https://github.com/ORNL/flowcept
|
|
6
6
|
Author: Oak Ridge National Laboratory
|
|
@@ -96,20 +96,34 @@ Description-Content-Type: text/markdown
|
|
|
96
96
|
|
|
97
97
|
# Flowcept
|
|
98
98
|
|
|
99
|
+
## Table of Contents
|
|
100
|
+
|
|
101
|
+
- [Overview](#overview)
|
|
102
|
+
- [Features](#features)
|
|
103
|
+
- [Installation](#installation)
|
|
104
|
+
- [Setup and the Settings File](#setup)
|
|
105
|
+
- [Running with Containers](#running-with-containers)
|
|
106
|
+
- [Examples](#examples)
|
|
107
|
+
- [Data Persistence](#data-persistence)
|
|
108
|
+
- [Performance Tuning](#performance-tuning-for-performance-evaluation)
|
|
109
|
+
- [AMD GPU Setup](#install-amd-gpu-lib)
|
|
110
|
+
|
|
111
|
+
## Overview
|
|
112
|
+
|
|
99
113
|
Flowcept is a runtime data integration system that captures and queries workflow provenance with minimal or no code changes. It unifies data across diverse workflows and tools, enabling integrated analysis and insights, especially in federated environments. Designed for scenarios involving critical data from multiple workflows, Flowcept seamlessly integrates data at runtime, providing a unified view for end-to-end monitoring and analysis, and enhanced support for Machine Learning (ML) workflows.
|
|
100
114
|
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
- Automatic
|
|
104
|
-
- Data observability
|
|
105
|
-
- Explicit
|
|
106
|
-
- ML data capture in various levels of details: workflow, model fitting or evaluation task, epoch iteration, layer forwarding
|
|
107
|
-
- ML model management
|
|
108
|
-
- Adapter-based, loosely-coupled system architecture, making it easy to plug and play with different data processing systems and backend database (e.g., MongoDB) or MQ services (e.g., Redis, Kafka)
|
|
109
|
-
- Low-overhead focused system architecture, to avoid adding performance overhead particularly to workloads that run on HPC machines
|
|
110
|
-
- Telemetry data capture (e.g., CPU, GPU, Memory consumption) linked to the application dataflow
|
|
111
|
-
- Highly customizable to multiple use cases, enabling easy toggle between settings (e.g., with/without provenance capture; with/without telemetry and which telemetry type to capture; which adapters or backend services to run with)
|
|
112
|
-
- [W3C PROV](https://www.w3.org/TR/prov-overview/) adherence
|
|
115
|
+
## Features
|
|
116
|
+
|
|
117
|
+
- Automatic workflow provenance data capture from heterogeneous workflows
|
|
118
|
+
- Data observability with no or minimal intrusion to application workflows
|
|
119
|
+
- Explicit application instrumentation, if this is preferred over data observability
|
|
120
|
+
- ML data capture in various levels of details: workflow, model fitting or evaluation task, epoch iteration, layer forwarding
|
|
121
|
+
- ML model management (e.g., model storage and retrieval, along with their metadata and provenance)
|
|
122
|
+
- Adapter-based, loosely-coupled system architecture, making it easy to plug and play with different data processing systems and backend database (e.g., MongoDB) or MQ services (e.g., Redis, Kafka)
|
|
123
|
+
- Low-overhead focused system architecture, to avoid adding performance overhead particularly to workloads that run on HPC machines
|
|
124
|
+
- Telemetry data capture (e.g., CPU, GPU, Memory consumption) linked to the application dataflow
|
|
125
|
+
- Highly customizable to multiple use cases, enabling easy toggle between settings (e.g., with/without provenance capture; with/without telemetry and which telemetry type to capture; which adapters or backend services to run with)
|
|
126
|
+
- [W3C PROV](https://www.w3.org/TR/prov-overview/) adherence
|
|
113
127
|
|
|
114
128
|
Notes:
|
|
115
129
|
|
|
@@ -192,7 +206,8 @@ To use Flowcept, one needs to start a MQ system `$> make services`. This will st
|
|
|
192
206
|
|
|
193
207
|
### Flowcept Settings File
|
|
194
208
|
|
|
195
|
-
Flowcept requires a settings file for configuration.
|
|
209
|
+
Flowcept requires a settings file for configuration.
|
|
210
|
+
You can find an example configuration file [here](resources/sample_settings.yaml), with documentation for each parameter provided as inline comments.
|
|
196
211
|
|
|
197
212
|
#### What You Can Configure:
|
|
198
213
|
|
|
@@ -214,7 +229,6 @@ export FLOWCEPT_SETTINGS_PATH=/absolute/path/to/your/settings.yaml
|
|
|
214
229
|
|
|
215
230
|
If this variable is not set, Flowcept will use the default values from the [example](resources/sample_settings.yaml) file.
|
|
216
231
|
|
|
217
|
-
|
|
218
232
|
# Running with Containers
|
|
219
233
|
|
|
220
234
|
To use containers instead of installing Flowcept's dependencies on your host system, we provide a [Dockerfile](deployment/Dockerfile) alongside a [docker-compose.yml](deployment/compose.yml) for dependent services (e.g., Redis, MongoDB).
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
flowcept/__init__.py,sha256=CukmdzTUvm6Y_plTKPq4kKn7w9LdR36j7V_C_UQyjhU,2011
|
|
2
|
-
flowcept/configs.py,sha256=
|
|
3
|
-
flowcept/version.py,sha256=
|
|
2
|
+
flowcept/configs.py,sha256=NDUAqqoKfztt6Qjwxy95eTQU71AovVJWXalI1x3HJ7Y,7441
|
|
3
|
+
flowcept/version.py,sha256=3Xeg2e7hNlxinsNn_IjT041Xp9p4RX_yIks40uVvSJM,306
|
|
4
4
|
flowcept/analytics/__init__.py,sha256=46q-7vsHq_ddPNrzNnDgEOiRgvlx-5Ggu2ocyROMV0w,641
|
|
5
5
|
flowcept/analytics/analytics_utils.py,sha256=FRJdBtQa7Hrk2oR_FFhmhmMf3X6YyZ4nbH5RIYh7KL4,8753
|
|
6
6
|
flowcept/analytics/data_augmentation.py,sha256=Dyr5x316Zf-k1e8rVoQMCpFOrklYVHjfejRPrtoycmc,1641
|
|
@@ -17,20 +17,20 @@ flowcept/commons/daos/keyvalue_dao.py,sha256=03xHhQIfZas0LQLP1DbGJ5DoskXyZNXQKIN
|
|
|
17
17
|
flowcept/commons/daos/docdb_dao/__init__.py,sha256=qRvXREeUJ4mkhxdC9bzpOsVX6M2FB5hDyLFxhMxTGhs,30
|
|
18
18
|
flowcept/commons/daos/docdb_dao/docdb_dao_base.py,sha256=YbfSVJPwZGK2GBYkeapRC83HkmP0c6Msv5TriD88RcI,11812
|
|
19
19
|
flowcept/commons/daos/docdb_dao/lmdb_dao.py,sha256=dJOLgCx_lwdz6MKiMpM_UE4rm0angDCPaVz_WU5KqIA,10407
|
|
20
|
-
flowcept/commons/daos/docdb_dao/mongodb_dao.py,sha256
|
|
20
|
+
flowcept/commons/daos/docdb_dao/mongodb_dao.py,sha256=WGjlB_8fIsMPRx7LXPj52Uexuh-E69LKsW8vN_zcpPU,38568
|
|
21
21
|
flowcept/commons/daos/mq_dao/__init__.py,sha256=Xxm4FmbBUZDQ7XIAmSFbeKE_AdHsbgFmSuftvMWSykQ,21
|
|
22
22
|
flowcept/commons/daos/mq_dao/mq_dao_base.py,sha256=EAqOhy7Q8V29JFDG8C50nRK34KsPxEICkG4elk4ZfX8,9020
|
|
23
23
|
flowcept/commons/daos/mq_dao/mq_dao_kafka.py,sha256=bf-bZvWw9JJk8Kdfzx2UkAnQC95rSrKXDEyYkrcncOk,4400
|
|
24
|
-
flowcept/commons/daos/mq_dao/mq_dao_mofka.py,sha256=
|
|
24
|
+
flowcept/commons/daos/mq_dao/mq_dao_mofka.py,sha256=_9KlJwUuFKgTB8ZPQQSz4yQSe6oYKb0vrhm3XIT8jPw,3945
|
|
25
25
|
flowcept/commons/daos/mq_dao/mq_dao_redis.py,sha256=Br97SoDIkt4dHH937Yjg3wtkn1xGT-x9t-8E3VD5TeU,4277
|
|
26
26
|
flowcept/commons/flowcept_dataclasses/__init__.py,sha256=8KkiJh0WSRAB50waVluxCSI8Tb9X1L9nup4c8RN3ulc,30
|
|
27
27
|
flowcept/commons/flowcept_dataclasses/base_settings_dataclasses.py,sha256=Cjw2PGYtZDfnwecz6G3S42Ncmxj7AIZVEBx05bsxRUo,399
|
|
28
28
|
flowcept/commons/flowcept_dataclasses/task_object.py,sha256=3DD5ZNMz7EVILS9PRkQ3khboav7lIKoUC5W6sKMFauQ,4694
|
|
29
29
|
flowcept/commons/flowcept_dataclasses/telemetry.py,sha256=9_5ONCo-06r5nKHXmi5HfIhiZSuPgmTECiq_u9MlxXM,2822
|
|
30
|
-
flowcept/commons/flowcept_dataclasses/workflow_object.py,sha256=
|
|
30
|
+
flowcept/commons/flowcept_dataclasses/workflow_object.py,sha256=f8aB0b3xcUr3KQTlloF7R_P6xQejzDPOm-s6dLhGMeA,4383
|
|
31
31
|
flowcept/flowcept_api/__init__.py,sha256=T1ty86YlocQ5Z18l5fUqHj_CC6Unq_iBv0lFyiI7Ao8,22
|
|
32
32
|
flowcept/flowcept_api/db_api.py,sha256=hKXep-n50rp9cAzV0ljk2QVEF8O64yxi3ujXv5_Ibac,9723
|
|
33
|
-
flowcept/flowcept_api/flowcept_controller.py,sha256=
|
|
33
|
+
flowcept/flowcept_api/flowcept_controller.py,sha256=lkHR7O0zAAfbGtVa4o9tjZMdZquYN7vdnymRKzc4B8s,11933
|
|
34
34
|
flowcept/flowcept_api/task_query_api.py,sha256=SrwB0OCVtbpvCPECkE2ySM10G_g8Wlk5PJ8h-0xEaNc,23821
|
|
35
35
|
flowcept/flowcept_webserver/__init__.py,sha256=8411GIXGddKTKoHUvbo_Rq6svosNG7tG8VzvUEBd7WI,28
|
|
36
36
|
flowcept/flowcept_webserver/app.py,sha256=VUV8_JZbIbx9u_1O7m7XtRdhZb_7uifUa-iNlPhmZws,658
|
|
@@ -38,14 +38,14 @@ flowcept/flowcept_webserver/resources/__init__.py,sha256=XOk5yhLeLU6JmVXxbl3TY2z
|
|
|
38
38
|
flowcept/flowcept_webserver/resources/query_rsrc.py,sha256=Mk1XDC_wVYkMk0eaazqWWrTC07gQU9U0toKfip0ihZE,1353
|
|
39
39
|
flowcept/flowcept_webserver/resources/task_messages_rsrc.py,sha256=0u68it2W-9NzUUx5fWOZCqvRKe5EsLI8oyvto9634Ng,666
|
|
40
40
|
flowcept/flowceptor/__init__.py,sha256=wVxRXUv07iNx6SMRRma2vqhR_GIcRl0re_WCYG65PUs,29
|
|
41
|
-
flowcept/flowceptor/telemetry_capture.py,sha256=
|
|
41
|
+
flowcept/flowceptor/telemetry_capture.py,sha256=wSXyQJ-vPVzeldD4KqoLQA2rg7V0EOQo_11ErJE5oQQ,13743
|
|
42
42
|
flowcept/flowceptor/adapters/__init__.py,sha256=SuZbSZVVQeBJ9zXW-M9jF09dw3XIjre3lSGrUO1Y8Po,27
|
|
43
|
-
flowcept/flowceptor/adapters/base_interceptor.py,sha256=
|
|
43
|
+
flowcept/flowceptor/adapters/base_interceptor.py,sha256=5guTHuuFpB7PYk9tQopdak5MBxcPMnsxDiZVUcEvJks,6483
|
|
44
44
|
flowcept/flowceptor/adapters/instrumentation_interceptor.py,sha256=DhK2bBnpghqPSeA62BUqRg6pl8zxuYrP33dK4x6PhRE,733
|
|
45
45
|
flowcept/flowceptor/adapters/interceptor_state_manager.py,sha256=xRzmi5YFKBEqNtX8F5s6XlMTRe27ml4BmQtBO4WtG2c,919
|
|
46
46
|
flowcept/flowceptor/adapters/dask/__init__.py,sha256=GKreb5L_nliD2BEckyB943zOQ-b6Gn1fLDj81FqSK2Y,23
|
|
47
47
|
flowcept/flowceptor/adapters/dask/dask_dataclasses.py,sha256=6LTG-kdcc6AUuVINvkqB5QHw6pchg1aMqj0sdWt2Ef8,580
|
|
48
|
-
flowcept/flowceptor/adapters/dask/dask_interceptor.py,sha256=
|
|
48
|
+
flowcept/flowceptor/adapters/dask/dask_interceptor.py,sha256=uBQpLluYXzlT1gBDfTe4_WueC_fWBEs5Xr8ntpOmljE,5869
|
|
49
49
|
flowcept/flowceptor/adapters/dask/dask_plugins.py,sha256=s1ENAi9N61PC_6RiFvOYhJsgWzSm_lFWm3w87V-R1YY,2473
|
|
50
50
|
flowcept/flowceptor/adapters/mlflow/__init__.py,sha256=3mzHrvh1XQOy68qx1A3so9Nq27tIb0i2mSXfv3F6gZg,25
|
|
51
51
|
flowcept/flowceptor/adapters/mlflow/interception_event_handler.py,sha256=-SsIRdOcZjQUTzWgsZ41ouqpla4Qd32jIWXIAGU1pPw,494
|
|
@@ -59,15 +59,15 @@ flowcept/flowceptor/adapters/zambeze/__init__.py,sha256=1e9_hK2cUKDXhQ0kBRftwcJj
|
|
|
59
59
|
flowcept/flowceptor/adapters/zambeze/zambeze_dataclasses.py,sha256=nn9MxvcdzgmOa8n5Jwdl7UzlSzxEu9bA-Ls6cHyb91c,849
|
|
60
60
|
flowcept/flowceptor/adapters/zambeze/zambeze_interceptor.py,sha256=Bjyi48JW0DXJLJuvwPxaD8zxxsSoEFgSoXl8YcbwFWk,3782
|
|
61
61
|
flowcept/flowceptor/consumers/__init__.py,sha256=foxtVEb2ZEe9g1slfYIKM4tIFv-He1l7XS--SYs7nlQ,28
|
|
62
|
-
flowcept/flowceptor/consumers/consumer_utils.py,sha256=
|
|
63
|
-
flowcept/flowceptor/consumers/document_inserter.py,sha256=
|
|
62
|
+
flowcept/flowceptor/consumers/consumer_utils.py,sha256=7bvFJWusJkfA4j0gwZLDIIsIOyfk9wRq6s5liS3JAV0,5665
|
|
63
|
+
flowcept/flowceptor/consumers/document_inserter.py,sha256=m9B57j6C4vdwSZhbafgtqHKTvi4YMttmv-iShW5MYqs,9508
|
|
64
64
|
flowcept/instrumentation/__init__.py,sha256=M5bTmg80E4QyN91gUX3qfw_nbtJSXwGWcKxdZP3vJz0,34
|
|
65
|
-
flowcept/instrumentation/flowcept_loop.py,sha256=
|
|
65
|
+
flowcept/instrumentation/flowcept_loop.py,sha256=RvETm3Pn37dIw_a1RXigyh2U7MCBHqi46dPmbrz3RMQ,12171
|
|
66
66
|
flowcept/instrumentation/flowcept_task.py,sha256=l_BAYEUZ_SeBt8QJN_E9D9QcZVYRnW9qO_XRnqvmePE,5993
|
|
67
|
-
flowcept/instrumentation/flowcept_torch.py,sha256=
|
|
68
|
-
flowcept/instrumentation/task_capture.py,sha256=
|
|
69
|
-
resources/sample_settings.yaml,sha256=
|
|
70
|
-
flowcept-0.8.
|
|
71
|
-
flowcept-0.8.
|
|
72
|
-
flowcept-0.8.
|
|
73
|
-
flowcept-0.8.
|
|
67
|
+
flowcept/instrumentation/flowcept_torch.py,sha256=BgJ0eOeCGOe0hjbyDQf-bHK_kFOvj9iSXC4PI1EikXE,23436
|
|
68
|
+
flowcept/instrumentation/task_capture.py,sha256=DdudnrjiXS6EGsirdykMLi1rFFHWkJMX1C7uibjsjL4,4944
|
|
69
|
+
resources/sample_settings.yaml,sha256=aKeHf8895vrHIbi0QS1w2WT5n8ZNI9Ep5PVPF5Y5MEQ,4957
|
|
70
|
+
flowcept-0.8.6.dist-info/METADATA,sha256=nzqP4fruoqw-264-XNXJ0rlR_UfwsmqmmOW_J0GPYRM,18086
|
|
71
|
+
flowcept-0.8.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
|
72
|
+
flowcept-0.8.6.dist-info/licenses/LICENSE,sha256=r5-2P6tFTuRGWT5TiX32s1y0tnp4cIqBEC1QjTaXe2k,1086
|
|
73
|
+
flowcept-0.8.6.dist-info/RECORD,,
|
resources/sample_settings.yaml
CHANGED
|
@@ -1,19 +1,20 @@
|
|
|
1
|
+
flowcept_version: 0.8.0 # Version of the Flowcept package. This setting file is compatible with this version.
|
|
2
|
+
|
|
1
3
|
project:
|
|
2
|
-
debug: true
|
|
3
|
-
json_serializer: default # or complex. If "complex", Flowcept will deal with complex python dicts that may contain JSON unserializable values
|
|
4
|
-
replace_non_json_serializable: true
|
|
5
|
-
performance_logging: false
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
db_flush_mode: online # or offline
|
|
4
|
+
debug: true # Toggle debug mode. This will add a property `debug: true` to all saved data, making it easier to retrieve/delete them later.
|
|
5
|
+
json_serializer: default # JSON serialization mode: default or complex. If "complex", Flowcept will deal with complex python dicts that may contain JSON unserializable values
|
|
6
|
+
replace_non_json_serializable: true # Replace values that can't be JSON serialized
|
|
7
|
+
performance_logging: false # Enable performance logging if true. Particularly useful for MQ flushes.
|
|
8
|
+
enrich_messages: true # Add extra metadata to task messages, such as IP addresses and UTC timestamps.
|
|
9
|
+
db_flush_mode: online # Mode for flushing DB entries: "online" or "offline". If online, flushes to the DB will happen before the workflow ends.
|
|
9
10
|
|
|
10
11
|
log:
|
|
11
|
-
log_path: "default"
|
|
12
|
-
log_file_level: error # use
|
|
13
|
-
log_stream_level: error
|
|
12
|
+
log_path: "default" # Path for log file output; "default" will write the log in the directory where the main executable is running from.
|
|
13
|
+
log_file_level: error # Logging level (error, debug, info, critical) for file logs; use "disable" to turn off.
|
|
14
|
+
log_stream_level: error # Logging level (error, debug, info, critical) for console/stream logs; use "disable" to turn off.
|
|
14
15
|
|
|
15
|
-
telemetry_capture:
|
|
16
|
-
gpu: ~ # ~ means None. This is a list with GPU metrics. AMD=[activity,used,power,temperature,others]; NVIDIA=[used,temperature,power,name,
|
|
16
|
+
telemetry_capture: # This toggles each individual type of telemetry capture. GPU capture is treated different depending on the vendor (AMD or NVIDIA).
|
|
17
|
+
gpu: ~ # ~ means None. This is a list with GPU metrics. AMD=[activity,used,power,temperature,others,id]; NVIDIA=[used,temperature,power,name,id]
|
|
17
18
|
cpu: true
|
|
18
19
|
per_cpu: true
|
|
19
20
|
process_info: true
|
|
@@ -23,17 +24,18 @@ telemetry_capture:
|
|
|
23
24
|
machine_info: true
|
|
24
25
|
|
|
25
26
|
instrumentation:
|
|
26
|
-
enabled: true
|
|
27
|
+
enabled: true # This toggles data capture for instrumentation.
|
|
28
|
+
singleton: true # Use a single instrumentation instance per process. Defaults to true
|
|
27
29
|
torch:
|
|
28
|
-
what: parent_and_children # parent_only, parent_and_children, ~
|
|
29
|
-
children_mode: telemetry_and_tensor_inspection # tensor_inspection, telemetry, telemetry_and_tensor_inspection
|
|
30
|
+
what: parent_and_children # Scope of instrumentation: "parent_only" -- will capture only at the main model level, "parent_and_children" -- will capture the inner layers, or ~ (disable).
|
|
31
|
+
children_mode: telemetry_and_tensor_inspection # What to capture if parent_and_children is chosen in the scope. Possible values: "tensor_inspection" (i.e., tensor metadata), "telemetry", "telemetry_and_tensor_inspection"
|
|
30
32
|
epoch_loop: lightweight # lightweight, ~ (disable), or default (default will use the default telemetry capture method)
|
|
31
33
|
batch_loop: lightweight # lightweight, ~ (disable), or default (default will use the default telemetry capture method)
|
|
32
|
-
capture_epochs_at_every: 1 #epochs; please use a value that is multiple of #epochs
|
|
33
|
-
#
|
|
34
|
+
capture_epochs_at_every: 1 # Will capture data at every N epochs; please use a value that is multiple of the total number of #epochs.
|
|
35
|
+
register_workflow: true # Will store the parent model forward as a workflow itself in the database.
|
|
34
36
|
|
|
35
37
|
experiment:
|
|
36
|
-
user: root
|
|
38
|
+
user: root # Optionally identify the user running the experiment. The logged username will be captured anyways.
|
|
37
39
|
|
|
38
40
|
mq:
|
|
39
41
|
type: redis # or kafka or mofka; Please adjust the port (kafka's default is 9092; redis is 6379). If mofka, adjust the group_file.
|
|
File without changes
|
|
File without changes
|