flowcept 0.8.4__py3-none-any.whl → 0.8.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -61,9 +61,18 @@ class MongoDBDAO(DocumentDBDAO):
61
61
  self.logger = FlowceptLogger()
62
62
 
63
63
  if MONGO_URI is not None:
64
- self._client = MongoClient(MONGO_URI)
64
+ self._client = MongoClient(MONGO_URI,
65
+ maxPoolSize=1000, # TODO: conf file
66
+ socketTimeoutMS=60000,
67
+ connectTimeoutMS=60000,
68
+ serverSelectionTimeoutMS=60000
69
+ )
65
70
  else:
66
- self._client = MongoClient(MONGO_HOST, MONGO_PORT)
71
+ self._client = MongoClient(MONGO_HOST, MONGO_PORT, maxPoolSize=1000,
72
+ socketTimeoutMS=60000,
73
+ connectTimeoutMS=60000,
74
+ serverSelectionTimeoutMS=60000
75
+ )
67
76
  self._db = self._client[MONGO_DB]
68
77
 
69
78
  self._tasks_collection = self._db["tasks"]
@@ -65,7 +65,7 @@ class MQDaoMofka(MQDao):
65
65
 
66
66
  def _bulk_publish(self, buffer, channel=MQ_CHANNEL, serializer=msgpack.dumps):
67
67
  try:
68
- self.logger.debug(f"Going to send Message:\n\t[BEGIN_MSG]{buffer}\n[END_MSG]\t")
68
+ #self.logger.debug(f"Going to send Message:\n\t[BEGIN_MSG]{buffer}\n[END_MSG]\t")
69
69
  for m in buffer:
70
70
  self.producer.push(m)
71
71
 
@@ -75,14 +75,14 @@ class MQDaoMofka(MQDao):
75
75
  self.logger.error(f"Message that caused error: {buffer}")
76
76
  try:
77
77
  self.producer.flush()
78
- self.logger.info(f"Flushed {len(buffer)} msgs to MQ!")
78
+ #self.logger.info(f"Flushed {len(buffer)} msgs to MQ!")
79
79
  except Exception as e:
80
80
  self.logger.exception(e)
81
81
 
82
82
  def _bulk_publish_timed(self, buffer, channel=MQ_CHANNEL, serializer=msgpack.dumps):
83
83
  total = 0
84
84
  try:
85
- self.logger.debug(f"Going to send Message:\n\t[BEGIN_MSG]{buffer}\n[END_MSG]\t")
85
+ #self.logger.debug(f"Going to send Message:\n\t[BEGIN_MSG]{buffer}\n[END_MSG]\t")
86
86
 
87
87
  for m in buffer:
88
88
  self.producer.push(m)
@@ -97,7 +97,7 @@ class MQDaoMofka(MQDao):
97
97
  self.producer.flush()
98
98
  t2 = time()
99
99
  self._flush_events.append(["bulk", t1, t2, t2 - t1, total])
100
- self.logger.info(f"Flushed {len(buffer)} msgs to MQ!")
100
+ # self.logger.info(f"Flushed {len(buffer)} msgs to MQ!")
101
101
  except Exception as e:
102
102
  self.logger.exception(e)
103
103
 
@@ -1,4 +1,4 @@
1
- """Workflow module."""
1
+ """Workflow Object module."""
2
2
 
3
3
  from typing import Dict, AnyStr, List
4
4
  import msgpack
@@ -12,6 +12,7 @@ from flowcept.configs import (
12
12
  SYS_NAME,
13
13
  EXTRA_METADATA,
14
14
  ENVIRONMENT_ID,
15
+ SETTINGS_PATH,
15
16
  )
16
17
 
17
18
 
@@ -23,6 +24,7 @@ class WorkflowObject:
23
24
  workflow_id: AnyStr = None
24
25
  parent_workflow_id: AnyStr = None
25
26
  machine_info: Dict = None
27
+ conf: Dict = None
26
28
  flowcept_settings: Dict = None
27
29
  flowcept_version: AnyStr = None
28
30
  utc_timestamp: float = None
@@ -70,7 +72,7 @@ class WorkflowObject:
70
72
  """Enrich it."""
71
73
  self.utc_timestamp = get_utc_now()
72
74
  self.flowcept_settings = OmegaConf.to_container(settings) if isinstance(settings, DictConfig) else settings
73
-
75
+ self.conf = {"settings_path": SETTINGS_PATH}
74
76
  if adapter_key is not None:
75
77
  # TODO :base-interceptor-refactor: :code-reorg: :usability:
76
78
  # revisit all times we assume settings is not none
flowcept/configs.py CHANGED
@@ -146,7 +146,7 @@ PERF_LOG = settings["project"].get("performance_logging", False)
146
146
  JSON_SERIALIZER = settings["project"].get("json_serializer", "default")
147
147
  REPLACE_NON_JSON_SERIALIZABLE = settings["project"].get("replace_non_json_serializable", True)
148
148
  ENRICH_MESSAGES = settings["project"].get("enrich_messages", True)
149
- REGISTER_WORKFLOW = settings["project"].get("register_workflow", True)
149
+
150
150
 
151
151
  TELEMETRY_CAPTURE = settings.get("telemetry_capture", None)
152
152
 
@@ -262,6 +262,7 @@ class Flowcept(object):
262
262
  if not MQDao.build().liveness_test():
263
263
  logger.error("MQ Not Ready!")
264
264
  return False
265
+
265
266
  if MONGO_ENABLED:
266
267
  from flowcept.commons.daos.docdb_dao.mongodb_dao import MongoDBDAO
267
268
 
@@ -8,7 +8,7 @@ from flowcept.commons.flowcept_dataclasses.workflow_object import (
8
8
  WorkflowObject,
9
9
  )
10
10
  from flowcept.configs import (
11
- ENRICH_MESSAGES,
11
+ ENRICH_MESSAGES, INSTRUMENTATION,
12
12
  )
13
13
  from flowcept.commons.flowcept_logger import FlowceptLogger
14
14
  from flowcept.commons.daos.mq_dao.mq_dao_base import MQDao
@@ -49,15 +49,23 @@ class BaseInterceptor(object):
49
49
  elif kind in "dask":
50
50
  # This is dask's client interceptor. We essentially use it to store the dask workflow.
51
51
  # That's why we don't need another special interceptor and we can reuse the instrumentation one.
52
- from flowcept.flowceptor.adapters.instrumentation_interceptor import InstrumentationInterceptor
53
-
54
- return InstrumentationInterceptor.get_instance()
52
+ return BaseInterceptor._build_instrumentation_interceptor()
55
53
  elif kind == "instrumentation":
54
+ return BaseInterceptor._build_instrumentation_interceptor()
55
+ else:
56
+ raise NotImplementedError
57
+
58
+ @staticmethod
59
+ def _build_instrumentation_interceptor():
60
+ # By using singleton, we lose the thread safety for the Interceptor, particularly, its MQ buffer.
61
+ # Since some use cases need threads, this allows disabling the singleton for more thread safety.
62
+ is_singleton = INSTRUMENTATION.get("singleton", True)
63
+ if is_singleton:
56
64
  from flowcept.flowceptor.adapters.instrumentation_interceptor import InstrumentationInterceptor
57
65
 
58
66
  return InstrumentationInterceptor.get_instance()
59
67
  else:
60
- raise NotImplementedError
68
+ return BaseInterceptor(kind="instrumentation")
61
69
 
62
70
  def __init__(self, plugin_key=None, kind=None):
63
71
  self.logger = FlowceptLogger()
@@ -80,6 +80,8 @@ class DaskWorkerInterceptor(BaseInterceptor):
80
80
  self._generated_workflow_id = True
81
81
  super().start(bundle_exec_id=self._worker.scheduler.address)
82
82
 
83
+ self._worker._interceptor = self
84
+
83
85
  instrumentation = INSTRUMENTATION.get("enabled", False)
84
86
  if instrumentation:
85
87
  InstrumentationInterceptor.get_instance().start(
@@ -45,13 +45,11 @@ def curate_task_msg(task_msg_dict: dict, convert_times=True):
45
45
  task_msg_dict["workflow_id"] = task_msg_dict["used"].pop("workflow_id")
46
46
 
47
47
  if convert_times:
48
- has_time_fields = False
49
48
  for time_field in TaskObject.get_time_field_names():
50
49
  if time_field in task_msg_dict:
51
- has_time_fields = True
52
50
  task_msg_dict[time_field] = datetime.fromtimestamp(task_msg_dict[time_field], pytz.utc)
53
51
 
54
- if not has_time_fields:
52
+ if "registered_at" not in task_msg_dict:
55
53
  task_msg_dict["registered_at"] = datetime.fromtimestamp(time(), pytz.utc)
56
54
 
57
55
 
@@ -77,6 +77,7 @@ class DocumentInserter:
77
77
  flush_interval=INSERTION_BUFFER_TIME,
78
78
  )
79
79
 
80
+
80
81
  def _set_buffer_size(self):
81
82
  if not ADAPTIVE_DB_BUFFER_SIZE:
82
83
  return
@@ -211,10 +212,11 @@ class DocumentInserter:
211
212
  return True
212
213
 
213
214
  def stop(self, bundle_exec_id=None):
214
- """Stop it."""
215
+ """Stop document inserter."""
215
216
  if self.check_safe_stops:
216
217
  trial = 0
217
218
  while not self._mq_dao.all_time_based_threads_ended(bundle_exec_id):
219
+ self.logger.debug(f"# time_based_threads for bundle_exec_id {bundle_exec_id} is {self._mq_dao._keyvalue_dao.set_count(bundle_exec_id)}")
218
220
  trial += 1
219
221
  self.logger.info(
220
222
  f"Doc Inserter {id(self)}: It's still not safe to stop DocInserter. "
@@ -119,8 +119,8 @@ class GPUCapture:
119
119
  if "name" in gpu_conf:
120
120
  flowcept_gpu_info["name"] = nvmlDeviceGetName(device)
121
121
 
122
- if "ix" in gpu_conf:
123
- flowcept_gpu_info["gpu_ix"] = gpu_ix
122
+ if "id" in gpu_conf:
123
+ flowcept_gpu_info["id"] = nvmlDeviceGetUUID(device)
124
124
 
125
125
  return flowcept_gpu_info
126
126
 
@@ -160,7 +160,6 @@ class GPUCapture:
160
160
  }
161
161
  if "others" in gpu_conf:
162
162
  flowcept_gpu_info["others"] = {
163
- "uuid": amdsmi_get_gpu_device_uuid(device),
164
163
  "current_gfxclk": all_metrics["current_gfxclk"],
165
164
  "current_socclk": all_metrics["current_socclk"],
166
165
  "current_uclk": all_metrics["current_uclk"],
@@ -168,6 +167,9 @@ class GPUCapture:
168
167
  "current_dclk0": all_metrics["current_dclk0"],
169
168
  }
170
169
 
170
+ if "id" in gpu_conf:
171
+ flowcept_gpu_info["id"] = amdsmi_get_gpu_device_uuid(device)
172
+
171
173
  return flowcept_gpu_info
172
174
 
173
175
 
@@ -193,6 +195,7 @@ elif GPUCapture.GPU_VENDOR == "nvidia":
193
195
  nvmlDeviceGetTemperature,
194
196
  nvmlDeviceGetPowerUsage,
195
197
  NVML_TEMPERATURE_GPU,
198
+ nvmlDeviceGetUUID,
196
199
  )
197
200
 
198
201
  FlowceptLogger().debug("Imported Nvidia modules!")
@@ -156,13 +156,13 @@ class FlowceptLoop:
156
156
  "used": {"i": self._next_counter, self._item_name: self._current_item},
157
157
  "parent_task_id": self._parent_task_id,
158
158
  }
159
- tel = FlowceptLoop._interceptor.telemetry_capture.capture()
160
- if tel:
161
- iteration_task["telemetry_at_start"] = tel.to_dict()
162
159
  return iteration_task
163
160
 
164
- def _end_iteration_task(self, iteration_task):
165
- iteration_task["status"] = Status.FINISHED.value
161
+ def _end_iteration_task(self, _):
162
+ self._last_iteration_task["status"] = Status.FINISHED.value
163
+ tel = FlowceptLoop._interceptor.telemetry_capture.capture()
164
+ if tel:
165
+ self._last_iteration_task["telemetry_at_end"] = tel.to_dict()
166
166
  FlowceptLoop._interceptor.intercept(self._last_iteration_task)
167
167
 
168
168
  def _do_nothing_in_end_iter(self, *args, **kwargs):
@@ -17,7 +17,6 @@ from flowcept.commons.flowcept_dataclasses.workflow_object import (
17
17
  )
18
18
  from flowcept.commons.vocabulary import Status
19
19
  from flowcept.configs import (
20
- REGISTER_WORKFLOW,
21
20
  INSTRUMENTATION,
22
21
  TELEMETRY_CAPTURE,
23
22
  REPLACE_NON_JSON_SERIALIZABLE,
@@ -30,6 +29,7 @@ from flowcept.instrumentation.flowcept_task import get_current_context_task_id
30
29
 
31
30
  TORCH_CONFIG = INSTRUMENTATION.get("torch")
32
31
 
32
+ REGISTER_WORKFLOW = TORCH_CONFIG.get("register_workflow", True)
33
33
 
34
34
  def flowcept_torch(cls):
35
35
  """
@@ -49,9 +49,6 @@ class FlowceptTask(object):
49
49
  are no-ops, and no data is captured.
50
50
  """
51
51
 
52
- if INSTRUMENTATION_ENABLED:
53
- _interceptor = InstrumentationInterceptor.get_instance()
54
-
55
52
  def __init__(
56
53
  self,
57
54
  task_id: str = None,
@@ -60,12 +57,18 @@ class FlowceptTask(object):
60
57
  activity_id: str = None,
61
58
  used: Dict = None,
62
59
  custom_metadata: Dict = None,
60
+ flowcept: 'Flowcept' = None
63
61
  ):
64
62
  if not INSTRUMENTATION_ENABLED:
65
63
  self._ended = True
66
64
  return
65
+ if flowcept is not None and flowcept._interceptor_instances[0].kind == "instrumentation":
66
+ self._interceptor = flowcept._interceptor_instances[0]
67
+ else:
68
+ self._interceptor = InstrumentationInterceptor.get_instance()
69
+
67
70
  self._task = TaskObject()
68
- self._task.telemetry_at_start = FlowceptTask._interceptor.telemetry_capture.capture()
71
+ self._task.telemetry_at_start = self._interceptor.telemetry_capture.capture()
69
72
  self._task.activity_id = activity_id
70
73
  self._task.started_at = time()
71
74
  self._task.task_id = task_id or str(self._task.started_at)
@@ -117,11 +120,11 @@ class FlowceptTask(object):
117
120
  """
118
121
  if not INSTRUMENTATION_ENABLED:
119
122
  return
120
- self._task.telemetry_at_end = FlowceptTask._interceptor.telemetry_capture.capture()
123
+ self._task.telemetry_at_end = self._interceptor.telemetry_capture.capture()
121
124
  self._task.ended_at = ended_at or time()
122
125
  self._task.status = status
123
126
  self._task.stderr = stderr
124
127
  self._task.stdout = stdout
125
128
  self._task.generated = generated
126
- FlowceptTask._interceptor.intercept(self._task.to_dict())
129
+ self._interceptor.intercept(self._task.to_dict())
127
130
  self._ended = True
flowcept/version.py CHANGED
@@ -4,4 +4,4 @@
4
4
  # The expected format is: <Major>.<Minor>.<Patch>
5
5
  # This file is supposed to be automatically modified by the CI Bot.
6
6
  # See .github/workflows/version_bumper.py
7
- __version__ = "0.8.4"
7
+ __version__ = "0.8.6"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: flowcept
3
- Version: 0.8.4
3
+ Version: 0.8.6
4
4
  Summary: Capture and query workflow provenance data using data observability
5
5
  Project-URL: GitHub, https://github.com/ORNL/flowcept
6
6
  Author: Oak Ridge National Laboratory
@@ -96,20 +96,34 @@ Description-Content-Type: text/markdown
96
96
 
97
97
  # Flowcept
98
98
 
99
+ ## Table of Contents
100
+
101
+ - [Overview](#overview)
102
+ - [Features](#features)
103
+ - [Installation](#installation)
104
+ - [Setup and the Settings File](#setup)
105
+ - [Running with Containers](#running-with-containers)
106
+ - [Examples](#examples)
107
+ - [Data Persistence](#data-persistence)
108
+ - [Performance Tuning](#performance-tuning-for-performance-evaluation)
109
+ - [AMD GPU Setup](#install-amd-gpu-lib)
110
+
111
+ ## Overview
112
+
99
113
  Flowcept is a runtime data integration system that captures and queries workflow provenance with minimal or no code changes. It unifies data across diverse workflows and tools, enabling integrated analysis and insights, especially in federated environments. Designed for scenarios involving critical data from multiple workflows, Flowcept seamlessly integrates data at runtime, providing a unified view for end-to-end monitoring and analysis, and enhanced support for Machine Learning (ML) workflows.
100
114
 
101
- Other capabilities include:
102
-
103
- - Automatic multi-workflow provenance data capture;
104
- - Data observability, enabling minimal intrusion to user workflows;
105
- - Explicit user workflow instrumentation, if this is preferred over implicit data observability;
106
- - ML data capture in various levels of details: workflow, model fitting or evaluation task, epoch iteration, layer forwarding;
107
- - ML model management;
108
- - Adapter-based, loosely-coupled system architecture, making it easy to plug and play with different data processing systems and backend database (e.g., MongoDB) or MQ services (e.g., Redis, Kafka);
109
- - Low-overhead focused system architecture, to avoid adding performance overhead particularly to workloads that run on HPC machines;
110
- - Telemetry data capture (e.g., CPU, GPU, Memory consumption) linked to the application dataflow;
111
- - Highly customizable to multiple use cases, enabling easy toggle between settings (e.g., with/without provenance capture; with/without telemetry and which telemetry type to capture; which adapters or backend services to run with);
112
- - [W3C PROV](https://www.w3.org/TR/prov-overview/) adherence;
115
+ ## Features
116
+
117
+ - Automatic workflow provenance data capture from heterogeneous workflows
118
+ - Data observability with no or minimal intrusion to application workflows
119
+ - Explicit application instrumentation, if this is preferred over data observability
120
+ - ML data capture in various levels of details: workflow, model fitting or evaluation task, epoch iteration, layer forwarding
121
+ - ML model management (e.g., model storage and retrieval, along with their metadata and provenance)
122
+ - Adapter-based, loosely-coupled system architecture, making it easy to plug and play with different data processing systems and backend database (e.g., MongoDB) or MQ services (e.g., Redis, Kafka)
123
+ - Low-overhead focused system architecture, to avoid adding performance overhead particularly to workloads that run on HPC machines
124
+ - Telemetry data capture (e.g., CPU, GPU, Memory consumption) linked to the application dataflow
125
+ - Highly customizable to multiple use cases, enabling easy toggle between settings (e.g., with/without provenance capture; with/without telemetry and which telemetry type to capture; which adapters or backend services to run with)
126
+ - [W3C PROV](https://www.w3.org/TR/prov-overview/) adherence
113
127
 
114
128
  Notes:
115
129
 
@@ -192,7 +206,8 @@ To use Flowcept, one needs to start a MQ system `$> make services`. This will st
192
206
 
193
207
  ### Flowcept Settings File
194
208
 
195
- Flowcept requires a settings file for configuration. You can find an example [here](resources/sample_settings.yaml).
209
+ Flowcept requires a settings file for configuration.
210
+ You can find an example configuration file [here](resources/sample_settings.yaml), with documentation for each parameter provided as inline comments.
196
211
 
197
212
  #### What You Can Configure:
198
213
 
@@ -214,7 +229,6 @@ export FLOWCEPT_SETTINGS_PATH=/absolute/path/to/your/settings.yaml
214
229
 
215
230
  If this variable is not set, Flowcept will use the default values from the [example](resources/sample_settings.yaml) file.
216
231
 
217
-
218
232
  # Running with Containers
219
233
 
220
234
  To use containers instead of installing Flowcept's dependencies on your host system, we provide a [Dockerfile](deployment/Dockerfile) alongside a [docker-compose.yml](deployment/compose.yml) for dependent services (e.g., Redis, MongoDB).
@@ -1,6 +1,6 @@
1
1
  flowcept/__init__.py,sha256=CukmdzTUvm6Y_plTKPq4kKn7w9LdR36j7V_C_UQyjhU,2011
2
- flowcept/configs.py,sha256=_-jhoI_HGKjzymjYTlDuysbM38Gr2aunc0Q-Stlmcwk,7511
3
- flowcept/version.py,sha256=sgsQa7sBlHkp-P60Bs4HRcOzfCc2iDmfQsjJbMii7Xg,306
2
+ flowcept/configs.py,sha256=NDUAqqoKfztt6Qjwxy95eTQU71AovVJWXalI1x3HJ7Y,7441
3
+ flowcept/version.py,sha256=3Xeg2e7hNlxinsNn_IjT041Xp9p4RX_yIks40uVvSJM,306
4
4
  flowcept/analytics/__init__.py,sha256=46q-7vsHq_ddPNrzNnDgEOiRgvlx-5Ggu2ocyROMV0w,641
5
5
  flowcept/analytics/analytics_utils.py,sha256=FRJdBtQa7Hrk2oR_FFhmhmMf3X6YyZ4nbH5RIYh7KL4,8753
6
6
  flowcept/analytics/data_augmentation.py,sha256=Dyr5x316Zf-k1e8rVoQMCpFOrklYVHjfejRPrtoycmc,1641
@@ -17,20 +17,20 @@ flowcept/commons/daos/keyvalue_dao.py,sha256=03xHhQIfZas0LQLP1DbGJ5DoskXyZNXQKIN
17
17
  flowcept/commons/daos/docdb_dao/__init__.py,sha256=qRvXREeUJ4mkhxdC9bzpOsVX6M2FB5hDyLFxhMxTGhs,30
18
18
  flowcept/commons/daos/docdb_dao/docdb_dao_base.py,sha256=YbfSVJPwZGK2GBYkeapRC83HkmP0c6Msv5TriD88RcI,11812
19
19
  flowcept/commons/daos/docdb_dao/lmdb_dao.py,sha256=dJOLgCx_lwdz6MKiMpM_UE4rm0angDCPaVz_WU5KqIA,10407
20
- flowcept/commons/daos/docdb_dao/mongodb_dao.py,sha256=-Kxjep1FbjKiGjvzyvePVHDf-Q1lOIce1EzBURSKubc,38037
20
+ flowcept/commons/daos/docdb_dao/mongodb_dao.py,sha256=WGjlB_8fIsMPRx7LXPj52Uexuh-E69LKsW8vN_zcpPU,38568
21
21
  flowcept/commons/daos/mq_dao/__init__.py,sha256=Xxm4FmbBUZDQ7XIAmSFbeKE_AdHsbgFmSuftvMWSykQ,21
22
22
  flowcept/commons/daos/mq_dao/mq_dao_base.py,sha256=EAqOhy7Q8V29JFDG8C50nRK34KsPxEICkG4elk4ZfX8,9020
23
23
  flowcept/commons/daos/mq_dao/mq_dao_kafka.py,sha256=bf-bZvWw9JJk8Kdfzx2UkAnQC95rSrKXDEyYkrcncOk,4400
24
- flowcept/commons/daos/mq_dao/mq_dao_mofka.py,sha256=aZ810wN5Wkjk7oRUxDWJWOIREUsmq57oI4AxY1bWBuk,3940
24
+ flowcept/commons/daos/mq_dao/mq_dao_mofka.py,sha256=_9KlJwUuFKgTB8ZPQQSz4yQSe6oYKb0vrhm3XIT8jPw,3945
25
25
  flowcept/commons/daos/mq_dao/mq_dao_redis.py,sha256=Br97SoDIkt4dHH937Yjg3wtkn1xGT-x9t-8E3VD5TeU,4277
26
26
  flowcept/commons/flowcept_dataclasses/__init__.py,sha256=8KkiJh0WSRAB50waVluxCSI8Tb9X1L9nup4c8RN3ulc,30
27
27
  flowcept/commons/flowcept_dataclasses/base_settings_dataclasses.py,sha256=Cjw2PGYtZDfnwecz6G3S42Ncmxj7AIZVEBx05bsxRUo,399
28
28
  flowcept/commons/flowcept_dataclasses/task_object.py,sha256=3DD5ZNMz7EVILS9PRkQ3khboav7lIKoUC5W6sKMFauQ,4694
29
29
  flowcept/commons/flowcept_dataclasses/telemetry.py,sha256=9_5ONCo-06r5nKHXmi5HfIhiZSuPgmTECiq_u9MlxXM,2822
30
- flowcept/commons/flowcept_dataclasses/workflow_object.py,sha256=t9M0eVdcq3NLZgT4fwXrR3x2oOgjtE5Jo5_MAi4-0YM,4283
30
+ flowcept/commons/flowcept_dataclasses/workflow_object.py,sha256=f8aB0b3xcUr3KQTlloF7R_P6xQejzDPOm-s6dLhGMeA,4383
31
31
  flowcept/flowcept_api/__init__.py,sha256=T1ty86YlocQ5Z18l5fUqHj_CC6Unq_iBv0lFyiI7Ao8,22
32
32
  flowcept/flowcept_api/db_api.py,sha256=hKXep-n50rp9cAzV0ljk2QVEF8O64yxi3ujXv5_Ibac,9723
33
- flowcept/flowcept_api/flowcept_controller.py,sha256=zq4cRM14xoeRA6HPL6JwLwDLpsVra5ej1EAPmiVKUIs,11932
33
+ flowcept/flowcept_api/flowcept_controller.py,sha256=lkHR7O0zAAfbGtVa4o9tjZMdZquYN7vdnymRKzc4B8s,11933
34
34
  flowcept/flowcept_api/task_query_api.py,sha256=SrwB0OCVtbpvCPECkE2ySM10G_g8Wlk5PJ8h-0xEaNc,23821
35
35
  flowcept/flowcept_webserver/__init__.py,sha256=8411GIXGddKTKoHUvbo_Rq6svosNG7tG8VzvUEBd7WI,28
36
36
  flowcept/flowcept_webserver/app.py,sha256=VUV8_JZbIbx9u_1O7m7XtRdhZb_7uifUa-iNlPhmZws,658
@@ -38,14 +38,14 @@ flowcept/flowcept_webserver/resources/__init__.py,sha256=XOk5yhLeLU6JmVXxbl3TY2z
38
38
  flowcept/flowcept_webserver/resources/query_rsrc.py,sha256=Mk1XDC_wVYkMk0eaazqWWrTC07gQU9U0toKfip0ihZE,1353
39
39
  flowcept/flowcept_webserver/resources/task_messages_rsrc.py,sha256=0u68it2W-9NzUUx5fWOZCqvRKe5EsLI8oyvto9634Ng,666
40
40
  flowcept/flowceptor/__init__.py,sha256=wVxRXUv07iNx6SMRRma2vqhR_GIcRl0re_WCYG65PUs,29
41
- flowcept/flowceptor/telemetry_capture.py,sha256=8LK4o3OaZD8B6KTpgpvD9D834dFpAJpX-NN2bc91jZU,13658
41
+ flowcept/flowceptor/telemetry_capture.py,sha256=wSXyQJ-vPVzeldD4KqoLQA2rg7V0EOQo_11ErJE5oQQ,13743
42
42
  flowcept/flowceptor/adapters/__init__.py,sha256=SuZbSZVVQeBJ9zXW-M9jF09dw3XIjre3lSGrUO1Y8Po,27
43
- flowcept/flowceptor/adapters/base_interceptor.py,sha256=99a_Ipnj6g8qZMHWLBEYJh0Cox033ADxOKPFrivr9gw,6056
43
+ flowcept/flowceptor/adapters/base_interceptor.py,sha256=5guTHuuFpB7PYk9tQopdak5MBxcPMnsxDiZVUcEvJks,6483
44
44
  flowcept/flowceptor/adapters/instrumentation_interceptor.py,sha256=DhK2bBnpghqPSeA62BUqRg6pl8zxuYrP33dK4x6PhRE,733
45
45
  flowcept/flowceptor/adapters/interceptor_state_manager.py,sha256=xRzmi5YFKBEqNtX8F5s6XlMTRe27ml4BmQtBO4WtG2c,919
46
46
  flowcept/flowceptor/adapters/dask/__init__.py,sha256=GKreb5L_nliD2BEckyB943zOQ-b6Gn1fLDj81FqSK2Y,23
47
47
  flowcept/flowceptor/adapters/dask/dask_dataclasses.py,sha256=6LTG-kdcc6AUuVINvkqB5QHw6pchg1aMqj0sdWt2Ef8,580
48
- flowcept/flowceptor/adapters/dask/dask_interceptor.py,sha256=Dzrwu9Y9A6k2Qq8tZKXx3zmi-CpmtFrJehUTfNjZxDM,5827
48
+ flowcept/flowceptor/adapters/dask/dask_interceptor.py,sha256=uBQpLluYXzlT1gBDfTe4_WueC_fWBEs5Xr8ntpOmljE,5869
49
49
  flowcept/flowceptor/adapters/dask/dask_plugins.py,sha256=s1ENAi9N61PC_6RiFvOYhJsgWzSm_lFWm3w87V-R1YY,2473
50
50
  flowcept/flowceptor/adapters/mlflow/__init__.py,sha256=3mzHrvh1XQOy68qx1A3so9Nq27tIb0i2mSXfv3F6gZg,25
51
51
  flowcept/flowceptor/adapters/mlflow/interception_event_handler.py,sha256=-SsIRdOcZjQUTzWgsZ41ouqpla4Qd32jIWXIAGU1pPw,494
@@ -59,15 +59,15 @@ flowcept/flowceptor/adapters/zambeze/__init__.py,sha256=1e9_hK2cUKDXhQ0kBRftwcJj
59
59
  flowcept/flowceptor/adapters/zambeze/zambeze_dataclasses.py,sha256=nn9MxvcdzgmOa8n5Jwdl7UzlSzxEu9bA-Ls6cHyb91c,849
60
60
  flowcept/flowceptor/adapters/zambeze/zambeze_interceptor.py,sha256=Bjyi48JW0DXJLJuvwPxaD8zxxsSoEFgSoXl8YcbwFWk,3782
61
61
  flowcept/flowceptor/consumers/__init__.py,sha256=foxtVEb2ZEe9g1slfYIKM4tIFv-He1l7XS--SYs7nlQ,28
62
- flowcept/flowceptor/consumers/consumer_utils.py,sha256=JmyjQeZPqMj_yqFlxxw9k2_JZvZkAmX7kySV__YvEVc,5719
63
- flowcept/flowceptor/consumers/document_inserter.py,sha256=rAK3rs3VNW5a6koesE05scQ1mR_4BhuxLurP10ipURs,9339
62
+ flowcept/flowceptor/consumers/consumer_utils.py,sha256=7bvFJWusJkfA4j0gwZLDIIsIOyfk9wRq6s5liS3JAV0,5665
63
+ flowcept/flowceptor/consumers/document_inserter.py,sha256=m9B57j6C4vdwSZhbafgtqHKTvi4YMttmv-iShW5MYqs,9508
64
64
  flowcept/instrumentation/__init__.py,sha256=M5bTmg80E4QyN91gUX3qfw_nbtJSXwGWcKxdZP3vJz0,34
65
- flowcept/instrumentation/flowcept_loop.py,sha256=9Ap7-PfpNdwS7DaRDaB-R9G3X_G3RZvGVkNVUZAix5A,12164
65
+ flowcept/instrumentation/flowcept_loop.py,sha256=RvETm3Pn37dIw_a1RXigyh2U7MCBHqi46dPmbrz3RMQ,12171
66
66
  flowcept/instrumentation/flowcept_task.py,sha256=l_BAYEUZ_SeBt8QJN_E9D9QcZVYRnW9qO_XRnqvmePE,5993
67
- flowcept/instrumentation/flowcept_torch.py,sha256=KXA1HBwz8l5Qp7PkZ7nsbYlM8IcwWD_u04NxaAcZPzM,23395
68
- flowcept/instrumentation/task_capture.py,sha256=u82r_SgzoVKyb6_SWtfB-meBUZgjrXvF5dxkH9vnMDs,4776
69
- resources/sample_settings.yaml,sha256=eYMO3rlS4m1sjkgoKuRIYaTuZldiu50bN-E3Bu3on_I,3424
70
- flowcept-0.8.4.dist-info/METADATA,sha256=aOyqBxuMdXSO5h4n9vAqLQDY7U3ZRRzJoHKTrgHYfoE,17543
71
- flowcept-0.8.4.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
72
- flowcept-0.8.4.dist-info/licenses/LICENSE,sha256=r5-2P6tFTuRGWT5TiX32s1y0tnp4cIqBEC1QjTaXe2k,1086
73
- flowcept-0.8.4.dist-info/RECORD,,
67
+ flowcept/instrumentation/flowcept_torch.py,sha256=BgJ0eOeCGOe0hjbyDQf-bHK_kFOvj9iSXC4PI1EikXE,23436
68
+ flowcept/instrumentation/task_capture.py,sha256=DdudnrjiXS6EGsirdykMLi1rFFHWkJMX1C7uibjsjL4,4944
69
+ resources/sample_settings.yaml,sha256=aKeHf8895vrHIbi0QS1w2WT5n8ZNI9Ep5PVPF5Y5MEQ,4957
70
+ flowcept-0.8.6.dist-info/METADATA,sha256=nzqP4fruoqw-264-XNXJ0rlR_UfwsmqmmOW_J0GPYRM,18086
71
+ flowcept-0.8.6.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
72
+ flowcept-0.8.6.dist-info/licenses/LICENSE,sha256=r5-2P6tFTuRGWT5TiX32s1y0tnp4cIqBEC1QjTaXe2k,1086
73
+ flowcept-0.8.6.dist-info/RECORD,,
@@ -1,19 +1,20 @@
1
+ flowcept_version: 0.8.0 # Version of the Flowcept package. This setting file is compatible with this version.
2
+
1
3
  project:
2
- debug: true
3
- json_serializer: default # or complex. If "complex", Flowcept will deal with complex python dicts that may contain JSON unserializable values
4
- replace_non_json_serializable: true
5
- performance_logging: false
6
- register_workflow: true
7
- enrich_messages: true
8
- db_flush_mode: online # or offline
4
+ debug: true # Toggle debug mode. This will add a property `debug: true` to all saved data, making it easier to retrieve/delete them later.
5
+ json_serializer: default # JSON serialization mode: default or complex. If "complex", Flowcept will deal with complex python dicts that may contain JSON unserializable values
6
+ replace_non_json_serializable: true # Replace values that can't be JSON serialized
7
+ performance_logging: false # Enable performance logging if true. Particularly useful for MQ flushes.
8
+ enrich_messages: true # Add extra metadata to task messages, such as IP addresses and UTC timestamps.
9
+ db_flush_mode: online # Mode for flushing DB entries: "online" or "offline". If online, flushes to the DB will happen before the workflow ends.
9
10
 
10
11
  log:
11
- log_path: "default"
12
- log_file_level: error # use 'disable' to disable logs
13
- log_stream_level: error
12
+ log_path: "default" # Path for log file output; "default" will write the log in the directory where the main executable is running from.
13
+ log_file_level: error # Logging level (error, debug, info, critical) for file logs; use "disable" to turn off.
14
+ log_stream_level: error # Logging level (error, debug, info, critical) for console/stream logs; use "disable" to turn off.
14
15
 
15
- telemetry_capture:
16
- gpu: ~ # ~ means None. This is a list with GPU metrics. AMD=[activity,used,power,temperature,others]; NVIDIA=[used,temperature,power,name,ix]
16
+ telemetry_capture: # This toggles each individual type of telemetry capture. GPU capture is treated different depending on the vendor (AMD or NVIDIA).
17
+ gpu: ~ # ~ means None. This is a list with GPU metrics. AMD=[activity,used,power,temperature,others,id]; NVIDIA=[used,temperature,power,name,id]
17
18
  cpu: true
18
19
  per_cpu: true
19
20
  process_info: true
@@ -23,17 +24,18 @@ telemetry_capture:
23
24
  machine_info: true
24
25
 
25
26
  instrumentation:
26
- enabled: true
27
+ enabled: true # This toggles data capture for instrumentation.
28
+ singleton: true # Use a single instrumentation instance per process. Defaults to true
27
29
  torch:
28
- what: parent_and_children # parent_only, parent_and_children, ~
29
- children_mode: telemetry_and_tensor_inspection # tensor_inspection, telemetry, telemetry_and_tensor_inspection
30
+ what: parent_and_children # Scope of instrumentation: "parent_only" -- will capture only at the main model level, "parent_and_children" -- will capture the inner layers, or ~ (disable).
31
+ children_mode: telemetry_and_tensor_inspection # What to capture if parent_and_children is chosen in the scope. Possible values: "tensor_inspection" (i.e., tensor metadata), "telemetry", "telemetry_and_tensor_inspection"
30
32
  epoch_loop: lightweight # lightweight, ~ (disable), or default (default will use the default telemetry capture method)
31
33
  batch_loop: lightweight # lightweight, ~ (disable), or default (default will use the default telemetry capture method)
32
- capture_epochs_at_every: 1 #epochs; please use a value that is multiple of #epochs
33
- # enable to set between train, evaluate, and test
34
+ capture_epochs_at_every: 1 # Will capture data at every N epochs; please use a value that is multiple of the total number of #epochs.
35
+ register_workflow: true # Will store the parent model forward as a workflow itself in the database.
34
36
 
35
37
  experiment:
36
- user: root
38
+ user: root # Optionally identify the user running the experiment. The logged username will be captured anyways.
37
39
 
38
40
  mq:
39
41
  type: redis # or kafka or mofka; Please adjust the port (kafka's default is 9092; redis is 6379). If mofka, adjust the group_file.