flowcept 0.8.11__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowcept/__init__.py +7 -4
- flowcept/agents/__init__.py +5 -0
- flowcept/{flowceptor/consumers/agent/client_agent.py → agents/agent_client.py} +22 -12
- flowcept/agents/agents_utils.py +181 -0
- flowcept/agents/dynamic_schema_tracker.py +191 -0
- flowcept/agents/flowcept_agent.py +30 -0
- flowcept/agents/flowcept_ctx_manager.py +175 -0
- flowcept/agents/gui/__init__.py +5 -0
- flowcept/agents/gui/agent_gui.py +76 -0
- flowcept/agents/gui/gui_utils.py +239 -0
- flowcept/agents/llms/__init__.py +1 -0
- flowcept/agents/llms/claude_gcp.py +139 -0
- flowcept/agents/llms/gemini25.py +119 -0
- flowcept/agents/prompts/__init__.py +1 -0
- flowcept/{flowceptor/adapters/agents/prompts.py → agents/prompts/general_prompts.py} +18 -0
- flowcept/agents/prompts/in_memory_query_prompts.py +297 -0
- flowcept/agents/tools/__init__.py +1 -0
- flowcept/agents/tools/general_tools.py +102 -0
- flowcept/agents/tools/in_memory_queries/__init__.py +1 -0
- flowcept/agents/tools/in_memory_queries/in_memory_queries_tools.py +704 -0
- flowcept/agents/tools/in_memory_queries/pandas_agent_utils.py +309 -0
- flowcept/cli.py +286 -44
- flowcept/commons/daos/docdb_dao/mongodb_dao.py +47 -0
- flowcept/commons/daos/mq_dao/mq_dao_base.py +24 -13
- flowcept/commons/daos/mq_dao/mq_dao_kafka.py +18 -2
- flowcept/commons/flowcept_dataclasses/task_object.py +16 -21
- flowcept/commons/flowcept_dataclasses/workflow_object.py +9 -1
- flowcept/commons/task_data_preprocess.py +260 -60
- flowcept/commons/utils.py +25 -6
- flowcept/configs.py +41 -26
- flowcept/flowcept_api/flowcept_controller.py +73 -6
- flowcept/flowceptor/adapters/base_interceptor.py +11 -5
- flowcept/flowceptor/consumers/agent/base_agent_context_manager.py +25 -1
- flowcept/flowceptor/consumers/base_consumer.py +4 -0
- flowcept/flowceptor/consumers/consumer_utils.py +5 -4
- flowcept/flowceptor/consumers/document_inserter.py +2 -2
- flowcept/flowceptor/telemetry_capture.py +5 -2
- flowcept/instrumentation/flowcept_agent_task.py +294 -0
- flowcept/instrumentation/flowcept_decorator.py +43 -0
- flowcept/instrumentation/flowcept_loop.py +3 -3
- flowcept/instrumentation/flowcept_task.py +64 -24
- flowcept/instrumentation/flowcept_torch.py +5 -5
- flowcept/instrumentation/task_capture.py +83 -6
- flowcept/version.py +1 -1
- {flowcept-0.8.11.dist-info → flowcept-0.9.1.dist-info}/METADATA +42 -14
- {flowcept-0.8.11.dist-info → flowcept-0.9.1.dist-info}/RECORD +50 -36
- resources/sample_settings.yaml +12 -4
- flowcept/flowceptor/adapters/agents/__init__.py +0 -1
- flowcept/flowceptor/adapters/agents/agents_utils.py +0 -89
- flowcept/flowceptor/adapters/agents/flowcept_agent.py +0 -292
- flowcept/flowceptor/adapters/agents/flowcept_llm_prov_capture.py +0 -186
- flowcept/flowceptor/consumers/agent/flowcept_agent_context_manager.py +0 -145
- flowcept/flowceptor/consumers/agent/flowcept_qa_manager.py +0 -112
- {flowcept-0.8.11.dist-info → flowcept-0.9.1.dist-info}/WHEEL +0 -0
- {flowcept-0.8.11.dist-info → flowcept-0.9.1.dist-info}/entry_points.txt +0 -0
- {flowcept-0.8.11.dist-info → flowcept-0.9.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -24,6 +24,7 @@ class TaskObject:
|
|
|
24
24
|
utc_timestamp: float = None
|
|
25
25
|
adapter_id: AnyStr = None
|
|
26
26
|
user: AnyStr = None
|
|
27
|
+
data: Any = None
|
|
27
28
|
used: Dict[AnyStr, Any] = None # Used parameter and files
|
|
28
29
|
campaign_id: AnyStr = None
|
|
29
30
|
generated: Dict[AnyStr, Any] = None # Generated results and files
|
|
@@ -53,6 +54,7 @@ class TaskObject:
|
|
|
53
54
|
dependencies: List = None
|
|
54
55
|
dependents: List = None
|
|
55
56
|
tags: List = None
|
|
57
|
+
agent_id: str = None
|
|
56
58
|
|
|
57
59
|
_DEFAULT_ENRICH_VALUES = {
|
|
58
60
|
"node_name": NODE_NAME,
|
|
@@ -104,20 +106,16 @@ class TaskObject:
|
|
|
104
106
|
if self.utc_timestamp is None:
|
|
105
107
|
self.utc_timestamp = flowcept.commons.utils.get_utc_now()
|
|
106
108
|
|
|
107
|
-
|
|
108
|
-
self
|
|
109
|
-
|
|
110
|
-
if self.login_name is None and LOGIN_NAME is not None:
|
|
111
|
-
self.login_name = LOGIN_NAME
|
|
112
|
-
|
|
113
|
-
if self.public_ip is None and PUBLIC_IP is not None:
|
|
114
|
-
self.public_ip = PUBLIC_IP
|
|
115
|
-
|
|
116
|
-
if self.private_ip is None and PRIVATE_IP is not None:
|
|
117
|
-
self.private_ip = PRIVATE_IP
|
|
109
|
+
for key, fallback_value in TaskObject._DEFAULT_ENRICH_VALUES.items():
|
|
110
|
+
if getattr(self, key) is None and fallback_value is not None:
|
|
111
|
+
setattr(self, key, fallback_value)
|
|
118
112
|
|
|
119
|
-
|
|
120
|
-
|
|
113
|
+
@staticmethod
|
|
114
|
+
def enrich_task_dict(task_dict: dict):
|
|
115
|
+
"""Enrich the task."""
|
|
116
|
+
for key, fallback_value in TaskObject._DEFAULT_ENRICH_VALUES.items():
|
|
117
|
+
if (key not in task_dict or task_dict[key] is None) and fallback_value is not None:
|
|
118
|
+
task_dict[key] = fallback_value
|
|
121
119
|
|
|
122
120
|
def to_dict(self):
|
|
123
121
|
"""Convert to dictionary."""
|
|
@@ -139,13 +137,6 @@ class TaskObject:
|
|
|
139
137
|
"""Serialize it."""
|
|
140
138
|
return msgpack.dumps(self.to_dict())
|
|
141
139
|
|
|
142
|
-
@staticmethod
|
|
143
|
-
def enrich_task_dict(task_dict: dict):
|
|
144
|
-
"""Enrich the task."""
|
|
145
|
-
for key, fallback_value in TaskObject._DEFAULT_ENRICH_VALUES.items():
|
|
146
|
-
if (key not in task_dict or task_dict[key] is None) and fallback_value is not None:
|
|
147
|
-
task_dict[key] = fallback_value
|
|
148
|
-
|
|
149
140
|
@staticmethod
|
|
150
141
|
def from_dict(task_obj_dict: Dict[AnyStr, Any]) -> "TaskObject":
|
|
151
142
|
"""Create a TaskObject from a dictionary.
|
|
@@ -177,6 +168,10 @@ class TaskObject:
|
|
|
177
168
|
|
|
178
169
|
def __repr__(self):
|
|
179
170
|
"""Return an unambiguous string representation of the TaskObject."""
|
|
180
|
-
attrs = ["task_id", "workflow_id", "campaign_id", "activity_id", "
|
|
171
|
+
attrs = ["task_id", "workflow_id", "campaign_id", "activity_id", "started_at", "ended_at"]
|
|
172
|
+
optionals = ["subtype", "parent_task_id", "agent_id"]
|
|
173
|
+
for opt in optionals:
|
|
174
|
+
if getattr(self, opt) is not None:
|
|
175
|
+
attrs.append(opt)
|
|
181
176
|
attr_str = ", ".join(f"{attr}={repr(getattr(self, attr))}" for attr in attrs)
|
|
182
177
|
return f"TaskObject({attr_str})"
|
|
@@ -5,7 +5,7 @@ import msgpack
|
|
|
5
5
|
from omegaconf import OmegaConf, DictConfig
|
|
6
6
|
|
|
7
7
|
from flowcept.version import __version__
|
|
8
|
-
from flowcept.commons.utils import get_utc_now
|
|
8
|
+
from flowcept.commons.utils import get_utc_now, get_git_info
|
|
9
9
|
from flowcept.configs import (
|
|
10
10
|
settings,
|
|
11
11
|
FLOWCEPT_USER,
|
|
@@ -38,6 +38,7 @@ class WorkflowObject:
|
|
|
38
38
|
sys_name: str = None
|
|
39
39
|
extra_metadata: str = None
|
|
40
40
|
used: Dict = None
|
|
41
|
+
code_repository: Dict = None
|
|
41
42
|
generated: Dict = None
|
|
42
43
|
|
|
43
44
|
def __init__(self, workflow_id=None, name=None, used=None, generated=None):
|
|
@@ -93,6 +94,13 @@ class WorkflowObject:
|
|
|
93
94
|
)
|
|
94
95
|
self.extra_metadata = _extra_metadata
|
|
95
96
|
|
|
97
|
+
if self.code_repository is None:
|
|
98
|
+
try:
|
|
99
|
+
self.code_repository = get_git_info()
|
|
100
|
+
except Exception as e:
|
|
101
|
+
print(e)
|
|
102
|
+
pass
|
|
103
|
+
|
|
96
104
|
if self.flowcept_version is None:
|
|
97
105
|
self.flowcept_version = __version__
|
|
98
106
|
|
|
@@ -1,35 +1,13 @@
|
|
|
1
|
-
|
|
2
|
-
The base of this code was generated using ChatGPT.
|
|
3
|
-
|
|
4
|
-
Prompt:
|
|
5
|
-
|
|
6
|
-
Here I have a list containing one real task.
|
|
7
|
-
|
|
8
|
-
<Paste one real task here>
|
|
9
|
-
|
|
10
|
-
I want to create a list of summarized task data, per task, containing:
|
|
11
|
-
- activity_id
|
|
12
|
-
- task_id
|
|
13
|
-
- used
|
|
14
|
-
- generated
|
|
15
|
-
- task_duration (ended_at - started_at)
|
|
16
|
-
- hostname
|
|
17
|
-
- cpu_info
|
|
18
|
-
- disk_info
|
|
19
|
-
- mem_info
|
|
20
|
-
- network_info
|
|
21
|
-
<Consider adding GPU info too, if you have gpu in your task data>
|
|
22
|
-
|
|
23
|
-
Where info about cpu, disk, mem, and network must consider telemetry_at_end and telemetry_at_start.
|
|
24
|
-
|
|
25
|
-
We will use this summarized data as input for LLM questions to find patterns in the resource usage and how they relate
|
|
26
|
-
to input (used) and output (generated) of each task.
|
|
27
|
-
"""
|
|
28
|
-
|
|
1
|
+
from datetime import datetime
|
|
29
2
|
from typing import Dict, List
|
|
3
|
+
import copy
|
|
4
|
+
from collections import defaultdict
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import pytz
|
|
30
8
|
|
|
31
9
|
|
|
32
|
-
def summarize_telemetry(task: Dict) -> Dict:
|
|
10
|
+
def summarize_telemetry(task: Dict, logger) -> Dict:
|
|
33
11
|
"""
|
|
34
12
|
Extract and compute the telemetry summary for a task based on start and end telemetry snapshots.
|
|
35
13
|
|
|
@@ -79,24 +57,55 @@ def summarize_telemetry(task: Dict) -> Dict:
|
|
|
79
57
|
"packets_recv_diff": net_end["packets_recv"] - net_start["packets_recv"],
|
|
80
58
|
}
|
|
81
59
|
|
|
82
|
-
|
|
83
|
-
|
|
60
|
+
tel_funcs = {
|
|
61
|
+
"cpu": extract_cpu_info,
|
|
62
|
+
"disk": extract_disk_info,
|
|
63
|
+
"memory": extract_mem_info,
|
|
64
|
+
"network": extract_network_info,
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
start_tele = task.get("telemetry_at_start", {})
|
|
68
|
+
end_tele = task.get("telemetry_at_end", {})
|
|
84
69
|
|
|
85
|
-
|
|
86
|
-
ended_at = task["ended_at"]
|
|
87
|
-
duration = ended_at - started_at
|
|
70
|
+
telemetry_summary = {}
|
|
88
71
|
|
|
89
|
-
|
|
90
|
-
"
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
72
|
+
try:
|
|
73
|
+
started_at = task.get("started_at", None)
|
|
74
|
+
ended_at = task.get("ended_at", None)
|
|
75
|
+
if started_at is None or ended_at is None:
|
|
76
|
+
logger.warning(f"We can't summarize telemetry for duration_sec for task {task}")
|
|
77
|
+
else:
|
|
78
|
+
duration = ended_at - started_at
|
|
79
|
+
telemetry_summary["duration_sec"] = duration
|
|
80
|
+
except Exception as e:
|
|
81
|
+
logger.error(f"Error to summarize telemetry for duration_sec in {task}")
|
|
82
|
+
logger.exception(e)
|
|
83
|
+
|
|
84
|
+
for key in start_tele.keys():
|
|
85
|
+
try:
|
|
86
|
+
if key not in tel_funcs:
|
|
87
|
+
continue
|
|
88
|
+
func = tel_funcs[key]
|
|
89
|
+
if key in end_tele:
|
|
90
|
+
telemetry_summary[key] = func(start_tele[key], end_tele[key])
|
|
91
|
+
else:
|
|
92
|
+
logger.warning(
|
|
93
|
+
f"We can't summarize telemetry {key} for task {task} because the key is not in the end_tele"
|
|
94
|
+
)
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.warning(f"Error to summarize telemetry for {key} for task {task}. Exception: {e}")
|
|
97
|
+
logger.exception(e)
|
|
96
98
|
|
|
97
99
|
return telemetry_summary
|
|
98
100
|
|
|
99
101
|
|
|
102
|
+
def _safe_get(task, key):
|
|
103
|
+
try:
|
|
104
|
+
return task.get(key)
|
|
105
|
+
except Exception:
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
|
|
100
109
|
def summarize_task(task: Dict, thresholds: Dict = None, logger=None) -> Dict:
|
|
101
110
|
"""
|
|
102
111
|
Summarize key metadata and telemetry for a task, optionally tagging critical conditions.
|
|
@@ -113,23 +122,51 @@ def summarize_task(task: Dict, thresholds: Dict = None, logger=None) -> Dict:
|
|
|
113
122
|
dict
|
|
114
123
|
Summary of the task including identifiers, telemetry summary, and optional critical tags.
|
|
115
124
|
"""
|
|
116
|
-
task_summary = {
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
"
|
|
121
|
-
"
|
|
122
|
-
"
|
|
123
|
-
"
|
|
124
|
-
|
|
125
|
+
task_summary = {}
|
|
126
|
+
|
|
127
|
+
# Keys that can be copied directly
|
|
128
|
+
for key in [
|
|
129
|
+
"workflow_id",
|
|
130
|
+
"task_id",
|
|
131
|
+
"parent_task_id",
|
|
132
|
+
"activity_id",
|
|
133
|
+
"used",
|
|
134
|
+
"generated",
|
|
135
|
+
"hostname",
|
|
136
|
+
"status",
|
|
137
|
+
"agent_id",
|
|
138
|
+
"campaign_id",
|
|
139
|
+
"subtype",
|
|
140
|
+
"custom_metadata",
|
|
141
|
+
]:
|
|
142
|
+
value = _safe_get(task, key)
|
|
143
|
+
if value is not None:
|
|
144
|
+
if "_id" in key:
|
|
145
|
+
task_summary[key] = str(value)
|
|
146
|
+
else:
|
|
147
|
+
task_summary[key] = value
|
|
148
|
+
|
|
149
|
+
# Special handling for timestamp field
|
|
150
|
+
try:
|
|
151
|
+
time_keys = ["started_at", "ended_at"]
|
|
152
|
+
for time_key in time_keys:
|
|
153
|
+
timestamp = _safe_get(task, time_key)
|
|
154
|
+
if timestamp is not None:
|
|
155
|
+
task_summary[time_key] = datetime.fromtimestamp(timestamp, pytz.utc)
|
|
156
|
+
except Exception as e:
|
|
157
|
+
if logger:
|
|
158
|
+
logger.exception(f"Error {e} converting timestamp for task {task.get('task_id', 'unknown')}")
|
|
125
159
|
|
|
126
160
|
try:
|
|
127
|
-
telemetry_summary = summarize_telemetry(task)
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
161
|
+
telemetry_summary = summarize_telemetry(task, logger)
|
|
162
|
+
try:
|
|
163
|
+
tags = tag_critical_task(
|
|
164
|
+
generated=task.get("generated", {}), telemetry_summary=telemetry_summary, thresholds=thresholds
|
|
165
|
+
)
|
|
166
|
+
if tags:
|
|
167
|
+
task_summary["tags"] = tags
|
|
168
|
+
except Exception as e:
|
|
169
|
+
logger.exception(e)
|
|
133
170
|
task_summary["telemetry_summary"] = telemetry_summary
|
|
134
171
|
except Exception as e:
|
|
135
172
|
if logger:
|
|
@@ -172,12 +209,13 @@ def tag_critical_task(
|
|
|
172
209
|
"high_output": 0.9,
|
|
173
210
|
}
|
|
174
211
|
|
|
175
|
-
cpu = abs(telemetry_summary
|
|
176
|
-
mem = telemetry_summary
|
|
177
|
-
disk = telemetry_summary
|
|
212
|
+
cpu = abs(telemetry_summary.get("cpu", {}).get("percent_all_diff", 0))
|
|
213
|
+
mem = telemetry_summary.get("mem", {}).get("used_mem_diff", 0)
|
|
214
|
+
disk = telemetry_summary.get("disk", {}).get("read_bytes_diff", 0) + telemetry_summary.get("disk", {}).get(
|
|
178
215
|
"write_bytes_diff", 0
|
|
179
216
|
)
|
|
180
|
-
|
|
217
|
+
# TODO gpu
|
|
218
|
+
duration = telemetry_summary.get("duration_sec", 0)
|
|
181
219
|
|
|
182
220
|
tags = []
|
|
183
221
|
|
|
@@ -198,3 +236,165 @@ def tag_critical_task(
|
|
|
198
236
|
tags.append("high_output")
|
|
199
237
|
|
|
200
238
|
return tags
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
sample_tasks = [
|
|
242
|
+
{
|
|
243
|
+
"task_id": "t1",
|
|
244
|
+
"activity_id": "train_model",
|
|
245
|
+
"used": {
|
|
246
|
+
"dataset": {"name": "MNIST", "size": 60000, "source": {"url": "http://example.com/mnist", "format": "csv"}},
|
|
247
|
+
"params": {"epochs": 5, "batch_size": 32, "shuffle": True},
|
|
248
|
+
},
|
|
249
|
+
"generated": {"model": {"accuracy": 0.98, "layers": [64, 64, 10], "saved_path": "/models/mnist_v1.pth"}},
|
|
250
|
+
"telemetry_summary": {"duration_sec": 42.7, "cpu_percent": 85.2},
|
|
251
|
+
},
|
|
252
|
+
{
|
|
253
|
+
"task_id": "t2",
|
|
254
|
+
"activity_id": "train_model",
|
|
255
|
+
"used": {
|
|
256
|
+
"dataset": {
|
|
257
|
+
"name": "CIFAR-10",
|
|
258
|
+
"size": 50000,
|
|
259
|
+
"source": {"url": "http://example.com/cifar", "format": "jpeg"},
|
|
260
|
+
},
|
|
261
|
+
"params": {"epochs": 10, "batch_size": 64, "shuffle": False},
|
|
262
|
+
},
|
|
263
|
+
"generated": {"model": {"accuracy": 0.91, "layers": [128, 128, 10], "saved_path": "/models/cifar_v1.pth"}},
|
|
264
|
+
"telemetry_summary": {"duration_sec": 120.5, "cpu_percent": 92.0},
|
|
265
|
+
},
|
|
266
|
+
{
|
|
267
|
+
"task_id": "t3",
|
|
268
|
+
"activity_id": "evaluate_model",
|
|
269
|
+
"used": {"model_path": "/models/mnist_v1.pth", "test_data": {"name": "MNIST-test", "samples": 10000}},
|
|
270
|
+
"generated": {"metrics": {"accuracy": 0.97, "confusion_matrix": [[8500, 100], [50, 1350]]}},
|
|
271
|
+
"telemetry_summary": {"duration_sec": 15.3},
|
|
272
|
+
},
|
|
273
|
+
{
|
|
274
|
+
"task_id": "t4",
|
|
275
|
+
"activity_id": "evaluate_model",
|
|
276
|
+
"used": {"model_path": "/models/cifar_v1.pth", "test_data": {"name": "CIFAR-test", "samples": 10000}},
|
|
277
|
+
"generated": {"metrics": {"accuracy": 0.88, "confusion_matrix": [[4000, 500], [300, 5200]]}},
|
|
278
|
+
"telemetry_summary": {"duration_sec": 18.9},
|
|
279
|
+
},
|
|
280
|
+
]
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def infer_dtype(value: Any) -> str:
|
|
284
|
+
"""Infer a simplified dtype label for the value."""
|
|
285
|
+
if isinstance(value, bool):
|
|
286
|
+
return "bool"
|
|
287
|
+
elif isinstance(value, int):
|
|
288
|
+
return "int"
|
|
289
|
+
elif isinstance(value, float):
|
|
290
|
+
return "float"
|
|
291
|
+
elif isinstance(value, str):
|
|
292
|
+
return "str"
|
|
293
|
+
elif isinstance(value, list):
|
|
294
|
+
return "list"
|
|
295
|
+
return "str" # fallback for other types
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def flatten_dict(d: dict, parent_key: str = "", sep: str = ".") -> dict:
|
|
299
|
+
"""Recursively flatten nested dicts using dot notation."""
|
|
300
|
+
items = {}
|
|
301
|
+
for k, v in d.items():
|
|
302
|
+
new_key = f"{parent_key}{sep}{k}" if parent_key else k
|
|
303
|
+
if isinstance(v, dict):
|
|
304
|
+
items.update(flatten_dict(v, new_key, sep=sep))
|
|
305
|
+
else:
|
|
306
|
+
items[new_key] = v
|
|
307
|
+
return items
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
def update_schema(schema_section: list, flat_fields: dict):
|
|
311
|
+
"""Update schema section with flattened fields and example values."""
|
|
312
|
+
field_map = {f["n"]: f for f in schema_section}
|
|
313
|
+
|
|
314
|
+
for key, value in flat_fields.items():
|
|
315
|
+
dtype = infer_dtype(value)
|
|
316
|
+
if isinstance(value, float):
|
|
317
|
+
val_repr = round(value, 2)
|
|
318
|
+
elif isinstance(value, (dict, list)):
|
|
319
|
+
val_repr = str(value)
|
|
320
|
+
else:
|
|
321
|
+
val_repr = value
|
|
322
|
+
|
|
323
|
+
if isinstance(val_repr, str) and len(val_repr) > 100:
|
|
324
|
+
val_repr = val_repr[:100] + "#TRUNCATED"
|
|
325
|
+
|
|
326
|
+
if key not in field_map:
|
|
327
|
+
field = {
|
|
328
|
+
"n": key,
|
|
329
|
+
"d": dtype,
|
|
330
|
+
"v": [val_repr] if val_repr is not None else [],
|
|
331
|
+
}
|
|
332
|
+
schema_section.append(field)
|
|
333
|
+
field_map[key] = field
|
|
334
|
+
else:
|
|
335
|
+
field = field_map[key]
|
|
336
|
+
if val_repr not in field["v"] and len(field["v"]) < 3:
|
|
337
|
+
field["v"].append(val_repr)
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def update_tasks_summary_schema(tasks: list[dict], schema) -> dict:
|
|
341
|
+
"""Update tasks_summary schema."""
|
|
342
|
+
act_schema = update_activity_schema(tasks)
|
|
343
|
+
merged_schema = deep_merge_dicts(act_schema, schema)
|
|
344
|
+
return merged_schema
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def update_activity_schema(tasks: list[dict]) -> dict:
|
|
348
|
+
"""Build schema for each activity_id from list of task dicts."""
|
|
349
|
+
schema = defaultdict(
|
|
350
|
+
lambda: {
|
|
351
|
+
"in": [],
|
|
352
|
+
"out": [],
|
|
353
|
+
# "tel": [],
|
|
354
|
+
}
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
for task in tasks:
|
|
358
|
+
activity_id = task.get("activity_id")
|
|
359
|
+
if not activity_id:
|
|
360
|
+
continue
|
|
361
|
+
|
|
362
|
+
activity_schema = schema[activity_id]
|
|
363
|
+
|
|
364
|
+
for section_key, schema_key in [
|
|
365
|
+
("used", "in"),
|
|
366
|
+
("generated", "out"),
|
|
367
|
+
# ("telemetry_summary", "tel"),
|
|
368
|
+
]:
|
|
369
|
+
section_data = task.get(section_key)
|
|
370
|
+
if isinstance(section_data, dict):
|
|
371
|
+
flat_fields = flatten_dict(section_data, parent_key=section_key)
|
|
372
|
+
update_schema(activity_schema[schema_key], flat_fields)
|
|
373
|
+
|
|
374
|
+
schema = dict(schema)
|
|
375
|
+
return schema
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
def deep_merge_dicts(a: dict, b: dict) -> dict:
|
|
379
|
+
"""
|
|
380
|
+
Recursively merge dict b into dict a:
|
|
381
|
+
- Does not overwrite existing values in a.
|
|
382
|
+
- If both values are dicts, merges recursively.
|
|
383
|
+
- If both values are lists, concatenates and deduplicates.
|
|
384
|
+
- Otherwise, keeps value from a.
|
|
385
|
+
Returns a new dict (does not mutate inputs).
|
|
386
|
+
"""
|
|
387
|
+
result = copy.deepcopy(a)
|
|
388
|
+
|
|
389
|
+
for key, b_val in b.items():
|
|
390
|
+
if key not in result:
|
|
391
|
+
result[key] = copy.deepcopy(b_val)
|
|
392
|
+
else:
|
|
393
|
+
a_val = result[key]
|
|
394
|
+
if isinstance(a_val, dict) and isinstance(b_val, dict):
|
|
395
|
+
result[key] = deep_merge_dicts(a_val, b_val)
|
|
396
|
+
elif isinstance(a_val, list) and isinstance(b_val, list):
|
|
397
|
+
combined = a_val + [item for item in b_val if item not in a_val]
|
|
398
|
+
result[key] = combined
|
|
399
|
+
# preserve a_val otherwise
|
|
400
|
+
return result
|
flowcept/commons/utils.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
"""Utilities."""
|
|
2
2
|
|
|
3
|
+
import argparse
|
|
3
4
|
from datetime import datetime, timedelta, timezone
|
|
4
5
|
import json
|
|
5
6
|
from time import time, sleep
|
|
@@ -9,7 +10,6 @@ import platform
|
|
|
9
10
|
import subprocess
|
|
10
11
|
import types
|
|
11
12
|
import numpy as np
|
|
12
|
-
import pytz
|
|
13
13
|
|
|
14
14
|
from flowcept import configs
|
|
15
15
|
from flowcept.commons.flowcept_dataclasses.task_object import TaskObject
|
|
@@ -19,7 +19,7 @@ from flowcept.commons.vocabulary import Status
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def get_utc_now() -> float:
|
|
22
|
-
"""Get UTC time."""
|
|
22
|
+
"""Get current UTC time as a timestamp (seconds since epoch)."""
|
|
23
23
|
now = datetime.now(timezone.utc)
|
|
24
24
|
return now.timestamp()
|
|
25
25
|
|
|
@@ -159,11 +159,14 @@ class GenericJSONEncoder(json.JSONEncoder):
|
|
|
159
159
|
return super().default(obj)
|
|
160
160
|
|
|
161
161
|
|
|
162
|
-
def replace_non_serializable_times(obj, tz=
|
|
163
|
-
"""Replace non-serializable
|
|
162
|
+
def replace_non_serializable_times(obj, tz=timezone.utc):
|
|
163
|
+
"""Replace non-serializable datetimes in an object with ISO 8601 strings (ms precision)."""
|
|
164
164
|
for time_field in TaskObject.get_time_field_names():
|
|
165
|
-
if time_field in obj:
|
|
166
|
-
obj[time_field] = obj[time_field].
|
|
165
|
+
if time_field in obj and isinstance(obj[time_field], datetime):
|
|
166
|
+
obj[time_field] = obj[time_field].astimezone(tz).isoformat(timespec="milliseconds")
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
__DICT__CLASSES = (argparse.Namespace,)
|
|
167
170
|
|
|
168
171
|
|
|
169
172
|
def replace_non_serializable(obj):
|
|
@@ -180,6 +183,8 @@ def replace_non_serializable(obj):
|
|
|
180
183
|
return obj.to_flowcept_dict()
|
|
181
184
|
elif hasattr(obj, "to_dict"):
|
|
182
185
|
return obj.to_dict()
|
|
186
|
+
elif isinstance(obj, __DICT__CLASSES):
|
|
187
|
+
return obj.__dict__
|
|
183
188
|
else:
|
|
184
189
|
# Replace non-serializable values with id()
|
|
185
190
|
return f"{obj.__class__.__name__}_instance_id_{id(obj)}"
|
|
@@ -262,6 +267,20 @@ class GenericJSONDecoder(json.JSONDecoder):
|
|
|
262
267
|
return inst
|
|
263
268
|
|
|
264
269
|
|
|
270
|
+
def get_git_info(path: str = "."):
|
|
271
|
+
"""Get Git Repo metadata."""
|
|
272
|
+
from git import Repo
|
|
273
|
+
|
|
274
|
+
repo = Repo(path, search_parent_directories=True)
|
|
275
|
+
head = repo.head.commit.hexsha
|
|
276
|
+
short = repo.git.rev_parse(head, short=True)
|
|
277
|
+
branch = repo.active_branch.name if not repo.head.is_detached else "HEAD"
|
|
278
|
+
remote = next(iter(repo.remotes)).url if repo.remotes else None
|
|
279
|
+
dirty = "dirty" if repo.is_dirty() else "clean"
|
|
280
|
+
root = repo.working_tree_dir
|
|
281
|
+
return {"sha": head, "short_sha": short, "branch": branch, "root": root, "remote": remote, "dirty": dirty}
|
|
282
|
+
|
|
283
|
+
|
|
265
284
|
class ClassProperty:
|
|
266
285
|
"""Wrapper to simulate property of class methods, removed in py313."""
|
|
267
286
|
|
flowcept/configs.py
CHANGED
|
@@ -4,29 +4,34 @@ import os
|
|
|
4
4
|
import socket
|
|
5
5
|
import getpass
|
|
6
6
|
|
|
7
|
+
from flowcept.version import __version__
|
|
8
|
+
|
|
7
9
|
PROJECT_NAME = "flowcept"
|
|
10
|
+
|
|
11
|
+
DEFAULT_SETTINGS = {
|
|
12
|
+
"version": __version__,
|
|
13
|
+
"log": {"log_file_level": "disable", "log_stream_level": "disable"},
|
|
14
|
+
"project": {"dump_buffer_path": "flowcept_messages.jsonl"},
|
|
15
|
+
"telemetry_capture": {},
|
|
16
|
+
"instrumentation": {},
|
|
17
|
+
"experiment": {},
|
|
18
|
+
"mq": {"enabled": False},
|
|
19
|
+
"kv_db": {"enabled": False},
|
|
20
|
+
"web_server": {},
|
|
21
|
+
"sys_metadata": {},
|
|
22
|
+
"extra_metadata": {},
|
|
23
|
+
"analytics": {},
|
|
24
|
+
"db_buffer": {},
|
|
25
|
+
"databases": {"mongodb": {"enabled": False}, "lmdb": {"enabled": False}},
|
|
26
|
+
"adapters": {},
|
|
27
|
+
"agent": {},
|
|
28
|
+
}
|
|
29
|
+
|
|
8
30
|
USE_DEFAULT = os.getenv("FLOWCEPT_USE_DEFAULT", "False").lower() == "true"
|
|
9
|
-
########################
|
|
10
|
-
# Project Settings #
|
|
11
|
-
########################
|
|
12
31
|
|
|
13
32
|
if USE_DEFAULT:
|
|
14
|
-
settings =
|
|
15
|
-
|
|
16
|
-
"project": {},
|
|
17
|
-
"telemetry_capture": {},
|
|
18
|
-
"instrumentation": {},
|
|
19
|
-
"experiment": {},
|
|
20
|
-
"mq": {},
|
|
21
|
-
"kv_db": {},
|
|
22
|
-
"web_server": {},
|
|
23
|
-
"sys_metadata": {},
|
|
24
|
-
"extra_metadata": {},
|
|
25
|
-
"analytics": {},
|
|
26
|
-
"buffer": {},
|
|
27
|
-
"databases": {},
|
|
28
|
-
"adapters": {},
|
|
29
|
-
}
|
|
33
|
+
settings = DEFAULT_SETTINGS.copy()
|
|
34
|
+
|
|
30
35
|
else:
|
|
31
36
|
from omegaconf import OmegaConf
|
|
32
37
|
|
|
@@ -42,7 +47,13 @@ else:
|
|
|
42
47
|
settings = OmegaConf.load(f)
|
|
43
48
|
else:
|
|
44
49
|
settings = OmegaConf.load(SETTINGS_PATH)
|
|
45
|
-
|
|
50
|
+
|
|
51
|
+
# Making sure all settings are in place.
|
|
52
|
+
keys = DEFAULT_SETTINGS.keys() - settings.keys()
|
|
53
|
+
if len(keys):
|
|
54
|
+
for k in keys:
|
|
55
|
+
settings[k] = DEFAULT_SETTINGS[k]
|
|
56
|
+
|
|
46
57
|
########################
|
|
47
58
|
# Log Settings #
|
|
48
59
|
########################
|
|
@@ -68,6 +79,7 @@ FLOWCEPT_USER = settings["experiment"].get("user", "blank_user")
|
|
|
68
79
|
|
|
69
80
|
MQ_INSTANCES = settings["mq"].get("instances", None)
|
|
70
81
|
MQ_SETTINGS = settings["mq"]
|
|
82
|
+
MQ_ENABLED = os.getenv("MQ_ENABLED", settings["mq"].get("enabled", True))
|
|
71
83
|
MQ_TYPE = os.getenv("MQ_TYPE", settings["mq"].get("type", "redis"))
|
|
72
84
|
MQ_CHANNEL = os.getenv("MQ_CHANNEL", settings["mq"].get("channel", "interception"))
|
|
73
85
|
MQ_PASSWORD = settings["mq"].get("password", None)
|
|
@@ -87,7 +99,7 @@ KVDB_PASSWORD = settings["kv_db"].get("password", None)
|
|
|
87
99
|
KVDB_HOST = os.getenv("KVDB_HOST", settings["kv_db"].get("host", "localhost"))
|
|
88
100
|
KVDB_PORT = int(os.getenv("KVDB_PORT", settings["kv_db"].get("port", "6379")))
|
|
89
101
|
KVDB_URI = os.getenv("KVDB_URI", settings["kv_db"].get("uri", None))
|
|
90
|
-
KVDB_ENABLED = settings["kv_db"].get("enabled",
|
|
102
|
+
KVDB_ENABLED = settings["kv_db"].get("enabled", False)
|
|
91
103
|
|
|
92
104
|
|
|
93
105
|
DATABASES = settings.get("databases", {})
|
|
@@ -136,19 +148,20 @@ DB_INSERTER_MAX_TRIALS_STOP = db_buffer_settings.get("stop_max_trials", 240)
|
|
|
136
148
|
DB_INSERTER_SLEEP_TRIALS_STOP = db_buffer_settings.get("stop_trials_sleep", 0.01)
|
|
137
149
|
|
|
138
150
|
|
|
139
|
-
|
|
151
|
+
###########################
|
|
140
152
|
# PROJECT SYSTEM SETTINGS #
|
|
141
|
-
|
|
153
|
+
###########################
|
|
142
154
|
|
|
143
|
-
DB_FLUSH_MODE = settings["project"].get("db_flush_mode", "
|
|
155
|
+
DB_FLUSH_MODE = settings["project"].get("db_flush_mode", "offline")
|
|
144
156
|
# DEBUG_MODE = settings["project"].get("debug", False)
|
|
145
157
|
PERF_LOG = settings["project"].get("performance_logging", False)
|
|
146
158
|
JSON_SERIALIZER = settings["project"].get("json_serializer", "default")
|
|
147
159
|
REPLACE_NON_JSON_SERIALIZABLE = settings["project"].get("replace_non_json_serializable", True)
|
|
148
160
|
ENRICH_MESSAGES = settings["project"].get("enrich_messages", True)
|
|
149
|
-
|
|
161
|
+
DUMP_BUFFER_PATH = settings["project"].get("dump_buffer_path", None)
|
|
150
162
|
|
|
151
163
|
TELEMETRY_CAPTURE = settings.get("telemetry_capture", None)
|
|
164
|
+
TELEMETRY_ENABLED = TELEMETRY_CAPTURE is not None and len(TELEMETRY_CAPTURE)
|
|
152
165
|
|
|
153
166
|
######################
|
|
154
167
|
# SYS METADATA #
|
|
@@ -219,9 +232,11 @@ ANALYTICS = settings.get("analytics", None)
|
|
|
219
232
|
####################
|
|
220
233
|
|
|
221
234
|
INSTRUMENTATION = settings.get("instrumentation", {})
|
|
222
|
-
INSTRUMENTATION_ENABLED = INSTRUMENTATION.get("enabled",
|
|
235
|
+
INSTRUMENTATION_ENABLED = INSTRUMENTATION.get("enabled", True)
|
|
223
236
|
|
|
224
237
|
AGENT = settings.get("agent", {})
|
|
238
|
+
AGENT_HOST = os.getenv("AGENT_HOST", settings["agent"].get("mcp_host", "localhost"))
|
|
239
|
+
AGENT_PORT = int(os.getenv("AGENT_PORT", settings["agent"].get("mcp_port", "8000")))
|
|
225
240
|
|
|
226
241
|
####################
|
|
227
242
|
# Enabled ADAPTERS #
|