flowcept 0.9.17__py3-none-any.whl → 0.9.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. flowcept/agents/agent_client.py +10 -4
  2. flowcept/agents/agents_utils.py +54 -19
  3. flowcept/agents/flowcept_agent.py +116 -12
  4. flowcept/agents/flowcept_ctx_manager.py +116 -46
  5. flowcept/agents/gui/gui_utils.py +21 -3
  6. flowcept/agents/prompts/general_prompts.py +1 -1
  7. flowcept/agents/prompts/in_memory_query_prompts.py +158 -45
  8. flowcept/agents/tools/general_tools.py +20 -3
  9. flowcept/agents/tools/in_memory_queries/in_memory_queries_tools.py +14 -31
  10. flowcept/commons/daos/docdb_dao/lmdb_dao.py +48 -0
  11. flowcept/commons/daos/keyvalue_dao.py +12 -3
  12. flowcept/commons/daos/mq_dao/mq_dao_base.py +37 -20
  13. flowcept/commons/daos/mq_dao/mq_dao_kafka.py +2 -2
  14. flowcept/commons/daos/mq_dao/mq_dao_redis.py +33 -2
  15. flowcept/commons/flowcept_dataclasses/task_object.py +4 -1
  16. flowcept/configs.py +17 -3
  17. flowcept/flowcept_api/flowcept_controller.py +5 -1
  18. flowcept/flowceptor/adapters/mlflow/interception_event_handler.py +33 -2
  19. flowcept/flowceptor/adapters/mlflow/mlflow_interceptor.py +18 -4
  20. flowcept/flowceptor/adapters/tensorboard/tensorboard_interceptor.py +1 -0
  21. flowcept/flowceptor/consumers/agent/base_agent_context_manager.py +9 -10
  22. flowcept/flowceptor/consumers/base_consumer.py +22 -4
  23. flowcept/flowceptor/consumers/document_inserter.py +22 -1
  24. flowcept/instrumentation/flowcept_task.py +147 -51
  25. flowcept/instrumentation/task_capture.py +10 -1
  26. flowcept/version.py +1 -1
  27. {flowcept-0.9.17.dist-info → flowcept-0.9.19.dist-info}/METADATA +8 -1
  28. {flowcept-0.9.17.dist-info → flowcept-0.9.19.dist-info}/RECORD +32 -32
  29. {flowcept-0.9.17.dist-info → flowcept-0.9.19.dist-info}/WHEEL +1 -1
  30. resources/sample_settings.yaml +2 -1
  31. {flowcept-0.9.17.dist-info → flowcept-0.9.19.dist-info}/entry_points.txt +0 -0
  32. {flowcept-0.9.17.dist-info → flowcept-0.9.19.dist-info}/licenses/LICENSE +0 -0
@@ -1,32 +1,117 @@
1
1
  # flake8: noqa: E501
2
2
  # flake8: noqa: D103
3
3
 
4
- COMMON_TASK_FIELDS = """
5
- | Column | Data Type | Description |
6
- |-------------------------------|-------------|
7
- | `workflow_id` | string | Workflow the task belongs to. Use this field when the query is asking about workflow execution |
8
- | `task_id` | string | Task identifier. |
9
- | `parent_task_id` | string | A task may be directly linked to others. Use this field when the query asks for a task informed by (or associated with or linked to) other task. |
10
- | `activity_id` | string | Type of task (e.g., 'choose_option'). Use this for "task type" queries. One activity_id is linked to multiple task_ids. |
11
- | `campaign_id` | string | A group of workflows. |
12
- | `hostname` | string | Compute node name. |
13
- | `agent_id` | string | Set if executed by an agent. |
14
- | `started_at` | datetime64[ns, UTC] | Start time of a task. Always use this field when the query is has any temporal reference related to the workflow execution, such as 'get the first 10 workflow executions' or 'the last workflow execution'. |
15
- | `ended_at` | datetime64[ns, UTC] | End time of a task. |
16
- | `subtype` | string | Subtype of a task. |
17
- | `tags` | List[str] | List of descriptive tags. |
18
- | `image` | blob | Raw binary data related to an image. |
19
- | `telemetry_summary.duration_sec` | float | Task duration (seconds). |
20
- | `telemetry_summary.cpu.percent_all_diff` | float | Difference in overall CPU utilization percentage across all cores between task end and start.|
21
- | `telemetry_summary.cpu.user_time_diff` | float | Difference average per core CPU user time ( seconds ) between task start and end times.|
22
- | `telemetry_summary.cpu.system_time_diff` | float | Difference in CPU system (kernel) time (seconds) used during the task execution.|
23
- | `telemetry_summary.cpu.idle_time_diff` | float | Difference in CPU idle time (seconds) during task end and start.|
24
- ---
25
- For any queries involving CPU, use fields that begin with telemetry_summary.cpu
4
+
5
+ def generate_common_task_fields(current_fields):
6
+ # TODO: make this better
7
+ common_task_fields = """
8
+ | Column | Data Type | Description |
9
+ |-------------------------------|-------------|
26
10
  """
11
+ common_task_fields += (
12
+ "| `workflow_id` | string | Workflow the task belongs to. Use this field when the query is asking about workflow execution |\n"
13
+ if "workflow_id" in current_fields
14
+ else ""
15
+ )
16
+ common_task_fields += (
17
+ "| `task_id` | string | Task identifier. |\n" if "task_id" in current_fields else ""
18
+ )
19
+ common_task_fields += (
20
+ "| `parent_task_id` | string | A task may be directly linked to others. Use this field when the query asks for a task informed by (or associated with or linked to) other task. |\n"
21
+ if "parent_task_id" in current_fields
22
+ else ""
23
+ )
24
+ common_task_fields += (
25
+ "| `activity_id` | string | Type of task (e.g., 'choose_option'). Use this for \"task type\" queries. One activity_id is linked to multiple task_ids. |\n"
26
+ if "activity_id" in current_fields
27
+ else ""
28
+ )
29
+ common_task_fields += (
30
+ "| `campaign_id` | string | A group of workflows. |\n"
31
+ if "campaign_id" in current_fields
32
+ else ""
33
+ )
34
+ common_task_fields += (
35
+ "| `hostname` | string | Compute node name. |\n" if "hostname" in current_fields else ""
36
+ )
37
+ common_task_fields += (
38
+ "| `agent_id` | string | Set if executed by an agent. |\n"
39
+ if "agent_id" in current_fields
40
+ else ""
41
+ )
42
+ common_task_fields += (
43
+ "| `started_at` | datetime64[ns, UTC] | Start time of a task. Always use this field when the query has any temporal reference related to the workflow execution, such as 'get the first 10 workflow executions' or 'the last workflow execution'. |\n"
44
+ if "started_at" in current_fields
45
+ else ""
46
+ )
47
+ common_task_fields += (
48
+ "| `ended_at` | datetime64[ns, UTC] | End time of a task. |\n"
49
+ if "ended_at" in current_fields
50
+ else ""
51
+ )
52
+ common_task_fields += (
53
+ "| `subtype` | string | Subtype of a task. |\n" if "subtype" in current_fields else ""
54
+ )
55
+ common_task_fields += (
56
+ "| `tags` | List[str] | List of descriptive tags. |\n"
57
+ if "tags" in current_fields
58
+ else ""
59
+ )
60
+ common_task_fields += (
61
+ "| `image` | blob | Raw binary data related to an image. |\n"
62
+ if "image" in current_fields
63
+ else ""
64
+ )
65
+ common_task_fields += (
66
+ "| `telemetry_summary.duration_sec` | float | Task duration (seconds). |\n"
67
+ if "telemetry_summary.duration_sec" in current_fields
68
+ else ""
69
+ )
70
+ common_task_fields += (
71
+ "| `telemetry_summary.cpu.percent_all_diff` | float | Difference in overall CPU utilization percentage across all cores between task end and start. |\n"
72
+ if "telemetry_summary.cpu.percent_all_diff" in current_fields
73
+ else ""
74
+ )
75
+ common_task_fields += (
76
+ "| `telemetry_summary.cpu.user_time_diff` | float | Difference average per core CPU user time (seconds) between task start and end times. |\n"
77
+ if "telemetry_summary.cpu.user_time_diff" in current_fields
78
+ else ""
79
+ )
80
+ common_task_fields += (
81
+ "| `telemetry_summary.cpu.system_time_diff` | float | Difference in CPU system (kernel) time (seconds) used during the task execution. |\n"
82
+ if "telemetry_summary.cpu.system_time_diff" in current_fields
83
+ else ""
84
+ )
85
+ common_task_fields += (
86
+ "| `telemetry_summary.cpu.idle_time_diff` | float | Difference in CPU idle time (seconds) during task end and start. |\n"
87
+ if "telemetry_summary.cpu.idle_time_diff" in current_fields
88
+ else ""
89
+ )
90
+
91
+ common_task_fields += "\n For any queries involving CPU, use fields that begin with telemetry_summary.cpu"
92
+
93
+ return common_task_fields
94
+
27
95
 
28
96
  DF_FORM = "The user has a pandas DataFrame called `df`, created from flattened task objects using `pd.json_normalize`."
29
97
 
98
+ CURRENT_DF_COLUMNS_PROMPT = """
99
+ ### ABSOLUTE FIELD CONSTRAINT -- THIS IS CRITICAL
100
+
101
+ The following list is the ONLY valid field names in df. Treat this as the schema:
102
+
103
+ ALLOWED_FIELDS = [COLS]
104
+
105
+ You MUST treat this list as authoritative.
106
+
107
+ - You may only use fields names that appear EXACTLY (string match) in ALLOWED_FIELDS.
108
+ - You are NOT allowed to create new field names by:
109
+ - adding or removing prefixes like "used." or "generated."
110
+ - combining words
111
+ - guessing.
112
+ - If a field name is not in ALLOWED_FIELDS, you MUST NOT use it.
113
+ """
114
+
30
115
 
31
116
  def get_example_values_prompt(example_values):
32
117
  values_prompt = f"""
@@ -39,7 +124,7 @@ def get_example_values_prompt(example_values):
39
124
  return values_prompt
40
125
 
41
126
 
42
- def get_df_schema_prompt(dynamic_schema, example_values):
127
+ def get_df_schema_prompt(dynamic_schema, example_values, current_fields):
43
128
  schema_prompt = f"""
44
129
  ## DATAFRAME STRUCTURE
45
130
 
@@ -53,14 +138,19 @@ def get_df_schema_prompt(dynamic_schema, example_values):
53
138
  The schema for these fields is defined in the dictionary below.
54
139
  It maps each activity ID to its inputs (i) and outputs (o), using flattened field names that include `used.` or `generated.` prefixes to indicate the role the field played in the task. These names match the columns in the dataframe `df`.
55
140
 
56
- ```python
57
141
  {dynamic_schema}
58
- ```
59
142
  Use this schema and fields to understand what inputs and outputs are valid for each activity.
143
+
144
+ IMPORTANT: The user might say used for outputs or generated for inputs, which might confuse you. Do not get tricked by the user.
145
+ Ignore the natural-language words "used" and "generated".
146
+ - The English phrase "used in the calculation" does NOT mean you must use a `used.` column.
147
+ - The English word "generated" in the question does NOT force you to use a `generated.` column either.
148
+
149
+ ALWAYS CHECK THE ALLOWED_FIELDS list before proceeding. THIS IS CRITICAL.
60
150
 
61
151
  ### 2. Additional fields for tasks:
62
152
 
63
- {COMMON_TASK_FIELDS}
153
+ {generate_common_task_fields(current_fields)}
64
154
  ---
65
155
  """
66
156
 
@@ -70,12 +160,12 @@ def get_df_schema_prompt(dynamic_schema, example_values):
70
160
  return prompt
71
161
 
72
162
 
73
- def generate_plot_code_prompt(query, dynamic_schema, example_values) -> str:
163
+ def generate_plot_code_prompt(query, dynamic_schema, example_values, current_fields) -> str:
74
164
  PLOT_PROMPT = f"""
75
165
  You are a Streamlit chart expert.
76
166
  {DF_FORM}
77
167
 
78
- {get_df_schema_prompt(dynamic_schema, example_values)}
168
+ {get_df_schema_prompt(dynamic_schema, example_values, current_fields)}
79
169
 
80
170
  ### 3. Guidelines
81
171
 
@@ -121,10 +211,14 @@ def generate_plot_code_prompt(query, dynamic_schema, example_values) -> str:
121
211
  "plot_code": "import matplotlib.pyplot as plt\nplt.hist(result['n_controls'])\nst.pyplot(plt)"
122
212
  }}
123
213
 
214
+ Your response must be only the raw Python code in the format:
215
+ result = ...
216
+ Except for the `result` variable, YOU MUST NEVER CREATE ANY OTHER VARIABLE. NEVER!
217
+
124
218
  User request:
125
219
  {query}
126
220
 
127
- THE OUTPUT MUST BE A VALID JSON ONLY. DO NOT SAY ANYTHING ELSE.
221
+
128
222
 
129
223
  """
130
224
  return PLOT_PROMPT
@@ -139,7 +233,7 @@ QUERY_GUIDELINES = """
139
233
 
140
234
  - Use `df` as the base DataFrame.
141
235
  - Use `activity_id` to filter by task type (valid values = schema keys).
142
- - Use `used.` for parameters (inputs) and `generated.` for outputs (metrics).
236
+ - ONLY IF the ALLOWED_FIELDS list allow, use `used.` for parameters (inputs) and `generated.` for outputs (metrics).
143
237
  - Use `telemetry_summary.duration_sec` for performance-related questions.
144
238
  - Use `hostname` when user mentions *where* a task ran.
145
239
  - Use `agent_id` when the user refers to agents (non-null means task was agent-run).
@@ -153,7 +247,7 @@ QUERY_GUIDELINES = """
153
247
  **THE COLUMN 'used' DOES NOT EXIST**
154
248
  **THE COLUMN 'generated' DOES NOT EXIST**
155
249
  - **When filtering by `activity_id`, only select columns that belong to that activity’s schema.**
156
- - Use only `used.` and `generated.` fields listed in the schema for that `activity_id`.
250
+ - Always observing the ALLOWED_FIELDS list, use only `used.` and `generated.` fields listed in the schema for that `activity_id`.
157
251
  - Explicitly list the selected columns — **never return all columns**
158
252
  - **Only include telemetry columns if used in the query logic.**
159
253
  -THERE IS NOT A FIELD NAMED `telemetry_summary.start_time` or `telemetry_summary.end_time` or `used.start_time` or `used.end_time`. Use `started_at` and `ended_at` instead when you want to find the duration of a task, activity, or workflow execution.
@@ -187,6 +281,17 @@ QUERY_GUIDELINES = """
187
281
  -**Do NOT use any of those: df[df['started_at'].idxmax()], df[df['started_at'].idxmin()], df[df['ended_at'].idxmin()], df[df['ended_at'].idxmax()]. Those are not valid Pandas Code.**
188
282
  - When the query mentions "each task", or "each activity", or "each workflow", make sure you show (project) the correct id column in the results (i.e., respectively: `task_id`, `activity_id`, `workflow_id`) to identify those in the results.
189
283
  - Use df[<role>.field_name] == True or df[<role>.field_name] == False when user queries boolean fields, where <role> is either used or generated, depending on the field name. Make sure field_name is a valid field in the DataFrame.
284
+
285
+ If the query asks you to report which values appear in one or more columns
286
+ (for example “which X were used”, “list all Y”, “what X and Y were generated”), then:
287
+
288
+ For each relevant column, select that column from df.
289
+ Call .dropna() on that column to remove missing values.
290
+ After dropping NaNs, apply .unique(), .value_counts(), or any other aggregation as needed.
291
+ Select that column.
292
+ Call .dropna() on it.
293
+ Then call .unique(), .value_counts(), or any other aggregation.
294
+
190
295
 
191
296
  - **Do not include metadata columns unless explicitly required by the user query.**
192
297
  """
@@ -200,15 +305,16 @@ FEW_SHOTS = """
200
305
  # Q: How many tasks for each activity?
201
306
  result = df['activity_id'].value_counts()
202
307
 
203
- # Q: What is the average loss across all tasks?
204
- result = df['generated.loss'].mean()
205
-
206
- # Q: select the 'choose_option' tasks executed by the agent, and show the planned controls, generated option, scores, explanations
207
- result = df[(df['activity_id'] == 'choose_option') & (df['agent_id'].notna())][['used.planned_controls', 'generated.option', 'used.scores.scores', 'generated.explanation']].copy()
208
-
209
- # Q: Show duration and generated scores for 'simulate_layer' tasks
210
- result = df[df['activity_id'] == 'simulate_layer'][['telemetry_summary.duration_sec', 'generated.scores']]
211
308
  """
309
+ # # Q: What is the average loss across all tasks?
310
+ # result = df['generated.loss'].mean()
311
+ #
312
+ # # Q: select the 'choose_option' tasks executed by the agent, and show the planned controls, generated option, scores, explanations
313
+ # result = df[(df['activity_id'] == 'choose_option') & (df['agent_id'].notna())][
314
+ # ['used.planned_controls', 'generated.option', 'used.scores.scores', 'generated.explanation']].copy()
315
+ #
316
+ # # Q: Show duration and generated scores for 'simulate_layer' tasks
317
+ # result = df[df['activity_id'] == 'simulate_layer'][['telemetry_summary.duration_sec', 'generated.scores']]
212
318
 
213
319
  OUTPUT_FORMATTING = """
214
320
  6. Final Instructions
@@ -226,7 +332,7 @@ OUTPUT_FORMATTING = """
226
332
  """
227
333
 
228
334
 
229
- def generate_pandas_code_prompt(query: str, dynamic_schema, example_values, custom_user_guidances):
335
+ def generate_pandas_code_prompt(query: str, dynamic_schema, example_values, custom_user_guidances, current_fields):
230
336
  if custom_user_guidances is not None and isinstance(custom_user_guidances, list) and len(custom_user_guidances):
231
337
  concatenated_guidance = "\n".join(f"- {msg}" for msg in custom_user_guidances)
232
338
  custom_user_guidance_prompt = (
@@ -236,11 +342,14 @@ def generate_pandas_code_prompt(query: str, dynamic_schema, example_values, cust
236
342
  )
237
343
  else:
238
344
  custom_user_guidance_prompt = ""
345
+
346
+ curr_cols = CURRENT_DF_COLUMNS_PROMPT.replace("[COLS]", str(current_fields))
239
347
  prompt = (
240
348
  f"{ROLE}"
241
349
  f"{JOB}"
242
350
  f"{DF_FORM}"
243
- f"{get_df_schema_prompt(dynamic_schema, example_values)}" # main tester
351
+ f"{curr_cols}"
352
+ f"{get_df_schema_prompt(dynamic_schema, example_values, current_fields)}" # main tester
244
353
  f"{QUERY_GUIDELINES}" # main tester
245
354
  f"{FEW_SHOTS}" # main tester
246
355
  f"{custom_user_guidance_prompt}"
@@ -251,7 +360,7 @@ def generate_pandas_code_prompt(query: str, dynamic_schema, example_values, cust
251
360
  return prompt
252
361
 
253
362
 
254
- def dataframe_summarizer_context(code, reduced_df, dynamic_schema, example_values, query) -> str:
363
+ def dataframe_summarizer_context(code, reduced_df, dynamic_schema, example_values, query, current_fields) -> str:
255
364
  job = "You are a Workflow Provenance Specialist analyzing a DataFrame that was obtained to answer a query."
256
365
 
257
366
  if "image" in reduced_df.columns:
@@ -272,7 +381,7 @@ def dataframe_summarizer_context(code, reduced_df, dynamic_schema, example_value
272
381
  {reduced_df}
273
382
 
274
383
  **Original df (before reduction) had this schema:
275
- {get_df_schema_prompt(dynamic_schema, example_values)}
384
+ {get_df_schema_prompt(dynamic_schema, example_values, current_fields)}
276
385
 
277
386
  Your task is to find a concise and direct answer as an English sentence to the user query.
278
387
 
@@ -310,7 +419,7 @@ def extract_or_fix_json_code_prompt(raw_text) -> str:
310
419
  return prompt
311
420
 
312
421
 
313
- def extract_or_fix_python_code_prompt(raw_text):
422
+ def extract_or_fix_python_code_prompt(raw_text, current_fields):
314
423
  prompt = f"""
315
424
  You are a Pandas DataFrame code extractor and fixer. Pandas is a well-known data science Python library for querying datasets.
316
425
  You are given a raw user message that may include explanations, markdown fences, or partial DataFrame code that queries a DataFrame `df`.
@@ -319,9 +428,13 @@ def extract_or_fix_python_code_prompt(raw_text):
319
428
  1. Check if the message contains a valid DataFrame code.
320
429
  2. If it does, extract the code.
321
430
  3. If there are any syntax errors, fix them.
322
- 4. Return only the corrected DataFrame query code no explanations, no comments, no markdown.
431
+ 4. Carefully analyze the list of columns in the query. The query must only use fields in this list:
432
+ ALLOWED_FIELDS = {current_fields}.
433
+ If there are fields not in this list, replace the fields to match according to the ALLOWED_FIELDS list.
434
+ 5. Return only the corrected DataFrame query code — no explanations, no comments, no markdown.
323
435
 
324
436
  The output must be valid Python code, and must not include any other text.
437
+ Your output can only contain fields in the ALLOWED_FIELDS list.
325
438
  This output will be parsed by another program.
326
439
 
327
440
  ONCE AGAIN, ONLY PRODUCE THE PYTHON CODE. DO NOT SAY ANYTHING ELSE!
@@ -1,7 +1,7 @@
1
1
  import json
2
2
  from typing import List
3
3
 
4
- from flowcept.agents.agents_utils import build_llm_model, ToolResult
4
+ from flowcept.agents.agents_utils import build_llm_model, ToolResult, normalize_message
5
5
  from flowcept.agents.flowcept_ctx_manager import mcp_flowcept
6
6
  from flowcept.agents.prompts.general_prompts import ROUTING_PROMPT, SMALL_TALK_PROMPT
7
7
 
@@ -105,6 +105,19 @@ def reset_records() -> ToolResult:
105
105
  return ToolResult(code=499, result=str(e))
106
106
 
107
107
 
108
+ @mcp_flowcept.tool()
109
+ def reset_context() -> ToolResult:
110
+ """
111
+ Resets all context.
112
+ """
113
+ try:
114
+ ctx = mcp_flowcept.get_context()
115
+ ctx.request_context.lifespan_context.reset_context()
116
+ return ToolResult(code=201, result="Context reset.")
117
+ except Exception as e:
118
+ return ToolResult(code=499, result=str(e))
119
+
120
+
108
121
  @mcp_flowcept.tool()
109
122
  def prompt_handler(message: str) -> ToolResult:
110
123
  """
@@ -120,20 +133,24 @@ def prompt_handler(message: str) -> ToolResult:
120
133
  TextContent
121
134
  The AI response or routing feedback.
122
135
  """
123
- df_key_words = ["df", "save", "result = df", "reset context"]
136
+ df_key_words = ["df", "save", "result = df"]
124
137
  for key in df_key_words:
125
138
  if key in message:
126
139
  return run_df_query(llm=None, query=message, plot=False)
127
140
 
141
+ if "reset context" in message:
142
+ return reset_context()
128
143
  if "@record" in message:
129
144
  return record_guidance(message)
130
145
  if "@show records" in message:
131
146
  return show_records()
132
147
  if "@reset records" in message:
133
- return reset_records(message)
148
+ return reset_records()
134
149
 
135
150
  llm = build_llm_model()
136
151
 
152
+ message = normalize_message(message)
153
+
137
154
  prompt = ROUTING_PROMPT + message
138
155
  route = llm.invoke(prompt)
139
156
 
@@ -63,11 +63,6 @@ def run_df_query(llm, query: str, plot=False) -> ToolResult:
63
63
 
64
64
  Examples
65
65
  --------
66
- Reset the context:
67
-
68
- >>> run_df_query(llm, "reset context")
69
- ToolResult(code=201, result="Context Reset!")
70
-
71
66
  Save the current DataFrame:
72
67
 
73
68
  >>> run_df_query(llm, "save")
@@ -90,10 +85,6 @@ def run_df_query(llm, query: str, plot=False) -> ToolResult:
90
85
  custom_user_guidance = ctx.request_context.lifespan_context.custom_guidance
91
86
  if df is None or not len(df):
92
87
  return ToolResult(code=404, result="Current df is empty or null.")
93
-
94
- if "reset context" in query:
95
- ctx.request_context.lifespan_context.df = pd.DataFrame()
96
- return ToolResult(code=201, result="Context Reset!")
97
88
  elif "save" in query:
98
89
  return save_df(df, schema, value_examples)
99
90
  elif "result = df" in query:
@@ -173,7 +164,7 @@ def generate_plot_code(llm, query, dynamic_schema, value_examples, df, custom_us
173
164
  >>> print(result.result["plot_code"])
174
165
  plt.bar(result_df["region"], result_df["total_sales"])
175
166
  """
176
- plot_prompt = generate_plot_code_prompt(query, dynamic_schema, value_examples)
167
+ plot_prompt = generate_plot_code_prompt(query, dynamic_schema, value_examples, list(df.columns))
177
168
  try:
178
169
  response = llm(plot_prompt)
179
170
  except Exception as e:
@@ -300,7 +291,9 @@ def generate_result_df(
300
291
  if llm is None:
301
292
  llm = build_llm_model()
302
293
  try:
303
- prompt = generate_pandas_code_prompt(query, dynamic_schema, example_values, custom_user_guidance)
294
+ prompt = generate_pandas_code_prompt(
295
+ query, dynamic_schema, example_values, custom_user_guidance, list(df.columns)
296
+ )
304
297
  response = llm(prompt)
305
298
  except Exception as e:
306
299
  return ToolResult(code=400, result=str(e), extra=prompt)
@@ -317,9 +310,10 @@ def generate_result_df(
317
310
  extra={"generated_code": result_code, "exception": str(e), "prompt": prompt},
318
311
  )
319
312
  else:
320
- tool_result = extract_or_fix_python_code(llm, result_code)
313
+ tool_result = extract_or_fix_python_code(llm, result_code, list(df.columns))
321
314
  if tool_result.code == 201:
322
315
  new_result_code = tool_result.result
316
+ result_code = new_result_code
323
317
  try:
324
318
  result_df = safe_execute(df, new_result_code)
325
319
  except Exception as e:
@@ -357,12 +351,7 @@ def generate_result_df(
357
351
  if summarize:
358
352
  try:
359
353
  tool_result = summarize_result(
360
- llm,
361
- result_code,
362
- result_df,
363
- query,
364
- dynamic_schema,
365
- example_values,
354
+ llm, result_code, result_df, query, dynamic_schema, example_values, list(df.columns)
366
355
  )
367
356
  if tool_result.is_success():
368
357
  return_code = 301
@@ -377,7 +366,7 @@ def generate_result_df(
377
366
  return_code = 303
378
367
 
379
368
  try:
380
- result_df = format_result_df(result_df)
369
+ result_df_str = format_result_df(result_df)
381
370
  except Exception as e:
382
371
  return ToolResult(
383
372
  code=405,
@@ -387,7 +376,8 @@ def generate_result_df(
387
376
 
388
377
  this_result = {
389
378
  "result_code": result_code,
390
- "result_df": result_df,
379
+ "result_df": result_df_str,
380
+ "result_df_markdown": result_df.to_markdown(index=False),
391
381
  "summary": summary,
392
382
  "summary_error": summary_error,
393
383
  }
@@ -473,7 +463,7 @@ def run_df_code(user_code: str, df):
473
463
 
474
464
 
475
465
  @mcp_flowcept.tool()
476
- def extract_or_fix_python_code(llm, raw_text):
466
+ def extract_or_fix_python_code(llm, raw_text, current_fields):
477
467
  """
478
468
  Extract or repair JSON code from raw text using an LLM.
479
469
 
@@ -523,7 +513,7 @@ def extract_or_fix_python_code(llm, raw_text):
523
513
  >>> print(res)
524
514
  ToolResult(code=499, result='LLM service unavailable')
525
515
  """
526
- prompt = extract_or_fix_python_code_prompt(raw_text)
516
+ prompt = extract_or_fix_python_code_prompt(raw_text, current_fields)
527
517
  try:
528
518
  response = llm(prompt)
529
519
  return ToolResult(code=201, result=response)
@@ -582,14 +572,7 @@ def extract_or_fix_json_code(llm, raw_text) -> ToolResult:
582
572
 
583
573
 
584
574
  @mcp_flowcept.tool()
585
- def summarize_result(
586
- llm,
587
- code,
588
- result,
589
- query: str,
590
- dynamic_schema,
591
- example_values,
592
- ) -> ToolResult:
575
+ def summarize_result(llm, code, result, query: str, dynamic_schema, example_values, current_fields) -> ToolResult:
593
576
  """
594
577
  Summarize the pandas result with local reduction for large DataFrames.
595
578
  - For wide DataFrames, selects top columns based on variance and uniqueness.
@@ -597,7 +580,7 @@ def summarize_result(
597
580
  - Constructs a detailed prompt for the LLM with original column context.
598
581
  """
599
582
  summarized_df = summarize_df(result, code)
600
- prompt = dataframe_summarizer_context(code, summarized_df, dynamic_schema, example_values, query)
583
+ prompt = dataframe_summarizer_context(code, summarized_df, dynamic_schema, example_values, query, current_fields)
601
584
  try:
602
585
  response = llm(prompt)
603
586
  return ToolResult(code=201, result=response)
@@ -115,6 +115,54 @@ class LMDBDAO(DocumentDBDAO):
115
115
  self.logger.exception(e)
116
116
  return False
117
117
 
118
+ def delete_task_keys(self, key_name, keys_list: List[str]) -> bool:
119
+ """Delete task documents by a key value list.
120
+
121
+ When deleting by task_id, deletes keys directly. Otherwise, scans
122
+ tasks and deletes matching entries.
123
+ """
124
+ if self._is_closed:
125
+ self._open()
126
+ if type(keys_list) is not list:
127
+ keys_list = [keys_list]
128
+ try:
129
+ with self._env.begin(write=True, db=self._tasks_db) as txn:
130
+ if key_name == "task_id":
131
+ for key in keys_list:
132
+ if key is None:
133
+ continue
134
+ txn.delete(str(key).encode())
135
+ else:
136
+ cursor = txn.cursor()
137
+ for key, value in cursor:
138
+ entry = json.loads(value.decode())
139
+ if entry.get(key_name) in keys_list:
140
+ cursor.delete()
141
+ return True
142
+ except Exception as e:
143
+ self.logger.exception(e)
144
+ return False
145
+
146
+ def count_tasks(self) -> int:
147
+ """Count number of docs in tasks collection."""
148
+ if self._is_closed:
149
+ self._open()
150
+ try:
151
+ return self._env.stat(db=self._tasks_db).get("entries", 0)
152
+ except Exception as e:
153
+ self.logger.exception(e)
154
+ return -1
155
+
156
+ def count_workflows(self) -> int:
157
+ """Count number of docs in workflows collection."""
158
+ if self._is_closed:
159
+ self._open()
160
+ try:
161
+ return self._env.stat(db=self._workflows_db).get("entries", 0)
162
+ except Exception as e:
163
+ self.logger.exception(e)
164
+ return -1
165
+
118
166
  @staticmethod
119
167
  def _match_filter(entry, filter):
120
168
  """
@@ -1,7 +1,5 @@
1
1
  """Key value module."""
2
2
 
3
- from flowcept.commons.daos.redis_conn import RedisConn
4
-
5
3
  from flowcept.commons.flowcept_logger import FlowceptLogger
6
4
  from flowcept.configs import (
7
5
  KVDB_HOST,
@@ -26,12 +24,23 @@ class KeyValueDAO:
26
24
 
27
25
  def __init__(self):
28
26
  if not hasattr(self, "_initialized"):
29
- self._initialized = True
30
27
  self.logger = FlowceptLogger()
28
+ from flowcept.commons.daos.redis_conn import RedisConn
29
+
31
30
  self.redis_conn = RedisConn.build_redis_conn_pool(
32
31
  host=KVDB_HOST, port=KVDB_PORT, password=KVDB_PASSWORD, uri=KVDB_URI
33
32
  )
34
33
 
34
+ self._initialized = True
35
+
36
+ @staticmethod
37
+ def get_set_name(set_id: str, exec_bundle_id=None) -> str:
38
+ """Return a consistent set name for KVDB sets."""
39
+ set_name = set_id
40
+ if exec_bundle_id is not None:
41
+ set_name += "_" + str(exec_bundle_id)
42
+ return set_name
43
+
35
44
  def delete_set(self, set_name: str):
36
45
  """Delete it."""
37
46
  self.redis_conn.delete(set_name)