PyPI - flowcept - Versions diffs - 0.9.17__py3-none-any.whl → 0.9.18__py3-none-any.whl - Mend

flowcept 0.9.17py3-none-any.whl → 0.9.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

flowcept/agents/agents_utils.py +42 -0
flowcept/agents/flowcept_agent.py +4 -1
flowcept/agents/flowcept_ctx_manager.py +99 -36
flowcept/agents/gui/gui_utils.py +21 -3
flowcept/agents/prompts/general_prompts.py +1 -1
flowcept/agents/prompts/in_memory_query_prompts.py +158 -45
flowcept/agents/tools/general_tools.py +20 -3
flowcept/agents/tools/in_memory_queries/in_memory_queries_tools.py +14 -31
flowcept/commons/daos/docdb_dao/lmdb_dao.py +48 -0
flowcept/commons/daos/mq_dao/mq_dao_kafka.py +2 -2
flowcept/commons/daos/mq_dao/mq_dao_redis.py +33 -2
flowcept/commons/flowcept_dataclasses/task_object.py +4 -1
flowcept/configs.py +4 -1
flowcept/flowcept_api/flowcept_controller.py +5 -1
flowcept/flowceptor/adapters/mlflow/interception_event_handler.py +33 -2
flowcept/flowceptor/adapters/mlflow/mlflow_interceptor.py +18 -4
flowcept/flowceptor/adapters/tensorboard/tensorboard_interceptor.py +1 -0
flowcept/flowceptor/consumers/agent/base_agent_context_manager.py +7 -8
flowcept/instrumentation/flowcept_task.py +147 -51
flowcept/instrumentation/task_capture.py +10 -1
flowcept/version.py +1 -1
{flowcept-0.9.17.dist-info → flowcept-0.9.18.dist-info}/METADATA +8 -1
{flowcept-0.9.17.dist-info → flowcept-0.9.18.dist-info}/RECORD +27 -27
{flowcept-0.9.17.dist-info → flowcept-0.9.18.dist-info}/WHEEL +1 -1
resources/sample_settings.yaml +2 -1
{flowcept-0.9.17.dist-info → flowcept-0.9.18.dist-info}/entry_points.txt +0 -0
{flowcept-0.9.17.dist-info → flowcept-0.9.18.dist-info}/licenses/LICENSE +0 -0

flowcept/agents/agents_utils.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import os
+import re
+import unicodedata
 from typing import Union, Dict
 from flowcept.flowceptor.consumers.agent.base_agent_context_manager import BaseAgentContextManager
@@ -194,3 +196,43 @@ def build_llm_model(
             if tool_task:
                 llm.parent_task_id = tool_task.task_id
     return llm
+def normalize_message(user_msg: str) -> str:
+    """
+    Normalize a user message into a canonical, comparison-friendly form.
+    The function standardizes text by trimming whitespace, applying Unicode
+    normalization, normalizing dash characters, collapsing repeated whitespace,
+    removing trailing punctuation that does not affect semantics, and converting
+    the result to lowercase.
+    Parameters
+    ----------
+    user_msg : str
+        Raw user input message.
+    Returns
+    -------
+    str
+        Normalized message suitable for matching, comparison, or hashing.
+    """
+    # 1) Strip leading/trailing whitespace
+    user_msg = user_msg.strip()
+    # 2) Unicode normalize to avoid weird characters (like fancy quotes, dashes)
+    user_msg = unicodedata.normalize("NFKC", user_msg)
+    # 3) Normalize dashes commonly used in chemistry (C–H, C—H, etc.)
+    user_msg = user_msg.replace("–", "-").replace("—", "-")
+    # 4) Collapse multiple spaces / newlines into a single space
+    user_msg = re.sub(r"\s+", " ", user_msg)
+    # 5) Remove trailing punctuation that doesn't change semantics
+    #    e.g., "?", "!", "." at the VERY end
+    user_msg = re.sub(r"[?!.\s]+$", "", user_msg)
+    user_msg = user_msg.lower()
+    return user_msg

flowcept/agents/flowcept_agent.py CHANGED Viewed

@@ -20,11 +20,14 @@ def main():
     def run():
         uvicorn.run(mcp_flowcept.streamable_http_app, host=AGENT_HOST, port=AGENT_PORT, lifespan="on")
-    Thread(target=run).start()
+    server_thread = Thread(target=run, daemon=False)
+    server_thread.start()
     sleep(2)
     # Wake up tool call
     print(run_tool(check_liveness, host=AGENT_HOST, port=AGENT_PORT)[0])
+    server_thread.join()
 if __name__ == "__main__":
     main()

flowcept/agents/flowcept_ctx_manager.py CHANGED Viewed

@@ -1,6 +1,9 @@
 from flowcept.agents.dynamic_schema_tracker import DynamicSchemaTracker
 from flowcept.agents.tools.in_memory_queries.pandas_agent_utils import load_saved_df
 from flowcept.commons.flowcept_dataclasses.task_object import TaskObject
+from flowcept.commons.flowcept_logger import FlowceptLogger
+from flowcept.commons.vocabulary import Status
+from flowcept.configs import AGENT
 from mcp.server.fastmcp import FastMCP
 import json
@@ -12,11 +15,12 @@ import pandas as pd
 from flowcept.flowceptor.consumers.agent.base_agent_context_manager import BaseAgentContextManager, BaseAppContext
-from flowcept.agents import agent_client
 from flowcept.commons.task_data_preprocess import summarize_task
+AGENT_DEBUG = AGENT.get("debug", False)
 @dataclass
 class FlowceptAppContext(BaseAppContext):
     """
@@ -39,6 +43,39 @@ class FlowceptAppContext(BaseAppContext):
     tracker_config: Dict | None
     custom_guidance: List[str] | None
+    def __init__(self):
+        self.logger = FlowceptLogger()
+        self.reset_context()
+    def reset_context(self):
+        """
+        Reset the agent's context to a clean state, initializing a new QA setup.
+        """
+        self.tasks = []
+        self.task_summaries = []
+        self.critical_tasks = []
+        self.df = pd.DataFrame()
+        self.tasks_schema = {}
+        self.value_examples = {}
+        self.custom_guidance = []
+        self.tracker_config = {}
+        if AGENT_DEBUG:
+            from flowcept.commons.flowcept_logger import FlowceptLogger
+            FlowceptLogger().warning("Running agent in DEBUG mode!")
+            df_path = "/tmp/current_agent_df.csv"
+            if os.path.exists(df_path):
+                self.logger.warning("Going to load df into context")
+                df = load_saved_df(df_path)
+                self.df = df
+            if os.path.exists("/tmp/current_tasks_schema.json"):
+                with open("/tmp/current_tasks_schema.json") as f:
+                    self.tasks_schema = json.load(f)
+            if os.path.exists("/tmp/value_examples.json"):
+                with open("/tmp/value_examples.json") as f:
+                    self.value_examples = json.load(f)
 class FlowceptAgentContextManager(BaseAgentContextManager):
     """
@@ -61,7 +98,7 @@ class FlowceptAgentContextManager(BaseAgentContextManager):
     """
     def __init__(self):
-        self.context: FlowceptAppContext = None
+        self.context = FlowceptAppContext()
         self.tracker_config = dict(max_examples=3, max_str_len=50)
         self.schema_tracker = DynamicSchemaTracker(**self.tracker_config)
         self.msgs_counter = 0
@@ -82,7 +119,6 @@ class FlowceptAgentContextManager(BaseAgentContextManager):
         bool
             True if the message was handled successfully.
         """
-        print("Received:", msg_obj)
         msg_type = msg_obj.get("type", None)
         if msg_type == "task":
             task_msg = TaskObject.from_dict(msg_obj)
@@ -90,8 +126,62 @@ class FlowceptAgentContextManager(BaseAgentContextManager):
                 self.logger.info(f"Going to ignore our own LLM messages: {task_msg}")
                 return True
-            self.msgs_counter += 1
             self.logger.debug("Received task msg!")
+            if task_msg.subtype == "call_agent_task":
+                from flowcept.instrumentation.task_capture import FlowceptTask
+                if task_msg.activity_id == "reset_user_context":
+                    self.context.reset_context()
+                    self.msgs_counter = 0
+                    FlowceptTask(
+                        agent_id=self.agent_id,
+                        generated={"msg": "Provenance Agent reset context."},
+                        subtype="agent_task",
+                        activity_id="reset_user_context",
+                    ).send()
+                    return True
+                elif task_msg.activity_id == "provenance_query":
+                    self.logger.info("Received a prov query message!")
+                    query_text = task_msg.used.get("query")
+                    from flowcept.agents import ToolResult
+                    from flowcept.agents.tools.general_tools import prompt_handler
+                    from flowcept.agents.agent_client import run_tool
+                    resp = run_tool(tool_name=prompt_handler, kwargs={"message": query_text})[0]
+                    try:
+                        error = None
+                        status = Status.FINISHED
+                        tool_result = ToolResult(**json.loads(resp))
+                        if tool_result.result_is_str():
+                            generated = {"text": tool_result.result}
+                        else:
+                            generated = tool_result.result
+                    except Exception as e:
+                        status = Status.ERROR
+                        error = f"Could not convert the following into a ToolResult:\n{resp}\nException: {e}"
+                        generated = {"text": str(resp)}
+                    FlowceptTask(
+                        agent_id=self.agent_id,
+                        generated=generated,
+                        stderr=error,
+                        status=status,
+                        subtype="agent_task",
+                        activity_id="provenance_query_response",
+                    ).send()
+                    return True
+            elif (
+                task_msg.subtype == "agent_task"
+                and task_msg.agent_id is not None
+                and task_msg.agent_id == self.agent_id
+            ):
+                self.logger.info(f"Ignoring agent tasks from myself: {task_msg}")
+                return True
+            self.msgs_counter += 1
             self.context.tasks.append(msg_obj)
             task_summary = summarize_task(msg_obj, logger=self.logger)
@@ -136,7 +226,9 @@ class FlowceptAgentContextManager(BaseAgentContextManager):
         Perform LLM-based analysis on the current chunk of task messages and send the results.
         """
         self.logger.debug(f"Going to begin LLM job! {self.msgs_counter}")
-        result = agent_client.run_tool("analyze_task_chunk")
+        from flowcept.agents.agent_client import run_tool
+        result = run_tool("analyze_task_chunk")
         if len(result):
             content = result[0].text
             if content != "Error executing tool":
@@ -146,36 +238,7 @@ class FlowceptAgentContextManager(BaseAgentContextManager):
             else:
                 self.logger.error(content)
-    def reset_context(self):
-        """
-        Reset the agent's context to a clean state, initializing a new QA setup.
-        """
-        self.context = FlowceptAppContext(
-            tasks=[],
-            task_summaries=[],
-            critical_tasks=[],
-            df=pd.DataFrame(),
-            tasks_schema={},
-            value_examples={},
-            custom_guidance=[],
-            tracker_config=self.tracker_config,
-        )
-        DEBUG = True  # TODO debugging!
-        if DEBUG:
-            self.logger.warning("Running agent in DEBUG mode!")
-            df_path = "/tmp/current_agent_df.csv"
-            if os.path.exists(df_path):
-                self.logger.warning("Going to load df into context")
-                df = load_saved_df(df_path)
-                self.context.df = df
-            if os.path.exists("/tmp/current_tasks_schema.json"):
-                with open("/tmp/current_tasks_schema.json") as f:
-                    self.context.tasks_schema = json.load(f)
-            if os.path.exists("/tmp/value_examples.json"):
-                with open("/tmp/value_examples.json") as f:
-                    self.context.value_examples = json.load(f)
 # Exporting the ctx_manager and the mcp_flowcept
 ctx_manager = FlowceptAgentContextManager()
-mcp_flowcept = FastMCP("FlowceptAgent", require_session=False, lifespan=ctx_manager.lifespan, stateless_http=True)
+mcp_flowcept = FastMCP("FlowceptAgent", lifespan=ctx_manager.lifespan, stateless_http=True)

flowcept/agents/gui/gui_utils.py CHANGED Viewed

@@ -351,10 +351,28 @@ def exec_st_plot_code(code, result_df, st_module):
     >>> code = "st.line_chart(result)"
     >>> exec_st_plot_code(code, df, st)
     """
-    print("Plot code \n", code)
+    # 1) Make a copy of result_df and rename columns with dots
+    plot_df = result_df.copy()
+    col_map = {}
+    for col in plot_df.columns:
+        if "." in col:
+            new_col = col.replace(".", "_")
+            col_map[col] = new_col
+            plot_df.rename(columns={col: new_col}, inplace=True)
+    # 2) Rewrite the code so column names match the renamed columns
+    sanitized_code = code
+    for old, new in col_map.items():
+        # replace only inside quotes: 'generated.bd_enthalpy' → 'generated_bd_enthalpy'
+        sanitized_code = sanitized_code.replace(f"'{old}'", f"'{new}'")
+        sanitized_code = sanitized_code.replace(f'"{old}"', f'"{new}"')
+    print("SANITIZED CODE:\n", sanitized_code)
+    print(f"Renamed DF columms: {plot_df}")
     exec(
-        code,
-        {"result": result_df, "st": st_module, "plt": __import__("matplotlib.pyplot"), "alt": __import__("altair")},
+        sanitized_code,
+        {"result": plot_df, "st": st_module, "plt": __import__("matplotlib.pyplot"), "alt": __import__("altair")},
     )

flowcept/agents/prompts/general_prompts.py CHANGED Viewed

@@ -28,7 +28,7 @@ ROUTING_PROMPT = (
     # "- in_context_query: if the user asks questions about tasks or data in running workflow (or a workflow that ran recently) or if the user mentions the in-memory 'df' or a dataframe.\n"
     # "- historical_prov_query: if the user wants to query historical provenance data\n"
     "- in_chat_query: if the user appears to be asking about something that has said recently in this chat.\n"
-    "- unknown: if you don't know.\n"
+    "- in_context_query: if you don't know.\n"
     "Respond with only the route label."
     "User message is below:\n "
 )

flowcept/agents/prompts/in_memory_query_prompts.py CHANGED Viewed

@@ -1,32 +1,117 @@
 # flake8: noqa: E501
 # flake8: noqa: D103
-COMMON_TASK_FIELDS = """
-    | Column                        | Data Type | Description |
-    |-------------------------------|-------------|
-    | `workflow_id`                 | string | Workflow the task belongs to. Use this field when the query is asking about workflow execution |
-    | `task_id`                     | string | Task identifier. |
-    | `parent_task_id`              | string | A task may be directly linked to others. Use this field when the query asks for a task informed by (or associated with or linked to) other task.  |
-    | `activity_id`                 | string | Type of task (e.g., 'choose_option'). Use this for "task type" queries. One activity_id is linked to multiple task_ids. |
-    | `campaign_id`                 | string | A group of workflows. |
-    | `hostname`                    | string | Compute node name. |
-    | `agent_id`                    | string | Set if executed by an agent. |
-    | `started_at`                  | datetime64[ns, UTC] | Start time of a task. Always use this field when the query is has any temporal reference related to the workflow execution, such as 'get the first 10 workflow executions' or 'the last workflow execution'. |
-    | `ended_at`                    | datetime64[ns, UTC] | End time of a task. |
-    | `subtype`                     | string | Subtype of a task. |
-    | `tags`                        | List[str] | List of descriptive tags. |
-    | `image`                        | blob | Raw binary data related to an image. |
-    | `telemetry_summary.duration_sec` | float | Task duration (seconds). |
-    | `telemetry_summary.cpu.percent_all_diff` | float | Difference in overall CPU utilization percentage across all cores between task end and start.|
-    | `telemetry_summary.cpu.user_time_diff`   | float |  Difference average per core CPU user time ( seconds ) between task start and end times.|
-    | `telemetry_summary.cpu.system_time_diff` | float |  Difference in CPU system (kernel) time (seconds) used during the task execution.|
-    | `telemetry_summary.cpu.idle_time_diff`   | float |  Difference in CPU idle time (seconds) during task end and start.|
-    ---
-    For any queries involving CPU, use fields that begin with telemetry_summary.cpu
+def generate_common_task_fields(current_fields):
+    # TODO: make this better
+    common_task_fields = """
+       | Column                        | Data Type | Description |
+       |-------------------------------|-------------|
     """
+    common_task_fields += (
+        "| `workflow_id`                 | string | Workflow the task belongs to. Use this field when the query is asking about workflow execution |\n"
+        if "workflow_id" in current_fields
+        else ""
+    )
+    common_task_fields += (
+        "| `task_id`                     | string | Task identifier. |\n" if "task_id" in current_fields else ""
+    )
+    common_task_fields += (
+        "| `parent_task_id`              | string | A task may be directly linked to others. Use this field when the query asks for a task informed by (or associated with or linked to) other task.  |\n"
+        if "parent_task_id" in current_fields
+        else ""
+    )
+    common_task_fields += (
+        "| `activity_id`                 | string | Type of task (e.g., 'choose_option'). Use this for \"task type\" queries. One activity_id is linked to multiple task_ids. |\n"
+        if "activity_id" in current_fields
+        else ""
+    )
+    common_task_fields += (
+        "| `campaign_id`                 | string | A group of workflows. |\n"
+        if "campaign_id" in current_fields
+        else ""
+    )
+    common_task_fields += (
+        "| `hostname`                    | string | Compute node name. |\n" if "hostname" in current_fields else ""
+    )
+    common_task_fields += (
+        "| `agent_id`                    | string | Set if executed by an agent. |\n"
+        if "agent_id" in current_fields
+        else ""
+    )
+    common_task_fields += (
+        "| `started_at`                  | datetime64[ns, UTC] | Start time of a task. Always use this field when the query has any temporal reference related to the workflow execution, such as 'get the first 10 workflow executions' or 'the last workflow execution'. |\n"
+        if "started_at" in current_fields
+        else ""
+    )
+    common_task_fields += (
+        "| `ended_at`                    | datetime64[ns, UTC] | End time of a task. |\n"
+        if "ended_at" in current_fields
+        else ""
+    )
+    common_task_fields += (
+        "| `subtype`                     | string | Subtype of a task. |\n" if "subtype" in current_fields else ""
+    )
+    common_task_fields += (
+        "| `tags`                        | List[str] | List of descriptive tags. |\n"
+        if "tags" in current_fields
+        else ""
+    )
+    common_task_fields += (
+        "| `image`                       | blob | Raw binary data related to an image. |\n"
+        if "image" in current_fields
+        else ""
+    )
+    common_task_fields += (
+        "| `telemetry_summary.duration_sec` | float | Task duration (seconds). |\n"
+        if "telemetry_summary.duration_sec" in current_fields
+        else ""
+    )
+    common_task_fields += (
+        "| `telemetry_summary.cpu.percent_all_diff` | float | Difference in overall CPU utilization percentage across all cores between task end and start. |\n"
+        if "telemetry_summary.cpu.percent_all_diff" in current_fields
+        else ""
+    )
+    common_task_fields += (
+        "| `telemetry_summary.cpu.user_time_diff`   | float | Difference average per core CPU user time (seconds) between task start and end times. |\n"
+        if "telemetry_summary.cpu.user_time_diff" in current_fields
+        else ""
+    )
+    common_task_fields += (
+        "| `telemetry_summary.cpu.system_time_diff` | float | Difference in CPU system (kernel) time (seconds) used during the task execution. |\n"
+        if "telemetry_summary.cpu.system_time_diff" in current_fields
+        else ""
+    )
+    common_task_fields += (
+        "| `telemetry_summary.cpu.idle_time_diff`   | float | Difference in CPU idle time (seconds) during task end and start. |\n"
+        if "telemetry_summary.cpu.idle_time_diff" in current_fields
+        else ""
+    )
+    common_task_fields += "\n For any queries involving CPU, use fields that begin with telemetry_summary.cpu"
+    return common_task_fields
 DF_FORM = "The user has a pandas DataFrame called `df`, created from flattened task objects using `pd.json_normalize`."
+CURRENT_DF_COLUMNS_PROMPT = """
+### ABSOLUTE FIELD CONSTRAINT -- THIS IS CRITICAL
+The following list is the ONLY valid field names in df. Treat this as the schema:
+ALLOWED_FIELDS = [COLS]
+You MUST treat this list as authoritative.
+- You may only use fields names that appear EXACTLY (string match) in ALLOWED_FIELDS.
+- You are NOT allowed to create new field names by:
+  - adding or removing prefixes like "used." or "generated."
+  - combining words
+  - guessing.
+- If a field name is not in ALLOWED_FIELDS, you MUST NOT use it.
+"""
 def get_example_values_prompt(example_values):
     values_prompt = f"""
@@ -39,7 +124,7 @@ def get_example_values_prompt(example_values):
     return values_prompt
-def get_df_schema_prompt(dynamic_schema, example_values):
+def get_df_schema_prompt(dynamic_schema, example_values, current_fields):
     schema_prompt = f"""
      ## DATAFRAME STRUCTURE
@@ -53,14 +138,19 @@ def get_df_schema_prompt(dynamic_schema, example_values):
         The schema for these fields is defined in the dictionary below.
         It maps each activity ID to its inputs (i) and outputs (o), using flattened field names that include `used.` or `generated.` prefixes to indicate the role the field played in the task. These names match the columns in the dataframe `df`.
-        ```python
         {dynamic_schema}
-        ```
         Use this schema and fields to understand what inputs and outputs are valid for each activity.
+        IMPORTANT: The user might say used for outputs or generated for inputs, which might confuse you. Do not get tricked by the user.
+         Ignore the natural-language words "used" and "generated".
+            - The English phrase "used in the calculation" does NOT mean you must use a `used.` column.
+            - The English word "generated" in the question does NOT force you to use a `generated.` column either.
+         ALWAYS CHECK THE ALLOWED_FIELDS list before proceeding. THIS IS CRITICAL.
         ### 2. Additional fields for tasks:
-        {COMMON_TASK_FIELDS}
+        {generate_common_task_fields(current_fields)}
         ---
     """
@@ -70,12 +160,12 @@ def get_df_schema_prompt(dynamic_schema, example_values):
     return prompt
-def generate_plot_code_prompt(query, dynamic_schema, example_values) -> str:
+def generate_plot_code_prompt(query, dynamic_schema, example_values, current_fields) -> str:
     PLOT_PROMPT = f"""
         You are a Streamlit chart expert.
         {DF_FORM}
-        {get_df_schema_prompt(dynamic_schema, example_values)}
+        {get_df_schema_prompt(dynamic_schema, example_values, current_fields)}
         ### 3. Guidelines
@@ -121,10 +211,14 @@ def generate_plot_code_prompt(query, dynamic_schema, example_values) -> str:
           "plot_code": "import matplotlib.pyplot as plt\nplt.hist(result['n_controls'])\nst.pyplot(plt)"
         }}
+        Your response must be only the raw Python code in the format:
+        result = ...
+        Except for the `result` variable, YOU MUST NEVER CREATE ANY OTHER VARIABLE. NEVER!
         User request:
         {query}
-        THE OUTPUT MUST BE A VALID JSON ONLY. DO NOT SAY ANYTHING ELSE.
     """
     return PLOT_PROMPT
@@ -139,7 +233,7 @@ QUERY_GUIDELINES = """
     - Use `df` as the base DataFrame.
     - Use `activity_id` to filter by task type (valid values = schema keys).
-    - Use `used.` for parameters (inputs) and `generated.` for outputs (metrics).
+    - ONLY IF the ALLOWED_FIELDS list allow, use `used.` for parameters (inputs) and `generated.` for outputs (metrics).
     - Use `telemetry_summary.duration_sec` for performance-related questions.
     - Use `hostname` when user mentions *where* a task ran.
     - Use `agent_id` when the user refers to agents (non-null means task was agent-run).
@@ -153,7 +247,7 @@ QUERY_GUIDELINES = """
     **THE COLUMN 'used' DOES NOT EXIST**
     **THE COLUMN 'generated' DOES NOT EXIST**
     - **When filtering by `activity_id`, only select columns that belong to that activity’s schema.**
-      - Use only `used.` and `generated.` fields listed in the schema for that `activity_id`.
+      - Always observing the ALLOWED_FIELDS list, use only `used.` and `generated.` fields listed in the schema for that `activity_id`.
      - Explicitly list the selected columns — **never return all columns**
     - **Only include telemetry columns if used in the query logic.**
       -THERE IS NOT A FIELD NAMED `telemetry_summary.start_time` or `telemetry_summary.end_time` or `used.start_time` or `used.end_time`. Use `started_at` and `ended_at` instead when you want to find the duration of a task, activity, or workflow execution.
@@ -187,6 +281,17 @@ QUERY_GUIDELINES = """
       -**Do NOT use any of those: df[df['started_at'].idxmax()], df[df['started_at'].idxmin()], df[df['ended_at'].idxmin()], df[df['ended_at'].idxmax()]. Those are not valid Pandas Code.**
       - When the query mentions "each task", or "each activity", or "each workflow", make sure you show (project) the correct id column in the results (i.e., respectively: `task_id`, `activity_id`, `workflow_id`) to identify those in the results.
       - Use df[<role>.field_name] == True or df[<role>.field_name] == False when user queries boolean fields, where <role> is either used or generated, depending on the field name. Make sure field_name is a valid field in the DataFrame.
+    If the query asks you to report which values appear in one or more columns
+        (for example “which X were used”, “list all Y”, “what X and Y were generated”), then:
+            For each relevant column, select that column from df.
+            Call .dropna() on that column to remove missing values.
+            After dropping NaNs, apply .unique(), .value_counts(), or any other aggregation as needed.
+            Select that column.
+            Call .dropna() on it.
+            Then call .unique(), .value_counts(), or any other aggregation.
     - **Do not include metadata columns unless explicitly required by the user query.**
 """
@@ -200,15 +305,16 @@ FEW_SHOTS = """
     # Q: How many tasks for each activity?
     result = df['activity_id'].value_counts()
-    # Q: What is the average loss across all tasks?
-    result = df['generated.loss'].mean()
-    # Q: select the 'choose_option' tasks executed by the agent, and show the planned controls, generated option, scores, explanations
-    result = df[(df['activity_id'] == 'choose_option') & (df['agent_id'].notna())][['used.planned_controls', 'generated.option', 'used.scores.scores', 'generated.explanation']].copy()
-    # Q: Show duration and generated scores for 'simulate_layer' tasks
-    result = df[df['activity_id'] == 'simulate_layer'][['telemetry_summary.duration_sec', 'generated.scores']]
 """
+# # Q: What is the average loss across all tasks?
+# result = df['generated.loss'].mean()
+#
+# # Q: select the 'choose_option' tasks executed by the agent, and show the planned controls, generated option, scores, explanations
+# result = df[(df['activity_id'] == 'choose_option') & (df['agent_id'].notna())][
+#     ['used.planned_controls', 'generated.option', 'used.scores.scores', 'generated.explanation']].copy()
+#
+# # Q: Show duration and generated scores for 'simulate_layer' tasks
+# result = df[df['activity_id'] == 'simulate_layer'][['telemetry_summary.duration_sec', 'generated.scores']]
 OUTPUT_FORMATTING = """
     6. Final Instructions
@@ -226,7 +332,7 @@ OUTPUT_FORMATTING = """
 """
-def generate_pandas_code_prompt(query: str, dynamic_schema, example_values, custom_user_guidances):
+def generate_pandas_code_prompt(query: str, dynamic_schema, example_values, custom_user_guidances, current_fields):
     if custom_user_guidances is not None and isinstance(custom_user_guidances, list) and len(custom_user_guidances):
         concatenated_guidance = "\n".join(f"- {msg}" for msg in custom_user_guidances)
         custom_user_guidance_prompt = (
@@ -236,11 +342,14 @@ def generate_pandas_code_prompt(query: str, dynamic_schema, example_values, cust
         )
     else:
         custom_user_guidance_prompt = ""
+    curr_cols = CURRENT_DF_COLUMNS_PROMPT.replace("[COLS]", str(current_fields))
     prompt = (
         f"{ROLE}"
         f"{JOB}"
         f"{DF_FORM}"
-        f"{get_df_schema_prompt(dynamic_schema, example_values)}"  # main tester
+        f"{curr_cols}"
+        f"{get_df_schema_prompt(dynamic_schema, example_values, current_fields)}"  # main tester
         f"{QUERY_GUIDELINES}"  # main tester
         f"{FEW_SHOTS}"  # main tester
         f"{custom_user_guidance_prompt}"
@@ -251,7 +360,7 @@ def generate_pandas_code_prompt(query: str, dynamic_schema, example_values, cust
     return prompt
-def dataframe_summarizer_context(code, reduced_df, dynamic_schema, example_values, query) -> str:
+def dataframe_summarizer_context(code, reduced_df, dynamic_schema, example_values, query, current_fields) -> str:
     job = "You are a Workflow Provenance Specialist analyzing a DataFrame that was obtained to answer a query."
     if "image" in reduced_df.columns:
@@ -272,7 +381,7 @@ def dataframe_summarizer_context(code, reduced_df, dynamic_schema, example_value
     {reduced_df}
     **Original df (before reduction) had this schema:
-    {get_df_schema_prompt(dynamic_schema, example_values)}
+    {get_df_schema_prompt(dynamic_schema, example_values, current_fields)}
     Your task is to find a concise and direct answer as an English sentence to the user query.
@@ -310,7 +419,7 @@ def extract_or_fix_json_code_prompt(raw_text) -> str:
     return prompt
-def extract_or_fix_python_code_prompt(raw_text):
+def extract_or_fix_python_code_prompt(raw_text, current_fields):
     prompt = f"""
     You are a Pandas DataFrame code extractor and fixer. Pandas is a well-known data science Python library for querying datasets.
     You are given a raw user message that may include explanations, markdown fences, or partial DataFrame code that queries a DataFrame `df`.
@@ -319,9 +428,13 @@ def extract_or_fix_python_code_prompt(raw_text):
     1. Check if the message contains a valid DataFrame code.
     2. If it does, extract the code.
     3. If there are any syntax errors, fix them.
-    4. Return only the corrected DataFrame query code — no explanations, no comments, no markdown.
+    4. Carefully analyze the list of columns in the query. The query must only use fields in this list:
+        ALLOWED_FIELDS = {current_fields}.
+       If there are fields not in this list, replace the fields to match according to the ALLOWED_FIELDS list.
+    5. Return only the corrected DataFrame query code — no explanations, no comments, no markdown.
     The output must be valid Python code, and must not include any other text.
+    Your output can only contain fields in the ALLOWED_FIELDS list.
     This output will be parsed by another program.
     ONCE AGAIN, ONLY PRODUCE THE PYTHON CODE. DO NOT SAY ANYTHING ELSE!

flowcept 0.9.17__py3-none-any.whl → 0.9.18__py3-none-any.whl

flowcept 0.9.17py3-none-any.whl → 0.9.18py3-none-any.whl