npm - windmill-components - Versions diffs - 1.677.0 → 1.687.0 - Mend

windmill-components 1.677.0 → 1.687.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (358) hide show

package/package/components/copilot/chat/__tests__/flow/expected/test7_modify_complex.json DELETED Viewed

@@ -1,136 +0,0 @@
-{
-    "summary": "Data enrichment flow with parallel processing",
-    "value": {
-        "modules": [
-            {
-                "id": "get_item",
-                "summary": "Get item from input",
-                "value": {
-                    "type": "rawscript",
-                    "language": "bun",
-                    "content": "export async function main(item_id: string) {\n  // Mock item lookup\n  return {\n    id: item_id,\n    name: \"Product \" + item_id,\n    sku: \"SKU-\" + item_id\n  };\n}",
-                    "input_transforms": {
-                        "item_id": {
-                            "type": "javascript",
-                            "expr": "flow_input.item_id"
-                        }
-                    }
-                }
-            },
-            {
-                "id": "parallel_enrichment",
-                "summary": "Enrich data in parallel",
-                "value": {
-                    "type": "branchall",
-                    "branches": [
-                        {
-                            "summary": "Price enrichment",
-                            "modules": [
-                                {
-                                    "id": "enrich_price",
-                                    "summary": "Call pricing API",
-                                    "value": {
-                                        "type": "rawscript",
-                                        "language": "bun",
-                                        "content": "export async function main(item: any) {\n  // Mock pricing API call with timeout handling\n  try {\n    return {\n      itemId: item.id,\n      price: 99.99,\n      currency: \"USD\",\n      discount: 10\n    };\n  } catch (e) {\n    return { itemId: item.id, price: 0, currency: \"USD\", fallback: true };\n  }\n}",
-                                        "input_transforms": {
-                                            "item": {
-                                                "type": "javascript",
-                                                "expr": "results.get_item"
-                                            }
-                                        }
-                                    }
-                                }
-                            ]
-                        },
-                        {
-                            "summary": "Inventory enrichment",
-                            "modules": [
-                                {
-                                    "id": "enrich_inventory",
-                                    "summary": "Call inventory API",
-                                    "value": {
-                                        "type": "rawscript",
-                                        "language": "bun",
-                                        "content": "export async function main(item: any) {\n  // Mock inventory API call with timeout handling\n  try {\n    return {\n      itemId: item.id,\n      inStock: true,\n      quantity: 150,\n      warehouse: \"WH-001\"\n    };\n  } catch (e) {\n    return { itemId: item.id, inStock: false, quantity: 0, fallback: true };\n  }\n}",
-                                        "input_transforms": {
-                                            "item": {
-                                                "type": "javascript",
-                                                "expr": "results.get_item"
-                                            }
-                                        }
-                                    }
-                                }
-                            ]
-                        },
-                        {
-                            "summary": "Reviews enrichment",
-                            "modules": [
-                                {
-                                    "id": "enrich_reviews",
-                                    "summary": "Call reviews API",
-                                    "value": {
-                                        "type": "rawscript",
-                                        "language": "bun",
-                                        "content": "export async function main(item: any) {\n  // Mock reviews API call with timeout handling\n  try {\n    return {\n      itemId: item.id,\n      averageRating: 4.5,\n      reviewCount: 127,\n      topReview: \"Great product!\"\n    };\n  } catch (e) {\n    return { itemId: item.id, averageRating: 0, reviewCount: 0, fallback: true };\n  }\n}",
-                                        "input_transforms": {
-                                            "item": {
-                                                "type": "javascript",
-                                                "expr": "results.get_item"
-                                            }
-                                        }
-                                    }
-                                }
-                            ]
-                        }
-                    ]
-                }
-            },
-            {
-                "id": "combine_data",
-                "summary": "Combine all enrichment data",
-                "value": {
-                    "type": "rawscript",
-                    "language": "bun",
-                    "content": "export async function main(item: any, parallel_results: any) {\n  // Extract results from parallel branches\n  const [priceResult, inventoryResult, reviewsResult] = parallel_results;\n  return {\n    ...item,\n    pricing: priceResult,\n    inventory: inventoryResult,\n    reviews: reviewsResult,\n    hasFallbacks: priceResult?.fallback || inventoryResult?.fallback || reviewsResult?.fallback\n  };\n}",
-                    "input_transforms": {
-                        "item": {
-                            "type": "javascript",
-                            "expr": "results.get_item"
-                        },
-                        "parallel_results": {
-                            "type": "javascript",
-                            "expr": "results.parallel_enrichment"
-                        }
-                    }
-                }
-            },
-            {
-                "id": "return_result",
-                "summary": "Return final result",
-                "value": {
-                    "type": "rawscript",
-                    "language": "bun",
-                    "content": "export async function main(enriched_item: any) {\n  return {\n    success: true,\n    data: enriched_item,\n    enrichedAt: new Date().toISOString()\n  };\n}",
-                    "input_transforms": {
-                        "enriched_item": {
-                            "type": "javascript",
-                            "expr": "results.combine_data"
-                        }
-                    }
-                }
-            }
-        ]
-    },
-    "schema": {
-        "$schema": "https://json-schema.org/draft/2020-12/schema",
-        "properties": {
-            "item_id": {
-                "type": "string",
-                "description": "The ID of the item to enrich"
-            }
-        },
-        "required": ["item_id"],
-        "type": "object"
-    }
-}

package/package/components/copilot/chat/__tests__/flow/flowChat.eval.test.js DELETED Viewed

@@ -1,294 +0,0 @@
-import { describe, it, expect } from 'vitest';
-import { runVariantComparison, writeFlowComparisonResults } from './flowEvalRunner';
-import { BASELINE_VARIANT, MINIMAL_SINGLE_TOOL_VARIANT } from './variants';
-// @ts-ignore - JSON import
-import expectedTest1 from './expected/test1.json';
-// @ts-ignore - JSON import
-import expectedTest2 from './expected/test2.json';
-// @ts-ignore - JSON import
-import expectedTest3 from './expected/test3.json';
-// @ts-ignore - JSON import
-import expectedTest4 from './expected/test4.json';
-// @ts-ignore - JSON import
-import expectedTest5 from './expected/test5_modify_simple.json';
-// @ts-ignore - JSON import
-import expectedTest6 from './expected/test6_modify_medium.json';
-// @ts-ignore - JSON import
-import expectedTest7 from './expected/test7_modify_complex.json';
-// @ts-ignore - JSON import
-import initialTest5 from './initial/test5_initial.json';
-// @ts-ignore - JSON import
-import initialTest6 from './initial/test6_initial.json';
-// @ts-ignore - JSON import
-import initialTest7 from './initial/test7_initial.json';
-// Get API keys from environment - tests will be skipped if none are set
-// @ts-ignore
-const OPENAI_API_KEY = process.env.OPENAI_API_KEY;
-// @ts-ignore
-const ANTHROPIC_API_KEY = process.env.ANTHROPIC_API_KEY;
-const hasAnyKey = OPENAI_API_KEY || ANTHROPIC_API_KEY;
-const describeWithApiKey = hasAnyKey ? describe : describe.skip;
-const MODEL_VARIANTS = [
-    ...(OPENAI_API_KEY
-        ? [{ model: 'gpt-4o', provider: 'openai', apiKey: OPENAI_API_KEY }]
-        : []),
-    ...(ANTHROPIC_API_KEY
-        ? [
-            {
-                model: 'claude-haiku-4-5-20241022',
-                provider: 'anthropic',
-                apiKey: ANTHROPIC_API_KEY
-            }
-        ]
-        : [])
-];
-const VARIANTS = [
-    ...MODEL_VARIANTS.map((mv) => ({
-        ...BASELINE_VARIANT,
-        model: mv.model,
-        name: `baseline-${mv.provider}-${mv.model}`,
-        _provider: mv.provider,
-        _apiKey: mv.apiKey
-    })),
-    ...MODEL_VARIANTS.map((mv) => ({
-        ...MINIMAL_SINGLE_TOOL_VARIANT,
-        model: mv.model,
-        name: `minimal-single-tool-${mv.provider}-${mv.model}`,
-        _provider: mv.provider,
-        _apiKey: mv.apiKey
-    }))
-];
-describeWithApiKey('Flow Chat LLM Evaluation', () => {
-    const TEST_TIMEOUT = 120_000;
-    if (!hasAnyKey) {
-        console.warn('No API keys set (OPENAI_API_KEY or ANTHROPIC_API_KEY), skipping tests');
-    }
-    it('test1: user role-based actions with loop and branches', async () => {
-        const USER_PROMPT = `
-THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
-STEP 1: Fetch mock users from api
-STEP 2: Filter only active users:
-STEP 3: Loop on all users
-STEP 4: Do branches based on user's role, do different action based on that. Roles are admin, user, moderator
-STEP 5: Return action taken for each user
-`;
-        const results = await runVariantComparison(USER_PROMPT, VARIANTS, VARIANTS[0]._apiKey, {
-            expectedFlow: expectedTest1
-        }, VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })));
-        // Write results to files
-        const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results);
-        console.log(`\nResults written to: ${summaryPath}`);
-        console.log(`Flow files: ${flowPaths.join(', ')}`);
-        // Assert all variants succeeded
-        for (const result of results) {
-            expect(true).toBe(true);
-            // Log evaluation results
-            if (result.evaluationResult) {
-                console.log(`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`);
-                console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`);
-                if (result.evaluationResult.missingRequirements &&
-                    result.evaluationResult.missingRequirements.length > 0) {
-                    console.log(`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`);
-                }
-            }
-        }
-    }, TEST_TIMEOUT);
-    it('test2: e-commerce order processing with inventory check and branching', async () => {
-        const USER_PROMPT = `
-THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
-STEP 1: Receive order data from input (order has items array with name/price/quantity, customer_email, shipping_address)
-STEP 2: Validate order - check all items have valid price > 0 and quantity > 0, return validation result
-STEP 3: Calculate order total with 8% tax rate
-STEP 4: Check inventory for each item (loop through items, return mock availability)
-STEP 5: Branch based on inventory - if all items available, create shipment record; otherwise create backorder record
-STEP 6: Send confirmation (mock email to customer_email)
-STEP 7: Return final order summary with status
-`;
-        const results = await runVariantComparison(USER_PROMPT, VARIANTS, VARIANTS[0]._apiKey, {
-            expectedFlow: expectedTest2
-        }, VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })));
-        const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results);
-        console.log(`\nResults written to: ${summaryPath}`);
-        console.log(`Flow files: ${flowPaths.join(', ')}`);
-        for (const result of results) {
-            expect(true).toBe(true);
-            if (result.evaluationResult) {
-                console.log(`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`);
-                console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`);
-                if (result.evaluationResult.missingRequirements &&
-                    result.evaluationResult.missingRequirements.length > 0) {
-                    console.log(`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`);
-                }
-            }
-        }
-    }, TEST_TIMEOUT);
-    it('test3: data pipeline with parallel processing and quality-based routing', async () => {
-        const USER_PROMPT = `
-THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
-STEP 1: Fetch list of data sources from configuration (return mock array of 3 source objects with id and url)
-STEP 2: For each data source in parallel:
-  - Fetch raw data from the source (mock fetch returning sample records)
-  - Transform/clean the data (filter out invalid entries)
-  - Validate the transformed data (return validation score 0-100)
-STEP 3: Aggregate all validated data into single dataset with combined records
-STEP 4: Calculate overall data quality score (average of all validation scores)
-STEP 5: Branch based on quality score:
-  - If score >= 90: Store in primary database and return success
-  - If score >= 70 and < 90: Store in secondary database with warning flag
-  - If score < 70: Store in quarantine and send alert
-STEP 6: Return processing report with statistics (total records, quality score, destination)
-`;
-        const results = await runVariantComparison(USER_PROMPT, VARIANTS, VARIANTS[0]._apiKey, {
-            expectedFlow: expectedTest3
-        }, VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })));
-        const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results);
-        console.log(`\nResults written to: ${summaryPath}`);
-        console.log(`Flow files: ${flowPaths.join(', ')}`);
-        for (const result of results) {
-            expect(true).toBe(true);
-            if (result.evaluationResult) {
-                console.log(`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`);
-                console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`);
-                if (result.evaluationResult.missingRequirements &&
-                    result.evaluationResult.missingRequirements.length > 0) {
-                    console.log(`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`);
-                }
-            }
-        }
-    }, TEST_TIMEOUT);
-    it('test4: AI agent with tools for customer support', async () => {
-        const USER_PROMPT = `
-THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
-Create a customer support flow with an AI agent:
-STEP 1: Receive customer query from input (customer_id string, query_text string)
-STEP 2: Fetch customer profile and order history (mock data based on customer_id)
-STEP 3: Use an AI agent to handle the customer query. The agent should have access to these tools:
-  - lookup_order: Takes order_id, returns order details (mock data)
-  - check_refund_eligibility: Takes order_id, returns eligibility status and reason
-  - create_support_ticket: Takes description and priority (low/medium/high), returns ticket_id
-  - search_faq: Takes search_query, returns relevant FAQ answers
-  The agent should use the customer profile context and respond helpfully.
-STEP 4: Log the interaction to audit trail (customer_id, query, response summary)
-STEP 5: Return the agent's response and any actions taken
-`;
-        const results = await runVariantComparison(USER_PROMPT, VARIANTS, VARIANTS[0]._apiKey, {
-            expectedFlow: expectedTest4
-        }, VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })));
-        const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results);
-        console.log(`\nResults written to: ${summaryPath}`);
-        console.log(`Flow files: ${flowPaths.join(', ')}`);
-        for (const result of results) {
-            expect(true).toBe(true);
-            if (result.evaluationResult) {
-                console.log(`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`);
-                console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`);
-                if (result.evaluationResult.missingRequirements &&
-                    result.evaluationResult.missingRequirements.length > 0) {
-                    console.log(`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`);
-                }
-            }
-        }
-    }, TEST_TIMEOUT);
-    // ==================== MODIFICATION TESTS ====================
-    // These tests evaluate the LLM's ability to modify existing flows
-    it('test5: simple modification - add validation step to existing flow', async () => {
-        const USER_PROMPT = `
-THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
-Modify this existing flow to add error handling:
-- Add a new step after process_data called "validate_data" to validate the processed data
-- The validation step should check if the data array is not empty
-- If validation fails (empty array), it should return an error object with message "No data to save"
-- If validation passes, return the data for the next step
-- Update save_results to handle the validation result appropriately
-`;
-        const results = await runVariantComparison(USER_PROMPT, VARIANTS, VARIANTS[0]._apiKey, {
-            initialModules: initialTest5.value.modules,
-            initialSchema: initialTest5.schema,
-            expectedFlow: expectedTest5
-        }, VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })));
-        const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results);
-        console.log(`\nResults written to: ${summaryPath}`);
-        console.log(`Flow files: ${flowPaths.join(', ')}`);
-        for (const result of results) {
-            expect(true).toBe(true);
-            if (result.evaluationResult) {
-                console.log(`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`);
-                console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`);
-                if (result.evaluationResult.missingRequirements &&
-                    result.evaluationResult.missingRequirements.length > 0) {
-                    console.log(`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`);
-                }
-            }
-        }
-    }, TEST_TIMEOUT);
-    it('test6: medium modification - add branching inside existing loop', async () => {
-        const USER_PROMPT = `
-THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
-Modify the order processing loop to handle different order types:
-- Inside the loop_orders, replace the simple process_order step with branching based on order.type
-- For type "express": add a step called handle_express that marks as priority and calculates express shipping cost ($15.99)
-- For type "standard": add a step called handle_standard that calculates standard shipping cost ($5.99)
-- For type "pickup": add a step called handle_pickup that marks as no shipping required (cost $0)
-- Move the original process_order step to the default branch for unknown order types
-- Each branch step should return the orderId, shipping cost, and shipping type
-`;
-        const results = await runVariantComparison(USER_PROMPT, VARIANTS, VARIANTS[0]._apiKey, {
-            initialModules: initialTest6.value.modules,
-            initialSchema: initialTest6.schema,
-            expectedFlow: expectedTest6
-        }, VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })));
-        const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results);
-        console.log(`\nResults written to: ${summaryPath}`);
-        console.log(`Flow files: ${flowPaths.join(', ')}`);
-        for (const result of results) {
-            expect(true).toBe(true);
-            if (result.evaluationResult) {
-                console.log(`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`);
-                console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`);
-                if (result.evaluationResult.missingRequirements &&
-                    result.evaluationResult.missingRequirements.length > 0) {
-                    console.log(`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`);
-                }
-            }
-        }
-    }, TEST_TIMEOUT);
-    it('test7: complex modification - refactor sequential to parallel execution', async () => {
-        const USER_PROMPT = `
-THIS IS A TEST, CODE SHOULD BE MINIMAL FUNCTIONING CODE, IF WE NEED RETURN VALUES RETURN EXAMPLE VALUES
-Refactor this flow for better performance by parallelizing the enrichment steps:
-- The three enrichment steps (enrich_price, enrich_inventory, enrich_reviews) currently run sequentially
-- Wrap them in a parallel branch (branchall) called "parallel_enrichment" so they run concurrently
-- Each enrichment step should include basic error handling with try/catch that returns a fallback value if it fails
-- Update the combine_data step to receive results from the parallel branch (results.parallel_enrichment returns an array of branch results)
-- The combine_data step should check if any enrichment used a fallback value and set a hasFallbacks flag
-- Keep get_item as the first step and return_result as the last step unchanged
-`;
-        const results = await runVariantComparison(USER_PROMPT, VARIANTS, VARIANTS[0]._apiKey, {
-            initialModules: initialTest7.value.modules,
-            initialSchema: initialTest7.schema,
-            expectedFlow: expectedTest7
-        }, VARIANTS.map((v) => ({ provider: v._provider, apiKey: v._apiKey })));
-        const { summaryPath, flowPaths } = await writeFlowComparisonResults(USER_PROMPT, results);
-        console.log(`\nResults written to: ${summaryPath}`);
-        console.log(`Flow files: ${flowPaths.join(', ')}`);
-        for (const result of results) {
-            expect(true).toBe(true);
-            if (result.evaluationResult) {
-                console.log(`[${result.variantName}] Resemblance Score: ${result.evaluationResult.resemblanceScore}/100`);
-                console.log(`[${result.variantName}] Statement: ${result.evaluationResult.statement}`);
-                if (result.evaluationResult.missingRequirements &&
-                    result.evaluationResult.missingRequirements.length > 0) {
-                    console.log(`[${result.variantName}] Missing: ${result.evaluationResult.missingRequirements.join(', ')}`);
-                }
-            }
-        }
-    }, TEST_TIMEOUT);
-});

package/package/components/copilot/chat/__tests__/flow/flowEvalComparison.d.ts DELETED Viewed

@@ -1,17 +0,0 @@
-import type { FlowModule } from '../../../../../gen';
-import type { EvaluationResult } from '../shared';
-/**
- * Expected flow structure for evaluation.
- */
-export interface ExpectedFlow {
-    summary?: string;
-    value: {
-        modules: FlowModule[];
-    };
-    schema?: Record<string, any>;
-}
-/**
- * Evaluates how well a generated flow matches an expected flow and user request using an LLM.
- * Returns a resemblance score (0-100), a qualitative statement, and any missing requirements.
- */
-export declare function evaluateFlowComparison(generatedFlow: ExpectedFlow, expectedFlow: ExpectedFlow, userPrompt: string): Promise<EvaluationResult>;

package/package/components/copilot/chat/__tests__/flow/flowEvalComparison.js DELETED Viewed

@@ -1,49 +0,0 @@
-import { evaluateWithLLM, BASE_EVALUATOR_RESPONSE_FORMAT } from '../shared';
-/**
- * Flow-specific evaluator system prompt.
- */
-const FLOW_EVALUATOR_SYSTEM_PROMPT = `You are an expert evaluator for Windmill flow definitions. Your task is to evaluate a generated flow against:
-1. The original user request/prompt
-2. An expected reference flow
-## Windmill Flow Context
-- Flows consist of modules (steps) that execute sequentially
-- Module types include: rawscript, forloopflow, branchone, branchall, script, flow, aiagent
-- Each module has an id, value (containing type and config), and may have input_transforms
-- input_transforms connect modules using expressions like "results.previous_step". Valid input_transforms are: static, javascript. Valid variables in javascript expressions are: results, flow_input, flow_input.iter.value (for forloopflow), flow_input.iter.index (for forloopflow).
-- forloopflow contains nested modules that execute per iteration with access to flow_input.iter.value
-- branchone executes first matching branch, branchall executes all matching branches
-- Branches have conditional expressions (expr) that determine execution
-- aiagent modules contain tools array with tool definitions
-## Evaluation Criteria
-1. **User Request Fulfillment**: Does the generated flow address ALL requirements from the user's original prompt?
-   - Are all requested steps present?
-   - Are the requested features implemented (loops, branches, specific logic)?
-   - Does the schema match what the user requested for inputs?
-2. **Structure**: Are the module types and nesting structure appropriate for the task?
-3. **Logic**: Does the flow accomplish the intended logical task?
-4. **Connections**: Are input_transforms connecting data correctly between steps?
-5. **Completeness**: Are all required steps present with no major omissions?
-6. **Code Quality**: Is the code functionally correct (exact syntax doesn't need to match)?
-## Important Notes
-- Minor differences in variable names, code formatting, or exact wording are acceptable
-- Focus on functional equivalence, not character-by-character matching
-- The generated flow should achieve the same outcome as described in the user request
-- Extra helper steps or slightly different approaches can still score high if they accomplish the goal
-- If the user requested specific module types (like aiagent), verify they are used correctly
-${BASE_EVALUATOR_RESPONSE_FORMAT}`;
-/**
- * Evaluates how well a generated flow matches an expected flow and user request using an LLM.
- * Returns a resemblance score (0-100), a qualitative statement, and any missing requirements.
- */
-export async function evaluateFlowComparison(generatedFlow, expectedFlow, userPrompt) {
-    return evaluateWithLLM({
-        userPrompt,
-        generatedOutput: generatedFlow,
-        expectedOutput: expectedFlow,
-        evaluatorSystemPrompt: FLOW_EVALUATOR_SYSTEM_PROMPT
-    });
-}

package/package/components/copilot/chat/__tests__/flow/flowEvalHelpers.d.ts DELETED Viewed

@@ -1,12 +0,0 @@
-import type { FlowAIChatHelpers } from '../../flow/core';
-import type { FlowModule } from '../../../../../gen';
-import type { ExtendedOpenFlow } from '../../../../flows/types';
-/**
- * Creates mock FlowAIChatHelpers for eval testing.
- * Tracks flow state in memory and allows tool functions to modify it.
- */
-export declare function createFlowEvalHelpers(initialModules?: FlowModule[], initialSchema?: Record<string, any>): {
-    helpers: FlowAIChatHelpers;
-    getFlow: () => ExtendedOpenFlow;
-    getModules: () => FlowModule[];
-};

package/package/components/copilot/chat/__tests__/flow/flowEvalHelpers.js DELETED Viewed

@@ -1,79 +0,0 @@
-import { findModuleById } from '../../shared';
-import { inlineScriptStore, restoreInlineScriptReferences } from '../../flow/inlineScriptsUtils';
-/**
- * Creates mock FlowAIChatHelpers for eval testing.
- * Tracks flow state in memory and allows tool functions to modify it.
- */
-export function createFlowEvalHelpers(initialModules = [], initialSchema) {
-    let flow = {
-        value: { modules: structuredClone(initialModules) },
-        summary: '',
-        schema: initialSchema ?? {
-            $schema: 'https://json-schema.org/draft/2020-12/schema',
-            properties: {},
-            required: [],
-            type: 'object'
-        }
-    };
-    const helpers = {
-        getFlowAndSelectedId: () => ({ flow, selectedId: '' }),
-        getModules: (id) => {
-            if (!id)
-                return flow.value.modules;
-            const module = findModuleById(flow.value.modules, id);
-            return module ? [module] : [];
-        },
-        setSnapshot: () => {
-            // No-op for eval - we don't need snapshot tracking
-        },
-        revertToSnapshot: () => {
-            // No-op for eval
-        },
-        setCode: async (id, code) => {
-            const module = findModuleById(flow.value.modules, id);
-            if (module && module.value.type === 'rawscript') {
-                module.value.content = code;
-            }
-            // Keep store coherent for subsequent set_flow_json calls with references
-            inlineScriptStore.set(id, code);
-        },
-        setFlowJson: async (modules, schema) => {
-            if (modules) {
-                // Restore inline script references back to full content
-                const restoredModules = restoreInlineScriptReferences(modules);
-                flow.value.modules = restoredModules;
-            }
-            // Update schema if provided
-            if (schema !== undefined) {
-                flow.schema = schema;
-            }
-        },
-        getFlowInputsSchema: async () => flow.schema ?? {},
-        updateExprsToSet: (_id, _inputTransforms) => {
-            // No-op for eval - UI-only functionality
-        },
-        acceptAllModuleActions: () => {
-            // No-op for eval
-        },
-        rejectAllModuleActions: () => {
-            // No-op for eval
-        },
-        hasPendingChanges: () => false,
-        selectStep: (_id) => {
-            // No-op for eval
-        },
-        testFlow: async () => {
-            // Return mock job ID - we don't actually run flows in eval
-            return 'mock-job-id-' + Date.now();
-        },
-        getLintErrors: async () => {
-            // Return empty lint result for eval
-            return { errorCount: 0, warningCount: 0, errors: [], warnings: [] };
-        }
-    };
-    return {
-        helpers,
-        getFlow: () => flow,
-        getModules: () => flow.value.modules
-    };
-}

package/package/components/copilot/chat/__tests__/flow/flowEvalRunner.d.ts DELETED Viewed

@@ -1,50 +0,0 @@
-import type { FlowModule } from '../../../../../gen';
-import type { AIProvider } from '../../../../../gen/types.gen';
-import type { ExtendedOpenFlow } from '../../../../flows/types';
-import { type ExpectedFlow } from './flowEvalComparison';
-import { type VariantConfig, type BaseEvalResult } from '../shared';
-export type { ExpectedFlow } from './flowEvalComparison';
-/**
- * Flow-specific evaluation result.
- */
-export interface FlowEvalResult extends BaseEvalResult<ExtendedOpenFlow> {
-    /** Alias for output to maintain API compatibility */
-    flow: ExtendedOpenFlow;
-}
-/**
- * Options for running a flow evaluation.
- */
-export interface FlowEvalOptions {
-    initialModules?: FlowModule[];
-    initialSchema?: Record<string, any>;
-    model?: string;
-    customSystemPrompt?: string;
-    maxIterations?: number;
-    variant?: VariantConfig;
-    expectedFlow?: ExpectedFlow;
-    /** AI provider (inferred from model name if omitted) */
-    provider?: AIProvider;
-}
-/**
- * Runs a flow chat evaluation using the shared chat loop (same code path as production).
- */
-export declare function runFlowEval(userPrompt: string, apiKey: string, options?: FlowEvalOptions): Promise<FlowEvalResult>;
-/**
- * Per-variant provider override.
- */
-export interface VariantProviderOverride {
-    provider: AIProvider;
-    apiKey: string;
-}
-/**
- * Runs the same prompt against multiple variants sequentially for comparison.
- * Accepts optional per-variant provider/apiKey overrides.
- */
-export declare function runVariantComparison(userPrompt: string, variants: VariantConfig[], defaultApiKey: string, baseOptions?: Omit<FlowEvalOptions, 'variant'>, providerOverrides?: VariantProviderOverride[]): Promise<FlowEvalResult[]>;
-/**
- * Writes flow comparison results to files.
- */
-export declare function writeFlowComparisonResults(userPrompt: string, results: FlowEvalResult[], outputDir?: string): Promise<{
-    summaryPath: string;
-    flowPaths: string[];
-}>;