safestar 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,48 @@
1
+ export function evaluate(scenario, currentRuns, baselineRuns) {
2
+ const report = {
3
+ scenario: scenario.name,
4
+ status: 'PASS',
5
+ length: { baseline: 0, current: 0, deltaPercent: 0 },
6
+ variance: { score: 0 },
7
+ violations: []
8
+ };
9
+ const currentLengths = currentRuns.map(r => r.length);
10
+ const avgCurrent = currentLengths.reduce((a, b) => a + b, 0) / currentLengths.length;
11
+ const squareDiffs = currentLengths.map(v => Math.pow(v - avgCurrent, 2));
12
+ const variance = Math.sqrt(squareDiffs.reduce((a, b) => a + b, 0) / currentLengths.length);
13
+ report.length.current = Math.round(avgCurrent);
14
+ report.variance.score = parseFloat(variance.toFixed(2));
15
+ if (baselineRuns) {
16
+ const baseLengths = baselineRuns.map(r => r.length);
17
+ const avgBase = baseLengths.reduce((a, b) => a + b, 0) / baseLengths.length;
18
+ report.length.baseline = Math.round(avgBase);
19
+ if (avgBase > 0) {
20
+ report.length.deltaPercent = Math.round(((avgCurrent - avgBase) / avgBase) * 100);
21
+ }
22
+ if (Math.abs(report.length.deltaPercent) > 50) {
23
+ report.status = 'WARN';
24
+ }
25
+ }
26
+ const violations = {};
27
+ currentRuns.forEach(run => {
28
+ if (scenario.checks?.max_length && run.length > scenario.checks.max_length) {
29
+ violations['max_length'] = (violations['max_length'] || 0) + 1;
30
+ }
31
+ scenario.checks?.must_contain?.forEach(word => {
32
+ if (!run.output.toLowerCase().includes(word.toLowerCase())) {
33
+ violations[`must_contain: "${word}"`] = (violations[`must_contain: "${word}"`] || 0) + 1;
34
+ }
35
+ });
36
+ scenario.checks?.must_not_contain?.forEach(word => {
37
+ if (run.output.toLowerCase().includes(word.toLowerCase())) {
38
+ violations[`must_not_contain: "${word}"`] = (violations[`must_not_contain: "${word}"`] || 0) + 1;
39
+ }
40
+ });
41
+ });
42
+ Object.entries(violations).forEach(([check, count]) => {
43
+ report.violations.push({ check, count });
44
+ if (count > 0)
45
+ report.status = 'FAIL';
46
+ });
47
+ return report;
48
+ }
package/dist/index.js ADDED
@@ -0,0 +1,94 @@
1
+ #!/usr/bin/env node
2
+ import { Command } from 'commander';
3
+ import chalk from 'chalk';
4
+ import { ensureDirs, loadScenario, saveRuns, saveBaseline, loadBaseline, loadLatestRun } from './utils.js';
5
+ import { runScenario } from './runner.js';
6
+ import { evaluate } from './evaluator.js';
7
+ const program = new Command();
8
+ program
9
+ .name('safestar')
10
+ .description('Snapshot and diff AI behavior')
11
+ .version('1.0.0');
12
+ // COMMAND: RUN
13
+ program.command('run <scenarioPath>')
14
+ .description('Execute a scenario and save runs locally')
15
+ .action(async (scenarioPath) => {
16
+ try {
17
+ ensureDirs();
18
+ const scenario = loadScenario(scenarioPath);
19
+ const results = await runScenario(scenario);
20
+ const savedPath = saveRuns(scenario.name, results);
21
+ console.log(chalk.green(`✓ Runs completed. Saved to ${savedPath}`));
22
+ // Auto-run diff logic to show immediate feedback
23
+ const baseline = loadBaseline(scenario.name);
24
+ const report = evaluate(scenario, results, baseline);
25
+ printReport(report);
26
+ }
27
+ catch (e) {
28
+ console.error(chalk.red('Error:'), e.message);
29
+ }
30
+ });
31
+ // COMMAND: BASELINE
32
+ program.command('baseline <scenarioName>')
33
+ .description('Promote the latest run to be the new baseline')
34
+ .action((scenarioName) => {
35
+ try {
36
+ const latest = loadLatestRun(scenarioName);
37
+ if (!latest) {
38
+ console.log(chalk.red('No runs found. Run "safestar run <scenario>" first.'));
39
+ return;
40
+ }
41
+ saveBaseline(scenarioName, latest);
42
+ console.log(chalk.green(`✓ Baseline updated for ${scenarioName}`));
43
+ }
44
+ catch (e) {
45
+ console.error(chalk.red('Error:'), e.message);
46
+ }
47
+ });
48
+ // COMMAND: DIFF
49
+ program.command('diff <scenarioPath>')
50
+ .description('Compare latest runs against baseline')
51
+ .action((scenarioPath) => {
52
+ try {
53
+ const scenario = loadScenario(scenarioPath);
54
+ const current = loadLatestRun(scenario.name);
55
+ const baseline = loadBaseline(scenario.name);
56
+ if (!current) {
57
+ console.error(chalk.red('No current runs found.'));
58
+ return;
59
+ }
60
+ const report = evaluate(scenario, current, baseline);
61
+ printReport(report);
62
+ }
63
+ catch (e) {
64
+ console.error(chalk.red('Error:'), e.message);
65
+ }
66
+ });
67
+ // Helper to pretty print the report
68
+ function printReport(report) {
69
+ console.log(chalk.bold('\n--- SAFESTAR REPORT ---'));
70
+ if (report.status === 'FAIL')
71
+ console.log(`Status: ${chalk.red.bold('FAIL')}`);
72
+ else if (report.status === 'WARN')
73
+ console.log(`Status: ${chalk.yellow.bold('WARN')}`);
74
+ else
75
+ console.log(`Status: ${chalk.green.bold('PASS')}`);
76
+ console.log(`\nMetrics:`);
77
+ console.log(` Avg Length: ${report.length.current} chars`);
78
+ if (report.length.baseline > 0) {
79
+ const color = report.length.deltaPercent > 0 ? chalk.yellow : chalk.blue;
80
+ console.log(` Drift: ${color(report.length.deltaPercent + '%')} vs baseline`);
81
+ }
82
+ console.log(` Variance: ${report.variance.score} (std dev)`);
83
+ if (report.violations.length > 0) {
84
+ console.log(chalk.red(`\nViolations:`));
85
+ report.violations.forEach((v) => {
86
+ console.log(` - ${v.check}: failed in ${v.count} runs`);
87
+ });
88
+ }
89
+ else {
90
+ console.log(chalk.green(`\nNo heuristic violations.`));
91
+ }
92
+ console.log('-----------------------\n');
93
+ }
94
+ program.parse();
package/dist/runner.js ADDED
@@ -0,0 +1,34 @@
1
+ import { execSync } from 'child_process';
2
+ export async function runScenario(scenario) {
3
+ const results = [];
4
+ console.log(`Running scenario: ${scenario.name} (${scenario.runs} times)...`);
5
+ for (let i = 0; i < scenario.runs; i++) {
6
+ let output = "";
7
+ // 1. REAL MODE: If user provided an exec command
8
+ if (scenario.exec) {
9
+ try {
10
+ // We pass the PROMPT as an environment variable to the user's script
11
+ output = execSync(scenario.exec, {
12
+ encoding: 'utf-8',
13
+ env: { ...process.env, PROMPT: scenario.prompt },
14
+ stdio: ['ignore', 'pipe', 'ignore'] // Clean output, ignore stderr
15
+ });
16
+ }
17
+ catch (error) {
18
+ console.error(`Execution failed: ${error.message}`);
19
+ output = "ERROR_IN_EXECUTION";
20
+ }
21
+ }
22
+ // 2. DEMO MODE: If no exec provided, fallback to mock (so new users can try it)
23
+ else {
24
+ output = "Mock Response " + Math.random().toString(36).substring(7);
25
+ }
26
+ results.push({
27
+ scenario: scenario.name,
28
+ output: output.trim(),
29
+ length: output.length,
30
+ timestamp: new Date().toISOString()
31
+ });
32
+ }
33
+ return results;
34
+ }
package/dist/types.js ADDED
@@ -0,0 +1,15 @@
1
+ import { z } from 'zod';
2
+ // 1. Zod Schema for the User's YAML Scenario
3
+ export const ScenarioSchema = z.object({
4
+ name: z.string(),
5
+ description: z.string().optional(),
6
+ prompt: z.string(),
7
+ // NEW: The command to execute (e.g., "python bot.py")
8
+ exec: z.string().optional(),
9
+ runs: z.number().int().min(1).default(5),
10
+ checks: z.object({
11
+ max_length: z.number().optional(),
12
+ must_contain: z.array(z.string()).optional(),
13
+ must_not_contain: z.array(z.string()).optional(),
14
+ }).optional()
15
+ });
package/dist/utils.js ADDED
@@ -0,0 +1,47 @@
1
+ import fs from 'fs';
2
+ import path from 'path';
3
+ import yaml from 'js-yaml';
4
+ import { ScenarioSchema } from './types.js';
5
+ const RUNS_DIR = '.safestar/runs';
6
+ const BASELINE_DIR = '.baselines';
7
+ export function ensureDirs() {
8
+ if (!fs.existsSync(RUNS_DIR))
9
+ fs.mkdirSync(RUNS_DIR, { recursive: true });
10
+ if (!fs.existsSync(BASELINE_DIR))
11
+ fs.mkdirSync(BASELINE_DIR, { recursive: true });
12
+ }
13
+ export function loadScenario(filepath) {
14
+ const content = fs.readFileSync(filepath, 'utf-8');
15
+ const raw = yaml.load(content);
16
+ return ScenarioSchema.parse(raw);
17
+ }
18
+ export function saveRuns(scenarioName, runs) {
19
+ const targetDir = path.join(RUNS_DIR, scenarioName);
20
+ if (!fs.existsSync(targetDir))
21
+ fs.mkdirSync(targetDir, { recursive: true });
22
+ const filename = `run_${Date.now()}.json`;
23
+ fs.writeFileSync(path.join(targetDir, filename), JSON.stringify(runs, null, 2));
24
+ return path.join(targetDir, filename);
25
+ }
26
+ export function saveBaseline(scenarioName, runs) {
27
+ const targetDir = path.join(BASELINE_DIR, scenarioName);
28
+ if (!fs.existsSync(targetDir))
29
+ fs.mkdirSync(targetDir, { recursive: true });
30
+ const filename = 'latest.json';
31
+ fs.writeFileSync(path.join(targetDir, filename), JSON.stringify(runs, null, 2));
32
+ }
33
+ export function loadBaseline(scenarioName) {
34
+ const filepath = path.join(BASELINE_DIR, scenarioName, 'latest.json');
35
+ if (!fs.existsSync(filepath))
36
+ return null;
37
+ return JSON.parse(fs.readFileSync(filepath, 'utf-8'));
38
+ }
39
+ export function loadLatestRun(scenarioName) {
40
+ const targetDir = path.join(RUNS_DIR, scenarioName);
41
+ if (!fs.existsSync(targetDir))
42
+ return null;
43
+ const files = fs.readdirSync(targetDir).sort().reverse();
44
+ if (files.length === 0)
45
+ return null;
46
+ return JSON.parse(fs.readFileSync(path.join(targetDir, files[0]), 'utf-8'));
47
+ }
package/package.json ADDED
@@ -0,0 +1,43 @@
1
+ {
2
+ "name": "safestar",
3
+ "version": "1.0.0",
4
+ "description": "Snapshot, version, and diff AI behavior over time.",
5
+ "main": "dist/index.js",
6
+ "bin": {
7
+ "safestar": "./dist/index.js"
8
+ },
9
+ "type": "module",
10
+ "files": [
11
+ "dist",
12
+ "README.md",
13
+ "package.json"
14
+ ],
15
+ "scripts": {
16
+ "dev": "tsx src/index.ts",
17
+ "build": "tsc",
18
+ "prepublishOnly": "npm run build"
19
+ },
20
+ "keywords": [
21
+ "ai",
22
+ "testing",
23
+ "drift",
24
+ "snapshot",
25
+ "cli"
26
+ ],
27
+ "author": "Aditya Pandey",
28
+ "license": "ISC",
29
+ "dependencies": {
30
+ "chalk": "^5.3.0",
31
+ "commander": "^11.1.0",
32
+ "glob": "^10.3.10",
33
+ "js-yaml": "^4.1.0",
34
+ "zod": "^3.22.4"
35
+ },
36
+ "devDependencies": {
37
+ "@types/node": "^20.11.0",
38
+ "tsx": "^4.7.0",
39
+ "typescript": "^5.3.3",
40
+ "@types/glob": "^8.1.0",
41
+ "@types/js-yaml": "^4.0.9"
42
+ }
43
+ }