@artemiskit/reports 0.1.6 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,84 @@
1
1
  # @artemiskit/reports
2
2
 
3
+ ## 0.2.0
4
+
5
+ ### Minor Changes
6
+
7
+ - d2c3835: ## v0.2.0 - Enhanced Evaluation Features
8
+
9
+ ### CLI (`@artemiskit/cli`)
10
+
11
+ #### New Features
12
+
13
+ - **Multi-turn mutations**: Added `--mutations multi_turn` flag for red team testing with 4 built-in strategies:
14
+ - `gradual_escalation`: Gradually intensifies requests over conversation turns
15
+ - `context_switching`: Shifts topics to lower defenses before attack
16
+ - `persona_building`: Establishes trust through roleplay
17
+ - `distraction`: Uses side discussions to slip in harmful requests
18
+ - **Custom multi-turn conversations**: Support for array prompts in red team scenarios (consistent with `run` command format). The last user message becomes the attack target, preceding messages form conversation context.
19
+ - **Custom attacks**: Added `--custom-attacks` flag to load custom attack patterns from YAML files with template variables and variations.
20
+ - **Encoding mutations**: Added `--mutations encoding` for obfuscation attacks (base64, ROT13, hex, unicode).
21
+ - **Directory scanning**: Run all scenarios in a directory with `akit run scenarios/`
22
+ - **Glob pattern matching**: Use patterns like `akit run scenarios/**/*.yaml`
23
+ - **Parallel execution**: Added `--parallel` flag for concurrent scenario execution
24
+ - **Scenario tags**: Filter scenarios with `--tags` flag
25
+
26
+ ### Core (`@artemiskit/core`)
27
+
28
+ #### New Features
29
+
30
+ - **Combined matchers**: New `type: combined` expectation with `operator: and|or` for complex assertion logic
31
+ - **`not_contains` expectation**: Negative containment check to ensure responses don't include specific text
32
+ - **`similarity` expectation**: Semantic similarity matching with two modes:
33
+ - Embedding-based: Uses vector embeddings for fast semantic comparison
34
+ - LLM-based fallback: Uses LLM to evaluate semantic similarity when embeddings unavailable
35
+ - Configurable threshold (default 0.75)
36
+ - **`inline` expectation**: Safe expression-based custom matchers in YAML using JavaScript-like expressions (e.g., `response.length > 100`, `response.includes('hello')`)
37
+ - **p90 latency metric**: Added p90 percentile to stress test latency metrics
38
+ - **Token usage tracking**: Monitor token consumption per request in stress tests
39
+ - **Cost estimation**: Estimate API costs with model pricing data
40
+
41
+ ### Red Team (`@artemiskit/redteam`)
42
+
43
+ #### New Features
44
+
45
+ - **MultiTurnMutation class**: Full implementation with strategy support and custom conversation prefixes
46
+ - **Custom attack loader**: Parse and load custom attack patterns from YAML
47
+ - **Encoding mutation**: Obfuscate attack payloads using various encoding schemes
48
+ - **CVSS-like severity scoring**: Detailed attack severity scoring with:
49
+ - `CvssScore` interface with attack vector, complexity, impact metrics
50
+ - `CvssCalculator` class for score calculation and aggregation
51
+ - Predefined scores for all mutations and detection categories
52
+ - Human-readable score descriptions and vector strings
53
+
54
+ ### Reports (`@artemiskit/reports`)
55
+
56
+ #### New Features
57
+
58
+ - **Run comparison HTML report**: Visual diff between two runs showing:
59
+ - Metrics overview with baseline vs current comparison
60
+ - Change summary (regressions, improvements, unchanged)
61
+ - Case-by-case comparison table with filtering
62
+ - Side-by-side response comparison for each case
63
+ - **Comparison JSON export**: Structured comparison data for programmatic use
64
+
65
+ ### CLI Enhancements
66
+
67
+ - **Compare command `--html` flag**: Generate HTML comparison report
68
+ - **Compare command `--json` flag**: Generate JSON comparison data
69
+
70
+ ### Documentation
71
+
72
+ - Updated all CLI command documentation
73
+ - Added comprehensive examples for custom multi-turn scenarios
74
+ - Documented combined matchers and `not_contains` expectations
75
+ - Added mutation strategy reference tables
76
+
77
+ ### Patch Changes
78
+
79
+ - Updated dependencies [d2c3835]
80
+ - @artemiskit/core@0.2.0
81
+
3
82
  ## 0.1.6
4
83
 
5
84
  ### Patch Changes
@@ -0,0 +1,52 @@
1
+ /**
2
+ * HTML Comparison Report Generator
3
+ * Generates a visual comparison between two runs
4
+ */
5
+ import type { CaseResult, RunManifest } from '@artemiskit/core';
6
+ /**
7
+ * Case-level comparison data
8
+ */
9
+ export interface CaseComparison {
10
+ caseId: string;
11
+ name?: string;
12
+ baselineStatus: 'passed' | 'failed' | null;
13
+ currentStatus: 'passed' | 'failed' | null;
14
+ baselineScore: number | null;
15
+ currentScore: number | null;
16
+ scoreDelta: number;
17
+ baselineLatency: number | null;
18
+ currentLatency: number | null;
19
+ latencyDelta: number;
20
+ changeType: 'regressed' | 'improved' | 'unchanged' | 'new' | 'removed';
21
+ baselineCase?: CaseResult;
22
+ currentCase?: CaseResult;
23
+ }
24
+ /**
25
+ * Comparison data structure
26
+ */
27
+ export interface ComparisonData {
28
+ baseline: RunManifest;
29
+ current: RunManifest;
30
+ metrics: {
31
+ successRateDelta: number;
32
+ medianLatencyDelta: number;
33
+ totalTokensDelta: number;
34
+ };
35
+ caseComparisons: CaseComparison[];
36
+ summary: {
37
+ totalRegressions: number;
38
+ totalImprovements: number;
39
+ totalUnchanged: number;
40
+ casesRemoved: number;
41
+ casesAdded: number;
42
+ };
43
+ }
44
+ /**
45
+ * Build comparison data from two manifests
46
+ */
47
+ export declare function buildComparisonData(baseline: RunManifest, current: RunManifest): ComparisonData;
48
+ /**
49
+ * Generate an HTML comparison report
50
+ */
51
+ export declare function generateCompareHTMLReport(baseline: RunManifest, current: RunManifest): string;
52
+ //# sourceMappingURL=compare-generator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compare-generator.d.ts","sourceRoot":"","sources":["../../src/html/compare-generator.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAGhE;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,cAAc,EAAE,QAAQ,GAAG,QAAQ,GAAG,IAAI,CAAC;IAC3C,aAAa,EAAE,QAAQ,GAAG,QAAQ,GAAG,IAAI,CAAC;IAC1C,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,YAAY,EAAE,MAAM,GAAG,IAAI,CAAC;IAC5B,UAAU,EAAE,MAAM,CAAC;IACnB,eAAe,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,cAAc,EAAE,MAAM,GAAG,IAAI,CAAC;IAC9B,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,WAAW,GAAG,UAAU,GAAG,WAAW,GAAG,KAAK,GAAG,SAAS,CAAC;IACvE,YAAY,CAAC,EAAE,UAAU,CAAC;IAC1B,WAAW,CAAC,EAAE,UAAU,CAAC;CAC1B;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,QAAQ,EAAE,WAAW,CAAC;IACtB,OAAO,EAAE,WAAW,CAAC;IACrB,OAAO,EAAE;QACP,gBAAgB,EAAE,MAAM,CAAC;QACzB,kBAAkB,EAAE,MAAM,CAAC;QAC3B,gBAAgB,EAAE,MAAM,CAAC;KAC1B,CAAC;IACF,eAAe,EAAE,cAAc,EAAE,CAAC;IAClC,OAAO,EAAE;QACP,gBAAgB,EAAE,MAAM,CAAC;QACzB,iBAAiB,EAAE,MAAM,CAAC;QAC1B,cAAc,EAAE,MAAM,CAAC;QACvB,YAAY,EAAE,MAAM,CAAC;QACrB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;CACH;AAwfD;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,WAAW,EAAE,OAAO,EAAE,WAAW,GAAG,cAAc,CAsG/F;AAED;;GAEG;AACH,wBAAgB,yBAAyB,CAAC,QAAQ,EAAE,WAAW,EAAE,OAAO,EAAE,WAAW,GAAG,MAAM,CA0H7F"}
@@ -1 +1 @@
1
- {"version":3,"file":"generator.d.ts","sourceRoot":"","sources":["../../src/html/generator.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAkOpD,wBAAgB,kBAAkB,CAAC,QAAQ,EAAE,WAAW,GAAG,MAAM,CA2BhE"}
1
+ {"version":3,"file":"generator.d.ts","sourceRoot":"","sources":["../../src/html/generator.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAmgBpD,wBAAgB,kBAAkB,CAAC,QAAQ,EAAE,WAAW,GAAG,MAAM,CA+BhE"}
@@ -1 +1 @@
1
- {"version":3,"file":"redteam-generator.d.ts","sourceRoot":"","sources":["../../src/html/redteam-generator.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AAkVxD,wBAAgB,yBAAyB,CAAC,QAAQ,EAAE,eAAe,GAAG,MAAM,CA0C3E"}
1
+ {"version":3,"file":"redteam-generator.d.ts","sourceRoot":"","sources":["../../src/html/redteam-generator.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AA6nBxD,wBAAgB,yBAAyB,CAAC,QAAQ,EAAE,eAAe,GAAG,MAAM,CA8C3E"}
@@ -1 +1 @@
1
- {"version":3,"file":"stress-generator.d.ts","sourceRoot":"","sources":["../../src/html/stress-generator.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC;AA0UvD,wBAAgB,wBAAwB,CAAC,QAAQ,EAAE,cAAc,GAAG,MAAM,CAsCzE"}
1
+ {"version":3,"file":"stress-generator.d.ts","sourceRoot":"","sources":["../../src/html/stress-generator.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC;AAiavD,wBAAgB,wBAAwB,CAAC,QAAQ,EAAE,cAAc,GAAG,MAAM,CAsCzE"}
package/dist/index.d.ts CHANGED
@@ -6,4 +6,5 @@ export { generateHTMLReport } from './html/generator';
6
6
  export { generateJSONReport, type JSONReportOptions } from './json/generator';
7
7
  export { generateRedTeamHTMLReport } from './html/redteam-generator';
8
8
  export { generateStressHTMLReport } from './html/stress-generator';
9
+ export { generateCompareHTMLReport, buildComparisonData, type ComparisonData, type CaseComparison, } from './html/compare-generator';
9
10
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACtD,OAAO,EAAE,kBAAkB,EAAE,KAAK,iBAAiB,EAAE,MAAM,kBAAkB,CAAC;AAG9E,OAAO,EAAE,yBAAyB,EAAE,MAAM,0BAA0B,CAAC;AAGrE,OAAO,EAAE,wBAAwB,EAAE,MAAM,yBAAyB,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACtD,OAAO,EAAE,kBAAkB,EAAE,KAAK,iBAAiB,EAAE,MAAM,kBAAkB,CAAC;AAG9E,OAAO,EAAE,yBAAyB,EAAE,MAAM,0BAA0B,CAAC;AAGrE,OAAO,EAAE,wBAAwB,EAAE,MAAM,yBAAyB,CAAC;AAGnE,OAAO,EACL,yBAAyB,EACzB,mBAAmB,EACnB,KAAK,cAAc,EACnB,KAAK,cAAc,GACpB,MAAM,0BAA0B,CAAC"}