@artemiskit/reports 0.1.6 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,98 @@
1
1
  # @artemiskit/reports
2
2
 
3
+ ## 0.2.2
4
+
5
+ ### Patch Changes
6
+
7
+ - Updated dependencies [d5ca7c6]
8
+ - @artemiskit/core@0.2.2
9
+
10
+ ## 0.2.1
11
+
12
+ ### Patch Changes
13
+
14
+ - Updated dependencies
15
+ - @artemiskit/core@0.2.1
16
+
17
+ ## 0.2.0
18
+
19
+ ### Minor Changes
20
+
21
+ - d2c3835: ## v0.2.0 - Enhanced Evaluation Features
22
+
23
+ ### CLI (`@artemiskit/cli`)
24
+
25
+ #### New Features
26
+
27
+ - **Multi-turn mutations**: Added `--mutations multi_turn` flag for red team testing with 4 built-in strategies:
28
+ - `gradual_escalation`: Gradually intensifies requests over conversation turns
29
+ - `context_switching`: Shifts topics to lower defenses before attack
30
+ - `persona_building`: Establishes trust through roleplay
31
+ - `distraction`: Uses side discussions to slip in harmful requests
32
+ - **Custom multi-turn conversations**: Support for array prompts in red team scenarios (consistent with `run` command format). The last user message becomes the attack target, preceding messages form conversation context.
33
+ - **Custom attacks**: Added `--custom-attacks` flag to load custom attack patterns from YAML files with template variables and variations.
34
+ - **Encoding mutations**: Added `--mutations encoding` for obfuscation attacks (base64, ROT13, hex, unicode).
35
+ - **Directory scanning**: Run all scenarios in a directory with `akit run scenarios/`
36
+ - **Glob pattern matching**: Use patterns like `akit run scenarios/**/*.yaml`
37
+ - **Parallel execution**: Added `--parallel` flag for concurrent scenario execution
38
+ - **Scenario tags**: Filter scenarios with `--tags` flag
39
+
40
+ ### Core (`@artemiskit/core`)
41
+
42
+ #### New Features
43
+
44
+ - **Combined matchers**: New `type: combined` expectation with `operator: and|or` for complex assertion logic
45
+ - **`not_contains` expectation**: Negative containment check to ensure responses don't include specific text
46
+ - **`similarity` expectation**: Semantic similarity matching with two modes:
47
+ - Embedding-based: Uses vector embeddings for fast semantic comparison
48
+ - LLM-based fallback: Uses LLM to evaluate semantic similarity when embeddings unavailable
49
+ - Configurable threshold (default 0.75)
50
+ - **`inline` expectation**: Safe expression-based custom matchers in YAML using JavaScript-like expressions (e.g., `response.length > 100`, `response.includes('hello')`)
51
+ - **p90 latency metric**: Added p90 percentile to stress test latency metrics
52
+ - **Token usage tracking**: Monitor token consumption per request in stress tests
53
+ - **Cost estimation**: Estimate API costs with model pricing data
54
+
55
+ ### Red Team (`@artemiskit/redteam`)
56
+
57
+ #### New Features
58
+
59
+ - **MultiTurnMutation class**: Full implementation with strategy support and custom conversation prefixes
60
+ - **Custom attack loader**: Parse and load custom attack patterns from YAML
61
+ - **Encoding mutation**: Obfuscate attack payloads using various encoding schemes
62
+ - **CVSS-like severity scoring**: Detailed attack severity scoring with:
63
+ - `CvssScore` interface with attack vector, complexity, impact metrics
64
+ - `CvssCalculator` class for score calculation and aggregation
65
+ - Predefined scores for all mutations and detection categories
66
+ - Human-readable score descriptions and vector strings
67
+
68
+ ### Reports (`@artemiskit/reports`)
69
+
70
+ #### New Features
71
+
72
+ - **Run comparison HTML report**: Visual diff between two runs showing:
73
+ - Metrics overview with baseline vs current comparison
74
+ - Change summary (regressions, improvements, unchanged)
75
+ - Case-by-case comparison table with filtering
76
+ - Side-by-side response comparison for each case
77
+ - **Comparison JSON export**: Structured comparison data for programmatic use
78
+
79
+ ### CLI Enhancements
80
+
81
+ - **Compare command `--html` flag**: Generate HTML comparison report
82
+ - **Compare command `--json` flag**: Generate JSON comparison data
83
+
84
+ ### Documentation
85
+
86
+ - Updated all CLI command documentation
87
+ - Added comprehensive examples for custom multi-turn scenarios
88
+ - Documented combined matchers and `not_contains` expectations
89
+ - Added mutation strategy reference tables
90
+
91
+ ### Patch Changes
92
+
93
+ - Updated dependencies [d2c3835]
94
+ - @artemiskit/core@0.2.0
95
+
3
96
  ## 0.1.6
4
97
 
5
98
  ### Patch Changes
@@ -0,0 +1,52 @@
1
+ /**
2
+ * HTML Comparison Report Generator
3
+ * Generates a visual comparison between two runs
4
+ */
5
+ import type { CaseResult, RunManifest } from '@artemiskit/core';
6
+ /**
7
+ * Case-level comparison data
8
+ */
9
+ export interface CaseComparison {
10
+ caseId: string;
11
+ name?: string;
12
+ baselineStatus: 'passed' | 'failed' | null;
13
+ currentStatus: 'passed' | 'failed' | null;
14
+ baselineScore: number | null;
15
+ currentScore: number | null;
16
+ scoreDelta: number;
17
+ baselineLatency: number | null;
18
+ currentLatency: number | null;
19
+ latencyDelta: number;
20
+ changeType: 'regressed' | 'improved' | 'unchanged' | 'new' | 'removed';
21
+ baselineCase?: CaseResult;
22
+ currentCase?: CaseResult;
23
+ }
24
+ /**
25
+ * Comparison data structure
26
+ */
27
+ export interface ComparisonData {
28
+ baseline: RunManifest;
29
+ current: RunManifest;
30
+ metrics: {
31
+ successRateDelta: number;
32
+ medianLatencyDelta: number;
33
+ totalTokensDelta: number;
34
+ };
35
+ caseComparisons: CaseComparison[];
36
+ summary: {
37
+ totalRegressions: number;
38
+ totalImprovements: number;
39
+ totalUnchanged: number;
40
+ casesRemoved: number;
41
+ casesAdded: number;
42
+ };
43
+ }
44
+ /**
45
+ * Build comparison data from two manifests
46
+ */
47
+ export declare function buildComparisonData(baseline: RunManifest, current: RunManifest): ComparisonData;
48
+ /**
49
+ * Generate an HTML comparison report
50
+ */
51
+ export declare function generateCompareHTMLReport(baseline: RunManifest, current: RunManifest): string;
52
+ //# sourceMappingURL=compare-generator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"compare-generator.d.ts","sourceRoot":"","sources":["../../src/html/compare-generator.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAEH,OAAO,KAAK,EAAE,UAAU,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAGhE;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,CAAC,EAAE,MAAM,CAAC;IACd,cAAc,EAAE,QAAQ,GAAG,QAAQ,GAAG,IAAI,CAAC;IAC3C,aAAa,EAAE,QAAQ,GAAG,QAAQ,GAAG,IAAI,CAAC;IAC1C,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,YAAY,EAAE,MAAM,GAAG,IAAI,CAAC;IAC5B,UAAU,EAAE,MAAM,CAAC;IACnB,eAAe,EAAE,MAAM,GAAG,IAAI,CAAC;IAC/B,cAAc,EAAE,MAAM,GAAG,IAAI,CAAC;IAC9B,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,WAAW,GAAG,UAAU,GAAG,WAAW,GAAG,KAAK,GAAG,SAAS,CAAC;IACvE,YAAY,CAAC,EAAE,UAAU,CAAC;IAC1B,WAAW,CAAC,EAAE,UAAU,CAAC;CAC1B;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,QAAQ,EAAE,WAAW,CAAC;IACtB,OAAO,EAAE,WAAW,CAAC;IACrB,OAAO,EAAE;QACP,gBAAgB,EAAE,MAAM,CAAC;QACzB,kBAAkB,EAAE,MAAM,CAAC;QAC3B,gBAAgB,EAAE,MAAM,CAAC;KAC1B,CAAC;IACF,eAAe,EAAE,cAAc,EAAE,CAAC;IAClC,OAAO,EAAE;QACP,gBAAgB,EAAE,MAAM,CAAC;QACzB,iBAAiB,EAAE,MAAM,CAAC;QAC1B,cAAc,EAAE,MAAM,CAAC;QACvB,YAAY,EAAE,MAAM,CAAC;QACrB,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;CACH;AAwfD;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,WAAW,EAAE,OAAO,EAAE,WAAW,GAAG,cAAc,CAsG/F;AAED;;GAEG;AACH,wBAAgB,yBAAyB,CAAC,QAAQ,EAAE,WAAW,EAAE,OAAO,EAAE,WAAW,GAAG,MAAM,CA0H7F"}
@@ -1 +1 @@
1
- {"version":3,"file":"generator.d.ts","sourceRoot":"","sources":["../../src/html/generator.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAkOpD,wBAAgB,kBAAkB,CAAC,QAAQ,EAAE,WAAW,GAAG,MAAM,CA2BhE"}
1
+ {"version":3,"file":"generator.d.ts","sourceRoot":"","sources":["../../src/html/generator.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAmgBpD,wBAAgB,kBAAkB,CAAC,QAAQ,EAAE,WAAW,GAAG,MAAM,CA+BhE"}
@@ -1 +1 @@
1
- {"version":3,"file":"redteam-generator.d.ts","sourceRoot":"","sources":["../../src/html/redteam-generator.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AAkVxD,wBAAgB,yBAAyB,CAAC,QAAQ,EAAE,eAAe,GAAG,MAAM,CA0C3E"}
1
+ {"version":3,"file":"redteam-generator.d.ts","sourceRoot":"","sources":["../../src/html/redteam-generator.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AA6nBxD,wBAAgB,yBAAyB,CAAC,QAAQ,EAAE,eAAe,GAAG,MAAM,CA8C3E"}
@@ -1 +1 @@
1
- {"version":3,"file":"stress-generator.d.ts","sourceRoot":"","sources":["../../src/html/stress-generator.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC;AA0UvD,wBAAgB,wBAAwB,CAAC,QAAQ,EAAE,cAAc,GAAG,MAAM,CAsCzE"}
1
+ {"version":3,"file":"stress-generator.d.ts","sourceRoot":"","sources":["../../src/html/stress-generator.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,kBAAkB,CAAC;AAiavD,wBAAgB,wBAAwB,CAAC,QAAQ,EAAE,cAAc,GAAG,MAAM,CAsCzE"}
package/dist/index.d.ts CHANGED
@@ -6,4 +6,5 @@ export { generateHTMLReport } from './html/generator';
6
6
  export { generateJSONReport, type JSONReportOptions } from './json/generator';
7
7
  export { generateRedTeamHTMLReport } from './html/redteam-generator';
8
8
  export { generateStressHTMLReport } from './html/stress-generator';
9
+ export { generateCompareHTMLReport, buildComparisonData, type ComparisonData, type CaseComparison, } from './html/compare-generator';
9
10
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACtD,OAAO,EAAE,kBAAkB,EAAE,KAAK,iBAAiB,EAAE,MAAM,kBAAkB,CAAC;AAG9E,OAAO,EAAE,yBAAyB,EAAE,MAAM,0BAA0B,CAAC;AAGrE,OAAO,EAAE,wBAAwB,EAAE,MAAM,yBAAyB,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAGH,OAAO,EAAE,kBAAkB,EAAE,MAAM,kBAAkB,CAAC;AACtD,OAAO,EAAE,kBAAkB,EAAE,KAAK,iBAAiB,EAAE,MAAM,kBAAkB,CAAC;AAG9E,OAAO,EAAE,yBAAyB,EAAE,MAAM,0BAA0B,CAAC;AAGrE,OAAO,EAAE,wBAAwB,EAAE,MAAM,yBAAyB,CAAC;AAGnE,OAAO,EACL,yBAAyB,EACzB,mBAAmB,EACnB,KAAK,cAAc,EACnB,KAAK,cAAc,GACpB,MAAM,0BAA0B,CAAC"}