kimi-agent-swarm-cli 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +49 -0
- package/fixtures/asset-mgmt-roles.json +1543 -0
- package/fixtures/basic-sources.json +58 -0
- package/fixtures/github-repo-landscape.json +308 -0
- package/fixtures/golden-answers.ts +56 -0
- package/fixtures/jsonl-provider.ts +41 -0
- package/fixtures/market-scan.json +246 -0
- package/fixtures/paul-graham-corpus.json +272 -0
- package/fixtures/sellside-research-roles.json +1709 -0
- package/fixtures/youtube-niche.json +262 -0
- package/package.json +45 -0
- package/src/benchmark.ts +151 -0
- package/src/cache.ts +86 -0
- package/src/cli.ts +377 -0
- package/src/command-provider.ts +99 -0
- package/src/config.ts +134 -0
- package/src/costs.ts +134 -0
- package/src/distributed/memory-adapter.ts +152 -0
- package/src/distributed/queue-adapter.ts +29 -0
- package/src/distributed/redis-adapter.ts +185 -0
- package/src/distributed/runner.ts +325 -0
- package/src/distributed/task-splitter.ts +78 -0
- package/src/export.ts +70 -0
- package/src/init.ts +138 -0
- package/src/leaderboard.ts +201 -0
- package/src/providers/brave-provider.ts +161 -0
- package/src/providers/github-provider.ts +151 -0
- package/src/providers/index.ts +49 -0
- package/src/providers/mock-search-provider.ts +45 -0
- package/src/providers/search-provider.ts +12 -0
- package/src/providers/serper-provider.ts +154 -0
- package/src/providers/tavily-provider.ts +158 -0
- package/src/runtime.ts +349 -0
- package/src/scorer.ts +103 -0
- package/src/types.ts +246 -0
- package/src/verifier.ts +369 -0
package/src/scorer.ts
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
import type { EnrichedSource, Source } from "./types";
|
|
2
|
+
|
|
3
|
+
const SCORE_WEIGHTS = {
|
|
4
|
+
relevance: 0.35,
|
|
5
|
+
authority: 0.25,
|
|
6
|
+
freshness: 0.2,
|
|
7
|
+
diversity: 0.1,
|
|
8
|
+
extractionValue: 0.1,
|
|
9
|
+
};
|
|
10
|
+
|
|
11
|
+
const HIGH_AUTHORITY_DOMAINS = new Set([
|
|
12
|
+
"github.com",
|
|
13
|
+
"arxiv.org",
|
|
14
|
+
"ssrn.com",
|
|
15
|
+
"crunchbase.com",
|
|
16
|
+
"sec.gov",
|
|
17
|
+
"who.int",
|
|
18
|
+
"imf.org",
|
|
19
|
+
"worldbank.org",
|
|
20
|
+
"bloomberg.com",
|
|
21
|
+
"reuters.com",
|
|
22
|
+
"ft.com",
|
|
23
|
+
"wsj.com",
|
|
24
|
+
"economist.com",
|
|
25
|
+
"nature.com",
|
|
26
|
+
"science.org",
|
|
27
|
+
]);
|
|
28
|
+
|
|
29
|
+
function extractDomain(url: string): string | undefined {
|
|
30
|
+
try {
|
|
31
|
+
return new URL(url).hostname.toLowerCase();
|
|
32
|
+
} catch {
|
|
33
|
+
return undefined;
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
function domainAuthorityBonus(url: string): number {
|
|
38
|
+
const domain = extractDomain(url);
|
|
39
|
+
if (!domain) return 0;
|
|
40
|
+
|
|
41
|
+
if (HIGH_AUTHORITY_DOMAINS.has(domain)) return 0.5;
|
|
42
|
+
if (domain.endsWith(".edu") || domain.endsWith(".gov") || domain.endsWith(".ac.uk")) {
|
|
43
|
+
return 0.4;
|
|
44
|
+
}
|
|
45
|
+
if (domain.endsWith(".org")) return 0.15;
|
|
46
|
+
return 0;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function freshnessPenalty(publishedAt?: string): number {
|
|
50
|
+
if (!publishedAt || publishedAt === "unknown") return -0.3;
|
|
51
|
+
if (publishedAt >= "2026-01-01") return 0;
|
|
52
|
+
if (publishedAt >= "2025-01-01") return -0.2;
|
|
53
|
+
if (publishedAt >= "2024-01-01") return -0.5;
|
|
54
|
+
return -0.8;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export function calculateWeightedScore(source: Source): number {
|
|
58
|
+
const scores = source.scores ?? {};
|
|
59
|
+
const baseScore =
|
|
60
|
+
(scores.relevance ?? 0) * SCORE_WEIGHTS.relevance +
|
|
61
|
+
(scores.authority ?? 0) * SCORE_WEIGHTS.authority +
|
|
62
|
+
(scores.freshness ?? 0) * SCORE_WEIGHTS.freshness +
|
|
63
|
+
(scores.diversity ?? 0) * SCORE_WEIGHTS.diversity +
|
|
64
|
+
(scores.extractionValue ?? 0) * SCORE_WEIGHTS.extractionValue;
|
|
65
|
+
|
|
66
|
+
const domainBonus = domainAuthorityBonus(source.url);
|
|
67
|
+
const freshness = freshnessPenalty(source.publishedAt);
|
|
68
|
+
|
|
69
|
+
return Math.max(0, Math.min(5, baseScore + domainBonus + freshness));
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
export function scoreSource(source: Source): EnrichedSource {
|
|
73
|
+
const weightedScore = calculateWeightedScore(source);
|
|
74
|
+
const scores = source.scores ?? {};
|
|
75
|
+
const relevance = scores.relevance ?? 0;
|
|
76
|
+
const authority = scores.authority ?? 0;
|
|
77
|
+
|
|
78
|
+
let reason: string;
|
|
79
|
+
let accepted: boolean;
|
|
80
|
+
|
|
81
|
+
if (weightedScore >= 3.0 && relevance >= 2 && authority >= 2) {
|
|
82
|
+
accepted = true;
|
|
83
|
+
reason = `weighted score ${weightedScore.toFixed(2)} meets acceptance threshold`;
|
|
84
|
+
} else if (relevance < 2) {
|
|
85
|
+
accepted = false;
|
|
86
|
+
reason = "low relevance";
|
|
87
|
+
} else if (authority < 2) {
|
|
88
|
+
accepted = false;
|
|
89
|
+
reason = "low authority";
|
|
90
|
+
} else if (weightedScore < 3.0) {
|
|
91
|
+
accepted = false;
|
|
92
|
+
reason = `weighted score ${weightedScore.toFixed(2)} below acceptance threshold`;
|
|
93
|
+
} else {
|
|
94
|
+
accepted = false;
|
|
95
|
+
reason = "duplicate or low-value source";
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
return {
|
|
99
|
+
...source,
|
|
100
|
+
decision: accepted ? "accepted" : "rejected",
|
|
101
|
+
reason,
|
|
102
|
+
};
|
|
103
|
+
}
|
package/src/types.ts
ADDED
|
@@ -0,0 +1,246 @@
|
|
|
1
|
+
export type SearchDepth = "light" | "standard" | "deep" | "maximum";
|
|
2
|
+
|
|
3
|
+
export type ExecutionProfile =
|
|
4
|
+
| "fixture"
|
|
5
|
+
| "fixture-asset-mgmt"
|
|
6
|
+
| "fixture-sellside-research"
|
|
7
|
+
| "fixture-youtube-niche"
|
|
8
|
+
| "fixture-paul-graham-corpus"
|
|
9
|
+
| "fixture-github-repo-landscape"
|
|
10
|
+
| "fixture-market-scan"
|
|
11
|
+
| "local-command"
|
|
12
|
+
| "web-search";
|
|
13
|
+
|
|
14
|
+
export interface SourceScores {
|
|
15
|
+
relevance: number;
|
|
16
|
+
authority: number;
|
|
17
|
+
freshness?: number;
|
|
18
|
+
diversity?: number;
|
|
19
|
+
extractionValue?: number;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface Source {
|
|
23
|
+
id: string;
|
|
24
|
+
url: string;
|
|
25
|
+
title: string;
|
|
26
|
+
sourceClass: string;
|
|
27
|
+
publishedAt?: string;
|
|
28
|
+
discoveredBy: string;
|
|
29
|
+
scores: SourceScores;
|
|
30
|
+
claims?: string[];
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export interface EnrichedSource extends Source {
|
|
34
|
+
decision: "accepted" | "rejected";
|
|
35
|
+
reason: string;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export type ClaimConfidence = "high" | "medium" | "low";
|
|
39
|
+
|
|
40
|
+
export type ClaimFreshness = "current" | "stale" | "unknown";
|
|
41
|
+
|
|
42
|
+
export interface Claim {
|
|
43
|
+
id: string;
|
|
44
|
+
claim: string;
|
|
45
|
+
sourceIds: string[];
|
|
46
|
+
confidence: ClaimConfidence;
|
|
47
|
+
freshness: ClaimFreshness;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export interface UsageMetrics {
|
|
51
|
+
providerCalls: number;
|
|
52
|
+
apiCalls: number;
|
|
53
|
+
estimatedTokens?: number;
|
|
54
|
+
estimatedCostUsd?: number;
|
|
55
|
+
actualCostUsd?: number;
|
|
56
|
+
notes?: string;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
export interface Run {
|
|
60
|
+
runId: string;
|
|
61
|
+
objective: string;
|
|
62
|
+
executionProfile: ExecutionProfile;
|
|
63
|
+
status: "completed" | "failed";
|
|
64
|
+
createdAt: string;
|
|
65
|
+
usageMetrics: UsageMetrics;
|
|
66
|
+
replayedFrom?: string;
|
|
67
|
+
cached?: boolean;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
export interface ResearchPlan {
|
|
71
|
+
objective: string;
|
|
72
|
+
searchDepth: SearchDepth;
|
|
73
|
+
executionProfile: ExecutionProfile;
|
|
74
|
+
queryFamilies: string[];
|
|
75
|
+
sourceTargets: string[];
|
|
76
|
+
stopConditions: string[];
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
export interface DuplicateClaimGroup {
|
|
80
|
+
representativeClaimId: string;
|
|
81
|
+
claimIds: string[];
|
|
82
|
+
similarityReason: string;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
export interface ConflictingClaimPair {
|
|
86
|
+
claimIdA: string;
|
|
87
|
+
claimIdB: string;
|
|
88
|
+
entity: string;
|
|
89
|
+
reason: string;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
export interface VerificationReport {
|
|
93
|
+
status: "passed" | "failed";
|
|
94
|
+
acceptedSources: number;
|
|
95
|
+
rejectedSources: number;
|
|
96
|
+
unsupportedClaims: number;
|
|
97
|
+
staleClaims: number;
|
|
98
|
+
unknownFreshnessClaims: number;
|
|
99
|
+
lowConfidenceClaims: number;
|
|
100
|
+
duplicateClaimGroups: DuplicateClaimGroup[];
|
|
101
|
+
conflictingClaimPairs: ConflictingClaimPair[];
|
|
102
|
+
coverageGaps: string[];
|
|
103
|
+
failures: string[];
|
|
104
|
+
warnings: string[];
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
export interface RunWideSearchOptions {
|
|
108
|
+
objective?: string;
|
|
109
|
+
profile?: ExecutionProfile;
|
|
110
|
+
providerCommand?: string;
|
|
111
|
+
providerArgs?: string[];
|
|
112
|
+
providerName?: string;
|
|
113
|
+
searchDepth?: SearchDepth;
|
|
114
|
+
workDir?: string;
|
|
115
|
+
budget?: BudgetOptions;
|
|
116
|
+
useCache?: boolean;
|
|
117
|
+
replayRunId?: string;
|
|
118
|
+
distributed?: DistributedRunOptions;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
export interface DistributedRunOptions {
|
|
122
|
+
enabled: boolean;
|
|
123
|
+
workers?: number;
|
|
124
|
+
maxRetries?: number;
|
|
125
|
+
resumeJobId?: string;
|
|
126
|
+
queueType?: "memory" | "redis";
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
export interface DistributedJob {
|
|
130
|
+
jobId: string;
|
|
131
|
+
objective: string;
|
|
132
|
+
executionProfile: ExecutionProfile;
|
|
133
|
+
providerName: string;
|
|
134
|
+
searchDepth: SearchDepth;
|
|
135
|
+
queueType: "memory" | "redis";
|
|
136
|
+
status: DistributedJobStatus;
|
|
137
|
+
tasks: DistributedTask[];
|
|
138
|
+
createdAt: string;
|
|
139
|
+
updatedAt: string;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
export type DistributedJobStatus = "pending" | "running" | "completed" | "failed";
|
|
143
|
+
|
|
144
|
+
export interface DistributedTask {
|
|
145
|
+
taskId: string;
|
|
146
|
+
jobId: string;
|
|
147
|
+
queryFamily: string;
|
|
148
|
+
query: string;
|
|
149
|
+
status: DistributedTaskStatus;
|
|
150
|
+
attempts: number;
|
|
151
|
+
maxRetries: number;
|
|
152
|
+
workerId?: string;
|
|
153
|
+
result?: WorkerResult;
|
|
154
|
+
error?: string;
|
|
155
|
+
startedAt?: string;
|
|
156
|
+
completedAt?: string;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
export type DistributedTaskStatus = "pending" | "running" | "completed" | "failed";
|
|
160
|
+
|
|
161
|
+
export interface WorkerResult {
|
|
162
|
+
sources: Source[];
|
|
163
|
+
usageMetrics: UsageMetrics;
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
export interface LoadSourcesOptions {
|
|
167
|
+
profile: ExecutionProfile;
|
|
168
|
+
objective: string;
|
|
169
|
+
providerCommand?: string;
|
|
170
|
+
providerArgs?: string[];
|
|
171
|
+
providerName?: string;
|
|
172
|
+
searchDepth?: SearchDepth;
|
|
173
|
+
metrics?: UsageMetrics;
|
|
174
|
+
useCache?: boolean;
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
export interface VerifyRunOptions {
|
|
178
|
+
runDir?: string;
|
|
179
|
+
minAcceptedSources?: number;
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
export interface RunWideSearchResult {
|
|
183
|
+
runId: string;
|
|
184
|
+
runDir: string;
|
|
185
|
+
verification: VerificationReport;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
export interface ProviderPricing {
|
|
189
|
+
perCallUsd: number;
|
|
190
|
+
per1kTokensUsd?: number;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
export interface BudgetOptions {
|
|
194
|
+
maxCostUsd?: number;
|
|
195
|
+
maxProviderCalls?: number;
|
|
196
|
+
maxApiCalls?: number;
|
|
197
|
+
dryRun?: boolean;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
export type ExportFormat = "json" | "csv";
|
|
201
|
+
|
|
202
|
+
export interface ExportOptions {
|
|
203
|
+
runDir: string;
|
|
204
|
+
format: ExportFormat;
|
|
205
|
+
outPath?: string;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
export interface GoldenAnswer {
|
|
209
|
+
expectedClaims: string[];
|
|
210
|
+
expectedSourceUrls?: string[];
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
export interface BenchmarkResult {
|
|
214
|
+
profile: string;
|
|
215
|
+
runId: string;
|
|
216
|
+
runDir: string;
|
|
217
|
+
precision: number;
|
|
218
|
+
recall: number;
|
|
219
|
+
citationAccuracy: number;
|
|
220
|
+
f1: number;
|
|
221
|
+
passed: boolean;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
export interface LeaderboardEntry {
|
|
225
|
+
runId: string;
|
|
226
|
+
profile: string;
|
|
227
|
+
runDir: string;
|
|
228
|
+
timestamp: string;
|
|
229
|
+
gitCommit?: string;
|
|
230
|
+
scores: BenchmarkResult;
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
export interface CostEstimate {
|
|
234
|
+
providerName: string;
|
|
235
|
+
depth: SearchDepth;
|
|
236
|
+
estimatedProviderCalls: number;
|
|
237
|
+
estimatedApiCalls: number;
|
|
238
|
+
estimatedCostUsd: number;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
export interface CacheKey {
|
|
242
|
+
provider: string;
|
|
243
|
+
objective: string;
|
|
244
|
+
depth: SearchDepth;
|
|
245
|
+
maxResults: number;
|
|
246
|
+
}
|
package/src/verifier.ts
ADDED
|
@@ -0,0 +1,369 @@
|
|
|
1
|
+
import { readFile, writeFile } from "node:fs/promises";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import type {
|
|
4
|
+
Claim,
|
|
5
|
+
ClaimConfidence,
|
|
6
|
+
ClaimFreshness,
|
|
7
|
+
ConflictingClaimPair,
|
|
8
|
+
DuplicateClaimGroup,
|
|
9
|
+
EnrichedSource,
|
|
10
|
+
VerificationReport,
|
|
11
|
+
} from "./types";
|
|
12
|
+
|
|
13
|
+
async function readJsonl<T>(path: string): Promise<T[] | null> {
|
|
14
|
+
try {
|
|
15
|
+
const text = await readFile(path, "utf8");
|
|
16
|
+
return text
|
|
17
|
+
.split(/\r?\n/)
|
|
18
|
+
.filter(Boolean)
|
|
19
|
+
.map((line) => JSON.parse(line) as T);
|
|
20
|
+
} catch (error) {
|
|
21
|
+
if ((error as NodeJS.ErrnoException).code === "ENOENT") {
|
|
22
|
+
return null;
|
|
23
|
+
}
|
|
24
|
+
throw error;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
function normalizeClaimText(text: string): string {
|
|
29
|
+
return text
|
|
30
|
+
.toLowerCase()
|
|
31
|
+
.replace(/[^\p{L}\p{N}\s]/gu, " ")
|
|
32
|
+
.replace(/\s+/g, " ")
|
|
33
|
+
.trim();
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function tokenSet(text: string): Set<string> {
|
|
37
|
+
return new Set(normalizeClaimText(text).split(" ").filter(Boolean));
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function jaccardSimilarity(a: string, b: string): number {
|
|
41
|
+
const setA = tokenSet(a);
|
|
42
|
+
const setB = tokenSet(b);
|
|
43
|
+
if (setA.size === 0 && setB.size === 0) return 1;
|
|
44
|
+
const intersection = new Set([...setA].filter((x) => setB.has(x)));
|
|
45
|
+
const union = new Set([...setA, ...setB]);
|
|
46
|
+
return intersection.size / union.size;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
function findDuplicateClaims(claims: Claim[]): DuplicateClaimGroup[] {
|
|
50
|
+
const groups: DuplicateClaimGroup[] = [];
|
|
51
|
+
const assigned = new Set<string>();
|
|
52
|
+
const sorted = [...claims].sort((a, b) => a.claim.length - b.claim.length);
|
|
53
|
+
|
|
54
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
55
|
+
const base = sorted[i];
|
|
56
|
+
if (assigned.has(base.id)) continue;
|
|
57
|
+
|
|
58
|
+
const duplicates: Claim[] = [base];
|
|
59
|
+
for (let j = i + 1; j < sorted.length; j++) {
|
|
60
|
+
const other = sorted[j];
|
|
61
|
+
if (assigned.has(other.id)) continue;
|
|
62
|
+
const similarity = jaccardSimilarity(base.claim, other.claim);
|
|
63
|
+
const normalizedBase = normalizeClaimText(base.claim);
|
|
64
|
+
const normalizedOther = normalizeClaimText(other.claim);
|
|
65
|
+
const isSubstring =
|
|
66
|
+
normalizedBase.length > 10 &&
|
|
67
|
+
(normalizedOther.includes(normalizedBase) || normalizedBase.includes(normalizedOther));
|
|
68
|
+
if (similarity >= 0.7 || isSubstring) {
|
|
69
|
+
duplicates.push(other);
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if (duplicates.length > 1) {
|
|
74
|
+
const claimIds = duplicates.map((c) => c.id);
|
|
75
|
+
claimIds.forEach((id) => assigned.add(id));
|
|
76
|
+
groups.push({
|
|
77
|
+
representativeClaimId: base.id,
|
|
78
|
+
claimIds,
|
|
79
|
+
similarityReason: "jaccard token similarity >= 0.7 or substring containment",
|
|
80
|
+
});
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
return groups;
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
const POSITIVE_POLARITY = [
|
|
88
|
+
"increase",
|
|
89
|
+
"increases",
|
|
90
|
+
"increased",
|
|
91
|
+
"rising",
|
|
92
|
+
"rises",
|
|
93
|
+
"rose",
|
|
94
|
+
"grows",
|
|
95
|
+
"grew",
|
|
96
|
+
"growth",
|
|
97
|
+
"up",
|
|
98
|
+
"higher",
|
|
99
|
+
"more",
|
|
100
|
+
"positive",
|
|
101
|
+
"good",
|
|
102
|
+
"bullish",
|
|
103
|
+
"strong",
|
|
104
|
+
"buy",
|
|
105
|
+
"outperform",
|
|
106
|
+
"above",
|
|
107
|
+
"exceeds",
|
|
108
|
+
];
|
|
109
|
+
|
|
110
|
+
const NEGATIVE_POLARITY = [
|
|
111
|
+
"decrease",
|
|
112
|
+
"decreases",
|
|
113
|
+
"decreased",
|
|
114
|
+
"falling",
|
|
115
|
+
"falls",
|
|
116
|
+
"fell",
|
|
117
|
+
"shrinks",
|
|
118
|
+
"shrank",
|
|
119
|
+
"shrunk",
|
|
120
|
+
"down",
|
|
121
|
+
"lower",
|
|
122
|
+
"less",
|
|
123
|
+
"negative",
|
|
124
|
+
"bad",
|
|
125
|
+
"bearish",
|
|
126
|
+
"weak",
|
|
127
|
+
"sell",
|
|
128
|
+
"underperform",
|
|
129
|
+
"below",
|
|
130
|
+
"misses",
|
|
131
|
+
];
|
|
132
|
+
|
|
133
|
+
function extractEntities(claim: string): string[] {
|
|
134
|
+
const entities: string[] = [];
|
|
135
|
+
|
|
136
|
+
// Quoted phrases
|
|
137
|
+
const quoted = claim.match(/"([^"]{3,80})"/g) ?? [];
|
|
138
|
+
entities.push(...quoted.map((m) => m.slice(1, -1)));
|
|
139
|
+
|
|
140
|
+
// Capitalized phrases (1-4 words) - likely proper nouns or product names
|
|
141
|
+
const capitalized = claim.match(/\b[A-Z][a-zA-Z0-9]*(?:\s+[A-Z][a-zA-Z0-9]*){0,3}\b/g) ?? [];
|
|
142
|
+
entities.push(...capitalized);
|
|
143
|
+
|
|
144
|
+
// Numbers with units
|
|
145
|
+
const numbers =
|
|
146
|
+
claim.match(
|
|
147
|
+
/\d+(?:\.\d+)?\s*(?:%|percent|bp|basis points|million|billion|trillion|KRW|USD|EUR|GBP)/gi,
|
|
148
|
+
) ?? [];
|
|
149
|
+
entities.push(...numbers);
|
|
150
|
+
|
|
151
|
+
return [...new Set(entities.map((e) => e.toLowerCase().trim()))].filter((e) => e.length > 2);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
function detectPolarity(claim: string): "positive" | "negative" | "neutral" {
|
|
155
|
+
const normalized = normalizeClaimText(claim);
|
|
156
|
+
const positiveHits = POSITIVE_POLARITY.filter((word) => normalized.includes(word)).length;
|
|
157
|
+
const negativeHits = NEGATIVE_POLARITY.filter((word) => normalized.includes(word)).length;
|
|
158
|
+
|
|
159
|
+
if (positiveHits > 0 && negativeHits === 0) return "positive";
|
|
160
|
+
if (negativeHits > 0 && positiveHits === 0) return "negative";
|
|
161
|
+
return "neutral";
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
function findConflictingClaims(claims: Claim[]): ConflictingClaimPair[] {
|
|
165
|
+
const pairs: ConflictingClaimPair[] = [];
|
|
166
|
+
const seen = new Set<string>();
|
|
167
|
+
|
|
168
|
+
for (let i = 0; i < claims.length; i++) {
|
|
169
|
+
const claimA = claims[i];
|
|
170
|
+
const polarityA = detectPolarity(claimA.claim);
|
|
171
|
+
if (polarityA === "neutral") continue;
|
|
172
|
+
|
|
173
|
+
const entitiesA = extractEntities(claimA.claim);
|
|
174
|
+
if (entitiesA.length === 0) continue;
|
|
175
|
+
|
|
176
|
+
for (let j = i + 1; j < claims.length; j++) {
|
|
177
|
+
const claimB = claims[j];
|
|
178
|
+
const polarityB = detectPolarity(claimB.claim);
|
|
179
|
+
if (polarityB === "neutral" || polarityA === polarityB) continue;
|
|
180
|
+
|
|
181
|
+
const entitiesB = extractEntities(claimB.claim);
|
|
182
|
+
const sharedEntities = entitiesA.filter((entity) => entitiesB.includes(entity));
|
|
183
|
+
if (sharedEntities.length === 0) continue;
|
|
184
|
+
|
|
185
|
+
const pairKey = [claimA.id, claimB.id].sort().join("::");
|
|
186
|
+
if (seen.has(pairKey)) continue;
|
|
187
|
+
seen.add(pairKey);
|
|
188
|
+
|
|
189
|
+
pairs.push({
|
|
190
|
+
claimIdA: claimA.id,
|
|
191
|
+
claimIdB: claimB.id,
|
|
192
|
+
entity: sharedEntities[0],
|
|
193
|
+
reason: `opposing polarity (${polarityA} vs ${polarityB}) on shared entity "${sharedEntities[0]}"`,
|
|
194
|
+
});
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
return pairs;
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
function countByFreshness(claims: Claim[], freshness: ClaimFreshness): number {
|
|
202
|
+
return claims.filter((claim) => claim.freshness === freshness).length;
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
function countByConfidence(claims: Claim[], confidence: ClaimConfidence): number {
|
|
206
|
+
return claims.filter((claim) => claim.confidence === confidence).length;
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
function findCoverageGaps(
|
|
210
|
+
sources: EnrichedSource[],
|
|
211
|
+
acceptedSources: EnrichedSource[],
|
|
212
|
+
): string[] {
|
|
213
|
+
const gaps: string[] = [];
|
|
214
|
+
const sourceClasses = new Set(sources.map((s) => s.sourceClass));
|
|
215
|
+
const acceptedClasses = new Set(acceptedSources.map((s) => s.sourceClass));
|
|
216
|
+
|
|
217
|
+
if (!acceptedClasses.has("primary-analysis") && sourceClasses.has("primary-analysis")) {
|
|
218
|
+
gaps.push("no accepted primary-analysis sources");
|
|
219
|
+
}
|
|
220
|
+
if (!acceptedClasses.has("secondary") && sourceClasses.has("secondary")) {
|
|
221
|
+
gaps.push("no accepted secondary sources");
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
const acceptedRatio = acceptedSources.length / Math.max(sources.length, 1);
|
|
225
|
+
if (acceptedRatio < 0.25) {
|
|
226
|
+
gaps.push("accepted source ratio below 25%");
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
return gaps;
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
function findBrokenSourceReferences(claims: Claim[], sourceIds: Set<string>): string[] {
|
|
233
|
+
const broken: string[] = [];
|
|
234
|
+
for (const claim of claims) {
|
|
235
|
+
for (const sourceId of claim.sourceIds ?? []) {
|
|
236
|
+
if (!sourceIds.has(sourceId)) {
|
|
237
|
+
broken.push(`${claim.id} -> ${sourceId}`);
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
return broken;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
export interface VerifyRunOptions {
|
|
245
|
+
runDir?: string;
|
|
246
|
+
minAcceptedSources?: number;
|
|
247
|
+
maxLowConfidenceRatio?: number;
|
|
248
|
+
maxStaleRatio?: number;
|
|
249
|
+
maxDuplicateClaimGroups?: number;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
export async function verifyRun({
|
|
253
|
+
runDir,
|
|
254
|
+
minAcceptedSources = 1,
|
|
255
|
+
maxLowConfidenceRatio = 0.5,
|
|
256
|
+
maxStaleRatio = 0.5,
|
|
257
|
+
maxDuplicateClaimGroups = Number.MAX_SAFE_INTEGER,
|
|
258
|
+
}: VerifyRunOptions = {}): Promise<VerificationReport> {
|
|
259
|
+
if (!runDir) {
|
|
260
|
+
throw new Error("verifyRun requires runDir");
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
const failures: string[] = [];
|
|
264
|
+
const warnings: string[] = [];
|
|
265
|
+
const sources = await readJsonl<EnrichedSource>(join(runDir, "source-ledger.jsonl"));
|
|
266
|
+
const claims = await readJsonl<Claim>(join(runDir, "claim-ledger.jsonl"));
|
|
267
|
+
|
|
268
|
+
if (!sources) {
|
|
269
|
+
failures.push("missing source ledger");
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
if (!claims) {
|
|
273
|
+
failures.push("missing claim ledger");
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
const acceptedSources = sources?.filter((source) => source.decision === "accepted") ?? [];
|
|
277
|
+
const rejectedSources = sources?.filter((source) => source.decision === "rejected") ?? [];
|
|
278
|
+
|
|
279
|
+
if (sources && acceptedSources.length < minAcceptedSources) {
|
|
280
|
+
failures.push(
|
|
281
|
+
`accepted source count ${acceptedSources.length} below minimum ${minAcceptedSources}`,
|
|
282
|
+
);
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
const unsupportedClaims: string[] = [];
|
|
286
|
+
for (const claim of claims ?? []) {
|
|
287
|
+
if (!Array.isArray(claim.sourceIds) || claim.sourceIds.length === 0) {
|
|
288
|
+
unsupportedClaims.push(claim.id ?? claim.claim ?? "unknown claim");
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
if (unsupportedClaims.length > 0) {
|
|
293
|
+
failures.push(`unsupported claims: ${unsupportedClaims.join(", ")}`);
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
const duplicateSources =
|
|
297
|
+
sources?.filter((source) => source.reason === "duplicate or low-value source") ?? [];
|
|
298
|
+
if (sources && duplicateSources.length / Math.max(sources.length, 1) > 0.5) {
|
|
299
|
+
warnings.push("duplicate or low-value source ratio is high");
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
const staleClaims = countByFreshness(claims ?? [], "stale");
|
|
303
|
+
const unknownFreshnessClaims = countByFreshness(claims ?? [], "unknown");
|
|
304
|
+
const lowConfidenceClaims = countByConfidence(claims ?? [], "low");
|
|
305
|
+
|
|
306
|
+
const totalClaims = claims?.length ?? 0;
|
|
307
|
+
if (totalClaims > 0) {
|
|
308
|
+
if (staleClaims / totalClaims > maxStaleRatio) {
|
|
309
|
+
failures.push(
|
|
310
|
+
`stale claim ratio ${(staleClaims / totalClaims).toFixed(2)} exceeds ${maxStaleRatio}`,
|
|
311
|
+
);
|
|
312
|
+
} else if (staleClaims / totalClaims > maxStaleRatio / 2) {
|
|
313
|
+
warnings.push(`stale claim ratio is ${(staleClaims / totalClaims).toFixed(2)}`);
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
if (lowConfidenceClaims / totalClaims > maxLowConfidenceRatio) {
|
|
317
|
+
failures.push(
|
|
318
|
+
`low-confidence claim ratio ${(lowConfidenceClaims / totalClaims).toFixed(2)} exceeds ${maxLowConfidenceRatio}`,
|
|
319
|
+
);
|
|
320
|
+
} else if (lowConfidenceClaims / totalClaims > maxLowConfidenceRatio / 2) {
|
|
321
|
+
warnings.push(`low-confidence claim ratio is ${(lowConfidenceClaims / totalClaims).toFixed(2)}`);
|
|
322
|
+
}
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
const duplicateClaimGroups = findDuplicateClaims(claims ?? []);
|
|
326
|
+
if (duplicateClaimGroups.length > maxDuplicateClaimGroups) {
|
|
327
|
+
failures.push(
|
|
328
|
+
`duplicate claim groups ${duplicateClaimGroups.length} exceeds ${maxDuplicateClaimGroups}`,
|
|
329
|
+
);
|
|
330
|
+
} else if (duplicateClaimGroups.length > 0) {
|
|
331
|
+
warnings.push(`${duplicateClaimGroups.length} duplicate claim groups detected`);
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
const conflictingClaimPairs = findConflictingClaims(claims ?? []);
|
|
335
|
+
if (conflictingClaimPairs.length > 0) {
|
|
336
|
+
warnings.push(
|
|
337
|
+
`${conflictingClaimPairs.length} conflicting claim pairs detected; review needed`,
|
|
338
|
+
);
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
const coverageGaps = findCoverageGaps(sources ?? [], acceptedSources);
|
|
342
|
+
if (coverageGaps.length > 0) {
|
|
343
|
+
warnings.push(`coverage gaps: ${coverageGaps.join("; ")}`);
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
const sourceIds = new Set(sources?.map((s) => s.id) ?? []);
|
|
347
|
+
const brokenReferences = findBrokenSourceReferences(claims ?? [], sourceIds);
|
|
348
|
+
if (brokenReferences.length > 0) {
|
|
349
|
+
failures.push(`broken source references: ${brokenReferences.join(", ")}`);
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
const report: VerificationReport = {
|
|
353
|
+
status: failures.length === 0 ? "passed" : "failed",
|
|
354
|
+
acceptedSources: acceptedSources.length,
|
|
355
|
+
rejectedSources: rejectedSources.length,
|
|
356
|
+
unsupportedClaims: unsupportedClaims.length,
|
|
357
|
+
staleClaims,
|
|
358
|
+
unknownFreshnessClaims,
|
|
359
|
+
lowConfidenceClaims,
|
|
360
|
+
duplicateClaimGroups,
|
|
361
|
+
conflictingClaimPairs,
|
|
362
|
+
coverageGaps,
|
|
363
|
+
failures,
|
|
364
|
+
warnings,
|
|
365
|
+
};
|
|
366
|
+
|
|
367
|
+
await writeFile(join(runDir, "verification-report.json"), `${JSON.stringify(report, null, 2)}\n`);
|
|
368
|
+
return report;
|
|
369
|
+
}
|