@duckmind/deepquark-darwin-arm64 0.9.83 → 0.9.90
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.deepquark/skills/bundled/knowledge-graph/SKILL.md +385 -0
- package/.deepquark/skills/bundled/knowledge-graph/STANDARDS.md +461 -0
- package/.deepquark/skills/bundled/knowledge-graph/lib/cli.ts +588 -0
- package/.deepquark/skills/bundled/knowledge-graph/lib/config.ts +630 -0
- package/.deepquark/skills/bundled/knowledge-graph/lib/connection-profile.ts +629 -0
- package/.deepquark/skills/bundled/knowledge-graph/lib/container.ts +756 -0
- package/.deepquark/skills/bundled/knowledge-graph/lib/mcp-client.ts +1310 -0
- package/.deepquark/skills/bundled/knowledge-graph/lib/output-formatter.ts +997 -0
- package/.deepquark/skills/bundled/knowledge-graph/lib/token-metrics.ts +335 -0
- package/.deepquark/skills/bundled/knowledge-graph/lib/transformation-log.ts +137 -0
- package/.deepquark/skills/bundled/knowledge-graph/lib/wrapper-config.ts +113 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/.env.example +129 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/compare-embeddings.ts +175 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/config-falkordb.yaml +108 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/config-neo4j.yaml +111 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/diagnose.ts +483 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/docker-compose-falkordb-dev.yml +146 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/docker-compose-falkordb.yml +151 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/docker-compose-neo4j-dev-local.yml +161 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/docker-compose-neo4j-dev.yml +161 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/docker-compose-neo4j.yml +169 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/docker-compose-production.yml +128 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/docker-compose-test.yml +10 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/docker-compose.yml +84 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/entrypoint.sh +40 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/install.ts +2054 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/podman-compose-falkordb.yml +78 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/podman-compose-neo4j.yml +88 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/podman-compose.yml +83 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/test-all-llms-mcp.ts +387 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/test-embedding-models.ts +201 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/test-embedding-providers.ts +641 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/test-graphiti-model.ts +217 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/test-grok-correct.ts +141 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/test-grok-llms-mcp.ts +386 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/test-grok-models.ts +173 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/test-llama-extraction.ts +188 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/test-mcp-final.ts +240 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/test-mcp-live.ts +187 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/test-mcp-session.ts +127 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/test-model-combinations.ts +316 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/test-ollama-models.ts +228 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/test-openrouter-models.ts +460 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/test-real-life-mcp.ts +311 -0
- package/.deepquark/skills/bundled/knowledge-graph/server/test-search-debug.ts +199 -0
- package/.deepquark/skills/bundled/knowledge-graph/tools/Install.md +104 -0
- package/.deepquark/skills/bundled/knowledge-graph/tools/README.md +120 -0
- package/.deepquark/skills/bundled/knowledge-graph/tools/knowledge-cli.ts +996 -0
- package/.deepquark/skills/bundled/knowledge-graph/tools/server-cli.ts +531 -0
- package/.deepquark/skills/bundled/knowledge-graph/workflows/BulkImport.md +514 -0
- package/.deepquark/skills/bundled/knowledge-graph/workflows/CaptureEpisode.md +242 -0
- package/.deepquark/skills/bundled/knowledge-graph/workflows/ClearGraph.md +392 -0
- package/.deepquark/skills/bundled/knowledge-graph/workflows/GetRecent.md +352 -0
- package/.deepquark/skills/bundled/knowledge-graph/workflows/GetStatus.md +373 -0
- package/.deepquark/skills/bundled/knowledge-graph/workflows/HealthReport.md +212 -0
- package/.deepquark/skills/bundled/knowledge-graph/workflows/InvestigateEntity.md +142 -0
- package/.deepquark/skills/bundled/knowledge-graph/workflows/OntologyManagement.md +201 -0
- package/.deepquark/skills/bundled/knowledge-graph/workflows/RunMaintenance.md +302 -0
- package/.deepquark/skills/bundled/knowledge-graph/workflows/SearchByDate.md +255 -0
- package/.deepquark/skills/bundled/knowledge-graph/workflows/SearchFacts.md +382 -0
- package/.deepquark/skills/bundled/knowledge-graph/workflows/SearchKnowledge.md +374 -0
- package/.deepquark/skills/bundled/knowledge-graph/workflows/StixImport.md +212 -0
- package/bin/deepquark +0 -0
- package/package.json +1 -1
- package/.deepquark/skills/bundled/ge-payroll/SKILL.md +0 -153
- package/.deepquark/skills/bundled/ge-payroll/evals/evals.json +0 -23
- package/.deepquark/skills/bundled/ge-payroll/references/pain-points-improvements.md +0 -106
- package/.deepquark/skills/bundled/ge-payroll/references/process-detail.md +0 -217
- package/.deepquark/skills/bundled/ge-payroll/references/raci-stakeholders.md +0 -85
- package/.deepquark/skills/bundled/ge-payroll/references/timeline-mandays.md +0 -64
|
@@ -0,0 +1,460 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
/**
|
|
3
|
+
* Test OpenRouter LLM models for cost vs performance
|
|
4
|
+
* Evaluates: entity extraction quality, response time, and cost
|
|
5
|
+
*
|
|
6
|
+
* Usage: OPENROUTER_API_KEY=sk-or-... bun run test-openrouter-models.ts
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
const OPENROUTER_API_KEY = process.env.OPENROUTER_API_KEY;
|
|
10
|
+
const OPENROUTER_BASE_URL = 'https://openrouter.ai/api/v1';
|
|
11
|
+
|
|
12
|
+
if (!OPENROUTER_API_KEY) {
|
|
13
|
+
console.error('❌ OPENROUTER_API_KEY environment variable required');
|
|
14
|
+
console.error(' Get your key at: https://openrouter.ai/keys');
|
|
15
|
+
console.error(' Usage: OPENROUTER_API_KEY=sk-or-... bun run test-openrouter-models.ts');
|
|
16
|
+
process.exit(1);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// Models to test with their pricing (per 1M tokens as of Jan 2026)
|
|
20
|
+
// Prices from https://openrouter.ai/models
|
|
21
|
+
const LLM_MODELS = [
|
|
22
|
+
// Top tier - highest quality
|
|
23
|
+
{ id: 'anthropic/claude-sonnet-4', name: 'Claude Sonnet 4', inputPrice: 3.0, outputPrice: 15.0 },
|
|
24
|
+
{ id: 'openai/gpt-4o', name: 'GPT-4o', inputPrice: 2.5, outputPrice: 10.0 },
|
|
25
|
+
{
|
|
26
|
+
id: 'google/gemini-2.0-flash-001',
|
|
27
|
+
name: 'Gemini 2.0 Flash',
|
|
28
|
+
inputPrice: 0.1,
|
|
29
|
+
outputPrice: 0.4,
|
|
30
|
+
},
|
|
31
|
+
|
|
32
|
+
// Mid tier - good balance
|
|
33
|
+
{ id: 'openai/gpt-4o-mini', name: 'GPT-4o Mini', inputPrice: 0.15, outputPrice: 0.6 },
|
|
34
|
+
{ id: 'anthropic/claude-3.5-haiku', name: 'Claude 3.5 Haiku', inputPrice: 0.8, outputPrice: 4.0 },
|
|
35
|
+
{ id: 'google/gemini-flash-1.5', name: 'Gemini 1.5 Flash', inputPrice: 0.075, outputPrice: 0.3 },
|
|
36
|
+
|
|
37
|
+
// Budget tier - cost effective
|
|
38
|
+
{
|
|
39
|
+
id: 'meta-llama/llama-3.3-70b-instruct',
|
|
40
|
+
name: 'Llama 3.3 70B',
|
|
41
|
+
inputPrice: 0.4,
|
|
42
|
+
outputPrice: 0.4,
|
|
43
|
+
},
|
|
44
|
+
{
|
|
45
|
+
id: 'meta-llama/llama-3.1-8b-instruct',
|
|
46
|
+
name: 'Llama 3.1 8B',
|
|
47
|
+
inputPrice: 0.055,
|
|
48
|
+
outputPrice: 0.055,
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
id: 'mistralai/mistral-7b-instruct',
|
|
52
|
+
name: 'Mistral 7B',
|
|
53
|
+
inputPrice: 0.055,
|
|
54
|
+
outputPrice: 0.055,
|
|
55
|
+
},
|
|
56
|
+
{ id: 'qwen/qwen-2.5-72b-instruct', name: 'Qwen 2.5 72B', inputPrice: 0.35, outputPrice: 0.4 },
|
|
57
|
+
|
|
58
|
+
// Deep reasoning (may be slower)
|
|
59
|
+
{ id: 'deepseek/deepseek-r1', name: 'DeepSeek R1', inputPrice: 0.55, outputPrice: 2.19 },
|
|
60
|
+
{ id: 'deepseek/deepseek-chat', name: 'DeepSeek V3', inputPrice: 0.14, outputPrice: 0.28 },
|
|
61
|
+
];
|
|
62
|
+
|
|
63
|
+
// Test prompts for entity extraction (similar to Graphiti requirements)
|
|
64
|
+
const TEST_CASES = [
|
|
65
|
+
{
|
|
66
|
+
name: 'Basic Entity Extraction',
|
|
67
|
+
text: 'John Smith works at Acme Corp in New York. He met Sarah Jones yesterday to discuss the Q4 budget.',
|
|
68
|
+
expectedEntities: ['John Smith', 'Acme Corp', 'New York', 'Sarah Jones'],
|
|
69
|
+
expectedRelationships: ['works_at', 'located_in', 'met'],
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
name: 'Technical Content',
|
|
73
|
+
text: 'The PAI system uses Neo4j for graph storage and OpenAI for embeddings. It was created by Daniel Miessler to help people build personalized AI infrastructure.',
|
|
74
|
+
expectedEntities: ['PAI', 'Neo4j', 'OpenAI', 'Daniel Miessler'],
|
|
75
|
+
expectedRelationships: ['uses', 'created_by'],
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
name: 'Complex Relationships',
|
|
79
|
+
text: `Alice, the CTO of TechStart, acquired DataFlow Inc last month. Bob, who was DataFlow's CEO, now reports to Alice. The deal was worth $50M.`,
|
|
80
|
+
expectedEntities: ['Alice', 'TechStart', 'DataFlow Inc', 'Bob'],
|
|
81
|
+
expectedRelationships: ['cto_of', 'acquired', 'reports_to'],
|
|
82
|
+
},
|
|
83
|
+
];
|
|
84
|
+
|
|
85
|
+
const EXTRACTION_PROMPT = (
|
|
86
|
+
text: string
|
|
87
|
+
) => `Extract entities and relationships from this text. Return ONLY valid JSON, no other text.
|
|
88
|
+
|
|
89
|
+
Text: "${text}"
|
|
90
|
+
|
|
91
|
+
Return this exact format:
|
|
92
|
+
{"entities": [{"name": "string", "type": "PERSON|ORGANIZATION|LOCATION|CONCEPT"}], "relationships": [{"source": "string", "target": "string", "type": "string"}]}`;
|
|
93
|
+
|
|
94
|
+
interface ModelResult {
|
|
95
|
+
model: string;
|
|
96
|
+
modelName: string;
|
|
97
|
+
inputPrice: number;
|
|
98
|
+
outputPrice: number;
|
|
99
|
+
avgResponseMs: number;
|
|
100
|
+
avgInputTokens: number;
|
|
101
|
+
avgOutputTokens: number;
|
|
102
|
+
costPer1000Calls: number;
|
|
103
|
+
qualityScore: number;
|
|
104
|
+
passRate: number;
|
|
105
|
+
results: TestCaseResult[];
|
|
106
|
+
error?: string;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
interface TestCaseResult {
|
|
110
|
+
name: string;
|
|
111
|
+
passed: boolean;
|
|
112
|
+
entities: number;
|
|
113
|
+
relationships: number;
|
|
114
|
+
responseMs: number;
|
|
115
|
+
inputTokens: number;
|
|
116
|
+
outputTokens: number;
|
|
117
|
+
error?: string;
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
function extractJSON(text: string): { entities: any[]; relationships: any[] } | null {
|
|
121
|
+
// Remove markdown code blocks
|
|
122
|
+
let clean = text.replace(/```json\n?/g, '').replace(/```\n?/g, '');
|
|
123
|
+
|
|
124
|
+
// Remove thinking tags (DeepSeek uses these)
|
|
125
|
+
clean = clean.replace(/<think>[\s\S]*?<\/think>/g, '');
|
|
126
|
+
clean = clean.trim();
|
|
127
|
+
|
|
128
|
+
// Try direct parse
|
|
129
|
+
try {
|
|
130
|
+
const obj = JSON.parse(clean);
|
|
131
|
+
if (obj.entities && obj.relationships) return obj;
|
|
132
|
+
} catch {}
|
|
133
|
+
|
|
134
|
+
// Find JSON in response
|
|
135
|
+
const jsonMatch = clean.match(/\{[\s\S]*"entities"[\s\S]*"relationships"[\s\S]*\}/);
|
|
136
|
+
if (jsonMatch) {
|
|
137
|
+
try {
|
|
138
|
+
const obj = JSON.parse(jsonMatch[0]);
|
|
139
|
+
if (obj.entities && obj.relationships) return obj;
|
|
140
|
+
} catch {}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
return null;
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
function calculateQuality(
|
|
147
|
+
result: { entities: any[]; relationships: any[] },
|
|
148
|
+
expected: (typeof TEST_CASES)[0]
|
|
149
|
+
): number {
|
|
150
|
+
// Score based on: found expected entities + found relationships + reasonable count
|
|
151
|
+
let score = 0;
|
|
152
|
+
const maxScore = 100;
|
|
153
|
+
|
|
154
|
+
// Entity coverage (40 points)
|
|
155
|
+
const foundEntities = result.entities.map((e) => e.name.toLowerCase());
|
|
156
|
+
const expectedFound = expected.expectedEntities.filter((e) =>
|
|
157
|
+
foundEntities.some((f) => f.includes(e.toLowerCase()) || e.toLowerCase().includes(f))
|
|
158
|
+
);
|
|
159
|
+
score += (expectedFound.length / expected.expectedEntities.length) * 40;
|
|
160
|
+
|
|
161
|
+
// Relationship presence (30 points)
|
|
162
|
+
const hasRelationships = result.relationships.length > 0;
|
|
163
|
+
const reasonableRelCount = result.relationships.length >= 1 && result.relationships.length <= 10;
|
|
164
|
+
if (hasRelationships) score += 15;
|
|
165
|
+
if (reasonableRelCount) score += 15;
|
|
166
|
+
|
|
167
|
+
// Valid structure (30 points)
|
|
168
|
+
const validEntities = result.entities.every((e) => e.name && typeof e.name === 'string');
|
|
169
|
+
const validRels = result.relationships.every((r) => r.source && r.target && r.type);
|
|
170
|
+
if (validEntities) score += 15;
|
|
171
|
+
if (validRels) score += 15;
|
|
172
|
+
|
|
173
|
+
return Math.min(score, maxScore);
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
async function testModel(model: (typeof LLM_MODELS)[0]): Promise<ModelResult> {
|
|
177
|
+
console.log(`\n🔄 Testing: ${model.name} (${model.id})`);
|
|
178
|
+
console.log(` ${'─'.repeat(50)}`);
|
|
179
|
+
|
|
180
|
+
const results: TestCaseResult[] = [];
|
|
181
|
+
|
|
182
|
+
for (const testCase of TEST_CASES) {
|
|
183
|
+
const start = Date.now();
|
|
184
|
+
|
|
185
|
+
try {
|
|
186
|
+
const response = await fetch(`${OPENROUTER_BASE_URL}/chat/completions`, {
|
|
187
|
+
method: 'POST',
|
|
188
|
+
headers: {
|
|
189
|
+
Authorization: `Bearer ${OPENROUTER_API_KEY}`,
|
|
190
|
+
'Content-Type': 'application/json',
|
|
191
|
+
'HTTP-Referer': 'https://pai.dev',
|
|
192
|
+
'X-Title': 'PAI Knowledge System Test',
|
|
193
|
+
},
|
|
194
|
+
body: JSON.stringify({
|
|
195
|
+
model: model.id,
|
|
196
|
+
messages: [{ role: 'user', content: EXTRACTION_PROMPT(testCase.text) }],
|
|
197
|
+
temperature: 0.1,
|
|
198
|
+
max_tokens: 1000,
|
|
199
|
+
}),
|
|
200
|
+
});
|
|
201
|
+
|
|
202
|
+
const responseMs = Date.now() - start;
|
|
203
|
+
|
|
204
|
+
if (!response.ok) {
|
|
205
|
+
const error = await response.text();
|
|
206
|
+
console.log(` ❌ ${testCase.name}: HTTP ${response.status}`);
|
|
207
|
+
results.push({
|
|
208
|
+
name: testCase.name,
|
|
209
|
+
passed: false,
|
|
210
|
+
entities: 0,
|
|
211
|
+
relationships: 0,
|
|
212
|
+
responseMs,
|
|
213
|
+
inputTokens: 0,
|
|
214
|
+
outputTokens: 0,
|
|
215
|
+
error: `HTTP ${response.status}: ${error.slice(0, 100)}`,
|
|
216
|
+
});
|
|
217
|
+
continue;
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
const data = (await response.json()) as any;
|
|
221
|
+
const content = data.choices?.[0]?.message?.content || '';
|
|
222
|
+
const inputTokens = data.usage?.prompt_tokens || 0;
|
|
223
|
+
const outputTokens = data.usage?.completion_tokens || 0;
|
|
224
|
+
|
|
225
|
+
const json = extractJSON(content);
|
|
226
|
+
|
|
227
|
+
if (json) {
|
|
228
|
+
const quality = calculateQuality(json, testCase);
|
|
229
|
+
console.log(
|
|
230
|
+
` ✅ ${testCase.name}: ${json.entities.length} entities, ${json.relationships.length} rels (${responseMs}ms, quality: ${quality.toFixed(0)}%)`
|
|
231
|
+
);
|
|
232
|
+
results.push({
|
|
233
|
+
name: testCase.name,
|
|
234
|
+
passed: true,
|
|
235
|
+
entities: json.entities.length,
|
|
236
|
+
relationships: json.relationships.length,
|
|
237
|
+
responseMs,
|
|
238
|
+
inputTokens,
|
|
239
|
+
outputTokens,
|
|
240
|
+
});
|
|
241
|
+
} else {
|
|
242
|
+
console.log(` ❌ ${testCase.name}: Invalid JSON (${responseMs}ms)`);
|
|
243
|
+
results.push({
|
|
244
|
+
name: testCase.name,
|
|
245
|
+
passed: false,
|
|
246
|
+
entities: 0,
|
|
247
|
+
relationships: 0,
|
|
248
|
+
responseMs,
|
|
249
|
+
inputTokens,
|
|
250
|
+
outputTokens,
|
|
251
|
+
error: 'Invalid JSON structure',
|
|
252
|
+
});
|
|
253
|
+
}
|
|
254
|
+
} catch (err: any) {
|
|
255
|
+
const responseMs = Date.now() - start;
|
|
256
|
+
console.log(` ❌ ${testCase.name}: ${err.message}`);
|
|
257
|
+
results.push({
|
|
258
|
+
name: testCase.name,
|
|
259
|
+
passed: false,
|
|
260
|
+
entities: 0,
|
|
261
|
+
relationships: 0,
|
|
262
|
+
responseMs,
|
|
263
|
+
inputTokens: 0,
|
|
264
|
+
outputTokens: 0,
|
|
265
|
+
error: err.message,
|
|
266
|
+
});
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
// Rate limiting - wait between calls
|
|
270
|
+
await new Promise((r) => setTimeout(r, 500));
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
// Calculate aggregates
|
|
274
|
+
const passedTests = results.filter((r) => r.passed);
|
|
275
|
+
const avgResponseMs =
|
|
276
|
+
results.length > 0
|
|
277
|
+
? Math.round(results.reduce((a, r) => a + r.responseMs, 0) / results.length)
|
|
278
|
+
: 0;
|
|
279
|
+
const avgInputTokens =
|
|
280
|
+
results.length > 0
|
|
281
|
+
? Math.round(results.reduce((a, r) => a + r.inputTokens, 0) / results.length)
|
|
282
|
+
: 0;
|
|
283
|
+
const avgOutputTokens =
|
|
284
|
+
results.length > 0
|
|
285
|
+
? Math.round(results.reduce((a, r) => a + r.outputTokens, 0) / results.length)
|
|
286
|
+
: 0;
|
|
287
|
+
|
|
288
|
+
// Cost per 1000 API calls
|
|
289
|
+
const costPer1000 =
|
|
290
|
+
(avgInputTokens / 1_000_000) * model.inputPrice * 1000 +
|
|
291
|
+
(avgOutputTokens / 1_000_000) * model.outputPrice * 1000;
|
|
292
|
+
|
|
293
|
+
// Quality score (average across passed tests)
|
|
294
|
+
let qualityScore = 0;
|
|
295
|
+
if (passedTests.length > 0) {
|
|
296
|
+
// Re-calculate quality for passed tests
|
|
297
|
+
for (let i = 0; i < TEST_CASES.length; i++) {
|
|
298
|
+
if (results[i].passed) {
|
|
299
|
+
qualityScore += calculateQuality(
|
|
300
|
+
{
|
|
301
|
+
entities: Array(results[i].entities).fill({ name: 'x' }),
|
|
302
|
+
relationships: Array(results[i].relationships).fill({
|
|
303
|
+
source: 'a',
|
|
304
|
+
target: 'b',
|
|
305
|
+
type: 'c',
|
|
306
|
+
}),
|
|
307
|
+
},
|
|
308
|
+
TEST_CASES[i]
|
|
309
|
+
);
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
qualityScore = qualityScore / passedTests.length;
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
return {
|
|
316
|
+
model: model.id,
|
|
317
|
+
modelName: model.name,
|
|
318
|
+
inputPrice: model.inputPrice,
|
|
319
|
+
outputPrice: model.outputPrice,
|
|
320
|
+
avgResponseMs,
|
|
321
|
+
avgInputTokens,
|
|
322
|
+
avgOutputTokens,
|
|
323
|
+
costPer1000Calls: costPer1000,
|
|
324
|
+
qualityScore,
|
|
325
|
+
passRate: (passedTests.length / results.length) * 100,
|
|
326
|
+
results,
|
|
327
|
+
};
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
function printComparisonTable(results: ModelResult[]) {
|
|
331
|
+
console.log(`\n${'═'.repeat(120)}`);
|
|
332
|
+
console.log('📊 COST VS PERFORMANCE COMPARISON TABLE');
|
|
333
|
+
console.log(`${'═'.repeat(120)}\n`);
|
|
334
|
+
|
|
335
|
+
// Sort by value score (quality / cost)
|
|
336
|
+
const sortedResults = [...results]
|
|
337
|
+
.filter((r) => r.passRate > 0)
|
|
338
|
+
.sort((a, b) => {
|
|
339
|
+
const valueA = a.costPer1000Calls > 0 ? a.qualityScore / a.costPer1000Calls : 0;
|
|
340
|
+
const valueB = b.costPer1000Calls > 0 ? b.qualityScore / b.costPer1000Calls : 0;
|
|
341
|
+
return valueB - valueA;
|
|
342
|
+
});
|
|
343
|
+
|
|
344
|
+
// Header
|
|
345
|
+
console.log(
|
|
346
|
+
'| Rank | Model | Pass Rate | Quality | Avg Time | Input$/M | Output$/M | Cost/1K Calls | Value Score |'
|
|
347
|
+
);
|
|
348
|
+
console.log(
|
|
349
|
+
'|------|--------------------------|-----------|---------|----------|----------|-----------|---------------|-------------|'
|
|
350
|
+
);
|
|
351
|
+
|
|
352
|
+
sortedResults.forEach((r, i) => {
|
|
353
|
+
const valueScore =
|
|
354
|
+
r.costPer1000Calls > 0 ? (r.qualityScore / r.costPer1000Calls).toFixed(1) : '∞';
|
|
355
|
+
console.log(
|
|
356
|
+
`| ${(i + 1).toString().padStart(4)} | ${r.modelName.padEnd(24)} | ${r.passRate.toFixed(0).padStart(7)}% | ${r.qualityScore.toFixed(0).padStart(5)}% | ${(`${r.avgResponseMs}ms`).padStart(8)} | $${r.inputPrice.toFixed(2).padStart(6)} | $${r.outputPrice.toFixed(2).padStart(9)} | $${r.costPer1000Calls.toFixed(4).padStart(11)} | ${valueScore.toString().padStart(11)} |`
|
|
357
|
+
);
|
|
358
|
+
});
|
|
359
|
+
|
|
360
|
+
// Failed models
|
|
361
|
+
const failed = results.filter((r) => r.passRate === 0);
|
|
362
|
+
if (failed.length > 0) {
|
|
363
|
+
console.log('\n❌ Failed Models:');
|
|
364
|
+
failed.forEach((r) =>
|
|
365
|
+
console.log(` - ${r.modelName}: ${r.results[0]?.error || 'Unknown error'}`)
|
|
366
|
+
);
|
|
367
|
+
}
|
|
368
|
+
|
|
369
|
+
// Recommendations
|
|
370
|
+
console.log(`\n${'─'.repeat(120)}`);
|
|
371
|
+
console.log('📌 RECOMMENDATIONS');
|
|
372
|
+
console.log('─'.repeat(120));
|
|
373
|
+
|
|
374
|
+
const bestQuality = sortedResults.reduce(
|
|
375
|
+
(a, b) => (a.qualityScore > b.qualityScore ? a : b),
|
|
376
|
+
sortedResults[0]
|
|
377
|
+
);
|
|
378
|
+
const bestValue = sortedResults[0]; // Already sorted by value
|
|
379
|
+
const cheapest = sortedResults.reduce(
|
|
380
|
+
(a, b) => (a.costPer1000Calls < b.costPer1000Calls ? a : b),
|
|
381
|
+
sortedResults[0]
|
|
382
|
+
);
|
|
383
|
+
const fastest = sortedResults.reduce(
|
|
384
|
+
(a, b) => (a.avgResponseMs < b.avgResponseMs ? a : b),
|
|
385
|
+
sortedResults[0]
|
|
386
|
+
);
|
|
387
|
+
|
|
388
|
+
console.log(
|
|
389
|
+
`\n🏆 Best Quality: ${bestQuality?.modelName} (${bestQuality?.qualityScore.toFixed(0)}% quality)`
|
|
390
|
+
);
|
|
391
|
+
console.log(
|
|
392
|
+
`💰 Best Value: ${bestValue?.modelName} (${bestValue?.qualityScore.toFixed(0)}% quality at $${bestValue?.costPer1000Calls.toFixed(4)}/1K calls)`
|
|
393
|
+
);
|
|
394
|
+
console.log(
|
|
395
|
+
`🪙 Cheapest: ${cheapest?.modelName} ($${cheapest?.costPer1000Calls.toFixed(4)}/1K calls)`
|
|
396
|
+
);
|
|
397
|
+
console.log(`⚡ Fastest: ${fastest?.modelName} (${fastest?.avgResponseMs}ms avg)`);
|
|
398
|
+
|
|
399
|
+
// Hybrid recommendation
|
|
400
|
+
console.log('\n🎯 RECOMMENDED FOR KNOWLEDGE SYSTEM:');
|
|
401
|
+
console.log(` Primary (best balance): ${bestValue?.modelName}`);
|
|
402
|
+
console.log(` Budget option: ${cheapest?.modelName}`);
|
|
403
|
+
console.log(` Premium option: ${bestQuality?.modelName}`);
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
async function main() {
|
|
407
|
+
console.log('═'.repeat(60));
|
|
408
|
+
console.log('🧪 OpenRouter LLM Model Comparison Test');
|
|
409
|
+
console.log(` Testing ${LLM_MODELS.length} models for entity extraction`);
|
|
410
|
+
console.log('═'.repeat(60));
|
|
411
|
+
|
|
412
|
+
const results: ModelResult[] = [];
|
|
413
|
+
|
|
414
|
+
for (const model of LLM_MODELS) {
|
|
415
|
+
try {
|
|
416
|
+
const result = await testModel(model);
|
|
417
|
+
results.push(result);
|
|
418
|
+
} catch (err: any) {
|
|
419
|
+
console.log(`\n❌ ${model.name}: ${err.message}`);
|
|
420
|
+
results.push({
|
|
421
|
+
model: model.id,
|
|
422
|
+
modelName: model.name,
|
|
423
|
+
inputPrice: model.inputPrice,
|
|
424
|
+
outputPrice: model.outputPrice,
|
|
425
|
+
avgResponseMs: 0,
|
|
426
|
+
avgInputTokens: 0,
|
|
427
|
+
avgOutputTokens: 0,
|
|
428
|
+
costPer1000Calls: 0,
|
|
429
|
+
qualityScore: 0,
|
|
430
|
+
passRate: 0,
|
|
431
|
+
results: [],
|
|
432
|
+
error: err.message,
|
|
433
|
+
});
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
// Rate limiting between models
|
|
437
|
+
await new Promise((r) => setTimeout(r, 1000));
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
// Print comparison table
|
|
441
|
+
printComparisonTable(results);
|
|
442
|
+
|
|
443
|
+
// Save results to file
|
|
444
|
+
const outputPath = new URL('./openrouter-test-results.json', import.meta.url).pathname;
|
|
445
|
+
await Bun.write(
|
|
446
|
+
outputPath,
|
|
447
|
+
JSON.stringify(
|
|
448
|
+
{
|
|
449
|
+
results,
|
|
450
|
+
timestamp: new Date().toISOString(),
|
|
451
|
+
testCases: TEST_CASES.map((t) => t.name),
|
|
452
|
+
},
|
|
453
|
+
null,
|
|
454
|
+
2
|
|
455
|
+
)
|
|
456
|
+
);
|
|
457
|
+
console.log(`\n📁 Results saved to: ${outputPath}`);
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
main().catch(console.error);
|