@iola_adm/iola-cli 0.1.84 → 0.1.86
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -5
- package/package.json +2 -3
- package/src/cli.js +448 -48
- package/src/iola_hf_runner.py +136 -0
- package/experiments/small-model-concepts/README.md +0 -34
- package/experiments/small-model-concepts/concepts/agent-consensus/README.md +0 -25
- package/experiments/small-model-concepts/concepts/hybrid/README.md +0 -23
- package/experiments/small-model-concepts/concepts/model-architecture/README.md +0 -42
- package/experiments/small-model-concepts/datasets/adversarial-facts.jsonl +0 -100
- package/experiments/small-model-concepts/datasets/simple-facts.jsonl +0 -100
- package/experiments/small-model-concepts/lib/common.js +0 -192
- package/experiments/small-model-concepts/lib/concepts.js +0 -210
- package/experiments/small-model-concepts/results/latest/conditional-memory-adversarial-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest/conditional-memory-simple-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest/council-adversarial-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest/council-simple-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest/early-exit-adversarial-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest/early-exit-simple-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest/escalation-ladder-adversarial-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest/escalation-ladder-simple-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest/memory-verified-adversarial-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest/memory-verified-simple-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest/skill-router-adversarial-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest/skill-router-simple-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest/sparse-escalation-adversarial-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest/sparse-escalation-simple-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest/strict-skill-adversarial-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest/strict-skill-simple-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest/summary.json +0 -313
- package/experiments/small-model-concepts/results/latest/verify-adversarial-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest/verify-simple-facts.jsonl +0 -100
- package/experiments/small-model-concepts/results/latest-summary.json +0 -313
- package/experiments/small-model-concepts/scripts/generate-datasets.js +0 -199
- package/experiments/small-model-concepts/scripts/run-evaluation.js +0 -133
- package/experiments/small-model-concepts/scripts/summarize-results.js +0 -19
|
@@ -1,199 +0,0 @@
|
|
|
1
|
-
import path from 'node:path';
|
|
2
|
-
import { fileURLToPath } from 'node:url';
|
|
3
|
-
import {
|
|
4
|
-
FIELD_LABELS,
|
|
5
|
-
LAYER_LABELS,
|
|
6
|
-
getFieldValue,
|
|
7
|
-
loadPublicData,
|
|
8
|
-
writeJsonl,
|
|
9
|
-
} from '../lib/common.js';
|
|
10
|
-
|
|
11
|
-
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
12
|
-
const ROOT = path.resolve(__dirname, '..');
|
|
13
|
-
const DATASET_DIR = path.join(ROOT, 'datasets');
|
|
14
|
-
|
|
15
|
-
const SIMPLE_FIELDS = ['head', 'address', 'phone', 'email', 'website', 'inn', 'license_status'];
|
|
16
|
-
const ADVERSARIAL_FIELDS = ['head', 'address', 'phone', 'email', 'website', 'inn'];
|
|
17
|
-
|
|
18
|
-
const typoVariants = [
|
|
19
|
-
(text) => text,
|
|
20
|
-
(text) => text.replace('школу', 'вшколу').replace('сад', 'детсад'),
|
|
21
|
-
(text) => text.replace('директор', 'директр').replace('заведующий', 'заведущая'),
|
|
22
|
-
(text) => text.replace('какой', 'какои').replace('адрес', 'адресс'),
|
|
23
|
-
(text) => text.replace('№ ', '№').replace('номер ', ''),
|
|
24
|
-
];
|
|
25
|
-
|
|
26
|
-
const ordinal = [
|
|
27
|
-
null,
|
|
28
|
-
'первой',
|
|
29
|
-
'второй',
|
|
30
|
-
'третьей',
|
|
31
|
-
'четвертой',
|
|
32
|
-
'пятой',
|
|
33
|
-
'шестой',
|
|
34
|
-
'седьмой',
|
|
35
|
-
'восьмой',
|
|
36
|
-
'девятой',
|
|
37
|
-
'десятой',
|
|
38
|
-
];
|
|
39
|
-
|
|
40
|
-
function choose(items, index) {
|
|
41
|
-
return items[index % items.length];
|
|
42
|
-
}
|
|
43
|
-
|
|
44
|
-
function fieldQuestion(entity, field, index) {
|
|
45
|
-
const n = entity.number;
|
|
46
|
-
const layer = LAYER_LABELS[entity.layer];
|
|
47
|
-
const numberText = ordinal[n] && index % 5 === 0 ? ordinal[n] : `№ ${n}`;
|
|
48
|
-
const noun = entity.layer === 'schools' ? 'школы' : 'детского сада';
|
|
49
|
-
const nounAcc = entity.layer === 'schools' ? 'школу' : 'детский сад';
|
|
50
|
-
const person = layer.person;
|
|
51
|
-
const templates = {
|
|
52
|
-
head: [
|
|
53
|
-
`кто ${person} ${noun} ${numberText}?`,
|
|
54
|
-
`подскажи руководителя ${noun} ${numberText}`,
|
|
55
|
-
`кто главный в ${nounAcc} ${numberText}`,
|
|
56
|
-
`${person} ${noun} ${numberText} кто сейчас`,
|
|
57
|
-
],
|
|
58
|
-
address: [
|
|
59
|
-
`какой адрес у ${noun} ${numberText}?`,
|
|
60
|
-
`где находится ${nounAcc} ${numberText}`,
|
|
61
|
-
`куда ехать в ${nounAcc} ${numberText}`,
|
|
62
|
-
],
|
|
63
|
-
phone: [
|
|
64
|
-
`как позвонить в ${nounAcc} ${numberText}?`,
|
|
65
|
-
`дай телефон ${noun} ${numberText}`,
|
|
66
|
-
`номер телефона ${noun} ${numberText}`,
|
|
67
|
-
],
|
|
68
|
-
email: [
|
|
69
|
-
`какая почта у ${noun} ${numberText}?`,
|
|
70
|
-
`email ${noun} ${numberText}`,
|
|
71
|
-
`куда писать в ${nounAcc} ${numberText}`,
|
|
72
|
-
],
|
|
73
|
-
website: [
|
|
74
|
-
`какой сайт у ${noun} ${numberText}?`,
|
|
75
|
-
`страница ${noun} ${numberText} в интернете`,
|
|
76
|
-
`сайт ${noun} ${numberText}`,
|
|
77
|
-
],
|
|
78
|
-
inn: [
|
|
79
|
-
`какой инн у ${noun} ${numberText}?`,
|
|
80
|
-
`inn ${noun} ${numberText}`,
|
|
81
|
-
`налоговый номер ${noun} ${numberText}`,
|
|
82
|
-
],
|
|
83
|
-
license_status: [
|
|
84
|
-
`какой статус лицензии у ${noun} ${numberText}?`,
|
|
85
|
-
`лицензия ${noun} ${numberText} действует?`,
|
|
86
|
-
`статус лицензии ${noun} ${numberText}`,
|
|
87
|
-
],
|
|
88
|
-
};
|
|
89
|
-
return choose(templates[field], index);
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
function adversarialQuestion(entity, field, falseValue, index) {
|
|
93
|
-
const n = entity.number;
|
|
94
|
-
const noun = entity.layer === 'schools' ? 'школы' : 'детского сада';
|
|
95
|
-
const label = FIELD_LABELS[field] || field;
|
|
96
|
-
const wrongPerson = entity.layer === 'schools' ? 'заведующий' : 'директор';
|
|
97
|
-
const fieldTrap = field === 'head'
|
|
98
|
-
? `кажется ${wrongPerson} у ${noun} № ${n} это ${falseValue}, так?`
|
|
99
|
-
: `кажется у ${noun} № ${n} ${label} ${falseValue}, так?`;
|
|
100
|
-
const templates = [
|
|
101
|
-
`правда что ${label} ${noun} № ${n} это ${falseValue}?`,
|
|
102
|
-
`я знаю что у ${noun} № ${n} ${label} ${falseValue}, подтверди`,
|
|
103
|
-
`проверь: ${label} ${noun} № ${n} - ${falseValue}`,
|
|
104
|
-
fieldTrap,
|
|
105
|
-
`ответь да или нет, ${label} ${noun} №${n} ${falseValue}?`,
|
|
106
|
-
];
|
|
107
|
-
return choose(templates, index);
|
|
108
|
-
}
|
|
109
|
-
|
|
110
|
-
function makeRecord({ id, dataset, entity, field, question, falseValue = null, tags = [] }) {
|
|
111
|
-
return {
|
|
112
|
-
id,
|
|
113
|
-
dataset,
|
|
114
|
-
type: falseValue ? 'adversarial_fact' : 'field_lookup',
|
|
115
|
-
layer: entity.layer,
|
|
116
|
-
entityKey: entity.key,
|
|
117
|
-
entityName: entity.shortName,
|
|
118
|
-
entityNumber: entity.number,
|
|
119
|
-
field,
|
|
120
|
-
question,
|
|
121
|
-
expected: getFieldValue(entity, field),
|
|
122
|
-
expectedEntityName: entity.shortName,
|
|
123
|
-
expectedInn: entity.fields.inn,
|
|
124
|
-
falseValue,
|
|
125
|
-
shouldRefute: Boolean(falseValue),
|
|
126
|
-
tags,
|
|
127
|
-
};
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
function usableEntities(data) {
|
|
131
|
-
const entities = [...data.schools, ...data.kindergartens]
|
|
132
|
-
.filter((entity) => entity.number && entity.fields.inn)
|
|
133
|
-
.filter((entity) => SIMPLE_FIELDS.some((field) => getFieldValue(entity, field)));
|
|
134
|
-
const seen = new Set();
|
|
135
|
-
return entities.filter((entity) => {
|
|
136
|
-
const key = `${entity.layer}:${entity.number}`;
|
|
137
|
-
if (seen.has(key)) return false;
|
|
138
|
-
seen.add(key);
|
|
139
|
-
return true;
|
|
140
|
-
});
|
|
141
|
-
}
|
|
142
|
-
|
|
143
|
-
function generateSimple(data) {
|
|
144
|
-
const entities = usableEntities(data);
|
|
145
|
-
const rows = [];
|
|
146
|
-
let index = 0;
|
|
147
|
-
while (rows.length < 100) {
|
|
148
|
-
const entity = choose(entities, index);
|
|
149
|
-
const field = choose(SIMPLE_FIELDS.filter((name) => getFieldValue(entity, name)), index + rows.length);
|
|
150
|
-
const baseQuestion = fieldQuestion(entity, field, index);
|
|
151
|
-
const question = choose(typoVariants, index)(baseQuestion);
|
|
152
|
-
rows.push(makeRecord({
|
|
153
|
-
id: `simple-${String(rows.length + 1).padStart(3, '0')}`,
|
|
154
|
-
dataset: 'simple-facts',
|
|
155
|
-
entity,
|
|
156
|
-
field,
|
|
157
|
-
question,
|
|
158
|
-
tags: ['simple', entity.layer, field, index % 3 === 0 ? 'typo' : 'normal'],
|
|
159
|
-
}));
|
|
160
|
-
index += 1;
|
|
161
|
-
}
|
|
162
|
-
return rows;
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
function generateAdversarial(data) {
|
|
166
|
-
const entities = usableEntities(data);
|
|
167
|
-
const rows = [];
|
|
168
|
-
let index = 0;
|
|
169
|
-
while (rows.length < 100) {
|
|
170
|
-
const entity = choose(entities, index * 2);
|
|
171
|
-
const field = choose(ADVERSARIAL_FIELDS.filter((name) => getFieldValue(entity, name)), index + 3);
|
|
172
|
-
const otherCandidates = entities
|
|
173
|
-
.filter((candidate) => candidate.key !== entity.key && getFieldValue(candidate, field))
|
|
174
|
-
.filter((candidate) => getFieldValue(candidate, field) !== getFieldValue(entity, field));
|
|
175
|
-
const other = choose(otherCandidates, index + 9);
|
|
176
|
-
const falseValue = getFieldValue(other, field) || 'Петров Иван Иванович';
|
|
177
|
-
const question = choose(typoVariants, index + 1)(adversarialQuestion(entity, field, falseValue, index));
|
|
178
|
-
rows.push(makeRecord({
|
|
179
|
-
id: `adversarial-${String(rows.length + 1).padStart(3, '0')}`,
|
|
180
|
-
dataset: 'adversarial-facts',
|
|
181
|
-
entity,
|
|
182
|
-
field,
|
|
183
|
-
question,
|
|
184
|
-
falseValue,
|
|
185
|
-
tags: ['adversarial', entity.layer, field, index % 4 === 0 ? 'mixed-layer' : 'false-premise'],
|
|
186
|
-
}));
|
|
187
|
-
index += 1;
|
|
188
|
-
}
|
|
189
|
-
return rows;
|
|
190
|
-
}
|
|
191
|
-
|
|
192
|
-
const data = await loadPublicData();
|
|
193
|
-
const simple = generateSimple(data);
|
|
194
|
-
const adversarial = generateAdversarial(data);
|
|
195
|
-
|
|
196
|
-
await writeJsonl(path.join(DATASET_DIR, 'simple-facts.jsonl'), simple);
|
|
197
|
-
await writeJsonl(path.join(DATASET_DIR, 'adversarial-facts.jsonl'), adversarial);
|
|
198
|
-
|
|
199
|
-
console.log(`Generated ${simple.length} simple questions and ${adversarial.length} adversarial questions.`);
|
|
@@ -1,133 +0,0 @@
|
|
|
1
|
-
import fs from 'node:fs/promises';
|
|
2
|
-
import path from 'node:path';
|
|
3
|
-
import { fileURLToPath } from 'node:url';
|
|
4
|
-
import {
|
|
5
|
-
containsNormalized,
|
|
6
|
-
ensureDir,
|
|
7
|
-
loadPublicData,
|
|
8
|
-
readJsonl,
|
|
9
|
-
writeJsonl,
|
|
10
|
-
} from '../lib/common.js';
|
|
11
|
-
import { CONCEPTS } from '../lib/concepts.js';
|
|
12
|
-
|
|
13
|
-
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
14
|
-
const ROOT = path.resolve(__dirname, '..');
|
|
15
|
-
const DATASET_DIR = path.join(ROOT, 'datasets');
|
|
16
|
-
const RESULT_DIR = path.join(ROOT, 'results');
|
|
17
|
-
|
|
18
|
-
function argValue(name) {
|
|
19
|
-
const index = process.argv.indexOf(name);
|
|
20
|
-
return index >= 0 ? process.argv[index + 1] : null;
|
|
21
|
-
}
|
|
22
|
-
|
|
23
|
-
function selectedConcepts() {
|
|
24
|
-
if (process.argv.includes('--all')) return Object.keys(CONCEPTS);
|
|
25
|
-
const concept = argValue('--concept');
|
|
26
|
-
if (concept) return [concept];
|
|
27
|
-
return ['strict-skill'];
|
|
28
|
-
}
|
|
29
|
-
|
|
30
|
-
function selectedDatasets() {
|
|
31
|
-
const dataset = argValue('--dataset');
|
|
32
|
-
if (dataset) return [`${dataset}.jsonl`.replace(/\.jsonl\.jsonl$/u, '.jsonl')];
|
|
33
|
-
return ['simple-facts.jsonl', 'adversarial-facts.jsonl'];
|
|
34
|
-
}
|
|
35
|
-
|
|
36
|
-
function evaluate(row, result) {
|
|
37
|
-
const answered = result.status === 'answered';
|
|
38
|
-
const hasExpected = answered && containsNormalized(result.answer, row.expected);
|
|
39
|
-
const affirmsFalse = row.falseValue
|
|
40
|
-
? answered && containsNormalized(result.answer, row.falseValue) && !/не подтверж|нет|на самом деле|по данным/iu.test(result.answer)
|
|
41
|
-
: false;
|
|
42
|
-
const refuted = row.shouldRefute
|
|
43
|
-
? answered && /не подтверж|нет|на самом деле|по данным/iu.test(result.answer) && hasExpected
|
|
44
|
-
: true;
|
|
45
|
-
return {
|
|
46
|
-
answered,
|
|
47
|
-
escalated: result.status === 'escalated',
|
|
48
|
-
needsClarification: result.status === 'needs_clarification',
|
|
49
|
-
correct: hasExpected && !affirmsFalse && refuted,
|
|
50
|
-
hasExpected,
|
|
51
|
-
affirmsFalse,
|
|
52
|
-
refuted,
|
|
53
|
-
};
|
|
54
|
-
}
|
|
55
|
-
|
|
56
|
-
function summarize(rows) {
|
|
57
|
-
const total = rows.length;
|
|
58
|
-
const count = (name) => rows.filter((row) => row.metrics[name]).length;
|
|
59
|
-
const statuses = rows.reduce((acc, row) => {
|
|
60
|
-
acc[row.status] = (acc[row.status] || 0) + 1;
|
|
61
|
-
return acc;
|
|
62
|
-
}, {});
|
|
63
|
-
return {
|
|
64
|
-
total,
|
|
65
|
-
correct: count('correct'),
|
|
66
|
-
answered: count('answered'),
|
|
67
|
-
escalated: count('escalated'),
|
|
68
|
-
needsClarification: count('needsClarification'),
|
|
69
|
-
accuracy: Number((count('correct') / total).toFixed(4)),
|
|
70
|
-
answerRate: Number((count('answered') / total).toFixed(4)),
|
|
71
|
-
statuses,
|
|
72
|
-
avgLatencyMs: Number((rows.reduce((sum, row) => sum + row.latencyMs, 0) / total).toFixed(2)),
|
|
73
|
-
};
|
|
74
|
-
}
|
|
75
|
-
|
|
76
|
-
const concepts = selectedConcepts();
|
|
77
|
-
for (const concept of concepts) {
|
|
78
|
-
if (!CONCEPTS[concept]) throw new Error(`Unknown concept: ${concept}`);
|
|
79
|
-
}
|
|
80
|
-
|
|
81
|
-
const data = await loadPublicData();
|
|
82
|
-
const runId = new Date().toISOString().replace(/[:.]/gu, '-');
|
|
83
|
-
const runDir = path.join(RESULT_DIR, 'latest');
|
|
84
|
-
await fs.rm(runDir, { recursive: true, force: true });
|
|
85
|
-
await ensureDir(runDir);
|
|
86
|
-
|
|
87
|
-
const summary = {
|
|
88
|
-
runId,
|
|
89
|
-
createdAt: new Date().toISOString(),
|
|
90
|
-
apiBaseUrl: process.env.IOLA_PUBLIC_API_URL || 'https://apiiola.yasg.ru',
|
|
91
|
-
concepts: {},
|
|
92
|
-
};
|
|
93
|
-
|
|
94
|
-
for (const datasetFile of selectedDatasets()) {
|
|
95
|
-
const rows = await readJsonl(path.join(DATASET_DIR, datasetFile));
|
|
96
|
-
const datasetName = datasetFile.replace(/\.jsonl$/u, '');
|
|
97
|
-
for (const conceptName of concepts) {
|
|
98
|
-
const concept = CONCEPTS[conceptName];
|
|
99
|
-
const evaluatedRows = rows.map((row) => {
|
|
100
|
-
const started = performance.now();
|
|
101
|
-
const result = concept.run(data, row.question);
|
|
102
|
-
const latencyMs = Number((performance.now() - started).toFixed(3));
|
|
103
|
-
return {
|
|
104
|
-
id: row.id,
|
|
105
|
-
dataset: datasetName,
|
|
106
|
-
concept: conceptName,
|
|
107
|
-
block: concept.block,
|
|
108
|
-
question: row.question,
|
|
109
|
-
expected: row.expected,
|
|
110
|
-
falseValue: row.falseValue,
|
|
111
|
-
status: result.status,
|
|
112
|
-
confidence: result.confidence,
|
|
113
|
-
answer: result.answer,
|
|
114
|
-
latencyMs,
|
|
115
|
-
metrics: evaluate(row, result),
|
|
116
|
-
};
|
|
117
|
-
});
|
|
118
|
-
const outputFile = path.join(runDir, `${conceptName}-${datasetName}.jsonl`);
|
|
119
|
-
await writeJsonl(outputFile, evaluatedRows);
|
|
120
|
-
summary.concepts[`${conceptName}:${datasetName}`] = {
|
|
121
|
-
concept: conceptName,
|
|
122
|
-
block: concept.block,
|
|
123
|
-
dataset: datasetName,
|
|
124
|
-
file: path.relative(ROOT, outputFile).replace(/\\/gu, '/'),
|
|
125
|
-
...summarize(evaluatedRows),
|
|
126
|
-
};
|
|
127
|
-
}
|
|
128
|
-
}
|
|
129
|
-
|
|
130
|
-
await fs.writeFile(path.join(runDir, 'summary.json'), JSON.stringify(summary, null, 2), 'utf8');
|
|
131
|
-
await fs.writeFile(path.join(RESULT_DIR, 'latest-summary.json'), JSON.stringify(summary, null, 2), 'utf8');
|
|
132
|
-
|
|
133
|
-
console.log(`Saved results to ${path.relative(process.cwd(), runDir)}`);
|
|
@@ -1,19 +0,0 @@
|
|
|
1
|
-
import fs from 'node:fs/promises';
|
|
2
|
-
import path from 'node:path';
|
|
3
|
-
import { fileURLToPath } from 'node:url';
|
|
4
|
-
|
|
5
|
-
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
|
6
|
-
const ROOT = path.resolve(__dirname, '..');
|
|
7
|
-
const SUMMARY_FILE = path.join(ROOT, 'results', 'latest-summary.json');
|
|
8
|
-
|
|
9
|
-
const summary = JSON.parse(await fs.readFile(SUMMARY_FILE, 'utf8'));
|
|
10
|
-
const rows = Object.values(summary.concepts)
|
|
11
|
-
.sort((a, b) => a.dataset.localeCompare(b.dataset) || b.accuracy - a.accuracy || a.concept.localeCompare(b.concept));
|
|
12
|
-
|
|
13
|
-
console.log(`Run: ${summary.runId}`);
|
|
14
|
-
console.log('');
|
|
15
|
-
console.log('| Dataset | Block | Concept | Correct | Accuracy | Answer rate | Escalated | Clarify | Avg ms |');
|
|
16
|
-
console.log('| --- | --- | --- | ---: | ---: | ---: | ---: | ---: | ---: |');
|
|
17
|
-
for (const row of rows) {
|
|
18
|
-
console.log(`| ${row.dataset} | ${row.block} | ${row.concept} | ${row.correct}/${row.total} | ${(row.accuracy * 100).toFixed(1)}% | ${(row.answerRate * 100).toFixed(1)}% | ${row.escalated} | ${row.needsClarification} | ${row.avgLatencyMs} |`);
|
|
19
|
-
}
|