079project 8.0.0 → 9.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +165 -0
- package/README.en.md +81 -1
- package/README.md +85 -1
- package/Redis-8.0.3-Windows-x64-cygwin-with-Service/dump.rdb +0 -0
- package/groupWorker.cjs +253 -0
- package/inferenceWorker.cjs +94 -0
- package/main.cjs +1263 -173
- package/mainFailedOfJing1Xi4Hua4Zhi4Duan3Yu3.cjs +6320 -0
- package/optimization.cjs +720 -0
- package/package.json +3 -2
- package/test_automatic/answer.csv +401 -0
- package/test_automatic/generate_daily_qa.py +645 -0
- package/test_automatic/question.csv +401 -0
- package/test_automatic.cjs +441 -0
package/main.cjs
CHANGED
|
@@ -20,19 +20,111 @@ const safeRequire = (name) => {
|
|
|
20
20
|
}
|
|
21
21
|
};
|
|
22
22
|
|
|
23
|
-
|
|
24
|
-
const
|
|
25
|
-
const
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
23
|
+
// 启动性能优化:重依赖懒加载(避免启动即加载 pdf-parse/cheerio/natural 等)。
|
|
24
|
+
const __lazyModules = new Map();
|
|
25
|
+
const lazyRequire = (name) => {
|
|
26
|
+
if (__lazyModules.has(name)) {
|
|
27
|
+
return __lazyModules.get(name);
|
|
28
|
+
}
|
|
29
|
+
const mod = safeRequire(name);
|
|
30
|
+
__lazyModules.set(name, mod);
|
|
31
|
+
return mod;
|
|
32
|
+
};
|
|
33
|
+
|
|
34
|
+
const getNatural = () => lazyRequire('natural');
|
|
35
|
+
const getCsvParse = () => lazyRequire('csv-parse/sync');
|
|
36
|
+
const getUmap = () => lazyRequire('umap-js');
|
|
37
|
+
const getAxios = () => lazyRequire('axios');
|
|
38
|
+
const getCheerio = () => lazyRequire('cheerio');
|
|
39
|
+
const getPdfParse = () => lazyRequire('pdf-parse');
|
|
40
|
+
const getMatrix = (() => {
|
|
41
|
+
let loaded = false;
|
|
42
|
+
let cached = null;
|
|
43
|
+
return () => {
|
|
44
|
+
if (loaded) return cached;
|
|
45
|
+
loaded = true;
|
|
46
|
+
const MatrixLib = lazyRequire('ml-matrix');
|
|
47
|
+
cached = MatrixLib?.Matrix ?? MatrixLib ?? null;
|
|
48
|
+
return cached;
|
|
49
|
+
};
|
|
50
|
+
})();
|
|
51
|
+
|
|
52
|
+
const getStopWords = (() => {
|
|
53
|
+
let loaded = false;
|
|
54
|
+
let cached = [];
|
|
55
|
+
return () => {
|
|
56
|
+
if (loaded) return cached;
|
|
57
|
+
loaded = true;
|
|
58
|
+
const natural = getNatural();
|
|
59
|
+
const list = natural?.stopwords ?? [];
|
|
60
|
+
cached = Array.isArray(list) ? list : [];
|
|
61
|
+
return cached;
|
|
62
|
+
};
|
|
63
|
+
})();
|
|
34
64
|
const DEFAULT_CHANNEL = process.env.AI_REDIS_CHANNEL || 'AI-model-workspace';
|
|
35
65
|
|
|
66
|
+
/**
|
|
67
|
+
* 外参列表(所有“外部可控参数”的入口汇总)
|
|
68
|
+
*
|
|
69
|
+
* 1) 启动时参数(CLI flags,形如 --k=v 或 --flag=true)
|
|
70
|
+
* - --base-dir: 运行时数据目录(默认:./runtime_store;对应 ENV: AI_BASE_DIR)
|
|
71
|
+
* - --gateway-host: 网关监听 host(默认:127.0.0.1;对应 ENV: AI_GATEWAY_HOST)
|
|
72
|
+
* - --port: 网关端口(默认:5080;对应 ENV: CONTROLLER_PORT)
|
|
73
|
+
* - --study-port: study/前端进程端口(默认:5081;对应 ENV: AI_STUDY_PORT)
|
|
74
|
+
* - --ai-count: 旧版兼容字段;用于 aiCount(默认:7;对应 ENV: AI_COUNT / AI_NUM)
|
|
75
|
+
* - --group-size: 每个工作组 AI 数量(默认:ai-count;对应 ENV: AI_GROUP_SIZE / GROUP_SIZE)
|
|
76
|
+
* - --group-count: 组数量(默认:3;对应 ENV: AI_GROUP_COUNT / GROUP_COUNT)
|
|
77
|
+
* - --spark-num-ai: SparkArray 每组参与汇聚的 AI 数量(默认:group-size;对应 ENV: AI_SPARK_NUM_AI)
|
|
78
|
+
* - --spark-budget: SparkArray 低精度预算预设(default/low/high 或 JSON;对应 ENV: AI_SPARK_BUDGET)
|
|
79
|
+
* - --robots-limit: robots 预热/导入条数上限(默认:200;对应 ENV: AI_ROBOTS_LIMIT)
|
|
80
|
+
* - --redis-url: Redis 连接串(默认:redis://127.0.0.1:6379;对应 ENV: REDIS_URL)
|
|
81
|
+
* - --channel: Redis pubsub 频道(默认:AI_REDIS_CHANNEL 或 'AI-model-workspace')
|
|
82
|
+
* - --snapshot-dir: 快照目录(默认:./snapshots)
|
|
83
|
+
* - --lmdb-dir: LMDB 根目录(默认:./lmdb;对应 ENV: LMDB_DIR)
|
|
84
|
+
* - --search-endpoint: 在线检索/搜索服务端点(默认:'';对应 ENV: AI_SEARCH_ENDPOINT)
|
|
85
|
+
* - --robots-dir: robots 语料目录(默认:./robots;对应 ENV: AI_ROBOTS_DIR)
|
|
86
|
+
* - --lemma-csv: lemma 词形还原表路径(默认:./lemma.csv;对应 ENV: AI_LEMMA_CSV)
|
|
87
|
+
* - --robots-autoload: 启动时是否自动加载 robots(默认:true;对应 ENV: AI_ROBOTS_AUTOLOAD)
|
|
88
|
+
* - --disable-memebarrier: 启动默认禁用 MemeBarrier(对应 ENV: AI_DISABLE_MEMEBARRIER)
|
|
89
|
+
* - --disable-rl: 启动默认禁用 RL(对应 ENV: AI_DISABLE_RL)
|
|
90
|
+
* - --disable-adv: 启动默认禁用 ADV(对应 ENV: AI_DISABLE_ADV)
|
|
91
|
+
* - --disable-learning: 启动默认禁用学习总开关(对应 ENV: AI_DISABLE_LEARNING)
|
|
92
|
+
* - --export-dir: 图导出目录(默认:./runtime_store;对应 ENV: AI_EXPORT_DIR)
|
|
93
|
+
*
|
|
94
|
+
* 2) 环境变量(ENV)
|
|
95
|
+
* - AI_REDIS_CHANNEL: Redis 频道默认值(被 --channel 覆盖)
|
|
96
|
+
* - AI_AUTH_ENABLED: 是否启用 /api/* 鉴权(默认:true;'false' 关闭)
|
|
97
|
+
* - AI_AUTH_JWT_SECRET / AUTH_JWT_SECRET: JWT 密钥(默认:'dev-secret-change-me')
|
|
98
|
+
* - 说明:鉴权默认保护 /api/*,仅 /api/system/status 在 publicPaths 白名单
|
|
99
|
+
*
|
|
100
|
+
* 3) 运行时参数(HTTP API,可在不重启的情况下调整)
|
|
101
|
+
* - POST /api/chat
|
|
102
|
+
* - body.text / body.message: 输入文本
|
|
103
|
+
* - body.sessionId: 会话 ID(可选;未提供则自动分配)
|
|
104
|
+
* - body.tokens / body.words / body.vocab: 直接提供分词(可选;否则对 text 做 tokenize)
|
|
105
|
+
*
|
|
106
|
+
* - GET /api/model/params: 读取当前模型参数
|
|
107
|
+
* - POST /api/model/params: patch 模型参数(部分字段)
|
|
108
|
+
* - POST /api/model/params/reset: 重置为 modelDefaults
|
|
109
|
+
* - 可 patch 的 key(见下方 modelDefaults):
|
|
110
|
+
* decayFactor, maxMemeWords, minOverlapThreshold, memeNgramMin, memeNgramMax,
|
|
111
|
+
* maliciousThreshold, learningIterations, iteration, threshold, decay, decayK,
|
|
112
|
+
* maxLen, edgeWeight, activationType, transferType, activationCustom, transferCustom
|
|
113
|
+
*
|
|
114
|
+
* - GET /api/runtime/features: 读取运行时功能开关状态
|
|
115
|
+
* - PATCH /api/runtime/features: patch 运行时功能开关(字段如下)
|
|
116
|
+
* - memebarrierEnabled: boolean
|
|
117
|
+
* - maliciousThreshold: number
|
|
118
|
+
* - learningEnabled: boolean(总开关;false 会同时关闭 rl/adv/dialogLearning)
|
|
119
|
+
* - rlEnabled / advEnabled: boolean
|
|
120
|
+
* - dialogLearningEnabled: boolean
|
|
121
|
+
* - rlEvery / advEvery: number(对话触发学习的阈值)
|
|
122
|
+
* - 注意:CLI 的 disable* 表示“启动默认禁用”,运行时仍允许 override(会返回 warnings)
|
|
123
|
+
*
|
|
124
|
+
* - POST /api/learn/thresholds: { rlEvery, advEvery }
|
|
125
|
+
* - POST /api/learn/reinforce: { cycles }(默认:3)
|
|
126
|
+
*/
|
|
127
|
+
|
|
36
128
|
const ensureDir = (dir) => {
|
|
37
129
|
if (!fs.existsSync(dir)) {
|
|
38
130
|
fs.mkdirSync(dir, { recursive: true });
|
|
@@ -60,8 +152,16 @@ const CONFIG = (() => {
|
|
|
60
152
|
return !(normalized === '0' || normalized === 'false' || normalized === 'off' || normalized === 'no');
|
|
61
153
|
};
|
|
62
154
|
const robotsLimitRaw = args['robots-limit'] || process.env.AI_ROBOTS_LIMIT || 200;
|
|
155
|
+
const robotsChunkMinRaw = args['robots-chunk-min'] || process.env.AI_ROBOTS_CHUNK_MIN || 3;
|
|
156
|
+
const robotsChunkMaxRaw = args['robots-chunk-max'] || process.env.AI_ROBOTS_CHUNK_MAX || 20;
|
|
157
|
+
const lmdbMapMbRaw = args['lmdb-map-mb'] || process.env.AI_LMDB_MAP_MB || 512;
|
|
158
|
+
const kvmCacheMaxRaw = args['kvm-cache-max'] || process.env.AI_KVM_CACHE_MAX || 50_000;
|
|
159
|
+
const lemmaMaxMbRaw = args['lemma-max-mb'] || process.env.AI_LEMMA_MAX_MB || 64;
|
|
63
160
|
const aiCountRaw = args['ai-count'] || process.env.AI_COUNT || process.env.AI_NUM || 7;
|
|
64
161
|
const groupCountRaw = args['group-count'] || process.env.AI_GROUP_COUNT || process.env.GROUP_COUNT || 3;
|
|
162
|
+
const groupSizeRaw = args['group-size'] || process.env.AI_GROUP_SIZE || process.env.GROUP_SIZE || aiCountRaw;
|
|
163
|
+
const sparkNumAiRaw = args['spark-num-ai'] || process.env.AI_SPARK_NUM_AI || groupSizeRaw;
|
|
164
|
+
const sparkBudgetRaw = args['spark-budget'] || process.env.AI_SPARK_BUDGET || 'default';
|
|
65
165
|
return {
|
|
66
166
|
baseDir: path.resolve(args['base-dir'] || process.env.AI_BASE_DIR || path.join(__dirname, 'runtime_store')),
|
|
67
167
|
gatewayHost: String(args['gateway-host'] || process.env.AI_GATEWAY_HOST || '127.0.0.1'),
|
|
@@ -69,7 +169,12 @@ const CONFIG = (() => {
|
|
|
69
169
|
portStudy: Number(args['study-port'] || process.env.AI_STUDY_PORT || 5081),
|
|
70
170
|
aiCount: Math.max(3, Number(aiCountRaw) || 7),
|
|
71
171
|
groupCount: Math.max(1, Number(groupCountRaw) || 3),
|
|
72
|
-
groupSize
|
|
172
|
+
// 兼容:若只传了 --ai-count,则默认 groupSize=aiCount
|
|
173
|
+
groupSize: Math.max(1, Number(groupSizeRaw) || (Number(aiCountRaw) || 7)),
|
|
174
|
+
// SparkArray:每组参与汇聚的实例数量(不超过该组实际 controller 数量)
|
|
175
|
+
sparkNumAI: Math.max(1, Number(sparkNumAiRaw) || (Number(groupSizeRaw) || (Number(aiCountRaw) || 7))),
|
|
176
|
+
// SparkArray:低精度预算(可传 preset:default/low/high,或 JSON 对象)
|
|
177
|
+
sparkBudget: sparkBudgetRaw,
|
|
73
178
|
redisUrl: process.env.REDIS_URL || args['redis-url'] || 'redis://127.0.0.1:6379',
|
|
74
179
|
redisChannel: args.channel || DEFAULT_CHANNEL,
|
|
75
180
|
snapshotDir: args['snapshot-dir'] || path.join(__dirname, 'snapshots'),
|
|
@@ -77,11 +182,24 @@ const CONFIG = (() => {
|
|
|
77
182
|
maxWorkers: Math.max(1, (os.cpus()?.length ?? 2) - 1),
|
|
78
183
|
shardCache: path.join(__dirname, 'shards_cache.json'),
|
|
79
184
|
lmdbRoot: path.join(args['lmdb-dir'] || process.env.LMDB_DIR || path.join(__dirname, 'lmdb')),
|
|
185
|
+
lmdbMapSizeBytes: Math.max(64, Number(lmdbMapMbRaw) || 512) * 1024 * 1024,
|
|
80
186
|
searchEndpoint: args['search-endpoint'] || process.env.AI_SEARCH_ENDPOINT || '',
|
|
81
187
|
robotsDir: path.resolve(args['robots-dir'] || process.env.AI_ROBOTS_DIR || path.join(__dirname, 'robots')),
|
|
82
188
|
lemmaCsv: path.resolve(args['lemma-csv'] || process.env.AI_LEMMA_CSV || path.join(__dirname, 'lemma.csv')),
|
|
189
|
+
lemmaAutoload: boolFrom(args['lemma-autoload'] ?? process.env.AI_LEMMA_AUTOLOAD, false),
|
|
190
|
+
lemmaMaxBytes: Math.max(1, Number(lemmaMaxMbRaw) || 64) * 1024 * 1024,
|
|
191
|
+
lemmaForce: boolFrom(args['lemma-force'] ?? process.env.AI_LEMMA_FORCE, false),
|
|
83
192
|
robotsWarmupLimit: Math.max(0, Number(robotsLimitRaw) || 0),
|
|
84
193
|
robotsAutoload: boolFrom(args['robots-autoload'] ?? process.env.AI_ROBOTS_AUTOLOAD, true),
|
|
194
|
+
robotsWarmupShuffle: boolFrom(args['robots-warmup-shuffle'] ?? process.env.AI_ROBOTS_WARMUP_SHUFFLE, false),
|
|
195
|
+
robotsChunkMinWords: Math.max(1, Number(robotsChunkMinRaw) || 2),
|
|
196
|
+
robotsChunkMaxWords: Math.max(1, Number(robotsChunkMaxRaw) || 5),
|
|
197
|
+
kvmCacheMaxEntries: Math.max(0, Number(kvmCacheMaxRaw) || 0),
|
|
198
|
+
learningWarmup: boolFrom(args['learning-warmup'] ?? process.env.AI_LEARNING_WARMUP, false),
|
|
199
|
+
// 启动时是否把 serving 的 snapshot 同步到 standby/validation(可能很慢;默认关闭以保证 fast-boot)
|
|
200
|
+
syncStandbyOnBoot: boolFrom(args['sync-standby'] ?? process.env.AI_SYNC_STANDBY_ON_BOOT, false),
|
|
201
|
+
// tests 语料预加载(可能较慢;默认开启,fast-boot 可设为 false)
|
|
202
|
+
testsAutoload: boolFrom(args['tests-autoload'] ?? process.env.AI_TESTS_AUTOLOAD, true),
|
|
85
203
|
// Feature toggles via CLI/env
|
|
86
204
|
disableBarrier: boolFrom(args['disable-memebarrier'] ?? process.env.AI_DISABLE_MEMEBARRIER, false) === true,
|
|
87
205
|
disableRL: boolFrom(args['disable-rl'] ?? process.env.AI_DISABLE_RL, false) === true,
|
|
@@ -99,7 +217,7 @@ ensureDir(CONFIG.robotsDir);
|
|
|
99
217
|
const LMDB = safeRequire('lmdb');
|
|
100
218
|
|
|
101
219
|
class LmdbStore {
|
|
102
|
-
constructor({ name, rootDir, encodeJSON = true }) {
|
|
220
|
+
constructor({ name, rootDir, encodeJSON = true, mapSizeBytes } = {}) {
|
|
103
221
|
this.name = name;
|
|
104
222
|
this.rootDir = rootDir;
|
|
105
223
|
this.encodeJSON = encodeJSON;
|
|
@@ -109,10 +227,13 @@ class LmdbStore {
|
|
|
109
227
|
try {
|
|
110
228
|
const envPath = path.join(rootDir, name);
|
|
111
229
|
ensureDir(envPath);
|
|
230
|
+
const resolvedMapSize = Number.isFinite(Number(mapSizeBytes)) && Number(mapSizeBytes) > 0
|
|
231
|
+
? Number(mapSizeBytes)
|
|
232
|
+
: 512 * 1024 * 1024;
|
|
112
233
|
this.env = this.backend.open({
|
|
113
234
|
path: envPath,
|
|
114
235
|
maxReaders: 64,
|
|
115
|
-
mapSize:
|
|
236
|
+
mapSize: resolvedMapSize,
|
|
116
237
|
useWritemap: true,
|
|
117
238
|
noSync: false
|
|
118
239
|
});
|
|
@@ -352,10 +473,40 @@ const compileCustomFunctionSafely = (source, argNames, fallback) => {
|
|
|
352
473
|
}
|
|
353
474
|
};
|
|
354
475
|
|
|
476
|
+
/**
|
|
477
|
+
* 模型外参/超参(面向调参/评测;与 CLI/ENV 无关)
|
|
478
|
+
*
|
|
479
|
+
* 修改方式:
|
|
480
|
+
* - 运行时 patch:POST /api/model/params 传入 { key: value }
|
|
481
|
+
* - 恢复默认:POST /api/model/params/reset
|
|
482
|
+
* - 读取当前值:GET /api/model/params
|
|
483
|
+
*
|
|
484
|
+
* 逐项说明(默认值以此处为准):
|
|
485
|
+
* - iteration (5): 传播迭代步数;用于 RuntimeState.runPropagation()/exportGraphToFile() -> TensorEngine.iteratePropagation().
|
|
486
|
+
* - decayK (1): 传播衰减系数;传给 TensorEngine.iteratePropagation(csr, seeds, steps, actFn, decayK, damp).
|
|
487
|
+
*
|
|
488
|
+
* - memeNgramMin (2) / memeNgramMax (4): 构建“短语模因(ngram)”的长度范围;用于 mapWordsToMemes() 与 _buildMemeSequenceFromTokens().
|
|
489
|
+
* - minOverlapThreshold (2): tokenSet 与既有 meme 的最小重合词数;满足则“融合”到该 meme(link 词 -> meme)。
|
|
490
|
+
* - maxMemeWords (100): tokenSet 去重后的最大词数上限(用于限制短语模因的词集合大小)。
|
|
491
|
+
*
|
|
492
|
+
* - maliciousThreshold (0.7): MemeBarrier 判定阈值(网关侧安全屏障);也可通过 PATCH /api/runtime/features 调整。
|
|
493
|
+
*
|
|
494
|
+
* - activationType ('relu'): 激活函数类型;用于 _activation()。
|
|
495
|
+
* - 可用类型见 module.exports.BUILTIN_ACTIVATION_TYPES;当为 'custom' 时使用 activationCustom。
|
|
496
|
+
* - activationCustom (''): 自定义激活函数源码(function(x){...} 或表达式);仅 activationType='custom' 时生效。
|
|
497
|
+
*
|
|
498
|
+
* - transferType ('linear') / transferCustom (''):
|
|
499
|
+
* - 预留:目前本文件内未在主传播路径中调用(仅实现了 _transfer() 与 BuiltinTransfers)。
|
|
500
|
+
*
|
|
501
|
+
* - decayFactor (0.5), learningIterations (3), threshold (3), decay (1), maxLen (16), edgeWeight (1):
|
|
502
|
+
* - 预留:当前版本 main.cjs 中未发现显式读取点(可能供未来/外部实验使用)。
|
|
503
|
+
*/
|
|
355
504
|
const modelDefaults = {
|
|
356
505
|
decayFactor: 0.5,
|
|
357
506
|
maxMemeWords: 100,
|
|
358
507
|
minOverlapThreshold: 2,
|
|
508
|
+
memeNgramMin: 3,
|
|
509
|
+
memeNgramMax: 14,
|
|
359
510
|
maliciousThreshold: 0.7,
|
|
360
511
|
learningIterations: 3,
|
|
361
512
|
iteration: 5,
|
|
@@ -367,7 +518,12 @@ const modelDefaults = {
|
|
|
367
518
|
activationType: 'relu',
|
|
368
519
|
transferType: 'linear',
|
|
369
520
|
activationCustom: '',
|
|
370
|
-
transferCustom: ''
|
|
521
|
+
transferCustom: '',
|
|
522
|
+
// 多次映射/镜面反射层(文明演算法思想):words -> memes -> words -> memes ...
|
|
523
|
+
mappingDepth: 1,
|
|
524
|
+
reflectionTopMemes: 18,
|
|
525
|
+
reflectionTopWords: 24,
|
|
526
|
+
reflectionMinScore: 1e-6
|
|
371
527
|
};
|
|
372
528
|
|
|
373
529
|
const hashString = (str) => {
|
|
@@ -389,7 +545,7 @@ const tokenize = (text) => {
|
|
|
389
545
|
if (!part) {
|
|
390
546
|
continue;
|
|
391
547
|
}
|
|
392
|
-
if (/^[a-z0-9_\-]+$/.test(part) &&
|
|
548
|
+
if (/^[a-z0-9_\-]+$/.test(part) && getStopWords().includes(part)) {
|
|
393
549
|
continue;
|
|
394
550
|
}
|
|
395
551
|
tokens.push(part);
|
|
@@ -397,21 +553,388 @@ const tokenize = (text) => {
|
|
|
397
553
|
return tokens;
|
|
398
554
|
};
|
|
399
555
|
|
|
556
|
+
const splitSentences = (text) => {
|
|
557
|
+
const raw = String(text || '').trim();
|
|
558
|
+
if (!raw) {
|
|
559
|
+
return [];
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
// 说明:此函数用于“轻量切分成可学习/可检索的文本单元”。
|
|
563
|
+
// 这里不再按标点分句作为唯一粒度,而是将文本切成“2-10 个词”的短片段。
|
|
564
|
+
// 这样 robots 语料、surface phrase 抽取等模块能获得更细粒度的共现结构。
|
|
565
|
+
const maxWords = 10;
|
|
566
|
+
const minWords = 2;
|
|
567
|
+
|
|
568
|
+
// 先粗分段(保留换行/句末标点作为天然边界),再在段内按词切块。
|
|
569
|
+
const rough = raw
|
|
570
|
+
.split(/[\r\n]+|(?<=[。!?!?])\s*/g)
|
|
571
|
+
.map((s) => String(s || '').trim())
|
|
572
|
+
.filter(Boolean);
|
|
573
|
+
|
|
574
|
+
const out = [];
|
|
575
|
+
for (const unit of rough) {
|
|
576
|
+
const tokens = tokenize(unit);
|
|
577
|
+
if (tokens.length < minWords) {
|
|
578
|
+
continue;
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
for (let i = 0; i < tokens.length; i += maxWords) {
|
|
582
|
+
const chunk = tokens.slice(i, i + maxWords);
|
|
583
|
+
if (chunk.length < minWords) {
|
|
584
|
+
// 末尾不足 2 词:尽量并入上一块
|
|
585
|
+
if (out.length) {
|
|
586
|
+
out[out.length - 1] = `${out[out.length - 1]} ${chunk.join(' ')}`.trim();
|
|
587
|
+
}
|
|
588
|
+
continue;
|
|
589
|
+
}
|
|
590
|
+
out.push(chunk.join(' '));
|
|
591
|
+
if (out.length >= 12) {
|
|
592
|
+
return out;
|
|
593
|
+
}
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
if (out.length >= 12) {
|
|
597
|
+
break;
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
return out.slice(0, 12);
|
|
602
|
+
};
|
|
603
|
+
|
|
604
|
+
const extractSurfacePhrases = (text, { maxPhrases = 24 } = {}) => {
|
|
605
|
+
const out = [];
|
|
606
|
+
const seen = new Set();
|
|
607
|
+
|
|
608
|
+
const push = (phrase, weight = 1) => {
|
|
609
|
+
const p = String(phrase || '').trim();
|
|
610
|
+
if (!p) return;
|
|
611
|
+
if (p.length < 2) return;
|
|
612
|
+
if (p.length > 160) return;
|
|
613
|
+
if (seen.has(p)) return;
|
|
614
|
+
seen.add(p);
|
|
615
|
+
out.push({ phrase: p, weight });
|
|
616
|
+
};
|
|
617
|
+
|
|
618
|
+
// 细化:优先保留“词/短语结构”,句子仅作弱特征。
|
|
619
|
+
const sentences = splitSentences(text);
|
|
620
|
+
for (const s of sentences) {
|
|
621
|
+
push(s, 1);
|
|
622
|
+
}
|
|
623
|
+
|
|
624
|
+
const tokens = tokenize(text);
|
|
625
|
+
if (tokens.length) {
|
|
626
|
+
// unigram
|
|
627
|
+
for (const t of tokens.slice(0, maxPhrases)) {
|
|
628
|
+
push(t, 2);
|
|
629
|
+
if (out.length >= maxPhrases) {
|
|
630
|
+
return out.slice(0, maxPhrases);
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
// n-gram (短语)
|
|
635
|
+
const maxN = Math.min(5, tokens.length);
|
|
636
|
+
for (let n = 2; n <= maxN; n++) {
|
|
637
|
+
for (let i = 0; i + n <= tokens.length; i++) {
|
|
638
|
+
const gram = tokens.slice(i, i + n).join(' ');
|
|
639
|
+
const w = n === 2 ? 3 : (n === 3 ? 2 : 1);
|
|
640
|
+
push(gram, w);
|
|
641
|
+
if (out.length >= maxPhrases) {
|
|
642
|
+
return out.slice(0, maxPhrases);
|
|
643
|
+
}
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
|
|
648
|
+
return out.slice(0, maxPhrases);
|
|
649
|
+
};
|
|
650
|
+
|
|
651
|
+
class MemeSurfaceLexicon {
|
|
652
|
+
constructor(store, {
|
|
653
|
+
maxEntriesPerMeme = 64,
|
|
654
|
+
decay = 0.985
|
|
655
|
+
} = {}) {
|
|
656
|
+
this.store = store;
|
|
657
|
+
this.maxEntriesPerMeme = Math.max(8, Number(maxEntriesPerMeme) || 64);
|
|
658
|
+
this.decay = Number.isFinite(Number(decay)) ? Number(decay) : 0.985;
|
|
659
|
+
}
|
|
660
|
+
|
|
661
|
+
_key(memeId) {
|
|
662
|
+
return `m:${String(memeId)}`;
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
_load(memeId) {
|
|
666
|
+
const raw = this.store.get(this._key(memeId));
|
|
667
|
+
if (!raw || typeof raw !== 'object') {
|
|
668
|
+
return { phrases: {}, updatedAt: 0 };
|
|
669
|
+
}
|
|
670
|
+
const phrases = raw.phrases && typeof raw.phrases === 'object' ? raw.phrases : {};
|
|
671
|
+
return { phrases, updatedAt: Number(raw.updatedAt || 0) || 0 };
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
_save(memeId, rec) {
|
|
675
|
+
this.store.put(this._key(memeId), rec);
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
learn(memeId, replyText, { weight = 1 } = {}) {
|
|
679
|
+
if (!memeId) return;
|
|
680
|
+
const w = Number.isFinite(Number(weight)) ? Number(weight) : 1;
|
|
681
|
+
const rec = this._load(memeId);
|
|
682
|
+
const next = { phrases: { ...rec.phrases }, updatedAt: Date.now() };
|
|
683
|
+
|
|
684
|
+
// 对旧条目做轻量衰减,防止早期噪声长期占据。
|
|
685
|
+
for (const [k, v] of Object.entries(next.phrases)) {
|
|
686
|
+
const nv = (Number(v) || 0) * this.decay;
|
|
687
|
+
if (nv <= 1e-6) {
|
|
688
|
+
delete next.phrases[k];
|
|
689
|
+
} else {
|
|
690
|
+
next.phrases[k] = nv;
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
const phrases = extractSurfacePhrases(replyText, { maxPhrases: 24 });
|
|
695
|
+
for (const p of phrases) {
|
|
696
|
+
next.phrases[p.phrase] = (Number(next.phrases[p.phrase]) || 0) + (p.weight * w);
|
|
697
|
+
}
|
|
698
|
+
|
|
699
|
+
// 裁剪到 topN
|
|
700
|
+
const ordered = Object.entries(next.phrases)
|
|
701
|
+
.sort((a, b) => (Number(b[1]) || 0) - (Number(a[1]) || 0))
|
|
702
|
+
.slice(0, this.maxEntriesPerMeme);
|
|
703
|
+
next.phrases = Object.fromEntries(ordered);
|
|
704
|
+
this._save(memeId, next);
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
getTop(memeId, { limit = 6 } = {}) {
|
|
708
|
+
const rec = this._load(memeId);
|
|
709
|
+
const ordered = Object.entries(rec.phrases || {})
|
|
710
|
+
.sort((a, b) => (Number(b[1]) || 0) - (Number(a[1]) || 0))
|
|
711
|
+
.slice(0, Math.max(1, Number(limit) || 6))
|
|
712
|
+
.map(([phrase, score]) => ({ phrase, score: Number(score) || 0 }));
|
|
713
|
+
return ordered;
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
exportSnapshot({ limitMemes = 512 } = {}) {
|
|
717
|
+
const out = [];
|
|
718
|
+
const items = this.store.entries('m:');
|
|
719
|
+
for (const [key, value] of items) {
|
|
720
|
+
out.push([key, value]);
|
|
721
|
+
if (out.length >= limitMemes) break;
|
|
722
|
+
}
|
|
723
|
+
return out;
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
importSnapshot(entries) {
|
|
727
|
+
if (!Array.isArray(entries)) return;
|
|
728
|
+
for (const item of entries) {
|
|
729
|
+
if (!Array.isArray(item) || item.length !== 2) continue;
|
|
730
|
+
const [key, value] = item;
|
|
731
|
+
if (typeof key !== 'string' || !key.startsWith('m:')) continue;
|
|
732
|
+
this.store.put(key, value);
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
const jaccard = (a, b) => {
|
|
738
|
+
const A = a instanceof Set ? a : new Set(Array.isArray(a) ? a : []);
|
|
739
|
+
const B = b instanceof Set ? b : new Set(Array.isArray(b) ? b : []);
|
|
740
|
+
if (A.size === 0 && B.size === 0) return 1;
|
|
741
|
+
if (A.size === 0 || B.size === 0) return 0;
|
|
742
|
+
let inter = 0;
|
|
743
|
+
for (const x of A) if (B.has(x)) inter++;
|
|
744
|
+
const uni = A.size + B.size - inter;
|
|
745
|
+
return uni <= 0 ? 0 : inter / uni;
|
|
746
|
+
};
|
|
747
|
+
|
|
748
|
+
class DialogMemory {
|
|
749
|
+
constructor(store, {
|
|
750
|
+
maxItems = 2048,
|
|
751
|
+
maxPerIndex = 64
|
|
752
|
+
} = {}) {
|
|
753
|
+
this.store = store;
|
|
754
|
+
this.maxItems = Math.max(128, Number(maxItems) || 2048);
|
|
755
|
+
this.maxPerIndex = Math.max(8, Number(maxPerIndex) || 64);
|
|
756
|
+
}
|
|
757
|
+
|
|
758
|
+
_kDialog(id) {
|
|
759
|
+
return `d:${String(id)}`;
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
_kIndex(memeId) {
|
|
763
|
+
return `i:${String(memeId)}`;
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
_makeId(signature) {
|
|
767
|
+
return hashString(String(signature || ''));
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
remember({ signature, memes = [], question = '', reply = '', scoreHint = 0 } = {}) {
|
|
771
|
+
const sig = String(signature || '').trim();
|
|
772
|
+
const rep = String(reply || '').trim();
|
|
773
|
+
if (!sig || !rep) return null;
|
|
774
|
+
const id = this._makeId(sig);
|
|
775
|
+
const key = this._kDialog(id);
|
|
776
|
+
const prev = this.store.get(key);
|
|
777
|
+
const next = {
|
|
778
|
+
id,
|
|
779
|
+
signature: sig,
|
|
780
|
+
memes: Array.isArray(memes) ? memes.slice(0, 32) : [],
|
|
781
|
+
question: String(question || '').slice(0, 800),
|
|
782
|
+
reply: rep.slice(0, 1200),
|
|
783
|
+
updatedAt: Date.now(),
|
|
784
|
+
count: (prev && Number(prev.count)) ? (Number(prev.count) + 1) : 1,
|
|
785
|
+
scoreHint: Number.isFinite(Number(scoreHint)) ? Number(scoreHint) : (prev?.scoreHint ?? 0)
|
|
786
|
+
};
|
|
787
|
+
this.store.put(key, next);
|
|
788
|
+
|
|
789
|
+
// 建索引:memeId -> dialogIds[]
|
|
790
|
+
const uniq = Array.from(new Set(next.memes));
|
|
791
|
+
for (const memeId of uniq) {
|
|
792
|
+
const ik = this._kIndex(memeId);
|
|
793
|
+
const list = Array.isArray(this.store.get(ik)) ? this.store.get(ik) : [];
|
|
794
|
+
const filtered = list.filter((x) => x && x !== id);
|
|
795
|
+
filtered.unshift(id);
|
|
796
|
+
this.store.put(ik, filtered.slice(0, this.maxPerIndex));
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
return next;
|
|
800
|
+
}
|
|
801
|
+
|
|
802
|
+
retrieve({ memes = [], signature = '', minSim = 0.45 } = {}) {
|
|
803
|
+
const memeList = Array.isArray(memes) ? memes.slice(0, 24) : [];
|
|
804
|
+
const sigSet = new Set(String(signature || '').split('|').filter(Boolean));
|
|
805
|
+
const candidateIds = new Set();
|
|
806
|
+
for (const memeId of memeList.slice(0, 8)) {
|
|
807
|
+
const ik = this._kIndex(memeId);
|
|
808
|
+
const ids = Array.isArray(this.store.get(ik)) ? this.store.get(ik) : [];
|
|
809
|
+
for (const id of ids) {
|
|
810
|
+
if (id) candidateIds.add(id);
|
|
811
|
+
}
|
|
812
|
+
}
|
|
813
|
+
|
|
814
|
+
// 如果没有索引命中,尝试精确 signature 命中
|
|
815
|
+
if (candidateIds.size === 0 && signature) {
|
|
816
|
+
candidateIds.add(this._makeId(signature));
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
let best = null;
|
|
820
|
+
let bestScore = 0;
|
|
821
|
+
for (const id of candidateIds) {
|
|
822
|
+
const rec = this.store.get(this._kDialog(id));
|
|
823
|
+
if (!rec || !rec.reply) continue;
|
|
824
|
+
const recSet = new Set(String(rec.signature || '').split('|').filter(Boolean));
|
|
825
|
+
const sim = jaccard(sigSet, recSet);
|
|
826
|
+
if (sim < minSim) continue;
|
|
827
|
+
const freq = Math.log(1 + (Number(rec.count) || 0));
|
|
828
|
+
const score = sim * (1 + 0.15 * freq);
|
|
829
|
+
if (score > bestScore) {
|
|
830
|
+
bestScore = score;
|
|
831
|
+
best = { ...rec, similarity: sim, score };
|
|
832
|
+
}
|
|
833
|
+
}
|
|
834
|
+
return best;
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
exportSnapshot({ limit = 512 } = {}) {
|
|
838
|
+
const out = { dialogs: [], indexes: [] };
|
|
839
|
+
const dialogs = this.store.entries('d:');
|
|
840
|
+
for (const [key, value] of dialogs) {
|
|
841
|
+
out.dialogs.push([key, value]);
|
|
842
|
+
if (out.dialogs.length >= limit) break;
|
|
843
|
+
}
|
|
844
|
+
const indexes = this.store.entries('i:');
|
|
845
|
+
for (const [key, value] of indexes) {
|
|
846
|
+
out.indexes.push([key, value]);
|
|
847
|
+
if (out.indexes.length >= limit) break;
|
|
848
|
+
}
|
|
849
|
+
return out;
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
importSnapshot(snapshot) {
|
|
853
|
+
if (!snapshot || typeof snapshot !== 'object') return;
|
|
854
|
+
for (const item of Array.isArray(snapshot.dialogs) ? snapshot.dialogs : []) {
|
|
855
|
+
if (!Array.isArray(item) || item.length !== 2) continue;
|
|
856
|
+
const [key, value] = item;
|
|
857
|
+
if (typeof key === 'string' && key.startsWith('d:')) this.store.put(key, value);
|
|
858
|
+
}
|
|
859
|
+
for (const item of Array.isArray(snapshot.indexes) ? snapshot.indexes : []) {
|
|
860
|
+
if (!Array.isArray(item) || item.length !== 2) continue;
|
|
861
|
+
const [key, value] = item;
|
|
862
|
+
if (typeof key === 'string' && key.startsWith('i:')) this.store.put(key, value);
|
|
863
|
+
}
|
|
864
|
+
}
|
|
865
|
+
}
|
|
866
|
+
|
|
400
867
|
class RobotsCorpus {
|
|
401
|
-
constructor({
|
|
868
|
+
constructor({
|
|
869
|
+
dir,
|
|
870
|
+
lemmaCsv,
|
|
871
|
+
lemmaAutoload = false,
|
|
872
|
+
lemmaMaxBytes,
|
|
873
|
+
lemmaForce = false,
|
|
874
|
+
chunkMinWords = 2,
|
|
875
|
+
chunkMaxWords = 5
|
|
876
|
+
} = {}) {
|
|
402
877
|
this.dir = dir;
|
|
403
878
|
this.lemmaCsv = lemmaCsv;
|
|
404
|
-
this.
|
|
879
|
+
this.lemmaAutoload = Boolean(lemmaAutoload);
|
|
880
|
+
this.lemmaForce = Boolean(lemmaForce);
|
|
881
|
+
this.lemmaMaxBytes = Number.isFinite(Number(lemmaMaxBytes)) && Number(lemmaMaxBytes) > 0
|
|
882
|
+
? Number(lemmaMaxBytes)
|
|
883
|
+
: 64 * 1024 * 1024;
|
|
884
|
+
this.chunkMinWords = Math.max(1, Number(chunkMinWords) || 2);
|
|
885
|
+
this.chunkMaxWords = Math.max(this.chunkMinWords, Number(chunkMaxWords) || 5);
|
|
886
|
+
this._lemmaLoaded = false;
|
|
887
|
+
this.lemmaMap = new Map();
|
|
405
888
|
this.maxArticleSize = 5_000_000;
|
|
406
889
|
this.minParagraphLength = 12;
|
|
407
890
|
}
|
|
408
891
|
|
|
892
|
+
_chunkTokens(tokens) {
|
|
893
|
+
const out = [];
|
|
894
|
+
if (!Array.isArray(tokens) || tokens.length === 0) {
|
|
895
|
+
return out;
|
|
896
|
+
}
|
|
897
|
+
const maxN = Math.max(1, this.chunkMaxWords);
|
|
898
|
+
const minN = Math.max(1, this.chunkMinWords);
|
|
899
|
+
for (let i = 0; i < tokens.length; i += maxN) {
|
|
900
|
+
const chunk = tokens.slice(i, i + maxN);
|
|
901
|
+
if (chunk.length >= minN) {
|
|
902
|
+
out.push(chunk);
|
|
903
|
+
}
|
|
904
|
+
}
|
|
905
|
+
return out;
|
|
906
|
+
}
|
|
907
|
+
|
|
908
|
+
_ensureLemmaMapLoaded() {
|
|
909
|
+
if (this._lemmaLoaded) {
|
|
910
|
+
return;
|
|
911
|
+
}
|
|
912
|
+
this._lemmaLoaded = true;
|
|
913
|
+
if (!this.lemmaAutoload && !this.lemmaForce) {
|
|
914
|
+
return;
|
|
915
|
+
}
|
|
916
|
+
this.lemmaMap = this._loadLemmaMap();
|
|
917
|
+
}
|
|
918
|
+
|
|
409
919
|
_loadLemmaMap() {
|
|
410
920
|
const map = new Map();
|
|
921
|
+
const csvParse = getCsvParse();
|
|
411
922
|
if (!csvParse || !this.lemmaCsv || !fs.existsSync(this.lemmaCsv)) {
|
|
412
923
|
return map;
|
|
413
924
|
}
|
|
414
925
|
try {
|
|
926
|
+
try {
|
|
927
|
+
const st = fs.statSync(this.lemmaCsv);
|
|
928
|
+
if (!this.lemmaForce && st && Number.isFinite(st.size) && st.size > this.lemmaMaxBytes) {
|
|
929
|
+
console.warn(
|
|
930
|
+
`[RobotsCorpus] lemma.csv too large (${Math.round(st.size / 1024 / 1024)}MB), skip autoload. ` +
|
|
931
|
+
`Set AI_LEMMA_FORCE=true or increase AI_LEMMA_MAX_MB to load.`
|
|
932
|
+
);
|
|
933
|
+
return map;
|
|
934
|
+
}
|
|
935
|
+
} catch (_e) {
|
|
936
|
+
// ignore stat failure
|
|
937
|
+
}
|
|
415
938
|
const csvContent = fs.readFileSync(this.lemmaCsv, 'utf8');
|
|
416
939
|
const rows = csvParse.parse(csvContent, { skip_empty_lines: true, relax_column_count: true });
|
|
417
940
|
for (const row of rows) {
|
|
@@ -437,6 +960,7 @@ class RobotsCorpus {
|
|
|
437
960
|
}
|
|
438
961
|
|
|
439
962
|
lemmatize(word) {
|
|
963
|
+
this._ensureLemmaMapLoaded();
|
|
440
964
|
const lower = String(word || '').toLowerCase();
|
|
441
965
|
return this.lemmaMap.get(lower) || lower;
|
|
442
966
|
}
|
|
@@ -466,11 +990,21 @@ class RobotsCorpus {
|
|
|
466
990
|
_readFile(file) {
|
|
467
991
|
const full = path.join(this.dir, file);
|
|
468
992
|
try {
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
993
|
+
// 关键优化:避免对超大语料文件做一次性 readFileSync(会把整个文件读进内存)
|
|
994
|
+
// 这里仅读取前 maxArticleSize 字节(近似等价于之前的 slice 行为)。
|
|
995
|
+
const fd = fs.openSync(full, 'r');
|
|
996
|
+
try {
|
|
997
|
+
const maxBytes = Math.max(1, Number(this.maxArticleSize) || 1);
|
|
998
|
+
const buf = Buffer.allocUnsafe(maxBytes);
|
|
999
|
+
const bytesRead = fs.readSync(fd, buf, 0, maxBytes, 0);
|
|
1000
|
+
return buf.toString('utf8', 0, Math.max(0, bytesRead || 0));
|
|
1001
|
+
} finally {
|
|
1002
|
+
try {
|
|
1003
|
+
fs.closeSync(fd);
|
|
1004
|
+
} catch (_e) {
|
|
1005
|
+
// ignore close failure
|
|
1006
|
+
}
|
|
472
1007
|
}
|
|
473
|
-
return content;
|
|
474
1008
|
} catch (err) {
|
|
475
1009
|
console.warn(`[RobotsCorpus] Failed to read ${full}:`, err.message);
|
|
476
1010
|
return '';
|
|
@@ -510,30 +1044,38 @@ class RobotsCorpus {
|
|
|
510
1044
|
for (const paragraph of paragraphs) {
|
|
511
1045
|
const trimmed = paragraph.trim();
|
|
512
1046
|
if (trimmed.length < this.minParagraphLength) {
|
|
513
|
-
localIndex += 1;
|
|
514
|
-
continue;
|
|
515
|
-
}
|
|
516
|
-
if (skip > 0) {
|
|
517
|
-
skip -= 1;
|
|
518
|
-
localIndex += 1;
|
|
519
1047
|
continue;
|
|
520
1048
|
}
|
|
521
|
-
|
|
522
|
-
|
|
1049
|
+
|
|
1050
|
+
// 新逻辑:按句子切分,再按 2-5 词(可配置)切块生成 doc
|
|
1051
|
+
// 这样每条 doc 更接近“短语/局部词共现”,利于词表/模因边的细粒度学习。
|
|
1052
|
+
// 注意:splitSentences() 已改为输出“2-10 词”的短片段;这里不再做二次切块。
|
|
1053
|
+
const units = splitSentences(trimmed);
|
|
1054
|
+
for (const unit of units) {
|
|
1055
|
+
const unitText = String(unit || '').trim();
|
|
1056
|
+
if (unitText.length < this.minParagraphLength) {
|
|
1057
|
+
continue;
|
|
1058
|
+
}
|
|
1059
|
+
const normalizedTokens = this.normalizeWords(tokenize(unitText));
|
|
1060
|
+
if (!normalizedTokens.length) {
|
|
1061
|
+
continue;
|
|
1062
|
+
}
|
|
1063
|
+
if (skip > 0) {
|
|
1064
|
+
skip -= 1;
|
|
1065
|
+
continue;
|
|
1066
|
+
}
|
|
1067
|
+
docs.push({
|
|
1068
|
+
id: `robots:${file}#${localIndex}`,
|
|
1069
|
+
file,
|
|
1070
|
+
source: `robots:${file}`,
|
|
1071
|
+
index: localIndex,
|
|
1072
|
+
text: unitText,
|
|
1073
|
+
tokens: normalizedTokens
|
|
1074
|
+
});
|
|
523
1075
|
localIndex += 1;
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
id: `robots:${file}#${localIndex}`,
|
|
528
|
-
file,
|
|
529
|
-
source: `robots:${file}`,
|
|
530
|
-
index: localIndex,
|
|
531
|
-
text: trimmed,
|
|
532
|
-
tokens
|
|
533
|
-
});
|
|
534
|
-
localIndex += 1;
|
|
535
|
-
if (maxDocs !== null && docs.length >= maxDocs) {
|
|
536
|
-
return shuffle && docs.length > 1 ? this._shuffle(docs) : docs;
|
|
1076
|
+
if (maxDocs !== null && docs.length >= maxDocs) {
|
|
1077
|
+
return shuffle && docs.length > 1 ? this._shuffle(docs) : docs;
|
|
1078
|
+
}
|
|
537
1079
|
}
|
|
538
1080
|
}
|
|
539
1081
|
}
|
|
@@ -546,9 +1088,41 @@ class RobotsCorpus {
|
|
|
546
1088
|
}
|
|
547
1089
|
|
|
548
1090
|
class KVMStore {
|
|
549
|
-
constructor(store) {
|
|
1091
|
+
constructor(store, { maxCacheEntries } = {}) {
|
|
550
1092
|
this.store = store;
|
|
551
1093
|
this.cache = new Map();
|
|
1094
|
+
this.maxCacheEntries = Number.isFinite(Number(maxCacheEntries)) && Number(maxCacheEntries) >= 0
|
|
1095
|
+
? Number(maxCacheEntries)
|
|
1096
|
+
: 50_000;
|
|
1097
|
+
}
|
|
1098
|
+
|
|
1099
|
+
_cacheGet(key) {
|
|
1100
|
+
if (!this.maxCacheEntries) {
|
|
1101
|
+
return null;
|
|
1102
|
+
}
|
|
1103
|
+
if (!this.cache.has(key)) {
|
|
1104
|
+
return null;
|
|
1105
|
+
}
|
|
1106
|
+
const value = this.cache.get(key);
|
|
1107
|
+
// LRU: bump to most-recent
|
|
1108
|
+
this.cache.delete(key);
|
|
1109
|
+
this.cache.set(key, value);
|
|
1110
|
+
return value;
|
|
1111
|
+
}
|
|
1112
|
+
|
|
1113
|
+
_cacheSet(key, value) {
|
|
1114
|
+
if (!this.maxCacheEntries) {
|
|
1115
|
+
return;
|
|
1116
|
+
}
|
|
1117
|
+
if (this.cache.has(key)) {
|
|
1118
|
+
this.cache.delete(key);
|
|
1119
|
+
}
|
|
1120
|
+
this.cache.set(key, value);
|
|
1121
|
+
while (this.cache.size > this.maxCacheEntries) {
|
|
1122
|
+
const oldest = this.cache.keys().next().value;
|
|
1123
|
+
if (oldest === undefined) break;
|
|
1124
|
+
this.cache.delete(oldest);
|
|
1125
|
+
}
|
|
552
1126
|
}
|
|
553
1127
|
|
|
554
1128
|
_key(type, value) {
|
|
@@ -557,23 +1131,21 @@ class KVMStore {
|
|
|
557
1131
|
|
|
558
1132
|
getWordMemeSet(word) {
|
|
559
1133
|
const key = this._key('word', word);
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
}
|
|
1134
|
+
const cached = this._cacheGet(key);
|
|
1135
|
+
if (cached) return cached;
|
|
563
1136
|
const value = this.store.get(key) || [];
|
|
564
1137
|
const set = new Set(value);
|
|
565
|
-
this.
|
|
1138
|
+
this._cacheSet(key, set);
|
|
566
1139
|
return set;
|
|
567
1140
|
}
|
|
568
1141
|
|
|
569
1142
|
getMemeWords(memeId) {
|
|
570
1143
|
const key = this._key('meme', memeId);
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
}
|
|
1144
|
+
const cached = this._cacheGet(key);
|
|
1145
|
+
if (cached) return cached;
|
|
574
1146
|
const value = this.store.get(key) || [];
|
|
575
1147
|
const set = new Set(value);
|
|
576
|
-
this.
|
|
1148
|
+
this._cacheSet(key, set);
|
|
577
1149
|
return set;
|
|
578
1150
|
}
|
|
579
1151
|
|
|
@@ -585,12 +1157,12 @@ class KVMStore {
|
|
|
585
1157
|
if (!wordSet.has(memeId)) {
|
|
586
1158
|
wordSet.add(memeId);
|
|
587
1159
|
this.store.put(wordKey, Array.from(wordSet));
|
|
588
|
-
this.
|
|
1160
|
+
this._cacheSet(wordKey, wordSet);
|
|
589
1161
|
}
|
|
590
1162
|
if (!memeSet.has(word)) {
|
|
591
1163
|
memeSet.add(word);
|
|
592
1164
|
this.store.put(memeKey, Array.from(memeSet));
|
|
593
|
-
this.
|
|
1165
|
+
this._cacheSet(memeKey, memeSet);
|
|
594
1166
|
}
|
|
595
1167
|
}
|
|
596
1168
|
|
|
@@ -626,36 +1198,80 @@ class CSRMatrix {
|
|
|
626
1198
|
}
|
|
627
1199
|
|
|
628
1200
|
class MemeGraph {
|
|
629
|
-
constructor(store) {
|
|
1201
|
+
constructor(store, { eagerLoad = false } = {}) {
|
|
630
1202
|
this.store = store;
|
|
631
1203
|
this.nodes = new Map();
|
|
632
1204
|
this.meta = new Map();
|
|
633
1205
|
this.windowSize = 4096;
|
|
634
|
-
this.
|
|
1206
|
+
this._fullyLoaded = false;
|
|
1207
|
+
if (eagerLoad) {
|
|
1208
|
+
this._loadAllFromStore();
|
|
1209
|
+
}
|
|
635
1210
|
}
|
|
636
1211
|
|
|
637
|
-
|
|
1212
|
+
_loadAllFromStore() {
|
|
1213
|
+
if (this._fullyLoaded) {
|
|
1214
|
+
return;
|
|
1215
|
+
}
|
|
638
1216
|
const entries = this.store.entries('node:');
|
|
639
1217
|
for (const [key, value] of entries) {
|
|
640
1218
|
const memeId = key.slice(5);
|
|
641
|
-
this.meta.
|
|
1219
|
+
if (!this.meta.has(memeId)) {
|
|
1220
|
+
this.meta.set(memeId, value || {});
|
|
1221
|
+
}
|
|
642
1222
|
}
|
|
643
1223
|
const rowEntries = this.store.entries('row:');
|
|
644
1224
|
for (const [key, row] of rowEntries) {
|
|
645
1225
|
const memeId = key.slice(4);
|
|
1226
|
+
if (this.nodes.has(memeId)) {
|
|
1227
|
+
continue;
|
|
1228
|
+
}
|
|
646
1229
|
if (!row || !Array.isArray(row.neighbors)) {
|
|
647
1230
|
continue;
|
|
648
1231
|
}
|
|
649
1232
|
const map = new Map();
|
|
650
1233
|
for (const { to, weight, direction } of row.neighbors) {
|
|
651
|
-
map.set(to, { weight, direction });
|
|
1234
|
+
map.set(String(to), { weight, direction });
|
|
652
1235
|
}
|
|
653
1236
|
this.nodes.set(memeId, map);
|
|
654
1237
|
}
|
|
1238
|
+
this._fullyLoaded = true;
|
|
1239
|
+
}
|
|
1240
|
+
|
|
1241
|
+
_ensureRowLoaded(memeId) {
|
|
1242
|
+
const id = String(memeId);
|
|
1243
|
+
if (this.nodes.has(id)) {
|
|
1244
|
+
return this.nodes.get(id);
|
|
1245
|
+
}
|
|
1246
|
+
const row = this.store.get(`row:${id}`);
|
|
1247
|
+
if (row && Array.isArray(row.neighbors)) {
|
|
1248
|
+
const map = new Map();
|
|
1249
|
+
for (const { to, weight, direction } of row.neighbors) {
|
|
1250
|
+
map.set(String(to), { weight, direction });
|
|
1251
|
+
}
|
|
1252
|
+
this.nodes.set(id, map);
|
|
1253
|
+
return map;
|
|
1254
|
+
}
|
|
1255
|
+
return null;
|
|
1256
|
+
}
|
|
1257
|
+
|
|
1258
|
+
_ensureMetaLoaded(memeId) {
|
|
1259
|
+
const id = String(memeId);
|
|
1260
|
+
if (this.meta.has(id)) {
|
|
1261
|
+
return this.meta.get(id);
|
|
1262
|
+
}
|
|
1263
|
+
const meta = this.store.get(`node:${id}`);
|
|
1264
|
+
if (meta && typeof meta === 'object') {
|
|
1265
|
+
this.meta.set(id, meta);
|
|
1266
|
+
return meta;
|
|
1267
|
+
}
|
|
1268
|
+
return null;
|
|
655
1269
|
}
|
|
656
1270
|
|
|
657
1271
|
// 返回当前图中所有模因节点的ID列表,供扫描器等模块使用
|
|
658
1272
|
getAllPoints() {
|
|
1273
|
+
// 需要全量枚举时才进行全量加载,避免启动时扫描整个 store。
|
|
1274
|
+
this._loadAllFromStore();
|
|
659
1275
|
return Array.from(this.meta.keys());
|
|
660
1276
|
}
|
|
661
1277
|
|
|
@@ -668,11 +1284,26 @@ class MemeGraph {
|
|
|
668
1284
|
}
|
|
669
1285
|
|
|
670
1286
|
ensureNode(memeId) {
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
|
|
1287
|
+
const id = String(memeId);
|
|
1288
|
+
if (!this.nodes.has(id)) {
|
|
1289
|
+
const loaded = this._ensureRowLoaded(id);
|
|
1290
|
+
if (!loaded) {
|
|
1291
|
+
this.nodes.set(id, new Map());
|
|
1292
|
+
}
|
|
1293
|
+
}
|
|
1294
|
+
if (!this.meta.has(id)) {
|
|
1295
|
+
const loadedMeta = this._ensureMetaLoaded(id);
|
|
1296
|
+
if (!loadedMeta) {
|
|
1297
|
+
this.meta.set(id, { degree: 0, lastTouched: Date.now() });
|
|
1298
|
+
this.store.put(`node:${id}`, this.meta.get(id));
|
|
1299
|
+
}
|
|
1300
|
+
}
|
|
1301
|
+
if (!this.nodes.get(id)) {
|
|
1302
|
+
this.nodes.set(id, new Map());
|
|
1303
|
+
}
|
|
1304
|
+
// 若节点此前不存在于 store,确保 row 持久化
|
|
1305
|
+
if (!this.store.get(`row:${id}`)) {
|
|
1306
|
+
this._persistNode(id);
|
|
676
1307
|
}
|
|
677
1308
|
}
|
|
678
1309
|
|
|
@@ -725,7 +1356,7 @@ class MemeGraph {
|
|
|
725
1356
|
while (border.length && visited.size < this.windowSize && depth < radius) {
|
|
726
1357
|
const next = [];
|
|
727
1358
|
for (const id of border) {
|
|
728
|
-
const table = this.nodes.get(id);
|
|
1359
|
+
const table = this.nodes.get(id) || this._ensureRowLoaded(id);
|
|
729
1360
|
if (!table) {
|
|
730
1361
|
continue;
|
|
731
1362
|
}
|
|
@@ -746,7 +1377,7 @@ class MemeGraph {
|
|
|
746
1377
|
const weights = [];
|
|
747
1378
|
for (let i = 0; i < ids.length; i++) {
|
|
748
1379
|
const id = ids[i];
|
|
749
|
-
const table = this.nodes.get(id) || new Map();
|
|
1380
|
+
const table = this.nodes.get(id) || this._ensureRowLoaded(id) || new Map();
|
|
750
1381
|
rowPtr[i] = edges.length;
|
|
751
1382
|
for (const [toId, { weight }] of table.entries()) {
|
|
752
1383
|
if (!index.has(toId)) {
|
|
@@ -768,6 +1399,8 @@ class MemeGraph {
|
|
|
768
1399
|
}
|
|
769
1400
|
|
|
770
1401
|
exportSnapshot() {
|
|
1402
|
+
// 导出需要全量内容;若尚未加载则在此时执行全量加载。
|
|
1403
|
+
this._loadAllFromStore();
|
|
771
1404
|
const nodes = [];
|
|
772
1405
|
for (const [id, table] of this.nodes.entries()) {
|
|
773
1406
|
nodes.push({
|
|
@@ -796,6 +1429,8 @@ class MemeGraph {
|
|
|
796
1429
|
}
|
|
797
1430
|
|
|
798
1431
|
removeNode(memeId) {
|
|
1432
|
+
// 删除需要一致性:先全量加载再删,避免遗漏未加载节点中的反向边。
|
|
1433
|
+
this._loadAllFromStore();
|
|
799
1434
|
if (!this.nodes.has(memeId)) {
|
|
800
1435
|
return false;
|
|
801
1436
|
}
|
|
@@ -1071,6 +1706,7 @@ class DimReducer {
|
|
|
1071
1706
|
}
|
|
1072
1707
|
|
|
1073
1708
|
project2D(emb, method = 'auto') {
|
|
1709
|
+
const umap = getUmap();
|
|
1074
1710
|
if (method === 'umap' && umap) {
|
|
1075
1711
|
const dataset = [];
|
|
1076
1712
|
for (let row = 0; row < emb.nRows; row++) {
|
|
@@ -1182,7 +1818,7 @@ class OnlineResearcher {
|
|
|
1182
1818
|
maxPdfBytes: clampInt(crawlReq?.maxPdfBytes, 64 * 1024, 40 * 1024 * 1024, 20 * 1024 * 1024),
|
|
1183
1819
|
userAgent: typeof crawlReq?.userAgent === 'string' && crawlReq.userAgent.trim() ? crawlReq.userAgent.trim() : '079ProjectCrawler/1.0'
|
|
1184
1820
|
};
|
|
1185
|
-
const crawler = new SiteCrawler({ axios, cheerio, pdfParse });
|
|
1821
|
+
const crawler = new SiteCrawler({ axios: getAxios(), cheerio: getCheerio(), pdfParse: getPdfParse() });
|
|
1186
1822
|
const result = await crawler.crawl(startUrl, crawlOptions);
|
|
1187
1823
|
if (!options?.forceRemote) {
|
|
1188
1824
|
const key = this._normalize(tokenize(`crawl ${startUrl}`));
|
|
@@ -1204,6 +1840,7 @@ class OnlineResearcher {
|
|
|
1204
1840
|
}
|
|
1205
1841
|
|
|
1206
1842
|
let payload = null;
|
|
1843
|
+
const axios = getAxios();
|
|
1207
1844
|
if (axios && this.endpoint && !options.skipRemote) {
|
|
1208
1845
|
try {
|
|
1209
1846
|
const resp = await axios.get(this.endpoint, {
|
|
@@ -1502,14 +2139,6 @@ class SessionManager {
|
|
|
1502
2139
|
this.idleMs = idleMs;
|
|
1503
2140
|
this.maxSessions = maxSessions;
|
|
1504
2141
|
this.active = new Map();
|
|
1505
|
-
this._load();
|
|
1506
|
-
}
|
|
1507
|
-
|
|
1508
|
-
_load() {
|
|
1509
|
-
const entries = this.store.entries('session:');
|
|
1510
|
-
for (const [key, value] of entries) {
|
|
1511
|
-
this.active.set(key.slice(8), value);
|
|
1512
|
-
}
|
|
1513
2142
|
}
|
|
1514
2143
|
|
|
1515
2144
|
_save(sessionId) {
|
|
@@ -1524,12 +2153,28 @@ class SessionManager {
|
|
|
1524
2153
|
}
|
|
1525
2154
|
|
|
1526
2155
|
ensure(sessionId) {
|
|
1527
|
-
if (sessionId
|
|
1528
|
-
const
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
2156
|
+
if (sessionId) {
|
|
2157
|
+
const sid = String(sessionId);
|
|
2158
|
+
if (this.active.has(sid)) {
|
|
2159
|
+
const data = this.active.get(sid);
|
|
2160
|
+
data.lastActivity = Date.now();
|
|
2161
|
+
data.count = (data.count || 0) + 1;
|
|
2162
|
+
this._save(sid);
|
|
2163
|
+
return sid;
|
|
2164
|
+
}
|
|
2165
|
+
// 懒加载:仅在客户端携带 sessionId 时才从 store 读取
|
|
2166
|
+
const stored = this.store.get(`session:${sid}`);
|
|
2167
|
+
if (stored && typeof stored === 'object') {
|
|
2168
|
+
const data = { ...stored, id: stored.id || sid };
|
|
2169
|
+
data.lastActivity = Date.now();
|
|
2170
|
+
data.count = (data.count || 0) + 1;
|
|
2171
|
+
this.active.set(sid, data);
|
|
2172
|
+
this._save(sid);
|
|
2173
|
+
if (this.active.size > this.maxSessions) {
|
|
2174
|
+
this._truncate();
|
|
2175
|
+
}
|
|
2176
|
+
return sid;
|
|
2177
|
+
}
|
|
1533
2178
|
}
|
|
1534
2179
|
const id = this._newId();
|
|
1535
2180
|
this.active.set(id, { id, createdAt: Date.now(), lastActivity: Date.now(), count: 1, meta: {} });
|
|
@@ -1591,7 +2236,7 @@ class SnapshotManager {
|
|
|
1591
2236
|
}
|
|
1592
2237
|
const raw = JSON.parse(fs.readFileSync(file, 'utf8'));
|
|
1593
2238
|
await this.runtime.fromSnapshot(raw);
|
|
1594
|
-
return
|
|
2239
|
+
return raw;
|
|
1595
2240
|
}
|
|
1596
2241
|
|
|
1597
2242
|
delete(id) {
|
|
@@ -1647,16 +2292,28 @@ class GraphExportBuilder {
|
|
|
1647
2292
|
|
|
1648
2293
|
class RuntimeState {
|
|
1649
2294
|
constructor({ kvmStore, memeStore, sessionStore, params, config }) {
|
|
1650
|
-
this.
|
|
2295
|
+
this.config = { ...(config || {}) };
|
|
2296
|
+
this.config.robotsDir = this.config.robotsDir || path.join(__dirname, 'robots');
|
|
2297
|
+
this.config.lemmaCsv = this.config.lemmaCsv || path.join(__dirname, 'lemma.csv');
|
|
2298
|
+
this.config.lemmaAutoload = this.config.lemmaAutoload ?? false;
|
|
2299
|
+
this.config.lemmaMaxBytes = this.config.lemmaMaxBytes ?? (64 * 1024 * 1024);
|
|
2300
|
+
this.config.lemmaForce = this.config.lemmaForce ?? false;
|
|
2301
|
+
this.config.kvmCacheMaxEntries = this.config.kvmCacheMaxEntries ?? 50_000;
|
|
2302
|
+
this.config.robotsChunkMinWords = this.config.robotsChunkMinWords ?? 2;
|
|
2303
|
+
this.config.robotsChunkMaxWords = this.config.robotsChunkMaxWords ?? 5;
|
|
2304
|
+
|
|
2305
|
+
this.kvm = new KVMStore(kvmStore, { maxCacheEntries: this.config.kvmCacheMaxEntries });
|
|
1651
2306
|
this.graph = new MemeGraph(memeStore);
|
|
1652
2307
|
this.sessions = new SessionManager(sessionStore);
|
|
2308
|
+
// “反接”层:从模因层回到用户可读表达(短语/句子)
|
|
2309
|
+
this.surfaceStore = new NamespacedStore(sessionStore, 'surface');
|
|
2310
|
+
this.dialogStore = new NamespacedStore(sessionStore, 'dialog');
|
|
2311
|
+
this.surfaceLexicon = new MemeSurfaceLexicon(this.surfaceStore);
|
|
2312
|
+
this.dialogMemory = new DialogMemory(this.dialogStore);
|
|
1653
2313
|
this.tensor = new TensorEngine();
|
|
1654
2314
|
this.pattern = new PatternMatrix(this);
|
|
1655
2315
|
this.params = { ...modelDefaults, ...(params || {}) };
|
|
1656
2316
|
this.metrics = { requests: 0, lastLatency: 0, updatedAt: Date.now() };
|
|
1657
|
-
this.config = { ...(config || {}) };
|
|
1658
|
-
this.config.robotsDir = this.config.robotsDir || path.join(__dirname, 'robots');
|
|
1659
|
-
this.config.lemmaCsv = this.config.lemmaCsv || path.join(__dirname, 'lemma.csv');
|
|
1660
2317
|
// 在线搜索配置:支持运行时开关与 endpoint 库
|
|
1661
2318
|
this.config.search = {
|
|
1662
2319
|
enabled: this.config.search?.enabled ?? true,
|
|
@@ -1740,8 +2397,14 @@ class RuntimeState {
|
|
|
1740
2397
|
}
|
|
1741
2398
|
|
|
1742
2399
|
mapWordsToMemes(words) {
|
|
2400
|
+
// 细化:输入不仅映射到“单词模因”,还会生成/融合“多词模因(短语模因)”。
|
|
2401
|
+
const tokens = Array.isArray(words) ? words.map((w) => String(w || '').trim()).filter(Boolean) : [];
|
|
1743
2402
|
const memeStrength = new Map();
|
|
1744
|
-
|
|
2403
|
+
const maxUnits = 128;
|
|
2404
|
+
let units = 0;
|
|
2405
|
+
|
|
2406
|
+
// 1) unigram:保持兼容
|
|
2407
|
+
for (const word of tokens) {
|
|
1745
2408
|
const memes = this.kvm.getWordMemeSet(word);
|
|
1746
2409
|
if (!memes || memes.size === 0) {
|
|
1747
2410
|
const memeId = `meme_${hashString(word)}`;
|
|
@@ -1754,9 +2417,143 @@ class RuntimeState {
|
|
|
1754
2417
|
memeStrength.set(memeId, (memeStrength.get(memeId) ?? 0) + 1);
|
|
1755
2418
|
}
|
|
1756
2419
|
}
|
|
2420
|
+
|
|
2421
|
+
// 2) phrase/word-structure meme:以 ngram 作为概念单元,模因仍为“多个词的集合”
|
|
2422
|
+
const nMin = Math.max(2, Number(this.params.memeNgramMin ?? 2) || 2);
|
|
2423
|
+
const nMax = Math.max(nMin, Number(this.params.memeNgramMax ?? 4) || 4);
|
|
2424
|
+
const minOverlap = Math.max(1, Number(this.params.minOverlapThreshold ?? 2) || 2);
|
|
2425
|
+
const maxWordSet = Math.max(4, Number(this.params.maxMemeWords ?? 100) || 100);
|
|
2426
|
+
|
|
2427
|
+
const resolveOrCreateMemeForTokenSet = (tokenSet) => {
|
|
2428
|
+
const uniq = Array.from(new Set(tokenSet.map((x) => String(x || '').trim()).filter(Boolean))).slice(0, maxWordSet);
|
|
2429
|
+
if (uniq.length <= 1) {
|
|
2430
|
+
const w = uniq[0];
|
|
2431
|
+
return w ? `meme_${hashString(w)}` : null;
|
|
2432
|
+
}
|
|
2433
|
+
// 统计每个候选 meme 与 tokenSet 的重合词数
|
|
2434
|
+
const counts = new Map();
|
|
2435
|
+
for (const w of uniq) {
|
|
2436
|
+
const memes = this.kvm.getWordMemeSet(w);
|
|
2437
|
+
if (!memes || memes.size === 0) {
|
|
2438
|
+
continue;
|
|
2439
|
+
}
|
|
2440
|
+
for (const mid of memes) {
|
|
2441
|
+
counts.set(mid, (counts.get(mid) ?? 0) + 1);
|
|
2442
|
+
}
|
|
2443
|
+
}
|
|
2444
|
+
let best = null;
|
|
2445
|
+
let bestOverlap = 0;
|
|
2446
|
+
for (const [mid, c] of counts.entries()) {
|
|
2447
|
+
if (c > bestOverlap) {
|
|
2448
|
+
bestOverlap = c;
|
|
2449
|
+
best = mid;
|
|
2450
|
+
}
|
|
2451
|
+
}
|
|
2452
|
+
if (best && bestOverlap >= minOverlap) {
|
|
2453
|
+
// “融合”:把当前 tokenSet 的词也挂到 best meme 上
|
|
2454
|
+
for (const w of uniq) {
|
|
2455
|
+
this.kvm.link(w, best);
|
|
2456
|
+
}
|
|
2457
|
+
this.graph.ensureNode(best);
|
|
2458
|
+
return best;
|
|
2459
|
+
}
|
|
2460
|
+
// 新建短语模因:ID 由“词集合”决定,保证稳定
|
|
2461
|
+
const sorted = uniq.slice().sort();
|
|
2462
|
+
const memeId = `meme_p_${hashString(sorted.join('|'))}`;
|
|
2463
|
+
this.graph.ensureNode(memeId);
|
|
2464
|
+
for (const w of sorted) {
|
|
2465
|
+
this.kvm.link(w, memeId);
|
|
2466
|
+
}
|
|
2467
|
+
return memeId;
|
|
2468
|
+
};
|
|
2469
|
+
|
|
2470
|
+
// 生成 ngram 单元并映射到 meme,权重按长度提升
|
|
2471
|
+
for (let i = 0; i < tokens.length && units < maxUnits; i++) {
|
|
2472
|
+
for (let n = nMin; n <= nMax && units < maxUnits; n++) {
|
|
2473
|
+
if (i + n > tokens.length) {
|
|
2474
|
+
break;
|
|
2475
|
+
}
|
|
2476
|
+
const gram = tokens.slice(i, i + n);
|
|
2477
|
+
const memeId = resolveOrCreateMemeForTokenSet(gram);
|
|
2478
|
+
if (!memeId) {
|
|
2479
|
+
continue;
|
|
2480
|
+
}
|
|
2481
|
+
const w = 1 + 0.5 * (n - 1);
|
|
2482
|
+
memeStrength.set(memeId, (memeStrength.get(memeId) ?? 0) + w);
|
|
2483
|
+
units += 1;
|
|
2484
|
+
}
|
|
2485
|
+
}
|
|
2486
|
+
|
|
1757
2487
|
return memeStrength;
|
|
1758
2488
|
}
|
|
1759
2489
|
|
|
2490
|
+
_buildMemeSequenceFromTokens(tokens) {
|
|
2491
|
+
const list = Array.isArray(tokens) ? tokens.map((t) => String(t || '').trim()).filter(Boolean) : [];
|
|
2492
|
+
const nMin = Math.max(2, Number(this.params.memeNgramMin ?? 2) || 2);
|
|
2493
|
+
const nMax = Math.max(nMin, Number(this.params.memeNgramMax ?? 4) || 4);
|
|
2494
|
+
const minOverlap = Math.max(1, Number(this.params.minOverlapThreshold ?? 2) || 2);
|
|
2495
|
+
const maxWordSet = Math.max(4, Number(this.params.maxMemeWords ?? 100) || 100);
|
|
2496
|
+
|
|
2497
|
+
const resolveOrCreate = (tokenSet) => {
|
|
2498
|
+
const uniq = Array.from(new Set(tokenSet.map((x) => String(x || '').trim()).filter(Boolean))).slice(0, maxWordSet);
|
|
2499
|
+
if (uniq.length <= 1) {
|
|
2500
|
+
const w = uniq[0];
|
|
2501
|
+
return w ? `meme_${hashString(w)}` : null;
|
|
2502
|
+
}
|
|
2503
|
+
const counts = new Map();
|
|
2504
|
+
for (const w of uniq) {
|
|
2505
|
+
const memes = this.kvm.getWordMemeSet(w);
|
|
2506
|
+
if (!memes || memes.size === 0) continue;
|
|
2507
|
+
for (const mid of memes) counts.set(mid, (counts.get(mid) ?? 0) + 1);
|
|
2508
|
+
}
|
|
2509
|
+
let best = null;
|
|
2510
|
+
let bestOverlap = 0;
|
|
2511
|
+
for (const [mid, c] of counts.entries()) {
|
|
2512
|
+
if (c > bestOverlap) {
|
|
2513
|
+
bestOverlap = c;
|
|
2514
|
+
best = mid;
|
|
2515
|
+
}
|
|
2516
|
+
}
|
|
2517
|
+
if (best && bestOverlap >= minOverlap) {
|
|
2518
|
+
for (const w of uniq) this.kvm.link(w, best);
|
|
2519
|
+
this.graph.ensureNode(best);
|
|
2520
|
+
return best;
|
|
2521
|
+
}
|
|
2522
|
+
const sorted = uniq.slice().sort();
|
|
2523
|
+
const memeId = `meme_p_${hashString(sorted.join('|'))}`;
|
|
2524
|
+
this.graph.ensureNode(memeId);
|
|
2525
|
+
for (const w of sorted) this.kvm.link(w, memeId);
|
|
2526
|
+
return memeId;
|
|
2527
|
+
};
|
|
2528
|
+
|
|
2529
|
+
const seq = [];
|
|
2530
|
+
for (let i = 0; i < list.length; i++) {
|
|
2531
|
+
// 用更长的 ngram 优先,减少“句子级”颗粒
|
|
2532
|
+
let picked = null;
|
|
2533
|
+
for (let n = nMax; n >= nMin; n--) {
|
|
2534
|
+
if (i + n > list.length) continue;
|
|
2535
|
+
picked = resolveOrCreate(list.slice(i, i + n));
|
|
2536
|
+
if (picked) {
|
|
2537
|
+
break;
|
|
2538
|
+
}
|
|
2539
|
+
}
|
|
2540
|
+
if (!picked) {
|
|
2541
|
+
const w = list[i];
|
|
2542
|
+
picked = w ? `meme_${hashString(w)}` : null;
|
|
2543
|
+
if (picked) {
|
|
2544
|
+
this.graph.ensureNode(picked);
|
|
2545
|
+
this.kvm.link(w, picked);
|
|
2546
|
+
}
|
|
2547
|
+
}
|
|
2548
|
+
if (picked) {
|
|
2549
|
+
if (seq.length === 0 || seq[seq.length - 1] !== picked) {
|
|
2550
|
+
seq.push(picked);
|
|
2551
|
+
}
|
|
2552
|
+
}
|
|
2553
|
+
}
|
|
2554
|
+
return seq;
|
|
2555
|
+
}
|
|
2556
|
+
|
|
1760
2557
|
_buildSeedVector(windowInfo, seeds) {
|
|
1761
2558
|
const vec = new Float32Array(windowInfo.ids.length);
|
|
1762
2559
|
for (const [memeId, strength] of seeds.entries()) {
|
|
@@ -1768,16 +2565,101 @@ class RuntimeState {
|
|
|
1768
2565
|
return vec;
|
|
1769
2566
|
}
|
|
1770
2567
|
|
|
1771
|
-
runPropagation(seeds) {
|
|
1772
|
-
const
|
|
2568
|
+
runPropagation(seeds, options = {}) {
|
|
2569
|
+
const radiusRaw = options.radius ?? options.windowRadius;
|
|
2570
|
+
const radius = Math.max(1, Math.min(6, Number(radiusRaw ?? 2) || 2));
|
|
2571
|
+
const windowInfo = this.graph.buildWindow(Array.from(seeds.keys()), radius);
|
|
1773
2572
|
const seedVector = this._buildSeedVector(windowInfo, seeds);
|
|
1774
2573
|
const act = this._activation();
|
|
1775
|
-
const
|
|
2574
|
+
const iteration = Math.max(1, Number(options.iteration ?? this.params.iteration ?? 5) || 5);
|
|
2575
|
+
const output = this.tensor.iteratePropagation(windowInfo.csr, seedVector, iteration, act, this.params.decayK, 0.02);
|
|
1776
2576
|
this.pattern.rebuild(windowInfo);
|
|
1777
2577
|
return { windowInfo, seedVector, activation: output };
|
|
1778
2578
|
}
|
|
1779
2579
|
|
|
2580
|
+
_pickTopActivatedMemes(result, seeds, { limit = 18, minScore = 1e-6 } = {}) {
|
|
2581
|
+
const { windowInfo, activation } = result || {};
|
|
2582
|
+
if (!windowInfo || !Array.isArray(windowInfo.ids) || !activation) return [];
|
|
2583
|
+
const seedIds = new Set(seeds ? Array.from(seeds.keys()) : []);
|
|
2584
|
+
|
|
2585
|
+
const isConnectedToSeeds = (memeId) => {
|
|
2586
|
+
if (seedIds.size === 0 || seedIds.has(memeId)) {
|
|
2587
|
+
return true;
|
|
2588
|
+
}
|
|
2589
|
+
const table = this.graph.nodes.get(memeId);
|
|
2590
|
+
if (table) {
|
|
2591
|
+
for (const neighborId of table.keys()) {
|
|
2592
|
+
if (seedIds.has(neighborId)) {
|
|
2593
|
+
return true;
|
|
2594
|
+
}
|
|
2595
|
+
}
|
|
2596
|
+
}
|
|
2597
|
+
for (const seedId of seedIds) {
|
|
2598
|
+
const seedTable = this.graph.nodes.get(seedId);
|
|
2599
|
+
if (seedTable && seedTable.has(memeId)) {
|
|
2600
|
+
return true;
|
|
2601
|
+
}
|
|
2602
|
+
}
|
|
2603
|
+
return false;
|
|
2604
|
+
};
|
|
2605
|
+
|
|
2606
|
+
const scored = [];
|
|
2607
|
+
for (let i = 0; i < windowInfo.ids.length; i++) {
|
|
2608
|
+
const memeId = windowInfo.ids[i];
|
|
2609
|
+
const score = activation[i];
|
|
2610
|
+
if (!Number.isFinite(score) || score <= minScore) continue;
|
|
2611
|
+
if (!isConnectedToSeeds(memeId)) continue;
|
|
2612
|
+
scored.push({ memeId, score });
|
|
2613
|
+
}
|
|
2614
|
+
scored.sort((a, b) => b.score - a.score);
|
|
2615
|
+
return scored.slice(0, Math.max(1, Number(limit) || 18));
|
|
2616
|
+
}
|
|
2617
|
+
|
|
2618
|
+
_makeSignatureFromTopMemes(topMemes, { limit = 12 } = {}) {
|
|
2619
|
+
const ids = (Array.isArray(topMemes) ? topMemes : [])
|
|
2620
|
+
.map((x) => (typeof x === 'string' ? x : x?.memeId))
|
|
2621
|
+
.filter(Boolean)
|
|
2622
|
+
.slice(0, Math.max(3, Number(limit) || 12));
|
|
2623
|
+
// signature 用 memeId 列表,排序保证稳定
|
|
2624
|
+
const uniq = Array.from(new Set(ids));
|
|
2625
|
+
uniq.sort();
|
|
2626
|
+
return uniq.join('|');
|
|
2627
|
+
}
|
|
2628
|
+
|
|
1780
2629
|
composeReply(result, words, seeds) {
|
|
2630
|
+
const topMemes = this._pickTopActivatedMemes(result, seeds, { limit: 18 });
|
|
2631
|
+
const signature = this._makeSignatureFromTopMemes(topMemes, { limit: 12 });
|
|
2632
|
+
|
|
2633
|
+
// 1) 优先:对话记忆检索(更像“在训练集中找答案”)
|
|
2634
|
+
const memoryHit = this.dialogMemory.retrieve({
|
|
2635
|
+
memes: topMemes.map((x) => x.memeId),
|
|
2636
|
+
signature,
|
|
2637
|
+
minSim: 0.45
|
|
2638
|
+
});
|
|
2639
|
+
if (memoryHit && typeof memoryHit.reply === 'string' && memoryHit.reply.trim()) {
|
|
2640
|
+
return memoryHit.reply.trim();
|
|
2641
|
+
}
|
|
2642
|
+
|
|
2643
|
+
// 2) 其次:模因 -> 表层表达(短语/句子)反接
|
|
2644
|
+
const phraseScores = new Map();
|
|
2645
|
+
for (const item of topMemes) {
|
|
2646
|
+
const list = this.surfaceLexicon.getTop(item.memeId, { limit: 4 });
|
|
2647
|
+
for (const c of list) {
|
|
2648
|
+
const p = String(c.phrase || '').trim();
|
|
2649
|
+
if (!p) continue;
|
|
2650
|
+
const prev = phraseScores.get(p) ?? 0;
|
|
2651
|
+
// meme 激活分数做门控,词典分数做权重
|
|
2652
|
+
phraseScores.set(p, prev + (Math.max(0, item.score) * (0.5 + Math.max(0, c.score))));
|
|
2653
|
+
}
|
|
2654
|
+
}
|
|
2655
|
+
const phraseOrdered = Array.from(phraseScores.entries())
|
|
2656
|
+
.sort((a, b) => b[1] - a[1])
|
|
2657
|
+
.map(([p]) => p);
|
|
2658
|
+
if (phraseOrdered.length) {
|
|
2659
|
+
return phraseOrdered.slice(0, 2).join('。');
|
|
2660
|
+
}
|
|
2661
|
+
|
|
2662
|
+
// 3) 回退:旧逻辑(meme -> words)
|
|
1781
2663
|
const { windowInfo, activation } = result;
|
|
1782
2664
|
const seedIds = new Set(seeds ? Array.from(seeds.keys()) : []);
|
|
1783
2665
|
const baseWords = Array.from(new Set((words || []).map((w) => String(w).trim()).filter(Boolean)));
|
|
@@ -1857,6 +2739,48 @@ class RuntimeState {
|
|
|
1857
2739
|
return finalWords.slice(0, 30).join(' ');
|
|
1858
2740
|
}
|
|
1859
2741
|
|
|
2742
|
+
learnFromDialog({ payload, result } = {}) {
|
|
2743
|
+
try {
|
|
2744
|
+
const question = typeof payload?.text === 'string'
|
|
2745
|
+
? payload.text
|
|
2746
|
+
: (Array.isArray(payload?.tokens) ? payload.tokens.join(' ') : '');
|
|
2747
|
+
const reply = typeof result?.reply === 'string' ? result.reply : '';
|
|
2748
|
+
if (!question.trim() || !reply.trim()) {
|
|
2749
|
+
return { ok: false, reason: 'missing-text' };
|
|
2750
|
+
}
|
|
2751
|
+
|
|
2752
|
+
// 以当前推理结果为主:topMemes -> signature
|
|
2753
|
+
const seeds = Array.isArray(result?.seeds)
|
|
2754
|
+
? new Map(result.seeds.map((pair) => [pair[0], pair[1]]))
|
|
2755
|
+
: this.mapWordsToMemes(tokenize(question));
|
|
2756
|
+
const windowInfo = Array.isArray(result?.memes) ? { ids: result.memes } : null;
|
|
2757
|
+
const activation = Array.isArray(result?.activation) ? Float32Array.from(result.activation) : null;
|
|
2758
|
+
const resObj = (windowInfo && activation) ? { windowInfo, activation } : this.runPropagation(seeds);
|
|
2759
|
+
|
|
2760
|
+
const topMemes = this._pickTopActivatedMemes(resObj, seeds, { limit: 18 });
|
|
2761
|
+
const signature = this._makeSignatureFromTopMemes(topMemes, { limit: 12 });
|
|
2762
|
+
const memeIds = topMemes.map((x) => x.memeId);
|
|
2763
|
+
|
|
2764
|
+
// 学习:每个高激活 meme 绑定 reply 的表层表达
|
|
2765
|
+
for (const item of topMemes.slice(0, 10)) {
|
|
2766
|
+
const w = Math.max(0.5, Math.min(3, item.score));
|
|
2767
|
+
this.surfaceLexicon.learn(item.memeId, reply, { weight: w });
|
|
2768
|
+
}
|
|
2769
|
+
|
|
2770
|
+
// 学习:签名级别的“对话记忆”(检索更直接)
|
|
2771
|
+
this.dialogMemory.remember({
|
|
2772
|
+
signature,
|
|
2773
|
+
memes: memeIds,
|
|
2774
|
+
question,
|
|
2775
|
+
reply,
|
|
2776
|
+
scoreHint: topMemes[0]?.score ?? 0
|
|
2777
|
+
});
|
|
2778
|
+
return { ok: true, memes: memeIds.length, signatureLen: signature ? signature.split('|').length : 0 };
|
|
2779
|
+
} catch (err) {
|
|
2780
|
+
return { ok: false, error: err.message };
|
|
2781
|
+
}
|
|
2782
|
+
}
|
|
2783
|
+
|
|
1860
2784
|
processInput(payload) {
|
|
1861
2785
|
const started = Date.now();
|
|
1862
2786
|
const text = payload.text != null ? payload.text : (payload.message != null ? String(payload.message) : '');
|
|
@@ -1866,8 +2790,44 @@ class RuntimeState {
|
|
|
1866
2790
|
: Array.isArray(payload.vocab) && payload.vocab.length ? payload.vocab
|
|
1867
2791
|
: null;
|
|
1868
2792
|
const words = tokensFromPayload ? tokensFromPayload.map((w) => String(w)) : tokenize(text);
|
|
1869
|
-
|
|
1870
|
-
const
|
|
2793
|
+
|
|
2794
|
+
const budget = payload && typeof payload === 'object' ? payload.budget : null;
|
|
2795
|
+
const depth = Math.max(1, Number(budget?.mappingDepth ?? budget?.depth ?? this.params.mappingDepth ?? 1) || 1);
|
|
2796
|
+
const topMemesK = Math.max(3, Number(budget?.reflectionTopMemes ?? budget?.topMemes ?? this.params.reflectionTopMemes ?? 18) || 18);
|
|
2797
|
+
const topWordsK = Math.max(3, Number(budget?.reflectionTopWords ?? budget?.topWords ?? this.params.reflectionTopWords ?? 24) || 24);
|
|
2798
|
+
const minScoreRaw = budget?.reflectionMinScore ?? budget?.minScore ?? this.params.reflectionMinScore;
|
|
2799
|
+
const minScore = Number.isFinite(Number(minScoreRaw)) ? Number(minScoreRaw) : 1e-6;
|
|
2800
|
+
const iterRaw = budget?.iteration;
|
|
2801
|
+
const iteration = Math.max(1, Number(iterRaw ?? this.params.iteration ?? 5) || 5);
|
|
2802
|
+
const radiusRaw = budget?.radius ?? budget?.windowRadius;
|
|
2803
|
+
const radius = Math.max(1, Math.min(6, Number(radiusRaw ?? 2) || 2));
|
|
2804
|
+
|
|
2805
|
+
let seeds = this.mapWordsToMemes(words);
|
|
2806
|
+
let result = null;
|
|
2807
|
+
if (depth > 1) {
|
|
2808
|
+
for (let hop = 1; hop < depth; hop++) {
|
|
2809
|
+
result = this.runPropagation(seeds, { iteration, radius });
|
|
2810
|
+
const topMemes = this._pickTopActivatedMemes(result, seeds, { limit: topMemesK, minScore });
|
|
2811
|
+
const wordScore = new Map();
|
|
2812
|
+
for (const m of topMemes) {
|
|
2813
|
+
const linked = this.kvm.getMemeWords(m.memeId);
|
|
2814
|
+
if (!linked) continue;
|
|
2815
|
+
for (const w of linked) {
|
|
2816
|
+
const ww = String(w || '').trim();
|
|
2817
|
+
if (!ww) continue;
|
|
2818
|
+
const prev = wordScore.get(ww) ?? 0;
|
|
2819
|
+
wordScore.set(ww, Math.max(prev, Number(m.score) || 0));
|
|
2820
|
+
}
|
|
2821
|
+
}
|
|
2822
|
+
const expanded = Array.from(wordScore.entries())
|
|
2823
|
+
.sort((a, b) => b[1] - a[1])
|
|
2824
|
+
.slice(0, topWordsK)
|
|
2825
|
+
.map(([w]) => w);
|
|
2826
|
+
const merged = Array.from(new Set([...(words || []).slice(0, 64), ...expanded]));
|
|
2827
|
+
seeds = this.mapWordsToMemes(merged);
|
|
2828
|
+
}
|
|
2829
|
+
}
|
|
2830
|
+
result = result || this.runPropagation(seeds, { iteration, radius });
|
|
1871
2831
|
const reply = this.composeReply(result, words, seeds);
|
|
1872
2832
|
const latency = Date.now() - started;
|
|
1873
2833
|
this.metrics.requests += 1;
|
|
@@ -1895,7 +2855,8 @@ class RuntimeState {
|
|
|
1895
2855
|
}
|
|
1896
2856
|
|
|
1897
2857
|
const seeds = this.mapWordsToMemes(tokens);
|
|
1898
|
-
|
|
2858
|
+
// 使用有序“词结构/短语结构”序列建边(从句子级细化到词/短语级)
|
|
2859
|
+
const memeIds = this._buildMemeSequenceFromTokens(tokens);
|
|
1899
2860
|
for (let i = 0; i < memeIds.length - 1; i++) {
|
|
1900
2861
|
this.graph.link(memeIds[i], memeIds[i + 1], 1, 0);
|
|
1901
2862
|
}
|
|
@@ -1925,7 +2886,12 @@ class RuntimeState {
|
|
|
1925
2886
|
if (!this.robotsCorpus) {
|
|
1926
2887
|
this.robotsCorpus = new RobotsCorpus({
|
|
1927
2888
|
dir: this.config.robotsDir,
|
|
1928
|
-
lemmaCsv: this.config.lemmaCsv
|
|
2889
|
+
lemmaCsv: this.config.lemmaCsv,
|
|
2890
|
+
lemmaAutoload: this.config.lemmaAutoload,
|
|
2891
|
+
lemmaMaxBytes: this.config.lemmaMaxBytes,
|
|
2892
|
+
lemmaForce: this.config.lemmaForce,
|
|
2893
|
+
chunkMinWords: this.config.robotsChunkMinWords,
|
|
2894
|
+
chunkMaxWords: this.config.robotsChunkMaxWords
|
|
1929
2895
|
});
|
|
1930
2896
|
}
|
|
1931
2897
|
return this.robotsCorpus;
|
|
@@ -1981,7 +2947,9 @@ class RuntimeState {
|
|
|
1981
2947
|
params: this.params,
|
|
1982
2948
|
graph: this.graph.exportSnapshot(),
|
|
1983
2949
|
sessions: this.sessions.export(),
|
|
1984
|
-
kvm: this.kvm.exportEntries()
|
|
2950
|
+
kvm: this.kvm.exportEntries(),
|
|
2951
|
+
surface: this.surfaceLexicon ? this.surfaceLexicon.exportSnapshot({ limitMemes: 512 }) : null,
|
|
2952
|
+
dialog: this.dialogMemory ? this.dialogMemory.exportSnapshot({ limit: 512 }) : null
|
|
1985
2953
|
};
|
|
1986
2954
|
}
|
|
1987
2955
|
|
|
@@ -1999,6 +2967,12 @@ class RuntimeState {
|
|
|
1999
2967
|
}
|
|
2000
2968
|
}
|
|
2001
2969
|
}
|
|
2970
|
+
if (snapshot.surface && this.surfaceLexicon) {
|
|
2971
|
+
this.surfaceLexicon.importSnapshot(snapshot.surface);
|
|
2972
|
+
}
|
|
2973
|
+
if (snapshot.dialog && this.dialogMemory) {
|
|
2974
|
+
this.dialogMemory.importSnapshot(snapshot.dialog);
|
|
2975
|
+
}
|
|
2002
2976
|
}
|
|
2003
2977
|
|
|
2004
2978
|
// 将当前窗口或指定种子集合导出为 Go 侧 Graph 结构并写入文件
|
|
@@ -2266,11 +3240,19 @@ class StudyEngine {
|
|
|
2266
3240
|
this.running = false;
|
|
2267
3241
|
this.queue = [];
|
|
2268
3242
|
this.metrics = { enqueued: 0, processed: 0, lastTickAt: 0, lastError: null };
|
|
3243
|
+
this.poolWorker = null;
|
|
3244
|
+
}
|
|
3245
|
+
|
|
3246
|
+
_ensureWorkerPool() {
|
|
3247
|
+
if (this.poolWorker) {
|
|
3248
|
+
return this.poolWorker;
|
|
3249
|
+
}
|
|
2269
3250
|
this.poolWorker = workerpool.pool(CONFIG.workerFile, {
|
|
2270
3251
|
minWorkers: 1,
|
|
2271
3252
|
maxWorkers: CONFIG.maxWorkers,
|
|
2272
3253
|
workerType: 'process'
|
|
2273
3254
|
});
|
|
3255
|
+
return this.poolWorker;
|
|
2274
3256
|
}
|
|
2275
3257
|
|
|
2276
3258
|
start() {
|
|
@@ -2301,7 +3283,8 @@ class StudyEngine {
|
|
|
2301
3283
|
try {
|
|
2302
3284
|
const text = String(doc?.text || '');
|
|
2303
3285
|
const tokens = tokenize(text);
|
|
2304
|
-
|
|
3286
|
+
const wp = this._ensureWorkerPool();
|
|
3287
|
+
await wp.exec('batchLemmatize', [[tokens], this.pool.getActive().runtime?.config?.lemmaCsv]);
|
|
2305
3288
|
} catch (_e) {
|
|
2306
3289
|
// ignore
|
|
2307
3290
|
}
|
|
@@ -2701,6 +3684,66 @@ class PersonaForestAverager {
|
|
|
2701
3684
|
}
|
|
2702
3685
|
}
|
|
2703
3686
|
|
|
3687
|
+
const normalizeBudget = (raw) => {
|
|
3688
|
+
if (raw === undefined || raw === null || raw === '' || raw === false) {
|
|
3689
|
+
return null;
|
|
3690
|
+
}
|
|
3691
|
+
if (typeof raw === 'string') {
|
|
3692
|
+
const s = raw.trim();
|
|
3693
|
+
const lowered = s.toLowerCase();
|
|
3694
|
+
if (lowered === 'default' || lowered === 'balanced' || lowered === 'medium' || lowered === 'none') {
|
|
3695
|
+
return null;
|
|
3696
|
+
}
|
|
3697
|
+
if (lowered === 'low' || lowered === 'fast') {
|
|
3698
|
+
return { iteration: 3, reflectionTopMemes: 12, reflectionTopWords: 16 };
|
|
3699
|
+
}
|
|
3700
|
+
if (lowered === 'high' || lowered === 'slow' || lowered === 'quality') {
|
|
3701
|
+
return { iteration: 7, reflectionTopMemes: 22, reflectionTopWords: 32 };
|
|
3702
|
+
}
|
|
3703
|
+
if (s.startsWith('{') && s.endsWith('}')) {
|
|
3704
|
+
try {
|
|
3705
|
+
return normalizeBudget(JSON.parse(s));
|
|
3706
|
+
} catch (_e) {
|
|
3707
|
+
return null;
|
|
3708
|
+
}
|
|
3709
|
+
}
|
|
3710
|
+
return null;
|
|
3711
|
+
}
|
|
3712
|
+
|
|
3713
|
+
if (typeof raw !== 'object') {
|
|
3714
|
+
return null;
|
|
3715
|
+
}
|
|
3716
|
+
const out = {};
|
|
3717
|
+
const pickNum = (key, ...aliases) => {
|
|
3718
|
+
const v = raw[key];
|
|
3719
|
+
if (Number.isFinite(Number(v))) {
|
|
3720
|
+
out[key] = Number(v);
|
|
3721
|
+
return;
|
|
3722
|
+
}
|
|
3723
|
+
for (const a of aliases) {
|
|
3724
|
+
const av = raw[a];
|
|
3725
|
+
if (Number.isFinite(Number(av))) {
|
|
3726
|
+
out[key] = Number(av);
|
|
3727
|
+
return;
|
|
3728
|
+
}
|
|
3729
|
+
}
|
|
3730
|
+
};
|
|
3731
|
+
pickNum('mappingDepth', 'depth');
|
|
3732
|
+
pickNum('iteration', 'iters');
|
|
3733
|
+
pickNum('reflectionTopMemes', 'topMemes');
|
|
3734
|
+
pickNum('reflectionTopWords', 'topWords');
|
|
3735
|
+
pickNum('reflectionMinScore', 'minScore');
|
|
3736
|
+
pickNum('radius', 'windowRadius');
|
|
3737
|
+
return Object.keys(out).length ? out : null;
|
|
3738
|
+
};
|
|
3739
|
+
|
|
3740
|
+
const mergeBudgets = (base, override) => {
|
|
3741
|
+
if (!base && !override) return null;
|
|
3742
|
+
if (!base) return override;
|
|
3743
|
+
if (!override) return base;
|
|
3744
|
+
return { ...base, ...override };
|
|
3745
|
+
};
|
|
3746
|
+
|
|
2704
3747
|
class SparkArray {
|
|
2705
3748
|
/**
|
|
2706
3749
|
* @param {ControllerPool} pool
|
|
@@ -2716,9 +3759,10 @@ class SparkArray {
|
|
|
2716
3759
|
const available = typeof pool?.listControllersInGroup === 'function'
|
|
2717
3760
|
? pool.listControllersInGroup(this.groupId)
|
|
2718
3761
|
: (typeof pool?.listControllerNames === 'function' ? pool.listControllerNames() : Object.keys(pool?.controllers || {}));
|
|
2719
|
-
const
|
|
3762
|
+
const wantedRaw = options.numAI ?? options.groupSize ?? CONFIG.sparkNumAI ?? CONFIG.groupSize ?? 7;
|
|
3763
|
+
const wanted = Math.max(1, Math.round(Number(wantedRaw) || 7));
|
|
2720
3764
|
const numAI = Math.max(1, Math.min(available.length || wanted, wanted));
|
|
2721
|
-
// 组内小 SparkArray
|
|
3765
|
+
// 组内小 SparkArray:默认 numAI 个 AI(不足则截断)
|
|
2722
3766
|
this.layers = Array.from({ length: numAI }, (_, i) => ({
|
|
2723
3767
|
name: `${this.groupId}:a${i + 1}`,
|
|
2724
3768
|
controllers: [available[i]],
|
|
@@ -2729,6 +3773,7 @@ class SparkArray {
|
|
|
2729
3773
|
this.layers = options.layers.map((layer) => ({ strategy: 'max', ...layer }));
|
|
2730
3774
|
}
|
|
2731
3775
|
this.personaForest = new PersonaForestAverager(options.personaForest || {});
|
|
3776
|
+
this.defaultBudget = normalizeBudget(options.budget ?? CONFIG.sparkBudget);
|
|
2732
3777
|
this.history = [];
|
|
2733
3778
|
}
|
|
2734
3779
|
|
|
@@ -2751,9 +3796,15 @@ class SparkArray {
|
|
|
2751
3796
|
async dispatch(payload, options = {}) {
|
|
2752
3797
|
const requestEmbedding = textToMiniEmbedding(payload.text || '', 64);
|
|
2753
3798
|
const variants = buildVariants(payload.text || '', options.perturbations || 0);
|
|
2754
|
-
|
|
3799
|
+
let layers = options.multiLayer === false ? [this.layers[0]] : this.layers;
|
|
3800
|
+
if (Number.isFinite(Number(options.numAI)) && Number(options.numAI) > 0) {
|
|
3801
|
+
const cap = Math.max(1, Math.floor(Number(options.numAI)));
|
|
3802
|
+
layers = layers.slice(0, cap);
|
|
3803
|
+
}
|
|
2755
3804
|
const layerResults = [];
|
|
2756
3805
|
|
|
3806
|
+
const budget = mergeBudgets(this.defaultBudget, normalizeBudget(options.budget ?? payload?.budget));
|
|
3807
|
+
|
|
2757
3808
|
for (const layer of layers) {
|
|
2758
3809
|
const controllers = [];
|
|
2759
3810
|
for (const controllerSpec of layer.controllers) {
|
|
@@ -2773,7 +3824,7 @@ class SparkArray {
|
|
|
2773
3824
|
const weightedText = weight <= 1
|
|
2774
3825
|
? String(payload.text || '')
|
|
2775
3826
|
: Array.from({ length: weight }, () => String(payload.text || '')).join(' ');
|
|
2776
|
-
baseResult = await ctrl.respond({ ...payload, text: weightedText });
|
|
3827
|
+
baseResult = await ctrl.respond({ ...payload, text: weightedText, ...(budget ? { budget } : {}) });
|
|
2777
3828
|
} catch (err) {
|
|
2778
3829
|
controllers.push({
|
|
2779
3830
|
controller: controllerName,
|
|
@@ -2786,7 +3837,7 @@ class SparkArray {
|
|
|
2786
3837
|
const variantResults = [];
|
|
2787
3838
|
for (const variant of variants) {
|
|
2788
3839
|
try {
|
|
2789
|
-
const vr = await ctrl.respond({ ...payload, text: variant });
|
|
3840
|
+
const vr = await ctrl.respond({ ...payload, text: variant, ...(budget ? { budget } : {}) });
|
|
2790
3841
|
variantResults.push({
|
|
2791
3842
|
text: variant,
|
|
2792
3843
|
response: vr,
|
|
@@ -2855,7 +3906,7 @@ class SparkArray {
|
|
|
2855
3906
|
}
|
|
2856
3907
|
}
|
|
2857
3908
|
}
|
|
2858
|
-
|
|
3909
|
+
|
|
2859
3910
|
// 随机森林式“中途平均/投票”:在不改变对外结构的前提下,优先选择更稳定且共识更强的回复
|
|
2860
3911
|
try {
|
|
2861
3912
|
const picked = this.personaForest.pick({
|
|
@@ -3160,7 +4211,7 @@ class ReinforcementLearner {
|
|
|
3160
4211
|
this.improvementThreshold = improvementThreshold;
|
|
3161
4212
|
this.history = [];
|
|
3162
4213
|
// 统一使用上方安全引用的 Matrix(可能为 null)
|
|
3163
|
-
this.Matrix =
|
|
4214
|
+
this.Matrix = getMatrix();
|
|
3164
4215
|
this.kmeans = safeRequire('ml-kmeans');
|
|
3165
4216
|
this.numeric = safeRequire('numeric');
|
|
3166
4217
|
}
|
|
@@ -3353,7 +4404,7 @@ class AdversarialLearner {
|
|
|
3353
4404
|
this.benchLimit = benchLimit;
|
|
3354
4405
|
this.rng = safeRequire('seedrandom') ? safeRequire('seedrandom')('phoenix-adv') : Math.random;
|
|
3355
4406
|
this.history = [];
|
|
3356
|
-
this.Matrix =
|
|
4407
|
+
this.Matrix = getMatrix();
|
|
3357
4408
|
}
|
|
3358
4409
|
|
|
3359
4410
|
_perturbTokens(tokens) {
|
|
@@ -3491,8 +4542,8 @@ class GatewayServer {
|
|
|
3491
4542
|
this.redisSync = redisSync;
|
|
3492
4543
|
this.study = study;
|
|
3493
4544
|
this.spark = sparkArray || new SparkArray(pool, shardManager);
|
|
3494
|
-
this.rl = learners.rl ||
|
|
3495
|
-
this.adv = learners.adv ||
|
|
4545
|
+
this.rl = learners.rl || null;
|
|
4546
|
+
this.adv = learners.adv || null;
|
|
3496
4547
|
this.rlDisabled = false;
|
|
3497
4548
|
this.advDisabled = false;
|
|
3498
4549
|
this.dialogLearningEnabled = true;
|
|
@@ -3556,6 +4607,18 @@ class GatewayServer {
|
|
|
3556
4607
|
this._setupRoutes();
|
|
3557
4608
|
}
|
|
3558
4609
|
|
|
4610
|
+
_ensureRL() {
|
|
4611
|
+
if (this.rl) return this.rl;
|
|
4612
|
+
this.rl = new ReinforcementLearner(this.pool, { testsDir: path.join(__dirname, 'tests') });
|
|
4613
|
+
return this.rl;
|
|
4614
|
+
}
|
|
4615
|
+
|
|
4616
|
+
_ensureADV() {
|
|
4617
|
+
if (this.adv) return this.adv;
|
|
4618
|
+
this.adv = new AdversarialLearner(this.pool, {});
|
|
4619
|
+
return this.adv;
|
|
4620
|
+
}
|
|
4621
|
+
|
|
3559
4622
|
_setupAuthMiddleware() {
|
|
3560
4623
|
let jwt;
|
|
3561
4624
|
try {
|
|
@@ -3767,14 +4830,14 @@ class GatewayServer {
|
|
|
3767
4830
|
return;
|
|
3768
4831
|
}
|
|
3769
4832
|
const cycles = Number(req.body?.cycles ?? 3) || 3;
|
|
3770
|
-
const out = await this.
|
|
4833
|
+
const out = await this._ensureRL().learn(cycles);
|
|
3771
4834
|
res.json({ ok: true, result: out });
|
|
3772
4835
|
} catch (err) {
|
|
3773
4836
|
res.status(500).json({ ok: false, error: err.message });
|
|
3774
4837
|
}
|
|
3775
4838
|
});
|
|
3776
4839
|
this.app.get('/api/learn/reinforce/latest', (req, res) => {
|
|
3777
|
-
res.json({ ok: true, latest: this.rl.latest() });
|
|
4840
|
+
res.json({ ok: true, latest: this.rl ? this.rl.latest() : null });
|
|
3778
4841
|
});
|
|
3779
4842
|
// Adversarial Learning endpoints
|
|
3780
4843
|
this.app.post('/api/learn/adversarial', async (req, res) => {
|
|
@@ -3788,14 +4851,14 @@ class GatewayServer {
|
|
|
3788
4851
|
res.status(400).json({ ok: false, error: 'samples required' });
|
|
3789
4852
|
return;
|
|
3790
4853
|
}
|
|
3791
|
-
const out = await this.
|
|
4854
|
+
const out = await this._ensureADV().attackAndDefend(samples);
|
|
3792
4855
|
res.json({ ok: true, result: out });
|
|
3793
4856
|
} catch (err) {
|
|
3794
4857
|
res.status(500).json({ ok: false, error: err.message });
|
|
3795
4858
|
}
|
|
3796
4859
|
});
|
|
3797
4860
|
this.app.get('/api/learn/adversarial/latest', (req, res) => {
|
|
3798
|
-
res.json({ ok: true, latest: this.adv.latest() });
|
|
4861
|
+
res.json({ ok: true, latest: this.adv ? this.adv.latest() : null });
|
|
3799
4862
|
});
|
|
3800
4863
|
this.app.post('/api/learn/thresholds', (req, res) => {
|
|
3801
4864
|
const { rlEvery, advEvery } = req.body || {};
|
|
@@ -3885,6 +4948,8 @@ class GatewayServer {
|
|
|
3885
4948
|
config: {
|
|
3886
4949
|
groupCount: CONFIG.groupCount,
|
|
3887
4950
|
groupSize: CONFIG.groupSize,
|
|
4951
|
+
sparkNumAI: CONFIG.sparkNumAI,
|
|
4952
|
+
sparkBudget: CONFIG.sparkBudget,
|
|
3888
4953
|
groupIds: typeof this.pool.listGroupIds === 'function' ? this.pool.listGroupIds() : [],
|
|
3889
4954
|
gatewayHost: CONFIG.gatewayHost,
|
|
3890
4955
|
portGateway: CONFIG.portGateway,
|
|
@@ -4336,9 +5401,21 @@ class GatewayServer {
|
|
|
4336
5401
|
if (!this.dialogLearningEnabled) {
|
|
4337
5402
|
return;
|
|
4338
5403
|
}
|
|
5404
|
+
|
|
5405
|
+
// 反接学习:把本轮对话沉淀为“模因层 -> 表层答案”的映射与可检索记忆
|
|
5406
|
+
try {
|
|
5407
|
+
const runtime = this.pool?.getActive?.()?.runtime;
|
|
5408
|
+
if (runtime && typeof runtime.learnFromDialog === 'function') {
|
|
5409
|
+
runtime.learnFromDialog({ payload, result });
|
|
5410
|
+
}
|
|
5411
|
+
} catch (e) {
|
|
5412
|
+
// 学习失败不影响主流程
|
|
5413
|
+
console.warn('[Learn] surface/dialog memory update failed:', e.message);
|
|
5414
|
+
}
|
|
5415
|
+
|
|
4339
5416
|
if (!this.rlDisabled && (total - this.dialogCounters.lastRL >= this.dialogThresholds.rlEvery)) {
|
|
4340
5417
|
this.dialogCounters.lastRL = total;
|
|
4341
|
-
Promise.resolve().then(() => this.
|
|
5418
|
+
Promise.resolve().then(() => this._ensureRL().learn(1)).catch((e) => console.warn('[Learn] RL trigger failed:', e.message));
|
|
4342
5419
|
}
|
|
4343
5420
|
if (!this.advDisabled && (total - this.dialogCounters.lastADV >= this.dialogThresholds.advEvery)) {
|
|
4344
5421
|
this.dialogCounters.lastADV = total;
|
|
@@ -4347,7 +5424,7 @@ class GatewayServer {
|
|
|
4347
5424
|
if (text && text.trim()) samples.push(text.trim());
|
|
4348
5425
|
if (result?.reply && typeof result.reply === 'string') samples.push(result.reply);
|
|
4349
5426
|
if (samples.length) {
|
|
4350
|
-
Promise.resolve().then(() => this.
|
|
5427
|
+
Promise.resolve().then(() => this._ensureADV().attackAndDefend(samples)).catch((e) => console.warn('[Learn] ADV trigger failed:', e.message));
|
|
4351
5428
|
}
|
|
4352
5429
|
}
|
|
4353
5430
|
} catch (e) {
|
|
@@ -4363,9 +5440,9 @@ class GatewayServer {
|
|
|
4363
5440
|
}
|
|
4364
5441
|
|
|
4365
5442
|
const bootstrap = async () => {
|
|
4366
|
-
const kvmStore = new LmdbStore({ name: 'kvm', rootDir: CONFIG.lmdbRoot });
|
|
4367
|
-
const memeStore = new LmdbStore({ name: 'meme_graph', rootDir: CONFIG.lmdbRoot });
|
|
4368
|
-
const sessionStore = new LmdbStore({ name: 'session', rootDir: CONFIG.lmdbRoot });
|
|
5443
|
+
const kvmStore = new LmdbStore({ name: 'kvm', rootDir: CONFIG.lmdbRoot, mapSizeBytes: CONFIG.lmdbMapSizeBytes });
|
|
5444
|
+
const memeStore = new LmdbStore({ name: 'meme_graph', rootDir: CONFIG.lmdbRoot, mapSizeBytes: CONFIG.lmdbMapSizeBytes });
|
|
5445
|
+
const sessionStore = new LmdbStore({ name: 'session', rootDir: CONFIG.lmdbRoot, mapSizeBytes: CONFIG.lmdbMapSizeBytes });
|
|
4369
5446
|
console.log('checkpoint1');
|
|
4370
5447
|
const pool = new ControllerPool({ kvmStore, memeStore, sessionStore, config: CONFIG });
|
|
4371
5448
|
const rotation = new RotationManager(pool, {});
|
|
@@ -4380,19 +5457,33 @@ const bootstrap = async () => {
|
|
|
4380
5457
|
study.start();
|
|
4381
5458
|
const snapshots = new SnapshotManager(pool.getActive().runtime, CONFIG.snapshotDir);
|
|
4382
5459
|
const shards = new ShardManager(pool);
|
|
4383
|
-
const spark = new BigSparkArray(pool, shards, {
|
|
5460
|
+
const spark = new BigSparkArray(pool, shards, {
|
|
5461
|
+
groupIds: pool.listGroupIds(),
|
|
5462
|
+
groupOptions: {
|
|
5463
|
+
numAI: CONFIG.sparkNumAI,
|
|
5464
|
+
budget: CONFIG.sparkBudget
|
|
5465
|
+
}
|
|
5466
|
+
});
|
|
4384
5467
|
// Try auto-restore latest snapshot to skip warmup/pretraining
|
|
4385
5468
|
let __restoredFromSnapshot = false;
|
|
4386
5469
|
try {
|
|
4387
5470
|
const list = snapshots.list().sort((a, b) => b.localeCompare(a));
|
|
4388
5471
|
if (list.length > 0) {
|
|
4389
|
-
await snapshots.restore(list[0]);
|
|
5472
|
+
const restoredSnapshot = await snapshots.restore(list[0]);
|
|
4390
5473
|
__restoredFromSnapshot = true;
|
|
4391
5474
|
console.log(`[Bootstrap] Restored latest snapshot: ${list[0]}`);
|
|
4392
|
-
|
|
4393
|
-
|
|
4394
|
-
|
|
4395
|
-
|
|
5475
|
+
if (CONFIG.syncStandbyOnBoot) {
|
|
5476
|
+
// 注意:applySnapshot 内部包含大量同步写入,依然会阻塞;仅在你明确需要时开启。
|
|
5477
|
+
try {
|
|
5478
|
+
await pool.standby.applySnapshot(restoredSnapshot);
|
|
5479
|
+
await pool.validation.applySnapshot(restoredSnapshot);
|
|
5480
|
+
console.log('[Bootstrap] Standby/validation synced from snapshot');
|
|
5481
|
+
} catch (e) {
|
|
5482
|
+
console.warn('[Bootstrap] Standby/validation sync skipped:', e.message);
|
|
5483
|
+
}
|
|
5484
|
+
} else {
|
|
5485
|
+
console.log('[Bootstrap] Standby/validation sync skipped (fast-boot)');
|
|
5486
|
+
}
|
|
4396
5487
|
}
|
|
4397
5488
|
} catch (err) {
|
|
4398
5489
|
console.warn('[Bootstrap] Snapshot restore skipped:', err.message);
|
|
@@ -4401,7 +5492,7 @@ const bootstrap = async () => {
|
|
|
4401
5492
|
try {
|
|
4402
5493
|
const preloadDocs = pool.getActive().runtime.collectRobotsDocuments({
|
|
4403
5494
|
limit: CONFIG.robotsWarmupLimit,
|
|
4404
|
-
shuffle:
|
|
5495
|
+
shuffle: Boolean(CONFIG.robotsWarmupShuffle)
|
|
4405
5496
|
});
|
|
4406
5497
|
if (preloadDocs.length) {
|
|
4407
5498
|
console.log(`[Bootstrap] Preloading ${preloadDocs.length} robots documents...`);
|
|
@@ -4423,64 +5514,60 @@ const bootstrap = async () => {
|
|
|
4423
5514
|
}
|
|
4424
5515
|
|
|
4425
5516
|
// 额外:将 tests 目录用例按哈希分片到不同 AI,形成差异化“训练集”
|
|
4426
|
-
|
|
4427
|
-
|
|
4428
|
-
|
|
4429
|
-
|
|
4430
|
-
|
|
4431
|
-
|
|
4432
|
-
|
|
4433
|
-
const
|
|
4434
|
-
const
|
|
4435
|
-
|
|
4436
|
-
|
|
4437
|
-
|
|
4438
|
-
|
|
5517
|
+
if (CONFIG.testsAutoload) {
|
|
5518
|
+
try {
|
|
5519
|
+
const testsDir = path.join(__dirname, 'tests');
|
|
5520
|
+
if (fs.existsSync(testsDir)) {
|
|
5521
|
+
const files = fs.readdirSync(testsDir).filter((f) => /\.txt$/i.test(f));
|
|
5522
|
+
if (files.length) {
|
|
5523
|
+
console.log(`[Bootstrap] Preloading tests corpus (${files.length} files)...`);
|
|
5524
|
+
const groups = pool.listGroupIds();
|
|
5525
|
+
for (const f of files) {
|
|
5526
|
+
const full = path.join(testsDir, f);
|
|
5527
|
+
const text = fs.readFileSync(full, 'utf8');
|
|
5528
|
+
const key = `tests:${f}`;
|
|
5529
|
+
const idx = groups.length ? (hashStrSimple(key) % groups.length) : 0;
|
|
5530
|
+
const targetGroup = groups[idx] || groups[0] || 'G1';
|
|
5531
|
+
await pool.ingestDocumentToGroup(targetGroup, { text, source: key });
|
|
5532
|
+
}
|
|
5533
|
+
console.log(`[Bootstrap] Sharded tests corpus into ${groups.length} groups.`);
|
|
4439
5534
|
}
|
|
4440
|
-
console.log(`[Bootstrap] Sharded tests corpus into ${groups.length} groups.`);
|
|
4441
5535
|
}
|
|
5536
|
+
} catch (err) {
|
|
5537
|
+
console.warn('[Bootstrap] Tests sharded preload skipped:', err.message);
|
|
4442
5538
|
}
|
|
4443
|
-
}
|
|
4444
|
-
console.
|
|
5539
|
+
} else {
|
|
5540
|
+
console.log('[Bootstrap] Tests preload skipped (fast-boot)');
|
|
4445
5541
|
}
|
|
4446
|
-
//
|
|
4447
|
-
const
|
|
4448
|
-
const adv = new AdversarialLearner(pool, {});
|
|
4449
|
-
const gateway = new GatewayServer(pool, shards, snapshots, rotation, redisSync, study, spark, { rl, adv });
|
|
5542
|
+
// 学习模块改为网关侧按需创建(降低启动时间与内存峰值)
|
|
5543
|
+
const gateway = new GatewayServer(pool, shards, snapshots, rotation, redisSync, study, spark, { rl: null, adv: null });
|
|
4450
5544
|
gateway.listen(CONFIG.portGateway, CONFIG.gatewayHost);
|
|
4451
|
-
//
|
|
4452
|
-
|
|
4453
|
-
|
|
4454
|
-
|
|
4455
|
-
|
|
4456
|
-
|
|
4457
|
-
|
|
4458
|
-
|
|
4459
|
-
|
|
4460
|
-
|
|
5545
|
+
// 可选预热:默认关闭;需要时加 --learning-warmup=true
|
|
5546
|
+
if (CONFIG.learningWarmup) {
|
|
5547
|
+
(async () => {
|
|
5548
|
+
if (!CONFIG.disableLearning && !CONFIG.disableRL) {
|
|
5549
|
+
try {
|
|
5550
|
+
await gateway._ensureRL().learn(1);
|
|
5551
|
+
} catch (e) {
|
|
5552
|
+
console.warn('[Bootstrap] RL warmup failed:', e.message);
|
|
5553
|
+
try { gateway.rlDisabled = true; } catch (_) {}
|
|
5554
|
+
}
|
|
4461
5555
|
}
|
|
4462
|
-
|
|
4463
|
-
|
|
4464
|
-
|
|
4465
|
-
|
|
4466
|
-
|
|
4467
|
-
|
|
4468
|
-
|
|
4469
|
-
|
|
4470
|
-
|
|
4471
|
-
|
|
4472
|
-
|
|
4473
|
-
if (samples.length) {
|
|
4474
|
-
await adv.attackAndDefend(samples);
|
|
5556
|
+
|
|
5557
|
+
if (!CONFIG.disableLearning && !CONFIG.disableADV) {
|
|
5558
|
+
try {
|
|
5559
|
+
const runtime = pool.getActive().runtime;
|
|
5560
|
+
const docs = runtime.collectRobotsDocuments({ limit: 3, shuffle: true });
|
|
5561
|
+
const samples = docs.map((d) => d.text).filter(Boolean).slice(0, 3);
|
|
5562
|
+
if (samples.length) {
|
|
5563
|
+
await gateway._ensureADV().attackAndDefend(samples);
|
|
5564
|
+
}
|
|
5565
|
+
} catch (e) {
|
|
5566
|
+
console.warn('[Bootstrap] Adversarial warmup failed:', e.message);
|
|
4475
5567
|
}
|
|
4476
|
-
} catch (e) {
|
|
4477
|
-
console.warn('[Bootstrap] Adversarial warmup failed:', e.message);
|
|
4478
5568
|
}
|
|
4479
|
-
}
|
|
4480
|
-
|
|
4481
|
-
console.log('[Bootstrap] ADV disabled by config');
|
|
4482
|
-
}
|
|
4483
|
-
})();
|
|
5569
|
+
})();
|
|
5570
|
+
}
|
|
4484
5571
|
process.on('SIGINT', async () => {
|
|
4485
5572
|
console.log('Received SIGINT, saving snapshot...');
|
|
4486
5573
|
try {
|
|
@@ -4502,5 +5589,8 @@ if (require.main === module) {
|
|
|
4502
5589
|
|
|
4503
5590
|
module.exports = {
|
|
4504
5591
|
bootstrap,
|
|
4505
|
-
CONFIG
|
|
5592
|
+
CONFIG,
|
|
5593
|
+
MODEL_DEFAULTS: modelDefaults,
|
|
5594
|
+
BUILTIN_ACTIVATION_TYPES: Object.keys(BuiltinActivations),
|
|
5595
|
+
BUILTIN_TRANSFER_TYPES: Object.keys(BuiltinTransfers)
|
|
4506
5596
|
};
|