079project 8.0.0 → 9.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/main.cjs CHANGED
@@ -20,19 +20,111 @@ const safeRequire = (name) => {
20
20
  }
21
21
  };
22
22
 
23
- const natural = safeRequire('natural');
24
- const csvParse = safeRequire('csv-parse/sync');
25
- const umap = safeRequire('umap-js');
26
- const axios = safeRequire('axios');
27
- const cheerio = safeRequire('cheerio');
28
- const pdfParse = safeRequire('pdf-parse');
29
- // 安全引用 ml-matrix,兼容不同导出结构
30
- const MatrixLib = safeRequire('ml-matrix');
31
- const Matrix = MatrixLib?.Matrix ?? MatrixLib ?? null;
32
-
33
- const STOP_WORDS = natural?.stopwords ?? [];
23
+ // 启动性能优化:重依赖懒加载(避免启动即加载 pdf-parse/cheerio/natural 等)。
24
+ const __lazyModules = new Map();
25
+ const lazyRequire = (name) => {
26
+ if (__lazyModules.has(name)) {
27
+ return __lazyModules.get(name);
28
+ }
29
+ const mod = safeRequire(name);
30
+ __lazyModules.set(name, mod);
31
+ return mod;
32
+ };
33
+
34
+ const getNatural = () => lazyRequire('natural');
35
+ const getCsvParse = () => lazyRequire('csv-parse/sync');
36
+ const getUmap = () => lazyRequire('umap-js');
37
+ const getAxios = () => lazyRequire('axios');
38
+ const getCheerio = () => lazyRequire('cheerio');
39
+ const getPdfParse = () => lazyRequire('pdf-parse');
40
+ const getMatrix = (() => {
41
+ let loaded = false;
42
+ let cached = null;
43
+ return () => {
44
+ if (loaded) return cached;
45
+ loaded = true;
46
+ const MatrixLib = lazyRequire('ml-matrix');
47
+ cached = MatrixLib?.Matrix ?? MatrixLib ?? null;
48
+ return cached;
49
+ };
50
+ })();
51
+
52
+ const getStopWords = (() => {
53
+ let loaded = false;
54
+ let cached = [];
55
+ return () => {
56
+ if (loaded) return cached;
57
+ loaded = true;
58
+ const natural = getNatural();
59
+ const list = natural?.stopwords ?? [];
60
+ cached = Array.isArray(list) ? list : [];
61
+ return cached;
62
+ };
63
+ })();
34
64
  const DEFAULT_CHANNEL = process.env.AI_REDIS_CHANNEL || 'AI-model-workspace';
35
65
 
66
+ /**
67
+ * 外参列表(所有“外部可控参数”的入口汇总)
68
+ *
69
+ * 1) 启动时参数(CLI flags,形如 --k=v 或 --flag=true)
70
+ * - --base-dir: 运行时数据目录(默认:./runtime_store;对应 ENV: AI_BASE_DIR)
71
+ * - --gateway-host: 网关监听 host(默认:127.0.0.1;对应 ENV: AI_GATEWAY_HOST)
72
+ * - --port: 网关端口(默认:5080;对应 ENV: CONTROLLER_PORT)
73
+ * - --study-port: study/前端进程端口(默认:5081;对应 ENV: AI_STUDY_PORT)
74
+ * - --ai-count: 旧版兼容字段;用于 aiCount(默认:7;对应 ENV: AI_COUNT / AI_NUM)
75
+ * - --group-size: 每个工作组 AI 数量(默认:ai-count;对应 ENV: AI_GROUP_SIZE / GROUP_SIZE)
76
+ * - --group-count: 组数量(默认:3;对应 ENV: AI_GROUP_COUNT / GROUP_COUNT)
77
+ * - --spark-num-ai: SparkArray 每组参与汇聚的 AI 数量(默认:group-size;对应 ENV: AI_SPARK_NUM_AI)
78
+ * - --spark-budget: SparkArray 低精度预算预设(default/low/high 或 JSON;对应 ENV: AI_SPARK_BUDGET)
79
+ * - --robots-limit: robots 预热/导入条数上限(默认:200;对应 ENV: AI_ROBOTS_LIMIT)
80
+ * - --redis-url: Redis 连接串(默认:redis://127.0.0.1:6379;对应 ENV: REDIS_URL)
81
+ * - --channel: Redis pubsub 频道(默认:AI_REDIS_CHANNEL 或 'AI-model-workspace')
82
+ * - --snapshot-dir: 快照目录(默认:./snapshots)
83
+ * - --lmdb-dir: LMDB 根目录(默认:./lmdb;对应 ENV: LMDB_DIR)
84
+ * - --search-endpoint: 在线检索/搜索服务端点(默认:'';对应 ENV: AI_SEARCH_ENDPOINT)
85
+ * - --robots-dir: robots 语料目录(默认:./robots;对应 ENV: AI_ROBOTS_DIR)
86
+ * - --lemma-csv: lemma 词形还原表路径(默认:./lemma.csv;对应 ENV: AI_LEMMA_CSV)
87
+ * - --robots-autoload: 启动时是否自动加载 robots(默认:true;对应 ENV: AI_ROBOTS_AUTOLOAD)
88
+ * - --disable-memebarrier: 启动默认禁用 MemeBarrier(对应 ENV: AI_DISABLE_MEMEBARRIER)
89
+ * - --disable-rl: 启动默认禁用 RL(对应 ENV: AI_DISABLE_RL)
90
+ * - --disable-adv: 启动默认禁用 ADV(对应 ENV: AI_DISABLE_ADV)
91
+ * - --disable-learning: 启动默认禁用学习总开关(对应 ENV: AI_DISABLE_LEARNING)
92
+ * - --export-dir: 图导出目录(默认:./runtime_store;对应 ENV: AI_EXPORT_DIR)
93
+ *
94
+ * 2) 环境变量(ENV)
95
+ * - AI_REDIS_CHANNEL: Redis 频道默认值(被 --channel 覆盖)
96
+ * - AI_AUTH_ENABLED: 是否启用 /api/* 鉴权(默认:true;'false' 关闭)
97
+ * - AI_AUTH_JWT_SECRET / AUTH_JWT_SECRET: JWT 密钥(默认:'dev-secret-change-me')
98
+ * - 说明:鉴权默认保护 /api/*,仅 /api/system/status 在 publicPaths 白名单
99
+ *
100
+ * 3) 运行时参数(HTTP API,可在不重启的情况下调整)
101
+ * - POST /api/chat
102
+ * - body.text / body.message: 输入文本
103
+ * - body.sessionId: 会话 ID(可选;未提供则自动分配)
104
+ * - body.tokens / body.words / body.vocab: 直接提供分词(可选;否则对 text 做 tokenize)
105
+ *
106
+ * - GET /api/model/params: 读取当前模型参数
107
+ * - POST /api/model/params: patch 模型参数(部分字段)
108
+ * - POST /api/model/params/reset: 重置为 modelDefaults
109
+ * - 可 patch 的 key(见下方 modelDefaults):
110
+ * decayFactor, maxMemeWords, minOverlapThreshold, memeNgramMin, memeNgramMax,
111
+ * maliciousThreshold, learningIterations, iteration, threshold, decay, decayK,
112
+ * maxLen, edgeWeight, activationType, transferType, activationCustom, transferCustom
113
+ *
114
+ * - GET /api/runtime/features: 读取运行时功能开关状态
115
+ * - PATCH /api/runtime/features: patch 运行时功能开关(字段如下)
116
+ * - memebarrierEnabled: boolean
117
+ * - maliciousThreshold: number
118
+ * - learningEnabled: boolean(总开关;false 会同时关闭 rl/adv/dialogLearning)
119
+ * - rlEnabled / advEnabled: boolean
120
+ * - dialogLearningEnabled: boolean
121
+ * - rlEvery / advEvery: number(对话触发学习的阈值)
122
+ * - 注意:CLI 的 disable* 表示“启动默认禁用”,运行时仍允许 override(会返回 warnings)
123
+ *
124
+ * - POST /api/learn/thresholds: { rlEvery, advEvery }
125
+ * - POST /api/learn/reinforce: { cycles }(默认:3)
126
+ */
127
+
36
128
  const ensureDir = (dir) => {
37
129
  if (!fs.existsSync(dir)) {
38
130
  fs.mkdirSync(dir, { recursive: true });
@@ -60,8 +152,16 @@ const CONFIG = (() => {
60
152
  return !(normalized === '0' || normalized === 'false' || normalized === 'off' || normalized === 'no');
61
153
  };
62
154
  const robotsLimitRaw = args['robots-limit'] || process.env.AI_ROBOTS_LIMIT || 200;
155
+ const robotsChunkMinRaw = args['robots-chunk-min'] || process.env.AI_ROBOTS_CHUNK_MIN || 3;
156
+ const robotsChunkMaxRaw = args['robots-chunk-max'] || process.env.AI_ROBOTS_CHUNK_MAX || 20;
157
+ const lmdbMapMbRaw = args['lmdb-map-mb'] || process.env.AI_LMDB_MAP_MB || 512;
158
+ const kvmCacheMaxRaw = args['kvm-cache-max'] || process.env.AI_KVM_CACHE_MAX || 50_000;
159
+ const lemmaMaxMbRaw = args['lemma-max-mb'] || process.env.AI_LEMMA_MAX_MB || 64;
63
160
  const aiCountRaw = args['ai-count'] || process.env.AI_COUNT || process.env.AI_NUM || 7;
64
161
  const groupCountRaw = args['group-count'] || process.env.AI_GROUP_COUNT || process.env.GROUP_COUNT || 3;
162
+ const groupSizeRaw = args['group-size'] || process.env.AI_GROUP_SIZE || process.env.GROUP_SIZE || aiCountRaw;
163
+ const sparkNumAiRaw = args['spark-num-ai'] || process.env.AI_SPARK_NUM_AI || groupSizeRaw;
164
+ const sparkBudgetRaw = args['spark-budget'] || process.env.AI_SPARK_BUDGET || 'default';
65
165
  return {
66
166
  baseDir: path.resolve(args['base-dir'] || process.env.AI_BASE_DIR || path.join(__dirname, 'runtime_store')),
67
167
  gatewayHost: String(args['gateway-host'] || process.env.AI_GATEWAY_HOST || '127.0.0.1'),
@@ -69,7 +169,12 @@ const CONFIG = (() => {
69
169
  portStudy: Number(args['study-port'] || process.env.AI_STUDY_PORT || 5081),
70
170
  aiCount: Math.max(3, Number(aiCountRaw) || 7),
71
171
  groupCount: Math.max(1, Number(groupCountRaw) || 3),
72
- groupSize: 7,
172
+ // 兼容:若只传了 --ai-count,则默认 groupSize=aiCount
173
+ groupSize: Math.max(1, Number(groupSizeRaw) || (Number(aiCountRaw) || 7)),
174
+ // SparkArray:每组参与汇聚的实例数量(不超过该组实际 controller 数量)
175
+ sparkNumAI: Math.max(1, Number(sparkNumAiRaw) || (Number(groupSizeRaw) || (Number(aiCountRaw) || 7))),
176
+ // SparkArray:低精度预算(可传 preset:default/low/high,或 JSON 对象)
177
+ sparkBudget: sparkBudgetRaw,
73
178
  redisUrl: process.env.REDIS_URL || args['redis-url'] || 'redis://127.0.0.1:6379',
74
179
  redisChannel: args.channel || DEFAULT_CHANNEL,
75
180
  snapshotDir: args['snapshot-dir'] || path.join(__dirname, 'snapshots'),
@@ -77,11 +182,24 @@ const CONFIG = (() => {
77
182
  maxWorkers: Math.max(1, (os.cpus()?.length ?? 2) - 1),
78
183
  shardCache: path.join(__dirname, 'shards_cache.json'),
79
184
  lmdbRoot: path.join(args['lmdb-dir'] || process.env.LMDB_DIR || path.join(__dirname, 'lmdb')),
185
+ lmdbMapSizeBytes: Math.max(64, Number(lmdbMapMbRaw) || 512) * 1024 * 1024,
80
186
  searchEndpoint: args['search-endpoint'] || process.env.AI_SEARCH_ENDPOINT || '',
81
187
  robotsDir: path.resolve(args['robots-dir'] || process.env.AI_ROBOTS_DIR || path.join(__dirname, 'robots')),
82
188
  lemmaCsv: path.resolve(args['lemma-csv'] || process.env.AI_LEMMA_CSV || path.join(__dirname, 'lemma.csv')),
189
+ lemmaAutoload: boolFrom(args['lemma-autoload'] ?? process.env.AI_LEMMA_AUTOLOAD, false),
190
+ lemmaMaxBytes: Math.max(1, Number(lemmaMaxMbRaw) || 64) * 1024 * 1024,
191
+ lemmaForce: boolFrom(args['lemma-force'] ?? process.env.AI_LEMMA_FORCE, false),
83
192
  robotsWarmupLimit: Math.max(0, Number(robotsLimitRaw) || 0),
84
193
  robotsAutoload: boolFrom(args['robots-autoload'] ?? process.env.AI_ROBOTS_AUTOLOAD, true),
194
+ robotsWarmupShuffle: boolFrom(args['robots-warmup-shuffle'] ?? process.env.AI_ROBOTS_WARMUP_SHUFFLE, false),
195
+ robotsChunkMinWords: Math.max(1, Number(robotsChunkMinRaw) || 2),
196
+ robotsChunkMaxWords: Math.max(1, Number(robotsChunkMaxRaw) || 5),
197
+ kvmCacheMaxEntries: Math.max(0, Number(kvmCacheMaxRaw) || 0),
198
+ learningWarmup: boolFrom(args['learning-warmup'] ?? process.env.AI_LEARNING_WARMUP, false),
199
+ // 启动时是否把 serving 的 snapshot 同步到 standby/validation(可能很慢;默认关闭以保证 fast-boot)
200
+ syncStandbyOnBoot: boolFrom(args['sync-standby'] ?? process.env.AI_SYNC_STANDBY_ON_BOOT, false),
201
+ // tests 语料预加载(可能较慢;默认开启,fast-boot 可设为 false)
202
+ testsAutoload: boolFrom(args['tests-autoload'] ?? process.env.AI_TESTS_AUTOLOAD, true),
85
203
  // Feature toggles via CLI/env
86
204
  disableBarrier: boolFrom(args['disable-memebarrier'] ?? process.env.AI_DISABLE_MEMEBARRIER, false) === true,
87
205
  disableRL: boolFrom(args['disable-rl'] ?? process.env.AI_DISABLE_RL, false) === true,
@@ -99,7 +217,7 @@ ensureDir(CONFIG.robotsDir);
99
217
  const LMDB = safeRequire('lmdb');
100
218
 
101
219
  class LmdbStore {
102
- constructor({ name, rootDir, encodeJSON = true }) {
220
+ constructor({ name, rootDir, encodeJSON = true, mapSizeBytes } = {}) {
103
221
  this.name = name;
104
222
  this.rootDir = rootDir;
105
223
  this.encodeJSON = encodeJSON;
@@ -109,10 +227,13 @@ class LmdbStore {
109
227
  try {
110
228
  const envPath = path.join(rootDir, name);
111
229
  ensureDir(envPath);
230
+ const resolvedMapSize = Number.isFinite(Number(mapSizeBytes)) && Number(mapSizeBytes) > 0
231
+ ? Number(mapSizeBytes)
232
+ : 512 * 1024 * 1024;
112
233
  this.env = this.backend.open({
113
234
  path: envPath,
114
235
  maxReaders: 64,
115
- mapSize: 1024 * 1024 * 1024,
236
+ mapSize: resolvedMapSize,
116
237
  useWritemap: true,
117
238
  noSync: false
118
239
  });
@@ -352,10 +473,40 @@ const compileCustomFunctionSafely = (source, argNames, fallback) => {
352
473
  }
353
474
  };
354
475
 
476
+ /**
477
+ * 模型外参/超参(面向调参/评测;与 CLI/ENV 无关)
478
+ *
479
+ * 修改方式:
480
+ * - 运行时 patch:POST /api/model/params 传入 { key: value }
481
+ * - 恢复默认:POST /api/model/params/reset
482
+ * - 读取当前值:GET /api/model/params
483
+ *
484
+ * 逐项说明(默认值以此处为准):
485
+ * - iteration (5): 传播迭代步数;用于 RuntimeState.runPropagation()/exportGraphToFile() -> TensorEngine.iteratePropagation().
486
+ * - decayK (1): 传播衰减系数;传给 TensorEngine.iteratePropagation(csr, seeds, steps, actFn, decayK, damp).
487
+ *
488
+ * - memeNgramMin (2) / memeNgramMax (4): 构建“短语模因(ngram)”的长度范围;用于 mapWordsToMemes() 与 _buildMemeSequenceFromTokens().
489
+ * - minOverlapThreshold (2): tokenSet 与既有 meme 的最小重合词数;满足则“融合”到该 meme(link 词 -> meme)。
490
+ * - maxMemeWords (100): tokenSet 去重后的最大词数上限(用于限制短语模因的词集合大小)。
491
+ *
492
+ * - maliciousThreshold (0.7): MemeBarrier 判定阈值(网关侧安全屏障);也可通过 PATCH /api/runtime/features 调整。
493
+ *
494
+ * - activationType ('relu'): 激活函数类型;用于 _activation()。
495
+ * - 可用类型见 module.exports.BUILTIN_ACTIVATION_TYPES;当为 'custom' 时使用 activationCustom。
496
+ * - activationCustom (''): 自定义激活函数源码(function(x){...} 或表达式);仅 activationType='custom' 时生效。
497
+ *
498
+ * - transferType ('linear') / transferCustom (''):
499
+ * - 预留:目前本文件内未在主传播路径中调用(仅实现了 _transfer() 与 BuiltinTransfers)。
500
+ *
501
+ * - decayFactor (0.5), learningIterations (3), threshold (3), decay (1), maxLen (16), edgeWeight (1):
502
+ * - 预留:当前版本 main.cjs 中未发现显式读取点(可能供未来/外部实验使用)。
503
+ */
355
504
  const modelDefaults = {
356
505
  decayFactor: 0.5,
357
506
  maxMemeWords: 100,
358
507
  minOverlapThreshold: 2,
508
+ memeNgramMin: 3,
509
+ memeNgramMax: 14,
359
510
  maliciousThreshold: 0.7,
360
511
  learningIterations: 3,
361
512
  iteration: 5,
@@ -367,7 +518,12 @@ const modelDefaults = {
367
518
  activationType: 'relu',
368
519
  transferType: 'linear',
369
520
  activationCustom: '',
370
- transferCustom: ''
521
+ transferCustom: '',
522
+ // 多次映射/镜面反射层(文明演算法思想):words -> memes -> words -> memes ...
523
+ mappingDepth: 1,
524
+ reflectionTopMemes: 18,
525
+ reflectionTopWords: 24,
526
+ reflectionMinScore: 1e-6
371
527
  };
372
528
 
373
529
  const hashString = (str) => {
@@ -389,7 +545,7 @@ const tokenize = (text) => {
389
545
  if (!part) {
390
546
  continue;
391
547
  }
392
- if (/^[a-z0-9_\-]+$/.test(part) && STOP_WORDS.includes(part)) {
548
+ if (/^[a-z0-9_\-]+$/.test(part) && getStopWords().includes(part)) {
393
549
  continue;
394
550
  }
395
551
  tokens.push(part);
@@ -397,21 +553,388 @@ const tokenize = (text) => {
397
553
  return tokens;
398
554
  };
399
555
 
556
+ const splitSentences = (text) => {
557
+ const raw = String(text || '').trim();
558
+ if (!raw) {
559
+ return [];
560
+ }
561
+
562
+ // 说明:此函数用于“轻量切分成可学习/可检索的文本单元”。
563
+ // 这里不再按标点分句作为唯一粒度,而是将文本切成“2-10 个词”的短片段。
564
+ // 这样 robots 语料、surface phrase 抽取等模块能获得更细粒度的共现结构。
565
+ const maxWords = 10;
566
+ const minWords = 2;
567
+
568
+ // 先粗分段(保留换行/句末标点作为天然边界),再在段内按词切块。
569
+ const rough = raw
570
+ .split(/[\r\n]+|(?<=[。!?!?])\s*/g)
571
+ .map((s) => String(s || '').trim())
572
+ .filter(Boolean);
573
+
574
+ const out = [];
575
+ for (const unit of rough) {
576
+ const tokens = tokenize(unit);
577
+ if (tokens.length < minWords) {
578
+ continue;
579
+ }
580
+
581
+ for (let i = 0; i < tokens.length; i += maxWords) {
582
+ const chunk = tokens.slice(i, i + maxWords);
583
+ if (chunk.length < minWords) {
584
+ // 末尾不足 2 词:尽量并入上一块
585
+ if (out.length) {
586
+ out[out.length - 1] = `${out[out.length - 1]} ${chunk.join(' ')}`.trim();
587
+ }
588
+ continue;
589
+ }
590
+ out.push(chunk.join(' '));
591
+ if (out.length >= 12) {
592
+ return out;
593
+ }
594
+ }
595
+
596
+ if (out.length >= 12) {
597
+ break;
598
+ }
599
+ }
600
+
601
+ return out.slice(0, 12);
602
+ };
603
+
604
+ const extractSurfacePhrases = (text, { maxPhrases = 24 } = {}) => {
605
+ const out = [];
606
+ const seen = new Set();
607
+
608
+ const push = (phrase, weight = 1) => {
609
+ const p = String(phrase || '').trim();
610
+ if (!p) return;
611
+ if (p.length < 2) return;
612
+ if (p.length > 160) return;
613
+ if (seen.has(p)) return;
614
+ seen.add(p);
615
+ out.push({ phrase: p, weight });
616
+ };
617
+
618
+ // 细化:优先保留“词/短语结构”,句子仅作弱特征。
619
+ const sentences = splitSentences(text);
620
+ for (const s of sentences) {
621
+ push(s, 1);
622
+ }
623
+
624
+ const tokens = tokenize(text);
625
+ if (tokens.length) {
626
+ // unigram
627
+ for (const t of tokens.slice(0, maxPhrases)) {
628
+ push(t, 2);
629
+ if (out.length >= maxPhrases) {
630
+ return out.slice(0, maxPhrases);
631
+ }
632
+ }
633
+
634
+ // n-gram (短语)
635
+ const maxN = Math.min(5, tokens.length);
636
+ for (let n = 2; n <= maxN; n++) {
637
+ for (let i = 0; i + n <= tokens.length; i++) {
638
+ const gram = tokens.slice(i, i + n).join(' ');
639
+ const w = n === 2 ? 3 : (n === 3 ? 2 : 1);
640
+ push(gram, w);
641
+ if (out.length >= maxPhrases) {
642
+ return out.slice(0, maxPhrases);
643
+ }
644
+ }
645
+ }
646
+ }
647
+
648
+ return out.slice(0, maxPhrases);
649
+ };
650
+
651
+ class MemeSurfaceLexicon {
652
+ constructor(store, {
653
+ maxEntriesPerMeme = 64,
654
+ decay = 0.985
655
+ } = {}) {
656
+ this.store = store;
657
+ this.maxEntriesPerMeme = Math.max(8, Number(maxEntriesPerMeme) || 64);
658
+ this.decay = Number.isFinite(Number(decay)) ? Number(decay) : 0.985;
659
+ }
660
+
661
+ _key(memeId) {
662
+ return `m:${String(memeId)}`;
663
+ }
664
+
665
+ _load(memeId) {
666
+ const raw = this.store.get(this._key(memeId));
667
+ if (!raw || typeof raw !== 'object') {
668
+ return { phrases: {}, updatedAt: 0 };
669
+ }
670
+ const phrases = raw.phrases && typeof raw.phrases === 'object' ? raw.phrases : {};
671
+ return { phrases, updatedAt: Number(raw.updatedAt || 0) || 0 };
672
+ }
673
+
674
+ _save(memeId, rec) {
675
+ this.store.put(this._key(memeId), rec);
676
+ }
677
+
678
+ learn(memeId, replyText, { weight = 1 } = {}) {
679
+ if (!memeId) return;
680
+ const w = Number.isFinite(Number(weight)) ? Number(weight) : 1;
681
+ const rec = this._load(memeId);
682
+ const next = { phrases: { ...rec.phrases }, updatedAt: Date.now() };
683
+
684
+ // 对旧条目做轻量衰减,防止早期噪声长期占据。
685
+ for (const [k, v] of Object.entries(next.phrases)) {
686
+ const nv = (Number(v) || 0) * this.decay;
687
+ if (nv <= 1e-6) {
688
+ delete next.phrases[k];
689
+ } else {
690
+ next.phrases[k] = nv;
691
+ }
692
+ }
693
+
694
+ const phrases = extractSurfacePhrases(replyText, { maxPhrases: 24 });
695
+ for (const p of phrases) {
696
+ next.phrases[p.phrase] = (Number(next.phrases[p.phrase]) || 0) + (p.weight * w);
697
+ }
698
+
699
+ // 裁剪到 topN
700
+ const ordered = Object.entries(next.phrases)
701
+ .sort((a, b) => (Number(b[1]) || 0) - (Number(a[1]) || 0))
702
+ .slice(0, this.maxEntriesPerMeme);
703
+ next.phrases = Object.fromEntries(ordered);
704
+ this._save(memeId, next);
705
+ }
706
+
707
+ getTop(memeId, { limit = 6 } = {}) {
708
+ const rec = this._load(memeId);
709
+ const ordered = Object.entries(rec.phrases || {})
710
+ .sort((a, b) => (Number(b[1]) || 0) - (Number(a[1]) || 0))
711
+ .slice(0, Math.max(1, Number(limit) || 6))
712
+ .map(([phrase, score]) => ({ phrase, score: Number(score) || 0 }));
713
+ return ordered;
714
+ }
715
+
716
+ exportSnapshot({ limitMemes = 512 } = {}) {
717
+ const out = [];
718
+ const items = this.store.entries('m:');
719
+ for (const [key, value] of items) {
720
+ out.push([key, value]);
721
+ if (out.length >= limitMemes) break;
722
+ }
723
+ return out;
724
+ }
725
+
726
+ importSnapshot(entries) {
727
+ if (!Array.isArray(entries)) return;
728
+ for (const item of entries) {
729
+ if (!Array.isArray(item) || item.length !== 2) continue;
730
+ const [key, value] = item;
731
+ if (typeof key !== 'string' || !key.startsWith('m:')) continue;
732
+ this.store.put(key, value);
733
+ }
734
+ }
735
+ }
736
+
737
+ const jaccard = (a, b) => {
738
+ const A = a instanceof Set ? a : new Set(Array.isArray(a) ? a : []);
739
+ const B = b instanceof Set ? b : new Set(Array.isArray(b) ? b : []);
740
+ if (A.size === 0 && B.size === 0) return 1;
741
+ if (A.size === 0 || B.size === 0) return 0;
742
+ let inter = 0;
743
+ for (const x of A) if (B.has(x)) inter++;
744
+ const uni = A.size + B.size - inter;
745
+ return uni <= 0 ? 0 : inter / uni;
746
+ };
747
+
748
+ class DialogMemory {
749
+ constructor(store, {
750
+ maxItems = 2048,
751
+ maxPerIndex = 64
752
+ } = {}) {
753
+ this.store = store;
754
+ this.maxItems = Math.max(128, Number(maxItems) || 2048);
755
+ this.maxPerIndex = Math.max(8, Number(maxPerIndex) || 64);
756
+ }
757
+
758
+ _kDialog(id) {
759
+ return `d:${String(id)}`;
760
+ }
761
+
762
+ _kIndex(memeId) {
763
+ return `i:${String(memeId)}`;
764
+ }
765
+
766
+ _makeId(signature) {
767
+ return hashString(String(signature || ''));
768
+ }
769
+
770
+ remember({ signature, memes = [], question = '', reply = '', scoreHint = 0 } = {}) {
771
+ const sig = String(signature || '').trim();
772
+ const rep = String(reply || '').trim();
773
+ if (!sig || !rep) return null;
774
+ const id = this._makeId(sig);
775
+ const key = this._kDialog(id);
776
+ const prev = this.store.get(key);
777
+ const next = {
778
+ id,
779
+ signature: sig,
780
+ memes: Array.isArray(memes) ? memes.slice(0, 32) : [],
781
+ question: String(question || '').slice(0, 800),
782
+ reply: rep.slice(0, 1200),
783
+ updatedAt: Date.now(),
784
+ count: (prev && Number(prev.count)) ? (Number(prev.count) + 1) : 1,
785
+ scoreHint: Number.isFinite(Number(scoreHint)) ? Number(scoreHint) : (prev?.scoreHint ?? 0)
786
+ };
787
+ this.store.put(key, next);
788
+
789
+ // 建索引:memeId -> dialogIds[]
790
+ const uniq = Array.from(new Set(next.memes));
791
+ for (const memeId of uniq) {
792
+ const ik = this._kIndex(memeId);
793
+ const list = Array.isArray(this.store.get(ik)) ? this.store.get(ik) : [];
794
+ const filtered = list.filter((x) => x && x !== id);
795
+ filtered.unshift(id);
796
+ this.store.put(ik, filtered.slice(0, this.maxPerIndex));
797
+ }
798
+
799
+ return next;
800
+ }
801
+
802
+ retrieve({ memes = [], signature = '', minSim = 0.45 } = {}) {
803
+ const memeList = Array.isArray(memes) ? memes.slice(0, 24) : [];
804
+ const sigSet = new Set(String(signature || '').split('|').filter(Boolean));
805
+ const candidateIds = new Set();
806
+ for (const memeId of memeList.slice(0, 8)) {
807
+ const ik = this._kIndex(memeId);
808
+ const ids = Array.isArray(this.store.get(ik)) ? this.store.get(ik) : [];
809
+ for (const id of ids) {
810
+ if (id) candidateIds.add(id);
811
+ }
812
+ }
813
+
814
+ // 如果没有索引命中,尝试精确 signature 命中
815
+ if (candidateIds.size === 0 && signature) {
816
+ candidateIds.add(this._makeId(signature));
817
+ }
818
+
819
+ let best = null;
820
+ let bestScore = 0;
821
+ for (const id of candidateIds) {
822
+ const rec = this.store.get(this._kDialog(id));
823
+ if (!rec || !rec.reply) continue;
824
+ const recSet = new Set(String(rec.signature || '').split('|').filter(Boolean));
825
+ const sim = jaccard(sigSet, recSet);
826
+ if (sim < minSim) continue;
827
+ const freq = Math.log(1 + (Number(rec.count) || 0));
828
+ const score = sim * (1 + 0.15 * freq);
829
+ if (score > bestScore) {
830
+ bestScore = score;
831
+ best = { ...rec, similarity: sim, score };
832
+ }
833
+ }
834
+ return best;
835
+ }
836
+
837
+ exportSnapshot({ limit = 512 } = {}) {
838
+ const out = { dialogs: [], indexes: [] };
839
+ const dialogs = this.store.entries('d:');
840
+ for (const [key, value] of dialogs) {
841
+ out.dialogs.push([key, value]);
842
+ if (out.dialogs.length >= limit) break;
843
+ }
844
+ const indexes = this.store.entries('i:');
845
+ for (const [key, value] of indexes) {
846
+ out.indexes.push([key, value]);
847
+ if (out.indexes.length >= limit) break;
848
+ }
849
+ return out;
850
+ }
851
+
852
+ importSnapshot(snapshot) {
853
+ if (!snapshot || typeof snapshot !== 'object') return;
854
+ for (const item of Array.isArray(snapshot.dialogs) ? snapshot.dialogs : []) {
855
+ if (!Array.isArray(item) || item.length !== 2) continue;
856
+ const [key, value] = item;
857
+ if (typeof key === 'string' && key.startsWith('d:')) this.store.put(key, value);
858
+ }
859
+ for (const item of Array.isArray(snapshot.indexes) ? snapshot.indexes : []) {
860
+ if (!Array.isArray(item) || item.length !== 2) continue;
861
+ const [key, value] = item;
862
+ if (typeof key === 'string' && key.startsWith('i:')) this.store.put(key, value);
863
+ }
864
+ }
865
+ }
866
+
400
867
  class RobotsCorpus {
401
- constructor({ dir, lemmaCsv }) {
868
+ constructor({
869
+ dir,
870
+ lemmaCsv,
871
+ lemmaAutoload = false,
872
+ lemmaMaxBytes,
873
+ lemmaForce = false,
874
+ chunkMinWords = 2,
875
+ chunkMaxWords = 5
876
+ } = {}) {
402
877
  this.dir = dir;
403
878
  this.lemmaCsv = lemmaCsv;
404
- this.lemmaMap = this._loadLemmaMap();
879
+ this.lemmaAutoload = Boolean(lemmaAutoload);
880
+ this.lemmaForce = Boolean(lemmaForce);
881
+ this.lemmaMaxBytes = Number.isFinite(Number(lemmaMaxBytes)) && Number(lemmaMaxBytes) > 0
882
+ ? Number(lemmaMaxBytes)
883
+ : 64 * 1024 * 1024;
884
+ this.chunkMinWords = Math.max(1, Number(chunkMinWords) || 2);
885
+ this.chunkMaxWords = Math.max(this.chunkMinWords, Number(chunkMaxWords) || 5);
886
+ this._lemmaLoaded = false;
887
+ this.lemmaMap = new Map();
405
888
  this.maxArticleSize = 5_000_000;
406
889
  this.minParagraphLength = 12;
407
890
  }
408
891
 
892
+ _chunkTokens(tokens) {
893
+ const out = [];
894
+ if (!Array.isArray(tokens) || tokens.length === 0) {
895
+ return out;
896
+ }
897
+ const maxN = Math.max(1, this.chunkMaxWords);
898
+ const minN = Math.max(1, this.chunkMinWords);
899
+ for (let i = 0; i < tokens.length; i += maxN) {
900
+ const chunk = tokens.slice(i, i + maxN);
901
+ if (chunk.length >= minN) {
902
+ out.push(chunk);
903
+ }
904
+ }
905
+ return out;
906
+ }
907
+
908
+ _ensureLemmaMapLoaded() {
909
+ if (this._lemmaLoaded) {
910
+ return;
911
+ }
912
+ this._lemmaLoaded = true;
913
+ if (!this.lemmaAutoload && !this.lemmaForce) {
914
+ return;
915
+ }
916
+ this.lemmaMap = this._loadLemmaMap();
917
+ }
918
+
409
919
  _loadLemmaMap() {
410
920
  const map = new Map();
921
+ const csvParse = getCsvParse();
411
922
  if (!csvParse || !this.lemmaCsv || !fs.existsSync(this.lemmaCsv)) {
412
923
  return map;
413
924
  }
414
925
  try {
926
+ try {
927
+ const st = fs.statSync(this.lemmaCsv);
928
+ if (!this.lemmaForce && st && Number.isFinite(st.size) && st.size > this.lemmaMaxBytes) {
929
+ console.warn(
930
+ `[RobotsCorpus] lemma.csv too large (${Math.round(st.size / 1024 / 1024)}MB), skip autoload. ` +
931
+ `Set AI_LEMMA_FORCE=true or increase AI_LEMMA_MAX_MB to load.`
932
+ );
933
+ return map;
934
+ }
935
+ } catch (_e) {
936
+ // ignore stat failure
937
+ }
415
938
  const csvContent = fs.readFileSync(this.lemmaCsv, 'utf8');
416
939
  const rows = csvParse.parse(csvContent, { skip_empty_lines: true, relax_column_count: true });
417
940
  for (const row of rows) {
@@ -437,6 +960,7 @@ class RobotsCorpus {
437
960
  }
438
961
 
439
962
  lemmatize(word) {
963
+ this._ensureLemmaMapLoaded();
440
964
  const lower = String(word || '').toLowerCase();
441
965
  return this.lemmaMap.get(lower) || lower;
442
966
  }
@@ -466,11 +990,21 @@ class RobotsCorpus {
466
990
  _readFile(file) {
467
991
  const full = path.join(this.dir, file);
468
992
  try {
469
- let content = fs.readFileSync(full, 'utf8');
470
- if (content.length > this.maxArticleSize) {
471
- content = content.slice(0, this.maxArticleSize);
993
+ // 关键优化:避免对超大语料文件做一次性 readFileSync(会把整个文件读进内存)
994
+ // 这里仅读取前 maxArticleSize 字节(近似等价于之前的 slice 行为)。
995
+ const fd = fs.openSync(full, 'r');
996
+ try {
997
+ const maxBytes = Math.max(1, Number(this.maxArticleSize) || 1);
998
+ const buf = Buffer.allocUnsafe(maxBytes);
999
+ const bytesRead = fs.readSync(fd, buf, 0, maxBytes, 0);
1000
+ return buf.toString('utf8', 0, Math.max(0, bytesRead || 0));
1001
+ } finally {
1002
+ try {
1003
+ fs.closeSync(fd);
1004
+ } catch (_e) {
1005
+ // ignore close failure
1006
+ }
472
1007
  }
473
- return content;
474
1008
  } catch (err) {
475
1009
  console.warn(`[RobotsCorpus] Failed to read ${full}:`, err.message);
476
1010
  return '';
@@ -510,30 +1044,38 @@ class RobotsCorpus {
510
1044
  for (const paragraph of paragraphs) {
511
1045
  const trimmed = paragraph.trim();
512
1046
  if (trimmed.length < this.minParagraphLength) {
513
- localIndex += 1;
514
- continue;
515
- }
516
- if (skip > 0) {
517
- skip -= 1;
518
- localIndex += 1;
519
1047
  continue;
520
1048
  }
521
- const tokens = this.normalizeWords(tokenize(trimmed));
522
- if (!tokens.length) {
1049
+
1050
+ // 新逻辑:按句子切分,再按 2-5 词(可配置)切块生成 doc
1051
+ // 这样每条 doc 更接近“短语/局部词共现”,利于词表/模因边的细粒度学习。
1052
+ // 注意:splitSentences() 已改为输出“2-10 词”的短片段;这里不再做二次切块。
1053
+ const units = splitSentences(trimmed);
1054
+ for (const unit of units) {
1055
+ const unitText = String(unit || '').trim();
1056
+ if (unitText.length < this.minParagraphLength) {
1057
+ continue;
1058
+ }
1059
+ const normalizedTokens = this.normalizeWords(tokenize(unitText));
1060
+ if (!normalizedTokens.length) {
1061
+ continue;
1062
+ }
1063
+ if (skip > 0) {
1064
+ skip -= 1;
1065
+ continue;
1066
+ }
1067
+ docs.push({
1068
+ id: `robots:${file}#${localIndex}`,
1069
+ file,
1070
+ source: `robots:${file}`,
1071
+ index: localIndex,
1072
+ text: unitText,
1073
+ tokens: normalizedTokens
1074
+ });
523
1075
  localIndex += 1;
524
- continue;
525
- }
526
- docs.push({
527
- id: `robots:${file}#${localIndex}`,
528
- file,
529
- source: `robots:${file}`,
530
- index: localIndex,
531
- text: trimmed,
532
- tokens
533
- });
534
- localIndex += 1;
535
- if (maxDocs !== null && docs.length >= maxDocs) {
536
- return shuffle && docs.length > 1 ? this._shuffle(docs) : docs;
1076
+ if (maxDocs !== null && docs.length >= maxDocs) {
1077
+ return shuffle && docs.length > 1 ? this._shuffle(docs) : docs;
1078
+ }
537
1079
  }
538
1080
  }
539
1081
  }
@@ -546,9 +1088,41 @@ class RobotsCorpus {
546
1088
  }
547
1089
 
548
1090
  class KVMStore {
549
- constructor(store) {
1091
+ constructor(store, { maxCacheEntries } = {}) {
550
1092
  this.store = store;
551
1093
  this.cache = new Map();
1094
+ this.maxCacheEntries = Number.isFinite(Number(maxCacheEntries)) && Number(maxCacheEntries) >= 0
1095
+ ? Number(maxCacheEntries)
1096
+ : 50_000;
1097
+ }
1098
+
1099
+ _cacheGet(key) {
1100
+ if (!this.maxCacheEntries) {
1101
+ return null;
1102
+ }
1103
+ if (!this.cache.has(key)) {
1104
+ return null;
1105
+ }
1106
+ const value = this.cache.get(key);
1107
+ // LRU: bump to most-recent
1108
+ this.cache.delete(key);
1109
+ this.cache.set(key, value);
1110
+ return value;
1111
+ }
1112
+
1113
+ _cacheSet(key, value) {
1114
+ if (!this.maxCacheEntries) {
1115
+ return;
1116
+ }
1117
+ if (this.cache.has(key)) {
1118
+ this.cache.delete(key);
1119
+ }
1120
+ this.cache.set(key, value);
1121
+ while (this.cache.size > this.maxCacheEntries) {
1122
+ const oldest = this.cache.keys().next().value;
1123
+ if (oldest === undefined) break;
1124
+ this.cache.delete(oldest);
1125
+ }
552
1126
  }
553
1127
 
554
1128
  _key(type, value) {
@@ -557,23 +1131,21 @@ class KVMStore {
557
1131
 
558
1132
  getWordMemeSet(word) {
559
1133
  const key = this._key('word', word);
560
- if (this.cache.has(key)) {
561
- return this.cache.get(key);
562
- }
1134
+ const cached = this._cacheGet(key);
1135
+ if (cached) return cached;
563
1136
  const value = this.store.get(key) || [];
564
1137
  const set = new Set(value);
565
- this.cache.set(key, set);
1138
+ this._cacheSet(key, set);
566
1139
  return set;
567
1140
  }
568
1141
 
569
1142
  getMemeWords(memeId) {
570
1143
  const key = this._key('meme', memeId);
571
- if (this.cache.has(key)) {
572
- return this.cache.get(key);
573
- }
1144
+ const cached = this._cacheGet(key);
1145
+ if (cached) return cached;
574
1146
  const value = this.store.get(key) || [];
575
1147
  const set = new Set(value);
576
- this.cache.set(key, set);
1148
+ this._cacheSet(key, set);
577
1149
  return set;
578
1150
  }
579
1151
 
@@ -585,12 +1157,12 @@ class KVMStore {
585
1157
  if (!wordSet.has(memeId)) {
586
1158
  wordSet.add(memeId);
587
1159
  this.store.put(wordKey, Array.from(wordSet));
588
- this.cache.set(wordKey, wordSet);
1160
+ this._cacheSet(wordKey, wordSet);
589
1161
  }
590
1162
  if (!memeSet.has(word)) {
591
1163
  memeSet.add(word);
592
1164
  this.store.put(memeKey, Array.from(memeSet));
593
- this.cache.set(memeKey, memeSet);
1165
+ this._cacheSet(memeKey, memeSet);
594
1166
  }
595
1167
  }
596
1168
 
@@ -626,36 +1198,80 @@ class CSRMatrix {
626
1198
  }
627
1199
 
628
1200
  class MemeGraph {
629
- constructor(store) {
1201
+ constructor(store, { eagerLoad = false } = {}) {
630
1202
  this.store = store;
631
1203
  this.nodes = new Map();
632
1204
  this.meta = new Map();
633
1205
  this.windowSize = 4096;
634
- this._load();
1206
+ this._fullyLoaded = false;
1207
+ if (eagerLoad) {
1208
+ this._loadAllFromStore();
1209
+ }
635
1210
  }
636
1211
 
637
- _load() {
1212
+ _loadAllFromStore() {
1213
+ if (this._fullyLoaded) {
1214
+ return;
1215
+ }
638
1216
  const entries = this.store.entries('node:');
639
1217
  for (const [key, value] of entries) {
640
1218
  const memeId = key.slice(5);
641
- this.meta.set(memeId, value || {});
1219
+ if (!this.meta.has(memeId)) {
1220
+ this.meta.set(memeId, value || {});
1221
+ }
642
1222
  }
643
1223
  const rowEntries = this.store.entries('row:');
644
1224
  for (const [key, row] of rowEntries) {
645
1225
  const memeId = key.slice(4);
1226
+ if (this.nodes.has(memeId)) {
1227
+ continue;
1228
+ }
646
1229
  if (!row || !Array.isArray(row.neighbors)) {
647
1230
  continue;
648
1231
  }
649
1232
  const map = new Map();
650
1233
  for (const { to, weight, direction } of row.neighbors) {
651
- map.set(to, { weight, direction });
1234
+ map.set(String(to), { weight, direction });
652
1235
  }
653
1236
  this.nodes.set(memeId, map);
654
1237
  }
1238
+ this._fullyLoaded = true;
1239
+ }
1240
+
1241
+ _ensureRowLoaded(memeId) {
1242
+ const id = String(memeId);
1243
+ if (this.nodes.has(id)) {
1244
+ return this.nodes.get(id);
1245
+ }
1246
+ const row = this.store.get(`row:${id}`);
1247
+ if (row && Array.isArray(row.neighbors)) {
1248
+ const map = new Map();
1249
+ for (const { to, weight, direction } of row.neighbors) {
1250
+ map.set(String(to), { weight, direction });
1251
+ }
1252
+ this.nodes.set(id, map);
1253
+ return map;
1254
+ }
1255
+ return null;
1256
+ }
1257
+
1258
+ _ensureMetaLoaded(memeId) {
1259
+ const id = String(memeId);
1260
+ if (this.meta.has(id)) {
1261
+ return this.meta.get(id);
1262
+ }
1263
+ const meta = this.store.get(`node:${id}`);
1264
+ if (meta && typeof meta === 'object') {
1265
+ this.meta.set(id, meta);
1266
+ return meta;
1267
+ }
1268
+ return null;
655
1269
  }
656
1270
 
657
1271
  // 返回当前图中所有模因节点的ID列表,供扫描器等模块使用
658
1272
  getAllPoints() {
1273
+ // 需要全量枚举时才进行全量加载,避免启动时扫描整个 store。
1274
+ this._loadAllFromStore();
659
1275
  return Array.from(this.meta.keys());
660
1276
  }
661
1277
 
@@ -668,11 +1284,26 @@ class MemeGraph {
668
1284
  }
669
1285
 
670
1286
  ensureNode(memeId) {
671
- if (!this.nodes.has(memeId)) {
672
- this.nodes.set(memeId, new Map());
673
- this.meta.set(memeId, { degree: 0, lastTouched: Date.now() });
674
- this.store.put(`node:${memeId}`, this.meta.get(memeId));
675
- this._persistNode(memeId);
1287
+ const id = String(memeId);
1288
+ if (!this.nodes.has(id)) {
1289
+ const loaded = this._ensureRowLoaded(id);
1290
+ if (!loaded) {
1291
+ this.nodes.set(id, new Map());
1292
+ }
1293
+ }
1294
+ if (!this.meta.has(id)) {
1295
+ const loadedMeta = this._ensureMetaLoaded(id);
1296
+ if (!loadedMeta) {
1297
+ this.meta.set(id, { degree: 0, lastTouched: Date.now() });
1298
+ this.store.put(`node:${id}`, this.meta.get(id));
1299
+ }
1300
+ }
1301
+ if (!this.nodes.get(id)) {
1302
+ this.nodes.set(id, new Map());
1303
+ }
1304
+ // 若节点此前不存在于 store,确保 row 持久化
1305
+ if (!this.store.get(`row:${id}`)) {
1306
+ this._persistNode(id);
676
1307
  }
677
1308
  }
678
1309
 
@@ -725,7 +1356,7 @@ class MemeGraph {
725
1356
  while (border.length && visited.size < this.windowSize && depth < radius) {
726
1357
  const next = [];
727
1358
  for (const id of border) {
728
- const table = this.nodes.get(id);
1359
+ const table = this.nodes.get(id) || this._ensureRowLoaded(id);
729
1360
  if (!table) {
730
1361
  continue;
731
1362
  }
@@ -746,7 +1377,7 @@ class MemeGraph {
746
1377
  const weights = [];
747
1378
  for (let i = 0; i < ids.length; i++) {
748
1379
  const id = ids[i];
749
- const table = this.nodes.get(id) || new Map();
1380
+ const table = this.nodes.get(id) || this._ensureRowLoaded(id) || new Map();
750
1381
  rowPtr[i] = edges.length;
751
1382
  for (const [toId, { weight }] of table.entries()) {
752
1383
  if (!index.has(toId)) {
@@ -768,6 +1399,8 @@ class MemeGraph {
768
1399
  }
769
1400
 
770
1401
  exportSnapshot() {
1402
+ // 导出需要全量内容;若尚未加载则在此时执行全量加载。
1403
+ this._loadAllFromStore();
771
1404
  const nodes = [];
772
1405
  for (const [id, table] of this.nodes.entries()) {
773
1406
  nodes.push({
@@ -796,6 +1429,8 @@ class MemeGraph {
796
1429
  }
797
1430
 
798
1431
  removeNode(memeId) {
1432
+ // 删除需要一致性:先全量加载再删,避免遗漏未加载节点中的反向边。
1433
+ this._loadAllFromStore();
799
1434
  if (!this.nodes.has(memeId)) {
800
1435
  return false;
801
1436
  }
@@ -1071,6 +1706,7 @@ class DimReducer {
1071
1706
  }
1072
1707
 
1073
1708
  project2D(emb, method = 'auto') {
1709
+ const umap = getUmap();
1074
1710
  if (method === 'umap' && umap) {
1075
1711
  const dataset = [];
1076
1712
  for (let row = 0; row < emb.nRows; row++) {
@@ -1182,7 +1818,7 @@ class OnlineResearcher {
1182
1818
  maxPdfBytes: clampInt(crawlReq?.maxPdfBytes, 64 * 1024, 40 * 1024 * 1024, 20 * 1024 * 1024),
1183
1819
  userAgent: typeof crawlReq?.userAgent === 'string' && crawlReq.userAgent.trim() ? crawlReq.userAgent.trim() : '079ProjectCrawler/1.0'
1184
1820
  };
1185
- const crawler = new SiteCrawler({ axios, cheerio, pdfParse });
1821
+ const crawler = new SiteCrawler({ axios: getAxios(), cheerio: getCheerio(), pdfParse: getPdfParse() });
1186
1822
  const result = await crawler.crawl(startUrl, crawlOptions);
1187
1823
  if (!options?.forceRemote) {
1188
1824
  const key = this._normalize(tokenize(`crawl ${startUrl}`));
@@ -1204,6 +1840,7 @@ class OnlineResearcher {
1204
1840
  }
1205
1841
 
1206
1842
  let payload = null;
1843
+ const axios = getAxios();
1207
1844
  if (axios && this.endpoint && !options.skipRemote) {
1208
1845
  try {
1209
1846
  const resp = await axios.get(this.endpoint, {
@@ -1502,14 +2139,6 @@ class SessionManager {
1502
2139
  this.idleMs = idleMs;
1503
2140
  this.maxSessions = maxSessions;
1504
2141
  this.active = new Map();
1505
- this._load();
1506
- }
1507
-
1508
- _load() {
1509
- const entries = this.store.entries('session:');
1510
- for (const [key, value] of entries) {
1511
- this.active.set(key.slice(8), value);
1512
- }
1513
2142
  }
1514
2143
 
1515
2144
  _save(sessionId) {
@@ -1524,12 +2153,28 @@ class SessionManager {
1524
2153
  }
1525
2154
 
1526
2155
  ensure(sessionId) {
1527
- if (sessionId && this.active.has(sessionId)) {
1528
- const data = this.active.get(sessionId);
1529
- data.lastActivity = Date.now();
1530
- data.count = (data.count || 0) + 1;
1531
- this._save(sessionId);
1532
- return sessionId;
2156
+ if (sessionId) {
2157
+ const sid = String(sessionId);
2158
+ if (this.active.has(sid)) {
2159
+ const data = this.active.get(sid);
2160
+ data.lastActivity = Date.now();
2161
+ data.count = (data.count || 0) + 1;
2162
+ this._save(sid);
2163
+ return sid;
2164
+ }
2165
+ // 懒加载:仅在客户端携带 sessionId 时才从 store 读取
2166
+ const stored = this.store.get(`session:${sid}`);
2167
+ if (stored && typeof stored === 'object') {
2168
+ const data = { ...stored, id: stored.id || sid };
2169
+ data.lastActivity = Date.now();
2170
+ data.count = (data.count || 0) + 1;
2171
+ this.active.set(sid, data);
2172
+ this._save(sid);
2173
+ if (this.active.size > this.maxSessions) {
2174
+ this._truncate();
2175
+ }
2176
+ return sid;
2177
+ }
1533
2178
  }
1534
2179
  const id = this._newId();
1535
2180
  this.active.set(id, { id, createdAt: Date.now(), lastActivity: Date.now(), count: 1, meta: {} });
@@ -1591,7 +2236,7 @@ class SnapshotManager {
1591
2236
  }
1592
2237
  const raw = JSON.parse(fs.readFileSync(file, 'utf8'));
1593
2238
  await this.runtime.fromSnapshot(raw);
1594
- return true;
2239
+ return raw;
1595
2240
  }
1596
2241
 
1597
2242
  delete(id) {
@@ -1647,16 +2292,28 @@ class GraphExportBuilder {
1647
2292
 
1648
2293
  class RuntimeState {
1649
2294
  constructor({ kvmStore, memeStore, sessionStore, params, config }) {
1650
- this.kvm = new KVMStore(kvmStore);
2295
+ this.config = { ...(config || {}) };
2296
+ this.config.robotsDir = this.config.robotsDir || path.join(__dirname, 'robots');
2297
+ this.config.lemmaCsv = this.config.lemmaCsv || path.join(__dirname, 'lemma.csv');
2298
+ this.config.lemmaAutoload = this.config.lemmaAutoload ?? false;
2299
+ this.config.lemmaMaxBytes = this.config.lemmaMaxBytes ?? (64 * 1024 * 1024);
2300
+ this.config.lemmaForce = this.config.lemmaForce ?? false;
2301
+ this.config.kvmCacheMaxEntries = this.config.kvmCacheMaxEntries ?? 50_000;
2302
+ this.config.robotsChunkMinWords = this.config.robotsChunkMinWords ?? 2;
2303
+ this.config.robotsChunkMaxWords = this.config.robotsChunkMaxWords ?? 5;
2304
+
2305
+ this.kvm = new KVMStore(kvmStore, { maxCacheEntries: this.config.kvmCacheMaxEntries });
1651
2306
  this.graph = new MemeGraph(memeStore);
1652
2307
  this.sessions = new SessionManager(sessionStore);
2308
+ // “反接”层:从模因层回到用户可读表达(短语/句子)
2309
+ this.surfaceStore = new NamespacedStore(sessionStore, 'surface');
2310
+ this.dialogStore = new NamespacedStore(sessionStore, 'dialog');
2311
+ this.surfaceLexicon = new MemeSurfaceLexicon(this.surfaceStore);
2312
+ this.dialogMemory = new DialogMemory(this.dialogStore);
1653
2313
  this.tensor = new TensorEngine();
1654
2314
  this.pattern = new PatternMatrix(this);
1655
2315
  this.params = { ...modelDefaults, ...(params || {}) };
1656
2316
  this.metrics = { requests: 0, lastLatency: 0, updatedAt: Date.now() };
1657
- this.config = { ...(config || {}) };
1658
- this.config.robotsDir = this.config.robotsDir || path.join(__dirname, 'robots');
1659
- this.config.lemmaCsv = this.config.lemmaCsv || path.join(__dirname, 'lemma.csv');
1660
2317
  // 在线搜索配置:支持运行时开关与 endpoint 库
1661
2318
  this.config.search = {
1662
2319
  enabled: this.config.search?.enabled ?? true,
@@ -1740,8 +2397,14 @@ class RuntimeState {
1740
2397
  }
1741
2398
 
1742
2399
  mapWordsToMemes(words) {
2400
+ // 细化:输入不仅映射到“单词模因”,还会生成/融合“多词模因(短语模因)”。
2401
+ const tokens = Array.isArray(words) ? words.map((w) => String(w || '').trim()).filter(Boolean) : [];
1743
2402
  const memeStrength = new Map();
1744
- for (const word of words) {
2403
+ const maxUnits = 128;
2404
+ let units = 0;
2405
+
2406
+ // 1) unigram:保持兼容
2407
+ for (const word of tokens) {
1745
2408
  const memes = this.kvm.getWordMemeSet(word);
1746
2409
  if (!memes || memes.size === 0) {
1747
2410
  const memeId = `meme_${hashString(word)}`;
@@ -1754,9 +2417,143 @@ class RuntimeState {
1754
2417
  memeStrength.set(memeId, (memeStrength.get(memeId) ?? 0) + 1);
1755
2418
  }
1756
2419
  }
2420
+
2421
+ // 2) phrase/word-structure meme:以 ngram 作为概念单元,模因仍为“多个词的集合”
2422
+ const nMin = Math.max(2, Number(this.params.memeNgramMin ?? 2) || 2);
2423
+ const nMax = Math.max(nMin, Number(this.params.memeNgramMax ?? 4) || 4);
2424
+ const minOverlap = Math.max(1, Number(this.params.minOverlapThreshold ?? 2) || 2);
2425
+ const maxWordSet = Math.max(4, Number(this.params.maxMemeWords ?? 100) || 100);
2426
+
2427
+ const resolveOrCreateMemeForTokenSet = (tokenSet) => {
2428
+ const uniq = Array.from(new Set(tokenSet.map((x) => String(x || '').trim()).filter(Boolean))).slice(0, maxWordSet);
2429
+ if (uniq.length <= 1) {
2430
+ const w = uniq[0];
2431
+ return w ? `meme_${hashString(w)}` : null;
2432
+ }
2433
+ // 统计每个候选 meme 与 tokenSet 的重合词数
2434
+ const counts = new Map();
2435
+ for (const w of uniq) {
2436
+ const memes = this.kvm.getWordMemeSet(w);
2437
+ if (!memes || memes.size === 0) {
2438
+ continue;
2439
+ }
2440
+ for (const mid of memes) {
2441
+ counts.set(mid, (counts.get(mid) ?? 0) + 1);
2442
+ }
2443
+ }
2444
+ let best = null;
2445
+ let bestOverlap = 0;
2446
+ for (const [mid, c] of counts.entries()) {
2447
+ if (c > bestOverlap) {
2448
+ bestOverlap = c;
2449
+ best = mid;
2450
+ }
2451
+ }
2452
+ if (best && bestOverlap >= minOverlap) {
2453
+ // “融合”:把当前 tokenSet 的词也挂到 best meme 上
2454
+ for (const w of uniq) {
2455
+ this.kvm.link(w, best);
2456
+ }
2457
+ this.graph.ensureNode(best);
2458
+ return best;
2459
+ }
2460
+ // 新建短语模因:ID 由“词集合”决定,保证稳定
2461
+ const sorted = uniq.slice().sort();
2462
+ const memeId = `meme_p_${hashString(sorted.join('|'))}`;
2463
+ this.graph.ensureNode(memeId);
2464
+ for (const w of sorted) {
2465
+ this.kvm.link(w, memeId);
2466
+ }
2467
+ return memeId;
2468
+ };
2469
+
2470
+ // 生成 ngram 单元并映射到 meme,权重按长度提升
2471
+ for (let i = 0; i < tokens.length && units < maxUnits; i++) {
2472
+ for (let n = nMin; n <= nMax && units < maxUnits; n++) {
2473
+ if (i + n > tokens.length) {
2474
+ break;
2475
+ }
2476
+ const gram = tokens.slice(i, i + n);
2477
+ const memeId = resolveOrCreateMemeForTokenSet(gram);
2478
+ if (!memeId) {
2479
+ continue;
2480
+ }
2481
+ const w = 1 + 0.5 * (n - 1);
2482
+ memeStrength.set(memeId, (memeStrength.get(memeId) ?? 0) + w);
2483
+ units += 1;
2484
+ }
2485
+ }
2486
+
1757
2487
  return memeStrength;
1758
2488
  }
1759
2489
 
2490
+ _buildMemeSequenceFromTokens(tokens) {
2491
+ const list = Array.isArray(tokens) ? tokens.map((t) => String(t || '').trim()).filter(Boolean) : [];
2492
+ const nMin = Math.max(2, Number(this.params.memeNgramMin ?? 2) || 2);
2493
+ const nMax = Math.max(nMin, Number(this.params.memeNgramMax ?? 4) || 4);
2494
+ const minOverlap = Math.max(1, Number(this.params.minOverlapThreshold ?? 2) || 2);
2495
+ const maxWordSet = Math.max(4, Number(this.params.maxMemeWords ?? 100) || 100);
2496
+
2497
+ const resolveOrCreate = (tokenSet) => {
2498
+ const uniq = Array.from(new Set(tokenSet.map((x) => String(x || '').trim()).filter(Boolean))).slice(0, maxWordSet);
2499
+ if (uniq.length <= 1) {
2500
+ const w = uniq[0];
2501
+ return w ? `meme_${hashString(w)}` : null;
2502
+ }
2503
+ const counts = new Map();
2504
+ for (const w of uniq) {
2505
+ const memes = this.kvm.getWordMemeSet(w);
2506
+ if (!memes || memes.size === 0) continue;
2507
+ for (const mid of memes) counts.set(mid, (counts.get(mid) ?? 0) + 1);
2508
+ }
2509
+ let best = null;
2510
+ let bestOverlap = 0;
2511
+ for (const [mid, c] of counts.entries()) {
2512
+ if (c > bestOverlap) {
2513
+ bestOverlap = c;
2514
+ best = mid;
2515
+ }
2516
+ }
2517
+ if (best && bestOverlap >= minOverlap) {
2518
+ for (const w of uniq) this.kvm.link(w, best);
2519
+ this.graph.ensureNode(best);
2520
+ return best;
2521
+ }
2522
+ const sorted = uniq.slice().sort();
2523
+ const memeId = `meme_p_${hashString(sorted.join('|'))}`;
2524
+ this.graph.ensureNode(memeId);
2525
+ for (const w of sorted) this.kvm.link(w, memeId);
2526
+ return memeId;
2527
+ };
2528
+
2529
+ const seq = [];
2530
+ for (let i = 0; i < list.length; i++) {
2531
+ // 用更长的 ngram 优先,减少“句子级”颗粒
2532
+ let picked = null;
2533
+ for (let n = nMax; n >= nMin; n--) {
2534
+ if (i + n > list.length) continue;
2535
+ picked = resolveOrCreate(list.slice(i, i + n));
2536
+ if (picked) {
2537
+ break;
2538
+ }
2539
+ }
2540
+ if (!picked) {
2541
+ const w = list[i];
2542
+ picked = w ? `meme_${hashString(w)}` : null;
2543
+ if (picked) {
2544
+ this.graph.ensureNode(picked);
2545
+ this.kvm.link(w, picked);
2546
+ }
2547
+ }
2548
+ if (picked) {
2549
+ if (seq.length === 0 || seq[seq.length - 1] !== picked) {
2550
+ seq.push(picked);
2551
+ }
2552
+ }
2553
+ }
2554
+ return seq;
2555
+ }
2556
+
1760
2557
  _buildSeedVector(windowInfo, seeds) {
1761
2558
  const vec = new Float32Array(windowInfo.ids.length);
1762
2559
  for (const [memeId, strength] of seeds.entries()) {
@@ -1768,16 +2565,101 @@ class RuntimeState {
1768
2565
  return vec;
1769
2566
  }
1770
2567
 
1771
- runPropagation(seeds) {
1772
- const windowInfo = this.graph.buildWindow(Array.from(seeds.keys()), 2);
2568
+ runPropagation(seeds, options = {}) {
2569
+ const radiusRaw = options.radius ?? options.windowRadius;
2570
+ const radius = Math.max(1, Math.min(6, Number(radiusRaw ?? 2) || 2));
2571
+ const windowInfo = this.graph.buildWindow(Array.from(seeds.keys()), radius);
1773
2572
  const seedVector = this._buildSeedVector(windowInfo, seeds);
1774
2573
  const act = this._activation();
1775
- const output = this.tensor.iteratePropagation(windowInfo.csr, seedVector, this.params.iteration || 5, act, this.params.decayK, 0.02);
2574
+ const iteration = Math.max(1, Number(options.iteration ?? this.params.iteration ?? 5) || 5);
2575
+ const output = this.tensor.iteratePropagation(windowInfo.csr, seedVector, iteration, act, this.params.decayK, 0.02);
1776
2576
  this.pattern.rebuild(windowInfo);
1777
2577
  return { windowInfo, seedVector, activation: output };
1778
2578
  }
1779
2579
 
2580
+ _pickTopActivatedMemes(result, seeds, { limit = 18, minScore = 1e-6 } = {}) {
2581
+ const { windowInfo, activation } = result || {};
2582
+ if (!windowInfo || !Array.isArray(windowInfo.ids) || !activation) return [];
2583
+ const seedIds = new Set(seeds ? Array.from(seeds.keys()) : []);
2584
+
2585
+ const isConnectedToSeeds = (memeId) => {
2586
+ if (seedIds.size === 0 || seedIds.has(memeId)) {
2587
+ return true;
2588
+ }
2589
+ const table = this.graph.nodes.get(memeId);
2590
+ if (table) {
2591
+ for (const neighborId of table.keys()) {
2592
+ if (seedIds.has(neighborId)) {
2593
+ return true;
2594
+ }
2595
+ }
2596
+ }
2597
+ for (const seedId of seedIds) {
2598
+ const seedTable = this.graph.nodes.get(seedId);
2599
+ if (seedTable && seedTable.has(memeId)) {
2600
+ return true;
2601
+ }
2602
+ }
2603
+ return false;
2604
+ };
2605
+
2606
+ const scored = [];
2607
+ for (let i = 0; i < windowInfo.ids.length; i++) {
2608
+ const memeId = windowInfo.ids[i];
2609
+ const score = activation[i];
2610
+ if (!Number.isFinite(score) || score <= minScore) continue;
2611
+ if (!isConnectedToSeeds(memeId)) continue;
2612
+ scored.push({ memeId, score });
2613
+ }
2614
+ scored.sort((a, b) => b.score - a.score);
2615
+ return scored.slice(0, Math.max(1, Number(limit) || 18));
2616
+ }
2617
+
2618
+ _makeSignatureFromTopMemes(topMemes, { limit = 12 } = {}) {
2619
+ const ids = (Array.isArray(topMemes) ? topMemes : [])
2620
+ .map((x) => (typeof x === 'string' ? x : x?.memeId))
2621
+ .filter(Boolean)
2622
+ .slice(0, Math.max(3, Number(limit) || 12));
2623
+ // signature 用 memeId 列表,排序保证稳定
2624
+ const uniq = Array.from(new Set(ids));
2625
+ uniq.sort();
2626
+ return uniq.join('|');
2627
+ }
2628
+
1780
2629
  composeReply(result, words, seeds) {
2630
+ const topMemes = this._pickTopActivatedMemes(result, seeds, { limit: 18 });
2631
+ const signature = this._makeSignatureFromTopMemes(topMemes, { limit: 12 });
2632
+
2633
+ // 1) 优先:对话记忆检索(更像“在训练集中找答案”)
2634
+ const memoryHit = this.dialogMemory.retrieve({
2635
+ memes: topMemes.map((x) => x.memeId),
2636
+ signature,
2637
+ minSim: 0.45
2638
+ });
2639
+ if (memoryHit && typeof memoryHit.reply === 'string' && memoryHit.reply.trim()) {
2640
+ return memoryHit.reply.trim();
2641
+ }
2642
+
2643
+ // 2) 其次:模因 -> 表层表达(短语/句子)反接
2644
+ const phraseScores = new Map();
2645
+ for (const item of topMemes) {
2646
+ const list = this.surfaceLexicon.getTop(item.memeId, { limit: 4 });
2647
+ for (const c of list) {
2648
+ const p = String(c.phrase || '').trim();
2649
+ if (!p) continue;
2650
+ const prev = phraseScores.get(p) ?? 0;
2651
+ // meme 激活分数做门控,词典分数做权重
2652
+ phraseScores.set(p, prev + (Math.max(0, item.score) * (0.5 + Math.max(0, c.score))));
2653
+ }
2654
+ }
2655
+ const phraseOrdered = Array.from(phraseScores.entries())
2656
+ .sort((a, b) => b[1] - a[1])
2657
+ .map(([p]) => p);
2658
+ if (phraseOrdered.length) {
2659
+ return phraseOrdered.slice(0, 2).join('。');
2660
+ }
2661
+
2662
+ // 3) 回退:旧逻辑(meme -> words)
1781
2663
  const { windowInfo, activation } = result;
1782
2664
  const seedIds = new Set(seeds ? Array.from(seeds.keys()) : []);
1783
2665
  const baseWords = Array.from(new Set((words || []).map((w) => String(w).trim()).filter(Boolean)));
@@ -1857,6 +2739,48 @@ class RuntimeState {
1857
2739
  return finalWords.slice(0, 30).join(' ');
1858
2740
  }
1859
2741
 
2742
+ learnFromDialog({ payload, result } = {}) {
2743
+ try {
2744
+ const question = typeof payload?.text === 'string'
2745
+ ? payload.text
2746
+ : (Array.isArray(payload?.tokens) ? payload.tokens.join(' ') : '');
2747
+ const reply = typeof result?.reply === 'string' ? result.reply : '';
2748
+ if (!question.trim() || !reply.trim()) {
2749
+ return { ok: false, reason: 'missing-text' };
2750
+ }
2751
+
2752
+ // 以当前推理结果为主:topMemes -> signature
2753
+ const seeds = Array.isArray(result?.seeds)
2754
+ ? new Map(result.seeds.map((pair) => [pair[0], pair[1]]))
2755
+ : this.mapWordsToMemes(tokenize(question));
2756
+ const windowInfo = Array.isArray(result?.memes) ? { ids: result.memes } : null;
2757
+ const activation = Array.isArray(result?.activation) ? Float32Array.from(result.activation) : null;
2758
+ const resObj = (windowInfo && activation) ? { windowInfo, activation } : this.runPropagation(seeds);
2759
+
2760
+ const topMemes = this._pickTopActivatedMemes(resObj, seeds, { limit: 18 });
2761
+ const signature = this._makeSignatureFromTopMemes(topMemes, { limit: 12 });
2762
+ const memeIds = topMemes.map((x) => x.memeId);
2763
+
2764
+ // 学习:每个高激活 meme 绑定 reply 的表层表达
2765
+ for (const item of topMemes.slice(0, 10)) {
2766
+ const w = Math.max(0.5, Math.min(3, item.score));
2767
+ this.surfaceLexicon.learn(item.memeId, reply, { weight: w });
2768
+ }
2769
+
2770
+ // 学习:签名级别的“对话记忆”(检索更直接)
2771
+ this.dialogMemory.remember({
2772
+ signature,
2773
+ memes: memeIds,
2774
+ question,
2775
+ reply,
2776
+ scoreHint: topMemes[0]?.score ?? 0
2777
+ });
2778
+ return { ok: true, memes: memeIds.length, signatureLen: signature ? signature.split('|').length : 0 };
2779
+ } catch (err) {
2780
+ return { ok: false, error: err.message };
2781
+ }
2782
+ }
2783
+
1860
2784
  processInput(payload) {
1861
2785
  const started = Date.now();
1862
2786
  const text = payload.text != null ? payload.text : (payload.message != null ? String(payload.message) : '');
@@ -1866,8 +2790,44 @@ class RuntimeState {
1866
2790
  : Array.isArray(payload.vocab) && payload.vocab.length ? payload.vocab
1867
2791
  : null;
1868
2792
  const words = tokensFromPayload ? tokensFromPayload.map((w) => String(w)) : tokenize(text);
1869
- const seeds = this.mapWordsToMemes(words);
1870
- const result = this.runPropagation(seeds);
2793
+
2794
+ const budget = payload && typeof payload === 'object' ? payload.budget : null;
2795
+ const depth = Math.max(1, Number(budget?.mappingDepth ?? budget?.depth ?? this.params.mappingDepth ?? 1) || 1);
2796
+ const topMemesK = Math.max(3, Number(budget?.reflectionTopMemes ?? budget?.topMemes ?? this.params.reflectionTopMemes ?? 18) || 18);
2797
+ const topWordsK = Math.max(3, Number(budget?.reflectionTopWords ?? budget?.topWords ?? this.params.reflectionTopWords ?? 24) || 24);
2798
+ const minScoreRaw = budget?.reflectionMinScore ?? budget?.minScore ?? this.params.reflectionMinScore;
2799
+ const minScore = Number.isFinite(Number(minScoreRaw)) ? Number(minScoreRaw) : 1e-6;
2800
+ const iterRaw = budget?.iteration;
2801
+ const iteration = Math.max(1, Number(iterRaw ?? this.params.iteration ?? 5) || 5);
2802
+ const radiusRaw = budget?.radius ?? budget?.windowRadius;
2803
+ const radius = Math.max(1, Math.min(6, Number(radiusRaw ?? 2) || 2));
2804
+
2805
+ let seeds = this.mapWordsToMemes(words);
2806
+ let result = null;
2807
+ if (depth > 1) {
2808
+ for (let hop = 1; hop < depth; hop++) {
2809
+ result = this.runPropagation(seeds, { iteration, radius });
2810
+ const topMemes = this._pickTopActivatedMemes(result, seeds, { limit: topMemesK, minScore });
2811
+ const wordScore = new Map();
2812
+ for (const m of topMemes) {
2813
+ const linked = this.kvm.getMemeWords(m.memeId);
2814
+ if (!linked) continue;
2815
+ for (const w of linked) {
2816
+ const ww = String(w || '').trim();
2817
+ if (!ww) continue;
2818
+ const prev = wordScore.get(ww) ?? 0;
2819
+ wordScore.set(ww, Math.max(prev, Number(m.score) || 0));
2820
+ }
2821
+ }
2822
+ const expanded = Array.from(wordScore.entries())
2823
+ .sort((a, b) => b[1] - a[1])
2824
+ .slice(0, topWordsK)
2825
+ .map(([w]) => w);
2826
+ const merged = Array.from(new Set([...(words || []).slice(0, 64), ...expanded]));
2827
+ seeds = this.mapWordsToMemes(merged);
2828
+ }
2829
+ }
2830
+ result = result || this.runPropagation(seeds, { iteration, radius });
1871
2831
  const reply = this.composeReply(result, words, seeds);
1872
2832
  const latency = Date.now() - started;
1873
2833
  this.metrics.requests += 1;
@@ -1895,7 +2855,8 @@ class RuntimeState {
1895
2855
  }
1896
2856
 
1897
2857
  const seeds = this.mapWordsToMemes(tokens);
1898
- const memeIds = Array.from(seeds.keys());
2858
+ // 使用有序“词结构/短语结构”序列建边(从句子级细化到词/短语级)
2859
+ const memeIds = this._buildMemeSequenceFromTokens(tokens);
1899
2860
  for (let i = 0; i < memeIds.length - 1; i++) {
1900
2861
  this.graph.link(memeIds[i], memeIds[i + 1], 1, 0);
1901
2862
  }
@@ -1925,7 +2886,12 @@ class RuntimeState {
1925
2886
  if (!this.robotsCorpus) {
1926
2887
  this.robotsCorpus = new RobotsCorpus({
1927
2888
  dir: this.config.robotsDir,
1928
- lemmaCsv: this.config.lemmaCsv
2889
+ lemmaCsv: this.config.lemmaCsv,
2890
+ lemmaAutoload: this.config.lemmaAutoload,
2891
+ lemmaMaxBytes: this.config.lemmaMaxBytes,
2892
+ lemmaForce: this.config.lemmaForce,
2893
+ chunkMinWords: this.config.robotsChunkMinWords,
2894
+ chunkMaxWords: this.config.robotsChunkMaxWords
1929
2895
  });
1930
2896
  }
1931
2897
  return this.robotsCorpus;
@@ -1981,7 +2947,9 @@ class RuntimeState {
1981
2947
  params: this.params,
1982
2948
  graph: this.graph.exportSnapshot(),
1983
2949
  sessions: this.sessions.export(),
1984
- kvm: this.kvm.exportEntries()
2950
+ kvm: this.kvm.exportEntries(),
2951
+ surface: this.surfaceLexicon ? this.surfaceLexicon.exportSnapshot({ limitMemes: 512 }) : null,
2952
+ dialog: this.dialogMemory ? this.dialogMemory.exportSnapshot({ limit: 512 }) : null
1985
2953
  };
1986
2954
  }
1987
2955
 
@@ -1999,6 +2967,12 @@ class RuntimeState {
1999
2967
  }
2000
2968
  }
2001
2969
  }
2970
+ if (snapshot.surface && this.surfaceLexicon) {
2971
+ this.surfaceLexicon.importSnapshot(snapshot.surface);
2972
+ }
2973
+ if (snapshot.dialog && this.dialogMemory) {
2974
+ this.dialogMemory.importSnapshot(snapshot.dialog);
2975
+ }
2002
2976
  }
2003
2977
 
2004
2978
  // 将当前窗口或指定种子集合导出为 Go 侧 Graph 结构并写入文件
@@ -2266,11 +3240,19 @@ class StudyEngine {
2266
3240
  this.running = false;
2267
3241
  this.queue = [];
2268
3242
  this.metrics = { enqueued: 0, processed: 0, lastTickAt: 0, lastError: null };
3243
+ this.poolWorker = null;
3244
+ }
3245
+
3246
+ _ensureWorkerPool() {
3247
+ if (this.poolWorker) {
3248
+ return this.poolWorker;
3249
+ }
2269
3250
  this.poolWorker = workerpool.pool(CONFIG.workerFile, {
2270
3251
  minWorkers: 1,
2271
3252
  maxWorkers: CONFIG.maxWorkers,
2272
3253
  workerType: 'process'
2273
3254
  });
3255
+ return this.poolWorker;
2274
3256
  }
2275
3257
 
2276
3258
  start() {
@@ -2301,7 +3283,8 @@ class StudyEngine {
2301
3283
  try {
2302
3284
  const text = String(doc?.text || '');
2303
3285
  const tokens = tokenize(text);
2304
- await this.poolWorker.exec('batchLemmatize', [[tokens], this.pool.getActive().runtime?.config?.lemmaCsv]);
3286
+ const wp = this._ensureWorkerPool();
3287
+ await wp.exec('batchLemmatize', [[tokens], this.pool.getActive().runtime?.config?.lemmaCsv]);
2305
3288
  } catch (_e) {
2306
3289
  // ignore
2307
3290
  }
@@ -2701,6 +3684,66 @@ class PersonaForestAverager {
2701
3684
  }
2702
3685
  }
2703
3686
 
3687
+ const normalizeBudget = (raw) => {
3688
+ if (raw === undefined || raw === null || raw === '' || raw === false) {
3689
+ return null;
3690
+ }
3691
+ if (typeof raw === 'string') {
3692
+ const s = raw.trim();
3693
+ const lowered = s.toLowerCase();
3694
+ if (lowered === 'default' || lowered === 'balanced' || lowered === 'medium' || lowered === 'none') {
3695
+ return null;
3696
+ }
3697
+ if (lowered === 'low' || lowered === 'fast') {
3698
+ return { iteration: 3, reflectionTopMemes: 12, reflectionTopWords: 16 };
3699
+ }
3700
+ if (lowered === 'high' || lowered === 'slow' || lowered === 'quality') {
3701
+ return { iteration: 7, reflectionTopMemes: 22, reflectionTopWords: 32 };
3702
+ }
3703
+ if (s.startsWith('{') && s.endsWith('}')) {
3704
+ try {
3705
+ return normalizeBudget(JSON.parse(s));
3706
+ } catch (_e) {
3707
+ return null;
3708
+ }
3709
+ }
3710
+ return null;
3711
+ }
3712
+
3713
+ if (typeof raw !== 'object') {
3714
+ return null;
3715
+ }
3716
+ const out = {};
3717
+ const pickNum = (key, ...aliases) => {
3718
+ const v = raw[key];
3719
+ if (Number.isFinite(Number(v))) {
3720
+ out[key] = Number(v);
3721
+ return;
3722
+ }
3723
+ for (const a of aliases) {
3724
+ const av = raw[a];
3725
+ if (Number.isFinite(Number(av))) {
3726
+ out[key] = Number(av);
3727
+ return;
3728
+ }
3729
+ }
3730
+ };
3731
+ pickNum('mappingDepth', 'depth');
3732
+ pickNum('iteration', 'iters');
3733
+ pickNum('reflectionTopMemes', 'topMemes');
3734
+ pickNum('reflectionTopWords', 'topWords');
3735
+ pickNum('reflectionMinScore', 'minScore');
3736
+ pickNum('radius', 'windowRadius');
3737
+ return Object.keys(out).length ? out : null;
3738
+ };
3739
+
3740
+ const mergeBudgets = (base, override) => {
3741
+ if (!base && !override) return null;
3742
+ if (!base) return override;
3743
+ if (!override) return base;
3744
+ return { ...base, ...override };
3745
+ };
3746
+
2704
3747
  class SparkArray {
2705
3748
  /**
2706
3749
  * @param {ControllerPool} pool
@@ -2716,9 +3759,10 @@ class SparkArray {
2716
3759
  const available = typeof pool?.listControllersInGroup === 'function'
2717
3760
  ? pool.listControllersInGroup(this.groupId)
2718
3761
  : (typeof pool?.listControllerNames === 'function' ? pool.listControllerNames() : Object.keys(pool?.controllers || {}));
2719
- const wanted = 7; // 每个工作组固定 7 AI
3762
+ const wantedRaw = options.numAI ?? options.groupSize ?? CONFIG.sparkNumAI ?? CONFIG.groupSize ?? 7;
3763
+ const wanted = Math.max(1, Math.round(Number(wantedRaw) || 7));
2720
3764
  const numAI = Math.max(1, Math.min(available.length || wanted, wanted));
2721
- // 组内小 SparkArray:固定 7 个 AI(不足则截断)
3765
+ // 组内小 SparkArray:默认 numAI 个 AI(不足则截断)
2722
3766
  this.layers = Array.from({ length: numAI }, (_, i) => ({
2723
3767
  name: `${this.groupId}:a${i + 1}`,
2724
3768
  controllers: [available[i]],
@@ -2729,6 +3773,7 @@ class SparkArray {
2729
3773
  this.layers = options.layers.map((layer) => ({ strategy: 'max', ...layer }));
2730
3774
  }
2731
3775
  this.personaForest = new PersonaForestAverager(options.personaForest || {});
3776
+ this.defaultBudget = normalizeBudget(options.budget ?? CONFIG.sparkBudget);
2732
3777
  this.history = [];
2733
3778
  }
2734
3779
 
@@ -2751,9 +3796,15 @@ class SparkArray {
2751
3796
  async dispatch(payload, options = {}) {
2752
3797
  const requestEmbedding = textToMiniEmbedding(payload.text || '', 64);
2753
3798
  const variants = buildVariants(payload.text || '', options.perturbations || 0);
2754
- const layers = options.multiLayer === false ? [this.layers[0]] : this.layers;
3799
+ let layers = options.multiLayer === false ? [this.layers[0]] : this.layers;
3800
+ if (Number.isFinite(Number(options.numAI)) && Number(options.numAI) > 0) {
3801
+ const cap = Math.max(1, Math.floor(Number(options.numAI)));
3802
+ layers = layers.slice(0, cap);
3803
+ }
2755
3804
  const layerResults = [];
2756
3805
 
3806
+ const budget = mergeBudgets(this.defaultBudget, normalizeBudget(options.budget ?? payload?.budget));
3807
+
2757
3808
  for (const layer of layers) {
2758
3809
  const controllers = [];
2759
3810
  for (const controllerSpec of layer.controllers) {
@@ -2773,7 +3824,7 @@ class SparkArray {
2773
3824
  const weightedText = weight <= 1
2774
3825
  ? String(payload.text || '')
2775
3826
  : Array.from({ length: weight }, () => String(payload.text || '')).join(' ');
2776
- baseResult = await ctrl.respond({ ...payload, text: weightedText });
3827
+ baseResult = await ctrl.respond({ ...payload, text: weightedText, ...(budget ? { budget } : {}) });
2777
3828
  } catch (err) {
2778
3829
  controllers.push({
2779
3830
  controller: controllerName,
@@ -2786,7 +3837,7 @@ class SparkArray {
2786
3837
  const variantResults = [];
2787
3838
  for (const variant of variants) {
2788
3839
  try {
2789
- const vr = await ctrl.respond({ ...payload, text: variant });
3840
+ const vr = await ctrl.respond({ ...payload, text: variant, ...(budget ? { budget } : {}) });
2790
3841
  variantResults.push({
2791
3842
  text: variant,
2792
3843
  response: vr,
@@ -2855,7 +3906,7 @@ class SparkArray {
2855
3906
  }
2856
3907
  }
2857
3908
  }
2858
-
3909
+
2859
3910
  // 随机森林式“中途平均/投票”:在不改变对外结构的前提下,优先选择更稳定且共识更强的回复
2860
3911
  try {
2861
3912
  const picked = this.personaForest.pick({
@@ -3160,7 +4211,7 @@ class ReinforcementLearner {
3160
4211
  this.improvementThreshold = improvementThreshold;
3161
4212
  this.history = [];
3162
4213
  // 统一使用上方安全引用的 Matrix(可能为 null)
3163
- this.Matrix = Matrix;
4214
+ this.Matrix = getMatrix();
3164
4215
  this.kmeans = safeRequire('ml-kmeans');
3165
4216
  this.numeric = safeRequire('numeric');
3166
4217
  }
@@ -3353,7 +4404,7 @@ class AdversarialLearner {
3353
4404
  this.benchLimit = benchLimit;
3354
4405
  this.rng = safeRequire('seedrandom') ? safeRequire('seedrandom')('phoenix-adv') : Math.random;
3355
4406
  this.history = [];
3356
- this.Matrix = safeRequire('ml-matrix');
4407
+ this.Matrix = getMatrix();
3357
4408
  }
3358
4409
 
3359
4410
  _perturbTokens(tokens) {
@@ -3491,8 +4542,8 @@ class GatewayServer {
3491
4542
  this.redisSync = redisSync;
3492
4543
  this.study = study;
3493
4544
  this.spark = sparkArray || new SparkArray(pool, shardManager);
3494
- this.rl = learners.rl || new ReinforcementLearner(this.pool, {});
3495
- this.adv = learners.adv || new AdversarialLearner(this.pool, {});
4545
+ this.rl = learners.rl || null;
4546
+ this.adv = learners.adv || null;
3496
4547
  this.rlDisabled = false;
3497
4548
  this.advDisabled = false;
3498
4549
  this.dialogLearningEnabled = true;
@@ -3556,6 +4607,18 @@ class GatewayServer {
3556
4607
  this._setupRoutes();
3557
4608
  }
3558
4609
 
4610
+ _ensureRL() {
4611
+ if (this.rl) return this.rl;
4612
+ this.rl = new ReinforcementLearner(this.pool, { testsDir: path.join(__dirname, 'tests') });
4613
+ return this.rl;
4614
+ }
4615
+
4616
+ _ensureADV() {
4617
+ if (this.adv) return this.adv;
4618
+ this.adv = new AdversarialLearner(this.pool, {});
4619
+ return this.adv;
4620
+ }
4621
+
3559
4622
  _setupAuthMiddleware() {
3560
4623
  let jwt;
3561
4624
  try {
@@ -3767,14 +4830,14 @@ class GatewayServer {
3767
4830
  return;
3768
4831
  }
3769
4832
  const cycles = Number(req.body?.cycles ?? 3) || 3;
3770
- const out = await this.rl.learn(cycles);
4833
+ const out = await this._ensureRL().learn(cycles);
3771
4834
  res.json({ ok: true, result: out });
3772
4835
  } catch (err) {
3773
4836
  res.status(500).json({ ok: false, error: err.message });
3774
4837
  }
3775
4838
  });
3776
4839
  this.app.get('/api/learn/reinforce/latest', (req, res) => {
3777
- res.json({ ok: true, latest: this.rl.latest() });
4840
+ res.json({ ok: true, latest: this.rl ? this.rl.latest() : null });
3778
4841
  });
3779
4842
  // Adversarial Learning endpoints
3780
4843
  this.app.post('/api/learn/adversarial', async (req, res) => {
@@ -3788,14 +4851,14 @@ class GatewayServer {
3788
4851
  res.status(400).json({ ok: false, error: 'samples required' });
3789
4852
  return;
3790
4853
  }
3791
- const out = await this.adv.attackAndDefend(samples);
4854
+ const out = await this._ensureADV().attackAndDefend(samples);
3792
4855
  res.json({ ok: true, result: out });
3793
4856
  } catch (err) {
3794
4857
  res.status(500).json({ ok: false, error: err.message });
3795
4858
  }
3796
4859
  });
3797
4860
  this.app.get('/api/learn/adversarial/latest', (req, res) => {
3798
- res.json({ ok: true, latest: this.adv.latest() });
4861
+ res.json({ ok: true, latest: this.adv ? this.adv.latest() : null });
3799
4862
  });
3800
4863
  this.app.post('/api/learn/thresholds', (req, res) => {
3801
4864
  const { rlEvery, advEvery } = req.body || {};
@@ -3885,6 +4948,8 @@ class GatewayServer {
3885
4948
  config: {
3886
4949
  groupCount: CONFIG.groupCount,
3887
4950
  groupSize: CONFIG.groupSize,
4951
+ sparkNumAI: CONFIG.sparkNumAI,
4952
+ sparkBudget: CONFIG.sparkBudget,
3888
4953
  groupIds: typeof this.pool.listGroupIds === 'function' ? this.pool.listGroupIds() : [],
3889
4954
  gatewayHost: CONFIG.gatewayHost,
3890
4955
  portGateway: CONFIG.portGateway,
@@ -4336,9 +5401,21 @@ class GatewayServer {
4336
5401
  if (!this.dialogLearningEnabled) {
4337
5402
  return;
4338
5403
  }
5404
+
5405
+ // 反接学习:把本轮对话沉淀为“模因层 -> 表层答案”的映射与可检索记忆
5406
+ try {
5407
+ const runtime = this.pool?.getActive?.()?.runtime;
5408
+ if (runtime && typeof runtime.learnFromDialog === 'function') {
5409
+ runtime.learnFromDialog({ payload, result });
5410
+ }
5411
+ } catch (e) {
5412
+ // 学习失败不影响主流程
5413
+ console.warn('[Learn] surface/dialog memory update failed:', e.message);
5414
+ }
5415
+
4339
5416
  if (!this.rlDisabled && (total - this.dialogCounters.lastRL >= this.dialogThresholds.rlEvery)) {
4340
5417
  this.dialogCounters.lastRL = total;
4341
- Promise.resolve().then(() => this.rl.learn(1)).catch((e) => console.warn('[Learn] RL trigger failed:', e.message));
5418
+ Promise.resolve().then(() => this._ensureRL().learn(1)).catch((e) => console.warn('[Learn] RL trigger failed:', e.message));
4342
5419
  }
4343
5420
  if (!this.advDisabled && (total - this.dialogCounters.lastADV >= this.dialogThresholds.advEvery)) {
4344
5421
  this.dialogCounters.lastADV = total;
@@ -4347,7 +5424,7 @@ class GatewayServer {
4347
5424
  if (text && text.trim()) samples.push(text.trim());
4348
5425
  if (result?.reply && typeof result.reply === 'string') samples.push(result.reply);
4349
5426
  if (samples.length) {
4350
- Promise.resolve().then(() => this.adv.attackAndDefend(samples)).catch((e) => console.warn('[Learn] ADV trigger failed:', e.message));
5427
+ Promise.resolve().then(() => this._ensureADV().attackAndDefend(samples)).catch((e) => console.warn('[Learn] ADV trigger failed:', e.message));
4351
5428
  }
4352
5429
  }
4353
5430
  } catch (e) {
@@ -4363,9 +5440,9 @@ class GatewayServer {
4363
5440
  }
4364
5441
 
4365
5442
  const bootstrap = async () => {
4366
- const kvmStore = new LmdbStore({ name: 'kvm', rootDir: CONFIG.lmdbRoot });
4367
- const memeStore = new LmdbStore({ name: 'meme_graph', rootDir: CONFIG.lmdbRoot });
4368
- const sessionStore = new LmdbStore({ name: 'session', rootDir: CONFIG.lmdbRoot });
5443
+ const kvmStore = new LmdbStore({ name: 'kvm', rootDir: CONFIG.lmdbRoot, mapSizeBytes: CONFIG.lmdbMapSizeBytes });
5444
+ const memeStore = new LmdbStore({ name: 'meme_graph', rootDir: CONFIG.lmdbRoot, mapSizeBytes: CONFIG.lmdbMapSizeBytes });
5445
+ const sessionStore = new LmdbStore({ name: 'session', rootDir: CONFIG.lmdbRoot, mapSizeBytes: CONFIG.lmdbMapSizeBytes });
4369
5446
  console.log('checkpoint1');
4370
5447
  const pool = new ControllerPool({ kvmStore, memeStore, sessionStore, config: CONFIG });
4371
5448
  const rotation = new RotationManager(pool, {});
@@ -4380,19 +5457,33 @@ const bootstrap = async () => {
4380
5457
  study.start();
4381
5458
  const snapshots = new SnapshotManager(pool.getActive().runtime, CONFIG.snapshotDir);
4382
5459
  const shards = new ShardManager(pool);
4383
- const spark = new BigSparkArray(pool, shards, { groupIds: pool.listGroupIds() });
5460
+ const spark = new BigSparkArray(pool, shards, {
5461
+ groupIds: pool.listGroupIds(),
5462
+ groupOptions: {
5463
+ numAI: CONFIG.sparkNumAI,
5464
+ budget: CONFIG.sparkBudget
5465
+ }
5466
+ });
4384
5467
  // Try auto-restore latest snapshot to skip warmup/pretraining
4385
5468
  let __restoredFromSnapshot = false;
4386
5469
  try {
4387
5470
  const list = snapshots.list().sort((a, b) => b.localeCompare(a));
4388
5471
  if (list.length > 0) {
4389
- await snapshots.restore(list[0]);
5472
+ const restoredSnapshot = await snapshots.restore(list[0]);
4390
5473
  __restoredFromSnapshot = true;
4391
5474
  console.log(`[Bootstrap] Restored latest snapshot: ${list[0]}`);
4392
- // Keep standby/validation in sync with serving
4393
- const snap = pool.getActive().snapshot();
4394
- await pool.standby.applySnapshot(snap);
4395
- await pool.validation.applySnapshot(snap);
5475
+ if (CONFIG.syncStandbyOnBoot) {
5476
+ // 注意:applySnapshot 内部包含大量同步写入,依然会阻塞;仅在你明确需要时开启。
5477
+ try {
5478
+ await pool.standby.applySnapshot(restoredSnapshot);
5479
+ await pool.validation.applySnapshot(restoredSnapshot);
5480
+ console.log('[Bootstrap] Standby/validation synced from snapshot');
5481
+ } catch (e) {
5482
+ console.warn('[Bootstrap] Standby/validation sync skipped:', e.message);
5483
+ }
5484
+ } else {
5485
+ console.log('[Bootstrap] Standby/validation sync skipped (fast-boot)');
5486
+ }
4396
5487
  }
4397
5488
  } catch (err) {
4398
5489
  console.warn('[Bootstrap] Snapshot restore skipped:', err.message);
@@ -4401,7 +5492,7 @@ const bootstrap = async () => {
4401
5492
  try {
4402
5493
  const preloadDocs = pool.getActive().runtime.collectRobotsDocuments({
4403
5494
  limit: CONFIG.robotsWarmupLimit,
4404
- shuffle: true
5495
+ shuffle: Boolean(CONFIG.robotsWarmupShuffle)
4405
5496
  });
4406
5497
  if (preloadDocs.length) {
4407
5498
  console.log(`[Bootstrap] Preloading ${preloadDocs.length} robots documents...`);
@@ -4423,64 +5514,60 @@ const bootstrap = async () => {
4423
5514
  }
4424
5515
 
4425
5516
  // 额外:将 tests 目录用例按哈希分片到不同 AI,形成差异化“训练集”
4426
- try {
4427
- const testsDir = path.join(__dirname, 'tests');
4428
- if (fs.existsSync(testsDir)) {
4429
- const files = fs.readdirSync(testsDir).filter((f) => /\.txt$/i.test(f));
4430
- if (files.length) {
4431
- const groups = pool.listGroupIds();
4432
- for (const f of files) {
4433
- const full = path.join(testsDir, f);
4434
- const text = fs.readFileSync(full, 'utf8');
4435
- const key = `tests:${f}`;
4436
- const idx = groups.length ? (hashStrSimple(key) % groups.length) : 0;
4437
- const targetGroup = groups[idx] || groups[0] || 'G1';
4438
- await pool.ingestDocumentToGroup(targetGroup, { text, source: key });
5517
+ if (CONFIG.testsAutoload) {
5518
+ try {
5519
+ const testsDir = path.join(__dirname, 'tests');
5520
+ if (fs.existsSync(testsDir)) {
5521
+ const files = fs.readdirSync(testsDir).filter((f) => /\.txt$/i.test(f));
5522
+ if (files.length) {
5523
+ console.log(`[Bootstrap] Preloading tests corpus (${files.length} files)...`);
5524
+ const groups = pool.listGroupIds();
5525
+ for (const f of files) {
5526
+ const full = path.join(testsDir, f);
5527
+ const text = fs.readFileSync(full, 'utf8');
5528
+ const key = `tests:${f}`;
5529
+ const idx = groups.length ? (hashStrSimple(key) % groups.length) : 0;
5530
+ const targetGroup = groups[idx] || groups[0] || 'G1';
5531
+ await pool.ingestDocumentToGroup(targetGroup, { text, source: key });
5532
+ }
5533
+ console.log(`[Bootstrap] Sharded tests corpus into ${groups.length} groups.`);
4439
5534
  }
4440
- console.log(`[Bootstrap] Sharded tests corpus into ${groups.length} groups.`);
4441
5535
  }
5536
+ } catch (err) {
5537
+ console.warn('[Bootstrap] Tests sharded preload skipped:', err.message);
4442
5538
  }
4443
- } catch (err) {
4444
- console.warn('[Bootstrap] Tests sharded preload skipped:', err.message);
5539
+ } else {
5540
+ console.log('[Bootstrap] Tests preload skipped (fast-boot)');
4445
5541
  }
4446
- // 初始化学习模块
4447
- const rl = new ReinforcementLearner(pool, { testsDir: path.join(__dirname, 'tests') });
4448
- const adv = new AdversarialLearner(pool, {});
4449
- const gateway = new GatewayServer(pool, shards, snapshots, rotation, redisSync, study, spark, { rl, adv });
5542
+ // 学习模块改为网关侧按需创建(降低启动时间与内存峰值)
5543
+ const gateway = new GatewayServer(pool, shards, snapshots, rotation, redisSync, study, spark, { rl: null, adv: null });
4450
5544
  gateway.listen(CONFIG.portGateway, CONFIG.gatewayHost);
4451
- // 轻量启动:尝试执行一次强化学习与对抗学习以预热
4452
- (async () => {
4453
- // RL 预热(可通过 CLI 关闭)
4454
- if (!CONFIG.disableLearning && !CONFIG.disableRL) {
4455
- try {
4456
- await rl.learn(1);
4457
- } catch (e) {
4458
- console.warn('[Bootstrap] RL warmup failed:', e.message);
4459
- // 预热失败后禁用 RL,后续触发与端点均短路
4460
- try { gateway.rlDisabled = true; } catch (_) {}
5545
+ // 可选预热:默认关闭;需要时加 --learning-warmup=true
5546
+ if (CONFIG.learningWarmup) {
5547
+ (async () => {
5548
+ if (!CONFIG.disableLearning && !CONFIG.disableRL) {
5549
+ try {
5550
+ await gateway._ensureRL().learn(1);
5551
+ } catch (e) {
5552
+ console.warn('[Bootstrap] RL warmup failed:', e.message);
5553
+ try { gateway.rlDisabled = true; } catch (_) {}
5554
+ }
4461
5555
  }
4462
- } else {
4463
- try { gateway.rlDisabled = true; } catch (_) {}
4464
- console.log('[Bootstrap] RL disabled by config');
4465
- }
4466
- // ADV 预热(可通过 CLI 关闭)
4467
- if (!CONFIG.disableLearning && !CONFIG.disableADV) {
4468
- try {
4469
- // 从 robots 目录挑选少量样本作为对抗试探
4470
- const runtime = pool.getActive().runtime;
4471
- const docs = runtime.collectRobotsDocuments({ limit: 3, shuffle: true });
4472
- const samples = docs.map(d => d.text).filter(Boolean).slice(0, 3);
4473
- if (samples.length) {
4474
- await adv.attackAndDefend(samples);
5556
+
5557
+ if (!CONFIG.disableLearning && !CONFIG.disableADV) {
5558
+ try {
5559
+ const runtime = pool.getActive().runtime;
5560
+ const docs = runtime.collectRobotsDocuments({ limit: 3, shuffle: true });
5561
+ const samples = docs.map((d) => d.text).filter(Boolean).slice(0, 3);
5562
+ if (samples.length) {
5563
+ await gateway._ensureADV().attackAndDefend(samples);
5564
+ }
5565
+ } catch (e) {
5566
+ console.warn('[Bootstrap] Adversarial warmup failed:', e.message);
4475
5567
  }
4476
- } catch (e) {
4477
- console.warn('[Bootstrap] Adversarial warmup failed:', e.message);
4478
5568
  }
4479
- } else {
4480
- try { gateway.advDisabled = true; } catch (_) {}
4481
- console.log('[Bootstrap] ADV disabled by config');
4482
- }
4483
- })();
5569
+ })();
5570
+ }
4484
5571
  process.on('SIGINT', async () => {
4485
5572
  console.log('Received SIGINT, saving snapshot...');
4486
5573
  try {
@@ -4502,5 +5589,8 @@ if (require.main === module) {
4502
5589
 
4503
5590
  module.exports = {
4504
5591
  bootstrap,
4505
- CONFIG
5592
+ CONFIG,
5593
+ MODEL_DEFAULTS: modelDefaults,
5594
+ BUILTIN_ACTIVATION_TYPES: Object.keys(BuiltinActivations),
5595
+ BUILTIN_TRANSFER_TYPES: Object.keys(BuiltinTransfers)
4506
5596
  };