079project 3.0.0 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/GroupStarter.cjs +396 -30
- package/forwarder.js +312 -55
- package/main_Serve.cjs +583 -105
- package/main_Study.cjs +581 -68
- package/notes.txt +241 -0
- package/package.json +7 -1
- package/note.txt +0 -5
- package/notebook.txt +0 -8
package/GroupStarter.cjs
CHANGED
|
@@ -99,6 +99,196 @@ const metrics = {
|
|
|
99
99
|
gpu: [], // 简化 [{ util, memUsed, memTotal }]
|
|
100
100
|
};
|
|
101
101
|
|
|
102
|
+
// 记录所有 group 的“模型分片”状态
|
|
103
|
+
// shardId 就是 groupId(0,1,2,...),后续可以映射到 notes.txt 里的 ModelShard 概念
|
|
104
|
+
|
|
105
|
+
// --- 轻量文本清洗与 hash-embedding,用 group 的语料生成一个语义中心 ---
|
|
106
|
+
function basicClean(s) {
|
|
107
|
+
if (!s) return '';
|
|
108
|
+
let t = String(s)
|
|
109
|
+
.replace(/https?:\/\/[\w\-._~:/?#[\]@!$&'()*+,;=%]+/gi, ' ')
|
|
110
|
+
.replace(/\s+/g, ' ')
|
|
111
|
+
.trim();
|
|
112
|
+
if (t.length > 512) t = t.slice(0, 512);
|
|
113
|
+
return t;
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
function hashStrSimple(str, seed = 1315423911) {
|
|
117
|
+
let h = seed >>> 0;
|
|
118
|
+
for (let i = 0; i < str.length; i++) {
|
|
119
|
+
h ^= ((h << 5) + str.charCodeAt(i) + (h >>> 2)) >>> 0;
|
|
120
|
+
}
|
|
121
|
+
return h >>> 0;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
function textToMiniEmbedding(text, dim = 64) {
|
|
125
|
+
const vec = new Float32Array(dim);
|
|
126
|
+
const toks = basicClean(text).toLowerCase().split(/[^a-z0-9_\-\u4e00-\u9fa5]+/).filter(Boolean);
|
|
127
|
+
if (!toks.length) return Array.from(vec);
|
|
128
|
+
for (const t of toks) {
|
|
129
|
+
const h = hashStrSimple(t);
|
|
130
|
+
const idx = h % dim;
|
|
131
|
+
vec[idx] += 1;
|
|
132
|
+
}
|
|
133
|
+
let n2 = 0;
|
|
134
|
+
for (let i = 0; i < dim; i++) n2 += vec[i] * vec[i];
|
|
135
|
+
n2 = Math.sqrt(n2) || 1;
|
|
136
|
+
for (let i = 0; i < dim; i++) vec[i] /= n2;
|
|
137
|
+
return Array.from(vec);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
function averageEmbedding(vectors, dim = 64) {
|
|
141
|
+
if (!vectors || !vectors.length) return Array.from(new Float32Array(dim));
|
|
142
|
+
const acc = new Float32Array(dim);
|
|
143
|
+
for (const v of vectors) {
|
|
144
|
+
const arr = Array.isArray(v) ? v : [];
|
|
145
|
+
for (let i = 0; i < dim && i < arr.length; i++) acc[i] += arr[i];
|
|
146
|
+
}
|
|
147
|
+
for (let i = 0; i < dim; i++) acc[i] /= vectors.length;
|
|
148
|
+
return Array.from(acc);
|
|
149
|
+
}
|
|
150
|
+
|
|
151
|
+
// 每组用 robots/*.txt 采样若干行,构成其“语料摘要+embedding”
|
|
152
|
+
function sampleGroupCorpusEmbedding(groupDir, { maxFiles = 16, maxLines = 200, dim = 64 } = {}) {
|
|
153
|
+
const robotsDir = path.join(groupDir, 'robots');
|
|
154
|
+
if (!fs.existsSync(robotsDir)) return { dim, centroid: Array.from(new Float32Array(dim)), lines: [] };
|
|
155
|
+
|
|
156
|
+
const files = fs.readdirSync(robotsDir).filter(f => f.toLowerCase().endsWith('.txt')).slice(0, maxFiles);
|
|
157
|
+
const lines = [];
|
|
158
|
+
for (const f of files) {
|
|
159
|
+
if (lines.length >= maxLines) break;
|
|
160
|
+
const full = path.join(robotsDir, f);
|
|
161
|
+
try {
|
|
162
|
+
const content = fs.readFileSync(full, 'utf-8');
|
|
163
|
+
const arr = content.split(/\r?\n/).map(s => s.trim()).filter(Boolean);
|
|
164
|
+
for (const ln of arr) {
|
|
165
|
+
if (lines.length >= maxLines) break;
|
|
166
|
+
if (ln.length < 4) continue;
|
|
167
|
+
lines.push(ln);
|
|
168
|
+
}
|
|
169
|
+
} catch (_) {}
|
|
170
|
+
}
|
|
171
|
+
const vecs = lines.map(ln => textToMiniEmbedding(ln, dim));
|
|
172
|
+
const centroid = averageEmbedding(vecs, dim);
|
|
173
|
+
return { dim, centroid, lines };
|
|
174
|
+
}
|
|
175
|
+
// 记录每个 group 的“模型分片”状态
|
|
176
|
+
// shardId 就是 groupId(0,1,2,...),后续可以映射到 notes.txt 里的 ModelShard 概念
|
|
177
|
+
const shardMeta = new Map(); // shardId -> { id, groupId, type, tags, state, createdAt, readyAt, warmScore, memHistory, centroid }
|
|
178
|
+
|
|
179
|
+
// 初始化时创建空 meta,后续由监控填充
|
|
180
|
+
function initShardMetaForGroup(groupId, type = 'text') {
|
|
181
|
+
if (!shardMeta.has(groupId)) {
|
|
182
|
+
shardMeta.set(groupId, {
|
|
183
|
+
id: `group_${groupId}`,
|
|
184
|
+
groupId,
|
|
185
|
+
type, // 'text' | 'image' | 'audio' 等
|
|
186
|
+
tags: [], // 领域标签(后续可以从 robots/ 目录名或配置推断)
|
|
187
|
+
state: 'booting', // booting -> training -> sealed -> warm/cold
|
|
188
|
+
createdAt: Date.now(),
|
|
189
|
+
readyAt: null,
|
|
190
|
+
warmScore: 0, // 使用/访问次数等加权得到
|
|
191
|
+
memHistory: [], // 最近若干次采样的内存用量(MB)
|
|
192
|
+
centroid: null // 语义中心向量,下一轮从 Study/Serve 导入
|
|
193
|
+
});
|
|
194
|
+
}
|
|
195
|
+
return shardMeta.get(groupId);
|
|
196
|
+
}
|
|
197
|
+
function startResourceMonitor() {
|
|
198
|
+
tryAllocEmergencyBuffer();
|
|
199
|
+
setInterval(async () => {
|
|
200
|
+
const cpu = sampleCpu();
|
|
201
|
+
pushMetric('cpu', cpu);
|
|
202
|
+
const { usedRatio } = sampleMem();
|
|
203
|
+
pushMetric('mem', usedRatio);
|
|
204
|
+
|
|
205
|
+
// === 新增:按 group 聚合 RSS(MB),用于判断训练/封存时机 ===
|
|
206
|
+
const perGroupMem = new Map(); // groupId -> { rss: number, procs: number }
|
|
207
|
+
for (const reg of groupRegistry) {
|
|
208
|
+
let totalRss = 0;
|
|
209
|
+
let count = 0;
|
|
210
|
+
const allPids = [
|
|
211
|
+
...(reg.servePorts || []).map(s => s.pid).filter(Boolean),
|
|
212
|
+
reg.study?.pid,
|
|
213
|
+
reg.forwarder?.pid
|
|
214
|
+
].filter(Boolean);
|
|
215
|
+
for (const pid of allPids) {
|
|
216
|
+
try {
|
|
217
|
+
const stat = process.platform === 'win32'
|
|
218
|
+
? null
|
|
219
|
+
: fs.readFileSync(`/proc/${pid}/statm`, 'utf-8'); // Linux 下可用;Windows 下暂忽略
|
|
220
|
+
if (stat) {
|
|
221
|
+
const parts = stat.trim().split(/\s+/);
|
|
222
|
+
const rssPages = Number(parts[1] || 0);
|
|
223
|
+
const rssBytes = rssPages * 4096;
|
|
224
|
+
totalRss += rssBytes;
|
|
225
|
+
count++;
|
|
226
|
+
}
|
|
227
|
+
} catch (_) {
|
|
228
|
+
// 读取失败忽略
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
if (count > 0) {
|
|
232
|
+
perGroupMem.set(reg.id, { rss: totalRss / (1024 * 1024), procs: count }); // MB
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// 更新 shardMeta.memHistory 并尝试检测“内存急剧下降”事件
|
|
237
|
+
for (const [gid, info] of perGroupMem.entries()) {
|
|
238
|
+
const meta = initShardMetaForGroup(gid, 'text');
|
|
239
|
+
meta.memHistory.push(info.rss);
|
|
240
|
+
if (meta.memHistory.length > 32) meta.memHistory.shift();
|
|
241
|
+
|
|
242
|
+
// 当 state 仍在 booting/training 时,检测“急剧下降”
|
|
243
|
+
if (meta.state === 'booting' || meta.state === 'training') {
|
|
244
|
+
// 简单判断:过去 N=10 个点的最大值和当前值的差,如果超过阈值则认为完成封存
|
|
245
|
+
const N = 10;
|
|
246
|
+
if (meta.memHistory.length >= N) {
|
|
247
|
+
const recent = meta.memHistory.slice(-N);
|
|
248
|
+
const maxRecent = Math.max(...recent);
|
|
249
|
+
const nowMem = recent[recent.length - 1];
|
|
250
|
+
// 阈值:下降超过 30% 且绝对下降 > 200MB,视为一次“完成训练/快照刷盘”
|
|
251
|
+
if (maxRecent > 0 && (maxRecent - nowMem) / maxRecent >= 0.3 && (maxRecent - nowMem) >= 200) {
|
|
252
|
+
if (!meta.readyAt) {
|
|
253
|
+
meta.readyAt = Date.now();
|
|
254
|
+
meta.state = 'sealed'; // 训练完成并封存(可进入冷池)
|
|
255
|
+
console.log(`[SHARD] group_${gid} 训练完成并封存:内存峰值约 ${maxRecent.toFixed(1)}MB -> ${nowMem.toFixed(1)}MB`);
|
|
256
|
+
}
|
|
257
|
+
} else {
|
|
258
|
+
// 尚未明显下降,认为仍在训练/构建
|
|
259
|
+
if (meta.state === 'booting') meta.state = 'training';
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
// 对已封存但仍在线的 shard,根据使用次数/最近访问情况更新 warmScore,后面给API用
|
|
265
|
+
// 这里只是为后续预留;当前版本 warmScore 仅简单随时间衰减
|
|
266
|
+
if (meta.state === 'sealed' || meta.state === 'warm') {
|
|
267
|
+
const decay = 0.99;
|
|
268
|
+
meta.warmScore = meta.warmScore * decay;
|
|
269
|
+
}
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
// 每30秒采样一次磁盘与GPU
|
|
273
|
+
if (metrics.ts.length % 30 === 0) {
|
|
274
|
+
try {
|
|
275
|
+
metrics.disk = await sampleDiskOnce();
|
|
276
|
+
} catch (_) {}
|
|
277
|
+
try {
|
|
278
|
+
metrics.gpu = await sampleGpuOnce();
|
|
279
|
+
} catch (_) {}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
// 自适应:高内存时进入应急模式,释放预留;恢复后重新预留
|
|
283
|
+
if (usedRatio >= 0.92 && !swapActive) {
|
|
284
|
+
swapActive = true;
|
|
285
|
+
releaseEmergencyBuffer();
|
|
286
|
+
} else if (swapActive && usedRatio <= 0.8) {
|
|
287
|
+
swapActive = false;
|
|
288
|
+
tryAllocEmergencyBuffer();
|
|
289
|
+
}
|
|
290
|
+
}, 1000);
|
|
291
|
+
}
|
|
102
292
|
let lastCpuSample = os.cpus();
|
|
103
293
|
let emergencyBuffer = null; // 应急释放缓冲,避免立刻 OOM
|
|
104
294
|
let swapActive = false; // 是否处于“内存应急”状态
|
|
@@ -206,40 +396,81 @@ function releaseEmergencyBuffer() {
|
|
|
206
396
|
}
|
|
207
397
|
}
|
|
208
398
|
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
const
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
}
|
|
221
|
-
|
|
222
|
-
metrics.gpu = await sampleGpuOnce();
|
|
223
|
-
} catch (_) {}
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
async function waitGroupSealed(groupId, { timeoutMs = 60 * 60 * 1000, pollMs = 5000 } = {}) {
|
|
402
|
+
const start = Date.now();
|
|
403
|
+
for (;;) {
|
|
404
|
+
const meta = shardMeta.get(groupId);
|
|
405
|
+
if (meta && meta.state === 'sealed') {
|
|
406
|
+
console.log(`[TRAIN] group_${groupId} 已检测到 sealed 状态`);
|
|
407
|
+
return true;
|
|
408
|
+
}
|
|
409
|
+
if (Date.now() - start > timeoutMs) {
|
|
410
|
+
console.warn(`[TRAIN] group_${groupId} 等待 sealed 超时 (${timeoutMs}ms)`);
|
|
411
|
+
return false;
|
|
224
412
|
}
|
|
413
|
+
await new Promise(r => setTimeout(r, pollMs));
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
// 封存后杀死该组所有子进程
|
|
417
|
+
function killGroupProcesses(reg) {
|
|
418
|
+
const all = [];
|
|
419
|
+
for (const sp of (reg.servePorts || [])) if (sp.pid) all.push(sp.pid);
|
|
420
|
+
if (reg.study?.pid) all.push(reg.study.pid);
|
|
421
|
+
if (reg.forwarder?.pid) all.push(reg.forwarder.pid);
|
|
422
|
+
for (const pid of all) {
|
|
423
|
+
try { process.kill(pid); } catch (_) {}
|
|
424
|
+
}
|
|
425
|
+
console.log(`[TRAIN] group_${reg.id} 所有子进程已终止(进入冷态)`);
|
|
426
|
+
}
|
|
225
427
|
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
428
|
+
// 调用该 group forwarder 的 /api/snapshot/create ,再 /api/snapshot/list 做一次硬盘快照
|
|
429
|
+
async function snapshotGroup(groupId, forwarderPort) {
|
|
430
|
+
try {
|
|
431
|
+
const apiBase = `http://localhost:${forwarderPort}`;
|
|
432
|
+
// 这里假设 serve/main_* 那边已经暴露 snapshot API(你 main_Serve.cjs 里有)
|
|
433
|
+
// 对于 group forwarder,我们只要简单调用 /api/snapshots/create 即可,如果你以后加的话。
|
|
434
|
+
// 当前版本没有 forwarder 级 snapshot,这里调用 serve 主实例的 snapshot。
|
|
435
|
+
console.log(`[TRAIN] group_${groupId} 尝试触发快照(通过任一 serve 实例)`);
|
|
436
|
+
|
|
437
|
+
// 读 group_ports.json,拿一个 serve port
|
|
438
|
+
const reg = groupRegistry[groupId];
|
|
439
|
+
const firstServe = reg?.servePorts?.[0]?.port;
|
|
440
|
+
if (!firstServe) {
|
|
441
|
+
console.warn(`[TRAIN] group_${groupId} 无 serve 端口,跳过快照`);
|
|
442
|
+
return false;
|
|
233
443
|
}
|
|
234
|
-
|
|
444
|
+
const serveApiBase = `http://localhost:${firstServe}`;
|
|
445
|
+
await axios.post(`${serveApiBase}/api/snapshot/create`, { name: 'auto-train-complete' }, { timeout: 600_000 }).catch(() => {});
|
|
446
|
+
console.log(`[TRAIN] group_${groupId} 快照请求已发送到 serve@${firstServe}`);
|
|
447
|
+
return true;
|
|
448
|
+
} catch (e) {
|
|
449
|
+
console.warn(`[TRAIN] group_${groupId} 快照失败:`, e.message);
|
|
450
|
+
return false;
|
|
451
|
+
}
|
|
235
452
|
}
|
|
236
453
|
|
|
454
|
+
// 封存后杀死该组所有子进程
|
|
455
|
+
function killGroupProcesses(reg) {
|
|
456
|
+
const all = [];
|
|
457
|
+
for (const sp of (reg.servePorts || [])) if (sp.pid) all.push(sp.pid);
|
|
458
|
+
if (reg.study?.pid) all.push(reg.study.pid);
|
|
459
|
+
if (reg.forwarder?.pid) all.push(reg.forwarder.pid);
|
|
460
|
+
for (const pid of all) {
|
|
461
|
+
try { process.kill(pid); } catch (_) {}
|
|
462
|
+
}
|
|
463
|
+
console.log(`[TRAIN] group_${reg.id} 所有子进程已终止(进入冷态)`);
|
|
464
|
+
}
|
|
237
465
|
|
|
238
466
|
async function startAllGroups() {
|
|
239
467
|
startLogger();
|
|
240
468
|
for (let i = 0; i < groupFolders.length; i++) {
|
|
241
469
|
const groupDir = groupFolders[i];
|
|
242
470
|
const reg = { id: i, dir: groupDir, servePorts: [], study: null, forwarder: null };
|
|
471
|
+
// 初始化 shard 元信息(类型按 GROUPS_DIR 推断)
|
|
472
|
+
const type = GROUPS_DIR.includes('image') ? 'image' : GROUPS_DIR.includes('audio') ? 'audio' : 'text';
|
|
473
|
+
initShardMetaForGroup(i, type);
|
|
243
474
|
|
|
244
475
|
// 预计算本组端口
|
|
245
476
|
const groupServePorts = Array.from({ length: SERVE_COUNT }, (_, j) => PORT_BASE + i * (SERVE_COUNT + 2) + j);
|
|
@@ -268,7 +499,6 @@ async function startAllGroups() {
|
|
|
268
499
|
cwd: groupDir,
|
|
269
500
|
stdio: 'inherit',
|
|
270
501
|
});
|
|
271
|
-
// 注册到日志子进程
|
|
272
502
|
portIndex.set(port, { groupId: i, role: 'serve' });
|
|
273
503
|
loggerSend({ type: 'register', processes: [{ groupId: i, role: 'serve', port, pid: null }] });
|
|
274
504
|
groupProcesses.push(proc);
|
|
@@ -289,7 +519,7 @@ async function startAllGroups() {
|
|
|
289
519
|
}
|
|
290
520
|
}
|
|
291
521
|
|
|
292
|
-
//
|
|
522
|
+
// Study
|
|
293
523
|
if (fs.existsSync(path.join(groupDir, 'main_Study.cjs'))) {
|
|
294
524
|
const proc = spawn(
|
|
295
525
|
'node',
|
|
@@ -305,8 +535,8 @@ async function startAllGroups() {
|
|
|
305
535
|
loggerSend({ type: 'update', port: studyPort, pid: proc.pid, role: 'study', groupId: i });
|
|
306
536
|
});
|
|
307
537
|
}
|
|
308
|
-
|
|
309
|
-
//
|
|
538
|
+
|
|
539
|
+
// Forwarder
|
|
310
540
|
groupForwarderPorts.push(forwarderPort);
|
|
311
541
|
const forwarderScript = path.join(groupDir, 'forwarder.js');
|
|
312
542
|
if (fs.existsSync(forwarderScript)) {
|
|
@@ -331,14 +561,132 @@ async function startAllGroups() {
|
|
|
331
561
|
});
|
|
332
562
|
}
|
|
333
563
|
|
|
334
|
-
// 写入组配置文件
|
|
335
564
|
const config = { servePorts, studyPort, forwarderPort };
|
|
336
565
|
fs.writeFileSync(path.join(groupDir, 'group_ports.json'), JSON.stringify(config, null, 2));
|
|
337
566
|
console.log(`[START] group_${i} 端口分配: serve=${servePorts.join(',')}, study=${studyPort}, forwarder=${forwarderPort}`);
|
|
338
567
|
groupRegistry.push(reg);
|
|
568
|
+
|
|
569
|
+
// === 关键:串行训练 + 封存 + 冷杀 ===
|
|
570
|
+
console.log(`[TRAIN] 等待 group_${i} 完成首次训练/封存...`);
|
|
571
|
+
await waitGroupSealed(i, { timeoutMs: 60 * 60 * 1000, pollMs: 5000 });
|
|
572
|
+
|
|
573
|
+
// 生成语义 embedding + 标签,并写入 shard_meta.json
|
|
574
|
+
const meta = shardMeta.get(i) || initShardMetaForGroup(i, type);
|
|
575
|
+
const { dim, centroid, lines } = sampleGroupCorpusEmbedding(groupDir, { maxFiles: 16, maxLines: 200, dim: 64 });
|
|
576
|
+
meta.type = type;
|
|
577
|
+
meta.tags = meta.tags && meta.tags.length ? meta.tags : [type];
|
|
578
|
+
meta.embedding = centroid;
|
|
579
|
+
meta.sampleLines = lines.slice(0, 16);
|
|
580
|
+
try {
|
|
581
|
+
fs.writeFileSync(
|
|
582
|
+
path.join(groupDir, 'shard_meta.json'),
|
|
583
|
+
JSON.stringify({
|
|
584
|
+
id: meta.id,
|
|
585
|
+
groupId: meta.groupId,
|
|
586
|
+
type: meta.type,
|
|
587
|
+
tags: meta.tags,
|
|
588
|
+
dim,
|
|
589
|
+
centroid,
|
|
590
|
+
createdAt: meta.createdAt,
|
|
591
|
+
readyAt: meta.readyAt
|
|
592
|
+
}, null, 2),
|
|
593
|
+
'utf-8'
|
|
594
|
+
);
|
|
595
|
+
console.log(`[SHARD] 已写入 ${groupDir}/shard_meta.json`);
|
|
596
|
+
} catch (e) {
|
|
597
|
+
console.warn(`[SHARD] 写入 shard_meta.json 失败:`, e.message);
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
// 触发一次快照(使训练结果落盘)
|
|
601
|
+
await snapshotGroup(i, forwarderPort);
|
|
602
|
+
|
|
603
|
+
// 冷杀该组所有进程,避免同时训练引发 OOM
|
|
604
|
+
killGroupProcesses(reg);
|
|
605
|
+
|
|
606
|
+
// 小间隔再启动下一组,避免端口释放延迟
|
|
339
607
|
await new Promise((r) => setTimeout(r, GROUP_START_DELAY));
|
|
340
608
|
}
|
|
341
|
-
console.log(`[INFO]
|
|
609
|
+
console.log(`[INFO] 所有 group 已串行训练并封存,接下来将统一冷启动`);
|
|
610
|
+
|
|
611
|
+
// === 统一冷启动:把所有 group 再启动一次,此时不再等待 sealed,直接作为推理实例运行 ===
|
|
612
|
+
for (let i = 0; i < groupFolders.length; i++) {
|
|
613
|
+
const groupDir = groupFolders[i];
|
|
614
|
+
const reg = groupRegistry[i];
|
|
615
|
+
const portsConf = JSON.parse(fs.readFileSync(path.join(groupDir, 'group_ports.json'), 'utf-8'));
|
|
616
|
+
const servePorts = portsConf.servePorts || [];
|
|
617
|
+
const studyPort = portsConf.studyPort;
|
|
618
|
+
const forwarderPort = portsConf.forwarderPort;
|
|
619
|
+
|
|
620
|
+
// Serve
|
|
621
|
+
reg.servePorts = [];
|
|
622
|
+
for (const port of servePorts) {
|
|
623
|
+
const serveScript = path.join(groupDir, 'main_Serve.cjs');
|
|
624
|
+
if (!fs.existsSync(serveScript)) continue;
|
|
625
|
+
const peers = servePorts.filter(p => p !== port).join(',');
|
|
626
|
+
const proc = spawn('node', [
|
|
627
|
+
serveScript,
|
|
628
|
+
String(port),
|
|
629
|
+
'--expose-gc',
|
|
630
|
+
'--group-id', String(i),
|
|
631
|
+
'--forwarder-port', String(forwarderPort),
|
|
632
|
+
'--study-port', String(studyPort),
|
|
633
|
+
'--peers', peers
|
|
634
|
+
], { cwd: groupDir, stdio: 'inherit' });
|
|
635
|
+
portIndex.set(port, { groupId: i, role: 'serve' });
|
|
636
|
+
loggerSend({ type: 'register', processes: [{ groupId: i, role: 'serve', port, pid: null }] });
|
|
637
|
+
groupProcesses.push(proc);
|
|
638
|
+
reg.servePorts.push({ pid: null, port });
|
|
639
|
+
proc.on('spawn', () => {
|
|
640
|
+
const idx = reg.servePorts.findIndex((p) => p.port === port);
|
|
641
|
+
if (idx >= 0) reg.servePorts[idx].pid = proc.pid;
|
|
642
|
+
loggerSend({ type: 'update', port, pid: proc.pid, role: 'serve', groupId: i });
|
|
643
|
+
});
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
// Study
|
|
647
|
+
if (fs.existsSync(path.join(groupDir, 'main_Study.cjs'))) {
|
|
648
|
+
const proc = spawn(
|
|
649
|
+
'node',
|
|
650
|
+
[path.join(groupDir, 'main_Study.cjs'), String(studyPort), '--max-old-space-size=16384', '--expose-gc'],
|
|
651
|
+
{ cwd: groupDir, stdio: 'inherit' }
|
|
652
|
+
);
|
|
653
|
+
portIndex.set(studyPort, { groupId: i, role: 'study' });
|
|
654
|
+
loggerSend({ type: 'register', processes: [{ groupId: i, role: 'study', port: studyPort, pid: null }] });
|
|
655
|
+
groupProcesses.push(proc);
|
|
656
|
+
reg.study = { pid: null, port: studyPort };
|
|
657
|
+
proc.on('spawn', () => {
|
|
658
|
+
reg.study.pid = proc.pid;
|
|
659
|
+
loggerSend({ type: 'update', port: studyPort, pid: proc.pid, role: 'study', groupId: i });
|
|
660
|
+
});
|
|
661
|
+
}
|
|
662
|
+
// Forwarder
|
|
663
|
+
if (fs.existsSync(path.join(groupDir, 'forwarder.js'))) {
|
|
664
|
+
const proc = spawn(
|
|
665
|
+
'node',
|
|
666
|
+
[
|
|
667
|
+
path.join(groupDir, 'forwarder.js'),
|
|
668
|
+
String(forwarderPort),
|
|
669
|
+
...servePorts.map(p => String(p)),
|
|
670
|
+
String(studyPort),
|
|
671
|
+
'--expose-gc'
|
|
672
|
+
],
|
|
673
|
+
{ cwd: groupDir, stdio: 'inherit' }
|
|
674
|
+
);
|
|
675
|
+
portIndex.set(forwarderPort, { groupId: i, role: 'forwarder' });
|
|
676
|
+
loggerSend({ type: 'register', processes: [{ groupId: i, role: 'forwarder', port: forwarderPort, pid: null }] });
|
|
677
|
+
groupProcesses.push(proc);
|
|
678
|
+
reg.forwarder = { pid: null, port: forwarderPort };
|
|
679
|
+
proc.on('spawn', () => {
|
|
680
|
+
reg.forwarder.pid = proc.pid;
|
|
681
|
+
loggerSend({ type: 'update', port: forwarderPort, pid: proc.pid, role: 'forwarder', groupId: i });
|
|
682
|
+
});
|
|
683
|
+
}
|
|
684
|
+
|
|
685
|
+
console.log(`[COLD-START] group_${i} 已从封存状态冷启动: serve=${servePorts.join(',')}, study=${studyPort}, forwarder=${forwarderPort}`);
|
|
686
|
+
await new Promise((r) => setTimeout(r, GROUP_START_DELAY));
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
console.log(`[INFO] 共启动 ${groupFolders.length} 个 group(冷启动完成)`);
|
|
342
690
|
}
|
|
343
691
|
|
|
344
692
|
function startMasterForwarder() {
|
|
@@ -689,7 +1037,24 @@ function startMasterForwarder() {
|
|
|
689
1037
|
app.get('/monitor', (req, res) => {
|
|
690
1038
|
res.sendFile(path.join(__dirname, 'public', 'monitor.html'));
|
|
691
1039
|
});
|
|
692
|
-
|
|
1040
|
+
app.get('/api/shards', (req, res) => {
|
|
1041
|
+
const out = [];
|
|
1042
|
+
for (const [gid, meta] of shardMeta.entries()) {
|
|
1043
|
+
out.push({
|
|
1044
|
+
id: meta.id,
|
|
1045
|
+
groupId: gid,
|
|
1046
|
+
type: meta.type,
|
|
1047
|
+
tags: meta.tags,
|
|
1048
|
+
state: meta.state,
|
|
1049
|
+
createdAt: meta.createdAt,
|
|
1050
|
+
readyAt: meta.readyAt,
|
|
1051
|
+
warmScore: meta.warmScore,
|
|
1052
|
+
memHistory: meta.memHistory.slice(-8),
|
|
1053
|
+
embedding: meta.embedding || null
|
|
1054
|
+
});
|
|
1055
|
+
}
|
|
1056
|
+
res.json({ ok: true, shards: out });
|
|
1057
|
+
});
|
|
693
1058
|
// 启动服务
|
|
694
1059
|
app.listen(MASTER_PORT, () => {
|
|
695
1060
|
console.log(`[MASTER] GroupStarter master forwarder已启动,端口 ${MASTER_PORT}`);
|
|
@@ -699,6 +1064,7 @@ function startMasterForwarder() {
|
|
|
699
1064
|
);
|
|
700
1065
|
console.log(`[MASTER] 对外API: POST /api/chat(前端不会调用,可留作他用)`);
|
|
701
1066
|
});
|
|
1067
|
+
|
|
702
1068
|
}
|
|
703
1069
|
// 僵尸端口检测 + 自动重启
|
|
704
1070
|
async function probePort(port) {
|