@synapseia-network/node 0.8.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +105 -0
- package/README.md +232 -0
- package/dist/bid-responder-Q725ZIUC.js +86 -0
- package/dist/bootstrap.js +22 -0
- package/dist/chain-info-lightweight-2UWAQZBF.js +303 -0
- package/dist/chat-stream-handler-BSHSGMFF.js +127 -0
- package/dist/chunk-2X7MSWD4.js +270 -0
- package/dist/chunk-3BHRQWSM.js +531 -0
- package/dist/chunk-5QFTU52A.js +442 -0
- package/dist/chunk-5ZAJBIAV.js +25 -0
- package/dist/chunk-7FLDR5NT.js +186 -0
- package/dist/chunk-C5XRYLYP.js +137 -0
- package/dist/chunk-D7ADMHK2.js +36 -0
- package/dist/chunk-DXUYWRO7.js +23 -0
- package/dist/chunk-F5UDK56Z.js +289 -0
- package/dist/chunk-NEHR6XY7.js +111 -0
- package/dist/chunk-NMJVODKH.js +453 -0
- package/dist/chunk-PRVT22SM.js +324 -0
- package/dist/chunk-T2ZRG5CX.js +1380 -0
- package/dist/chunk-V2L5SXTL.js +88 -0
- package/dist/chunk-XL2NJWFY.js +702 -0
- package/dist/embedding-C6GE3WVM.js +16 -0
- package/dist/hardware-ITQQJ5YI.js +37 -0
- package/dist/index.js +16836 -0
- package/dist/inference-server-CIGRJ36H.js +25 -0
- package/dist/local-cors-J6RWNMMD.js +44 -0
- package/dist/model-catalog-C53SDFMG.js +15 -0
- package/dist/model-discovery-LA6YMT3I.js +10 -0
- package/dist/ollama-XVXA3A37.js +9 -0
- package/dist/rewards-vault-cli-HW7H4EMD.js +147 -0
- package/dist/scripts/create_nodes.sh +6 -0
- package/dist/scripts/diloco_train.py +319 -0
- package/dist/scripts/train_lora.py +237 -0
- package/dist/scripts/train_micro.py +586 -0
- package/dist/trainer-HQMV2ZAR.js +21 -0
- package/package.json +128 -0
- package/scripts/create_nodes.sh +6 -0
- package/scripts/diloco_train.py +319 -0
- package/scripts/train_lora.py +237 -0
- package/scripts/train_micro.py +586 -0
|
@@ -0,0 +1,442 @@
|
|
|
1
|
+
import { fileURLToPath as __synFup } from "url";import { dirname as __synDn } from "path";const __filename = __synFup(import.meta.url);const __dirname = __synDn(__filename);
|
|
2
|
+
import {
|
|
3
|
+
init_logger,
|
|
4
|
+
logger_default
|
|
5
|
+
} from "./chunk-V2L5SXTL.js";
|
|
6
|
+
import {
|
|
7
|
+
__name
|
|
8
|
+
} from "./chunk-D7ADMHK2.js";
|
|
9
|
+
|
|
10
|
+
// src/modules/model/trainer.ts
|
|
11
|
+
init_logger();
|
|
12
|
+
import { Injectable } from "@nestjs/common";
|
|
13
|
+
import { spawn } from "child_process";
|
|
14
|
+
import { existsSync } from "fs";
|
|
15
|
+
import { resolve } from "path";
|
|
16
|
+
import { totalmem } from "os";
|
|
17
|
+
function _ts_decorate(decorators, target, key, desc) {
|
|
18
|
+
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
|
|
19
|
+
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
|
|
20
|
+
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
|
|
21
|
+
return c > 3 && r && Object.defineProperty(target, key, r), r;
|
|
22
|
+
}
|
|
23
|
+
__name(_ts_decorate, "_ts_decorate");
|
|
24
|
+
function moduleDir() {
|
|
25
|
+
return __dirname;
|
|
26
|
+
}
|
|
27
|
+
__name(moduleDir, "moduleDir");
|
|
28
|
+
var TRAINING_MEM_FLOOR_MB = 900;
|
|
29
|
+
var TRAINER_EVAL_FAILED_SENTINEL = 1e30;
|
|
30
|
+
function readTrainingHeadroomMB() {
|
|
31
|
+
const totalMB = totalmem() / (1024 * 1024);
|
|
32
|
+
const rssMB = process.memoryUsage().rss / (1024 * 1024);
|
|
33
|
+
return Math.max(0, Math.floor(totalMB - rssMB));
|
|
34
|
+
}
|
|
35
|
+
__name(readTrainingHeadroomMB, "readTrainingHeadroomMB");
|
|
36
|
+
var TrainerHelper = class _TrainerHelper {
|
|
37
|
+
static {
|
|
38
|
+
__name(this, "TrainerHelper");
|
|
39
|
+
}
|
|
40
|
+
static pyTorchCache = null;
|
|
41
|
+
static trainingInProgress = false;
|
|
42
|
+
async isPyTorchAvailable() {
|
|
43
|
+
if (_TrainerHelper.pyTorchCache === true) return true;
|
|
44
|
+
const result = await new Promise((res) => {
|
|
45
|
+
const proc = spawn("python3", [
|
|
46
|
+
"-c",
|
|
47
|
+
"import torch; print(torch.__version__)"
|
|
48
|
+
], {
|
|
49
|
+
stdio: [
|
|
50
|
+
"ignore",
|
|
51
|
+
"pipe",
|
|
52
|
+
"pipe"
|
|
53
|
+
]
|
|
54
|
+
});
|
|
55
|
+
let settled = false;
|
|
56
|
+
const settle = /* @__PURE__ */ __name((v) => {
|
|
57
|
+
if (!settled) {
|
|
58
|
+
settled = true;
|
|
59
|
+
res(v);
|
|
60
|
+
}
|
|
61
|
+
}, "settle");
|
|
62
|
+
proc.on("close", (code) => settle(code === 0));
|
|
63
|
+
proc.on("error", () => settle(false));
|
|
64
|
+
setTimeout(() => {
|
|
65
|
+
try {
|
|
66
|
+
proc.kill();
|
|
67
|
+
} catch {
|
|
68
|
+
}
|
|
69
|
+
settle(false);
|
|
70
|
+
}, 3e4);
|
|
71
|
+
});
|
|
72
|
+
if (result) _TrainerHelper.pyTorchCache = true;
|
|
73
|
+
return result;
|
|
74
|
+
}
|
|
75
|
+
async trainMicroModel(options) {
|
|
76
|
+
if (_TrainerHelper.trainingInProgress) {
|
|
77
|
+
throw new Error("A training run is already in progress on this node \u2014 refusing concurrent execution");
|
|
78
|
+
}
|
|
79
|
+
_TrainerHelper.trainingInProgress = true;
|
|
80
|
+
try {
|
|
81
|
+
return await this.trainMicroModelInner(options);
|
|
82
|
+
} finally {
|
|
83
|
+
_TrainerHelper.trainingInProgress = false;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
validateTrainingConfig(proposal) {
|
|
87
|
+
const { hyperparams } = proposal;
|
|
88
|
+
if (hyperparams.learningRate < 1e-4 || hyperparams.learningRate > 0.01) return {
|
|
89
|
+
valid: false,
|
|
90
|
+
error: "learningRate must be between 0.0001 and 0.01"
|
|
91
|
+
};
|
|
92
|
+
if (![
|
|
93
|
+
16,
|
|
94
|
+
32,
|
|
95
|
+
64,
|
|
96
|
+
128
|
|
97
|
+
].includes(hyperparams.batchSize)) return {
|
|
98
|
+
valid: false,
|
|
99
|
+
error: "batchSize must be one of: 16, 32, 64, 128"
|
|
100
|
+
};
|
|
101
|
+
if (![
|
|
102
|
+
64,
|
|
103
|
+
128,
|
|
104
|
+
192,
|
|
105
|
+
256
|
|
106
|
+
].includes(hyperparams.hiddenDim)) return {
|
|
107
|
+
valid: false,
|
|
108
|
+
error: "hiddenDim must be one of: 64, 128, 192, 256"
|
|
109
|
+
};
|
|
110
|
+
if (hyperparams.numLayers < 2 || hyperparams.numLayers > 8) return {
|
|
111
|
+
valid: false,
|
|
112
|
+
error: "numLayers must be between 2 and 8"
|
|
113
|
+
};
|
|
114
|
+
if (![
|
|
115
|
+
2,
|
|
116
|
+
4,
|
|
117
|
+
8
|
|
118
|
+
].includes(hyperparams.numHeads)) return {
|
|
119
|
+
valid: false,
|
|
120
|
+
error: "numHeads must be one of: 2, 4, 8"
|
|
121
|
+
};
|
|
122
|
+
if (![
|
|
123
|
+
"gelu",
|
|
124
|
+
"silu",
|
|
125
|
+
"relu"
|
|
126
|
+
].includes(hyperparams.activation)) return {
|
|
127
|
+
valid: false,
|
|
128
|
+
error: "activation must be one of: gelu, silu, relu"
|
|
129
|
+
};
|
|
130
|
+
if (![
|
|
131
|
+
"layernorm",
|
|
132
|
+
"rmsnorm"
|
|
133
|
+
].includes(hyperparams.normalization)) return {
|
|
134
|
+
valid: false,
|
|
135
|
+
error: "normalization must be one of: layernorm, rmsnorm"
|
|
136
|
+
};
|
|
137
|
+
if (![
|
|
138
|
+
"xavier",
|
|
139
|
+
"kaiming",
|
|
140
|
+
"normal"
|
|
141
|
+
].includes(hyperparams.initScheme)) return {
|
|
142
|
+
valid: false,
|
|
143
|
+
error: "initScheme must be one of: xavier, kaiming, normal"
|
|
144
|
+
};
|
|
145
|
+
if (hyperparams.maxTrainSeconds < 10 || hyperparams.maxTrainSeconds > 300) return {
|
|
146
|
+
valid: false,
|
|
147
|
+
error: "maxTrainSeconds must be between 10 and 300"
|
|
148
|
+
};
|
|
149
|
+
return {
|
|
150
|
+
valid: true
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
calculateImprovement(currentLoss, bestLoss) {
|
|
154
|
+
if (bestLoss <= 0) return 0;
|
|
155
|
+
return (bestLoss - currentLoss) / bestLoss * 100;
|
|
156
|
+
}
|
|
157
|
+
// ── Private ───────────────────────────────────────────────────────────────
|
|
158
|
+
/**
|
|
159
|
+
* Ask Ollama to unload every currently-loaded model so its weights free RAM
|
|
160
|
+
* before we spawn Python + torch. Uses /api/ps to list residents, then posts
|
|
161
|
+
* {model, keep_alive: 0} to /api/generate for each (Ollama interprets
|
|
162
|
+
* keep_alive=0 as "unload now"). Fully best-effort: 2s total budget, every
|
|
163
|
+
* error swallowed — a stale Ollama must never block training.
|
|
164
|
+
*/
|
|
165
|
+
async unloadOllamaModels() {
|
|
166
|
+
const baseUrl = process.env.OLLAMA_URL ?? "http://localhost:11434";
|
|
167
|
+
const deadline = Date.now() + 2e3;
|
|
168
|
+
try {
|
|
169
|
+
const controller = new AbortController();
|
|
170
|
+
const timer = setTimeout(() => controller.abort(), Math.max(500, deadline - Date.now()));
|
|
171
|
+
const listRes = await fetch(`${baseUrl}/api/ps`, {
|
|
172
|
+
signal: controller.signal
|
|
173
|
+
});
|
|
174
|
+
clearTimeout(timer);
|
|
175
|
+
if (!listRes.ok) return;
|
|
176
|
+
const listJson = await listRes.json();
|
|
177
|
+
const loaded = (listJson.models ?? []).map((m) => m.name ?? m.model).filter((n) => typeof n === "string" && n.length > 0);
|
|
178
|
+
if (loaded.length === 0) return;
|
|
179
|
+
logger_default.log(`[trainer] unloading ${loaded.length} ollama model(s) to free RAM: ${loaded.join(", ")}`);
|
|
180
|
+
await Promise.all(loaded.map(async (name) => {
|
|
181
|
+
const ctl = new AbortController();
|
|
182
|
+
const t = setTimeout(() => ctl.abort(), Math.max(300, deadline - Date.now()));
|
|
183
|
+
try {
|
|
184
|
+
await fetch(`${baseUrl}/api/generate`, {
|
|
185
|
+
method: "POST",
|
|
186
|
+
headers: {
|
|
187
|
+
"content-type": "application/json"
|
|
188
|
+
},
|
|
189
|
+
body: JSON.stringify({
|
|
190
|
+
model: name,
|
|
191
|
+
keep_alive: 0,
|
|
192
|
+
prompt: ""
|
|
193
|
+
}),
|
|
194
|
+
signal: ctl.signal
|
|
195
|
+
});
|
|
196
|
+
} catch {
|
|
197
|
+
} finally {
|
|
198
|
+
clearTimeout(t);
|
|
199
|
+
}
|
|
200
|
+
}));
|
|
201
|
+
} catch {
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
/**
|
|
205
|
+
* Estimate the resident-set memory the training spawn will need and
|
|
206
|
+
* compare to current free RAM. Heuristic, deliberately conservative:
|
|
207
|
+
*
|
|
208
|
+
* python+torch import ≈ 900 MB (fixed; observed range 700–900 MB
|
|
209
|
+
* before the user model is touched, see
|
|
210
|
+
* SIGKILL post-mortem 2026-04-30)
|
|
211
|
+
* model params (fp32) ≈ hiddenDim² × numLayers × 12 × 4 bytes
|
|
212
|
+
* Adam state ≈ 2× params
|
|
213
|
+
* activations ≈ batchSize × 256 (seq) × hiddenDim × numLayers × 4 bytes
|
|
214
|
+
*
|
|
215
|
+
* `numHeads` and the few other knobs don't move the RSS needle enough
|
|
216
|
+
* to bother modeling. The 50% safety multiplier below covers transient
|
|
217
|
+
* spikes (torch graph compilation, dataloader prefetch) that the static
|
|
218
|
+
* estimate misses; previously the pre-check passed yet the cgroup OOM
|
|
219
|
+
* killer still SIGKILL'd at runtime — bump tightens that gap.
|
|
220
|
+
*/
|
|
221
|
+
checkMemoryHeadroom(hp) {
|
|
222
|
+
const SEQ_LEN = 256;
|
|
223
|
+
const FLOAT_BYTES = 4;
|
|
224
|
+
const paramsBytes = hp.hiddenDim * hp.hiddenDim * hp.numLayers * 12 * FLOAT_BYTES;
|
|
225
|
+
const adamBytes = paramsBytes * 2;
|
|
226
|
+
const activationsBytes = hp.batchSize * SEQ_LEN * hp.hiddenDim * hp.numLayers * FLOAT_BYTES;
|
|
227
|
+
const estimatedMB = Math.round(TRAINING_MEM_FLOOR_MB + (paramsBytes + adamBytes + activationsBytes) / (1024 * 1024));
|
|
228
|
+
const freeMB = readTrainingHeadroomMB();
|
|
229
|
+
const totalMB = Math.round(totalmem() / (1024 * 1024));
|
|
230
|
+
const requiredMB = Math.round(estimatedMB * 1.5);
|
|
231
|
+
return {
|
|
232
|
+
ok: freeMB >= requiredMB,
|
|
233
|
+
freeMB,
|
|
234
|
+
totalMB,
|
|
235
|
+
estimatedMB
|
|
236
|
+
};
|
|
237
|
+
}
|
|
238
|
+
resolveTrainScript() {
|
|
239
|
+
try {
|
|
240
|
+
const here = moduleDir();
|
|
241
|
+
const candidates = [
|
|
242
|
+
resolve(here, "scripts/train_micro.py"),
|
|
243
|
+
resolve(here, "../scripts/train_micro.py"),
|
|
244
|
+
resolve(here, "../../scripts/train_micro.py"),
|
|
245
|
+
resolve(here, "../../../scripts/train_micro.py"),
|
|
246
|
+
resolve(here, "../../../../scripts/train_micro.py"),
|
|
247
|
+
resolve(process.cwd(), "scripts/train_micro.py")
|
|
248
|
+
];
|
|
249
|
+
for (const c of candidates) {
|
|
250
|
+
if (existsSync(c)) return c;
|
|
251
|
+
}
|
|
252
|
+
return candidates[0];
|
|
253
|
+
} catch {
|
|
254
|
+
return resolve(moduleDir(), "scripts/train_micro.py");
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
async trainMicroModelInner(options) {
|
|
258
|
+
const { proposal, datasetPath, hardware, pythonScriptPath = this.resolveTrainScript(), runNumber = 1 } = options;
|
|
259
|
+
const TRAINING_TIMEOUT_MS = parseInt(process.env.TRAINING_TIMEOUT_MS || "1200000", 10);
|
|
260
|
+
await this.unloadOllamaModels().catch(() => {
|
|
261
|
+
});
|
|
262
|
+
const memCheck = this.checkMemoryHeadroom(proposal.hyperparams);
|
|
263
|
+
if (!memCheck.ok) {
|
|
264
|
+
throw new Error(`Insufficient memory for training: free=${memCheck.freeMB}MB, estimated need=${memCheck.estimatedMB}MB (hiddenDim=${proposal.hyperparams.hiddenDim}, batchSize=${proposal.hyperparams.batchSize}, numLayers=${proposal.hyperparams.numLayers}, total=${memCheck.totalMB}MB). Free RAM (raise mem_limit), or wait for other processes to release memory.`);
|
|
265
|
+
}
|
|
266
|
+
let killProcess = null;
|
|
267
|
+
const settledHolder = {
|
|
268
|
+
current: false
|
|
269
|
+
};
|
|
270
|
+
const freememAtSpawnMB = readTrainingHeadroomMB();
|
|
271
|
+
const totalMemMB = Math.round(totalmem() / (1024 * 1024));
|
|
272
|
+
const trainingPromise = new Promise((res, reject) => {
|
|
273
|
+
const startTime = Date.now();
|
|
274
|
+
const lossCurve = [];
|
|
275
|
+
logger_default.log(`Spawning: python3 ${pythonScriptPath}`);
|
|
276
|
+
logger_default.log(`Training timeout: ${(TRAINING_TIMEOUT_MS / 1e3).toFixed(0)}s (TRAINING_TIMEOUT_MS env var overrides)`);
|
|
277
|
+
logger_default.log(`Script exists: ${existsSync(pythonScriptPath)}`);
|
|
278
|
+
logger_default.log(`Memory at spawn: free=${freememAtSpawnMB}MB / total=${totalMemMB}MB`);
|
|
279
|
+
const pythonProcess = spawn("python3", [
|
|
280
|
+
"-u",
|
|
281
|
+
pythonScriptPath
|
|
282
|
+
], {
|
|
283
|
+
stdio: [
|
|
284
|
+
"pipe",
|
|
285
|
+
"pipe",
|
|
286
|
+
"pipe"
|
|
287
|
+
],
|
|
288
|
+
env: {
|
|
289
|
+
...process.env,
|
|
290
|
+
OMP_NUM_THREADS: "1",
|
|
291
|
+
MKL_NUM_THREADS: "1",
|
|
292
|
+
OPENBLAS_NUM_THREADS: "1",
|
|
293
|
+
NUMEXPR_NUM_THREADS: "1",
|
|
294
|
+
VECLIB_MAXIMUM_THREADS: "1"
|
|
295
|
+
}
|
|
296
|
+
});
|
|
297
|
+
killProcess = /* @__PURE__ */ __name(() => {
|
|
298
|
+
if (!pythonProcess.killed) pythonProcess.kill("SIGTERM");
|
|
299
|
+
}, "killProcess");
|
|
300
|
+
const hyperparamsPayload = {
|
|
301
|
+
...proposal.hyperparams,
|
|
302
|
+
dataPath: datasetPath,
|
|
303
|
+
hardware
|
|
304
|
+
};
|
|
305
|
+
const settle = /* @__PURE__ */ __name((err, result) => {
|
|
306
|
+
if (settledHolder.current) return;
|
|
307
|
+
settledHolder.current = true;
|
|
308
|
+
killProcess = null;
|
|
309
|
+
if (err) reject(err);
|
|
310
|
+
else res(result);
|
|
311
|
+
}, "settle");
|
|
312
|
+
pythonProcess.on("error", (error) => {
|
|
313
|
+
settle(new Error(`Failed to spawn python3: ${error.message}. Is python3 installed?`));
|
|
314
|
+
});
|
|
315
|
+
pythonProcess.stdin.on("error", (err) => {
|
|
316
|
+
if (err.code !== "EPIPE") logger_default.warn(`[trainer] stdin error: ${err.message}`);
|
|
317
|
+
});
|
|
318
|
+
const payload = JSON.stringify(hyperparamsPayload);
|
|
319
|
+
logger_default.log(`Sending payload (${payload.length} bytes): dataPath=${hyperparamsPayload.dataPath}, hardware=${hyperparamsPayload.hardware}`);
|
|
320
|
+
pythonProcess.stdin.write(payload);
|
|
321
|
+
pythonProcess.stdin.end();
|
|
322
|
+
let stdout = "";
|
|
323
|
+
let stderr = "";
|
|
324
|
+
let finalResult = null;
|
|
325
|
+
pythonProcess.stdout.on("data", (data) => {
|
|
326
|
+
const lines = data.toString().split("\n").filter((line) => line.trim());
|
|
327
|
+
for (const line of lines) {
|
|
328
|
+
try {
|
|
329
|
+
const parsed = JSON.parse(line);
|
|
330
|
+
if (parsed.step !== void 0 && parsed.loss !== void 0) lossCurve.push(parsed.loss);
|
|
331
|
+
if (parsed.result) {
|
|
332
|
+
finalResult = {
|
|
333
|
+
finalLoss: parsed.result.finalLoss,
|
|
334
|
+
valLoss: parsed.result.valLoss,
|
|
335
|
+
// New 2026-05-05: trainer emits these when val set was empty
|
|
336
|
+
// or deadline beat the first batch. Propagated to executor so
|
|
337
|
+
// `improved` gating and reward payout can short-circuit on
|
|
338
|
+
// failed eval rather than trusting the 1e30 sentinel.
|
|
339
|
+
valLossEvalFailed: parsed.result.valLossEvalFailed === true,
|
|
340
|
+
valLossEvalFailureReason: typeof parsed.result.valLossEvalFailureReason === "string" ? parsed.result.valLossEvalFailureReason : void 0
|
|
341
|
+
};
|
|
342
|
+
}
|
|
343
|
+
} catch {
|
|
344
|
+
}
|
|
345
|
+
}
|
|
346
|
+
stdout += data.toString();
|
|
347
|
+
});
|
|
348
|
+
let stderrLineBuf = "";
|
|
349
|
+
const classifyAndLog = /* @__PURE__ */ __name((line) => {
|
|
350
|
+
const trimmed = line.trim();
|
|
351
|
+
if (!trimmed) return;
|
|
352
|
+
if (trimmed.startsWith('{"stage"') || /^\{.*"stage"\s*:/.test(trimmed)) {
|
|
353
|
+
logger_default.info(`[trainer] python3 trace: ${trimmed}`);
|
|
354
|
+
return;
|
|
355
|
+
}
|
|
356
|
+
if (/Error:|Traceback/.test(trimmed)) {
|
|
357
|
+
logger_default.error(`[trainer] python3 stderr: ${trimmed}`);
|
|
358
|
+
return;
|
|
359
|
+
}
|
|
360
|
+
if (/WARNING:/.test(trimmed)) {
|
|
361
|
+
logger_default.warn(`[trainer] python3 stderr: ${trimmed}`);
|
|
362
|
+
return;
|
|
363
|
+
}
|
|
364
|
+
logger_default.warn(`[trainer] python3 stderr: ${trimmed}`);
|
|
365
|
+
}, "classifyAndLog");
|
|
366
|
+
pythonProcess.stderr.on("data", (data) => {
|
|
367
|
+
const chunk = data.toString();
|
|
368
|
+
stderr += chunk;
|
|
369
|
+
stderrLineBuf += chunk;
|
|
370
|
+
const nl = stderrLineBuf.lastIndexOf("\n");
|
|
371
|
+
if (nl < 0) return;
|
|
372
|
+
const complete = stderrLineBuf.slice(0, nl);
|
|
373
|
+
stderrLineBuf = stderrLineBuf.slice(nl + 1);
|
|
374
|
+
for (const line of complete.split("\n")) classifyAndLog(line);
|
|
375
|
+
});
|
|
376
|
+
pythonProcess.on("close", (code, signal) => {
|
|
377
|
+
const durationMs = Date.now() - startTime;
|
|
378
|
+
if (stderrLineBuf.trim()) {
|
|
379
|
+
classifyAndLog(stderrLineBuf);
|
|
380
|
+
stderrLineBuf = "";
|
|
381
|
+
}
|
|
382
|
+
if (code === null || code !== 0) {
|
|
383
|
+
const killedByOom = code === null && signal === "SIGKILL";
|
|
384
|
+
const base = stderr.trim() ? `Training failed (exit ${code}, signal ${signal ?? "none"}): ${stderr.trim().slice(0, 500)}` : `Training process exited with code ${code ?? "null"}, signal ${signal ?? "none"} \u2014 no output received`;
|
|
385
|
+
const oomContext = killedByOom ? ` \u2014 likely OOM (SIGKILL by cgroup). hiddenDim=${proposal.hyperparams.hiddenDim}, batchSize=${proposal.hyperparams.batchSize}, numLayers=${proposal.hyperparams.numLayers}; memAtSpawn=${freememAtSpawnMB}MB/${totalMemMB}MB. Raise container mem_limit or lower hiddenDim/batchSize.` : "";
|
|
386
|
+
settle(new Error(base + oomContext));
|
|
387
|
+
return;
|
|
388
|
+
}
|
|
389
|
+
if (!finalResult) {
|
|
390
|
+
settle(new Error("Training completed but no result received from Python script"));
|
|
391
|
+
return;
|
|
392
|
+
}
|
|
393
|
+
settle(void 0, {
|
|
394
|
+
runNumber,
|
|
395
|
+
finalLoss: finalResult.finalLoss ?? 0,
|
|
396
|
+
valLoss: finalResult.valLoss ?? 0,
|
|
397
|
+
improvementPercent: 0,
|
|
398
|
+
durationMs,
|
|
399
|
+
config: proposal.hyperparams,
|
|
400
|
+
lossCurve,
|
|
401
|
+
hardwareUsed: hardware,
|
|
402
|
+
valLossEvalFailed: finalResult.valLossEvalFailed === true,
|
|
403
|
+
valLossEvalFailureReason: finalResult.valLossEvalFailureReason
|
|
404
|
+
});
|
|
405
|
+
});
|
|
406
|
+
});
|
|
407
|
+
const timeoutPromise = new Promise((_res, reject) => {
|
|
408
|
+
const timeoutHandle = setTimeout(() => {
|
|
409
|
+
settledHolder.current = true;
|
|
410
|
+
if (killProcess) {
|
|
411
|
+
killProcess();
|
|
412
|
+
killProcess = null;
|
|
413
|
+
}
|
|
414
|
+
reject(new Error(`Training timed out after ${TRAINING_TIMEOUT_MS / 1e3}s`));
|
|
415
|
+
}, TRAINING_TIMEOUT_MS);
|
|
416
|
+
trainingPromise.finally(() => clearTimeout(timeoutHandle)).catch(() => {
|
|
417
|
+
});
|
|
418
|
+
});
|
|
419
|
+
return await Promise.race([
|
|
420
|
+
trainingPromise,
|
|
421
|
+
timeoutPromise
|
|
422
|
+
]);
|
|
423
|
+
}
|
|
424
|
+
};
|
|
425
|
+
TrainerHelper = _ts_decorate([
|
|
426
|
+
Injectable()
|
|
427
|
+
], TrainerHelper);
|
|
428
|
+
var _trainerInstance = new TrainerHelper();
|
|
429
|
+
var isPyTorchAvailable = /* @__PURE__ */ __name(() => _trainerInstance.isPyTorchAvailable(), "isPyTorchAvailable");
|
|
430
|
+
var trainMicroModel = /* @__PURE__ */ __name((options) => _trainerInstance.trainMicroModel(options), "trainMicroModel");
|
|
431
|
+
var validateTrainingConfig = /* @__PURE__ */ __name((proposal) => _trainerInstance.validateTrainingConfig(proposal), "validateTrainingConfig");
|
|
432
|
+
var calculateImprovement = /* @__PURE__ */ __name((currentLoss, bestLoss) => _trainerInstance.calculateImprovement(currentLoss, bestLoss), "calculateImprovement");
|
|
433
|
+
|
|
434
|
+
export {
|
|
435
|
+
TRAINING_MEM_FLOOR_MB,
|
|
436
|
+
TRAINER_EVAL_FAILED_SENTINEL,
|
|
437
|
+
TrainerHelper,
|
|
438
|
+
isPyTorchAvailable,
|
|
439
|
+
trainMicroModel,
|
|
440
|
+
validateTrainingConfig,
|
|
441
|
+
calculateImprovement
|
|
442
|
+
};
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
import { fileURLToPath as __synFup } from "url";import { dirname as __synDn } from "path";const __filename = __synFup(import.meta.url);const __dirname = __synDn(__filename);
|
|
2
|
+
import {
|
|
3
|
+
__name
|
|
4
|
+
} from "./chunk-D7ADMHK2.js";
|
|
5
|
+
|
|
6
|
+
// src/modules/inference/chat-inference-state.ts
|
|
7
|
+
var active = 0;
|
|
8
|
+
function beginChatInference() {
|
|
9
|
+
active += 1;
|
|
10
|
+
}
|
|
11
|
+
__name(beginChatInference, "beginChatInference");
|
|
12
|
+
function endChatInference() {
|
|
13
|
+
active = Math.max(0, active - 1);
|
|
14
|
+
}
|
|
15
|
+
__name(endChatInference, "endChatInference");
|
|
16
|
+
function isChatInferenceActive() {
|
|
17
|
+
return active > 0;
|
|
18
|
+
}
|
|
19
|
+
__name(isChatInferenceActive, "isChatInferenceActive");
|
|
20
|
+
|
|
21
|
+
export {
|
|
22
|
+
beginChatInference,
|
|
23
|
+
endChatInference,
|
|
24
|
+
isChatInferenceActive
|
|
25
|
+
};
|
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
import { fileURLToPath as __synFup } from "url";import { dirname as __synDn } from "path";const __filename = __synFup(import.meta.url);const __dirname = __synDn(__filename);
|
|
2
|
+
import {
|
|
3
|
+
MODEL_CATALOG,
|
|
4
|
+
ModelCatalogHelper
|
|
5
|
+
} from "./chunk-PRVT22SM.js";
|
|
6
|
+
import {
|
|
7
|
+
init_logger,
|
|
8
|
+
logger_default
|
|
9
|
+
} from "./chunk-V2L5SXTL.js";
|
|
10
|
+
import {
|
|
11
|
+
__esm,
|
|
12
|
+
__name
|
|
13
|
+
} from "./chunk-D7ADMHK2.js";
|
|
14
|
+
|
|
15
|
+
// src/utils/node-auth.ts
|
|
16
|
+
import * as ed from "@noble/ed25519";
|
|
17
|
+
import { sha256, sha512 } from "@noble/hashes/sha2.js";
|
|
18
|
+
async function buildAuthHeaders(params) {
|
|
19
|
+
const { method, path, body, privateKey, publicKey, peerId } = params;
|
|
20
|
+
const timestamp = Date.now();
|
|
21
|
+
const bodyStr = typeof body === "object" && body !== null ? JSON.stringify(sortObjectKeys(body)) : String(body ?? "");
|
|
22
|
+
const bodyHash = Buffer.from(sha256(new TextEncoder().encode(bodyStr))).toString("base64");
|
|
23
|
+
const message = `${peerId}:${timestamp}:${path}:${bodyHash}`;
|
|
24
|
+
const messageBytes = new TextEncoder().encode(message);
|
|
25
|
+
const signature = sign(messageBytes, privateKey);
|
|
26
|
+
return {
|
|
27
|
+
"X-Peer-Id": peerId,
|
|
28
|
+
"X-Public-Key": Buffer.from(publicKey).toString("base64"),
|
|
29
|
+
"X-Timestamp": String(timestamp),
|
|
30
|
+
"X-Signature": Buffer.from(signature).toString("base64")
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
function rawEd25519Key(buf) {
|
|
34
|
+
if (buf.length === 32) return buf;
|
|
35
|
+
if (buf.length < 32) {
|
|
36
|
+
throw new Error(`rawEd25519Key: input too short (${buf.length} bytes)`);
|
|
37
|
+
}
|
|
38
|
+
return buf.slice(buf.length - 32);
|
|
39
|
+
}
|
|
40
|
+
function buildWsHandshakeAuth(params) {
|
|
41
|
+
const { peerId } = params;
|
|
42
|
+
const privateKeyRaw = rawEd25519Key(params.privateKey);
|
|
43
|
+
const publicKeyRaw = rawEd25519Key(params.publicKey);
|
|
44
|
+
const timestamp = Date.now();
|
|
45
|
+
const message = `${timestamp}:websocket:handshake`;
|
|
46
|
+
const messageBytes = new TextEncoder().encode(message);
|
|
47
|
+
const signature = sign(messageBytes, privateKeyRaw);
|
|
48
|
+
return {
|
|
49
|
+
peerId,
|
|
50
|
+
publicKey: Buffer.from(publicKeyRaw).toString("hex"),
|
|
51
|
+
timestamp: String(timestamp),
|
|
52
|
+
signature: Buffer.from(signature).toString("hex")
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
function sortObjectKeys(obj) {
|
|
56
|
+
if (Array.isArray(obj)) {
|
|
57
|
+
return obj.map(sortObjectKeys);
|
|
58
|
+
}
|
|
59
|
+
if (typeof obj === "object" && obj !== null) {
|
|
60
|
+
return Object.keys(obj).sort().reduce((acc, key) => {
|
|
61
|
+
acc[key] = sortObjectKeys(obj[key]);
|
|
62
|
+
return acc;
|
|
63
|
+
}, {});
|
|
64
|
+
}
|
|
65
|
+
return obj;
|
|
66
|
+
}
|
|
67
|
+
var sign;
|
|
68
|
+
var init_node_auth = __esm({
|
|
69
|
+
"src/utils/node-auth.ts"() {
|
|
70
|
+
"use strict";
|
|
71
|
+
ed.hashes.sha512 = sha512;
|
|
72
|
+
({ sign } = ed);
|
|
73
|
+
__name(buildAuthHeaders, "buildAuthHeaders");
|
|
74
|
+
__name(rawEd25519Key, "rawEd25519Key");
|
|
75
|
+
__name(buildWsHandshakeAuth, "buildWsHandshakeAuth");
|
|
76
|
+
__name(sortObjectKeys, "sortObjectKeys");
|
|
77
|
+
}
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
// src/modules/discovery/model-discovery.ts
|
|
81
|
+
init_logger();
|
|
82
|
+
import axios from "axios";
|
|
83
|
+
import { Injectable } from "@nestjs/common";
|
|
84
|
+
init_node_auth();
|
|
85
|
+
function _ts_decorate(decorators, target, key, desc) {
|
|
86
|
+
var c = arguments.length, r = c < 3 ? target : desc === null ? desc = Object.getOwnPropertyDescriptor(target, key) : desc, d;
|
|
87
|
+
if (typeof Reflect === "object" && typeof Reflect.decorate === "function") r = Reflect.decorate(decorators, target, key, desc);
|
|
88
|
+
else for (var i = decorators.length - 1; i >= 0; i--) if (d = decorators[i]) r = (c < 3 ? d(r) : c > 3 ? d(target, key, r) : d(target, key)) || r;
|
|
89
|
+
return c > 3 && r && Object.defineProperty(target, key, r), r;
|
|
90
|
+
}
|
|
91
|
+
__name(_ts_decorate, "_ts_decorate");
|
|
92
|
+
var ModelDiscovery = class {
|
|
93
|
+
static {
|
|
94
|
+
__name(this, "ModelDiscovery");
|
|
95
|
+
}
|
|
96
|
+
lastRegisteredHash = "";
|
|
97
|
+
/**
|
|
98
|
+
* Discover locally available models and register them with the coordinator.
|
|
99
|
+
* Called periodically after heartbeat to keep the model registry in sync.
|
|
100
|
+
*/
|
|
101
|
+
async registerModels(coordinatorUrl, peerId, hardware, identity, ollamaUrl) {
|
|
102
|
+
try {
|
|
103
|
+
const catalogHelper = new ModelCatalogHelper();
|
|
104
|
+
const localModelNames = catalogHelper.getLocalModels(ollamaUrl);
|
|
105
|
+
if (localModelNames.length === 0) {
|
|
106
|
+
logger_default.log("[ModelDiscovery] No local models found, skipping registration");
|
|
107
|
+
return;
|
|
108
|
+
}
|
|
109
|
+
const models = this.buildModelList(localModelNames, hardware);
|
|
110
|
+
const hash = this.hashModels(models);
|
|
111
|
+
const payload = {
|
|
112
|
+
peerId,
|
|
113
|
+
models,
|
|
114
|
+
inferencePort: Number(process.env.INFERENCE_PORT) || 8080,
|
|
115
|
+
inferencePublicUrl: process.env.INFERENCE_PUBLIC_URL || void 0
|
|
116
|
+
};
|
|
117
|
+
const modelsChanged = hash !== this.lastRegisteredHash;
|
|
118
|
+
const headers = {
|
|
119
|
+
"Content-Type": "application/json"
|
|
120
|
+
};
|
|
121
|
+
if (identity?.privateKey && identity?.publicKey) {
|
|
122
|
+
try {
|
|
123
|
+
const auth = await buildAuthHeaders({
|
|
124
|
+
method: "POST",
|
|
125
|
+
path: "/inference/register",
|
|
126
|
+
body: payload,
|
|
127
|
+
privateKey: Buffer.from(identity.privateKey, "hex"),
|
|
128
|
+
publicKey: Buffer.from(identity.publicKey, "hex"),
|
|
129
|
+
peerId: identity.peerId
|
|
130
|
+
});
|
|
131
|
+
Object.assign(headers, auth);
|
|
132
|
+
} catch (signErr) {
|
|
133
|
+
logger_default.warn("[ModelDiscovery] Failed to sign request:", signErr.message);
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
await axios.post(`${coordinatorUrl}/inference/register`, payload, {
|
|
137
|
+
timeout: 5e3,
|
|
138
|
+
headers
|
|
139
|
+
});
|
|
140
|
+
if (modelsChanged) {
|
|
141
|
+
this.lastRegisteredHash = hash;
|
|
142
|
+
logger_default.log(`[ModelDiscovery] Registered ${models.length} model(s) with coordinator`);
|
|
143
|
+
}
|
|
144
|
+
} catch (error) {
|
|
145
|
+
logger_default.warn(`[ModelDiscovery] Failed to register models: ${error.message}`);
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
/**
|
|
149
|
+
* Build model info list matching coordinator's expected format.
|
|
150
|
+
*/
|
|
151
|
+
buildModelList(localModelNames, hardware) {
|
|
152
|
+
return localModelNames.map((name) => {
|
|
153
|
+
const catalogEntry = MODEL_CATALOG.find((m) => m.name.toLowerCase() === name.split(":")[0].toLowerCase());
|
|
154
|
+
const capabilities = [];
|
|
155
|
+
if (catalogEntry?.category === "embedding") {
|
|
156
|
+
capabilities.push("embedding");
|
|
157
|
+
} else {
|
|
158
|
+
capabilities.push("inference");
|
|
159
|
+
if (catalogEntry?.category === "code") capabilities.push("code");
|
|
160
|
+
}
|
|
161
|
+
return {
|
|
162
|
+
name: name.split(":")[0],
|
|
163
|
+
quantization: name.includes(":") ? name.split(":")[1] : "default",
|
|
164
|
+
vram: catalogEntry?.minVram ?? 0,
|
|
165
|
+
maxContextLength: catalogEntry?.minVram && catalogEntry.minVram >= 4 ? 8192 : 4096,
|
|
166
|
+
capabilities
|
|
167
|
+
};
|
|
168
|
+
});
|
|
169
|
+
}
|
|
170
|
+
/**
|
|
171
|
+
* Simple hash of model names for change detection.
|
|
172
|
+
*/
|
|
173
|
+
hashModels(models) {
|
|
174
|
+
return models.map((m) => `${m.name}:${m.quantization}`).sort().join(",");
|
|
175
|
+
}
|
|
176
|
+
};
|
|
177
|
+
ModelDiscovery = _ts_decorate([
|
|
178
|
+
Injectable()
|
|
179
|
+
], ModelDiscovery);
|
|
180
|
+
|
|
181
|
+
export {
|
|
182
|
+
buildAuthHeaders,
|
|
183
|
+
buildWsHandshakeAuth,
|
|
184
|
+
init_node_auth,
|
|
185
|
+
ModelDiscovery
|
|
186
|
+
};
|