@tradejs/cli 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +65 -0
- package/dist/cli.js +10727 -0
- package/dist/lib/runBot.js +157 -0
- package/dist/scripts/backtest.js +2044 -0
- package/dist/scripts/bot.js +153 -0
- package/dist/scripts/cleanDir.js +40 -0
- package/dist/scripts/cleanRedis.js +40 -0
- package/dist/scripts/cleanTests.js +81 -0
- package/dist/scripts/continuity.js +183 -0
- package/dist/scripts/derivativesIngest.js +107 -0
- package/dist/scripts/derivativesIngestCoinalyzeAll.js +391 -0
- package/dist/scripts/doctor.js +5143 -0
- package/dist/scripts/findMlSignalsByTestSuite.js +83 -0
- package/dist/scripts/infraCommon.js +135 -0
- package/dist/scripts/infraDown.js +82 -0
- package/dist/scripts/infraInit.js +107 -0
- package/dist/scripts/infraUp.js +82 -0
- package/dist/scripts/migration.js +67 -0
- package/dist/scripts/mlExport.js +95 -0
- package/dist/scripts/mlExportSelect.js +100 -0
- package/dist/scripts/mlInspect.js +553 -0
- package/dist/scripts/mlTrainLatestSelect.js +1056 -0
- package/dist/scripts/results.js +1909 -0
- package/dist/scripts/selectStrategy.js +99 -0
- package/dist/scripts/signals.js +300 -0
- package/dist/scripts/test-ml.js +133 -0
- package/dist/scripts/test.js +16 -0
- package/dist/scripts/user-add.js +64 -0
- package/dist/workers/testerWorker.js +54 -0
- package/package.json +75 -0
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __copyProps = (to, from, except, desc) => {
|
|
9
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
10
|
+
for (let key of __getOwnPropNames(from))
|
|
11
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
12
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
13
|
+
}
|
|
14
|
+
return to;
|
|
15
|
+
};
|
|
16
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
17
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
18
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
19
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
20
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
21
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
22
|
+
mod
|
|
23
|
+
));
|
|
24
|
+
|
|
25
|
+
// src/scripts/mlExportSelect.ts
|
|
26
|
+
var import_child_process = require("child_process");
|
|
27
|
+
|
|
28
|
+
// src/scripts/selectStrategy.ts
|
|
29
|
+
var import_readline = __toESM(require("readline"));
|
|
30
|
+
var import_chalk = __toESM(require("chalk"));
|
|
31
|
+
var import_strategies = require("@tradejs/node/strategies");
|
|
32
|
+
var defaultStrategy = "TrendLine";
|
|
33
|
+
var getStrategyChoices = async () => {
|
|
34
|
+
try {
|
|
35
|
+
const loaded = await (0, import_strategies.getAvailableStrategyNames)();
|
|
36
|
+
if (loaded.length) {
|
|
37
|
+
return loaded;
|
|
38
|
+
}
|
|
39
|
+
} catch (error) {
|
|
40
|
+
console.warn(`Failed to load strategy list: ${String(error)}`);
|
|
41
|
+
}
|
|
42
|
+
return [
|
|
43
|
+
"Breakout",
|
|
44
|
+
"MaStrategy",
|
|
45
|
+
"AdaptiveMomentumRibbon",
|
|
46
|
+
"TrendLine",
|
|
47
|
+
"VolumeDivergence"
|
|
48
|
+
];
|
|
49
|
+
};
|
|
50
|
+
var selectStrategy = async (promptLabel = "Select strategy") => {
|
|
51
|
+
const strategies = await getStrategyChoices();
|
|
52
|
+
const fallbackStrategy = strategies.includes(defaultStrategy) ? defaultStrategy : strategies[0];
|
|
53
|
+
if (!process.stdin.isTTY) {
|
|
54
|
+
return fallbackStrategy;
|
|
55
|
+
}
|
|
56
|
+
console.log(import_chalk.default.cyan("Available strategies:"));
|
|
57
|
+
strategies.forEach((name, index) => {
|
|
58
|
+
const isDefault = name === defaultStrategy;
|
|
59
|
+
const label = isDefault ? import_chalk.default.green(name) : name;
|
|
60
|
+
const suffix = isDefault ? import_chalk.default.gray(" (default)") : "";
|
|
61
|
+
console.log(` ${import_chalk.default.yellow(String(index + 1))}) ${label}${suffix}`);
|
|
62
|
+
});
|
|
63
|
+
const rl = import_readline.default.createInterface({
|
|
64
|
+
input: process.stdin,
|
|
65
|
+
output: process.stdout
|
|
66
|
+
});
|
|
67
|
+
const question = (text) => new Promise((resolve) => rl.question(text, resolve));
|
|
68
|
+
const answer = await question(
|
|
69
|
+
`${promptLabel} [${import_chalk.default.green(fallbackStrategy)}]: `
|
|
70
|
+
);
|
|
71
|
+
rl.close();
|
|
72
|
+
const trimmed = answer.trim();
|
|
73
|
+
if (!trimmed) {
|
|
74
|
+
return fallbackStrategy;
|
|
75
|
+
}
|
|
76
|
+
const asNumber = Number(trimmed);
|
|
77
|
+
if (Number.isFinite(asNumber) && asNumber >= 1 && asNumber <= strategies.length) {
|
|
78
|
+
return strategies[asNumber - 1];
|
|
79
|
+
}
|
|
80
|
+
const byName = strategies.find(
|
|
81
|
+
(name) => name.toLowerCase() === trimmed.toLowerCase()
|
|
82
|
+
);
|
|
83
|
+
if (byName) {
|
|
84
|
+
return byName;
|
|
85
|
+
}
|
|
86
|
+
console.warn(`Unknown strategy "${trimmed}", using ${fallbackStrategy}.`);
|
|
87
|
+
return fallbackStrategy;
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
// src/scripts/mlExportSelect.ts
|
|
91
|
+
var run = async () => {
|
|
92
|
+
const selected = await selectStrategy();
|
|
93
|
+
const args = ["ts-node", "./src/scripts/mlExport", "--strategy", selected];
|
|
94
|
+
const result = (0, import_child_process.spawnSync)("yarn", args, { stdio: "inherit" });
|
|
95
|
+
process.exit(result.status ?? 1);
|
|
96
|
+
};
|
|
97
|
+
run().catch((err) => {
|
|
98
|
+
console.error("Failed to select strategy:", err);
|
|
99
|
+
process.exit(1);
|
|
100
|
+
});
|
|
@@ -0,0 +1,553 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __create = Object.create;
|
|
3
|
+
var __defProp = Object.defineProperty;
|
|
4
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
5
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
6
|
+
var __getProtoOf = Object.getPrototypeOf;
|
|
7
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
8
|
+
var __copyProps = (to, from, except, desc) => {
|
|
9
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
10
|
+
for (let key of __getOwnPropNames(from))
|
|
11
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
12
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
13
|
+
}
|
|
14
|
+
return to;
|
|
15
|
+
};
|
|
16
|
+
var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps(
|
|
17
|
+
// If the importer is in node compatibility mode or this is not an ESM
|
|
18
|
+
// file that has been converted to a CommonJS file using a Babel-
|
|
19
|
+
// compatible transform (i.e. "__esModule" has not been set), then set
|
|
20
|
+
// "default" to the CommonJS "module.exports" for node compatibility.
|
|
21
|
+
isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target,
|
|
22
|
+
mod
|
|
23
|
+
));
|
|
24
|
+
|
|
25
|
+
// src/scripts/mlInspect.ts
|
|
26
|
+
var import_args = __toESM(require("args"));
|
|
27
|
+
var import_chalk = __toESM(require("chalk"));
|
|
28
|
+
var import_promises = __toESM(require("fs/promises"));
|
|
29
|
+
var import_fs = require("fs");
|
|
30
|
+
var import_path = __toESM(require("path"));
|
|
31
|
+
var import_readline = __toESM(require("readline"));
|
|
32
|
+
var import_child_process = require("child_process");
|
|
33
|
+
import_args.default.example(
|
|
34
|
+
"yarn ml-inspect --rows 10000 --mode sample",
|
|
35
|
+
"Inspect the latest ML dataset chunk and highlight problematic features"
|
|
36
|
+
);
|
|
37
|
+
import_args.default.option(["d", "dir"], "Dataset directory", "data/ml/export");
|
|
38
|
+
import_args.default.option(["r", "rows"], "Rows to inspect", 1e4);
|
|
39
|
+
import_args.default.option(["m", "mode"], "head | tail | sample", "sample");
|
|
40
|
+
import_args.default.option(["S", "strategy"], "Strategy token in dataset filename", "");
|
|
41
|
+
import_args.default.option(
|
|
42
|
+
["f", "file"],
|
|
43
|
+
"Explicit dataset file path (overrides auto-select)"
|
|
44
|
+
);
|
|
45
|
+
import_args.default.option(["L", "limitIssues"], "How many fields to print in report", 25);
|
|
46
|
+
import_args.default.option(["M", "minFieldValues"], "Min valid values per numeric field", 50);
|
|
47
|
+
import_args.default.option(["T", "tool"], "quick | ydata", "");
|
|
48
|
+
var flags = import_args.default.parse(process.argv);
|
|
49
|
+
var toMode = (value) => {
|
|
50
|
+
const mode = String(value || "sample").toLowerCase();
|
|
51
|
+
if (mode === "head" || mode === "tail" || mode === "sample") {
|
|
52
|
+
return mode;
|
|
53
|
+
}
|
|
54
|
+
return "sample";
|
|
55
|
+
};
|
|
56
|
+
var toInspectTool = (value) => {
|
|
57
|
+
const raw = String(value || "").trim().toLowerCase();
|
|
58
|
+
if (!raw) return null;
|
|
59
|
+
if (raw === "quick" || raw === "ydata") return raw;
|
|
60
|
+
return null;
|
|
61
|
+
};
|
|
62
|
+
var selectInspectTool = async (defaultTool = "quick") => {
|
|
63
|
+
if (!process.stdin.isTTY) {
|
|
64
|
+
return defaultTool;
|
|
65
|
+
}
|
|
66
|
+
const options = ["quick", "ydata"];
|
|
67
|
+
console.log(import_chalk.default.cyan("Available inspect tools:"));
|
|
68
|
+
options.forEach((name, index) => {
|
|
69
|
+
const isDefault = name === defaultTool;
|
|
70
|
+
const label = isDefault ? import_chalk.default.green(name) : name;
|
|
71
|
+
const suffix = isDefault ? import_chalk.default.gray(" (default)") : "";
|
|
72
|
+
console.log(` ${import_chalk.default.yellow(String(index + 1))}) ${label}${suffix}`);
|
|
73
|
+
});
|
|
74
|
+
const rl = import_readline.default.createInterface({
|
|
75
|
+
input: process.stdin,
|
|
76
|
+
output: process.stdout
|
|
77
|
+
});
|
|
78
|
+
const question = (text) => new Promise((resolve) => rl.question(text, resolve));
|
|
79
|
+
const answer = await question(
|
|
80
|
+
`Select inspect tool [${import_chalk.default.green(defaultTool)}]: `
|
|
81
|
+
);
|
|
82
|
+
rl.close();
|
|
83
|
+
const trimmed = answer.trim().toLowerCase();
|
|
84
|
+
if (!trimmed) return defaultTool;
|
|
85
|
+
const asNumber = Number(trimmed);
|
|
86
|
+
if (Number.isFinite(asNumber) && asNumber >= 1 && asNumber <= options.length) {
|
|
87
|
+
return options[asNumber - 1];
|
|
88
|
+
}
|
|
89
|
+
if (options.includes(trimmed)) {
|
|
90
|
+
return trimmed;
|
|
91
|
+
}
|
|
92
|
+
console.warn(
|
|
93
|
+
`Unknown inspect tool "${answer.trim()}", using ${defaultTool}.`
|
|
94
|
+
);
|
|
95
|
+
return defaultTool;
|
|
96
|
+
};
|
|
97
|
+
var asPositiveInt = (value, fallback) => {
|
|
98
|
+
const parsed = Number.parseInt(String(value ?? ""), 10);
|
|
99
|
+
return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback;
|
|
100
|
+
};
|
|
101
|
+
var readRowsHead = async (filePath, maxRows) => {
|
|
102
|
+
const rows = [];
|
|
103
|
+
const rl = import_readline.default.createInterface({
|
|
104
|
+
input: (0, import_fs.createReadStream)(filePath, { encoding: "utf8" }),
|
|
105
|
+
crlfDelay: Infinity
|
|
106
|
+
});
|
|
107
|
+
for await (const line of rl) {
|
|
108
|
+
const trimmed = line.trim();
|
|
109
|
+
if (!trimmed) continue;
|
|
110
|
+
try {
|
|
111
|
+
const row = JSON.parse(trimmed);
|
|
112
|
+
if (row && typeof row === "object" && !Array.isArray(row)) {
|
|
113
|
+
rows.push(row);
|
|
114
|
+
}
|
|
115
|
+
} catch {
|
|
116
|
+
}
|
|
117
|
+
if (rows.length >= maxRows) {
|
|
118
|
+
rl.close();
|
|
119
|
+
break;
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
return rows;
|
|
123
|
+
};
|
|
124
|
+
var readRowsTail = async (filePath, maxRows) => {
|
|
125
|
+
const ring = [];
|
|
126
|
+
let idx = 0;
|
|
127
|
+
const rl = import_readline.default.createInterface({
|
|
128
|
+
input: (0, import_fs.createReadStream)(filePath, { encoding: "utf8" }),
|
|
129
|
+
crlfDelay: Infinity
|
|
130
|
+
});
|
|
131
|
+
for await (const line of rl) {
|
|
132
|
+
const trimmed = line.trim();
|
|
133
|
+
if (!trimmed) continue;
|
|
134
|
+
try {
|
|
135
|
+
const row = JSON.parse(trimmed);
|
|
136
|
+
if (!row || typeof row !== "object" || Array.isArray(row)) continue;
|
|
137
|
+
if (ring.length < maxRows) {
|
|
138
|
+
ring.push(row);
|
|
139
|
+
} else {
|
|
140
|
+
ring[idx] = row;
|
|
141
|
+
idx = (idx + 1) % maxRows;
|
|
142
|
+
}
|
|
143
|
+
} catch {
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
if (ring.length < maxRows || idx === 0) {
|
|
147
|
+
return ring;
|
|
148
|
+
}
|
|
149
|
+
return ring.slice(idx).concat(ring.slice(0, idx));
|
|
150
|
+
};
|
|
151
|
+
var readRowsSample = async (filePath, maxRows) => {
|
|
152
|
+
const sample = [];
|
|
153
|
+
let seen = 0;
|
|
154
|
+
const rl = import_readline.default.createInterface({
|
|
155
|
+
input: (0, import_fs.createReadStream)(filePath, { encoding: "utf8" }),
|
|
156
|
+
crlfDelay: Infinity
|
|
157
|
+
});
|
|
158
|
+
for await (const line of rl) {
|
|
159
|
+
const trimmed = line.trim();
|
|
160
|
+
if (!trimmed) continue;
|
|
161
|
+
try {
|
|
162
|
+
const row = JSON.parse(trimmed);
|
|
163
|
+
if (!row || typeof row !== "object" || Array.isArray(row)) continue;
|
|
164
|
+
seen += 1;
|
|
165
|
+
if (sample.length < maxRows) {
|
|
166
|
+
sample.push(row);
|
|
167
|
+
} else {
|
|
168
|
+
const replaceIndex = Math.floor(Math.random() * seen);
|
|
169
|
+
if (replaceIndex < maxRows) {
|
|
170
|
+
sample[replaceIndex] = row;
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
} catch {
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
return sample;
|
|
177
|
+
};
|
|
178
|
+
var quantileSorted = (sorted, q) => {
|
|
179
|
+
if (!sorted.length) return 0;
|
|
180
|
+
if (sorted.length === 1) return sorted[0];
|
|
181
|
+
const pos = (sorted.length - 1) * q;
|
|
182
|
+
const base = Math.floor(pos);
|
|
183
|
+
const rest = pos - base;
|
|
184
|
+
const left = sorted[base];
|
|
185
|
+
const right = sorted[Math.min(base + 1, sorted.length - 1)];
|
|
186
|
+
return left + (right - left) * rest;
|
|
187
|
+
};
|
|
188
|
+
var mean = (values) => {
|
|
189
|
+
if (!values.length) return 0;
|
|
190
|
+
let total = 0;
|
|
191
|
+
for (const value of values) total += value;
|
|
192
|
+
return total / values.length;
|
|
193
|
+
};
|
|
194
|
+
var std = (values, valuesMean) => {
|
|
195
|
+
if (!values.length) return 0;
|
|
196
|
+
let total = 0;
|
|
197
|
+
for (const value of values) {
|
|
198
|
+
const diff = value - valuesMean;
|
|
199
|
+
total += diff * diff;
|
|
200
|
+
}
|
|
201
|
+
return Math.sqrt(total / values.length);
|
|
202
|
+
};
|
|
203
|
+
var findLatestDataset = async (params) => {
|
|
204
|
+
const { dir, strategy } = params;
|
|
205
|
+
const entries = await import_promises.default.readdir(dir, { withFileTypes: true });
|
|
206
|
+
const strategyToken = strategy.trim().toLowerCase();
|
|
207
|
+
const files = entries.filter((entry) => entry.isFile()).map((entry) => entry.name).filter((name) => name.endsWith(".jsonl")).filter(
|
|
208
|
+
(name) => !name.includes(".train.") && !name.includes(".test.")
|
|
209
|
+
).filter(
|
|
210
|
+
(name) => strategyToken ? name.toLowerCase().includes(`ml-dataset-${strategyToken}-`) : name.toLowerCase().startsWith("ml-dataset-")
|
|
211
|
+
);
|
|
212
|
+
if (!files.length) {
|
|
213
|
+
return null;
|
|
214
|
+
}
|
|
215
|
+
const withMtime = await Promise.all(
|
|
216
|
+
files.map(async (name) => {
|
|
217
|
+
const stat = await import_promises.default.stat(import_path.default.join(dir, name));
|
|
218
|
+
return { name, mtime: stat.mtimeMs };
|
|
219
|
+
})
|
|
220
|
+
);
|
|
221
|
+
withMtime.sort(
|
|
222
|
+
(a, b) => b.mtime - a.mtime
|
|
223
|
+
);
|
|
224
|
+
return import_path.default.join(dir, withMtime[0].name);
|
|
225
|
+
};
|
|
226
|
+
var buildNumericStats = (rows, minFieldValues) => {
|
|
227
|
+
const allFields = /* @__PURE__ */ new Set();
|
|
228
|
+
for (const row of rows) {
|
|
229
|
+
for (const key of Object.keys(row)) {
|
|
230
|
+
allFields.add(key);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
const numericStats = [];
|
|
234
|
+
for (const field of allFields) {
|
|
235
|
+
const values = [];
|
|
236
|
+
let missing = 0;
|
|
237
|
+
let nonFinite = 0;
|
|
238
|
+
let zeros = 0;
|
|
239
|
+
let present = 0;
|
|
240
|
+
for (const row of rows) {
|
|
241
|
+
const raw = row[field];
|
|
242
|
+
if (raw === null || raw === void 0 || raw === "") {
|
|
243
|
+
missing += 1;
|
|
244
|
+
continue;
|
|
245
|
+
}
|
|
246
|
+
present += 1;
|
|
247
|
+
const value = Number(raw);
|
|
248
|
+
if (!Number.isFinite(value)) {
|
|
249
|
+
nonFinite += 1;
|
|
250
|
+
continue;
|
|
251
|
+
}
|
|
252
|
+
values.push(value);
|
|
253
|
+
if (value === 0) {
|
|
254
|
+
zeros += 1;
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
if (values.length < minFieldValues) {
|
|
258
|
+
continue;
|
|
259
|
+
}
|
|
260
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
261
|
+
const q1 = quantileSorted(sorted, 0.25);
|
|
262
|
+
const median = quantileSorted(sorted, 0.5);
|
|
263
|
+
const q3 = quantileSorted(sorted, 0.75);
|
|
264
|
+
const p95 = quantileSorted(sorted, 0.95);
|
|
265
|
+
const p99 = quantileSorted(sorted, 0.99);
|
|
266
|
+
const valuesMean = mean(values);
|
|
267
|
+
const valuesStd = std(values, valuesMean);
|
|
268
|
+
const iqr = q3 - q1;
|
|
269
|
+
const low = q1 - 3 * iqr;
|
|
270
|
+
const high = q3 + 3 * iqr;
|
|
271
|
+
const outliers = values.filter((v) => v < low || v > high).length;
|
|
272
|
+
const uniqueCount = new Set(values).size;
|
|
273
|
+
numericStats.push({
|
|
274
|
+
field,
|
|
275
|
+
count: rows.length,
|
|
276
|
+
validCount: values.length,
|
|
277
|
+
missingRate: missing / rows.length,
|
|
278
|
+
nonFiniteRate: nonFinite / Math.max(present, 1),
|
|
279
|
+
zeroRate: zeros / Math.max(values.length, 1),
|
|
280
|
+
uniqueCount,
|
|
281
|
+
min: sorted[0],
|
|
282
|
+
max: sorted[sorted.length - 1],
|
|
283
|
+
mean: valuesMean,
|
|
284
|
+
std: valuesStd,
|
|
285
|
+
median,
|
|
286
|
+
q1,
|
|
287
|
+
q3,
|
|
288
|
+
p95,
|
|
289
|
+
p99,
|
|
290
|
+
outlierRate: outliers / Math.max(values.length, 1),
|
|
291
|
+
scaleRatio: 0,
|
|
292
|
+
issues: [],
|
|
293
|
+
score: 0
|
|
294
|
+
});
|
|
295
|
+
}
|
|
296
|
+
const medianAbsValues = numericStats.map((item) => Math.abs(item.median)).filter((value) => value > 0).sort((a, b) => a - b);
|
|
297
|
+
const globalMedianAbs = quantileSorted(medianAbsValues, 0.5) || 1;
|
|
298
|
+
for (const stat of numericStats) {
|
|
299
|
+
const issues = [];
|
|
300
|
+
let score = 0;
|
|
301
|
+
const absMedian = Math.abs(stat.median);
|
|
302
|
+
const absP99 = Math.abs(stat.p99);
|
|
303
|
+
const p99ToMedian = absP99 / Math.max(absMedian, 1e-12);
|
|
304
|
+
const scaleRatio = absMedian / globalMedianAbs;
|
|
305
|
+
const hasStableMedian = absMedian >= 1e-4;
|
|
306
|
+
const isBinary = stat.uniqueCount <= 2;
|
|
307
|
+
stat.scaleRatio = scaleRatio;
|
|
308
|
+
if (stat.nonFiniteRate > 0) {
|
|
309
|
+
issues.push("has NaN/Inf values");
|
|
310
|
+
score += 5;
|
|
311
|
+
}
|
|
312
|
+
if (stat.missingRate > 0.2) {
|
|
313
|
+
issues.push(`high missing rate ${(stat.missingRate * 100).toFixed(1)}%`);
|
|
314
|
+
score += 4;
|
|
315
|
+
} else if (stat.missingRate > 0.05) {
|
|
316
|
+
issues.push(
|
|
317
|
+
`noticeable missing rate ${(stat.missingRate * 100).toFixed(1)}%`
|
|
318
|
+
);
|
|
319
|
+
score += 2;
|
|
320
|
+
}
|
|
321
|
+
if (stat.validCount > 0 && stat.uniqueCount <= 1) {
|
|
322
|
+
issues.push("constant value (no signal for model)");
|
|
323
|
+
score += 5;
|
|
324
|
+
}
|
|
325
|
+
if (stat.zeroRate > 0.98) {
|
|
326
|
+
issues.push(`almost always zero ${(stat.zeroRate * 100).toFixed(1)}%`);
|
|
327
|
+
score += 4;
|
|
328
|
+
} else if (stat.zeroRate > 0.9) {
|
|
329
|
+
issues.push(`mostly zero ${(stat.zeroRate * 100).toFixed(1)}%`);
|
|
330
|
+
score += 2;
|
|
331
|
+
}
|
|
332
|
+
if (hasStableMedian) {
|
|
333
|
+
if (p99ToMedian > 1e3) {
|
|
334
|
+
issues.push(
|
|
335
|
+
`extreme scale spread p99/median=${p99ToMedian.toFixed(0)}`
|
|
336
|
+
);
|
|
337
|
+
score += 5;
|
|
338
|
+
} else if (p99ToMedian > 100) {
|
|
339
|
+
issues.push(`large scale spread p99/median=${p99ToMedian.toFixed(0)}`);
|
|
340
|
+
score += 3;
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
if (stat.outlierRate > 0.1) {
|
|
344
|
+
issues.push(
|
|
345
|
+
`very high outlier rate ${(stat.outlierRate * 100).toFixed(1)}%`
|
|
346
|
+
);
|
|
347
|
+
score += 4;
|
|
348
|
+
} else if (stat.outlierRate > 0.03) {
|
|
349
|
+
issues.push(`high outlier rate ${(stat.outlierRate * 100).toFixed(1)}%`);
|
|
350
|
+
score += 2;
|
|
351
|
+
}
|
|
352
|
+
if (!isBinary && (scaleRatio > 1e3 || scaleRatio < 1e-3)) {
|
|
353
|
+
issues.push(
|
|
354
|
+
`feature scale differs from dataset median by x${scaleRatio.toExponential(2)}`
|
|
355
|
+
);
|
|
356
|
+
score += 3;
|
|
357
|
+
}
|
|
358
|
+
stat.issues = issues;
|
|
359
|
+
stat.score = score;
|
|
360
|
+
}
|
|
361
|
+
numericStats.sort((a, b) => {
|
|
362
|
+
if (b.score !== a.score) return b.score - a.score;
|
|
363
|
+
return b.outlierRate - a.outlierRate;
|
|
364
|
+
});
|
|
365
|
+
return numericStats;
|
|
366
|
+
};
|
|
367
|
+
var formatNumber = (value) => {
|
|
368
|
+
if (!Number.isFinite(value)) return "n/a";
|
|
369
|
+
const abs = Math.abs(value);
|
|
370
|
+
if (abs >= 1e6 || abs > 0 && abs < 1e-4) {
|
|
371
|
+
return value.toExponential(3);
|
|
372
|
+
}
|
|
373
|
+
return value.toFixed(6);
|
|
374
|
+
};
|
|
375
|
+
var runQuickInspect = async (params) => {
|
|
376
|
+
const { datasetPath, mode, rowsToInspect, limitIssues, minFieldValues } = params;
|
|
377
|
+
let rows = [];
|
|
378
|
+
if (mode === "head") {
|
|
379
|
+
rows = await readRowsHead(datasetPath, rowsToInspect);
|
|
380
|
+
} else if (mode === "tail") {
|
|
381
|
+
rows = await readRowsTail(datasetPath, rowsToInspect);
|
|
382
|
+
} else {
|
|
383
|
+
rows = await readRowsSample(datasetPath, rowsToInspect);
|
|
384
|
+
}
|
|
385
|
+
if (!rows.length) {
|
|
386
|
+
console.error(import_chalk.default.red("No rows could be read from dataset."));
|
|
387
|
+
process.exit(1);
|
|
388
|
+
}
|
|
389
|
+
const numericStats = buildNumericStats(rows, minFieldValues);
|
|
390
|
+
if (!numericStats.length) {
|
|
391
|
+
console.error(
|
|
392
|
+
import_chalk.default.red(
|
|
393
|
+
"No numeric fields with enough values were found in sampled rows."
|
|
394
|
+
)
|
|
395
|
+
);
|
|
396
|
+
process.exit(1);
|
|
397
|
+
}
|
|
398
|
+
const problematic = numericStats.filter((item) => item.score > 0);
|
|
399
|
+
console.log(import_chalk.default.gray("ML dataset inspection (quick)"));
|
|
400
|
+
console.log(import_chalk.default.gray(`file: ${datasetPath}`));
|
|
401
|
+
console.log(import_chalk.default.gray(`mode: ${mode}`));
|
|
402
|
+
console.log(import_chalk.default.gray(`rows inspected: ${rows.length}`));
|
|
403
|
+
console.log(import_chalk.default.gray(`numeric fields analyzed: ${numericStats.length}`));
|
|
404
|
+
console.log(import_chalk.default.gray(`fields with recommendations: ${problematic.length}`));
|
|
405
|
+
console.log("");
|
|
406
|
+
if (!problematic.length) {
|
|
407
|
+
console.log(
|
|
408
|
+
import_chalk.default.green("No problematic fields detected by current rules.")
|
|
409
|
+
);
|
|
410
|
+
process.exit(0);
|
|
411
|
+
}
|
|
412
|
+
const top = problematic.slice(0, limitIssues);
|
|
413
|
+
for (const item of top) {
|
|
414
|
+
console.log(import_chalk.default.yellow(item.field));
|
|
415
|
+
console.log(
|
|
416
|
+
` score=${item.score} issues=${item.issues.join("; ") || "none"}`
|
|
417
|
+
);
|
|
418
|
+
console.log(
|
|
419
|
+
` median=${formatNumber(item.median)} p99=${formatNumber(item.p99)} min=${formatNumber(item.min)} max=${formatNumber(item.max)}`
|
|
420
|
+
);
|
|
421
|
+
console.log(
|
|
422
|
+
` missing=${(item.missingRate * 100).toFixed(2)}% nonFinite=${(item.nonFiniteRate * 100).toFixed(2)}% outliers=${(item.outlierRate * 100).toFixed(2)}% zeros=${(item.zeroRate * 100).toFixed(2)}%`
|
|
423
|
+
);
|
|
424
|
+
}
|
|
425
|
+
if (problematic.length > limitIssues) {
|
|
426
|
+
console.log("");
|
|
427
|
+
console.log(
|
|
428
|
+
import_chalk.default.gray(
|
|
429
|
+
`... ${problematic.length - limitIssues} more fields hidden (raise --limitIssues)`
|
|
430
|
+
)
|
|
431
|
+
);
|
|
432
|
+
}
|
|
433
|
+
console.log("");
|
|
434
|
+
console.log(import_chalk.default.cyan("How to fix common issues:"));
|
|
435
|
+
console.log(
|
|
436
|
+
" - Large scale spread: normalize feature (log1p / relative to price / robust scaling)."
|
|
437
|
+
);
|
|
438
|
+
console.log(
|
|
439
|
+
" - High outliers: winsorize/clip or rebuild the source transform."
|
|
440
|
+
);
|
|
441
|
+
console.log(" - Missing values: fill consistently or remove the feature.");
|
|
442
|
+
console.log(
|
|
443
|
+
" - Constant/mostly zero: drop the feature or redefine its window."
|
|
444
|
+
);
|
|
445
|
+
};
|
|
446
|
+
var runYDataInspect = async (params) => {
|
|
447
|
+
const { datasetPath, mode, rowsToInspect } = params;
|
|
448
|
+
const cwd = process.cwd();
|
|
449
|
+
const absDatasetPath = import_path.default.resolve(datasetPath);
|
|
450
|
+
const dataRoot = import_path.default.resolve(cwd, "data");
|
|
451
|
+
if (!absDatasetPath.startsWith(dataRoot)) {
|
|
452
|
+
console.error(
|
|
453
|
+
import_chalk.default.red(
|
|
454
|
+
`ydata mode supports only files under ${dataRoot}. Use --file inside data/`
|
|
455
|
+
)
|
|
456
|
+
);
|
|
457
|
+
process.exit(1);
|
|
458
|
+
}
|
|
459
|
+
const relToData = import_path.default.relative(dataRoot, absDatasetPath);
|
|
460
|
+
const inputInContainer = `/app/data/${relToData}`;
|
|
461
|
+
const reportDir = import_path.default.dirname(absDatasetPath);
|
|
462
|
+
await import_promises.default.mkdir(reportDir, { recursive: true });
|
|
463
|
+
const reportPath = import_path.default.join(
|
|
464
|
+
reportDir,
|
|
465
|
+
`${import_path.default.basename(datasetPath, import_path.default.extname(datasetPath))}.profile.html`
|
|
466
|
+
);
|
|
467
|
+
const reportRelToData = import_path.default.relative(dataRoot, reportPath);
|
|
468
|
+
const reportInContainer = `/app/data/${reportRelToData}`;
|
|
469
|
+
console.log(
|
|
470
|
+
import_chalk.default.gray(
|
|
471
|
+
"Running ydata-profiling report via docker ml-profile service..."
|
|
472
|
+
)
|
|
473
|
+
);
|
|
474
|
+
const result = (0, import_child_process.spawnSync)(
|
|
475
|
+
"docker",
|
|
476
|
+
[
|
|
477
|
+
"compose",
|
|
478
|
+
"-f",
|
|
479
|
+
"docker-compose.ml.yml",
|
|
480
|
+
"run",
|
|
481
|
+
"--rm",
|
|
482
|
+
"ml-profile",
|
|
483
|
+
"python",
|
|
484
|
+
"/app/ml/profile.py",
|
|
485
|
+
"--input",
|
|
486
|
+
inputInContainer,
|
|
487
|
+
"--rows",
|
|
488
|
+
String(rowsToInspect),
|
|
489
|
+
"--mode",
|
|
490
|
+
mode,
|
|
491
|
+
"--output",
|
|
492
|
+
reportInContainer,
|
|
493
|
+
"--title",
|
|
494
|
+
`ML Profile: ${import_path.default.basename(datasetPath)}`
|
|
495
|
+
],
|
|
496
|
+
{
|
|
497
|
+
stdio: "inherit",
|
|
498
|
+
env: {
|
|
499
|
+
...process.env,
|
|
500
|
+
PYTHONUNBUFFERED: "1"
|
|
501
|
+
}
|
|
502
|
+
}
|
|
503
|
+
);
|
|
504
|
+
if ((result.status ?? 1) !== 0) {
|
|
505
|
+
console.error(
|
|
506
|
+
import_chalk.default.red(
|
|
507
|
+
"ydata-profiling failed. Build profile image first: docker compose -f docker-compose.ml.yml build ml-profile"
|
|
508
|
+
)
|
|
509
|
+
);
|
|
510
|
+
process.exit(result.status ?? 1);
|
|
511
|
+
}
|
|
512
|
+
console.log(import_chalk.default.green(`Profile report saved: ${reportPath}`));
|
|
513
|
+
};
|
|
514
|
+
var main = async () => {
|
|
515
|
+
const dir = String(flags.dir || "data/ml/export");
|
|
516
|
+
const rowsToInspect = asPositiveInt(flags.rows, 1e4);
|
|
517
|
+
const mode = toMode(flags.mode);
|
|
518
|
+
const strategy = String(flags.strategy || "");
|
|
519
|
+
const limitIssues = asPositiveInt(flags.limitIssues, 25);
|
|
520
|
+
const minFieldValues = asPositiveInt(flags.minFieldValues, 50);
|
|
521
|
+
const toolFromFlags = toInspectTool(flags.tool);
|
|
522
|
+
const explicitFile = flags.file ? String(flags.file) : "";
|
|
523
|
+
const datasetPath = explicitFile || await findLatestDataset({
|
|
524
|
+
dir,
|
|
525
|
+
strategy
|
|
526
|
+
});
|
|
527
|
+
if (!datasetPath) {
|
|
528
|
+
console.error(
|
|
529
|
+
import_chalk.default.red(`No dataset found. Expected ml-dataset-*.jsonl in ${dir}`)
|
|
530
|
+
);
|
|
531
|
+
process.exit(1);
|
|
532
|
+
}
|
|
533
|
+
const tool = toolFromFlags ?? await selectInspectTool("quick");
|
|
534
|
+
if (tool === "ydata") {
|
|
535
|
+
await runYDataInspect({
|
|
536
|
+
datasetPath,
|
|
537
|
+
mode,
|
|
538
|
+
rowsToInspect
|
|
539
|
+
});
|
|
540
|
+
return;
|
|
541
|
+
}
|
|
542
|
+
await runQuickInspect({
|
|
543
|
+
datasetPath,
|
|
544
|
+
mode,
|
|
545
|
+
rowsToInspect,
|
|
546
|
+
limitIssues,
|
|
547
|
+
minFieldValues
|
|
548
|
+
});
|
|
549
|
+
};
|
|
550
|
+
main().catch((err) => {
|
|
551
|
+
console.error("ml-inspect failed:", err);
|
|
552
|
+
process.exit(1);
|
|
553
|
+
});
|