chainlesschain 0.47.8 → 0.47.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,268 @@
1
+ /**
2
+ * Threat Intelligence Store — SQLite-backed IoC (indicator of
3
+ * compromise) catalog. Imports STIX 2.1 bundles, deduplicates
4
+ * indicators by (type, value), and supports fast O(1) matching of
5
+ * an arbitrary observable against the stored feed.
6
+ *
7
+ * Designed to pair with `compliance-manager.js`: a compliance scan
8
+ * can call `matchObservable` to flag blocklisted artefacts during
9
+ * evidence ingestion.
10
+ */
11
+
12
+ import crypto from "crypto";
13
+ import fs from "fs";
14
+ import {
15
+ extractIndicatorsFromBundle,
16
+ classifyObservable,
17
+ IOC_TYPES,
18
+ } from "./stix-parser.js";
19
+
20
+ /* ── Schema ────────────────────────────────────────────────── */
21
+
22
+ export function ensureThreatIntelTables(db) {
23
+ db.exec(`
24
+ CREATE TABLE IF NOT EXISTS threat_intel_indicators (
25
+ id TEXT PRIMARY KEY,
26
+ type TEXT NOT NULL,
27
+ value TEXT NOT NULL,
28
+ labels TEXT,
29
+ confidence INTEGER,
30
+ source_id TEXT,
31
+ source_name TEXT,
32
+ valid_from TEXT,
33
+ valid_until TEXT,
34
+ first_seen_at TEXT DEFAULT (datetime('now')),
35
+ last_seen_at TEXT DEFAULT (datetime('now')),
36
+ UNIQUE(type, value)
37
+ )
38
+ `);
39
+ db.exec(
40
+ `CREATE INDEX IF NOT EXISTS idx_threat_intel_type ON threat_intel_indicators(type)`,
41
+ );
42
+ db.exec(
43
+ `CREATE INDEX IF NOT EXISTS idx_threat_intel_value ON threat_intel_indicators(value)`,
44
+ );
45
+ }
46
+
47
+ /* ── Import ────────────────────────────────────────────────── */
48
+
49
+ /**
50
+ * Import a parsed STIX bundle (or a loose array of STIX objects) into
51
+ * the store. Returns `{imported, updated, skipped, total}`.
52
+ *
53
+ * - `imported`: new (type,value) pairs inserted.
54
+ * - `updated`: existing pairs whose metadata was refreshed.
55
+ * - `skipped`: objects that yielded no usable indicator (unknown
56
+ * observable type, non-stix pattern_type, malformed pattern, etc.).
57
+ */
58
+ export function importStixBundle(db, bundle) {
59
+ if (!db) throw new Error("Database is required");
60
+ const iocs = extractIndicatorsFromBundle(bundle);
61
+ const total = Array.isArray(bundle?.objects)
62
+ ? bundle.objects.filter((o) => o?.type === "indicator").length
63
+ : Array.isArray(bundle)
64
+ ? bundle.filter((o) => o?.type === "indicator").length
65
+ : 0;
66
+
67
+ let imported = 0;
68
+ let updated = 0;
69
+
70
+ const insert = db.prepare(
71
+ `INSERT INTO threat_intel_indicators
72
+ (id, type, value, labels, confidence, source_id, source_name,
73
+ valid_from, valid_until, first_seen_at, last_seen_at)
74
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now'), datetime('now'))`,
75
+ );
76
+ const update = db.prepare(
77
+ `UPDATE threat_intel_indicators
78
+ SET labels = ?,
79
+ confidence = ?,
80
+ source_id = ?,
81
+ source_name = ?,
82
+ valid_from = ?,
83
+ valid_until = ?,
84
+ last_seen_at = datetime('now')
85
+ WHERE type = ? AND value = ?`,
86
+ );
87
+ const selectExisting = db.prepare(
88
+ `SELECT id FROM threat_intel_indicators WHERE type = ? AND value = ?`,
89
+ );
90
+
91
+ for (const rawIoc of iocs) {
92
+ // File hashes are case-insensitive — normalize on write so that
93
+ // equality lookups in `matchObservable` don't depend on SQLite's
94
+ // COLLATE NOCASE (which the mock-db used in tests doesn't emulate).
95
+ const ioc = rawIoc.type.startsWith("file-")
96
+ ? { ...rawIoc, value: String(rawIoc.value).toLowerCase() }
97
+ : rawIoc;
98
+ const prior = selectExisting.get(ioc.type, ioc.value);
99
+ const labels = JSON.stringify(ioc.source?.labels || []);
100
+ const confidence = ioc.source?.confidence ?? null;
101
+ const sourceId = ioc.source?.indicatorId || null;
102
+ const sourceName = ioc.source?.name || null;
103
+ const validFrom = ioc.source?.validFrom || null;
104
+ const validUntil = ioc.source?.validUntil || null;
105
+
106
+ if (prior) {
107
+ update.run(
108
+ labels,
109
+ confidence,
110
+ sourceId,
111
+ sourceName,
112
+ validFrom,
113
+ validUntil,
114
+ ioc.type,
115
+ ioc.value,
116
+ );
117
+ updated += 1;
118
+ } else {
119
+ insert.run(
120
+ crypto.randomUUID(),
121
+ ioc.type,
122
+ ioc.value,
123
+ labels,
124
+ confidence,
125
+ sourceId,
126
+ sourceName,
127
+ validFrom,
128
+ validUntil,
129
+ );
130
+ imported += 1;
131
+ }
132
+ }
133
+ const skipped = Math.max(total - iocs.length, 0);
134
+ return { imported, updated, skipped, total };
135
+ }
136
+
137
+ /**
138
+ * Import a STIX bundle from a file path. Convenience wrapper over
139
+ * `importStixBundle` — reads the JSON, parses it, defers to the
140
+ * core importer.
141
+ */
142
+ export function importStixFile(db, filePath) {
143
+ const raw = fs.readFileSync(filePath, "utf-8");
144
+ let bundle;
145
+ try {
146
+ bundle = JSON.parse(raw);
147
+ } catch (err) {
148
+ throw new Error(`Invalid JSON in ${filePath}: ${err.message}`);
149
+ }
150
+ return importStixBundle(db, bundle);
151
+ }
152
+
153
+ /* ── Query ─────────────────────────────────────────────────── */
154
+
155
+ function _rowToIndicator(r) {
156
+ if (!r) return null;
157
+ let labels = [];
158
+ try {
159
+ labels = r.labels ? JSON.parse(r.labels) : [];
160
+ } catch {
161
+ labels = [];
162
+ }
163
+ return {
164
+ id: r.id,
165
+ type: r.type,
166
+ value: r.value,
167
+ labels,
168
+ confidence: r.confidence ?? null,
169
+ sourceId: r.source_id || null,
170
+ sourceName: r.source_name || null,
171
+ validFrom: r.valid_from || null,
172
+ validUntil: r.valid_until || null,
173
+ firstSeenAt: r.first_seen_at,
174
+ lastSeenAt: r.last_seen_at,
175
+ };
176
+ }
177
+
178
+ /**
179
+ * List indicators, optionally filtered by type. `limit` caps result
180
+ * size (default 100, 0 = unlimited).
181
+ */
182
+ export function listIndicators(db, options = {}) {
183
+ const { type, limit = 100 } = options;
184
+ if (type && !IOC_TYPES.includes(type)) {
185
+ throw new Error(
186
+ `Unknown IOC type: ${type}. Valid: ${IOC_TYPES.join(", ")}`,
187
+ );
188
+ }
189
+ let sql = `SELECT * FROM threat_intel_indicators`;
190
+ const params = [];
191
+ if (type) {
192
+ sql += ` WHERE type = ?`;
193
+ params.push(type);
194
+ }
195
+ sql += ` ORDER BY last_seen_at DESC`;
196
+ if (limit && limit > 0) {
197
+ sql += ` LIMIT ?`;
198
+ params.push(limit);
199
+ }
200
+ return db
201
+ .prepare(sql)
202
+ .all(...params)
203
+ .map(_rowToIndicator);
204
+ }
205
+
206
+ /**
207
+ * Match an arbitrary observable against the stored feed. Returns:
208
+ * {matched:true, type, indicator} on hit
209
+ * {matched:false, type} on miss (type still classified)
210
+ * {matched:false, type:"unknown"} if we can't classify the input
211
+ */
212
+ export function matchObservable(db, value) {
213
+ const type = classifyObservable(value);
214
+ if (type === "unknown") return { matched: false, type: "unknown" };
215
+
216
+ const trimmed = String(value).trim();
217
+ const normalized = type.startsWith("file-") ? trimmed.toLowerCase() : trimmed;
218
+
219
+ const row = db
220
+ .prepare(
221
+ `SELECT * FROM threat_intel_indicators
222
+ WHERE type = ? AND value = ?
223
+ LIMIT 1`,
224
+ )
225
+ .get(type, normalized);
226
+
227
+ if (!row) return { matched: false, type };
228
+ return { matched: true, type, indicator: _rowToIndicator(row) };
229
+ }
230
+
231
+ /**
232
+ * Aggregate stats — total indicators + counts per type.
233
+ */
234
+ export function getStats(db) {
235
+ const total =
236
+ db.prepare(`SELECT COUNT(*) AS n FROM threat_intel_indicators`).get()?.n ??
237
+ 0;
238
+ const rows = db
239
+ .prepare(
240
+ `SELECT type, COUNT(*) AS n
241
+ FROM threat_intel_indicators
242
+ GROUP BY type
243
+ ORDER BY n DESC`,
244
+ )
245
+ .all();
246
+ const byType = {};
247
+ for (const r of rows) byType[r.type] = r.n;
248
+ return { total, byType };
249
+ }
250
+
251
+ /**
252
+ * Remove a single indicator by (type, value). Returns true if a row
253
+ * was deleted.
254
+ */
255
+ export function removeIndicator(db, type, value) {
256
+ const info = db
257
+ .prepare(`DELETE FROM threat_intel_indicators WHERE type = ? AND value = ?`)
258
+ .run(type, value);
259
+ return info.changes > 0;
260
+ }
261
+
262
+ /**
263
+ * Drop every stored indicator. Returns the number of rows removed.
264
+ */
265
+ export function clearAll(db) {
266
+ const info = db.prepare(`DELETE FROM threat_intel_indicators`).run();
267
+ return info.changes;
268
+ }
@@ -0,0 +1,400 @@
1
+ /**
2
+ * Topic Classifier — language-aware, multilingual.
3
+ *
4
+ * Pragmatic alternative to a neural classifier. Improves on boolean keyword
5
+ * matching by:
6
+ * 1. Detecting language via Unicode ranges (zh / ja / en / other) so
7
+ * CJK text is tokenized per-character instead of being stringified
8
+ * as one whitespace-less blob.
9
+ * 2. Scoring with term frequency (TF) against per-language lexicons,
10
+ * then normalizing so scores are comparable across topics.
11
+ *
12
+ * Deep-learning-enhanced classification is deferred to the LLM manager
13
+ * (Desktop main-process); this CLI classifier is offline and fully
14
+ * deterministic, which is what `cc social analyze` needs.
15
+ *
16
+ * Languages: zh (Chinese), ja (Japanese), en (English), other.
17
+ */
18
+
19
+ /* ── Unicode ranges ────────────────────────────────────────── */
20
+
21
+ // Han ideographs (CJK Unified + Extension-A). Shared by zh and ja.
22
+ const RE_HAN = /[\u3400-\u4dbf\u4e00-\u9fff]/;
23
+ // Hiragana + Katakana (distinctive for Japanese).
24
+ const RE_KANA = /[\u3040-\u309f\u30a0-\u30ff]/;
25
+ // Latin alphanumerics.
26
+ const RE_LATIN = /[A-Za-z]/;
27
+
28
+ /**
29
+ * Detect the dominant language of a piece of text.
30
+ * - Any Hiragana/Katakana → ja (distinctive of Japanese)
31
+ * - Else Han ideographs present → zh
32
+ * - Else majority Latin letters → en
33
+ * - Otherwise 'other'
34
+ */
35
+ export function detectLanguage(text) {
36
+ if (!text || typeof text !== "string") return "other";
37
+ const trimmed = text.trim();
38
+ if (trimmed.length === 0) return "other";
39
+ if (RE_KANA.test(trimmed)) return "ja";
40
+ if (RE_HAN.test(trimmed)) return "zh";
41
+ const latinCount = (trimmed.match(/[A-Za-z]/g) || []).length;
42
+ const totalLetters = (trimmed.match(/[^\s\d\p{P}\p{S}]/gu) || []).length;
43
+ if (totalLetters > 0 && latinCount / totalLetters >= 0.5) return "en";
44
+ return "other";
45
+ }
46
+
47
+ /* ── Tokenization ──────────────────────────────────────────── */
48
+
49
+ /**
50
+ * Tokenize a piece of text. The strategy depends on language:
51
+ * - en / other: split on non-alphanumerics, lowercase.
52
+ * - zh: each Han character becomes a token (ideograms carry meaning
53
+ * individually, and without a real segmenter this is more useful
54
+ * than "one giant token per whitespace-less blob").
55
+ * - ja: Kana + Han characters become tokens; Latin words split normally.
56
+ */
57
+ export function tokenize(text, lang) {
58
+ if (!text) return [];
59
+ const resolved = lang || detectLanguage(text);
60
+ const out = [];
61
+ if (resolved === "en" || resolved === "other") {
62
+ for (const word of text.toLowerCase().split(/[^a-z0-9_]+/u)) {
63
+ if (word) out.push(word);
64
+ }
65
+ return out;
66
+ }
67
+ // For zh/ja we tokenize char-by-char for CJK and word-by-word for Latin.
68
+ for (const ch of text) {
69
+ if (RE_KANA.test(ch) || RE_HAN.test(ch)) {
70
+ out.push(ch);
71
+ }
72
+ }
73
+ for (const word of text.toLowerCase().split(/[^a-z0-9_]+/u)) {
74
+ if (word) out.push(word);
75
+ }
76
+ return out;
77
+ }
78
+
79
+ /* ── Lexicons ──────────────────────────────────────────────── */
80
+
81
+ /**
82
+ * Default lexicons. Shape:
83
+ * DEFAULT_TOPIC_LEXICONS[topic][lang] = Array<string | [string, number]>
84
+ *
85
+ * Keywords are matched as exact tokens (language-aware — English keywords
86
+ * match tokenized English words; CN/JP keywords can be either single CJK
87
+ * chars or multi-char phrases, which we match via substring inclusion).
88
+ */
89
+ export const DEFAULT_TOPIC_LEXICONS = Object.freeze({
90
+ tech: {
91
+ en: [
92
+ "tech",
93
+ "technology",
94
+ "software",
95
+ "ai",
96
+ "algorithm",
97
+ "code",
98
+ "developer",
99
+ "programming",
100
+ "cloud",
101
+ "startup",
102
+ ],
103
+ zh: [
104
+ "科技",
105
+ "技术",
106
+ "软件",
107
+ "算法",
108
+ "编程",
109
+ "程序",
110
+ "开发",
111
+ "云计算",
112
+ "人工智能",
113
+ "互联网",
114
+ ],
115
+ ja: [
116
+ "テクノロジー",
117
+ "技術",
118
+ "ソフトウェア",
119
+ "プログラム",
120
+ "開発",
121
+ "人工知能",
122
+ "クラウド",
123
+ "システム",
124
+ ],
125
+ },
126
+ sports: {
127
+ en: [
128
+ "sport",
129
+ "sports",
130
+ "game",
131
+ "match",
132
+ "football",
133
+ "basketball",
134
+ "soccer",
135
+ "olympic",
136
+ "tennis",
137
+ "team",
138
+ ],
139
+ zh: [
140
+ "体育",
141
+ "运动",
142
+ "比赛",
143
+ "足球",
144
+ "篮球",
145
+ "奥运",
146
+ "网球",
147
+ "联赛",
148
+ "冠军",
149
+ ],
150
+ ja: [
151
+ "スポーツ",
152
+ "試合",
153
+ "サッカー",
154
+ "野球",
155
+ "バスケ",
156
+ "オリンピック",
157
+ "優勝",
158
+ ],
159
+ },
160
+ health: {
161
+ en: [
162
+ "health",
163
+ "medical",
164
+ "doctor",
165
+ "hospital",
166
+ "disease",
167
+ "vaccine",
168
+ "patient",
169
+ "therapy",
170
+ "clinic",
171
+ "surgery",
172
+ ],
173
+ zh: [
174
+ "健康",
175
+ "医疗",
176
+ "医生",
177
+ "医院",
178
+ "疾病",
179
+ "疫苗",
180
+ "治疗",
181
+ "诊所",
182
+ "手术",
183
+ ],
184
+ ja: ["健康", "医療", "医師", "病院", "病気", "ワクチン", "治療"],
185
+ },
186
+ food: {
187
+ en: [
188
+ "food",
189
+ "restaurant",
190
+ "cuisine",
191
+ "cooking",
192
+ "chef",
193
+ "recipe",
194
+ "dish",
195
+ "meal",
196
+ "dessert",
197
+ ],
198
+ zh: ["美食", "餐厅", "料理", "烹饪", "厨师", "菜谱", "甜点", "小吃"],
199
+ ja: ["料理", "レストラン", "グルメ", "料理人", "デザート", "食事"],
200
+ },
201
+ travel: {
202
+ en: [
203
+ "travel",
204
+ "tourism",
205
+ "hotel",
206
+ "trip",
207
+ "flight",
208
+ "vacation",
209
+ "tourist",
210
+ "destination",
211
+ "cruise",
212
+ ],
213
+ zh: ["旅行", "旅游", "酒店", "景点", "机票", "度假", "邮轮"],
214
+ ja: ["旅行", "ホテル", "観光", "フライト", "休暇", "旅"],
215
+ },
216
+ politics: {
217
+ en: [
218
+ "politics",
219
+ "government",
220
+ "election",
221
+ "policy",
222
+ "president",
223
+ "congress",
224
+ "senate",
225
+ "vote",
226
+ "campaign",
227
+ ],
228
+ zh: ["政治", "政府", "选举", "政策", "总统", "议会", "投票", "竞选"],
229
+ ja: ["政治", "政府", "選挙", "政策", "大統領", "国会", "投票"],
230
+ },
231
+ finance: {
232
+ en: [
233
+ "finance",
234
+ "economy",
235
+ "stock",
236
+ "investment",
237
+ "market",
238
+ "bank",
239
+ "trade",
240
+ "currency",
241
+ "crypto",
242
+ "inflation",
243
+ ],
244
+ zh: [
245
+ "金融",
246
+ "经济",
247
+ "股票",
248
+ "投资",
249
+ "市场",
250
+ "银行",
251
+ "贸易",
252
+ "货币",
253
+ "加密",
254
+ ],
255
+ ja: ["金融", "経済", "株式", "投資", "市場", "銀行", "取引", "通貨"],
256
+ },
257
+ entertainment: {
258
+ en: [
259
+ "entertainment",
260
+ "movie",
261
+ "film",
262
+ "music",
263
+ "celebrity",
264
+ "concert",
265
+ "series",
266
+ "actor",
267
+ "singer",
268
+ ],
269
+ zh: ["娱乐", "电影", "音乐", "明星", "演唱会", "电视剧", "演员", "歌手"],
270
+ ja: ["映画", "音楽", "芸能", "コンサート", "ドラマ", "俳優", "歌手"],
271
+ },
272
+ });
273
+
274
+ // User-registered lexicon overrides. Indexed by topic → lang → keywords[].
275
+ const _customLexicons = new Map();
276
+
277
+ /**
278
+ * Register or override a topic lexicon. Pass `lexicon` as:
279
+ * { en: [...], zh: [...], ja: [...] }
280
+ */
281
+ export function registerTopicLexicon(topic, lexicon) {
282
+ if (!topic) throw new Error("topic is required");
283
+ if (!lexicon || typeof lexicon !== "object") {
284
+ throw new Error("lexicon object is required");
285
+ }
286
+ _customLexicons.set(topic, lexicon);
287
+ }
288
+
289
+ export function unregisterTopicLexicon(topic) {
290
+ return _customLexicons.delete(topic);
291
+ }
292
+
293
+ export function listTopicLexicons() {
294
+ const merged = { ...DEFAULT_TOPIC_LEXICONS };
295
+ for (const [topic, lexicon] of _customLexicons.entries()) {
296
+ merged[topic] = lexicon;
297
+ }
298
+ return merged;
299
+ }
300
+
301
+ /* ── Scoring ───────────────────────────────────────────────── */
302
+
303
+ function _keywordWeight(entry) {
304
+ if (Array.isArray(entry)) return { keyword: entry[0], weight: entry[1] ?? 1 };
305
+ return { keyword: entry, weight: 1 };
306
+ }
307
+
308
+ function _matchCount(text, tokens, keyword, lang) {
309
+ // For Latin keywords, match as token equality (case-insensitive).
310
+ // For CJK keywords, match as substring of the text (so multi-char phrases
311
+ // like "加密货币" match even though tokenize() splits them into chars).
312
+ if (/^[A-Za-z0-9_]+$/.test(keyword)) {
313
+ const lower = keyword.toLowerCase();
314
+ let count = 0;
315
+ for (const t of tokens) if (t === lower) count += 1;
316
+ return count;
317
+ }
318
+ // CJK multi-char or single-char phrase: substring count.
319
+ if (!text) return 0;
320
+ let count = 0;
321
+ let idx = 0;
322
+ while (true) {
323
+ const hit = text.indexOf(keyword, idx);
324
+ if (hit === -1) break;
325
+ count += 1;
326
+ idx = hit + keyword.length;
327
+ }
328
+ return count;
329
+ }
330
+
331
+ /**
332
+ * Classify `text` into one or more topics.
333
+ *
334
+ * @param {string} text
335
+ * @param {Object} [opts]
336
+ * @param {Object} [opts.lexicons] Override per-topic lexicons for this call.
337
+ * @param {number} [opts.topK=3] Return the top-K topics.
338
+ * @param {string} [opts.lang] Override detected language.
339
+ * @param {number} [opts.minScore=0] Drop topics with score below this (pre-normalization).
340
+ * @returns {{
341
+ * language: string,
342
+ * tokens: string[],
343
+ * topics: Array<{ topic: string, score: number, rawScore: number, hits: number }>
344
+ * }}
345
+ */
346
+ export function classifyTopic(text, opts = {}) {
347
+ const {
348
+ lexicons: overrideLexicons,
349
+ topK = 3,
350
+ lang: forceLang,
351
+ minScore = 0,
352
+ } = opts;
353
+
354
+ const language = forceLang || detectLanguage(text);
355
+ const tokens = tokenize(text, language);
356
+ if (!text || tokens.length === 0) {
357
+ return { language, tokens: [], topics: [] };
358
+ }
359
+
360
+ const lexiconSet = overrideLexicons || listTopicLexicons();
361
+ const raw = [];
362
+ for (const [topic, perLang] of Object.entries(lexiconSet)) {
363
+ if (!perLang || typeof perLang !== "object") continue;
364
+ const entries = perLang[language] || [];
365
+ // Fall back to English keywords when the target language has none.
366
+ const fallback = entries.length === 0 ? perLang.en || [] : [];
367
+ let rawScore = 0;
368
+ let hits = 0;
369
+ for (const entry of [...entries, ...fallback]) {
370
+ const { keyword, weight } = _keywordWeight(entry);
371
+ const count = _matchCount(text, tokens, keyword, language);
372
+ if (count > 0) {
373
+ rawScore += count * weight;
374
+ hits += count;
375
+ }
376
+ }
377
+ if (rawScore > minScore) {
378
+ raw.push({ topic, rawScore, hits });
379
+ }
380
+ }
381
+
382
+ const total = raw.reduce((s, r) => s + r.rawScore, 0) || 1;
383
+ const topics = raw
384
+ .map((r) => ({
385
+ topic: r.topic,
386
+ score: r.rawScore / total,
387
+ rawScore: r.rawScore,
388
+ hits: r.hits,
389
+ }))
390
+ .sort((a, b) => b.rawScore - a.rawScore)
391
+ .slice(0, topK);
392
+
393
+ return { language, tokens, topics };
394
+ }
395
+
396
+ /* ── Reset (for testing) ───────────────────────────────────── */
397
+
398
+ export function _resetState() {
399
+ _customLexicons.clear();
400
+ }