@tikoci/rosetta 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,194 @@
1
+ /**
2
+ * extract-devices.ts — Load MikroTik product matrix CSV into the devices table.
3
+ *
4
+ * Idempotent: deletes all existing device rows, then inserts from CSV.
5
+ * FTS5 index auto-populated via triggers defined in db.ts.
6
+ *
7
+ * Usage: bun run src/extract-devices.ts [path/to/matrix.csv]
8
+ */
9
+
10
+ import { readFileSync } from "node:fs";
11
+ import { db, initDb } from "./db.ts";
12
+
13
+ const DEFAULT_CSV = "matrix/2026-03-25/matrix.csv";
14
+ const csvPath = process.argv[2] || DEFAULT_CSV;
15
+
16
+ /** Parse a CSV line respecting quoted fields. */
17
+ function parseCsvLine(line: string): string[] {
18
+ const fields: string[] = [];
19
+ let i = 0;
20
+ while (i < line.length) {
21
+ if (line[i] === '"') {
22
+ // Quoted field — find closing quote (doubled quotes "" are escaped quotes)
23
+ i++; // skip opening quote
24
+ let value = "";
25
+ while (i < line.length) {
26
+ if (line[i] === '"') {
27
+ if (i + 1 < line.length && line[i + 1] === '"') {
28
+ value += '"';
29
+ i += 2;
30
+ } else {
31
+ i++; // skip closing quote
32
+ break;
33
+ }
34
+ } else {
35
+ value += line[i];
36
+ i++;
37
+ }
38
+ }
39
+ fields.push(value);
40
+ if (i < line.length && line[i] === ",") i++; // skip comma
41
+ } else {
42
+ // Unquoted field
43
+ const nextComma = line.indexOf(",", i);
44
+ if (nextComma === -1) {
45
+ fields.push(line.slice(i));
46
+ break;
47
+ }
48
+ fields.push(line.slice(i, nextComma));
49
+ i = nextComma + 1;
50
+ }
51
+ }
52
+ return fields;
53
+ }
54
+
55
+ /** Parse a size string like "512 MB" or "16 GB" into megabytes. */
56
+ function parseSizeMb(value: string): number | null {
57
+ if (!value) return null;
58
+ const match = value.match(/^([\d.]+)\s*(MB|GB)/i);
59
+ if (!match) return null;
60
+ const num = Number.parseFloat(match[1]);
61
+ const unit = match[2].toUpperCase();
62
+ return unit === "GB" ? Math.round(num * 1024) : Math.round(num);
63
+ }
64
+
65
+ /** Parse a power string like "8 W" or "800 W" to watts. */
66
+ function parseWatts(value: string): number | null {
67
+ if (!value) return null;
68
+ const match = value.match(/^([\d.]+)\s*W/i);
69
+ return match ? Number.parseFloat(match[1]) : null;
70
+ }
71
+
72
+ /** Parse an integer, returning null for empty/non-numeric. */
73
+ function parseIntOrNull(value: string): number | null {
74
+ if (!value) return null;
75
+ const n = Number.parseInt(value, 10);
76
+ return Number.isNaN(n) ? null : n;
77
+ }
78
+
79
+ /** Parse a float, returning null for empty/non-numeric. */
80
+ function parseFloatOrNull(value: string): number | null {
81
+ if (!value) return null;
82
+ const n = Number.parseFloat(value);
83
+ return Number.isNaN(n) ? null : n;
84
+ }
85
+
86
+ /** Parse price like "2,795.00" or "89.00" to a float. */
87
+ function parsePrice(value: string): number | null {
88
+ if (!value) return null;
89
+ const cleaned = value.replace(/[,$]/g, "");
90
+ return parseFloatOrNull(cleaned);
91
+ }
92
+
93
+ // ── Main ──
94
+
95
+ initDb();
96
+
97
+ const raw = readFileSync(csvPath, "utf-8");
98
+ // Strip UTF-8 BOM
99
+ const content = raw.replace(/^\ufeff/, "");
100
+ const lines = content.split(/\r?\n/).filter((l) => l.trim());
101
+
102
+ if (lines.length < 2) {
103
+ console.error("CSV has no data rows");
104
+ process.exit(1);
105
+ }
106
+
107
+ // Skip header row
108
+ const dataLines = lines.slice(1);
109
+
110
+ // Idempotent: clear existing data (FTS triggers handle cleanup)
111
+ db.run("DELETE FROM devices");
112
+
113
+ const insert = db.prepare(`INSERT INTO devices (
114
+ product_name, product_code, architecture, cpu, cpu_cores, cpu_frequency,
115
+ license_level, operating_system, ram, ram_mb, storage, storage_mb,
116
+ dimensions, poe_in, poe_out, poe_out_ports, poe_in_voltage,
117
+ dc_inputs, dc_jack_voltage, max_power_w,
118
+ wireless_24_chains, antenna_24_dbi, wireless_5_chains, antenna_5_dbi,
119
+ eth_fast, eth_gigabit, eth_2500, usb_ports, combo_ports,
120
+ sfp_ports, sfp_plus_ports, eth_multigig, sim_slots,
121
+ memory_cards, usb_type, msrp_usd
122
+ ) VALUES (
123
+ ?, ?, ?, ?, ?, ?,
124
+ ?, ?, ?, ?, ?, ?,
125
+ ?, ?, ?, ?, ?,
126
+ ?, ?, ?,
127
+ ?, ?, ?, ?,
128
+ ?, ?, ?, ?, ?,
129
+ ?, ?, ?, ?,
130
+ ?, ?, ?
131
+ )`);
132
+
133
+ let inserted = 0;
134
+ let skipped = 0;
135
+
136
+ const insertAll = db.transaction(() => {
137
+ for (const line of dataLines) {
138
+ const f = parseCsvLine(line);
139
+ if (f.length < 34) {
140
+ skipped++;
141
+ continue;
142
+ }
143
+
144
+ const productName = f[0].trim();
145
+ if (!productName) {
146
+ skipped++;
147
+ continue;
148
+ }
149
+
150
+ insert.run(
151
+ productName,
152
+ f[1].trim() || null, // product_code
153
+ f[2].trim() || null, // architecture
154
+ f[3].trim() || null, // cpu
155
+ parseIntOrNull(f[4]), // cpu_cores
156
+ f[5].trim() || null, // cpu_frequency
157
+ parseIntOrNull(f[6]), // license_level
158
+ f[7].trim() || null, // operating_system
159
+ f[8].trim() || null, // ram
160
+ parseSizeMb(f[8]), // ram_mb
161
+ f[9].trim() || null, // storage
162
+ parseSizeMb(f[9]), // storage_mb
163
+ f[10].trim() || null, // dimensions
164
+ f[11].trim() || null, // poe_in
165
+ f[12].trim() || null, // poe_out
166
+ f[13].trim() || null, // poe_out_ports
167
+ f[14].trim() || null, // poe_in_voltage
168
+ parseIntOrNull(f[15]), // dc_inputs
169
+ f[16].trim() || null, // dc_jack_voltage
170
+ parseWatts(f[17]), // max_power_w
171
+ parseIntOrNull(f[18]), // wireless_24_chains
172
+ parseFloatOrNull(f[19]),// antenna_24_dbi
173
+ parseIntOrNull(f[20]), // wireless_5_chains
174
+ parseFloatOrNull(f[21]),// antenna_5_dbi
175
+ parseIntOrNull(f[22]), // eth_fast
176
+ parseIntOrNull(f[23]), // eth_gigabit
177
+ parseIntOrNull(f[24]), // eth_2500
178
+ parseIntOrNull(f[25]), // usb_ports
179
+ parseIntOrNull(f[26]), // combo_ports
180
+ parseIntOrNull(f[27]), // sfp_ports
181
+ parseIntOrNull(f[28]), // sfp_plus_ports
182
+ parseIntOrNull(f[29]), // eth_multigig
183
+ parseIntOrNull(f[30]), // sim_slots
184
+ f[31].trim() || null, // memory_cards
185
+ f[32].trim() || null, // usb_type
186
+ parsePrice(f[33]), // msrp_usd
187
+ );
188
+ inserted++;
189
+ }
190
+ });
191
+
192
+ insertAll();
193
+
194
+ console.log(`Devices: ${inserted} inserted, ${skipped} skipped from ${csvPath}`);
@@ -0,0 +1,379 @@
1
+ #!/usr/bin/env bun
2
+
3
+ /**
4
+ * extract-html.ts — Parse Confluence HTML export into SQLite pages table.
5
+ *
6
+ * Reads all HTML files from the export directory, extracts:
7
+ * - Page ID and slug from filename
8
+ * - Title from #title-text (stripped of "RouterOS : " prefix)
9
+ * - Breadcrumb path from #breadcrumbs
10
+ * - Parent page ID from last breadcrumb link
11
+ * - Plain text from #main-content (HTML stripped)
12
+ * - Code blocks from pre.syntaxhighlighter-pre
13
+ * - Author and last_updated from .page-metadata
14
+ *
15
+ * Populates: pages, pages_fts, callouts, callouts_fts (via triggers)
16
+ *
17
+ * Usage: bun run src/extract-html.ts [html-dir]
18
+ */
19
+
20
+ import { readdirSync, readFileSync } from "node:fs";
21
+ import { basename, resolve } from "node:path";
22
+ import { parseHTML } from "linkedom";
23
+ import { db, initDb } from "./db.ts";
24
+
25
+ const HTML_DIR =
26
+ process.argv[2] || resolve(import.meta.dirname, "../box/latest/ROS");
27
+
28
+ // Filename pattern: Slug_PageID.html or just PageID.html
29
+ const filenameRe = /^(?:(.+?)_)?(\d+)\.html$/;
30
+
31
+ interface PageRow {
32
+ id: number;
33
+ slug: string;
34
+ title: string;
35
+ path: string;
36
+ depth: number;
37
+ parent_id: number | null;
38
+ url: string;
39
+ text: string;
40
+ code: string;
41
+ code_lang: string | null;
42
+ author: string | null;
43
+ last_updated: string | null;
44
+ word_count: number;
45
+ code_lines: number;
46
+ html_file: string;
47
+ }
48
+
49
+ interface CalloutRow {
50
+ page_id: number;
51
+ type: string;
52
+ content: string;
53
+ sort_order: number;
54
+ }
55
+
56
+ interface SectionRow {
57
+ page_id: number;
58
+ heading: string;
59
+ level: number;
60
+ anchor_id: string;
61
+ text: string;
62
+ code: string;
63
+ word_count: number;
64
+ sort_order: number;
65
+ }
66
+
67
+ /**
68
+ * Split main content into sections by h1–h3 headings with id attributes.
69
+ * Uses innerHTML + regex to locate heading boundaries, then parses each
70
+ * section chunk independently for text and code extraction.
71
+ */
72
+ function extractSections(mainContent: Element, pageId: number): SectionRow[] {
73
+ const html = mainContent.innerHTML;
74
+ const headingRe = /<h([1-3])\s[^>]*?id="([^"]+)"[^>]*>([\s\S]*?)<\/h\1>/g;
75
+
76
+ const headings: Array<{
77
+ level: number;
78
+ anchorId: string;
79
+ heading: string;
80
+ start: number;
81
+ end: number;
82
+ }> = [];
83
+
84
+ for (let m = headingRe.exec(html); m !== null; m = headingRe.exec(html)) {
85
+ if (m[2] === "title-heading") continue;
86
+ headings.push({
87
+ level: Number.parseInt(m[1], 10),
88
+ anchorId: m[2],
89
+ heading: parseHTML(`<span>${m[3]}</span>`).document.querySelector("span")?.textContent?.trim() || "",
90
+ start: m.index,
91
+ end: m.index + m[0].length,
92
+ });
93
+ }
94
+
95
+ if (headings.length === 0) return [];
96
+
97
+ return headings.map((h, i) => {
98
+ const sectionHtml = html.slice(h.end, headings[i + 1]?.start ?? html.length);
99
+ const { document: doc } = parseHTML(`<div>${sectionHtml}</div>`);
100
+ const root = doc.querySelector("div");
101
+
102
+ const codeEls = root?.querySelectorAll("pre.syntaxhighlighter-pre") ?? [];
103
+ const codeChunks: string[] = [];
104
+ for (const ce of codeEls) {
105
+ codeChunks.push(ce.textContent?.trim() || "");
106
+ }
107
+
108
+ const text = root?.textContent?.trim() || "";
109
+ const code = codeChunks.join("\n\n");
110
+
111
+ return {
112
+ page_id: pageId,
113
+ heading: h.heading,
114
+ level: h.level,
115
+ anchor_id: h.anchorId,
116
+ text,
117
+ code,
118
+ word_count: text.split(/\s+/).filter(Boolean).length,
119
+ sort_order: i,
120
+ };
121
+ });
122
+ }
123
+
124
+ function extractPageId(href: string): number | null {
125
+ const m = basename(href).match(filenameRe);
126
+ return m ? Number(m[2]) : null;
127
+ }
128
+
129
+ function textContent(el: Element | null): string {
130
+ return el?.textContent?.trim() || "";
131
+ }
132
+
133
+ function extractPage(file: string, html: string): (PageRow & { callouts: CalloutRow[]; sections: SectionRow[] }) | null {
134
+ const { document } = parseHTML(html);
135
+
136
+ const match = basename(file).match(filenameRe);
137
+ if (!match) return null;
138
+
139
+ const slug = match[1] || String(match[2]);
140
+ const id = Number(match[2]);
141
+
142
+ // Title: strip "RouterOS : " prefix
143
+ const title = textContent(document.querySelector("#title-text"))
144
+ .replace(/^\s*RouterOS\s*:\s*/i, "")
145
+ .trim();
146
+
147
+ if (!title) return null;
148
+
149
+ // Breadcrumbs
150
+ const breadcrumbLinks = document.querySelectorAll("#breadcrumbs li a");
151
+ const breadcrumbs: string[] = [];
152
+ let parentId: number | null = null;
153
+ for (const a of breadcrumbLinks) {
154
+ breadcrumbs.push(textContent(a));
155
+ const href = a.getAttribute("href") || "";
156
+ parentId = extractPageId(href);
157
+ }
158
+ const path = [...breadcrumbs, title].join(" > ");
159
+ const depth = breadcrumbs.length + 1;
160
+
161
+ // URL: Confluence pattern
162
+ const urlSlug = encodeURIComponent(title.replace(/ /g, "+"));
163
+ const url = `https://help.mikrotik.com/docs/spaces/ROS/pages/${id}/${urlSlug}`;
164
+
165
+ // Main content
166
+ const mainContent = document.querySelector("#main-content");
167
+
168
+ // Code blocks — extract before stripping HTML
169
+ const codeEls = mainContent?.querySelectorAll("pre.syntaxhighlighter-pre") || [];
170
+ const codeChunks: string[] = [];
171
+ const codeLangs = new Set<string>();
172
+ for (const el of codeEls) {
173
+ codeChunks.push(el.textContent?.trim() || "");
174
+ const params = el.getAttribute("data-syntaxhighlighter-params") || "";
175
+ const brushMatch = params.match(/brush:\s*(\w+)/);
176
+ if (brushMatch) codeLangs.add(brushMatch[1]);
177
+ }
178
+ const code = codeChunks.join("\n\n");
179
+ const codeLang = codeLangs.size > 0 ? [...codeLangs].join(",") : null;
180
+ const codeLines = code.split("\n").filter((l) => l.trim()).length;
181
+
182
+ // Plain text from main content (includes code block text too, which is fine for FTS)
183
+ const text = mainContent?.textContent?.trim() || "";
184
+ const wordCount = text.split(/\s+/).filter(Boolean).length;
185
+
186
+ // Callouts: extract note/warning/info blocks
187
+ const calloutEls = mainContent?.querySelectorAll('div[role="region"].confluence-information-macro') || [];
188
+ const callouts: CalloutRow[] = [];
189
+ let calloutOrder = 0;
190
+ for (const el of calloutEls) {
191
+ const label = (el.getAttribute("aria-label") || "").toLowerCase().trim();
192
+ const type = label === "warning" ? "warning" : label === "note" ? "note" : label === "info" ? "info" : label || "note";
193
+ const body = el.querySelector(".confluence-information-macro-body");
194
+ const content = body?.textContent?.trim() || "";
195
+ if (content) {
196
+ callouts.push({ page_id: id, type, content, sort_order: calloutOrder++ });
197
+ }
198
+ }
199
+
200
+ // Metadata: author, last_updated
201
+ const metaEl = document.querySelector(".page-metadata");
202
+ const metaText = metaEl?.textContent || "";
203
+ const authorMatch = metaText.match(/Created by\s+(.+?)(?:,|\s*last)/i);
204
+ const author = authorMatch?.[1]?.trim() || null;
205
+ const dateMatch = metaText.match(/on\s+(\w+ \d{1,2}, \d{4})/);
206
+ const lastUpdated = dateMatch?.[1] || null;
207
+
208
+ // Sections: split content by h1–h3 headings
209
+ const sections = mainContent ? extractSections(mainContent, id) : [];
210
+
211
+ return {
212
+ id,
213
+ slug,
214
+ title,
215
+ path,
216
+ depth,
217
+ parent_id: parentId,
218
+ url,
219
+ text,
220
+ code,
221
+ code_lang: codeLang,
222
+ author,
223
+ last_updated: lastUpdated,
224
+ word_count: wordCount,
225
+ code_lines: codeLines,
226
+ html_file: file,
227
+ callouts,
228
+ sections,
229
+ };
230
+ }
231
+
232
+ // ---- Main ----
233
+
234
+ console.log("Initializing database...");
235
+ initDb();
236
+
237
+ // Drop existing data for clean re-extraction (respect FK order)
238
+ db.run("DELETE FROM sections;");
239
+ db.run("DELETE FROM callouts;");
240
+ db.run("INSERT INTO callouts_fts(callouts_fts) VALUES('rebuild');");
241
+ db.run("DELETE FROM properties;");
242
+ db.run("INSERT INTO properties_fts(properties_fts) VALUES('rebuild');");
243
+ db.run("PRAGMA foreign_keys = OFF;");
244
+ db.run("DELETE FROM pages;");
245
+ db.run("PRAGMA foreign_keys = ON;");
246
+ db.run("INSERT INTO pages_fts(pages_fts) VALUES('rebuild');");
247
+
248
+ const htmlFiles = readdirSync(HTML_DIR)
249
+ .filter((f) => f.endsWith(".html") && f !== "index.html")
250
+ .sort();
251
+
252
+ console.log(`Extracting ${htmlFiles.length} HTML files from ${HTML_DIR}`);
253
+
254
+ // Two-pass insert: first without parent_id (avoids FK ordering issues),
255
+ // then update parent relationships.
256
+ const insertPage = db.prepare(`
257
+ INSERT OR REPLACE INTO pages
258
+ (id, slug, title, path, depth, parent_id, url, text, code, code_lang,
259
+ author, last_updated, word_count, code_lines, html_file)
260
+ VALUES (?, ?, ?, ?, ?, NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?)
261
+ `);
262
+ const updateParent = db.prepare("UPDATE pages SET parent_id = ? WHERE id = ?");
263
+
264
+ let extracted = 0;
265
+ let skipped = 0;
266
+ let totalWords = 0;
267
+ let totalCodeLines = 0;
268
+ let totalCallouts = 0;
269
+
270
+ const allPages: (PageRow & { callouts: CalloutRow[]; sections: SectionRow[] })[] = [];
271
+
272
+ // Pass 1: extract and insert all pages (parent_id = NULL)
273
+ const insertAll = db.transaction(() => {
274
+ for (const file of htmlFiles) {
275
+ const html = readFileSync(resolve(HTML_DIR, file), "utf-8");
276
+ const page = extractPage(file, html);
277
+ if (!page) {
278
+ skipped++;
279
+ console.warn(` skipped: ${file}`);
280
+ continue;
281
+ }
282
+ insertPage.run(
283
+ page.id,
284
+ page.slug,
285
+ page.title,
286
+ page.path,
287
+ page.depth,
288
+ page.url,
289
+ page.text,
290
+ page.code,
291
+ page.code_lang,
292
+ page.author,
293
+ page.last_updated,
294
+ page.word_count,
295
+ page.code_lines,
296
+ page.html_file,
297
+ );
298
+ allPages.push(page);
299
+ extracted++;
300
+ totalWords += page.word_count;
301
+ totalCodeLines += page.code_lines;
302
+ }
303
+ });
304
+ insertAll();
305
+
306
+ // Pass 2: set parent_id where the parent actually exists in the DB
307
+ const pageIds = new Set(allPages.map((p) => p.id));
308
+ const setParents = db.transaction(() => {
309
+ for (const page of allPages) {
310
+ if (page.parent_id && pageIds.has(page.parent_id)) {
311
+ updateParent.run(page.parent_id, page.id);
312
+ }
313
+ }
314
+ });
315
+ setParents();
316
+
317
+ // Pass 3: insert callouts
318
+ const insertCallout = db.prepare(`
319
+ INSERT INTO callouts (page_id, type, content, sort_order)
320
+ VALUES (?, ?, ?, ?)
321
+ `);
322
+ const insertCallouts = db.transaction(() => {
323
+ for (const page of allPages) {
324
+ for (const c of page.callouts) {
325
+ insertCallout.run(c.page_id, c.type, c.content, c.sort_order);
326
+ totalCallouts++;
327
+ }
328
+ }
329
+ });
330
+ insertCallouts();
331
+
332
+ // Pass 4: insert sections
333
+ let totalSections = 0;
334
+ let pagesWithSections = 0;
335
+ const insertSection = db.prepare(`
336
+ INSERT INTO sections (page_id, heading, level, anchor_id, text, code, word_count, sort_order)
337
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
338
+ `);
339
+ const insertSections = db.transaction(() => {
340
+ for (const page of allPages) {
341
+ if (page.sections.length > 0) {
342
+ pagesWithSections++;
343
+ for (const s of page.sections) {
344
+ insertSection.run(s.page_id, s.heading, s.level, s.anchor_id, s.text, s.code, s.word_count, s.sort_order);
345
+ totalSections++;
346
+ }
347
+ }
348
+ }
349
+ });
350
+ insertSections();
351
+
352
+ const ftsCount = (db.prepare("SELECT COUNT(*) as c FROM pages_fts").get() as { c: number }).c;
353
+
354
+ console.log(`\nExtraction complete:`);
355
+ console.log(` Pages extracted: ${extracted}`);
356
+ console.log(` Pages skipped: ${skipped}`);
357
+ console.log(` Total words: ${totalWords.toLocaleString()}`);
358
+ console.log(` Total code lines: ${totalCodeLines.toLocaleString()}`);
359
+ console.log(` Total callouts: ${totalCallouts}`);
360
+ console.log(` Total sections: ${totalSections} (across ${pagesWithSections} pages)`);
361
+ console.log(` FTS index rows: ${ftsCount}`);
362
+
363
+ // Quick search test
364
+ const testResults = db
365
+ .prepare(
366
+ `SELECT s.id, s.title, s.path,
367
+ snippet(pages_fts, 2, '>>>', '<<<', '...', 20) as excerpt
368
+ FROM pages_fts fts
369
+ JOIN pages s ON s.id = fts.rowid
370
+ WHERE pages_fts MATCH 'firewall filter'
371
+ ORDER BY rank LIMIT 5`,
372
+ )
373
+ .all();
374
+
375
+ console.log(`\nTest search for "firewall filter":`);
376
+ for (const r of testResults as Array<{ id: number; title: string; path: string; excerpt: string }>) {
377
+ console.log(` [${r.id}] ${r.path}`);
378
+ console.log(` ${r.excerpt}`);
379
+ }