@tikoci/rosetta 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +333 -0
- package/bin/rosetta.js +34 -0
- package/matrix/2026-03-25/matrix.csv +145 -0
- package/matrix/CLAUDE.md +7 -0
- package/matrix/get-mikrotik-products-csv.sh +20 -0
- package/package.json +34 -0
- package/src/assess-html.ts +267 -0
- package/src/db.ts +360 -0
- package/src/extract-all-versions.ts +147 -0
- package/src/extract-changelogs.ts +266 -0
- package/src/extract-commands.ts +175 -0
- package/src/extract-devices.ts +194 -0
- package/src/extract-html.ts +379 -0
- package/src/extract-properties.ts +234 -0
- package/src/link-commands.ts +208 -0
- package/src/mcp.ts +725 -0
- package/src/query.test.ts +994 -0
- package/src/query.ts +990 -0
- package/src/release.test.ts +280 -0
- package/src/restraml.ts +65 -0
- package/src/search.ts +49 -0
- package/src/setup.ts +224 -0
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* extract-devices.ts — Load MikroTik product matrix CSV into the devices table.
|
|
3
|
+
*
|
|
4
|
+
* Idempotent: deletes all existing device rows, then inserts from CSV.
|
|
5
|
+
* FTS5 index auto-populated via triggers defined in db.ts.
|
|
6
|
+
*
|
|
7
|
+
* Usage: bun run src/extract-devices.ts [path/to/matrix.csv]
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { readFileSync } from "node:fs";
|
|
11
|
+
import { db, initDb } from "./db.ts";
|
|
12
|
+
|
|
13
|
+
const DEFAULT_CSV = "matrix/2026-03-25/matrix.csv";
|
|
14
|
+
const csvPath = process.argv[2] || DEFAULT_CSV;
|
|
15
|
+
|
|
16
|
+
/** Parse a CSV line respecting quoted fields. */
|
|
17
|
+
function parseCsvLine(line: string): string[] {
|
|
18
|
+
const fields: string[] = [];
|
|
19
|
+
let i = 0;
|
|
20
|
+
while (i < line.length) {
|
|
21
|
+
if (line[i] === '"') {
|
|
22
|
+
// Quoted field — find closing quote (doubled quotes "" are escaped quotes)
|
|
23
|
+
i++; // skip opening quote
|
|
24
|
+
let value = "";
|
|
25
|
+
while (i < line.length) {
|
|
26
|
+
if (line[i] === '"') {
|
|
27
|
+
if (i + 1 < line.length && line[i + 1] === '"') {
|
|
28
|
+
value += '"';
|
|
29
|
+
i += 2;
|
|
30
|
+
} else {
|
|
31
|
+
i++; // skip closing quote
|
|
32
|
+
break;
|
|
33
|
+
}
|
|
34
|
+
} else {
|
|
35
|
+
value += line[i];
|
|
36
|
+
i++;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
fields.push(value);
|
|
40
|
+
if (i < line.length && line[i] === ",") i++; // skip comma
|
|
41
|
+
} else {
|
|
42
|
+
// Unquoted field
|
|
43
|
+
const nextComma = line.indexOf(",", i);
|
|
44
|
+
if (nextComma === -1) {
|
|
45
|
+
fields.push(line.slice(i));
|
|
46
|
+
break;
|
|
47
|
+
}
|
|
48
|
+
fields.push(line.slice(i, nextComma));
|
|
49
|
+
i = nextComma + 1;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
return fields;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
/** Parse a size string like "512 MB" or "16 GB" into megabytes. */
|
|
56
|
+
function parseSizeMb(value: string): number | null {
|
|
57
|
+
if (!value) return null;
|
|
58
|
+
const match = value.match(/^([\d.]+)\s*(MB|GB)/i);
|
|
59
|
+
if (!match) return null;
|
|
60
|
+
const num = Number.parseFloat(match[1]);
|
|
61
|
+
const unit = match[2].toUpperCase();
|
|
62
|
+
return unit === "GB" ? Math.round(num * 1024) : Math.round(num);
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/** Parse a power string like "8 W" or "800 W" to watts. */
|
|
66
|
+
function parseWatts(value: string): number | null {
|
|
67
|
+
if (!value) return null;
|
|
68
|
+
const match = value.match(/^([\d.]+)\s*W/i);
|
|
69
|
+
return match ? Number.parseFloat(match[1]) : null;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
/** Parse an integer, returning null for empty/non-numeric. */
|
|
73
|
+
function parseIntOrNull(value: string): number | null {
|
|
74
|
+
if (!value) return null;
|
|
75
|
+
const n = Number.parseInt(value, 10);
|
|
76
|
+
return Number.isNaN(n) ? null : n;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/** Parse a float, returning null for empty/non-numeric. */
|
|
80
|
+
function parseFloatOrNull(value: string): number | null {
|
|
81
|
+
if (!value) return null;
|
|
82
|
+
const n = Number.parseFloat(value);
|
|
83
|
+
return Number.isNaN(n) ? null : n;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/** Parse price like "2,795.00" or "89.00" to a float. */
|
|
87
|
+
function parsePrice(value: string): number | null {
|
|
88
|
+
if (!value) return null;
|
|
89
|
+
const cleaned = value.replace(/[,$]/g, "");
|
|
90
|
+
return parseFloatOrNull(cleaned);
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// ── Main ──
|
|
94
|
+
|
|
95
|
+
initDb();
|
|
96
|
+
|
|
97
|
+
const raw = readFileSync(csvPath, "utf-8");
|
|
98
|
+
// Strip UTF-8 BOM
|
|
99
|
+
const content = raw.replace(/^\ufeff/, "");
|
|
100
|
+
const lines = content.split(/\r?\n/).filter((l) => l.trim());
|
|
101
|
+
|
|
102
|
+
if (lines.length < 2) {
|
|
103
|
+
console.error("CSV has no data rows");
|
|
104
|
+
process.exit(1);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
// Skip header row
|
|
108
|
+
const dataLines = lines.slice(1);
|
|
109
|
+
|
|
110
|
+
// Idempotent: clear existing data (FTS triggers handle cleanup)
|
|
111
|
+
db.run("DELETE FROM devices");
|
|
112
|
+
|
|
113
|
+
const insert = db.prepare(`INSERT INTO devices (
|
|
114
|
+
product_name, product_code, architecture, cpu, cpu_cores, cpu_frequency,
|
|
115
|
+
license_level, operating_system, ram, ram_mb, storage, storage_mb,
|
|
116
|
+
dimensions, poe_in, poe_out, poe_out_ports, poe_in_voltage,
|
|
117
|
+
dc_inputs, dc_jack_voltage, max_power_w,
|
|
118
|
+
wireless_24_chains, antenna_24_dbi, wireless_5_chains, antenna_5_dbi,
|
|
119
|
+
eth_fast, eth_gigabit, eth_2500, usb_ports, combo_ports,
|
|
120
|
+
sfp_ports, sfp_plus_ports, eth_multigig, sim_slots,
|
|
121
|
+
memory_cards, usb_type, msrp_usd
|
|
122
|
+
) VALUES (
|
|
123
|
+
?, ?, ?, ?, ?, ?,
|
|
124
|
+
?, ?, ?, ?, ?, ?,
|
|
125
|
+
?, ?, ?, ?, ?,
|
|
126
|
+
?, ?, ?,
|
|
127
|
+
?, ?, ?, ?,
|
|
128
|
+
?, ?, ?, ?, ?,
|
|
129
|
+
?, ?, ?, ?,
|
|
130
|
+
?, ?, ?
|
|
131
|
+
)`);
|
|
132
|
+
|
|
133
|
+
let inserted = 0;
|
|
134
|
+
let skipped = 0;
|
|
135
|
+
|
|
136
|
+
const insertAll = db.transaction(() => {
|
|
137
|
+
for (const line of dataLines) {
|
|
138
|
+
const f = parseCsvLine(line);
|
|
139
|
+
if (f.length < 34) {
|
|
140
|
+
skipped++;
|
|
141
|
+
continue;
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
const productName = f[0].trim();
|
|
145
|
+
if (!productName) {
|
|
146
|
+
skipped++;
|
|
147
|
+
continue;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
insert.run(
|
|
151
|
+
productName,
|
|
152
|
+
f[1].trim() || null, // product_code
|
|
153
|
+
f[2].trim() || null, // architecture
|
|
154
|
+
f[3].trim() || null, // cpu
|
|
155
|
+
parseIntOrNull(f[4]), // cpu_cores
|
|
156
|
+
f[5].trim() || null, // cpu_frequency
|
|
157
|
+
parseIntOrNull(f[6]), // license_level
|
|
158
|
+
f[7].trim() || null, // operating_system
|
|
159
|
+
f[8].trim() || null, // ram
|
|
160
|
+
parseSizeMb(f[8]), // ram_mb
|
|
161
|
+
f[9].trim() || null, // storage
|
|
162
|
+
parseSizeMb(f[9]), // storage_mb
|
|
163
|
+
f[10].trim() || null, // dimensions
|
|
164
|
+
f[11].trim() || null, // poe_in
|
|
165
|
+
f[12].trim() || null, // poe_out
|
|
166
|
+
f[13].trim() || null, // poe_out_ports
|
|
167
|
+
f[14].trim() || null, // poe_in_voltage
|
|
168
|
+
parseIntOrNull(f[15]), // dc_inputs
|
|
169
|
+
f[16].trim() || null, // dc_jack_voltage
|
|
170
|
+
parseWatts(f[17]), // max_power_w
|
|
171
|
+
parseIntOrNull(f[18]), // wireless_24_chains
|
|
172
|
+
parseFloatOrNull(f[19]),// antenna_24_dbi
|
|
173
|
+
parseIntOrNull(f[20]), // wireless_5_chains
|
|
174
|
+
parseFloatOrNull(f[21]),// antenna_5_dbi
|
|
175
|
+
parseIntOrNull(f[22]), // eth_fast
|
|
176
|
+
parseIntOrNull(f[23]), // eth_gigabit
|
|
177
|
+
parseIntOrNull(f[24]), // eth_2500
|
|
178
|
+
parseIntOrNull(f[25]), // usb_ports
|
|
179
|
+
parseIntOrNull(f[26]), // combo_ports
|
|
180
|
+
parseIntOrNull(f[27]), // sfp_ports
|
|
181
|
+
parseIntOrNull(f[28]), // sfp_plus_ports
|
|
182
|
+
parseIntOrNull(f[29]), // eth_multigig
|
|
183
|
+
parseIntOrNull(f[30]), // sim_slots
|
|
184
|
+
f[31].trim() || null, // memory_cards
|
|
185
|
+
f[32].trim() || null, // usb_type
|
|
186
|
+
parsePrice(f[33]), // msrp_usd
|
|
187
|
+
);
|
|
188
|
+
inserted++;
|
|
189
|
+
}
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
insertAll();
|
|
193
|
+
|
|
194
|
+
console.log(`Devices: ${inserted} inserted, ${skipped} skipped from ${csvPath}`);
|
|
@@ -0,0 +1,379 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* extract-html.ts — Parse Confluence HTML export into SQLite pages table.
|
|
5
|
+
*
|
|
6
|
+
* Reads all HTML files from the export directory, extracts:
|
|
7
|
+
* - Page ID and slug from filename
|
|
8
|
+
* - Title from #title-text (stripped of "RouterOS : " prefix)
|
|
9
|
+
* - Breadcrumb path from #breadcrumbs
|
|
10
|
+
* - Parent page ID from last breadcrumb link
|
|
11
|
+
* - Plain text from #main-content (HTML stripped)
|
|
12
|
+
* - Code blocks from pre.syntaxhighlighter-pre
|
|
13
|
+
* - Author and last_updated from .page-metadata
|
|
14
|
+
*
|
|
15
|
+
* Populates: pages, pages_fts, callouts, callouts_fts (via triggers)
|
|
16
|
+
*
|
|
17
|
+
* Usage: bun run src/extract-html.ts [html-dir]
|
|
18
|
+
*/
|
|
19
|
+
|
|
20
|
+
import { readdirSync, readFileSync } from "node:fs";
|
|
21
|
+
import { basename, resolve } from "node:path";
|
|
22
|
+
import { parseHTML } from "linkedom";
|
|
23
|
+
import { db, initDb } from "./db.ts";
|
|
24
|
+
|
|
25
|
+
const HTML_DIR =
|
|
26
|
+
process.argv[2] || resolve(import.meta.dirname, "../box/latest/ROS");
|
|
27
|
+
|
|
28
|
+
// Filename pattern: Slug_PageID.html or just PageID.html
|
|
29
|
+
const filenameRe = /^(?:(.+?)_)?(\d+)\.html$/;
|
|
30
|
+
|
|
31
|
+
interface PageRow {
|
|
32
|
+
id: number;
|
|
33
|
+
slug: string;
|
|
34
|
+
title: string;
|
|
35
|
+
path: string;
|
|
36
|
+
depth: number;
|
|
37
|
+
parent_id: number | null;
|
|
38
|
+
url: string;
|
|
39
|
+
text: string;
|
|
40
|
+
code: string;
|
|
41
|
+
code_lang: string | null;
|
|
42
|
+
author: string | null;
|
|
43
|
+
last_updated: string | null;
|
|
44
|
+
word_count: number;
|
|
45
|
+
code_lines: number;
|
|
46
|
+
html_file: string;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
interface CalloutRow {
|
|
50
|
+
page_id: number;
|
|
51
|
+
type: string;
|
|
52
|
+
content: string;
|
|
53
|
+
sort_order: number;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
interface SectionRow {
|
|
57
|
+
page_id: number;
|
|
58
|
+
heading: string;
|
|
59
|
+
level: number;
|
|
60
|
+
anchor_id: string;
|
|
61
|
+
text: string;
|
|
62
|
+
code: string;
|
|
63
|
+
word_count: number;
|
|
64
|
+
sort_order: number;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Split main content into sections by h1–h3 headings with id attributes.
|
|
69
|
+
* Uses innerHTML + regex to locate heading boundaries, then parses each
|
|
70
|
+
* section chunk independently for text and code extraction.
|
|
71
|
+
*/
|
|
72
|
+
function extractSections(mainContent: Element, pageId: number): SectionRow[] {
|
|
73
|
+
const html = mainContent.innerHTML;
|
|
74
|
+
const headingRe = /<h([1-3])\s[^>]*?id="([^"]+)"[^>]*>([\s\S]*?)<\/h\1>/g;
|
|
75
|
+
|
|
76
|
+
const headings: Array<{
|
|
77
|
+
level: number;
|
|
78
|
+
anchorId: string;
|
|
79
|
+
heading: string;
|
|
80
|
+
start: number;
|
|
81
|
+
end: number;
|
|
82
|
+
}> = [];
|
|
83
|
+
|
|
84
|
+
for (let m = headingRe.exec(html); m !== null; m = headingRe.exec(html)) {
|
|
85
|
+
if (m[2] === "title-heading") continue;
|
|
86
|
+
headings.push({
|
|
87
|
+
level: Number.parseInt(m[1], 10),
|
|
88
|
+
anchorId: m[2],
|
|
89
|
+
heading: parseHTML(`<span>${m[3]}</span>`).document.querySelector("span")?.textContent?.trim() || "",
|
|
90
|
+
start: m.index,
|
|
91
|
+
end: m.index + m[0].length,
|
|
92
|
+
});
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
if (headings.length === 0) return [];
|
|
96
|
+
|
|
97
|
+
return headings.map((h, i) => {
|
|
98
|
+
const sectionHtml = html.slice(h.end, headings[i + 1]?.start ?? html.length);
|
|
99
|
+
const { document: doc } = parseHTML(`<div>${sectionHtml}</div>`);
|
|
100
|
+
const root = doc.querySelector("div");
|
|
101
|
+
|
|
102
|
+
const codeEls = root?.querySelectorAll("pre.syntaxhighlighter-pre") ?? [];
|
|
103
|
+
const codeChunks: string[] = [];
|
|
104
|
+
for (const ce of codeEls) {
|
|
105
|
+
codeChunks.push(ce.textContent?.trim() || "");
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const text = root?.textContent?.trim() || "";
|
|
109
|
+
const code = codeChunks.join("\n\n");
|
|
110
|
+
|
|
111
|
+
return {
|
|
112
|
+
page_id: pageId,
|
|
113
|
+
heading: h.heading,
|
|
114
|
+
level: h.level,
|
|
115
|
+
anchor_id: h.anchorId,
|
|
116
|
+
text,
|
|
117
|
+
code,
|
|
118
|
+
word_count: text.split(/\s+/).filter(Boolean).length,
|
|
119
|
+
sort_order: i,
|
|
120
|
+
};
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
function extractPageId(href: string): number | null {
|
|
125
|
+
const m = basename(href).match(filenameRe);
|
|
126
|
+
return m ? Number(m[2]) : null;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function textContent(el: Element | null): string {
|
|
130
|
+
return el?.textContent?.trim() || "";
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
function extractPage(file: string, html: string): (PageRow & { callouts: CalloutRow[]; sections: SectionRow[] }) | null {
|
|
134
|
+
const { document } = parseHTML(html);
|
|
135
|
+
|
|
136
|
+
const match = basename(file).match(filenameRe);
|
|
137
|
+
if (!match) return null;
|
|
138
|
+
|
|
139
|
+
const slug = match[1] || String(match[2]);
|
|
140
|
+
const id = Number(match[2]);
|
|
141
|
+
|
|
142
|
+
// Title: strip "RouterOS : " prefix
|
|
143
|
+
const title = textContent(document.querySelector("#title-text"))
|
|
144
|
+
.replace(/^\s*RouterOS\s*:\s*/i, "")
|
|
145
|
+
.trim();
|
|
146
|
+
|
|
147
|
+
if (!title) return null;
|
|
148
|
+
|
|
149
|
+
// Breadcrumbs
|
|
150
|
+
const breadcrumbLinks = document.querySelectorAll("#breadcrumbs li a");
|
|
151
|
+
const breadcrumbs: string[] = [];
|
|
152
|
+
let parentId: number | null = null;
|
|
153
|
+
for (const a of breadcrumbLinks) {
|
|
154
|
+
breadcrumbs.push(textContent(a));
|
|
155
|
+
const href = a.getAttribute("href") || "";
|
|
156
|
+
parentId = extractPageId(href);
|
|
157
|
+
}
|
|
158
|
+
const path = [...breadcrumbs, title].join(" > ");
|
|
159
|
+
const depth = breadcrumbs.length + 1;
|
|
160
|
+
|
|
161
|
+
// URL: Confluence pattern
|
|
162
|
+
const urlSlug = encodeURIComponent(title.replace(/ /g, "+"));
|
|
163
|
+
const url = `https://help.mikrotik.com/docs/spaces/ROS/pages/${id}/${urlSlug}`;
|
|
164
|
+
|
|
165
|
+
// Main content
|
|
166
|
+
const mainContent = document.querySelector("#main-content");
|
|
167
|
+
|
|
168
|
+
// Code blocks — extract before stripping HTML
|
|
169
|
+
const codeEls = mainContent?.querySelectorAll("pre.syntaxhighlighter-pre") || [];
|
|
170
|
+
const codeChunks: string[] = [];
|
|
171
|
+
const codeLangs = new Set<string>();
|
|
172
|
+
for (const el of codeEls) {
|
|
173
|
+
codeChunks.push(el.textContent?.trim() || "");
|
|
174
|
+
const params = el.getAttribute("data-syntaxhighlighter-params") || "";
|
|
175
|
+
const brushMatch = params.match(/brush:\s*(\w+)/);
|
|
176
|
+
if (brushMatch) codeLangs.add(brushMatch[1]);
|
|
177
|
+
}
|
|
178
|
+
const code = codeChunks.join("\n\n");
|
|
179
|
+
const codeLang = codeLangs.size > 0 ? [...codeLangs].join(",") : null;
|
|
180
|
+
const codeLines = code.split("\n").filter((l) => l.trim()).length;
|
|
181
|
+
|
|
182
|
+
// Plain text from main content (includes code block text too, which is fine for FTS)
|
|
183
|
+
const text = mainContent?.textContent?.trim() || "";
|
|
184
|
+
const wordCount = text.split(/\s+/).filter(Boolean).length;
|
|
185
|
+
|
|
186
|
+
// Callouts: extract note/warning/info blocks
|
|
187
|
+
const calloutEls = mainContent?.querySelectorAll('div[role="region"].confluence-information-macro') || [];
|
|
188
|
+
const callouts: CalloutRow[] = [];
|
|
189
|
+
let calloutOrder = 0;
|
|
190
|
+
for (const el of calloutEls) {
|
|
191
|
+
const label = (el.getAttribute("aria-label") || "").toLowerCase().trim();
|
|
192
|
+
const type = label === "warning" ? "warning" : label === "note" ? "note" : label === "info" ? "info" : label || "note";
|
|
193
|
+
const body = el.querySelector(".confluence-information-macro-body");
|
|
194
|
+
const content = body?.textContent?.trim() || "";
|
|
195
|
+
if (content) {
|
|
196
|
+
callouts.push({ page_id: id, type, content, sort_order: calloutOrder++ });
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// Metadata: author, last_updated
|
|
201
|
+
const metaEl = document.querySelector(".page-metadata");
|
|
202
|
+
const metaText = metaEl?.textContent || "";
|
|
203
|
+
const authorMatch = metaText.match(/Created by\s+(.+?)(?:,|\s*last)/i);
|
|
204
|
+
const author = authorMatch?.[1]?.trim() || null;
|
|
205
|
+
const dateMatch = metaText.match(/on\s+(\w+ \d{1,2}, \d{4})/);
|
|
206
|
+
const lastUpdated = dateMatch?.[1] || null;
|
|
207
|
+
|
|
208
|
+
// Sections: split content by h1–h3 headings
|
|
209
|
+
const sections = mainContent ? extractSections(mainContent, id) : [];
|
|
210
|
+
|
|
211
|
+
return {
|
|
212
|
+
id,
|
|
213
|
+
slug,
|
|
214
|
+
title,
|
|
215
|
+
path,
|
|
216
|
+
depth,
|
|
217
|
+
parent_id: parentId,
|
|
218
|
+
url,
|
|
219
|
+
text,
|
|
220
|
+
code,
|
|
221
|
+
code_lang: codeLang,
|
|
222
|
+
author,
|
|
223
|
+
last_updated: lastUpdated,
|
|
224
|
+
word_count: wordCount,
|
|
225
|
+
code_lines: codeLines,
|
|
226
|
+
html_file: file,
|
|
227
|
+
callouts,
|
|
228
|
+
sections,
|
|
229
|
+
};
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
// ---- Main ----
|
|
233
|
+
|
|
234
|
+
console.log("Initializing database...");
|
|
235
|
+
initDb();
|
|
236
|
+
|
|
237
|
+
// Drop existing data for clean re-extraction (respect FK order)
|
|
238
|
+
db.run("DELETE FROM sections;");
|
|
239
|
+
db.run("DELETE FROM callouts;");
|
|
240
|
+
db.run("INSERT INTO callouts_fts(callouts_fts) VALUES('rebuild');");
|
|
241
|
+
db.run("DELETE FROM properties;");
|
|
242
|
+
db.run("INSERT INTO properties_fts(properties_fts) VALUES('rebuild');");
|
|
243
|
+
db.run("PRAGMA foreign_keys = OFF;");
|
|
244
|
+
db.run("DELETE FROM pages;");
|
|
245
|
+
db.run("PRAGMA foreign_keys = ON;");
|
|
246
|
+
db.run("INSERT INTO pages_fts(pages_fts) VALUES('rebuild');");
|
|
247
|
+
|
|
248
|
+
const htmlFiles = readdirSync(HTML_DIR)
|
|
249
|
+
.filter((f) => f.endsWith(".html") && f !== "index.html")
|
|
250
|
+
.sort();
|
|
251
|
+
|
|
252
|
+
console.log(`Extracting ${htmlFiles.length} HTML files from ${HTML_DIR}`);
|
|
253
|
+
|
|
254
|
+
// Two-pass insert: first without parent_id (avoids FK ordering issues),
|
|
255
|
+
// then update parent relationships.
|
|
256
|
+
const insertPage = db.prepare(`
|
|
257
|
+
INSERT OR REPLACE INTO pages
|
|
258
|
+
(id, slug, title, path, depth, parent_id, url, text, code, code_lang,
|
|
259
|
+
author, last_updated, word_count, code_lines, html_file)
|
|
260
|
+
VALUES (?, ?, ?, ?, ?, NULL, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
261
|
+
`);
|
|
262
|
+
const updateParent = db.prepare("UPDATE pages SET parent_id = ? WHERE id = ?");
|
|
263
|
+
|
|
264
|
+
let extracted = 0;
|
|
265
|
+
let skipped = 0;
|
|
266
|
+
let totalWords = 0;
|
|
267
|
+
let totalCodeLines = 0;
|
|
268
|
+
let totalCallouts = 0;
|
|
269
|
+
|
|
270
|
+
const allPages: (PageRow & { callouts: CalloutRow[]; sections: SectionRow[] })[] = [];
|
|
271
|
+
|
|
272
|
+
// Pass 1: extract and insert all pages (parent_id = NULL)
|
|
273
|
+
const insertAll = db.transaction(() => {
|
|
274
|
+
for (const file of htmlFiles) {
|
|
275
|
+
const html = readFileSync(resolve(HTML_DIR, file), "utf-8");
|
|
276
|
+
const page = extractPage(file, html);
|
|
277
|
+
if (!page) {
|
|
278
|
+
skipped++;
|
|
279
|
+
console.warn(` skipped: ${file}`);
|
|
280
|
+
continue;
|
|
281
|
+
}
|
|
282
|
+
insertPage.run(
|
|
283
|
+
page.id,
|
|
284
|
+
page.slug,
|
|
285
|
+
page.title,
|
|
286
|
+
page.path,
|
|
287
|
+
page.depth,
|
|
288
|
+
page.url,
|
|
289
|
+
page.text,
|
|
290
|
+
page.code,
|
|
291
|
+
page.code_lang,
|
|
292
|
+
page.author,
|
|
293
|
+
page.last_updated,
|
|
294
|
+
page.word_count,
|
|
295
|
+
page.code_lines,
|
|
296
|
+
page.html_file,
|
|
297
|
+
);
|
|
298
|
+
allPages.push(page);
|
|
299
|
+
extracted++;
|
|
300
|
+
totalWords += page.word_count;
|
|
301
|
+
totalCodeLines += page.code_lines;
|
|
302
|
+
}
|
|
303
|
+
});
|
|
304
|
+
insertAll();
|
|
305
|
+
|
|
306
|
+
// Pass 2: set parent_id where the parent actually exists in the DB
|
|
307
|
+
const pageIds = new Set(allPages.map((p) => p.id));
|
|
308
|
+
const setParents = db.transaction(() => {
|
|
309
|
+
for (const page of allPages) {
|
|
310
|
+
if (page.parent_id && pageIds.has(page.parent_id)) {
|
|
311
|
+
updateParent.run(page.parent_id, page.id);
|
|
312
|
+
}
|
|
313
|
+
}
|
|
314
|
+
});
|
|
315
|
+
setParents();
|
|
316
|
+
|
|
317
|
+
// Pass 3: insert callouts
|
|
318
|
+
const insertCallout = db.prepare(`
|
|
319
|
+
INSERT INTO callouts (page_id, type, content, sort_order)
|
|
320
|
+
VALUES (?, ?, ?, ?)
|
|
321
|
+
`);
|
|
322
|
+
const insertCallouts = db.transaction(() => {
|
|
323
|
+
for (const page of allPages) {
|
|
324
|
+
for (const c of page.callouts) {
|
|
325
|
+
insertCallout.run(c.page_id, c.type, c.content, c.sort_order);
|
|
326
|
+
totalCallouts++;
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
});
|
|
330
|
+
insertCallouts();
|
|
331
|
+
|
|
332
|
+
// Pass 4: insert sections
|
|
333
|
+
let totalSections = 0;
|
|
334
|
+
let pagesWithSections = 0;
|
|
335
|
+
const insertSection = db.prepare(`
|
|
336
|
+
INSERT INTO sections (page_id, heading, level, anchor_id, text, code, word_count, sort_order)
|
|
337
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
|
338
|
+
`);
|
|
339
|
+
const insertSections = db.transaction(() => {
|
|
340
|
+
for (const page of allPages) {
|
|
341
|
+
if (page.sections.length > 0) {
|
|
342
|
+
pagesWithSections++;
|
|
343
|
+
for (const s of page.sections) {
|
|
344
|
+
insertSection.run(s.page_id, s.heading, s.level, s.anchor_id, s.text, s.code, s.word_count, s.sort_order);
|
|
345
|
+
totalSections++;
|
|
346
|
+
}
|
|
347
|
+
}
|
|
348
|
+
}
|
|
349
|
+
});
|
|
350
|
+
insertSections();
|
|
351
|
+
|
|
352
|
+
const ftsCount = (db.prepare("SELECT COUNT(*) as c FROM pages_fts").get() as { c: number }).c;
|
|
353
|
+
|
|
354
|
+
console.log(`\nExtraction complete:`);
|
|
355
|
+
console.log(` Pages extracted: ${extracted}`);
|
|
356
|
+
console.log(` Pages skipped: ${skipped}`);
|
|
357
|
+
console.log(` Total words: ${totalWords.toLocaleString()}`);
|
|
358
|
+
console.log(` Total code lines: ${totalCodeLines.toLocaleString()}`);
|
|
359
|
+
console.log(` Total callouts: ${totalCallouts}`);
|
|
360
|
+
console.log(` Total sections: ${totalSections} (across ${pagesWithSections} pages)`);
|
|
361
|
+
console.log(` FTS index rows: ${ftsCount}`);
|
|
362
|
+
|
|
363
|
+
// Quick search test
|
|
364
|
+
const testResults = db
|
|
365
|
+
.prepare(
|
|
366
|
+
`SELECT s.id, s.title, s.path,
|
|
367
|
+
snippet(pages_fts, 2, '>>>', '<<<', '...', 20) as excerpt
|
|
368
|
+
FROM pages_fts fts
|
|
369
|
+
JOIN pages s ON s.id = fts.rowid
|
|
370
|
+
WHERE pages_fts MATCH 'firewall filter'
|
|
371
|
+
ORDER BY rank LIMIT 5`,
|
|
372
|
+
)
|
|
373
|
+
.all();
|
|
374
|
+
|
|
375
|
+
console.log(`\nTest search for "firewall filter":`);
|
|
376
|
+
for (const r of testResults as Array<{ id: number; title: string; path: string; excerpt: string }>) {
|
|
377
|
+
console.log(` [${r.id}] ${r.path}`);
|
|
378
|
+
console.log(` ${r.excerpt}`);
|
|
379
|
+
}
|