@tikoci/rosetta 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,359 @@
1
+ /**
2
+ * extract-test-results.ts — Scrape MikroTik product pages for test results + block diagram URLs.
3
+ *
4
+ * Fetches each product page from mikrotik.com and extracts:
5
+ * - Ethernet test results (bridging/routing throughput at various packet sizes)
6
+ * - IPSec test results (tunnel throughput with various ciphers)
7
+ * - Block diagram PNG URL
8
+ * - Product page URL slug
9
+ *
10
+ * Idempotent: deletes all existing test results, updates device rows.
11
+ * Requires devices table to be populated first (via extract-devices.ts).
12
+ *
13
+ * Usage: bun run src/extract-test-results.ts [--concurrency N] [--delay MS]
14
+ *
15
+ * Product page URL slug discovery: fetches the product matrix page to build
16
+ * a name→slug mapping, then fetches each product page by slug.
17
+ */
18
+
19
+ import { parseHTML } from "linkedom";
20
+ import { db, initDb } from "./db.ts";
21
+
22
+ // ── CLI flags ──
23
+
24
+ const args = process.argv.slice(2);
25
+ function getFlag(name: string, fallback: number): number {
26
+ const idx = args.indexOf(`--${name}`);
27
+ if (idx !== -1 && args[idx + 1]) return Number(args[idx + 1]);
28
+ return fallback;
29
+ }
30
+
31
+ const CONCURRENCY = getFlag("concurrency", 4);
32
+ const DELAY_MS = getFlag("delay", 500);
33
+ const PRODUCT_BASE = "https://mikrotik.com/product/";
34
+
35
+ // ── Types ──
36
+
37
+ interface TestResultRow {
38
+ mode: string;
39
+ configuration: string;
40
+ packet_size: number;
41
+ throughput_kpps: number | null;
42
+ throughput_mbps: number | null;
43
+ }
44
+
45
+ interface ProductPageData {
46
+ slug: string;
47
+ ethernet_results: TestResultRow[];
48
+ ipsec_results: TestResultRow[];
49
+ block_diagram_url: string | null;
50
+ }
51
+
52
+ // ── HTML Parsing ──
53
+
54
+ /** Decode HTML entities like none to text. */
55
+ function decodeEntities(html: string): string {
56
+ const { document } = parseHTML("<div></div>");
57
+ const el = document.createElement("div");
58
+ el.innerHTML = html;
59
+ return el.textContent || "";
60
+ }
61
+
62
+ /** Parse a performance-table element into test result rows. */
63
+ function parsePerformanceTable(table: Element): { testType: string; rows: TestResultRow[] } {
64
+ const rows: TestResultRow[] = [];
65
+
66
+ // Header row: first <tr> in <thead> has [product_code, test_description]
67
+ const thead = table.querySelector("thead");
68
+ if (!thead) return { testType: "unknown", rows };
69
+
70
+ const headerRows = thead.querySelectorAll("tr");
71
+ if (headerRows.length < 2) return { testType: "unknown", rows };
72
+
73
+ // Determine test type from header description
74
+ const headerCells = headerRows[0].querySelectorAll("td");
75
+ const testDesc = headerCells.length >= 2 ? (headerCells[1].textContent || "").trim().toLowerCase() : "";
76
+ const testType = testDesc.includes("ipsec") ? "ipsec" : "ethernet";
77
+
78
+ // Determine packet sizes from the second header row
79
+ // Structure: [Mode, Configuration, (1518|1400) byte, 512 byte, 64 byte]
80
+ // The colspan=2 means each size has kpps + Mbps columns
81
+ const sizeRow = headerRows[1];
82
+ const sizeCells = sizeRow.querySelectorAll("td");
83
+ const packetSizes: number[] = [];
84
+ for (const cell of sizeCells) {
85
+ const text = (cell.textContent || "").trim();
86
+ const match = text.match(/^(\d+)\s*byte/i);
87
+ if (match) packetSizes.push(Number.parseInt(match[1], 10));
88
+ }
89
+
90
+ // If we couldn't find sizes in the header, use defaults
91
+ if (packetSizes.length === 0) {
92
+ if (testType === "ipsec") {
93
+ packetSizes.push(1400, 512, 64);
94
+ } else {
95
+ packetSizes.push(1518, 512, 64);
96
+ }
97
+ }
98
+
99
+ // Parse data rows from <tbody>
100
+ const tbody = table.querySelector("tbody");
101
+ if (!tbody) return { testType, rows };
102
+
103
+ for (const tr of tbody.querySelectorAll("tr")) {
104
+ const cells = tr.querySelectorAll("td");
105
+ if (cells.length < 2) continue;
106
+
107
+ const mode = (cells[0].textContent || "").trim();
108
+ const config = (cells[1].textContent || "").trim();
109
+
110
+ // Each packet size has 2 columns: kpps, Mbps
111
+ for (let i = 0; i < packetSizes.length; i++) {
112
+ const kppsIdx = 2 + i * 2;
113
+ const mbpsIdx = 3 + i * 2;
114
+ if (kppsIdx >= cells.length) break;
115
+
116
+ const kpps = Number.parseFloat((cells[kppsIdx].textContent || "").trim());
117
+ const mbps = mbpsIdx < cells.length
118
+ ? Number.parseFloat((cells[mbpsIdx].textContent || "").trim())
119
+ : null;
120
+
121
+ rows.push({
122
+ mode,
123
+ configuration: config,
124
+ packet_size: packetSizes[i],
125
+ throughput_kpps: Number.isNaN(kpps) ? null : kpps,
126
+ throughput_mbps: mbps !== null && Number.isNaN(mbps) ? null : mbps,
127
+ });
128
+ }
129
+ }
130
+
131
+ return { testType, rows };
132
+ }
133
+
134
+ /** Generate candidate URL slugs for a product.
135
+ * MikroTik slugs are wildly inconsistent — some use lowercased names with underscores,
136
+ * some use product codes with original casing, and + is sometimes "plus", sometimes dropped.
137
+ * Unicode superscripts (², ³) are transliterated to digits.
138
+ * We try multiple variants and use the first that returns 200. */
139
+ function generateSlugs(name: string, code: string | null): string[] {
140
+ const slugs: string[] = [];
141
+ const seen = new Set<string>();
142
+ const add = (s: string) => {
143
+ if (s && !seen.has(s)) {
144
+ seen.add(s);
145
+ slugs.push(s);
146
+ }
147
+ };
148
+
149
+ // Normalize Unicode superscripts to regular digits
150
+ const norm = (s: string) =>
151
+ s.replace(/²/g, "2").replace(/³/g, "3").replace(/¹/g, "1");
152
+
153
+ const cleanName = norm(name);
154
+
155
+ // 1. Lowercased name: + → plus, non-alphanum → _
156
+ add(cleanName.toLowerCase().replace(/\+/g, "plus").replace(/[^a-z0-9plus]+/g, "_").replace(/^_|_$/g, ""));
157
+
158
+ // 2. Lowercased name: drop + entirely
159
+ add(cleanName.toLowerCase().replace(/[^a-z0-9]+/g, "_").replace(/^_|_$/g, ""));
160
+
161
+ if (code) {
162
+ const cleanCode = norm(code);
163
+
164
+ // 3. Product code as-is (original casing, + → plus, strip other specials)
165
+ add(cleanCode.replace(/\+/g, "plus").replace(/[^a-zA-Z0-9plus\-]+/g, "").replace(/^-|-$/g, ""));
166
+
167
+ // 4. Product code as-is (original casing)
168
+ add(cleanCode.replace(/[^a-zA-Z0-9\-]+/g, "").replace(/^-|-$/g, ""));
169
+
170
+ // 5. Lowercased code: + → plus
171
+ add(cleanCode.toLowerCase().replace(/\+/g, "plus").replace(/[^a-z0-9plus]+/g, "_").replace(/^_|_$/g, ""));
172
+
173
+ // 6. Lowercased code: drop +
174
+ add(cleanCode.toLowerCase().replace(/[^a-z0-9]+/g, "_").replace(/^_|_$/g, ""));
175
+ }
176
+
177
+ return slugs;
178
+ }
179
+
180
+ /** Fetch and parse a single product page, trying multiple slug candidates. */
181
+ async function fetchProductPage(slugs: string[]): Promise<ProductPageData | null> {
182
+ for (const slug of slugs) {
183
+ const url = `${PRODUCT_BASE}${slug}`;
184
+ try {
185
+ const resp = await fetch(url);
186
+ if (resp.ok) {
187
+ const html = await resp.text();
188
+ return parseProductHtml(html, slug);
189
+ }
190
+ // Don't warn for intermediary attempts — only the last slug matters
191
+ } catch {
192
+ // network error, try next slug
193
+ }
194
+ }
195
+ console.warn(` [404] ${slugs[0]} (tried ${slugs.length} variants)`);
196
+ return null;
197
+ }
198
+
199
+ /** Parse product page HTML into structured data. */
200
+ function parseProductHtml(html: string, slug: string): ProductPageData | null {
201
+
202
+ const { document } = parseHTML(html);
203
+
204
+ // Parse performance tables
205
+ const tables = document.querySelectorAll("table.performance-table");
206
+ const ethernet_results: TestResultRow[] = [];
207
+ const ipsec_results: TestResultRow[] = [];
208
+
209
+ for (const table of tables) {
210
+ const { testType, rows } = parsePerformanceTable(table);
211
+ if (testType === "ipsec") {
212
+ ipsec_results.push(...rows);
213
+ } else {
214
+ ethernet_results.push(...rows);
215
+ }
216
+ }
217
+
218
+ // Find block diagram URL
219
+ let block_diagram_url: string | null = null;
220
+ const links = document.querySelectorAll("a");
221
+ for (const a of links) {
222
+ const text = (a.textContent || "").trim();
223
+ if (text === "Block Diagram") {
224
+ const href = a.getAttribute("href");
225
+ if (href) {
226
+ block_diagram_url = href.startsWith("http")
227
+ ? href
228
+ : `https://cdn.mikrotik.com${href}`;
229
+ }
230
+ break;
231
+ }
232
+ }
233
+
234
+ return { slug, ethernet_results, ipsec_results, block_diagram_url };
235
+ }
236
+
237
+ /** Sleep helper for rate limiting. */
238
+ function sleep(ms: number): Promise<void> {
239
+ return new Promise((resolve) => setTimeout(resolve, ms));
240
+ }
241
+
242
+ // ── Main ──
243
+
244
+ initDb();
245
+
246
+ // Get all devices from DB
247
+ const devices = db.prepare("SELECT id, product_name, product_code FROM devices ORDER BY product_name").all() as Array<{
248
+ id: number;
249
+ product_name: string;
250
+ product_code: string | null;
251
+ }>;
252
+
253
+ if (devices.length === 0) {
254
+ console.error("No devices in database. Run extract-devices.ts first.");
255
+ process.exit(1);
256
+ }
257
+
258
+ console.log(`Found ${devices.length} devices in database`);
259
+
260
+ // Build device → candidate slugs mapping
261
+ const deviceSlugs: Array<{ id: number; name: string; slugs: string[] }> = [];
262
+ for (const dev of devices) {
263
+ const slugs = generateSlugs(dev.product_name, dev.product_code);
264
+ deviceSlugs.push({ id: dev.id, name: dev.product_name, slugs });
265
+ }
266
+
267
+ // Idempotent: clear existing test results
268
+ db.run("DELETE FROM device_test_results");
269
+
270
+ // Prepare statements
271
+ const insertTest = db.prepare(`INSERT OR IGNORE INTO device_test_results (
272
+ device_id, test_type, mode, configuration, packet_size,
273
+ throughput_kpps, throughput_mbps
274
+ ) VALUES (?, ?, ?, ?, ?, ?, ?)`);
275
+
276
+ const updateDevice = db.prepare(`UPDATE devices
277
+ SET product_url = ?, block_diagram_url = ?
278
+ WHERE id = ?`);
279
+
280
+ console.log(`Fetching ${deviceSlugs.length} product pages (concurrency=${CONCURRENCY}, delay=${DELAY_MS}ms)...`);
281
+
282
+ let totalTests = 0;
283
+ let devicesWithTests = 0;
284
+ let devicesWithDiagrams = 0;
285
+ let fetchErrors = 0;
286
+
287
+ const insertAll = db.transaction(
288
+ (results: Array<{ deviceId: number; data: ProductPageData | null }>) => {
289
+ for (const { deviceId, data } of results) {
290
+ if (!data) {
291
+ fetchErrors++;
292
+ continue;
293
+ }
294
+
295
+ // Update device with URL and block diagram
296
+ updateDevice.run(
297
+ `https://mikrotik.com/product/${data.slug}`,
298
+ data.block_diagram_url,
299
+ deviceId,
300
+ );
301
+
302
+ if (data.block_diagram_url) devicesWithDiagrams++;
303
+
304
+ // Insert test results
305
+ const allResults = [
306
+ ...data.ethernet_results.map((r) => ({ ...r, test_type: "ethernet" as const })),
307
+ ...data.ipsec_results.map((r) => ({ ...r, test_type: "ipsec" as const })),
308
+ ];
309
+
310
+ if (allResults.length > 0) devicesWithTests++;
311
+
312
+ for (const r of allResults) {
313
+ insertTest.run(
314
+ deviceId,
315
+ r.test_type,
316
+ r.mode,
317
+ r.configuration,
318
+ r.packet_size,
319
+ r.throughput_kpps,
320
+ r.throughput_mbps,
321
+ );
322
+ totalTests++;
323
+ }
324
+ }
325
+ },
326
+ );
327
+
328
+ // Fetch all products with rate limiting
329
+ const allResults: Array<{ deviceId: number; data: ProductPageData | null }> = [];
330
+ let processed = 0;
331
+
332
+ for (let i = 0; i < deviceSlugs.length; i += CONCURRENCY) {
333
+ const batch = deviceSlugs.slice(i, i + CONCURRENCY);
334
+ const batchResults = await Promise.all(
335
+ batch.map(async (dev) => {
336
+ const data = await fetchProductPage(dev.slugs);
337
+ return { deviceId: dev.id, data };
338
+ }),
339
+ );
340
+ allResults.push(...batchResults);
341
+ processed += batch.length;
342
+
343
+ const pct = Math.round((processed / deviceSlugs.length) * 100);
344
+ process.stdout.write(`\r ${processed}/${deviceSlugs.length} (${pct}%)`);
345
+
346
+ if (i + CONCURRENCY < deviceSlugs.length) {
347
+ await sleep(DELAY_MS);
348
+ }
349
+ }
350
+ console.log(""); // newline after progress
351
+
352
+ // Insert all results in one transaction
353
+ insertAll(allResults);
354
+
355
+ console.log(`Test results: ${totalTests} rows for ${devicesWithTests} devices`);
356
+ console.log(`Block diagrams: ${devicesWithDiagrams} devices`);
357
+ if (fetchErrors > 0) {
358
+ console.warn(`Fetch errors: ${fetchErrors} products`);
359
+ }