domain-rank 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/export.ts ADDED
@@ -0,0 +1,98 @@
1
+ import fs from 'fs/promises';
2
+ import { createWriteStream } from 'fs';
3
+ import path from 'path';
4
+
5
+ type Format = 'json' | 'csv' | 'ndjson';
6
+
7
+ const defaultFormats: Format[] = ['json', 'csv', 'ndjson'];
8
+
9
+ async function loadData() {
10
+ const base = path.resolve(__dirname, '../data');
11
+ const mergedPath = path.join(base, 'domain-rank-merged.json');
12
+ const infoPath = path.join(base, 'domain-info.json');
13
+ const merged = JSON.parse(await fs.readFile(mergedPath, 'utf8'));
14
+ let info = {};
15
+ try {
16
+ info = JSON.parse(await fs.readFile(infoPath, 'utf8'));
17
+ } catch (e) {
18
+ // ignore
19
+ }
20
+ return { merged, info };
21
+ }
22
+
23
+ function toCSV(records: any[]) {
24
+ if (!records.length) return '';
25
+ const keys = Object.keys(records[0]);
26
+ const lines = [keys.join(',')];
27
+ for (const r of records) {
28
+ lines.push(keys.map(k => {
29
+ const v = r[k] ?? '';
30
+ const s = typeof v === 'string' ? v : JSON.stringify(v);
31
+ return '"' + s.replace(/"/g, '""') + '"';
32
+ }).join(','));
33
+ }
34
+ return lines.join('\n');
35
+ }
36
+
37
+ async function writeJSON(outDir: string, name: string, data: any) {
38
+ await fs.mkdir(outDir, { recursive: true });
39
+ await fs.writeFile(path.join(outDir, name + '.json'), JSON.stringify(data, null, 2), 'utf8');
40
+ }
41
+
42
+ async function writeNDJSON(outDir: string, name: string, arr: any[]) {
43
+ await fs.mkdir(outDir, { recursive: true });
44
+ const fp = path.join(outDir, name + '.ndjson');
45
+ const stream = createWriteStream(fp, { encoding: 'utf8' });
46
+ for (const item of arr) {
47
+ stream.write(JSON.stringify(item) + '\n');
48
+ }
49
+ stream.end();
50
+ await new Promise(resolve => stream.on('finish', resolve));
51
+ }
52
+
53
+ async function writeCSVFile(outDir: string, name: string, arr: any[]) {
54
+ await fs.mkdir(outDir, { recursive: true });
55
+ const csv = toCSV(arr);
56
+ await fs.writeFile(path.join(outDir, name + '.csv'), csv, 'utf8');
57
+ }
58
+
59
+ function makeRecords(merged: any[], info: Record<string, any>) {
60
+ return merged.map((m: any) => ({
61
+ rank: m.rank ?? null,
62
+ domain: m.domain ?? m.name ?? null,
63
+ source: m.source ?? null,
64
+ score: m.score ?? null,
65
+ info: info[m.domain] ?? null
66
+ }));
67
+ }
68
+
69
+ async function main() {
70
+ const args = process.argv.slice(2);
71
+ const formatsArg = args.find(a => a.startsWith('--formats='))?.split('=')[1];
72
+ const outArg = args.find(a => a.startsWith('--out='))?.split('=')[1];
73
+ const formats = formatsArg ? formatsArg.split(',') as Format[] : defaultFormats;
74
+ const outDir = outArg ? path.resolve(process.cwd(), outArg) : path.resolve(process.cwd(), 'exports');
75
+
76
+ const { merged, info } = await loadData();
77
+ const records = makeRecords(Array.isArray(merged) ? merged : Object.values(merged), info || {});
78
+
79
+ if (formats.includes('json')) {
80
+ await writeJSON(outDir, 'domain-rank', records);
81
+ console.log('Wrote JSON to', outDir);
82
+ }
83
+ if (formats.includes('ndjson')) {
84
+ await writeNDJSON(outDir, 'domain-rank', records);
85
+ console.log('Wrote NDJSON to', outDir);
86
+ }
87
+ if (formats.includes('csv')) {
88
+ await writeCSVFile(outDir, 'domain-rank', records);
89
+ console.log('Wrote CSV to', outDir);
90
+ }
91
+ }
92
+
93
+ if (require.main === module) {
94
+ main().catch(err => {
95
+ console.error(err);
96
+ process.exit(1);
97
+ });
98
+ }
@@ -0,0 +1,213 @@
1
+ import { Database } from "bun:sqlite";
2
+ import domains from "../data/domains-1m.js";
3
+
4
+ const db = new Database("favicons.sqlite");
5
+
6
+ // Create table if it doesn't exist
7
+ db.run(`
8
+ CREATE TABLE IF NOT EXISTS favicons (
9
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
10
+ domain TEXT UNIQUE,
11
+ favicon BLOB,
12
+ size INTEGER,
13
+ created_at DATETIME DEFAULT CURRENT_TIMESTAMP
14
+ )
15
+ `);
16
+
17
+ /**
18
+ * Get favicon for a single domain using Google's favicon API
19
+ * @param {string} domain - Domain name to fetch favicon for
20
+ * @param {number} [size=16] - Size of the favicon in pixels
21
+ * @returns {Promise<Buffer|null>} Buffer containing favicon data or null if failed
22
+ */
23
+ async function getFaviconForSingleDomain(domain, size = 16) {
24
+ try {
25
+ // Create AbortController for timeout
26
+ const controller = new AbortController();
27
+ const timeoutId = setTimeout(() => controller.abort(), 10000); // 10 second timeout
28
+
29
+ const response = await fetch(
30
+ 'https://www.google.com/s2/favicons?domain=' + encodeURIComponent(domain) + '&sz=' + size,
31
+ {
32
+ signal: controller.signal,
33
+ headers: {
34
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
35
+ }
36
+ }
37
+ );
38
+
39
+ clearTimeout(timeoutId);
40
+
41
+ if (!response.ok) {
42
+ return null;
43
+ }
44
+
45
+ const arrayBuffer = await response.arrayBuffer();
46
+ const buffer = Buffer.from(arrayBuffer);
47
+
48
+ // Check if we got a valid image (not the default Google favicon)
49
+ if (buffer.length < 100) {
50
+ return null; // Too small, likely a default favicon
51
+ }
52
+
53
+ return buffer;
54
+ } catch (error) {
55
+ console.log(`Could not fetch favicon for ${domain}: ${error.message}`);
56
+ return null;
57
+ }
58
+ }
59
+
60
+ /**
61
+ * Fetch favicons and store them in SQLite database
62
+ * @param {object} options - Options for fetching favicons
63
+ * @param {number} options.startIndex - Start index in domains list (default: 0)
64
+ * @param {number} options.endIndex - End index in domains list (default: 1000)
65
+ * @param {number} options.size - Size of favicons in pixels (default: 16)
66
+ * @param {number} options.delay - Delay between requests in ms (default: 200)
67
+ * @returns {Promise<void>} Promise that resolves when all favicons are fetched and stored
68
+ */
69
+ async function fetchAndStoreFavicons(options = {}) {
70
+ const {
71
+ startIndex = 0,
72
+ endIndex = 1000,
73
+ size = 16,
74
+ delay = 200,
75
+ } = options;
76
+
77
+ const domainsArray = domains.split(",");
78
+ const actualEndIndex = Math.min(endIndex, domainsArray.length);
79
+
80
+ // Prepare statements for better performance
81
+ const insertStmt = db.prepare(
82
+ "INSERT OR REPLACE INTO favicons (domain, favicon, size) VALUES (?, ?, ?)"
83
+ );
84
+ const existsStmt = db.prepare(
85
+ "SELECT id FROM favicons WHERE domain = ? AND size = ?"
86
+ );
87
+
88
+ // Get current count
89
+ const initialCount = db.query("SELECT COUNT(*) as count FROM favicons").get().count;
90
+ console.log(`Starting with ${initialCount} favicons in database`);
91
+ console.log(`Fetching favicons for domains ${startIndex} to ${actualEndIndex - 1}`);
92
+ console.log(`Target: ${actualEndIndex - startIndex} domains`);
93
+
94
+ let successCount = 0;
95
+ let skipCount = 0;
96
+ let failCount = 0;
97
+
98
+ for (let i = startIndex; i < actualEndIndex; i++) {
99
+ const domain = domainsArray[i];
100
+
101
+ // Check if already exists with same size
102
+ const exists = existsStmt.get(domain, size);
103
+ if (exists) {
104
+ console.log(`Skipping ${i + 1}: ${domain} (already exists)`);
105
+ skipCount++;
106
+ continue;
107
+ }
108
+
109
+ console.log(`Fetching ${i + 1}: ${domain}`);
110
+
111
+ const faviconBuffer = await getFaviconForSingleDomain(domain, size);
112
+
113
+ if (faviconBuffer) {
114
+ insertStmt.run(domain, faviconBuffer, size);
115
+ successCount++;
116
+ console.log(`✓ Stored favicon for ${domain} (${faviconBuffer.length} bytes)`);
117
+
118
+ // Log progress every 20 successful downloads
119
+ if (successCount % 20 === 0) {
120
+ const currentCount = db.query("SELECT COUNT(*) as count FROM favicons").get().count;
121
+ console.log(`Progress: ${successCount} new favicons downloaded, ${currentCount} total in database`);
122
+ }
123
+ } else {
124
+ failCount++;
125
+ console.log(`✗ Failed to download favicon for ${domain}`);
126
+ }
127
+
128
+ // Add delay to avoid overwhelming Google's API
129
+ if (delay > 0) {
130
+ await new Promise(resolve => setTimeout(resolve, delay));
131
+ }
132
+ }
133
+
134
+ const finalCount = db.query("SELECT COUNT(*) as count FROM favicons").get().count;
135
+
136
+ console.log(`\nCompleted fetching favicons!`);
137
+ console.log(`Total favicons in database: ${finalCount}`);
138
+ console.log(`New favicons: ${successCount}`);
139
+ console.log(`Skipped: ${skipCount}`);
140
+ console.log(`Failed: ${failCount}`);
141
+ }
142
+
143
+ /**
144
+ * Get favicon from database by domain
145
+ * @param {string} domain - Domain to get favicon for
146
+ * @param {number} [size=16] - Size of favicon to retrieve
147
+ * @returns {Buffer|null} Favicon buffer or null if not found
148
+ */
149
+ function getFaviconFromDatabase(domain, size = 16) {
150
+ const stmt = db.prepare("SELECT favicon FROM favicons WHERE domain = ? AND size = ?");
151
+ const result = stmt.get(domain, size);
152
+ return result ? result.favicon : null;
153
+ }
154
+
155
+ /**
156
+ * Get all domains that have favicons stored
157
+ * @param {number} [size=16] - Size filter
158
+ * @returns {string[]} Array of domain names
159
+ */
160
+ function getDomainsWithFavicons(size = 16) {
161
+ const stmt = db.prepare("SELECT domain FROM favicons WHERE size = ? ORDER BY domain");
162
+ return stmt.all(size).map(row => row.domain);
163
+ }
164
+
165
+ /**
166
+ * Get database statistics
167
+ * @returns {object} Statistics about the favicon database
168
+ */
169
+ function getDatabaseStats() {
170
+ const totalCount = db.query("SELECT COUNT(*) as count FROM favicons").get().count;
171
+ const sizeStats = db.query(`
172
+ SELECT size, COUNT(*) as count
173
+ FROM favicons
174
+ GROUP BY size
175
+ ORDER BY size
176
+ `).all();
177
+
178
+ const totalSize = db.query(`
179
+ SELECT SUM(LENGTH(favicon)) as totalBytes
180
+ FROM favicons
181
+ `).get().totalBytes || 0;
182
+
183
+ return {
184
+ totalFavicons: totalCount,
185
+ totalSizeBytes: totalSize,
186
+ totalSizeMB: (totalSize / 1024 / 1024).toFixed(2),
187
+ sizeBreakdown: sizeStats
188
+ };
189
+ }
190
+
191
+ // Export functions for use as a module
192
+ export {
193
+ fetchAndStoreFavicons,
194
+ getFaviconFromDatabase,
195
+ getDomainsWithFavicons,
196
+ getDatabaseStats,
197
+ getFaviconForSingleDomain
198
+ };
199
+
200
+ // Close database connection gracefully on exit
201
+ process.on('SIGINT', () => {
202
+ console.log('\nClosing database connection...');
203
+ db.close();
204
+ process.exit(0);
205
+ });
206
+
207
+ // Run for top 1000 domains if this file is executed directly
208
+ if (import.meta.main) {
209
+ console.log('Starting favicon fetcher...');
210
+ await fetchAndStoreFavicons({ startIndex: 0, endIndex: 1000, delay: 200 });
211
+ console.log('\nDatabase stats:');
212
+ console.log(getDatabaseStats());
213
+ }
@@ -0,0 +1,170 @@
1
+ import fs from "fs";
2
+ import zlib from "zlib";
3
+ import readline from "readline";
4
+ import { writeFile } from 'fs/promises';
5
+ import { pipeline } from 'stream/promises';
6
+ import { Transform, Readable } from 'stream';
7
+ import unzipper from 'unzipper';
8
+ import grab from "grab-url";
9
+
10
+
11
+
12
+
13
+ /**
14
+ * Domain Rank shows how trustworthy and influential a domain is based on links pointing to that
15
+ * domain's pages across all 120+ million domains.
16
+ *
17
+ * @see [CommonCrawl](https://commoncrawl.org/web-graphs)
18
+ * CommonCrawl is a nonprofit for open source public dataset that crawls and downloads the entire
19
+ * internet 100TB urls and html. CommonCrawl calculates domain rank for 100M domains, using
20
+ * PageRank algorithm which randomly surfs links and counts travels to each page to find
21
+ * probability of being at a domain, thus ranking influence among other reputable domains.
22
+ */
23
+ export async function importDomainsPageRankCrawler(urlCommonCrawl = '', limit = 1000000) {
24
+ const url = urlCommonCrawl || await getDomainCrawlerUrl();
25
+
26
+ try {
27
+ if (!fs.existsSync("./data")) fs.mkdirSync("./data", { recursive: true });
28
+ try { fs.unlinkSync("./data/domains-1m.js"); } catch (_) {}
29
+
30
+ const writeStream = fs.createWriteStream("./data/domains-1m.js", { flags: "w" });
31
+ let lineNum = 0;
32
+ let headerProcessed = false;
33
+ writeStream.write("export default '");
34
+
35
+ const ticker = setInterval(() => {
36
+ console.log(`Processed lines: ${lineNum.toLocaleString()}`);
37
+ if (lineNum >= limit) clearInterval(ticker);
38
+ }, 5000);
39
+
40
+ await grab(url, {
41
+ onStream: async (body) => {
42
+ const decompressedStream = Readable.fromWeb(body).pipe(zlib.createGunzip());
43
+ const rl = readline.createInterface({ input: decompressedStream, crlfDelay: Infinity });
44
+ let streamClosed = false;
45
+
46
+ function closeStream() {
47
+ if (!streamClosed) {
48
+ streamClosed = true;
49
+ writeStream.write('\b');
50
+ writeStream.write("'");
51
+ writeStream.end();
52
+ clearInterval(ticker);
53
+ console.log(`Output file: ./data/domains-1m.js`);
54
+ }
55
+ }
56
+
57
+ rl.on('line', (line) => {
58
+ try {
59
+ if (streamClosed) return;
60
+ if (!headerProcessed) {
61
+ if (line.includes("#host_rev")) { headerProcessed = true; return; }
62
+ }
63
+ const parts = line.split("\t");
64
+ if (parts.length < 5) return;
65
+ const domain = parts[4];
66
+ if (!domain || domain === "#host_rev") return;
67
+ writeStream.write(`${domain.split(".").reverse().join(".")},`);
68
+ lineNum++;
69
+ if (lineNum >= limit) { rl.close(); closeStream(); }
70
+ } catch (error) {
71
+ console.error("Error processing line:", error);
72
+ }
73
+ });
74
+
75
+ await new Promise((resolve, reject) => {
76
+ rl.on('close', () => { closeStream(); resolve(); });
77
+ rl.on('error', (err) => { clearInterval(ticker); reject(err); });
78
+ });
79
+ }
80
+ });
81
+
82
+ } catch (error) {
83
+ console.error("Error:", error);
84
+ }
85
+ }
86
+
87
+
88
+ /**
89
+ * Scrapes Common Crawl web graphs page to find the domain-ranks.txt.gz URL
90
+ * from the first available date listing.
91
+ *
92
+ * @returns {Promise<string>} The full URL to the domain-ranks.txt.gz file
93
+ * @throws {Error} When HTTP requests fail, date links are not found, or domain-ranks.txt.gz is not found
94
+ */
95
+ export async function getDomainCrawlerUrl() {
96
+ const mainHtml = await grab('https://commoncrawl.org/web-graphs', { timeout: 10 });
97
+
98
+ const datePattern = /href="([^"]*\d{4}-\w+[^"]*)"/;
99
+ const dateMatch = mainHtml.match(datePattern);
100
+ if (!dateMatch) throw new Error('No date link found');
101
+
102
+ const dateUrl = dateMatch[1].startsWith('http')
103
+ ? dateMatch[1]
104
+ : `https://commoncrawl.org${dateMatch[1]}`;
105
+
106
+ const dateHtml = await grab(dateUrl, { timeout: 10 });
107
+
108
+ const rankPattern = /href="([^"]*domain-ranks\.txt\.gz[^"]*)"/;
109
+ const rankMatch = dateHtml.match(rankPattern);
110
+ if (!rankMatch) throw new Error('domain-ranks.txt.gz not found');
111
+
112
+ return rankMatch[1].startsWith('http')
113
+ ? rankMatch[1]
114
+ : `https://commoncrawl.org${rankMatch[1]}`;
115
+ }
116
+
117
+
118
+
119
+ /**
120
+ * Download and extract the current Tranco top-1M domain ranking.
121
+ *
122
+ * The Tranco project aggregates multiple ranking providers (Cisco Umbrella,
123
+ * Majestic, Farsight, Chrome UX Report, Cloudflare Radar) to generate
124
+ * manipulation-resistant popularity lists. The list is updated daily (UTC).
125
+ *
126
+ * Source: https://tranco-list.eu/
127
+ * Default dataset: https://tranco-list.eu/top-1m.csv.zip
128
+ */
129
+
130
+ async function importDomainsOfficialList(limit = 10000) {
131
+ const url = 'https://tranco-list.eu/top-1m.csv.zip';
132
+ const output = './data/domains-official-100k.js';
133
+
134
+ console.log(`Streaming download and extraction (limit: ${limit.toLocaleString()})...`);
135
+
136
+ const domains = [];
137
+ let isFirstLine = true;
138
+ let done = false;
139
+
140
+ await grab(url, {
141
+ onStream: (body) => pipeline(
142
+ Readable.fromWeb(body),
143
+ unzipper.ParseOne(),
144
+ new Transform({
145
+ objectMode: false,
146
+ transform(chunk, encoding, callback) {
147
+ if (done) return callback();
148
+ for (const line of chunk.toString().split('\n')) {
149
+ if (isFirstLine) { isFirstLine = false; continue; }
150
+ const domain = line.split(',')[1]?.replace(/"/g, '').trim();
151
+ if (domain) domains.push(domain);
152
+ if (domains.length >= limit) { done = true; break; }
153
+ }
154
+ callback();
155
+ }
156
+ })
157
+ )
158
+ });
159
+
160
+ await writeFile(output, `export default '${domains.join(',')}';`);
161
+ console.log(`Saved ${domains.length} domains to ${output}`);
162
+ }
163
+
164
+
165
+
166
+ //if run directly
167
+ if (import.meta.main) {
168
+ // importDomainsPageRankCrawler();
169
+ importDomainsOfficialList(10000);
170
+ }
@@ -0,0 +1,109 @@
1
+ import fs from "fs";
2
+
3
+ type OldDomainEntry = [number, string]; // [rank, title] (existing source format)
4
+
5
+ type OutputEntry = {
6
+ name?: string; // preferred display name (news title overrides)
7
+ domainRank?: number;
8
+ domainTitle?: string;
9
+ newsRank?: number;
10
+ newsTitle?: string;
11
+ langCode?: string;
12
+ };
13
+
14
+ type DomainMap = Record<string, OutputEntry>;
15
+
16
+ /**
17
+ * Merge domain-info.json (general 1M list) with news-domain-rank.json
18
+ * (curated news sources). News entries override general entries for the same
19
+ * domain; news-only domains are appended after the general list.
20
+ *
21
+ * Ranks are reassigned sequentially in the merged output.
22
+ */
23
+ export function mergeDomainLists(options: {
24
+ domainInfoPath?: string;
25
+ newsDomainRankPath?: string;
26
+ outputPath?: string;
27
+ } = {}): DomainMap {
28
+ const {
29
+ domainInfoPath = "./data/domain-info.json",
30
+ newsDomainRankPath = "./data/news-domain-rank.json",
31
+ outputPath = "./data/domain-rank-merged.json",
32
+ } = options;
33
+
34
+ const domainInfoRaw: Record<string, OldDomainEntry> = fs.existsSync(domainInfoPath)
35
+ ? JSON.parse(fs.readFileSync(domainInfoPath, "utf8"))
36
+ : {};
37
+
38
+ const newsDomainRaw: Record<string, OldDomainEntry> = fs.existsSync(newsDomainRankPath)
39
+ ? JSON.parse(fs.readFileSync(newsDomainRankPath, "utf8"))
40
+ : {};
41
+
42
+ // Build merged map preserving both domain and news information
43
+ const merged: DomainMap = {};
44
+
45
+ // copy general list first
46
+ for (const [domain, entry] of Object.entries(domainInfoRaw)) {
47
+ const [rank, title] = entry || [undefined, undefined];
48
+ const newsEntry = newsDomainRaw[domain];
49
+ const newsRank = newsEntry ? newsEntry[0] : undefined;
50
+ const newsTitle = newsEntry ? newsEntry[1] : undefined;
51
+ const name = newsTitle || title || domain;
52
+ merged[domain] = {
53
+ name,
54
+ domainRank: typeof rank === 'number' ? rank : undefined,
55
+ domainTitle: title || undefined,
56
+ newsRank: typeof newsRank === 'number' ? newsRank : undefined,
57
+ newsTitle: newsTitle || undefined,
58
+ langCode: undefined,
59
+ };
60
+ }
61
+
62
+ // Determine next rank for news-only domains
63
+ const maxDomainRank = Object.values(merged).reduce((max, e) => Math.max(max, e.domainRank || 0), 0);
64
+ let nextRank = maxDomainRank + 1;
65
+
66
+ // append news-only domains
67
+ for (const [domain, entry] of Object.entries(newsDomainRaw)) {
68
+ if (merged[domain]) continue;
69
+ const [newsRank, newsTitle] = entry || [undefined, undefined];
70
+ merged[domain] = {
71
+ name: newsTitle || domain,
72
+ domainRank: nextRank++,
73
+ newsRank: typeof newsRank === 'number' ? newsRank : undefined,
74
+ newsTitle: newsTitle || undefined,
75
+ langCode: undefined,
76
+ };
77
+ }
78
+
79
+ // Prepare serializable output as fixed-order 6-element arrays:
80
+ // [name, domainRank, domainTitle, newsRank, newsTitle, langCode]
81
+ // Use empty strings (for text) and 0 (for ranks) for missing values.
82
+ const outputObj: Record<string, Array<string | number>> = {};
83
+ for (const [domain, entry] of Object.entries(merged)) {
84
+ // Avoid repeating the full name: if domainTitle or newsTitle exactly
85
+ // matches the chosen display name, store an empty string in that position.
86
+ const domainTitleField = entry.domainTitle !== undefined && entry.domainTitle !== entry.name ? entry.domainTitle : "";
87
+ const newsTitleField = entry.newsTitle !== undefined && entry.newsTitle !== entry.name ? entry.newsTitle : "";
88
+
89
+ const arr: Array<string | number> = [
90
+ entry.name !== undefined ? entry.name : "",
91
+ entry.domainRank !== undefined ? entry.domainRank : 0,
92
+ domainTitleField,
93
+ entry.newsRank !== undefined ? entry.newsRank : 0,
94
+ newsTitleField,
95
+ entry.langCode !== undefined ? entry.langCode : "",
96
+ ];
97
+ outputObj[domain] = arr;
98
+ }
99
+
100
+ fs.writeFileSync(outputPath, JSON.stringify(outputObj), "utf8");
101
+ console.log(
102
+ `Merged ${Object.keys(domainInfoRaw).length} general + ${Object.keys(newsDomainRaw).length} news entries → ${Object.keys(merged).length} total → ${outputPath}`
103
+ );
104
+
105
+ return merged;
106
+ }
107
+
108
+ // Run when called directly: bun src/merge-domain-lists.ts
109
+ mergeDomainLists();
@@ -0,0 +1,99 @@
1
+ import domainsOfficial from "../data/domains-official-1m.js";
2
+ import fs from "fs";
3
+ import {
4
+ shouldRemoveDomain,
5
+ findMainDomain,
6
+ getTitleOverride,
7
+ formatDomainAsTitle,
8
+ cleanSourceTitle,
9
+ getSourceTitle,
10
+ } from "./domain-name-formatter.js";
11
+
12
+ type DomainMap = Record<string, [number, string]>;
13
+
14
+ interface DomainInfoOptions {
15
+ startIndex?: number;
16
+ endIndex?: number;
17
+ resume?: boolean;
18
+ }
19
+
20
+ export async function domainInfo(options: DomainInfoOptions = {}): Promise<void> {
21
+ const { startIndex = 0, endIndex = 1000, resume = false } = options;
22
+
23
+ const domainsArray = domainsOfficial.split(",");
24
+ const actualEndIndex = Math.min(endIndex, domainsArray.length);
25
+ const dataPath = "./data/domain-info.json";
26
+
27
+ let domainResults: DomainMap = {};
28
+ let resumeFromIndex = startIndex;
29
+
30
+ if (fs.existsSync(dataPath)) {
31
+ domainResults = JSON.parse(fs.readFileSync(dataPath, "utf8"));
32
+
33
+ if (resume) {
34
+ const existing = Object.keys(domainResults);
35
+ if (existing.length > 0) {
36
+ const lastDomain = existing[existing.length - 1];
37
+ const lastIndex = domainsArray.indexOf(lastDomain);
38
+ if (lastIndex !== -1) {
39
+ resumeFromIndex = lastIndex + 1;
40
+ console.log(`Resuming from domain: ${lastDomain} (index ${resumeFromIndex})`);
41
+ }
42
+ }
43
+ }
44
+ } else {
45
+ fs.writeFileSync(dataPath, "{}", "utf8");
46
+ }
47
+
48
+ if (!resume && startIndex === 0) {
49
+ domainResults = {};
50
+ fs.writeFileSync(dataPath, "{}", "utf8");
51
+ }
52
+
53
+ console.log(`Processing domains from index ${resumeFromIndex} to ${actualEndIndex - 1}`);
54
+
55
+ let actualRank = Object.keys(domainResults).length;
56
+
57
+ for (let i = resumeFromIndex; i < actualEndIndex; i++) {
58
+ const domain = domainsArray[i];
59
+
60
+ if (shouldRemoveDomain(domain)) {
61
+ console.log(`Skipping ${i + 1}: ${domain} (marked for removal)`);
62
+ continue;
63
+ }
64
+
65
+ const mainDomain = findMainDomain(domain);
66
+ if (mainDomain) {
67
+ console.log(`Skipping ${i + 1}: ${domain} (alternative for ${mainDomain})`);
68
+ continue;
69
+ }
70
+
71
+ actualRank++;
72
+ console.log(`Processing ${actualRank}: ${domain}`);
73
+
74
+ let source: string | null = getTitleOverride(domain);
75
+
76
+ if (!source) {
77
+ source = formatDomainAsTitle(domain);
78
+ }
79
+
80
+ if (!getTitleOverride(domain)) {
81
+ const webTitle = await getSourceTitle(domain);
82
+ if (webTitle) {
83
+ const cleaned = cleanSourceTitle(webTitle)
84
+ ?.replace(/homepage/gi, "")
85
+ .replace(/home/gi, "")
86
+ .replace(".com", "");
87
+ if (cleaned && cleaned.length > 0) {
88
+ const wordCount = cleaned.split(/\s+/).filter((w) => w.length > 0).length;
89
+ if (wordCount < 3) source = cleaned;
90
+ }
91
+ }
92
+ }
93
+
94
+ domainResults[domain] = [actualRank, source || domain];
95
+ fs.writeFileSync(dataPath, JSON.stringify(domainResults), "utf8");
96
+ }
97
+ }
98
+
99
+ domainInfo({ startIndex: 0, endIndex: 1000000, resume: true });
@@ -0,0 +1,13 @@
1
+ import {test, expect} from 'vitest';
2
+ import {importTopDomains} from '../src/download-1m'
3
+
4
+ test('import top domains', async () => {
5
+
6
+ var result = await importTopDomains();
7
+
8
+ await new Promise(resolve => setTimeout(resolve, 40000));
9
+
10
+ console.log(result);
11
+ expect(result).toBeDefined();
12
+
13
+ }, 40000)