domain-rank 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/exports/domain-rank.csv +10021 -0
- package/exports/domain-rank.json +70142 -0
- package/exports/domain-rank.ndjson +10020 -0
- package/package.json +33 -0
- package/packages/domain-rank/exports/domain-rank.csv +10021 -0
- package/packages/domain-rank/exports/domain-rank.json +70142 -0
- package/packages/domain-rank/exports/domain-rank.ndjson +10020 -0
- package/readme.md +33 -0
- package/src/domain-api.ts +79 -0
- package/src/domain-exceptions.ts +24 -0
- package/src/domain-name-formatter.ts +136 -0
- package/src/duplicates.d.ts +3 -0
- package/src/duplicates.js +413 -0
- package/src/export.ts +98 -0
- package/src/favicons.js +213 -0
- package/src/import-domains-1m.js +170 -0
- package/src/merge-domain-lists.ts +109 -0
- package/src/parse-domain-info.ts +99 -0
- package/test/domain.test.js +13 -0
- package/test/search.test.js +360 -0
- package/tsconfig.json +19 -0
- package/vite.config.ts +18 -0
package/src/export.ts
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import fs from 'fs/promises';
|
|
2
|
+
import { createWriteStream } from 'fs';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
|
|
5
|
+
type Format = 'json' | 'csv' | 'ndjson';
|
|
6
|
+
|
|
7
|
+
const defaultFormats: Format[] = ['json', 'csv', 'ndjson'];
|
|
8
|
+
|
|
9
|
+
async function loadData() {
|
|
10
|
+
const base = path.resolve(__dirname, '../data');
|
|
11
|
+
const mergedPath = path.join(base, 'domain-rank-merged.json');
|
|
12
|
+
const infoPath = path.join(base, 'domain-info.json');
|
|
13
|
+
const merged = JSON.parse(await fs.readFile(mergedPath, 'utf8'));
|
|
14
|
+
let info = {};
|
|
15
|
+
try {
|
|
16
|
+
info = JSON.parse(await fs.readFile(infoPath, 'utf8'));
|
|
17
|
+
} catch (e) {
|
|
18
|
+
// ignore
|
|
19
|
+
}
|
|
20
|
+
return { merged, info };
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
function toCSV(records: any[]) {
|
|
24
|
+
if (!records.length) return '';
|
|
25
|
+
const keys = Object.keys(records[0]);
|
|
26
|
+
const lines = [keys.join(',')];
|
|
27
|
+
for (const r of records) {
|
|
28
|
+
lines.push(keys.map(k => {
|
|
29
|
+
const v = r[k] ?? '';
|
|
30
|
+
const s = typeof v === 'string' ? v : JSON.stringify(v);
|
|
31
|
+
return '"' + s.replace(/"/g, '""') + '"';
|
|
32
|
+
}).join(','));
|
|
33
|
+
}
|
|
34
|
+
return lines.join('\n');
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
async function writeJSON(outDir: string, name: string, data: any) {
|
|
38
|
+
await fs.mkdir(outDir, { recursive: true });
|
|
39
|
+
await fs.writeFile(path.join(outDir, name + '.json'), JSON.stringify(data, null, 2), 'utf8');
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
async function writeNDJSON(outDir: string, name: string, arr: any[]) {
|
|
43
|
+
await fs.mkdir(outDir, { recursive: true });
|
|
44
|
+
const fp = path.join(outDir, name + '.ndjson');
|
|
45
|
+
const stream = createWriteStream(fp, { encoding: 'utf8' });
|
|
46
|
+
for (const item of arr) {
|
|
47
|
+
stream.write(JSON.stringify(item) + '\n');
|
|
48
|
+
}
|
|
49
|
+
stream.end();
|
|
50
|
+
await new Promise(resolve => stream.on('finish', resolve));
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
async function writeCSVFile(outDir: string, name: string, arr: any[]) {
|
|
54
|
+
await fs.mkdir(outDir, { recursive: true });
|
|
55
|
+
const csv = toCSV(arr);
|
|
56
|
+
await fs.writeFile(path.join(outDir, name + '.csv'), csv, 'utf8');
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function makeRecords(merged: any[], info: Record<string, any>) {
|
|
60
|
+
return merged.map((m: any) => ({
|
|
61
|
+
rank: m.rank ?? null,
|
|
62
|
+
domain: m.domain ?? m.name ?? null,
|
|
63
|
+
source: m.source ?? null,
|
|
64
|
+
score: m.score ?? null,
|
|
65
|
+
info: info[m.domain] ?? null
|
|
66
|
+
}));
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
async function main() {
|
|
70
|
+
const args = process.argv.slice(2);
|
|
71
|
+
const formatsArg = args.find(a => a.startsWith('--formats='))?.split('=')[1];
|
|
72
|
+
const outArg = args.find(a => a.startsWith('--out='))?.split('=')[1];
|
|
73
|
+
const formats = formatsArg ? formatsArg.split(',') as Format[] : defaultFormats;
|
|
74
|
+
const outDir = outArg ? path.resolve(process.cwd(), outArg) : path.resolve(process.cwd(), 'exports');
|
|
75
|
+
|
|
76
|
+
const { merged, info } = await loadData();
|
|
77
|
+
const records = makeRecords(Array.isArray(merged) ? merged : Object.values(merged), info || {});
|
|
78
|
+
|
|
79
|
+
if (formats.includes('json')) {
|
|
80
|
+
await writeJSON(outDir, 'domain-rank', records);
|
|
81
|
+
console.log('Wrote JSON to', outDir);
|
|
82
|
+
}
|
|
83
|
+
if (formats.includes('ndjson')) {
|
|
84
|
+
await writeNDJSON(outDir, 'domain-rank', records);
|
|
85
|
+
console.log('Wrote NDJSON to', outDir);
|
|
86
|
+
}
|
|
87
|
+
if (formats.includes('csv')) {
|
|
88
|
+
await writeCSVFile(outDir, 'domain-rank', records);
|
|
89
|
+
console.log('Wrote CSV to', outDir);
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
if (require.main === module) {
|
|
94
|
+
main().catch(err => {
|
|
95
|
+
console.error(err);
|
|
96
|
+
process.exit(1);
|
|
97
|
+
});
|
|
98
|
+
}
|
package/src/favicons.js
ADDED
|
@@ -0,0 +1,213 @@
|
|
|
1
|
+
import { Database } from "bun:sqlite";
|
|
2
|
+
import domains from "../data/domains-1m.js";
|
|
3
|
+
|
|
4
|
+
const db = new Database("favicons.sqlite");
|
|
5
|
+
|
|
6
|
+
// Create table if it doesn't exist
|
|
7
|
+
db.run(`
|
|
8
|
+
CREATE TABLE IF NOT EXISTS favicons (
|
|
9
|
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
10
|
+
domain TEXT UNIQUE,
|
|
11
|
+
favicon BLOB,
|
|
12
|
+
size INTEGER,
|
|
13
|
+
created_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
|
14
|
+
)
|
|
15
|
+
`);
|
|
16
|
+
|
|
17
|
+
/**
|
|
18
|
+
* Get favicon for a single domain using Google's favicon API
|
|
19
|
+
* @param {string} domain - Domain name to fetch favicon for
|
|
20
|
+
* @param {number} [size=16] - Size of the favicon in pixels
|
|
21
|
+
* @returns {Promise<Buffer|null>} Buffer containing favicon data or null if failed
|
|
22
|
+
*/
|
|
23
|
+
async function getFaviconForSingleDomain(domain, size = 16) {
|
|
24
|
+
try {
|
|
25
|
+
// Create AbortController for timeout
|
|
26
|
+
const controller = new AbortController();
|
|
27
|
+
const timeoutId = setTimeout(() => controller.abort(), 10000); // 10 second timeout
|
|
28
|
+
|
|
29
|
+
const response = await fetch(
|
|
30
|
+
'https://www.google.com/s2/favicons?domain=' + encodeURIComponent(domain) + '&sz=' + size,
|
|
31
|
+
{
|
|
32
|
+
signal: controller.signal,
|
|
33
|
+
headers: {
|
|
34
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
);
|
|
38
|
+
|
|
39
|
+
clearTimeout(timeoutId);
|
|
40
|
+
|
|
41
|
+
if (!response.ok) {
|
|
42
|
+
return null;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
const arrayBuffer = await response.arrayBuffer();
|
|
46
|
+
const buffer = Buffer.from(arrayBuffer);
|
|
47
|
+
|
|
48
|
+
// Check if we got a valid image (not the default Google favicon)
|
|
49
|
+
if (buffer.length < 100) {
|
|
50
|
+
return null; // Too small, likely a default favicon
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
return buffer;
|
|
54
|
+
} catch (error) {
|
|
55
|
+
console.log(`Could not fetch favicon for ${domain}: ${error.message}`);
|
|
56
|
+
return null;
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
/**
|
|
61
|
+
* Fetch favicons and store them in SQLite database
|
|
62
|
+
* @param {object} options - Options for fetching favicons
|
|
63
|
+
* @param {number} options.startIndex - Start index in domains list (default: 0)
|
|
64
|
+
* @param {number} options.endIndex - End index in domains list (default: 1000)
|
|
65
|
+
* @param {number} options.size - Size of favicons in pixels (default: 16)
|
|
66
|
+
* @param {number} options.delay - Delay between requests in ms (default: 200)
|
|
67
|
+
* @returns {Promise<void>} Promise that resolves when all favicons are fetched and stored
|
|
68
|
+
*/
|
|
69
|
+
async function fetchAndStoreFavicons(options = {}) {
|
|
70
|
+
const {
|
|
71
|
+
startIndex = 0,
|
|
72
|
+
endIndex = 1000,
|
|
73
|
+
size = 16,
|
|
74
|
+
delay = 200,
|
|
75
|
+
} = options;
|
|
76
|
+
|
|
77
|
+
const domainsArray = domains.split(",");
|
|
78
|
+
const actualEndIndex = Math.min(endIndex, domainsArray.length);
|
|
79
|
+
|
|
80
|
+
// Prepare statements for better performance
|
|
81
|
+
const insertStmt = db.prepare(
|
|
82
|
+
"INSERT OR REPLACE INTO favicons (domain, favicon, size) VALUES (?, ?, ?)"
|
|
83
|
+
);
|
|
84
|
+
const existsStmt = db.prepare(
|
|
85
|
+
"SELECT id FROM favicons WHERE domain = ? AND size = ?"
|
|
86
|
+
);
|
|
87
|
+
|
|
88
|
+
// Get current count
|
|
89
|
+
const initialCount = db.query("SELECT COUNT(*) as count FROM favicons").get().count;
|
|
90
|
+
console.log(`Starting with ${initialCount} favicons in database`);
|
|
91
|
+
console.log(`Fetching favicons for domains ${startIndex} to ${actualEndIndex - 1}`);
|
|
92
|
+
console.log(`Target: ${actualEndIndex - startIndex} domains`);
|
|
93
|
+
|
|
94
|
+
let successCount = 0;
|
|
95
|
+
let skipCount = 0;
|
|
96
|
+
let failCount = 0;
|
|
97
|
+
|
|
98
|
+
for (let i = startIndex; i < actualEndIndex; i++) {
|
|
99
|
+
const domain = domainsArray[i];
|
|
100
|
+
|
|
101
|
+
// Check if already exists with same size
|
|
102
|
+
const exists = existsStmt.get(domain, size);
|
|
103
|
+
if (exists) {
|
|
104
|
+
console.log(`Skipping ${i + 1}: ${domain} (already exists)`);
|
|
105
|
+
skipCount++;
|
|
106
|
+
continue;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
console.log(`Fetching ${i + 1}: ${domain}`);
|
|
110
|
+
|
|
111
|
+
const faviconBuffer = await getFaviconForSingleDomain(domain, size);
|
|
112
|
+
|
|
113
|
+
if (faviconBuffer) {
|
|
114
|
+
insertStmt.run(domain, faviconBuffer, size);
|
|
115
|
+
successCount++;
|
|
116
|
+
console.log(`✓ Stored favicon for ${domain} (${faviconBuffer.length} bytes)`);
|
|
117
|
+
|
|
118
|
+
// Log progress every 20 successful downloads
|
|
119
|
+
if (successCount % 20 === 0) {
|
|
120
|
+
const currentCount = db.query("SELECT COUNT(*) as count FROM favicons").get().count;
|
|
121
|
+
console.log(`Progress: ${successCount} new favicons downloaded, ${currentCount} total in database`);
|
|
122
|
+
}
|
|
123
|
+
} else {
|
|
124
|
+
failCount++;
|
|
125
|
+
console.log(`✗ Failed to download favicon for ${domain}`);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
// Add delay to avoid overwhelming Google's API
|
|
129
|
+
if (delay > 0) {
|
|
130
|
+
await new Promise(resolve => setTimeout(resolve, delay));
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
const finalCount = db.query("SELECT COUNT(*) as count FROM favicons").get().count;
|
|
135
|
+
|
|
136
|
+
console.log(`\nCompleted fetching favicons!`);
|
|
137
|
+
console.log(`Total favicons in database: ${finalCount}`);
|
|
138
|
+
console.log(`New favicons: ${successCount}`);
|
|
139
|
+
console.log(`Skipped: ${skipCount}`);
|
|
140
|
+
console.log(`Failed: ${failCount}`);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/**
|
|
144
|
+
* Get favicon from database by domain
|
|
145
|
+
* @param {string} domain - Domain to get favicon for
|
|
146
|
+
* @param {number} [size=16] - Size of favicon to retrieve
|
|
147
|
+
* @returns {Buffer|null} Favicon buffer or null if not found
|
|
148
|
+
*/
|
|
149
|
+
function getFaviconFromDatabase(domain, size = 16) {
|
|
150
|
+
const stmt = db.prepare("SELECT favicon FROM favicons WHERE domain = ? AND size = ?");
|
|
151
|
+
const result = stmt.get(domain, size);
|
|
152
|
+
return result ? result.favicon : null;
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
/**
|
|
156
|
+
* Get all domains that have favicons stored
|
|
157
|
+
* @param {number} [size=16] - Size filter
|
|
158
|
+
* @returns {string[]} Array of domain names
|
|
159
|
+
*/
|
|
160
|
+
function getDomainsWithFavicons(size = 16) {
|
|
161
|
+
const stmt = db.prepare("SELECT domain FROM favicons WHERE size = ? ORDER BY domain");
|
|
162
|
+
return stmt.all(size).map(row => row.domain);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
/**
|
|
166
|
+
* Get database statistics
|
|
167
|
+
* @returns {object} Statistics about the favicon database
|
|
168
|
+
*/
|
|
169
|
+
function getDatabaseStats() {
|
|
170
|
+
const totalCount = db.query("SELECT COUNT(*) as count FROM favicons").get().count;
|
|
171
|
+
const sizeStats = db.query(`
|
|
172
|
+
SELECT size, COUNT(*) as count
|
|
173
|
+
FROM favicons
|
|
174
|
+
GROUP BY size
|
|
175
|
+
ORDER BY size
|
|
176
|
+
`).all();
|
|
177
|
+
|
|
178
|
+
const totalSize = db.query(`
|
|
179
|
+
SELECT SUM(LENGTH(favicon)) as totalBytes
|
|
180
|
+
FROM favicons
|
|
181
|
+
`).get().totalBytes || 0;
|
|
182
|
+
|
|
183
|
+
return {
|
|
184
|
+
totalFavicons: totalCount,
|
|
185
|
+
totalSizeBytes: totalSize,
|
|
186
|
+
totalSizeMB: (totalSize / 1024 / 1024).toFixed(2),
|
|
187
|
+
sizeBreakdown: sizeStats
|
|
188
|
+
};
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
// Export functions for use as a module
|
|
192
|
+
export {
|
|
193
|
+
fetchAndStoreFavicons,
|
|
194
|
+
getFaviconFromDatabase,
|
|
195
|
+
getDomainsWithFavicons,
|
|
196
|
+
getDatabaseStats,
|
|
197
|
+
getFaviconForSingleDomain
|
|
198
|
+
};
|
|
199
|
+
|
|
200
|
+
// Close database connection gracefully on exit
|
|
201
|
+
process.on('SIGINT', () => {
|
|
202
|
+
console.log('\nClosing database connection...');
|
|
203
|
+
db.close();
|
|
204
|
+
process.exit(0);
|
|
205
|
+
});
|
|
206
|
+
|
|
207
|
+
// Run for top 1000 domains if this file is executed directly
|
|
208
|
+
if (import.meta.main) {
|
|
209
|
+
console.log('Starting favicon fetcher...');
|
|
210
|
+
await fetchAndStoreFavicons({ startIndex: 0, endIndex: 1000, delay: 200 });
|
|
211
|
+
console.log('\nDatabase stats:');
|
|
212
|
+
console.log(getDatabaseStats());
|
|
213
|
+
}
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
import fs from "fs";
|
|
2
|
+
import zlib from "zlib";
|
|
3
|
+
import readline from "readline";
|
|
4
|
+
import { writeFile } from 'fs/promises';
|
|
5
|
+
import { pipeline } from 'stream/promises';
|
|
6
|
+
import { Transform, Readable } from 'stream';
|
|
7
|
+
import unzipper from 'unzipper';
|
|
8
|
+
import grab from "grab-url";
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Domain Rank shows how trustworthy and influential a domain is based on links pointing to that
|
|
15
|
+
* domain's pages across all 120+ million domains.
|
|
16
|
+
*
|
|
17
|
+
* @see [CommonCrawl](https://commoncrawl.org/web-graphs)
|
|
18
|
+
* CommonCrawl is a nonprofit for open source public dataset that crawls and downloads the entire
|
|
19
|
+
* internet 100TB urls and html. CommonCrawl calculates domain rank for 100M domains, using
|
|
20
|
+
* PageRank algorithm which randomly surfs links and counts travels to each page to find
|
|
21
|
+
* probability of being at a domain, thus ranking influence among other reputable domains.
|
|
22
|
+
*/
|
|
23
|
+
export async function importDomainsPageRankCrawler(urlCommonCrawl = '', limit = 1000000) {
|
|
24
|
+
const url = urlCommonCrawl || await getDomainCrawlerUrl();
|
|
25
|
+
|
|
26
|
+
try {
|
|
27
|
+
if (!fs.existsSync("./data")) fs.mkdirSync("./data", { recursive: true });
|
|
28
|
+
try { fs.unlinkSync("./data/domains-1m.js"); } catch (_) {}
|
|
29
|
+
|
|
30
|
+
const writeStream = fs.createWriteStream("./data/domains-1m.js", { flags: "w" });
|
|
31
|
+
let lineNum = 0;
|
|
32
|
+
let headerProcessed = false;
|
|
33
|
+
writeStream.write("export default '");
|
|
34
|
+
|
|
35
|
+
const ticker = setInterval(() => {
|
|
36
|
+
console.log(`Processed lines: ${lineNum.toLocaleString()}`);
|
|
37
|
+
if (lineNum >= limit) clearInterval(ticker);
|
|
38
|
+
}, 5000);
|
|
39
|
+
|
|
40
|
+
await grab(url, {
|
|
41
|
+
onStream: async (body) => {
|
|
42
|
+
const decompressedStream = Readable.fromWeb(body).pipe(zlib.createGunzip());
|
|
43
|
+
const rl = readline.createInterface({ input: decompressedStream, crlfDelay: Infinity });
|
|
44
|
+
let streamClosed = false;
|
|
45
|
+
|
|
46
|
+
function closeStream() {
|
|
47
|
+
if (!streamClosed) {
|
|
48
|
+
streamClosed = true;
|
|
49
|
+
writeStream.write('\b');
|
|
50
|
+
writeStream.write("'");
|
|
51
|
+
writeStream.end();
|
|
52
|
+
clearInterval(ticker);
|
|
53
|
+
console.log(`Output file: ./data/domains-1m.js`);
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
rl.on('line', (line) => {
|
|
58
|
+
try {
|
|
59
|
+
if (streamClosed) return;
|
|
60
|
+
if (!headerProcessed) {
|
|
61
|
+
if (line.includes("#host_rev")) { headerProcessed = true; return; }
|
|
62
|
+
}
|
|
63
|
+
const parts = line.split("\t");
|
|
64
|
+
if (parts.length < 5) return;
|
|
65
|
+
const domain = parts[4];
|
|
66
|
+
if (!domain || domain === "#host_rev") return;
|
|
67
|
+
writeStream.write(`${domain.split(".").reverse().join(".")},`);
|
|
68
|
+
lineNum++;
|
|
69
|
+
if (lineNum >= limit) { rl.close(); closeStream(); }
|
|
70
|
+
} catch (error) {
|
|
71
|
+
console.error("Error processing line:", error);
|
|
72
|
+
}
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
await new Promise((resolve, reject) => {
|
|
76
|
+
rl.on('close', () => { closeStream(); resolve(); });
|
|
77
|
+
rl.on('error', (err) => { clearInterval(ticker); reject(err); });
|
|
78
|
+
});
|
|
79
|
+
}
|
|
80
|
+
});
|
|
81
|
+
|
|
82
|
+
} catch (error) {
|
|
83
|
+
console.error("Error:", error);
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
/**
|
|
89
|
+
* Scrapes Common Crawl web graphs page to find the domain-ranks.txt.gz URL
|
|
90
|
+
* from the first available date listing.
|
|
91
|
+
*
|
|
92
|
+
* @returns {Promise<string>} The full URL to the domain-ranks.txt.gz file
|
|
93
|
+
* @throws {Error} When HTTP requests fail, date links are not found, or domain-ranks.txt.gz is not found
|
|
94
|
+
*/
|
|
95
|
+
export async function getDomainCrawlerUrl() {
|
|
96
|
+
const mainHtml = await grab('https://commoncrawl.org/web-graphs', { timeout: 10 });
|
|
97
|
+
|
|
98
|
+
const datePattern = /href="([^"]*\d{4}-\w+[^"]*)"/;
|
|
99
|
+
const dateMatch = mainHtml.match(datePattern);
|
|
100
|
+
if (!dateMatch) throw new Error('No date link found');
|
|
101
|
+
|
|
102
|
+
const dateUrl = dateMatch[1].startsWith('http')
|
|
103
|
+
? dateMatch[1]
|
|
104
|
+
: `https://commoncrawl.org${dateMatch[1]}`;
|
|
105
|
+
|
|
106
|
+
const dateHtml = await grab(dateUrl, { timeout: 10 });
|
|
107
|
+
|
|
108
|
+
const rankPattern = /href="([^"]*domain-ranks\.txt\.gz[^"]*)"/;
|
|
109
|
+
const rankMatch = dateHtml.match(rankPattern);
|
|
110
|
+
if (!rankMatch) throw new Error('domain-ranks.txt.gz not found');
|
|
111
|
+
|
|
112
|
+
return rankMatch[1].startsWith('http')
|
|
113
|
+
? rankMatch[1]
|
|
114
|
+
: `https://commoncrawl.org${rankMatch[1]}`;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
/**
|
|
120
|
+
* Download and extract the current Tranco top-1M domain ranking.
|
|
121
|
+
*
|
|
122
|
+
* The Tranco project aggregates multiple ranking providers (Cisco Umbrella,
|
|
123
|
+
* Majestic, Farsight, Chrome UX Report, Cloudflare Radar) to generate
|
|
124
|
+
* manipulation-resistant popularity lists. The list is updated daily (UTC).
|
|
125
|
+
*
|
|
126
|
+
* Source: https://tranco-list.eu/
|
|
127
|
+
* Default dataset: https://tranco-list.eu/top-1m.csv.zip
|
|
128
|
+
*/
|
|
129
|
+
|
|
130
|
+
async function importDomainsOfficialList(limit = 10000) {
|
|
131
|
+
const url = 'https://tranco-list.eu/top-1m.csv.zip';
|
|
132
|
+
const output = './data/domains-official-100k.js';
|
|
133
|
+
|
|
134
|
+
console.log(`Streaming download and extraction (limit: ${limit.toLocaleString()})...`);
|
|
135
|
+
|
|
136
|
+
const domains = [];
|
|
137
|
+
let isFirstLine = true;
|
|
138
|
+
let done = false;
|
|
139
|
+
|
|
140
|
+
await grab(url, {
|
|
141
|
+
onStream: (body) => pipeline(
|
|
142
|
+
Readable.fromWeb(body),
|
|
143
|
+
unzipper.ParseOne(),
|
|
144
|
+
new Transform({
|
|
145
|
+
objectMode: false,
|
|
146
|
+
transform(chunk, encoding, callback) {
|
|
147
|
+
if (done) return callback();
|
|
148
|
+
for (const line of chunk.toString().split('\n')) {
|
|
149
|
+
if (isFirstLine) { isFirstLine = false; continue; }
|
|
150
|
+
const domain = line.split(',')[1]?.replace(/"/g, '').trim();
|
|
151
|
+
if (domain) domains.push(domain);
|
|
152
|
+
if (domains.length >= limit) { done = true; break; }
|
|
153
|
+
}
|
|
154
|
+
callback();
|
|
155
|
+
}
|
|
156
|
+
})
|
|
157
|
+
)
|
|
158
|
+
});
|
|
159
|
+
|
|
160
|
+
await writeFile(output, `export default '${domains.join(',')}';`);
|
|
161
|
+
console.log(`Saved ${domains.length} domains to ${output}`);
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
//if run directly
|
|
167
|
+
if (import.meta.main) {
|
|
168
|
+
// importDomainsPageRankCrawler();
|
|
169
|
+
importDomainsOfficialList(10000);
|
|
170
|
+
}
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
import fs from "fs";
|
|
2
|
+
|
|
3
|
+
type OldDomainEntry = [number, string]; // [rank, title] (existing source format)
|
|
4
|
+
|
|
5
|
+
type OutputEntry = {
|
|
6
|
+
name?: string; // preferred display name (news title overrides)
|
|
7
|
+
domainRank?: number;
|
|
8
|
+
domainTitle?: string;
|
|
9
|
+
newsRank?: number;
|
|
10
|
+
newsTitle?: string;
|
|
11
|
+
langCode?: string;
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
type DomainMap = Record<string, OutputEntry>;
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Merge domain-info.json (general 1M list) with news-domain-rank.json
|
|
18
|
+
* (curated news sources). News entries override general entries for the same
|
|
19
|
+
* domain; news-only domains are appended after the general list.
|
|
20
|
+
*
|
|
21
|
+
* Ranks are reassigned sequentially in the merged output.
|
|
22
|
+
*/
|
|
23
|
+
export function mergeDomainLists(options: {
|
|
24
|
+
domainInfoPath?: string;
|
|
25
|
+
newsDomainRankPath?: string;
|
|
26
|
+
outputPath?: string;
|
|
27
|
+
} = {}): DomainMap {
|
|
28
|
+
const {
|
|
29
|
+
domainInfoPath = "./data/domain-info.json",
|
|
30
|
+
newsDomainRankPath = "./data/news-domain-rank.json",
|
|
31
|
+
outputPath = "./data/domain-rank-merged.json",
|
|
32
|
+
} = options;
|
|
33
|
+
|
|
34
|
+
const domainInfoRaw: Record<string, OldDomainEntry> = fs.existsSync(domainInfoPath)
|
|
35
|
+
? JSON.parse(fs.readFileSync(domainInfoPath, "utf8"))
|
|
36
|
+
: {};
|
|
37
|
+
|
|
38
|
+
const newsDomainRaw: Record<string, OldDomainEntry> = fs.existsSync(newsDomainRankPath)
|
|
39
|
+
? JSON.parse(fs.readFileSync(newsDomainRankPath, "utf8"))
|
|
40
|
+
: {};
|
|
41
|
+
|
|
42
|
+
// Build merged map preserving both domain and news information
|
|
43
|
+
const merged: DomainMap = {};
|
|
44
|
+
|
|
45
|
+
// copy general list first
|
|
46
|
+
for (const [domain, entry] of Object.entries(domainInfoRaw)) {
|
|
47
|
+
const [rank, title] = entry || [undefined, undefined];
|
|
48
|
+
const newsEntry = newsDomainRaw[domain];
|
|
49
|
+
const newsRank = newsEntry ? newsEntry[0] : undefined;
|
|
50
|
+
const newsTitle = newsEntry ? newsEntry[1] : undefined;
|
|
51
|
+
const name = newsTitle || title || domain;
|
|
52
|
+
merged[domain] = {
|
|
53
|
+
name,
|
|
54
|
+
domainRank: typeof rank === 'number' ? rank : undefined,
|
|
55
|
+
domainTitle: title || undefined,
|
|
56
|
+
newsRank: typeof newsRank === 'number' ? newsRank : undefined,
|
|
57
|
+
newsTitle: newsTitle || undefined,
|
|
58
|
+
langCode: undefined,
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Determine next rank for news-only domains
|
|
63
|
+
const maxDomainRank = Object.values(merged).reduce((max, e) => Math.max(max, e.domainRank || 0), 0);
|
|
64
|
+
let nextRank = maxDomainRank + 1;
|
|
65
|
+
|
|
66
|
+
// append news-only domains
|
|
67
|
+
for (const [domain, entry] of Object.entries(newsDomainRaw)) {
|
|
68
|
+
if (merged[domain]) continue;
|
|
69
|
+
const [newsRank, newsTitle] = entry || [undefined, undefined];
|
|
70
|
+
merged[domain] = {
|
|
71
|
+
name: newsTitle || domain,
|
|
72
|
+
domainRank: nextRank++,
|
|
73
|
+
newsRank: typeof newsRank === 'number' ? newsRank : undefined,
|
|
74
|
+
newsTitle: newsTitle || undefined,
|
|
75
|
+
langCode: undefined,
|
|
76
|
+
};
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Prepare serializable output as fixed-order 6-element arrays:
|
|
80
|
+
// [name, domainRank, domainTitle, newsRank, newsTitle, langCode]
|
|
81
|
+
// Use empty strings (for text) and 0 (for ranks) for missing values.
|
|
82
|
+
const outputObj: Record<string, Array<string | number>> = {};
|
|
83
|
+
for (const [domain, entry] of Object.entries(merged)) {
|
|
84
|
+
// Avoid repeating the full name: if domainTitle or newsTitle exactly
|
|
85
|
+
// matches the chosen display name, store an empty string in that position.
|
|
86
|
+
const domainTitleField = entry.domainTitle !== undefined && entry.domainTitle !== entry.name ? entry.domainTitle : "";
|
|
87
|
+
const newsTitleField = entry.newsTitle !== undefined && entry.newsTitle !== entry.name ? entry.newsTitle : "";
|
|
88
|
+
|
|
89
|
+
const arr: Array<string | number> = [
|
|
90
|
+
entry.name !== undefined ? entry.name : "",
|
|
91
|
+
entry.domainRank !== undefined ? entry.domainRank : 0,
|
|
92
|
+
domainTitleField,
|
|
93
|
+
entry.newsRank !== undefined ? entry.newsRank : 0,
|
|
94
|
+
newsTitleField,
|
|
95
|
+
entry.langCode !== undefined ? entry.langCode : "",
|
|
96
|
+
];
|
|
97
|
+
outputObj[domain] = arr;
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
fs.writeFileSync(outputPath, JSON.stringify(outputObj), "utf8");
|
|
101
|
+
console.log(
|
|
102
|
+
`Merged ${Object.keys(domainInfoRaw).length} general + ${Object.keys(newsDomainRaw).length} news entries → ${Object.keys(merged).length} total → ${outputPath}`
|
|
103
|
+
);
|
|
104
|
+
|
|
105
|
+
return merged;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
// Run when called directly: bun src/merge-domain-lists.ts
|
|
109
|
+
mergeDomainLists();
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
import domainsOfficial from "../data/domains-official-1m.js";
|
|
2
|
+
import fs from "fs";
|
|
3
|
+
import {
|
|
4
|
+
shouldRemoveDomain,
|
|
5
|
+
findMainDomain,
|
|
6
|
+
getTitleOverride,
|
|
7
|
+
formatDomainAsTitle,
|
|
8
|
+
cleanSourceTitle,
|
|
9
|
+
getSourceTitle,
|
|
10
|
+
} from "./domain-name-formatter.js";
|
|
11
|
+
|
|
12
|
+
type DomainMap = Record<string, [number, string]>;
|
|
13
|
+
|
|
14
|
+
interface DomainInfoOptions {
|
|
15
|
+
startIndex?: number;
|
|
16
|
+
endIndex?: number;
|
|
17
|
+
resume?: boolean;
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export async function domainInfo(options: DomainInfoOptions = {}): Promise<void> {
|
|
21
|
+
const { startIndex = 0, endIndex = 1000, resume = false } = options;
|
|
22
|
+
|
|
23
|
+
const domainsArray = domainsOfficial.split(",");
|
|
24
|
+
const actualEndIndex = Math.min(endIndex, domainsArray.length);
|
|
25
|
+
const dataPath = "./data/domain-info.json";
|
|
26
|
+
|
|
27
|
+
let domainResults: DomainMap = {};
|
|
28
|
+
let resumeFromIndex = startIndex;
|
|
29
|
+
|
|
30
|
+
if (fs.existsSync(dataPath)) {
|
|
31
|
+
domainResults = JSON.parse(fs.readFileSync(dataPath, "utf8"));
|
|
32
|
+
|
|
33
|
+
if (resume) {
|
|
34
|
+
const existing = Object.keys(domainResults);
|
|
35
|
+
if (existing.length > 0) {
|
|
36
|
+
const lastDomain = existing[existing.length - 1];
|
|
37
|
+
const lastIndex = domainsArray.indexOf(lastDomain);
|
|
38
|
+
if (lastIndex !== -1) {
|
|
39
|
+
resumeFromIndex = lastIndex + 1;
|
|
40
|
+
console.log(`Resuming from domain: ${lastDomain} (index ${resumeFromIndex})`);
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
} else {
|
|
45
|
+
fs.writeFileSync(dataPath, "{}", "utf8");
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
if (!resume && startIndex === 0) {
|
|
49
|
+
domainResults = {};
|
|
50
|
+
fs.writeFileSync(dataPath, "{}", "utf8");
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
console.log(`Processing domains from index ${resumeFromIndex} to ${actualEndIndex - 1}`);
|
|
54
|
+
|
|
55
|
+
let actualRank = Object.keys(domainResults).length;
|
|
56
|
+
|
|
57
|
+
for (let i = resumeFromIndex; i < actualEndIndex; i++) {
|
|
58
|
+
const domain = domainsArray[i];
|
|
59
|
+
|
|
60
|
+
if (shouldRemoveDomain(domain)) {
|
|
61
|
+
console.log(`Skipping ${i + 1}: ${domain} (marked for removal)`);
|
|
62
|
+
continue;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
const mainDomain = findMainDomain(domain);
|
|
66
|
+
if (mainDomain) {
|
|
67
|
+
console.log(`Skipping ${i + 1}: ${domain} (alternative for ${mainDomain})`);
|
|
68
|
+
continue;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
actualRank++;
|
|
72
|
+
console.log(`Processing ${actualRank}: ${domain}`);
|
|
73
|
+
|
|
74
|
+
let source: string | null = getTitleOverride(domain);
|
|
75
|
+
|
|
76
|
+
if (!source) {
|
|
77
|
+
source = formatDomainAsTitle(domain);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
if (!getTitleOverride(domain)) {
|
|
81
|
+
const webTitle = await getSourceTitle(domain);
|
|
82
|
+
if (webTitle) {
|
|
83
|
+
const cleaned = cleanSourceTitle(webTitle)
|
|
84
|
+
?.replace(/homepage/gi, "")
|
|
85
|
+
.replace(/home/gi, "")
|
|
86
|
+
.replace(".com", "");
|
|
87
|
+
if (cleaned && cleaned.length > 0) {
|
|
88
|
+
const wordCount = cleaned.split(/\s+/).filter((w) => w.length > 0).length;
|
|
89
|
+
if (wordCount < 3) source = cleaned;
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
domainResults[domain] = [actualRank, source || domain];
|
|
95
|
+
fs.writeFileSync(dataPath, JSON.stringify(domainResults), "utf8");
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
domainInfo({ startIndex: 0, endIndex: 1000000, resume: true });
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import {test, expect} from 'vitest';
|
|
2
|
+
import {importTopDomains} from '../src/download-1m'
|
|
3
|
+
|
|
4
|
+
test('import top domains', async () => {
|
|
5
|
+
|
|
6
|
+
var result = await importTopDomains();
|
|
7
|
+
|
|
8
|
+
await new Promise(resolve => setTimeout(resolve, 40000));
|
|
9
|
+
|
|
10
|
+
console.log(result);
|
|
11
|
+
expect(result).toBeDefined();
|
|
12
|
+
|
|
13
|
+
}, 40000)
|