rezo 1.0.71 → 1.0.73
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/entries/curl.d.ts +13 -2
- package/dist/adapters/entries/fetch.d.ts +13 -2
- package/dist/adapters/entries/http.d.ts +13 -2
- package/dist/adapters/entries/http2.d.ts +13 -2
- package/dist/adapters/entries/react-native.d.ts +13 -2
- package/dist/adapters/entries/xhr.d.ts +13 -2
- package/dist/adapters/index.cjs +6 -6
- package/dist/cache/index.cjs +9 -9
- package/dist/crawler/crawler-options.cjs +1 -1
- package/dist/crawler/crawler-options.js +1 -1
- package/dist/crawler/crawler.cjs +307 -86
- package/dist/crawler/crawler.js +307 -86
- package/dist/crawler/index.cjs +40 -40
- package/dist/crawler/plugin/capped-array.cjs +1 -0
- package/dist/crawler/plugin/capped-array.js +1 -0
- package/dist/crawler/plugin/capped-map.cjs +1 -0
- package/dist/crawler/plugin/capped-map.js +1 -0
- package/dist/crawler/plugin/file-cacher.cjs +20 -18
- package/dist/crawler/plugin/file-cacher.js +20 -18
- package/dist/crawler/plugin/health-metrics.cjs +2 -0
- package/dist/crawler/plugin/health-metrics.js +2 -0
- package/dist/crawler/plugin/index.cjs +1 -1
- package/dist/crawler/plugin/index.js +1 -1
- package/dist/crawler/plugin/memory-monitor.cjs +1 -0
- package/dist/crawler/plugin/memory-monitor.js +1 -0
- package/dist/crawler/plugin/navigation-history.cjs +5 -5
- package/dist/crawler/plugin/navigation-history.js +3 -3
- package/dist/crawler/plugin/result-stream.cjs +5 -0
- package/dist/crawler/plugin/result-stream.js +5 -0
- package/dist/crawler/plugin/sqlite-utils.cjs +1 -0
- package/dist/crawler/plugin/sqlite-utils.js +1 -0
- package/dist/crawler/plugin/url-store.cjs +5 -5
- package/dist/crawler/plugin/url-store.js +5 -5
- package/dist/crawler/scraper.cjs +1 -1
- package/dist/crawler/scraper.js +1 -1
- package/dist/crawler.d.ts +148 -25
- package/dist/entries/crawler.cjs +4 -4
- package/dist/index.cjs +30 -30
- package/dist/index.d.ts +13 -2
- package/dist/internal/agents/index.cjs +10 -10
- package/dist/platform/browser.d.ts +13 -2
- package/dist/platform/bun.d.ts +13 -2
- package/dist/platform/deno.d.ts +13 -2
- package/dist/platform/node.d.ts +13 -2
- package/dist/platform/react-native.d.ts +13 -2
- package/dist/platform/worker.d.ts +13 -2
- package/dist/proxy/index.cjs +4 -4
- package/dist/queue/index.cjs +8 -8
- package/dist/queue/queue.cjs +58 -13
- package/dist/queue/queue.js +58 -13
- package/dist/responses/universal/index.cjs +11 -11
- package/dist/utils/agent-pool.cjs +2 -0
- package/dist/utils/agent-pool.js +2 -0
- package/dist/utils/compression.cjs +6 -6
- package/dist/utils/compression.js +6 -6
- package/dist/version.cjs +1 -1
- package/dist/version.js +1 -1
- package/dist/wget/index.cjs +49 -49
- package/dist/wget/index.d.ts +12 -1
- package/package.json +1 -1
|
@@ -1,19 +1,21 @@
|
|
|
1
|
-
var
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
1
|
+
var d=require("node:fs"),f=require("node:path"),u=require("node:zlib"),{createDatabase:b}=require("./sqlite-utils.cjs"),E=typeof u.zstdCompressSync==="function";function p(e){if(E)return u.zstdCompressSync(e);return e}function R(e){if(E)return u.zstdDecompressSync(e);return e}class h{db=null;options;cacheDir;dbPath;closed=!1;initPromise=null;evictionInProgress=!1;constructor(e={}){if(this.options={cacheDir:e.cacheDir||"/tmp/rezo-crawler/cache",dbFileName:e.dbFileName||"cache.db",ttl:e.ttl||604800000,compression:e.compression??!1,maxEntries:e.maxEntries??1e5},this.cacheDir=f.resolve(this.options.cacheDir),this.dbPath=f.join(this.cacheDir,this.options.dbFileName),!d.existsSync(this.cacheDir))d.mkdirSync(this.cacheDir,{recursive:!0})}static async create(e={}){let t=new h(e);return await t.initialize(),t}async initialize(){if(this.initPromise)return this.initPromise;return this.initPromise=(async()=>{this.db=await b(this.dbPath),this.db.exec("PRAGMA journal_mode = WAL"),this.db.exec("PRAGMA synchronous = NORMAL"),this.db.exec("PRAGMA cache_size = -32000"),this.db.exec("PRAGMA temp_store = MEMORY"),this.db.exec("PRAGMA mmap_size = 134217728"),this.db.exec(`
|
|
2
|
+
CREATE TABLE IF NOT EXISTS cache (
|
|
3
|
+
key TEXT NOT NULL,
|
|
4
|
+
namespace TEXT NOT NULL,
|
|
5
|
+
value BLOB NOT NULL,
|
|
6
|
+
expiresAt INTEGER NOT NULL,
|
|
7
|
+
createdAt INTEGER NOT NULL,
|
|
8
|
+
compressed INTEGER DEFAULT 0,
|
|
9
|
+
PRIMARY KEY (namespace, key)
|
|
10
|
+
) WITHOUT ROWID
|
|
11
|
+
`),this.db.exec("CREATE INDEX IF NOT EXISTS idx_expires ON cache(expiresAt)"),this.db.exec("CREATE INDEX IF NOT EXISTS idx_created ON cache(createdAt)"),this.db.exec("CREATE INDEX IF NOT EXISTS idx_ns ON cache(namespace)")})(),this.initPromise}async ensureDb(){if(await this.initialize(),!this.db)throw Error("FileCacher database not initialized");return this.db}async set(e,t,s,c="default"){if(this.closed)throw Error("FileCacher is closed");let i=await this.ensureDb(),r=Date.now(),a=r+(s??this.options.ttl),n=Buffer.from(JSON.stringify(t),"utf-8"),o=0;if(this.options.compression&&E)try{n=p(n),o=1}catch{}i.run(`
|
|
12
|
+
INSERT OR REPLACE INTO cache (key, namespace, value, expiresAt, createdAt, compressed)
|
|
13
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
14
|
+
`,e,c,Buffer.from(n).toString("base64"),a,r,o),this.maybeEvict()}async setMany(e,t="default"){if(this.closed)throw Error("FileCacher is closed");if(e.length===0)return;let s=await this.ensureDb(),c=Date.now(),i=c+this.options.ttl;s.exec("BEGIN TRANSACTION");try{for(let r of e){let a=r.ttl?c+r.ttl:i,n=Buffer.from(JSON.stringify(r.value),"utf-8"),o=0;if(this.options.compression&&E)try{n=p(n),o=1}catch{}s.run(`
|
|
15
|
+
INSERT OR REPLACE INTO cache (key, namespace, value, expiresAt, createdAt, compressed)
|
|
16
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
17
|
+
`,r.key,t,Buffer.from(n).toString("base64"),a,c,o)}s.exec("COMMIT")}catch(r){throw s.exec("ROLLBACK"),r}this.maybeEvict()}maybeEvict(){if(this.options.maxEntries<=0)return;if(this.evictionInProgress)return;if(!this.db)return;let e=this.db.get("SELECT COUNT(*) as cnt FROM cache");if(!e||e.cnt<=this.options.maxEntries)return;this.evictionInProgress=!0;try{let t=Math.ceil(this.options.maxEntries*0.1);this.db.run("DELETE FROM cache WHERE expiresAt < ?",Date.now());let s=this.db.get("SELECT COUNT(*) as cnt FROM cache");if(s&&s.cnt>this.options.maxEntries)this.db.run(`
|
|
18
|
+
DELETE FROM cache WHERE rowid IN (
|
|
19
|
+
SELECT rowid FROM cache ORDER BY createdAt ASC LIMIT ?
|
|
15
20
|
)
|
|
16
|
-
`,
|
|
17
|
-
INSERT OR REPLACE INTO cache (key, value, expiresAt, createdAt, compressed)
|
|
18
|
-
VALUES (?, ?, ?, ?, ?)
|
|
19
|
-
`,a.key,Buffer.from(o).toString("base64"),n,t,i)}c.exec("COMMIT")}catch(a){throw c.exec("ROLLBACK"),a}}async get(e,r="default"){if(this.closed)throw Error("FileCacher is closed");let c=await this.getDatabase(r),t=c.get("SELECT value, expiresAt, compressed FROM cache WHERE key = ?",e);if(!t)return null;if(t.expiresAt<Date.now())return c.run("DELETE FROM cache WHERE key = ?",e),null;let s=Buffer.from(t.value,"base64");if(t.compressed)try{s=g(s)}catch{return null}try{return JSON.parse(Buffer.from(s).toString("utf-8"))}catch{return null}}async has(e,r="default"){if(this.closed)return!1;let t=(await this.getDatabase(r)).get("SELECT expiresAt FROM cache WHERE key = ?",e);if(!t)return!1;return t.expiresAt>=Date.now()}async hasMany(e,r="default"){if(this.closed)return new Set;if(e.length===0)return new Set;let c=await this.getDatabase(r),t=Date.now(),s=new Set,a=500;for(let n=0;n<e.length;n+=a){let o=e.slice(n,n+a),i=o.map(()=>"?").join(","),u=c.all(`SELECT key, expiresAt FROM cache WHERE key IN (${i})`,...o);for(let h of u)if(h.expiresAt>=t)s.add(h.key)}return s}async delete(e,r="default"){if(this.closed)return!1;return(await this.getDatabase(r)).run("DELETE FROM cache WHERE key = ?",e),!0}async clear(e="default"){if(this.closed)return;(await this.getDatabase(e)).exec("DELETE FROM cache")}async cleanup(e="default"){if(this.closed)return 0;let r=await this.getDatabase(e),c=Date.now(),t=r.get("SELECT COUNT(*) as cnt FROM cache");r.run("DELETE FROM cache WHERE expiresAt < ?",c);let s=r.get("SELECT COUNT(*) as cnt FROM cache");return(t?.cnt||0)-(s?.cnt||0)}async stats(e="default"){if(this.closed)return{count:0,expired:0};let r=await this.getDatabase(e),c=Date.now(),t=r.get("SELECT COUNT(*) as cnt FROM cache"),s=r.get("SELECT COUNT(*) as cnt FROM cache WHERE expiresAt < ?",c);return{count:t?.cnt||0,expired:s?.cnt||0}}async close(){if(this.closed)return;this.closed=!0;for(let e of this.databases.values())try{e.close()}catch{}this.databases.clear()}get isClosed(){return this.closed}get directory(){return this.cacheDir}}exports.FileCacher=l;exports.default=l;module.exports=Object.assign(l,exports);
|
|
21
|
+
`,t)}finally{this.evictionInProgress=!1}}async get(e,t="default"){if(this.closed)return null;let s=await this.ensureDb(),c=s.get("SELECT value, expiresAt, compressed FROM cache WHERE namespace = ? AND key = ?",t,e);if(!c)return null;if(c.expiresAt<Date.now())return s.run("DELETE FROM cache WHERE namespace = ? AND key = ?",t,e),null;let i=Buffer.from(c.value,"base64");if(c.compressed)try{i=R(i)}catch{return null}try{return JSON.parse(Buffer.from(i).toString("utf-8"))}catch{return null}}async has(e,t="default"){if(this.closed)return!1;let c=(await this.ensureDb()).get("SELECT expiresAt FROM cache WHERE namespace = ? AND key = ?",t,e);if(!c)return!1;return c.expiresAt>=Date.now()}async hasMany(e,t="default"){if(this.closed)return new Set;if(e.length===0)return new Set;let s=await this.ensureDb(),c=Date.now(),i=new Set,r=500;for(let a=0;a<e.length;a+=r){let n=e.slice(a,a+r),o=n.map(()=>"?").join(","),T=s.all(`SELECT key, expiresAt FROM cache WHERE namespace = ? AND key IN (${o})`,t,...n);for(let l of T)if(l.expiresAt>=c)i.add(l.key)}return i}async delete(e,t="default"){if(this.closed)return!1;return(await this.ensureDb()).run("DELETE FROM cache WHERE namespace = ? AND key = ?",t,e),!0}async clear(e){if(this.closed)return;let t=await this.ensureDb();if(e)t.run("DELETE FROM cache WHERE namespace = ?",e);else t.exec("DELETE FROM cache")}async cleanup(){if(this.closed)return 0;let e=await this.ensureDb(),t=Date.now(),s=e.get("SELECT COUNT(*) as cnt FROM cache");e.run("DELETE FROM cache WHERE expiresAt < ?",t);let c=e.get("SELECT COUNT(*) as cnt FROM cache");return(s?.cnt||0)-(c?.cnt||0)}async stats(e){if(this.closed)return{count:0,expired:0,namespaces:0};let t=await this.ensureDb(),s=Date.now(),c,i;if(e)c=t.get("SELECT COUNT(*) as cnt FROM cache WHERE namespace = ?",e),i=t.get("SELECT COUNT(*) as cnt FROM cache WHERE namespace = ? AND expiresAt < ?",e,s);else c=t.get("SELECT COUNT(*) as cnt FROM cache"),i=t.get("SELECT COUNT(*) as cnt FROM cache WHERE expiresAt < ?",s);let r=t.get("SELECT COUNT(DISTINCT namespace) as cnt FROM cache");return{count:c?.cnt||0,expired:i?.cnt||0,namespaces:r?.cnt||0}}async close(){if(this.closed)return;if(this.closed=!0,await this.initPromise,this.db){try{this.db.exec("PRAGMA wal_checkpoint(TRUNCATE)")}catch{}try{this.db.close()}catch{}this.db=null}}get isClosed(){return this.closed}get directory(){return this.cacheDir}get databasePath(){return this.dbPath}}exports.FileCacher=h;exports.default=h;module.exports=Object.assign(h,exports);
|
|
@@ -1,19 +1,21 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
1
|
+
import d from"node:fs";import f from"node:path";import*as h from"node:zlib";import{createDatabase as m}from"./sqlite-utils.js";var E=typeof h.zstdCompressSync==="function";function p(e){if(E)return h.zstdCompressSync(e);return e}function b(e){if(E)return h.zstdDecompressSync(e);return e}class l{db=null;options;cacheDir;dbPath;closed=!1;initPromise=null;evictionInProgress=!1;constructor(e={}){if(this.options={cacheDir:e.cacheDir||"/tmp/rezo-crawler/cache",dbFileName:e.dbFileName||"cache.db",ttl:e.ttl||604800000,compression:e.compression??!1,maxEntries:e.maxEntries??1e5},this.cacheDir=f.resolve(this.options.cacheDir),this.dbPath=f.join(this.cacheDir,this.options.dbFileName),!d.existsSync(this.cacheDir))d.mkdirSync(this.cacheDir,{recursive:!0})}static async create(e={}){let t=new l(e);return await t.initialize(),t}async initialize(){if(this.initPromise)return this.initPromise;return this.initPromise=(async()=>{this.db=await m(this.dbPath),this.db.exec("PRAGMA journal_mode = WAL"),this.db.exec("PRAGMA synchronous = NORMAL"),this.db.exec("PRAGMA cache_size = -32000"),this.db.exec("PRAGMA temp_store = MEMORY"),this.db.exec("PRAGMA mmap_size = 134217728"),this.db.exec(`
|
|
2
|
+
CREATE TABLE IF NOT EXISTS cache (
|
|
3
|
+
key TEXT NOT NULL,
|
|
4
|
+
namespace TEXT NOT NULL,
|
|
5
|
+
value BLOB NOT NULL,
|
|
6
|
+
expiresAt INTEGER NOT NULL,
|
|
7
|
+
createdAt INTEGER NOT NULL,
|
|
8
|
+
compressed INTEGER DEFAULT 0,
|
|
9
|
+
PRIMARY KEY (namespace, key)
|
|
10
|
+
) WITHOUT ROWID
|
|
11
|
+
`),this.db.exec("CREATE INDEX IF NOT EXISTS idx_expires ON cache(expiresAt)"),this.db.exec("CREATE INDEX IF NOT EXISTS idx_created ON cache(createdAt)"),this.db.exec("CREATE INDEX IF NOT EXISTS idx_ns ON cache(namespace)")})(),this.initPromise}async ensureDb(){if(await this.initialize(),!this.db)throw Error("FileCacher database not initialized");return this.db}async set(e,t,s,c="default"){if(this.closed)throw Error("FileCacher is closed");let i=await this.ensureDb(),r=Date.now(),a=r+(s??this.options.ttl),n=Buffer.from(JSON.stringify(t),"utf-8"),o=0;if(this.options.compression&&E)try{n=p(n),o=1}catch{}i.run(`
|
|
12
|
+
INSERT OR REPLACE INTO cache (key, namespace, value, expiresAt, createdAt, compressed)
|
|
13
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
14
|
+
`,e,c,Buffer.from(n).toString("base64"),a,r,o),this.maybeEvict()}async setMany(e,t="default"){if(this.closed)throw Error("FileCacher is closed");if(e.length===0)return;let s=await this.ensureDb(),c=Date.now(),i=c+this.options.ttl;s.exec("BEGIN TRANSACTION");try{for(let r of e){let a=r.ttl?c+r.ttl:i,n=Buffer.from(JSON.stringify(r.value),"utf-8"),o=0;if(this.options.compression&&E)try{n=p(n),o=1}catch{}s.run(`
|
|
15
|
+
INSERT OR REPLACE INTO cache (key, namespace, value, expiresAt, createdAt, compressed)
|
|
16
|
+
VALUES (?, ?, ?, ?, ?, ?)
|
|
17
|
+
`,r.key,t,Buffer.from(n).toString("base64"),a,c,o)}s.exec("COMMIT")}catch(r){throw s.exec("ROLLBACK"),r}this.maybeEvict()}maybeEvict(){if(this.options.maxEntries<=0)return;if(this.evictionInProgress)return;if(!this.db)return;let e=this.db.get("SELECT COUNT(*) as cnt FROM cache");if(!e||e.cnt<=this.options.maxEntries)return;this.evictionInProgress=!0;try{let t=Math.ceil(this.options.maxEntries*0.1);this.db.run("DELETE FROM cache WHERE expiresAt < ?",Date.now());let s=this.db.get("SELECT COUNT(*) as cnt FROM cache");if(s&&s.cnt>this.options.maxEntries)this.db.run(`
|
|
18
|
+
DELETE FROM cache WHERE rowid IN (
|
|
19
|
+
SELECT rowid FROM cache ORDER BY createdAt ASC LIMIT ?
|
|
15
20
|
)
|
|
16
|
-
`,
|
|
17
|
-
INSERT OR REPLACE INTO cache (key, value, expiresAt, createdAt, compressed)
|
|
18
|
-
VALUES (?, ?, ?, ?, ?)
|
|
19
|
-
`,a.key,Buffer.from(o).toString("base64"),n,t,i)}c.exec("COMMIT")}catch(a){throw c.exec("ROLLBACK"),a}}async get(e,r="default"){if(this.closed)throw Error("FileCacher is closed");let c=await this.getDatabase(r),t=c.get("SELECT value, expiresAt, compressed FROM cache WHERE key = ?",e);if(!t)return null;if(t.expiresAt<Date.now())return c.run("DELETE FROM cache WHERE key = ?",e),null;let s=Buffer.from(t.value,"base64");if(t.compressed)try{s=N(s)}catch{return null}try{return JSON.parse(Buffer.from(s).toString("utf-8"))}catch{return null}}async has(e,r="default"){if(this.closed)return!1;let t=(await this.getDatabase(r)).get("SELECT expiresAt FROM cache WHERE key = ?",e);if(!t)return!1;return t.expiresAt>=Date.now()}async hasMany(e,r="default"){if(this.closed)return new Set;if(e.length===0)return new Set;let c=await this.getDatabase(r),t=Date.now(),s=new Set,a=500;for(let n=0;n<e.length;n+=a){let o=e.slice(n,n+a),i=o.map(()=>"?").join(","),l=c.all(`SELECT key, expiresAt FROM cache WHERE key IN (${i})`,...o);for(let h of l)if(h.expiresAt>=t)s.add(h.key)}return s}async delete(e,r="default"){if(this.closed)return!1;return(await this.getDatabase(r)).run("DELETE FROM cache WHERE key = ?",e),!0}async clear(e="default"){if(this.closed)return;(await this.getDatabase(e)).exec("DELETE FROM cache")}async cleanup(e="default"){if(this.closed)return 0;let r=await this.getDatabase(e),c=Date.now(),t=r.get("SELECT COUNT(*) as cnt FROM cache");r.run("DELETE FROM cache WHERE expiresAt < ?",c);let s=r.get("SELECT COUNT(*) as cnt FROM cache");return(t?.cnt||0)-(s?.cnt||0)}async stats(e="default"){if(this.closed)return{count:0,expired:0};let r=await this.getDatabase(e),c=Date.now(),t=r.get("SELECT COUNT(*) as cnt FROM cache"),s=r.get("SELECT COUNT(*) as cnt FROM cache WHERE expiresAt < ?",c);return{count:t?.cnt||0,expired:s?.cnt||0}}async close(){if(this.closed)return;this.closed=!0;for(let e of this.databases.values())try{e.close()}catch{}this.databases.clear()}get isClosed(){return this.closed}get directory(){return this.cacheDir}}var z=E;export{z as default,E as FileCacher};
|
|
21
|
+
`,t)}finally{this.evictionInProgress=!1}}async get(e,t="default"){if(this.closed)return null;let s=await this.ensureDb(),c=s.get("SELECT value, expiresAt, compressed FROM cache WHERE namespace = ? AND key = ?",t,e);if(!c)return null;if(c.expiresAt<Date.now())return s.run("DELETE FROM cache WHERE namespace = ? AND key = ?",t,e),null;let i=Buffer.from(c.value,"base64");if(c.compressed)try{i=b(i)}catch{return null}try{return JSON.parse(Buffer.from(i).toString("utf-8"))}catch{return null}}async has(e,t="default"){if(this.closed)return!1;let c=(await this.ensureDb()).get("SELECT expiresAt FROM cache WHERE namespace = ? AND key = ?",t,e);if(!c)return!1;return c.expiresAt>=Date.now()}async hasMany(e,t="default"){if(this.closed)return new Set;if(e.length===0)return new Set;let s=await this.ensureDb(),c=Date.now(),i=new Set,r=500;for(let a=0;a<e.length;a+=r){let n=e.slice(a,a+r),o=n.map(()=>"?").join(","),T=s.all(`SELECT key, expiresAt FROM cache WHERE namespace = ? AND key IN (${o})`,t,...n);for(let u of T)if(u.expiresAt>=c)i.add(u.key)}return i}async delete(e,t="default"){if(this.closed)return!1;return(await this.ensureDb()).run("DELETE FROM cache WHERE namespace = ? AND key = ?",t,e),!0}async clear(e){if(this.closed)return;let t=await this.ensureDb();if(e)t.run("DELETE FROM cache WHERE namespace = ?",e);else t.exec("DELETE FROM cache")}async cleanup(){if(this.closed)return 0;let e=await this.ensureDb(),t=Date.now(),s=e.get("SELECT COUNT(*) as cnt FROM cache");e.run("DELETE FROM cache WHERE expiresAt < ?",t);let c=e.get("SELECT COUNT(*) as cnt FROM cache");return(s?.cnt||0)-(c?.cnt||0)}async stats(e){if(this.closed)return{count:0,expired:0,namespaces:0};let t=await this.ensureDb(),s=Date.now(),c,i;if(e)c=t.get("SELECT COUNT(*) as cnt FROM cache WHERE namespace = ?",e),i=t.get("SELECT COUNT(*) as cnt FROM cache WHERE namespace = ? AND expiresAt < ?",e,s);else c=t.get("SELECT COUNT(*) as cnt FROM cache"),i=t.get("SELECT COUNT(*) as cnt FROM cache WHERE expiresAt < ?",s);let r=t.get("SELECT COUNT(DISTINCT namespace) as cnt FROM cache");return{count:c?.cnt||0,expired:i?.cnt||0,namespaces:r?.cnt||0}}async close(){if(this.closed)return;if(this.closed=!0,await this.initPromise,this.db){try{this.db.exec("PRAGMA wal_checkpoint(TRUNCATE)")}catch{}try{this.db.close()}catch{}this.db=null}}get isClosed(){return this.closed}get directory(){return this.cacheDir}get databasePath(){return this.dbPath}}var y=l;export{y as default,l as FileCacher};
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
class c{windowSize;startTime;samples=[];totalRequests=0;totalSuccesses=0;totalFailures=0;totalResponseTime=0;constructor(e={}){this.windowSize=e.windowSize??60000,this.startTime=Date.now()}recordRequest(e,t){let s=Date.now();if(this.samples.push({timestamp:s,responseTime:e,success:t}),this.totalRequests++,this.totalResponseTime+=e,t)this.totalSuccesses++;else this.totalFailures++;this.pruneOldSamples(s)}pruneOldSamples(e){let t=e-this.windowSize,s=0;while(s<this.samples.length&&this.samples[s].timestamp<t)s++;if(s>0)this.samples=this.samples.slice(s)}getSnapshot(e,t){let s=Date.now();this.pruneOldSamples(s);let u=this.samples,o=u.length,r=o>0?Math.max(1000,s-u[0].timestamp):1000,h=o/r*1000,p=u.filter((a)=>a.success).length,l=o>0?p/o*100:100,m=100-l,n=u.map((a)=>a.responseTime).sort((a,i)=>a-i),_=o>0?n.reduce((a,i)=>a+i,0)/o:0,d=Math.floor(n.length*0.95),R=n.length>0?n[d]:0;return{timestamp:s,requestsPerSecond:Math.round(h*100)/100,successRate:Math.round(l*100)/100,failureRate:Math.round(m*100)/100,avgResponseTime:Math.round(_),p95ResponseTime:Math.round(R),queueDepth:e,activeRequests:t,totalRequests:this.totalRequests,totalSuccesses:this.totalSuccesses,totalFailures:this.totalFailures,uptimeMs:s-this.startTime}}getTotals(){return{requests:this.totalRequests,successes:this.totalSuccesses,failures:this.totalFailures,avgResponseTime:this.totalRequests>0?Math.round(this.totalResponseTime/this.totalRequests):0}}isHealthy(e={}){let t=this.getSnapshot(0,0),s=e.minSuccessRate??80,u=e.maxAvgResponseTime??1e4,o=e.maxP95ResponseTime??30000;if(t.totalRequests<10)return!0;return t.successRate>=s&&t.avgResponseTime<=u&&t.p95ResponseTime<=o}toPrometheusFormat(e="crawler"){let t=this.getSnapshot(0,0),s=[];return s.push(`# HELP ${e}_requests_total Total requests processed`),s.push(`# TYPE ${e}_requests_total counter`),s.push(`${e}_requests_total ${t.totalRequests}`),s.push(`# HELP ${e}_requests_success_total Total successful requests`),s.push(`# TYPE ${e}_requests_success_total counter`),s.push(`${e}_requests_success_total ${t.totalSuccesses}`),s.push(`# HELP ${e}_requests_failed_total Total failed requests`),s.push(`# TYPE ${e}_requests_failed_total counter`),s.push(`${e}_requests_failed_total ${t.totalFailures}`),s.push(`# HELP ${e}_requests_per_second Current requests per second`),s.push(`# TYPE ${e}_requests_per_second gauge`),s.push(`${e}_requests_per_second ${t.requestsPerSecond}`),s.push(`# HELP ${e}_success_rate_percent Success rate percentage`),s.push(`# TYPE ${e}_success_rate_percent gauge`),s.push(`${e}_success_rate_percent ${t.successRate}`),s.push(`# HELP ${e}_response_time_avg_ms Average response time`),s.push(`# TYPE ${e}_response_time_avg_ms gauge`),s.push(`${e}_response_time_avg_ms ${t.avgResponseTime}`),s.push(`# HELP ${e}_response_time_p95_ms P95 response time`),s.push(`# TYPE ${e}_response_time_p95_ms gauge`),s.push(`${e}_response_time_p95_ms ${t.p95ResponseTime}`),s.push(`# HELP ${e}_uptime_seconds Crawler uptime`),s.push(`# TYPE ${e}_uptime_seconds gauge`),s.push(`${e}_uptime_seconds ${Math.round(t.uptimeMs/1000)}`),s.join(`
|
|
2
|
+
`)}toJSON(e=0,t=0){return JSON.stringify(this.getSnapshot(e,t))}reset(){this.samples=[],this.totalRequests=0,this.totalSuccesses=0,this.totalFailures=0,this.totalResponseTime=0}}exports.HealthMetrics=c;exports.default=c;module.exports=Object.assign(c,exports);
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
class l{windowSize;startTime;samples=[];totalRequests=0;totalSuccesses=0;totalFailures=0;totalResponseTime=0;constructor(e={}){this.windowSize=e.windowSize??60000,this.startTime=Date.now()}recordRequest(e,t){let s=Date.now();if(this.samples.push({timestamp:s,responseTime:e,success:t}),this.totalRequests++,this.totalResponseTime+=e,t)this.totalSuccesses++;else this.totalFailures++;this.pruneOldSamples(s)}pruneOldSamples(e){let t=e-this.windowSize,s=0;while(s<this.samples.length&&this.samples[s].timestamp<t)s++;if(s>0)this.samples=this.samples.slice(s)}getSnapshot(e,t){let s=Date.now();this.pruneOldSamples(s);let u=this.samples,o=u.length,r=o>0?Math.max(1000,s-u[0].timestamp):1000,h=o/r*1000,p=u.filter((a)=>a.success).length,i=o>0?p/o*100:100,m=100-i,n=u.map((a)=>a.responseTime).sort((a,c)=>a-c),_=o>0?n.reduce((a,c)=>a+c,0)/o:0,d=Math.floor(n.length*0.95),R=n.length>0?n[d]:0;return{timestamp:s,requestsPerSecond:Math.round(h*100)/100,successRate:Math.round(i*100)/100,failureRate:Math.round(m*100)/100,avgResponseTime:Math.round(_),p95ResponseTime:Math.round(R),queueDepth:e,activeRequests:t,totalRequests:this.totalRequests,totalSuccesses:this.totalSuccesses,totalFailures:this.totalFailures,uptimeMs:s-this.startTime}}getTotals(){return{requests:this.totalRequests,successes:this.totalSuccesses,failures:this.totalFailures,avgResponseTime:this.totalRequests>0?Math.round(this.totalResponseTime/this.totalRequests):0}}isHealthy(e={}){let t=this.getSnapshot(0,0),s=e.minSuccessRate??80,u=e.maxAvgResponseTime??1e4,o=e.maxP95ResponseTime??30000;if(t.totalRequests<10)return!0;return t.successRate>=s&&t.avgResponseTime<=u&&t.p95ResponseTime<=o}toPrometheusFormat(e="crawler"){let t=this.getSnapshot(0,0),s=[];return s.push(`# HELP ${e}_requests_total Total requests processed`),s.push(`# TYPE ${e}_requests_total counter`),s.push(`${e}_requests_total ${t.totalRequests}`),s.push(`# HELP ${e}_requests_success_total Total successful requests`),s.push(`# TYPE ${e}_requests_success_total counter`),s.push(`${e}_requests_success_total ${t.totalSuccesses}`),s.push(`# HELP ${e}_requests_failed_total Total failed requests`),s.push(`# TYPE ${e}_requests_failed_total counter`),s.push(`${e}_requests_failed_total ${t.totalFailures}`),s.push(`# HELP ${e}_requests_per_second Current requests per second`),s.push(`# TYPE ${e}_requests_per_second gauge`),s.push(`${e}_requests_per_second ${t.requestsPerSecond}`),s.push(`# HELP ${e}_success_rate_percent Success rate percentage`),s.push(`# TYPE ${e}_success_rate_percent gauge`),s.push(`${e}_success_rate_percent ${t.successRate}`),s.push(`# HELP ${e}_response_time_avg_ms Average response time`),s.push(`# TYPE ${e}_response_time_avg_ms gauge`),s.push(`${e}_response_time_avg_ms ${t.avgResponseTime}`),s.push(`# HELP ${e}_response_time_p95_ms P95 response time`),s.push(`# TYPE ${e}_response_time_p95_ms gauge`),s.push(`${e}_response_time_p95_ms ${t.p95ResponseTime}`),s.push(`# HELP ${e}_uptime_seconds Crawler uptime`),s.push(`# TYPE ${e}_uptime_seconds gauge`),s.push(`${e}_uptime_seconds ${Math.round(t.uptimeMs/1000)}`),s.join(`
|
|
2
|
+
`)}toJSON(e=0,t=0){return JSON.stringify(this.getSnapshot(e,t))}reset(){this.samples=[],this.totalRequests=0,this.totalSuccesses=0,this.totalFailures=0,this.totalResponseTime=0}}var g=l;export{g as default,l as HealthMetrics};
|
|
@@ -1 +1 @@
|
|
|
1
|
-
var e=require("./file-cacher.cjs");exports.FileCacher=e.FileCacher;var r=require("./url-store.cjs");exports.UrlStore=r.UrlStore;
|
|
1
|
+
var e=require("./file-cacher.cjs");exports.FileCacher=e.FileCacher;var r=require("./url-store.cjs");exports.UrlStore=r.UrlStore;var o=require("./result-stream.cjs");exports.ResultStream=o.ResultStream;var t=require("./memory-monitor.cjs");exports.MemoryMonitor=t.MemoryMonitor;var a=require("./health-metrics.cjs");exports.HealthMetrics=a.HealthMetrics;var i=require("./capped-map.cjs");exports.CappedMap=i.CappedMap;var m=require("./capped-array.cjs");exports.CappedArray=m.CappedArray;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
import{FileCacher as
|
|
1
|
+
import{FileCacher as e}from"./file-cacher.js";import{UrlStore as p}from"./url-store.js";import{ResultStream as a}from"./result-stream.js";import{MemoryMonitor as x}from"./memory-monitor.js";import{HealthMetrics as M}from"./health-metrics.js";import{CappedMap as C}from"./capped-map.js";import{CappedArray as d}from"./capped-array.js";export{p as UrlStore,a as ResultStream,x as MemoryMonitor,M as HealthMetrics,e as FileCacher,C as CappedMap,d as CappedArray};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
var s=require("node:v8");class a{warningThreshold;criticalThreshold;checkInterval;maxHeap;monitorInterval;lastStatus="ok";constructor(t={}){this.warningThreshold=t.warningRatio??0.7,this.criticalThreshold=t.criticalRatio??0.85,this.checkInterval=t.checkInterval??1e4;let e=s.getHeapStatistics();this.maxHeap=e.heap_size_limit}check(){let e=process.memoryUsage().heapUsed/this.maxHeap;if(e>=this.criticalThreshold)return"critical";if(e>=this.warningThreshold)return"warning";return"ok"}getReport(){let t=process.memoryUsage(),e=t.heapUsed/this.maxHeap*100;return{status:this.check(),heapUsedMB:Math.round(t.heapUsed/1024/1024),heapTotalMB:Math.round(t.heapTotal/1024/1024),heapLimitMB:Math.round(this.maxHeap/1024/1024),usagePercent:Math.round(e*10)/10,externalMB:Math.round(t.external/1024/1024),rssMB:Math.round(t.rss/1024/1024)}}getUsagePercent(){return process.memoryUsage().heapUsed/this.maxHeap*100}forceGC(){if(typeof global.gc==="function")return global.gc(),!0;return!1}startAutoMonitor(t){if(this.monitorInterval)return;if(this.monitorInterval=setInterval(()=>{let e=this.getReport(),r=e.status;if(r!==this.lastStatus){if(r==="warning"&&t.onWarning)t.onWarning(e);else if(r==="critical"&&t.onCritical)t.onCritical(e);else if(r==="ok"&&this.lastStatus!=="ok"&&t.onRecovered)t.onRecovered(e);this.lastStatus=r}},this.checkInterval),this.monitorInterval.unref)this.monitorInterval.unref()}stopAutoMonitor(){if(this.monitorInterval)clearInterval(this.monitorInterval),this.monitorInterval=void 0}getRecommendedConcurrency(t,e=5){switch(this.check()){case"critical":return e;case"warning":return Math.max(e,Math.floor(t*0.5));default:return t}}destroy(){this.stopAutoMonitor()}}exports.MemoryMonitor=a;exports.default=a;module.exports=Object.assign(a,exports);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
import s from"node:v8";class a{warningThreshold;criticalThreshold;checkInterval;maxHeap;monitorInterval;lastStatus="ok";constructor(t={}){this.warningThreshold=t.warningRatio??0.7,this.criticalThreshold=t.criticalRatio??0.85,this.checkInterval=t.checkInterval??1e4;let e=s.getHeapStatistics();this.maxHeap=e.heap_size_limit}check(){let e=process.memoryUsage().heapUsed/this.maxHeap;if(e>=this.criticalThreshold)return"critical";if(e>=this.warningThreshold)return"warning";return"ok"}getReport(){let t=process.memoryUsage(),e=t.heapUsed/this.maxHeap*100;return{status:this.check(),heapUsedMB:Math.round(t.heapUsed/1024/1024),heapTotalMB:Math.round(t.heapTotal/1024/1024),heapLimitMB:Math.round(this.maxHeap/1024/1024),usagePercent:Math.round(e*10)/10,externalMB:Math.round(t.external/1024/1024),rssMB:Math.round(t.rss/1024/1024)}}getUsagePercent(){return process.memoryUsage().heapUsed/this.maxHeap*100}forceGC(){if(typeof global.gc==="function")return global.gc(),!0;return!1}startAutoMonitor(t){if(this.monitorInterval)return;if(this.monitorInterval=setInterval(()=>{let e=this.getReport(),r=e.status;if(r!==this.lastStatus){if(r==="warning"&&t.onWarning)t.onWarning(e);else if(r==="critical"&&t.onCritical)t.onCritical(e);else if(r==="ok"&&this.lastStatus!=="ok"&&t.onRecovered)t.onRecovered(e);this.lastStatus=r}},this.checkInterval),this.monitorInterval.unref)this.monitorInterval.unref()}stopAutoMonitor(){if(this.monitorInterval)clearInterval(this.monitorInterval),this.monitorInterval=void 0}getRecommendedConcurrency(t,e=5){switch(this.check()){case"critical":return e;case"warning":return Math.max(e,Math.floor(t*0.5));default:return t}}destroy(){this.stopAutoMonitor()}}var n=a;export{n as default,a as MemoryMonitor};
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
var
|
|
1
|
+
var d=require("node:fs"),o=require("node:path"),{createHash:a}=require("node:crypto"),{createDatabase:E}=require("./sqlite-utils.cjs");class n{db=null;options;storeDir;dbPath;closed=!1;initPromise=null;constructor(s={}){if(this.options={storeDir:s.storeDir||"/tmp/rezo-crawler/navigation",dbFileName:s.dbFileName||"navigation.db",hashUrls:s.hashUrls??!1},this.storeDir=o.resolve(this.options.storeDir),this.dbPath=o.join(this.storeDir,this.options.dbFileName),!d.existsSync(this.storeDir))d.mkdirSync(this.storeDir,{recursive:!0})}static async create(s={}){let i=new n(s);return await i.initialize(),i}async initialize(){if(this.initPromise)return this.initPromise;return this.initPromise=(async()=>{this.db=await E(this.dbPath),this.db.run("PRAGMA journal_mode = WAL"),this.db.run("PRAGMA synchronous = NORMAL"),this.db.run("PRAGMA cache_size = -32000"),this.db.run("PRAGMA temp_store = MEMORY"),this.db.run("PRAGMA mmap_size = 134217728"),this.db.run(`
|
|
2
2
|
CREATE TABLE IF NOT EXISTS sessions (
|
|
3
3
|
sessionId TEXT PRIMARY KEY,
|
|
4
4
|
baseUrl TEXT NOT NULL,
|
|
@@ -37,7 +37,7 @@ var h=Object.create;var{getPrototypeOf:T,defineProperty:a,getOwnPropertyNames:c}
|
|
|
37
37
|
errorMessage TEXT,
|
|
38
38
|
UNIQUE(sessionId, urlKey)
|
|
39
39
|
)
|
|
40
|
-
`),this.db.run("CREATE INDEX IF NOT EXISTS idx_queue_session ON queue(sessionId)"),this.db.run("CREATE INDEX IF NOT EXISTS idx_queue_priority ON queue(sessionId, priority DESC)"),this.db.run("CREATE INDEX IF NOT EXISTS idx_visited_session ON visited(sessionId)"),this.db.run("CREATE INDEX IF NOT EXISTS idx_sessions_status ON sessions(status)")})(),this.initPromise}getUrlKey(s){if(this.options.hashUrls)return
|
|
41
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,s,
|
|
42
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,s,i,e
|
|
43
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`,s,i,e
|
|
40
|
+
`),this.db.run("CREATE INDEX IF NOT EXISTS idx_queue_session ON queue(sessionId)"),this.db.run("CREATE INDEX IF NOT EXISTS idx_queue_priority ON queue(sessionId, priority DESC)"),this.db.run("CREATE INDEX IF NOT EXISTS idx_visited_session ON visited(sessionId)"),this.db.run("CREATE INDEX IF NOT EXISTS idx_sessions_status ON sessions(status)")})(),this.initPromise}getUrlKey(s){if(this.options.hashUrls)return a("sha256").update(s).digest("hex");return s}async createSession(s,i,e){if(this.closed||!this.db)throw Error("NavigationHistory is closed");let t=Date.now(),r={sessionId:s,baseUrl:i,startedAt:t,lastActivityAt:t,status:"running",urlsVisited:0,urlsQueued:0,urlsFailed:0,metadata:e?JSON.stringify(e):void 0};return this.db.run(`INSERT OR REPLACE INTO sessions (sessionId, baseUrl, startedAt, lastActivityAt, status, urlsVisited, urlsQueued, urlsFailed, metadata)
|
|
41
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,s,i,t,t,"running",0,0,0,r.metadata??null),r}async getSession(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.get("SELECT * FROM sessions WHERE sessionId = ?",s)}async updateSessionStatus(s,i){if(this.closed||!this.db)throw Error("NavigationHistory is closed");this.db.run("UPDATE sessions SET status = ?, lastActivityAt = ? WHERE sessionId = ?",i,Date.now(),s)}async updateSessionStats(s,i){if(this.closed||!this.db)throw Error("NavigationHistory is closed");let e=["lastActivityAt = ?"],t=[Date.now()];if(i.urlsVisited!==void 0)e.push("urlsVisited = ?"),t.push(i.urlsVisited);if(i.urlsQueued!==void 0)e.push("urlsQueued = ?"),t.push(i.urlsQueued);if(i.urlsFailed!==void 0)e.push("urlsFailed = ?"),t.push(i.urlsFailed);t.push(s),this.db.run(`UPDATE sessions SET ${e.join(", ")} WHERE sessionId = ?`,...t)}async addToQueue(s,i,e={}){if(this.closed||!this.db)throw Error("NavigationHistory is closed");let t=this.getUrlKey(i);if(this.db.get("SELECT id FROM queue WHERE sessionId = ? AND urlKey = ?",s,t))return!1;if(this.db.get("SELECT id FROM visited WHERE sessionId = ? AND urlKey = ?",s,t))return!1;return this.db.run(`INSERT INTO queue (sessionId, urlKey, originalUrl, method, priority, body, headers, metadata, addedAt)
|
|
42
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,s,t,i,e.method||"GET",e.priority||0,e.body?JSON.stringify(e.body):null,e.headers?JSON.stringify(e.headers):null,e.metadata?JSON.stringify(e.metadata):null,Date.now()),!0}async getNextFromQueue(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.get("SELECT originalUrl as url, method, priority, body, headers, metadata, addedAt FROM queue WHERE sessionId = ? ORDER BY priority DESC, addedAt ASC LIMIT 1",s)}async removeFromQueue(s,i){if(this.closed||!this.db)throw Error("NavigationHistory is closed");let e=this.getUrlKey(i);return this.db.run("DELETE FROM queue WHERE sessionId = ? AND urlKey = ?",s,e),!0}async getQueueSize(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.get("SELECT COUNT(*) as count FROM queue WHERE sessionId = ?",s)?.count||0}async markVisited(s,i,e={}){if(this.closed||!this.db)throw Error("NavigationHistory is closed");let t=this.getUrlKey(i);this.db.run("DELETE FROM queue WHERE sessionId = ? AND urlKey = ?",s,t),this.db.run(`INSERT OR REPLACE INTO visited (sessionId, urlKey, originalUrl, status, visitedAt, finalUrl, contentType, errorMessage)
|
|
43
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`,s,t,i,e.status||0,Date.now(),e.finalUrl??null,e.contentType??null,e.errorMessage??null)}async isVisited(s,i){if(this.closed||!this.db)throw Error("NavigationHistory is closed");let e=this.getUrlKey(i);return!!this.db.get("SELECT id FROM visited WHERE sessionId = ? AND urlKey = ?",s,e)}async getVisitedCount(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.get("SELECT COUNT(*) as count FROM visited WHERE sessionId = ?",s)?.count||0}async getFailedUrls(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.all("SELECT url, status, visitedAt, finalUrl, contentType, errorMessage FROM visited WHERE sessionId = ? AND (status >= 400 OR errorMessage IS NOT NULL)",s)}async getAllQueuedUrls(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.all("SELECT originalUrl as url, method, priority, body, headers, metadata, addedAt FROM queue WHERE sessionId = ? ORDER BY priority DESC, addedAt ASC",s)}async clearQueue(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");this.db.run("DELETE FROM queue WHERE sessionId = ?",s)}async clearVisited(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");this.db.run("DELETE FROM visited WHERE sessionId = ?",s)}async deleteSession(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");this.db.run("DELETE FROM queue WHERE sessionId = ?",s),this.db.run("DELETE FROM visited WHERE sessionId = ?",s),this.db.run("DELETE FROM sessions WHERE sessionId = ?",s)}async getResumableSessions(){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.all("SELECT * FROM sessions WHERE status IN ('running', 'paused') ORDER BY lastActivityAt DESC")}async close(){if(this.closed)return;if(this.closed=!0,this.db){try{this.db.run("PRAGMA wal_checkpoint(TRUNCATE)")}catch{}try{this.db.close()}catch{}this.db=null}}get isClosed(){return this.closed}get databasePath(){return this.dbPath}}exports.NavigationHistory=n;
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import
|
|
1
|
+
import o from"node:fs";import d from"node:path";import{createHash as a}from"node:crypto";import{createDatabase as E}from"./sqlite-utils.js";class n{db=null;options;storeDir;dbPath;closed=!1;initPromise=null;constructor(s={}){if(this.options={storeDir:s.storeDir||"/tmp/rezo-crawler/navigation",dbFileName:s.dbFileName||"navigation.db",hashUrls:s.hashUrls??!1},this.storeDir=d.resolve(this.options.storeDir),this.dbPath=d.join(this.storeDir,this.options.dbFileName),!o.existsSync(this.storeDir))o.mkdirSync(this.storeDir,{recursive:!0})}static async create(s={}){let i=new n(s);return await i.initialize(),i}async initialize(){if(this.initPromise)return this.initPromise;return this.initPromise=(async()=>{this.db=await E(this.dbPath),this.db.run("PRAGMA journal_mode = WAL"),this.db.run("PRAGMA synchronous = NORMAL"),this.db.run("PRAGMA cache_size = -32000"),this.db.run("PRAGMA temp_store = MEMORY"),this.db.run("PRAGMA mmap_size = 134217728"),this.db.run(`
|
|
2
2
|
CREATE TABLE IF NOT EXISTS sessions (
|
|
3
3
|
sessionId TEXT PRIMARY KEY,
|
|
4
4
|
baseUrl TEXT NOT NULL,
|
|
@@ -37,7 +37,7 @@ import{createRequire as R}from"node:module";var T=Object.create;var{getPrototype
|
|
|
37
37
|
errorMessage TEXT,
|
|
38
38
|
UNIQUE(sessionId, urlKey)
|
|
39
39
|
)
|
|
40
|
-
`),this.db.run("CREATE INDEX IF NOT EXISTS idx_queue_session ON queue(sessionId)"),this.db.run("CREATE INDEX IF NOT EXISTS idx_queue_priority ON queue(sessionId, priority DESC)"),this.db.run("CREATE INDEX IF NOT EXISTS idx_visited_session ON visited(sessionId)"),this.db.run("CREATE INDEX IF NOT EXISTS idx_sessions_status ON sessions(status)")})(),this.initPromise}getUrlKey(s){if(this.options.hashUrls)return
|
|
40
|
+
`),this.db.run("CREATE INDEX IF NOT EXISTS idx_queue_session ON queue(sessionId)"),this.db.run("CREATE INDEX IF NOT EXISTS idx_queue_priority ON queue(sessionId, priority DESC)"),this.db.run("CREATE INDEX IF NOT EXISTS idx_visited_session ON visited(sessionId)"),this.db.run("CREATE INDEX IF NOT EXISTS idx_sessions_status ON sessions(status)")})(),this.initPromise}getUrlKey(s){if(this.options.hashUrls)return a("sha256").update(s).digest("hex");return s}async createSession(s,i,t){if(this.closed||!this.db)throw Error("NavigationHistory is closed");let e=Date.now(),r={sessionId:s,baseUrl:i,startedAt:e,lastActivityAt:e,status:"running",urlsVisited:0,urlsQueued:0,urlsFailed:0,metadata:t?JSON.stringify(t):void 0};return this.db.run(`INSERT OR REPLACE INTO sessions (sessionId, baseUrl, startedAt, lastActivityAt, status, urlsVisited, urlsQueued, urlsFailed, metadata)
|
|
41
41
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,s,i,e,e,"running",0,0,0,r.metadata??null),r}async getSession(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.get("SELECT * FROM sessions WHERE sessionId = ?",s)}async updateSessionStatus(s,i){if(this.closed||!this.db)throw Error("NavigationHistory is closed");this.db.run("UPDATE sessions SET status = ?, lastActivityAt = ? WHERE sessionId = ?",i,Date.now(),s)}async updateSessionStats(s,i){if(this.closed||!this.db)throw Error("NavigationHistory is closed");let t=["lastActivityAt = ?"],e=[Date.now()];if(i.urlsVisited!==void 0)t.push("urlsVisited = ?"),e.push(i.urlsVisited);if(i.urlsQueued!==void 0)t.push("urlsQueued = ?"),e.push(i.urlsQueued);if(i.urlsFailed!==void 0)t.push("urlsFailed = ?"),e.push(i.urlsFailed);e.push(s),this.db.run(`UPDATE sessions SET ${t.join(", ")} WHERE sessionId = ?`,...e)}async addToQueue(s,i,t={}){if(this.closed||!this.db)throw Error("NavigationHistory is closed");let e=this.getUrlKey(i);if(this.db.get("SELECT id FROM queue WHERE sessionId = ? AND urlKey = ?",s,e))return!1;if(this.db.get("SELECT id FROM visited WHERE sessionId = ? AND urlKey = ?",s,e))return!1;return this.db.run(`INSERT INTO queue (sessionId, urlKey, originalUrl, method, priority, body, headers, metadata, addedAt)
|
|
42
42
|
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,s,e,i,t.method||"GET",t.priority||0,t.body?JSON.stringify(t.body):null,t.headers?JSON.stringify(t.headers):null,t.metadata?JSON.stringify(t.metadata):null,Date.now()),!0}async getNextFromQueue(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.get("SELECT originalUrl as url, method, priority, body, headers, metadata, addedAt FROM queue WHERE sessionId = ? ORDER BY priority DESC, addedAt ASC LIMIT 1",s)}async removeFromQueue(s,i){if(this.closed||!this.db)throw Error("NavigationHistory is closed");let t=this.getUrlKey(i);return this.db.run("DELETE FROM queue WHERE sessionId = ? AND urlKey = ?",s,t),!0}async getQueueSize(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.get("SELECT COUNT(*) as count FROM queue WHERE sessionId = ?",s)?.count||0}async markVisited(s,i,t={}){if(this.closed||!this.db)throw Error("NavigationHistory is closed");let e=this.getUrlKey(i);this.db.run("DELETE FROM queue WHERE sessionId = ? AND urlKey = ?",s,e),this.db.run(`INSERT OR REPLACE INTO visited (sessionId, urlKey, originalUrl, status, visitedAt, finalUrl, contentType, errorMessage)
|
|
43
|
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`,s,e,i,t.status||0,Date.now(),t.finalUrl??null,t.contentType??null,t.errorMessage??null)}async isVisited(s,i){if(this.closed||!this.db)throw Error("NavigationHistory is closed");let t=this.getUrlKey(i);return!!this.db.get("SELECT id FROM visited WHERE sessionId = ? AND urlKey = ?",s,t)}async getVisitedCount(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.get("SELECT COUNT(*) as count FROM visited WHERE sessionId = ?",s)?.count||0}async getFailedUrls(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.all("SELECT url, status, visitedAt, finalUrl, contentType, errorMessage FROM visited WHERE sessionId = ? AND (status >= 400 OR errorMessage IS NOT NULL)",s)}async getAllQueuedUrls(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.all("SELECT originalUrl as url, method, priority, body, headers, metadata, addedAt FROM queue WHERE sessionId = ? ORDER BY priority DESC, addedAt ASC",s)}async clearQueue(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");this.db.run("DELETE FROM queue WHERE sessionId = ?",s)}async clearVisited(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");this.db.run("DELETE FROM visited WHERE sessionId = ?",s)}async deleteSession(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");this.db.run("DELETE FROM queue WHERE sessionId = ?",s),this.db.run("DELETE FROM visited WHERE sessionId = ?",s),this.db.run("DELETE FROM sessions WHERE sessionId = ?",s)}async getResumableSessions(){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.all("SELECT * FROM sessions WHERE status IN ('running', 'paused') ORDER BY lastActivityAt DESC")}async close(){if(this.closed)return;if(this.closed=!0,this.db)this.db.close()
|
|
43
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`,s,e,i,t.status||0,Date.now(),t.finalUrl??null,t.contentType??null,t.errorMessage??null)}async isVisited(s,i){if(this.closed||!this.db)throw Error("NavigationHistory is closed");let t=this.getUrlKey(i);return!!this.db.get("SELECT id FROM visited WHERE sessionId = ? AND urlKey = ?",s,t)}async getVisitedCount(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.get("SELECT COUNT(*) as count FROM visited WHERE sessionId = ?",s)?.count||0}async getFailedUrls(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.all("SELECT url, status, visitedAt, finalUrl, contentType, errorMessage FROM visited WHERE sessionId = ? AND (status >= 400 OR errorMessage IS NOT NULL)",s)}async getAllQueuedUrls(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.all("SELECT originalUrl as url, method, priority, body, headers, metadata, addedAt FROM queue WHERE sessionId = ? ORDER BY priority DESC, addedAt ASC",s)}async clearQueue(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");this.db.run("DELETE FROM queue WHERE sessionId = ?",s)}async clearVisited(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");this.db.run("DELETE FROM visited WHERE sessionId = ?",s)}async deleteSession(s){if(this.closed||!this.db)throw Error("NavigationHistory is closed");this.db.run("DELETE FROM queue WHERE sessionId = ?",s),this.db.run("DELETE FROM visited WHERE sessionId = ?",s),this.db.run("DELETE FROM sessions WHERE sessionId = ?",s)}async getResumableSessions(){if(this.closed||!this.db)throw Error("NavigationHistory is closed");return this.db.all("SELECT * FROM sessions WHERE status IN ('running', 'paused') ORDER BY lastActivityAt DESC")}async close(){if(this.closed)return;if(this.closed=!0,this.db){try{this.db.run("PRAGMA wal_checkpoint(TRUNCATE)")}catch{}try{this.db.close()}catch{}this.db=null}}get isClosed(){return this.closed}get databasePath(){return this.dbPath}}export{n as NavigationHistory};
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
var{createWriteStream:h}=require("node:fs"),o=require("node:path"),r=require("node:fs");class n{stream=null;count=0;bytesWritten=0;fileIndex=0;options;closed=!1;constructor(t){this.options={outputPath:t.outputPath,format:t.format??"jsonl",flushInterval:t.flushInterval??100,maxFileSize:t.maxFileSize??0,csvHeaders:t.csvHeaders??[]};let e=o.dirname(this.options.outputPath);if(!r.existsSync(e))r.mkdirSync(e,{recursive:!0});this.openStream()}openStream(){let t=this.getFilePath();if(this.stream=h(t,{flags:"a"}),this.options.format==="csv"&&this.options.csvHeaders.length>0){let e=r.existsSync(t)?r.statSync(t):null;if(!e||e.size===0){let s=this.options.csvHeaders.join(",")+`
|
|
2
|
+
`;this.stream.write(s),this.bytesWritten+=Buffer.byteLength(s)}}}getFilePath(){if(this.options.maxFileSize<=0||this.fileIndex===0)return this.options.outputPath;let t=o.extname(this.options.outputPath);return`${this.options.outputPath.slice(0,-t.length)}.${this.fileIndex}${t}`}write(t){if(this.closed||!this.stream)return;let e;if(this.options.format==="csv")e=this.toCSV(t)+`
|
|
3
|
+
`;else e=JSON.stringify(t)+`
|
|
4
|
+
`;let s=Buffer.byteLength(e);if(this.stream.write(e),this.count++,this.bytesWritten+=s,this.count%this.options.flushInterval===0)this.stream.cork(),process.nextTick(()=>this.stream?.uncork());if(this.options.maxFileSize>0&&this.bytesWritten>=this.options.maxFileSize)this.rotate()}writeMany(t){for(let e of t)this.write(e)}toCSV(t){if(this.options.csvHeaders.length===0)this.options.csvHeaders=Object.keys(t);return this.options.csvHeaders.map((e)=>{let s=t[e];if(s===void 0||s===null)return"";let i=String(s);if(i.includes(",")||i.includes('"')||i.includes(`
|
|
5
|
+
`))return`"${i.replace(/"/g,'""')}"`;return i}).join(",")}rotate(){if(!this.stream)return;this.stream.end(),this.fileIndex++,this.bytesWritten=0,this.openStream()}flush(){if(this.stream&&!this.closed)this.stream.cork(),process.nextTick(()=>this.stream?.uncork())}async close(){if(this.closed)return;return this.closed=!0,new Promise((t,e)=>{if(!this.stream){t();return}this.stream.end((s)=>{if(s)e(s);else t()})})}get recordCount(){return this.count}get totalBytes(){return this.bytesWritten}get isClosed(){return this.closed}get outputPath(){return this.getFilePath()}}exports.ResultStream=n;exports.default=n;module.exports=Object.assign(n,exports);
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import{createWriteStream as h}from"node:fs";import n from"node:path";import r from"node:fs";class o{stream=null;count=0;bytesWritten=0;fileIndex=0;options;closed=!1;constructor(t){this.options={outputPath:t.outputPath,format:t.format??"jsonl",flushInterval:t.flushInterval??100,maxFileSize:t.maxFileSize??0,csvHeaders:t.csvHeaders??[]};let e=n.dirname(this.options.outputPath);if(!r.existsSync(e))r.mkdirSync(e,{recursive:!0});this.openStream()}openStream(){let t=this.getFilePath();if(this.stream=h(t,{flags:"a"}),this.options.format==="csv"&&this.options.csvHeaders.length>0){let e=r.existsSync(t)?r.statSync(t):null;if(!e||e.size===0){let s=this.options.csvHeaders.join(",")+`
|
|
2
|
+
`;this.stream.write(s),this.bytesWritten+=Buffer.byteLength(s)}}}getFilePath(){if(this.options.maxFileSize<=0||this.fileIndex===0)return this.options.outputPath;let t=n.extname(this.options.outputPath);return`${this.options.outputPath.slice(0,-t.length)}.${this.fileIndex}${t}`}write(t){if(this.closed||!this.stream)return;let e;if(this.options.format==="csv")e=this.toCSV(t)+`
|
|
3
|
+
`;else e=JSON.stringify(t)+`
|
|
4
|
+
`;let s=Buffer.byteLength(e);if(this.stream.write(e),this.count++,this.bytesWritten+=s,this.count%this.options.flushInterval===0)this.stream.cork(),process.nextTick(()=>this.stream?.uncork());if(this.options.maxFileSize>0&&this.bytesWritten>=this.options.maxFileSize)this.rotate()}writeMany(t){for(let e of t)this.write(e)}toCSV(t){if(this.options.csvHeaders.length===0)this.options.csvHeaders=Object.keys(t);return this.options.csvHeaders.map((e)=>{let s=t[e];if(s===void 0||s===null)return"";let i=String(s);if(i.includes(",")||i.includes('"')||i.includes(`
|
|
5
|
+
`))return`"${i.replace(/"/g,'""')}"`;return i}).join(",")}rotate(){if(!this.stream)return;this.stream.end(),this.fileIndex++,this.bytesWritten=0,this.openStream()}flush(){if(this.stream&&!this.closed)this.stream.cork(),process.nextTick(()=>this.stream?.uncork())}async close(){if(this.closed)return;return this.closed=!0,new Promise((t,e)=>{if(!this.stream){t();return}this.stream.end((s)=>{if(s)e(s);else t()})})}get recordCount(){return this.count}get totalBytes(){return this.bytesWritten}get isClosed(){return this.closed}get outputPath(){return this.getFilePath()}}var f=o;export{f as default,o as ResultStream};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
var h=Object.create;var{getPrototypeOf:y,defineProperty:l,getOwnPropertyNames:f}=Object;var d=Object.prototype.hasOwnProperty;var u=(t,a,r)=>{r=t!=null?h(y(t)):{};let s=a||!t||!t.__esModule?l(r,"default",{value:t,enumerable:!0}):r;for(let e of f(t))if(!d.call(s,e))l(s,e,{get:()=>t[e],enumerable:!0});return s};var x=exports.isBun=typeof globalThis.Bun<"u",m={maxRetries:5,initialDelayMs:50,maxDelayMs:1000};function w(t){if(!t)return!1;let a=t.message||t.toString();return a.includes("SQLITE_BUSY")||a.includes("database is locked")||a.includes("database table is locked")}function D(t){return new Promise((a)=>setTimeout(a,t))}function i(t,a={}){let r={...m,...a},s;for(let e=0;e<=r.maxRetries;e++)try{return t()}catch(n){if(s=n,!w(n)||e>=r.maxRetries)throw n;let c=Math.min(r.initialDelayMs*Math.pow(2,e)+Math.random()*50,r.maxDelayMs),o=Date.now();while(Date.now()-o<c);}throw s}async function M(t,a={}){let r={...m,...a},s;for(let e=0;e<=r.maxRetries;e++)try{return await t()}catch(n){if(s=n,!w(n)||e>=r.maxRetries)throw n;let c=Math.min(r.initialDelayMs*Math.pow(2,e)+Math.random()*50,r.maxDelayMs);await D(c)}throw s}async function R(t,a={}){if(x){let{Database:e}=await import("bun:sqlite"),n=new e(t);return{run:(c,...o)=>i(()=>n.run(c,...o),a),get:(c,...o)=>i(()=>n.query(c).get(...o),a),all:(c,...o)=>i(()=>n.query(c).all(...o),a),exec:(c)=>i(()=>n.exec(c),a),close:()=>n.close()}}let{DatabaseSync:r}=await import("node:sqlite"),s=new r(t);return{run:(e,...n)=>i(()=>{if(n.length===0)s.exec(e);else s.prepare(e).run(...n)},a),get:(e,...n)=>i(()=>{return s.prepare(e).get(...n)},a),all:(e,...n)=>i(()=>{return s.prepare(e).all(...n)},a),exec:(e)=>i(()=>s.exec(e),a),close:()=>s.close()}}exports.withRetrySync=i;exports.withRetryAsync=M;exports.createDatabase=R;
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
import{createRequire as p}from"node:module";var h=Object.create;var{getPrototypeOf:f,defineProperty:i,getOwnPropertyNames:d}=Object;var y=Object.prototype.hasOwnProperty;var u=(t,a,r)=>{r=t!=null?h(f(t)):{};let s=a||!t||!t.__esModule?i(r,"default",{value:t,enumerable:!0}):r;for(let e of d(t))if(!y.call(s,e))i(s,e,{get:()=>t[e],enumerable:!0});return s};var m=p(import.meta.url);var D=typeof globalThis.Bun<"u",w={maxRetries:5,initialDelayMs:50,maxDelayMs:1000};function x(t){if(!t)return!1;let a=t.message||t.toString();return a.includes("SQLITE_BUSY")||a.includes("database is locked")||a.includes("database table is locked")}function M(t){return new Promise((a)=>setTimeout(a,t))}function c(t,a={}){let r={...w,...a},s;for(let e=0;e<=r.maxRetries;e++)try{return t()}catch(n){if(s=n,!x(n)||e>=r.maxRetries)throw n;let o=Math.min(r.initialDelayMs*Math.pow(2,e)+Math.random()*50,r.maxDelayMs),l=Date.now();while(Date.now()-l<o);}throw s}async function b(t,a={}){let r={...w,...a},s;for(let e=0;e<=r.maxRetries;e++)try{return await t()}catch(n){if(s=n,!x(n)||e>=r.maxRetries)throw n;let o=Math.min(r.initialDelayMs*Math.pow(2,e)+Math.random()*50,r.maxDelayMs);await M(o)}throw s}async function E(t,a={}){if(D){let{Database:e}=await import("bun:sqlite"),n=new e(t);return{run:(o,...l)=>c(()=>n.run(o,...l),a),get:(o,...l)=>c(()=>n.query(o).get(...l),a),all:(o,...l)=>c(()=>n.query(o).all(...l),a),exec:(o)=>c(()=>n.exec(o),a),close:()=>n.close()}}let{DatabaseSync:r}=await import("node:sqlite"),s=new r(t);return{run:(e,...n)=>c(()=>{if(n.length===0)s.exec(e);else s.prepare(e).run(...n)},a),get:(e,...n)=>c(()=>{return s.prepare(e).get(...n)},a),all:(e,...n)=>c(()=>{return s.prepare(e).all(...n)},a),exec:(e)=>c(()=>s.exec(e),a),close:()=>s.close()}}export{c as withRetrySync,b as withRetryAsync,D as isBun,E as createDatabase};
|
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
var
|
|
1
|
+
var T=require("node:fs"),R=require("node:path"),{createHash:A}=require("node:crypto"),{createDatabase:b}=require("./sqlite-utils.cjs");class c{db=null;options;storeDir;dbPath;closed=!1;initPromise=null;constructor(t={}){if(this.options={storeDir:t.storeDir||"/tmp/rezo-crawler/urls",dbFileName:t.dbFileName||"urls.db",ttl:t.ttl||604800000,maxUrls:t.maxUrls??0,hashUrls:t.hashUrls??!0},this.storeDir=R.resolve(this.options.storeDir),this.dbPath=R.join(this.storeDir,this.options.dbFileName),!T.existsSync(this.storeDir))T.mkdirSync(this.storeDir,{recursive:!0})}static async create(t={}){let s=new c(t);return await s.initialize(),s}async initialize(){if(this.initPromise)return this.initPromise;return this.initPromise=(async()=>{this.db=await b(this.dbPath),this.db.exec("PRAGMA journal_mode = WAL"),this.db.exec("PRAGMA synchronous = NORMAL"),this.db.exec("PRAGMA cache_size = -32000"),this.db.exec("PRAGMA temp_store = MEMORY"),this.db.exec("PRAGMA mmap_size = 134217728"),this.db.exec(`
|
|
2
2
|
CREATE TABLE IF NOT EXISTS urls (
|
|
3
3
|
urlHash TEXT PRIMARY KEY,
|
|
4
4
|
visitedAt INTEGER NOT NULL,
|
|
5
5
|
expiresAt INTEGER NOT NULL,
|
|
6
6
|
namespace TEXT NOT NULL DEFAULT 'default'
|
|
7
7
|
) WITHOUT ROWID
|
|
8
|
-
`),this.db.exec("CREATE INDEX IF NOT EXISTS idx_ns_exp ON urls(namespace, expiresAt)")})(),this.initPromise}getUrlKey(t){if(this.options.hashUrls)return
|
|
8
|
+
`),this.db.exec("CREATE INDEX IF NOT EXISTS idx_ns_exp ON urls(namespace, expiresAt)")})(),this.initPromise}getUrlKey(t){if(this.options.hashUrls)return A("sha256").update(t).digest("hex");return t}async set(t,s="default",e){if(this.closed)throw Error("UrlStore is closed");await this.initialize();let i=this.getUrlKey(t),r=Date.now(),a=r+(e??this.options.ttl);if(this.db.run(`
|
|
9
9
|
INSERT OR REPLACE INTO urls (urlHash, visitedAt, expiresAt, namespace)
|
|
10
10
|
VALUES (?, ?, ?, ?)
|
|
11
|
-
`,i,
|
|
11
|
+
`,i,r,a,s),this.options.maxUrls>0){let l=this.db.get("SELECT COUNT(*) as cnt FROM urls");if(l&&l.cnt>this.options.maxUrls){let n=l.cnt-this.options.maxUrls;this.db.run(`
|
|
12
12
|
DELETE FROM urls WHERE urlHash IN (
|
|
13
13
|
SELECT urlHash FROM urls ORDER BY visitedAt ASC LIMIT ?
|
|
14
14
|
)
|
|
15
|
-
`,
|
|
15
|
+
`,n)}}}async setMany(t,s="default",e){if(this.closed)throw Error("UrlStore is closed");if(t.length===0)return;await this.initialize();let i=Date.now(),r=i+(e??this.options.ttl);this.db.exec("BEGIN TRANSACTION");try{for(let a of t){let l=this.getUrlKey(a);this.db.run(`
|
|
16
16
|
INSERT OR REPLACE INTO urls (urlHash, visitedAt, expiresAt, namespace)
|
|
17
17
|
VALUES (?, ?, ?, ?)
|
|
18
|
-
`,
|
|
18
|
+
`,l,i,r,s)}this.db.exec("COMMIT")}catch(a){throw this.db.exec("ROLLBACK"),a}}async has(t,s){if(this.closed)return!1;await this.initialize();let e=this.getUrlKey(t),i=Date.now(),r;if(s)r=this.db.get("SELECT expiresAt FROM urls WHERE urlHash = ? AND namespace = ?",e,s);else r=this.db.get("SELECT expiresAt FROM urls WHERE urlHash = ?",e);if(!r)return!1;return r.expiresAt>=i}async hasMany(t,s){if(this.closed)return new Set;if(t.length===0)return new Set;await this.initialize();let e=new Set,i=Date.now(),r=new Map;for(let n of t)r.set(this.getUrlKey(n),n);let a=Array.from(r.keys()),l=500;for(let n=0;n<a.length;n+=l){let h=a.slice(n,n+l),E=h.map(()=>"?").join(","),o;if(s)o=this.db.all(`SELECT urlHash, expiresAt FROM urls WHERE urlHash IN (${E}) AND namespace = ?`,...h,s);else o=this.db.all(`SELECT urlHash, expiresAt FROM urls WHERE urlHash IN (${E})`,...h);for(let u of o)if(u.expiresAt>=i){let d=r.get(u.urlHash);if(d)e.add(d)}}return e}async filterUnvisited(t,s){let e=await this.hasMany(t,s);return t.filter((i)=>!e.has(i))}async delete(t,s){if(this.closed)return!1;await this.initialize();let e=this.getUrlKey(t);if(s)this.db.run("DELETE FROM urls WHERE urlHash = ? AND namespace = ?",e,s);else this.db.run("DELETE FROM urls WHERE urlHash = ?",e);return!0}async clear(t){if(this.closed)return;if(await this.initialize(),t)this.db.run("DELETE FROM urls WHERE namespace = ?",t);else this.db.exec("DELETE FROM urls")}async cleanup(){if(this.closed)return 0;await this.initialize();let t=Date.now(),s=this.db.get("SELECT COUNT(*) as cnt FROM urls");this.db.run("DELETE FROM urls WHERE expiresAt < ?",t);let e=this.db.get("SELECT COUNT(*) as cnt FROM urls");return(s?.cnt||0)-(e?.cnt||0)}async count(t){if(this.closed)return 0;await this.initialize();let s=Date.now();if(t)return this.db.get("SELECT COUNT(*) as cnt FROM urls WHERE namespace = ? AND expiresAt >= ?",t,s)?.cnt||0;return this.db.get("SELECT COUNT(*) as cnt FROM urls WHERE expiresAt >= ?",s)?.cnt||0}async stats(t){if(this.closed)return{total:0,expired:0,namespaces:0};await this.initialize();let s=Date.now(),e,i;if(t)e=this.db.get("SELECT COUNT(*) as cnt FROM urls WHERE namespace = ?",t),i=this.db.get("SELECT COUNT(*) as cnt FROM urls WHERE namespace = ? AND expiresAt < ?",t,s);else e=this.db.get("SELECT COUNT(*) as cnt FROM urls"),i=this.db.get("SELECT COUNT(*) as cnt FROM urls WHERE expiresAt < ?",s);let r=this.db.get("SELECT COUNT(DISTINCT namespace) as cnt FROM urls");return{total:e?.cnt||0,expired:i?.cnt||0,namespaces:r?.cnt||0}}async close(){if(this.closed)return;if(this.closed=!0,await this.initPromise,this.db){try{this.db.exec("PRAGMA wal_checkpoint(TRUNCATE)")}catch{}try{this.db.close()}catch{}this.db=null}}get isClosed(){return this.closed}get path(){return this.dbPath}}exports.UrlStore=c;exports.default=c;module.exports=Object.assign(c,exports);
|
|
@@ -1,18 +1,18 @@
|
|
|
1
|
-
import
|
|
1
|
+
import T from"node:fs";import R from"node:path";import{createHash as A}from"node:crypto";import{createDatabase as f}from"./sqlite-utils.js";class c{db=null;options;storeDir;dbPath;closed=!1;initPromise=null;constructor(t={}){if(this.options={storeDir:t.storeDir||"/tmp/rezo-crawler/urls",dbFileName:t.dbFileName||"urls.db",ttl:t.ttl||604800000,maxUrls:t.maxUrls??0,hashUrls:t.hashUrls??!0},this.storeDir=R.resolve(this.options.storeDir),this.dbPath=R.join(this.storeDir,this.options.dbFileName),!T.existsSync(this.storeDir))T.mkdirSync(this.storeDir,{recursive:!0})}static async create(t={}){let s=new c(t);return await s.initialize(),s}async initialize(){if(this.initPromise)return this.initPromise;return this.initPromise=(async()=>{this.db=await f(this.dbPath),this.db.exec("PRAGMA journal_mode = WAL"),this.db.exec("PRAGMA synchronous = NORMAL"),this.db.exec("PRAGMA cache_size = -32000"),this.db.exec("PRAGMA temp_store = MEMORY"),this.db.exec("PRAGMA mmap_size = 134217728"),this.db.exec(`
|
|
2
2
|
CREATE TABLE IF NOT EXISTS urls (
|
|
3
3
|
urlHash TEXT PRIMARY KEY,
|
|
4
4
|
visitedAt INTEGER NOT NULL,
|
|
5
5
|
expiresAt INTEGER NOT NULL,
|
|
6
6
|
namespace TEXT NOT NULL DEFAULT 'default'
|
|
7
7
|
) WITHOUT ROWID
|
|
8
|
-
`),this.db.exec("CREATE INDEX IF NOT EXISTS idx_ns_exp ON urls(namespace, expiresAt)")})(),this.initPromise}getUrlKey(t){if(this.options.hashUrls)return
|
|
8
|
+
`),this.db.exec("CREATE INDEX IF NOT EXISTS idx_ns_exp ON urls(namespace, expiresAt)")})(),this.initPromise}getUrlKey(t){if(this.options.hashUrls)return A("sha256").update(t).digest("hex");return t}async set(t,s="default",e){if(this.closed)throw Error("UrlStore is closed");await this.initialize();let i=this.getUrlKey(t),r=Date.now(),a=r+(e??this.options.ttl);if(this.db.run(`
|
|
9
9
|
INSERT OR REPLACE INTO urls (urlHash, visitedAt, expiresAt, namespace)
|
|
10
10
|
VALUES (?, ?, ?, ?)
|
|
11
|
-
`,i,
|
|
11
|
+
`,i,r,a,s),this.options.maxUrls>0){let l=this.db.get("SELECT COUNT(*) as cnt FROM urls");if(l&&l.cnt>this.options.maxUrls){let n=l.cnt-this.options.maxUrls;this.db.run(`
|
|
12
12
|
DELETE FROM urls WHERE urlHash IN (
|
|
13
13
|
SELECT urlHash FROM urls ORDER BY visitedAt ASC LIMIT ?
|
|
14
14
|
)
|
|
15
|
-
`,
|
|
15
|
+
`,n)}}}async setMany(t,s="default",e){if(this.closed)throw Error("UrlStore is closed");if(t.length===0)return;await this.initialize();let i=Date.now(),r=i+(e??this.options.ttl);this.db.exec("BEGIN TRANSACTION");try{for(let a of t){let l=this.getUrlKey(a);this.db.run(`
|
|
16
16
|
INSERT OR REPLACE INTO urls (urlHash, visitedAt, expiresAt, namespace)
|
|
17
17
|
VALUES (?, ?, ?, ?)
|
|
18
|
-
`,
|
|
18
|
+
`,l,i,r,s)}this.db.exec("COMMIT")}catch(a){throw this.db.exec("ROLLBACK"),a}}async has(t,s){if(this.closed)return!1;await this.initialize();let e=this.getUrlKey(t),i=Date.now(),r;if(s)r=this.db.get("SELECT expiresAt FROM urls WHERE urlHash = ? AND namespace = ?",e,s);else r=this.db.get("SELECT expiresAt FROM urls WHERE urlHash = ?",e);if(!r)return!1;return r.expiresAt>=i}async hasMany(t,s){if(this.closed)return new Set;if(t.length===0)return new Set;await this.initialize();let e=new Set,i=Date.now(),r=new Map;for(let n of t)r.set(this.getUrlKey(n),n);let a=Array.from(r.keys()),l=500;for(let n=0;n<a.length;n+=l){let o=a.slice(n,n+l),E=o.map(()=>"?").join(","),h;if(s)h=this.db.all(`SELECT urlHash, expiresAt FROM urls WHERE urlHash IN (${E}) AND namespace = ?`,...o,s);else h=this.db.all(`SELECT urlHash, expiresAt FROM urls WHERE urlHash IN (${E})`,...o);for(let u of h)if(u.expiresAt>=i){let d=r.get(u.urlHash);if(d)e.add(d)}}return e}async filterUnvisited(t,s){let e=await this.hasMany(t,s);return t.filter((i)=>!e.has(i))}async delete(t,s){if(this.closed)return!1;await this.initialize();let e=this.getUrlKey(t);if(s)this.db.run("DELETE FROM urls WHERE urlHash = ? AND namespace = ?",e,s);else this.db.run("DELETE FROM urls WHERE urlHash = ?",e);return!0}async clear(t){if(this.closed)return;if(await this.initialize(),t)this.db.run("DELETE FROM urls WHERE namespace = ?",t);else this.db.exec("DELETE FROM urls")}async cleanup(){if(this.closed)return 0;await this.initialize();let t=Date.now(),s=this.db.get("SELECT COUNT(*) as cnt FROM urls");this.db.run("DELETE FROM urls WHERE expiresAt < ?",t);let e=this.db.get("SELECT COUNT(*) as cnt FROM urls");return(s?.cnt||0)-(e?.cnt||0)}async count(t){if(this.closed)return 0;await this.initialize();let s=Date.now();if(t)return this.db.get("SELECT COUNT(*) as cnt FROM urls WHERE namespace = ? AND expiresAt >= ?",t,s)?.cnt||0;return this.db.get("SELECT COUNT(*) as cnt FROM urls WHERE expiresAt >= ?",s)?.cnt||0}async stats(t){if(this.closed)return{total:0,expired:0,namespaces:0};await this.initialize();let s=Date.now(),e,i;if(t)e=this.db.get("SELECT COUNT(*) as cnt FROM urls WHERE namespace = ?",t),i=this.db.get("SELECT COUNT(*) as cnt FROM urls WHERE namespace = ? AND expiresAt < ?",t,s);else e=this.db.get("SELECT COUNT(*) as cnt FROM urls"),i=this.db.get("SELECT COUNT(*) as cnt FROM urls WHERE expiresAt < ?",s);let r=this.db.get("SELECT COUNT(DISTINCT namespace) as cnt FROM urls");return{total:e?.cnt||0,expired:i?.cnt||0,namespaces:r?.cnt||0}}async close(){if(this.closed)return;if(this.closed=!0,await this.initPromise,this.db){try{this.db.exec("PRAGMA wal_checkpoint(TRUNCATE)")}catch{}try{this.db.close()}catch{}this.db=null}}get isClosed(){return this.closed}get path(){return this.dbPath}}var w=c;export{w as default,c as UrlStore};
|
package/dist/crawler/scraper.cjs
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
var{parseHTML:w}=require("linkedom");class x extends Set{maxSize;constructor(e){super();this.maxSize=e}add(e){if(this.has(e))return this;if(this.size>=this.maxSize){let a=this.values().next().value;if(a)this.delete(a)}return super.add(e)}}class v{http;httpOptions;onEmailLeads;onEmailDiscovered;debug;discoveredEmails=new x(1e4);userAgents=[];fileExtensions=[];restrictedDomains=$();forbiddenProtocols=["mailto:","tel:","javascript:","data:","sms:","ftp:","file:","irc:","blob:","chrome:","about:","intent:"];constructor(e,a,n,o,t=!1){this.http=e,this.httpOptions=a,this.onEmailLeads=n,this.onEmailDiscovered=o,this.debug=t,this.userAgents=y()}sleep(e){return new Promise((a)=>setTimeout(a,e))}async executeHttp(e,a,n,o,t,i=0){let{getCache:r,saveCache:l,hasUrlInCache:d,saveUrl:c,httpConfig:s={}}=o;if(!e||e.length<3||this.forbiddenProtocols.some((u)=>e.startsWith(u)))return;try{let u=t?!1:await d(e),h=await r(e);if(u&&!h)return!1;if(u&&a!=="GET")return!1;let m=h&&a==="GET"?h:await(a==="GET"?this.http.get(e,s):a==="PATCH"?this.http.patch(e,n,s):a==="POST"?this.http.post(e,n,s):this.http.put(e,n,s));if(!h)await l(e,{data:m.data,contentType:m.contentType,finalUrl:m.finalUrl});if(!u)await c(e);if(!m.contentType||!m.contentType.includes("/html")||!m.contentType.includes("text/")||typeof m.data!=="string")return null;return{data:m.data,contentType:m.contentType,finalUrl:m.finalUrl}}catch(u){let h=u,m=this.httpOptions;if(h&&h.response){let g=h.response.status,p=m.retryDelay||100,b=m.maxRetryAttempts||3,f=m.retryWithoutProxyOnStatusCode||void 0,k=m.maxRetryOnProxyError||3;if(f&&s.proxy&&f.includes(g)&&i<b)return await this.sleep(p),delete s.proxy,await this.executeHttp(e,a,n,o,t,i+1);else if(m.retryOnStatusCode&&s.proxy&&m.retryOnStatusCode.includes(g)&&i<b)return await this.sleep(p),await this.executeHttp(e,a,n,o,t,i+1);else if(m.retryOnProxyError&&s.proxy&&i<k)return await this.sleep(p),await this.executeHttp(e,a,n,o,t,i+1)}if(this.debug){if(this.debug)console.log(`Error: unable to ${a} ${e}: ${u.message}`)}return null}}extractEmails(e,a,n,o,t,i){let r=this.extractEmailsFromContent(e?.replaceAll("mailto:"," ")),l=[];for(let d of r)if(this.handleEmailDiscovery(d,a,n,t,i))l.push(d);if(o&&o.length>0&&l.length>0)t.add(async()=>Promise.all(o.map((d)=>d(l))));r.length=0,l.length=0}async parseExternalWebsite(e,a,n,o,t,i=!0,r,l){let d=o.httpConfig?.headers?o.httpConfig.headers instanceof Headers?Object.fromEntries(o.httpConfig.headers.entries()):o.httpConfig.headers:{};o.httpConfig=o.httpConfig||{},o.httpConfig.headers={"user-agent":this.getRandomUserAgent(),accept:"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8","accept-language":"en-US,en;q=0.9","cache-control":"no-cache",pragma:"no-cache",...d},o.httpConfig.timeout=o.httpConfig.timeout||15000,o.depth=o.depth||0,o.allowCrossDomainTravel=o.allowCrossDomainTravel||!1,t=t&&i;let c=[];try{let s=new URL(e),u=this.extractRootDomain(e);if(this.isLinktreeUrl(e))return await this.parseLinktreeProfile(e,o,t);if(this.isRestrictedDomain(e)){if(this.debug)console.warn(`⚠️ Skipped URL (restricted url): ${e}`);return c}let h=await this.executeHttp(e,a,n,o,t);if(!h){if(this.debug&&h===null)console.warn(`⚠️ Failed to fetch page content: ${e}`);if(this.debug&&h===!1)console.warn(`⚠️ Skipped URL (already visited): ${e}`);return c}let m=this.extractEmailsFromContent(h.data?.replaceAll("mailto:"," "));for(let g of m)if(this.handleEmailDiscovery(g,e,o.onEmailDiscovered,o.queue,o.emailMetadata))c.push(g);if(o.depth>0||!r){let g=w(h.data).document,p=this.extractRelevantLinks(g,s,u,o.depth,o.allowCrossDomainTravel);o.depth--;let b=await Promise.allSettled(p.map((f)=>this.parseExternalWebsite(f,"GET",null,{...o,depth:o.depth},t,!1,!0)));for(let f of b)if(f.status==="fulfilled")c.push(...f.value);else if(this.debug)console.warn("⚠️ Failed to parse child URL:",f.reason?.message)}}catch(s){if(this.debug)console.error(`❌ Error parsing external website: ${e}`,s?.message)}if(i){if(o.onEmails&&o.onEmails.length>0)o.queue.add(async()=>Promise.all(o.onEmails.map((s)=>s(c))))}return c}async parseLinktreeProfile(e,a,n){let o=[];try{let t=await this.executeHttp(e,"GET",null,a,n);if(!t){if(this.debug)console.warn(`⚠️ Failed to fetch Linktree profile: ${e}`);return o}let r=w(t).document.getElementById("links-container");if(!r){if(this.debug)console.warn(`\uD83D\uDD0D No links container found in Linktree profile: ${e}`);return o}let l=this.extractLinktreeExternalUrls(r,e);if(l.length===0){if(this.debug)console.info("\uD83D\uDCED No valid external links found in Linktree profile");return o}if(this.debug)console.info(`\uD83C\uDFAF Found ${l.length} external links in Linktree profile`);let d=await Promise.allSettled(l.map((c)=>this.parseExternalWebsite(c,"GET",null,a,n,!1)));for(let c of d)if(c.status==="fulfilled")o.push(...c.value);else if(this.debug)console.warn("⚠️ Failed to parse Linktree external URL:",c.reason?.message)}catch(t){if(this.debug)console.error(`❌ Error parsing Linktree profile: ${e}`,t?.message)}return o}extractLinktreeExternalUrls(e,a){let n=new Set,o=e.querySelectorAll("a[href][target='_blank']");for(let t of o){let i=t.getAttribute("href");if(!i||i.length<3||this.forbiddenProtocols.some((r)=>i.startsWith(r)))continue;try{let r=new URL(i,a).href,l=this.extractRootDomain(r);if(l!=="linktr.ee"&&!this.isRestrictedDomain(r)&&l.length>3)n.add(r)}catch(r){if(this.debug)console.warn(`\uD83D\uDD17 Invalid URL in Linktree: ${i}`)}}return Array.from(n)}handleEmailDiscovery(e,a,n,o,t){if(!this.discoveredEmails.has(e)){this.discoveredEmails.add(e);let i={email:e,discoveredAt:a,timestamp:new Date,metadata:t||{}};if(n&&n.length>0)o.add(async()=>Promise.all(n.map((r)=>r(i))));if(this.debug)console.info(`\uD83D\uDCE7 New email discovered: ${e} at ${a}`);return!0}return!1}isDomainAccessAllowed(e,a,n,o){if(o)return!0;if(n===0)return e===a;return e===a||e.endsWith(`.${a}`)||a.endsWith(`.${e}`)}extractRelevantLinks(e,a,n,o,t){let i=[],r=["about","contact","help","support","reach","email","mail","message","company","team","staff","info","inquiry","feedback","service","assistance","connect","touch"],l=e.querySelectorAll("a[href]");for(let d of l){let c=d.getAttribute("href");if(!c||c.length<2)continue;try{let s=this.normalizeUrl(c,a),u=this.extractRootDomain(s);if(!this.isDomainAccessAllowed(u,n,o,t))continue;if(r.some((m)=>s.toLowerCase().includes(m))||this.isLinktreeUrl(s))i.push(s)}catch(s){if(this.debug)console.warn(`\uD83D\uDD17 Invalid link found: ${c}`,s?.message)}}return i}extractEmailsFromContent(e){let a=e.replace(/[^\w@.-\s]/g," "),n=/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g,o=(l)=>{let d=/^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$/,c=l.split("@")[1]?.toLowerCase(),s=l.split(".").pop()?.toLowerCase();return d.test(l)&&c!==void 0&&s!==void 0&&!this.fileExtensions.includes(`.${s}`)&&!this.isRestrictedDomain(`https://${c}`)},t=(l)=>{return(l.match(n)||[]).filter(o)},i=e.replace(/<[^>]*>/g," "),r=[...t(i),...t(a)];return[...new Set(r)]}isRestrictedDomain(e){try{let a=new URL(e).host.toLowerCase();return this.restrictedDomains.some((n)=>a===n.toLowerCase()||a.endsWith(`.${n.toLowerCase()}`))}catch{return!0}}isLinktreeUrl(e){try{return this.extractRootDomain(e)==="linktr.ee"}catch{return!1}}extractRootDomain(e){try{let n=new URL(e).hostname.toLowerCase();return n.startsWith("www.")?n.slice(4):n}catch{return""}}normalizeUrl(e,a){if(e.startsWith("http://")||e.startsWith("https://"))return e;if(e.startsWith("//"))return`${a.protocol}${e}`;return new URL(e,a.href).href}getRandomUserAgent(){return this.userAgents[Math.floor(Math.random()*this.userAgents.length)]}}function y(){let e=[{name:"Chrome",version:"91.0.4472.124",engine:"AppleWebKit/537.36"},{name:"Firefox",version:"89.0",engine:"Gecko/20100101"},{name:"Safari",version:"14.1.1",engine:"AppleWebKit/605.1.15"},{name:"Edge",version:"91.0.864.59",engine:"AppleWebKit/537.36"},{name:"Opera",version:"77.0.4054.277",engine:"AppleWebKit/537.36"},{name:"Vivaldi",version:"3.8.2259.42",engine:"AppleWebKit/537.36"},{name:"Brave",version:"1.26.74",engine:"AppleWebKit/537.36"},{name:"Chromium",version:"91.0.4472.101",engine:"AppleWebKit/537.36"},{name:"Yandex",version:"21.5.3.742",engine:"AppleWebKit/537.36"},{name:"Maxthon",version:"5.3.8.2000",engine:"AppleWebKit/537.36"}],a=["Windows NT 10.0","Windows NT 6.1","Macintosh; Intel Mac OS X 10_15_7","Macintosh; Intel Mac OS X 11_4_0","X11; Linux x86_64","X11; Ubuntu; Linux x86_64"],n=[];for(let o=0;o<200;o++){let t=e[Math.floor(Math.random()*e.length)],i=a[Math.floor(Math.random()*a.length)],r="";switch(t.name){case"Chrome":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36`;break;case"Firefox":r=`Mozilla/5.0 (${i}; rv:${t.version}) ${t.engine} Firefox/${t.version}`;break;case"Safari":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Version/${t.version} Safari/605.1.15`;break;case"Edge":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Edg/${t.version}`;break;case"Opera":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 OPR/${t.version}`;break;case"Vivaldi":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Vivaldi/${t.version}`;break;case"Brave":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Brave/${t.version}`;break;case"Chromium":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chromium/${t.version} Chrome/${t.version} Safari/537.36`;break;case"Yandex":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} YaBrowser/${t.version} Safari/537.36`;break;case"Maxthon":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Maxthon/${t.version}`;break}n.push(r)}return n}function $(){return["facebook.com","fb.com","messenger.com","instagram.com","threads.net","twitter.com","x.com","linkedin.com","pinterest.com","pin.it","reddit.com","tumblr.com","snapchat.com","tiktok.com","douyin.com","youtube.com","youtu.be","whatsapp.com","telegram.org","t.me","medium.com","quora.com","flickr.com","vimeo.com","vk.com","weibo.com","sina.com.cn","line.me","discord.com","discordapp.com","twitch.tv","meetup.com","nextdoor.com","xing.com","yelp.com","zalo.me","mastodon.social","clubhouse.com","patreon.com","onlyfans.com","douban.com","goodreads.com","soundcloud.com","spotify.com","last.fm","behance.net","dribbble.com","deviantart.com","pixiv.net","slideshare.net","tinder.com","bumble.com","etsy.com","indeed.com","glassdoor.com","monster.com","careerbuilder.com","dice.com","ziprecruiter.com","simplyhired.com","upwork.com","freelancer.com","fiverr.com","stackoverflow.com","stackoverflow.co","angel.co","wellfound.com","quora.com","stackexchange.com","yahoo.com","answers.microsoft.com","askubuntu.com","superuser.com","serverfault.com","mathoverflow.net","xda-developers.com","gamespot.com","ign.com","4chan.org","9gag.com","gizmodo.com","slashdot.org","hacker-news.news","ycombinator.com","producthunt.com","discourse.org","google.com","google.co.uk","google.de","google.fr","google.co.jp","bing.com","yahoo.com","search.yahoo.com","duckduckgo.com","baidu.com","yandex.com","yandex.ru","ask.com","wolframalpha.com","ecosia.org","startpage.com","qwant.com","searx.me","gibiru.com","swisscows.com","gmail.com","googlemail.com","outlook.com","hotmail.com","live.com","msn.com","yahoo.com","ymail.com","aol.com","icloud.com","me.com","mac.com","protonmail.com","pm.me","zoho.com","mail.com","gmx.com","gmx.net","yandex.com","yandex.ru","tutanota.com","tutanota.de","fastmail.com","hushmail.com","mailbox.org","posteo.de","runbox.com","disroot.org","163.com","qq.com","rambler.ru","mail.ru","yelp.com","yelp.ca","yelp.co.uk","yelp.com.au","yellowpages.com","yellowpages.ca","yell.com","tripadvisor.com","tripadvisor.co.uk","tripadvisor.ca","foursquare.com","angieslist.com","bbb.org","manta.com","thumbtack.com","homeadvisor.com","superpages.com","whitepages.com","local.com","citysearch.com","merchantcircle.com","insiderpages.com","kudzu.com","hotfrog.com","buildzoom.com","houzz.com","porch.com","mapquest.com","zagat.com","zomato.com","opentable.com","viator.com","expedia.com","booking.com","airbnb.com","vrbo.com","homeaway.com","craigslist.org","nextdoor.com","patch.com","meetup.com","eventbrite.com","groupon.com","livingsocial.com","gumtree.com","gumtree.com.au","kijiji.ca","leboncoin.fr","finn.no","blocket.se","58.com","dianping.com","tabelog.com","ypcdn.com"]}exports.Scraper=v;exports.CappedSet=x;
|
|
1
|
+
var{parseHTML:k}=require("linkedom");class w extends Set{maxSize;evictionCount;constructor(e,a=0.1){super();this.maxSize=e,this.evictionCount=Math.max(1,Math.ceil(e*a))}add(e){if(this.has(e))return this;if(this.size>=this.maxSize){let a=this.values();for(let n=0;n<this.evictionCount;n++){let t=a.next();if(t.done)break;this.delete(t.value)}}return super.add(e)}}class b{http;httpOptions;onEmailLeads;onEmailDiscovered;debug;static CLEAN_CONTENT_REGEX=/[^\w@.-\s]/g;static EMAIL_REGEX=/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g;static EMAIL_VALIDATION_REGEX=/^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$/;discoveredEmails=new w(1e4);userAgents=[];fileExtensions=[];restrictedDomains=C();forbiddenProtocols=["mailto:","tel:","javascript:","data:","sms:","ftp:","file:","irc:","blob:","chrome:","about:","intent:"];constructor(e,a,n,t,o=!1){this.http=e,this.httpOptions=a,this.onEmailLeads=n,this.onEmailDiscovered=t,this.debug=o,this.userAgents=$()}sleep(e){return new Promise((a)=>setTimeout(a,e))}async executeHttp(e,a,n,t,o,i=0){let{getCache:s,saveCache:r,hasUrlInCache:m,saveUrl:u,httpConfig:l={}}=t;if(!e||e.length<3||this.forbiddenProtocols.some((f)=>e.startsWith(f)))return;try{let f=o?!1:await m(e),h=await s(e);if(f&&!h)return!1;if(f&&a!=="GET")return!1;let c=h&&a==="GET"?h:await(a==="GET"?this.http.get(e,l):a==="PATCH"?this.http.patch(e,n,l):a==="POST"?this.http.post(e,n,l):this.http.put(e,n,l));if(!h)await r(e,{data:c.data,contentType:c.contentType,finalUrl:c.finalUrl});if(!f)await u(e);if(!c.contentType||!c.contentType.includes("/html")||!c.contentType.includes("text/")||typeof c.data!=="string")return null;return{data:c.data,contentType:c.contentType,finalUrl:c.finalUrl}}catch(f){let h=f,c=this.httpOptions;if(h&&h.response){let g=h.response.status,p=c.retryDelay||100,x=c.maxRetryAttempts||3,d=c.retryWithoutProxyOnStatusCode||void 0,v=c.maxRetryOnProxyError||3;if(d&&l.proxy&&d.includes(g)&&i<x)return await this.sleep(p),delete l.proxy,await this.executeHttp(e,a,n,t,o,i+1);else if(c.retryOnStatusCode&&l.proxy&&c.retryOnStatusCode.includes(g)&&i<x)return await this.sleep(p),await this.executeHttp(e,a,n,t,o,i+1);else if(c.retryOnProxyError&&l.proxy&&i<v)return await this.sleep(p),await this.executeHttp(e,a,n,t,o,i+1)}if(this.debug){if(this.debug)console.log(`Error: unable to ${a} ${e}: ${f.message}`)}return null}}extractEmails(e,a,n,t,o,i){let s=this.extractEmailsFromContent(e?.replaceAll("mailto:"," ")),r=[];for(let m of s)if(this.handleEmailDiscovery(m,a,n,o,i))r.push(m);if(t&&t.length>0&&r.length>0)o.add(async()=>Promise.all(t.map((m)=>m(r))));s.length=0,r.length=0}async parseExternalWebsite(e,a,n,t,o,i=!0,s,r){let m=t.httpConfig?.headers?t.httpConfig.headers instanceof Headers?Object.fromEntries(t.httpConfig.headers.entries()):t.httpConfig.headers:{};t.httpConfig=t.httpConfig||{},t.httpConfig.headers={"user-agent":this.getRandomUserAgent(),accept:"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8","accept-language":"en-US,en;q=0.9","cache-control":"no-cache",pragma:"no-cache",...m},t.httpConfig.timeout=t.httpConfig.timeout||15000,t.depth=t.depth||0,t.allowCrossDomainTravel=t.allowCrossDomainTravel||!1,o=o&&i;let u=[];try{let l=new URL(e),f=this.extractRootDomain(e);if(this.isLinktreeUrl(e))return await this.parseLinktreeProfile(e,t,o);if(this.isRestrictedDomain(e)){if(this.debug)console.warn(`⚠️ Skipped URL (restricted url): ${e}`);return u}let h=await this.executeHttp(e,a,n,t,o);if(!h){if(this.debug&&h===null)console.warn(`⚠️ Failed to fetch page content: ${e}`);if(this.debug&&h===!1)console.warn(`⚠️ Skipped URL (already visited): ${e}`);return u}let c=this.extractEmailsFromContent(h.data?.replaceAll("mailto:"," "));for(let g of c)if(this.handleEmailDiscovery(g,e,t.onEmailDiscovered,t.queue,t.emailMetadata))u.push(g);if(t.depth>0||!s){let g;{let{document:d}=k(h.data);g=this.extractRelevantLinks(d,l,f,t.depth,t.allowCrossDomainTravel)}t.depth--;let p=10,x=[];for(let d=0;d<g.length;d+=p){let v=g.slice(d,d+p),y=await Promise.allSettled(v.map((E)=>this.parseExternalWebsite(E,"GET",null,{...t,depth:t.depth},o,!1,!0)));x.push(...y)}for(let d of x)if(d.status==="fulfilled")u.push(...d.value);else if(this.debug)console.warn("⚠️ Failed to parse child URL:",d.reason?.message)}}catch(l){if(this.debug)console.error(`❌ Error parsing external website: ${e}`,l?.message)}if(i){if(t.onEmails&&t.onEmails.length>0)t.queue.add(async()=>Promise.all(t.onEmails.map((l)=>l(u))))}return u}async parseLinktreeProfile(e,a,n){let t=[];try{let o=await this.executeHttp(e,"GET",null,a,n);if(!o){if(this.debug)console.warn(`⚠️ Failed to fetch Linktree profile: ${e}`);return t}let i;{let{document:r}=k(o),m=r.getElementById("links-container");if(!m){if(this.debug)console.warn(`\uD83D\uDD0D No links container found in Linktree profile: ${e}`);return t}i=this.extractLinktreeExternalUrls(m,e)}if(i.length===0){if(this.debug)console.info("\uD83D\uDCED No valid external links found in Linktree profile");return t}if(this.debug)console.info(`\uD83C\uDFAF Found ${i.length} external links in Linktree profile`);let s=await Promise.allSettled(i.map((r)=>this.parseExternalWebsite(r,"GET",null,a,n,!1)));for(let r of s)if(r.status==="fulfilled")t.push(...r.value);else if(this.debug)console.warn("⚠️ Failed to parse Linktree external URL:",r.reason?.message)}catch(o){if(this.debug)console.error(`❌ Error parsing Linktree profile: ${e}`,o?.message)}return t}extractLinktreeExternalUrls(e,a){let n=new Set,t=e.querySelectorAll("a[href][target='_blank']");for(let o of t){let i=o.getAttribute("href");if(!i||i.length<3||this.forbiddenProtocols.some((s)=>i.startsWith(s)))continue;try{let s=new URL(i,a).href,r=this.extractRootDomain(s);if(r!=="linktr.ee"&&!this.isRestrictedDomain(s)&&r.length>3)n.add(s)}catch(s){if(this.debug)console.warn(`\uD83D\uDD17 Invalid URL in Linktree: ${i}`)}}return Array.from(n)}handleEmailDiscovery(e,a,n,t,o){if(!this.discoveredEmails.has(e)){this.discoveredEmails.add(e);let i={email:e,discoveredAt:a,timestamp:new Date,metadata:o||{}};if(n&&n.length>0)t.add(async()=>Promise.all(n.map((s)=>s(i))));if(this.debug)console.info(`\uD83D\uDCE7 New email discovered: ${e} at ${a}`);return!0}return!1}isDomainAccessAllowed(e,a,n,t){if(t)return!0;if(n===0)return e===a;return e===a||e.endsWith(`.${a}`)||a.endsWith(`.${e}`)}extractRelevantLinks(e,a,n,t,o){let i=[],s=["about","contact","help","support","reach","email","mail","message","company","team","staff","info","inquiry","feedback","service","assistance","connect","touch"],r=e.querySelectorAll("a[href]");for(let m of r){let u=m.getAttribute("href");if(!u||u.length<2)continue;try{let l=this.normalizeUrl(u,a),f=this.extractRootDomain(l);if(!this.isDomainAccessAllowed(f,n,t,o))continue;if(s.some((c)=>l.toLowerCase().includes(c))||this.isLinktreeUrl(l))i.push(l)}catch(l){if(this.debug)console.warn(`\uD83D\uDD17 Invalid link found: ${u}`,l?.message)}}return i}extractEmailsFromContent(e){let a=e.replace(b.CLEAN_CONTENT_REGEX," "),n=(s)=>{let r=s.split("@")[1]?.toLowerCase(),m=s.split(".").pop()?.toLowerCase();return b.EMAIL_VALIDATION_REGEX.test(s)&&r!==void 0&&m!==void 0&&!this.fileExtensions.includes(`.${m}`)&&!this.isRestrictedDomain(`https://${r}`)},t=(s)=>{return b.EMAIL_REGEX.lastIndex=0,(s.match(b.EMAIL_REGEX)||[]).filter(n)},o=e.replace(/<[^>]*>/g," "),i=[...t(o),...t(a)];return[...new Set(i)]}isRestrictedDomain(e){try{let a=new URL(e).host.toLowerCase();return this.restrictedDomains.some((n)=>a===n.toLowerCase()||a.endsWith(`.${n.toLowerCase()}`))}catch{return!0}}isLinktreeUrl(e){try{return this.extractRootDomain(e)==="linktr.ee"}catch{return!1}}extractRootDomain(e){try{let n=new URL(e).hostname.toLowerCase();return n.startsWith("www.")?n.slice(4):n}catch{return""}}normalizeUrl(e,a){if(e.startsWith("http://")||e.startsWith("https://"))return e;if(e.startsWith("//"))return`${a.protocol}${e}`;return new URL(e,a.href).href}getRandomUserAgent(){return this.userAgents[Math.floor(Math.random()*this.userAgents.length)]}}function $(){let e=[{name:"Chrome",version:"91.0.4472.124",engine:"AppleWebKit/537.36"},{name:"Firefox",version:"89.0",engine:"Gecko/20100101"},{name:"Safari",version:"14.1.1",engine:"AppleWebKit/605.1.15"},{name:"Edge",version:"91.0.864.59",engine:"AppleWebKit/537.36"},{name:"Opera",version:"77.0.4054.277",engine:"AppleWebKit/537.36"},{name:"Vivaldi",version:"3.8.2259.42",engine:"AppleWebKit/537.36"},{name:"Brave",version:"1.26.74",engine:"AppleWebKit/537.36"},{name:"Chromium",version:"91.0.4472.101",engine:"AppleWebKit/537.36"},{name:"Yandex",version:"21.5.3.742",engine:"AppleWebKit/537.36"},{name:"Maxthon",version:"5.3.8.2000",engine:"AppleWebKit/537.36"}],a=["Windows NT 10.0","Windows NT 6.1","Macintosh; Intel Mac OS X 10_15_7","Macintosh; Intel Mac OS X 11_4_0","X11; Linux x86_64","X11; Ubuntu; Linux x86_64"],n=[];for(let t=0;t<200;t++){let o=e[Math.floor(Math.random()*e.length)],i=a[Math.floor(Math.random()*a.length)],s="";switch(o.name){case"Chrome":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Chrome/${o.version} Safari/537.36`;break;case"Firefox":s=`Mozilla/5.0 (${i}; rv:${o.version}) ${o.engine} Firefox/${o.version}`;break;case"Safari":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Version/${o.version} Safari/605.1.15`;break;case"Edge":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Chrome/${o.version} Safari/537.36 Edg/${o.version}`;break;case"Opera":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Chrome/${o.version} Safari/537.36 OPR/${o.version}`;break;case"Vivaldi":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Chrome/${o.version} Safari/537.36 Vivaldi/${o.version}`;break;case"Brave":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Chrome/${o.version} Safari/537.36 Brave/${o.version}`;break;case"Chromium":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Chromium/${o.version} Chrome/${o.version} Safari/537.36`;break;case"Yandex":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Chrome/${o.version} YaBrowser/${o.version} Safari/537.36`;break;case"Maxthon":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Chrome/${o.version} Safari/537.36 Maxthon/${o.version}`;break}n.push(s)}return n}function C(){return["facebook.com","fb.com","messenger.com","instagram.com","threads.net","twitter.com","x.com","linkedin.com","pinterest.com","pin.it","reddit.com","tumblr.com","snapchat.com","tiktok.com","douyin.com","youtube.com","youtu.be","whatsapp.com","telegram.org","t.me","medium.com","quora.com","flickr.com","vimeo.com","vk.com","weibo.com","sina.com.cn","line.me","discord.com","discordapp.com","twitch.tv","meetup.com","nextdoor.com","xing.com","yelp.com","zalo.me","mastodon.social","clubhouse.com","patreon.com","onlyfans.com","douban.com","goodreads.com","soundcloud.com","spotify.com","last.fm","behance.net","dribbble.com","deviantart.com","pixiv.net","slideshare.net","tinder.com","bumble.com","etsy.com","indeed.com","glassdoor.com","monster.com","careerbuilder.com","dice.com","ziprecruiter.com","simplyhired.com","upwork.com","freelancer.com","fiverr.com","stackoverflow.com","stackoverflow.co","angel.co","wellfound.com","quora.com","stackexchange.com","yahoo.com","answers.microsoft.com","askubuntu.com","superuser.com","serverfault.com","mathoverflow.net","xda-developers.com","gamespot.com","ign.com","4chan.org","9gag.com","gizmodo.com","slashdot.org","hacker-news.news","ycombinator.com","producthunt.com","discourse.org","google.com","google.co.uk","google.de","google.fr","google.co.jp","bing.com","yahoo.com","search.yahoo.com","duckduckgo.com","baidu.com","yandex.com","yandex.ru","ask.com","wolframalpha.com","ecosia.org","startpage.com","qwant.com","searx.me","gibiru.com","swisscows.com","gmail.com","googlemail.com","outlook.com","hotmail.com","live.com","msn.com","yahoo.com","ymail.com","aol.com","icloud.com","me.com","mac.com","protonmail.com","pm.me","zoho.com","mail.com","gmx.com","gmx.net","yandex.com","yandex.ru","tutanota.com","tutanota.de","fastmail.com","hushmail.com","mailbox.org","posteo.de","runbox.com","disroot.org","163.com","qq.com","rambler.ru","mail.ru","yelp.com","yelp.ca","yelp.co.uk","yelp.com.au","yellowpages.com","yellowpages.ca","yell.com","tripadvisor.com","tripadvisor.co.uk","tripadvisor.ca","foursquare.com","angieslist.com","bbb.org","manta.com","thumbtack.com","homeadvisor.com","superpages.com","whitepages.com","local.com","citysearch.com","merchantcircle.com","insiderpages.com","kudzu.com","hotfrog.com","buildzoom.com","houzz.com","porch.com","mapquest.com","zagat.com","zomato.com","opentable.com","viator.com","expedia.com","booking.com","airbnb.com","vrbo.com","homeaway.com","craigslist.org","nextdoor.com","patch.com","meetup.com","eventbrite.com","groupon.com","livingsocial.com","gumtree.com","gumtree.com.au","kijiji.ca","leboncoin.fr","finn.no","blocket.se","58.com","dianping.com","tabelog.com","ypcdn.com"]}exports.Scraper=b;exports.CappedSet=w;
|
package/dist/crawler/scraper.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
import{parseHTML as x}from"linkedom";class w extends Set{maxSize;constructor(e){super();this.maxSize=e}add(e){if(this.has(e))return this;if(this.size>=this.maxSize){let a=this.values().next().value;if(a)this.delete(a)}return super.add(e)}}class k{http;httpOptions;onEmailLeads;onEmailDiscovered;debug;discoveredEmails=new w(1e4);userAgents=[];fileExtensions=[];restrictedDomains=$();forbiddenProtocols=["mailto:","tel:","javascript:","data:","sms:","ftp:","file:","irc:","blob:","chrome:","about:","intent:"];constructor(e,a,n,o,t=!1){this.http=e,this.httpOptions=a,this.onEmailLeads=n,this.onEmailDiscovered=o,this.debug=t,this.userAgents=y()}sleep(e){return new Promise((a)=>setTimeout(a,e))}async executeHttp(e,a,n,o,t,i=0){let{getCache:r,saveCache:l,hasUrlInCache:d,saveUrl:c,httpConfig:s={}}=o;if(!e||e.length<3||this.forbiddenProtocols.some((u)=>e.startsWith(u)))return;try{let u=t?!1:await d(e),h=await r(e);if(u&&!h)return!1;if(u&&a!=="GET")return!1;let m=h&&a==="GET"?h:await(a==="GET"?this.http.get(e,s):a==="PATCH"?this.http.patch(e,n,s):a==="POST"?this.http.post(e,n,s):this.http.put(e,n,s));if(!h)await l(e,{data:m.data,contentType:m.contentType,finalUrl:m.finalUrl});if(!u)await c(e);if(!m.contentType||!m.contentType.includes("/html")||!m.contentType.includes("text/")||typeof m.data!=="string")return null;return{data:m.data,contentType:m.contentType,finalUrl:m.finalUrl}}catch(u){let h=u,m=this.httpOptions;if(h&&h.response){let g=h.response.status,p=m.retryDelay||100,b=m.maxRetryAttempts||3,f=m.retryWithoutProxyOnStatusCode||void 0,v=m.maxRetryOnProxyError||3;if(f&&s.proxy&&f.includes(g)&&i<b)return await this.sleep(p),delete s.proxy,await this.executeHttp(e,a,n,o,t,i+1);else if(m.retryOnStatusCode&&s.proxy&&m.retryOnStatusCode.includes(g)&&i<b)return await this.sleep(p),await this.executeHttp(e,a,n,o,t,i+1);else if(m.retryOnProxyError&&s.proxy&&i<v)return await this.sleep(p),await this.executeHttp(e,a,n,o,t,i+1)}if(this.debug){if(this.debug)console.log(`Error: unable to ${a} ${e}: ${u.message}`)}return null}}extractEmails(e,a,n,o,t,i){let r=this.extractEmailsFromContent(e?.replaceAll("mailto:"," ")),l=[];for(let d of r)if(this.handleEmailDiscovery(d,a,n,t,i))l.push(d);if(o&&o.length>0&&l.length>0)t.add(async()=>Promise.all(o.map((d)=>d(l))));r.length=0,l.length=0}async parseExternalWebsite(e,a,n,o,t,i=!0,r,l){let d=o.httpConfig?.headers?o.httpConfig.headers instanceof Headers?Object.fromEntries(o.httpConfig.headers.entries()):o.httpConfig.headers:{};o.httpConfig=o.httpConfig||{},o.httpConfig.headers={"user-agent":this.getRandomUserAgent(),accept:"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8","accept-language":"en-US,en;q=0.9","cache-control":"no-cache",pragma:"no-cache",...d},o.httpConfig.timeout=o.httpConfig.timeout||15000,o.depth=o.depth||0,o.allowCrossDomainTravel=o.allowCrossDomainTravel||!1,t=t&&i;let c=[];try{let s=new URL(e),u=this.extractRootDomain(e);if(this.isLinktreeUrl(e))return await this.parseLinktreeProfile(e,o,t);if(this.isRestrictedDomain(e)){if(this.debug)console.warn(`⚠️ Skipped URL (restricted url): ${e}`);return c}let h=await this.executeHttp(e,a,n,o,t);if(!h){if(this.debug&&h===null)console.warn(`⚠️ Failed to fetch page content: ${e}`);if(this.debug&&h===!1)console.warn(`⚠️ Skipped URL (already visited): ${e}`);return c}let m=this.extractEmailsFromContent(h.data?.replaceAll("mailto:"," "));for(let g of m)if(this.handleEmailDiscovery(g,e,o.onEmailDiscovered,o.queue,o.emailMetadata))c.push(g);if(o.depth>0||!r){let g=x(h.data).document,p=this.extractRelevantLinks(g,s,u,o.depth,o.allowCrossDomainTravel);o.depth--;let b=await Promise.allSettled(p.map((f)=>this.parseExternalWebsite(f,"GET",null,{...o,depth:o.depth},t,!1,!0)));for(let f of b)if(f.status==="fulfilled")c.push(...f.value);else if(this.debug)console.warn("⚠️ Failed to parse child URL:",f.reason?.message)}}catch(s){if(this.debug)console.error(`❌ Error parsing external website: ${e}`,s?.message)}if(i){if(o.onEmails&&o.onEmails.length>0)o.queue.add(async()=>Promise.all(o.onEmails.map((s)=>s(c))))}return c}async parseLinktreeProfile(e,a,n){let o=[];try{let t=await this.executeHttp(e,"GET",null,a,n);if(!t){if(this.debug)console.warn(`⚠️ Failed to fetch Linktree profile: ${e}`);return o}let r=x(t).document.getElementById("links-container");if(!r){if(this.debug)console.warn(`\uD83D\uDD0D No links container found in Linktree profile: ${e}`);return o}let l=this.extractLinktreeExternalUrls(r,e);if(l.length===0){if(this.debug)console.info("\uD83D\uDCED No valid external links found in Linktree profile");return o}if(this.debug)console.info(`\uD83C\uDFAF Found ${l.length} external links in Linktree profile`);let d=await Promise.allSettled(l.map((c)=>this.parseExternalWebsite(c,"GET",null,a,n,!1)));for(let c of d)if(c.status==="fulfilled")o.push(...c.value);else if(this.debug)console.warn("⚠️ Failed to parse Linktree external URL:",c.reason?.message)}catch(t){if(this.debug)console.error(`❌ Error parsing Linktree profile: ${e}`,t?.message)}return o}extractLinktreeExternalUrls(e,a){let n=new Set,o=e.querySelectorAll("a[href][target='_blank']");for(let t of o){let i=t.getAttribute("href");if(!i||i.length<3||this.forbiddenProtocols.some((r)=>i.startsWith(r)))continue;try{let r=new URL(i,a).href,l=this.extractRootDomain(r);if(l!=="linktr.ee"&&!this.isRestrictedDomain(r)&&l.length>3)n.add(r)}catch(r){if(this.debug)console.warn(`\uD83D\uDD17 Invalid URL in Linktree: ${i}`)}}return Array.from(n)}handleEmailDiscovery(e,a,n,o,t){if(!this.discoveredEmails.has(e)){this.discoveredEmails.add(e);let i={email:e,discoveredAt:a,timestamp:new Date,metadata:t||{}};if(n&&n.length>0)o.add(async()=>Promise.all(n.map((r)=>r(i))));if(this.debug)console.info(`\uD83D\uDCE7 New email discovered: ${e} at ${a}`);return!0}return!1}isDomainAccessAllowed(e,a,n,o){if(o)return!0;if(n===0)return e===a;return e===a||e.endsWith(`.${a}`)||a.endsWith(`.${e}`)}extractRelevantLinks(e,a,n,o,t){let i=[],r=["about","contact","help","support","reach","email","mail","message","company","team","staff","info","inquiry","feedback","service","assistance","connect","touch"],l=e.querySelectorAll("a[href]");for(let d of l){let c=d.getAttribute("href");if(!c||c.length<2)continue;try{let s=this.normalizeUrl(c,a),u=this.extractRootDomain(s);if(!this.isDomainAccessAllowed(u,n,o,t))continue;if(r.some((m)=>s.toLowerCase().includes(m))||this.isLinktreeUrl(s))i.push(s)}catch(s){if(this.debug)console.warn(`\uD83D\uDD17 Invalid link found: ${c}`,s?.message)}}return i}extractEmailsFromContent(e){let a=e.replace(/[^\w@.-\s]/g," "),n=/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g,o=(l)=>{let d=/^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$/,c=l.split("@")[1]?.toLowerCase(),s=l.split(".").pop()?.toLowerCase();return d.test(l)&&c!==void 0&&s!==void 0&&!this.fileExtensions.includes(`.${s}`)&&!this.isRestrictedDomain(`https://${c}`)},t=(l)=>{return(l.match(n)||[]).filter(o)},i=e.replace(/<[^>]*>/g," "),r=[...t(i),...t(a)];return[...new Set(r)]}isRestrictedDomain(e){try{let a=new URL(e).host.toLowerCase();return this.restrictedDomains.some((n)=>a===n.toLowerCase()||a.endsWith(`.${n.toLowerCase()}`))}catch{return!0}}isLinktreeUrl(e){try{return this.extractRootDomain(e)==="linktr.ee"}catch{return!1}}extractRootDomain(e){try{let n=new URL(e).hostname.toLowerCase();return n.startsWith("www.")?n.slice(4):n}catch{return""}}normalizeUrl(e,a){if(e.startsWith("http://")||e.startsWith("https://"))return e;if(e.startsWith("//"))return`${a.protocol}${e}`;return new URL(e,a.href).href}getRandomUserAgent(){return this.userAgents[Math.floor(Math.random()*this.userAgents.length)]}}function y(){let e=[{name:"Chrome",version:"91.0.4472.124",engine:"AppleWebKit/537.36"},{name:"Firefox",version:"89.0",engine:"Gecko/20100101"},{name:"Safari",version:"14.1.1",engine:"AppleWebKit/605.1.15"},{name:"Edge",version:"91.0.864.59",engine:"AppleWebKit/537.36"},{name:"Opera",version:"77.0.4054.277",engine:"AppleWebKit/537.36"},{name:"Vivaldi",version:"3.8.2259.42",engine:"AppleWebKit/537.36"},{name:"Brave",version:"1.26.74",engine:"AppleWebKit/537.36"},{name:"Chromium",version:"91.0.4472.101",engine:"AppleWebKit/537.36"},{name:"Yandex",version:"21.5.3.742",engine:"AppleWebKit/537.36"},{name:"Maxthon",version:"5.3.8.2000",engine:"AppleWebKit/537.36"}],a=["Windows NT 10.0","Windows NT 6.1","Macintosh; Intel Mac OS X 10_15_7","Macintosh; Intel Mac OS X 11_4_0","X11; Linux x86_64","X11; Ubuntu; Linux x86_64"],n=[];for(let o=0;o<200;o++){let t=e[Math.floor(Math.random()*e.length)],i=a[Math.floor(Math.random()*a.length)],r="";switch(t.name){case"Chrome":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36`;break;case"Firefox":r=`Mozilla/5.0 (${i}; rv:${t.version}) ${t.engine} Firefox/${t.version}`;break;case"Safari":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Version/${t.version} Safari/605.1.15`;break;case"Edge":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Edg/${t.version}`;break;case"Opera":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 OPR/${t.version}`;break;case"Vivaldi":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Vivaldi/${t.version}`;break;case"Brave":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Brave/${t.version}`;break;case"Chromium":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chromium/${t.version} Chrome/${t.version} Safari/537.36`;break;case"Yandex":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} YaBrowser/${t.version} Safari/537.36`;break;case"Maxthon":r=`Mozilla/5.0 (${i}) ${t.engine} (KHTML, like Gecko) Chrome/${t.version} Safari/537.36 Maxthon/${t.version}`;break}n.push(r)}return n}function $(){return["facebook.com","fb.com","messenger.com","instagram.com","threads.net","twitter.com","x.com","linkedin.com","pinterest.com","pin.it","reddit.com","tumblr.com","snapchat.com","tiktok.com","douyin.com","youtube.com","youtu.be","whatsapp.com","telegram.org","t.me","medium.com","quora.com","flickr.com","vimeo.com","vk.com","weibo.com","sina.com.cn","line.me","discord.com","discordapp.com","twitch.tv","meetup.com","nextdoor.com","xing.com","yelp.com","zalo.me","mastodon.social","clubhouse.com","patreon.com","onlyfans.com","douban.com","goodreads.com","soundcloud.com","spotify.com","last.fm","behance.net","dribbble.com","deviantart.com","pixiv.net","slideshare.net","tinder.com","bumble.com","etsy.com","indeed.com","glassdoor.com","monster.com","careerbuilder.com","dice.com","ziprecruiter.com","simplyhired.com","upwork.com","freelancer.com","fiverr.com","stackoverflow.com","stackoverflow.co","angel.co","wellfound.com","quora.com","stackexchange.com","yahoo.com","answers.microsoft.com","askubuntu.com","superuser.com","serverfault.com","mathoverflow.net","xda-developers.com","gamespot.com","ign.com","4chan.org","9gag.com","gizmodo.com","slashdot.org","hacker-news.news","ycombinator.com","producthunt.com","discourse.org","google.com","google.co.uk","google.de","google.fr","google.co.jp","bing.com","yahoo.com","search.yahoo.com","duckduckgo.com","baidu.com","yandex.com","yandex.ru","ask.com","wolframalpha.com","ecosia.org","startpage.com","qwant.com","searx.me","gibiru.com","swisscows.com","gmail.com","googlemail.com","outlook.com","hotmail.com","live.com","msn.com","yahoo.com","ymail.com","aol.com","icloud.com","me.com","mac.com","protonmail.com","pm.me","zoho.com","mail.com","gmx.com","gmx.net","yandex.com","yandex.ru","tutanota.com","tutanota.de","fastmail.com","hushmail.com","mailbox.org","posteo.de","runbox.com","disroot.org","163.com","qq.com","rambler.ru","mail.ru","yelp.com","yelp.ca","yelp.co.uk","yelp.com.au","yellowpages.com","yellowpages.ca","yell.com","tripadvisor.com","tripadvisor.co.uk","tripadvisor.ca","foursquare.com","angieslist.com","bbb.org","manta.com","thumbtack.com","homeadvisor.com","superpages.com","whitepages.com","local.com","citysearch.com","merchantcircle.com","insiderpages.com","kudzu.com","hotfrog.com","buildzoom.com","houzz.com","porch.com","mapquest.com","zagat.com","zomato.com","opentable.com","viator.com","expedia.com","booking.com","airbnb.com","vrbo.com","homeaway.com","craigslist.org","nextdoor.com","patch.com","meetup.com","eventbrite.com","groupon.com","livingsocial.com","gumtree.com","gumtree.com.au","kijiji.ca","leboncoin.fr","finn.no","blocket.se","58.com","dianping.com","tabelog.com","ypcdn.com"]}export{k as Scraper,w as CappedSet};
|
|
1
|
+
import{parseHTML as w}from"linkedom";class k extends Set{maxSize;evictionCount;constructor(e,a=0.1){super();this.maxSize=e,this.evictionCount=Math.max(1,Math.ceil(e*a))}add(e){if(this.has(e))return this;if(this.size>=this.maxSize){let a=this.values();for(let n=0;n<this.evictionCount;n++){let t=a.next();if(t.done)break;this.delete(t.value)}}return super.add(e)}}class x{http;httpOptions;onEmailLeads;onEmailDiscovered;debug;static CLEAN_CONTENT_REGEX=/[^\w@.-\s]/g;static EMAIL_REGEX=/\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b/g;static EMAIL_VALIDATION_REGEX=/^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$/;discoveredEmails=new k(1e4);userAgents=[];fileExtensions=[];restrictedDomains=L();forbiddenProtocols=["mailto:","tel:","javascript:","data:","sms:","ftp:","file:","irc:","blob:","chrome:","about:","intent:"];constructor(e,a,n,t,o=!1){this.http=e,this.httpOptions=a,this.onEmailLeads=n,this.onEmailDiscovered=t,this.debug=o,this.userAgents=$()}sleep(e){return new Promise((a)=>setTimeout(a,e))}async executeHttp(e,a,n,t,o,i=0){let{getCache:s,saveCache:r,hasUrlInCache:m,saveUrl:u,httpConfig:l={}}=t;if(!e||e.length<3||this.forbiddenProtocols.some((f)=>e.startsWith(f)))return;try{let f=o?!1:await m(e),h=await s(e);if(f&&!h)return!1;if(f&&a!=="GET")return!1;let c=h&&a==="GET"?h:await(a==="GET"?this.http.get(e,l):a==="PATCH"?this.http.patch(e,n,l):a==="POST"?this.http.post(e,n,l):this.http.put(e,n,l));if(!h)await r(e,{data:c.data,contentType:c.contentType,finalUrl:c.finalUrl});if(!f)await u(e);if(!c.contentType||!c.contentType.includes("/html")||!c.contentType.includes("text/")||typeof c.data!=="string")return null;return{data:c.data,contentType:c.contentType,finalUrl:c.finalUrl}}catch(f){let h=f,c=this.httpOptions;if(h&&h.response){let g=h.response.status,p=c.retryDelay||100,b=c.maxRetryAttempts||3,d=c.retryWithoutProxyOnStatusCode||void 0,v=c.maxRetryOnProxyError||3;if(d&&l.proxy&&d.includes(g)&&i<b)return await this.sleep(p),delete l.proxy,await this.executeHttp(e,a,n,t,o,i+1);else if(c.retryOnStatusCode&&l.proxy&&c.retryOnStatusCode.includes(g)&&i<b)return await this.sleep(p),await this.executeHttp(e,a,n,t,o,i+1);else if(c.retryOnProxyError&&l.proxy&&i<v)return await this.sleep(p),await this.executeHttp(e,a,n,t,o,i+1)}if(this.debug){if(this.debug)console.log(`Error: unable to ${a} ${e}: ${f.message}`)}return null}}extractEmails(e,a,n,t,o,i){let s=this.extractEmailsFromContent(e?.replaceAll("mailto:"," ")),r=[];for(let m of s)if(this.handleEmailDiscovery(m,a,n,o,i))r.push(m);if(t&&t.length>0&&r.length>0)o.add(async()=>Promise.all(t.map((m)=>m(r))));s.length=0,r.length=0}async parseExternalWebsite(e,a,n,t,o,i=!0,s,r){let m=t.httpConfig?.headers?t.httpConfig.headers instanceof Headers?Object.fromEntries(t.httpConfig.headers.entries()):t.httpConfig.headers:{};t.httpConfig=t.httpConfig||{},t.httpConfig.headers={"user-agent":this.getRandomUserAgent(),accept:"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8","accept-language":"en-US,en;q=0.9","cache-control":"no-cache",pragma:"no-cache",...m},t.httpConfig.timeout=t.httpConfig.timeout||15000,t.depth=t.depth||0,t.allowCrossDomainTravel=t.allowCrossDomainTravel||!1,o=o&&i;let u=[];try{let l=new URL(e),f=this.extractRootDomain(e);if(this.isLinktreeUrl(e))return await this.parseLinktreeProfile(e,t,o);if(this.isRestrictedDomain(e)){if(this.debug)console.warn(`⚠️ Skipped URL (restricted url): ${e}`);return u}let h=await this.executeHttp(e,a,n,t,o);if(!h){if(this.debug&&h===null)console.warn(`⚠️ Failed to fetch page content: ${e}`);if(this.debug&&h===!1)console.warn(`⚠️ Skipped URL (already visited): ${e}`);return u}let c=this.extractEmailsFromContent(h.data?.replaceAll("mailto:"," "));for(let g of c)if(this.handleEmailDiscovery(g,e,t.onEmailDiscovered,t.queue,t.emailMetadata))u.push(g);if(t.depth>0||!s){let g;{let{document:d}=w(h.data);g=this.extractRelevantLinks(d,l,f,t.depth,t.allowCrossDomainTravel)}t.depth--;let p=10,b=[];for(let d=0;d<g.length;d+=p){let v=g.slice(d,d+p),y=await Promise.allSettled(v.map((E)=>this.parseExternalWebsite(E,"GET",null,{...t,depth:t.depth},o,!1,!0)));b.push(...y)}for(let d of b)if(d.status==="fulfilled")u.push(...d.value);else if(this.debug)console.warn("⚠️ Failed to parse child URL:",d.reason?.message)}}catch(l){if(this.debug)console.error(`❌ Error parsing external website: ${e}`,l?.message)}if(i){if(t.onEmails&&t.onEmails.length>0)t.queue.add(async()=>Promise.all(t.onEmails.map((l)=>l(u))))}return u}async parseLinktreeProfile(e,a,n){let t=[];try{let o=await this.executeHttp(e,"GET",null,a,n);if(!o){if(this.debug)console.warn(`⚠️ Failed to fetch Linktree profile: ${e}`);return t}let i;{let{document:r}=w(o),m=r.getElementById("links-container");if(!m){if(this.debug)console.warn(`\uD83D\uDD0D No links container found in Linktree profile: ${e}`);return t}i=this.extractLinktreeExternalUrls(m,e)}if(i.length===0){if(this.debug)console.info("\uD83D\uDCED No valid external links found in Linktree profile");return t}if(this.debug)console.info(`\uD83C\uDFAF Found ${i.length} external links in Linktree profile`);let s=await Promise.allSettled(i.map((r)=>this.parseExternalWebsite(r,"GET",null,a,n,!1)));for(let r of s)if(r.status==="fulfilled")t.push(...r.value);else if(this.debug)console.warn("⚠️ Failed to parse Linktree external URL:",r.reason?.message)}catch(o){if(this.debug)console.error(`❌ Error parsing Linktree profile: ${e}`,o?.message)}return t}extractLinktreeExternalUrls(e,a){let n=new Set,t=e.querySelectorAll("a[href][target='_blank']");for(let o of t){let i=o.getAttribute("href");if(!i||i.length<3||this.forbiddenProtocols.some((s)=>i.startsWith(s)))continue;try{let s=new URL(i,a).href,r=this.extractRootDomain(s);if(r!=="linktr.ee"&&!this.isRestrictedDomain(s)&&r.length>3)n.add(s)}catch(s){if(this.debug)console.warn(`\uD83D\uDD17 Invalid URL in Linktree: ${i}`)}}return Array.from(n)}handleEmailDiscovery(e,a,n,t,o){if(!this.discoveredEmails.has(e)){this.discoveredEmails.add(e);let i={email:e,discoveredAt:a,timestamp:new Date,metadata:o||{}};if(n&&n.length>0)t.add(async()=>Promise.all(n.map((s)=>s(i))));if(this.debug)console.info(`\uD83D\uDCE7 New email discovered: ${e} at ${a}`);return!0}return!1}isDomainAccessAllowed(e,a,n,t){if(t)return!0;if(n===0)return e===a;return e===a||e.endsWith(`.${a}`)||a.endsWith(`.${e}`)}extractRelevantLinks(e,a,n,t,o){let i=[],s=["about","contact","help","support","reach","email","mail","message","company","team","staff","info","inquiry","feedback","service","assistance","connect","touch"],r=e.querySelectorAll("a[href]");for(let m of r){let u=m.getAttribute("href");if(!u||u.length<2)continue;try{let l=this.normalizeUrl(u,a),f=this.extractRootDomain(l);if(!this.isDomainAccessAllowed(f,n,t,o))continue;if(s.some((c)=>l.toLowerCase().includes(c))||this.isLinktreeUrl(l))i.push(l)}catch(l){if(this.debug)console.warn(`\uD83D\uDD17 Invalid link found: ${u}`,l?.message)}}return i}extractEmailsFromContent(e){let a=e.replace(x.CLEAN_CONTENT_REGEX," "),n=(s)=>{let r=s.split("@")[1]?.toLowerCase(),m=s.split(".").pop()?.toLowerCase();return x.EMAIL_VALIDATION_REGEX.test(s)&&r!==void 0&&m!==void 0&&!this.fileExtensions.includes(`.${m}`)&&!this.isRestrictedDomain(`https://${r}`)},t=(s)=>{return x.EMAIL_REGEX.lastIndex=0,(s.match(x.EMAIL_REGEX)||[]).filter(n)},o=e.replace(/<[^>]*>/g," "),i=[...t(o),...t(a)];return[...new Set(i)]}isRestrictedDomain(e){try{let a=new URL(e).host.toLowerCase();return this.restrictedDomains.some((n)=>a===n.toLowerCase()||a.endsWith(`.${n.toLowerCase()}`))}catch{return!0}}isLinktreeUrl(e){try{return this.extractRootDomain(e)==="linktr.ee"}catch{return!1}}extractRootDomain(e){try{let n=new URL(e).hostname.toLowerCase();return n.startsWith("www.")?n.slice(4):n}catch{return""}}normalizeUrl(e,a){if(e.startsWith("http://")||e.startsWith("https://"))return e;if(e.startsWith("//"))return`${a.protocol}${e}`;return new URL(e,a.href).href}getRandomUserAgent(){return this.userAgents[Math.floor(Math.random()*this.userAgents.length)]}}function $(){let e=[{name:"Chrome",version:"91.0.4472.124",engine:"AppleWebKit/537.36"},{name:"Firefox",version:"89.0",engine:"Gecko/20100101"},{name:"Safari",version:"14.1.1",engine:"AppleWebKit/605.1.15"},{name:"Edge",version:"91.0.864.59",engine:"AppleWebKit/537.36"},{name:"Opera",version:"77.0.4054.277",engine:"AppleWebKit/537.36"},{name:"Vivaldi",version:"3.8.2259.42",engine:"AppleWebKit/537.36"},{name:"Brave",version:"1.26.74",engine:"AppleWebKit/537.36"},{name:"Chromium",version:"91.0.4472.101",engine:"AppleWebKit/537.36"},{name:"Yandex",version:"21.5.3.742",engine:"AppleWebKit/537.36"},{name:"Maxthon",version:"5.3.8.2000",engine:"AppleWebKit/537.36"}],a=["Windows NT 10.0","Windows NT 6.1","Macintosh; Intel Mac OS X 10_15_7","Macintosh; Intel Mac OS X 11_4_0","X11; Linux x86_64","X11; Ubuntu; Linux x86_64"],n=[];for(let t=0;t<200;t++){let o=e[Math.floor(Math.random()*e.length)],i=a[Math.floor(Math.random()*a.length)],s="";switch(o.name){case"Chrome":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Chrome/${o.version} Safari/537.36`;break;case"Firefox":s=`Mozilla/5.0 (${i}; rv:${o.version}) ${o.engine} Firefox/${o.version}`;break;case"Safari":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Version/${o.version} Safari/605.1.15`;break;case"Edge":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Chrome/${o.version} Safari/537.36 Edg/${o.version}`;break;case"Opera":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Chrome/${o.version} Safari/537.36 OPR/${o.version}`;break;case"Vivaldi":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Chrome/${o.version} Safari/537.36 Vivaldi/${o.version}`;break;case"Brave":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Chrome/${o.version} Safari/537.36 Brave/${o.version}`;break;case"Chromium":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Chromium/${o.version} Chrome/${o.version} Safari/537.36`;break;case"Yandex":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Chrome/${o.version} YaBrowser/${o.version} Safari/537.36`;break;case"Maxthon":s=`Mozilla/5.0 (${i}) ${o.engine} (KHTML, like Gecko) Chrome/${o.version} Safari/537.36 Maxthon/${o.version}`;break}n.push(s)}return n}function L(){return["facebook.com","fb.com","messenger.com","instagram.com","threads.net","twitter.com","x.com","linkedin.com","pinterest.com","pin.it","reddit.com","tumblr.com","snapchat.com","tiktok.com","douyin.com","youtube.com","youtu.be","whatsapp.com","telegram.org","t.me","medium.com","quora.com","flickr.com","vimeo.com","vk.com","weibo.com","sina.com.cn","line.me","discord.com","discordapp.com","twitch.tv","meetup.com","nextdoor.com","xing.com","yelp.com","zalo.me","mastodon.social","clubhouse.com","patreon.com","onlyfans.com","douban.com","goodreads.com","soundcloud.com","spotify.com","last.fm","behance.net","dribbble.com","deviantart.com","pixiv.net","slideshare.net","tinder.com","bumble.com","etsy.com","indeed.com","glassdoor.com","monster.com","careerbuilder.com","dice.com","ziprecruiter.com","simplyhired.com","upwork.com","freelancer.com","fiverr.com","stackoverflow.com","stackoverflow.co","angel.co","wellfound.com","quora.com","stackexchange.com","yahoo.com","answers.microsoft.com","askubuntu.com","superuser.com","serverfault.com","mathoverflow.net","xda-developers.com","gamespot.com","ign.com","4chan.org","9gag.com","gizmodo.com","slashdot.org","hacker-news.news","ycombinator.com","producthunt.com","discourse.org","google.com","google.co.uk","google.de","google.fr","google.co.jp","bing.com","yahoo.com","search.yahoo.com","duckduckgo.com","baidu.com","yandex.com","yandex.ru","ask.com","wolframalpha.com","ecosia.org","startpage.com","qwant.com","searx.me","gibiru.com","swisscows.com","gmail.com","googlemail.com","outlook.com","hotmail.com","live.com","msn.com","yahoo.com","ymail.com","aol.com","icloud.com","me.com","mac.com","protonmail.com","pm.me","zoho.com","mail.com","gmx.com","gmx.net","yandex.com","yandex.ru","tutanota.com","tutanota.de","fastmail.com","hushmail.com","mailbox.org","posteo.de","runbox.com","disroot.org","163.com","qq.com","rambler.ru","mail.ru","yelp.com","yelp.ca","yelp.co.uk","yelp.com.au","yellowpages.com","yellowpages.ca","yell.com","tripadvisor.com","tripadvisor.co.uk","tripadvisor.ca","foursquare.com","angieslist.com","bbb.org","manta.com","thumbtack.com","homeadvisor.com","superpages.com","whitepages.com","local.com","citysearch.com","merchantcircle.com","insiderpages.com","kudzu.com","hotfrog.com","buildzoom.com","houzz.com","porch.com","mapquest.com","zagat.com","zomato.com","opentable.com","viator.com","expedia.com","booking.com","airbnb.com","vrbo.com","homeaway.com","craigslist.org","nextdoor.com","patch.com","meetup.com","eventbrite.com","groupon.com","livingsocial.com","gumtree.com","gumtree.com.au","kijiji.ca","leboncoin.fr","finn.no","blocket.se","58.com","dianping.com","tabelog.com","ypcdn.com"]}export{x as Scraper,k as CappedSet};
|