spectrawl 0.4.2 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/browse/index.js +15 -6
- package/src/crawl.js +6 -3
package/package.json
CHANGED
package/src/browse/index.js
CHANGED
|
@@ -127,12 +127,21 @@ class BrowseEngine {
|
|
|
127
127
|
|
|
128
128
|
await page.goto(url, { waitUntil: 'domcontentloaded', timeout: 30000 })
|
|
129
129
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
130
|
+
if (opts.fastMode) {
|
|
131
|
+
// Crawl mode: minimal delays, just enough for lazy-load triggers
|
|
132
|
+
await page.waitForTimeout(400)
|
|
133
|
+
await page.evaluate(() => {
|
|
134
|
+
window.scrollBy({ top: 500, behavior: 'instant' })
|
|
135
|
+
})
|
|
136
|
+
await page.waitForTimeout(200)
|
|
137
|
+
} else {
|
|
138
|
+
// Normal browse: full human-like delays
|
|
139
|
+
await page.waitForTimeout(800 + Math.random() * 1500)
|
|
140
|
+
await page.evaluate(() => {
|
|
141
|
+
window.scrollBy({ top: Math.floor(Math.random() * 400) + 100, behavior: 'smooth' })
|
|
142
|
+
})
|
|
143
|
+
await page.waitForTimeout(300 + Math.random() * 700)
|
|
144
|
+
}
|
|
136
145
|
|
|
137
146
|
const result = {}
|
|
138
147
|
|
package/src/crawl.js
CHANGED
|
@@ -260,8 +260,10 @@ class CrawlEngine {
|
|
|
260
260
|
const totalMB = Math.floor(os.totalmem() / 1024 / 1024)
|
|
261
261
|
const freeMB = Math.floor(os.freemem() / 1024 / 1024)
|
|
262
262
|
const concurrency = detectConcurrency()
|
|
263
|
-
//
|
|
264
|
-
|
|
263
|
+
// Realistic: ~0.8s per page with fast mode, limited by shared browser pipeline
|
|
264
|
+
// Concurrency helps but not linearly — shared browser bottleneck
|
|
265
|
+
const effectiveConcurrency = Math.min(concurrency, 5) // diminishing returns past 5
|
|
266
|
+
const pagesPerMinute = Math.floor(effectiveConcurrency * 30) // ~2s effective per page with overhead
|
|
265
267
|
return {
|
|
266
268
|
totalRamMB: totalMB,
|
|
267
269
|
freeRamMB: freeMB,
|
|
@@ -279,7 +281,8 @@ class CrawlEngine {
|
|
|
279
281
|
_cookies: cookies,
|
|
280
282
|
timeout: config.timeout,
|
|
281
283
|
html: true,
|
|
282
|
-
noCache: true
|
|
284
|
+
noCache: true,
|
|
285
|
+
fastMode: true // crawl mode: reduced delays for speed
|
|
283
286
|
})
|
|
284
287
|
if (result?.content) {
|
|
285
288
|
const linkSource = result.html || result.content
|