cockroach-crawler 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +159 -0
- package/assets/logo.png +0 -0
- package/bin/cockroach-crawl.js +157 -0
- package/package.json +62 -0
- package/src/index.js +386 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Ajnas
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="https://raw.githubusercontent.com/AjnasNB/cockroach-crawler/main/assets/logo.png" alt="Cockroach Crawler" width="620">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
# Cockroach Crawler
|
|
6
|
+
|
|
7
|
+
Cockroach Crawler is an open-source public-web crawler for agent workflows. It turns crawlable public pages into clean JSON or JSONL with titles, metadata, readable text, markdown, links, status codes, content types, and timestamps.
|
|
8
|
+
|
|
9
|
+
It is designed for documentation indexing, RAG ingestion, QA crawling, content inventory, public research, and agent toolchains that need a local crawler with a simple CLI and JavaScript API.
|
|
10
|
+
|
|
11
|
+
## What it does
|
|
12
|
+
|
|
13
|
+
- Crawls public HTTP/HTTPS pages from one or more seed URLs.
|
|
14
|
+
- Respects `robots.txt` by default.
|
|
15
|
+
- Discovers sitemaps from `robots.txt` and `/sitemap.xml`.
|
|
16
|
+
- Extracts readable text and markdown.
|
|
17
|
+
- Outputs JSON or JSONL for agent pipelines.
|
|
18
|
+
- Supports concurrency, per-origin delay, max pages, max depth, include/exclude filters, URL files, and same-origin controls.
|
|
19
|
+
- Uses a clear user agent and supports contact details.
|
|
20
|
+
- Avoids likely private account/admin/cart/login URLs by default.
|
|
21
|
+
- Ships zero hosted dependencies: no account, API key, browser session, or LLM call is required.
|
|
22
|
+
|
|
23
|
+
## What it does not do
|
|
24
|
+
|
|
25
|
+
Cockroach Crawler is not a stealth scraper. It does not include bypass tooling for login walls, paywalls, CAPTCHA, anti-bot systems, authorization boundaries, or robots.txt. If a site owner requires permission, get permission and crawl within their terms.
|
|
26
|
+
|
|
27
|
+
## When to use it
|
|
28
|
+
|
|
29
|
+
Use Cockroach Crawler when an agent or data pipeline needs a repeatable public-web fetcher that returns compact, structured page records. It is intentionally smaller than full crawler frameworks and intentionally simpler than browser agents.
|
|
30
|
+
|
|
31
|
+
| Use case | Cockroach Crawler fit |
|
|
32
|
+
| --- | --- |
|
|
33
|
+
| Crawl public docs, blogs, help centers, or marketing pages | Strong |
|
|
34
|
+
| Export crawl results as JSONL or markdown for RAG | Strong |
|
|
35
|
+
| Respect robots.txt and crawl-delay style politeness controls | Strong |
|
|
36
|
+
| Crawl JavaScript-only apps that require clicks, login, or a live browser | Use Browser Use, Playwright, Puppeteer, Crawlee, or another browser stack |
|
|
37
|
+
| Bypass paywalls, CAPTCHA, anti-bot, auth, or owner restrictions | Not supported |
|
|
38
|
+
|
|
39
|
+
## Comparison
|
|
40
|
+
|
|
41
|
+
Cockroach Crawler is not a universal replacement for Browser Use, Crawlee, Scrapy, Firecrawl, Crawl4AI, or hosted browser tools. It is better for one narrow job: low-friction public HTML crawling into agent-friendly JSON/JSONL/Markdown from Node.js.
|
|
42
|
+
|
|
43
|
+
| Tool | Best at | Tradeoff compared with Cockroach Crawler |
|
|
44
|
+
| --- | --- | --- |
|
|
45
|
+
| Browser Use | LLM-driven browser control and interactive workflows | Heavier stack; better for clicking/forms, not a lightweight batch crawler |
|
|
46
|
+
| Crawlee | Production-grade crawling with browser and queue primitives | More powerful, larger API surface, more setup |
|
|
47
|
+
| Scrapy | Mature Python crawling framework | Excellent framework, but Python-first and less direct for Node agent pipelines |
|
|
48
|
+
| Firecrawl | Hosted/API-first page extraction and crawling | Great API product; self-hosting/licensing/deployment is heavier |
|
|
49
|
+
| Crawl4AI | LLM-oriented Python crawling and markdown extraction | Strong AI extraction focus; Python-first |
|
|
50
|
+
| Cockroach Crawler | Local public-web crawl to JSONL/Markdown for agents | Smaller scope; no JS browser rendering, no stealth, no hosted extraction |
|
|
51
|
+
|
|
52
|
+
The goal is not to beat every crawler at every job. The goal is to be the simplest useful crawler an agent can call locally when it needs public pages converted into clean records.
|
|
53
|
+
|
|
54
|
+
## Install
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
npm install -g cockroach-crawler
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
Or run without global install:
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
npx cockroach-crawler https://example.com --max-pages 20 --jsonl
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## CLI examples
|
|
67
|
+
|
|
68
|
+
Crawl one public site:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
cockroach-crawl https://example.com --max-pages 50 --jsonl --output crawl.jsonl
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
Use sitemaps and include only docs URLs:
|
|
75
|
+
|
|
76
|
+
```bash
|
|
77
|
+
cockroach-crawl https://example.com --sitemaps --include "/docs/" --max-pages 200 --output docs.json
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
Read many seed URLs from a file:
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
cockroach-crawl --url-file urls.txt --max-pages 100 --jsonl
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
Add a contact-aware user agent:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
cockroach-crawl https://example.com --contact "mailto:you@example.com"
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## CLI options
|
|
93
|
+
|
|
94
|
+
- `--url-file <file>`: read seed URLs from a text file, one URL per line.
|
|
95
|
+
- `--max-pages <n>`: maximum pages to return. Default: `50`.
|
|
96
|
+
- `--max-depth <n>`: maximum link depth from seeds. Default: `2`.
|
|
97
|
+
- `--concurrency <n>`: concurrent workers. Default: `4`.
|
|
98
|
+
- `--delay <ms>`: minimum delay per origin. Default: `250`.
|
|
99
|
+
- `--timeout <ms>`: request timeout. Default: `15000`.
|
|
100
|
+
- `--sitemaps`: discover URLs from robots.txt sitemaps and `/sitemap.xml`.
|
|
101
|
+
- `--all-origins`: allow crawling across discovered origins.
|
|
102
|
+
- `--include <regex>`: only crawl URLs matching regex. Can be repeated.
|
|
103
|
+
- `--exclude <regex>`: skip URLs matching regex. Can be repeated.
|
|
104
|
+
- `--allow-non-public`: allow likely login/account/admin/cart URLs.
|
|
105
|
+
- `--jsonl`: output JSON Lines instead of a JSON array.
|
|
106
|
+
- `--output <file>`: write output to a file.
|
|
107
|
+
- `--user-agent <ua>`: custom user agent.
|
|
108
|
+
- `--contact <email/url>`: add contact detail to the default user agent.
|
|
109
|
+
|
|
110
|
+
## Library API
|
|
111
|
+
|
|
112
|
+
```js
|
|
113
|
+
import { crawl } from "cockroach-crawler";
|
|
114
|
+
|
|
115
|
+
const pages = await crawl({
|
|
116
|
+
seeds: ["https://example.com"],
|
|
117
|
+
maxPages: 25,
|
|
118
|
+
maxDepth: 2,
|
|
119
|
+
concurrency: 4,
|
|
120
|
+
includeSitemaps: true,
|
|
121
|
+
include: ["/docs/"],
|
|
122
|
+
exclude: ["/login", "/account"],
|
|
123
|
+
onPage(page) {
|
|
124
|
+
console.log(page.url, page.title);
|
|
125
|
+
}
|
|
126
|
+
});
|
|
127
|
+
|
|
128
|
+
console.log(pages[0].markdown);
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
## Output shape
|
|
132
|
+
|
|
133
|
+
```json
|
|
134
|
+
{
|
|
135
|
+
"url": "https://example.com/",
|
|
136
|
+
"canonical": "https://example.com/",
|
|
137
|
+
"title": "Example",
|
|
138
|
+
"description": "Example description",
|
|
139
|
+
"h1": "Example",
|
|
140
|
+
"text": "Readable text...",
|
|
141
|
+
"markdown": "# Example\n\nReadable markdown...",
|
|
142
|
+
"links": ["https://example.com/about"],
|
|
143
|
+
"fetchedAt": "2026-06-27T00:00:00.000Z",
|
|
144
|
+
"status": 200,
|
|
145
|
+
"contentType": "text/html; charset=utf-8"
|
|
146
|
+
}
|
|
147
|
+
```
|
|
148
|
+
|
|
149
|
+
## Development
|
|
150
|
+
|
|
151
|
+
```bash
|
|
152
|
+
npm install
|
|
153
|
+
npm test
|
|
154
|
+
npm run bench
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
## License
|
|
158
|
+
|
|
159
|
+
MIT
|
package/assets/logo.png
ADDED
|
Binary file
|
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { createWriteStream } from "node:fs";
|
|
3
|
+
import { readFile } from "node:fs/promises";
|
|
4
|
+
import { crawl } from "../src/index.js";
|
|
5
|
+
|
|
6
|
+
function usage() {
|
|
7
|
+
console.log(`
|
|
8
|
+
Cockroach Crawler
|
|
9
|
+
|
|
10
|
+
Usage:
|
|
11
|
+
cockroach-crawl <url> [more urls...] [options]
|
|
12
|
+
|
|
13
|
+
Options:
|
|
14
|
+
--url-file <file> Read seed URLs from a text file, one URL per line
|
|
15
|
+
--max-pages <n> Maximum pages to return. Default: 50
|
|
16
|
+
--max-depth <n> Maximum link depth from seeds. Default: 2
|
|
17
|
+
--concurrency <n> Concurrent workers. Default: 4
|
|
18
|
+
--delay <ms> Minimum delay per origin. Default: 250
|
|
19
|
+
--timeout <ms> Request timeout. Default: 15000
|
|
20
|
+
--sitemaps Discover URLs from robots.txt sitemaps and /sitemap.xml
|
|
21
|
+
--all-origins Allow crawling across origins discovered from links
|
|
22
|
+
--include <regex> Only crawl URLs matching regex. Can be repeated
|
|
23
|
+
--exclude <regex> Skip URLs matching regex. Can be repeated
|
|
24
|
+
--allow-non-public Allow likely login/account/admin/cart URLs
|
|
25
|
+
--jsonl Output JSON Lines instead of a JSON array
|
|
26
|
+
--output <file> Write output to a file
|
|
27
|
+
--user-agent <ua> Custom user agent
|
|
28
|
+
--contact <email/url> Add contact detail to the default user agent
|
|
29
|
+
--help Show this help
|
|
30
|
+
|
|
31
|
+
This crawler respects robots.txt by default and does not bypass access controls.
|
|
32
|
+
`);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
async function urlsFromFile(file) {
|
|
36
|
+
const text = await readFile(file, "utf8");
|
|
37
|
+
return text
|
|
38
|
+
.split(/\r?\n/)
|
|
39
|
+
.map((line) => line.trim())
|
|
40
|
+
.filter((line) => line && !line.startsWith("#"));
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
async function readArgs(argv) {
|
|
44
|
+
const urls = [];
|
|
45
|
+
const include = [];
|
|
46
|
+
const exclude = [];
|
|
47
|
+
const options = {
|
|
48
|
+
maxPages: 50,
|
|
49
|
+
maxDepth: 2,
|
|
50
|
+
concurrency: 4,
|
|
51
|
+
delayMs: 250,
|
|
52
|
+
timeoutMs: 15_000,
|
|
53
|
+
includeSitemaps: false,
|
|
54
|
+
sameOrigin: true,
|
|
55
|
+
publicOnly: true,
|
|
56
|
+
jsonl: false,
|
|
57
|
+
output: null,
|
|
58
|
+
userAgent: undefined
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
for (let i = 0; i < argv.length; i += 1) {
|
|
62
|
+
const arg = argv[i];
|
|
63
|
+
if (arg === "--help" || arg === "-h") {
|
|
64
|
+
options.help = true;
|
|
65
|
+
} else if (arg === "--url-file") {
|
|
66
|
+
urls.push(...(await urlsFromFile(argv[++i])));
|
|
67
|
+
} else if (arg === "--max-pages") {
|
|
68
|
+
options.maxPages = Number(argv[++i]);
|
|
69
|
+
} else if (arg === "--max-depth") {
|
|
70
|
+
options.maxDepth = Number(argv[++i]);
|
|
71
|
+
} else if (arg === "--concurrency") {
|
|
72
|
+
options.concurrency = Number(argv[++i]);
|
|
73
|
+
} else if (arg === "--delay") {
|
|
74
|
+
options.delayMs = Number(argv[++i]);
|
|
75
|
+
} else if (arg === "--timeout") {
|
|
76
|
+
options.timeoutMs = Number(argv[++i]);
|
|
77
|
+
} else if (arg === "--sitemaps") {
|
|
78
|
+
options.includeSitemaps = true;
|
|
79
|
+
} else if (arg === "--all-origins") {
|
|
80
|
+
options.sameOrigin = false;
|
|
81
|
+
} else if (arg === "--include") {
|
|
82
|
+
include.push(argv[++i]);
|
|
83
|
+
} else if (arg === "--exclude") {
|
|
84
|
+
exclude.push(argv[++i]);
|
|
85
|
+
} else if (arg === "--allow-non-public") {
|
|
86
|
+
options.publicOnly = false;
|
|
87
|
+
} else if (arg === "--jsonl") {
|
|
88
|
+
options.jsonl = true;
|
|
89
|
+
} else if (arg === "--output" || arg === "-o") {
|
|
90
|
+
options.output = argv[++i];
|
|
91
|
+
} else if (arg === "--user-agent") {
|
|
92
|
+
options.userAgent = argv[++i];
|
|
93
|
+
} else if (arg === "--contact") {
|
|
94
|
+
const contact = argv[++i];
|
|
95
|
+
options.userAgent = `CockroachCrawler/0.1 (+${contact})`;
|
|
96
|
+
} else if (arg.startsWith("-")) {
|
|
97
|
+
throw new Error(`Unknown option: ${arg}`);
|
|
98
|
+
} else {
|
|
99
|
+
urls.push(arg);
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
options.include = include;
|
|
104
|
+
options.exclude = exclude;
|
|
105
|
+
return { urls, options };
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
function writeOutput(pages, options) {
|
|
109
|
+
const body = options.jsonl
|
|
110
|
+
? pages.map((page) => JSON.stringify(page)).join("\n") + "\n"
|
|
111
|
+
: JSON.stringify({ pages, stats: pages.stats || null }, null, 2) + "\n";
|
|
112
|
+
|
|
113
|
+
if (!options.output) {
|
|
114
|
+
process.stdout.write(body);
|
|
115
|
+
return;
|
|
116
|
+
}
|
|
117
|
+
const stream = createWriteStream(options.output, { encoding: "utf8" });
|
|
118
|
+
stream.end(body);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
async function main() {
|
|
122
|
+
const { urls, options } = await readArgs(process.argv.slice(2));
|
|
123
|
+
if (options.help) {
|
|
124
|
+
usage();
|
|
125
|
+
return;
|
|
126
|
+
}
|
|
127
|
+
if (!urls.length) {
|
|
128
|
+
usage();
|
|
129
|
+
process.exitCode = 1;
|
|
130
|
+
return;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
const pages = await crawl({
|
|
134
|
+
seeds: urls,
|
|
135
|
+
maxPages: options.maxPages,
|
|
136
|
+
maxDepth: options.maxDepth,
|
|
137
|
+
concurrency: options.concurrency,
|
|
138
|
+
delayMs: options.delayMs,
|
|
139
|
+
timeoutMs: options.timeoutMs,
|
|
140
|
+
includeSitemaps: options.includeSitemaps,
|
|
141
|
+
sameOrigin: options.sameOrigin,
|
|
142
|
+
publicOnly: options.publicOnly,
|
|
143
|
+
include: options.include,
|
|
144
|
+
exclude: options.exclude,
|
|
145
|
+
userAgent: options.userAgent,
|
|
146
|
+
onError: (failure) => {
|
|
147
|
+
process.stderr.write(`crawl warning: ${failure.url}: ${failure.error}\n`);
|
|
148
|
+
}
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
writeOutput(pages, options);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
main().catch((error) => {
|
|
155
|
+
process.stderr.write(`${error.stack || error.message || String(error)}\n`);
|
|
156
|
+
process.exitCode = 1;
|
|
157
|
+
});
|
package/package.json
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "cockroach-crawler",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Public-web crawler for agent workflows with robots.txt, sitemaps, depth limits, filters, markdown extraction, and JSONL output.",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"author": {
|
|
7
|
+
"name": "Ajnas",
|
|
8
|
+
"email": "ajnasnb@gmail.com"
|
|
9
|
+
},
|
|
10
|
+
"type": "module",
|
|
11
|
+
"bin": {
|
|
12
|
+
"cockroach-crawl": "bin/cockroach-crawl.js",
|
|
13
|
+
"cockroach-crawler": "bin/cockroach-crawl.js"
|
|
14
|
+
},
|
|
15
|
+
"main": "src/index.js",
|
|
16
|
+
"exports": {
|
|
17
|
+
".": "./src/index.js"
|
|
18
|
+
},
|
|
19
|
+
"files": [
|
|
20
|
+
"bin/",
|
|
21
|
+
"src/",
|
|
22
|
+
"assets/",
|
|
23
|
+
"README.md",
|
|
24
|
+
"LICENSE"
|
|
25
|
+
],
|
|
26
|
+
"scripts": {
|
|
27
|
+
"test": "node --test",
|
|
28
|
+
"bench": "node bench/local-benchmark.mjs",
|
|
29
|
+
"crawl": "node bin/cockroach-crawl.js"
|
|
30
|
+
},
|
|
31
|
+
"keywords": [
|
|
32
|
+
"crawler",
|
|
33
|
+
"cockroach-crawler",
|
|
34
|
+
"agent",
|
|
35
|
+
"web-crawler",
|
|
36
|
+
"scraper",
|
|
37
|
+
"robots.txt",
|
|
38
|
+
"sitemap",
|
|
39
|
+
"markdown",
|
|
40
|
+
"jsonl",
|
|
41
|
+
"research"
|
|
42
|
+
],
|
|
43
|
+
"dependencies": {
|
|
44
|
+
"cheerio": "^1.0.0",
|
|
45
|
+
"robots-parser": "^3.0.1",
|
|
46
|
+
"turndown": "^7.2.0"
|
|
47
|
+
},
|
|
48
|
+
"engines": {
|
|
49
|
+
"node": ">=20"
|
|
50
|
+
},
|
|
51
|
+
"repository": {
|
|
52
|
+
"type": "git",
|
|
53
|
+
"url": "git+https://github.com/AjnasNB/cockroach-crawler.git"
|
|
54
|
+
},
|
|
55
|
+
"bugs": {
|
|
56
|
+
"url": "https://github.com/AjnasNB/cockroach-crawler/issues"
|
|
57
|
+
},
|
|
58
|
+
"homepage": "https://github.com/AjnasNB/cockroach-crawler#readme",
|
|
59
|
+
"publishConfig": {
|
|
60
|
+
"access": "public"
|
|
61
|
+
}
|
|
62
|
+
}
|
package/src/index.js
ADDED
|
@@ -0,0 +1,386 @@
|
|
|
1
|
+
import * as cheerio from "cheerio";
|
|
2
|
+
import robotsParser from "robots-parser";
|
|
3
|
+
import TurndownService from "turndown";
|
|
4
|
+
|
|
5
|
+
const DEFAULT_USER_AGENT = "CockroachCrawler/0.1 (+https://github.com/AjnasNB/cockroach-crawler)";
|
|
6
|
+
const DEFAULT_MAX_BYTES = 3 * 1024 * 1024;
|
|
7
|
+
|
|
8
|
+
const sleep = (ms) => new Promise((resolve) => setTimeout(resolve, ms));
|
|
9
|
+
|
|
10
|
+
function toUrl(value, base) {
|
|
11
|
+
try {
|
|
12
|
+
return new URL(value, base).toString();
|
|
13
|
+
} catch {
|
|
14
|
+
return null;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
function normalizeUrl(value) {
|
|
19
|
+
const url = new URL(value);
|
|
20
|
+
url.hash = "";
|
|
21
|
+
if ((url.protocol === "http:" && url.port === "80") || (url.protocol === "https:" && url.port === "443")) {
|
|
22
|
+
url.port = "";
|
|
23
|
+
}
|
|
24
|
+
return url.toString();
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
function isHttpUrl(value) {
|
|
28
|
+
try {
|
|
29
|
+
const url = new URL(value);
|
|
30
|
+
return url.protocol === "http:" || url.protocol === "https:";
|
|
31
|
+
} catch {
|
|
32
|
+
return false;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
function sameOrigin(a, b) {
|
|
37
|
+
return new URL(a).origin === new URL(b).origin;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
function compilePatterns(values) {
|
|
41
|
+
return (values || []).map((value) => new RegExp(value));
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function matchesAny(value, patterns) {
|
|
45
|
+
return patterns.some((pattern) => pattern.test(value));
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
async function fetchText(url, options) {
|
|
49
|
+
const controller = new AbortController();
|
|
50
|
+
const timeout = setTimeout(() => controller.abort(), options.timeoutMs);
|
|
51
|
+
try {
|
|
52
|
+
const response = await fetch(url, {
|
|
53
|
+
headers: {
|
|
54
|
+
"user-agent": options.userAgent,
|
|
55
|
+
accept: options.accept || "text/html,application/xhtml+xml,application/xml;q=0.9,text/plain;q=0.8,*/*;q=0.5"
|
|
56
|
+
},
|
|
57
|
+
redirect: "follow",
|
|
58
|
+
signal: controller.signal
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
const contentType = response.headers.get("content-type") || "";
|
|
62
|
+
const length = Number(response.headers.get("content-length") || 0);
|
|
63
|
+
if (length > options.maxBytes) {
|
|
64
|
+
throw new Error(`Response too large: ${length} bytes`);
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
const reader = response.body?.getReader();
|
|
68
|
+
if (!reader) {
|
|
69
|
+
return { response, text: await response.text(), contentType };
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
const chunks = [];
|
|
73
|
+
let received = 0;
|
|
74
|
+
while (true) {
|
|
75
|
+
const { done, value } = await reader.read();
|
|
76
|
+
if (done) break;
|
|
77
|
+
received += value.byteLength;
|
|
78
|
+
if (received > options.maxBytes) {
|
|
79
|
+
throw new Error(`Response exceeded maxBytes: ${options.maxBytes}`);
|
|
80
|
+
}
|
|
81
|
+
chunks.push(value);
|
|
82
|
+
}
|
|
83
|
+
const buffer = Buffer.concat(chunks.map((chunk) => Buffer.from(chunk)));
|
|
84
|
+
return { response, text: buffer.toString("utf8"), contentType };
|
|
85
|
+
} finally {
|
|
86
|
+
clearTimeout(timeout);
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
function parseRobotsSitemaps(robotsText) {
|
|
91
|
+
return robotsText
|
|
92
|
+
.split(/\r?\n/)
|
|
93
|
+
.map((line) => line.trim())
|
|
94
|
+
.filter((line) => /^sitemap:/i.test(line))
|
|
95
|
+
.map((line) => line.replace(/^sitemap:\s*/i, "").trim())
|
|
96
|
+
.filter(Boolean);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
async function loadRobots(origin, options) {
|
|
100
|
+
const robotsUrl = new URL("/robots.txt", origin).toString();
|
|
101
|
+
try {
|
|
102
|
+
const { response, text } = await fetchText(robotsUrl, {
|
|
103
|
+
...options,
|
|
104
|
+
accept: "text/plain,*/*;q=0.5",
|
|
105
|
+
maxBytes: Math.min(options.maxBytes, 512 * 1024)
|
|
106
|
+
});
|
|
107
|
+
if (!response.ok) {
|
|
108
|
+
return {
|
|
109
|
+
parser: robotsParser(robotsUrl, ""),
|
|
110
|
+
sitemaps: []
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
return {
|
|
114
|
+
parser: robotsParser(robotsUrl, text),
|
|
115
|
+
sitemaps: parseRobotsSitemaps(text)
|
|
116
|
+
};
|
|
117
|
+
} catch {
|
|
118
|
+
return {
|
|
119
|
+
parser: robotsParser(robotsUrl, ""),
|
|
120
|
+
sitemaps: []
|
|
121
|
+
};
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
async function discoverSitemapUrls(sitemapUrl, options) {
|
|
126
|
+
const discovered = [];
|
|
127
|
+
try {
|
|
128
|
+
const { response, text, contentType } = await fetchText(sitemapUrl, {
|
|
129
|
+
...options,
|
|
130
|
+
accept: "application/xml,text/xml,*/*;q=0.5"
|
|
131
|
+
});
|
|
132
|
+
if (!response.ok) return discovered;
|
|
133
|
+
if (!contentType.includes("xml") && !text.trim().startsWith("<")) return discovered;
|
|
134
|
+
|
|
135
|
+
const $ = cheerio.load(text, { xmlMode: true });
|
|
136
|
+
$("url > loc").each((_, el) => {
|
|
137
|
+
const value = $(el).text().trim();
|
|
138
|
+
if (isHttpUrl(value)) discovered.push(normalizeUrl(value));
|
|
139
|
+
});
|
|
140
|
+
$("sitemap > loc").each((_, el) => {
|
|
141
|
+
const value = $(el).text().trim();
|
|
142
|
+
if (isHttpUrl(value)) discovered.push(normalizeUrl(value));
|
|
143
|
+
});
|
|
144
|
+
} catch {
|
|
145
|
+
return discovered;
|
|
146
|
+
}
|
|
147
|
+
return discovered;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
function extractLinks($, baseUrl) {
|
|
151
|
+
const links = [];
|
|
152
|
+
$("a[href]").each((_, el) => {
|
|
153
|
+
const href = $(el).attr("href");
|
|
154
|
+
const resolved = toUrl(href, baseUrl);
|
|
155
|
+
if (resolved && isHttpUrl(resolved)) {
|
|
156
|
+
links.push(normalizeUrl(resolved));
|
|
157
|
+
}
|
|
158
|
+
});
|
|
159
|
+
return [...new Set(links)];
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
function cleanForExtraction($) {
|
|
163
|
+
$("script, style, noscript, template, svg, canvas, iframe").remove();
|
|
164
|
+
$("[hidden], [aria-hidden='true']").remove();
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
export function extractPage(html, url) {
|
|
168
|
+
const $ = cheerio.load(html);
|
|
169
|
+
cleanForExtraction($);
|
|
170
|
+
|
|
171
|
+
const title = ($("title").first().text() || $("h1").first().text() || "").trim().replace(/\s+/g, " ");
|
|
172
|
+
const description = ($("meta[name='description']").attr("content") || "").trim();
|
|
173
|
+
const h1 = $("h1").first().text().trim().replace(/\s+/g, " ");
|
|
174
|
+
const canonical = toUrl($("link[rel='canonical']").attr("href") || url, url);
|
|
175
|
+
const links = extractLinks($, url);
|
|
176
|
+
|
|
177
|
+
const main = $("main").first();
|
|
178
|
+
const contentRoot = main.length ? main : $("body");
|
|
179
|
+
const htmlFragment = contentRoot.html() || "";
|
|
180
|
+
const text = contentRoot.text().replace(/\s+/g, " ").trim();
|
|
181
|
+
|
|
182
|
+
const turndown = new TurndownService({
|
|
183
|
+
headingStyle: "atx",
|
|
184
|
+
codeBlockStyle: "fenced",
|
|
185
|
+
bulletListMarker: "-"
|
|
186
|
+
});
|
|
187
|
+
const markdown = turndown.turndown(htmlFragment).replace(/\n{3,}/g, "\n\n").trim();
|
|
188
|
+
|
|
189
|
+
return {
|
|
190
|
+
url,
|
|
191
|
+
canonical,
|
|
192
|
+
title,
|
|
193
|
+
description,
|
|
194
|
+
h1,
|
|
195
|
+
text,
|
|
196
|
+
markdown,
|
|
197
|
+
links,
|
|
198
|
+
fetchedAt: new Date().toISOString()
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
class CrawlQueue {
|
|
203
|
+
constructor() {
|
|
204
|
+
this.items = [];
|
|
205
|
+
this.offset = 0;
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
push(url) {
|
|
209
|
+
const next = typeof url === "string" ? { url, depth: 0 } : url;
|
|
210
|
+
this.items.push(next);
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
shift() {
|
|
214
|
+
if (this.offset >= this.items.length) return null;
|
|
215
|
+
const value = this.items[this.offset];
|
|
216
|
+
this.offset += 1;
|
|
217
|
+
if (this.offset > 1000 && this.offset * 2 > this.items.length) {
|
|
218
|
+
this.items = this.items.slice(this.offset);
|
|
219
|
+
this.offset = 0;
|
|
220
|
+
}
|
|
221
|
+
return value;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
get length() {
|
|
225
|
+
return this.items.length - this.offset;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
export async function crawl(input = {}) {
|
|
230
|
+
const seeds = (input.seeds || input.urls || [])
|
|
231
|
+
.map((seed) => (isHttpUrl(seed) ? normalizeUrl(seed) : null))
|
|
232
|
+
.filter(Boolean);
|
|
233
|
+
|
|
234
|
+
if (!seeds.length) {
|
|
235
|
+
throw new Error("At least one http(s) seed URL is required.");
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
const options = {
|
|
239
|
+
maxPages: input.maxPages ?? 50,
|
|
240
|
+
concurrency: input.concurrency ?? 4,
|
|
241
|
+
sameOrigin: input.sameOrigin ?? true,
|
|
242
|
+
maxDepth: input.maxDepth ?? 2,
|
|
243
|
+
include: compilePatterns(input.include),
|
|
244
|
+
exclude: compilePatterns(input.exclude),
|
|
245
|
+
publicOnly: input.publicOnly ?? true,
|
|
246
|
+
includeSitemaps: input.includeSitemaps ?? false,
|
|
247
|
+
obeyRobots: input.obeyRobots ?? true,
|
|
248
|
+
userAgent: input.userAgent || DEFAULT_USER_AGENT,
|
|
249
|
+
delayMs: input.delayMs ?? 250,
|
|
250
|
+
timeoutMs: input.timeoutMs ?? 15_000,
|
|
251
|
+
maxBytes: input.maxBytes ?? DEFAULT_MAX_BYTES,
|
|
252
|
+
onPage: input.onPage || null,
|
|
253
|
+
onError: input.onError || null
|
|
254
|
+
};
|
|
255
|
+
|
|
256
|
+
const queue = new CrawlQueue();
|
|
257
|
+
const seen = new Set();
|
|
258
|
+
const enqueued = new Set();
|
|
259
|
+
const results = [];
|
|
260
|
+
const stats = {
|
|
261
|
+
fetched: 0,
|
|
262
|
+
skippedRobots: 0,
|
|
263
|
+
skippedFiltered: 0,
|
|
264
|
+
skippedNonPublic: 0,
|
|
265
|
+
errors: 0
|
|
266
|
+
};
|
|
267
|
+
const robotsByOrigin = new Map();
|
|
268
|
+
const lastFetchByOrigin = new Map();
|
|
269
|
+
const seedOrigins = new Set(seeds.map((seed) => new URL(seed).origin));
|
|
270
|
+
|
|
271
|
+
const isAllowedByFilters = (url) => {
|
|
272
|
+
if (options.include.length && !matchesAny(url, options.include)) return false;
|
|
273
|
+
if (options.exclude.length && matchesAny(url, options.exclude)) return false;
|
|
274
|
+
return true;
|
|
275
|
+
};
|
|
276
|
+
|
|
277
|
+
const isPublicUrl = (url) => {
|
|
278
|
+
if (!options.publicOnly) return true;
|
|
279
|
+
const parsed = new URL(url);
|
|
280
|
+
const banned = /(?:login|signin|sign-in|signup|account|admin|dashboard|checkout|cart|billing|private|wp-admin)/i;
|
|
281
|
+
return !banned.test(parsed.pathname);
|
|
282
|
+
};
|
|
283
|
+
|
|
284
|
+
const enqueue = (url, depth = 0) => {
|
|
285
|
+
if (!url || enqueued.has(url) || seen.has(url)) return;
|
|
286
|
+
if (options.sameOrigin && ![...seedOrigins].some((origin) => sameOrigin(url, origin))) return;
|
|
287
|
+
if (!isAllowedByFilters(url)) {
|
|
288
|
+
stats.skippedFiltered += 1;
|
|
289
|
+
return;
|
|
290
|
+
}
|
|
291
|
+
if (!isPublicUrl(url)) {
|
|
292
|
+
stats.skippedNonPublic += 1;
|
|
293
|
+
return;
|
|
294
|
+
}
|
|
295
|
+
if (depth > options.maxDepth) return;
|
|
296
|
+
enqueued.add(url);
|
|
297
|
+
queue.push({ url, depth });
|
|
298
|
+
};
|
|
299
|
+
|
|
300
|
+
for (const seed of seeds) enqueue(seed, 0);
|
|
301
|
+
|
|
302
|
+
async function getRobots(url) {
|
|
303
|
+
const origin = new URL(url).origin;
|
|
304
|
+
if (!robotsByOrigin.has(origin)) {
|
|
305
|
+
robotsByOrigin.set(origin, await loadRobots(origin, options));
|
|
306
|
+
}
|
|
307
|
+
return robotsByOrigin.get(origin);
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
if (options.includeSitemaps) {
|
|
311
|
+
for (const seed of seeds) {
|
|
312
|
+
const robots = await getRobots(seed);
|
|
313
|
+
const sitemapUrls = robots.sitemaps.length
|
|
314
|
+
? robots.sitemaps
|
|
315
|
+
: [new URL("/sitemap.xml", new URL(seed).origin).toString()];
|
|
316
|
+
for (const sitemapUrl of sitemapUrls) {
|
|
317
|
+
const urls = await discoverSitemapUrls(sitemapUrl, options);
|
|
318
|
+
for (const url of urls) enqueue(url, 0);
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
async function waitForOrigin(url) {
|
|
324
|
+
const origin = new URL(url).origin;
|
|
325
|
+
const last = lastFetchByOrigin.get(origin) || 0;
|
|
326
|
+
const waitMs = Math.max(0, last + options.delayMs - Date.now());
|
|
327
|
+
if (waitMs) await sleep(waitMs);
|
|
328
|
+
lastFetchByOrigin.set(origin, Date.now());
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
async function worker() {
|
|
332
|
+
while (results.length < options.maxPages) {
|
|
333
|
+
const item = queue.shift();
|
|
334
|
+
if (!item) return;
|
|
335
|
+
const { url, depth } = item;
|
|
336
|
+
if (seen.has(url)) continue;
|
|
337
|
+
seen.add(url);
|
|
338
|
+
|
|
339
|
+
try {
|
|
340
|
+
if (options.obeyRobots) {
|
|
341
|
+
const robots = await getRobots(url);
|
|
342
|
+
if (robots.parser && !robots.parser.isAllowed(url, options.userAgent)) {
|
|
343
|
+
stats.skippedRobots += 1;
|
|
344
|
+
continue;
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
await waitForOrigin(url);
|
|
349
|
+
const { response, text, contentType } = await fetchText(url, options);
|
|
350
|
+
if (!response.ok) {
|
|
351
|
+
throw new Error(`HTTP ${response.status}`);
|
|
352
|
+
}
|
|
353
|
+
stats.fetched += 1;
|
|
354
|
+
if (!/html|xml|text\//i.test(contentType)) {
|
|
355
|
+
continue;
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
const page = extractPage(text, response.url || url);
|
|
359
|
+
page.status = response.status;
|
|
360
|
+
page.contentType = contentType;
|
|
361
|
+
results.push(page);
|
|
362
|
+
if (options.onPage) await options.onPage(page);
|
|
363
|
+
|
|
364
|
+
for (const link of page.links) {
|
|
365
|
+
if (results.length + queue.length >= options.maxPages * 6) break;
|
|
366
|
+
enqueue(link, depth + 1);
|
|
367
|
+
}
|
|
368
|
+
} catch (error) {
|
|
369
|
+
stats.errors += 1;
|
|
370
|
+
const failure = { url, error: error.message || String(error) };
|
|
371
|
+
if (options.onError) await options.onError(failure);
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
const workerCount = Math.max(1, Math.min(options.concurrency, options.maxPages));
|
|
377
|
+
await Promise.all(Array.from({ length: workerCount }, () => worker()));
|
|
378
|
+
const sliced = results.slice(0, options.maxPages);
|
|
379
|
+
Object.defineProperty(sliced, "stats", {
|
|
380
|
+
value: stats,
|
|
381
|
+
enumerable: false
|
|
382
|
+
});
|
|
383
|
+
return sliced;
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
export { discoverSitemapUrls, normalizeUrl };
|