ddg-search 2026.2.15-1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +78 -0
- package/bin/ddg-search.js +4 -0
- package/package.json +52 -0
- package/src/args.js +57 -0
- package/src/cli.js +50 -0
- package/src/constants.js +4 -0
- package/src/formatters.js +169 -0
- package/src/index.js +16 -0
- package/src/parser.js +87 -0
- package/src/search.js +84 -0
- package/src/usage.js +33 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 camo@hiddendj.com
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# ddg-search
|
|
2
|
+
|
|
3
|
+
DuckDuckGo HTML search scraper with multiple output formats. Provides a CLI and small library helpers to fetch result pages, handle pagination, and emit OpenSearch-style structured data.
|
|
4
|
+
|
|
5
|
+
## Requirements
|
|
6
|
+
- Node.js 22 or newer
|
|
7
|
+
|
|
8
|
+
## Installation
|
|
9
|
+
- Global CLI (npm): `npm install -g ddg-search`
|
|
10
|
+
- One-off run (npx): `npx ddg-search --help`
|
|
11
|
+
- Project dependency: `npm install ddg-search`
|
|
12
|
+
- Local dev from this repo: `pnpm install` then `pnpm link --global` or `pnpm install -g .` (enable via `corepack enable` if needed)
|
|
13
|
+
|
|
14
|
+
## CLI usage
|
|
15
|
+
```
|
|
16
|
+
Usage: ddg-search [options] <query>
|
|
17
|
+
|
|
18
|
+
Search DuckDuckGo and output results in structured formats.
|
|
19
|
+
|
|
20
|
+
Options:
|
|
21
|
+
-f, --format <fmt> Output format (default: json). See formats below.
|
|
22
|
+
-p, --pages <n> Maximum pages to scrape, 0 for unlimited (default: 5)
|
|
23
|
+
-r, --region <code> Region code, e.g. us-en, uk-en (default: all regions)
|
|
24
|
+
-t, --time <range> Time filter: d (day), w (week), m (month), y (year)
|
|
25
|
+
-h, --help Show this help message
|
|
26
|
+
|
|
27
|
+
Formats:
|
|
28
|
+
json OpenSearch 1.1 response conventions in JSON
|
|
29
|
+
jsonl One JSON object per result line (streaming-friendly)
|
|
30
|
+
csv CSV with headers
|
|
31
|
+
opensearch OpenSearch 1.1 Atom XML
|
|
32
|
+
markdown Numbered markdown list (LLM-friendly)
|
|
33
|
+
compact Minimal token format for LLM context windows
|
|
34
|
+
|
|
35
|
+
Results are written to stdout; progress is written to stderr.
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Examples
|
|
39
|
+
- `ddg-search "node.js tutorial"`
|
|
40
|
+
- `ddg-search -f csv -p 3 "linux kernel"`
|
|
41
|
+
- `ddg-search -f opensearch "rust programming" > results.xml`
|
|
42
|
+
- `ddg-search -f compact "api docs" | llm "summarize these results"`
|
|
43
|
+
- `ddg-search -p 0 "scrape everything"`
|
|
44
|
+
- `ddg-search -r us-en -t w "recent news"`
|
|
45
|
+
- `ddg-search "rust programming" | jq '.items[].link'`
|
|
46
|
+
|
|
47
|
+
## Programmatic usage
|
|
48
|
+
```js
|
|
49
|
+
import { search, formatJson } from 'ddg-search';
|
|
50
|
+
|
|
51
|
+
const { results, spelling, zeroClick } = await search('rust programming', {
|
|
52
|
+
maxPages: 2,
|
|
53
|
+
region: 'us-en',
|
|
54
|
+
time: 'w',
|
|
55
|
+
});
|
|
56
|
+
|
|
57
|
+
// Convert to OpenSearch-style JSON
|
|
58
|
+
const output = formatJson({ results, spelling, zeroClick });
|
|
59
|
+
console.log(output);
|
|
60
|
+
```
|
|
61
|
+
Exports also include `fetchPage`, `parsePage`, and formatters like `formatCsv`, `formatJsonl`, `formatMarkdown`, `formatOpenSearch`, and `formatCompact`.
|
|
62
|
+
|
|
63
|
+
## Notes
|
|
64
|
+
- DuckDuckGo may present bot-detection. The scraper stops early and returns collected results if that happens.
|
|
65
|
+
- Respect site terms of use and rate-limit your requests; `search()` inserts random delays between pages by default.
|
|
66
|
+
|
|
67
|
+
## Development
|
|
68
|
+
- Run tests: `pnpm test`
|
|
69
|
+
- Coverage: `pnpm run coverage`
|
|
70
|
+
- Lint: `pnpm run lint`
|
|
71
|
+
- Format check: `pnpm run format`; auto-fix: `pnpm run format:write`
|
|
72
|
+
|
|
73
|
+
## Links
|
|
74
|
+
- npm: https://www.npmjs.com/package/ddg-search
|
|
75
|
+
- GitHub: https://github.com/camohiddendj/ddg-search
|
|
76
|
+
|
|
77
|
+
## Contributing
|
|
78
|
+
Please see [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines on issues and pull requests.
|
package/package.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "ddg-search",
|
|
3
|
+
"version": "2026.2.15-1",
|
|
4
|
+
"description": "DuckDuckGo HTML search scraper with multiple output formats",
|
|
5
|
+
"license": "MIT",
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "https://github.com/camohiddendj/ddg-search.git"
|
|
9
|
+
},
|
|
10
|
+
"bugs": {
|
|
11
|
+
"url": "https://github.com/camohiddendj/ddg-search/issues"
|
|
12
|
+
},
|
|
13
|
+
"homepage": "https://github.com/camohiddendj/ddg-search#readme",
|
|
14
|
+
"type": "module",
|
|
15
|
+
"main": "src/index.js",
|
|
16
|
+
"files": [
|
|
17
|
+
"bin",
|
|
18
|
+
"src",
|
|
19
|
+
"README.md",
|
|
20
|
+
"LICENSE"
|
|
21
|
+
],
|
|
22
|
+
"bin": {
|
|
23
|
+
"ddg-search": "bin/ddg-search.js"
|
|
24
|
+
},
|
|
25
|
+
"scripts": {
|
|
26
|
+
"test": "node --test",
|
|
27
|
+
"coverage": "c8 node --test",
|
|
28
|
+
"lint": "eslint .",
|
|
29
|
+
"format": "prettier --check .",
|
|
30
|
+
"format:write": "prettier --write ."
|
|
31
|
+
},
|
|
32
|
+
"engines": {
|
|
33
|
+
"node": ">=22"
|
|
34
|
+
},
|
|
35
|
+
"keywords": [
|
|
36
|
+
"duckduckgo",
|
|
37
|
+
"search",
|
|
38
|
+
"cli",
|
|
39
|
+
"scraper",
|
|
40
|
+
"opensearch"
|
|
41
|
+
],
|
|
42
|
+
"dependencies": {
|
|
43
|
+
"cheerio": "^1.2.0"
|
|
44
|
+
},
|
|
45
|
+
"devDependencies": {
|
|
46
|
+
"c8": "^9.1.0",
|
|
47
|
+
"eslint": "^8.57.1",
|
|
48
|
+
"eslint-config-prettier": "^9.1.0",
|
|
49
|
+
"prettier": "^3.3.3"
|
|
50
|
+
},
|
|
51
|
+
"packageManager": "pnpm@10.29.3+sha512.498e1fb4cca5aa06c1dcf2611e6fafc50972ffe7189998c409e90de74566444298ffe43e6cd2acdc775ba1aa7cc5e092a8b7054c811ba8c5770f84693d33d2dc"
|
|
52
|
+
}
|
package/src/args.js
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
import { parseArgs } from 'node:util';
|
|
2
|
+
import { usage } from './usage.js';
|
|
3
|
+
|
|
4
|
+
const SUPPORTED_FORMATS = ['json', 'jsonl', 'csv', 'opensearch', 'markdown', 'compact'];
|
|
5
|
+
|
|
6
|
+
export function parseCliArgs(argv = process.argv.slice(2), exitFn = process.exit) {
|
|
7
|
+
let parsed;
|
|
8
|
+
try {
|
|
9
|
+
parsed = parseArgs({
|
|
10
|
+
allowPositionals: true,
|
|
11
|
+
args: argv,
|
|
12
|
+
options: {
|
|
13
|
+
format: { type: 'string', short: 'f', default: 'json' },
|
|
14
|
+
pages: { type: 'string', short: 'p' },
|
|
15
|
+
region: { type: 'string', short: 'r' },
|
|
16
|
+
time: { type: 'string', short: 't' },
|
|
17
|
+
help: { type: 'boolean', short: 'h', default: false },
|
|
18
|
+
},
|
|
19
|
+
});
|
|
20
|
+
} catch (e) {
|
|
21
|
+
console.error(`Error: ${e.message}`);
|
|
22
|
+
exitFn(1);
|
|
23
|
+
return { query: '', maxPages: 0, format: 'json', region: '', time: '' };
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
const { values, positionals } = parsed;
|
|
27
|
+
|
|
28
|
+
if (values.help || positionals.length === 0) {
|
|
29
|
+
usage(exitFn);
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
const query = positionals.join(' ');
|
|
33
|
+
const parsedPages = values.pages != null ? parseInt(values.pages, 10) : 5;
|
|
34
|
+
const maxPages = parsedPages === 0 ? Infinity : parsedPages;
|
|
35
|
+
const format = values.format;
|
|
36
|
+
const region = values.region || '';
|
|
37
|
+
const time = values.time || '';
|
|
38
|
+
|
|
39
|
+
if (!SUPPORTED_FORMATS.includes(format)) {
|
|
40
|
+
console.error(`Unknown format: ${format}. Supported: ${SUPPORTED_FORMATS.join(', ')}`);
|
|
41
|
+
exitFn(1);
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
if (Number.isNaN(parsedPages) || parsedPages < 0) {
|
|
45
|
+
console.error('--pages must be a non-negative integer (0 for unlimited)');
|
|
46
|
+
exitFn(1);
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
if (time && !['d', 'w', 'm', 'y'].includes(time)) {
|
|
50
|
+
console.error('Unknown time range: d, w, m, y');
|
|
51
|
+
exitFn(1);
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
return { query, maxPages, format, region, time };
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export { usage };
|
package/src/cli.js
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import { parseCliArgs } from './args.js';
|
|
2
|
+
import {
|
|
3
|
+
formatCompact,
|
|
4
|
+
formatCsv,
|
|
5
|
+
formatJson,
|
|
6
|
+
formatJsonl,
|
|
7
|
+
formatMarkdown,
|
|
8
|
+
formatOpenSearch,
|
|
9
|
+
} from './formatters.js';
|
|
10
|
+
import { search } from './search.js';
|
|
11
|
+
|
|
12
|
+
export async function main(
|
|
13
|
+
argv = process.argv.slice(2),
|
|
14
|
+
{ searchImpl = search, stdout = process.stdout, exit = process.exit } = {},
|
|
15
|
+
) {
|
|
16
|
+
const { query, maxPages, format, region, time } = parseCliArgs(argv, exit);
|
|
17
|
+
|
|
18
|
+
try {
|
|
19
|
+
const data = await searchImpl(query, { maxPages, region, time });
|
|
20
|
+
|
|
21
|
+
let output;
|
|
22
|
+
switch (format) {
|
|
23
|
+
case 'json':
|
|
24
|
+
output = formatJson(data);
|
|
25
|
+
break;
|
|
26
|
+
case 'jsonl':
|
|
27
|
+
output = formatJsonl(data);
|
|
28
|
+
break;
|
|
29
|
+
case 'csv':
|
|
30
|
+
output = formatCsv(data);
|
|
31
|
+
break;
|
|
32
|
+
case 'opensearch':
|
|
33
|
+
output = formatOpenSearch(data);
|
|
34
|
+
break;
|
|
35
|
+
case 'markdown':
|
|
36
|
+
output = formatMarkdown(data);
|
|
37
|
+
break;
|
|
38
|
+
case 'compact':
|
|
39
|
+
output = formatCompact(data);
|
|
40
|
+
break;
|
|
41
|
+
default:
|
|
42
|
+
throw new Error(`Unsupported format: ${format}`);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
stdout.write(output + '\n');
|
|
46
|
+
} catch (err) {
|
|
47
|
+
console.error(`Error: ${err.message}`);
|
|
48
|
+
exit(1);
|
|
49
|
+
}
|
|
50
|
+
}
|
package/src/constants.js
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
import { BASE_URL } from './constants.js';
|
|
2
|
+
|
|
3
|
+
export function escapeCsv(str) {
|
|
4
|
+
if (str.includes('"') || str.includes(',') || str.includes('\n')) {
|
|
5
|
+
return '"' + str.replace(/"/g, '""') + '"';
|
|
6
|
+
}
|
|
7
|
+
return str;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export function escapeXml(str) {
|
|
11
|
+
return str
|
|
12
|
+
.replace(/&/g, '&')
|
|
13
|
+
.replace(/</g, '<')
|
|
14
|
+
.replace(/>/g, '>')
|
|
15
|
+
.replace(/"/g, '"')
|
|
16
|
+
.replace(/'/g, ''');
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export function formatJson(data) {
|
|
20
|
+
const output = {
|
|
21
|
+
'opensearch:totalResults': data.results.length,
|
|
22
|
+
'opensearch:startIndex': 1,
|
|
23
|
+
'opensearch:itemsPerPage': data.results.length,
|
|
24
|
+
'opensearch:Query': {
|
|
25
|
+
role: 'request',
|
|
26
|
+
searchTerms: data.query,
|
|
27
|
+
},
|
|
28
|
+
pagesScraped: data.pagesScraped,
|
|
29
|
+
};
|
|
30
|
+
|
|
31
|
+
if (data.spelling) {
|
|
32
|
+
output.spelling = data.spelling;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
if (data.zeroClick) {
|
|
36
|
+
output.zeroClick = data.zeroClick;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
output.items = data.results.map((r, i) => ({
|
|
40
|
+
position: i + 1,
|
|
41
|
+
title: r.title,
|
|
42
|
+
link: r.url,
|
|
43
|
+
description: r.description,
|
|
44
|
+
displayUrl: r.displayUrl,
|
|
45
|
+
}));
|
|
46
|
+
|
|
47
|
+
return JSON.stringify(output, null, 2);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export function formatJsonl(data) {
|
|
51
|
+
const lines = [];
|
|
52
|
+
if (data.zeroClick) {
|
|
53
|
+
lines.push(JSON.stringify({ type: 'zeroClick', ...data.zeroClick }));
|
|
54
|
+
}
|
|
55
|
+
lines.push(
|
|
56
|
+
...data.results.map((r, i) =>
|
|
57
|
+
JSON.stringify({
|
|
58
|
+
position: i + 1,
|
|
59
|
+
title: r.title,
|
|
60
|
+
link: r.url,
|
|
61
|
+
description: r.description,
|
|
62
|
+
}),
|
|
63
|
+
),
|
|
64
|
+
);
|
|
65
|
+
return lines.join('\n');
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
export function formatCsv(data) {
|
|
69
|
+
const lines = ['position,title,link,description'];
|
|
70
|
+
for (let i = 0; i < data.results.length; i++) {
|
|
71
|
+
const r = data.results[i];
|
|
72
|
+
lines.push([i + 1, escapeCsv(r.title), escapeCsv(r.url), escapeCsv(r.description)].join(','));
|
|
73
|
+
}
|
|
74
|
+
return lines.join('\n');
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
export function formatOpenSearch(data) {
|
|
78
|
+
const now = new Date().toISOString();
|
|
79
|
+
const searchUrl = `${BASE_URL}?q=${encodeURIComponent(data.query)}`;
|
|
80
|
+
|
|
81
|
+
let xml = '<?xml version="1.0" encoding="UTF-8"?>\n';
|
|
82
|
+
xml += '<feed xmlns="http://www.w3.org/2005/Atom"\n';
|
|
83
|
+
xml += ' xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">\n';
|
|
84
|
+
xml += ` <title>DuckDuckGo: ${escapeXml(data.query)}</title>\n`;
|
|
85
|
+
xml += ` <link href="${escapeXml(searchUrl)}"/>\n`;
|
|
86
|
+
xml += ` <updated>${now}</updated>\n`;
|
|
87
|
+
xml += ` <id>${escapeXml(searchUrl)}</id>\n`;
|
|
88
|
+
xml += ` <opensearch:totalResults>${data.results.length}</opensearch:totalResults>\n`;
|
|
89
|
+
xml += ' <opensearch:startIndex>1</opensearch:startIndex>\n';
|
|
90
|
+
xml += ` <opensearch:itemsPerPage>${data.results.length}</opensearch:itemsPerPage>\n`;
|
|
91
|
+
xml += ` <opensearch:Query role="request" searchTerms="${escapeXml(data.query)}"/>\n`;
|
|
92
|
+
|
|
93
|
+
if (data.zeroClick) {
|
|
94
|
+
const zc = data.zeroClick;
|
|
95
|
+
xml += ' <entry>\n';
|
|
96
|
+
xml += ` <title type="text">${escapeXml(zc.heading)}</title>\n`;
|
|
97
|
+
xml += ` <link href="${escapeXml(zc.url)}"/>\n`;
|
|
98
|
+
xml += ` <id>${escapeXml(zc.url)}</id>\n`;
|
|
99
|
+
xml += ` <summary>${escapeXml(zc.abstract)}</summary>\n`;
|
|
100
|
+
xml += ' <category term="zeroClick"/>\n';
|
|
101
|
+
xml += ' </entry>\n';
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
for (const r of data.results) {
|
|
105
|
+
xml += ' <entry>\n';
|
|
106
|
+
xml += ` <title>${escapeXml(r.title)}</title>\n`;
|
|
107
|
+
xml += ` <link href="${escapeXml(r.url)}"/>\n`;
|
|
108
|
+
xml += ` <id>${escapeXml(r.url)}</id>\n`;
|
|
109
|
+
xml += ` <summary>${escapeXml(r.description)}</summary>\n`;
|
|
110
|
+
xml += ' </entry>\n';
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
xml += '</feed>';
|
|
114
|
+
return xml;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
export function formatMarkdown(data) {
|
|
118
|
+
const lines = [];
|
|
119
|
+
lines.push(`# Search: ${data.query}`);
|
|
120
|
+
lines.push(`${data.results.length} results from ${data.pagesScraped} page(s)\n`);
|
|
121
|
+
|
|
122
|
+
if (data.spelling) {
|
|
123
|
+
lines.push(`> **Did you mean:** ${data.spelling.corrected}\n`);
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
if (data.zeroClick) {
|
|
127
|
+
const zc = data.zeroClick;
|
|
128
|
+
lines.push(`> **${zc.heading}** — ${zc.abstract}`);
|
|
129
|
+
const suffix = zc.source ? ` (${zc.source})` : '';
|
|
130
|
+
lines.push(`> [Read more](${zc.url})${suffix}\n`);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
for (let i = 0; i < data.results.length; i++) {
|
|
134
|
+
const r = data.results[i];
|
|
135
|
+
lines.push(`${i + 1}. [${r.title}](${r.url})`);
|
|
136
|
+
if (r.description) {
|
|
137
|
+
lines.push(` ${r.description}`);
|
|
138
|
+
}
|
|
139
|
+
lines.push('');
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
return lines.join('\n');
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
export function formatCompact(data) {
|
|
146
|
+
const lines = [];
|
|
147
|
+
lines.push(`query: ${data.query}`);
|
|
148
|
+
lines.push(`results: ${data.results.length}`);
|
|
149
|
+
if (data.spelling) {
|
|
150
|
+
lines.push(`did_you_mean: ${data.spelling.corrected}`);
|
|
151
|
+
}
|
|
152
|
+
if (data.zeroClick) {
|
|
153
|
+
lines.push(`zero_click: ${data.zeroClick.heading}`);
|
|
154
|
+
lines.push(` ${data.zeroClick.url}`);
|
|
155
|
+
lines.push(` ${data.zeroClick.abstract}`);
|
|
156
|
+
}
|
|
157
|
+
lines.push('---');
|
|
158
|
+
|
|
159
|
+
for (let i = 0; i < data.results.length; i++) {
|
|
160
|
+
const r = data.results[i];
|
|
161
|
+
lines.push(`[${i + 1}] ${r.title}`);
|
|
162
|
+
lines.push(` ${r.url}`);
|
|
163
|
+
if (r.description) {
|
|
164
|
+
lines.push(` ${r.description}`);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
return lines.join('\n');
|
|
169
|
+
}
|
package/src/index.js
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
export { BASE_URL, USER_AGENT } from './constants.js';
|
|
2
|
+
export { usage } from './usage.js';
|
|
3
|
+
export { parseCliArgs } from './args.js';
|
|
4
|
+
export { parsePage, isBotDetection } from './parser.js';
|
|
5
|
+
export { fetchPage, randomDelay, search } from './search.js';
|
|
6
|
+
export {
|
|
7
|
+
escapeCsv,
|
|
8
|
+
escapeXml,
|
|
9
|
+
formatCompact,
|
|
10
|
+
formatCsv,
|
|
11
|
+
formatJson,
|
|
12
|
+
formatJsonl,
|
|
13
|
+
formatMarkdown,
|
|
14
|
+
formatOpenSearch,
|
|
15
|
+
} from './formatters.js';
|
|
16
|
+
export { main } from './cli.js';
|
package/src/parser.js
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import * as cheerio from 'cheerio';
|
|
2
|
+
|
|
3
|
+
export function isBotDetection(html) {
|
|
4
|
+
return html.includes('anomaly-modal') || html.includes('challenge-form');
|
|
5
|
+
}
|
|
6
|
+
|
|
7
|
+
export function parsePage(html) {
|
|
8
|
+
const $ = cheerio.load(html);
|
|
9
|
+
|
|
10
|
+
let spelling = null;
|
|
11
|
+
const didYouMean = $('#did_you_mean');
|
|
12
|
+
if (didYouMean.length) {
|
|
13
|
+
const links = didYouMean.find('a');
|
|
14
|
+
const correctedLink = links.first();
|
|
15
|
+
if (correctedLink.length) {
|
|
16
|
+
spelling = { corrected: correctedLink.text().trim() };
|
|
17
|
+
const originalLink = links.eq(1);
|
|
18
|
+
if (originalLink.length) {
|
|
19
|
+
spelling.original = originalLink.text().trim().replace(/^"|"$/g, '');
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
let zeroClick = null;
|
|
25
|
+
const zciEl = $('.zci-wrapper .zci');
|
|
26
|
+
if (zciEl.length) {
|
|
27
|
+
const headingAnchor = zciEl.find('.zci__heading a');
|
|
28
|
+
const abstractEl = zciEl.find('#zero_click_abstract');
|
|
29
|
+
const imageEl = abstractEl.find('.zci__image');
|
|
30
|
+
const sourceLink = abstractEl.find('a q');
|
|
31
|
+
|
|
32
|
+
const heading = headingAnchor.text().trim();
|
|
33
|
+
const url = headingAnchor.attr('href') || '';
|
|
34
|
+
|
|
35
|
+
const abstractClone = abstractEl.clone();
|
|
36
|
+
abstractClone.find('a').remove();
|
|
37
|
+
const abstract = abstractClone.text().trim();
|
|
38
|
+
|
|
39
|
+
if (heading) {
|
|
40
|
+
zeroClick = { heading, url, abstract };
|
|
41
|
+
const imageSrc = imageEl.attr('src');
|
|
42
|
+
if (imageSrc) zeroClick.image = imageSrc;
|
|
43
|
+
const sourceName = sourceLink.text().trim();
|
|
44
|
+
if (sourceName) zeroClick.source = sourceName;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
const results = [];
|
|
49
|
+
$('.result.web-result')
|
|
50
|
+
.not('.result--ad')
|
|
51
|
+
.not('.result--no-result')
|
|
52
|
+
.each((_i, el) => {
|
|
53
|
+
const $el = $(el);
|
|
54
|
+
const titleEl = $el.find('.result__a');
|
|
55
|
+
const snippetEl = $el.find('.result__snippet');
|
|
56
|
+
const urlEl = $el.find('.result__url');
|
|
57
|
+
|
|
58
|
+
const title = titleEl.text().trim();
|
|
59
|
+
const url = titleEl.attr('href') || '';
|
|
60
|
+
const description = snippetEl.text().trim();
|
|
61
|
+
const displayUrl = urlEl.text().trim();
|
|
62
|
+
|
|
63
|
+
if (title && url) {
|
|
64
|
+
results.push({ title, url, description, displayUrl });
|
|
65
|
+
}
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
const noMoreResults = $('.result--no-result').length > 0;
|
|
69
|
+
|
|
70
|
+
let nextPageData = null;
|
|
71
|
+
$('.nav-link').each((_i, el) => {
|
|
72
|
+
const $form = $(el).find('form');
|
|
73
|
+
const submitBtn = $form.find('input[type="submit"]');
|
|
74
|
+
if (submitBtn.val() === 'Next') {
|
|
75
|
+
nextPageData = {};
|
|
76
|
+
$form.find('input[type="hidden"]').each((_j, input) => {
|
|
77
|
+
const $input = $(input);
|
|
78
|
+
const name = $input.attr('name');
|
|
79
|
+
if (name) {
|
|
80
|
+
nextPageData[name] = $input.attr('value') || '';
|
|
81
|
+
}
|
|
82
|
+
});
|
|
83
|
+
}
|
|
84
|
+
});
|
|
85
|
+
|
|
86
|
+
return { results, spelling, zeroClick, noMoreResults, nextPageData };
|
|
87
|
+
}
|
package/src/search.js
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import { BASE_URL, USER_AGENT } from './constants.js';
|
|
2
|
+
import { isBotDetection, parsePage } from './parser.js';
|
|
3
|
+
|
|
4
|
+
export async function fetchPage(url, postData, fetchImpl = fetch) {
|
|
5
|
+
const opts = {
|
|
6
|
+
headers: { 'User-Agent': USER_AGENT },
|
|
7
|
+
};
|
|
8
|
+
|
|
9
|
+
if (postData) {
|
|
10
|
+
opts.method = 'POST';
|
|
11
|
+
opts.headers['Content-Type'] = 'application/x-www-form-urlencoded';
|
|
12
|
+
opts.body = new URLSearchParams(postData).toString();
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
const resp = await fetchImpl(url, opts);
|
|
16
|
+
if (!resp.ok) {
|
|
17
|
+
throw new Error(`HTTP ${resp.status} ${resp.statusText}`);
|
|
18
|
+
}
|
|
19
|
+
return resp.text();
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export function randomDelay() {
|
|
23
|
+
const ms = 800 + Math.random() * 2100;
|
|
24
|
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export async function search(
|
|
28
|
+
query,
|
|
29
|
+
{ maxPages, region, time, fetchImpl = fetch, delay = randomDelay, stderr = process.stderr },
|
|
30
|
+
) {
|
|
31
|
+
const allResults = [];
|
|
32
|
+
let spelling = null;
|
|
33
|
+
let zeroClick = null;
|
|
34
|
+
let page = 0;
|
|
35
|
+
const showProgress = stderr.isTTY;
|
|
36
|
+
|
|
37
|
+
const params = new URLSearchParams({ q: query });
|
|
38
|
+
if (region) params.set('kl', region);
|
|
39
|
+
if (time) params.set('df', time);
|
|
40
|
+
|
|
41
|
+
const firstHtml = await fetchPage(`${BASE_URL}?${params}`, null, fetchImpl);
|
|
42
|
+
|
|
43
|
+
if (isBotDetection(firstHtml)) {
|
|
44
|
+
throw new Error('Anti-bot detection triggered on first request. Try again later.');
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
let parsed = parsePage(firstHtml);
|
|
48
|
+
allResults.push(...parsed.results);
|
|
49
|
+
spelling = parsed.spelling;
|
|
50
|
+
zeroClick = parsed.zeroClick;
|
|
51
|
+
page++;
|
|
52
|
+
|
|
53
|
+
if (showProgress) {
|
|
54
|
+
stderr.write(`\rPage ${page}: ${parsed.results.length} results (${allResults.length} total)`);
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
while (parsed.nextPageData && !parsed.noMoreResults && page < maxPages) {
|
|
58
|
+
await delay();
|
|
59
|
+
|
|
60
|
+
const html = await fetchPage(BASE_URL, parsed.nextPageData, fetchImpl);
|
|
61
|
+
|
|
62
|
+
if (isBotDetection(html)) {
|
|
63
|
+
if (showProgress) {
|
|
64
|
+
stderr.write('\n');
|
|
65
|
+
stderr.write('Anti-bot detection hit. Returning results collected so far.\n');
|
|
66
|
+
}
|
|
67
|
+
break;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
parsed = parsePage(html);
|
|
71
|
+
allResults.push(...parsed.results);
|
|
72
|
+
page++;
|
|
73
|
+
|
|
74
|
+
if (showProgress) {
|
|
75
|
+
stderr.write(`\rPage ${page}: ${parsed.results.length} results (${allResults.length} total)`);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if (showProgress) {
|
|
80
|
+
stderr.write('\n');
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
return { results: allResults, spelling, zeroClick, pagesScraped: page, query };
|
|
84
|
+
}
|
package/src/usage.js
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
export function usage(exitFn = process.exit) {
|
|
2
|
+
const text = `Usage: ddg-search [options] <query>
|
|
3
|
+
|
|
4
|
+
Search DuckDuckGo and output results in structured formats.
|
|
5
|
+
|
|
6
|
+
Options:
|
|
7
|
+
-f, --format <fmt> Output format (default: json). See formats below.
|
|
8
|
+
-p, --pages <n> Maximum pages to scrape, 0 for unlimited (default: 5)
|
|
9
|
+
-r, --region <code> Region code, e.g. us-en, uk-en (default: all regions)
|
|
10
|
+
-t, --time <range> Time filter: d (day), w (week), m (month), y (year)
|
|
11
|
+
-h, --help Show this help message
|
|
12
|
+
|
|
13
|
+
Formats:
|
|
14
|
+
json OpenSearch 1.1 response conventions in JSON
|
|
15
|
+
jsonl One JSON object per result line (streaming-friendly)
|
|
16
|
+
csv CSV with headers
|
|
17
|
+
opensearch OpenSearch 1.1 Atom XML
|
|
18
|
+
markdown Numbered markdown list (AI/LLM-friendly)
|
|
19
|
+
compact Minimal token format for LLM context windows
|
|
20
|
+
|
|
21
|
+
Results are written to stdout; progress is written to stderr.
|
|
22
|
+
|
|
23
|
+
Examples:
|
|
24
|
+
ddg-search "node.js tutorial"
|
|
25
|
+
ddg-search -f csv -p 3 "linux kernel"
|
|
26
|
+
ddg-search -f opensearch "rust programming" > results.xml
|
|
27
|
+
ddg-search -f compact "api docs" | llm "summarize these results"
|
|
28
|
+
ddg-search -p 0 "scrape everything"
|
|
29
|
+
ddg-search -r us-en -t w "recent news"
|
|
30
|
+
ddg-search "rust programming" | jq '.items[].link'`;
|
|
31
|
+
console.error(text);
|
|
32
|
+
exitFn(1);
|
|
33
|
+
}
|