@nitpicker/cli 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +16 -0
- package/LICENSE +191 -0
- package/README.md +21 -0
- package/bin/nitpicker.js +3 -0
- package/lib/analyze/log.d.ts +12 -0
- package/lib/analyze/log.js +24 -0
- package/lib/analyze/logger.d.ts +10 -0
- package/lib/analyze/logger.js +16 -0
- package/lib/audit.d.ts +7 -0
- package/lib/audit.js +42 -0
- package/lib/bootstrap.d.ts +9 -0
- package/lib/bootstrap.js +22 -0
- package/lib/commands/analyze.d.ts +35 -0
- package/lib/commands/analyze.js +86 -0
- package/lib/commands/crawl.d.ts +125 -0
- package/lib/commands/crawl.js +249 -0
- package/lib/commands/report.d.ts +45 -0
- package/lib/commands/report.js +62 -0
- package/lib/crawl/debug.d.ts +6 -0
- package/lib/crawl/debug.js +10 -0
- package/lib/crawl/diff.d.ts +10 -0
- package/lib/crawl/diff.js +38 -0
- package/lib/crawl/event-assignments.d.ts +17 -0
- package/lib/crawl/event-assignments.js +33 -0
- package/lib/index.d.ts +1 -0
- package/lib/index.js +28 -0
- package/lib/log.d.ts +8 -0
- package/lib/log.js +45 -0
- package/lib/logger.d.ts +10 -0
- package/lib/logger.js +16 -0
- package/package.json +45 -0
- package/src/analyze/log.spec.ts +88 -0
- package/src/analyze/log.ts +28 -0
- package/src/commands/analyze.ts +107 -0
- package/src/commands/crawl.ts +303 -0
- package/src/commands/report.ts +73 -0
- package/src/crawl/debug.ts +14 -0
- package/src/crawl/diff.ts +48 -0
- package/src/crawl/event-assignments.ts +48 -0
- package/src/index.ts +32 -0
- package/tsconfig.json +15 -0
- package/tsconfig.tsbuildinfo +1 -0
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import { afterEach, describe, it, expect, vi, beforeEach } from 'vitest';
|
|
2
|
+
|
|
3
|
+
import { log } from './log.js';
|
|
4
|
+
|
|
5
|
+
describe('log', () => {
|
|
6
|
+
let consoleSpy: ReturnType<typeof vi.spyOn>;
|
|
7
|
+
let consoleErrorSpy: ReturnType<typeof vi.spyOn>;
|
|
8
|
+
let mockNitpicker: {
|
|
9
|
+
handlers: Map<string, ((...args: never[]) => void)[]>;
|
|
10
|
+
on: ReturnType<typeof vi.fn>;
|
|
11
|
+
emit: (event: string, data: unknown) => void;
|
|
12
|
+
};
|
|
13
|
+
|
|
14
|
+
beforeEach(() => {
|
|
15
|
+
consoleSpy = vi.spyOn(console, 'log').mockImplementation(() => {});
|
|
16
|
+
consoleErrorSpy = vi.spyOn(console, 'error').mockImplementation(() => {});
|
|
17
|
+
|
|
18
|
+
mockNitpicker = {
|
|
19
|
+
handlers: new Map(),
|
|
20
|
+
on: vi.fn((event: string, handler: (...args: never[]) => void) => {
|
|
21
|
+
const handlers = mockNitpicker.handlers.get(event) ?? [];
|
|
22
|
+
handlers.push(handler);
|
|
23
|
+
mockNitpicker.handlers.set(event, handlers);
|
|
24
|
+
}),
|
|
25
|
+
emit(event: string, data: unknown) {
|
|
26
|
+
const handlers = this.handlers.get(event) ?? [];
|
|
27
|
+
for (const handler of handlers) {
|
|
28
|
+
handler(data as never);
|
|
29
|
+
}
|
|
30
|
+
},
|
|
31
|
+
};
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
afterEach(() => {
|
|
35
|
+
vi.restoreAllMocks();
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
it('prints all start log lines', () => {
|
|
39
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
40
|
+
log(mockNitpicker as any, ['line1', 'line2', 'line3']);
|
|
41
|
+
|
|
42
|
+
expect(consoleSpy).toHaveBeenCalledTimes(3);
|
|
43
|
+
expect(consoleSpy).toHaveBeenNthCalledWith(1, 'line1');
|
|
44
|
+
expect(consoleSpy).toHaveBeenNthCalledWith(2, 'line2');
|
|
45
|
+
expect(consoleSpy).toHaveBeenNthCalledWith(3, 'line3');
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
it('registers writeFile event handler', () => {
|
|
49
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
50
|
+
log(mockNitpicker as any, []);
|
|
51
|
+
|
|
52
|
+
expect(mockNitpicker.on).toHaveBeenCalledWith('writeFile', expect.any(Function));
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
it('registers error event handler', () => {
|
|
56
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
57
|
+
log(mockNitpicker as any, []);
|
|
58
|
+
|
|
59
|
+
expect(mockNitpicker.on).toHaveBeenCalledWith('error', expect.any(Function));
|
|
60
|
+
});
|
|
61
|
+
|
|
62
|
+
it('logs write file path on writeFile event', () => {
|
|
63
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
64
|
+
log(mockNitpicker as any, []);
|
|
65
|
+
consoleSpy.mockClear();
|
|
66
|
+
|
|
67
|
+
mockNitpicker.emit('writeFile', { filePath: '/path/to/output.nitpicker' });
|
|
68
|
+
|
|
69
|
+
expect(consoleSpy).toHaveBeenCalledWith(' 📥 Write file: /path/to/output.nitpicker');
|
|
70
|
+
});
|
|
71
|
+
|
|
72
|
+
it('logs error message on error event', () => {
|
|
73
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
74
|
+
log(mockNitpicker as any, []);
|
|
75
|
+
|
|
76
|
+
mockNitpicker.emit('error', { message: 'something went wrong' });
|
|
77
|
+
|
|
78
|
+
expect(consoleErrorSpy).toHaveBeenCalledWith('something went wrong');
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
it('handles empty start log array', () => {
|
|
82
|
+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
|
83
|
+
log(mockNitpicker as any, []);
|
|
84
|
+
|
|
85
|
+
// No console.log calls for start lines (only event handlers registered)
|
|
86
|
+
expect(consoleSpy).not.toHaveBeenCalled();
|
|
87
|
+
});
|
|
88
|
+
});
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
import type { Nitpicker } from '@nitpicker/core';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Registers event listeners on a Nitpicker instance for analyze output,
|
|
5
|
+
* and prints the initial summary lines.
|
|
6
|
+
*
|
|
7
|
+
* Outputs the start log lines (site URL, file path), then listens for
|
|
8
|
+
* `writeFile` and `error` events to provide CLI feedback.
|
|
9
|
+
* Progress display is handled by Lanes in the calling code.
|
|
10
|
+
* @param nitpicker - The Nitpicker instance to listen on
|
|
11
|
+
* @param startLog - Lines to display at the start (URL + file path summary)
|
|
12
|
+
*/
|
|
13
|
+
export function log(nitpicker: Nitpicker, startLog: string[]): void {
|
|
14
|
+
for (const line of startLog) {
|
|
15
|
+
// eslint-disable-next-line no-console
|
|
16
|
+
console.log(line);
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
nitpicker.on('writeFile', ({ filePath }) => {
|
|
20
|
+
// eslint-disable-next-line no-console
|
|
21
|
+
console.log(` 📥 Write file: ${filePath}`);
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
nitpicker.on('error', (err) => {
|
|
25
|
+
// eslint-disable-next-line no-console
|
|
26
|
+
console.error(err.message);
|
|
27
|
+
});
|
|
28
|
+
}
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
import type { CommandDef, InferFlags } from '@d-zero/roar';
|
|
2
|
+
|
|
3
|
+
import path from 'node:path';
|
|
4
|
+
|
|
5
|
+
import { Lanes } from '@d-zero/dealer';
|
|
6
|
+
import { Nitpicker, readPluginLabels } from '@nitpicker/core';
|
|
7
|
+
import enquirer from 'enquirer';
|
|
8
|
+
|
|
9
|
+
import { log } from '../analyze/log.js';
|
|
10
|
+
|
|
11
|
+
const { prompt } = enquirer;
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Command definition for the `analyze` sub-command.
|
|
15
|
+
* @see {@link analyze} for the main entry point
|
|
16
|
+
*/
|
|
17
|
+
export const commandDef = {
|
|
18
|
+
desc: 'Analyze a .nitpicker archive',
|
|
19
|
+
flags: {
|
|
20
|
+
all: {
|
|
21
|
+
type: 'boolean',
|
|
22
|
+
desc: 'Run all analysis plugins',
|
|
23
|
+
},
|
|
24
|
+
verbose: {
|
|
25
|
+
type: 'boolean',
|
|
26
|
+
desc: 'Output logs verbosely',
|
|
27
|
+
},
|
|
28
|
+
},
|
|
29
|
+
} as const satisfies CommandDef;
|
|
30
|
+
|
|
31
|
+
type AnalyzeFlags = InferFlags<typeof commandDef.flags>;
|
|
32
|
+
|
|
33
|
+
/**
|
|
34
|
+
* Main entry point for the `analyze` CLI command.
|
|
35
|
+
*
|
|
36
|
+
* Opens a `.nitpicker` archive, loads the configured analyze plugins,
|
|
37
|
+
* presents an interactive multi-select prompt (unless `--all` is specified),
|
|
38
|
+
* runs the selected plugins with per-plugin Lanes progress display, and
|
|
39
|
+
* writes results back to the archive.
|
|
40
|
+
*
|
|
41
|
+
* WHY enquirer prompt: Allows users to selectively run expensive plugins
|
|
42
|
+
* (e.g. Lighthouse) without re-running everything. The `--all` flag
|
|
43
|
+
* bypasses the prompt for CI/automation use cases.
|
|
44
|
+
* @param args - Positional arguments; first argument is the `.nitpicker` file path
|
|
45
|
+
* @param flags - Parsed CLI flags from the `analyze` command
|
|
46
|
+
*/
|
|
47
|
+
export async function analyze(args: string[], flags: AnalyzeFlags) {
|
|
48
|
+
const filePath = args[0];
|
|
49
|
+
|
|
50
|
+
if (!filePath) {
|
|
51
|
+
return;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
const isTTY = process.stdout.isTTY;
|
|
55
|
+
|
|
56
|
+
const absFilePath = path.isAbsolute(filePath)
|
|
57
|
+
? filePath
|
|
58
|
+
: path.resolve(process.cwd(), filePath);
|
|
59
|
+
// eslint-disable-next-line no-console
|
|
60
|
+
console.log(` 📦 Extracting archive: ${absFilePath}`);
|
|
61
|
+
const nitpicker = await Nitpicker.open(absFilePath);
|
|
62
|
+
|
|
63
|
+
const config = await nitpicker.getConfig();
|
|
64
|
+
const plugins = config.analyze || [];
|
|
65
|
+
const pluginNameList = plugins.map((plugin) => plugin.name);
|
|
66
|
+
|
|
67
|
+
if (pluginNameList.length === 0) {
|
|
68
|
+
// eslint-disable-next-line no-console
|
|
69
|
+
console.error(
|
|
70
|
+
'No analyze plugins found. Install @nitpicker/analyze-* packages or configure them in .nitpickerrc.',
|
|
71
|
+
);
|
|
72
|
+
return;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
let filter: string[] | undefined;
|
|
76
|
+
|
|
77
|
+
if (!flags.all) {
|
|
78
|
+
const labels = await readPluginLabels(plugins);
|
|
79
|
+
const choices = plugins.map((plugin) => ({
|
|
80
|
+
name: plugin.name,
|
|
81
|
+
message: labels.get(plugin.name) || plugin.name,
|
|
82
|
+
}));
|
|
83
|
+
const res = await prompt<{ filter: string[] }>([
|
|
84
|
+
{
|
|
85
|
+
message: 'What do you analyze?',
|
|
86
|
+
name: 'filter',
|
|
87
|
+
type: 'multiselect',
|
|
88
|
+
choices,
|
|
89
|
+
},
|
|
90
|
+
]);
|
|
91
|
+
filter = res.filter;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
const siteUrl = (await nitpicker.archive.getUrl()) || '<Unknown URL>';
|
|
95
|
+
|
|
96
|
+
log(nitpicker, [`🥢 ${siteUrl} (${filePath})`, ` 📤 Read file: ${absFilePath}`]);
|
|
97
|
+
|
|
98
|
+
const lanes = new Lanes({ verbose: !isTTY, indent: ' ' });
|
|
99
|
+
try {
|
|
100
|
+
await nitpicker.analyze(filter, { lanes, verbose: !isTTY });
|
|
101
|
+
} finally {
|
|
102
|
+
lanes.close();
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
await nitpicker.write();
|
|
106
|
+
await nitpicker.archive.close();
|
|
107
|
+
}
|
|
@@ -0,0 +1,303 @@
|
|
|
1
|
+
import type { CommandDef, InferFlags } from '@d-zero/roar';
|
|
2
|
+
import type { Config, CrawlerError } from '@nitpicker/crawler';
|
|
3
|
+
|
|
4
|
+
import path from 'node:path';
|
|
5
|
+
|
|
6
|
+
import { readList } from '@d-zero/readtext/list';
|
|
7
|
+
import { CrawlerOrchestrator } from '@nitpicker/crawler';
|
|
8
|
+
|
|
9
|
+
import { log, verbosely } from '../crawl/debug.js';
|
|
10
|
+
import { diff } from '../crawl/diff.js';
|
|
11
|
+
import { eventAssignments } from '../crawl/event-assignments.js';
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Command definition for the `crawl` sub-command.
|
|
15
|
+
* Defines all CLI flags with their types, defaults, and descriptions.
|
|
16
|
+
* @see {@link crawl} for the main entry point that dispatches to startCrawl/resumeCrawl/diff
|
|
17
|
+
*/
|
|
18
|
+
export const commandDef = {
|
|
19
|
+
desc: 'Crawl a website',
|
|
20
|
+
flags: {
|
|
21
|
+
resume: {
|
|
22
|
+
type: 'string',
|
|
23
|
+
shortFlag: 'R',
|
|
24
|
+
desc: 'Resume crawling from a stub file',
|
|
25
|
+
},
|
|
26
|
+
interval: {
|
|
27
|
+
type: 'number',
|
|
28
|
+
shortFlag: 'I',
|
|
29
|
+
desc: 'An interval time on request when crawles',
|
|
30
|
+
},
|
|
31
|
+
image: {
|
|
32
|
+
type: 'boolean',
|
|
33
|
+
default: true,
|
|
34
|
+
desc: 'Getting images (use --no-image to disable)',
|
|
35
|
+
},
|
|
36
|
+
fetchExternal: {
|
|
37
|
+
type: 'boolean',
|
|
38
|
+
default: true,
|
|
39
|
+
desc: 'Fetch external links (use --no-fetch-external to disable)',
|
|
40
|
+
},
|
|
41
|
+
parallels: {
|
|
42
|
+
type: 'number',
|
|
43
|
+
shortFlag: 'P',
|
|
44
|
+
desc: 'Number of parallel scraping',
|
|
45
|
+
},
|
|
46
|
+
recursive: {
|
|
47
|
+
type: 'boolean',
|
|
48
|
+
default: true,
|
|
49
|
+
desc: 'Recursive crawling (use --no-recursive to disable)',
|
|
50
|
+
},
|
|
51
|
+
scope: {
|
|
52
|
+
type: 'string',
|
|
53
|
+
desc: 'Set hosts and URLs as scope',
|
|
54
|
+
},
|
|
55
|
+
exclude: {
|
|
56
|
+
type: 'string',
|
|
57
|
+
isMultiple: true,
|
|
58
|
+
desc: 'Excluding page URL path (glob pattern)',
|
|
59
|
+
},
|
|
60
|
+
excludeKeyword: {
|
|
61
|
+
type: 'string',
|
|
62
|
+
isMultiple: true,
|
|
63
|
+
desc: 'Exclude keyword in document of page',
|
|
64
|
+
},
|
|
65
|
+
excludeUrl: {
|
|
66
|
+
type: 'string',
|
|
67
|
+
isMultiple: true,
|
|
68
|
+
desc: 'Exclude external URL prefix',
|
|
69
|
+
},
|
|
70
|
+
disableQueries: {
|
|
71
|
+
type: 'boolean',
|
|
72
|
+
shortFlag: 'Q',
|
|
73
|
+
desc: 'Disable queries that the URL has',
|
|
74
|
+
},
|
|
75
|
+
imageFileSizeThreshold: {
|
|
76
|
+
type: 'number',
|
|
77
|
+
desc: 'Image file size threshold',
|
|
78
|
+
},
|
|
79
|
+
single: {
|
|
80
|
+
type: 'boolean',
|
|
81
|
+
desc: 'Single page mode',
|
|
82
|
+
},
|
|
83
|
+
maxExcludedDepth: {
|
|
84
|
+
type: 'number',
|
|
85
|
+
desc: 'Avoid crawling depths above a set number',
|
|
86
|
+
},
|
|
87
|
+
retry: {
|
|
88
|
+
type: 'number',
|
|
89
|
+
default: 3,
|
|
90
|
+
desc: 'Number of retry attempts per URL on scrape failure',
|
|
91
|
+
},
|
|
92
|
+
list: {
|
|
93
|
+
type: 'string',
|
|
94
|
+
isMultiple: true,
|
|
95
|
+
desc: 'Running only each page from the list',
|
|
96
|
+
},
|
|
97
|
+
listFile: {
|
|
98
|
+
type: 'string',
|
|
99
|
+
desc: 'Running only each page from the list file',
|
|
100
|
+
},
|
|
101
|
+
userAgent: {
|
|
102
|
+
type: 'string',
|
|
103
|
+
desc: 'Custom User-Agent string for HTTP requests',
|
|
104
|
+
},
|
|
105
|
+
ignoreRobots: {
|
|
106
|
+
type: 'boolean',
|
|
107
|
+
desc: 'Ignore robots.txt restrictions (use responsibly)',
|
|
108
|
+
},
|
|
109
|
+
verbose: {
|
|
110
|
+
type: 'boolean',
|
|
111
|
+
desc: 'Output verbose log to standard out',
|
|
112
|
+
},
|
|
113
|
+
silent: {
|
|
114
|
+
type: 'boolean',
|
|
115
|
+
desc: 'No output log to standard out',
|
|
116
|
+
},
|
|
117
|
+
diff: {
|
|
118
|
+
type: 'boolean',
|
|
119
|
+
desc: 'Diff mode',
|
|
120
|
+
},
|
|
121
|
+
},
|
|
122
|
+
} as const satisfies CommandDef;
|
|
123
|
+
|
|
124
|
+
type CrawlFlags = InferFlags<typeof commandDef.flags>;
|
|
125
|
+
|
|
126
|
+
type LogType = 'verbose' | 'normal' | 'silent';
|
|
127
|
+
|
|
128
|
+
/**
|
|
129
|
+
* Sets up signal handlers for graceful shutdown and starts event logging.
|
|
130
|
+
*
|
|
131
|
+
* Registers SIGINT/SIGBREAK/SIGHUP/SIGABRT handlers that kill zombie
|
|
132
|
+
* Chromium processes before exiting, then delegates to {@link eventAssignments}
|
|
133
|
+
* for progress output.
|
|
134
|
+
* @param trigger - Display label for the crawl (URL or stub file path)
|
|
135
|
+
* @param orchestrator - The initialized CrawlerOrchestrator instance
|
|
136
|
+
* @param config - The resolved archive configuration
|
|
137
|
+
* @param logType - Output verbosity level
|
|
138
|
+
*/
|
|
139
|
+
function run(
|
|
140
|
+
trigger: string,
|
|
141
|
+
orchestrator: CrawlerOrchestrator,
|
|
142
|
+
config: Config,
|
|
143
|
+
logType: LogType,
|
|
144
|
+
) {
|
|
145
|
+
const killed = () => {
|
|
146
|
+
orchestrator.garbageCollect();
|
|
147
|
+
process.exit();
|
|
148
|
+
};
|
|
149
|
+
process.on('SIGINT', killed);
|
|
150
|
+
process.on('SIGBREAK', killed);
|
|
151
|
+
process.on('SIGHUP', killed);
|
|
152
|
+
process.on('SIGABRT', killed);
|
|
153
|
+
|
|
154
|
+
const head = [
|
|
155
|
+
`🐳 ${trigger} (New scraping)`,
|
|
156
|
+
...Object.entries(config).map(([key, value]) => ` ${key}: ${value}`),
|
|
157
|
+
];
|
|
158
|
+
return eventAssignments(orchestrator, head, logType);
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/**
|
|
162
|
+
* Starts a fresh crawl session for the given URLs.
|
|
163
|
+
*
|
|
164
|
+
* Creates a CrawlerOrchestrator, runs the crawl, writes the archive,
|
|
165
|
+
* and cleans up browser processes. Exits with code 1 if errors occurred.
|
|
166
|
+
* @param siteUrl - One or more root URLs to crawl
|
|
167
|
+
* @param flags - Parsed CLI flags from the `crawl` command
|
|
168
|
+
*/
|
|
169
|
+
async function startCrawl(siteUrl: string[], flags: CrawlFlags) {
|
|
170
|
+
const errStack: (CrawlerError | Error)[] = [];
|
|
171
|
+
|
|
172
|
+
const isList = !!flags.list?.length;
|
|
173
|
+
const orchestrator = await CrawlerOrchestrator.crawling(
|
|
174
|
+
siteUrl,
|
|
175
|
+
{
|
|
176
|
+
...flags,
|
|
177
|
+
scope: flags.scope?.split(',').map((s) => s.trim()),
|
|
178
|
+
list: isList,
|
|
179
|
+
recursive: isList ? false : flags.recursive,
|
|
180
|
+
},
|
|
181
|
+
(orchestrator, config) => {
|
|
182
|
+
run(
|
|
183
|
+
config.baseUrl,
|
|
184
|
+
orchestrator,
|
|
185
|
+
config,
|
|
186
|
+
flags.verbose ? 'verbose' : flags.silent ? 'silent' : 'normal',
|
|
187
|
+
).catch((error) => errStack.push(error));
|
|
188
|
+
},
|
|
189
|
+
);
|
|
190
|
+
|
|
191
|
+
await orchestrator.write();
|
|
192
|
+
|
|
193
|
+
orchestrator.garbageCollect();
|
|
194
|
+
|
|
195
|
+
if (errStack.length > 0) {
|
|
196
|
+
formatCrawlErrors(errStack);
|
|
197
|
+
process.exit(1);
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
/**
|
|
202
|
+
* Resumes a previously interrupted crawl from a stub file (temporary directory).
|
|
203
|
+
*
|
|
204
|
+
* Restores the crawl state from the archive, applies any flag overrides,
|
|
205
|
+
* and continues crawling from where the previous session left off.
|
|
206
|
+
* @param stubFilePath - Path to the stub file or temporary directory to resume from
|
|
207
|
+
* @param flags - Parsed CLI flags from the `crawl` command
|
|
208
|
+
*/
|
|
209
|
+
async function resumeCrawl(stubFilePath: string, flags: CrawlFlags) {
|
|
210
|
+
const errStack: (CrawlerError | Error)[] = [];
|
|
211
|
+
const absFilePath = path.isAbsolute(stubFilePath)
|
|
212
|
+
? stubFilePath
|
|
213
|
+
: path.resolve(process.cwd(), stubFilePath);
|
|
214
|
+
|
|
215
|
+
const orchestrator = await CrawlerOrchestrator.resume(
|
|
216
|
+
absFilePath,
|
|
217
|
+
{
|
|
218
|
+
...flags,
|
|
219
|
+
scope: flags.scope?.split(',').map((s) => s.trim()),
|
|
220
|
+
list: false,
|
|
221
|
+
},
|
|
222
|
+
(orchestrator, config) => {
|
|
223
|
+
run(
|
|
224
|
+
stubFilePath,
|
|
225
|
+
orchestrator,
|
|
226
|
+
config,
|
|
227
|
+
flags.verbose ? 'verbose' : flags.silent ? 'silent' : 'normal',
|
|
228
|
+
).catch((error) => errStack.push(error));
|
|
229
|
+
},
|
|
230
|
+
);
|
|
231
|
+
|
|
232
|
+
await orchestrator.write();
|
|
233
|
+
|
|
234
|
+
orchestrator.garbageCollect();
|
|
235
|
+
|
|
236
|
+
if (errStack.length > 0) {
|
|
237
|
+
formatCrawlErrors(errStack);
|
|
238
|
+
process.exit(1);
|
|
239
|
+
}
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Main entry point for the `crawl` CLI command.
|
|
244
|
+
*
|
|
245
|
+
* Dispatches to one of four modes based on the flags:
|
|
246
|
+
* 1. `--diff` mode: Compares two archive files and outputs URL lists
|
|
247
|
+
* 2. `--resume` mode: Resumes a previously interrupted crawl
|
|
248
|
+
* 3. `--list-file` / `--list` mode: Crawls a pre-defined URL list (non-recursive)
|
|
249
|
+
* 4. Default mode: Crawls from a single root URL
|
|
250
|
+
* @param args - Positional arguments (typically one or two URLs/file paths)
|
|
251
|
+
* @param flags - Parsed CLI flags from the `crawl` command
|
|
252
|
+
*/
|
|
253
|
+
export async function crawl(args: string[], flags: CrawlFlags) {
|
|
254
|
+
if (flags.verbose && !flags.silent) {
|
|
255
|
+
verbosely();
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
log('Options: %O', flags);
|
|
259
|
+
|
|
260
|
+
if (flags.diff) {
|
|
261
|
+
const a = args[0];
|
|
262
|
+
const b = args[1];
|
|
263
|
+
if (!a || !b) {
|
|
264
|
+
throw new Error('Please provide two file paths to compare');
|
|
265
|
+
}
|
|
266
|
+
await diff(a, b);
|
|
267
|
+
return;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
if (flags.resume) {
|
|
271
|
+
await resumeCrawl(flags.resume, flags);
|
|
272
|
+
return;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
if (flags.listFile) {
|
|
276
|
+
const list = await readList(path.resolve(process.cwd(), flags.listFile));
|
|
277
|
+
flags.list = list;
|
|
278
|
+
await startCrawl(list, flags);
|
|
279
|
+
return;
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
if (flags.list && flags.list.length > 0) {
|
|
283
|
+
const pageList = [...flags.list, ...args];
|
|
284
|
+
await startCrawl(pageList, flags);
|
|
285
|
+
return;
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
const siteUrl = args[0];
|
|
289
|
+
|
|
290
|
+
if (siteUrl) {
|
|
291
|
+
await startCrawl([siteUrl], flags);
|
|
292
|
+
return;
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
/**
|
|
297
|
+
* Prints a summary of errors that occurred during crawling to stderr.
|
|
298
|
+
* @param errStack - Array of errors collected during the crawl session
|
|
299
|
+
*/
|
|
300
|
+
function formatCrawlErrors(errStack: (CrawlerError | Error)[]) {
|
|
301
|
+
// eslint-disable-next-line no-console
|
|
302
|
+
console.error(`\nCompleted with ${errStack.length} error(s).`);
|
|
303
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import type { CommandDef, InferFlags } from '@d-zero/roar';
|
|
2
|
+
|
|
3
|
+
import { report as runReport } from '@nitpicker/report-google-sheets';
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Command definition for the `report` sub-command.
|
|
7
|
+
* @see {@link report} for the main entry point
|
|
8
|
+
*/
|
|
9
|
+
export const commandDef = {
|
|
10
|
+
desc: 'Generate a Google Sheets report',
|
|
11
|
+
flags: {
|
|
12
|
+
sheet: {
|
|
13
|
+
shortFlag: 'S',
|
|
14
|
+
type: 'string',
|
|
15
|
+
isRequired: true,
|
|
16
|
+
desc: 'Google Sheets URL',
|
|
17
|
+
},
|
|
18
|
+
credentials: {
|
|
19
|
+
shortFlag: 'C',
|
|
20
|
+
type: 'string',
|
|
21
|
+
default: './credentials.json',
|
|
22
|
+
desc: 'Path to credentials file (keep this file secure and out of version control)',
|
|
23
|
+
},
|
|
24
|
+
config: {
|
|
25
|
+
shortFlag: 'c',
|
|
26
|
+
type: 'string',
|
|
27
|
+
desc: 'Path to config file',
|
|
28
|
+
},
|
|
29
|
+
limit: {
|
|
30
|
+
shortFlag: 'l',
|
|
31
|
+
type: 'number',
|
|
32
|
+
default: 100_000,
|
|
33
|
+
desc: 'Limit number of rows',
|
|
34
|
+
},
|
|
35
|
+
},
|
|
36
|
+
} as const satisfies CommandDef;
|
|
37
|
+
|
|
38
|
+
type ReportFlags = InferFlags<typeof commandDef.flags>;
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Main entry point for the `report` CLI command.
|
|
42
|
+
*
|
|
43
|
+
* Reads a `.nitpicker` archive and generates a Google Sheets report
|
|
44
|
+
* by delegating to `@nitpicker/report-google-sheets`. Requires a Google
|
|
45
|
+
* Sheets URL and a service account credentials file.
|
|
46
|
+
* @param args - Positional arguments; first argument is the `.nitpicker` file path
|
|
47
|
+
* @param flags - Parsed CLI flags from the `report` command
|
|
48
|
+
*/
|
|
49
|
+
export async function report(args: string[], flags: ReportFlags) {
|
|
50
|
+
const filePath = args[0];
|
|
51
|
+
|
|
52
|
+
if (!filePath) {
|
|
53
|
+
return;
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
const sheetUrl = flags.sheet;
|
|
57
|
+
|
|
58
|
+
if (!sheetUrl) {
|
|
59
|
+
return;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
const credentialFilePath = flags.credentials;
|
|
63
|
+
const configFilePath = flags.config || null;
|
|
64
|
+
const limit = flags.limit;
|
|
65
|
+
|
|
66
|
+
await runReport({
|
|
67
|
+
filePath,
|
|
68
|
+
sheetUrl,
|
|
69
|
+
credentialFilePath,
|
|
70
|
+
configPath: configFilePath,
|
|
71
|
+
limit,
|
|
72
|
+
});
|
|
73
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import debug from 'debug';
|
|
2
|
+
|
|
3
|
+
export const log = debug('Nitpicker').extend('CLI');
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
*
|
|
7
|
+
*/
|
|
8
|
+
export function verbosely() {
|
|
9
|
+
if (!debug.enabled('Nitpicker')) {
|
|
10
|
+
debug.enable(
|
|
11
|
+
'Nitpicker*,-Nitpicker:Crawler:Deal,-Nitpicker:Scraper:DOM:Details:*,-Nitpicker:Scraper:Resource:*',
|
|
12
|
+
);
|
|
13
|
+
}
|
|
14
|
+
}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import type { Page } from '@nitpicker/crawler';
|
|
2
|
+
|
|
3
|
+
import fs from 'node:fs/promises';
|
|
4
|
+
|
|
5
|
+
import { sortUrl } from '@d-zero/shared/sort-url';
|
|
6
|
+
import { Archive } from '@nitpicker/crawler';
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Compares two `.nitpicker` archives and writes their URL lists to `a.txt` and `b.txt`.
|
|
10
|
+
*
|
|
11
|
+
* Extracts active internal HTML pages (2xx/3xx status) from both archives,
|
|
12
|
+
* sorts them in natural URL order, and writes to the current working directory.
|
|
13
|
+
* The output files can then be compared using standard diff tools.
|
|
14
|
+
* @param a - File path to the first `.nitpicker` archive
|
|
15
|
+
* @param b - File path to the second `.nitpicker` archive
|
|
16
|
+
*/
|
|
17
|
+
export async function diff(a: string, b: string) {
|
|
18
|
+
const archiveA = await Archive.open({ filePath: a });
|
|
19
|
+
const archiveB = await Archive.open({ filePath: b });
|
|
20
|
+
const pagesA = await archiveA.getPages();
|
|
21
|
+
const pagesB = await archiveB.getPages();
|
|
22
|
+
const listA = pagesA.filter(isActive).map((page) => page.url.withoutHashAndAuth);
|
|
23
|
+
const listB = pagesB.filter(isActive).map((page) => page.url.withoutHashAndAuth);
|
|
24
|
+
|
|
25
|
+
const sortedA = sortUrl(listA).map((url) => url.withoutHashAndAuth);
|
|
26
|
+
const sortedB = sortUrl(listB).map((url) => url.withoutHashAndAuth);
|
|
27
|
+
|
|
28
|
+
await fs.writeFile('a.txt', sortedA.join('\n'), 'utf8');
|
|
29
|
+
await fs.writeFile('b.txt', sortedB.join('\n'), 'utf8');
|
|
30
|
+
|
|
31
|
+
await archiveA.close();
|
|
32
|
+
await archiveB.close();
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Filters for active internal HTML pages (status 200-399, non-external).
|
|
37
|
+
* @param page - The page to check
|
|
38
|
+
* @returns `true` if the page is an active internal HTML page
|
|
39
|
+
*/
|
|
40
|
+
function isActive(page: Page) {
|
|
41
|
+
return (
|
|
42
|
+
page.isPage() &&
|
|
43
|
+
!page.isExternal &&
|
|
44
|
+
page.status &&
|
|
45
|
+
page.status >= 200 &&
|
|
46
|
+
page.status < 400
|
|
47
|
+
);
|
|
48
|
+
}
|