rankforge 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,292 @@
1
+ import fs from "node:fs";
2
+ import path from "node:path";
3
+ import { unsafeRegexReason } from "./regex-guards.mjs";
4
+
5
+ export const auditConfigSchema = {
6
+ $schema: "https://json-schema.org/draft/2020-12/schema",
7
+ $id: "https://rankforge.dev/schemas/rankforge-config.schema.json",
8
+ title: "RankForge Config",
9
+ type: "object",
10
+ required: ["target"],
11
+ additionalProperties: true,
12
+ properties: {
13
+ project: { type: "string" },
14
+ target: { type: "string", minLength: 1 },
15
+ sitemap: { type: "string" },
16
+ urlList: { type: "string" },
17
+ respectRobots: { type: "boolean" },
18
+ brand: {
19
+ type: "object",
20
+ additionalProperties: true,
21
+ properties: {
22
+ name: { type: "string" },
23
+ type: { type: "string" },
24
+ sameAs: { type: "array", items: { type: "string" } },
25
+ },
26
+ },
27
+ audience: { type: "array", items: { type: "string" } },
28
+ targetQueries: { type: "array", items: { type: "string" } },
29
+ competitors: { type: "array", items: { type: "string" } },
30
+ crawl: {
31
+ type: "object",
32
+ additionalProperties: true,
33
+ properties: {
34
+ mode: { enum: ["full", "sample", "single"] },
35
+ maxPages: { type: "integer", minimum: 1 },
36
+ maxDepth: { type: "integer", minimum: 0 },
37
+ include: { type: "array", items: { type: "string" } },
38
+ exclude: { type: "array", items: { type: "string" } },
39
+ },
40
+ },
41
+ repo: {
42
+ type: "object",
43
+ additionalProperties: true,
44
+ properties: {
45
+ buildCommand: { type: "string" },
46
+ previewCommand: { type: "string" },
47
+ previewUrl: { type: "string" },
48
+ staticDir: { type: "string" },
49
+ routeList: { type: "string" },
50
+ maxBuildMs: { type: "integer", minimum: 1 },
51
+ maxPreviewMs: { type: "integer", minimum: 1 },
52
+ },
53
+ },
54
+ render: {
55
+ type: "object",
56
+ additionalProperties: true,
57
+ properties: {
58
+ mode: { enum: ["auto", "always", "never"] },
59
+ viewports: {
60
+ type: "array",
61
+ items: { enum: ["mobile", "desktop"] },
62
+ },
63
+ },
64
+ },
65
+ security: {
66
+ type: "object",
67
+ additionalProperties: true,
68
+ properties: {
69
+ mode: { enum: ["local", "restricted"] },
70
+ },
71
+ },
72
+ limits: {
73
+ type: "object",
74
+ additionalProperties: true,
75
+ properties: {
76
+ timeoutMs: { type: "integer", minimum: 1 },
77
+ maxHtmlBytes: { type: "integer", minimum: 1 },
78
+ maxTextBytes: { type: "integer", minimum: 1 },
79
+ maxFileBytes: { type: "integer", minimum: 1 },
80
+ maxIntegrationBytes: { type: "integer", minimum: 1 },
81
+ },
82
+ },
83
+ integrations: {
84
+ type: "object",
85
+ additionalProperties: true,
86
+ properties: {
87
+ searchConsole: { type: "string" },
88
+ serp: { type: "string" },
89
+ aiAnswers: { type: "string" },
90
+ lighthouse: { type: "string" },
91
+ },
92
+ },
93
+ },
94
+ };
95
+
96
+ const isObject = (value) => value !== null && typeof value === "object" && !Array.isArray(value);
97
+
98
+ const isUrlOrLocalhost = (value) =>
99
+ /^https?:\/\//i.test(value) || /^localhost(?::\d+)?(?:\/|$)/i.test(value) || /^127\.0\.0\.1(?::\d+)?/i.test(value);
100
+
101
+ const resolveMaybePath = (value, baseDir) => {
102
+ if (typeof value !== "string" || !value || !baseDir) return value;
103
+ if (isUrlOrLocalhost(value) || path.isAbsolute(value)) return value;
104
+ return path.resolve(baseDir, value);
105
+ };
106
+
107
+ export const resolveAuditConfigPaths = (config, baseDir) => {
108
+ if (!isObject(config)) return config;
109
+ const resolved = {
110
+ ...config,
111
+ target: resolveMaybePath(config.target, baseDir),
112
+ urlList: resolveMaybePath(config.urlList, baseDir),
113
+ repo: isObject(config.repo)
114
+ ? {
115
+ ...config.repo,
116
+ staticDir: resolveMaybePath(config.repo.staticDir, baseDir),
117
+ routeList: resolveMaybePath(config.repo.routeList, baseDir),
118
+ }
119
+ : config.repo,
120
+ integrations: config.integrations
121
+ ? {
122
+ ...config.integrations,
123
+ searchConsole: resolveMaybePath(config.integrations.searchConsole, baseDir),
124
+ serp: resolveMaybePath(config.integrations.serp, baseDir),
125
+ aiAnswers: resolveMaybePath(config.integrations.aiAnswers, baseDir),
126
+ lighthouse: resolveMaybePath(config.integrations.lighthouse, baseDir),
127
+ }
128
+ : config.integrations,
129
+ };
130
+
131
+ return resolved;
132
+ };
133
+
134
+ const isTargetLike = (value) => {
135
+ if (typeof value !== "string" || !value.trim()) return false;
136
+ if (/^https?:\/\//i.test(value)) {
137
+ try {
138
+ new URL(value);
139
+ return true;
140
+ } catch {
141
+ return false;
142
+ }
143
+ }
144
+ if (/^localhost(?::\d+)?\//i.test(value) || /^localhost(?::\d+)?$/i.test(value)) return true;
145
+ if (/^127\.0\.0\.1(?::\d+)?/i.test(value)) return true;
146
+ return Boolean(path.normalize(value));
147
+ };
148
+
149
+ const validateRegexList = (errors, key, value) => {
150
+ if (value === undefined) return;
151
+ if (!Array.isArray(value)) {
152
+ errors.push(`${key} must be an array`);
153
+ return;
154
+ }
155
+ for (const [index, pattern] of value.entries()) {
156
+ if (typeof pattern !== "string") {
157
+ errors.push(`${key}[${index}] must be a string`);
158
+ continue;
159
+ }
160
+ try {
161
+ new RegExp(pattern);
162
+ } catch {
163
+ errors.push(`${key}[${index}] must be a valid regular expression`);
164
+ continue;
165
+ }
166
+ const unsafeReason = unsafeRegexReason(pattern);
167
+ if (unsafeReason) errors.push(`${key}[${index}] contains an unsafe regular expression: ${unsafeReason}`);
168
+ }
169
+ };
170
+
171
+ const validatePositiveInteger = (errors, key, value) => {
172
+ if (value !== undefined && (!Number.isInteger(value) || value < 1)) {
173
+ errors.push(`${key} must be a positive integer`);
174
+ }
175
+ };
176
+
177
+ const validateExistingFile = (errors, key, filePath, baseDir) => {
178
+ if (filePath === undefined || filePath === null || filePath === "") return;
179
+ if (typeof filePath !== "string") {
180
+ errors.push(`${key} must be a string`);
181
+ return;
182
+ }
183
+ const resolved = path.isAbsolute(filePath) ? filePath : path.resolve(baseDir || process.cwd(), filePath);
184
+ if (!fs.existsSync(resolved)) errors.push(`${key} file does not exist: ${filePath}`);
185
+ };
186
+
187
+ export const validateAuditConfig = (config, options = {}) => {
188
+ const errors = [];
189
+
190
+ if (!isObject(config)) {
191
+ return { ok: false, errors: ["config must be an object"] };
192
+ }
193
+
194
+ if (!("target" in config)) {
195
+ errors.push("target is required");
196
+ } else if (!isTargetLike(config.target)) {
197
+ errors.push("target must be a URL, localhost target, or local path");
198
+ }
199
+
200
+ if (config.crawl !== undefined) {
201
+ if (!isObject(config.crawl)) {
202
+ errors.push("crawl must be an object");
203
+ } else {
204
+ if (config.crawl.mode !== undefined && !["full", "sample", "single"].includes(config.crawl.mode)) {
205
+ errors.push("crawl.mode must be one of: full, sample, single");
206
+ }
207
+ if (
208
+ config.crawl.maxPages !== undefined &&
209
+ (!Number.isInteger(config.crawl.maxPages) || config.crawl.maxPages < 1)
210
+ ) {
211
+ errors.push("crawl.maxPages must be a positive integer");
212
+ }
213
+ if (
214
+ config.crawl.maxDepth !== undefined &&
215
+ (!Number.isInteger(config.crawl.maxDepth) || config.crawl.maxDepth < 0)
216
+ ) {
217
+ errors.push("crawl.maxDepth must be a non-negative integer");
218
+ }
219
+ validateRegexList(errors, "crawl.include", config.crawl.include);
220
+ validateRegexList(errors, "crawl.exclude", config.crawl.exclude);
221
+ if (config.crawl.sitemap !== undefined && typeof config.crawl.sitemap !== "string") {
222
+ errors.push("crawl.sitemap must be a string");
223
+ }
224
+ if (config.crawl.respectRobots !== undefined && typeof config.crawl.respectRobots !== "boolean") {
225
+ errors.push("crawl.respectRobots must be a boolean");
226
+ }
227
+ }
228
+ }
229
+
230
+ if (config.repo !== undefined) {
231
+ if (!isObject(config.repo)) {
232
+ errors.push("repo must be an object");
233
+ } else {
234
+ for (const key of ["buildCommand", "previewCommand", "previewUrl", "staticDir", "routeList"]) {
235
+ if (config.repo[key] !== undefined && typeof config.repo[key] !== "string") {
236
+ errors.push(`repo.${key} must be a string`);
237
+ }
238
+ }
239
+ validatePositiveInteger(errors, "repo.maxBuildMs", config.repo.maxBuildMs);
240
+ validatePositiveInteger(errors, "repo.maxPreviewMs", config.repo.maxPreviewMs);
241
+ }
242
+ }
243
+
244
+ if (config.render !== undefined) {
245
+ if (!isObject(config.render)) {
246
+ errors.push("render must be an object");
247
+ } else if (config.render.mode !== undefined && !["auto", "always", "never"].includes(config.render.mode)) {
248
+ errors.push("render.mode must be one of: auto, always, never");
249
+ }
250
+ }
251
+
252
+ if (config.security !== undefined) {
253
+ if (!isObject(config.security)) {
254
+ errors.push("security must be an object");
255
+ } else if (config.security.mode !== undefined && !["local", "restricted"].includes(config.security.mode)) {
256
+ errors.push("security.mode must be one of: local, restricted");
257
+ }
258
+ }
259
+
260
+ if (config.limits !== undefined) {
261
+ if (!isObject(config.limits)) {
262
+ errors.push("limits must be an object");
263
+ } else {
264
+ validatePositiveInteger(errors, "limits.timeoutMs", config.limits.timeoutMs);
265
+ validatePositiveInteger(errors, "limits.maxHtmlBytes", config.limits.maxHtmlBytes);
266
+ validatePositiveInteger(errors, "limits.maxTextBytes", config.limits.maxTextBytes);
267
+ validatePositiveInteger(errors, "limits.maxFileBytes", config.limits.maxFileBytes);
268
+ validatePositiveInteger(errors, "limits.maxIntegrationBytes", config.limits.maxIntegrationBytes);
269
+ }
270
+ }
271
+
272
+ if (config.urlList !== undefined && typeof config.urlList !== "string") {
273
+ errors.push("urlList must be a string");
274
+ }
275
+
276
+ if (config.integrations !== undefined && !isObject(config.integrations)) {
277
+ errors.push("integrations must be an object");
278
+ }
279
+
280
+ if (options.checkFiles) {
281
+ validateExistingFile(errors, "urlList", config.urlList, options.baseDir);
282
+ validateExistingFile(errors, "repo.routeList", config.repo?.routeList, options.baseDir);
283
+ validateExistingFile(errors, "integrations.searchConsole", config.integrations?.searchConsole, options.baseDir);
284
+ validateExistingFile(errors, "integrations.serp", config.integrations?.serp, options.baseDir);
285
+ validateExistingFile(errors, "integrations.aiAnswers", config.integrations?.aiAnswers, options.baseDir);
286
+ validateExistingFile(errors, "integrations.lighthouse", config.integrations?.lighthouse, options.baseDir);
287
+ }
288
+
289
+ return { ok: errors.length === 0, errors };
290
+ };
291
+
292
+ export const readAuditConfig = (filePath) => JSON.parse(fs.readFileSync(filePath, "utf8"));
package/src/crawl.mjs ADDED
@@ -0,0 +1,188 @@
1
+ import { collectSnapshot } from "./snapshot.mjs";
2
+ import { fetchWithGuards, readResponseTextLimited, resolveLimits } from "./io-guards.mjs";
3
+ import { parseRobotsTxt, isAllowedByRobots } from "./robots.mjs";
4
+ import { parseSitemap } from "./sitemap.mjs";
5
+ import { compileSafeRegex } from "./regex-guards.mjs";
6
+ import { isHttpUrl, normalizeUrl, sameOrigin } from "./url-utils.mjs";
7
+
8
+ const userAgent = "RankForgeBot";
9
+
10
+ const fetchText = async (url, options = {}) => {
11
+ const limits = resolveLimits(options.limits);
12
+ const maxRedirects = options.maxRedirects ?? 5;
13
+ let current = url;
14
+
15
+ for (let attempt = 0; attempt <= maxRedirects; attempt++) {
16
+ const response = await fetchWithGuards(current, {
17
+ security: options.security,
18
+ limits,
19
+ fetchOptions: {
20
+ headers: { "user-agent": userAgent },
21
+ redirect: "manual",
22
+ },
23
+ });
24
+ const location = response.headers.get("location");
25
+ if ([301, 302, 303, 307, 308].includes(response.status) && location) {
26
+ current = new URL(location, current).href;
27
+ continue;
28
+ }
29
+
30
+ return {
31
+ url: current,
32
+ status: response.status,
33
+ ok: response.ok,
34
+ text: response.ok
35
+ ? await readResponseTextLimited(response, {
36
+ limits,
37
+ maxBytes: limits.maxTextBytes,
38
+ label: current,
39
+ })
40
+ : "",
41
+ };
42
+ }
43
+
44
+ throw new Error(`Too many redirects while fetching ${url}`);
45
+ };
46
+
47
+ const robotsUrlFor = (target) => new URL("/robots.txt", target).href;
48
+
49
+ const loadRobots = async (target, enabled, options = {}) => {
50
+ if (!enabled) return null;
51
+ try {
52
+ const response = await fetchText(robotsUrlFor(target), options);
53
+ return {
54
+ url: response.url,
55
+ status: response.status,
56
+ ok: response.ok,
57
+ parsed: response.ok ? parseRobotsTxt(response.text) : { groups: [] },
58
+ };
59
+ } catch (error) {
60
+ return { url: robotsUrlFor(target), status: null, ok: false, error: error.message, parsed: { groups: [] } };
61
+ }
62
+ };
63
+
64
+ const loadSitemap = async (url, options = {}) => {
65
+ try {
66
+ const response = await fetchText(url, options);
67
+ return {
68
+ url,
69
+ status: response.status,
70
+ ok: response.ok,
71
+ parsed: response.ok ? parseSitemap(response.text) : { type: "unknown", urls: [], sitemaps: [] },
72
+ };
73
+ } catch (error) {
74
+ return { url, status: null, ok: false, error: error.message, parsed: { type: "unknown", urls: [], sitemaps: [] } };
75
+ }
76
+ };
77
+
78
+ const compilePatterns = (patterns = [], key = "pattern") =>
79
+ patterns.map((pattern, index) => compileSafeRegex(pattern, `${key}[${index}]`));
80
+
81
+ const createFilter = (config, target) => {
82
+ const include = compilePatterns(config.include || config.crawl?.include || [], "crawl.include");
83
+ const exclude = compilePatterns(config.exclude || config.crawl?.exclude || [], "crawl.exclude");
84
+
85
+ return (url) => {
86
+ if (normalizeUrl(url) === target) return null;
87
+ if (exclude.some((pattern) => pattern.test(url))) return "excluded";
88
+ if (include.length && !include.some((pattern) => pattern.test(url))) return "not_included";
89
+ return null;
90
+ };
91
+ };
92
+
93
+ export const crawlSite = async (config) => {
94
+ const target = normalizeUrl(config.target);
95
+ const maxPages = config.maxPages || config.crawl?.maxPages || 50;
96
+ const maxDepth = config.maxDepth ?? config.crawl?.maxDepth ?? 2;
97
+ const respectRobots = config.respectRobots ?? config.crawl?.respectRobots ?? false;
98
+ const sitemapUrl = config.sitemap || config.crawl?.sitemap || null;
99
+ const guardOptions = { security: config.security, limits: config.limits };
100
+ const filterReason = createFilter(config, target);
101
+ const robots = await loadRobots(target, respectRobots, guardOptions);
102
+ const sitemaps = [];
103
+ const queue = [{ url: target, depth: 0 }];
104
+ const skipped = [];
105
+
106
+ if (sitemapUrl) {
107
+ const sitemap = await loadSitemap(sitemapUrl, guardOptions);
108
+ sitemaps.push(sitemap);
109
+ for (const url of sitemap.parsed.urls || []) {
110
+ const normalized = normalizeUrl(url);
111
+ if (!isHttpUrl(normalized)) {
112
+ skipped.push({ url: normalized, reason: "not_http" });
113
+ continue;
114
+ }
115
+ if (!sameOrigin(target, normalized)) {
116
+ skipped.push({ url: normalized, reason: "cross_origin" });
117
+ continue;
118
+ }
119
+ const reason = filterReason(normalized);
120
+ if (reason) skipped.push({ url: normalized, reason });
121
+ else queue.push({ url: normalized, depth: 0, source: "sitemap" });
122
+ }
123
+ }
124
+
125
+ const seen = new Set();
126
+ const pages = [];
127
+
128
+ while (queue.length && pages.length < maxPages) {
129
+ const item = queue.shift();
130
+ if (seen.has(item.url)) continue;
131
+ seen.add(item.url);
132
+
133
+ if (!isHttpUrl(item.url)) {
134
+ skipped.push({ url: item.url, reason: "not_http" });
135
+ continue;
136
+ }
137
+
138
+ if (!sameOrigin(target, item.url)) {
139
+ skipped.push({ url: item.url, reason: "cross_origin" });
140
+ continue;
141
+ }
142
+
143
+ const reason = filterReason(item.url);
144
+ if (reason) {
145
+ skipped.push({ url: item.url, reason });
146
+ continue;
147
+ }
148
+
149
+ if (respectRobots && robots?.parsed && !isAllowedByRobots(robots.parsed, item.url, userAgent)) {
150
+ skipped.push({ url: item.url, reason: "robots_blocked" });
151
+ continue;
152
+ }
153
+
154
+ const snapshot = await collectSnapshot(item.url, {
155
+ render: config.render?.mode,
156
+ renderer: config.renderer,
157
+ security: config.security,
158
+ limits: config.limits,
159
+ });
160
+ pages.push(snapshot);
161
+
162
+ if (item.depth >= maxDepth) continue;
163
+
164
+ for (const link of snapshot.evidence.links || []) {
165
+ if (!link.href) continue;
166
+ const normalized = normalizeUrl(link.href);
167
+ if (seen.has(normalized)) continue;
168
+ if (!isHttpUrl(normalized)) {
169
+ skipped.push({ url: normalized, reason: "not_http" });
170
+ continue;
171
+ }
172
+ if (!sameOrigin(target, normalized)) {
173
+ skipped.push({ url: normalized, reason: "cross_origin" });
174
+ continue;
175
+ }
176
+ const reason = filterReason(normalized);
177
+ if (reason) {
178
+ skipped.push({ url: normalized, reason });
179
+ continue;
180
+ }
181
+ if (!queue.some((queued) => queued.url === normalized)) {
182
+ queue.push({ url: normalized, depth: item.depth + 1 });
183
+ }
184
+ }
185
+ }
186
+
187
+ return { pages, skipped, robots, sitemaps };
188
+ };
@@ -0,0 +1,9 @@
1
+ export const implementationTaskFor = (rule, owner, effort) => ({
2
+ summary: rule.recommendation,
3
+ owner,
4
+ effort,
5
+ acceptanceCriteria: [
6
+ `The ${rule.id} finding is no longer triggered for the affected evidence.`,
7
+ "Updated evidence remains crawlable, visible, and aligned with the cited guidance.",
8
+ ],
9
+ });