@shriyanss/js-recon 1.2.2-alpha.7 → 1.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,410 @@
1
+ # Contributing New Next.js Discovery Methods
2
+
3
+ This guide explains how to add new JavaScript file discovery techniques for Next.js applications to `js-recon`.
4
+
5
+ ## Table of Contents
6
+
7
+ - [Architecture Overview](#architecture-overview)
8
+ - [Discovery Method Types](#discovery-method-types)
9
+ - [Adding a New Discovery Method](#adding-a-new-discovery-method)
10
+ - [Integration into NextJsCrawler](#integration-into-nextjscrawler)
11
+ - [Best Practices](#best-practices)
12
+ - [Testing Your Method](#testing-your-method)
13
+ - [Research Mode](#research-mode)
14
+
15
+ ---
16
+
17
+ ## Architecture Overview
18
+
19
+ The Next.js discovery system is built around the **`NextJsCrawler`** class (`src/lazyLoad/next_js/NextJsCrawler.ts`), which implements a **three-phase crawling strategy**:
20
+
21
+ ### Phase 1: Initial Discovery
22
+
23
+ Heavyweight methods that run **once** to bootstrap the crawl:
24
+
25
+ - Script tag parsing (`next_getJSScript`)
26
+ - Webpack runtime analysis (`next_GetLazyResourcesWebpackJs`) – uses Puppeteer
27
+ - `_buildManifest.js` parsing (`next_getLazyResourcesBuildManifestJs`)
28
+ - Subsequent requests with RSC headers (`subsequentRequests`)
29
+
30
+ ### Phase 2: Recursive Discovery
31
+
32
+ Lightweight methods that run **multiple times** on newly discovered URLs until convergence:
33
+
34
+ - Promise.all pattern detection (`next_promiseResolve`)
35
+ - Layout.js href extraction (`next_parseLayoutJs`)
36
+ - Script tag re-parsing on new pages (`next_getJSScript`)
37
+
38
+ ### Phase 3: Finalization
39
+
40
+ Post-processing on the final URL set:
41
+
42
+ - Source map brute-forcing (`next_bruteForceJsFiles`)
43
+
44
+ ---
45
+
46
+ ## Discovery Method Types
47
+
48
+ ### Initial Discovery Methods
49
+
50
+ **When to use:**
51
+
52
+ - The method is **expensive** (e.g., launches a browser, makes many requests)
53
+ - The method doesn't benefit from being run multiple times
54
+ - The method provides a bootstrap set of URLs
55
+
56
+ **Characteristics:**
57
+
58
+ - Runs exactly **once** per crawl
59
+ - Added to `initialDiscovery()` in `NextJsCrawler.ts`
60
+ - Examples: Puppeteer-based webpack analysis, manifest parsing
61
+
62
+ ### Recursive Discovery Methods
63
+
64
+ **When to use:**
65
+
66
+ - The method analyzes **JavaScript file contents** to find more JS files
67
+ - The method discovers **client-side paths** that need to be visited
68
+ - The method can find new URLs by examining URLs found in previous passes
69
+
70
+ **Characteristics:**
71
+
72
+ - Runs in a **loop** until no new URLs are discovered (max 10 iterations by default)
73
+ - Takes an array of URLs as input and returns newly discovered URLs
74
+ - Added to `recursivePass()` in `NextJsCrawler.ts`
75
+ - Examples: Promise.all pattern detection, layout href parsing
76
+
77
+ ---
78
+
79
+ ## Adding a New Discovery Method
80
+
81
+ ### Step 1: Create the Method Module
82
+
83
+ Create a new file in `src/lazyLoad/next_js/` following the naming convention `next_<methodName>.ts`.
84
+
85
+ #### Example: Detecting dynamic imports
86
+
87
+ ```typescript
88
+ // src/lazyLoad/next_js/next_dynamicImports.ts
89
+ import chalk from "chalk";
90
+ import makeRequest from "../../utility/makeReq.js";
91
+ import parser from "@babel/parser";
92
+ import _traverse from "@babel/traverse";
93
+ const traverse = _traverse.default;
94
+
95
+ /**
96
+ * Finds JS files referenced in dynamic import() statements.
97
+ *
98
+ * @param baseUrl - The base URL of the application
99
+ * @param urls - Array of JS file URLs to analyze
100
+ * @returns Array of newly discovered JS file URLs
101
+ */
102
+ const next_dynamicImports = async (baseUrl: string, urls: string[]): Promise<string[]> => {
103
+ console.log(chalk.cyan("[i] Analyzing dynamic import() statements"));
104
+
105
+ const discoveredUrls: string[] = [];
106
+
107
+ for (const url of urls) {
108
+ try {
109
+ const response = await makeRequest(url);
110
+ if (!response?.ok) continue;
111
+
112
+ const jsContent = await response.text();
113
+ const ast = parser.parse(jsContent, {
114
+ sourceType: "unambiguous",
115
+ plugins: ["jsx", "typescript"],
116
+ errorRecovery: true,
117
+ });
118
+
119
+ traverse(ast, {
120
+ Import(path) {
121
+ // Detect: import('path/to/file.js')
122
+ const parent = path.parent;
123
+ if (
124
+ parent.type === "CallExpression" &&
125
+ parent.arguments.length > 0 &&
126
+ parent.arguments[0].type === "StringLiteral"
127
+ ) {
128
+ const importPath = parent.arguments[0].value;
129
+ if (importPath.endsWith(".js")) {
130
+ // Resolve relative to base URL
131
+ const resolvedUrl = new URL(importPath, baseUrl).href;
132
+ discoveredUrls.push(resolvedUrl);
133
+ }
134
+ }
135
+ },
136
+ });
137
+ } catch (error) {
138
+ // Skip files that can't be parsed
139
+ continue;
140
+ }
141
+ }
142
+
143
+ const uniqueUrls = [...new Set(discoveredUrls)];
144
+
145
+ if (uniqueUrls.length > 0) {
146
+ console.log(chalk.green(`[✓] Found ${uniqueUrls.length} JS files from dynamic imports`));
147
+ }
148
+
149
+ return uniqueUrls;
150
+ };
151
+
152
+ export default next_dynamicImports;
153
+ ```
154
+
155
+ ### Step 2: Method Signature Guidelines
156
+
157
+ **For initial discovery methods:**
158
+
159
+ ```typescript
160
+ async function next_methodName(url: string): Promise<string[]>;
161
+ ```
162
+
163
+ - Takes the base URL as input
164
+ - Returns array of discovered URLs
165
+
166
+ **For recursive discovery methods:**
167
+
168
+ ```typescript
169
+ async function next_methodName(baseUrl: string, urls: string[]): Promise<string[]>;
170
+ ```
171
+
172
+ - Takes base URL and array of URLs to analyze
173
+ - Returns array of **newly** discovered URLs
174
+ - Should handle empty input gracefully
175
+
176
+ ### Step 3: Common Patterns
177
+
178
+ #### Pattern 1: AST-based Analysis
179
+
180
+ Use Babel parser to analyze JavaScript syntax:
181
+
182
+ ```typescript
183
+ import parser from "@babel/parser";
184
+ import _traverse from "@babel/traverse";
185
+ const traverse = _traverse.default;
186
+
187
+ const ast = parser.parse(jsContent, {
188
+ sourceType: "unambiguous",
189
+ plugins: ["jsx", "typescript"],
190
+ errorRecovery: true,
191
+ });
192
+
193
+ traverse(ast, {
194
+ // Visitor pattern
195
+ CallExpression(path) {
196
+ // Analyze specific AST nodes
197
+ },
198
+ });
199
+ ```
200
+
201
+ #### Pattern 2: String/Regex Matching
202
+
203
+ Quick pattern detection without parsing:
204
+
205
+ ```typescript
206
+ const matches = jsContent.matchAll(/static\/chunks\/[a-zA-Z0-9_\-]+\.js/g);
207
+ for (const match of matches) {
208
+ discoveredUrls.push(resolveUrl(match[0]));
209
+ }
210
+ ```
211
+
212
+ #### Pattern 3: Request-Based Discovery
213
+
214
+ Make HTTP requests to known patterns:
215
+
216
+ ```typescript
217
+ const candidateUrl = `${baseUrl}/_next/static/${hash}/chunk.js`;
218
+ const response = await makeRequest(candidateUrl);
219
+ if (response.status === 200) {
220
+ discoveredUrls.push(candidateUrl);
221
+ }
222
+ ```
223
+
224
+ ---
225
+
226
+ ## Integration into NextJsCrawler
227
+
228
+ ### For Initial Discovery Methods
229
+
230
+ Add to `initialDiscovery()` in `NextJsCrawler.ts`. Use this phase for methods that only need to run once, like those starting from the application's root URL:
231
+
232
+ ```typescript
233
+ private async initialDiscovery(): Promise<void> {
234
+ // ... existing methods ...
235
+
236
+ // 5. Your new initial discovery method
237
+ const jsFromPrefetch = await next_prefetchHints(this.url);
238
+ this.techniqueEfficiencyMapping["next_prefetchHints"] = jsFromPrefetch;
239
+ this.registerUrls(jsFromPrefetch);
240
+
241
+ // ... rest of the method ...
242
+ }
243
+ ```
244
+
245
+ **Don't forget to add the import at the top:**
246
+
247
+ ```typescript
248
+ import next_prefetchHints from "./next_prefetchHints.js";
249
+ ```
250
+
251
+ ### For Recursive Discovery Methods
252
+
253
+ Add to `recursivePass()` in `NextJsCrawler.ts`. Use this phase for methods that analyze JS file contents to find more JS files:
254
+
255
+ ```typescript
256
+ private async recursivePass(jsUrls: string[]): Promise<string[]> {
257
+ let newInThisPass: string[] = [];
258
+
259
+ // ... existing methods ...
260
+
261
+ // Your new recursive method
262
+ const jsFromDynamicImports = await next_dynamicImports(this.url, jsUrls);
263
+ this.techniqueEfficiencyMapping["next_dynamicImports"] = [
264
+ ...(this.techniqueEfficiencyMapping["next_dynamicImports"] || []),
265
+ ...jsFromDynamicImports,
266
+ ];
267
+ newInThisPass.push(...this.registerUrls(jsFromDynamicImports));
268
+
269
+ return newInThisPass;
270
+ }
271
+ ```
272
+
273
+ **Key points:**
274
+
275
+ - Append to `techniqueEfficiencyMapping` (not replace) for recursive methods
276
+ - Use `registerUrls()` to deduplicate and track new URLs
277
+ - Only add truly new URLs to `newInThisPass`
278
+
279
+ ---
280
+
281
+ ## Best Practices
282
+
283
+ ### 1. Error Handling
284
+
285
+ Always handle errors gracefully – don't crash the entire crawl:
286
+
287
+ ```typescript
288
+ try {
289
+ const response = await makeRequest(url);
290
+ if (!response?.ok) return [];
291
+ // ... process response ...
292
+ } catch (error) {
293
+ // Log if needed, but continue
294
+ return [];
295
+ }
296
+ ```
297
+
298
+ ### 2. URL Resolution
299
+
300
+ Use `URL` constructor for proper relative URL resolution:
301
+
302
+ ```typescript
303
+ const absoluteUrl = new URL(relativePath, baseUrl).href;
304
+ ```
305
+
306
+ ### 3. Deduplication
307
+
308
+ Always deduplicate before returning:
309
+
310
+ ```typescript
311
+ return [...new Set(discoveredUrls)];
312
+ ```
313
+
314
+ ### 4. Performance
315
+
316
+ - Cache expensive operations (e.g., don't re-fetch the same URL)
317
+ - Use `lazyLoadGlobals.presentInCrawledUrls()` to check if already visited
318
+ - Mark URLs as crawled with `lazyLoadGlobals.addCrawledUrl()`
319
+
320
+ ```typescript
321
+ import { presentInCrawledUrls, addCrawledUrl } from "../globals.js";
322
+
323
+ for (const url of urls) {
324
+ if (presentInCrawledUrls(url)) continue;
325
+
326
+ // ... analyze URL ...
327
+
328
+ addCrawledUrl(url);
329
+ }
330
+ ```
331
+
332
+ ### 5. Logging
333
+
334
+ Use `chalk` for consistent logging:
335
+
336
+ ```typescript
337
+ import chalk from "chalk";
338
+
339
+ console.log(chalk.cyan("[i] Starting analysis...")); // Info
340
+ console.log(chalk.green("[✓] Found 10 files")); // Success
341
+ console.log(chalk.yellow("[!] Warning message")); // Warning
342
+ console.log(chalk.red("[!] Error occurred")); // Error
343
+ ```
344
+
345
+ ---
346
+
347
+ ## Testing Your Method
348
+
349
+ ### 1. Integration Test with Real Sites
350
+
351
+ Test against known Next.js sites:
352
+
353
+ ```bash
354
+ npm run cleanup && npm run start -- run -u https://nextjs.org -y --research
355
+ ```
356
+
357
+ Check `research.json` to see how many URLs your method discovered:
358
+
359
+ ```json
360
+ {
361
+ "next_dynamicImports": [
362
+ "https://nextjs.org/_next/static/chunks/123.js",
363
+ "https://nextjs.org/_next/static/chunks/456.js"
364
+ ]
365
+ }
366
+ ```
367
+
368
+ ### 2. Verify Convergence
369
+
370
+ Ensure your recursive method doesn't cause infinite loops:
371
+
372
+ - Check that "Recursive crawl converged" message appears
373
+ - If it hits max iterations (defaults to 10), investigate why
374
+
375
+ ### 3. Performance Validation
376
+
377
+ Monitor execution time:
378
+
379
+ ```bash
380
+ time npm run start -- run -u https://example.com -y
381
+ ```
382
+
383
+ ---
384
+
385
+ ## Research Mode
386
+
387
+ Use research mode to validate and compare discovery methods:
388
+
389
+ ```bash
390
+ npm run start -- run -u https://example.com -y --research
391
+ ```
392
+
393
+ This generates `research.json` with per-method efficiency:
394
+
395
+ ```json
396
+ {
397
+ "next_getJSScript": [
398
+ "https://example.com/_next/static/chunks/main.js",
399
+ "https://example.com/_next/static/chunks/webpack.js"
400
+ ],
401
+ "next_dynamicImports": ["https://example.com/_next/static/chunks/lazy-component.js"],
402
+ "next_promiseResolve": ["https://example.com/_next/static/chunks/pages/about.js"]
403
+ }
404
+ ```
405
+
406
+ **Analysis tips:**
407
+
408
+ - **Unique discoveries**: Methods that find URLs no other method finds
409
+ - **Overlap**: Methods that discover the same URLs (candidates for removal)
410
+ - **Efficiency**: URLs found per method execution
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@shriyanss/js-recon",
3
- "version": "1.2.2-alpha.7",
3
+ "version": "1.2.2",
4
4
  "description": "JS Recon Tool",
5
5
  "main": "build/index.js",
6
6
  "type": "module",
@@ -12,7 +12,7 @@
12
12
  "build": "rm -rf build/ && tsc",
13
13
  "start": "node build/index.js",
14
14
  "test": "node build/index.js -h",
15
- "cleanup": "rm -rf build output .resp_cache.json endpoints.json extracted_urls{.txt,.json,-openapi.json} strings.json mapped{-openapi.json,.json} analyze.json test{.yaml,.js} shriyanss-js-recon-*.tgz js-recon.db report.{html,md} js_recon_run_output *_test.js extracted/ && tsc"
15
+ "cleanup": "rm -rf build output .resp_cache.json endpoints.json extracted_urls{.txt,.json,-openapi.json} strings.json mapped{-openapi.json,.json} analyze.json test{.yaml,.js} shriyanss-js-recon-*.tgz js-recon.db report.{html,md} js_recon_run_output *_test.js extracted/ research.json && tsc"
16
16
  },
17
17
  "keywords": [],
18
18
  "author": "Shriyans Sudhi",