@dependabit/detector 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -0
- package/LICENSE +21 -0
- package/README.md +32 -0
- package/dist/detector.d.ts +64 -0
- package/dist/detector.d.ts.map +1 -0
- package/dist/detector.js +578 -0
- package/dist/detector.js.map +1 -0
- package/dist/diff-parser.d.ts +53 -0
- package/dist/diff-parser.d.ts.map +1 -0
- package/dist/diff-parser.js +203 -0
- package/dist/diff-parser.js.map +1 -0
- package/dist/index.d.ts +14 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +9 -0
- package/dist/index.js.map +1 -0
- package/dist/llm/client.d.ts +65 -0
- package/dist/llm/client.d.ts.map +1 -0
- package/dist/llm/client.js +12 -0
- package/dist/llm/client.js.map +1 -0
- package/dist/llm/copilot.d.ts +15 -0
- package/dist/llm/copilot.d.ts.map +1 -0
- package/dist/llm/copilot.js +119 -0
- package/dist/llm/copilot.js.map +1 -0
- package/dist/llm/prompts.d.ts +10 -0
- package/dist/llm/prompts.d.ts.map +1 -0
- package/dist/llm/prompts.js +94 -0
- package/dist/llm/prompts.js.map +1 -0
- package/dist/parsers/code-comments.d.ts +23 -0
- package/dist/parsers/code-comments.d.ts.map +1 -0
- package/dist/parsers/code-comments.js +139 -0
- package/dist/parsers/code-comments.js.map +1 -0
- package/dist/parsers/package-files.d.ts +31 -0
- package/dist/parsers/package-files.d.ts.map +1 -0
- package/dist/parsers/package-files.js +130 -0
- package/dist/parsers/package-files.js.map +1 -0
- package/dist/parsers/readme.d.ts +23 -0
- package/dist/parsers/readme.d.ts.map +1 -0
- package/dist/parsers/readme.js +151 -0
- package/dist/parsers/readme.js.map +1 -0
- package/package.json +41 -0
- package/src/detector.ts +746 -0
- package/src/diff-parser.ts +257 -0
- package/src/index.ts +43 -0
- package/src/llm/client.ts +85 -0
- package/src/llm/copilot.ts +147 -0
- package/src/llm/prompts.ts +102 -0
- package/src/parsers/code-comments.ts +178 -0
- package/src/parsers/package-files.ts +156 -0
- package/src/parsers/readme.ts +185 -0
- package/test/detector.test.ts +102 -0
- package/test/diff-parser.test.ts +187 -0
- package/test/llm/client.test.ts +31 -0
- package/test/llm/copilot.test.ts +55 -0
- package/test/parsers/code-comments.test.ts +98 -0
- package/test/parsers/package-files.test.ts +52 -0
- package/test/parsers/readme.test.ts +52 -0
- package/tsconfig.json +10 -0
- package/tsconfig.tsbuildinfo +1 -0
package/src/detector.ts
ADDED
|
@@ -0,0 +1,746 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Detector Orchestrator
|
|
3
|
+
* Coordinates content parsers and LLM analysis to detect external dependencies
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { readdir, readFile } from 'node:fs/promises';
|
|
7
|
+
import { join, relative, resolve, normalize, sep } from 'node:path';
|
|
8
|
+
import { randomUUID } from 'node:crypto';
|
|
9
|
+
import type { LLMProvider } from './llm/client.js';
|
|
10
|
+
import { createClassificationPrompt } from './llm/prompts.js';
|
|
11
|
+
import { parseReadme } from './parsers/readme.js';
|
|
12
|
+
import { parseCodeComments } from './parsers/code-comments.js';
|
|
13
|
+
import {
|
|
14
|
+
parsePackageJson,
|
|
15
|
+
parseRequirementsTxt,
|
|
16
|
+
parseCargoToml,
|
|
17
|
+
parseGoMod
|
|
18
|
+
} from './parsers/package-files.js';
|
|
19
|
+
import type {
|
|
20
|
+
DependencyEntry,
|
|
21
|
+
DependencyType,
|
|
22
|
+
AccessMethod,
|
|
23
|
+
DetectionMethod
|
|
24
|
+
} from '@dependabit/manifest';
|
|
25
|
+
|
|
26
|
+
export interface DetectorOptions {
|
|
27
|
+
repoPath: string;
|
|
28
|
+
llmProvider: LLMProvider;
|
|
29
|
+
ignorePatterns?: string[];
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export interface DetectionResult {
|
|
33
|
+
dependencies: DependencyEntry[];
|
|
34
|
+
statistics: {
|
|
35
|
+
filesScanned: number;
|
|
36
|
+
urlsFound: number;
|
|
37
|
+
llmCalls: number;
|
|
38
|
+
totalTokens: number;
|
|
39
|
+
totalLatencyMs: number;
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
const DEFAULT_IGNORE_PATTERNS = [
|
|
44
|
+
'node_modules',
|
|
45
|
+
'.git',
|
|
46
|
+
'dist',
|
|
47
|
+
'build',
|
|
48
|
+
'target',
|
|
49
|
+
'vendor',
|
|
50
|
+
'.venv',
|
|
51
|
+
'venv',
|
|
52
|
+
'__pycache__',
|
|
53
|
+
'coverage',
|
|
54
|
+
'.next',
|
|
55
|
+
'.nuxt'
|
|
56
|
+
];
|
|
57
|
+
|
|
58
|
+
/**
|
|
59
|
+
* Main detector class
|
|
60
|
+
*/
|
|
61
|
+
export class Detector {
|
|
62
|
+
private options: Required<DetectorOptions>;
|
|
63
|
+
|
|
64
|
+
constructor(options: DetectorOptions) {
|
|
65
|
+
this.options = {
|
|
66
|
+
...options,
|
|
67
|
+
ignorePatterns: options.ignorePatterns || DEFAULT_IGNORE_PATTERNS
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
/**
|
|
72
|
+
* Detect all external dependencies in the repository
|
|
73
|
+
*
|
|
74
|
+
* Implementation follows a hybrid approach:
|
|
75
|
+
* 1. Programmatic parsing of repository files (README, code comments, package files)
|
|
76
|
+
* 2. LLM analysis only for documents not fully parsed in step 1 (future enhancement)
|
|
77
|
+
* 3. Programmatic type categorization based on URL patterns and context
|
|
78
|
+
* 4. LLM fallback for uncategorized dependencies
|
|
79
|
+
* 5. Programmatic access method determination based on URL patterns
|
|
80
|
+
* 6. LLM fallback for access methods that can't be determined (future enhancement)
|
|
81
|
+
* 7. Manifest entry creation with references and versioning
|
|
82
|
+
*/
|
|
83
|
+
async detectDependencies(): Promise<DetectionResult> {
|
|
84
|
+
const allReferences: Map<
|
|
85
|
+
string,
|
|
86
|
+
{
|
|
87
|
+
url: string;
|
|
88
|
+
contexts: Array<{ file: string; line?: number; text: string }>;
|
|
89
|
+
detectionMethod: DetectionMethod;
|
|
90
|
+
}
|
|
91
|
+
> = new Map();
|
|
92
|
+
|
|
93
|
+
let filesScanned = 0;
|
|
94
|
+
let llmCalls = 0;
|
|
95
|
+
let totalTokens = 0;
|
|
96
|
+
let totalLatencyMs = 0;
|
|
97
|
+
|
|
98
|
+
// Step 1: Parse repository for dependencies (programmatic)
|
|
99
|
+
// 1a. Parse README files
|
|
100
|
+
const readmeFiles = await this.findFiles(this.options.repoPath, /^README/i);
|
|
101
|
+
for (const file of readmeFiles) {
|
|
102
|
+
const content = await readFile(file, 'utf-8');
|
|
103
|
+
const references = parseReadme(content, relative(this.options.repoPath, file));
|
|
104
|
+
|
|
105
|
+
for (const ref of references) {
|
|
106
|
+
this.addReference(
|
|
107
|
+
allReferences,
|
|
108
|
+
ref.url,
|
|
109
|
+
{
|
|
110
|
+
file: relative(this.options.repoPath, file),
|
|
111
|
+
...(ref.line !== undefined && { line: ref.line }),
|
|
112
|
+
text: ref.context
|
|
113
|
+
},
|
|
114
|
+
'llm-analysis'
|
|
115
|
+
);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
filesScanned++;
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// 1b. Parse package files for metadata (NOT dependencies)
|
|
122
|
+
const packageFiles = await this.findPackageFiles(this.options.repoPath);
|
|
123
|
+
for (const file of packageFiles) {
|
|
124
|
+
const content = await readFile(file, 'utf-8');
|
|
125
|
+
const metadata = this.parsePackageFile(file, content);
|
|
126
|
+
|
|
127
|
+
for (const url of [
|
|
128
|
+
...(metadata.urls || []),
|
|
129
|
+
metadata.repository,
|
|
130
|
+
metadata.homepage,
|
|
131
|
+
metadata.documentation
|
|
132
|
+
].filter(Boolean)) {
|
|
133
|
+
this.addReference(
|
|
134
|
+
allReferences,
|
|
135
|
+
url!,
|
|
136
|
+
{
|
|
137
|
+
file: relative(this.options.repoPath, file),
|
|
138
|
+
text: 'Package metadata'
|
|
139
|
+
},
|
|
140
|
+
'package-json'
|
|
141
|
+
);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
filesScanned++;
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
// 1c. Parse code comments from source files
|
|
148
|
+
const sourceFiles = await this.findSourceFiles(this.options.repoPath);
|
|
149
|
+
for (const file of sourceFiles.slice(0, 50)) {
|
|
150
|
+
// Limit to 50 files for performance
|
|
151
|
+
const content = await readFile(file, 'utf-8');
|
|
152
|
+
const references = parseCodeComments(content, relative(this.options.repoPath, file));
|
|
153
|
+
|
|
154
|
+
for (const ref of references) {
|
|
155
|
+
this.addReference(
|
|
156
|
+
allReferences,
|
|
157
|
+
ref.url,
|
|
158
|
+
{
|
|
159
|
+
file: ref.file,
|
|
160
|
+
line: ref.line,
|
|
161
|
+
text: ref.context
|
|
162
|
+
},
|
|
163
|
+
'code-comment'
|
|
164
|
+
);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
filesScanned++;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
// Step 2: LLM 2nd pass for documents not fully parsed in step 1
|
|
171
|
+
// Analyze README files for dependency context that parsers might have missed
|
|
172
|
+
const llmEnhancedReferences = new Set<string>();
|
|
173
|
+
|
|
174
|
+
for (const file of readmeFiles.slice(0, 5)) {
|
|
175
|
+
// Limit to 5 READMEs for LLM analysis
|
|
176
|
+
try {
|
|
177
|
+
const content = await readFile(file, 'utf-8');
|
|
178
|
+
const relPath = relative(this.options.repoPath, file);
|
|
179
|
+
|
|
180
|
+
// Use LLM to extract additional context from README
|
|
181
|
+
const detectionPrompt = `Analyze this README file and identify any external dependencies or resources that might be referenced but not explicitly linked:
|
|
182
|
+
|
|
183
|
+
File: ${relPath}
|
|
184
|
+
Content:
|
|
185
|
+
${content.slice(0, 5000)}
|
|
186
|
+
|
|
187
|
+
Identify:
|
|
188
|
+
1. Documentation sites mentioned but not linked
|
|
189
|
+
2. Tools or libraries referenced in text
|
|
190
|
+
3. API services mentioned
|
|
191
|
+
4. Research papers or specifications cited
|
|
192
|
+
|
|
193
|
+
Return as JSON with "dependencies" array.`;
|
|
194
|
+
|
|
195
|
+
const response = await this.options.llmProvider.analyze(content, detectionPrompt);
|
|
196
|
+
llmCalls++;
|
|
197
|
+
totalTokens += response.usage.totalTokens;
|
|
198
|
+
totalLatencyMs += response.usage.latencyMs;
|
|
199
|
+
|
|
200
|
+
// Add LLM-discovered references
|
|
201
|
+
for (const dep of response.dependencies) {
|
|
202
|
+
if (dep.url && !allReferences.has(dep.url)) {
|
|
203
|
+
llmEnhancedReferences.add(dep.url);
|
|
204
|
+
this.addReference(
|
|
205
|
+
allReferences,
|
|
206
|
+
dep.url,
|
|
207
|
+
{
|
|
208
|
+
file: relPath,
|
|
209
|
+
text: dep.description || 'Discovered by LLM analysis'
|
|
210
|
+
},
|
|
211
|
+
'llm-analysis'
|
|
212
|
+
);
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
} catch (error) {
|
|
216
|
+
console.error(`LLM document analysis failed for ${file}:`, error);
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Steps 3-6: Categorize dependencies (programmatic first, LLM fallback)
|
|
221
|
+
const dependencies: DependencyEntry[] = [];
|
|
222
|
+
const now = new Date().toISOString();
|
|
223
|
+
|
|
224
|
+
for (const [url, data] of allReferences) {
|
|
225
|
+
// Prepare context for potential LLM use
|
|
226
|
+
const context = data.contexts
|
|
227
|
+
.map((c) => `${c.file}${c.line ? `:${c.line}` : ''}: ${c.text}`)
|
|
228
|
+
.join('\n');
|
|
229
|
+
const firstContext = data.contexts[0]?.text || '';
|
|
230
|
+
|
|
231
|
+
// Step 3: Try programmatic type categorization
|
|
232
|
+
let type: DependencyType | null = this.determineDependencyType(url, firstContext);
|
|
233
|
+
let typeConfidence = type ? 0.9 : 0.5; // High confidence for programmatic
|
|
234
|
+
|
|
235
|
+
// Step 4: If type couldn't be determined, use LLM fallback
|
|
236
|
+
if (!type) {
|
|
237
|
+
try {
|
|
238
|
+
const classificationPrompt = createClassificationPrompt(url, context);
|
|
239
|
+
const response = await this.options.llmProvider.analyze('', classificationPrompt);
|
|
240
|
+
llmCalls++;
|
|
241
|
+
totalTokens += response.usage.totalTokens;
|
|
242
|
+
totalLatencyMs += response.usage.latencyMs;
|
|
243
|
+
|
|
244
|
+
if (response.dependencies.length > 0) {
|
|
245
|
+
const dep = response.dependencies[0];
|
|
246
|
+
if (dep) {
|
|
247
|
+
type = dep.type as DependencyType;
|
|
248
|
+
typeConfidence = dep.confidence;
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
} catch (error) {
|
|
252
|
+
console.error(`LLM classification failed for ${url}:`, error);
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// Default to 'other' if still not determined
|
|
257
|
+
if (!type) {
|
|
258
|
+
type = 'other';
|
|
259
|
+
typeConfidence = 0.3;
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
// Step 5: Try programmatic access method determination
|
|
263
|
+
let accessMethod: AccessMethod | null = this.determineAccessMethod(url);
|
|
264
|
+
|
|
265
|
+
// Step 6: If access method couldn't be determined, use LLM fallback
|
|
266
|
+
if (!accessMethod) {
|
|
267
|
+
try {
|
|
268
|
+
const accessMethodPrompt = `Determine the best access method for this URL: ${url}
|
|
269
|
+
|
|
270
|
+
Context: ${firstContext}
|
|
271
|
+
|
|
272
|
+
Choose ONE of these access methods:
|
|
273
|
+
- "github-api": For GitHub repositories
|
|
274
|
+
- "arxiv": For arXiv papers
|
|
275
|
+
- "openapi": For API specifications
|
|
276
|
+
- "context7": For Context7 documentation
|
|
277
|
+
- "http": For general web resources
|
|
278
|
+
|
|
279
|
+
Return as JSON: {"accessMethod": "...", "confidence": 0.0-1.0}`;
|
|
280
|
+
|
|
281
|
+
const response = await this.options.llmProvider.analyze('', accessMethodPrompt);
|
|
282
|
+
llmCalls++;
|
|
283
|
+
totalTokens += response.usage.totalTokens;
|
|
284
|
+
totalLatencyMs += response.usage.latencyMs;
|
|
285
|
+
|
|
286
|
+
// Parse LLM response for access method
|
|
287
|
+
const content = response.rawResponse || '{}';
|
|
288
|
+
try {
|
|
289
|
+
const parsed = JSON.parse(content);
|
|
290
|
+
if (parsed.accessMethod) {
|
|
291
|
+
accessMethod = parsed.accessMethod as AccessMethod;
|
|
292
|
+
}
|
|
293
|
+
} catch {
|
|
294
|
+
// If parsing fails, fall back to http
|
|
295
|
+
accessMethod = 'http';
|
|
296
|
+
}
|
|
297
|
+
} catch (error) {
|
|
298
|
+
console.error(`LLM access method determination failed for ${url}:`, error);
|
|
299
|
+
accessMethod = 'http';
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
|
|
303
|
+
// Ensure we have a valid access method
|
|
304
|
+
if (!accessMethod) {
|
|
305
|
+
accessMethod = 'http';
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// Step 7: Create manifest entry with references and versioning
|
|
309
|
+
const entry: DependencyEntry = {
|
|
310
|
+
id: randomUUID(),
|
|
311
|
+
url,
|
|
312
|
+
type,
|
|
313
|
+
accessMethod,
|
|
314
|
+
name: this.extractName(url),
|
|
315
|
+
description: firstContext,
|
|
316
|
+
currentVersion: undefined,
|
|
317
|
+
currentStateHash: '', // Will be populated by monitor
|
|
318
|
+
detectionMethod: data.detectionMethod,
|
|
319
|
+
detectionConfidence: typeConfidence,
|
|
320
|
+
detectedAt: now,
|
|
321
|
+
lastChecked: now,
|
|
322
|
+
auth: undefined,
|
|
323
|
+
monitoring: {
|
|
324
|
+
enabled: true,
|
|
325
|
+
checkFrequency: 'daily',
|
|
326
|
+
ignoreChanges: false
|
|
327
|
+
},
|
|
328
|
+
referencedIn: data.contexts.map((c) => ({
|
|
329
|
+
file: c.file,
|
|
330
|
+
line: c.line,
|
|
331
|
+
context: c.text
|
|
332
|
+
})),
|
|
333
|
+
changeHistory: []
|
|
334
|
+
};
|
|
335
|
+
|
|
336
|
+
dependencies.push(entry);
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
return {
|
|
340
|
+
dependencies,
|
|
341
|
+
statistics: {
|
|
342
|
+
filesScanned,
|
|
343
|
+
urlsFound: allReferences.size,
|
|
344
|
+
llmCalls,
|
|
345
|
+
totalTokens,
|
|
346
|
+
totalLatencyMs
|
|
347
|
+
}
|
|
348
|
+
};
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
private addReference(
|
|
352
|
+
map: Map<string, any>,
|
|
353
|
+
url: string,
|
|
354
|
+
context: { file: string; line?: number; text: string },
|
|
355
|
+
detectionMethod: DetectionMethod
|
|
356
|
+
): void {
|
|
357
|
+
if (!map.has(url)) {
|
|
358
|
+
map.set(url, {
|
|
359
|
+
url,
|
|
360
|
+
contexts: [],
|
|
361
|
+
detectionMethod
|
|
362
|
+
});
|
|
363
|
+
}
|
|
364
|
+
map.get(url)!.contexts.push(context);
|
|
365
|
+
}
|
|
366
|
+
|
|
367
|
+
/**
|
|
368
|
+
* Programmatically determine access method based on URL patterns
|
|
369
|
+
* Returns null if cannot be determined programmatically
|
|
370
|
+
*/
|
|
371
|
+
private determineAccessMethod(url: string): AccessMethod | null {
|
|
372
|
+
// GitHub URLs
|
|
373
|
+
if (url.includes('github.com')) return 'github-api';
|
|
374
|
+
|
|
375
|
+
// arXiv papers
|
|
376
|
+
if (url.includes('arxiv.org')) return 'arxiv';
|
|
377
|
+
|
|
378
|
+
// OpenAPI/Swagger specs
|
|
379
|
+
if (
|
|
380
|
+
url.includes('openapi') ||
|
|
381
|
+
url.includes('swagger') ||
|
|
382
|
+
url.endsWith('.yaml') ||
|
|
383
|
+
url.endsWith('.json') ||
|
|
384
|
+
url.includes('/api/spec') ||
|
|
385
|
+
url.includes('/api-docs')
|
|
386
|
+
) {
|
|
387
|
+
return 'openapi';
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
// Context7 documentation
|
|
391
|
+
if (url.includes('context7')) return 'context7';
|
|
392
|
+
|
|
393
|
+
// Cannot determine programmatically - needs LLM
|
|
394
|
+
return null;
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
/**
|
|
398
|
+
* Programmatically determine dependency type based on URL patterns and context
|
|
399
|
+
* Returns null if cannot be determined programmatically
|
|
400
|
+
*/
|
|
401
|
+
private determineDependencyType(url: string, context: string): DependencyType | null {
|
|
402
|
+
const lowerUrl = url.toLowerCase();
|
|
403
|
+
const lowerContext = context.toLowerCase();
|
|
404
|
+
|
|
405
|
+
// Research papers
|
|
406
|
+
if (
|
|
407
|
+
lowerUrl.includes('arxiv.org') ||
|
|
408
|
+
lowerContext.includes('paper') ||
|
|
409
|
+
lowerContext.includes('research')
|
|
410
|
+
) {
|
|
411
|
+
return 'research-paper';
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
// Schemas
|
|
415
|
+
if (
|
|
416
|
+
lowerUrl.includes('schema') ||
|
|
417
|
+
lowerUrl.includes('openapi') ||
|
|
418
|
+
lowerUrl.includes('swagger') ||
|
|
419
|
+
lowerUrl.includes('graphql') ||
|
|
420
|
+
lowerUrl.includes('protobuf')
|
|
421
|
+
) {
|
|
422
|
+
return 'schema';
|
|
423
|
+
}
|
|
424
|
+
|
|
425
|
+
// Documentation
|
|
426
|
+
if (
|
|
427
|
+
lowerUrl.includes('/docs') ||
|
|
428
|
+
lowerUrl.includes('/documentation') ||
|
|
429
|
+
lowerUrl.includes('/guide') ||
|
|
430
|
+
lowerUrl.includes('/tutorial') ||
|
|
431
|
+
lowerUrl.includes('/reference') ||
|
|
432
|
+
lowerContext.includes('documentation') ||
|
|
433
|
+
lowerContext.includes('docs')
|
|
434
|
+
) {
|
|
435
|
+
return 'documentation';
|
|
436
|
+
}
|
|
437
|
+
|
|
438
|
+
// Reference implementations (GitHub repos)
|
|
439
|
+
if (
|
|
440
|
+
lowerUrl.includes('github.com') &&
|
|
441
|
+
(lowerContext.includes('example') ||
|
|
442
|
+
lowerContext.includes('implementation') ||
|
|
443
|
+
lowerContext.includes('reference'))
|
|
444
|
+
) {
|
|
445
|
+
return 'reference-implementation';
|
|
446
|
+
}
|
|
447
|
+
|
|
448
|
+
// API examples
|
|
449
|
+
if (
|
|
450
|
+
lowerContext.includes('example') &&
|
|
451
|
+
(lowerContext.includes('api') || lowerContext.includes('endpoint'))
|
|
452
|
+
) {
|
|
453
|
+
return 'api-example';
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
// Cannot determine programmatically - needs LLM
|
|
457
|
+
return null;
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
private extractName(url: string): string {
|
|
461
|
+
// Extract a reasonable name from URL
|
|
462
|
+
try {
|
|
463
|
+
const urlObj = new URL(url);
|
|
464
|
+
const pathParts = urlObj.pathname.split('/').filter(Boolean);
|
|
465
|
+
if (pathParts.length > 0) {
|
|
466
|
+
const lastPart = pathParts[pathParts.length - 1];
|
|
467
|
+
if (lastPart) {
|
|
468
|
+
return lastPart.replace(/\.[^.]+$/, '');
|
|
469
|
+
}
|
|
470
|
+
}
|
|
471
|
+
return urlObj.hostname;
|
|
472
|
+
} catch {
|
|
473
|
+
return url;
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
private parsePackageFile(
|
|
478
|
+
filePath: string,
|
|
479
|
+
content: string
|
|
480
|
+
): { urls: string[]; repository?: string; homepage?: string; documentation?: string } {
|
|
481
|
+
const fileName = filePath.split('/').pop() || '';
|
|
482
|
+
|
|
483
|
+
if (fileName === 'package.json') {
|
|
484
|
+
return parsePackageJson(content);
|
|
485
|
+
}
|
|
486
|
+
if (fileName === 'requirements.txt') {
|
|
487
|
+
return parseRequirementsTxt(content);
|
|
488
|
+
}
|
|
489
|
+
if (fileName === 'Cargo.toml') {
|
|
490
|
+
return parseCargoToml(content);
|
|
491
|
+
}
|
|
492
|
+
if (fileName === 'go.mod') {
|
|
493
|
+
return parseGoMod(content);
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
return { urls: [] };
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
private async findFiles(dir: string, pattern: RegExp): Promise<string[]> {
|
|
500
|
+
const files: string[] = [];
|
|
501
|
+
|
|
502
|
+
try {
|
|
503
|
+
const entries = await readdir(dir, { withFileTypes: true });
|
|
504
|
+
|
|
505
|
+
for (const entry of entries) {
|
|
506
|
+
const fullPath = join(dir, entry.name);
|
|
507
|
+
|
|
508
|
+
if (this.shouldIgnore(entry.name)) {
|
|
509
|
+
continue;
|
|
510
|
+
}
|
|
511
|
+
|
|
512
|
+
if (entry.isDirectory()) {
|
|
513
|
+
const subFiles = await this.findFiles(fullPath, pattern);
|
|
514
|
+
files.push(...subFiles);
|
|
515
|
+
} else if (pattern.test(entry.name)) {
|
|
516
|
+
files.push(fullPath);
|
|
517
|
+
}
|
|
518
|
+
}
|
|
519
|
+
} catch {
|
|
520
|
+
// Ignore errors (permission denied, etc.)
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
return files;
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
private async findPackageFiles(dir: string): Promise<string[]> {
|
|
527
|
+
return this.findFiles(dir, /^(package\.json|requirements\.txt|Cargo\.toml|go\.mod)$/);
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
private async findSourceFiles(dir: string): Promise<string[]> {
|
|
531
|
+
return this.findFiles(dir, /\.(ts|js|tsx|jsx|py|rs|go|java|kt|cs|rb|php)$/);
|
|
532
|
+
}
|
|
533
|
+
|
|
534
|
+
private shouldIgnore(name: string): boolean {
|
|
535
|
+
return this.options.ignorePatterns.some((pattern) => name.includes(pattern));
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
/**
|
|
539
|
+
* Analyze only specific files for dependencies (for incremental updates)
|
|
540
|
+
* This is more efficient than full repository scan when only few files changed
|
|
541
|
+
*/
|
|
542
|
+
async analyzeFiles(filePaths: string[]): Promise<DetectionResult> {
|
|
543
|
+
const allReferences: Map<
|
|
544
|
+
string,
|
|
545
|
+
{
|
|
546
|
+
url: string;
|
|
547
|
+
contexts: Array<{ file: string; line?: number; text: string }>;
|
|
548
|
+
detectionMethod: DetectionMethod;
|
|
549
|
+
}
|
|
550
|
+
> = new Map();
|
|
551
|
+
|
|
552
|
+
let filesScanned = 0;
|
|
553
|
+
let llmCalls = 0;
|
|
554
|
+
let totalTokens = 0;
|
|
555
|
+
let totalLatencyMs = 0;
|
|
556
|
+
|
|
557
|
+
for (const filePath of filePaths) {
|
|
558
|
+
// Validate that the file path is safe before joining
|
|
559
|
+
const normalizedRepoPath = resolve(normalize(this.options.repoPath));
|
|
560
|
+
const normalizedFilePath = normalize(filePath);
|
|
561
|
+
const fullPath = resolve(normalizedRepoPath, normalizedFilePath);
|
|
562
|
+
|
|
563
|
+
// Ensure the resolved path is within the repository boundaries
|
|
564
|
+
// Use path.sep for cross-platform compatibility
|
|
565
|
+
const repoPathWithSep = normalizedRepoPath.endsWith(sep)
|
|
566
|
+
? normalizedRepoPath
|
|
567
|
+
: normalizedRepoPath + sep;
|
|
568
|
+
|
|
569
|
+
if (!fullPath.startsWith(repoPathWithSep) && fullPath !== normalizedRepoPath) {
|
|
570
|
+
// Skip files outside the repository to prevent path traversal
|
|
571
|
+
if (process.env['DEBUG']) {
|
|
572
|
+
console.warn(`Skipping file outside repository: ${filePath}`);
|
|
573
|
+
}
|
|
574
|
+
continue;
|
|
575
|
+
}
|
|
576
|
+
|
|
577
|
+
try {
|
|
578
|
+
const content = await readFile(fullPath, 'utf-8');
|
|
579
|
+
const relativePath = relative(this.options.repoPath, fullPath);
|
|
580
|
+
const fileName = filePath.split('/').pop() || '';
|
|
581
|
+
|
|
582
|
+
// Parse based on file type
|
|
583
|
+
if (/^README/i.test(fileName)) {
|
|
584
|
+
// README file
|
|
585
|
+
const references = parseReadme(content, relativePath);
|
|
586
|
+
for (const ref of references) {
|
|
587
|
+
this.addReference(
|
|
588
|
+
allReferences,
|
|
589
|
+
ref.url,
|
|
590
|
+
{
|
|
591
|
+
file: relativePath,
|
|
592
|
+
...(ref.line !== undefined && { line: ref.line }),
|
|
593
|
+
text: ref.context
|
|
594
|
+
},
|
|
595
|
+
'llm-analysis'
|
|
596
|
+
);
|
|
597
|
+
}
|
|
598
|
+
} else if (/^(package\.json|requirements\.txt|Cargo\.toml|go\.mod)$/i.test(fileName)) {
|
|
599
|
+
// Package file
|
|
600
|
+
const metadata = this.parsePackageFile(fullPath, content);
|
|
601
|
+
for (const url of [
|
|
602
|
+
...(metadata.urls || []),
|
|
603
|
+
metadata.repository,
|
|
604
|
+
metadata.homepage,
|
|
605
|
+
metadata.documentation
|
|
606
|
+
].filter(Boolean)) {
|
|
607
|
+
this.addReference(
|
|
608
|
+
allReferences,
|
|
609
|
+
url!,
|
|
610
|
+
{
|
|
611
|
+
file: relativePath,
|
|
612
|
+
text: 'Package metadata'
|
|
613
|
+
},
|
|
614
|
+
'package-json'
|
|
615
|
+
);
|
|
616
|
+
}
|
|
617
|
+
} else if (/\.(ts|js|tsx|jsx|py|rs|go|java|kt|cs|rb|php)$/.test(fileName)) {
|
|
618
|
+
// Source file
|
|
619
|
+
const references = parseCodeComments(content, relativePath);
|
|
620
|
+
for (const ref of references) {
|
|
621
|
+
this.addReference(
|
|
622
|
+
allReferences,
|
|
623
|
+
ref.url,
|
|
624
|
+
{
|
|
625
|
+
file: ref.file,
|
|
626
|
+
line: ref.line,
|
|
627
|
+
text: ref.context
|
|
628
|
+
},
|
|
629
|
+
'code-comment'
|
|
630
|
+
);
|
|
631
|
+
}
|
|
632
|
+
} else if (/\.(md|txt|rst|adoc)$/.test(fileName)) {
|
|
633
|
+
// Documentation file
|
|
634
|
+
const references = parseReadme(content, relativePath);
|
|
635
|
+
for (const ref of references) {
|
|
636
|
+
this.addReference(
|
|
637
|
+
allReferences,
|
|
638
|
+
ref.url,
|
|
639
|
+
{
|
|
640
|
+
file: relativePath,
|
|
641
|
+
...(ref.line !== undefined && { line: ref.line }),
|
|
642
|
+
text: ref.context
|
|
643
|
+
},
|
|
644
|
+
'llm-analysis'
|
|
645
|
+
);
|
|
646
|
+
}
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
filesScanned++;
|
|
650
|
+
} catch (error) {
|
|
651
|
+
// Skip files that can't be read - log but don't throw
|
|
652
|
+
// Using console.warn since we don't have a logger instance here
|
|
653
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
654
|
+
console.warn(`Failed to analyze ${filePath}: ${message}`);
|
|
655
|
+
if (process.env['DEBUG']) {
|
|
656
|
+
// Log full error details when DEBUG is enabled
|
|
657
|
+
console.debug('Full error while analyzing %s:', filePath, error);
|
|
658
|
+
}
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
// Create dependency entries
|
|
663
|
+
const dependencies: DependencyEntry[] = [];
|
|
664
|
+
|
|
665
|
+
for (const [url, refData] of allReferences.entries()) {
|
|
666
|
+
const contextText = refData.contexts.map((c) => c.text).join(' ');
|
|
667
|
+
|
|
668
|
+
// Step 3: Programmatic type categorization
|
|
669
|
+
let type = this.determineDependencyType(url, contextText);
|
|
670
|
+
|
|
671
|
+
// Step 4: LLM fallback for type categorization (if needed)
|
|
672
|
+
if (!type && refData.contexts.length > 0) {
|
|
673
|
+
const startTime = Date.now();
|
|
674
|
+
try {
|
|
675
|
+
const prompt = createClassificationPrompt(url, contextText);
|
|
676
|
+
const response = await this.options.llmProvider.analyze('', prompt);
|
|
677
|
+
|
|
678
|
+
llmCalls++;
|
|
679
|
+
totalTokens += response.usage?.totalTokens || 0;
|
|
680
|
+
totalLatencyMs += Date.now() - startTime;
|
|
681
|
+
|
|
682
|
+
// Use rawResponse for classification
|
|
683
|
+
const responseText = (response.rawResponse || '').toLowerCase();
|
|
684
|
+
type = (
|
|
685
|
+
responseText.includes('schema')
|
|
686
|
+
? 'schema'
|
|
687
|
+
: responseText.includes('documentation')
|
|
688
|
+
? 'documentation'
|
|
689
|
+
: responseText.includes('research') || responseText.includes('paper')
|
|
690
|
+
? 'research-paper'
|
|
691
|
+
: responseText.includes('implementation')
|
|
692
|
+
? 'reference-implementation'
|
|
693
|
+
: responseText.includes('example')
|
|
694
|
+
? 'api-example'
|
|
695
|
+
: 'other'
|
|
696
|
+
) as DependencyType;
|
|
697
|
+
} catch {
|
|
698
|
+
type = 'other';
|
|
699
|
+
}
|
|
700
|
+
}
|
|
701
|
+
|
|
702
|
+
if (!type) {
|
|
703
|
+
type = 'other';
|
|
704
|
+
}
|
|
705
|
+
|
|
706
|
+
// Step 5: Programmatic access method determination
|
|
707
|
+
let accessMethod = this.determineAccessMethod(url);
|
|
708
|
+
if (!accessMethod) {
|
|
709
|
+
accessMethod = 'http'; // Default fallback
|
|
710
|
+
}
|
|
711
|
+
|
|
712
|
+
const dependency: DependencyEntry = {
|
|
713
|
+
id: randomUUID(),
|
|
714
|
+
url,
|
|
715
|
+
type,
|
|
716
|
+
accessMethod,
|
|
717
|
+
name: this.extractName(url),
|
|
718
|
+
currentStateHash: `sha256:pending`,
|
|
719
|
+
detectionMethod: refData.detectionMethod,
|
|
720
|
+
detectionConfidence: refData.detectionMethod === 'manual' ? 1.0 : 0.85,
|
|
721
|
+
detectedAt: new Date().toISOString(),
|
|
722
|
+
lastChecked: new Date().toISOString(),
|
|
723
|
+
auth: undefined,
|
|
724
|
+
referencedIn: refData.contexts.map((ctx) => ({
|
|
725
|
+
file: ctx.file,
|
|
726
|
+
line: ctx.line,
|
|
727
|
+
context: ctx.text
|
|
728
|
+
})),
|
|
729
|
+
changeHistory: []
|
|
730
|
+
};
|
|
731
|
+
|
|
732
|
+
dependencies.push(dependency);
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
return {
|
|
736
|
+
dependencies,
|
|
737
|
+
statistics: {
|
|
738
|
+
filesScanned,
|
|
739
|
+
urlsFound: allReferences.size,
|
|
740
|
+
llmCalls,
|
|
741
|
+
totalTokens,
|
|
742
|
+
totalLatencyMs
|
|
743
|
+
}
|
|
744
|
+
};
|
|
745
|
+
}
|
|
746
|
+
}
|