@dockerforge/core 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,525 @@
1
+ // backend/src/modules/ingestion/ingestion.js
2
+ // Fetches repo file tree + key files via provider APIs — no git binary needed.
3
+ // Works on Vercel, Railway, Render, etc.
4
+
5
+ const path = require('path');
6
+ const fs = require('fs-extra');
7
+ const { IGNORED_DIRS, ROOT_CONFIG_FILES } = require('../../../../shared/constants');
8
+
9
+ const IGNORED_SET = new Set(IGNORED_DIRS);
10
+
11
+ const KEY_FILES = [
12
+ 'package.json', 'package-lock.json', 'yarn.lock', 'pnpm-lock.yaml',
13
+ 'requirements.txt', 'pyproject.toml', 'setup.py', 'Pipfile',
14
+ '.python-version', 'Procfile', 'manage.py', 'app.py', 'main.py',
15
+ '.dockerignore', '.env.example', '.env.sample',
16
+ // Node entry points — fetched so the analyser can scan require('../xxx') for sibling deps
17
+ 'server.js', 'server.ts', 'index.js', 'index.ts', 'app.js', 'app.ts', 'main.js', 'main.ts',
18
+ // Framework config files — needed to detect buildOutputDir (vite: outDir, next: distDir, etc.)
19
+ 'vite.config.js', 'vite.config.mjs', 'vite.config.ts', 'vite.config.mts',
20
+ 'next.config.js', 'next.config.mjs', 'next.config.ts',
21
+ 'astro.config.js', 'astro.config.mjs', 'astro.config.ts',
22
+ 'svelte.config.js', 'svelte.config.ts',
23
+ ];
24
+ const KEY_EXTENSIONS = ['.csproj', '.fsproj'];
25
+ const ROOT_CONFIG_SET = new Set(ROOT_CONFIG_FILES);
26
+ const TSCONFIG_VARIANT_RE = /^tsconfig\..+\.json$/;
27
+ const SOURCE_DIR_NAMES = new Set([
28
+ 'src', 'public', 'app', 'pages', 'components', 'lib', 'libs',
29
+ 'shared', 'common', 'utils', 'styles', 'assets',
30
+ 'backend', 'server', 'api', 'routes', 'controllers', 'services',
31
+ 'middleware', 'workers',
32
+ ]);
33
+ const TEXT_SOURCE_EXTENSIONS = new Set([
34
+ '.js', '.jsx', '.ts', '.tsx', '.mjs', '.cjs',
35
+ '.css', '.scss', '.sass', '.less',
36
+ '.html', '.htm', '.json', '.svg', '.txt', '.xml',
37
+ '.yml', '.yaml', '.md', '.mdx',
38
+ ]);
39
+ const FETCH_TIMEOUT_MS = 10_000;
40
+ const MAX_INDEXED_PATHS = 5000;
41
+ const MAX_ZIP_FILES = 2000;
42
+ const MAX_ZIP_UNCOMPRESSED_BYTES = 100 * 1024 * 1024;
43
+
44
+ function validateSubFolder(subFolder) {
45
+ if (!subFolder) return null;
46
+ let decoded;
47
+ try {
48
+ decoded = decodeURIComponent(subFolder);
49
+ for (let i = 0; i < 2 && decoded.includes('%'); i += 1) {
50
+ const next = decodeURIComponent(decoded);
51
+ if (next === decoded) break;
52
+ decoded = next;
53
+ }
54
+ } catch {
55
+ throw new Error('Invalid subfolder path in URL');
56
+ }
57
+ const normalized = decoded.replace(/\\/g, '/');
58
+ if (
59
+ path.isAbsolute(normalized) ||
60
+ normalized.split('/').some(part => part === '..') ||
61
+ normalized.includes('\0')
62
+ ) {
63
+ throw new Error('Invalid subfolder path in URL');
64
+ }
65
+ return normalized.replace(/^\/+|\/+$/g, '');
66
+ }
67
+
68
+ function assertSafeRelativePath(filePath) {
69
+ if (!filePath || typeof filePath !== 'string') {
70
+ throw new Error('Invalid file path in input');
71
+ }
72
+
73
+ const normalized = filePath.replace(/\\/g, '/');
74
+ if (
75
+ path.isAbsolute(normalized) ||
76
+ normalized.split('/').some(part => part === '..') ||
77
+ normalized.includes('\0')
78
+ ) {
79
+ throw new Error(`Unsafe file path rejected: ${filePath}`);
80
+ }
81
+
82
+ return normalized.replace(/^\/+/, '');
83
+ }
84
+
85
+ function safeJoin(root, filePath) {
86
+ const safePath = assertSafeRelativePath(filePath);
87
+ const dest = path.resolve(root, safePath);
88
+ const resolvedRoot = path.resolve(root);
89
+ if (dest !== resolvedRoot && !dest.startsWith(resolvedRoot + path.sep)) {
90
+ throw new Error(`Unsafe file path rejected: ${filePath}`);
91
+ }
92
+ return dest;
93
+ }
94
+
95
+ function encodePathForUrl(filePath) {
96
+ return filePath.split('/').map(encodeURIComponent).join('/');
97
+ }
98
+
99
+ async function mapLimit(items, limit, mapper) {
100
+ const results = [];
101
+ const executing = new Set();
102
+
103
+ for (const item of items) {
104
+ const promise = Promise.resolve().then(() => mapper(item));
105
+ results.push(promise);
106
+ executing.add(promise);
107
+ promise.finally(() => executing.delete(promise));
108
+ if (executing.size >= limit) {
109
+ await Promise.race(executing);
110
+ }
111
+ }
112
+
113
+ return Promise.all(results);
114
+ }
115
+
116
+ // ── URL Normalisation ───────────────────────────────────────
117
+ function normaliseGitUrl(rawUrl) {
118
+ const url = rawUrl.trim();
119
+ const rawPath = url.split(/[?#]/)[0];
120
+ if (/(?:^|\/)(?:\.{2}|%2e%2e)(?:\/|$)/i.test(rawPath)) {
121
+ throw new Error('Invalid subfolder path in URL');
122
+ }
123
+ let parsed;
124
+ try {
125
+ parsed = new URL(url);
126
+ } catch {
127
+ throw new Error(`Unrecognised URL — only GitHub, GitLab, and Bitbucket are supported (e.g. https://github.com/owner/repo)`);
128
+ }
129
+
130
+ const host = parsed.hostname.toLowerCase();
131
+ const parts = parsed.pathname.replace(/^\//, '').split('/');
132
+
133
+ if (host === 'github.com') {
134
+ const owner = parts[0];
135
+ const repo = parts[1]?.replace(/\.git$/, '');
136
+ if (!owner || !repo) {
137
+ throw new Error(`GitHub URL is missing the repository name — use https://github.com/owner/repo`);
138
+ }
139
+ // URL shape: /owner/repo/tree/<branch>[/subfolder/...]
140
+ // parts[2]='tree', parts[3]=branch, parts[4+]=subfolder path
141
+ const branch = parts[2] === 'tree' ? parts[3] : null;
142
+ const subFolder = validateSubFolder(parts[2] === 'tree' && parts.length > 4
143
+ ? parts.slice(4).join('/')
144
+ : null);
145
+ return { provider: 'github', owner, repo, branch, subFolder };
146
+ }
147
+
148
+ if (host === 'gitlab.com') {
149
+ const treeMatch = parsed.pathname.match(/^(.*?)\/-\/tree\/([^/]+)(\/(.+))?$/);
150
+ if (treeMatch) {
151
+ const repoParts = treeMatch[1].replace(/^\//, '').split('/');
152
+ const repo = repoParts.pop();
153
+ const owner = repoParts.join('/');
154
+ const branch = treeMatch[2];
155
+ const subFolder = validateSubFolder(treeMatch[4] || null);
156
+ return { provider: 'gitlab', owner, repo, branch, subFolder };
157
+ }
158
+ const repo = parts[parts.length - 1].replace(/\.git$/, '');
159
+ const owner = parts.slice(0, -1).join('/');
160
+ return { provider: 'gitlab', owner, repo, branch: null, subFolder: null };
161
+ }
162
+
163
+ if (host === 'bitbucket.org') {
164
+ const owner = parts[0];
165
+ const repo = parts[1]?.replace(/\.git$/, '');
166
+ // URL shape: /owner/repo/src/<branch>[/subfolder/...]
167
+ const branch = parts[2] === 'src' ? parts[3] : null;
168
+ const subFolder = validateSubFolder(parts[2] === 'src' && parts.length > 4
169
+ ? parts.slice(4).join('/')
170
+ : null);
171
+ return { provider: 'bitbucket', owner, repo, branch, subFolder };
172
+ }
173
+ throw new Error(`Unrecognised URL — only GitHub, GitLab, and Bitbucket are supported (e.g. https://github.com/owner/repo)`);
174
+ }
175
+
176
+ // ── Fetch helpers ───────────────────────────────────────────
177
+
178
+ /**
179
+ * Maps an HTTP error status + URL context into a user-readable message.
180
+ * hasAuth = true when an Authorization / PRIVATE-TOKEN header was sent.
181
+ */
182
+ function classifyHttpError(status, url, hasAuth) {
183
+ const isGitHub = url.includes('api.github.com') || url.includes('raw.githubusercontent.com');
184
+ const isGitLab = url.includes('gitlab.com');
185
+ const isBitbucket = url.includes('bitbucket.org');
186
+ const provider = isGitHub ? 'GitHub'
187
+ : isGitLab ? 'GitLab'
188
+ : isBitbucket ? 'Bitbucket'
189
+ : 'provider';
190
+
191
+ switch (status) {
192
+ case 401:
193
+ return `${provider} authentication required — add an Access Token for private repos`;
194
+
195
+ case 403:
196
+ return hasAuth
197
+ ? `${provider} access denied: token may lack "repo" read scope or is expired`
198
+ : `${provider} access denied: this repo is private or you've hit the rate limit. Add an Access Token to continue`;
199
+
200
+ case 404:
201
+ return hasAuth
202
+ ? `${provider} repo not found: double-check the URL, branch name, or subfolder path`
203
+ : `${provider} repo not found: if it's private, add an Access Token; otherwise verify the URL is correct`;
204
+
205
+ case 409:
206
+ return `${provider} repo is empty: nothing to analyse`;
207
+
208
+ case 422:
209
+ return `${provider} rejected the request, check the URL format or branch name`;
210
+
211
+ case 429:
212
+ return `${provider} rate limit reached, add an Access Token to increase your quota`;
213
+
214
+ default:
215
+ if (status >= 500) return `${provider} server error (${status}) — try again in a moment`;
216
+ return `${provider} returned HTTP ${status} — check the URL and try again`;
217
+ }
218
+ }
219
+
220
+ async function fetchWithTimeout(url, options = {}) {
221
+ const controller = new AbortController();
222
+ const timeout = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS);
223
+ try {
224
+ return await fetch(url, { ...options, signal: controller.signal });
225
+ } catch (err) {
226
+ if (err.name === 'AbortError') {
227
+ throw new Error('Provider request timed out, try again later');
228
+ }
229
+ throw err;
230
+ } finally {
231
+ clearTimeout(timeout);
232
+ }
233
+ }
234
+
235
+ async function fetchJson(url, headers = {}) {
236
+ const res = await fetchWithTimeout(url, { headers });
237
+ if (!res.ok) {
238
+ const hasAuth = !!(headers['Authorization'] || headers['PRIVATE-TOKEN']);
239
+ throw new Error(classifyHttpError(res.status, url, hasAuth));
240
+ }
241
+ return res.json();
242
+ }
243
+
244
+ async function fetchText(url, headers = {}) {
245
+ const res = await fetchWithTimeout(url, { headers });
246
+ if (!res.ok) return null;
247
+ return res.text();
248
+ }
249
+
250
+ // ── Provider: GitHub ────────────────────────────────────────
251
+ async function fetchGitHub({ owner, repo, branch, subFolder }, pat) {
252
+ const headers = { 'User-Agent': 'dockerfile-builder' };
253
+ const token = pat || process.env.GITHUB_TOKEN;
254
+ if (token) headers['Authorization'] = `Bearer ${token}`;
255
+
256
+ let ref = branch;
257
+ if (!ref) {
258
+ const meta = await fetchJson(`https://api.github.com/repos/${owner}/${repo}`, headers);
259
+ ref = meta.default_branch;
260
+ }
261
+
262
+ // Always fetch the tree at repo root — subFolder filtering happens below
263
+ const tree = await fetchJson(
264
+ `https://api.github.com/repos/${owner}/${repo}/git/trees/${ref}?recursive=1`, headers
265
+ );
266
+ if (tree.truncated) {
267
+ throw new Error('GitHub repo tree is too large to analyse via API; use a subfolder URL or ZIP upload');
268
+ }
269
+
270
+ const prefix = subFolder ? subFolder + '/' : '';
271
+ const allBlobs = tree.tree.filter(f => f.type === 'blob').map(f => assertSafeRelativePath(f.path));
272
+
273
+ // If a subfolder was specified, narrow to only those files and strip the prefix
274
+ // so the analyser sees them as if they were at the project root
275
+ const allPaths = prefix
276
+ ? allBlobs.filter(p => p.startsWith(prefix)).map(p => p.slice(prefix.length))
277
+ : allBlobs;
278
+
279
+ const wanted = allPaths.filter(p => shouldFetchContent(p));
280
+ const files = {};
281
+
282
+ await mapLimit(wanted, 10, async (filePath) => {
283
+ const remotePath = prefix + filePath; // full path in the repo
284
+ const content = await fetchText(
285
+ `https://raw.githubusercontent.com/${owner}/${repo}/${encodeURIComponent(ref)}/${encodePathForUrl(remotePath)}`, headers
286
+ );
287
+ if (content !== null) files[filePath] = content;
288
+ });
289
+
290
+ return { files, allPaths };
291
+ }
292
+
293
+ // ── Provider: GitLab ────────────────────────────────────────
294
+ async function fetchGitLab({ owner, repo, branch, subFolder }, pat) {
295
+ const headers = { 'User-Agent': 'dockerfile-builder' };
296
+ const token = pat || process.env.GITLAB_TOKEN;
297
+ if (token) headers['PRIVATE-TOKEN'] = token;
298
+
299
+ const encodedNs = encodeURIComponent(`${owner}/${repo}`);
300
+ const baseUrl = `https://gitlab.com/api/v4/projects/${encodedNs}`;
301
+
302
+ let ref = branch;
303
+ if (!ref) {
304
+ const meta = await fetchJson(baseUrl, headers);
305
+ ref = meta.default_branch;
306
+ }
307
+
308
+ const tree = [];
309
+ for (let page = 1; page <= 50; page++) {
310
+ const batch = await fetchJson(
311
+ `${baseUrl}/repository/tree?recursive=true&ref=${encodeURIComponent(ref)}&per_page=100&page=${page}`,
312
+ headers
313
+ );
314
+ tree.push(...batch);
315
+ if (batch.length < 100 || tree.length >= MAX_INDEXED_PATHS) break;
316
+ }
317
+ if (tree.length >= MAX_INDEXED_PATHS) {
318
+ throw new Error(`GitLab repo tree exceeds ${MAX_INDEXED_PATHS} files; use a subfolder URL to narrow scope`);
319
+ }
320
+
321
+ const prefix = subFolder ? subFolder + '/' : '';
322
+ const allBlobs = tree.filter(f => f.type === 'blob').map(f => assertSafeRelativePath(f.path));
323
+
324
+ const allPaths = prefix
325
+ ? allBlobs.filter(p => p.startsWith(prefix)).map(p => p.slice(prefix.length))
326
+ : allBlobs;
327
+
328
+ const wanted = allPaths.filter(p => shouldFetchContent(p));
329
+ const files = {};
330
+
331
+ await mapLimit(wanted, 10, async (filePath) => {
332
+ const remotePath = prefix + filePath;
333
+ const content = await fetchText(
334
+ `${baseUrl}/repository/files/${encodeURIComponent(remotePath)}/raw?ref=${encodeURIComponent(ref)}`, headers
335
+ );
336
+ if (content !== null) files[filePath] = content;
337
+ });
338
+
339
+ return { files, allPaths };
340
+ }
341
+
342
+ // ── Provider: Bitbucket ─────────────────────────────────────
343
+ async function fetchBitbucket({ owner, repo, branch, subFolder }, pat) {
344
+ const headers = { 'User-Agent': 'dockerfile-builder' };
345
+ const token = pat || process.env.BITBUCKET_TOKEN;
346
+ if (token) headers['Authorization'] = `Bearer ${token}`;
347
+
348
+ const baseUrl = `https://api.bitbucket.org/2.0/repositories/${owner}/${repo}`;
349
+
350
+ let ref = branch;
351
+ if (!ref) {
352
+ const meta = await fetchJson(baseUrl, headers);
353
+ ref = meta.mainbranch?.name || 'main';
354
+ }
355
+
356
+ const srcPath = subFolder ? `${ref}/${subFolder}/` : `${ref}/`;
357
+ const values = [];
358
+ let nextUrl = `${baseUrl}/src/${srcPath}?pagelen=100&fields=values.path,values.type,next`;
359
+ while (nextUrl && values.length < MAX_INDEXED_PATHS) {
360
+ const page = await fetchJson(nextUrl, headers);
361
+ values.push(...(page.values || []));
362
+ nextUrl = page.next || null;
363
+ }
364
+ if (values.length >= MAX_INDEXED_PATHS) {
365
+ throw new Error(`Bitbucket repo tree exceeds ${MAX_INDEXED_PATHS} files; use a subfolder URL to narrow scope`);
366
+ }
367
+
368
+ const prefix = subFolder ? subFolder + '/' : '';
369
+ const allBlobs = values.filter(f => f.type === 'commit_file').map(f => assertSafeRelativePath(f.path));
370
+
371
+ const allPaths = prefix
372
+ ? allBlobs.filter(p => p.startsWith(prefix)).map(p => p.slice(prefix.length))
373
+ : allBlobs;
374
+
375
+ const wanted = allPaths.filter(p => shouldFetchContent(p));
376
+ const files = {};
377
+
378
+ await mapLimit(wanted, 10, async (filePath) => {
379
+ const remotePath = prefix + filePath;
380
+ const content = await fetchText(`${baseUrl}/src/${encodeURIComponent(ref)}/${encodePathForUrl(remotePath)}`, headers);
381
+ if (content !== null) files[filePath] = content;
382
+ });
383
+
384
+ return { files, allPaths };
385
+ }
386
+
387
+ // ── Key file filter ─────────────────────────────────────────
388
+ function isKeyFile(filePath) {
389
+ const base = path.basename(filePath);
390
+ const ext = path.extname(filePath);
391
+ const parts = filePath.split('/');
392
+ const depth = parts.length;
393
+ if (depth > 6) return false;
394
+ // Block if any directory component (not the filename itself) is ignored
395
+ if (parts.slice(0, -1).some(part => IGNORED_SET.has(part))) return false;
396
+ // Root config files (depth 1-2) are fetched so the analyser gets real content
397
+ // Depth 2 catches e.g. packages/tsconfig.base.json in yarn workspaces
398
+ if (depth <= 2 && ROOT_CONFIG_SET.has(base)) return true;
399
+ if (depth <= 2 && TSCONFIG_VARIANT_RE.test(base)) return true;
400
+ return KEY_FILES.includes(base) || KEY_EXTENSIONS.includes(ext);
401
+ }
402
+
403
+ // ── Main: ingestGitRepo ─────────────────────────────────────
404
+ function isBuildSourceFile(filePath) {
405
+ const ext = path.extname(filePath).toLowerCase();
406
+ const parts = filePath.split('/');
407
+ if (parts.length > 10) return false;
408
+ if (!TEXT_SOURCE_EXTENSIONS.has(ext)) return false;
409
+ if (parts.slice(0, -1).some(part => IGNORED_SET.has(part))) return false;
410
+ return parts.slice(0, -1).some(part => SOURCE_DIR_NAMES.has(part));
411
+ }
412
+
413
+ function shouldFetchContent(filePath) {
414
+ return isKeyFile(filePath) || isBuildSourceFile(filePath);
415
+ }
416
+
417
+ async function ingestGitRepo(gitUrl, workDir, pat) {
418
+ const info = normaliseGitUrl(gitUrl);
419
+ console.log(` Provider: ${info.provider} | ${info.owner}/${info.repo}${info.branch ? ` @ ${info.branch}` : ''}`);
420
+
421
+ let result;
422
+ if (info.provider === 'github') result = await fetchGitHub(info, pat);
423
+ else if (info.provider === 'gitlab') result = await fetchGitLab(info, pat);
424
+ else if (info.provider === 'bitbucket') result = await fetchBitbucket(info, pat);
425
+ else throw new Error(`Unsupported provider: ${info.provider}`);
426
+
427
+ const { files, allPaths } = result;
428
+
429
+ if (Object.keys(files).length === 0 && allPaths.length === 0) {
430
+ throw new Error('Could not read repo — check the URL is public, or set a token in .env');
431
+ }
432
+
433
+ // Write fetched files to workDir so the analyser reads them normally
434
+ await fs.ensureDir(workDir);
435
+ for (const [filePath, content] of Object.entries(files)) {
436
+ const dest = safeJoin(workDir, filePath);
437
+ await fs.ensureDir(path.dirname(dest));
438
+ await fs.writeFile(dest, content, 'utf-8');
439
+ }
440
+
441
+ // Write empty placeholders for the rest (analyser detects stacks by filename).
442
+ // Skip paths whose directory components are in IGNORED_SET — never materialise
443
+ // example/test/docs directories so the analyser cannot detect them as services.
444
+ for (const filePath of allPaths) {
445
+ const safePath = assertSafeRelativePath(filePath);
446
+ const parts = safePath.split('/');
447
+ if (parts.slice(0, -1).some(part => IGNORED_SET.has(part))) continue;
448
+ const dest = safeJoin(workDir, safePath);
449
+ if (!(await fs.pathExists(dest))) {
450
+ await fs.ensureDir(path.dirname(dest));
451
+ await fs.writeFile(dest, '');
452
+ }
453
+ }
454
+
455
+ console.log(` Fetched ${Object.keys(files).length} key files, ${allPaths.length} total paths indexed`);
456
+ return workDir;
457
+ }
458
+
459
+ // ── Zip Unpack ──────────────────────────────────────────────
460
+ async function ingestZip(zipPath, workDir) {
461
+ const AdmZip = require('adm-zip');
462
+ await fs.ensureDir(workDir);
463
+ const zip = new AdmZip(zipPath);
464
+ const entries = zip.getEntries();
465
+ let fileCount = 0;
466
+ let uncompressedBytes = 0;
467
+
468
+ for (const entry of entries) {
469
+ const entryName = assertSafeRelativePath(entry.entryName);
470
+ if (entry.isDirectory) continue;
471
+
472
+ fileCount += 1;
473
+ const data = entry.getData();
474
+ uncompressedBytes += data.byteLength;
475
+ if (fileCount > MAX_ZIP_FILES) {
476
+ throw new Error(`ZIP contains too many files; maximum is ${MAX_ZIP_FILES}`);
477
+ }
478
+ if (uncompressedBytes > MAX_ZIP_UNCOMPRESSED_BYTES) {
479
+ throw new Error('ZIP uncompressed size is too large');
480
+ }
481
+
482
+ const dest = safeJoin(workDir, entryName);
483
+ await fs.ensureDir(path.dirname(dest));
484
+ await fs.writeFile(dest, data);
485
+ }
486
+
487
+ const rootEntries = await fs.readdir(workDir);
488
+ if (rootEntries.length === 1) {
489
+ const nested = path.join(workDir, rootEntries[0]);
490
+ const stat = await fs.stat(nested);
491
+ if (stat.isDirectory()) {
492
+ const tmp = workDir + '_tmp';
493
+ await fs.move(nested, tmp);
494
+ await fs.remove(workDir);
495
+ await fs.move(tmp, workDir);
496
+ }
497
+ }
498
+
499
+ return workDir;
500
+ }
501
+
502
+ // ── Pasted File Tree ────────────────────────────────────────
503
+ async function ingestTree(treeText, workDir) {
504
+ await fs.ensureDir(workDir);
505
+ for (const line of treeText.split('\n')) {
506
+ const cleaned = line.replace(/[├└│─]/g, '').replace(/^\s+/, '').trim();
507
+ if (cleaned && !cleaned.endsWith('/')) {
508
+ const filePath = safeJoin(workDir, cleaned);
509
+ await fs.ensureDir(path.dirname(filePath));
510
+ await fs.writeFile(filePath, '');
511
+ }
512
+ }
513
+ return workDir;
514
+ }
515
+
516
+ module.exports = {
517
+ ingestGitRepo,
518
+ ingestZip,
519
+ ingestTree,
520
+ normaliseGitUrl,
521
+ validateSubFolder,
522
+ isKeyFile,
523
+ isBuildSourceFile,
524
+ shouldFetchContent,
525
+ };
@@ -0,0 +1,38 @@
1
+ // backend/src/modules/optimisation/optimiser.js
2
+ // Post-generation pass: check for obvious wins, add notes
3
+
4
+ function optimise(result, analysis) {
5
+ const improvements = [...result.improvements];
6
+ let dockerfile = result.dockerfile;
7
+ const lines = dockerfile.split('\n');
8
+
9
+ // Only count RUN instructions in the final stage. Builder-stage RUNs are
10
+ // usually better kept separate because they preserve useful cache boundaries.
11
+ const finalFromIdx = lines.reduce((last, line, idx) => (
12
+ line.trim().startsWith('FROM ') ? idx : last
13
+ ), 0);
14
+ const finalStageRunLines = lines.slice(finalFromIdx).filter(l => l.trim().startsWith('RUN'));
15
+ if (finalStageRunLines.length > 3) {
16
+ improvements.push(
17
+ `There are ${finalStageRunLines.length} RUN instructions in the final stage - consider chaining related ones with && to reduce runtime layers`
18
+ );
19
+ }
20
+
21
+ // Check: large base image
22
+ if (
23
+ dockerfile.includes(':latest') ||
24
+ (!dockerfile.includes('-alpine') && !dockerfile.includes('-slim'))
25
+ ) {
26
+ improvements.push(
27
+ 'Consider using an alpine or slim variant of the base image to reduce final image size'
28
+ );
29
+ }
30
+
31
+ return {
32
+ ...result,
33
+ dockerfile,
34
+ improvements,
35
+ };
36
+ }
37
+
38
+ module.exports = { optimise };
@@ -0,0 +1,91 @@
1
+ // backend/src/modules/security/security.js
2
+ // Checks for common Dockerfile security issues
3
+
4
+ function hasTaggedImage(image) {
5
+ if (image.includes('@sha256:')) return true;
6
+ const lastSegment = image.split('/').pop() || image;
7
+ return lastSegment.includes(':');
8
+ }
9
+
10
+ function parseEnvAssignments(line) {
11
+ const body = line.replace(/^ENV\s+/, '').trim();
12
+ const assignments = [];
13
+ const pairPattern = /([A-Za-z_][A-Za-z0-9_]*)=("[^"]*"|'[^']*'|[^\s\\]+)/g;
14
+ let match;
15
+
16
+ while ((match = pairPattern.exec(body)) !== null) {
17
+ assignments.push({
18
+ key: match[1],
19
+ value: match[2].replace(/^['"]|['"]$/g, ''),
20
+ });
21
+ }
22
+
23
+ return assignments;
24
+ }
25
+
26
+ function isSecretLikeEnvKey(key = '') {
27
+ return /(?:TOKEN|SECRET|PASSWORD|PASS|API_KEY|PRIVATE_KEY|ACCESS_KEY|AUTH|CREDENTIAL|DATABASE_URL|DB_URL|MONGO|REDIS_URL|POSTGRES|JWT|ENCRYPTION|SIGNING_KEY|STRIPE)/i.test(key);
28
+ }
29
+
30
+ function isSecretLikeEnvValue(value = '') {
31
+ return /(?:postgres:\/\/|mysql:\/\/|mongodb(?:\+srv)?:\/\/|redis:\/\/|sk_(?:live|test)_|AKIA[0-9A-Z]{16}|-----BEGIN [A-Z ]*PRIVATE KEY-----)/i.test(value);
32
+ }
33
+
34
+ function securityPass(result, analysis) {
35
+ const notes = [];
36
+ const dockerfile = result.dockerfile;
37
+
38
+ // Check 1: Root user
39
+ // nginx handles its own privilege drop internally (master runs as root, workers as nginx).
40
+ // Only skip the USER check when the FINAL runtime stage is an nginx image, not an intermediate builder.
41
+ const fromLines = dockerfile.split('\n').filter(l => /^FROM\s/i.test(l));
42
+ const lastFrom = fromLines[fromLines.length - 1] || '';
43
+ const usesNginxRuntime = /^FROM nginx:/i.test(lastFrom);
44
+ if (!usesNginxRuntime && !/^USER\s+\S+/m.test(dockerfile)) {
45
+ notes.push('⚠️ SECURITY: No USER instruction found - container will run as root. Add a non-root user.');
46
+ }
47
+
48
+ // Check 2: Base image pinning
49
+ if (dockerfile.includes(':latest')) {
50
+ notes.push('⚠️ SECURITY: Avoid using :latest tag - pin to a specific version for reproducibility and security.');
51
+ }
52
+ for (const line of dockerfile.split('\n')) {
53
+ const match = line.trim().match(/^FROM\s+([^\s]+)(?:\s+AS\s+\S+)?$/i);
54
+ if (match && !hasTaggedImage(match[1])) {
55
+ notes.push(`⚠️ SECURITY: Base image has no tag: "${match[1]}" - pin to a specific version.`);
56
+ }
57
+ }
58
+
59
+ // Check 3: Secrets in ENV
60
+ const envLines = dockerfile.split('\n').filter(l => l.trim().startsWith('ENV '));
61
+ for (const line of envLines) {
62
+ for (const env of parseEnvAssignments(line.trim())) {
63
+ if (isSecretLikeEnvKey(env.key) || isSecretLikeEnvValue(env.value)) {
64
+ notes.push(`⚠️ SECURITY: Possible secret in ENV instruction: "${env.key}" - use runtime secrets or env files instead.`);
65
+ }
66
+ }
67
+ }
68
+
69
+ // Check 4: .env not in dockerignore
70
+ if (result.dockerignore && !result.dockerignore.includes('.env')) {
71
+ notes.push('⚠️ SECURITY: Add .env to .dockerignore to prevent accidentally baking secrets into the image.');
72
+ }
73
+
74
+ // Check 5: curl/wget piped to shell
75
+ if (/(?:curl|wget)\s+[^|\n]+\|\s*(?:ba)?sh\b/i.test(dockerfile)) {
76
+ notes.push('⚠️ SECURITY: Detected curl | bash pattern - avoid piping remote scripts directly to a shell.');
77
+ }
78
+
79
+ // Check 6: ADD vs COPY
80
+ if (dockerfile.includes('\nADD ') && !dockerfile.includes('ADD http')) {
81
+ notes.push('💡 Use COPY instead of ADD unless you need tar auto-extraction or remote URL support. COPY is more explicit.');
82
+ }
83
+
84
+ return notes;
85
+ }
86
+
87
+ module.exports = {
88
+ securityPass,
89
+ isSecretLikeEnvKey,
90
+ isSecretLikeEnvValue,
91
+ };