archbyte 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/archbyte.js CHANGED
@@ -22,6 +22,11 @@ import { handleVersion, handleUpdate } from '../dist/cli/version.js';
22
22
  import { requireLicense } from '../dist/cli/license-gate.js';
23
23
  import { DEFAULT_PORT } from '../dist/cli/constants.js';
24
24
 
25
+ // When spawned by `archbyte serve` (internal), skip interactive license checks.
26
+ // The user already authenticated when they started the server.
27
+ const isInternal = process.env.ARCHBYTE_INTERNAL === '1';
28
+ const gate = isInternal ? async () => {} : requireLicense;
29
+
25
30
  const require = createRequire(import.meta.url);
26
31
  const { version: PKG_VERSION } = require('../package.json');
27
32
 
@@ -93,6 +98,7 @@ program
93
98
  .option('-v, --verbose', 'Show detailed output')
94
99
  .option('--force', 'Force full re-scan (skip incremental detection)')
95
100
  .option('--dry-run', 'Preview without running')
101
+ .option('--debug', 'Show transparency report (what data is collected and sent)')
96
102
  .action(async (options) => {
97
103
  // handleRun manages login + setup + requireLicense internally
98
104
  await handleRun(options);
@@ -110,8 +116,9 @@ program
110
116
  .option('--skip-llm', 'Alias for --static')
111
117
  .option('--force', 'Force full re-scan (skip incremental detection)')
112
118
  .option('--dry-run', 'Preview without running')
119
+ .option('--debug', 'Show transparency report (what data is collected and sent)')
113
120
  .action(async (options) => {
114
- await requireLicense('analyze');
121
+ await gate('analyze');
115
122
  await handleAnalyze(options);
116
123
  });
117
124
 
@@ -122,7 +129,7 @@ program
122
129
  .option('-o, --output <path>', 'Output diagram (default: .archbyte/architecture.json)')
123
130
  .option('-v, --verbose', 'Show detailed output')
124
131
  .action(async (options) => {
125
- await requireLicense('generate');
132
+ await gate('generate');
126
133
  await handleGenerate(options);
127
134
  });
128
135
 
@@ -131,6 +138,7 @@ program
131
138
  .description('Start the visualization UI server')
132
139
  .option('-p, --port <number>', `Server port (default: ${DEFAULT_PORT})`, parseInt)
133
140
  .option('-d, --diagram <path>', 'Path to architecture JSON (default: .archbyte/architecture.json)')
141
+ .option('--debug', 'Enable transparency endpoint (/api/transparency)')
134
142
  .action(async (options) => {
135
143
  await handleServe(options);
136
144
  });
@@ -33,13 +33,13 @@ export const serviceDescriber = {
33
33
  parts.push(`Detected language: ${ctx.structure.language}`);
34
34
  parts.push(`Languages: ${ctx.structure.languages.join(", ") || "none"}`);
35
35
  parts.push(`Framework: ${ctx.structure.framework ?? "none"}`);
36
- // Docs
36
+ // Docs — only project description, NOT externalDependencies.
37
+ // Doc-extracted dependency mentions prime the LLM to hallucinate phantom services
38
+ // (e.g., docs mention "MCP" → LLM creates "MCP Server" component).
39
+ // The LLM should discover services from actual code evidence only.
37
40
  if (ctx.docs.projectDescription) {
38
41
  parts.push(`\nFrom docs: ${ctx.docs.projectDescription}`);
39
42
  }
40
- if (ctx.docs.externalDependencies.length > 0) {
41
- parts.push(`\nExternal dependencies mentioned: ${ctx.docs.externalDependencies.join(", ")}`);
42
- }
43
43
  // Docker services — only include if infra/config files changed (or full scan)
44
44
  if (ctx.infra.docker.composeFile && (hasInfraChanges || hasConfigChanges)) {
45
45
  const svcInfo = ctx.infra.docker.services.map((s) => {
@@ -6,7 +6,7 @@ import type { IncrementalContext } from "./types.js";
6
6
  * Run the multi-agent pipeline: 3 parallel fast agents → 2 sequential agents.
7
7
  * Each agent gets a single chat() call with pre-collected static context.
8
8
  */
9
- export declare function runPipeline(ctx: StaticContext, provider: LLMProvider, config: ArchByteConfig, onProgress?: (msg: string) => void, incrementalContext?: IncrementalContext): Promise<StaticAnalysisResult & {
9
+ export declare function runPipeline(ctx: StaticContext, provider: LLMProvider, config: ArchByteConfig, onProgress?: (msg: string) => void, incrementalContext?: IncrementalContext, onDebug?: (agentId: string, model: string, system: string, user: string) => void): Promise<StaticAnalysisResult & {
10
10
  tokenUsage?: {
11
11
  input: number;
12
12
  output: number;
@@ -92,7 +92,7 @@ function getFallbackData(agentId, inc) {
92
92
  * Run the multi-agent pipeline: 3 parallel fast agents → 2 sequential agents.
93
93
  * Each agent gets a single chat() call with pre-collected static context.
94
94
  */
95
- export async function runPipeline(ctx, provider, config, onProgress, incrementalContext) {
95
+ export async function runPipeline(ctx, provider, config, onProgress, incrementalContext, onDebug) {
96
96
  const agentResults = {};
97
97
  const agentMeta = [];
98
98
  const skippedAgents = [];
@@ -118,7 +118,7 @@ export async function runPipeline(ctx, provider, config, onProgress, incremental
118
118
  agentResults[agent.id] = fallback;
119
119
  return Promise.resolve(null);
120
120
  }
121
- return runAgent(agent, ctx, provider, config, parallelPrior, onProgress);
121
+ return runAgent(agent, ctx, provider, config, parallelPrior, onProgress, onDebug);
122
122
  }));
123
123
  let authFailed = false;
124
124
  for (let i = 0; i < parallelTasks.length; i++) {
@@ -156,7 +156,7 @@ export async function runPipeline(ctx, provider, config, onProgress, incremental
156
156
  continue;
157
157
  }
158
158
  try {
159
- const result = await runAgent(agent, ctx, provider, config, agentResults, onProgress);
159
+ const result = await runAgent(agent, ctx, provider, config, agentResults, onProgress, onDebug);
160
160
  if (result) {
161
161
  agentResults[agent.id] = result.data;
162
162
  agentMeta.push(result);
@@ -214,10 +214,12 @@ const MAX_TOKENS = {
214
214
  "flow-detector": 4096,
215
215
  "validator": 4096,
216
216
  };
217
- async function runAgent(agent, ctx, provider, config, priorResults, onProgress) {
217
+ async function runAgent(agent, ctx, provider, config, priorResults, onProgress, onDebug) {
218
218
  const start = Date.now();
219
219
  const model = resolveModel(config.provider, agent.modelTier, config.modelOverrides, config.model);
220
220
  const { system, user } = agent.buildPrompt(ctx, priorResults);
221
+ // Debug callback — report what data is being sent
222
+ onDebug?.(agent.id, model, system, user);
221
223
  onProgress?.(` ${agent.name}: calling ${model}...`);
222
224
  const maxTokens = MAX_TOKENS[agent.id] ?? 4096;
223
225
  const response = await provider.chat({
@@ -1,5 +1,6 @@
1
1
  // Pipeline — Merger
2
2
  // Assembles all agent outputs into a StaticAnalysisResult
3
+ import { categorizeDep } from "../static/taxonomy.js";
3
4
  function sanitize(s) {
4
5
  if (!s)
5
6
  return s;
@@ -9,21 +10,24 @@ function sanitize(s) {
9
10
  * Build a set of "evidence tokens" from the static context — things that concretely
10
11
  * exist in the codebase (dependencies, env vars, docker images/services).
11
12
  * Used to gate LLM-generated databases/external services against hallucination.
13
+ *
14
+ * Uses the package taxonomy to resolve package names to their display names
15
+ * (e.g., "pg" → also adds "postgresql", "stripe" → also adds "stripe").
16
+ * This lets the LLM use human-readable names while still requiring code evidence.
12
17
  */
13
18
  function buildEvidenceTokens(ctx) {
14
19
  const tokens = new Set();
20
+ /** Add a dependency name + its taxonomy display name as tokens. */
21
+ function addDep(dep) {
22
+ tokens.add(dep.toLowerCase());
23
+ const cat = categorizeDep(dep);
24
+ if (cat)
25
+ tokens.add(cat.displayName.toLowerCase());
26
+ }
15
27
  // Package dependencies from import map (codeSamples.importMap: file → imported modules)
16
28
  for (const imports of Object.values(ctx.codeSamples.importMap)) {
17
- for (const imp of imports) {
18
- tokens.add(imp.toLowerCase());
19
- // Also add short name for scoped packages: @aws-sdk/client-s3 → client-s3, aws-sdk
20
- if (imp.startsWith("@")) {
21
- const parts = imp.split("/");
22
- if (parts[1])
23
- tokens.add(parts[1].toLowerCase());
24
- tokens.add(parts[0].slice(1).toLowerCase());
25
- }
26
- }
29
+ for (const imp of imports)
30
+ addDep(imp);
27
31
  }
28
32
  // Config files may contain dependency info (package.json deps etc.)
29
33
  for (const cfg of ctx.codeSamples.configFiles) {
@@ -31,13 +35,7 @@ function buildEvidenceTokens(ctx) {
31
35
  try {
32
36
  const pkg = JSON.parse(cfg.content);
33
37
  for (const dep of Object.keys({ ...pkg.dependencies, ...pkg.devDependencies })) {
34
- tokens.add(dep.toLowerCase());
35
- if (dep.startsWith("@")) {
36
- const parts = dep.split("/");
37
- if (parts[1])
38
- tokens.add(parts[1].toLowerCase());
39
- tokens.add(parts[0].slice(1).toLowerCase());
40
- }
38
+ addDep(dep);
41
39
  }
42
40
  }
43
41
  catch { /* ignore parse errors */ }
@@ -59,35 +57,29 @@ function buildEvidenceTokens(ctx) {
59
57
  for (const s of ctx.infra.cloud.services) {
60
58
  tokens.add(s.toLowerCase());
61
59
  }
62
- // External dependencies mentioned in docs
63
- for (const dep of ctx.docs.externalDependencies) {
64
- tokens.add(dep.toLowerCase());
65
- }
60
+ // NOTE: ctx.docs.externalDependencies intentionally excluded.
61
+ // Doc mentions (from markdown/README) are not concrete code evidence and cause
62
+ // hallucination — the LLM sees "MCP" in docs and creates phantom components.
63
+ // Only code-level signals (imports, deps, env vars, Docker, cloud) count.
66
64
  return tokens;
67
65
  }
68
66
  /**
69
- * Check if a service/database ID and type have concrete evidence in the static context.
70
- * Uses fuzzy matching: checks if any evidence token contains or is contained by the service keywords.
67
+ * Check if a service/database has concrete evidence in the static context.
68
+ * Strict exact-match only no substring/regex fuzzy matching.
69
+ * The taxonomy enriches evidence tokens with display names (pg → PostgreSQL)
70
+ * so the LLM can use human-readable names and still match.
71
71
  */
72
72
  function hasEvidence(id, name, type, evidenceTokens) {
73
- // Build candidate keywords from the service
74
73
  const candidates = [
75
74
  id.toLowerCase(),
76
75
  name.toLowerCase(),
77
76
  type.toLowerCase(),
78
- // Split hyphenated IDs: "aws-sqs" → ["aws", "sqs"]
77
+ // Split hyphenated IDs: "aws-sqs" → also check "aws", "sqs"
79
78
  ...id.toLowerCase().split("-"),
80
79
  ].filter(Boolean);
81
80
  for (const candidate of candidates) {
82
- for (const token of evidenceTokens) {
83
- // Direct match or substring match (in both directions)
84
- if (token === candidate)
85
- return true;
86
- if (token.includes(candidate) && candidate.length >= 3)
87
- return true;
88
- if (candidate.includes(token) && token.length >= 3)
89
- return true;
90
- }
81
+ if (evidenceTokens.has(candidate))
82
+ return true;
91
83
  }
92
84
  return false;
93
85
  }
@@ -0,0 +1,12 @@
1
+ export interface IgnoreFilter {
2
+ /** Returns true if the relative path should be excluded from analysis */
3
+ isIgnored(relativePath: string): boolean;
4
+ /** Number of active patterns (excluding comments and blank lines) */
5
+ patternCount: number;
6
+ }
7
+ /**
8
+ * Load `.archbyteignore` from the project root.
9
+ * Returns an IgnoreFilter that matches paths against the patterns.
10
+ * If the file doesn't exist, returns a no-op filter that ignores nothing.
11
+ */
12
+ export declare function loadIgnoreFile(projectRoot: string): IgnoreFilter;
@@ -0,0 +1,140 @@
1
+ // .archbyteignore — File exclusion filter
2
+ // Supports .gitignore-style patterns: # comments, ! negation, ** globstar, * wildcard
3
+ import * as fs from "fs";
4
+ import * as path from "path";
5
+ /**
6
+ * Load `.archbyteignore` from the project root.
7
+ * Returns an IgnoreFilter that matches paths against the patterns.
8
+ * If the file doesn't exist, returns a no-op filter that ignores nothing.
9
+ */
10
+ export function loadIgnoreFile(projectRoot) {
11
+ const ignorePath = path.join(projectRoot, ".archbyteignore");
12
+ if (!fs.existsSync(ignorePath)) {
13
+ return { isIgnored: () => false, patternCount: 0 };
14
+ }
15
+ const content = fs.readFileSync(ignorePath, "utf-8");
16
+ const rules = parseIgnorePatterns(content);
17
+ return {
18
+ isIgnored(relativePath) {
19
+ // Normalize path separators
20
+ const normalized = relativePath.replace(/\\/g, "/").replace(/^\//, "");
21
+ let ignored = false;
22
+ for (const rule of rules) {
23
+ if (rule.pattern.test(normalized)) {
24
+ ignored = !rule.negated;
25
+ }
26
+ }
27
+ return ignored;
28
+ },
29
+ patternCount: rules.length,
30
+ };
31
+ }
32
+ /**
33
+ * Parse .gitignore-style content into an ordered list of rules.
34
+ */
35
+ function parseIgnorePatterns(content) {
36
+ const rules = [];
37
+ for (const rawLine of content.split("\n")) {
38
+ const line = rawLine.trim();
39
+ // Skip blank lines and comments
40
+ if (!line || line.startsWith("#"))
41
+ continue;
42
+ let pattern = line;
43
+ let negated = false;
44
+ // Handle negation
45
+ if (pattern.startsWith("!")) {
46
+ negated = true;
47
+ pattern = pattern.slice(1);
48
+ }
49
+ // Remove trailing spaces (unless escaped)
50
+ pattern = pattern.replace(/(?<!\\)\s+$/, "");
51
+ if (!pattern)
52
+ continue;
53
+ const regex = patternToRegex(pattern);
54
+ rules.push({ pattern: regex, negated });
55
+ }
56
+ return rules;
57
+ }
58
+ /**
59
+ * Convert a .gitignore-style pattern to a RegExp.
60
+ * Supports: * (any non-slash), ** (any including slashes), ? (single char),
61
+ * trailing / (directory match), leading / (root-anchored).
62
+ */
63
+ function patternToRegex(pattern) {
64
+ let anchored = false;
65
+ // Leading / means anchored to root
66
+ if (pattern.startsWith("/")) {
67
+ anchored = true;
68
+ pattern = pattern.slice(1);
69
+ }
70
+ // Trailing / means match directories — for our purposes, match the prefix
71
+ const dirOnly = pattern.endsWith("/");
72
+ if (dirOnly) {
73
+ pattern = pattern.slice(0, -1);
74
+ }
75
+ // Escape regex special chars, then convert glob patterns
76
+ let regex = "";
77
+ let i = 0;
78
+ while (i < pattern.length) {
79
+ const ch = pattern[i];
80
+ const next = pattern[i + 1];
81
+ if (ch === "*" && next === "*") {
82
+ // ** — match anything including path separators
83
+ if (pattern[i + 2] === "/") {
84
+ // **/ — match zero or more directories
85
+ regex += "(?:.*/)?";
86
+ i += 3;
87
+ }
88
+ else {
89
+ // ** at end or before non-slash
90
+ regex += ".*";
91
+ i += 2;
92
+ }
93
+ }
94
+ else if (ch === "*") {
95
+ // * — match anything except /
96
+ regex += "[^/]*";
97
+ i++;
98
+ }
99
+ else if (ch === "?") {
100
+ // ? — match single non-slash char
101
+ regex += "[^/]";
102
+ i++;
103
+ }
104
+ else if (ch === "[") {
105
+ // Character class — pass through until ]
106
+ const closeBracket = pattern.indexOf("]", i + 1);
107
+ if (closeBracket !== -1) {
108
+ regex += pattern.slice(i, closeBracket + 1);
109
+ i = closeBracket + 1;
110
+ }
111
+ else {
112
+ regex += escapeRegex(ch);
113
+ i++;
114
+ }
115
+ }
116
+ else {
117
+ regex += escapeRegex(ch);
118
+ i++;
119
+ }
120
+ }
121
+ if (dirOnly) {
122
+ // Match the directory itself or anything under it
123
+ regex += "(?:/.*)?";
124
+ }
125
+ if (anchored) {
126
+ // Must match from the start
127
+ return new RegExp(`^${regex}$`);
128
+ }
129
+ // Unanchored: match if the pattern matches the full path
130
+ // or any suffix after a /
131
+ // If pattern contains /, it's implicitly anchored
132
+ if (pattern.includes("/")) {
133
+ return new RegExp(`^${regex}$`);
134
+ }
135
+ // No slash: match against the basename OR any path segment
136
+ return new RegExp(`(?:^|/)${regex}(?:/.*)?$`);
137
+ }
138
+ function escapeRegex(ch) {
139
+ return ch.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
140
+ }
@@ -1,4 +1,5 @@
1
1
  import type { StaticAnalysisResult, StaticContext } from "./types.js";
2
+ import type { PrivacyConfig } from "../../cli/yaml-io.js";
2
3
  export type { StaticAnalysisResult, StaticContext } from "./types.js";
3
4
  export { validateAnalysis } from "./validator.js";
4
5
  /**
@@ -16,4 +17,4 @@ export declare function runStaticAnalysis(projectRoot: string, onProgress?: (msg
16
17
  * This runs ONLY fact-collectors (no component-detector, connection-mapper, or validator).
17
18
  * Output is consumed by the pipeline LLM agents.
18
19
  */
19
- export declare function runStaticContextCollection(projectRoot: string, onProgress?: (msg: string) => void): Promise<StaticContext>;
20
+ export declare function runStaticContextCollection(projectRoot: string, onProgress?: (msg: string) => void, privacy?: Required<PrivacyConfig>): Promise<StaticContext>;
@@ -11,6 +11,8 @@ import { mapConnections } from "./connection-mapper.js";
11
11
  import { validateAnalysis } from "./validator.js";
12
12
  import { collectFileTree } from "./file-tree-collector.js";
13
13
  import { collectCodeSamples } from "./code-sampler.js";
14
+ import { loadIgnoreFile } from "./ignore.js";
15
+ import { redactContext } from "./redactor.js";
14
16
  export { validateAnalysis } from "./validator.js";
15
17
  /**
16
18
  * Run all static analysis scanners.
@@ -22,7 +24,11 @@ export { validateAnalysis } from "./validator.js";
22
24
  * 4. Gap detection — identify what the LLM should resolve
23
25
  */
24
26
  export async function runStaticAnalysis(projectRoot, onProgress) {
25
- const tk = new StaticToolkit(projectRoot);
27
+ const ignoreFilter = loadIgnoreFile(projectRoot);
28
+ if (ignoreFilter.patternCount > 0) {
29
+ onProgress?.(`Loaded .archbyteignore: ${ignoreFilter.patternCount} pattern(s)`);
30
+ }
31
+ const tk = new StaticToolkit(projectRoot, ignoreFilter);
26
32
  // Phase 1: parallel scanners (no dependencies)
27
33
  onProgress?.("Running parallel scanners...");
28
34
  const [structure, docs, infra, events, envs] = await Promise.all([
@@ -292,8 +298,12 @@ async function collectGaps(analysis, tk) {
292
298
  * This runs ONLY fact-collectors (no component-detector, connection-mapper, or validator).
293
299
  * Output is consumed by the pipeline LLM agents.
294
300
  */
295
- export async function runStaticContextCollection(projectRoot, onProgress) {
296
- const tk = new StaticToolkit(projectRoot);
301
+ export async function runStaticContextCollection(projectRoot, onProgress, privacy) {
302
+ const ignoreFilter = loadIgnoreFile(projectRoot);
303
+ if (ignoreFilter.patternCount > 0) {
304
+ onProgress?.(`Loaded .archbyteignore: ${ignoreFilter.patternCount} pattern(s)`);
305
+ }
306
+ const tk = new StaticToolkit(projectRoot, ignoreFilter);
297
307
  onProgress?.("Collecting static context (7 scanners in parallel)...");
298
308
  const [structure, docs, infra, events, envs, fileTree, codeSamples] = await Promise.all([
299
309
  scanStructure(tk),
@@ -306,5 +316,43 @@ export async function runStaticContextCollection(projectRoot, onProgress) {
306
316
  ]);
307
317
  onProgress?.(`Context: ${fileTree.totalFiles} files, ${fileTree.totalDirs} dirs, ${codeSamples.configFiles.length} configs, ${codeSamples.samples.length} samples`);
308
318
  onProgress?.(`Detected: ${structure.language}, ${structure.framework ?? "no framework"}, monorepo=${structure.isMonorepo}`);
309
- return { structure, docs, infra, events, envs, fileTree, codeSamples };
319
+ let ctx = { structure, docs, infra, events, envs, fileTree, codeSamples };
320
+ // Apply privacy controls — zero out disabled fields
321
+ if (privacy) {
322
+ if (!privacy.sendCodeSamples) {
323
+ ctx.codeSamples = { ...ctx.codeSamples, samples: [] };
324
+ onProgress?.("Privacy: code samples excluded");
325
+ }
326
+ if (!privacy.sendImportMap) {
327
+ ctx.codeSamples = { ...ctx.codeSamples, importMap: {} };
328
+ onProgress?.("Privacy: import map excluded");
329
+ }
330
+ if (!privacy.sendEnvNames) {
331
+ ctx.envs = { ...ctx.envs, environments: ctx.envs.environments.map((e) => ({ ...e, variables: [] })) };
332
+ onProgress?.("Privacy: env variable names excluded");
333
+ }
334
+ if (!privacy.sendDocs) {
335
+ ctx.docs = { projectDescription: "", architectureNotes: [], apiEndpoints: [], externalDependencies: [] };
336
+ onProgress?.("Privacy: documentation excluded");
337
+ }
338
+ if (!privacy.sendFileTree) {
339
+ ctx.fileTree = { tree: [], totalFiles: ctx.fileTree.totalFiles, totalDirs: ctx.fileTree.totalDirs };
340
+ onProgress?.("Privacy: file tree excluded");
341
+ }
342
+ if (!privacy.sendInfra) {
343
+ ctx.infra = {
344
+ docker: { services: [], composeFile: false },
345
+ kubernetes: { resources: [] },
346
+ cloud: { provider: null, services: [], iac: null },
347
+ ci: { platform: null, pipelines: [] },
348
+ };
349
+ onProgress?.("Privacy: infrastructure details excluded");
350
+ }
351
+ // Redaction — hash identifiers before returning
352
+ if (privacy.redact) {
353
+ ctx = redactContext(ctx);
354
+ onProgress?.("Privacy: redaction applied — identifiers hashed");
355
+ }
356
+ }
357
+ return ctx;
310
358
  }
@@ -0,0 +1,12 @@
1
+ import type { StaticContext } from "./types.js";
2
+ /**
3
+ * Redact sensitive identifiers in a StaticContext.
4
+ * - File paths: hash each segment, preserve extensions and depth
5
+ * - Env var names: hash
6
+ * - Docker service names: hash
7
+ * - String literals in code samples: hash
8
+ * - Preserve: npm package names, language keywords, structural info
9
+ *
10
+ * Returns a deep copy — the original context is not modified.
11
+ */
12
+ export declare function redactContext(ctx: StaticContext): StaticContext;