jaku.sh 1.0.2 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,184 @@
1
+ import { isPassive } from '../../utils/safety.js';
2
+ import { OpenAIProvider } from './providers/openai-provider.js';
3
+ import { AnthropicProvider } from './providers/anthropic-provider.js';
4
+ import { NullProvider } from './providers/null-provider.js';
5
+
6
+ /**
7
+ * LLMClient — the single facade every consumer touches.
8
+ *
9
+ * Responsibilities:
10
+ * - Provider selection (openai | anthropic | null)
11
+ * - API key injection FROM ENV ONLY (never stored in config)
12
+ * - Enforce enablement + consent + safety-mode gating (egress disabled in passive)
13
+ * - Per-scan token + call budget, per-call timeout (AbortSignal.timeout)
14
+ * - Retry/backoff on 429, circuit-breaker on connection failure
15
+ * - Returns null whenever disabled/unavailable/over-budget so callers degrade
16
+ *
17
+ * Hard rule: ask() NEVER throws to callers and NEVER returns secrets. If the LLM
18
+ * is not usable for any reason, it returns null and the caller keeps its
19
+ * deterministic behavior.
20
+ */
21
+
22
+ const PROVIDER_ENV = {
23
+ openai: 'OPENAI_API_KEY',
24
+ anthropic: 'ANTHROPIC_API_KEY',
25
+ };
26
+
27
+ const DEFAULT_MODELS = {
28
+ openai: 'gpt-4o-mini',
29
+ anthropic: 'claude-3-5-haiku-latest',
30
+ };
31
+
32
+ /**
33
+ * Resolve the runtime LLM state from config + env. The returned `apiKey` (if
34
+ * any) is for immediate provider construction only and is never persisted.
35
+ */
36
+ function resolveLLMRuntime(config) {
37
+ const llm = (config && config.llm) || {};
38
+ const providerName = llm.provider || 'openai';
39
+ const model = llm.model || DEFAULT_MODELS[providerName] || null;
40
+ const envVar = PROVIDER_ENV[providerName];
41
+ const apiKey = envVar ? process.env[envVar] : null;
42
+
43
+ const base = { providerName, model, envVar };
44
+
45
+ if (!llm.enabled) {
46
+ return { ...base, enabled: false, reason: 'not enabled (set llm.enabled or pass --llm)' };
47
+ }
48
+ if (isPassive(config)) {
49
+ return { ...base, enabled: false, reason: 'disabled in passive safety mode (no third-party egress)' };
50
+ }
51
+ if (!PROVIDER_ENV[providerName]) {
52
+ return { ...base, enabled: false, reason: `unknown provider "${providerName}" (use openai|anthropic)` };
53
+ }
54
+ if (!llm.consent) {
55
+ return { ...base, enabled: false, reason: 'no consent (set llm.consent=true or pass --llm-consent)' };
56
+ }
57
+ if (!apiKey) {
58
+ return { ...base, enabled: false, reason: `no API key in env ${envVar}` };
59
+ }
60
+ return { ...base, enabled: true, reason: 'active', apiKey };
61
+ }
62
+
63
+ function createProvider(name, opts) {
64
+ if (name === 'openai') return new OpenAIProvider(opts);
65
+ if (name === 'anthropic') return new AnthropicProvider(opts);
66
+ return new NullProvider(opts);
67
+ }
68
+
69
+ function isConnectionError(err) {
70
+ const code = err?.cause?.code || err?.code;
71
+ return ['ECONNREFUSED', 'ENOTFOUND', 'EAI_AGAIN', 'ECONNRESET', 'UND_ERR_CONNECT_TIMEOUT'].includes(code);
72
+ }
73
+
74
+ export class LLMClient {
75
+ constructor(config, logger) {
76
+ this.config = config || {};
77
+ this.logger = logger || null;
78
+
79
+ const llm = this.config.llm || {};
80
+ this._maxCalls = Number.isFinite(llm.max_calls) ? llm.max_calls : 50;
81
+ this._perCallTokens = Number.isFinite(llm.max_tokens) ? llm.max_tokens : 1024;
82
+ this._tokenBudget = Number.isFinite(llm.token_budget) ? llm.token_budget : 100000;
83
+ this._timeoutMs = (Number.isFinite(llm.timeout_seconds) ? llm.timeout_seconds : 30) * 1000;
84
+
85
+ this._calls = 0;
86
+ this._tokensUsed = 0;
87
+ this._circuitOpen = false;
88
+ this._warnedBudget = false;
89
+
90
+ const runtime = resolveLLMRuntime(this.config);
91
+ this.enabled = runtime.enabled;
92
+ this.reason = runtime.reason;
93
+ this.providerName = runtime.providerName;
94
+ this.model = runtime.model;
95
+
96
+ if (this.enabled) {
97
+ this.provider = createProvider(runtime.providerName, {
98
+ apiKey: runtime.apiKey,
99
+ model: runtime.model,
100
+ baseUrl: llm.base_url || null,
101
+ logger,
102
+ });
103
+ // One-line consent/egress warning. Never logs the key.
104
+ this.logger?.warn?.(
105
+ `[LLM] Augmentation ENABLED via ${runtime.providerName}/${runtime.model}. ` +
106
+ `Minimal finding/target data may be sent to a third-party API. ` +
107
+ `Disable by removing --llm or setting llm.enabled=false.`
108
+ );
109
+ } else {
110
+ this.provider = new NullProvider({ logger });
111
+ }
112
+ }
113
+
114
+ /** True if a real provider is active. */
115
+ isEnabled() {
116
+ return this.enabled && !this._circuitOpen;
117
+ }
118
+
119
+ /** Human-readable one-line status (no secrets). */
120
+ static describe(config) {
121
+ const r = resolveLLMRuntime(config);
122
+ if (r.enabled) return `enabled (${r.providerName}/${r.model})`;
123
+ return `disabled — ${r.reason}`;
124
+ }
125
+
126
+ /** Per-scan usage snapshot (for logging, never includes keys). */
127
+ usage() {
128
+ return { calls: this._calls, tokensUsed: this._tokensUsed, circuitOpen: this._circuitOpen };
129
+ }
130
+
131
+ /**
132
+ * Ask the LLM. Returns the completion text, or null on any disablement /
133
+ * budget exhaustion / error. Never throws.
134
+ */
135
+ async ask({ system, prompt, maxTokens, temperature = 0 } = {}) {
136
+ if (!this.enabled || this._circuitOpen || !prompt) return null;
137
+
138
+ if (this._calls >= this._maxCalls || this._tokensUsed >= this._tokenBudget) {
139
+ if (!this._warnedBudget) {
140
+ this._warnedBudget = true;
141
+ this.logger?.debug?.('[LLM] budget exhausted — further augmentation skipped');
142
+ }
143
+ return null;
144
+ }
145
+
146
+ const outTokens = Math.min(maxTokens || this._perCallTokens, this._perCallTokens);
147
+ this._calls++;
148
+
149
+ let attempt = 0;
150
+ const maxAttempts = 2;
151
+ while (attempt <= maxAttempts) {
152
+ try {
153
+ const res = await this.provider.complete({
154
+ system,
155
+ prompt,
156
+ maxTokens: outTokens,
157
+ temperature,
158
+ signal: AbortSignal.timeout(this._timeoutMs),
159
+ });
160
+ this._tokensUsed += res?.usage?.total_tokens || outTokens;
161
+ return res?.text ?? null;
162
+ } catch (err) {
163
+ if (err?.status === 429 && attempt < maxAttempts) {
164
+ const backoff = 1000 * Math.pow(2, attempt);
165
+ await new Promise(r => setTimeout(r, backoff));
166
+ attempt++;
167
+ continue;
168
+ }
169
+ if (isConnectionError(err)) {
170
+ this._circuitOpen = true;
171
+ this.logger?.warn?.('[LLM] connection failure — disabling LLM augmentation for the remainder of this scan');
172
+ return null;
173
+ }
174
+ // Logger format redacts secrets; keep the message terse regardless.
175
+ this.logger?.debug?.(`[LLM] call failed (${err?.status || err?.name || 'error'}) — degrading to deterministic behavior`);
176
+ return null;
177
+ }
178
+ }
179
+ return null;
180
+ }
181
+ }
182
+
183
+ export { resolveLLMRuntime, PROVIDER_ENV, DEFAULT_MODELS };
184
+ export default LLMClient;
@@ -0,0 +1,46 @@
1
+ import { BaseLLMProvider } from './base-provider.js';
2
+
3
+ /**
4
+ * AnthropicProvider — Adapter for the Anthropic Messages API.
5
+ * Uses the built-in global fetch (Node ≥20). No SDK dependency.
6
+ */
7
+ export class AnthropicProvider extends BaseLLMProvider {
8
+ get name() { return 'anthropic'; }
9
+
10
+ async complete({ system, prompt, maxTokens = 1024, temperature = 0, signal } = {}) {
11
+ const base = (this.baseUrl || 'https://api.anthropic.com/v1').replace(/\/$/, '');
12
+ const url = `${base}/messages`;
13
+
14
+ const res = await fetch(url, {
15
+ method: 'POST',
16
+ headers: {
17
+ 'Content-Type': 'application/json',
18
+ 'x-api-key': this._apiKey,
19
+ 'anthropic-version': '2023-06-01',
20
+ },
21
+ body: JSON.stringify({
22
+ model: this.model,
23
+ max_tokens: maxTokens,
24
+ temperature,
25
+ system: system || undefined,
26
+ messages: [{ role: 'user', content: prompt }],
27
+ }),
28
+ signal,
29
+ });
30
+
31
+ if (!res.ok) {
32
+ await res.text().catch(() => '');
33
+ throw this._httpError(res.status, res.statusText);
34
+ }
35
+
36
+ const json = await res.json();
37
+ // content is an array of blocks; concatenate any text blocks.
38
+ const text = Array.isArray(json?.content)
39
+ ? json.content.filter(b => b?.type === 'text').map(b => b.text).join('').trim() || null
40
+ : null;
41
+ const total = (json?.usage?.input_tokens || 0) + (json?.usage?.output_tokens || 0);
42
+ return { text, usage: { total_tokens: total } };
43
+ }
44
+ }
45
+
46
+ export default AnthropicProvider;
@@ -0,0 +1,44 @@
1
+ /**
2
+ * BaseLLMProvider — Abstract interface for LLM provider adapters.
3
+ *
4
+ * Mirrors the abstract-base pattern used by BaseAgent: subclasses MUST implement
5
+ * `name` and `complete()`. Providers are thin HTTP adapters built on the
6
+ * Node ≥20 global `fetch` — no third-party SDKs.
7
+ */
8
+ export class BaseLLMProvider {
9
+ constructor({ apiKey, model, baseUrl, logger } = {}) {
10
+ if (new.target === BaseLLMProvider) {
11
+ throw new Error('BaseLLMProvider is abstract — extend it, do not instantiate directly.');
12
+ }
13
+ // The API key is held only in this adapter instance for the duration of
14
+ // the scan. It is NEVER written to config, logs, reports, or findings.
15
+ this._apiKey = apiKey || null;
16
+ this.model = model || null;
17
+ this.baseUrl = baseUrl || null;
18
+ this.logger = logger || null;
19
+ }
20
+
21
+ /** Provider display name (e.g. "openai"). Must be overridden. */
22
+ get name() { throw new Error('Provider must define a name'); }
23
+
24
+ /**
25
+ * Perform a single completion.
26
+ * @param {object} req
27
+ * @param {string} req.system - system instruction
28
+ * @param {string} req.prompt - user prompt
29
+ * @param {number} req.maxTokens - max output tokens
30
+ * @param {number} req.temperature
31
+ * @param {AbortSignal} req.signal
32
+ * @returns {Promise<{text: string|null, usage: {total_tokens: number}}>}
33
+ */
34
+ async complete(_req) {
35
+ throw new Error(`${this.name} must implement complete()`);
36
+ }
37
+
38
+ /** Build an Error carrying an HTTP status so the client can branch (e.g. 429). */
39
+ _httpError(status, label) {
40
+ return Object.assign(new Error(`${this.name} API error: ${label || status}`), { status });
41
+ }
42
+ }
43
+
44
+ export default BaseLLMProvider;
@@ -0,0 +1,21 @@
1
+ import { BaseLLMProvider } from './base-provider.js';
2
+
3
+ /**
4
+ * NullProvider — No-op provider used when LLM augmentation is disabled or for
5
+ * tests. Always returns null text so every consumer degrades gracefully to its
6
+ * deterministic, non-LLM behavior.
7
+ */
8
+ export class NullProvider extends BaseLLMProvider {
9
+ constructor(opts = {}) {
10
+ // Allow direct instantiation (it's the safe default).
11
+ super({ ...opts, apiKey: null });
12
+ }
13
+
14
+ get name() { return 'null'; }
15
+
16
+ async complete() {
17
+ return { text: null, usage: { total_tokens: 0 } };
18
+ }
19
+ }
20
+
21
+ export default NullProvider;
@@ -0,0 +1,47 @@
1
+ import { BaseLLMProvider } from './base-provider.js';
2
+
3
+ /**
4
+ * OpenAIProvider — Adapter for the OpenAI Chat Completions API.
5
+ * Uses the built-in global fetch (Node ≥20). No SDK dependency.
6
+ */
7
+ export class OpenAIProvider extends BaseLLMProvider {
8
+ get name() { return 'openai'; }
9
+
10
+ async complete({ system, prompt, maxTokens = 1024, temperature = 0, signal } = {}) {
11
+ const base = (this.baseUrl || 'https://api.openai.com/v1').replace(/\/$/, '');
12
+ const url = `${base}/chat/completions`;
13
+
14
+ const messages = [];
15
+ if (system) messages.push({ role: 'system', content: system });
16
+ messages.push({ role: 'user', content: prompt });
17
+
18
+ const res = await fetch(url, {
19
+ method: 'POST',
20
+ headers: {
21
+ 'Content-Type': 'application/json',
22
+ Authorization: `Bearer ${this._apiKey}`,
23
+ },
24
+ body: JSON.stringify({
25
+ model: this.model,
26
+ temperature,
27
+ max_tokens: maxTokens,
28
+ messages,
29
+ }),
30
+ signal,
31
+ });
32
+
33
+ if (!res.ok) {
34
+ // Drain body without surfacing it (it may echo request data); never log keys.
35
+ await res.text().catch(() => '');
36
+ throw this._httpError(res.status, res.statusText);
37
+ }
38
+
39
+ const json = await res.json();
40
+ const text = json?.choices?.[0]?.message?.content ?? null;
41
+ const total = json?.usage?.total_tokens
42
+ ?? ((json?.usage?.prompt_tokens || 0) + (json?.usage?.completion_tokens || 0));
43
+ return { text, usage: { total_tokens: total || 0 } };
44
+ }
45
+ }
46
+
47
+ export default OpenAIProvider;
@@ -116,7 +116,7 @@ export class AccessBoundaryTester {
116
116
  */
117
117
  async _testIDOR(businessContext, surfaceInventory) {
118
118
  const findings = [];
119
- const apis = surfaceInventory.apis || [];
119
+ const apis = surfaceInventory.apiEndpoints || surfaceInventory.apis || [];
120
120
 
121
121
  for (const api of apis) {
122
122
  const url = api.url || api;
@@ -1,4 +1,5 @@
1
1
  import { createFinding } from '../../utils/finding.js';
2
+ import { inferBusinessDomains } from '../llm/augmentations.js';
2
3
 
3
4
  /**
4
5
  * BusinessRuleInferrer — Infers business rules from the surface inventory.
@@ -80,7 +81,9 @@ export class BusinessRuleInferrer {
80
81
 
81
82
  const pages = surfaceInventory.pages || [];
82
83
  const forms = surfaceInventory.forms || [];
83
- const apis = surfaceInventory.apis || [];
84
+ // The crawler emits `apiEndpoints` (not `apis`); read the correct field
85
+ // so business-logic API surfaces are actually categorized.
86
+ const apis = surfaceInventory.apiEndpoints || surfaceInventory.apis || [];
84
87
 
85
88
  // 1. Categorize pages by domain
86
89
  for (const page of pages) {
@@ -156,6 +159,52 @@ export class BusinessRuleInferrer {
156
159
  return context;
157
160
  }
158
161
 
162
+ /**
163
+ * Phase 3 — Augment the regex-based domain inference with LLM-inferred
164
+ * domains and security-relevant invariants. STRICTLY ADDITIVE: the regex
165
+ * DOMAIN_PATTERNS result above is untouched; results are attached under
166
+ * `context.llmInsights` (source:'llm'). No-op when LLM is disabled.
167
+ *
168
+ * Data minimization: only URL paths + form field NAMES are sent (no values,
169
+ * no bodies, no secrets).
170
+ */
171
+ async augmentWithLLM(context, surfaceInventory, llmClient) {
172
+ if (!llmClient?.isEnabled?.()) return context;
173
+
174
+ try {
175
+ const pages = surfaceInventory.pages || [];
176
+ const apis = surfaceInventory.apiEndpoints || surfaceInventory.apis || [];
177
+ const forms = surfaceInventory.forms || [];
178
+
179
+ const toPath = (u) => {
180
+ try { return new URL(typeof u === 'string' ? u : (u.url || '')).pathname; }
181
+ catch { return typeof u === 'string' ? u : (u?.url || ''); }
182
+ };
183
+
184
+ const paths = [...new Set([
185
+ ...pages.map(p => toPath(p.url || p)),
186
+ ...apis.map(a => toPath(a.url || a)),
187
+ ])].filter(Boolean);
188
+
189
+ const formFields = [...new Set(
190
+ forms.flatMap(f => (f.fields || []).map(fl => (fl.name || fl.id || '').toLowerCase()))
191
+ )].filter(Boolean);
192
+
193
+ const result = await inferBusinessDomains(llmClient, { paths, formFields });
194
+ if (result && (result.domains.length || result.invariants.length)) {
195
+ context.llmInsights = {
196
+ domains: result.domains,
197
+ invariants: result.invariants,
198
+ source: 'llm',
199
+ };
200
+ this.logger?.info?.(`Business Rule Inferrer (LLM): +${result.domains.length} domains, ${result.invariants.length} invariants`);
201
+ }
202
+ } catch (err) {
203
+ this.logger?.debug?.(`Business Rule Inferrer LLM augmentation skipped: ${err.message}`);
204
+ }
205
+ return context;
206
+ }
207
+
159
208
  /**
160
209
  * Detect multi-step flows (pages with step indicators in URLs).
161
210
  */