thumbgate 1.16.13 → 1.16.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. package/.claude-plugin/marketplace.json +2 -2
  2. package/.claude-plugin/plugin.json +1 -1
  3. package/.well-known/mcp/server-card.json +1 -1
  4. package/README.md +3 -1
  5. package/adapters/claude/.mcp.json +2 -2
  6. package/adapters/mcp/server-stdio.js +26 -1
  7. package/adapters/opencode/opencode.json +1 -1
  8. package/bin/cli.js +420 -1
  9. package/config/gate-templates.json +372 -0
  10. package/config/mcp-allowlists.json +25 -0
  11. package/config/model-candidates.json +59 -2
  12. package/config/model-tiers.json +4 -1
  13. package/package.json +79 -22
  14. package/public/compare.html +6 -0
  15. package/public/index.html +144 -11
  16. package/public/numbers.html +8 -8
  17. package/public/pro.html +22 -24
  18. package/scripts/agent-design-governance.js +211 -0
  19. package/scripts/agent-reasoning-traces.js +683 -0
  20. package/scripts/agent-reward-model.js +438 -0
  21. package/scripts/agent-stack-survival-audit.js +231 -0
  22. package/scripts/ai-engineering-stack-guardrails.js +256 -0
  23. package/scripts/billing.js +16 -4
  24. package/scripts/chatgpt-ads-readiness-pack.js +195 -0
  25. package/scripts/cli-schema.js +277 -0
  26. package/scripts/code-graph-guardrails.js +176 -0
  27. package/scripts/deepseek-v4-runtime-guardrails.js +253 -0
  28. package/scripts/gemini-embedding-policy.js +198 -0
  29. package/scripts/inference-cache-policy.js +39 -0
  30. package/scripts/judge-reward-function.js +396 -0
  31. package/scripts/llm-behavior-monitor.js +251 -0
  32. package/scripts/long-running-agent-context-guardrails.js +176 -0
  33. package/scripts/multimodal-retrieval-plan.js +31 -11
  34. package/scripts/oss-pr-opportunity-scout.js +240 -0
  35. package/scripts/proactive-agent-eval-guardrails.js +230 -0
  36. package/scripts/profile-router.js +5 -4
  37. package/scripts/prompting-operating-system.js +273 -0
  38. package/scripts/proxy-pointer-rag-guardrails.js +189 -0
  39. package/scripts/rag-precision-guardrails.js +202 -0
  40. package/scripts/rate-limiter.js +1 -1
  41. package/scripts/reasoning-efficiency-guardrails.js +176 -0
  42. package/scripts/reward-hacking-guardrails.js +251 -0
  43. package/scripts/seo-gsd.js +1201 -11
  44. package/scripts/single-use-credential-gate.js +182 -0
  45. package/scripts/structured-prompt-driven.js +226 -0
  46. package/scripts/telemetry-analytics.js +31 -6
  47. package/scripts/tool-registry.js +92 -0
  48. package/scripts/upstream-contribution-engine.js +379 -0
  49. package/scripts/vector-store.js +119 -4
  50. package/src/api/server.js +333 -100
  51. package/scripts/agents-sdk-sandbox-plan.js +0 -57
  52. package/scripts/ai-org-governance.js +0 -98
  53. package/scripts/artifact-agent-plan.js +0 -81
  54. package/scripts/enterprise-agent-rollout.js +0 -34
  55. package/scripts/experience-replay-governance.js +0 -69
  56. package/scripts/inference-economics.js +0 -53
  57. package/scripts/knowledge-layer-plan.js +0 -108
  58. package/scripts/memory-store-governance.js +0 -60
  59. package/scripts/post-training-governance.js +0 -34
  60. package/scripts/production-agent-readiness.js +0 -40
  61. package/scripts/scaling-law-claims.js +0 -60
  62. package/scripts/student-consistent-training.js +0 -73
@@ -1,98 +0,0 @@
1
- #!/usr/bin/env node
2
- 'use strict';
3
-
4
- function normalizeText(value) {
5
- if (value === undefined || value === null) return '';
6
- return String(value).trim();
7
- }
8
-
9
- function normalizeBudget(value, fallback) {
10
- const number = Number(value);
11
- return Number.isFinite(number) && number >= 0 ? number : fallback;
12
- }
13
-
14
- function buildAiOrgGovernancePlan(input = {}) {
15
- const mission = normalizeText(input.mission) || 'Continuously improve agent-governed workflows while staying within budget.';
16
- const monthlyBudgetUsd = normalizeBudget(input.monthlyBudgetUsd, 25);
17
- const roles = [
18
- {
19
- id: 'ceo',
20
- title: 'Planner',
21
- mission: 'Break goals into tickets, assign owners, and enforce ROI.',
22
- monthlyBudgetUsd: normalizeBudget(input.ceoBudgetUsd, Math.min(10, monthlyBudgetUsd)),
23
- canCreateAgents: false,
24
- },
25
- {
26
- id: 'research_analyst',
27
- title: 'Research Analyst',
28
- mission: 'Collect market and technical signals into structured briefs.',
29
- monthlyBudgetUsd: normalizeBudget(input.researchBudgetUsd, Math.min(5, monthlyBudgetUsd)),
30
- canCreateAgents: false,
31
- },
32
- {
33
- id: 'qa_operator',
34
- title: 'QA Operator',
35
- mission: 'Review evidence, tests, diffs, and spend anomalies before promotion.',
36
- monthlyBudgetUsd: normalizeBudget(input.qaBudgetUsd, Math.min(5, monthlyBudgetUsd)),
37
- canCreateAgents: false,
38
- },
39
- ];
40
-
41
- return {
42
- generatedAt: normalizeText(input.generatedAt) || new Date().toISOString(),
43
- orgName: normalizeText(input.orgName) || 'ThumbGate Agent Company',
44
- mission,
45
- monthlyBudgetUsd,
46
- roles,
47
- ticketTemplates: [
48
- {
49
- id: 'market_signal_brief',
50
- ownerRole: 'research_analyst',
51
- outputSchema: ['source', 'claim', 'relevance', 'action', 'evidence'],
52
- },
53
- {
54
- id: 'workflow_hardening',
55
- ownerRole: 'ceo',
56
- outputSchema: ['risk', 'gate', 'test', 'rollback', 'owner'],
57
- },
58
- {
59
- id: 'evidence_review',
60
- ownerRole: 'qa_operator',
61
- outputSchema: ['claim', 'evidence', 'verdict', 'missing_proof'],
62
- },
63
- ],
64
- approvalGates: [
65
- 'new_agent_role',
66
- 'budget_increase',
67
- 'credentialed_connector_write',
68
- 'production_release',
69
- 'public_claim_without_evidence',
70
- ],
71
- audit: {
72
- daily: ['ticket outcomes', 'spend by role', 'blocked actions', 'open approvals'],
73
- weekly: ['low ROI tickets', 'stale agents', 'budget cap changes', 'policy drift'],
74
- },
75
- };
76
- }
77
-
78
- function evaluateAiOrgAction(action = {}, plan = buildAiOrgGovernancePlan()) {
79
- const type = normalizeText(action.type);
80
- const issues = [];
81
- if (type === 'create_agent') issues.push('new_agent_role');
82
- if (type === 'raise_budget') issues.push('budget_increase');
83
- if (type === 'connector_write') issues.push('credentialed_connector_write');
84
- if (type === 'public_claim' && !(Array.isArray(action.evidence) && action.evidence.length > 0)) {
85
- issues.push('public_claim_without_evidence');
86
- }
87
- const gateHits = issues.filter((issue) => plan.approvalGates.includes(issue));
88
- return {
89
- decision: gateHits.length > 0 ? 'warn' : 'allow',
90
- gateHits,
91
- requiredApproval: gateHits.length > 0,
92
- };
93
- }
94
-
95
- module.exports = {
96
- buildAiOrgGovernancePlan,
97
- evaluateAiOrgAction,
98
- };
@@ -1,81 +0,0 @@
1
- #!/usr/bin/env node
2
- 'use strict';
3
-
4
- function normalizeText(value) {
5
- if (value === undefined || value === null) return '';
6
- return String(value).trim();
7
- }
8
-
9
- function normalizeTask(task, index = 0) {
10
- const id = normalizeText(task?.id) || `task-${index + 1}`;
11
- return {
12
- id,
13
- description: normalizeText(task?.description),
14
- branchName: normalizeText(task?.branchName) || id.toLowerCase().replaceAll(/[^a-z0-9]+/g, '-'),
15
- priority: Number.isFinite(Number(task?.priority)) ? Number(task.priority) : index + 1,
16
- };
17
- }
18
-
19
- function buildArtifactAgentPlan(input = {}) {
20
- const baselineName = normalizeText(input.baselineName) || 'baseline';
21
- const gitUrl = normalizeText(input.gitUrl) || 'https://github.com/IgorGanapolsky/ThumbGate.git';
22
- const tasks = Array.isArray(input.tasks) ? input.tasks.map(normalizeTask) : [];
23
- const forks = tasks.map((task) => ({
24
- taskId: task.id,
25
- forkName: `${baselineName}-${task.id}`.toLowerCase().replaceAll(/[^a-z0-9-]+/g, '-'),
26
- branchName: task.branchName,
27
- artifactRemote: null,
28
- tokenRef: `artifact_token_${task.id}`,
29
- status: 'planned',
30
- }));
31
-
32
- return {
33
- generatedAt: normalizeText(input.generatedAt) || new Date().toISOString(),
34
- baseline: {
35
- name: baselineName,
36
- gitUrl,
37
- importIfMissing: true,
38
- },
39
- taskSchema: {
40
- required: ['id', 'description', 'branchName', 'priority'],
41
- properties: {
42
- id: 'stable task identifier',
43
- description: 'agent-readable task description',
44
- branchName: 'deterministic branch/fork suffix',
45
- priority: 'lower number runs first',
46
- },
47
- },
48
- tasks,
49
- forks,
50
- runnerContract: {
51
- filesystem: 'in_memory_git',
52
- tools: ['read(path)', 'write(path, contents)', 'run_tests(command)', 'commit(message)'],
53
- constraints: [
54
- 'read before write',
55
- 'minimize changes',
56
- 'commit every successful task',
57
- 'never expose artifact tokens in logs',
58
- ],
59
- },
60
- reviewGate: {
61
- requiredBeforeMerge: [
62
- 'diff summary',
63
- 'changed files',
64
- 'test output',
65
- 'decision journal entry',
66
- 'human or reviewer-agent approval',
67
- ],
68
- blockedWithout: ['baseline comparison', 'rollback path', 'evidence artifacts'],
69
- },
70
- observability: {
71
- events: ['task_created', 'fork_created', 'agent_started', 'tool_call', 'commit_pushed', 'reviewed', 'merged_or_rejected'],
72
- metrics: ['task_latency_ms', 'tool_call_count', 'test_pass_rate', 'review_reject_rate'],
73
- traceKey: 'artifact_task_id',
74
- },
75
- };
76
- }
77
-
78
- module.exports = {
79
- buildArtifactAgentPlan,
80
- normalizeTask,
81
- };
@@ -1,34 +0,0 @@
1
- #!/usr/bin/env node
2
- 'use strict';
3
-
4
- function buildEnterpriseAgentRollout(input = {}) {
5
- const industry = input.industry || 'enterprise software';
6
- return {
7
- program: 'ThumbGate Enterprise Agent Acceleration',
8
- industry,
9
- operatingModel: {
10
- forwardDeployedEngineer: true,
11
- humanInTheLead: true,
12
- domainExpertsRequired: true,
13
- sovereignDeploymentOption: true,
14
- },
15
- phases: [
16
- { id: 'discover', outcome: 'rank workflows by measurable business value and risk' },
17
- { id: 'prototype', outcome: 'ship one governed agent with evidence and rollback' },
18
- { id: 'scale', outcome: 'publish reusable agent catalog and approval policies' },
19
- { id: 'operate', outcome: 'review traces, ROI, incidents, and policy drift weekly' },
20
- ],
21
- governance: [
22
- 'human oversight for high-stakes recommendations',
23
- 'sovereign data boundary when required',
24
- 'agent catalog with owner and allowed tools',
25
- 'decision journal for every business-critical action',
26
- 'measurable outcome before expansion',
27
- ],
28
- metrics: ['cycle_time_saved', 'blocked_risky_actions', 'approved_agent_runs', 'business_value_cents', 'incident_rate'],
29
- };
30
- }
31
-
32
- module.exports = {
33
- buildEnterpriseAgentRollout,
34
- };
@@ -1,69 +0,0 @@
1
- 'use strict';
2
-
3
- function buildExperienceReplayPolicy(options = {}) {
4
- const maxStalenessHours = Number.isFinite(options.maxStalenessHours) ? options.maxStalenessHours : 24;
5
- const replayRatio = Number.isFinite(options.replayRatio) ? options.replayRatio : 0.25;
6
- const minEntropy = Number.isFinite(options.minEntropy) ? options.minEntropy : 0.65;
7
-
8
- return {
9
- policyId: 'feedback_experience_replay',
10
- purpose: 'Reuse high-signal feedback trajectories without letting stale lessons dominate training.',
11
- buffer: {
12
- strategy: 'fifo_with_quality_filters',
13
- maxStalenessHours,
14
- replayRatio,
15
- sampleWithoutRemoval: true,
16
- },
17
- filters: [
18
- 'redacted',
19
- 'source_feedback_id_present',
20
- 'outcome_evidence_present',
21
- 'not_contradicted_by_newer_lesson',
22
- 'not_low_confidence_or_vague_feedback',
23
- ],
24
- monitors: {
25
- maxStalenessHours,
26
- minEntropy,
27
- compareAgainstFreshOnly: true,
28
- metrics: ['gate_precision', 'gate_recall', 'unsupported_claim_rate', 'policy_entropy', 'compute_saved_percent'],
29
- },
30
- };
31
- }
32
-
33
- function evaluateReplayCandidate(candidate = {}, policy = buildExperienceReplayPolicy()) {
34
- const issues = [];
35
- if (!candidate.sourceFeedbackId) issues.push('missing_source_feedback_id');
36
- if (!candidate.redacted) issues.push('redaction_required');
37
- if (!candidate.outcomeEvidence) issues.push('missing_outcome_evidence');
38
- if (candidate.contradictedByNewerLesson) issues.push('contradicted_by_newer_lesson');
39
- if (candidate.vagueFeedback) issues.push('vague_feedback_not_replayable');
40
-
41
- const ageHours = Number(candidate.ageHours || 0);
42
- if (ageHours > policy.buffer.maxStalenessHours) issues.push('stale_replay_sample');
43
-
44
- return {
45
- decision: issues.length ? 'reject' : 'accept',
46
- issues,
47
- replayWeight: issues.length ? 0 : Math.min(policy.buffer.replayRatio, Number(candidate.qualityScore || 1)),
48
- };
49
- }
50
-
51
- function evaluateReplayRun(run = {}, policy = buildExperienceReplayPolicy()) {
52
- const issues = [];
53
- if (Number(run.replayRatio || 0) > 0.5) issues.push('replay_ratio_too_high');
54
- if (Number(run.policyEntropy || 0) < policy.monitors.minEntropy) issues.push('policy_entropy_too_low');
55
- if (!run.freshOnlyBaseline) issues.push('missing_fresh_only_baseline');
56
- if (!run.computeSavedPercent && run.computeSavedPercent !== 0) issues.push('missing_compute_saved_metric');
57
-
58
- return {
59
- decision: issues.length ? 'warn' : 'allow',
60
- issues,
61
- computeEfficient: Number(run.computeSavedPercent || 0) > 0 && Number(run.policyEntropy || 0) >= policy.monitors.minEntropy,
62
- };
63
- }
64
-
65
- module.exports = {
66
- buildExperienceReplayPolicy,
67
- evaluateReplayCandidate,
68
- evaluateReplayRun,
69
- };
@@ -1,53 +0,0 @@
1
- #!/usr/bin/env node
2
- 'use strict';
3
-
4
- function estimateDifficulty(input = {}) {
5
- let score = 0;
6
- const text = String(input.task || input.prompt || '');
7
- if (text.length > 1200) score += 20;
8
- if (/ambiguous|research|architecture|security|production|migration|legal|financial/i.test(text)) score += 25;
9
- if (Number(input.dollarImpact || 0) >= 1000) score += 25;
10
- if (Array.isArray(input.files) && input.files.length > 5) score += 15;
11
- if (input.requiresHumanApproval === true) score += 15;
12
- return Math.max(0, Math.min(100, score));
13
- }
14
-
15
- function planInferenceBudget(input = {}) {
16
- const difficulty = Number.isFinite(input.difficulty) ? input.difficulty : estimateDifficulty(input);
17
- const maxCostCents = Number.isFinite(Number(input.maxCostCents)) ? Number(input.maxCostCents) : 50;
18
- let depth = 'shallow';
19
- let reasoningEffort = 'low';
20
- let expertCount = 1;
21
- let humanHandoff = false;
22
-
23
- if (difficulty >= 70) {
24
- depth = 'deep';
25
- reasoningEffort = 'high';
26
- expertCount = 4;
27
- humanHandoff = true;
28
- } else if (difficulty >= 35) {
29
- depth = 'standard';
30
- reasoningEffort = 'medium';
31
- expertCount = 2;
32
- }
33
-
34
- if (maxCostCents < 20 && depth === 'deep') {
35
- depth = 'standard';
36
- reasoningEffort = 'medium';
37
- }
38
-
39
- return {
40
- difficulty,
41
- maxCostCents,
42
- depth,
43
- reasoningEffort,
44
- activeExperts: expertCount,
45
- humanHandoff,
46
- telemetry: ['difficulty', 'depth', 'reasoningEffort', 'activeExperts', 'latencyMs', 'costCents', 'outcome'],
47
- };
48
- }
49
-
50
- module.exports = {
51
- estimateDifficulty,
52
- planInferenceBudget,
53
- };
@@ -1,108 +0,0 @@
1
- 'use strict';
2
-
3
- function buildKnowledgeLayerPlan(options = {}) {
4
- const domain = options.domain || 'agent_reliability';
5
- const graph = options.graph || 'neo4j';
6
-
7
- return {
8
- domain,
9
- graph,
10
- memoryTiers: [
11
- {
12
- id: 'short_term',
13
- purpose: 'Current session context so the agent does not re-ask answered questions.',
14
- ttl: 'session',
15
- },
16
- {
17
- id: 'long_term',
18
- purpose: 'Durable user, product, workflow, and feedback profile facts.',
19
- ttl: 'durable',
20
- },
21
- {
22
- id: 'reasoning_memory',
23
- purpose: 'Reusable decision paths that avoid recomputing expensive traversals.',
24
- ttl: 'versioned',
25
- },
26
- ],
27
- nodeTypes: [
28
- 'User',
29
- 'Agent',
30
- 'Workflow',
31
- 'Feedback',
32
- 'Gate',
33
- 'Decision',
34
- 'Evidence',
35
- 'Recommendation',
36
- 'Outcome',
37
- ],
38
- relationshipTypes: [
39
- 'GAVE_FEEDBACK',
40
- 'TRIGGERED_GATE',
41
- 'USED_EVIDENCE',
42
- 'RECOMMENDED_ACTION',
43
- 'PRODUCED_OUTCOME',
44
- 'SIMILAR_TO',
45
- 'REUSES_REASONING',
46
- ],
47
- highRoiUseCases: [
48
- 'conversion recommendations with explainable evidence paths',
49
- 'compute savings from reasoning-memory cache hits',
50
- 'compliance audit trail for why an agent recommended or blocked an action',
51
- 'closed-loop profile updates from every feedback, purchase, or outcome event',
52
- ],
53
- gates: [
54
- 'do not recommend without an evidence path',
55
- 'do not reuse reasoning memory when source facts changed',
56
- 'write audit node for every recommendation and blocked action',
57
- 'record outcome feedback to update profile and graph edges',
58
- ],
59
- };
60
- }
61
-
62
- function buildRecommendationEvidencePath(input = {}) {
63
- const userId = input.userId || 'unknown_user';
64
- const recommendationId = input.recommendationId || 'rec_pending';
65
- const evidence = Array.isArray(input.evidence) ? input.evidence : [];
66
- const similarProfiles = Array.isArray(input.similarProfiles) ? input.similarProfiles : [];
67
-
68
- return {
69
- recommendationId,
70
- path: [
71
- { type: 'User', id: userId },
72
- ...similarProfiles.map((id) => ({ type: 'SimilarProfile', id })),
73
- ...evidence.map((item, index) => ({
74
- type: item.type || 'Evidence',
75
- id: item.id || `evidence_${index + 1}`,
76
- quote: item.quote || null,
77
- })),
78
- { type: 'Recommendation', id: recommendationId },
79
- ],
80
- explainable: evidence.length > 0,
81
- };
82
- }
83
-
84
- function evaluateKnowledgeLayerRun(run = {}) {
85
- const issues = [];
86
- if (!run.userId) issues.push('missing_user_id');
87
- if (!run.recommendationId) issues.push('missing_recommendation_id');
88
- if (!run.evidencePath?.explainable) issues.push('missing_explainable_evidence_path');
89
- if (!run.auditNodeId) issues.push('missing_audit_node_id');
90
- if (run.reusedReasoning && !run.reasoningVersion) issues.push('missing_reasoning_version');
91
- if (run.profileUpdate && !run.outcomeEventId) issues.push('missing_outcome_event_id');
92
-
93
- return {
94
- decision: issues.length ? 'warn' : 'allow',
95
- issues,
96
- roiSignals: [
97
- run.reusedReasoning ? 'lower_graph_query_and_token_cost' : null,
98
- run.profileUpdate ? 'closed_loop_personalization' : null,
99
- run.auditNodeId ? 'compliance_trace_available' : null,
100
- ].filter(Boolean),
101
- };
102
- }
103
-
104
- module.exports = {
105
- buildKnowledgeLayerPlan,
106
- buildRecommendationEvidencePath,
107
- evaluateKnowledgeLayerRun,
108
- };
@@ -1,60 +0,0 @@
1
- #!/usr/bin/env node
2
- 'use strict';
3
-
4
- function normalizeText(value) {
5
- if (value === undefined || value === null) return '';
6
- return String(value).trim();
7
- }
8
-
9
- function classifyMemoryFile(filePath) {
10
- const normalized = normalizeText(filePath).toLowerCase();
11
- if (/preference|style|tone|format/.test(normalized)) return 'preference';
12
- if (/credential|token|secret|password|key/.test(normalized)) return 'blocked_secret';
13
- if (/task|completed|todo|draft/.test(normalized)) return 'workflow_state';
14
- if (/account|customer|user|contact/.test(normalized)) return 'sensitive_context';
15
- return 'general';
16
- }
17
-
18
- function actionForClassification(classification) {
19
- if (classification === 'blocked_secret') return 'block';
20
- if (classification === 'sensitive_context') return 'redact_before_export';
21
- return 'allow_reviewed_promotion';
22
- }
23
-
24
- function buildMemoryStoreGovernance(input = {}) {
25
- const files = Array.isArray(input.files) ? input.files : [];
26
- const records = files.map((file) => {
27
- const path = typeof file === 'string' ? file : file.path;
28
- const classification = classifyMemoryFile(path);
29
- return {
30
- path: normalizeText(path),
31
- classification,
32
- promotable: !['blocked_secret', 'sensitive_context'].includes(classification),
33
- action: actionForClassification(classification),
34
- };
35
- }).filter((record) => record.path);
36
-
37
- return {
38
- generatedAt: normalizeText(input.generatedAt) || new Date().toISOString(),
39
- storeKind: 'file_backed_agent_memory',
40
- records,
41
- policy: {
42
- export: 'allowed_after_redaction',
43
- import: 'requires_schema_validation',
44
- promotion: 'requires_review_and_actionable_context',
45
- deletion: 'append_decision_journal_entry',
46
- },
47
- summary: {
48
- totalFiles: records.length,
49
- blocked: records.filter((record) => record.action === 'block').length,
50
- redactBeforeExport: records.filter((record) => record.action === 'redact_before_export').length,
51
- promotable: records.filter((record) => record.promotable).length,
52
- },
53
- };
54
- }
55
-
56
- module.exports = {
57
- actionForClassification,
58
- buildMemoryStoreGovernance,
59
- classifyMemoryFile,
60
- };
@@ -1,34 +0,0 @@
1
- #!/usr/bin/env node
2
- 'use strict';
3
-
4
- function evaluatePostTrainingPlan(input = {}) {
5
- const mode = String(input.mode || '').toLowerCase();
6
- const issues = [];
7
- if (!['sft', 'rl', 'grpo', 'gspo'].includes(mode)) issues.push('unsupported_post_training_mode');
8
- if (!input.dataset) issues.push('missing_dataset');
9
- if (!input.baseCheckpoint) issues.push('missing_base_checkpoint');
10
- if (input.piiRedacted !== true) issues.push('pii_redaction_required');
11
- if (input.holdoutEval !== true) issues.push('holdout_eval_required');
12
- if (input.rewardSpecRequired !== false && ['rl', 'grpo', 'gspo'].includes(mode) && !input.rewardSpec) {
13
- issues.push('missing_reward_spec');
14
- }
15
- if (input.maxSpendCents === undefined) issues.push('missing_spend_cap');
16
-
17
- return {
18
- mode,
19
- decision: issues.length === 0 ? 'allow' : 'warn',
20
- issues,
21
- requiredArtifacts: [
22
- 'dataset manifest',
23
- 'PII redaction report',
24
- 'base checkpoint',
25
- 'holdout eval report',
26
- 'spend cap',
27
- ['rl', 'grpo', 'gspo'].includes(mode) ? 'reward specification' : null,
28
- ].filter(Boolean),
29
- };
30
- }
31
-
32
- module.exports = {
33
- evaluatePostTrainingPlan,
34
- };
@@ -1,40 +0,0 @@
1
- #!/usr/bin/env node
2
- 'use strict';
3
-
4
- function readinessStatus(score, missing) {
5
- if (missing.length === 0) return 'production_ready';
6
- if (score >= 60) return 'needs_hardening';
7
- return 'prototype';
8
- }
9
-
10
- function evaluateProductionAgentReadiness(input = {}) {
11
- const signals = {
12
- subAgents: Array.isArray(input.subAgents) && input.subAgents.length >= 2,
13
- structuredOutputs: input.structuredOutputs === true,
14
- dynamicRag: input.dynamicRag === true,
15
- observability: input.observability === true || input.tracing === true,
16
- circuitBreakers: input.circuitBreakers === true,
17
- };
18
- const missing = Object.entries(signals)
19
- .filter(([, present]) => !present)
20
- .map(([name]) => name);
21
- const score = Math.round((Object.values(signals).filter(Boolean).length / Object.keys(signals).length) * 100);
22
- return {
23
- status: readinessStatus(score, missing),
24
- score,
25
- signals,
26
- missing,
27
- requiredFixes: missing.map((name) => ({
28
- subAgents: 'Split monolithic prompts into narrow sub-agent stages.',
29
- structuredOutputs: 'Use runtime-validated schemas instead of prompt-only JSON formatting.',
30
- dynamicRag: 'Replace hardcoded context with refreshed retrieval over indexed source material.',
31
- observability: 'Emit traces for model calls, tool calls, tokens, latency, and stage failures.',
32
- circuitBreakers: 'Set retry, timeout, loop, and spend limits before production use.',
33
- }[name])),
34
- };
35
- }
36
-
37
- module.exports = {
38
- evaluateProductionAgentReadiness,
39
- readinessStatus,
40
- };
@@ -1,60 +0,0 @@
1
- #!/usr/bin/env node
2
- 'use strict';
3
-
4
- function normalizeText(value) {
5
- if (value === undefined || value === null) return '';
6
- return String(value).trim();
7
- }
8
-
9
- function classifyScalingClaim(claim) {
10
- const text = normalizeText(claim).toLowerCase();
11
- if (/\b(pretrain|pretraining|parameters|training tokens|flops|cross entropy|test loss)\b/.test(text)) {
12
- return 'pretraining_scaling';
13
- }
14
- if (/\b(rl|reinforcement|feedback|dpo|kto|reward|policy|thumbs[-\s]?(up|down)|gate|prevention rule)\b/.test(text)) {
15
- return 'feedback_policy_scaling';
16
- }
17
- return 'general_scaling';
18
- }
19
-
20
- function evaluateScalingClaim(input = {}) {
21
- const claim = normalizeText(input.claim);
22
- const claimType = classifyScalingClaim(claim);
23
- const evidence = Array.isArray(input.evidence) ? input.evidence.filter(Boolean) : [];
24
- const heldout = evidence.some((entry) => /held[-\s]?out|validation|eval|ablation|backtest/i.test(String(entry)));
25
- const production = evidence.some((entry) => /production|real user|workflow run|decision journal|blocked action/i.test(String(entry)));
26
- const rlCompute = evidence.some((entry) => /sampling compute|rollout|trajectory|policy update|reward model|rl compute/i.test(String(entry)));
27
- const sampling = evidence.some((entry) => /pass@|best-of-n|majority vote|sample budget|sampling/i.test(String(entry)));
28
- const issues = [];
29
-
30
- if (!claim) issues.push('missing_claim');
31
- if (claimType === 'feedback_policy_scaling' && !heldout) {
32
- issues.push('missing_heldout_feedback_eval');
33
- }
34
- if (claimType === 'feedback_policy_scaling' && /rl|reinforcement|sampling/i.test(claim) && !rlCompute) {
35
- issues.push('missing_rl_compute_evidence');
36
- }
37
- if (claimType === 'feedback_policy_scaling' && /sampling|best-of|vote|pass@/i.test(claim) && !sampling) {
38
- issues.push('missing_sampling_budget_evidence');
39
- }
40
- if (claimType === 'pretraining_scaling' && evidence.length === 0) {
41
- issues.push('missing_model_scaling_evidence');
42
- }
43
- if (/guarantee|always|never|100%|proves?/i.test(claim) && !production) {
44
- issues.push('absolute_claim_without_production_evidence');
45
- }
46
-
47
- return {
48
- claimType,
49
- decision: issues.length === 0 ? 'allow' : 'warn',
50
- issues,
51
- requiredEvidence: claimType === 'feedback_policy_scaling'
52
- ? ['held-out eval', 'ablation or backtest', 'RL/sampling compute budget when claimed', 'decision-journal production sample']
53
- : ['source data', 'validation metric', 'scope limits'],
54
- };
55
- }
56
-
57
- module.exports = {
58
- classifyScalingClaim,
59
- evaluateScalingClaim,
60
- };