dialectic 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/.cursor/commands/setup-test.mdc +175 -0
  2. package/.cursor/rules/basic-code-cleanup.mdc +1110 -0
  3. package/.cursor/rules/riper5.mdc +96 -0
  4. package/.env.example +6 -0
  5. package/AGENTS.md +1052 -0
  6. package/LICENSE +21 -0
  7. package/README.md +93 -0
  8. package/WARP.md +113 -0
  9. package/dialectic-1.0.0.tgz +0 -0
  10. package/dialectic.js +10 -0
  11. package/docs/commands.md +375 -0
  12. package/docs/configuration.md +882 -0
  13. package/docs/context_summarization.md +1023 -0
  14. package/docs/debate_flow.md +1127 -0
  15. package/docs/eval_flow.md +795 -0
  16. package/docs/evaluator.md +141 -0
  17. package/examples/debate-config-openrouter.json +48 -0
  18. package/examples/debate_config1.json +48 -0
  19. package/examples/eval/eval1/eval_config1.json +13 -0
  20. package/examples/eval/eval1/result1.json +62 -0
  21. package/examples/eval/eval1/result2.json +97 -0
  22. package/examples/eval_summary_format.md +11 -0
  23. package/examples/example3/debate-config.json +64 -0
  24. package/examples/example3/eval_config2.json +25 -0
  25. package/examples/example3/problem.md +17 -0
  26. package/examples/example3/rounds_test/eval_run.sh +16 -0
  27. package/examples/example3/rounds_test/run_test.sh +16 -0
  28. package/examples/kata1/architect-only-solution_2-rounds.json +121 -0
  29. package/examples/kata1/architect-perf-solution_2-rounds.json +234 -0
  30. package/examples/kata1/debate-config-kata1.json +54 -0
  31. package/examples/kata1/eval_architect-only_2-rounds.json +97 -0
  32. package/examples/kata1/eval_architect-perf_2-rounds.json +97 -0
  33. package/examples/kata1/kata1-report.md +12224 -0
  34. package/examples/kata1/kata1-report_temps-01_01_01_07.md +2451 -0
  35. package/examples/kata1/kata1.md +5 -0
  36. package/examples/kata1/meta.txt +1 -0
  37. package/examples/kata2/debate-config.json +54 -0
  38. package/examples/kata2/eval_config1.json +21 -0
  39. package/examples/kata2/eval_config2.json +25 -0
  40. package/examples/kata2/kata2.md +5 -0
  41. package/examples/kata2/only_architect/debate-config.json +45 -0
  42. package/examples/kata2/only_architect/eval_run.sh +11 -0
  43. package/examples/kata2/only_architect/run_test.sh +5 -0
  44. package/examples/kata2/rounds_test/eval_run.sh +11 -0
  45. package/examples/kata2/rounds_test/run_test.sh +5 -0
  46. package/examples/kata2/summary_length_test/eval_run.sh +11 -0
  47. package/examples/kata2/summary_length_test/eval_run_w_clarify.sh +7 -0
  48. package/examples/kata2/summary_length_test/run_test.sh +5 -0
  49. package/examples/task-queue/debate-config.json +76 -0
  50. package/examples/task-queue/debate_report.md +566 -0
  51. package/examples/task-queue/task-queue-system.md +25 -0
  52. package/jest.config.ts +13 -0
  53. package/multi_agent_debate_spec.md +2980 -0
  54. package/package.json +38 -0
  55. package/sanity-check-problem.txt +9 -0
  56. package/src/agents/prompts/architect-prompts.ts +203 -0
  57. package/src/agents/prompts/generalist-prompts.ts +157 -0
  58. package/src/agents/prompts/index.ts +41 -0
  59. package/src/agents/prompts/judge-prompts.ts +19 -0
  60. package/src/agents/prompts/kiss-prompts.ts +230 -0
  61. package/src/agents/prompts/performance-prompts.ts +142 -0
  62. package/src/agents/prompts/prompt-types.ts +68 -0
  63. package/src/agents/prompts/security-prompts.ts +149 -0
  64. package/src/agents/prompts/shared.ts +144 -0
  65. package/src/agents/prompts/testing-prompts.ts +149 -0
  66. package/src/agents/role-based-agent.ts +386 -0
  67. package/src/cli/commands/debate.ts +761 -0
  68. package/src/cli/commands/eval.ts +475 -0
  69. package/src/cli/commands/report.ts +265 -0
  70. package/src/cli/index.ts +79 -0
  71. package/src/core/agent.ts +198 -0
  72. package/src/core/clarifications.ts +34 -0
  73. package/src/core/judge.ts +257 -0
  74. package/src/core/orchestrator.ts +432 -0
  75. package/src/core/state-manager.ts +322 -0
  76. package/src/eval/evaluator-agent.ts +130 -0
  77. package/src/eval/prompts/system.md +41 -0
  78. package/src/eval/prompts/user.md +64 -0
  79. package/src/providers/llm-provider.ts +25 -0
  80. package/src/providers/openai-provider.ts +84 -0
  81. package/src/providers/openrouter-provider.ts +122 -0
  82. package/src/providers/provider-factory.ts +64 -0
  83. package/src/types/agent.types.ts +141 -0
  84. package/src/types/config.types.ts +47 -0
  85. package/src/types/debate.types.ts +237 -0
  86. package/src/types/eval.types.ts +85 -0
  87. package/src/utils/common.ts +104 -0
  88. package/src/utils/context-formatter.ts +102 -0
  89. package/src/utils/context-summarizer.ts +143 -0
  90. package/src/utils/env-loader.ts +46 -0
  91. package/src/utils/exit-codes.ts +5 -0
  92. package/src/utils/id.ts +11 -0
  93. package/src/utils/logger.ts +48 -0
  94. package/src/utils/paths.ts +10 -0
  95. package/src/utils/progress-ui.ts +313 -0
  96. package/src/utils/prompt-loader.ts +79 -0
  97. package/src/utils/report-generator.ts +301 -0
  98. package/tests/clarifications.spec.ts +128 -0
  99. package/tests/cli.debate.spec.ts +144 -0
  100. package/tests/config-loading.spec.ts +206 -0
  101. package/tests/context-summarizer.spec.ts +131 -0
  102. package/tests/debate-config-custom.json +38 -0
  103. package/tests/env-loader.spec.ts +149 -0
  104. package/tests/eval.command.spec.ts +1191 -0
  105. package/tests/logger.spec.ts +19 -0
  106. package/tests/openai-provider.spec.ts +26 -0
  107. package/tests/openrouter-provider.spec.ts +279 -0
  108. package/tests/orchestrator-summary.spec.ts +386 -0
  109. package/tests/orchestrator.spec.ts +207 -0
  110. package/tests/prompt-loader.spec.ts +52 -0
  111. package/tests/prompts/architect.md +16 -0
  112. package/tests/provider-factory.spec.ts +150 -0
  113. package/tests/report.command.spec.ts +546 -0
  114. package/tests/role-based-agent-summary.spec.ts +476 -0
  115. package/tests/security-agent.spec.ts +221 -0
  116. package/tests/shared-prompts.spec.ts +318 -0
  117. package/tests/state-manager.spec.ts +251 -0
  118. package/tests/summary-prompts.spec.ts +153 -0
  119. package/tsconfig.json +49 -0
@@ -0,0 +1,54 @@
1
+ {
2
+ "agents": [
3
+ {
4
+ "id": "agent-architect",
5
+ "name": "System Architect",
6
+ "role": "architect",
7
+ "model": "anthropic/claude-sonnet-4.5",
8
+ "provider": "openrouter",
9
+ "temperature": 0.5,
10
+ "enabled": true
11
+ },
12
+ {
13
+ "id": "agent-performance",
14
+ "name": "Performance Engineer",
15
+ "role": "performance",
16
+ "model": "anthropic/claude-sonnet-4.5",
17
+ "provider": "openrouter",
18
+ "temperature": 0.6,
19
+ "enabled": true
20
+ },
21
+ {
22
+ "id": "agent-security",
23
+ "name": "Security Specialist",
24
+ "role": "security",
25
+ "model": "gpt-4",
26
+ "provider": "openai",
27
+ "temperature": 0.4,
28
+ "enabled": true
29
+ }
30
+ ],
31
+ "judge": {
32
+ "id": "judge-main",
33
+ "name": "Technical Judge",
34
+ "role": "generalist",
35
+ "model": "anthropic/claude-3-opus",
36
+ "provider": "openrouter",
37
+ "temperature": 0.5
38
+ },
39
+ "debate": {
40
+ "rounds": 3,
41
+ "terminationCondition": {
42
+ "type": "fixed"
43
+ },
44
+ "synthesisMethod": "judge",
45
+ "includeFullHistory": true,
46
+ "timeoutPerRound": 300000,
47
+ "summarization": {
48
+ "enabled": true,
49
+ "threshold": 2500,
50
+ "maxLength": 1250,
51
+ "method": "length-based"
52
+ }
53
+ }
54
+ }
@@ -0,0 +1,97 @@
1
+ {
2
+ "evaluation": {
3
+ "functional_completeness": {
4
+ "average_score": 10
5
+ },
6
+ "non_functional": {
7
+ "performance_scalability": {
8
+ "average_score": 10
9
+ },
10
+ "security": {
11
+ "average_score": 10
12
+ },
13
+ "maintainability_evolvability": {
14
+ "average_score": 10
15
+ },
16
+ "regulatory_compliance": {
17
+ "average_score": 9
18
+ },
19
+ "testability": {
20
+ "average_score": 9
21
+ }
22
+ }
23
+ },
24
+ "overall_score": 10,
25
+ "agents": {
26
+ "eval-1": {
27
+ "evaluation": {
28
+ "functional_completeness": {
29
+ "score": 10,
30
+ "reasoning": "The solution addresses all explicit requirements: student code upload, automated running/grading, persistence/auditability (PostgreSQL, immutable logs), plagiarism detection (MinHash + TurnItIn), and LMS integration (LTI 1.3). It also covers implied requirements like handling different programming languages and managing submission spikes."
31
+ },
32
+ "non_functional": {
33
+ "performance_scalability": {
34
+ "score": 10,
35
+ "reasoning": "The design is highly scalable, utilizing an event-driven architecture (Kafka/NATS), Kubernetes HPA, auto-scaling node pools, and presigned S3 uploads to handle deadline spikes (150-300 submissions in minutes). It explicitly details capacity planning and cost optimization (spot instances)."
36
+ },
37
+ "security": {
38
+ "score": 10,
39
+ "reasoning": "Security is a core design principle, employing defense-in-depth. It uses gVisor/Firecracker for kernel-level isolation of untrusted code, Kubernetes NetworkPolicies, TLS everywhere, Vault for secrets, and RBAC. This is a robust security posture for running arbitrary user code."
40
+ },
41
+ "maintainability_evolvability": {
42
+ "score": 10,
43
+ "reasoning": "The solution is built on event-driven microservices with clear component boundaries, which maximizes maintainability and reduces the blast radius of changes. The use of GitOps (ArgoCD) and a detailed operational model with runbooks and observability (OpenTelemetry, Prometheus) ensures high evolvability and ease of troubleshooting."
44
+ },
45
+ "regulatory_compliance": {
46
+ "score": 9,
47
+ "reasoning": "The solution explicitly mentions GDPR compliance via data anonymization and aligns retention policies with university governance (7 years). It includes an immutable audit log, which is crucial for compliance and auditing requirements."
48
+ },
49
+ "testability": {
50
+ "score": 9,
51
+ "reasoning": "The microservice architecture and clear interfaces (Kafka events) inherently support isolated testing. The GitOps pipeline includes CI/CD with unit and integration tests, and the roadmap includes automated load testing. The use of standard cloud components also simplifies mocking and environment setup."
52
+ }
53
+ }
54
+ },
55
+ "overall_summary": {
56
+ "strengths": "Exceptional depth in security (gVisor/Firecracker), scalability (event-driven, Kubernetes), and operations (GitOps, detailed observability stack). All requirements are met with production-ready, battle-tested patterns.",
57
+ "weaknesses": "The complexity of the proposed stack (Kafka, NATS, Kubernetes, gVisor) presents a high initial operational learning curve for the university's IT team, as acknowledged in the summary.",
58
+ "overall_score": 10
59
+ }
60
+ },
61
+ "eval-2": {
62
+ "evaluation": {
63
+ "functional_completeness": {
64
+ "score": 10,
65
+ "reasoning": "The solution addresses all explicit requirements: student code upload, automated running and grading, persistence and auditability (immutable audit log, retention policy), plagiarism detection (two-tier MinHash and TurnItIn integration), and LMS integration (LTI 1.3 Adapter). Edge cases like large files (presigned URLs) and resource contention (resource classes, quotas) are also covered."
66
+ },
67
+ "non_functional": {
68
+ "performance_scalability": {
69
+ "score": 10,
70
+ "reasoning": "Scalability is a core design principle. The architecture uses horizontal scaling (HPA, auto-scaled components), an event-driven model (Kafka/NATS) to buffer load spikes, and specific strategies for high-load scenarios (pre-warming node pools, increased Kafka partitions). Presigned uploads prevent the platform from becoming a network bottleneck. Cost optimization (Spot instances) is also explicitly addressed."
71
+ },
72
+ "security": {
73
+ "score": 10,
74
+ "reasoning": "Security is addressed with a robust defense-in-depth strategy. This includes kernel-level isolation for untrusted code (gVisor/Firecracker), network segmentation (Kubernetes NetworkPolicies), resource protection (PodSecurityPolicies, ResourceQuotas), data encryption (TLS, SSE-S3), and strong access control (Vault, RBAC). The solution explicitly mitigates the critical risk of sandbox escape."
75
+ },
76
+ "maintainability_evolvability": {
77
+ "score": 10,
78
+ "reasoning": "The solution is highly maintainable and evolvable due to its microservices architecture, clear component boundaries, and event-driven communication. GitOps (ArgoCD) ensures declarative, repeatable deployments and easy rollbacks. The roadmap explicitly plans for future language support (Java, C++, JS), demonstrating foresight for evolution. Observability (OpenTelemetry, Prometheus, Loki) is built-in, aiding troubleshooting."
79
+ },
80
+ "regulatory_compliance": {
81
+ "score": 9,
82
+ "reasoning": "The solution explicitly mentions GDPR compliance via data anonymization after course completion and aligns retention policies with university governance (7 years). It also includes an immutable audit log of state transitions, which is crucial for compliance and auditing requirements. The use of LTI 1.3 standard for LMS integration also promotes secure data handling."
83
+ },
84
+ "testability": {
85
+ "score": 9,
86
+ "reasoning": "The microservices architecture and clear interfaces (Kafka events) allow for isolated testing of components (e.g., LTI Adapter, Grading Engine). The GitOps pipeline includes automated unit and integration tests, and the deployment strategy uses Canary rollouts, which is excellent for testing changes in production safely. The roadmap includes automated load testing, indicating a strong focus on non-functional testing."
87
+ }
88
+ }
89
+ },
90
+ "overall_summary": {
91
+ "strengths": "Exceptional depth in security (gVisor/Firecracker), scalability (event-driven, HPA), and operational maturity (GitOps, Observability). All functional requirements are met with robust, production-ready patterns.",
92
+ "weaknesses": "The complexity of the proposed stack (Kubernetes, Kafka, NATS, gVisor) implies a high initial operational learning curve for the university's IT team, which is acknowledged as a risk.",
93
+ "overall_score": 10
94
+ }
95
+ }
96
+ }
97
+ }
@@ -0,0 +1,97 @@
1
+ {
2
+ "evaluation": {
3
+ "functional_completeness": {
4
+ "average_score": 10
5
+ },
6
+ "non_functional": {
7
+ "performance_scalability": {
8
+ "average_score": 10
9
+ },
10
+ "security": {
11
+ "average_score": 10
12
+ },
13
+ "maintainability_evolvability": {
14
+ "average_score": 9
15
+ },
16
+ "regulatory_compliance": {
17
+ "average_score": 9.5
18
+ },
19
+ "testability": {
20
+ "average_score": 10
21
+ }
22
+ }
23
+ },
24
+ "overall_score": 10,
25
+ "agents": {
26
+ "eval-1": {
27
+ "evaluation": {
28
+ "functional_completeness": {
29
+ "score": 10,
30
+ "reasoning": "The solution addresses all stated requirements: student upload, running/grading, persistent/auditable grades/runs (WORM storage, immutable audit trail), plagiarism detection (MinHash-LSH, TurnItIn integration), and LMS integration (implied via API Gateway/LMS/SPA block). It also covers implied requirements like performance, security, and operational resilience."
31
+ },
32
+ "non_functional": {
33
+ "performance_scalability": {
34
+ "score": 10,
35
+ "reasoning": "The design explicitly targets high performance and scalability using event-driven architecture (Kafka), presigned URLs to offload the gateway, warm container pools for low latency execution, detailed throughput targets (10x baseline), and specific optimizations like weighted Kafka lag and multi-level caching. It provides concrete SLA targets and a validation plan."
36
+ },
37
+ "security": {
38
+ "score": 10,
39
+ "reasoning": "Security is thoroughly addressed, focusing on sandbox hardening (gVisor/Firecracker, seccomp, network isolation), data protection (mTLS, encryption at rest, IAM), and image integrity (Docker Content Trust). The use of WORM storage also enhances audit security."
40
+ },
41
+ "maintainability_evolvability": {
42
+ "score": 9,
43
+ "reasoning": "The microservices and event-driven architecture promote clear separation of concerns (Submission, Sandbox, Grading, Plagiarism services). The use of Kubernetes, GitOps, and clear component specifications enhances maintainability. The design explicitly considers future enhancements (Nice-to-Have section), demonstrating evolvability, though the complexity of the Kafka/Outbox pattern adds a slight operational burden."
44
+ },
45
+ "regulatory_compliance": {
46
+ "score": 9,
47
+ "reasoning": "The solution explicitly mentions GDPR/FERPA compliance, addressing data hashing, subject access tooling, and configurable retention. The use of WORM storage (S3 Object Lock) directly addresses the auditability and non-tampering requirements essential for accreditation/compliance."
48
+ },
49
+ "testability": {
50
+ "score": 10,
51
+ "reasoning": "Testability is a core consideration, with clear component boundaries, a detailed performance testing plan (load, burst, mixed payload), and a dedicated section on Chaos Engineering to validate resilience. The use of a dedicated worker pool with a circuit breaker for the 3rd party TurnItIn integration ensures isolated testing and graceful degradation."
52
+ }
53
+ }
54
+ },
55
+ "overall_summary": {
56
+ "strengths": "Exceptional detail in addressing non-functional requirements, particularly performance, security, and auditability. The use of advanced patterns like weighted Kafka lag, warm container pools, and MinHash-LSH demonstrates a deep understanding of the problem domain and operational excellence. The solution is highly structured and includes a comprehensive implementation roadmap and risk mitigation plan.",
57
+ "weaknesses": "The complexity introduced by the Outbox pattern and managing a highly tuned Kafka cluster might require specialized expertise, potentially exceeding the 'operational complexity budget' of a typical university IT department, despite the claim.",
58
+ "overall_score": 10
59
+ }
60
+ },
61
+ "eval-2": {
62
+ "evaluation": {
63
+ "functional_completeness": {
64
+ "score": 10,
65
+ "reasoning": "The solution addresses all explicit requirements: student upload, persistent/auditable grades and runs (WORM storage, immutable audit trail), plagiarism detection (MinHash-LSH, TurnItIn integration), and LMS integration (implied via API Gateway/LMS/SPA component). It also covers implied requirements like resource isolation (sandbox) and high availability (HA PostgreSQL, Kafka replication)."
66
+ },
67
+ "non_functional": {
68
+ "performance_scalability": {
69
+ "score": 10,
70
+ "reasoning": "The solution is highly optimized for performance and scalability. Key decisions like Presigned URL uploads, Event-Driven architecture (Kafka), Warm container pools, Weighted Kafka Lag scaling, and MinHash-LSH for plagiarism directly address potential bottlenecks and provide significant headroom (10x peak capacity). Concrete SLA targets and throughput targets are provided."
71
+ },
72
+ "security": {
73
+ "score": 10,
74
+ "reasoning": "Security is comprehensively addressed, particularly the critical sandbox isolation (gVisor/Firecracker, seccomp, Falco monitoring, network isolation). Data protection includes mTLS, AES-256 encryption at rest, and granular IAM access control. Image integrity via Docker Content Trust is also included."
75
+ },
76
+ "maintainability_evolvability": {
77
+ "score": 9,
78
+ "reasoning": "The microservices and event-driven architecture promote high maintainability and low coupling. The use of GitOps (ArgoCD), clear component boundaries, and an observability stack (Prometheus, Grafana, OpenTelemetry) supports troubleshooting and evolution. The Outbox pattern ensures reliable state transitions. The only minor deduction is that the daily partitioning of the grades table, while good for performance, might require occasional maintenance for schema evolution."
79
+ },
80
+ "regulatory_compliance": {
81
+ "score": 10,
82
+ "reasoning": "The solution explicitly addresses regulatory compliance (GDPR/FERPA) through data hashing, configurable retention policies (5 years minimum), WORM storage (S3 Object Lock) for auditability, and tooling for data subject access. This is a strong and explicit coverage of compliance needs."
83
+ },
84
+ "testability": {
85
+ "score": 10,
86
+ "reasoning": "Testability is excellent. The component decomposition allows for isolated testing. The solution explicitly details performance testing (sustained, burst, mixed payload), security testing, and includes a robust plan for Chaos Engineering to validate resilience. The use of clear SLAs and metrics facilitates continuous validation."
87
+ }
88
+ }
89
+ },
90
+ "overall_summary": {
91
+ "strengths": "The solution is exceptionally detailed, covering all functional and non-functional requirements with high-confidence, production-ready patterns (Event-Driven, Outbox, Warm Pools). It provides concrete performance targets, robust security measures (sandbox hardening), and a clear, phased implementation roadmap.",
92
+ "weaknesses": "The complexity of integrating MinHash-LSH and the reliance on specific Kafka expertise are minor operational challenges, but these are acknowledged and mitigated in the plan.",
93
+ "overall_score": 10
94
+ }
95
+ }
96
+ }
97
+ }