dialectic 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (119) hide show
  1. package/.cursor/commands/setup-test.mdc +175 -0
  2. package/.cursor/rules/basic-code-cleanup.mdc +1110 -0
  3. package/.cursor/rules/riper5.mdc +96 -0
  4. package/.env.example +6 -0
  5. package/AGENTS.md +1052 -0
  6. package/LICENSE +21 -0
  7. package/README.md +93 -0
  8. package/WARP.md +113 -0
  9. package/dialectic-1.0.0.tgz +0 -0
  10. package/dialectic.js +10 -0
  11. package/docs/commands.md +375 -0
  12. package/docs/configuration.md +882 -0
  13. package/docs/context_summarization.md +1023 -0
  14. package/docs/debate_flow.md +1127 -0
  15. package/docs/eval_flow.md +795 -0
  16. package/docs/evaluator.md +141 -0
  17. package/examples/debate-config-openrouter.json +48 -0
  18. package/examples/debate_config1.json +48 -0
  19. package/examples/eval/eval1/eval_config1.json +13 -0
  20. package/examples/eval/eval1/result1.json +62 -0
  21. package/examples/eval/eval1/result2.json +97 -0
  22. package/examples/eval_summary_format.md +11 -0
  23. package/examples/example3/debate-config.json +64 -0
  24. package/examples/example3/eval_config2.json +25 -0
  25. package/examples/example3/problem.md +17 -0
  26. package/examples/example3/rounds_test/eval_run.sh +16 -0
  27. package/examples/example3/rounds_test/run_test.sh +16 -0
  28. package/examples/kata1/architect-only-solution_2-rounds.json +121 -0
  29. package/examples/kata1/architect-perf-solution_2-rounds.json +234 -0
  30. package/examples/kata1/debate-config-kata1.json +54 -0
  31. package/examples/kata1/eval_architect-only_2-rounds.json +97 -0
  32. package/examples/kata1/eval_architect-perf_2-rounds.json +97 -0
  33. package/examples/kata1/kata1-report.md +12224 -0
  34. package/examples/kata1/kata1-report_temps-01_01_01_07.md +2451 -0
  35. package/examples/kata1/kata1.md +5 -0
  36. package/examples/kata1/meta.txt +1 -0
  37. package/examples/kata2/debate-config.json +54 -0
  38. package/examples/kata2/eval_config1.json +21 -0
  39. package/examples/kata2/eval_config2.json +25 -0
  40. package/examples/kata2/kata2.md +5 -0
  41. package/examples/kata2/only_architect/debate-config.json +45 -0
  42. package/examples/kata2/only_architect/eval_run.sh +11 -0
  43. package/examples/kata2/only_architect/run_test.sh +5 -0
  44. package/examples/kata2/rounds_test/eval_run.sh +11 -0
  45. package/examples/kata2/rounds_test/run_test.sh +5 -0
  46. package/examples/kata2/summary_length_test/eval_run.sh +11 -0
  47. package/examples/kata2/summary_length_test/eval_run_w_clarify.sh +7 -0
  48. package/examples/kata2/summary_length_test/run_test.sh +5 -0
  49. package/examples/task-queue/debate-config.json +76 -0
  50. package/examples/task-queue/debate_report.md +566 -0
  51. package/examples/task-queue/task-queue-system.md +25 -0
  52. package/jest.config.ts +13 -0
  53. package/multi_agent_debate_spec.md +2980 -0
  54. package/package.json +38 -0
  55. package/sanity-check-problem.txt +9 -0
  56. package/src/agents/prompts/architect-prompts.ts +203 -0
  57. package/src/agents/prompts/generalist-prompts.ts +157 -0
  58. package/src/agents/prompts/index.ts +41 -0
  59. package/src/agents/prompts/judge-prompts.ts +19 -0
  60. package/src/agents/prompts/kiss-prompts.ts +230 -0
  61. package/src/agents/prompts/performance-prompts.ts +142 -0
  62. package/src/agents/prompts/prompt-types.ts +68 -0
  63. package/src/agents/prompts/security-prompts.ts +149 -0
  64. package/src/agents/prompts/shared.ts +144 -0
  65. package/src/agents/prompts/testing-prompts.ts +149 -0
  66. package/src/agents/role-based-agent.ts +386 -0
  67. package/src/cli/commands/debate.ts +761 -0
  68. package/src/cli/commands/eval.ts +475 -0
  69. package/src/cli/commands/report.ts +265 -0
  70. package/src/cli/index.ts +79 -0
  71. package/src/core/agent.ts +198 -0
  72. package/src/core/clarifications.ts +34 -0
  73. package/src/core/judge.ts +257 -0
  74. package/src/core/orchestrator.ts +432 -0
  75. package/src/core/state-manager.ts +322 -0
  76. package/src/eval/evaluator-agent.ts +130 -0
  77. package/src/eval/prompts/system.md +41 -0
  78. package/src/eval/prompts/user.md +64 -0
  79. package/src/providers/llm-provider.ts +25 -0
  80. package/src/providers/openai-provider.ts +84 -0
  81. package/src/providers/openrouter-provider.ts +122 -0
  82. package/src/providers/provider-factory.ts +64 -0
  83. package/src/types/agent.types.ts +141 -0
  84. package/src/types/config.types.ts +47 -0
  85. package/src/types/debate.types.ts +237 -0
  86. package/src/types/eval.types.ts +85 -0
  87. package/src/utils/common.ts +104 -0
  88. package/src/utils/context-formatter.ts +102 -0
  89. package/src/utils/context-summarizer.ts +143 -0
  90. package/src/utils/env-loader.ts +46 -0
  91. package/src/utils/exit-codes.ts +5 -0
  92. package/src/utils/id.ts +11 -0
  93. package/src/utils/logger.ts +48 -0
  94. package/src/utils/paths.ts +10 -0
  95. package/src/utils/progress-ui.ts +313 -0
  96. package/src/utils/prompt-loader.ts +79 -0
  97. package/src/utils/report-generator.ts +301 -0
  98. package/tests/clarifications.spec.ts +128 -0
  99. package/tests/cli.debate.spec.ts +144 -0
  100. package/tests/config-loading.spec.ts +206 -0
  101. package/tests/context-summarizer.spec.ts +131 -0
  102. package/tests/debate-config-custom.json +38 -0
  103. package/tests/env-loader.spec.ts +149 -0
  104. package/tests/eval.command.spec.ts +1191 -0
  105. package/tests/logger.spec.ts +19 -0
  106. package/tests/openai-provider.spec.ts +26 -0
  107. package/tests/openrouter-provider.spec.ts +279 -0
  108. package/tests/orchestrator-summary.spec.ts +386 -0
  109. package/tests/orchestrator.spec.ts +207 -0
  110. package/tests/prompt-loader.spec.ts +52 -0
  111. package/tests/prompts/architect.md +16 -0
  112. package/tests/provider-factory.spec.ts +150 -0
  113. package/tests/report.command.spec.ts +546 -0
  114. package/tests/role-based-agent-summary.spec.ts +476 -0
  115. package/tests/security-agent.spec.ts +221 -0
  116. package/tests/shared-prompts.spec.ts +318 -0
  117. package/tests/state-manager.spec.ts +251 -0
  118. package/tests/summary-prompts.spec.ts +153 -0
  119. package/tsconfig.json +49 -0
@@ -0,0 +1,141 @@
1
+ # Evaluator Command
2
+
3
+ The `eval` command runs one or more evaluator agents to assess the outcome of a completed debate stored as a debate state JSON file.
4
+
5
+ ## Usage
6
+
7
+ ```bash
8
+ # Basic usage (stdout Markdown table by default)
9
+ dialectic eval --config ./eval-config.json --debate ./debates/deb-20250101-010203-ABC.json
10
+
11
+ # With environment file and verbose diagnostics
12
+ dialectic eval --config ./eval-config.json --debate ./deb.json --env-file ./.env --verbose
13
+
14
+ # Write aggregated JSON output
15
+ dialectic eval --config ./eval-config.json --debate ./deb.json --output ./result.json
16
+ ```
17
+
18
+ Options:
19
+ - `-c, --config <path>`: Required. Evaluator configuration file path.
20
+ - `-d, --debate <path>`: Required. Debate JSON (saved `DebateState`) file path.
21
+ - `--env-file <path>`: Optional. Path to environment file.
22
+ - `-v, --verbose`: Optional. Verbose diagnostic output to stderr.
23
+ - `-o, --output <path>`: Optional. Output destination.
24
+ - If path ends with `.json`, writes aggregated JSON output (including per-agent results).
25
+ - Otherwise, writes a Markdown table (or stdout when not provided).
26
+
27
+ Exit codes:
28
+ - `0`: Success
29
+ - `2`: Invalid arguments (missing files/fields, malformed JSON, missing final solution)
30
+ - `4`: Configuration error (e.g., missing API keys)
31
+
32
+ ## Evaluator Configuration
33
+
34
+ Schema (root):
35
+ ```json
36
+ {
37
+ "agents": [
38
+ {
39
+ "id": "eval-1",
40
+ "name": "Evaluator 1",
41
+ "model": "gpt-4",
42
+ "provider": "openai",
43
+ "systemPromptPath": "./prompts/system.md",
44
+ "userPromptPath": "./prompts/user.md",
45
+ "timeout": 30000,
46
+ "enabled": true
47
+ }
48
+ ]
49
+ }
50
+ ```
51
+
52
+ Notes:
53
+ - `role` is not required.
54
+ - `temperature` is ignored; evaluators always use temperature 0.1.
55
+ - `timeout` units: milliseconds. Default 30000.
56
+
57
+ ## Input Debate JSON
58
+ - Must contain non-empty `problem` and `finalSolution.description`.
59
+ - If missing, the command fails with exit code 2.
60
+ - Clarifications (if any) are included in the evaluator context as fenced code blocks; skipped answers appear as `NA`.
61
+
62
+ ## Default Prompts
63
+ Built-in prompts are bundled in the code and used if files are missing/invalid:
64
+ - `src/eval/prompts/system.md`
65
+ - `src/eval/prompts/user.md`
66
+
67
+ ## Evaluator Output Contract
68
+ Evaluators must return ONLY a JSON object with this structure:
69
+ ```json
70
+ {
71
+ "evaluation": {
72
+ "functional_completeness": { "score": 1, "reasoning": "..." },
73
+ "non_functional": {
74
+ "performance_scalability": { "score": 1, "reasoning": "..." },
75
+ "security": { "score": 1, "reasoning": "..." },
76
+ "maintainability_evolvability": { "score": 1, "reasoning": "..." },
77
+ "regulatory_compliance": { "score": 1, "reasoning": "..." },
78
+ "testability": { "score": 1, "reasoning": "..." }
79
+ }
80
+ },
81
+ "overall_summary": {
82
+ "strengths": "...",
83
+ "weaknesses": "...",
84
+ "overall_score": 1
85
+ }
86
+ }
87
+ ```
88
+
89
+ Rules:
90
+ - Scores must be integers in the range 1..10.
91
+ - If a score is unavailable, omit that field; the system averages only present values and warns.
92
+ - Out-of-range scores are clamped to [1..10] with a warning.
93
+ - Non-numeric scores are ignored with a warning.
94
+ - Extra keys are ignored (warned in verbose mode).
95
+
96
+ ## Aggregation and Output
97
+ - Categories averaged:
98
+ - Functional Completeness
99
+ - Performance & Scalability
100
+ - Security
101
+ - Maintainability & Evolvability
102
+ - Regulatory Compliance
103
+ - Testability
104
+ - Overall Score
105
+ - Averaging uses only present, valid scores; results are rounded to 2 decimals. Missing across all agents displays `N/A`.
106
+
107
+ ### Markdown Output
108
+ A single Markdown table is printed to stdout by default:
109
+ ```
110
+ | Functional Completeness | Performance & Scalability | Security | Maintainability & Evolvability | Regulatory Compliance | Testability | Overall Score |
111
+ |------------------------|---------------------------|----------|-------------------------------|------------------------|------------|---------------|
112
+ | 7.50 | 6.00 | 8.00 | 7.00 | N/A | 7.00 | 7.20 |
113
+ ```
114
+
115
+ ### JSON Output
116
+ When `--output` ends with `.json`, the file contains averages and per-agent results:
117
+ ```json
118
+ {
119
+ "evaluation": {
120
+ "functional_completeness": { "average_score": 7.5 },
121
+ "non_functional": {
122
+ "performance_scalability": { "average_score": 6.0 },
123
+ "security": { "average_score": 8.0 },
124
+ "maintainability_evolvability": { "average_score": 7.0 },
125
+ "regulatory_compliance": { "average_score": null },
126
+ "testability": { "average_score": 7.0 }
127
+ }
128
+ },
129
+ "overall_score": 7.2,
130
+ "agents": {
131
+ "eval-1": {}
132
+ }
133
+ }
134
+ ```
135
+
136
+ ## Verbose Mode
137
+ With `--verbose`, stderr includes:
138
+ - Provider/model per agent
139
+ - Prompt sources (built-in vs file path)
140
+ - Per-agent latency and any timeout
141
+ - JSON parsing/clamping/ignored-field warnings
@@ -0,0 +1,48 @@
1
+ {
2
+ "agents": [
3
+ {
4
+ "id": "agent-architect",
5
+ "name": "System Architect",
6
+ "role": "architect",
7
+ "model": "anthropic/claude-sonnet-4.5",
8
+ "provider": "openrouter",
9
+ "temperature": 0.5,
10
+ "enabled": true
11
+ },
12
+ {
13
+ "id": "agent-performance",
14
+ "name": "Performance Engineer",
15
+ "role": "performance",
16
+ "model": "anthropic/claude-3-opus",
17
+ "provider": "openrouter",
18
+ "temperature": 0.6,
19
+ "enabled": true
20
+ },
21
+ {
22
+ "id": "agent-security",
23
+ "name": "Security Specialist",
24
+ "role": "security",
25
+ "model": "gpt-4",
26
+ "provider": "openai",
27
+ "temperature": 0.4,
28
+ "enabled": true
29
+ }
30
+ ],
31
+ "judge": {
32
+ "id": "judge-main",
33
+ "name": "Technical Judge",
34
+ "role": "generalist",
35
+ "model": "openai/gpt-4",
36
+ "provider": "openrouter",
37
+ "temperature": 0.3
38
+ },
39
+ "debate": {
40
+ "rounds": 3,
41
+ "terminationCondition": {
42
+ "type": "fixed"
43
+ },
44
+ "synthesisMethod": "judge",
45
+ "includeFullHistory": true,
46
+ "timeoutPerRound": 300000
47
+ }
48
+ }
@@ -0,0 +1,48 @@
1
+ {
2
+ "agents": [
3
+ {
4
+ "id": "agent-architect",
5
+ "name": "System Architect",
6
+ "role": "architect",
7
+ "model": "gpt-4",
8
+ "provider": "openai",
9
+ "temperature": 0.5,
10
+ "enabled": true
11
+ },
12
+ {
13
+ "id": "agent-performance",
14
+ "name": "Performance Engineer",
15
+ "role": "performance",
16
+ "model": "gpt-4",
17
+ "provider": "openai",
18
+ "temperature": 0.6,
19
+ "enabled": true
20
+ },
21
+ {
22
+ "id": "agent-security",
23
+ "name": "Security Specialist",
24
+ "role": "security",
25
+ "model": "gpt-4",
26
+ "provider": "openai",
27
+ "temperature": 0.4,
28
+ "enabled": true
29
+ }
30
+ ],
31
+ "judge": {
32
+ "id": "judge-main",
33
+ "name": "Technical Judge",
34
+ "role": "generalist",
35
+ "model": "gpt-4",
36
+ "provider": "openai",
37
+ "temperature": 0.3
38
+ },
39
+ "debate": {
40
+ "rounds": 4,
41
+ "terminationCondition": {
42
+ "type": "fixed"
43
+ },
44
+ "synthesisMethod": "judge",
45
+ "includeFullHistory": true,
46
+ "timeoutPerRound": 450000
47
+ }
48
+ }
@@ -0,0 +1,13 @@
1
+ {
2
+ "agents": [
3
+ {
4
+ "id": "eval-1",
5
+ "name": "Gemini Flash Evaluator",
6
+ "model": "google/gemini-2.5-flash-preview-09-2025",
7
+ "provider": "openrouter",
8
+ "timeout": 30000,
9
+ "enabled": true
10
+ }
11
+ ]
12
+ }
13
+
@@ -0,0 +1,62 @@
1
+ {
2
+ "evaluation": {
3
+ "functional_completeness": {
4
+ "average_score": 9
5
+ },
6
+ "non_functional": {
7
+ "performance_scalability": {
8
+ "average_score": 10
9
+ },
10
+ "security": {
11
+ "average_score": 10
12
+ },
13
+ "maintainability_evolvability": {
14
+ "average_score": 9
15
+ },
16
+ "regulatory_compliance": {
17
+ "average_score": 9
18
+ },
19
+ "testability": {
20
+ "average_score": 9
21
+ }
22
+ }
23
+ },
24
+ "overall_score": 9,
25
+ "agents": {
26
+ "eval-1": {
27
+ "evaluation": {
28
+ "functional_completeness": {
29
+ "score": 9,
30
+ "reasoning": "All core functional requirements (upload, run/grade, persistence/audit, plagiarism detection, LMS integration) are addressed, with detailed technical solutions proposed for each. The solution goes beyond the minimum requirements."
31
+ },
32
+ "non_functional": {
33
+ "performance_scalability": {
34
+ "score": 10,
35
+ "reasoning": "Excellent focus on performance and scalability, including a phased architecture (monolith to microservices), multi-tier caching (L1/L2 Redis), optimized database design (partitioning, materialized views), and high-performance execution environment (Firecracker microVMs, pre-warmed VMs, dynamic scaling). Targets are concrete (150+ submissions/min, 95%+ cache hit rate)."
36
+ },
37
+ "security": {
38
+ "score": 10,
39
+ "reasoning": "Robust security measures are detailed, specifically addressing execution isolation (Firecracker), data protection (JWT rotation, device fingerprinting), auditability, and proactive measures (dependency scanning, patching)."
40
+ },
41
+ "maintainability_evolvability": {
42
+ "score": 9,
43
+ "reasoning": "The phased implementation (monolith MVP to microservices via Strangler Fig) is a strong strategy for maintainability and controlled evolution. The rigorous performance validation and continuous profiling also contribute to long-term health."
44
+ },
45
+ "regulatory_compliance": {
46
+ "score": 9,
47
+ "reasoning": "Explicit mention of compliance with FERPA, GDPR, and SOC 2 frameworks indicates a strong commitment to regulatory requirements, which is crucial for a university system handling student data."
48
+ },
49
+ "testability": {
50
+ "score": 9,
51
+ "reasoning": "The solution includes rigorous performance validation, comprehensive load testing scenarios (soak, spike), continuous profiling, and automated performance regression gates in CI/CD, all of which enhance testability and quality assurance."
52
+ }
53
+ }
54
+ },
55
+ "overall_summary": {
56
+ "strengths": "The solution is exceptionally detailed, addressing all non-functional requirements with concrete, high-quality technical choices (Firecracker, PostgreSQL partitioning, LSH for plagiarism, phased architecture). It demonstrates a strong understanding of security, scalability, and cost optimization.",
57
+ "weaknesses": "The initial monolith phase, while pragmatic, introduces a temporary architectural debt. The complexity of the multi-tier caching and microVM management requires significant operational expertise.",
58
+ "overall_score": 9
59
+ }
60
+ }
61
+ }
62
+ }
@@ -0,0 +1,97 @@
1
+ {
2
+ "evaluation": {
3
+ "functional_completeness": {
4
+ "average_score": 9
5
+ },
6
+ "non_functional": {
7
+ "performance_scalability": {
8
+ "average_score": 10
9
+ },
10
+ "security": {
11
+ "average_score": 9.5
12
+ },
13
+ "maintainability_evolvability": {
14
+ "average_score": 9
15
+ },
16
+ "regulatory_compliance": {
17
+ "average_score": 9
18
+ },
19
+ "testability": {
20
+ "average_score": 8
21
+ }
22
+ }
23
+ },
24
+ "overall_score": 9,
25
+ "agents": {
26
+ "eval-1": {
27
+ "evaluation": {
28
+ "functional_completeness": {
29
+ "score": 9,
30
+ "reasoning": "The solution explicitly addresses all core requirements: student code upload/execution/grading, persistence/auditability (PostgreSQL, audit logging), plagiarism detection (LSH, external submission), and integration with the LMS (implied by the overall system context, though specific integration details are light). The phased approach ensures initial functional delivery."
31
+ },
32
+ "non_functional": {
33
+ "performance_scalability": {
34
+ "score": 10,
35
+ "reasoning": "The solution provides extensive and detailed strategies for performance and scalability, including multi-tier caching (L1/L2), high-performance execution using Firecracker microVMs with pre-warmed pools, dynamic scaling, optimized database design (partitioning, materialized views), and rigorous performance validation (load testing, profiling). It explicitly targets high throughput (150+ submissions/min) and low latency."
36
+ },
37
+ "security": {
38
+ "score": 10,
39
+ "reasoning": "Security is addressed comprehensively, covering execution isolation (Firecracker microVMs), data protection (secure JWT, audit logging), application security (dependency scanning), and regulatory compliance (explicit mention of FERPA, GDPR, SOC 2). This demonstrates a robust security posture."
40
+ },
41
+ "maintainability_evolvability": {
42
+ "score": 9,
43
+ "reasoning": "The solution explicitly adopts a phased implementation starting with a pragmatic monolith and evolving to microservices using the Strangler Fig pattern. This is a strong strategy for maintainability and controlled evolution. Decomposition is planned, and the use of clear components (caching, execution environment) aids maintenance."
44
+ },
45
+ "regulatory_compliance": {
46
+ "score": 9,
47
+ "reasoning": "The solution explicitly mentions compliance with FERPA, GDPR, and SOC 2 frameworks, indicating that regulatory impact on student data privacy and security has been considered in the design process."
48
+ },
49
+ "testability": {
50
+ "score": 8,
51
+ "reasoning": "Testability is addressed through the mention of automated performance regression gates in CI/CD and comprehensive load testing scenarios. The component separation (even in the planned microservices phase) and isolated execution environment (Firecracker) inherently support better unit and integration testing, though explicit strategies for mocking external services (like TurnItIn) or component-level testing are not detailed."
52
+ }
53
+ }
54
+ },
55
+ "overall_summary": {
56
+ "strengths": "Exceptional focus on performance, scalability, and security, utilizing modern technologies like Firecracker microVMs and LSH for plagiarism detection. The phased implementation approach (monolith to microservices via Strangler Fig) is pragmatic and reduces initial risk.",
57
+ "weaknesses": "Specific details on the LMS integration and the initial 'pragmatic monolith' structure are light, which could impact initial implementation clarity.",
58
+ "overall_score": 9
59
+ }
60
+ },
61
+ "eval-2": {
62
+ "evaluation": {
63
+ "functional_completeness": {
64
+ "score": 9,
65
+ "reasoning": "The solution addresses all core functional requirements: code upload, execution/grading, persistence/auditability (PostgreSQL, audit logging), plagiarism detection (LSH, TurnItIn integration), and LMS integration (implied by the overall system context, though not detailed). The phased approach ensures initial functionality is delivered."
66
+ },
67
+ "non_functional": {
68
+ "performance_scalability": {
69
+ "score": 10,
70
+ "reasoning": "Performance and scalability are primary considerations. The solution explicitly details high-performance execution (Firecracker microVMs, pre-warmed pool, 150+ submissions/min burst), multi-tier caching (L1/L2, Redis), optimized database design (partitioning, materialized views), and a clear scaling path (monolith to microservices). This is a robust plan for scale."
71
+ },
72
+ "security": {
73
+ "score": 9,
74
+ "reasoning": "Security is well-addressed, particularly for code execution isolation (Firecracker), data protection (JWT rotation, audit logging), and compliance (FERPA, GDPR, SOC 2 frameworks mentioned). The focus on automated scanning and patching also contributes significantly."
75
+ },
76
+ "maintainability_evolvability": {
77
+ "score": 9,
78
+ "reasoning": "The solution explicitly adopts a phased approach (monolith to microservices via Strangler Fig) which is a strong pattern for evolvability. Decomposition is planned, and the use of clear components (caching, execution environment) enhances maintainability. Continuous profiling aids troubleshooting."
79
+ },
80
+ "regulatory_compliance": {
81
+ "score": 9,
82
+ "reasoning": "The solution explicitly mentions compliance with key frameworks relevant to educational data (FERPA, GDPR, SOC 2), indicating that regulatory requirements have been factored into the security and data handling design."
83
+ },
84
+ "testability": {
85
+ "score": 8,
86
+ "reasoning": "Testability is addressed through comprehensive load testing scenarios, continuous profiling, and automated performance regression gates in CI/CD. The component separation (e.g., Firecracker for execution) suggests isolated testing is possible, though specific strategies for unit/integration testing of the core logic are not detailed."
87
+ }
88
+ }
89
+ },
90
+ "overall_summary": {
91
+ "strengths": "Exceptional focus on non-functional requirements, particularly performance, scalability, and security, using modern, high-confidence technologies (Firecracker, LSH, multi-tier caching). The pragmatic, phased implementation approach (monolith to microservices) minimizes initial risk while ensuring future growth capacity.",
92
+ "weaknesses": "LMS integration details are sparse. The initial 'pragmatic monolith' phase, while good for speed, requires careful management to ensure the Strangler Fig migration path remains viable.",
93
+ "overall_score": 9
94
+ }
95
+ }
96
+ }
97
+ }
@@ -0,0 +1,11 @@
1
+
2
+ Extract all the different scores (each sub score), from all these files and put them in a markdown table, allowing to compare results.
3
+ the table column should be:
4
+ - file name
5
+ - functional_completeness score
6
+ - performance_scalability score
7
+ - security score
8
+ - maintainability_evolvability score
9
+ - regulatory_compliance score
10
+ - testability score.
11
+ - overall_score (taken from the root object).
@@ -0,0 +1,64 @@
1
+ {
2
+ "agents": [
3
+ {
4
+ "id": "agent-architect",
5
+ "name": "System Architect",
6
+ "role": "architect",
7
+ "model": "google/gemini-2.5-flash-lite",
8
+ "provider": "openrouter",
9
+ "temperature": 0.5,
10
+ "enabled": true
11
+ },
12
+ {
13
+ "id": "agent-performance",
14
+ "name": "Performance Engineer",
15
+ "role": "performance",
16
+ "model": "google/gemini-2.5-flash-lite",
17
+ "provider": "openrouter",
18
+ "temperature": 0.5,
19
+ "enabled": true
20
+ },
21
+ {
22
+ "id": "agent-security",
23
+ "name": "Security Specialist",
24
+ "role": "security",
25
+ "model": "google/gemini-2.5-flash-lite",
26
+ "provider": "openrouter",
27
+ "temperature": 0.5,
28
+ "enabled": true
29
+ },
30
+ {
31
+ "id": "agent-kiss",
32
+ "name": "KISS Advocate",
33
+ "role": "kiss",
34
+ "model": "google/gemini-2.5-flash-lite",
35
+ "provider": "openrouter",
36
+ "temperature": 0.5,
37
+ "enabled": true
38
+ }
39
+ ],
40
+ "judge": {
41
+ "id": "judge-main",
42
+ "name": "Technical Judge",
43
+ "role": "generalist",
44
+ "model": "google/gemini-2.5-flash-lite",
45
+ "provider": "openrouter",
46
+ "temperature": 0.5
47
+ },
48
+ "debate": {
49
+ "rounds": 3,
50
+ "terminationCondition": {
51
+ "type": "fixed"
52
+ },
53
+ "synthesisMethod": "judge",
54
+ "includeFullHistory": true,
55
+ "timeoutPerRound": 300000,
56
+ "summarization": {
57
+ "enabled": true,
58
+ "threshold": 10000,
59
+ "maxLength": 5000,
60
+ "method": "length-based"
61
+ }
62
+ }
63
+ }
64
+
@@ -0,0 +1,25 @@
1
+ {
2
+ "agents": [
3
+ {
4
+ "id": "eval-1",
5
+ "name": "Sonnet 4 Evaluator 1",
6
+ "model": "google/gemini-2.5-flash-lite",
7
+ "provider": "openrouter",
8
+ "timeout": 30000,
9
+ "enabled": true,
10
+ "systemPromptPath": "../eval_system.md",
11
+ "userPromptPath": "../eval_user.md"
12
+ },
13
+ {
14
+ "id": "eval-2",
15
+ "name": "Sonnet 4 Evaluator 2",
16
+ "model": "google/gemini-2.5-flash-lite",
17
+ "provider": "openrouter",
18
+ "timeout": 30000,
19
+ "enabled": true,
20
+ "systemPromptPath": "../eval_system.md",
21
+ "userPromptPath": "../eval_user.md"
22
+ }
23
+ ]
24
+ }
25
+
@@ -0,0 +1,17 @@
1
+ I’m designing a system where we have a backend (API + admin/back office) and a frontend with active users. The scenario is something like this:
2
+
3
+ We have around 100 daily active users, potentially scaling to 1000+ in the future.
4
+
5
+ From the back office, admins can post notifications or messages (e.g., “maintenance at 12:00”) that should appear in real time on the frontend.
6
+
7
+ Right now, we are using polling from the frontend to check for updates every 30 seconds or so.
8
+
9
+ I’m considering switching to a WebSocket approach, where the backend pushes the message to all connected clients immediately.
10
+
11
+ My questions are:
12
+
13
+ What are the main benefits and trade-offs of using WebSockets vs polling in scenarios like this?
14
+
15
+ Are there specific factors (number of requests, latency, server resources, scaling) that would make you choose one over the other?
16
+
17
+ Any experiences with scaling this kind of system from tens to thousands of users?
@@ -0,0 +1,16 @@
1
+ #!/bin/bash
2
+
3
+ # Base paths
4
+ BASE_DIR="examples/example3"
5
+ OUTPUT_DIR="/mnt/c/tmp/dialectic/example3/rounds_test"
6
+
7
+ # Ensure output directory exists
8
+ mkdir -p "$OUTPUT_DIR"
9
+
10
+ # Run evaluations for all debate outputs
11
+ dialectic eval -c ./$BASE_DIR/eval_config2.json -d $OUTPUT_DIR/all_agents_1R_no_clarify.json -v -o $OUTPUT_DIR/eval2_all_agents_1R_no_clarify.json
12
+ dialectic eval -c ./$BASE_DIR/eval_config2.json -d $OUTPUT_DIR/all_agents_2R_no_clarify.json -v -o $OUTPUT_DIR/eval2_all_agents_2R_no_clarify.json
13
+ dialectic eval -c ./$BASE_DIR/eval_config2.json -d $OUTPUT_DIR/all_agents_3R_no_clarify.json -v -o $OUTPUT_DIR/eval2_all_agents_3R_no_clarify.json
14
+ # dialectic eval -c ./$BASE_DIR/eval_config2.json -d $OUTPUT_DIR/all_agents_4R_no_clarify.json -v -o $OUTPUT_DIR/eval2_all_agents_4R_no_clarify.json
15
+ # dialectic eval -c ./$BASE_DIR/eval_config2.json -d $OUTPUT_DIR/all_agents_5R_no_clarify.json -v -o $OUTPUT_DIR/eval2_all_agents_5R_no_clarify.json
16
+
@@ -0,0 +1,16 @@
1
+ #!/bin/bash
2
+
3
+ # Base paths
4
+ BASE_DIR="examples/example3"
5
+ OUTPUT_DIR="/c/tmp/dialectic/example3/rounds_test"
6
+
7
+ # Ensure output directory exists
8
+ mkdir -p "$OUTPUT_DIR"
9
+
10
+ # Run debates with rounds 1-5
11
+ dialectic debate -r 1 -c "$BASE_DIR/debate-config.json" -o "$OUTPUT_DIR/all_agents_1R_no_clarify.json" -p "$BASE_DIR/problem.md" -v
12
+ dialectic debate -r 2 -c "$BASE_DIR/debate-config.json" -o "$OUTPUT_DIR/all_agents_2R_no_clarify.json" -p "$BASE_DIR/problem.md" -v
13
+ dialectic debate -r 3 -c "$BASE_DIR/debate-config.json" -o "$OUTPUT_DIR/all_agents_3R_no_clarify.json" -p "$BASE_DIR/problem.md" -v
14
+ # dialectic debate -r 4 -c "$BASE_DIR/debate-config.json" -o "$OUTPUT_DIR/all_agents_4R_no_clarify.json" -p "$BASE_DIR/problem.md" -v
15
+ # dialectic debate -r 5 -c "$BASE_DIR/debate-config.json" -o "$OUTPUT_DIR/all_agents_5R_no_clarify.json" -p "$BASE_DIR/problem.md" -v
16
+