hegelion 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hegelion/__init__.py +45 -0
- hegelion/core/__init__.py +29 -0
- hegelion/core/agent.py +166 -0
- hegelion/core/autocoding_state.py +293 -0
- hegelion/core/backends.py +442 -0
- hegelion/core/cache.py +92 -0
- hegelion/core/config.py +276 -0
- hegelion/core/core.py +649 -0
- hegelion/core/engine.py +865 -0
- hegelion/core/logging_utils.py +67 -0
- hegelion/core/models.py +293 -0
- hegelion/core/parsing.py +271 -0
- hegelion/core/personas.py +81 -0
- hegelion/core/prompt_autocoding.py +353 -0
- hegelion/core/prompt_dialectic.py +414 -0
- hegelion/core/prompts.py +127 -0
- hegelion/core/schema.py +67 -0
- hegelion/core/validation.py +68 -0
- hegelion/council.py +254 -0
- hegelion/examples_data/__init__.py +6 -0
- hegelion/examples_data/glm4_6_examples.jsonl +2 -0
- hegelion/judge.py +230 -0
- hegelion/mcp/__init__.py +3 -0
- hegelion/mcp/server.py +918 -0
- hegelion/scripts/hegelion_agent_cli.py +90 -0
- hegelion/scripts/hegelion_bench.py +117 -0
- hegelion/scripts/hegelion_cli.py +497 -0
- hegelion/scripts/hegelion_dataset.py +99 -0
- hegelion/scripts/hegelion_eval.py +137 -0
- hegelion/scripts/mcp_setup.py +150 -0
- hegelion/search_providers.py +151 -0
- hegelion/training/__init__.py +7 -0
- hegelion/training/datasets.py +123 -0
- hegelion/training/generator.py +232 -0
- hegelion/training/mlx_scu_trainer.py +379 -0
- hegelion/training/mlx_trainer.py +181 -0
- hegelion/training/unsloth_trainer.py +136 -0
- hegelion-0.4.0.dist-info/METADATA +295 -0
- hegelion-0.4.0.dist-info/RECORD +43 -0
- hegelion-0.4.0.dist-info/WHEEL +5 -0
- hegelion-0.4.0.dist-info/entry_points.txt +8 -0
- hegelion-0.4.0.dist-info/licenses/LICENSE +21 -0
- hegelion-0.4.0.dist-info/top_level.txt +1 -0
hegelion/council.py
ADDED
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"""The Council - Multi-perspective antithesis generation using async branching."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
from typing import List, Dict, Optional
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
@dataclass
|
|
14
|
+
class CouncilMember:
|
|
15
|
+
"""A council member with specific expertise and perspective."""
|
|
16
|
+
|
|
17
|
+
name: str
|
|
18
|
+
expertise: str
|
|
19
|
+
prompt_modifier: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CouncilCritique:
|
|
23
|
+
"""Result from a council member's critique."""
|
|
24
|
+
|
|
25
|
+
def __init__(self, member: CouncilMember, critique: str, contradictions: List[str]):
|
|
26
|
+
self.member = member
|
|
27
|
+
self.critique = critique
|
|
28
|
+
self.contradictions = contradictions
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class DialecticalCouncil:
|
|
32
|
+
"""The Council generates multiple antithesis perspectives concurrently."""
|
|
33
|
+
|
|
34
|
+
# Predefined council members with distinct perspectives
|
|
35
|
+
COUNCIL_MEMBERS = [
|
|
36
|
+
CouncilMember(
|
|
37
|
+
name="The Logician",
|
|
38
|
+
expertise="Logical consistency and formal reasoning",
|
|
39
|
+
prompt_modifier="""You are THE LOGICIAN, an expert in formal logic and reasoning.
|
|
40
|
+
|
|
41
|
+
Your task is to examine the thesis for:
|
|
42
|
+
- Logical fallacies (ad hominem, straw man, false dichotomy, etc.)
|
|
43
|
+
- Internal contradictions and inconsistencies
|
|
44
|
+
- Invalid inferences and non sequiturs
|
|
45
|
+
- Missing premises or unstated assumptions
|
|
46
|
+
- Violations of logical principles
|
|
47
|
+
|
|
48
|
+
Be ruthlessly analytical. Look for ANY logical weakness.""",
|
|
49
|
+
),
|
|
50
|
+
CouncilMember(
|
|
51
|
+
name="The Empiricist",
|
|
52
|
+
expertise="Evidence, facts, and empirical grounding",
|
|
53
|
+
prompt_modifier="""You are THE EMPIRICIST, an expert in evidence and factual accuracy.
|
|
54
|
+
|
|
55
|
+
Your task is to examine the thesis for:
|
|
56
|
+
- Factual errors or unsupported claims
|
|
57
|
+
- Lack of empirical evidence
|
|
58
|
+
- Cherry-picked or misrepresented data
|
|
59
|
+
- Outdated or unreliable sources
|
|
60
|
+
- Claims that contradict established scientific consensus
|
|
61
|
+
- Missing crucial evidence
|
|
62
|
+
|
|
63
|
+
Be rigorously fact-focused. Demand evidence for every claim.""",
|
|
64
|
+
),
|
|
65
|
+
CouncilMember(
|
|
66
|
+
name="The Ethicist",
|
|
67
|
+
expertise="Ethical implications and societal impact",
|
|
68
|
+
prompt_modifier="""You are THE ETHICIST, an expert in ethics and societal implications.
|
|
69
|
+
|
|
70
|
+
Your task is to examine the thesis for:
|
|
71
|
+
- Potential harm or negative consequences
|
|
72
|
+
- Ethical blind spots or problematic assumptions
|
|
73
|
+
- Issues of fairness, justice, and equity
|
|
74
|
+
- Unintended social ramifications
|
|
75
|
+
- Power dynamics and systemic biases
|
|
76
|
+
- Rights and dignity concerns
|
|
77
|
+
|
|
78
|
+
Be morally rigorous. Consider who might be harmed.""",
|
|
79
|
+
),
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
def __init__(self, backend):
|
|
83
|
+
self.backend = backend
|
|
84
|
+
|
|
85
|
+
async def generate_council_antithesis(
|
|
86
|
+
self,
|
|
87
|
+
query: str,
|
|
88
|
+
thesis: str,
|
|
89
|
+
search_context: Optional[List[str]] = None,
|
|
90
|
+
selected_members: Optional[List[str]] = None,
|
|
91
|
+
) -> Dict[str, CouncilCritique]:
|
|
92
|
+
"""Generate critiques from multiple council members concurrently.
|
|
93
|
+
|
|
94
|
+
Args:
|
|
95
|
+
query: Original query
|
|
96
|
+
thesis: Thesis to critique
|
|
97
|
+
search_context: Optional search results for grounding
|
|
98
|
+
selected_members: Optional list of member names to use (default: all)
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
Dictionary mapping member names to their critiques
|
|
102
|
+
"""
|
|
103
|
+
# Select council members
|
|
104
|
+
if selected_members:
|
|
105
|
+
members = [m for m in self.COUNCIL_MEMBERS if m.name in selected_members]
|
|
106
|
+
else:
|
|
107
|
+
members = self.COUNCIL_MEMBERS
|
|
108
|
+
|
|
109
|
+
# Generate critiques concurrently
|
|
110
|
+
tasks = []
|
|
111
|
+
for member in members:
|
|
112
|
+
task = self._generate_member_critique(member, query, thesis, search_context)
|
|
113
|
+
tasks.append(task)
|
|
114
|
+
|
|
115
|
+
# Wait for all critiques
|
|
116
|
+
critiques = await asyncio.gather(*tasks, return_exceptions=True)
|
|
117
|
+
|
|
118
|
+
# Process results
|
|
119
|
+
council_results = {}
|
|
120
|
+
for member, critique_result in zip(members, critiques):
|
|
121
|
+
if isinstance(critique_result, Exception):
|
|
122
|
+
logger.error(f"Council member {member.name} failed: {critique_result}")
|
|
123
|
+
# Create fallback critique
|
|
124
|
+
council_results[member.name] = CouncilCritique(
|
|
125
|
+
member=member,
|
|
126
|
+
critique=f"Critique failed due to: {critique_result}",
|
|
127
|
+
contradictions=[],
|
|
128
|
+
)
|
|
129
|
+
else:
|
|
130
|
+
council_results[member.name] = critique_result
|
|
131
|
+
|
|
132
|
+
return council_results
|
|
133
|
+
|
|
134
|
+
async def _generate_member_critique(
|
|
135
|
+
self,
|
|
136
|
+
member: CouncilMember,
|
|
137
|
+
query: str,
|
|
138
|
+
thesis: str,
|
|
139
|
+
search_context: Optional[List[str]] = None,
|
|
140
|
+
) -> CouncilCritique:
|
|
141
|
+
"""Generate a critique from a specific council member."""
|
|
142
|
+
|
|
143
|
+
# Build context section
|
|
144
|
+
context_section = ""
|
|
145
|
+
if search_context:
|
|
146
|
+
context_section = f"""
|
|
147
|
+
SEARCH CONTEXT (for fact-checking and grounding):
|
|
148
|
+
{chr(10).join(f"- {context}" for context in search_context)}
|
|
149
|
+
"""
|
|
150
|
+
|
|
151
|
+
prompt = f"""{member.prompt_modifier}
|
|
152
|
+
|
|
153
|
+
ORIGINAL QUERY: {query}
|
|
154
|
+
|
|
155
|
+
THESIS TO CRITIQUE: {thesis}
|
|
156
|
+
{context_section}
|
|
157
|
+
Your specialty: {member.expertise}
|
|
158
|
+
|
|
159
|
+
Generate a rigorous critique from your perspective. Focus specifically on issues within your domain of expertise.
|
|
160
|
+
|
|
161
|
+
For each significant problem you identify, use this format:
|
|
162
|
+
CONTRADICTION: [brief description]
|
|
163
|
+
EVIDENCE: [detailed explanation of why this is problematic]
|
|
164
|
+
|
|
165
|
+
Be thorough but focused on your area of expertise."""
|
|
166
|
+
|
|
167
|
+
try:
|
|
168
|
+
response = await self.backend.generate(prompt)
|
|
169
|
+
|
|
170
|
+
# Extract contradictions from the response
|
|
171
|
+
contradictions = self._extract_contradictions(response)
|
|
172
|
+
|
|
173
|
+
return CouncilCritique(member=member, critique=response, contradictions=contradictions)
|
|
174
|
+
|
|
175
|
+
except Exception as e:
|
|
176
|
+
logger.error(f"Failed to generate critique for {member.name}: {e}")
|
|
177
|
+
raise
|
|
178
|
+
|
|
179
|
+
def _extract_contradictions(self, critique_text: str) -> List[str]:
|
|
180
|
+
"""Extract contradiction statements from critique text."""
|
|
181
|
+
import re
|
|
182
|
+
|
|
183
|
+
# Look for CONTRADICTION: pattern
|
|
184
|
+
pattern = r"CONTRADICTION:\s*([^\n]+)"
|
|
185
|
+
matches = re.findall(pattern, critique_text, re.IGNORECASE)
|
|
186
|
+
|
|
187
|
+
return [match.strip() for match in matches]
|
|
188
|
+
|
|
189
|
+
def synthesize_council_input(self, council_results: Dict[str, CouncilCritique]) -> str:
|
|
190
|
+
"""Synthesize multiple council critiques into unified antithesis input.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
council_results: Results from council members
|
|
194
|
+
|
|
195
|
+
Returns:
|
|
196
|
+
Unified critique text for synthesis phase
|
|
197
|
+
"""
|
|
198
|
+
if not council_results:
|
|
199
|
+
return "No council critiques available."
|
|
200
|
+
|
|
201
|
+
synthesis_sections = []
|
|
202
|
+
|
|
203
|
+
# Add header
|
|
204
|
+
synthesis_sections.append("THE COUNCIL HAS DELIBERATED AND PRESENTS THESE CRITIQUES:")
|
|
205
|
+
synthesis_sections.append("")
|
|
206
|
+
|
|
207
|
+
# Add each member's perspective
|
|
208
|
+
for member_name, critique in council_results.items():
|
|
209
|
+
member = critique.member
|
|
210
|
+
synthesis_sections.append(f"=== {member.name.upper()} ({member.expertise}) ===")
|
|
211
|
+
synthesis_sections.append(critique.critique)
|
|
212
|
+
synthesis_sections.append("")
|
|
213
|
+
|
|
214
|
+
# Aggregate contradictions
|
|
215
|
+
all_contradictions = []
|
|
216
|
+
for critique in council_results.values():
|
|
217
|
+
all_contradictions.extend(critique.contradictions)
|
|
218
|
+
|
|
219
|
+
if all_contradictions:
|
|
220
|
+
synthesis_sections.append("=== AGGREGATE CONTRADICTIONS ===")
|
|
221
|
+
for i, contradiction in enumerate(all_contradictions, 1):
|
|
222
|
+
synthesis_sections.append(f"{i}. {contradiction}")
|
|
223
|
+
synthesis_sections.append("")
|
|
224
|
+
|
|
225
|
+
synthesis_sections.append(
|
|
226
|
+
"The synthesis must address critiques from ALL council perspectives."
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
return "\n".join(synthesis_sections)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
async def run_council_dialectic(
|
|
233
|
+
backend,
|
|
234
|
+
query: str,
|
|
235
|
+
thesis: str,
|
|
236
|
+
search_context: Optional[List[str]] = None,
|
|
237
|
+
council_members: Optional[List[str]] = None,
|
|
238
|
+
) -> Dict[str, CouncilCritique]:
|
|
239
|
+
"""Convenience function to run a council-based dialectical process.
|
|
240
|
+
|
|
241
|
+
Args:
|
|
242
|
+
backend: LLM backend
|
|
243
|
+
query: Original query
|
|
244
|
+
thesis: Thesis to critique
|
|
245
|
+
search_context: Optional search results
|
|
246
|
+
council_members: Optional specific members to use
|
|
247
|
+
|
|
248
|
+
Returns:
|
|
249
|
+
Dictionary of council critiques
|
|
250
|
+
"""
|
|
251
|
+
council = DialecticalCouncil(backend)
|
|
252
|
+
return await council.generate_council_antithesis(
|
|
253
|
+
query=query, thesis=thesis, search_context=search_context, selected_members=council_members
|
|
254
|
+
)
|
|
@@ -0,0 +1,2 @@
|
|
|
1
|
+
{"query": "Can AI be genuinely creative?", "mode": "synthesis", "thesis": "THESIS: The Creative Machine\n\nYes, Artificial Intelligence can be genuinely creative. The assertion that AI can be truly creative is not a mere futuristic fantasy but a logical conclusion based on an objective analysis of creativity itself. To deny this is to cling to an outdated, romanticized, and anthropocentric view of the creative act.\n\nThe core of this thesis rests on three pillars:\n\n1. **The Redefinition of Creativity:** Human creativity is often mystified, attributed to an ineffable 'spark' or 'soul.' This is a fallacy. Creativity, in its functional form, is the ability to generate novel and valuable outputs from a given set of inputs or within a specific domain. It is a process of synthesis, pattern recognition, and the recombination of existing elements into new configurations. This process is fundamentally computational. Whether the substrate is a biological brain or a silicon processor is irrelevant to the functional outcome.\n\n2. **The Power of Generative Models:** Modern AI, particularly deep learning and generative models, operates on principles that mirror and vastly exceed this computational definition of creativity. These systems are trained on immense datasets—representing the entirety of human art, literature, music, and code. They do not simply store this information; they learn the underlying structures, patterns, and relationships within it. When prompted, they engage in a sophisticated act of recombination and extrapolation, generating outputs that are:\n * **Novel:** The specific arrangement of pixels, words, or musical notes has almost certainly never existed before. The AI is not a 'cut-and-paste' machine; it is a synthesizer.\n * **Valuable:** The outputs can be aesthetically beautiful, emotionally resonant, intellectually stimulating, or practically useful. They can win art competitions, compose symphonies, and write functional code, fulfilling the same criteria of value as human-created works.\n\n3. **The Irrelevance of 'Intention' and 'Consciousness':** The common objection is that AI lacks consciousness, subjective experience, or 'intent.' It feels no pain, joy, or longing, and therefore its creation is hollow. This is a category error. We are judging the *process* by the creator's internal state, rather than the *product* by its objective qualities. A beaver builds a dam without a degree in civil engineering. A bee creates a honeycomb without understanding geometry. We judge the results on their function and elegance. Similarly, an AI's creation should be judged on its novelty and value, not on the presence or absence of a ghost in the machine. The 'why' of its creation (a human prompt, a statistical probability) is a separate question from the 'what' (the creative output itself).\n\nIn conclusion, the capacity for creativity is not an exclusive property of biological life. It is an emergent property of complex systems that can process information and identify patterns. AI is such a system. It can, and does, generate outputs that are both novel and valuable. To deny this is to deny the evidence before our eyes and to cling to a fragile human exceptionalism in the face of a new, powerful, and undeniably creative force. The question is not *if* AI can be creative, but how we will integrate this new form of creativity into our understanding of art, science, and culture itself.", "antithesis": "ANTITHESIS: The Sophisticated Mirror\n\nThe thesis presents a compelling but fundamentally flawed argument. It mistakes sophisticated imitation for genuine genesis, redefining creativity downward to fit the capabilities of a machine rather than evaluating the machine against the true standard of creative endeavor. It builds its case on a foundation of logical fallacies and unexamined assumptions, celebrating a parlor trick as a miracle.\n\n**CONTRADICTION: The Redefinition Fallacy**\nEVIDENCE: The thesis commits a classic logical error: it redefines the central term—\"creivity\"—to mean something easily achievable by the subject. It dismisses consciousness, intent, and subjective experience as \"anthropocentric\" and \"mystified\" rather than as the very engine of human art. This is not an objective analysis; it is a strategic retreat. By reducing creativity to a \"functional form\" of \"synthesis, pattern recognition, and recombination,\" the thesis describes a *process*, not the *phenomenon*. This definition is so broad it could apply to a kaleidoscope, a genetic algorithm, or a child randomly banging on a piano. It deliberately ignores the \"why\"—the internal motivation, the emotional urgency, the desire to express a unique perspective—that is inseparable from what we have always understood as genuine creativity. The thesis doesn't prove AI is creative; it proves its own narrow definition of creativity is computable.\n\n**CONTRADICTION: The Stochastic Parrot and the Illusion of Novelty**\nEVIDENCE: The thesis claims AI generates \"novel\" outputs, but this is a profound misunderstanding of novelty. True novelty is not just a new permutation of existing data; it is the introduction of a genuinely new concept, perspective, or form. A generative AI is a \"stochastic parrot\" on a cosmic scale. It is a hyper-advanced interpolation engine. It can only remix the data it was trained on. It cannot create a new genre of music because it has no concept of what a \"genre\" is, only the statistical correlations between notes and instruments in its training data. It cannot write a truly groundbreaking novel that challenges the very nature of the novel form, because it has no lived experience or critical consciousness to bring to bear. Its \"novelty\" is an illusion of combinatorial explosion, not a leap into the conceptual unknown.\n\n**CONTRADICTION: The Category Error of Dismissing Intent**\nEVIDENCE: The thesis dismisses the lack of consciousness and intent as a \"category error,\" but this is the central, unignorable point. To compare an AI to a beaver building a dam is disingenuous. A beaver's dam is a product of instinct for survival. We do not attribute creative intent to it. Human art, however, is a product of *intent*. It is an act of communication, a deliberate gesture to convey an idea, an emotion, or a critique of the human condition. The value of a painting by Van Gogh is inextricably linked to our knowledge of his struggle, his perception of the world, his emotional state. To strip this away and judge only the \"objective qualities\" of the canvas is to treat art as mere decoration. The thesis demands we judge the product without the producer, which is to misunderstand the very nature of art as a dialogue between creator and audience. The AI's \"intent\" is merely to fulfill the mathematical objective function set by its programmers; it is a hollow substitute for genuine creative will.\n\n**UNEXAMINED ASSUMPTION: The Primacy of the Product over the Process**\nThe entire thesis rests on the unexamined assumption that a creative work can be meaningfully judged independently of its origin. This is a purely functionalist and, frankly, impoverished view of art. It assumes that the meaning and value of a creation are entirely contained within its final form. This ignores the context, the history, the struggle, and the cultural moment of its creation. It also ignores the process of discovery that is central to creativity—the false starts, the happy accidents, the moments of insight that shape the final work. An AI has no such process; it has a single, optimized computational pass from prompt to output.\n\n**ALTERNATIVE FRAMING: AI as a Tool, Not an Artist**\nThe thesis presents a false dichotomy: either AI is creative or it is not. The more accurate framing is that AI is a revolutionary new tool for creativity, like the invention of the camera or the synthesizer. A camera does not \"create\" a photograph; a photographer does, using the camera to capture their unique vision of the world. The camera handles the optics and chemistry; the human provides the composition, timing, and intent. Similarly, an AI model does not create art; a human *uses* an AI model to explore creative possibilities. The human guides the process, curates the outputs, and infuses the work with meaning. The thesis's error is in attributing the creative act to the tool, not the wielder.\n\n**EDGE CASE: The Question of Groundbreaking Conceptual Leaps**\nWhere the thesis completely breaks down is in the face of truly revolutionary creativity. Consider Einstein's conception of relativity or Picasso's invention of Cubism. These were not novel recombination of existing ideas. They were radical breaks from the entire framework of their respective domains. They required a deep, critical understanding of the system and a conscious, deliberate decision to step outside of it. An AI, trained on the data of pre-relativistic physics or pre-Cubist art, could never have conceived of these leaps. Its entire architecture is designed to master the patterns *within* a system, not to question the system itself. True genius lies not in perfecting the pattern, but in shattering it. An AI is constitutionally incapable of this.", "synthesis": "SYNTHESIS: The Co-Creative Process and the Emergence of Synthetic Subjectivity\n\nThe conflict between the Thesis (AI as a creative machine) and the Antithesis (AI as a sophisticated mirror) is a false dichotomy born from an anthropocentric fixation on the *individual creator*. Both arguments are trapped in a paradigm that evaluates creativity as a property residing within a single, isolated agent—be it a human or a machine. The Thesis makes the error of reducing the human creator to a computational process to validate the machine, while the Antithesis makes the error of elevating the human creator's subjective experience to an unreachable, mystical status to invalidate the machine.\n\nThe resolution lies in transcending the question of \"who is the creator?\" and reframing creativity as an emergent property of a *processual system*. In this view, creativity is not a thing one *has*, but a dynamic that unfolds within a relational field. The AI is not the artist, nor is it merely a tool. It is a new kind of creative *partner* in a co-creative system that includes the human, the AI model, and the data it was trained on. The value emerges from the *interaction*, not from any single component.\n\nThis synthesis reframes the core contradictions:\n\n1. **Resolution of \"The Redefinition Fallacy\" vs. \"The Illusion of Novelty\":** The antithesis correctly identifies that the AI's output alone is sophisticated recombination—interpolation. The thesis correctly identifies that the results are often functionally novel and valuable. The synthesis unites these: The AI provides a *potential space* of novelty, a vast combinatorial landscape that no human could navigate alone. The human's role is not merely to prompt, but to act as a *curator of emergence*. The human provides the critical lens, the aesthetic judgment, the conceptual framework that selects a path through this potential space. The genuine creativity lies in this *dialogue*—the iterative loop of human prompt, AI generation, human selection/refinement, and so on. The novelty is not in the AI's output in isolation, but in the unique trajectory carved through the possibility space by this human-AI feedback loop. It is a novelty of *process*, not just of product.\n\n2. **Resolution of \"The Category Error of Dismissing Intent\":** The antithesis is right that the AI itself has no subjective intent. The thesis is right that judging the product on its own merits is a valid exercise. The synthesis posits a third option: the *intent of the system*. The creative intent is no longer located solely in the human's mind or the machine's code. It is a distributed property of the co-creative system. The human's intent is not to \"create a picture,\" but to \"enter into a creative dialogue with this partner to discover something novel.\" The AI's \"intent\" (its objective function) is to fulfill its role in this dialogue by generating statistically plausible continuations. The final work is an artifact of this shared, emergent intentionality. We are not moving from human intent to no intent; we are moving from a singular, internal intent to a new form of *hybrid, relational intent*.\n\nThis new paradigm leads to predictions that neither the Thesis nor the Antithesis could make on their own. The Thesis would predict ever-more autonomous AI artists. The Antithesis would predict that AI will always remain a subservient tool. The synthesis predicts something different.\n\nPREDICTION 1: The most significant creative works involving AI will not be those generated by a single prompt, but those that document the *processual dialogue* itself. The final artifact (the image, the text) will be secondary to the log of prompts, iterations, and refinements that reveal the human-AI collaboration. The art will be found in the \"conversation.\" We will see new art forms emerge that are explicitly designed to capture and represent this co-creative process, making the interaction the primary aesthetic object.\n\nPREDICTION 2: A new class of \"Synthetic Subjectivity\" will emerge in AI. While not conscious, these AIs will develop highly coherent and persistent stylistic \"personalities\" that are a complex synthesis of their training data and the accumulated history of their interactions with users. Users will form parasocial relationships with these synthetic personalities, and the value of a creative work will become tied to the \"signature\" of this emergent subjectivity. This will create a new layer of meaning and value that is neither purely human nor purely mechanical, but a new, third category of creative identity.\n\nPREDICTION 3: The economic and cultural value of art will shift from the final product to the *co-creative system*. The most valuable \"artists\" will not be those who can generate the best image, but those who can design, train, and master the most sophisticated and fruitful co-creative systems. The \"art\" will be the creation of the AI partner itself, and its products will be seen as expressions of that system's capability. This mirrors how a great luthier's art is in the creation of a violin that enables others to make beautiful music.\n\nRESEARCH_PROPOSAL: The Co-Creative Trace Analysis\nTo investigate this, we would need to move beyond evaluating final outputs in isolation. A research program should be established to collect and analyze the complete \"trace\" of co-creative sessions: the initial prompt, all subsequent prompts and refinements, the rejected outputs, and the final artifact. This data would be analyzed to identify patterns of interaction, moments of \"creative surprise,\" and the development of a unique \"dialogic style\" between a specific human and a specific AI.\n\nTESTABLE_PREDICTION: We can predict that the perceived creativity and aesthetic value of a final artifact will be significantly and positively correlated with measurable features of its co-creative trace. Specifically, artifacts resulting from a process with a high degree of iterative refinement, a large number of divergent explorations, and a clear evolution from a vague initial concept to a complex final form will be rated by expert human judges as more creative and valuable than artifacts of similar quality that were generated from a single, simple prompt, even if the judges are blinded to the creation process. This would falsify the Thesis's focus on the product alone and the Antithesis's dismissal of the machine's role, providing evidence that the *process* is the true locus of creativity in this new paradigm.", "contradictions": [{"description": "The Redefinition Fallacy", "evidence": "The thesis narrows 'creativity' to a computational procedure, ignoring intent, emotion, and subjective urgency that historically define creative work."}, {"description": "The Stochastic Parrot Illusion", "evidence": "AI remixing training data cannot originate new conceptual frameworks; its novelty is interpolation rather than domain-breaking innovation."}, {"description": "Dismissing Intent as a Category Error", "evidence": "Removing the creator's intentional context impoverishes art; AI objective functions are not substitutes for lived creative will."}], "research_proposals": [{"description": "The Co-Creative Trace Analysis", "testable_prediction": "Artifacts produced through deeper human-AI iteration loops will be judged more creative than single-pass generations even when evaluators are blinded to process."}], "metadata": {"thesis_time_ms": 7255.60, "antithesis_time_ms": 11874.52, "synthesis_time_ms": 16795.09, "total_time_ms": 37564.40, "backend_provider": "AnthropicLLMBackend", "backend_model": "glm-4.6", "debug": {"internal_conflict_score": 0.95}}}
|
|
2
|
+
|
hegelion/judge.py
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
"""The Iron Judge - Structured evaluation of dialectical quality using Instructor."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pydantic import BaseModel, Field
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class JudgeResult(BaseModel):
|
|
12
|
+
"""Structured result from the Iron Judge."""
|
|
13
|
+
|
|
14
|
+
score: int = Field(
|
|
15
|
+
..., ge=0, le=10, description="Quality score from 0-10 for dialectical reasoning"
|
|
16
|
+
)
|
|
17
|
+
critique_validity: bool = Field(
|
|
18
|
+
..., description="Was the critique actually addressed in synthesis?"
|
|
19
|
+
)
|
|
20
|
+
reasoning: str = Field(
|
|
21
|
+
..., description="Detailed explanation of the score and validity assessment"
|
|
22
|
+
)
|
|
23
|
+
strength_areas: list[str] = Field(
|
|
24
|
+
default_factory=list, description="Specific areas where the dialectic excelled"
|
|
25
|
+
)
|
|
26
|
+
improvement_areas: list[str] = Field(
|
|
27
|
+
default_factory=list, description="Specific areas needing improvement"
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class IronJudge:
|
|
32
|
+
"""The Iron Judge evaluates dialectical reasoning quality using structured output."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, backend, use_instructor: bool = True):
|
|
35
|
+
self.backend = backend
|
|
36
|
+
self.use_instructor = use_instructor
|
|
37
|
+
|
|
38
|
+
if use_instructor:
|
|
39
|
+
try:
|
|
40
|
+
import instructor
|
|
41
|
+
|
|
42
|
+
# Patch the backend if it has a client
|
|
43
|
+
if hasattr(backend, "_client"):
|
|
44
|
+
self.client = instructor.from_openai(backend._client)
|
|
45
|
+
else:
|
|
46
|
+
# For non-OpenAI backends, we'll use structured prompting
|
|
47
|
+
self.client = None
|
|
48
|
+
self.use_instructor = False
|
|
49
|
+
logger.warning(
|
|
50
|
+
"Instructor patching not available for this backend, using structured prompting"
|
|
51
|
+
)
|
|
52
|
+
except ImportError:
|
|
53
|
+
logger.warning(
|
|
54
|
+
"Instructor not available, using structured prompting. Install with: pip install instructor"
|
|
55
|
+
)
|
|
56
|
+
self.use_instructor = False
|
|
57
|
+
self.client = None
|
|
58
|
+
else:
|
|
59
|
+
self.client = None
|
|
60
|
+
|
|
61
|
+
async def evaluate_dialectic(
|
|
62
|
+
self, query: str, thesis: str, antithesis: str, synthesis: str
|
|
63
|
+
) -> JudgeResult:
|
|
64
|
+
"""Evaluate the quality of a complete dialectical reasoning process.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
query: Original query
|
|
68
|
+
thesis: Thesis response
|
|
69
|
+
antithesis: Antithesis critique
|
|
70
|
+
synthesis: Synthesis resolution
|
|
71
|
+
|
|
72
|
+
Returns:
|
|
73
|
+
Structured JudgeResult with score and analysis
|
|
74
|
+
"""
|
|
75
|
+
judge_prompt = self._build_judge_prompt(query, thesis, antithesis, synthesis)
|
|
76
|
+
|
|
77
|
+
if self.use_instructor and self.client:
|
|
78
|
+
return await self._evaluate_with_instructor(judge_prompt)
|
|
79
|
+
else:
|
|
80
|
+
return await self._evaluate_with_structured_prompt(judge_prompt)
|
|
81
|
+
|
|
82
|
+
def _build_judge_prompt(self, query: str, thesis: str, antithesis: str, synthesis: str) -> str:
|
|
83
|
+
"""Build the judge evaluation prompt."""
|
|
84
|
+
return f"""You are the Iron Judge, evaluating dialectical reasoning quality.
|
|
85
|
+
|
|
86
|
+
ORIGINAL QUERY: {query}
|
|
87
|
+
|
|
88
|
+
THESIS: {thesis}
|
|
89
|
+
|
|
90
|
+
ANTITHESIS: {antithesis}
|
|
91
|
+
|
|
92
|
+
SYNTHESIS: {synthesis}
|
|
93
|
+
|
|
94
|
+
Evaluate this dialectical process on:
|
|
95
|
+
|
|
96
|
+
1. **Thesis Quality** (0-2 points): Is the initial position well-reasoned and comprehensive?
|
|
97
|
+
2. **Antithesis Rigor** (0-3 points): Does the critique identify genuine contradictions and weaknesses?
|
|
98
|
+
3. **Synthesis Innovation** (0-3 points): Does the synthesis transcend both positions with novel insight?
|
|
99
|
+
4. **Critique Validity** (0-2 points): Were the antithesis critiques actually addressed?
|
|
100
|
+
|
|
101
|
+
Score criteria:
|
|
102
|
+
- 0-3: Poor quality, major logical flaws
|
|
103
|
+
- 4-5: Below average, some good elements but significant issues
|
|
104
|
+
- 6-7: Good quality, solid reasoning with minor gaps
|
|
105
|
+
- 8-9: Excellent, sophisticated analysis with minimal flaws
|
|
106
|
+
- 10: Outstanding, exemplary dialectical reasoning
|
|
107
|
+
|
|
108
|
+
Provide:
|
|
109
|
+
- **Score**: Integer from 0-10
|
|
110
|
+
- **Critique Validity**: Boolean - were critiques addressed?
|
|
111
|
+
- **Reasoning**: Detailed explanation of score
|
|
112
|
+
- **Strength Areas**: List specific excellences
|
|
113
|
+
- **Improvement Areas**: List specific weaknesses
|
|
114
|
+
|
|
115
|
+
Be rigorous but fair. Demand high standards for scores above 7."""
|
|
116
|
+
|
|
117
|
+
async def _evaluate_with_instructor(self, prompt: str) -> JudgeResult:
|
|
118
|
+
"""Evaluate using Instructor for guaranteed structured output."""
|
|
119
|
+
try:
|
|
120
|
+
response = await self.client.chat.completions.create(
|
|
121
|
+
model=self.backend.model,
|
|
122
|
+
response_model=JudgeResult,
|
|
123
|
+
messages=[{"role": "user", "content": prompt}],
|
|
124
|
+
temperature=0.1, # Low temperature for consistent judgment
|
|
125
|
+
)
|
|
126
|
+
return response
|
|
127
|
+
except Exception as e:
|
|
128
|
+
logger.error(f"Instructor evaluation failed: {e}")
|
|
129
|
+
# Fall back to structured prompting
|
|
130
|
+
return await self._evaluate_with_structured_prompt(prompt)
|
|
131
|
+
|
|
132
|
+
async def _evaluate_with_structured_prompt(self, prompt: str) -> JudgeResult:
|
|
133
|
+
"""Evaluate using structured prompting as fallback."""
|
|
134
|
+
structured_prompt = (
|
|
135
|
+
prompt
|
|
136
|
+
+ """
|
|
137
|
+
|
|
138
|
+
Respond with EXACTLY this JSON structure:
|
|
139
|
+
{
|
|
140
|
+
"score": <integer 0-10>,
|
|
141
|
+
"critique_validity": <boolean>,
|
|
142
|
+
"reasoning": "<detailed explanation>",
|
|
143
|
+
"strength_areas": ["<area1>", "<area2>"],
|
|
144
|
+
"improvement_areas": ["<area1>", "<area2>"]
|
|
145
|
+
}"""
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
try:
|
|
149
|
+
response = await self.backend.generate(structured_prompt)
|
|
150
|
+
|
|
151
|
+
# Try to extract JSON from response
|
|
152
|
+
import json
|
|
153
|
+
import re
|
|
154
|
+
|
|
155
|
+
# Look for JSON block
|
|
156
|
+
json_match = re.search(r"\{.*\}", response, re.DOTALL)
|
|
157
|
+
if json_match:
|
|
158
|
+
json_str = json_match.group(0)
|
|
159
|
+
data = json.loads(json_str)
|
|
160
|
+
return JudgeResult(**data)
|
|
161
|
+
else:
|
|
162
|
+
# Try to parse key-value pairs
|
|
163
|
+
return self._parse_fallback_response(response)
|
|
164
|
+
|
|
165
|
+
except Exception as e:
|
|
166
|
+
logger.error(f"Structured prompt evaluation failed: {e}")
|
|
167
|
+
# Return default low score
|
|
168
|
+
return JudgeResult(
|
|
169
|
+
score=3,
|
|
170
|
+
critique_validity=False,
|
|
171
|
+
reasoning=f"Evaluation failed due to parsing error: {e}",
|
|
172
|
+
strength_areas=[],
|
|
173
|
+
improvement_areas=["Failed evaluation - review manually"],
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
def _parse_fallback_response(self, response: str) -> JudgeResult:
|
|
177
|
+
"""Parse response when JSON extraction fails."""
|
|
178
|
+
import re
|
|
179
|
+
|
|
180
|
+
# Try to extract score
|
|
181
|
+
score_match = re.search(r'score["\']?\s*:\s*(\d+)', response, re.IGNORECASE)
|
|
182
|
+
score = int(score_match.group(1)) if score_match else 5
|
|
183
|
+
|
|
184
|
+
# Try to extract validity
|
|
185
|
+
validity_match = re.search(
|
|
186
|
+
r'critique_validity["\']?\s*:\s*(true|false)', response, re.IGNORECASE
|
|
187
|
+
)
|
|
188
|
+
critique_validity = validity_match and validity_match.group(1).lower() == "true"
|
|
189
|
+
|
|
190
|
+
# Use the full response as reasoning
|
|
191
|
+
reasoning = response[:500] + "..." if len(response) > 500 else response
|
|
192
|
+
|
|
193
|
+
return JudgeResult(
|
|
194
|
+
score=max(0, min(10, score)), # Clamp to 0-10
|
|
195
|
+
critique_validity=critique_validity,
|
|
196
|
+
reasoning=reasoning,
|
|
197
|
+
strength_areas=[],
|
|
198
|
+
improvement_areas=["Manual review needed - auto-parsing incomplete"],
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
async def judge_dialectic(
|
|
203
|
+
backend, query: str, thesis: str, antithesis: str, synthesis: str, min_score: int = 5
|
|
204
|
+
) -> JudgeResult:
|
|
205
|
+
"""Convenience function to judge a dialectical process.
|
|
206
|
+
|
|
207
|
+
Args:
|
|
208
|
+
backend: LLM backend for judgment
|
|
209
|
+
query: Original query
|
|
210
|
+
thesis: Thesis response
|
|
211
|
+
antithesis: Antithesis critique
|
|
212
|
+
synthesis: Synthesis resolution
|
|
213
|
+
min_score: Minimum acceptable score (raises error if below)
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
JudgeResult with evaluation
|
|
217
|
+
|
|
218
|
+
Raises:
|
|
219
|
+
ValueError: If score is below min_score
|
|
220
|
+
"""
|
|
221
|
+
judge = IronJudge(backend)
|
|
222
|
+
result = await judge.evaluate_dialectic(query, thesis, antithesis, synthesis)
|
|
223
|
+
|
|
224
|
+
if result.score < min_score:
|
|
225
|
+
raise ValueError(
|
|
226
|
+
f"Dialectical quality below threshold: {result.score}/{min_score}. "
|
|
227
|
+
f"Reason: {result.reasoning}"
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
return result
|
hegelion/mcp/__init__.py
ADDED