vibe-aigc 0.6.0__tar.gz → 0.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {vibe_aigc-0.6.0/vibe_aigc.egg-info → vibe_aigc-0.6.1}/PKG-INFO +1 -1
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/pyproject.toml +1 -1
- vibe_aigc-0.6.1/vibe_aigc/fidelity.py +401 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/vibe_backend.py +75 -28
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/vlm_feedback.py +23 -12
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1/vibe_aigc.egg-info}/PKG-INFO +1 -1
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc.egg-info/SOURCES.txt +1 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/LICENSE +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/README.md +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/setup.cfg +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_adaptive_replanning.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_agents.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_assets.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_auto_checkpoint.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_automatic_checkpoints.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_checkpoint_serialization.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_error_handling.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_executor.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_feedback_system.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_integration.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_knowledge_base.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_metaplanner_resume.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_metaplanner_visualization.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_models.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_parallel_execution.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_planner.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_progress_callbacks.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_tools.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_visualization.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/tests/test_workflow_resume.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/__init__.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/agents.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/assets.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/audio.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/character.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/cli.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/comfyui.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/composer_general.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/discovery.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/executor.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/knowledge.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/llm.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/model_registry.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/models.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/mv_pipeline.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/persistence.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/planner.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/tools.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/tools_multimodal.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/video.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/visualization.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/workflow_backend.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/workflow_composer.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/workflow_executor.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/workflow_registry.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/workflow_strategies.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc/workflows.py +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc.egg-info/dependency_links.txt +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc.egg-info/entry_points.txt +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc.egg-info/requires.txt +0 -0
- {vibe_aigc-0.6.0 → vibe_aigc-0.6.1}/vibe_aigc.egg-info/top_level.txt +0 -0
|
@@ -8,7 +8,7 @@ exclude = ["tests*", "docs*", "examples*", "landing*"]
|
|
|
8
8
|
|
|
9
9
|
[project]
|
|
10
10
|
name = "vibe-aigc"
|
|
11
|
-
version = "0.6.
|
|
11
|
+
version = "0.6.1"
|
|
12
12
|
description = "A New Paradigm for Content Generation via Agentic Orchestration"
|
|
13
13
|
authors = [{name = "Vibe AIGC Contributors"}]
|
|
14
14
|
license = "MIT"
|
|
@@ -0,0 +1,401 @@
|
|
|
1
|
+
"""Fidelity Measurement — Creative Unit Tests for vibe-aigc.
|
|
2
|
+
|
|
3
|
+
Paper Section 6: "The Verification Crisis... no universal unit test for a 'cinematic atmosphere'"
|
|
4
|
+
Paper Section 7: "We need 'Creative Unit Tests'"
|
|
5
|
+
|
|
6
|
+
This module measures how well vibe-aigc achieves user intent:
|
|
7
|
+
1. Intent Alignment: Does output match the vibe?
|
|
8
|
+
2. Consistency: Same prompt → similar results?
|
|
9
|
+
3. Quality Distribution: What's the score spread?
|
|
10
|
+
4. Refinement Efficacy: Does feedback improve scores?
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import asyncio
|
|
14
|
+
import statistics
|
|
15
|
+
from dataclasses import dataclass, field
|
|
16
|
+
from typing import Any, Dict, List, Optional
|
|
17
|
+
from datetime import datetime
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
import json
|
|
20
|
+
|
|
21
|
+
from .vibe_backend import VibeBackend, GenerationRequest, GenerationResult
|
|
22
|
+
from .discovery import Capability
|
|
23
|
+
from .vlm_feedback import VLMFeedback, FeedbackResult
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class FidelityScore:
|
|
28
|
+
"""Score for a single generation."""
|
|
29
|
+
prompt: str
|
|
30
|
+
output_url: str
|
|
31
|
+
quality_score: float
|
|
32
|
+
feedback: str
|
|
33
|
+
strengths: List[str]
|
|
34
|
+
weaknesses: List[str]
|
|
35
|
+
attempt_number: int
|
|
36
|
+
timestamp: str
|
|
37
|
+
|
|
38
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
39
|
+
return {
|
|
40
|
+
"prompt": self.prompt,
|
|
41
|
+
"output_url": self.output_url,
|
|
42
|
+
"quality_score": self.quality_score,
|
|
43
|
+
"feedback": self.feedback,
|
|
44
|
+
"strengths": self.strengths,
|
|
45
|
+
"weaknesses": self.weaknesses,
|
|
46
|
+
"attempt_number": self.attempt_number,
|
|
47
|
+
"timestamp": self.timestamp
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@dataclass
|
|
52
|
+
class FidelityReport:
|
|
53
|
+
"""Complete fidelity report for a prompt."""
|
|
54
|
+
prompt: str
|
|
55
|
+
capability: str
|
|
56
|
+
num_runs: int
|
|
57
|
+
scores: List[FidelityScore]
|
|
58
|
+
|
|
59
|
+
# Statistics
|
|
60
|
+
mean_score: float = 0.0
|
|
61
|
+
std_dev: float = 0.0
|
|
62
|
+
min_score: float = 0.0
|
|
63
|
+
max_score: float = 0.0
|
|
64
|
+
|
|
65
|
+
# Refinement analysis
|
|
66
|
+
first_attempt_mean: float = 0.0
|
|
67
|
+
refined_attempts_mean: float = 0.0
|
|
68
|
+
refinement_improvement: float = 0.0
|
|
69
|
+
|
|
70
|
+
# Common patterns
|
|
71
|
+
common_strengths: List[str] = field(default_factory=list)
|
|
72
|
+
common_weaknesses: List[str] = field(default_factory=list)
|
|
73
|
+
|
|
74
|
+
def compute_statistics(self) -> None:
|
|
75
|
+
"""Compute statistics from scores."""
|
|
76
|
+
if not self.scores:
|
|
77
|
+
return
|
|
78
|
+
|
|
79
|
+
quality_scores = [s.quality_score for s in self.scores]
|
|
80
|
+
|
|
81
|
+
self.mean_score = statistics.mean(quality_scores)
|
|
82
|
+
self.std_dev = statistics.stdev(quality_scores) if len(quality_scores) > 1 else 0.0
|
|
83
|
+
self.min_score = min(quality_scores)
|
|
84
|
+
self.max_score = max(quality_scores)
|
|
85
|
+
|
|
86
|
+
# Refinement analysis
|
|
87
|
+
first_attempts = [s.quality_score for s in self.scores if s.attempt_number == 1]
|
|
88
|
+
refined_attempts = [s.quality_score for s in self.scores if s.attempt_number > 1]
|
|
89
|
+
|
|
90
|
+
if first_attempts:
|
|
91
|
+
self.first_attempt_mean = statistics.mean(first_attempts)
|
|
92
|
+
if refined_attempts:
|
|
93
|
+
self.refined_attempts_mean = statistics.mean(refined_attempts)
|
|
94
|
+
self.refinement_improvement = self.refined_attempts_mean - self.first_attempt_mean
|
|
95
|
+
|
|
96
|
+
# Common patterns
|
|
97
|
+
all_strengths = []
|
|
98
|
+
all_weaknesses = []
|
|
99
|
+
for s in self.scores:
|
|
100
|
+
all_strengths.extend(s.strengths)
|
|
101
|
+
all_weaknesses.extend(s.weaknesses)
|
|
102
|
+
|
|
103
|
+
# Count frequency
|
|
104
|
+
from collections import Counter
|
|
105
|
+
strength_counts = Counter(all_strengths)
|
|
106
|
+
weakness_counts = Counter(all_weaknesses)
|
|
107
|
+
|
|
108
|
+
self.common_strengths = [s for s, _ in strength_counts.most_common(5)]
|
|
109
|
+
self.common_weaknesses = [w for w, _ in weakness_counts.most_common(5)]
|
|
110
|
+
|
|
111
|
+
def summary(self) -> str:
|
|
112
|
+
"""Human-readable summary."""
|
|
113
|
+
lines = [
|
|
114
|
+
"=" * 60,
|
|
115
|
+
"FIDELITY REPORT",
|
|
116
|
+
"=" * 60,
|
|
117
|
+
"",
|
|
118
|
+
f"Prompt: {self.prompt[:50]}...",
|
|
119
|
+
f"Capability: {self.capability}",
|
|
120
|
+
f"Runs: {self.num_runs}",
|
|
121
|
+
"",
|
|
122
|
+
"QUALITY SCORES:",
|
|
123
|
+
f" Mean: {self.mean_score:.2f}/10",
|
|
124
|
+
f" Std Dev: {self.std_dev:.2f}",
|
|
125
|
+
f" Range: {self.min_score:.1f} - {self.max_score:.1f}",
|
|
126
|
+
"",
|
|
127
|
+
"REFINEMENT EFFICACY:",
|
|
128
|
+
f" First attempt mean: {self.first_attempt_mean:.2f}",
|
|
129
|
+
f" Refined attempts mean: {self.refined_attempts_mean:.2f}",
|
|
130
|
+
f" Improvement: {self.refinement_improvement:+.2f}",
|
|
131
|
+
"",
|
|
132
|
+
"COMMON STRENGTHS:",
|
|
133
|
+
]
|
|
134
|
+
for s in self.common_strengths[:3]:
|
|
135
|
+
lines.append(f" + {s}")
|
|
136
|
+
|
|
137
|
+
lines.append("")
|
|
138
|
+
lines.append("COMMON WEAKNESSES:")
|
|
139
|
+
for w in self.common_weaknesses[:3]:
|
|
140
|
+
lines.append(f" - {w}")
|
|
141
|
+
|
|
142
|
+
lines.append("")
|
|
143
|
+
lines.append("=" * 60)
|
|
144
|
+
|
|
145
|
+
# Verdict
|
|
146
|
+
if self.mean_score >= 7.0:
|
|
147
|
+
lines.append("VERDICT: HIGH FIDELITY - System achieves intent well")
|
|
148
|
+
elif self.mean_score >= 5.0:
|
|
149
|
+
lines.append("VERDICT: MODERATE FIDELITY - Room for improvement")
|
|
150
|
+
else:
|
|
151
|
+
lines.append("VERDICT: LOW FIDELITY - Significant gap from intent")
|
|
152
|
+
|
|
153
|
+
if self.refinement_improvement > 0.5:
|
|
154
|
+
lines.append(f"REFINEMENT: EFFECTIVE (+{self.refinement_improvement:.1f} improvement)")
|
|
155
|
+
elif self.refinement_improvement < -0.5:
|
|
156
|
+
lines.append(f"REFINEMENT: COUNTERPRODUCTIVE ({self.refinement_improvement:.1f})")
|
|
157
|
+
else:
|
|
158
|
+
lines.append("REFINEMENT: MARGINAL EFFECT")
|
|
159
|
+
|
|
160
|
+
lines.append("=" * 60)
|
|
161
|
+
|
|
162
|
+
return "\n".join(lines)
|
|
163
|
+
|
|
164
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
165
|
+
return {
|
|
166
|
+
"prompt": self.prompt,
|
|
167
|
+
"capability": self.capability,
|
|
168
|
+
"num_runs": self.num_runs,
|
|
169
|
+
"scores": [s.to_dict() for s in self.scores],
|
|
170
|
+
"statistics": {
|
|
171
|
+
"mean": self.mean_score,
|
|
172
|
+
"std_dev": self.std_dev,
|
|
173
|
+
"min": self.min_score,
|
|
174
|
+
"max": self.max_score,
|
|
175
|
+
},
|
|
176
|
+
"refinement": {
|
|
177
|
+
"first_attempt_mean": self.first_attempt_mean,
|
|
178
|
+
"refined_mean": self.refined_attempts_mean,
|
|
179
|
+
"improvement": self.refinement_improvement,
|
|
180
|
+
},
|
|
181
|
+
"patterns": {
|
|
182
|
+
"common_strengths": self.common_strengths,
|
|
183
|
+
"common_weaknesses": self.common_weaknesses,
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
class FidelityBenchmark:
|
|
189
|
+
"""Benchmark for measuring vibe-aigc fidelity.
|
|
190
|
+
|
|
191
|
+
Usage:
|
|
192
|
+
benchmark = FidelityBenchmark(comfyui_url="http://192.168.1.143:8188")
|
|
193
|
+
await benchmark.initialize()
|
|
194
|
+
|
|
195
|
+
report = await benchmark.run(
|
|
196
|
+
prompt="cyberpunk samurai in neon rain",
|
|
197
|
+
capability=Capability.TEXT_TO_IMAGE,
|
|
198
|
+
num_runs=5
|
|
199
|
+
)
|
|
200
|
+
|
|
201
|
+
print(report.summary())
|
|
202
|
+
"""
|
|
203
|
+
|
|
204
|
+
def __init__(
|
|
205
|
+
self,
|
|
206
|
+
comfyui_url: str = "http://127.0.0.1:8188",
|
|
207
|
+
max_attempts_per_run: int = 2,
|
|
208
|
+
quality_threshold: float = 7.0
|
|
209
|
+
):
|
|
210
|
+
self.backend = VibeBackend(
|
|
211
|
+
comfyui_url=comfyui_url,
|
|
212
|
+
enable_vlm=True,
|
|
213
|
+
max_attempts=max_attempts_per_run,
|
|
214
|
+
quality_threshold=quality_threshold
|
|
215
|
+
)
|
|
216
|
+
self._initialized = False
|
|
217
|
+
|
|
218
|
+
async def initialize(self) -> None:
|
|
219
|
+
"""Initialize the benchmark."""
|
|
220
|
+
await self.backend.initialize()
|
|
221
|
+
self._initialized = True
|
|
222
|
+
|
|
223
|
+
async def run(
|
|
224
|
+
self,
|
|
225
|
+
prompt: str,
|
|
226
|
+
capability: Capability = Capability.TEXT_TO_IMAGE,
|
|
227
|
+
num_runs: int = 5,
|
|
228
|
+
**kwargs
|
|
229
|
+
) -> FidelityReport:
|
|
230
|
+
"""Run the fidelity benchmark.
|
|
231
|
+
|
|
232
|
+
Args:
|
|
233
|
+
prompt: The prompt to test
|
|
234
|
+
capability: What to generate
|
|
235
|
+
num_runs: How many times to run
|
|
236
|
+
**kwargs: Additional generation parameters
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
FidelityReport with scores and statistics
|
|
240
|
+
"""
|
|
241
|
+
if not self._initialized:
|
|
242
|
+
await self.initialize()
|
|
243
|
+
|
|
244
|
+
print(f"Running fidelity benchmark: {num_runs} runs")
|
|
245
|
+
print(f"Prompt: {prompt[:50]}...")
|
|
246
|
+
print()
|
|
247
|
+
|
|
248
|
+
scores = []
|
|
249
|
+
|
|
250
|
+
for i in range(num_runs):
|
|
251
|
+
print(f"Run {i+1}/{num_runs}...")
|
|
252
|
+
|
|
253
|
+
request = GenerationRequest(
|
|
254
|
+
prompt=prompt,
|
|
255
|
+
capability=capability,
|
|
256
|
+
**kwargs
|
|
257
|
+
)
|
|
258
|
+
|
|
259
|
+
result = await self.backend.generate(request)
|
|
260
|
+
|
|
261
|
+
if result.success:
|
|
262
|
+
score = FidelityScore(
|
|
263
|
+
prompt=prompt,
|
|
264
|
+
output_url=result.output_url or "",
|
|
265
|
+
quality_score=result.quality_score or 5.0,
|
|
266
|
+
feedback=result.feedback or "",
|
|
267
|
+
strengths=result.strengths or [],
|
|
268
|
+
weaknesses=result.weaknesses or [],
|
|
269
|
+
attempt_number=result.attempts,
|
|
270
|
+
timestamp=datetime.now().isoformat()
|
|
271
|
+
)
|
|
272
|
+
scores.append(score)
|
|
273
|
+
print(f" Score: {score.quality_score}/10 (attempt {score.attempt_number})")
|
|
274
|
+
if score.strengths:
|
|
275
|
+
print(f" Strengths: {', '.join(score.strengths[:2])}")
|
|
276
|
+
if score.weaknesses:
|
|
277
|
+
print(f" Weaknesses: {', '.join(score.weaknesses[:2])}")
|
|
278
|
+
else:
|
|
279
|
+
print(f" Failed: {result.error}")
|
|
280
|
+
|
|
281
|
+
# Build report
|
|
282
|
+
report = FidelityReport(
|
|
283
|
+
prompt=prompt,
|
|
284
|
+
capability=capability.value,
|
|
285
|
+
num_runs=num_runs,
|
|
286
|
+
scores=scores
|
|
287
|
+
)
|
|
288
|
+
report.compute_statistics()
|
|
289
|
+
|
|
290
|
+
return report
|
|
291
|
+
|
|
292
|
+
async def compare_prompts(
|
|
293
|
+
self,
|
|
294
|
+
prompts: List[str],
|
|
295
|
+
capability: Capability = Capability.TEXT_TO_IMAGE,
|
|
296
|
+
runs_per_prompt: int = 3
|
|
297
|
+
) -> List[FidelityReport]:
|
|
298
|
+
"""Compare fidelity across multiple prompts."""
|
|
299
|
+
reports = []
|
|
300
|
+
|
|
301
|
+
for prompt in prompts:
|
|
302
|
+
report = await self.run(prompt, capability, runs_per_prompt)
|
|
303
|
+
reports.append(report)
|
|
304
|
+
|
|
305
|
+
return reports
|
|
306
|
+
|
|
307
|
+
async def test_refinement_efficacy(
|
|
308
|
+
self,
|
|
309
|
+
prompt: str,
|
|
310
|
+
capability: Capability = Capability.TEXT_TO_IMAGE,
|
|
311
|
+
num_runs: int = 5
|
|
312
|
+
) -> Dict[str, Any]:
|
|
313
|
+
"""Specifically test if VLM refinement improves quality.
|
|
314
|
+
|
|
315
|
+
Runs with max_attempts=1 (no refinement) vs max_attempts=3 (with refinement)
|
|
316
|
+
"""
|
|
317
|
+
print("Testing refinement efficacy...")
|
|
318
|
+
print()
|
|
319
|
+
|
|
320
|
+
# Without refinement
|
|
321
|
+
print("Phase 1: Without refinement (max_attempts=1)")
|
|
322
|
+
self.backend.max_attempts = 1
|
|
323
|
+
no_refine_scores = []
|
|
324
|
+
|
|
325
|
+
for i in range(num_runs):
|
|
326
|
+
result = await self.backend.generate(GenerationRequest(
|
|
327
|
+
prompt=prompt,
|
|
328
|
+
capability=capability
|
|
329
|
+
))
|
|
330
|
+
if result.success:
|
|
331
|
+
no_refine_scores.append(result.quality_score or 5.0)
|
|
332
|
+
print(f" Run {i+1}: {result.quality_score}/10")
|
|
333
|
+
|
|
334
|
+
# With refinement
|
|
335
|
+
print()
|
|
336
|
+
print("Phase 2: With refinement (max_attempts=3)")
|
|
337
|
+
self.backend.max_attempts = 3
|
|
338
|
+
with_refine_scores = []
|
|
339
|
+
|
|
340
|
+
for i in range(num_runs):
|
|
341
|
+
result = await self.backend.generate(GenerationRequest(
|
|
342
|
+
prompt=prompt,
|
|
343
|
+
capability=capability
|
|
344
|
+
))
|
|
345
|
+
if result.success:
|
|
346
|
+
with_refine_scores.append(result.quality_score or 5.0)
|
|
347
|
+
print(f" Run {i+1}: {result.quality_score}/10 (attempts: {result.attempts})")
|
|
348
|
+
|
|
349
|
+
# Analysis
|
|
350
|
+
no_refine_mean = statistics.mean(no_refine_scores) if no_refine_scores else 0
|
|
351
|
+
with_refine_mean = statistics.mean(with_refine_scores) if with_refine_scores else 0
|
|
352
|
+
improvement = with_refine_mean - no_refine_mean
|
|
353
|
+
|
|
354
|
+
return {
|
|
355
|
+
"prompt": prompt,
|
|
356
|
+
"without_refinement": {
|
|
357
|
+
"scores": no_refine_scores,
|
|
358
|
+
"mean": no_refine_mean,
|
|
359
|
+
},
|
|
360
|
+
"with_refinement": {
|
|
361
|
+
"scores": with_refine_scores,
|
|
362
|
+
"mean": with_refine_mean,
|
|
363
|
+
},
|
|
364
|
+
"improvement": improvement,
|
|
365
|
+
"refinement_effective": improvement > 0.5
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
# =============================================================================
|
|
370
|
+
# CONVENIENCE FUNCTIONS
|
|
371
|
+
# =============================================================================
|
|
372
|
+
|
|
373
|
+
async def measure_fidelity(
|
|
374
|
+
prompt: str,
|
|
375
|
+
comfyui_url: str = "http://127.0.0.1:8188",
|
|
376
|
+
num_runs: int = 5
|
|
377
|
+
) -> FidelityReport:
|
|
378
|
+
"""Quick fidelity measurement."""
|
|
379
|
+
benchmark = FidelityBenchmark(comfyui_url=comfyui_url)
|
|
380
|
+
await benchmark.initialize()
|
|
381
|
+
return await benchmark.run(prompt, num_runs=num_runs)
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
async def run_creative_unit_test(
|
|
385
|
+
prompt: str,
|
|
386
|
+
expected_min_score: float = 6.0,
|
|
387
|
+
comfyui_url: str = "http://127.0.0.1:8188",
|
|
388
|
+
num_runs: int = 3
|
|
389
|
+
) -> bool:
|
|
390
|
+
"""Run a creative unit test — does the system achieve minimum quality?
|
|
391
|
+
|
|
392
|
+
Returns True if mean score >= expected_min_score
|
|
393
|
+
"""
|
|
394
|
+
report = await measure_fidelity(prompt, comfyui_url, num_runs)
|
|
395
|
+
passed = report.mean_score >= expected_min_score
|
|
396
|
+
|
|
397
|
+
print(f"Creative Unit Test: {'PASSED' if passed else 'FAILED'}")
|
|
398
|
+
print(f" Expected: >= {expected_min_score}")
|
|
399
|
+
print(f" Actual: {report.mean_score:.2f}")
|
|
400
|
+
|
|
401
|
+
return passed
|
|
@@ -14,7 +14,7 @@ This works with ANY ComfyUI setup — no hardcoded models or patterns.
|
|
|
14
14
|
import asyncio
|
|
15
15
|
import aiohttp
|
|
16
16
|
from typing import Any, Dict, List, Optional
|
|
17
|
-
from dataclasses import dataclass
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
18
|
from pathlib import Path
|
|
19
19
|
|
|
20
20
|
from .discovery import (
|
|
@@ -51,6 +51,9 @@ class GenerationResult:
|
|
|
51
51
|
output_path: Optional[str] = None
|
|
52
52
|
quality_score: float = 0.0
|
|
53
53
|
feedback: Optional[str] = None
|
|
54
|
+
strengths: List[str] = field(default_factory=list)
|
|
55
|
+
weaknesses: List[str] = field(default_factory=list)
|
|
56
|
+
prompt_improvements: List[str] = field(default_factory=list)
|
|
54
57
|
error: Optional[str] = None
|
|
55
58
|
workflow_used: Optional[str] = None
|
|
56
59
|
model_used: Optional[str] = None
|
|
@@ -188,16 +191,25 @@ class VibeBackend:
|
|
|
188
191
|
|
|
189
192
|
# Compose from available nodes
|
|
190
193
|
print(f"Composing workflow for {request.capability.value}...")
|
|
194
|
+
|
|
195
|
+
# Build kwargs based on capability
|
|
196
|
+
kwargs = {
|
|
197
|
+
"negative_prompt": request.negative_prompt,
|
|
198
|
+
"width": request.width,
|
|
199
|
+
"height": request.height,
|
|
200
|
+
"steps": request.steps,
|
|
201
|
+
"cfg": request.cfg,
|
|
202
|
+
"seed": request.seed
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
# Add frames only for video capabilities
|
|
206
|
+
if request.capability in [Capability.TEXT_TO_VIDEO, Capability.IMAGE_TO_VIDEO]:
|
|
207
|
+
kwargs["frames"] = request.frames
|
|
208
|
+
|
|
191
209
|
return self.composer.compose_for_capability(
|
|
192
210
|
capability=request.capability,
|
|
193
211
|
prompt=request.prompt,
|
|
194
|
-
|
|
195
|
-
width=request.width,
|
|
196
|
-
height=request.height,
|
|
197
|
-
frames=request.frames,
|
|
198
|
-
steps=request.steps,
|
|
199
|
-
cfg=request.cfg,
|
|
200
|
-
seed=request.seed
|
|
212
|
+
**kwargs
|
|
201
213
|
)
|
|
202
214
|
|
|
203
215
|
async def _execute_with_feedback(
|
|
@@ -230,30 +242,65 @@ class VibeBackend:
|
|
|
230
242
|
return result
|
|
231
243
|
|
|
232
244
|
# VLM feedback
|
|
233
|
-
if self.vlm and self.vlm.available and result.
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
+
if self.vlm and self.vlm.available and result.output_url:
|
|
246
|
+
# Download image for VLM analysis
|
|
247
|
+
feedback = None
|
|
248
|
+
temp_path = None
|
|
249
|
+
try:
|
|
250
|
+
import tempfile
|
|
251
|
+
import os
|
|
252
|
+
async with aiohttp.ClientSession() as session:
|
|
253
|
+
async with session.get(result.output_url) as resp:
|
|
254
|
+
if resp.status == 200:
|
|
255
|
+
content = await resp.read()
|
|
256
|
+
# Save to temp file (won't auto-delete)
|
|
257
|
+
suffix = '.png' if 'png' in result.output_url else '.webp'
|
|
258
|
+
fd, temp_path = tempfile.mkstemp(suffix=suffix)
|
|
259
|
+
os.write(fd, content)
|
|
260
|
+
os.close(fd)
|
|
261
|
+
|
|
262
|
+
feedback = self.vlm.analyze_media(
|
|
263
|
+
Path(temp_path),
|
|
264
|
+
current_prompt
|
|
265
|
+
)
|
|
266
|
+
except Exception as e:
|
|
267
|
+
print(f"VLM feedback failed: {e}")
|
|
268
|
+
feedback = None
|
|
269
|
+
finally:
|
|
270
|
+
# Clean up temp file (ignore errors on Windows)
|
|
271
|
+
if temp_path:
|
|
272
|
+
try:
|
|
273
|
+
import os
|
|
274
|
+
os.unlink(temp_path)
|
|
275
|
+
except:
|
|
276
|
+
pass # Windows file locking, will be cleaned up by OS
|
|
245
277
|
|
|
246
|
-
if feedback
|
|
247
|
-
|
|
278
|
+
if feedback:
|
|
279
|
+
result.quality_score = feedback.quality_score
|
|
280
|
+
result.feedback = feedback.description
|
|
281
|
+
result.strengths = feedback.strengths
|
|
282
|
+
result.weaknesses = feedback.weaknesses
|
|
283
|
+
result.prompt_improvements = feedback.prompt_improvements
|
|
284
|
+
|
|
285
|
+
if feedback.quality_score > best_score:
|
|
286
|
+
best_score = feedback.quality_score
|
|
287
|
+
best_result = result
|
|
288
|
+
|
|
289
|
+
if feedback.quality_score >= self.quality_threshold:
|
|
290
|
+
print(f"Quality threshold met: {feedback.quality_score}/10")
|
|
291
|
+
result.attempts = attempt + 1
|
|
292
|
+
return result
|
|
293
|
+
|
|
294
|
+
# Refine prompt for next attempt
|
|
295
|
+
if attempt < self.max_attempts - 1:
|
|
296
|
+
current_prompt = self.vlm.suggest_improvements(feedback, current_prompt)
|
|
297
|
+
print(f"Refined prompt: {current_prompt[:50]}...")
|
|
298
|
+
else:
|
|
299
|
+
# VLM failed, return successful result
|
|
248
300
|
result.attempts = attempt + 1
|
|
249
301
|
return result
|
|
250
|
-
|
|
251
|
-
# Refine prompt for next attempt
|
|
252
|
-
if attempt < self.max_attempts - 1:
|
|
253
|
-
current_prompt = self.vlm.suggest_improvements(feedback, current_prompt)
|
|
254
|
-
print(f"Refined prompt: {current_prompt[:50]}...")
|
|
255
302
|
else:
|
|
256
|
-
# No VLM, return first successful result
|
|
303
|
+
# No VLM configured, return first successful result
|
|
257
304
|
result.attempts = attempt + 1
|
|
258
305
|
return result
|
|
259
306
|
|
|
@@ -118,25 +118,36 @@ class VLMFeedback:
|
|
|
118
118
|
|
|
119
119
|
img = Image.open(image_path)
|
|
120
120
|
|
|
121
|
-
prompt = f"""You are an AI art director analyzing generated images.
|
|
121
|
+
prompt = f"""You are an expert AI art director analyzing AI-generated images for quality.
|
|
122
122
|
|
|
123
|
-
|
|
123
|
+
Original prompt: {context}
|
|
124
|
+
|
|
125
|
+
IMPORTANT: You MUST provide specific, actionable prompt improvements.
|
|
124
126
|
|
|
125
|
-
Analyze this image and respond
|
|
127
|
+
Analyze this image and respond ONLY with valid JSON (no markdown):
|
|
126
128
|
{{
|
|
127
|
-
"quality_score": <1-10>,
|
|
128
|
-
"description": "<what you see>",
|
|
129
|
-
"strengths": ["<
|
|
130
|
-
"weaknesses": ["<
|
|
131
|
-
"prompt_improvements": [
|
|
129
|
+
"quality_score": <1-10 based on: composition, detail, prompt adherence, aesthetic quality>,
|
|
130
|
+
"description": "<brief description of what you see>",
|
|
131
|
+
"strengths": ["<specific strength 1>", "<specific strength 2>"],
|
|
132
|
+
"weaknesses": ["<specific weakness 1>", "<specific weakness 2>"],
|
|
133
|
+
"prompt_improvements": [
|
|
134
|
+
"<SPECIFIC phrase to ADD to prompt to fix weakness 1>",
|
|
135
|
+
"<SPECIFIC phrase to ADD to prompt to fix weakness 2>",
|
|
136
|
+
"<SPECIFIC quality modifier to add>"
|
|
137
|
+
],
|
|
132
138
|
"parameter_changes": {{
|
|
133
|
-
"cfg": <
|
|
134
|
-
"steps": <
|
|
135
|
-
"sampler": "<suggested sampler or null>"
|
|
139
|
+
"cfg": <suggest higher/lower cfg if needed, or null>,
|
|
140
|
+
"steps": <suggest more/fewer steps if needed, or null>
|
|
136
141
|
}}
|
|
137
142
|
}}
|
|
138
143
|
|
|
139
|
-
|
|
144
|
+
REQUIRED: prompt_improvements must have at least 2 specific suggestions like:
|
|
145
|
+
- "add sharp focus" if blurry
|
|
146
|
+
- "add dramatic shadows" if flat lighting
|
|
147
|
+
- "add intricate details" if lacking detail
|
|
148
|
+
- "add correct anatomy" if distorted
|
|
149
|
+
|
|
150
|
+
Score guide: 1-3 poor, 4-5 mediocre, 6-7 good, 8-9 excellent, 10 perfect."""
|
|
140
151
|
|
|
141
152
|
try:
|
|
142
153
|
response = self.vlm.generate_content([prompt, img])
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|