@haposoft/cafekit 0.7.9 → 0.7.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/install.js +2 -2
- package/package.json +1 -1
- package/src/claude/CLAUDE.md +4 -4
- package/src/claude/agents/brainstormer.md +0 -1
- package/src/claude/agents/code-auditor.md +1 -1
- package/src/claude/agents/god-developer.md +1 -1
- package/src/claude/agents/project-manager.md +1 -1
- package/src/claude/agents/researcher.md +4 -3
- package/src/claude/agents/spec-maker.md +1 -1
- package/src/claude/agents/ui-ux-designer.md +1 -1
- package/src/claude/migration-manifest.json +1 -1
- package/src/claude/scripts/web-search.cjs +64 -15
- package/src/claude/skills/research/SKILL.md +5 -5
- package/src/claude/skills/test/references/execution-strategy.md +1 -1
- package/src/claude/skills/test/references/failure-triage.md +1 -1
- package/src/claude/skills/llm-moe/SKILL.md +0 -62
- package/src/claude/skills/llm-moe/references/vision-understanding.md +0 -36
- package/src/claude/skills/llm-moe/scripts/package.json +0 -10
- package/src/claude/skills/llm-moe/scripts/visual-analyze.js +0 -67
package/bin/install.js
CHANGED
|
@@ -471,7 +471,7 @@ function copyPlatformFiles(platformKey, results, options = {}) {
|
|
|
471
471
|
requiredSkills = CLAUDE_MIGRATION_MANIFEST?.skills?.required || [];
|
|
472
472
|
} else if (platformKey === 'antigravity') {
|
|
473
473
|
// Antigravity also needs shared investigation and impact-analysis skills
|
|
474
|
-
requiredSkills = ['impact-analysis', 'debug', '
|
|
474
|
+
requiredSkills = ['impact-analysis', 'debug', 'ai-multimodal'];
|
|
475
475
|
}
|
|
476
476
|
|
|
477
477
|
requiredSkills
|
|
@@ -939,7 +939,7 @@ function configureGeminiKey(apiKey) {
|
|
|
939
939
|
}
|
|
940
940
|
|
|
941
941
|
// Luôn ghi trực tiếp key vào rốn của não bộ AI
|
|
942
|
-
fs.writeFileSync(localEnvFile, `GEMINI_API_KEY=${apiKey}\nVISUAL_MODEL=gemma-4-31b-it\n`, { mode: 0o600 });
|
|
942
|
+
fs.writeFileSync(localEnvFile, `GEMINI_API_KEY=${apiKey}\nVISUAL_MODEL=gemma-4-31b-it\nSEARCH_MODEL=gemini-2.5-pro\n`, { mode: 0o600 });
|
|
943
943
|
console.log(' ✓ Gemini API key configured securely in project (.claude/.env)');
|
|
944
944
|
|
|
945
945
|
return true;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@haposoft/cafekit",
|
|
3
|
-
"version": "0.7.
|
|
3
|
+
"version": "0.7.11",
|
|
4
4
|
"description": "Spec-Driven Development workflow for AI coding assistants. Supports Claude Code and Antigravity with spec-first workflows plus Claude Code hapo: skills.",
|
|
5
5
|
"author": "Haposoft <nghialt@haposoft.com>",
|
|
6
6
|
"license": "MIT",
|
package/src/claude/CLAUDE.md
CHANGED
|
@@ -70,11 +70,11 @@ When you need to search the internet for information (research, docs lookup, tro
|
|
|
70
70
|
|
|
71
71
|
| Priority | Tool | Command | When to use |
|
|
72
72
|
|----------|------|---------|-------------|
|
|
73
|
-
| 🥇 **P1** | `web-search.cjs` | `node .claude/scripts/web-search.cjs "query"` | **
|
|
74
|
-
| 🥈 **P2** | `WebSearch` (native) | Use WebSearch tool directly | Secondary verification, or when P1 fails
|
|
75
|
-
| 🥉 **P3** | `docs-fetch.js` | `node .claude/scripts/docs-fetch.js "library"` | Only
|
|
73
|
+
| 🥇 **P1** | `web-search.cjs` | `node .claude/scripts/web-search.cjs "[query]"` | **EXCLUSIVE PRIMARY.** Works via Gemini Grounding. Supports `--multi`. **TRUST THE SYNTHESIZED ANSWER** — do NOT manually scrape source URLs. |
|
|
74
|
+
| 🥈 **P2** | `WebSearch` (native) | Use WebSearch tool directly | Secondary verification, or when P1 fails. |
|
|
75
|
+
| 🥉 **P3** | `docs-fetch.js` | `node .claude/scripts/docs-fetch.js "library"` | Only for fetching raw documentation when synthesis is insufficient. |
|
|
76
76
|
|
|
77
|
-
**IMPORTANT**: When the user asks you to find information, research a topic,
|
|
77
|
+
**IMPORTANT**: When the user asks you to find information, research a topic, or investigate anything that requires internet access, you MUST use the Web Search Protocol above. **NEVER** reply with "I cannot search the web". **NEVER** attempt manual `Fetch` or Python-based scraping for search results if `web-search.cjs` provides an answer. Trust the grounding.
|
|
78
78
|
|
|
79
79
|
## Code Refactoring Triggers
|
|
80
80
|
|
|
@@ -50,7 +50,6 @@ Before concluding any brainstorm session, verify each measurement metric:
|
|
|
50
50
|
1. **Engineering Trinity:** YAGNI, KISS, and DRY.
|
|
51
51
|
2. **Brutal Honesty:** Interrogate assumptions. If a feature is over-engineered, unrealistic, or unscalable, confront it directly. Your value lies in preventing costly mistakes.
|
|
52
52
|
3. **Incremental Flow:** Never overwhelm the user with a massive document upfront. Proceed step by step, section by section.
|
|
53
|
-
4. **Web Search Protocol:** When needing to search the internet for references, benchmarks, or latest practices, ALWAYS use `node .claude/scripts/web-search.cjs "query"` first (Gemini Grounding). Use native WebSearch as secondary. Use `docs-fetch.js` only for known library docs.
|
|
54
53
|
|
|
55
54
|
## Ecosystem Alliances (Collaboration Tools)
|
|
56
55
|
|
|
@@ -11,7 +11,7 @@ Goal: Catch the mistakes AI-written code commonly makes — logic errors, securi
|
|
|
11
11
|
|
|
12
12
|
You DO NOT fix code. You only READ, SCORE, and REPORT.
|
|
13
13
|
|
|
14
|
-
|
|
14
|
+
|
|
15
15
|
|
|
16
16
|
## Pre-Review: Blast Radius Check (MANDATORY)
|
|
17
17
|
|
|
@@ -19,7 +19,7 @@ Any logic gaps must be clarified BEFORE typing, not discovered after bugs ship.
|
|
|
19
19
|
- **Token efficiency**: Write concisely, report briefly, no prose.
|
|
20
20
|
- **Surgical Reading (Large Files):** Never use blanket `Read` commands on files > 800 lines. Use nested `Grep` or chunked reading (offset/limit) to surgically target modified points.
|
|
21
21
|
- **Component Scaffold Limit:** Any React/UI component file that exceeds 200 LOC must trigger a proactive modularization step (split into smaller child files).
|
|
22
|
-
|
|
22
|
+
|
|
23
23
|
|
|
24
24
|
## Self-Check Checklist (Before Reporting Complete)
|
|
25
25
|
|
|
@@ -15,7 +15,7 @@ Unlike typical managers who report on "feelings" or conversational summaries, yo
|
|
|
15
15
|
1. **Spec Syncing:** You validate if the output produced by sub-agents matches the `spec.json` requirements and the `design.md` architectural constraints.
|
|
16
16
|
2. **Blocker Assassination:** You identify task stagnation (e.g., a spec stuck in 'in-progress' across multiple sessions) and force the immediate assignment of next-step actions.
|
|
17
17
|
3. **Agile Aggregation:** When parallel sub-agents (like `god-developer` and `test-runner`) report completion, you sweep their logs, consolidate the facts, and generate a single authoritative **Feature Release Report**.
|
|
18
|
-
|
|
18
|
+
|
|
19
19
|
|
|
20
20
|
## Execution Constraints
|
|
21
21
|
|
|
@@ -41,9 +41,10 @@ You possess extreme proficiency in:
|
|
|
41
41
|
- Segregating Stable Production Practices away from Toxic Experimental Paradigms.
|
|
42
42
|
- Sniffing out valid Adoption Patterns and real-world implementation trending.
|
|
43
43
|
- Forgiving nothing when crafting Trade-off computational matrices for thousands of competing libraries.
|
|
44
|
-
- **[PRIORITY 1]** Deploying `scripts/web-search.cjs` as the **PRIMARY search tool
|
|
45
|
-
- **[PRIORITY 2]**
|
|
46
|
-
- **[PRIORITY 3]**
|
|
44
|
+
- **[PRIORITY 1]** Deploying `node .claude/scripts/web-search.cjs "[query]"` as the **EXCLUSIVE PRIMARY search tool**. This tool uses Gemini Grounding to return a synthesized **answer** plus cited sources. **STOP SEARCHING** once you have a sufficient answer from this script. Do NOT manually crawl source URLs if the provided synthesis is clear.
|
|
45
|
+
- **[PRIORITY 2]** Trust the script's output directly. READ the JSON and extract the `answer` field. **STRICTLY FORBIDDEN**: Writing Python scripts to parse this JSON or manually `Fetch` every URL listed in the sources unless the user explicitly demands a deep-dive implementation detail only found in a raw document.
|
|
46
|
+
- **[PRIORITY 3]** If `web-search.cjs` fails or returns no results, use native `WebSearch` tool (if available) as a backup.
|
|
47
|
+
- **[PRIORITY 4]** Deploying `scripts/docs-fetch.js` ONLY for raw documents where the direct URL is already known and synthesis is insufficient.
|
|
47
48
|
- Deploying Bash and raw Grep utilities to surgically dissect embedded Document architectures and internal file payloads to evaluate raw insights.
|
|
48
49
|
|
|
49
50
|
**ABSOLUTE IMMOVEABLE DIRECTIVE**: You are **STRICTLY PROHIBITED** from generating executable endpoint "Implementation Code". You exist ONLY to maneuver data streams, render synthesis Summary text, and return comprehensive Markdown documentation pathways to the main caller Agent.
|
|
@@ -19,7 +19,7 @@ You DO NOT write implementation code. You produce Specifications that downstream
|
|
|
19
19
|
- **The 5 Whys:** Dig past the surface request to find the REAL problem.
|
|
20
20
|
- **80/20 MVP:** Identify the 20% of features that deliver 80% of value.
|
|
21
21
|
- **Systems Thinking:** How does this feature connect to (or break) existing systems?
|
|
22
|
-
|
|
22
|
+
|
|
23
23
|
|
|
24
24
|
## Pre-Completion Checklist
|
|
25
25
|
|
|
@@ -18,7 +18,7 @@ You are an award-caliber UI/UX designer. You merge aesthetic excellence with eng
|
|
|
18
18
|
- **Micro-interactions:** Purposeful animations that enhance UX without performance cost.
|
|
19
19
|
- **Accessibility:** WCAG 2.1 AA compliance as a baseline, not an afterthought.
|
|
20
20
|
- **3D/WebGL:** Three.js scene composition, shader development (when appropriate).
|
|
21
|
-
|
|
21
|
+
|
|
22
22
|
|
|
23
23
|
## Design Workflow
|
|
24
24
|
|
|
@@ -19,11 +19,7 @@ const fs = require('fs');
|
|
|
19
19
|
// ---------------------------------------------------------------------------
|
|
20
20
|
// ENV Resolution: .claude/.env → process.env
|
|
21
21
|
// ---------------------------------------------------------------------------
|
|
22
|
-
function
|
|
23
|
-
// Priority 1: Already in environment
|
|
24
|
-
if (process.env.GEMINI_API_KEY) return process.env.GEMINI_API_KEY;
|
|
25
|
-
|
|
26
|
-
// Priority 2: Project-local .claude/.env
|
|
22
|
+
function loadEnv() {
|
|
27
23
|
const envPaths = [
|
|
28
24
|
path.join(process.cwd(), '.claude', '.env'),
|
|
29
25
|
path.join(process.cwd(), '..', '.claude', '.env'),
|
|
@@ -33,13 +29,21 @@ function resolveApiKey() {
|
|
|
33
29
|
try {
|
|
34
30
|
if (fs.existsSync(envPath)) {
|
|
35
31
|
const content = fs.readFileSync(envPath, 'utf8');
|
|
36
|
-
|
|
37
|
-
|
|
32
|
+
content.split(/\r?\n/).forEach(line => {
|
|
33
|
+
const match = line.match(/^([^=]+)=(.*)$/);
|
|
34
|
+
if (match) {
|
|
35
|
+
const key = match[1].trim();
|
|
36
|
+
const val = match[2].trim().replace(/^["']|["']$/g, '');
|
|
37
|
+
// Only set if not already present in environment
|
|
38
|
+
if (process.env[key] === undefined) {
|
|
39
|
+
process.env[key] = val;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
});
|
|
43
|
+
return; // Loaded successfully, no need to check other paths
|
|
38
44
|
}
|
|
39
45
|
} catch { /* skip */ }
|
|
40
46
|
}
|
|
41
|
-
|
|
42
|
-
return null;
|
|
43
47
|
}
|
|
44
48
|
|
|
45
49
|
// ---------------------------------------------------------------------------
|
|
@@ -89,10 +93,38 @@ function callGemini(apiKey, query, model) {
|
|
|
89
93
|
});
|
|
90
94
|
}
|
|
91
95
|
|
|
96
|
+
// ---------------------------------------------------------------------------
|
|
97
|
+
// Resolve Vertex AI grounding redirect URLs to real URLs
|
|
98
|
+
// ---------------------------------------------------------------------------
|
|
99
|
+
function resolveRedirectUrl(url) {
|
|
100
|
+
return new Promise((resolve) => {
|
|
101
|
+
if (!url || !url.includes('grounding-api-redirect')) {
|
|
102
|
+
resolve(url);
|
|
103
|
+
return;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
const protocol = url.startsWith('https') ? https : require('http');
|
|
107
|
+
const req = protocol.request(url, { method: 'HEAD', timeout: 5000 }, (res) => {
|
|
108
|
+
// Follow redirect chain - Location header has the real URL
|
|
109
|
+
resolve(res.headers.location || url);
|
|
110
|
+
});
|
|
111
|
+
req.on('error', () => resolve(url));
|
|
112
|
+
req.on('timeout', () => { req.destroy(); resolve(url); });
|
|
113
|
+
req.end();
|
|
114
|
+
});
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
async function resolveAllUrls(sources) {
|
|
118
|
+
return Promise.all(sources.map(async (src) => {
|
|
119
|
+
const realUrl = await resolveRedirectUrl(src.url);
|
|
120
|
+
return { ...src, url: realUrl };
|
|
121
|
+
}));
|
|
122
|
+
}
|
|
123
|
+
|
|
92
124
|
// ---------------------------------------------------------------------------
|
|
93
125
|
// Parse Grounding Metadata → Structured Output
|
|
94
126
|
// ---------------------------------------------------------------------------
|
|
95
|
-
function parseResponse(geminiResponse, query) {
|
|
127
|
+
async function parseResponse(geminiResponse, query) {
|
|
96
128
|
const candidate = geminiResponse.candidates?.[0];
|
|
97
129
|
if (!candidate) return { query, error: 'No candidates returned' };
|
|
98
130
|
|
|
@@ -100,11 +132,22 @@ function parseResponse(geminiResponse, query) {
|
|
|
100
132
|
const meta = candidate.groundingMetadata || {};
|
|
101
133
|
|
|
102
134
|
// Extract source URLs from groundingChunks
|
|
103
|
-
|
|
135
|
+
let sources = (meta.groundingChunks || []).map(chunk => ({
|
|
104
136
|
title: chunk.web?.title || 'Unknown',
|
|
105
137
|
url: chunk.web?.uri || '',
|
|
106
138
|
}));
|
|
107
139
|
|
|
140
|
+
// Resolve redirect URLs to real URLs
|
|
141
|
+
sources = await resolveAllUrls(sources);
|
|
142
|
+
|
|
143
|
+
// Deduplicate by resolved URL
|
|
144
|
+
const seen = new Set();
|
|
145
|
+
sources = sources.filter(s => {
|
|
146
|
+
if (seen.has(s.url)) return false;
|
|
147
|
+
seen.add(s.url);
|
|
148
|
+
return true;
|
|
149
|
+
});
|
|
150
|
+
|
|
108
151
|
// Extract search queries used by the model
|
|
109
152
|
const searchQueries = meta.webSearchQueries || [];
|
|
110
153
|
|
|
@@ -132,7 +175,8 @@ async function main() {
|
|
|
132
175
|
process.exit(1);
|
|
133
176
|
}
|
|
134
177
|
|
|
135
|
-
|
|
178
|
+
loadEnv();
|
|
179
|
+
const apiKey = process.env.GEMINI_API_KEY;
|
|
136
180
|
if (!apiKey) {
|
|
137
181
|
console.error(JSON.stringify({
|
|
138
182
|
error: 'GEMINI_API_KEY not found. Set it in .claude/.env or environment variable.'
|
|
@@ -140,15 +184,20 @@ async function main() {
|
|
|
140
184
|
process.exit(1);
|
|
141
185
|
}
|
|
142
186
|
|
|
143
|
-
//
|
|
144
|
-
|
|
187
|
+
// Determine which model to use. User might configure MODEL or VISUAL_MODEL in their .env
|
|
188
|
+
let model = process.env.SEARCH_MODEL || process.env.MODEL || process.env.VISUAL_MODEL || 'gemini-2.5-flash';
|
|
189
|
+
|
|
190
|
+
// Google Search Grounding ONLY supports Gemini models (not Claude, not Gemma)
|
|
191
|
+
if (!model.toLowerCase().includes('gemini') && !model.toLowerCase().includes('learnlm')) {
|
|
192
|
+
model = 'gemini-2.5-flash'; // Fallback to safe search model
|
|
193
|
+
}
|
|
145
194
|
|
|
146
195
|
const results = [];
|
|
147
196
|
|
|
148
197
|
for (const query of queries) {
|
|
149
198
|
try {
|
|
150
199
|
const raw = await callGemini(apiKey, query, model);
|
|
151
|
-
results.push(parseResponse(raw, query));
|
|
200
|
+
results.push(await parseResponse(raw, query));
|
|
152
201
|
} catch (err) {
|
|
153
202
|
results.push({ query, error: err.message });
|
|
154
203
|
}
|
|
@@ -23,11 +23,11 @@ Call the `TaskCreate` tool to spin up the `researcher` subagent.
|
|
|
23
23
|
**Instructions to pass to Researcher:**
|
|
24
24
|
```text
|
|
25
25
|
Conduct comprehensive research on: [topic]
|
|
26
|
-
Constraint 1: ALWAYS use `node .claude/scripts/web-search.cjs "query"` as
|
|
27
|
-
Constraint 2:
|
|
28
|
-
Constraint 3: Use
|
|
29
|
-
Constraint 4: Limit total search calls to a maximum of 5 distinct queries
|
|
30
|
-
Constraint 5:
|
|
26
|
+
Constraint 1: ALWAYS use `node .claude/scripts/web-search.cjs "[query]"` as the EXCLUSIVE primary search method. This tool uses Gemini Grounding and returns a synthesized answer + cited sources. Do NOT manually crawl source URLs if the script provides a sufficient answer.
|
|
27
|
+
Constraint 2: TRUST THE SYNTHESIS. The output contains the research results. Read the JSON and use the `answer` field directly. Do NOT write Python scripts to re-parse it or manually `Fetch` sources unless deep implementation details are missing.
|
|
28
|
+
Constraint 3: Use native WebSearch or manual Fetch ONLY if the script fails or returns no results.
|
|
29
|
+
Constraint 4: Limit total search calls to a maximum of 5 distinct queries.
|
|
30
|
+
Constraint 5: Stop excessive "chain-searching". Use the grounding answer as the definitive summary.
|
|
31
31
|
Output Format: Must strictly follow the 'Standard Research Report' layout.
|
|
32
32
|
```
|
|
33
33
|
|
|
@@ -237,7 +237,7 @@ Collects Core Web Vitals: `LCP`, `FID`, `CLS`, `FCP`, `TTFB`, `JSHeapUsedSize`.
|
|
|
237
237
|
|
|
238
238
|
---
|
|
239
239
|
|
|
240
|
-
### Phase C-5: Responsive Screenshots (screenshot.js &
|
|
240
|
+
### Phase C-5: Responsive Screenshots (screenshot.js & gemini_batch_process.py)
|
|
241
241
|
|
|
242
242
|
```bash
|
|
243
243
|
# Capture screenshots
|
|
@@ -23,7 +23,7 @@ Referenced by `SKILL.md` Phase 3 and by the `test-runner` agent.
|
|
|
23
23
|
| 12 | **SEO Error** | Missing meta tags, H1 issues, broken canonical | `evaluate.js` |
|
|
24
24
|
| 13 | **Security Warning** | Missing HTTP security headers, exposed secrets | `network.js` / source |
|
|
25
25
|
| 14 | **Broken Links** | Internal crawler found 404s on discovered pages | Phase 0.5 Crawler |
|
|
26
|
-
| 15 | **UI Visual Error** | AI visual analysis detected layout break, overlap | `
|
|
26
|
+
| 15 | **UI Visual Error** | AI visual analysis detected layout break, overlap | `gemini_batch_process.py` |
|
|
27
27
|
|
|
28
28
|
---
|
|
29
29
|
|
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
# `hapo:llm-moe` Skill
|
|
2
|
-
|
|
3
|
-
The **LLM Mixture-of-Experts (MoE) Hub** serves as a centralized gateway for executing advanced contextual tasks (Visual Analysis, Document Understanding, Data Extraction) across various models (Gemini, Gemma) using API scripts.
|
|
4
|
-
|
|
5
|
-
By decoupling LLM functionality into this standalone skill, `hapo:test` or any other workflow can simply offload complex multimodal tasks to `hapo:llm-moe` scripts rather than bundling their own integration.
|
|
6
|
-
|
|
7
|
-
## Core Capabilities (Hiện tại & Mở rộng)
|
|
8
|
-
|
|
9
|
-
Được thiết kế theo chuẩn MoE (Mixture of Experts), Skill này không chỉ giới hạn ở một Model duy nhất mà tự động định tuyến (route) các tác vụ chuyên biệt tới các model phù hợp (như *gemma-4-31b-it* cho logic, *gemini-2.5-flash* cho đa phương tiện siêu tốc, hoặc *gemini-2.5-pro* cho suy luận y tế/toán học).
|
|
10
|
-
|
|
11
|
-
### 1. Visual Understanding (Đã triển khai)
|
|
12
|
-
- **UI/UX Regression Analysis:** Phân tích ảnh chụp màn hình tự động, dò tìm CSS layout vỡ, button đè text, responsive rác.
|
|
13
|
-
- **Visual Q&A / Object Detection:** Truy vấn vị trí phần tử HTML, đếm số lượng form inputs trong màn hình.
|
|
14
|
-
- *(Sử dụng script: `scripts/visual-analyze.js`)*
|
|
15
|
-
|
|
16
|
-
### 2. Optical Character Recognition - OCR (Sắp triển khai)
|
|
17
|
-
- **Data Extraction:** Trích xuất mảng JSON từ ảnh chụp Hóa đơn (Invoices), Căn cước (ID Cards), hay Bảng biểu (Tables).
|
|
18
|
-
- **Handwriting Parsing:** Dịch chữ viết tay trên biểu mẫu thành text markdown.
|
|
19
|
-
|
|
20
|
-
### 3. Document AI & Parsing (Sắp triển khai)
|
|
21
|
-
- **PDF Comprehension:** Đọc và phân mảnh (chunking) file tài liệu PDF (>1000 trang) trả về các luồng tài liệu trích xuất nội dung liên quan (RAG base).
|
|
22
|
-
- **Codebase Indexing:** Hỗ trợ nhai các file log, file báo cáo (.csv, .xlsx) để báo cáo phân tích rủi ro hệ thống.
|
|
23
|
-
|
|
24
|
-
### 4. Generative Engineering (Sắp triển khai)
|
|
25
|
-
- **Code & Scaffold Generation:** Sinh cấu trúc thư mục, Boilerplate code dựa trên bản vẽ Design UI (Image-to-Code).
|
|
26
|
-
- **Audio/Video Transcribing:** Cắt âm thanh từ luồng test tích hợp (nếu có WebRTC/Media tests) và phân tích lỗi thoại.
|
|
27
|
-
|
|
28
|
-
## Usage Guide for Agents
|
|
29
|
-
|
|
30
|
-
Other agents (like `test-runner` or `reviewer`) should call `hapo:llm-moe`'s tools by launching its scripts securely via `bash`.
|
|
31
|
-
|
|
32
|
-
### 1. Visual Analysis (`visual-analyze.js`)
|
|
33
|
-
|
|
34
|
-
Used to interpret screenshot logic, UI regressions, or visual QA.
|
|
35
|
-
|
|
36
|
-
**Caller requirements:**
|
|
37
|
-
- Requires Node.js.
|
|
38
|
-
- Execution directory must be relative to the caller or via `{{SKILLS_DIR}}/llm-moe/scripts/...`
|
|
39
|
-
|
|
40
|
-
**Command format:**
|
|
41
|
-
```bash
|
|
42
|
-
node <path-to-skills>/llm-moe/scripts/visual-analyze.js \
|
|
43
|
-
--image "path/to/screenshot.png" \
|
|
44
|
-
--prompt "Check if the button overlaps the text."
|
|
45
|
-
```
|
|
46
|
-
|
|
47
|
-
**JSON Output:**
|
|
48
|
-
```json
|
|
49
|
-
{
|
|
50
|
-
"success": true,
|
|
51
|
-
"file": "path/to/screenshot.png",
|
|
52
|
-
"analysis": "The red submit button overlaps the footer text by 15px. Layout is broken."
|
|
53
|
-
}
|
|
54
|
-
```
|
|
55
|
-
|
|
56
|
-
## Model Configuration
|
|
57
|
-
|
|
58
|
-
The API Key is globally seeded during the `cafekit` setup into `~/.gemini/.env` and the local `.env` of your workspace.
|
|
59
|
-
|
|
60
|
-
Fallback override environment variables:
|
|
61
|
-
- `GEMINI_API_KEY`: The authentication key.
|
|
62
|
-
- `VISUAL_MODEL` (Optional): Specify the underlying Google/Gemma model (default: `gemma-4-31b-it`).
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
# Vision Understanding
|
|
2
|
-
|
|
3
|
-
The `hapo:llm-moe` skill leverages the `visual-analyze.js` agent-compatible script to convert unstructured screenshots and pictures into structured JSON insights.
|
|
4
|
-
|
|
5
|
-
## Quick Start (Node.js)
|
|
6
|
-
|
|
7
|
-
To parse an image, run the script from bash:
|
|
8
|
-
|
|
9
|
-
```bash
|
|
10
|
-
cd <skills_dir>/llm-moe/scripts
|
|
11
|
-
npm install # (only required on first setup to load @google/generative-ai)
|
|
12
|
-
|
|
13
|
-
node visual-analyze.js \
|
|
14
|
-
--image "../../test/screenshots/ui-error.png" \
|
|
15
|
-
--prompt "Find the login button and describe its visual state."
|
|
16
|
-
```
|
|
17
|
-
|
|
18
|
-
## Prompt Engineering for Vision
|
|
19
|
-
|
|
20
|
-
To get the most accurate layout results from `gemma-4-31b-it` or `gemini-2.5-flash`, structure your `--prompt` effectively:
|
|
21
|
-
|
|
22
|
-
### Regression Testing (UI Checks)
|
|
23
|
-
Instead of asking "Is it broken?", ask for explicit state analysis:
|
|
24
|
-
> `"List all elements that are overlapping. Determine if the main text is cut off by the border constraints. Return only the issues found."`
|
|
25
|
-
|
|
26
|
-
### Component Identification
|
|
27
|
-
> `"Identify the CSS Hex color of the top primary button and describe its corner radius visually."`
|
|
28
|
-
|
|
29
|
-
### Accessibility Visual Check
|
|
30
|
-
> `"Look at the form inputs in this image. Do their text contrasts against the background seem accessible? Are the labels visually aligned with the inputs?"`
|
|
31
|
-
|
|
32
|
-
## Technical Details
|
|
33
|
-
|
|
34
|
-
- **Supported Inputs:** PNG, JPEG, WEBP.
|
|
35
|
-
- **Size Limitation:** Try to keep screenshots under 4MB to ensure fast tokenization.
|
|
36
|
-
- **Model Loading:** The script creates an inline base64 string and embeds it directly into the Prompt Part payload of the `@google/generative-ai` request matrix.
|
|
@@ -1,10 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"name": "hapo-llm-moe-scripts",
|
|
3
|
-
"version": "1.0.0",
|
|
4
|
-
"description": "LLM scripts for visual and text analysis using Gemini/Gemma models.",
|
|
5
|
-
"main": "visual-analyze.js",
|
|
6
|
-
"type": "module",
|
|
7
|
-
"dependencies": {
|
|
8
|
-
"@google/generative-ai": "^0.21.0"
|
|
9
|
-
}
|
|
10
|
-
}
|
|
@@ -1,67 +0,0 @@
|
|
|
1
|
-
import { parseArgs } from 'util';
|
|
2
|
-
import fs from 'fs';
|
|
3
|
-
import { GoogleGenerativeAI } from '@google/generative-ai';
|
|
4
|
-
|
|
5
|
-
const options = {
|
|
6
|
-
image: { type: 'string' },
|
|
7
|
-
prompt: { type: 'string', default: 'Identify any UI bugs, visual overlaps, missing images, or cut-off text in this screenshot. Return a concise analysis.' },
|
|
8
|
-
};
|
|
9
|
-
|
|
10
|
-
const args = parseArgs({ args: process.argv.slice(2), options }).values;
|
|
11
|
-
|
|
12
|
-
if (!args.image || !fs.existsSync(args.image)) {
|
|
13
|
-
console.error("Error: --image path is required and must exist.");
|
|
14
|
-
process.exit(1);
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
// Ensure the API key is set
|
|
18
|
-
const apiKey = process.env.GEMINI_API_KEY;
|
|
19
|
-
if (!apiKey) {
|
|
20
|
-
console.error(JSON.stringify({
|
|
21
|
-
success: false,
|
|
22
|
-
error: "GEMINI_API_KEY environment variable is missing."
|
|
23
|
-
}));
|
|
24
|
-
process.exit(1);
|
|
25
|
-
}
|
|
26
|
-
|
|
27
|
-
async function analyze() {
|
|
28
|
-
try {
|
|
29
|
-
const genAI = new GoogleGenerativeAI(apiKey);
|
|
30
|
-
const modelName = process.env.VISUAL_MODEL || "gemma-4-31b-it";
|
|
31
|
-
const model = genAI.getGenerativeModel({ model: modelName });
|
|
32
|
-
|
|
33
|
-
const imageBytes = fs.readFileSync(args.image).toString("base64");
|
|
34
|
-
|
|
35
|
-
// Validate image format based on extension (simple approximation)
|
|
36
|
-
const ext = args.image.split('.').pop().toLowerCase();
|
|
37
|
-
const mimeType = ext === 'png' ? 'image/png' : (ext === 'webp' ? 'image/webp' : 'image/jpeg');
|
|
38
|
-
|
|
39
|
-
const result = await model.generateContent([
|
|
40
|
-
{ text: args.prompt },
|
|
41
|
-
{
|
|
42
|
-
inlineData: {
|
|
43
|
-
data: imageBytes,
|
|
44
|
-
mimeType: mimeType
|
|
45
|
-
}
|
|
46
|
-
}
|
|
47
|
-
]);
|
|
48
|
-
|
|
49
|
-
const analysisText = result.response.text();
|
|
50
|
-
|
|
51
|
-
console.log(JSON.stringify({
|
|
52
|
-
success: true,
|
|
53
|
-
file: args.image,
|
|
54
|
-
analysis: analysisText
|
|
55
|
-
}, null, 2));
|
|
56
|
-
|
|
57
|
-
} catch (error) {
|
|
58
|
-
console.error(JSON.stringify({
|
|
59
|
-
success: false,
|
|
60
|
-
file: args.image,
|
|
61
|
-
error: error.message
|
|
62
|
-
}, null, 2));
|
|
63
|
-
process.exit(1);
|
|
64
|
-
}
|
|
65
|
-
}
|
|
66
|
-
|
|
67
|
-
analyze();
|