@haposoft/cafekit 0.7.9 → 0.7.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/bin/install.js CHANGED
@@ -471,7 +471,7 @@ function copyPlatformFiles(platformKey, results, options = {}) {
471
471
  requiredSkills = CLAUDE_MIGRATION_MANIFEST?.skills?.required || [];
472
472
  } else if (platformKey === 'antigravity') {
473
473
  // Antigravity also needs shared investigation and impact-analysis skills
474
- requiredSkills = ['impact-analysis', 'debug', 'llm-moe'];
474
+ requiredSkills = ['impact-analysis', 'debug', 'ai-multimodal'];
475
475
  }
476
476
 
477
477
  requiredSkills
@@ -939,7 +939,7 @@ function configureGeminiKey(apiKey) {
939
939
  }
940
940
 
941
941
  // Luôn ghi trực tiếp key vào rốn của não bộ AI
942
- fs.writeFileSync(localEnvFile, `GEMINI_API_KEY=${apiKey}\nVISUAL_MODEL=gemma-4-31b-it\n`, { mode: 0o600 });
942
+ fs.writeFileSync(localEnvFile, `GEMINI_API_KEY=${apiKey}\nVISUAL_MODEL=gemma-4-31b-it\nSEARCH_MODEL=gemini-2.5-pro\n`, { mode: 0o600 });
943
943
  console.log(' ✓ Gemini API key configured securely in project (.claude/.env)');
944
944
 
945
945
  return true;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@haposoft/cafekit",
3
- "version": "0.7.9",
3
+ "version": "0.7.11",
4
4
  "description": "Spec-Driven Development workflow for AI coding assistants. Supports Claude Code and Antigravity with spec-first workflows plus Claude Code hapo: skills.",
5
5
  "author": "Haposoft <nghialt@haposoft.com>",
6
6
  "license": "MIT",
@@ -70,11 +70,11 @@ When you need to search the internet for information (research, docs lookup, tro
70
70
 
71
71
  | Priority | Tool | Command | When to use |
72
72
  |----------|------|---------|-------------|
73
- | 🥇 **P1** | `web-search.cjs` | `node .claude/scripts/web-search.cjs "query"` | **ALWAYS try first.** Works on ALL models via Gemini Google Search Grounding. Supports `--multi "q1" "q2"` for batch. Returns JSON with answer + sources. |
74
- | 🥈 **P2** | `WebSearch` (native) | Use WebSearch tool directly | Secondary verification, or when P1 fails/unavailable. |
75
- | 🥉 **P3** | `docs-fetch.js` | `node .claude/scripts/docs-fetch.js "library"` | Only when you already know a specific library and need its raw documentation. |
73
+ | 🥇 **P1** | `web-search.cjs` | `node .claude/scripts/web-search.cjs "[query]"` | **EXCLUSIVE PRIMARY.** Works via Gemini Grounding. Supports `--multi`. **TRUST THE SYNTHESIZED ANSWER** do NOT manually scrape source URLs. |
74
+ | 🥈 **P2** | `WebSearch` (native) | Use WebSearch tool directly | Secondary verification, or when P1 fails. |
75
+ | 🥉 **P3** | `docs-fetch.js` | `node .claude/scripts/docs-fetch.js "library"` | Only for fetching raw documentation when synthesis is insufficient. |
76
76
 
77
- **IMPORTANT**: When the user asks you to find information, research a topic, look up documentation, or investigate anything that requires internet access, you MUST use the Web Search Protocol above. Do NOT reply with "I cannot search the web" you have `web-search.cjs` available via Bash.
77
+ **IMPORTANT**: When the user asks you to find information, research a topic, or investigate anything that requires internet access, you MUST use the Web Search Protocol above. **NEVER** reply with "I cannot search the web". **NEVER** attempt manual `Fetch` or Python-based scraping for search results if `web-search.cjs` provides an answer. Trust the grounding.
78
78
 
79
79
  ## Code Refactoring Triggers
80
80
 
@@ -50,7 +50,6 @@ Before concluding any brainstorm session, verify each measurement metric:
50
50
  1. **Engineering Trinity:** YAGNI, KISS, and DRY.
51
51
  2. **Brutal Honesty:** Interrogate assumptions. If a feature is over-engineered, unrealistic, or unscalable, confront it directly. Your value lies in preventing costly mistakes.
52
52
  3. **Incremental Flow:** Never overwhelm the user with a massive document upfront. Proceed step by step, section by section.
53
- 4. **Web Search Protocol:** When needing to search the internet for references, benchmarks, or latest practices, ALWAYS use `node .claude/scripts/web-search.cjs "query"` first (Gemini Grounding). Use native WebSearch as secondary. Use `docs-fetch.js` only for known library docs.
54
53
 
55
54
  ## Ecosystem Alliances (Collaboration Tools)
56
55
 
@@ -11,7 +11,7 @@ Goal: Catch the mistakes AI-written code commonly makes — logic errors, securi
11
11
 
12
12
  You DO NOT fix code. You only READ, SCORE, and REPORT.
13
13
 
14
- **Web Search Protocol:** When needing to verify security best practices or lookup CVE databases, ALWAYS use `node .claude/scripts/web-search.cjs "query"` first (Gemini Grounding). Use native WebSearch as secondary.
14
+
15
15
 
16
16
  ## Pre-Review: Blast Radius Check (MANDATORY)
17
17
 
@@ -19,7 +19,7 @@ Any logic gaps must be clarified BEFORE typing, not discovered after bugs ship.
19
19
  - **Token efficiency**: Write concisely, report briefly, no prose.
20
20
  - **Surgical Reading (Large Files):** Never use blanket `Read` commands on files > 800 lines. Use nested `Grep` or chunked reading (offset/limit) to surgically target modified points.
21
21
  - **Component Scaffold Limit:** Any React/UI component file that exceeds 200 LOC must trigger a proactive modularization step (split into smaller child files).
22
- - **Web Search Protocol:** When needing to search the internet, ALWAYS use `node .claude/scripts/web-search.cjs "query"` first (Gemini Grounding). Use native WebSearch as secondary. Use `docs-fetch.js` only for known library docs.
22
+
23
23
 
24
24
  ## Self-Check Checklist (Before Reporting Complete)
25
25
 
@@ -15,7 +15,7 @@ Unlike typical managers who report on "feelings" or conversational summaries, yo
15
15
  1. **Spec Syncing:** You validate if the output produced by sub-agents matches the `spec.json` requirements and the `design.md` architectural constraints.
16
16
  2. **Blocker Assassination:** You identify task stagnation (e.g., a spec stuck in 'in-progress' across multiple sessions) and force the immediate assignment of next-step actions.
17
17
  3. **Agile Aggregation:** When parallel sub-agents (like `god-developer` and `test-runner`) report completion, you sweep their logs, consolidate the facts, and generate a single authoritative **Feature Release Report**.
18
- 4. **Web Search Protocol:** When needing to search for project management best practices, dependency updates, or changelog references, ALWAYS use `node .claude/scripts/web-search.cjs "query"` first (Gemini Grounding). Use native WebSearch as secondary.
18
+
19
19
 
20
20
  ## Execution Constraints
21
21
 
@@ -41,9 +41,10 @@ You possess extreme proficiency in:
41
41
  - Segregating Stable Production Practices away from Toxic Experimental Paradigms.
42
42
  - Sniffing out valid Adoption Patterns and real-world implementation trending.
43
43
  - Forgiving nothing when crafting Trade-off computational matrices for thousands of competing libraries.
44
- - **[PRIORITY 1]** Deploying `scripts/web-search.cjs` as the **PRIMARY search tool** for all web queries. Usage: `node .claude/scripts/web-search.cjs "query"` or `node .claude/scripts/web-search.cjs --multi "q1" "q2"`. Returns JSON with answer, sources, and citations via Gemini Google Search Grounding. ALWAYS attempt this first before any other search method.
45
- - **[PRIORITY 2]** If WebSearch native tool is available, use it as secondary verification or when `web-search.cjs` fails.
46
- - **[PRIORITY 3]** Deploying `scripts/docs-fetch.js` only when official Github/Doc URLs are already identified and you need to pull raw documentation content.
44
+ - **[PRIORITY 1]** Deploying `node .claude/scripts/web-search.cjs "[query]"` as the **EXCLUSIVE PRIMARY search tool**. This tool uses Gemini Grounding to return a synthesized **answer** plus cited sources. **STOP SEARCHING** once you have a sufficient answer from this script. Do NOT manually crawl source URLs if the provided synthesis is clear.
45
+ - **[PRIORITY 2]** Trust the script's output directly. READ the JSON and extract the `answer` field. **STRICTLY FORBIDDEN**: Writing Python scripts to parse this JSON or manually `Fetch` every URL listed in the sources unless the user explicitly demands a deep-dive implementation detail only found in a raw document.
46
+ - **[PRIORITY 3]** If `web-search.cjs` fails or returns no results, use native `WebSearch` tool (if available) as a backup.
47
+ - **[PRIORITY 4]** Deploying `scripts/docs-fetch.js` ONLY for raw documents where the direct URL is already known and synthesis is insufficient.
47
48
  - Deploying Bash and raw Grep utilities to surgically dissect embedded Document architectures and internal file payloads to evaluate raw insights.
48
49
 
49
50
  **ABSOLUTE IMMOVEABLE DIRECTIVE**: You are **STRICTLY PROHIBITED** from generating executable endpoint "Implementation Code". You exist ONLY to maneuver data streams, render synthesis Summary text, and return comprehensive Markdown documentation pathways to the main caller Agent.
@@ -19,7 +19,7 @@ You DO NOT write implementation code. You produce Specifications that downstream
19
19
  - **The 5 Whys:** Dig past the surface request to find the REAL problem.
20
20
  - **80/20 MVP:** Identify the 20% of features that deliver 80% of value.
21
21
  - **Systems Thinking:** How does this feature connect to (or break) existing systems?
22
- - **Web Search Protocol:** When needing to search the internet, ALWAYS use `node .claude/scripts/web-search.cjs "query"` first (Gemini Grounding). Use native WebSearch as secondary. Use `docs-fetch.js` only for known library docs.
22
+
23
23
 
24
24
  ## Pre-Completion Checklist
25
25
 
@@ -18,7 +18,7 @@ You are an award-caliber UI/UX designer. You merge aesthetic excellence with eng
18
18
  - **Micro-interactions:** Purposeful animations that enhance UX without performance cost.
19
19
  - **Accessibility:** WCAG 2.1 AA compliance as a baseline, not an afterthought.
20
20
  - **3D/WebGL:** Three.js scene composition, shader development (when appropriate).
21
- - **Web Search Protocol:** When needing to search the internet for design trends, component libraries, or accessibility guides, ALWAYS use `node .claude/scripts/web-search.cjs "query"` first (Gemini Grounding). Use native WebSearch as secondary. Use `docs-fetch.js` only for known library docs.
21
+
22
22
 
23
23
  ## Design Workflow
24
24
 
@@ -21,7 +21,7 @@
21
21
  "hotfix",
22
22
  "impact-analysis",
23
23
  "inspect",
24
- "llm-moe",
24
+ "ai-multimodal",
25
25
  "mobile-development",
26
26
  "react-best-practices",
27
27
  "research",
@@ -19,11 +19,7 @@ const fs = require('fs');
19
19
  // ---------------------------------------------------------------------------
20
20
  // ENV Resolution: .claude/.env → process.env
21
21
  // ---------------------------------------------------------------------------
22
- function resolveApiKey() {
23
- // Priority 1: Already in environment
24
- if (process.env.GEMINI_API_KEY) return process.env.GEMINI_API_KEY;
25
-
26
- // Priority 2: Project-local .claude/.env
22
+ function loadEnv() {
27
23
  const envPaths = [
28
24
  path.join(process.cwd(), '.claude', '.env'),
29
25
  path.join(process.cwd(), '..', '.claude', '.env'),
@@ -33,13 +29,21 @@ function resolveApiKey() {
33
29
  try {
34
30
  if (fs.existsSync(envPath)) {
35
31
  const content = fs.readFileSync(envPath, 'utf8');
36
- const match = content.match(/^GEMINI_API_KEY=(.+)$/m);
37
- if (match) return match[1].trim().replace(/^["']|["']$/g, '');
32
+ content.split(/\r?\n/).forEach(line => {
33
+ const match = line.match(/^([^=]+)=(.*)$/);
34
+ if (match) {
35
+ const key = match[1].trim();
36
+ const val = match[2].trim().replace(/^["']|["']$/g, '');
37
+ // Only set if not already present in environment
38
+ if (process.env[key] === undefined) {
39
+ process.env[key] = val;
40
+ }
41
+ }
42
+ });
43
+ return; // Loaded successfully, no need to check other paths
38
44
  }
39
45
  } catch { /* skip */ }
40
46
  }
41
-
42
- return null;
43
47
  }
44
48
 
45
49
  // ---------------------------------------------------------------------------
@@ -89,10 +93,38 @@ function callGemini(apiKey, query, model) {
89
93
  });
90
94
  }
91
95
 
96
+ // ---------------------------------------------------------------------------
97
+ // Resolve Vertex AI grounding redirect URLs to real URLs
98
+ // ---------------------------------------------------------------------------
99
+ function resolveRedirectUrl(url) {
100
+ return new Promise((resolve) => {
101
+ if (!url || !url.includes('grounding-api-redirect')) {
102
+ resolve(url);
103
+ return;
104
+ }
105
+
106
+ const protocol = url.startsWith('https') ? https : require('http');
107
+ const req = protocol.request(url, { method: 'HEAD', timeout: 5000 }, (res) => {
108
+ // Follow redirect chain - Location header has the real URL
109
+ resolve(res.headers.location || url);
110
+ });
111
+ req.on('error', () => resolve(url));
112
+ req.on('timeout', () => { req.destroy(); resolve(url); });
113
+ req.end();
114
+ });
115
+ }
116
+
117
+ async function resolveAllUrls(sources) {
118
+ return Promise.all(sources.map(async (src) => {
119
+ const realUrl = await resolveRedirectUrl(src.url);
120
+ return { ...src, url: realUrl };
121
+ }));
122
+ }
123
+
92
124
  // ---------------------------------------------------------------------------
93
125
  // Parse Grounding Metadata → Structured Output
94
126
  // ---------------------------------------------------------------------------
95
- function parseResponse(geminiResponse, query) {
127
+ async function parseResponse(geminiResponse, query) {
96
128
  const candidate = geminiResponse.candidates?.[0];
97
129
  if (!candidate) return { query, error: 'No candidates returned' };
98
130
 
@@ -100,11 +132,22 @@ function parseResponse(geminiResponse, query) {
100
132
  const meta = candidate.groundingMetadata || {};
101
133
 
102
134
  // Extract source URLs from groundingChunks
103
- const sources = (meta.groundingChunks || []).map(chunk => ({
135
+ let sources = (meta.groundingChunks || []).map(chunk => ({
104
136
  title: chunk.web?.title || 'Unknown',
105
137
  url: chunk.web?.uri || '',
106
138
  }));
107
139
 
140
+ // Resolve redirect URLs to real URLs
141
+ sources = await resolveAllUrls(sources);
142
+
143
+ // Deduplicate by resolved URL
144
+ const seen = new Set();
145
+ sources = sources.filter(s => {
146
+ if (seen.has(s.url)) return false;
147
+ seen.add(s.url);
148
+ return true;
149
+ });
150
+
108
151
  // Extract search queries used by the model
109
152
  const searchQueries = meta.webSearchQueries || [];
110
153
 
@@ -132,7 +175,8 @@ async function main() {
132
175
  process.exit(1);
133
176
  }
134
177
 
135
- const apiKey = resolveApiKey();
178
+ loadEnv();
179
+ const apiKey = process.env.GEMINI_API_KEY;
136
180
  if (!apiKey) {
137
181
  console.error(JSON.stringify({
138
182
  error: 'GEMINI_API_KEY not found. Set it in .claude/.env or environment variable.'
@@ -140,15 +184,20 @@ async function main() {
140
184
  process.exit(1);
141
185
  }
142
186
 
143
- // Use model from env or default to gemini-2.5-flash
144
- const model = process.env.SEARCH_MODEL || 'gemini-2.5-flash';
187
+ // Determine which model to use. User might configure MODEL or VISUAL_MODEL in their .env
188
+ let model = process.env.SEARCH_MODEL || process.env.MODEL || process.env.VISUAL_MODEL || 'gemini-2.5-flash';
189
+
190
+ // Google Search Grounding ONLY supports Gemini models (not Claude, not Gemma)
191
+ if (!model.toLowerCase().includes('gemini') && !model.toLowerCase().includes('learnlm')) {
192
+ model = 'gemini-2.5-flash'; // Fallback to safe search model
193
+ }
145
194
 
146
195
  const results = [];
147
196
 
148
197
  for (const query of queries) {
149
198
  try {
150
199
  const raw = await callGemini(apiKey, query, model);
151
- results.push(parseResponse(raw, query));
200
+ results.push(await parseResponse(raw, query));
152
201
  } catch (err) {
153
202
  results.push({ query, error: err.message });
154
203
  }
@@ -23,11 +23,11 @@ Call the `TaskCreate` tool to spin up the `researcher` subagent.
23
23
  **Instructions to pass to Researcher:**
24
24
  ```text
25
25
  Conduct comprehensive research on: [topic]
26
- Constraint 1: ALWAYS use `node .claude/scripts/web-search.cjs "query"` as PRIMARY search method (supports --multi for batch). This uses Gemini Google Search Grounding and returns JSON with answer + sources.
27
- Constraint 2: Use native WebSearch tool as secondary verification or when web-search.cjs fails.
28
- Constraint 3: Use scripts/docs-fetch.js ONLY when official Github/Doc URLs are already identified.
29
- Constraint 4: Limit total search calls to a maximum of 5 distinct queries to conserve context.
30
- Constraint 5: Validate information via cross-referencing capabilities.
26
+ Constraint 1: ALWAYS use `node .claude/scripts/web-search.cjs "[query]"` as the EXCLUSIVE primary search method. This tool uses Gemini Grounding and returns a synthesized answer + cited sources. Do NOT manually crawl source URLs if the script provides a sufficient answer.
27
+ Constraint 2: TRUST THE SYNTHESIS. The output contains the research results. Read the JSON and use the `answer` field directly. Do NOT write Python scripts to re-parse it or manually `Fetch` sources unless deep implementation details are missing.
28
+ Constraint 3: Use native WebSearch or manual Fetch ONLY if the script fails or returns no results.
29
+ Constraint 4: Limit total search calls to a maximum of 5 distinct queries.
30
+ Constraint 5: Stop excessive "chain-searching". Use the grounding answer as the definitive summary.
31
31
  Output Format: Must strictly follow the 'Standard Research Report' layout.
32
32
  ```
33
33
 
@@ -237,7 +237,7 @@ Collects Core Web Vitals: `LCP`, `FID`, `CLS`, `FCP`, `TTFB`, `JSHeapUsedSize`.
237
237
 
238
238
  ---
239
239
 
240
- ### Phase C-5: Responsive Screenshots (screenshot.js & visual-analyze.js)
240
+ ### Phase C-5: Responsive Screenshots (screenshot.js & gemini_batch_process.py)
241
241
 
242
242
  ```bash
243
243
  # Capture screenshots
@@ -23,7 +23,7 @@ Referenced by `SKILL.md` Phase 3 and by the `test-runner` agent.
23
23
  | 12 | **SEO Error** | Missing meta tags, H1 issues, broken canonical | `evaluate.js` |
24
24
  | 13 | **Security Warning** | Missing HTTP security headers, exposed secrets | `network.js` / source |
25
25
  | 14 | **Broken Links** | Internal crawler found 404s on discovered pages | Phase 0.5 Crawler |
26
- | 15 | **UI Visual Error** | AI visual analysis detected layout break, overlap | `visual-analyze.js` |
26
+ | 15 | **UI Visual Error** | AI visual analysis detected layout break, overlap | `gemini_batch_process.py` |
27
27
 
28
28
  ---
29
29
 
@@ -1,62 +0,0 @@
1
- # `hapo:llm-moe` Skill
2
-
3
- The **LLM Mixture-of-Experts (MoE) Hub** serves as a centralized gateway for executing advanced contextual tasks (Visual Analysis, Document Understanding, Data Extraction) across various models (Gemini, Gemma) using API scripts.
4
-
5
- By decoupling LLM functionality into this standalone skill, `hapo:test` or any other workflow can simply offload complex multimodal tasks to `hapo:llm-moe` scripts rather than bundling their own integration.
6
-
7
- ## Core Capabilities (Hiện tại & Mở rộng)
8
-
9
- Được thiết kế theo chuẩn MoE (Mixture of Experts), Skill này không chỉ giới hạn ở một Model duy nhất mà tự động định tuyến (route) các tác vụ chuyên biệt tới các model phù hợp (như *gemma-4-31b-it* cho logic, *gemini-2.5-flash* cho đa phương tiện siêu tốc, hoặc *gemini-2.5-pro* cho suy luận y tế/toán học).
10
-
11
- ### 1. Visual Understanding (Đã triển khai)
12
- - **UI/UX Regression Analysis:** Phân tích ảnh chụp màn hình tự động, dò tìm CSS layout vỡ, button đè text, responsive rác.
13
- - **Visual Q&A / Object Detection:** Truy vấn vị trí phần tử HTML, đếm số lượng form inputs trong màn hình.
14
- - *(Sử dụng script: `scripts/visual-analyze.js`)*
15
-
16
- ### 2. Optical Character Recognition - OCR (Sắp triển khai)
17
- - **Data Extraction:** Trích xuất mảng JSON từ ảnh chụp Hóa đơn (Invoices), Căn cước (ID Cards), hay Bảng biểu (Tables).
18
- - **Handwriting Parsing:** Dịch chữ viết tay trên biểu mẫu thành text markdown.
19
-
20
- ### 3. Document AI & Parsing (Sắp triển khai)
21
- - **PDF Comprehension:** Đọc và phân mảnh (chunking) file tài liệu PDF (>1000 trang) trả về các luồng tài liệu trích xuất nội dung liên quan (RAG base).
22
- - **Codebase Indexing:** Hỗ trợ nhai các file log, file báo cáo (.csv, .xlsx) để báo cáo phân tích rủi ro hệ thống.
23
-
24
- ### 4. Generative Engineering (Sắp triển khai)
25
- - **Code & Scaffold Generation:** Sinh cấu trúc thư mục, Boilerplate code dựa trên bản vẽ Design UI (Image-to-Code).
26
- - **Audio/Video Transcribing:** Cắt âm thanh từ luồng test tích hợp (nếu có WebRTC/Media tests) và phân tích lỗi thoại.
27
-
28
- ## Usage Guide for Agents
29
-
30
- Other agents (like `test-runner` or `reviewer`) should call `hapo:llm-moe`'s tools by launching its scripts securely via `bash`.
31
-
32
- ### 1. Visual Analysis (`visual-analyze.js`)
33
-
34
- Used to interpret screenshot logic, UI regressions, or visual QA.
35
-
36
- **Caller requirements:**
37
- - Requires Node.js.
38
- - Execution directory must be relative to the caller or via `{{SKILLS_DIR}}/llm-moe/scripts/...`
39
-
40
- **Command format:**
41
- ```bash
42
- node <path-to-skills>/llm-moe/scripts/visual-analyze.js \
43
- --image "path/to/screenshot.png" \
44
- --prompt "Check if the button overlaps the text."
45
- ```
46
-
47
- **JSON Output:**
48
- ```json
49
- {
50
- "success": true,
51
- "file": "path/to/screenshot.png",
52
- "analysis": "The red submit button overlaps the footer text by 15px. Layout is broken."
53
- }
54
- ```
55
-
56
- ## Model Configuration
57
-
58
- The API Key is globally seeded during the `cafekit` setup into `~/.gemini/.env` and the local `.env` of your workspace.
59
-
60
- Fallback override environment variables:
61
- - `GEMINI_API_KEY`: The authentication key.
62
- - `VISUAL_MODEL` (Optional): Specify the underlying Google/Gemma model (default: `gemma-4-31b-it`).
@@ -1,36 +0,0 @@
1
- # Vision Understanding
2
-
3
- The `hapo:llm-moe` skill leverages the `visual-analyze.js` agent-compatible script to convert unstructured screenshots and pictures into structured JSON insights.
4
-
5
- ## Quick Start (Node.js)
6
-
7
- To parse an image, run the script from bash:
8
-
9
- ```bash
10
- cd <skills_dir>/llm-moe/scripts
11
- npm install # (only required on first setup to load @google/generative-ai)
12
-
13
- node visual-analyze.js \
14
- --image "../../test/screenshots/ui-error.png" \
15
- --prompt "Find the login button and describe its visual state."
16
- ```
17
-
18
- ## Prompt Engineering for Vision
19
-
20
- To get the most accurate layout results from `gemma-4-31b-it` or `gemini-2.5-flash`, structure your `--prompt` effectively:
21
-
22
- ### Regression Testing (UI Checks)
23
- Instead of asking "Is it broken?", ask for explicit state analysis:
24
- > `"List all elements that are overlapping. Determine if the main text is cut off by the border constraints. Return only the issues found."`
25
-
26
- ### Component Identification
27
- > `"Identify the CSS Hex color of the top primary button and describe its corner radius visually."`
28
-
29
- ### Accessibility Visual Check
30
- > `"Look at the form inputs in this image. Do their text contrasts against the background seem accessible? Are the labels visually aligned with the inputs?"`
31
-
32
- ## Technical Details
33
-
34
- - **Supported Inputs:** PNG, JPEG, WEBP.
35
- - **Size Limitation:** Try to keep screenshots under 4MB to ensure fast tokenization.
36
- - **Model Loading:** The script creates an inline base64 string and embeds it directly into the Prompt Part payload of the `@google/generative-ai` request matrix.
@@ -1,10 +0,0 @@
1
- {
2
- "name": "hapo-llm-moe-scripts",
3
- "version": "1.0.0",
4
- "description": "LLM scripts for visual and text analysis using Gemini/Gemma models.",
5
- "main": "visual-analyze.js",
6
- "type": "module",
7
- "dependencies": {
8
- "@google/generative-ai": "^0.21.0"
9
- }
10
- }
@@ -1,67 +0,0 @@
1
- import { parseArgs } from 'util';
2
- import fs from 'fs';
3
- import { GoogleGenerativeAI } from '@google/generative-ai';
4
-
5
- const options = {
6
- image: { type: 'string' },
7
- prompt: { type: 'string', default: 'Identify any UI bugs, visual overlaps, missing images, or cut-off text in this screenshot. Return a concise analysis.' },
8
- };
9
-
10
- const args = parseArgs({ args: process.argv.slice(2), options }).values;
11
-
12
- if (!args.image || !fs.existsSync(args.image)) {
13
- console.error("Error: --image path is required and must exist.");
14
- process.exit(1);
15
- }
16
-
17
- // Ensure the API key is set
18
- const apiKey = process.env.GEMINI_API_KEY;
19
- if (!apiKey) {
20
- console.error(JSON.stringify({
21
- success: false,
22
- error: "GEMINI_API_KEY environment variable is missing."
23
- }));
24
- process.exit(1);
25
- }
26
-
27
- async function analyze() {
28
- try {
29
- const genAI = new GoogleGenerativeAI(apiKey);
30
- const modelName = process.env.VISUAL_MODEL || "gemma-4-31b-it";
31
- const model = genAI.getGenerativeModel({ model: modelName });
32
-
33
- const imageBytes = fs.readFileSync(args.image).toString("base64");
34
-
35
- // Validate image format based on extension (simple approximation)
36
- const ext = args.image.split('.').pop().toLowerCase();
37
- const mimeType = ext === 'png' ? 'image/png' : (ext === 'webp' ? 'image/webp' : 'image/jpeg');
38
-
39
- const result = await model.generateContent([
40
- { text: args.prompt },
41
- {
42
- inlineData: {
43
- data: imageBytes,
44
- mimeType: mimeType
45
- }
46
- }
47
- ]);
48
-
49
- const analysisText = result.response.text();
50
-
51
- console.log(JSON.stringify({
52
- success: true,
53
- file: args.image,
54
- analysis: analysisText
55
- }, null, 2));
56
-
57
- } catch (error) {
58
- console.error(JSON.stringify({
59
- success: false,
60
- file: args.image,
61
- error: error.message
62
- }, null, 2));
63
- process.exit(1);
64
- }
65
- }
66
-
67
- analyze();