@goonnguyen/human-mcp 1.3.0 ā 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +261 -19
- package/bin/human-mcp.js +2 -0
- package/dist/index.js +65180 -1698
- package/package.json +19 -2
- package/.claude/agents/code-reviewer.md +0 -140
- package/.claude/agents/database-admin.md +0 -86
- package/.claude/agents/debugger.md +0 -119
- package/.claude/agents/docs-manager.md +0 -113
- package/.claude/agents/git-manager.md +0 -59
- package/.claude/agents/planner-researcher.md +0 -97
- package/.claude/agents/project-manager.md +0 -113
- package/.claude/agents/tester.md +0 -95
- package/.claude/commands/cook.md +0 -7
- package/.claude/commands/debug.md +0 -10
- package/.claude/commands/docs/init.md +0 -11
- package/.claude/commands/docs/update.md +0 -11
- package/.claude/commands/fix/ci.md +0 -8
- package/.claude/commands/fix/fast.md +0 -5
- package/.claude/commands/fix/hard.md +0 -7
- package/.claude/commands/fix/test.md +0 -16
- package/.claude/commands/git/cm.md +0 -5
- package/.claude/commands/git/cp.md +0 -4
- package/.claude/commands/plan/ci.md +0 -12
- package/.claude/commands/plan/two.md +0 -13
- package/.claude/commands/plan.md +0 -10
- package/.claude/commands/test.md +0 -7
- package/.claude/commands/watzup.md +0 -8
- package/.claude/hooks/telegram_notify.sh +0 -136
- package/.claude/send-discord.sh +0 -64
- package/.claude/settings.json +0 -7
- package/.claude/statusline.sh +0 -143
- package/.dockerignore +0 -81
- package/.env.example +0 -44
- package/.github/workflows/publish.yml +0 -88
- package/.opencode/agent/code-reviewer.md +0 -142
- package/.opencode/agent/debugger.md +0 -74
- package/.opencode/agent/docs-manager.md +0 -119
- package/.opencode/agent/git-manager.md +0 -60
- package/.opencode/agent/planner-researcher.md +0 -100
- package/.opencode/agent/project-manager.md +0 -113
- package/.opencode/agent/system-architecture.md +0 -200
- package/.opencode/agent/tester.md +0 -96
- package/.opencode/agent/ui-ux-developer.md +0 -97
- package/.opencode/command/cook.md +0 -7
- package/.opencode/command/debug.md +0 -10
- package/.opencode/command/fix/ci.md +0 -8
- package/.opencode/command/fix/fast.md +0 -5
- package/.opencode/command/fix/hard.md +0 -7
- package/.opencode/command/fix/test.md +0 -16
- package/.opencode/command/git/cm.md +0 -5
- package/.opencode/command/git/cp.md +0 -4
- package/.opencode/command/plan/ci.md +0 -12
- package/.opencode/command/plan/two.md +0 -13
- package/.opencode/command/plan.md +0 -10
- package/.opencode/command/test.md +0 -7
- package/.opencode/command/watzup.md +0 -8
- package/.releaserc.json +0 -26
- package/.serena/project.yml +0 -68
- package/CHANGELOG.md +0 -62
- package/CLAUDE.md +0 -141
- package/DEPLOYMENT.md +0 -329
- package/Dockerfile +0 -52
- package/QUICKSTART.md +0 -97
- package/bun.lock +0 -1872
- package/bunfig.toml +0 -15
- package/docker-compose.yaml +0 -128
- package/docs/README.md +0 -51
- package/docs/codebase-structure-architecture-code-standards.md +0 -428
- package/docs/codebase-summary.md +0 -321
- package/docs/project-overview-pdr.md +0 -286
- package/docs/project-roadmap.md +0 -494
- package/examples/debugging-session.ts +0 -96
- package/human-mcp.png +0 -0
- package/inspector-wrapper.mjs +0 -33
- package/plans/001-streamable-http-transport-plan.md +0 -905
- package/plans/002-sse-fallback-http-transport-plan.md +0 -161
- package/plans/003-fix-test-infrastructure-and-ci-plan.md +0 -699
- package/plans/003-http-transport-local-file-access-plan.md +0 -880
- package/plans/004-fix-typescript-compilation-errors-plan.md +0 -388
- package/plans/005-comprehensive-test-infrastructure-fix-plan.md +0 -854
- package/plans/templates/bug-fix-template.md +0 -69
- package/plans/templates/feature-implementation-template.md +0 -84
- package/plans/templates/refactor-template.md +0 -82
- package/plans/templates/template-usage-guide.md +0 -58
- package/src/index.ts +0 -49
- package/src/prompts/debugging-prompts.ts +0 -149
- package/src/prompts/index.ts +0 -55
- package/src/resources/documentation.ts +0 -316
- package/src/resources/index.ts +0 -49
- package/src/server.ts +0 -36
- package/src/tools/eyes/index.ts +0 -225
- package/src/tools/eyes/processors/gif.ts +0 -137
- package/src/tools/eyes/processors/image.ts +0 -213
- package/src/tools/eyes/processors/video.ts +0 -135
- package/src/tools/eyes/schemas.ts +0 -51
- package/src/tools/eyes/utils/formatters.ts +0 -126
- package/src/tools/eyes/utils/gemini-client.ts +0 -73
- package/src/transports/http/file-interceptor.ts +0 -134
- package/src/transports/http/middleware.ts +0 -46
- package/src/transports/http/routes.ts +0 -297
- package/src/transports/http/server.ts +0 -116
- package/src/transports/http/session.ts +0 -93
- package/src/transports/http/sse-routes.ts +0 -210
- package/src/transports/index.ts +0 -36
- package/src/transports/stdio.ts +0 -7
- package/src/transports/types.ts +0 -50
- package/src/types/index.ts +0 -41
- package/src/utils/cloudflare-r2.ts +0 -107
- package/src/utils/config.ts +0 -123
- package/src/utils/errors.ts +0 -40
- package/src/utils/logger.ts +0 -49
- package/tests/integration/http-transport-files.test.ts +0 -190
- package/tests/integration/server.test.ts +0 -27
- package/tests/integration/sse-transport.test.ts +0 -142
- package/tests/setup.ts +0 -55
- package/tests/types/api-responses.ts +0 -35
- package/tests/types/test-types.ts +0 -105
- package/tests/unit/cloudflare-r2.test.ts +0 -118
- package/tests/unit/config.test.ts +0 -40
- package/tests/unit/eyes-analyze.test.ts +0 -150
- package/tests/unit/formatters.test.ts +0 -85
- package/tests/unit/sse-routes.test.ts +0 -92
- package/tests/utils/error-scenarios.ts +0 -198
- package/tests/utils/index.ts +0 -3
- package/tests/utils/mock-helpers.ts +0 -99
- package/tests/utils/test-data-generators.ts +0 -217
- package/tests/utils/test-server-manager.ts +0 -172
- package/tsconfig.json +0 -26
package/docs/project-roadmap.md
DELETED
|
@@ -1,494 +0,0 @@
|
|
|
1
|
-
# Human MCP - Project Roadmap
|
|
2
|
-
|
|
3
|
-
## Project Vision
|
|
4
|
-
|
|
5
|
-
**Human MCP: Bringing Human Capabilities to Coding Agents**
|
|
6
|
-
|
|
7
|
-
Transform AI coding agents with human-like sensory capabilities by providing sophisticated multimodal analysis tools through the Model Context Protocol. Our mission is to bridge the gap between AI agents and human perception, enabling comprehensive debugging, analysis, and content understanding workflows.
|
|
8
|
-
|
|
9
|
-
## Executive Summary
|
|
10
|
-
|
|
11
|
-
Human MCP is a Model Context Protocol server that empowers AI coding agents with advanced multimodal capabilities. Currently focused on visual analysis (Eyes), the project roadmap extends to encompass complete human-like sensory capabilities including document understanding, audio processing, speech generation, and content creation.
|
|
12
|
-
|
|
13
|
-
**Current Status**: Version 1.2.1 - Visual Analysis Foundation Complete
|
|
14
|
-
**Next Milestone**: Document Understanding (Eyes Extension)
|
|
15
|
-
**Target Completion**: Q4 2025 for full human capabilities suite
|
|
16
|
-
|
|
17
|
-
## Current Capabilities (Phase 1 - COMPLETE)
|
|
18
|
-
|
|
19
|
-
### Eyes: Visual Analysis - 100% Complete ā
|
|
20
|
-
|
|
21
|
-
**Status**: Production Ready (v1.2.1)
|
|
22
|
-
**Completion Date**: September 08, 2025
|
|
23
|
-
|
|
24
|
-
#### Current Features
|
|
25
|
-
- **Image Analysis**: PNG, JPEG, WebP, GIF static image processing
|
|
26
|
-
- **Video Analysis**: MP4, WebM, MOV, AVI video processing with frame extraction
|
|
27
|
-
- **GIF Analysis**: Animated GIF frame-by-frame analysis
|
|
28
|
-
- **Image Comparison**: Pixel, structural, and semantic comparison capabilities
|
|
29
|
-
- **Analysis Types**: UI debugging, error detection, accessibility, performance, layout analysis
|
|
30
|
-
- **Detail Levels**: Quick (< 10s) and detailed (< 30s) analysis modes
|
|
31
|
-
- **Input Sources**: File paths, URLs, and base64 data URIs
|
|
32
|
-
|
|
33
|
-
#### Technical Implementation
|
|
34
|
-
```typescript
|
|
35
|
-
// Current Tools Available
|
|
36
|
-
- eyes_analyze: Primary visual analysis tool
|
|
37
|
-
- eyes_compare: Image comparison and difference detection
|
|
38
|
-
|
|
39
|
-
// Architecture Components
|
|
40
|
-
- Gemini API integration with configurable models
|
|
41
|
-
- ffmpeg-based video processing
|
|
42
|
-
- Sharp library for GIF frame extraction
|
|
43
|
-
- Comprehensive error handling and logging
|
|
44
|
-
- MCP protocol compliant server implementation
|
|
45
|
-
```
|
|
46
|
-
|
|
47
|
-
#### Performance Metrics (Current)
|
|
48
|
-
- **Image Processing**: < 10s (quick) / < 30s (detailed)
|
|
49
|
-
- **Video Processing**: < 2 minutes for 30-second clips
|
|
50
|
-
- **Success Rate**: 98.5% for supported formats
|
|
51
|
-
- **Memory Usage**: < 100MB for typical operations
|
|
52
|
-
- **API Response Time**: 95th percentile < 30 seconds
|
|
53
|
-
|
|
54
|
-
## Development Phases & Roadmap
|
|
55
|
-
|
|
56
|
-
### Phase 2: Document Understanding (Q4 2025)
|
|
57
|
-
**Priority**: High | **Status**: Planning | **Progress**: 0%
|
|
58
|
-
|
|
59
|
-
#### Objectives
|
|
60
|
-
Extend Eyes capability to read and understand documentation formats including PDFs, Word documents, Excel files, and other structured documents using Gemini's Document Understanding API.
|
|
61
|
-
|
|
62
|
-
#### Technical Implementation Plan
|
|
63
|
-
```typescript
|
|
64
|
-
// New Tools to Implement
|
|
65
|
-
- eyes_read_document: Document analysis and extraction
|
|
66
|
-
- eyes_extract_data: Structured data extraction from documents
|
|
67
|
-
- eyes_summarize: Document summarization and key insights
|
|
68
|
-
|
|
69
|
-
// Required Dependencies
|
|
70
|
-
- pdf-parse: PDF text extraction
|
|
71
|
-
- mammoth: Word document processing
|
|
72
|
-
- xlsx: Excel spreadsheet handling
|
|
73
|
-
- @google/generative-ai: Document Understanding API
|
|
74
|
-
|
|
75
|
-
// Architecture Extensions
|
|
76
|
-
src/tools/eyes/processors/
|
|
77
|
-
āāā document.ts # PDF, DOCX document processing
|
|
78
|
-
āāā spreadsheet.ts # Excel, CSV data processing
|
|
79
|
-
āāā presentation.ts # PowerPoint slide analysis
|
|
80
|
-
āāā text.ts # Plain text and markdown processing
|
|
81
|
-
```
|
|
82
|
-
|
|
83
|
-
#### Deliverables
|
|
84
|
-
- [ ] PDF document analysis with text extraction and understanding
|
|
85
|
-
- [ ] Word document processing with formatting preservation
|
|
86
|
-
- [ ] Excel spreadsheet data analysis and insights
|
|
87
|
-
- [ ] PowerPoint presentation content analysis
|
|
88
|
-
- [ ] Multi-format document comparison capabilities
|
|
89
|
-
- [ ] Comprehensive documentation and examples
|
|
90
|
-
|
|
91
|
-
#### Success Metrics
|
|
92
|
-
- Support for PDF, DOCX, XLSX, PPTX, TXT, MD formats
|
|
93
|
-
- Text extraction accuracy > 95%
|
|
94
|
-
- Processing time < 60 seconds for typical documents
|
|
95
|
-
- Structured data extraction with schema validation
|
|
96
|
-
- Cross-document comparison and analysis capabilities
|
|
97
|
-
|
|
98
|
-
#### Timeline: January 2025 - March 2025
|
|
99
|
-
- **Week 1-2**: Document processing architecture design
|
|
100
|
-
- **Week 3-6**: PDF and Word document processor implementation
|
|
101
|
-
- **Week 7-10**: Excel and PowerPoint processor development
|
|
102
|
-
- **Week 11-12**: Testing, optimization, and documentation
|
|
103
|
-
|
|
104
|
-
### Phase 3: Audio Processing - Ears (Q4 2025)
|
|
105
|
-
**Priority**: High | **Status**: Not Started | **Progress**: 0%
|
|
106
|
-
|
|
107
|
-
#### Objectives
|
|
108
|
-
Implement comprehensive audio analysis capabilities using Gemini's Audio Understanding API, enabling speech-to-text, audio content analysis, and debugging of audio-related issues.
|
|
109
|
-
|
|
110
|
-
#### Technical Implementation Plan
|
|
111
|
-
```typescript
|
|
112
|
-
// New Tools to Implement
|
|
113
|
-
- ears_transcribe: Speech-to-text conversion
|
|
114
|
-
- ears_analyze: Audio content analysis and insights
|
|
115
|
-
- ears_compare: Audio comparison and difference detection
|
|
116
|
-
- ears_extract: Audio feature extraction and metadata
|
|
117
|
-
|
|
118
|
-
// Required Dependencies
|
|
119
|
-
- fluent-ffmpeg: Audio format conversion and processing
|
|
120
|
-
- audio-context: Web Audio API compatibility
|
|
121
|
-
- wav-file-info: Audio file metadata extraction
|
|
122
|
-
|
|
123
|
-
// Architecture Design
|
|
124
|
-
src/tools/ears/
|
|
125
|
-
āāā index.ts # Tool registration and orchestration
|
|
126
|
-
āāā schemas.ts # Audio input validation schemas
|
|
127
|
-
āāā processors/
|
|
128
|
-
ā āāā speech.ts # Speech-to-text processing
|
|
129
|
-
ā āāā music.ts # Music analysis and classification
|
|
130
|
-
ā āāā effects.ts # Audio effects and quality analysis
|
|
131
|
-
ā āāā comparison.ts # Audio comparison utilities
|
|
132
|
-
āāā utils/
|
|
133
|
-
āāā audio-client.ts # Gemini Audio API client
|
|
134
|
-
āāā converters.ts # Audio format conversion
|
|
135
|
-
āāā analyzers.ts # Audio analysis utilities
|
|
136
|
-
```
|
|
137
|
-
|
|
138
|
-
#### Deliverables
|
|
139
|
-
- [ ] Speech-to-text transcription with speaker identification
|
|
140
|
-
- [ ] Audio content analysis (music, speech, noise classification)
|
|
141
|
-
- [ ] Audio quality assessment and debugging capabilities
|
|
142
|
-
- [ ] Audio comparison for A/B testing and regression detection
|
|
143
|
-
- [ ] Multi-format audio support (WAV, MP3, AAC, OGG, FLAC)
|
|
144
|
-
- [ ] Real-time audio processing capabilities (future)
|
|
145
|
-
|
|
146
|
-
#### Success Metrics
|
|
147
|
-
- Transcription accuracy > 95% for clear speech
|
|
148
|
-
- Support for 20+ audio formats
|
|
149
|
-
- Processing time < file duration + 30 seconds
|
|
150
|
-
- Speaker identification accuracy > 90%
|
|
151
|
-
- Audio quality assessment with detailed metrics
|
|
152
|
-
|
|
153
|
-
#### Timeline: April 2025 - June 2025
|
|
154
|
-
- **Month 1**: Core audio processing infrastructure
|
|
155
|
-
- **Month 2**: Speech-to-text and content analysis implementation
|
|
156
|
-
- **Month 3**: Testing, optimization, and advanced features
|
|
157
|
-
|
|
158
|
-
### Phase 4: Speech Generation - Mouth (Q4 2025)
|
|
159
|
-
**Priority**: Medium | **Status**: Not Started | **Progress**: 0%
|
|
160
|
-
|
|
161
|
-
#### Objectives
|
|
162
|
-
Implement text-to-speech capabilities using Gemini's Speech Generation API, enabling AI agents to provide audio feedback, generate spoken explanations, and create audio content.
|
|
163
|
-
|
|
164
|
-
#### Technical Implementation Plan
|
|
165
|
-
```typescript
|
|
166
|
-
// New Tools to Implement
|
|
167
|
-
- mouth_speak: Text-to-speech generation
|
|
168
|
-
- mouth_narrate: Long-form content narration
|
|
169
|
-
- mouth_explain: Code explanation with speech
|
|
170
|
-
- mouth_customize: Voice customization and tuning
|
|
171
|
-
|
|
172
|
-
// Architecture Design
|
|
173
|
-
src/tools/mouth/
|
|
174
|
-
āāā index.ts # Tool registration
|
|
175
|
-
āāā schemas.ts # Speech generation schemas
|
|
176
|
-
āāā processors/
|
|
177
|
-
ā āāā synthesis.ts # Core text-to-speech
|
|
178
|
-
ā āāā narration.ts # Long-form content
|
|
179
|
-
ā āāā explanation.ts # Technical content speech
|
|
180
|
-
ā āāā effects.ts # Voice effects and modulation
|
|
181
|
-
āāā utils/
|
|
182
|
-
āāā speech-client.ts # Gemini Speech API client
|
|
183
|
-
āāā voice-profiles.ts # Voice customization
|
|
184
|
-
āāā audio-export.ts # Audio file generation
|
|
185
|
-
```
|
|
186
|
-
|
|
187
|
-
#### Deliverables
|
|
188
|
-
- [ ] High-quality text-to-speech with multiple voice options
|
|
189
|
-
- [ ] Code explanation and technical content narration
|
|
190
|
-
- [ ] Customizable voice parameters (speed, pitch, tone)
|
|
191
|
-
- [ ] Long-form content narration with chapter breaks
|
|
192
|
-
- [ ] Multi-language speech generation support
|
|
193
|
-
- [ ] Audio export in multiple formats (MP3, WAV, OGG)
|
|
194
|
-
|
|
195
|
-
#### Success Metrics
|
|
196
|
-
- Natural-sounding speech with < 2% word error rate
|
|
197
|
-
- Response time < 10 seconds for typical text inputs
|
|
198
|
-
- Support for 10+ languages
|
|
199
|
-
- Voice customization with 5+ parameters
|
|
200
|
-
- Audio quality suitable for professional use
|
|
201
|
-
|
|
202
|
-
#### Timeline: September 2025 - October 2025
|
|
203
|
-
- **Month 1**: Speech synthesis core implementation
|
|
204
|
-
- **Month 2**: Voice customization and multi-language support
|
|
205
|
-
- **Month 3**: Advanced features and integration testing
|
|
206
|
-
|
|
207
|
-
### Phase 5: Content Generation - Hands (Q4 2025)
|
|
208
|
-
**Priority**: Medium | **Status**: Not Started | **Progress**: 0%
|
|
209
|
-
|
|
210
|
-
#### Objectives
|
|
211
|
-
Implement visual and video content generation capabilities using Google's Imagen (Nano Banana) and Veo3 APIs, enabling AI agents to create images, edit visuals, and generate videos.
|
|
212
|
-
|
|
213
|
-
#### Technical Implementation Plan
|
|
214
|
-
```typescript
|
|
215
|
-
// New Tools to Implement
|
|
216
|
-
- hands_draw: Image generation from text prompts
|
|
217
|
-
- hands_edit: Image editing and modification
|
|
218
|
-
- hands_create_video: Video generation from text/images
|
|
219
|
-
- hands_animate: Animation creation and motion graphics
|
|
220
|
-
|
|
221
|
-
// Architecture Design
|
|
222
|
-
src/tools/hands/
|
|
223
|
-
āāā index.ts # Tool registration
|
|
224
|
-
āāā schemas.ts # Content generation schemas
|
|
225
|
-
āāā processors/
|
|
226
|
-
ā āāā image-gen.ts # Imagen API integration
|
|
227
|
-
ā āāā image-edit.ts # Image editing capabilities
|
|
228
|
-
ā āāā video-gen.ts # Veo3 video generation
|
|
229
|
-
ā āāā animation.ts # Animation and motion graphics
|
|
230
|
-
āāā utils/
|
|
231
|
-
āāā imagen-client.ts # Google Imagen client
|
|
232
|
-
āāā veo-client.ts # Google Veo3 client
|
|
233
|
-
āāā content-utils.ts # Content processing utilities
|
|
234
|
-
```
|
|
235
|
-
|
|
236
|
-
#### Deliverables
|
|
237
|
-
- [ ] High-quality image generation from text descriptions
|
|
238
|
-
- [ ] Image editing capabilities (inpainting, style transfer, enhancement)
|
|
239
|
-
- [ ] Video generation from text prompts and image sequences
|
|
240
|
-
- [ ] Animation creation with motion graphics
|
|
241
|
-
- [ ] Batch content generation for workflow automation
|
|
242
|
-
- [ ] Content customization with style and parameter controls
|
|
243
|
-
|
|
244
|
-
#### Success Metrics
|
|
245
|
-
- Image generation quality score > 8/10 (human evaluation)
|
|
246
|
-
- Video generation up to 30 seconds duration
|
|
247
|
-
- Processing time < 5 minutes for typical requests
|
|
248
|
-
- Support for multiple artistic styles and formats
|
|
249
|
-
- Batch processing capabilities for efficiency
|
|
250
|
-
|
|
251
|
-
#### Timeline: October 2025 - December 2025
|
|
252
|
-
- **Month 1**: Image generation and editing implementation
|
|
253
|
-
- **Month 2**: Video generation with Veo3 integration
|
|
254
|
-
- **Month 3**: Advanced features, optimization, and testing
|
|
255
|
-
|
|
256
|
-
## Technical Architecture Evolution
|
|
257
|
-
|
|
258
|
-
### Current Architecture (v1.2.1)
|
|
259
|
-
```
|
|
260
|
-
āāāāāāāāāāāāāāāāāāā āāāāāāāāāāāāāāāāāāāā āāāāāāāāāāāāāāāāāāā
|
|
261
|
-
ā MCP Client āāāāāŗā Human MCP āāāāāŗā Google Gemini ā
|
|
262
|
-
ā (AI Agent) ā ā Server ā ā Vision API ā
|
|
263
|
-
āāāāāāāāāāāāāāāāāāā āāāāāāāāāāāāāāāāāāāā āāāāāāāāāāāāāāāāāāā
|
|
264
|
-
ā
|
|
265
|
-
ā¼
|
|
266
|
-
āāāāāāāāāāāāāāāāāāāā
|
|
267
|
-
ā Eyes Processors ā
|
|
268
|
-
ā(Image/Video/GIF) ā
|
|
269
|
-
āāāāāāāāāāāāāāāāāāāā
|
|
270
|
-
```
|
|
271
|
-
|
|
272
|
-
### Target Architecture (v2.0.0 - End 2025)
|
|
273
|
-
```
|
|
274
|
-
āāāāāāāāāāāāāāāāāāā āāāāāāāāāāāāāāāāāāāāāāāā āāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
275
|
-
ā MCP Client āāāāāŗā Human MCP āāāāāŗā Google AI Services ā
|
|
276
|
-
ā (AI Agent) ā ā Server ā ā āāāāāāāāāāāāāāāāāāāāāāā ā
|
|
277
|
-
āāāāāāāāāāāāāāāāāāā ā ā ā ā Gemini Vision API ā ā
|
|
278
|
-
ā āāāāāāāāāāāāāāāāāāā ā ā ā Gemini Audio API ā ā
|
|
279
|
-
ā ā Eyes (Vision) ā ā ā ā Gemini Speech API ā ā
|
|
280
|
-
ā ā ⢠Images/Video ā ā ā ā Imagen API ā ā
|
|
281
|
-
ā ā ⢠Documents ā ā ā ā Veo3 Video API ā ā
|
|
282
|
-
ā āāāāāāāāāāāāāāāāāāā ā ā āāāāāāāāāāāāāāāāāāāāāāā ā
|
|
283
|
-
ā āāāāāāāāāāāāāāāāāāā ā āāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
284
|
-
ā ā Ears (Audio) ā ā ā
|
|
285
|
-
ā ā ⢠Speech-to-Textā ā ā
|
|
286
|
-
ā ā ⢠Audio Analysisā ā ā¼
|
|
287
|
-
ā āāāāāāāāāāāāāāāāāāā ā āāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
288
|
-
ā āāāāāāāāāāāāāāāāāāā ā ā System Dependencies ā
|
|
289
|
-
ā ā Mouth (Speech) ā ā ā āāāāāāāāāāāāāāāāāāāāāāā ā
|
|
290
|
-
ā ā ⢠Text-to-Speechā ā ā ā ffmpeg (A/V proc) ā ā
|
|
291
|
-
ā ā ⢠Narration ā ā ā ā Sharp (Images) ā ā
|
|
292
|
-
ā āāāāāāāāāāāāāāāāāāā ā ā ā pdf-parse (Docs) ā ā
|
|
293
|
-
ā āāāāāāāāāāāāāāāāāāā ā ā ā Audio libraries ā ā
|
|
294
|
-
ā ā Hands (Creation)ā ā ā āāāāāāāāāāāāāāāāāāāāāāā ā
|
|
295
|
-
ā ā ⢠Image Gen ā ā āāāāāāāāāāāāāāāāāāāāāāāāāāā
|
|
296
|
-
ā ā ⢠Video Gen ā ā
|
|
297
|
-
ā āāāāāāāāāāāāāāāāāāā ā
|
|
298
|
-
āāāāāāāāāāāāāāāāāāāāāāāā
|
|
299
|
-
```
|
|
300
|
-
|
|
301
|
-
## Resource Requirements & Dependencies
|
|
302
|
-
|
|
303
|
-
### Development Resources
|
|
304
|
-
- **Timeline**: 3 months (September 2025 - December 2025)
|
|
305
|
-
|
|
306
|
-
### Technical Dependencies
|
|
307
|
-
```json
|
|
308
|
-
{
|
|
309
|
-
"current": [
|
|
310
|
-
"@google/generative-ai": "Gemini Vision API",
|
|
311
|
-
"ffmpeg": "Video processing",
|
|
312
|
-
"sharp": "Image processing",
|
|
313
|
-
"@modelcontextprotocol/sdk": "MCP protocol"
|
|
314
|
-
],
|
|
315
|
-
"phase2": [
|
|
316
|
-
"pdf-parse": "PDF document processing",
|
|
317
|
-
"mammoth": "Word document handling",
|
|
318
|
-
"xlsx": "Excel spreadsheet processing"
|
|
319
|
-
],
|
|
320
|
-
"phase3": [
|
|
321
|
-
"fluent-ffmpeg": "Enhanced audio processing",
|
|
322
|
-
"audio-context": "Web Audio API",
|
|
323
|
-
"wav-file-info": "Audio metadata"
|
|
324
|
-
],
|
|
325
|
-
"phase4": [
|
|
326
|
-
"@google/speech-api": "Text-to-speech synthesis",
|
|
327
|
-
"voice-processing": "Audio effects"
|
|
328
|
-
],
|
|
329
|
-
"phase5": [
|
|
330
|
-
"@google/imagen-api": "Image generation",
|
|
331
|
-
"@google/veo3-api": "Video generation"
|
|
332
|
-
]
|
|
333
|
-
}
|
|
334
|
-
```
|
|
335
|
-
|
|
336
|
-
### Infrastructure Requirements
|
|
337
|
-
- **API Access**: Google AI services (Gemini, Imagen, Veo3)
|
|
338
|
-
- **Computing**: Development machines with sufficient RAM (16GB+)
|
|
339
|
-
- **Storage**: Temporary file processing space (10GB+)
|
|
340
|
-
- **Network**: High-bandwidth internet for API calls
|
|
341
|
-
|
|
342
|
-
## Success Metrics & KPIs
|
|
343
|
-
|
|
344
|
-
### Technical Metrics
|
|
345
|
-
| Metric | Current (Phase 1) | Target (Phase 5) |
|
|
346
|
-
|--------|------------------|------------------|
|
|
347
|
-
| Processing Speed | < 30s (images) | < 60s (any content) |
|
|
348
|
-
| Success Rate | 98.5% | 99%+ |
|
|
349
|
-
| Format Support | 8 formats | 50+ formats |
|
|
350
|
-
| Memory Usage | < 100MB | < 200MB |
|
|
351
|
-
| API Response Time | 95th %ile < 30s | 95th %ile < 45s |
|
|
352
|
-
|
|
353
|
-
### Business Metrics
|
|
354
|
-
- **Adoption Rate**: Target 1000+ MCP client integrations by end of 2025
|
|
355
|
-
- **API Usage**: Target 100K+ API calls per month
|
|
356
|
-
- **Community Growth**: Target 500+ GitHub stars, 50+ contributors
|
|
357
|
-
- **Documentation Quality**: 100% API coverage, comprehensive examples
|
|
358
|
-
|
|
359
|
-
### Quality Metrics
|
|
360
|
-
- **Test Coverage**: Maintain > 85% code coverage
|
|
361
|
-
- **Bug Rate**: < 5 bugs per 1000 lines of code
|
|
362
|
-
- **Performance**: No regression in processing times
|
|
363
|
-
- **User Satisfaction**: > 4.5/5 star rating in feedback
|
|
364
|
-
|
|
365
|
-
## Risk Assessment & Mitigation
|
|
366
|
-
|
|
367
|
-
### High-Risk Items
|
|
368
|
-
|
|
369
|
-
#### 1. Google API Dependency Risk
|
|
370
|
-
**Risk**: Changes to Google AI APIs or pricing models
|
|
371
|
-
**Impact**: High - Could break functionality or increase costs significantly
|
|
372
|
-
**Mitigation**:
|
|
373
|
-
- Implement adapter pattern for easy API switching
|
|
374
|
-
- Monitor Google AI roadmaps and announcements
|
|
375
|
-
- Develop fallback strategies with alternative providers
|
|
376
|
-
- Maintain API version compatibility layers
|
|
377
|
-
|
|
378
|
-
#### 2. Performance Scalability Risk
|
|
379
|
-
**Risk**: Processing large files or high request volumes
|
|
380
|
-
**Impact**: Medium - Could impact user experience
|
|
381
|
-
**Mitigation**:
|
|
382
|
-
- Implement streaming for large files
|
|
383
|
-
- Add request queuing and rate limiting
|
|
384
|
-
- Optimize memory usage and cleanup
|
|
385
|
-
- Provide performance monitoring and alerting
|
|
386
|
-
|
|
387
|
-
#### 3. Format Compatibility Risk
|
|
388
|
-
**Risk**: Unsupported media formats or edge cases
|
|
389
|
-
**Impact**: Medium - Limited functionality for some users
|
|
390
|
-
**Mitigation**:
|
|
391
|
-
- Comprehensive format testing matrix
|
|
392
|
-
- Graceful error handling for unsupported formats
|
|
393
|
-
- Clear documentation of supported formats
|
|
394
|
-
- Community feedback loop for new format requests
|
|
395
|
-
|
|
396
|
-
### Medium-Risk Items
|
|
397
|
-
|
|
398
|
-
#### 4. Development Timeline Risk
|
|
399
|
-
**Risk**: Features taking longer than estimated
|
|
400
|
-
**Impact**: Medium - Delayed roadmap execution
|
|
401
|
-
**Mitigation**:
|
|
402
|
-
- Agile development with monthly milestones
|
|
403
|
-
- Regular progress reviews and timeline adjustments
|
|
404
|
-
- Parallel development tracks where possible
|
|
405
|
-
- MVP approach for each phase
|
|
406
|
-
|
|
407
|
-
#### 5. API Cost Management Risk
|
|
408
|
-
**Risk**: Unexpected increase in API usage costs
|
|
409
|
-
**Impact**: Medium - Budget overrun
|
|
410
|
-
**Mitigation**:
|
|
411
|
-
- Implement usage monitoring and alerting
|
|
412
|
-
- Provide cost estimation tools for users
|
|
413
|
-
- Offer different processing tiers (quick vs. detailed)
|
|
414
|
-
- Cache results where appropriate
|
|
415
|
-
|
|
416
|
-
### Low-Risk Items
|
|
417
|
-
|
|
418
|
-
#### 6. Community Adoption Risk
|
|
419
|
-
**Risk**: Low adoption of new features
|
|
420
|
-
**Impact**: Low - Feature may not justify development cost
|
|
421
|
-
**Mitigation**:
|
|
422
|
-
- User research and feedback collection
|
|
423
|
-
- Beta testing with key integrators
|
|
424
|
-
- Comprehensive documentation and examples
|
|
425
|
-
- Active community engagement
|
|
426
|
-
|
|
427
|
-
## Development Methodology
|
|
428
|
-
|
|
429
|
-
### Agile Approach
|
|
430
|
-
- **Sprint Duration**: 2-week sprints
|
|
431
|
-
- **Planning**: Monthly planning sessions for each phase
|
|
432
|
-
- **Reviews**: Weekly progress reviews with stakeholders
|
|
433
|
-
- **Retrospectives**: End-of-phase retrospectives for improvement
|
|
434
|
-
|
|
435
|
-
### Quality Assurance
|
|
436
|
-
- **Testing Strategy**: Unit tests, integration tests, manual testing
|
|
437
|
-
- **Code Review**: All code reviewed by team lead
|
|
438
|
-
- **Performance Testing**: Automated performance regression testing
|
|
439
|
-
- **Security Review**: Security audit for each major release
|
|
440
|
-
|
|
441
|
-
### Release Strategy
|
|
442
|
-
- **Versioning**: Semantic versioning (MAJOR.MINOR.PATCH)
|
|
443
|
-
- **Release Schedule**: Monthly minor releases, quarterly major releases
|
|
444
|
-
- **Beta Testing**: 2-week beta period for major features
|
|
445
|
-
- **Rollback Plan**: Ability to rollback releases if issues discovered
|
|
446
|
-
|
|
447
|
-
## Integration Strategy
|
|
448
|
-
|
|
449
|
-
### MCP Ecosystem Integration
|
|
450
|
-
- **Client Compatibility**: Ensure compatibility with major MCP clients
|
|
451
|
-
- **Protocol Updates**: Stay current with MCP protocol evolution
|
|
452
|
-
- **Community Tools**: Integration with popular development tools
|
|
453
|
-
- **Documentation**: Comprehensive integration guides
|
|
454
|
-
|
|
455
|
-
### External Service Integration
|
|
456
|
-
- **Google AI Services**: Primary integration with Google's AI ecosystem
|
|
457
|
-
- **Alternative Providers**: Future integration with OpenAI, Anthropic, etc.
|
|
458
|
-
- **Local Models**: Support for local AI model deployment
|
|
459
|
-
- **Caching Layer**: Intelligent caching to reduce API calls
|
|
460
|
-
|
|
461
|
-
## Future Vision (Beyond 2025)
|
|
462
|
-
|
|
463
|
-
### Advanced Capabilities
|
|
464
|
-
- **Real-time Processing**: Live screen capture and analysis
|
|
465
|
-
- **Interactive Debugging**: Conversational debugging workflows
|
|
466
|
-
- **Multi-modal Fusion**: Combined analysis across all sensory modalities
|
|
467
|
-
- **Custom Model Training**: Domain-specific model fine-tuning
|
|
468
|
-
|
|
469
|
-
### Enterprise Features
|
|
470
|
-
- **On-premises Deployment**: Air-gapped enterprise installations
|
|
471
|
-
- **SSO Integration**: Enterprise authentication and authorization
|
|
472
|
-
- **Audit Logging**: Comprehensive audit trails for compliance
|
|
473
|
-
- **Scalability**: Horizontal scaling for high-volume usage
|
|
474
|
-
|
|
475
|
-
### Research & Development
|
|
476
|
-
- **New AI Models**: Integration with cutting-edge AI research
|
|
477
|
-
- **Performance Optimization**: Advanced caching and preprocessing
|
|
478
|
-
- **Privacy Enhancement**: Local processing capabilities
|
|
479
|
-
- **Accessibility**: Enhanced accessibility features and compliance
|
|
480
|
-
|
|
481
|
-
## Conclusion
|
|
482
|
-
|
|
483
|
-
The Human MCP project represents a significant advancement in AI-agent capabilities, providing comprehensive human-like sensory analysis through the Model Context Protocol. With the visual analysis foundation complete, the roadmap focuses on expanding to document understanding, audio processing, speech generation, and content creation.
|
|
484
|
-
|
|
485
|
-
The phased approach ensures steady progress while maintaining high quality and reliability. Success depends on careful API integration, performance optimization, and active community engagement. By the end of 2025, Human MCP will provide AI agents with a complete suite of human-like capabilities, fundamentally changing how AI systems interact with and understand multimodal content.
|
|
486
|
-
|
|
487
|
-
**Key Success Factors**:
|
|
488
|
-
- Maintaining high performance and reliability standards
|
|
489
|
-
- Building strong community adoption and feedback loops
|
|
490
|
-
- Staying ahead of Google AI API evolution
|
|
491
|
-
- Delivering practical value to AI agent developers
|
|
492
|
-
- Comprehensive documentation and developer experience
|
|
493
|
-
|
|
494
|
-
The project positions Human MCP as the definitive multimodal analysis solution for AI agents, enabling sophisticated debugging, content analysis, and creation workflows that bridge the gap between artificial and human intelligence.
|
|
@@ -1,96 +0,0 @@
|
|
|
1
|
-
/**
|
|
2
|
-
* Example: Complete debugging session with Human MCP
|
|
3
|
-
*
|
|
4
|
-
* This demonstrates a typical workflow for debugging UI issues
|
|
5
|
-
* using the Human MCP server's visual analysis capabilities.
|
|
6
|
-
*/
|
|
7
|
-
|
|
8
|
-
import { createServer } from "../src/server.js";
|
|
9
|
-
|
|
10
|
-
async function debuggingSession() {
|
|
11
|
-
console.log("š Starting Human MCP debugging session...\n");
|
|
12
|
-
|
|
13
|
-
const server = await createServer();
|
|
14
|
-
|
|
15
|
-
// Example 1: Analyze a UI screenshot for layout issues
|
|
16
|
-
console.log("1ļøā£ Analyzing UI screenshot for layout issues...");
|
|
17
|
-
|
|
18
|
-
const uiAnalysis = await server.callTool("eyes.analyze", {
|
|
19
|
-
source: "/path/to/broken-ui.png",
|
|
20
|
-
type: "image",
|
|
21
|
-
analysis_type: "ui_debug",
|
|
22
|
-
detail_level: "detailed",
|
|
23
|
-
specific_focus: "navigation menu alignment and button states"
|
|
24
|
-
});
|
|
25
|
-
|
|
26
|
-
console.log("š UI Analysis Results:");
|
|
27
|
-
console.log(uiAnalysis.content[0].text);
|
|
28
|
-
console.log("\n" + "=".repeat(50) + "\n");
|
|
29
|
-
|
|
30
|
-
// Example 2: Investigate error in screen recording
|
|
31
|
-
console.log("2ļøā£ Investigating error sequence in recording...");
|
|
32
|
-
|
|
33
|
-
const errorAnalysis = await server.callTool("eyes.analyze", {
|
|
34
|
-
source: "/path/to/error-recording.mp4",
|
|
35
|
-
type: "video",
|
|
36
|
-
analysis_type: "error_detection",
|
|
37
|
-
detail_level: "detailed",
|
|
38
|
-
specific_focus: "form submission failure and user feedback"
|
|
39
|
-
});
|
|
40
|
-
|
|
41
|
-
console.log("šØ Error Analysis Results:");
|
|
42
|
-
console.log(errorAnalysis.content[0].text);
|
|
43
|
-
console.log("\n" + "=".repeat(50) + "\n");
|
|
44
|
-
|
|
45
|
-
// Example 3: Compare before/after layouts
|
|
46
|
-
console.log("3ļøā£ Comparing layouts before and after changes...");
|
|
47
|
-
|
|
48
|
-
const comparison = await server.callTool("eyes.compare", {
|
|
49
|
-
source1: "/path/to/before-fix.png",
|
|
50
|
-
source2: "/path/to/after-fix.png",
|
|
51
|
-
comparison_type: "structural"
|
|
52
|
-
});
|
|
53
|
-
|
|
54
|
-
console.log("š Layout Comparison Results:");
|
|
55
|
-
console.log(comparison.content[0].text);
|
|
56
|
-
console.log("\n" + "=".repeat(50) + "\n");
|
|
57
|
-
|
|
58
|
-
// Example 4: Accessibility audit
|
|
59
|
-
console.log("4ļøā£ Performing accessibility audit...");
|
|
60
|
-
|
|
61
|
-
const a11yAnalysis = await server.callTool("eyes.analyze", {
|
|
62
|
-
source: "/path/to/page-screenshot.png",
|
|
63
|
-
type: "image",
|
|
64
|
-
analysis_type: "accessibility",
|
|
65
|
-
detail_level: "detailed",
|
|
66
|
-
check_accessibility: true,
|
|
67
|
-
specific_focus: "color contrast and focus indicators"
|
|
68
|
-
});
|
|
69
|
-
|
|
70
|
-
console.log("āæ Accessibility Analysis Results:");
|
|
71
|
-
console.log(a11yAnalysis.content[0].text);
|
|
72
|
-
console.log("\n" + "=".repeat(50) + "\n");
|
|
73
|
-
|
|
74
|
-
// Example 5: Performance analysis of loading animation
|
|
75
|
-
console.log("5ļøā£ Analyzing loading animation performance...");
|
|
76
|
-
|
|
77
|
-
const perfAnalysis = await server.callTool("eyes.analyze", {
|
|
78
|
-
source: "/path/to/loading-animation.gif",
|
|
79
|
-
type: "gif",
|
|
80
|
-
analysis_type: "performance",
|
|
81
|
-
detail_level: "detailed",
|
|
82
|
-
specific_focus: "loading indicators and user feedback timing"
|
|
83
|
-
});
|
|
84
|
-
|
|
85
|
-
console.log("ā” Performance Analysis Results:");
|
|
86
|
-
console.log(perfAnalysis.content[0].text);
|
|
87
|
-
|
|
88
|
-
console.log("\nā
Debugging session complete!");
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
// Run the example if called directly
|
|
92
|
-
if (import.meta.main) {
|
|
93
|
-
debuggingSession().catch(console.error);
|
|
94
|
-
}
|
|
95
|
-
|
|
96
|
-
export { debuggingSession };
|
package/human-mcp.png
DELETED
|
Binary file
|
package/inspector-wrapper.mjs
DELETED
|
@@ -1,33 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
// Workaround for eventsource ESM import issues
|
|
4
|
-
import { createRequire } from 'module';
|
|
5
|
-
import { spawn } from 'child_process';
|
|
6
|
-
|
|
7
|
-
const require = createRequire(import.meta.url);
|
|
8
|
-
|
|
9
|
-
// Try to fix the eventsource import by patching the module resolution
|
|
10
|
-
const Module = require('module');
|
|
11
|
-
const originalResolveFilename = Module._resolveFilename;
|
|
12
|
-
|
|
13
|
-
Module._resolveFilename = function (request, parent, isMain, options) {
|
|
14
|
-
if (request === 'eventsource' && parent?.filename?.includes('@modelcontextprotocol/inspector')) {
|
|
15
|
-
// Force CommonJS resolution for eventsource
|
|
16
|
-
return require.resolve('eventsource');
|
|
17
|
-
}
|
|
18
|
-
return originalResolveFilename.call(this, request, parent, isMain, options);
|
|
19
|
-
};
|
|
20
|
-
|
|
21
|
-
// Run the inspector with the command line args
|
|
22
|
-
const args = process.argv.slice(2);
|
|
23
|
-
const child = spawn('npx', ['@modelcontextprotocol/inspector', ...args], {
|
|
24
|
-
stdio: 'inherit',
|
|
25
|
-
env: {
|
|
26
|
-
...process.env,
|
|
27
|
-
NODE_OPTIONS: '--loader ./inspector-loader.mjs'
|
|
28
|
-
}
|
|
29
|
-
});
|
|
30
|
-
|
|
31
|
-
child.on('close', (code) => {
|
|
32
|
-
process.exit(code);
|
|
33
|
-
});
|