@goonnguyen/human-mcp 1.3.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. package/README.md +261 -19
  2. package/bin/human-mcp.js +2 -0
  3. package/dist/index.js +65180 -1698
  4. package/package.json +19 -2
  5. package/.claude/agents/code-reviewer.md +0 -140
  6. package/.claude/agents/database-admin.md +0 -86
  7. package/.claude/agents/debugger.md +0 -119
  8. package/.claude/agents/docs-manager.md +0 -113
  9. package/.claude/agents/git-manager.md +0 -59
  10. package/.claude/agents/planner-researcher.md +0 -97
  11. package/.claude/agents/project-manager.md +0 -113
  12. package/.claude/agents/tester.md +0 -95
  13. package/.claude/commands/cook.md +0 -7
  14. package/.claude/commands/debug.md +0 -10
  15. package/.claude/commands/docs/init.md +0 -11
  16. package/.claude/commands/docs/update.md +0 -11
  17. package/.claude/commands/fix/ci.md +0 -8
  18. package/.claude/commands/fix/fast.md +0 -5
  19. package/.claude/commands/fix/hard.md +0 -7
  20. package/.claude/commands/fix/test.md +0 -16
  21. package/.claude/commands/git/cm.md +0 -5
  22. package/.claude/commands/git/cp.md +0 -4
  23. package/.claude/commands/plan/ci.md +0 -12
  24. package/.claude/commands/plan/two.md +0 -13
  25. package/.claude/commands/plan.md +0 -10
  26. package/.claude/commands/test.md +0 -7
  27. package/.claude/commands/watzup.md +0 -8
  28. package/.claude/hooks/telegram_notify.sh +0 -136
  29. package/.claude/send-discord.sh +0 -64
  30. package/.claude/settings.json +0 -7
  31. package/.claude/statusline.sh +0 -143
  32. package/.dockerignore +0 -81
  33. package/.env.example +0 -44
  34. package/.github/workflows/publish.yml +0 -88
  35. package/.opencode/agent/code-reviewer.md +0 -142
  36. package/.opencode/agent/debugger.md +0 -74
  37. package/.opencode/agent/docs-manager.md +0 -119
  38. package/.opencode/agent/git-manager.md +0 -60
  39. package/.opencode/agent/planner-researcher.md +0 -100
  40. package/.opencode/agent/project-manager.md +0 -113
  41. package/.opencode/agent/system-architecture.md +0 -200
  42. package/.opencode/agent/tester.md +0 -96
  43. package/.opencode/agent/ui-ux-developer.md +0 -97
  44. package/.opencode/command/cook.md +0 -7
  45. package/.opencode/command/debug.md +0 -10
  46. package/.opencode/command/fix/ci.md +0 -8
  47. package/.opencode/command/fix/fast.md +0 -5
  48. package/.opencode/command/fix/hard.md +0 -7
  49. package/.opencode/command/fix/test.md +0 -16
  50. package/.opencode/command/git/cm.md +0 -5
  51. package/.opencode/command/git/cp.md +0 -4
  52. package/.opencode/command/plan/ci.md +0 -12
  53. package/.opencode/command/plan/two.md +0 -13
  54. package/.opencode/command/plan.md +0 -10
  55. package/.opencode/command/test.md +0 -7
  56. package/.opencode/command/watzup.md +0 -8
  57. package/.releaserc.json +0 -26
  58. package/.serena/project.yml +0 -68
  59. package/CHANGELOG.md +0 -62
  60. package/CLAUDE.md +0 -141
  61. package/DEPLOYMENT.md +0 -329
  62. package/Dockerfile +0 -52
  63. package/QUICKSTART.md +0 -97
  64. package/bun.lock +0 -1872
  65. package/bunfig.toml +0 -15
  66. package/docker-compose.yaml +0 -128
  67. package/docs/README.md +0 -51
  68. package/docs/codebase-structure-architecture-code-standards.md +0 -428
  69. package/docs/codebase-summary.md +0 -321
  70. package/docs/project-overview-pdr.md +0 -286
  71. package/docs/project-roadmap.md +0 -494
  72. package/examples/debugging-session.ts +0 -96
  73. package/human-mcp.png +0 -0
  74. package/inspector-wrapper.mjs +0 -33
  75. package/plans/001-streamable-http-transport-plan.md +0 -905
  76. package/plans/002-sse-fallback-http-transport-plan.md +0 -161
  77. package/plans/003-fix-test-infrastructure-and-ci-plan.md +0 -699
  78. package/plans/003-http-transport-local-file-access-plan.md +0 -880
  79. package/plans/004-fix-typescript-compilation-errors-plan.md +0 -388
  80. package/plans/005-comprehensive-test-infrastructure-fix-plan.md +0 -854
  81. package/plans/templates/bug-fix-template.md +0 -69
  82. package/plans/templates/feature-implementation-template.md +0 -84
  83. package/plans/templates/refactor-template.md +0 -82
  84. package/plans/templates/template-usage-guide.md +0 -58
  85. package/src/index.ts +0 -49
  86. package/src/prompts/debugging-prompts.ts +0 -149
  87. package/src/prompts/index.ts +0 -55
  88. package/src/resources/documentation.ts +0 -316
  89. package/src/resources/index.ts +0 -49
  90. package/src/server.ts +0 -36
  91. package/src/tools/eyes/index.ts +0 -225
  92. package/src/tools/eyes/processors/gif.ts +0 -137
  93. package/src/tools/eyes/processors/image.ts +0 -213
  94. package/src/tools/eyes/processors/video.ts +0 -135
  95. package/src/tools/eyes/schemas.ts +0 -51
  96. package/src/tools/eyes/utils/formatters.ts +0 -126
  97. package/src/tools/eyes/utils/gemini-client.ts +0 -73
  98. package/src/transports/http/file-interceptor.ts +0 -134
  99. package/src/transports/http/middleware.ts +0 -46
  100. package/src/transports/http/routes.ts +0 -297
  101. package/src/transports/http/server.ts +0 -116
  102. package/src/transports/http/session.ts +0 -93
  103. package/src/transports/http/sse-routes.ts +0 -210
  104. package/src/transports/index.ts +0 -36
  105. package/src/transports/stdio.ts +0 -7
  106. package/src/transports/types.ts +0 -50
  107. package/src/types/index.ts +0 -41
  108. package/src/utils/cloudflare-r2.ts +0 -107
  109. package/src/utils/config.ts +0 -123
  110. package/src/utils/errors.ts +0 -40
  111. package/src/utils/logger.ts +0 -49
  112. package/tests/integration/http-transport-files.test.ts +0 -190
  113. package/tests/integration/server.test.ts +0 -27
  114. package/tests/integration/sse-transport.test.ts +0 -142
  115. package/tests/setup.ts +0 -55
  116. package/tests/types/api-responses.ts +0 -35
  117. package/tests/types/test-types.ts +0 -105
  118. package/tests/unit/cloudflare-r2.test.ts +0 -118
  119. package/tests/unit/config.test.ts +0 -40
  120. package/tests/unit/eyes-analyze.test.ts +0 -150
  121. package/tests/unit/formatters.test.ts +0 -85
  122. package/tests/unit/sse-routes.test.ts +0 -92
  123. package/tests/utils/error-scenarios.ts +0 -198
  124. package/tests/utils/index.ts +0 -3
  125. package/tests/utils/mock-helpers.ts +0 -99
  126. package/tests/utils/test-data-generators.ts +0 -217
  127. package/tests/utils/test-server-manager.ts +0 -172
  128. package/tsconfig.json +0 -26
package/README.md CHANGED
@@ -21,12 +21,39 @@ Human MCP is a Model Context Protocol server that provides AI coding agents with
21
21
  - **Performance**: Loading states, visual performance indicators
22
22
  - **Layout**: Responsive design, positioning, visual hierarchy
23
23
 
24
+ 🎨 **Content Generation**
25
+ - Generate high-quality images from text descriptions
26
+ - Multiple artistic styles: photorealistic, artistic, cartoon, sketch, digital art
27
+ - Flexible aspect ratios and output formats
28
+ - Advanced prompt engineering and negative prompts
29
+
30
+ 🗣️ **Speech Generation**
31
+ - Convert text to natural-sounding speech with 30+ voice options
32
+ - Long-form content narration with chapter breaks
33
+ - Technical code explanation with spoken analysis
34
+ - Voice customization and style control
35
+ - Multi-language support (24 languages)
36
+ - Professional audio export in WAV format
37
+
24
38
  🤖 **AI-Powered**
25
39
  - Uses Google Gemini 2.5 Flash for fast, accurate analysis
40
+ - Advanced Imagen API for high-quality image generation
41
+ - Gemini Speech Generation API for natural voice synthesis
26
42
  - Detailed technical insights for developers
27
43
  - Actionable recommendations for fixing issues
28
44
  - Structured output with detected elements and coordinates
29
45
 
46
+ ### Google Gemini Documentation
47
+ - [Gemini API](https://ai.google.dev/gemini-api/docs?hl=en)
48
+ - [Gemini Models](https://ai.google.dev/gemini-api/docs/models)
49
+ - [Video Understanding](https://ai.google.dev/gemini-api/docs/video-understanding?hl=en)
50
+ - [Image Understanding](https://ai.google.dev/gemini-api/docs/image-understanding)
51
+ - [Document Understanding](https://ai.google.dev/gemini-api/docs/document-processing)
52
+ - [Audio Understanding](https://ai.google.dev/gemini-api/docs/audio)
53
+ - [Speech Generation](https://ai.google.dev/gemini-api/docs/speech-generation)
54
+ - [Image Generation](https://ai.google.dev/gemini-api/docs/image-generation)
55
+ - [Video Generation](https://ai.google.dev/gemini-api/docs/video)
56
+
30
57
  ## Quick Start
31
58
 
32
59
  ### Getting Your Google Gemini API Key
@@ -400,6 +427,70 @@ Or via JSON configuration:
400
427
  }
401
428
  ```
402
429
 
430
+ #### OpenCode
431
+
432
+ OpenCode is a powerful AI coding agent that supports MCP servers for enhanced capabilities. Use Human MCP to add visual analysis tools to your OpenCode workflow.
433
+
434
+ **Configuration Location:**
435
+ - **Global**: `~/.config/opencode/opencode.json`
436
+ - **Project**: `./opencode.json` in your project root
437
+
438
+ **Configuration Example (STDIO - Recommended):**
439
+
440
+ ```json
441
+ {
442
+ "$schema": "https://opencode.ai/config.json",
443
+ "mcp": {
444
+ "human": {
445
+ "type": "local",
446
+ "command": ["npx", "@goonnguyen/human-mcp"],
447
+ "enabled": true,
448
+ "environment": {
449
+ "GOOGLE_GEMINI_API_KEY": "your_gemini_api_key_here",
450
+ "TRANSPORT_TYPE": "stdio",
451
+ "LOG_LEVEL": "info"
452
+ }
453
+ }
454
+ }
455
+ }
456
+ ```
457
+
458
+ **Alternative Configuration (if globally installed):**
459
+
460
+ ```json
461
+ {
462
+ "$schema": "https://opencode.ai/config.json",
463
+ "mcp": {
464
+ "human": {
465
+ "type": "local",
466
+ "command": ["human-mcp"],
467
+ "enabled": true,
468
+ "environment": {
469
+ "GOOGLE_GEMINI_API_KEY": "your_gemini_api_key_here",
470
+ "TRANSPORT_TYPE": "stdio"
471
+ }
472
+ }
473
+ }
474
+ }
475
+ ```
476
+
477
+ **Setup Steps:**
478
+ 1. Install Human MCP: `npm install -g @goonnguyen/human-mcp`
479
+ 2. Create or edit your OpenCode configuration file
480
+ 3. Add the Human MCP server configuration (use `npx` version for reliability)
481
+ 4. Set your Google Gemini API key in environment variables or the config
482
+ 5. Restart OpenCode
483
+
484
+ **Important Notes:**
485
+ - **STDIO Mode**: Human MCP uses stdio transport by default, which provides the best compatibility with OpenCode
486
+ - **No R2 Uploads**: In stdio mode, all images and videos are processed locally and sent to Gemini using inline base64 - no Cloudflare R2 uploads occur
487
+ - **Security**: Never commit API keys to version control. Use environment variables or secure credential storage
488
+
489
+ **Verification:**
490
+ - Check OpenCode logs for successful MCP connection
491
+ - Try using `eyes_analyze` tool: "Analyze this screenshot for UI issues"
492
+ - Verify no external network calls to Cloudflare R2 in stdio mode
493
+
403
494
  #### Gemini CLI
404
495
 
405
496
  While Gemini CLI doesn't directly support MCP, you can use Human MCP as a bridge to access visual analysis capabilities.
@@ -855,7 +946,74 @@ Compare two images to identify visual differences.
855
946
  {
856
947
  "source1": "/path/to/before.png",
857
948
  "source2": "/path/to/after.png",
858
- "comparison_type": "structural"
949
+ "comparison_type": "structural"
950
+ }
951
+ ```
952
+
953
+ ### gemini_gen_image
954
+
955
+ Generate high-quality images from text descriptions using Gemini Imagen API.
956
+
957
+ ```json
958
+ {
959
+ "prompt": "A modern minimalist login form with clean typography",
960
+ "style": "digital_art",
961
+ "aspect_ratio": "16:9",
962
+ "negative_prompt": "cluttered, low quality, blurry"
963
+ }
964
+ ```
965
+
966
+ ### mouth_speak
967
+
968
+ Convert text to natural-sounding speech with voice customization.
969
+
970
+ ```json
971
+ {
972
+ "text": "Welcome to our application. Let me guide you through the interface.",
973
+ "voice": "Zephyr",
974
+ "language": "en-US",
975
+ "style_prompt": "Speak in a friendly, welcoming tone"
976
+ }
977
+ ```
978
+
979
+ ### mouth_narrate
980
+
981
+ Generate narration for long-form content with chapter breaks and style control.
982
+
983
+ ```json
984
+ {
985
+ "content": "Chapter 1: Introduction to React...",
986
+ "voice": "Sage",
987
+ "narration_style": "educational",
988
+ "chapter_breaks": true,
989
+ "max_chunk_size": 8000
990
+ }
991
+ ```
992
+
993
+ ### mouth_explain
994
+
995
+ Generate spoken explanations of code with technical analysis.
996
+
997
+ ```json
998
+ {
999
+ "code": "function factorial(n) { return n <= 1 ? 1 : n * factorial(n-1); }",
1000
+ "programming_language": "javascript",
1001
+ "voice": "Apollo",
1002
+ "explanation_level": "intermediate",
1003
+ "include_examples": true
1004
+ }
1005
+ ```
1006
+
1007
+ ### mouth_customize
1008
+
1009
+ Test different voices and styles to find the best fit for your content.
1010
+
1011
+ ```json
1012
+ {
1013
+ "text": "Hello, this is a voice test sample.",
1014
+ "voice": "Charon",
1015
+ "style_variations": ["professional", "casual", "energetic"],
1016
+ "compare_voices": ["Puck", "Sage", "Apollo"]
859
1017
  }
860
1018
  ```
861
1019
 
@@ -886,12 +1044,67 @@ Compare two images to identify visual differences.
886
1044
  # Check accessibility compliance
887
1045
  {
888
1046
  "source": "page-screenshot.png",
889
- "type": "image",
1047
+ "type": "image",
890
1048
  "analysis_type": "accessibility",
891
1049
  "check_accessibility": true
892
1050
  }
893
1051
  ```
894
1052
 
1053
+ ### Image Generation for Design
1054
+ ```bash
1055
+ # Generate UI mockups and design elements
1056
+ {
1057
+ "prompt": "Professional dashboard interface with data visualization charts",
1058
+ "style": "digital_art",
1059
+ "aspect_ratio": "16:9"
1060
+ }
1061
+ ```
1062
+
1063
+ ### Prototype Creation
1064
+ ```bash
1065
+ # Create visual prototypes for development
1066
+ {
1067
+ "prompt": "Mobile app login screen with modern design, dark theme",
1068
+ "style": "photorealistic",
1069
+ "aspect_ratio": "9:16",
1070
+ "negative_prompt": "old-fashioned, bright colors"
1071
+ }
1072
+ ```
1073
+
1074
+ ### Code Explanation Audio
1075
+ ```bash
1076
+ # Generate spoken explanations for code reviews
1077
+ {
1078
+ "code": "const useAuth = () => { const [user, setUser] = useState(null); return { user, login: setUser }; }",
1079
+ "programming_language": "javascript",
1080
+ "voice": "Apollo",
1081
+ "explanation_level": "advanced",
1082
+ "include_examples": true
1083
+ }
1084
+ ```
1085
+
1086
+ ### Documentation Narration
1087
+ ```bash
1088
+ # Convert technical documentation to audio
1089
+ {
1090
+ "content": "This API endpoint handles user authentication and returns a JWT token...",
1091
+ "voice": "Sage",
1092
+ "narration_style": "professional",
1093
+ "chapter_breaks": true
1094
+ }
1095
+ ```
1096
+
1097
+ ### User Interface Voice Feedback
1098
+ ```bash
1099
+ # Generate voice responses for applications
1100
+ {
1101
+ "text": "File uploaded successfully. Processing will complete in approximately 30 seconds.",
1102
+ "voice": "Kore",
1103
+ "language": "en-US",
1104
+ "style_prompt": "Speak in a helpful, reassuring tone"
1105
+ }
1106
+ ```
1107
+
895
1108
  ## Prompts
896
1109
 
897
1110
  Human MCP includes pre-built prompts for common debugging scenarios:
@@ -977,9 +1190,19 @@ HTTP_ENABLE_RATE_LIMITING=false
977
1190
  Human MCP Server
978
1191
  ├── Eyes Tool (Vision Understanding)
979
1192
  │ ├── Image Analysis
980
- │ ├── Video Processing
1193
+ │ ├── Video Processing
981
1194
  │ ├── GIF Frame Extraction
982
1195
  │ └── Visual Comparison
1196
+ ├── Hands Tool (Content Generation)
1197
+ │ ├── Image Generation
1198
+ │ ├── Style Customization
1199
+ │ ├── Aspect Ratio Control
1200
+ │ └── Prompt Engineering
1201
+ ├── Mouth Tool (Speech Generation)
1202
+ │ ├── Text-to-Speech Synthesis
1203
+ │ ├── Long-form Narration
1204
+ │ ├── Code Explanation
1205
+ │ └── Voice Customization
983
1206
  ├── Debugging Prompts
984
1207
  └── Documentation Resources
985
1208
  ```
@@ -992,7 +1215,7 @@ For detailed architecture information and future development plans, see:
992
1215
 
993
1216
  **Mission**: Transform AI coding agents with complete human-like sensory capabilities, bridging the gap between artificial and human intelligence through sophisticated multimodal analysis.
994
1217
 
995
- ### Current Status: Phase 1 Complete ✅
1218
+ ### Current Status: Phase 1 Complete ✅ | Phase 4 Complete ✅ | Phase 5 Complete ✅
996
1219
 
997
1220
  **Eyes (Visual Analysis)** - Production Ready (v1.2.1)
998
1221
  - Advanced image, video, and GIF analysis capabilities
@@ -1001,6 +1224,21 @@ For detailed architecture information and future development plans, see:
1001
1224
  - Processing 20+ visual formats with 98.5% success rate
1002
1225
  - Sub-30 second response times for detailed analysis
1003
1226
 
1227
+ **Hands (Content Generation)** - Production Ready (v1.2.2)
1228
+ - High-quality image generation using Gemini Imagen API
1229
+ - Multiple artistic styles and aspect ratios
1230
+ - Advanced prompt engineering with negative prompts
1231
+ - Comprehensive validation and error handling
1232
+ - Fast generation times with reliable output
1233
+
1234
+ **Mouth (Speech Generation)** - Production Ready (v1.3.0)
1235
+ - Natural text-to-speech with 30+ voice options
1236
+ - Long-form content narration with chapter breaks
1237
+ - Technical code explanation with spoken analysis
1238
+ - Voice customization and style control
1239
+ - Multi-language support (24 languages)
1240
+ - Professional audio export in WAV format
1241
+
1004
1242
  ### Upcoming Development Phases
1005
1243
 
1006
1244
  #### Phase 2: Document Understanding (Q4 2025)
@@ -1019,21 +1257,25 @@ For detailed architecture information and future development plans, see:
1019
1257
  - Support for 20+ audio formats (WAV, MP3, AAC, OGG, FLAC)
1020
1258
  - Real-time audio processing capabilities
1021
1259
 
1022
- #### Phase 4: Speech Generation - Mouth (Q4 2025)
1023
- **AI Voice Capabilities**
1024
- - High-quality text-to-speech with customizable voice parameters
1025
- - Code explanation and technical content narration
1026
- - Multi-language speech generation (10+ languages)
1027
- - Long-form content narration with natural pacing
1028
- - Professional-quality audio export in multiple formats
1029
-
1030
- #### Phase 5: Content Generation - Hands (Q4 2025)
1031
- **Creative Content Creation**
1032
- - Image generation from text descriptions using Imagen API
1033
- - Advanced image editing (inpainting, style transfer, enhancement)
1034
- - Video generation up to 30 seconds using Veo3 API
1035
- - Animation creation with motion graphics
1036
- - Batch content generation for workflow automation
1260
+ #### Phase 4: Speech Generation - Mouth COMPLETE
1261
+ **AI Voice Capabilities** - Production Ready (v1.3.0)
1262
+ - High-quality text-to-speech with 30+ voice options using Gemini Speech API
1263
+ - Code explanation and technical content narration
1264
+ - Multi-language speech generation (24 languages supported)
1265
+ - Long-form content narration with chapter breaks and natural pacing
1266
+ - Professional-quality audio export in WAV format
1267
+ - ✅ Voice customization with style prompts and voice comparison
1268
+
1269
+ #### Phase 5: Content Generation - Hands ✅ COMPLETE
1270
+ **Creative Content Creation** - Production Ready (v1.2.2)
1271
+ - Image generation from text descriptions using Imagen API
1272
+ - Multiple artistic styles: photorealistic, artistic, cartoon, sketch, digital_art
1273
+ - Flexible aspect ratios: 1:1, 16:9, 9:16, 4:3, 3:4
1274
+ - Advanced prompt engineering with negative prompts
1275
+ - Comprehensive error handling and validation
1276
+ - Future: Advanced image editing (inpainting, style transfer, enhancement)
1277
+ - Future: Video generation up to 30 seconds using Veo3 API
1278
+ - Future: Animation creation with motion graphics
1037
1279
 
1038
1280
  ### Target Architecture (End 2025)
1039
1281
 
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env node
2
+ import('../dist/index.js');