openclacky 1.2.8 → 1.2.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/lib/clacky/agent_config.rb +91 -7
- data/lib/clacky/client.rb +6 -2
- data/lib/clacky/default_skills/channel-manager/SKILL.md +33 -110
- data/lib/clacky/default_skills/media-gen/SKILL.md +128 -0
- data/lib/clacky/media/base.rb +68 -0
- data/lib/clacky/media/gemini.rb +36 -0
- data/lib/clacky/media/generator.rb +78 -0
- data/lib/clacky/media/openai_compat.rb +168 -0
- data/lib/clacky/providers.rb +82 -0
- data/lib/clacky/server/http_server.rb +210 -20
- data/lib/clacky/telemetry.rb +11 -5
- data/lib/clacky/version.rb +1 -1
- data/lib/clacky/web/app.css +172 -12
- data/lib/clacky/web/i18n.js +58 -0
- data/lib/clacky/web/index.html +14 -2
- data/lib/clacky/web/model-tester.js +58 -0
- data/lib/clacky/web/onboard.js +17 -30
- data/lib/clacky/web/settings.js +322 -97
- data/lib/clacky.rb +3 -0
- data/scripts/build/lib/network.sh +61 -30
- data/scripts/install.sh +61 -30
- data/scripts/install_browser.sh +61 -30
- data/scripts/install_full.sh +61 -30
- data/scripts/install_rails_deps.sh +61 -30
- data/scripts/install_system_deps.sh +61 -30
- metadata +7 -2
- data/lib/clacky/default_skills/channel-manager/feishu_setup.rb +0 -574
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: b2060d694267d0947681785d2e2ffe730d0b241a9ac2ec68e218eb037478bf27
|
|
4
|
+
data.tar.gz: 0b3e301010e16752da0bd64a9603b06010c2a23c5bd81884c7719d74f8f1bd67
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 386e2359d904b9a6bc81e56429cd585aaef59b7174f5dbc93e51cdd2bf97df703fd8145910759f50427632fcc90a307ed3ec6b19e48f0230d7e0c39987d3e7a8
|
|
7
|
+
data.tar.gz: 15413b83259ef7a39acac101597149cbf2144473da691d885f14d3b271076399da14c924db19201a1b99d8bf264542b56f3619b6cb6b903dfdac462e46f15a97
|
data/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,18 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [1.2.9] - 2026-06-01
|
|
9
|
+
|
|
10
|
+
### Added
|
|
11
|
+
- Image generation support via model tool calls
|
|
12
|
+
- Startup telemetry now reports launch source for better usage analytics
|
|
13
|
+
|
|
14
|
+
### Improved
|
|
15
|
+
- Feishu channel setup simplified with Agent App flow — fewer manual steps and no redirect URL config needed
|
|
16
|
+
|
|
17
|
+
### Fixed
|
|
18
|
+
- Network region detection hardened with CDN fallback to handle edge cases and improve reliability
|
|
19
|
+
|
|
8
20
|
## [1.2.8] - 2026-06-01
|
|
9
21
|
|
|
10
22
|
### Added
|
data/lib/clacky/agent_config.rb
CHANGED
|
@@ -318,7 +318,9 @@ module Clacky
|
|
|
318
318
|
end
|
|
319
319
|
end
|
|
320
320
|
|
|
321
|
-
new(**constructor_args)
|
|
321
|
+
instance = new(**constructor_args)
|
|
322
|
+
instance.derive_media_models!
|
|
323
|
+
instance
|
|
322
324
|
end
|
|
323
325
|
|
|
324
326
|
# Auto-injection of provider-preset lite models into @models has been
|
|
@@ -585,12 +587,94 @@ module Clacky
|
|
|
585
587
|
}.compact
|
|
586
588
|
end
|
|
587
589
|
|
|
588
|
-
# Find model by type (default or lite)
|
|
589
|
-
# Returns the model hash or nil if not found
|
|
590
|
+
# Find model by type (default or lite or media kind)
|
|
591
|
+
# Returns the model hash or nil if not found.
|
|
592
|
+
# For media kinds (image/video/audio): explicit user-configured (custom)
|
|
593
|
+
# entries win; otherwise an auto-derived virtual entry is returned
|
|
594
|
+
# based on the default model's provider — mirroring how lite is
|
|
595
|
+
# virtually derived via #lite_model_config_for_current.
|
|
590
596
|
def find_model_by_type(type)
|
|
597
|
+
kind = type.to_s
|
|
598
|
+
if Clacky::Providers::MEDIA_KINDS.include?(kind)
|
|
599
|
+
custom = @models.find { |m| m["type"] == kind }
|
|
600
|
+
return custom if custom
|
|
601
|
+
return derive_media_model(kind)
|
|
602
|
+
end
|
|
591
603
|
@models.find { |m| m["type"] == type }
|
|
592
604
|
end
|
|
593
605
|
|
|
606
|
+
private def derive_media_model(kind)
|
|
607
|
+
default = find_model_by_type("default")
|
|
608
|
+
return nil unless default
|
|
609
|
+
|
|
610
|
+
provider_id = Clacky::Providers.resolve_provider(
|
|
611
|
+
base_url: default["base_url"],
|
|
612
|
+
api_key: default["api_key"]
|
|
613
|
+
)
|
|
614
|
+
return nil unless provider_id
|
|
615
|
+
|
|
616
|
+
model_name = Clacky::Providers.default_media_model(provider_id, kind)
|
|
617
|
+
return nil if model_name.nil? || model_name.to_s.empty?
|
|
618
|
+
|
|
619
|
+
{
|
|
620
|
+
"model" => model_name,
|
|
621
|
+
"base_url" => default["base_url"],
|
|
622
|
+
"api_key" => default["api_key"],
|
|
623
|
+
"type" => kind,
|
|
624
|
+
"auto_injected" => true
|
|
625
|
+
}
|
|
626
|
+
end
|
|
627
|
+
|
|
628
|
+
# Kept as a no-op for backward compatibility. Media auto entries are
|
|
629
|
+
# now derived virtually on read; nothing is materialized into @models.
|
|
630
|
+
def derive_media_models!
|
|
631
|
+
@models.reject! { |m| m["auto_injected"] && Clacky::Providers::MEDIA_KINDS.include?(m["type"].to_s) }
|
|
632
|
+
end
|
|
633
|
+
|
|
634
|
+
# Returns the configured/derived media model entry for `kind`, plus a
|
|
635
|
+
# hint about its source. UI uses this to render the tri-state control.
|
|
636
|
+
# @param kind [String] one of "image" / "video" / "audio"
|
|
637
|
+
# @return [Hash{String=>Object}] keys:
|
|
638
|
+
# "configured" [Boolean] — anything available?
|
|
639
|
+
# "source" [String] — "off" | "auto" | "custom"
|
|
640
|
+
# "model" [String, nil]
|
|
641
|
+
# "base_url" [String, nil]
|
|
642
|
+
# "provider" [String, nil] — provider id
|
|
643
|
+
# "available" [Array<String>] — auto-source candidates from preset
|
|
644
|
+
def media_state(kind)
|
|
645
|
+
kind = kind.to_s
|
|
646
|
+
custom = @models.find { |m| m["type"] == kind }
|
|
647
|
+
auto = custom ? nil : derive_media_model(kind)
|
|
648
|
+
entry = custom || auto
|
|
649
|
+
|
|
650
|
+
provider_id = if entry
|
|
651
|
+
Clacky::Providers.resolve_provider(
|
|
652
|
+
base_url: entry["base_url"],
|
|
653
|
+
api_key: entry["api_key"]
|
|
654
|
+
)
|
|
655
|
+
end
|
|
656
|
+
|
|
657
|
+
available_provider_id = if custom
|
|
658
|
+
provider_id
|
|
659
|
+
else
|
|
660
|
+
default = find_model_by_type("default")
|
|
661
|
+
default && Clacky::Providers.resolve_provider(
|
|
662
|
+
base_url: default["base_url"],
|
|
663
|
+
api_key: default["api_key"]
|
|
664
|
+
)
|
|
665
|
+
end
|
|
666
|
+
available = available_provider_id ? Clacky::Providers.media_models(available_provider_id, kind) : []
|
|
667
|
+
|
|
668
|
+
{
|
|
669
|
+
"configured" => !entry.nil?,
|
|
670
|
+
"source" => custom ? "custom" : (auto ? "auto" : "off"),
|
|
671
|
+
"model" => entry && entry["model"],
|
|
672
|
+
"base_url" => entry && entry["base_url"],
|
|
673
|
+
"provider" => provider_id,
|
|
674
|
+
"available" => available
|
|
675
|
+
}
|
|
676
|
+
end
|
|
677
|
+
|
|
594
678
|
# Find model by composite key (model name + base_url).
|
|
595
679
|
# Used when restoring a session to match its original model without relying
|
|
596
680
|
# on the runtime-only id (which changes on every process restart).
|
|
@@ -896,14 +980,14 @@ module Clacky
|
|
|
896
980
|
Clacky::Providers.supports?(provider_id, capability, model_name: m["model"])
|
|
897
981
|
end
|
|
898
982
|
|
|
899
|
-
# Set a model's type (default or
|
|
900
|
-
#
|
|
983
|
+
# Set a model's type (default, lite, image, video, or audio).
|
|
984
|
+
# At most one model carries each type at a time.
|
|
901
985
|
# @param index [Integer] the model index
|
|
902
|
-
# @param type [String, nil]
|
|
986
|
+
# @param type [String, nil] type tag, or nil to clear
|
|
903
987
|
# Returns true if successful
|
|
904
988
|
def set_model_type(index, type)
|
|
905
989
|
return false if index < 0 || index >= @models.length
|
|
906
|
-
return false unless ["default", "lite", nil].include?(type)
|
|
990
|
+
return false unless ["default", "lite", "image", "video", "audio", nil].include?(type)
|
|
907
991
|
|
|
908
992
|
if type
|
|
909
993
|
# Remove type from any other model that has it
|
data/lib/clacky/client.rb
CHANGED
|
@@ -561,10 +561,14 @@ module Clacky
|
|
|
561
561
|
# ── Error handling ────────────────────────────────────────────────────────
|
|
562
562
|
|
|
563
563
|
def handle_test_response(response)
|
|
564
|
-
return { success: true } if response.status == 200
|
|
564
|
+
return { success: true, status: response.status } if response.status == 200
|
|
565
565
|
|
|
566
566
|
error_body = JSON.parse(response.body) rescue nil
|
|
567
|
-
{
|
|
567
|
+
{
|
|
568
|
+
success: false,
|
|
569
|
+
status: response.status,
|
|
570
|
+
error: extract_error_message(error_body, response.body)
|
|
571
|
+
}
|
|
568
572
|
end
|
|
569
573
|
|
|
570
574
|
def raise_error(response)
|
|
@@ -99,128 +99,51 @@ Ask:
|
|
|
99
99
|
|
|
100
100
|
### Feishu setup
|
|
101
101
|
|
|
102
|
-
|
|
102
|
+
Feishu now offers a one-click **Agent App** (智能体应用) that auto-configures all
|
|
103
|
+
required permissions, events, and publishing for you — no Bot capability toggle,
|
|
104
|
+
no permission JSON, no event subscription, no version/release steps. Just create
|
|
105
|
+
the app and copy the credentials. The connection mode is unchanged (long
|
|
106
|
+
connection / WebSocket), handled entirely by the server.
|
|
103
107
|
|
|
104
|
-
|
|
105
|
-
```bash
|
|
106
|
-
ruby "SKILL_DIR/feishu_setup.rb"
|
|
107
|
-
```
|
|
108
|
-
**Important**: call `terminal` with `timeout: 180` — the script may wait up to 90s for a WebSocket connection in Phase 4.
|
|
109
|
-
|
|
110
|
-
**If exit code is 0:**
|
|
111
|
-
- The script completed successfully.
|
|
112
|
-
- Config is already written to `~/.clacky/channels.yml`.
|
|
113
|
-
- Tell the user: "✅ Feishu channel configured automatically! The channel is ready."
|
|
114
|
-
- **Skip Step 2 (manual fallback) and continue to Step 3.**
|
|
115
|
-
|
|
116
|
-
**If exit code is non-0:**
|
|
117
|
-
- Check stdout for the error message.
|
|
118
|
-
- **If the error contains "Browser not configured" or "browser tool":**
|
|
119
|
-
- Tell the user: "The browser tool is not configured yet. Let me help you set it up first..."
|
|
120
|
-
- Invoke the `browser-setup` skill: `invoke_skill("browser-setup", "setup")`.
|
|
121
|
-
- After browser-setup completes, tell the user: "Browser is ready! Let me retry the Feishu setup..."
|
|
122
|
-
- **Retry the script** (same command, same timeout). If it succeeds this time, stop. If it fails again, check the new error and proceed accordingly.
|
|
123
|
-
- **If the error contains "No cookies found" or "Please log in":**
|
|
124
|
-
- Open Feishu login page using browser tool:
|
|
125
|
-
```
|
|
126
|
-
browser(action="navigate", url="https://open.feishu.cn/app")
|
|
127
|
-
```
|
|
128
|
-
- Tell the user: "I've opened Feishu in your browser. Please log in, then reply 'done'."
|
|
129
|
-
- Wait for "done".
|
|
130
|
-
- **Retry the script** (same command, same timeout). Repeat this login-wait-retry loop up to **3 times total**.
|
|
131
|
-
- If any attempt succeeds (exit code 0), stop — setup is complete.
|
|
132
|
-
- If an attempt fails with a **different** error (not a login error), break out of the loop and continue to Step 2.
|
|
133
|
-
- If all 3 attempts fail with login errors, tell the user: "Automated setup was unable to detect a Feishu login after 3 attempts. Switching to guided setup..." and continue to Step 2.
|
|
134
|
-
- **Otherwise (non-login, non-browser error):**
|
|
135
|
-
- Tell the user: "Automated setup encountered an issue: `<error message>`. Switching to guided setup..."
|
|
136
|
-
- Continue to Step 2 (manual flow) below.
|
|
137
|
-
|
|
138
|
-
---
|
|
139
|
-
|
|
140
|
-
#### Step 2 — Manual guided setup (fallback)
|
|
141
|
-
|
|
142
|
-
Only reach here if the automated script failed.
|
|
143
|
-
|
|
144
|
-
##### Phase 1 — Open Feishu Open Platform
|
|
145
|
-
|
|
146
|
-
1. Navigate: `open https://open.feishu.cn/app`. Pass `isolated: true`.
|
|
147
|
-
2. If a login page or QR code is shown, tell the user to log in and wait for "done".
|
|
148
|
-
3. Confirm the app list is visible.
|
|
149
|
-
|
|
150
|
-
##### Phase 2 — Create a new app
|
|
151
|
-
|
|
152
|
-
4. **Always create a new app** — do NOT reuse existing apps. Guide the user: "Click 'Create Enterprise Self-Built App', fill in name (e.g. Open Clacky) and description (e.g. AI assistant powered by openclacky), then submit. Reply done." Wait for "done".
|
|
153
|
-
|
|
154
|
-
##### Phase 3 — Enable Bot capability
|
|
155
|
-
|
|
156
|
-
5. Feishu opens Add App Capabilities by default after creating an app. Guide the user: "Find the Bot capability card and click the Add button next to it, then reply done." Wait for "done".
|
|
157
|
-
|
|
158
|
-
##### Phase 4 — Get credentials
|
|
159
|
-
|
|
160
|
-
6. Navigate to Credentials & Basic Info in the left menu.
|
|
161
|
-
7. Guide the user: "Copy App ID and App Secret, then paste here. Reply with: App ID: xxx, App Secret: xxx" Wait for the reply. Parse `app_id` and `app_secret`.
|
|
108
|
+
#### Step 1 — Open the Agent App creation page
|
|
162
109
|
|
|
163
|
-
|
|
110
|
+
1. Navigate: `open https://open.feishu.cn/page/launcher?from=backend_oneclick`. Pass `isolated: true`. If the browser is not configured (the `open` call fails), just give the user the URL and ask them to open it manually in any browser — the rest of the flow is fully manual and does not need browser automation.
|
|
111
|
+
2. If a login page or QR code is shown, tell the user to scan/log in and wait for "done".
|
|
164
112
|
|
|
165
|
-
|
|
166
|
-
9. Guide the user: "In the bulk import dialog, clear the existing example first (select all, delete), then paste the following JSON. Reply done." Wait for "done". Do NOT try to clear or edit via browser — user does it.
|
|
113
|
+
#### Step 2 — Create the Agent App
|
|
167
114
|
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
"im:message",
|
|
173
|
-
"im:message.p2p_msg:readonly",
|
|
174
|
-
"im:message:send_as_bot"
|
|
175
|
-
],
|
|
176
|
-
"user": []
|
|
177
|
-
}
|
|
178
|
-
}
|
|
179
|
-
```
|
|
115
|
+
3. After login, the page lands on **创建飞书智能体应用 (Create Feishu Agent App)**.
|
|
116
|
+
Guide the user: "Enter an app name (e.g. Open Clacky), then click **立即创建 (Create Now)**. Reply done."
|
|
117
|
+
(The avatar is auto-assigned at random and can be changed anytime — it does not affect setup.)
|
|
118
|
+
Wait for "done".
|
|
180
119
|
|
|
181
|
-
|
|
120
|
+
#### Step 3 — Copy credentials
|
|
182
121
|
|
|
183
|
-
|
|
122
|
+
4. The page jumps to **创建成功 (Created Successfully)**, showing `App ID` and `App Secret`.
|
|
123
|
+
The Secret is masked by default. Guide the user: "Click the eye icon next to **App Secret** to reveal it,
|
|
124
|
+
then copy both values and paste here. Reply with: App ID: xxx, App Secret: xxx"
|
|
125
|
+
Wait for the reply. Parse `app_id` (starts with `cli_`) and `app_secret`. Trim whitespace and
|
|
126
|
+
make sure the two values are not swapped.
|
|
184
127
|
|
|
185
|
-
|
|
186
|
-
```bash
|
|
187
|
-
curl -X POST http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/channels/feishu \
|
|
188
|
-
-H "Content-Type: application/json" \
|
|
189
|
-
-d '{"app_id":"<APP_ID>","app_secret":"<APP_SECRET>","domain":"https://open.feishu.cn"}'
|
|
190
|
-
```
|
|
191
|
-
**CRITICAL: This curl call is the ONLY way to save credentials. NEVER write `~/.clacky/channels.yml` or any file under `~/.clacky/channels/` directly. The server API handles persistence and hot-reload.**
|
|
192
|
-
11. **Wait for connection** — Poll until log shows `[feishu-ws] WebSocket connected ✅`:
|
|
193
|
-
```bash
|
|
194
|
-
for i in $(seq 1 20); do
|
|
195
|
-
grep -q "\[feishu-ws\] WebSocket connected" ~/.clacky/logger/clacky-$(date +%Y-%m-%d).log 2>/dev/null && echo "CONNECTED" && break
|
|
196
|
-
sleep 1
|
|
197
|
-
done
|
|
198
|
-
```
|
|
199
|
-
12. **Configure events** — Guide the user: "In Events & Callbacks, select 'Long Connection' mode. Click Save. Then click Add Event, search `im.message.receive_v1`, select it, click Add. Reply done." Wait for "done".
|
|
128
|
+
#### Step 4 — Save credentials
|
|
200
129
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
-d '{"app_id":"<APP_ID>","app_secret":"<APP_SECRET>"}'
|
|
211
|
-
```
|
|
212
|
-
|
|
213
|
-
Check for `"code":0`. On success: continue to Step 3 (below).
|
|
214
|
-
|
|
215
|
-
##### Phase 9 — done
|
|
130
|
+
5. Run:
|
|
131
|
+
```bash
|
|
132
|
+
curl -X POST http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/channels/feishu \
|
|
133
|
+
-H "Content-Type: application/json" \
|
|
134
|
+
-d '{"app_id":"<APP_ID>","app_secret":"<APP_SECRET>","domain":"https://open.feishu.cn"}'
|
|
135
|
+
```
|
|
136
|
+
**CRITICAL: This curl call is the ONLY way to save credentials. NEVER write `~/.clacky/channels.yml`
|
|
137
|
+
or any file under `~/.clacky/channels/` directly. The server API handles persistence, hot-reload,
|
|
138
|
+
and establishing the long connection.**
|
|
216
139
|
|
|
217
|
-
|
|
140
|
+
On success: tell the user "✅ Feishu channel configured!" and **continue to Step 5 (Feishu CLI)**.
|
|
218
141
|
|
|
219
142
|
---
|
|
220
143
|
|
|
221
|
-
#### Step
|
|
144
|
+
#### Step 5 — Optional: install Feishu CLI
|
|
222
145
|
|
|
223
|
-
Reach here
|
|
146
|
+
Reach here after the channel is configured (Step 4 succeeded). Read `app_id` and `app_secret` from `~/.clacky/channels.yml` (under `channels.feishu`) for the install commands below.
|
|
224
147
|
|
|
225
148
|
Call `request_user_feedback`:
|
|
226
149
|
|
|
@@ -269,7 +192,7 @@ When `lark-cli auth login` returns successfully, tell the user:
|
|
|
269
192
|
|
|
270
193
|
### WeCom setup
|
|
271
194
|
|
|
272
|
-
1. Navigate: `open https://work.weixin.qq.com/wework_admin/frame#/aiHelper/create`. Pass `isolated: true`.
|
|
195
|
+
1. Navigate: `open https://work.weixin.qq.com/wework_admin/frame#/aiHelper/create`. Pass `isolated: true`. If the browser is not configured (the `open` call fails), just give the user the URL and ask them to open it manually in any browser — the rest of the flow is fully manual and does not need browser automation.
|
|
273
196
|
2. If a login page or QR code is shown, tell the user to log in and wait for "done".
|
|
274
197
|
3. Guide the user: "Scroll to the bottom of the right panel and click 'API mode creation'. Reply done." Wait for "done".
|
|
275
198
|
4. Guide the user: "Click 'Add' next to 'Visible Range'. Select the top-level company node. Click Confirm. Reply done." Wait for "done".
|
|
@@ -0,0 +1,128 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: media-gen
|
|
3
|
+
description: 'Generate images (and later videos / audio) inside the current task. Use this skill whenever the user asks to create, generate, or produce a picture / image / illustration / cover / poster / icon / artwork — including phrases like 生成图片, 画一张, 做封面, 来张配图, generate image, make a picture, draw, create artwork, design a cover. Also use when building documents (slides, PPT, posters, marketing pages, README hero shots) where an image is needed inline. Routes calls through the local Clacky HTTP server, which uses the user-configured `type=image` model — you do NOT need to know which provider; the server handles it.'
|
|
4
|
+
disable-model-invocation: false
|
|
5
|
+
user-invocable: true
|
|
6
|
+
---
|
|
7
|
+
|
|
8
|
+
# media-gen
|
|
9
|
+
|
|
10
|
+
Generate images on demand by calling the local Clacky HTTP server, which dispatches to whichever image-generation model the user configured (`type=image` in their model settings).
|
|
11
|
+
|
|
12
|
+
## Endpoint
|
|
13
|
+
|
|
14
|
+
```
|
|
15
|
+
POST http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/media/image
|
|
16
|
+
GET http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/media/types
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
## Step 1 — Verify a backend is configured
|
|
20
|
+
|
|
21
|
+
Before generating anything, confirm the user has a `type=image` model set up:
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
curl -s http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/media/types
|
|
25
|
+
```
|
|
26
|
+
|
|
27
|
+
If the response shows `image.configured = false`, stop and tell the user:
|
|
28
|
+
|
|
29
|
+
> 还没有配置生图模型。请打开 Clacky 设置页 → 添加模型 → 类型选 `image`(推荐 `or-gemini-3-pro-image` 或 `or-gpt-image-1`)。配好后再让我生图。
|
|
30
|
+
|
|
31
|
+
Do NOT try to fall back to `terminal` + a hand-written `curl https://api.openai.com/...` — that bypasses the user's configured backend and won't be billed correctly.
|
|
32
|
+
|
|
33
|
+
## Step 2 — Generate the image
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
curl -s -X POST http://${CLACKY_SERVER_HOST}:${CLACKY_SERVER_PORT}/api/media/image \
|
|
37
|
+
-H "Content-Type: application/json" \
|
|
38
|
+
-d '{
|
|
39
|
+
"prompt": "A clean, modern hero illustration for a tech startup landing page. Soft gradient background, abstract geometric shapes in blue and purple, minimal style, 4K quality.",
|
|
40
|
+
"aspect_ratio": "landscape"
|
|
41
|
+
}'
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
### Request fields
|
|
45
|
+
|
|
46
|
+
| Field | Required | Values | Notes |
|
|
47
|
+
|----------------|----------|-------------------------------------|-------|
|
|
48
|
+
| `prompt` | yes | string | Be detailed and concrete. See prompt tips below. |
|
|
49
|
+
| `aspect_ratio` | no | `landscape` / `square` / `portrait` | Defaults to `landscape`. |
|
|
50
|
+
| `output_dir` | no | absolute path | Defaults to the current working directory. The image is saved under `<output_dir>/assets/generated/`. |
|
|
51
|
+
|
|
52
|
+
### Response shape (success)
|
|
53
|
+
|
|
54
|
+
```json
|
|
55
|
+
{
|
|
56
|
+
"success": true,
|
|
57
|
+
"image": "/abs/path/to/working_dir/assets/generated/img_20260525_011820_a1b2c3d4.png",
|
|
58
|
+
"model": "or-gemini-3-pro-image",
|
|
59
|
+
"provider": "openclacky",
|
|
60
|
+
"prompt": "A clean, modern hero illustration ...",
|
|
61
|
+
"aspect_ratio": "landscape",
|
|
62
|
+
"size": "1536x1024",
|
|
63
|
+
"usage": {
|
|
64
|
+
"prompt_tokens": 50,
|
|
65
|
+
"completion_tokens": 4500,
|
|
66
|
+
"cache_read_tokens": 0,
|
|
67
|
+
"cache_write_tokens": 0,
|
|
68
|
+
"total_tokens": 4550
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
The `image` field is an absolute path on disk. To embed it in markdown, slides, or HTML, convert it to a path relative to the document you're writing.
|
|
74
|
+
|
|
75
|
+
`usage` may be absent when the configured backend doesn't return token counts. Treat it as optional.
|
|
76
|
+
|
|
77
|
+
### Response shape (failure)
|
|
78
|
+
|
|
79
|
+
```json
|
|
80
|
+
{
|
|
81
|
+
"success": false,
|
|
82
|
+
"image": null,
|
|
83
|
+
"error": "Upstream 401: Invalid API key",
|
|
84
|
+
"error_type": "api_error",
|
|
85
|
+
"model": "...",
|
|
86
|
+
"provider": "..."
|
|
87
|
+
}
|
|
88
|
+
```
|
|
89
|
+
|
|
90
|
+
Common `error_type` values: `not_configured`, `auth_required`, `network_error`, `api_error`, `empty_response`. Tell the user the error plainly; if it's `auth_required` or `api_error 401/403`, point them at settings to fix the api_key.
|
|
91
|
+
|
|
92
|
+
## Step 3 — Show the image
|
|
93
|
+
|
|
94
|
+
`Read` does NOT show the image to the user — it only feeds it into your own context. To make the user actually see it, write a markdown tag in your reply:
|
|
95
|
+
|
|
96
|
+
```markdown
|
|
97
|
+

|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
Take the `image` field from the response and prefix `file://` (three slashes, since the path is absolute).
|
|
101
|
+
|
|
102
|
+
If you're also embedding it in a document (README, PPT, etc.), use a relative path: ``.
|
|
103
|
+
|
|
104
|
+
## Prompt writing tips
|
|
105
|
+
|
|
106
|
+
A good image prompt has 4 layers, in this order:
|
|
107
|
+
|
|
108
|
+
1. **Subject** — what is in the image, concretely. ("a golden retriever puppy", "a stylized icon of a rocket")
|
|
109
|
+
2. **Style / medium** — photo / illustration / 3D render / watercolor / flat vector / line art
|
|
110
|
+
3. **Composition / lighting** — close-up / wide shot / overhead / soft natural light / dramatic backlight
|
|
111
|
+
4. **Mood / palette** — minimal / playful / corporate / pastel / high-contrast monochrome
|
|
112
|
+
|
|
113
|
+
For PPT / slide decks specifically:
|
|
114
|
+
- Hero / cover slides: `aspect_ratio: landscape`, prompt should emphasise "clean", "minimal", "negative space" so text overlays well
|
|
115
|
+
- Section dividers: `aspect_ratio: landscape`, abstract or pattern-style works better than literal subjects
|
|
116
|
+
- Inline figures: `aspect_ratio: square` or `portrait`, more literal subject is fine
|
|
117
|
+
|
|
118
|
+
When the user gives a vague request like "给我配张图", ask one clarifying question (subject? style?) before calling the API — costs real money per image.
|
|
119
|
+
|
|
120
|
+
## When NOT to use this skill
|
|
121
|
+
|
|
122
|
+
- The user asks to **edit** an existing image (this skill is text-to-image only today)
|
|
123
|
+
- The user wants a **diagram / chart** with specific data — use a charting library (matplotlib, mermaid, etc.) instead; image gen is for illustrations, not data viz
|
|
124
|
+
- The user asks for **screenshots** of real software — use the browser tool
|
|
125
|
+
|
|
126
|
+
## Future modalities
|
|
127
|
+
|
|
128
|
+
The same `/api/media/` namespace will gain `video` and `audio` endpoints. The pattern is identical: the user configures `type=video` / `type=audio` models in settings, this skill (or its successor) calls the matching endpoint.
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "fileutils"
|
|
4
|
+
require "base64"
|
|
5
|
+
require "securerandom"
|
|
6
|
+
|
|
7
|
+
module Clacky
|
|
8
|
+
module Media
|
|
9
|
+
# Abstract base for media (image / video / audio) generation providers.
|
|
10
|
+
#
|
|
11
|
+
# Subclasses implement #generate_image (and later #generate_video,
|
|
12
|
+
# #generate_audio). The base class supplies the uniform success/error
|
|
13
|
+
# response shape and the on-disk persistence helper, mirroring the
|
|
14
|
+
# design used by Hermes' image_gen_provider so the surface stays
|
|
15
|
+
# learnable across modalities.
|
|
16
|
+
class Base
|
|
17
|
+
# @param model_entry [Hash] one entry from AgentConfig#models — must
|
|
18
|
+
# include "model", "base_url", "api_key" keys.
|
|
19
|
+
def initialize(model_entry)
|
|
20
|
+
@model_entry = model_entry
|
|
21
|
+
@model = model_entry["model"]
|
|
22
|
+
@base_url = model_entry["base_url"]
|
|
23
|
+
@api_key = model_entry["api_key"]
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
# @return [Hash] either success_response(...) or error_response(...)
|
|
27
|
+
def generate_image(prompt:, aspect_ratio: "landscape", output_dir: nil, **_kwargs)
|
|
28
|
+
raise NotImplementedError, "#{self.class.name} must implement #generate_image"
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Persist a base64-encoded image under <output_dir>/assets/generated/.
|
|
32
|
+
# Returns the absolute path on disk.
|
|
33
|
+
private def save_b64_image(b64_data, output_dir:, prefix: "img", extension: "png")
|
|
34
|
+
target_dir = File.join(output_dir, "assets", "generated")
|
|
35
|
+
FileUtils.mkdir_p(target_dir)
|
|
36
|
+
ts = Time.now.strftime("%Y%m%d_%H%M%S")
|
|
37
|
+
short = SecureRandom.hex(4)
|
|
38
|
+
path = File.join(target_dir, "#{prefix}_#{ts}_#{short}.#{extension}")
|
|
39
|
+
File.binwrite(path, Base64.decode64(b64_data))
|
|
40
|
+
path
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
private def success_response(image:, prompt:, aspect_ratio:, provider:, extra: {})
|
|
44
|
+
{
|
|
45
|
+
"success" => true,
|
|
46
|
+
"image" => image,
|
|
47
|
+
"model" => @model,
|
|
48
|
+
"prompt" => prompt,
|
|
49
|
+
"aspect_ratio" => aspect_ratio,
|
|
50
|
+
"provider" => provider
|
|
51
|
+
}.merge(extra)
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
private def error_response(error:, error_type: "provider_error", provider: "", prompt: "", aspect_ratio: "landscape")
|
|
55
|
+
{
|
|
56
|
+
"success" => false,
|
|
57
|
+
"image" => nil,
|
|
58
|
+
"error" => error,
|
|
59
|
+
"error_type" => error_type,
|
|
60
|
+
"model" => @model,
|
|
61
|
+
"prompt" => prompt,
|
|
62
|
+
"aspect_ratio" => aspect_ratio,
|
|
63
|
+
"provider" => provider
|
|
64
|
+
}
|
|
65
|
+
end
|
|
66
|
+
end
|
|
67
|
+
end
|
|
68
|
+
end
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require "faraday"
|
|
4
|
+
require "json"
|
|
5
|
+
require_relative "base"
|
|
6
|
+
|
|
7
|
+
module Clacky
|
|
8
|
+
module Media
|
|
9
|
+
# Native Google Gemini image generation adapter.
|
|
10
|
+
#
|
|
11
|
+
# Reserved for users who configure a direct Google AI Studio base_url
|
|
12
|
+
# (e.g. https://generativelanguage.googleapis.com) with a raw Google API
|
|
13
|
+
# key. The official endpoints are:
|
|
14
|
+
# POST /v1beta/models/<model>:generateContent — image-out via Gemini
|
|
15
|
+
# POST /v1beta/models/<model>:predict — Imagen
|
|
16
|
+
# with x-goog-api-key auth, contents[].parts[] request schema, and
|
|
17
|
+
# candidates[].content.parts[].inlineData response schema. Completely
|
|
18
|
+
# different from the OpenAI /v1/images/generations contract.
|
|
19
|
+
#
|
|
20
|
+
# Today every shipping path (openclacky gateway, OpenRouter) wraps Gemini
|
|
21
|
+
# behind an OpenAI-compatible facade, so OpenAICompat handles them and
|
|
22
|
+
# this class is intentionally a stub. We surface a clear error rather
|
|
23
|
+
# than silently 404 against Google's actual host.
|
|
24
|
+
class Gemini < Base
|
|
25
|
+
def generate_image(prompt:, aspect_ratio: "landscape", output_dir: nil, **_kwargs)
|
|
26
|
+
error_response(
|
|
27
|
+
error: "Direct Google AI Studio (generativelanguage.googleapis.com) image generation is not yet supported. Use the openclacky or OpenRouter gateway instead — set base_url to https://api.openclacky.com or https://openrouter.ai/api/v1 and pick a Gemini image model (e.g. or-gemini-3-pro-image, google/gemini-3-pro-image-preview).",
|
|
28
|
+
error_type: "not_implemented",
|
|
29
|
+
provider: "gemini-direct",
|
|
30
|
+
prompt: prompt,
|
|
31
|
+
aspect_ratio: aspect_ratio
|
|
32
|
+
)
|
|
33
|
+
end
|
|
34
|
+
end
|
|
35
|
+
end
|
|
36
|
+
end
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require_relative "openai_compat"
|
|
4
|
+
require_relative "gemini"
|
|
5
|
+
|
|
6
|
+
module Clacky
|
|
7
|
+
module Media
|
|
8
|
+
# Top-level dispatcher: takes an AgentConfig and a request, picks the
|
|
9
|
+
# right provider class based on the configured image model's base_url,
|
|
10
|
+
# and delegates.
|
|
11
|
+
#
|
|
12
|
+
# Adding a new modality (video / audio) means:
|
|
13
|
+
# 1. add a generate_<modality> method here that resolves the correct
|
|
14
|
+
# type=<modality> entry and class
|
|
15
|
+
# 2. add a provider class under lib/clacky/media/ implementing the call
|
|
16
|
+
class Generator
|
|
17
|
+
# Hosts that speak the native Google AI Studio API instead of an
|
|
18
|
+
# OpenAI-compatible facade. Matched as a substring against the
|
|
19
|
+
# configured base_url so any regional / staging variant is caught.
|
|
20
|
+
GOOGLE_NATIVE_HOSTS = [
|
|
21
|
+
"generativelanguage.googleapis.com",
|
|
22
|
+
"aiplatform.googleapis.com"
|
|
23
|
+
].freeze
|
|
24
|
+
|
|
25
|
+
# @param agent_config [Clacky::AgentConfig]
|
|
26
|
+
def initialize(agent_config)
|
|
27
|
+
@agent_config = agent_config
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
# @return [Hash, nil] the type=image model entry, or nil if not configured
|
|
31
|
+
def image_model_entry
|
|
32
|
+
@agent_config.find_model_by_type("image")
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
def generate_image(prompt:, aspect_ratio: "landscape", output_dir: nil, **kwargs)
|
|
36
|
+
entry = image_model_entry
|
|
37
|
+
if entry.nil?
|
|
38
|
+
return {
|
|
39
|
+
"success" => false,
|
|
40
|
+
"image" => nil,
|
|
41
|
+
"error" => "No image model configured. Add a model with type=image in settings.",
|
|
42
|
+
"error_type" => "not_configured",
|
|
43
|
+
"provider" => "",
|
|
44
|
+
"model" => "",
|
|
45
|
+
"prompt" => prompt
|
|
46
|
+
}
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
provider = build_provider_for(entry)
|
|
50
|
+
provider.generate_image(
|
|
51
|
+
prompt: prompt,
|
|
52
|
+
aspect_ratio: aspect_ratio,
|
|
53
|
+
output_dir: output_dir,
|
|
54
|
+
**kwargs
|
|
55
|
+
)
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
# Pick the adapter class for a media model entry.
|
|
59
|
+
#
|
|
60
|
+
# Routing rules:
|
|
61
|
+
# • base_url points directly at a Google AI Studio host → Gemini
|
|
62
|
+
# (native /v1beta/models/<m>:generateContent schema).
|
|
63
|
+
# • everything else → OpenAICompat. This covers OpenAI itself, the
|
|
64
|
+
# openclacky gateway, OpenRouter, and any third-party proxy that
|
|
65
|
+
# re-exposes Gemini / Imagen / DALL-E behind /v1/images/generations.
|
|
66
|
+
# OpenAICompat#generate_image branches internally on model id to
|
|
67
|
+
# drop OpenAI-only params (size) when talking to Gemini families.
|
|
68
|
+
private def build_provider_for(entry)
|
|
69
|
+
url = entry["base_url"].to_s
|
|
70
|
+
if GOOGLE_NATIVE_HOSTS.any? { |host| url.include?(host) }
|
|
71
|
+
Gemini.new(entry)
|
|
72
|
+
else
|
|
73
|
+
OpenAICompat.new(entry)
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
end
|
|
77
|
+
end
|
|
78
|
+
end
|