@kolbo/kolbo-code-linux-arm64-musl 1.1.74 → 2.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/kolbo +0 -0
- package/package.json +1 -1
- package/skills/brainstorming/SKILL.md +164 -0
- package/skills/brainstorming/scripts/frame-template.html +214 -0
- package/skills/brainstorming/scripts/helper.js +88 -0
- package/skills/brainstorming/scripts/server.cjs +354 -0
- package/skills/brainstorming/scripts/start-server.sh +148 -0
- package/skills/brainstorming/scripts/stop-server.sh +56 -0
- package/skills/brainstorming/spec-document-reviewer-prompt.md +49 -0
- package/skills/brainstorming/visual-companion.md +287 -0
- package/skills/dispatching-parallel-agents/SKILL.md +182 -0
- package/skills/docx/.skillfish.json +10 -0
- package/skills/docx/SKILL.md +196 -0
- package/skills/docx/docx-js.md +350 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/skills/docx/ooxml/schemas/mce/mc.xsd +75 -0
- package/skills/docx/ooxml/schemas/microsoft/wml-2010.xsd +560 -0
- package/skills/docx/ooxml/schemas/microsoft/wml-2012.xsd +67 -0
- package/skills/docx/ooxml/schemas/microsoft/wml-2018.xsd +14 -0
- package/skills/docx/ooxml/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/skills/docx/ooxml/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/skills/docx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/skills/docx/ooxml/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/skills/docx/ooxml/scripts/pack.py +159 -0
- package/skills/docx/ooxml/scripts/unpack.py +29 -0
- package/skills/docx/ooxml/scripts/validate.py +69 -0
- package/skills/docx/ooxml/scripts/validation/__init__.py +15 -0
- package/skills/docx/ooxml/scripts/validation/base.py +951 -0
- package/skills/docx/ooxml/scripts/validation/docx.py +274 -0
- package/skills/docx/ooxml/scripts/validation/pptx.py +315 -0
- package/skills/docx/ooxml/scripts/validation/redlining.py +279 -0
- package/skills/docx/ooxml.md +599 -0
- package/skills/docx/scripts/__init__.py +1 -0
- package/skills/docx/scripts/document.py +1272 -0
- package/skills/docx/scripts/templates/comments.xml +3 -0
- package/skills/docx/scripts/templates/commentsExtended.xml +3 -0
- package/skills/docx/scripts/templates/commentsExtensible.xml +3 -0
- package/skills/docx/scripts/templates/commentsIds.xml +3 -0
- package/skills/docx/scripts/templates/people.xml +3 -0
- package/skills/docx/scripts/utilities.py +374 -0
- package/skills/executing-plans/SKILL.md +70 -0
- package/skills/finishing-a-development-branch/SKILL.md +200 -0
- package/skills/fullstack-app/SKILL.md +621 -0
- package/skills/kolbo/SKILL.md +19 -263
- package/skills/ollama-vision/SKILL.md +105 -0
- package/skills/pdf/.skillfish.json +10 -0
- package/skills/pdf/FORMS.md +205 -0
- package/skills/pdf/REFERENCE.md +612 -0
- package/skills/pdf/SKILL.md +293 -0
- package/skills/pdf/scripts/check_bounding_boxes.py +70 -0
- package/skills/pdf/scripts/check_bounding_boxes_test.py +226 -0
- package/skills/pdf/scripts/check_fillable_fields.py +12 -0
- package/skills/pdf/scripts/convert_pdf_to_images.py +35 -0
- package/skills/pdf/scripts/create_validation_image.py +41 -0
- package/skills/pdf/scripts/extract_form_field_info.py +152 -0
- package/skills/pdf/scripts/fill_fillable_fields.py +114 -0
- package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +108 -0
- package/skills/photo-studio/SKILL.md +122 -0
- package/skills/pptx/.skillfish.json +10 -0
- package/skills/pptx/SKILL.md +483 -0
- package/skills/pptx/html2pptx.md +626 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/skills/pptx/ooxml/schemas/mce/mc.xsd +75 -0
- package/skills/pptx/ooxml/schemas/microsoft/wml-2010.xsd +560 -0
- package/skills/pptx/ooxml/schemas/microsoft/wml-2012.xsd +67 -0
- package/skills/pptx/ooxml/schemas/microsoft/wml-2018.xsd +14 -0
- package/skills/pptx/ooxml/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/skills/pptx/ooxml/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/skills/pptx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/skills/pptx/ooxml/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/skills/pptx/ooxml/scripts/pack.py +159 -0
- package/skills/pptx/ooxml/scripts/unpack.py +29 -0
- package/skills/pptx/ooxml/scripts/validate.py +69 -0
- package/skills/pptx/ooxml/scripts/validation/__init__.py +15 -0
- package/skills/pptx/ooxml/scripts/validation/base.py +951 -0
- package/skills/pptx/ooxml/scripts/validation/docx.py +274 -0
- package/skills/pptx/ooxml/scripts/validation/pptx.py +315 -0
- package/skills/pptx/ooxml/scripts/validation/redlining.py +279 -0
- package/skills/pptx/ooxml.md +427 -0
- package/skills/pptx/scripts/html2pptx.js +995 -0
- package/skills/pptx/scripts/inventory.py +1020 -0
- package/skills/pptx/scripts/rearrange.py +231 -0
- package/skills/pptx/scripts/replace.py +385 -0
- package/skills/pptx/scripts/thumbnail.py +450 -0
- package/skills/receiving-code-review/SKILL.md +213 -0
- package/skills/requesting-code-review/SKILL.md +105 -0
- package/skills/requesting-code-review/code-reviewer.md +146 -0
- package/skills/subagent-driven-development/SKILL.md +277 -0
- package/skills/subagent-driven-development/code-quality-reviewer-prompt.md +26 -0
- package/skills/subagent-driven-development/implementer-prompt.md +113 -0
- package/skills/subagent-driven-development/spec-reviewer-prompt.md +61 -0
- package/skills/supabase/.skillfish.json +10 -0
- package/skills/supabase/SKILL.md +106 -0
- package/skills/supabase/assets/feedback-issue-template.md +17 -0
- package/skills/supabase/references/skill-feedback.md +17 -0
- package/skills/supabase-postgres-best-practices/.skillfish.json +10 -0
- package/skills/supabase-postgres-best-practices/SKILL.md +64 -0
- package/skills/supabase-postgres-best-practices/references/_contributing.md +170 -0
- package/skills/supabase-postgres-best-practices/references/_sections.md +39 -0
- package/skills/supabase-postgres-best-practices/references/_template.md +34 -0
- package/skills/supabase-postgres-best-practices/references/advanced-full-text-search.md +55 -0
- package/skills/supabase-postgres-best-practices/references/advanced-jsonb-indexing.md +49 -0
- package/skills/supabase-postgres-best-practices/references/conn-idle-timeout.md +46 -0
- package/skills/supabase-postgres-best-practices/references/conn-limits.md +44 -0
- package/skills/supabase-postgres-best-practices/references/conn-pooling.md +41 -0
- package/skills/supabase-postgres-best-practices/references/conn-prepared-statements.md +46 -0
- package/skills/supabase-postgres-best-practices/references/data-batch-inserts.md +54 -0
- package/skills/supabase-postgres-best-practices/references/data-n-plus-one.md +53 -0
- package/skills/supabase-postgres-best-practices/references/data-pagination.md +50 -0
- package/skills/supabase-postgres-best-practices/references/data-upsert.md +50 -0
- package/skills/supabase-postgres-best-practices/references/lock-advisory.md +56 -0
- package/skills/supabase-postgres-best-practices/references/lock-deadlock-prevention.md +68 -0
- package/skills/supabase-postgres-best-practices/references/lock-short-transactions.md +50 -0
- package/skills/supabase-postgres-best-practices/references/lock-skip-locked.md +54 -0
- package/skills/supabase-postgres-best-practices/references/monitor-explain-analyze.md +45 -0
- package/skills/supabase-postgres-best-practices/references/monitor-pg-stat-statements.md +55 -0
- package/skills/supabase-postgres-best-practices/references/monitor-vacuum-analyze.md +55 -0
- package/skills/supabase-postgres-best-practices/references/query-composite-indexes.md +44 -0
- package/skills/supabase-postgres-best-practices/references/query-covering-indexes.md +40 -0
- package/skills/supabase-postgres-best-practices/references/query-index-types.md +48 -0
- package/skills/supabase-postgres-best-practices/references/query-missing-indexes.md +43 -0
- package/skills/supabase-postgres-best-practices/references/query-partial-indexes.md +45 -0
- package/skills/supabase-postgres-best-practices/references/schema-constraints.md +80 -0
- package/skills/supabase-postgres-best-practices/references/schema-data-types.md +46 -0
- package/skills/supabase-postgres-best-practices/references/schema-foreign-key-indexes.md +59 -0
- package/skills/supabase-postgres-best-practices/references/schema-lowercase-identifiers.md +55 -0
- package/skills/supabase-postgres-best-practices/references/schema-partitioning.md +55 -0
- package/skills/supabase-postgres-best-practices/references/schema-primary-keys.md +61 -0
- package/skills/supabase-postgres-best-practices/references/security-privileges.md +54 -0
- package/skills/supabase-postgres-best-practices/references/security-rls-basics.md +50 -0
- package/skills/supabase-postgres-best-practices/references/security-rls-performance.md +57 -0
- package/skills/supabase-quickstart/SKILL.md +400 -0
- package/skills/systematic-debugging/CREATION-LOG.md +119 -0
- package/skills/systematic-debugging/SKILL.md +296 -0
- package/skills/systematic-debugging/condition-based-waiting-example.ts +158 -0
- package/skills/systematic-debugging/condition-based-waiting.md +115 -0
- package/skills/systematic-debugging/defense-in-depth.md +122 -0
- package/skills/systematic-debugging/find-polluter.sh +63 -0
- package/skills/systematic-debugging/root-cause-tracing.md +169 -0
- package/skills/systematic-debugging/test-academic.md +14 -0
- package/skills/systematic-debugging/test-pressure-1.md +58 -0
- package/skills/systematic-debugging/test-pressure-2.md +68 -0
- package/skills/systematic-debugging/test-pressure-3.md +69 -0
- package/skills/test-driven-development/SKILL.md +371 -0
- package/skills/test-driven-development/testing-anti-patterns.md +299 -0
- package/skills/using-git-worktrees/SKILL.md +218 -0
- package/skills/using-superpowers/SKILL.md +115 -0
- package/skills/using-superpowers/references/codex-tools.md +100 -0
- package/skills/using-superpowers/references/gemini-tools.md +33 -0
- package/skills/verification-before-completion/SKILL.md +139 -0
- package/skills/video-production/SKILL.md +8 -7
- package/skills/writing-plans/SKILL.md +152 -0
- package/skills/writing-plans/plan-document-reviewer-prompt.md +49 -0
- package/skills/writing-skills/SKILL.md +655 -0
- package/skills/writing-skills/anthropic-best-practices.md +1150 -0
- package/skills/writing-skills/examples/CLAUDE_MD_TESTING.md +189 -0
- package/skills/writing-skills/graphviz-conventions.dot +172 -0
- package/skills/writing-skills/persuasion-principles.md +187 -0
- package/skills/writing-skills/render-graphs.js +168 -0
- package/skills/writing-skills/testing-skills-with-subagents.md +384 -0
- package/skills/xlsx/.skillfish.json +10 -0
- package/skills/xlsx/SKILL.md +288 -0
- package/skills/xlsx/recalc.py +178 -0
- package/skills/color-grading/SKILL.md +0 -152
- package/skills/ffmpeg-patterns/SKILL.md +0 -240
- package/skills/image-prompting-guide/SKILL.md +0 -143
- package/skills/music-prompting/SKILL.md +0 -146
- package/skills/production-review/SKILL.md +0 -152
- package/skills/short-form-video/SKILL.md +0 -168
- package/skills/sound-design/SKILL.md +0 -154
- package/skills/storytelling/SKILL.md +0 -139
- package/skills/subtitle-production/SKILL.md +0 -244
- package/skills/subtitle-production/reference/burn_to_video.py +0 -222
- package/skills/subtitle-production/reference/export_srts.py +0 -127
- package/skills/subtitle-production/reference/gen_srt.py +0 -42
- package/skills/typography-video/SKILL.md +0 -182
- package/skills/typography-video/reference/KineticTitleScene.tsx +0 -345
- package/skills/video-editing/SKILL.md +0 -128
- package/skills/video-prompting-guide/SKILL.md +0 -268
package/skills/kolbo/SKILL.md
CHANGED
|
@@ -1,78 +1,25 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: kolbo
|
|
3
|
-
description: Generate
|
|
3
|
+
description: Generate or analyze creative media through Kolbo AI. Load this skill whenever the user asks to create, edit, prompt, or analyze images, videos, music, speech, or sound effects — or to list available AI models / check credit balance. It contains the MCP tool workflow and the prompt-engineering rules for each media type.
|
|
4
4
|
---
|
|
5
5
|
|
|
6
|
-
# Kolbo AI — Creative Generation
|
|
6
|
+
# Kolbo AI — Creative Generation & Analysis
|
|
7
7
|
|
|
8
8
|
You have direct access to the Kolbo AI creative platform via MCP tools (auto-configured by `kolbo auth login`). Use them to generate and deliver real content — do NOT just describe what you would create.
|
|
9
9
|
|
|
10
10
|
## Available MCP Tools
|
|
11
11
|
|
|
12
|
-
### Generation
|
|
13
|
-
|
|
14
|
-
| Tool | Description |
|
|
15
|
-
|------|-------------|
|
|
16
|
-
| `generate_image` | Create images from text prompts. Supports Visual DNA, moodboards, reference images, batch generation, web-search grounding. |
|
|
17
|
-
| `generate_image_edit` | Edit/transform an existing image (background removal, color changes, compositing). Pass source images + edit prompt. |
|
|
18
|
-
| `generate_creative_director` | Generate a coordinated multi-scene set (1–8 scenes) from one creative brief. Ideal for storyboards, ad campaigns, product showcases. Supports image and video modes. |
|
|
19
|
-
| `generate_video` | Create videos from text prompts. Supports Visual DNA and reference images for consistency. |
|
|
20
|
-
| `generate_video_from_image` | Animate a still image into video. Prompt describes the motion, not the subject. |
|
|
21
|
-
| `generate_video_from_video` | Restyle/transform an existing video (style transfer, scene restyling, subject swap). Keeps the original motion. |
|
|
22
|
-
| `generate_elements` | Generate video from reference assets (images/videos) + prompt. Use when animating specific uploaded assets. |
|
|
23
|
-
| `generate_first_last_frame` | Generate video that morphs from a first frame to a last frame (keyframe interpolation). |
|
|
24
|
-
| `generate_lipsync` | Lipsync an audio track to a source image or video face. Accepts local files or URLs. |
|
|
25
|
-
| `generate_music` | Create music from descriptions. Supports instrumental, custom lyrics, style, vocal gender. |
|
|
26
|
-
| `generate_speech` | Convert text to speech (TTS). Default: ElevenLabs. Use `list_voices` to pick a voice. |
|
|
27
|
-
| `generate_sound` | Generate sound effects from descriptions (foley, ambient, impacts, UI sounds). |
|
|
28
|
-
| `generate_3d` | Generate 3D models from text, single image, or multi-view images. Returns GLB, FBX, OBJ, USDZ. |
|
|
29
|
-
|
|
30
|
-
### Transcription & Analysis
|
|
31
|
-
|
|
32
|
-
| Tool | Description |
|
|
33
|
-
|------|-------------|
|
|
34
|
-
| `transcribe_audio` | Transcribe audio or video into text + SRT subtitles + word-by-word SRT. Accepts local files or URLs. |
|
|
35
|
-
|
|
36
|
-
### Voice & Model Discovery
|
|
37
|
-
|
|
38
12
|
| Tool | Description |
|
|
39
13
|
|------|-------------|
|
|
14
|
+
| `generate_image` | Create images from text prompts. Returns image URL(s). |
|
|
15
|
+
| `generate_video` | Create videos from text. Returns video URL. |
|
|
16
|
+
| `generate_video_from_image` | Animate a still image into video. Returns video URL. |
|
|
17
|
+
| `generate_music` | Create music from descriptions. Returns audio URL. |
|
|
18
|
+
| `generate_speech` | Convert text to speech. Returns audio URL. |
|
|
19
|
+
| `generate_sound` | Generate sound effects. Returns audio URL. |
|
|
40
20
|
| `list_models` | Browse available AI models filtered by type. |
|
|
41
|
-
| `list_voices` | List available TTS voices with filtering by provider, language, gender. |
|
|
42
21
|
| `check_credits` | Check remaining Kolbo credit balance. |
|
|
43
|
-
| `get_generation_status` | Poll status of an in-progress generation by ID
|
|
44
|
-
|
|
45
|
-
### Media Library
|
|
46
|
-
|
|
47
|
-
| Tool | Description |
|
|
48
|
-
|------|-------------|
|
|
49
|
-
| `upload_media` | Upload a local file or URL to the user's Kolbo media library (CDN). Use for multi-tool workflows. |
|
|
50
|
-
| `list_media` | Browse user's uploaded media with filtering by type and search. |
|
|
51
|
-
|
|
52
|
-
### Visual DNA (Character/Style Consistency)
|
|
53
|
-
|
|
54
|
-
| Tool | Description |
|
|
55
|
-
|------|-------------|
|
|
56
|
-
| `create_visual_dna` | Create a Visual DNA profile from reference images/video/audio for character, style, product, or scene consistency. |
|
|
57
|
-
| `list_visual_dnas` | List your Visual DNA profiles (id, name, type, thumbnail). |
|
|
58
|
-
| `get_visual_dna` | Fetch full profile details including system_prompt and reference images. |
|
|
59
|
-
| `delete_visual_dna` | Delete a Visual DNA profile. |
|
|
60
|
-
|
|
61
|
-
### Moodboards & Presets
|
|
62
|
-
|
|
63
|
-
| Tool | Description |
|
|
64
|
-
|------|-------------|
|
|
65
|
-
| `list_moodboards` | List available moodboards (personal, system presets, org). |
|
|
66
|
-
| `get_moodboard` | Fetch a moodboard's master_prompt, style_guide, and images. |
|
|
67
|
-
| `list_presets` | Browse generation presets (image/video/music templates with bundled style direction). |
|
|
68
|
-
|
|
69
|
-
### Chat
|
|
70
|
-
|
|
71
|
-
| Tool | Description |
|
|
72
|
-
|------|-------------|
|
|
73
|
-
| `chat_send_message` | Send a message to Kolbo AI chat. Supports web search and deep think modes. |
|
|
74
|
-
| `chat_list_conversations` | List your SDK chat conversations. |
|
|
75
|
-
| `chat_get_messages` | Fetch messages in a conversation (with media URLs). |
|
|
22
|
+
| `get_generation_status` | Poll status of an in-progress generation by ID. |
|
|
76
23
|
|
|
77
24
|
## Core Workflow
|
|
78
25
|
|
|
@@ -87,96 +34,21 @@ You have direct access to the Kolbo AI creative platform via MCP tools (auto-con
|
|
|
87
34
|
| Type | Use for |
|
|
88
35
|
|------|---------|
|
|
89
36
|
| `image` | Still-image generation |
|
|
90
|
-
| `image_edit` | Image editing / transformation |
|
|
91
37
|
| `video` | Text-to-video |
|
|
92
38
|
| `video_from_image` | Image-to-video animation |
|
|
93
|
-
| `lipsync` | Audio-to-face lipsync |
|
|
94
39
|
| `music` | Music generation |
|
|
95
40
|
| `speech` | Text-to-speech |
|
|
96
41
|
| `sound` | Sound effects |
|
|
97
|
-
| `three_d` | 3D model generation |
|
|
98
42
|
|
|
99
43
|
### Cost Awareness
|
|
100
44
|
|
|
101
45
|
Creative generations bill against the user's Kolbo credit balance. Order of expense (rough):
|
|
102
|
-
- **Cheap & fast**: speech (~5-30s), sound effects (~5-30s), image (~10-30s)
|
|
103
|
-
- **Medium**: music (~30s-2min)
|
|
104
|
-
- **Expensive**: video (~1-5min, highest credit cost)
|
|
46
|
+
- **Cheap & fast**: speech (~5-30s), sound effects (~5-30s), image (~10-30s)
|
|
47
|
+
- **Medium**: music (~30s-2min)
|
|
48
|
+
- **Expensive**: video (~1-5min, highest credit cost)
|
|
105
49
|
|
|
106
50
|
Rule of thumb: confirm intent before firing off a video generation unless the user was explicit. For images, just generate.
|
|
107
51
|
|
|
108
|
-
### Rate Limiting
|
|
109
|
-
Kolbo enforces **10 generation requests per minute per user per tool type** (e.g. 10 image calls + 10 video calls = fine, but 11 image calls in 1 minute = rate limited). General media requests are capped at **300 per minute**.
|
|
110
|
-
|
|
111
|
-
When making multiple generation calls:
|
|
112
|
-
- **Stagger calls** — do NOT fire all in parallel. Space them ~5-10 seconds apart.
|
|
113
|
-
- **Batch images**: use `generate_creative_director` instead of calling `generate_image` 5+ times — it handles multi-scene in one request.
|
|
114
|
-
- If you get a rate limit error (429), wait 60 seconds (the window resets per minute) and retry. Do not retry more than 2 times.
|
|
115
|
-
|
|
116
|
-
---
|
|
117
|
-
|
|
118
|
-
## Transcription & Audio/Video Analysis
|
|
119
|
-
|
|
120
|
-
Use `transcribe_audio` whenever the user provides an audio or video file and wants:
|
|
121
|
-
- A text transcript
|
|
122
|
-
- Subtitles (SRT format)
|
|
123
|
-
- Word-by-word timed subtitles (for karaoke, motion graphics, Remotion captions, video editing)
|
|
124
|
-
- Content analysis or summary of spoken content
|
|
125
|
-
- Dialogue extraction from video
|
|
126
|
-
|
|
127
|
-
### Workflow
|
|
128
|
-
1. Call `transcribe_audio` with the `source` (URL or absolute local file path)
|
|
129
|
-
2. The tool returns:
|
|
130
|
-
- `text` — full transcript as plain text
|
|
131
|
-
- `srt_url` — download URL for grouped SRT subtitles (configurable words-per-line)
|
|
132
|
-
- `word_by_word_srt_url` — download URL for **word-by-word SRT** (one word per subtitle entry with precise timestamps from ElevenLabs Scribe v2)
|
|
133
|
-
- `txt_url` — download URL for plain text file
|
|
134
|
-
- `duration` — audio duration in seconds
|
|
135
|
-
3. Analyze the transcript text as needed (summarize, translate, extract topics, answer questions about content)
|
|
136
|
-
|
|
137
|
-
### Supported Formats
|
|
138
|
-
- **Audio**: mp3, wav, m4a, flac, aac
|
|
139
|
-
- **Video** (extracts audio track): mp4, mov, webm, mkv, avi, m4v
|
|
140
|
-
|
|
141
|
-
### Word-by-Word Transcription
|
|
142
|
-
The `word_by_word_srt_url` contains an SRT file where each subtitle entry is a **single word** with precise start/end timestamps (powered by ElevenLabs Scribe v2). This is ideal for:
|
|
143
|
-
- **Karaoke-style captions** — highlight one word at a time
|
|
144
|
-
- **Remotion/motion graphics** — animate text word-by-word synced to audio
|
|
145
|
-
- **Video editing** — precise cut points aligned to speech
|
|
146
|
-
- **Accessibility** — word-level navigation for hearing-impaired users
|
|
147
|
-
|
|
148
|
-
The regular `srt_url` groups words into readable subtitle lines (default 12 words per line, up to 2 lines per subtitle).
|
|
149
|
-
|
|
150
|
-
### Use Cases & Examples
|
|
151
|
-
- "Transcribe this podcast" → `transcribe_audio` with the audio URL
|
|
152
|
-
- "What's being said in this video?" → `transcribe_audio` → analyze the returned text
|
|
153
|
-
- "Generate subtitles for my video" → `transcribe_audio` → share the `srt_url`
|
|
154
|
-
- "I need word-by-word timing for this audio" → `transcribe_audio` → share `word_by_word_srt_url`
|
|
155
|
-
- "Summarize this meeting recording" → `transcribe_audio` → summarize the text
|
|
156
|
-
- "Extract key points from this lecture" → `transcribe_audio` → analyze and extract
|
|
157
|
-
|
|
158
|
-
### Long Content
|
|
159
|
-
Transcription supports files up to 30 minutes. For longer content, split the file first or provide segments.
|
|
160
|
-
|
|
161
|
-
### Visual Video/Audio Analysis (what's happening, not just what's said)
|
|
162
|
-
`transcribe_audio` only extracts **speech**. If the user wants to understand **what's visually happening** in a video (scenes, actions, objects, on-screen text) or needs a multimodal AI to reason about the content, use `chat_send_message` with a video-capable model instead.
|
|
163
|
-
|
|
164
|
-
**Video-capable models**: `gemini-2.5-pro`, `gemini-2.5-flash` — these can watch video and analyze visual content.
|
|
165
|
-
|
|
166
|
-
**Workflow for visual analysis:**
|
|
167
|
-
1. Upload the video with `upload_media` to get a stable CDN URL
|
|
168
|
-
2. Call `chat_send_message` with the video URL in the message and a video-capable model (e.g. `gemini-2.5-pro`)
|
|
169
|
-
3. Ask your analysis question: "Describe what happens in this video", "What products are shown?", "Summarize the key scenes"
|
|
170
|
-
|
|
171
|
-
**When to use which:**
|
|
172
|
-
|
|
173
|
-
| User intent | Tool |
|
|
174
|
-
|-------------|------|
|
|
175
|
-
| "Transcribe this" / "What's being said?" | `transcribe_audio` |
|
|
176
|
-
| "Generate subtitles" / "Word-by-word timing" | `transcribe_audio` |
|
|
177
|
-
| "What's happening in this video?" / "Describe the scenes" | `chat_send_message` + Gemini |
|
|
178
|
-
| "Analyze this video and transcribe it" | Both — `transcribe_audio` for text + `chat_send_message` for visual |
|
|
179
|
-
|
|
180
52
|
---
|
|
181
53
|
|
|
182
54
|
## Image Prompts
|
|
@@ -189,36 +61,14 @@ Transcription supports files up to 30 minutes. For longer content, split the fil
|
|
|
189
61
|
- **`enhance_prompt: true`** (default) will improve most prompts automatically. Turn it off only if the user's prompt is already fully engineered or they want literal wording.
|
|
190
62
|
|
|
191
63
|
### Image Editing (image-to-image)
|
|
192
|
-
|
|
193
|
-
Use `generate_image_edit` when the user wants to modify an existing image. Pass the source image URL(s) in `source_images` and describe the change in `prompt`.
|
|
194
|
-
|
|
64
|
+
When the model can see the uploaded image, describe the **change**, not the unchanged parts.
|
|
195
65
|
- Good: "Turn the sky orange and add drifting clouds"
|
|
196
66
|
- Bad: "A mountain landscape with an orange sky and drifting clouds" (re-describes what's already in the image)
|
|
197
67
|
|
|
198
68
|
Simple edits deserve simple prompts. Only elaborate for genuinely complex, multi-step transformations.
|
|
199
69
|
|
|
200
70
|
### Multi-Scene / Campaigns
|
|
201
|
-
For storyboards, campaigns, or character-consistent sequences,
|
|
202
|
-
|
|
203
|
-
In the CLI, you can also do sequential `generate_image` calls with the same Visual DNA profiles.
|
|
204
|
-
|
|
205
|
-
---
|
|
206
|
-
|
|
207
|
-
## Visual DNA (Character/Style Consistency)
|
|
208
|
-
|
|
209
|
-
Visual DNA profiles capture the visual "identity" of a character, style, product, or scene from reference media.
|
|
210
|
-
|
|
211
|
-
### Workflow
|
|
212
|
-
1. **Create** a profile with `create_visual_dna` — provide reference images (max 4), optionally video and audio
|
|
213
|
-
2. **Types**: `character` (default), `style`, `product`, `scene`
|
|
214
|
-
3. **Use** the profile by passing its `id` in `visual_dna_ids` when calling any generation tool
|
|
215
|
-
4. **List/inspect** profiles with `list_visual_dnas` / `get_visual_dna`
|
|
216
|
-
|
|
217
|
-
### When to Use
|
|
218
|
-
- User wants the same character across multiple images/videos
|
|
219
|
-
- User wants a consistent brand style across a campaign
|
|
220
|
-
- User references "keep the same look" or "same character"
|
|
221
|
-
- User provides reference photos of a person/product to maintain consistency
|
|
71
|
+
For storyboards, campaigns, or character-consistent sequences, call `generate_image` once per scene with the same base style cues carried across prompts. Kolbo's web app has a dedicated Creative Director feature for this; in the CLI the workflow is sequential `generate_image` calls.
|
|
222
72
|
|
|
223
73
|
---
|
|
224
74
|
|
|
@@ -239,20 +89,6 @@ The model can see the starting frame. Describe **what happens**, not what the im
|
|
|
239
89
|
- Good: "Slow dolly-in on the subject. Her hair drifts in a light breeze. Soft particles float through the air. [6s]"
|
|
240
90
|
- Bad: "A woman with long brown hair standing in a forest, wearing a red dress, with golden sunlight..." (re-describes the image)
|
|
241
91
|
|
|
242
|
-
### Video-to-Video (Restyle)
|
|
243
|
-
Use `generate_video_from_video` to restyle an existing video. Describe the **new style**, not the original content — the model preserves the original motion.
|
|
244
|
-
- Good: "Transform into anime style with cel-shading and vibrant colors"
|
|
245
|
-
- Bad: "A person walking down a street" (re-describes what's already in the video)
|
|
246
|
-
|
|
247
|
-
### Elements (Reference Assets → Video)
|
|
248
|
-
Use `generate_elements` when the user has specific assets (product photos, character references) they want animated into a video. Pass them as `reference_images` (URLs) or `files` (local paths).
|
|
249
|
-
|
|
250
|
-
### First/Last Frame (Keyframe Interpolation)
|
|
251
|
-
Use `generate_first_last_frame` when the user provides two keyframes and wants the model to create a smooth transition between them.
|
|
252
|
-
|
|
253
|
-
### Lipsync
|
|
254
|
-
Use `generate_lipsync` to sync audio to a face in an image or video. Both `source` (face) and `audio` accept URLs or local file paths.
|
|
255
|
-
|
|
256
92
|
### Camera Vocabulary
|
|
257
93
|
|
|
258
94
|
Pick what fits the mood. Every shot gets at least one.
|
|
@@ -314,17 +150,6 @@ Format: `extreme slow-motion [Xs] — [micro-movements in ultra slow-mo] — sna
|
|
|
314
150
|
|
|
315
151
|
---
|
|
316
152
|
|
|
317
|
-
## 3D Generation
|
|
318
|
-
|
|
319
|
-
Use `generate_3d` for creating 3D models. Three modes:
|
|
320
|
-
- **Text mode**: prompt-only (e.g., "a medieval sword with ornate handle")
|
|
321
|
-
- **Single image mode**: one reference image + optional prompt
|
|
322
|
-
- **Multi-view mode**: 2+ reference images for higher-quality reconstruction
|
|
323
|
-
|
|
324
|
-
Returns downloadable model files in GLB, FBX, OBJ, and USDZ formats. Use `list_models` with `type: "three_d"` to discover available models.
|
|
325
|
-
|
|
326
|
-
---
|
|
327
|
-
|
|
328
153
|
## Music Prompts
|
|
329
154
|
|
|
330
155
|
Describe **genre → mood → instrumentation → tempo → era**, in that order.
|
|
@@ -339,10 +164,10 @@ Describe **genre → mood → instrumentation → tempo → era**, in that order
|
|
|
339
164
|
|
|
340
165
|
## Speech (TTS)
|
|
341
166
|
|
|
342
|
-
- Call `
|
|
343
|
-
-
|
|
344
|
-
- For multilingual content, pick a voice that supports the target language.
|
|
167
|
+
- Call `list_models` with `type: speech` to get voice identifiers. Pass the `identifier` as `model` for a consistent voice.
|
|
168
|
+
- The voice **is** the model for speech — there is no separate voice parameter.
|
|
345
169
|
- For long text, split at natural sentence boundaries. Each generation has a character cap; chunk long-form content into multiple calls.
|
|
170
|
+
- For multilingual content, pick a voice that supports the target language from `list_models`.
|
|
346
171
|
|
|
347
172
|
---
|
|
348
173
|
|
|
@@ -354,35 +179,6 @@ Describe **genre → mood → instrumentation → tempo → era**, in that order
|
|
|
354
179
|
|
|
355
180
|
---
|
|
356
181
|
|
|
357
|
-
## Moodboards & Presets
|
|
358
|
-
|
|
359
|
-
**Moodboards** provide style direction (master prompt + style guide + reference images). Pass a `moodboard_id` to any generation tool to apply its style.
|
|
360
|
-
- `list_moodboards` to browse available options
|
|
361
|
-
- `get_moodboard` to see full details before applying
|
|
362
|
-
|
|
363
|
-
**Presets** bundle prompt templates + style direction for specific creative looks. Pass a `preset_id` to generation tools.
|
|
364
|
-
- `list_presets` with optional `type` filter ("image", "video", "music", "text_to_video")
|
|
365
|
-
|
|
366
|
-
---
|
|
367
|
-
|
|
368
|
-
## Media Library
|
|
369
|
-
|
|
370
|
-
Use `upload_media` to upload local files or URLs to the Kolbo CDN for stable hosting. Useful when:
|
|
371
|
-
- A local file needs to be referenced in multiple generation calls
|
|
372
|
-
- You want a permanent CDN URL instead of an ephemeral local path
|
|
373
|
-
|
|
374
|
-
Use `list_media` to browse previously uploaded content (filter by type, search by name).
|
|
375
|
-
|
|
376
|
-
---
|
|
377
|
-
|
|
378
|
-
## Chat
|
|
379
|
-
|
|
380
|
-
Use `chat_send_message` to interact with Kolbo AI models (GPT-4o, Claude, etc.) with optional web search and deep think modes. Conversations persist via `session_id` — omit to start new, pass to continue.
|
|
381
|
-
|
|
382
|
-
Use `chat_list_conversations` and `chat_get_messages` to browse conversation history.
|
|
383
|
-
|
|
384
|
-
---
|
|
385
|
-
|
|
386
182
|
## Image Analysis (when the user uploads images)
|
|
387
183
|
|
|
388
184
|
When the user shares an image and asks about it:
|
|
@@ -392,7 +188,7 @@ When the user shares an image and asks about it:
|
|
|
392
188
|
- **Extract text verbatim** when asked (OCR-style requests are fine).
|
|
393
189
|
- **Cannot identify real people.** Describe hair, clothing, pose, expression, and apparent role — but never name a specific individual, even a well-known public figure. If the user insists, decline and offer to describe instead.
|
|
394
190
|
- **Copyrighted content**: summarize and reference, don't reproduce verbatim large chunks.
|
|
395
|
-
- If the user wants an **edit** based on the analysis, hand off to `
|
|
191
|
+
- If the user wants an **edit** based on the analysis, hand off to `generate_video_from_image` (motion) or `generate_image` with an image-to-image model (visual edit) — see the Image Editing section above for prompt structure.
|
|
396
192
|
|
|
397
193
|
---
|
|
398
194
|
|
|
@@ -421,56 +217,16 @@ Full public documentation for Kolbo Code (the CLI you are running inside) lives
|
|
|
421
217
|
|
|
422
218
|
The MDX sources are in the `kolbo-docs` repo under `content/docs/kolbo-code/`. When the user's question has a concrete answer in one of those pages, cite the path and summarize — do not invent new instructions.
|
|
423
219
|
|
|
424
|
-
## Troubleshooting
|
|
425
|
-
|
|
426
|
-
### "API key is invalid or expired"
|
|
427
|
-
This usually means the CLI is sending a key to the wrong API endpoint.
|
|
428
|
-
|
|
429
|
-
**Common cause — whitelabel overlap:** if the user previously used regular `kolbo` and then switched to a whitelabel/partner CLI (e.g. `sapir`), the old API key may still be cached against the main Kolbo API. Running `kolbo` instead of the branded command (`sapir`) overwrites the MCP config with the wrong endpoint.
|
|
430
|
-
|
|
431
|
-
**Fix:** tell the user to re-authenticate with their branded CLI command:
|
|
432
|
-
```
|
|
433
|
-
sapir auth login
|
|
434
|
-
```
|
|
435
|
-
(Replace `sapir` with their actual CLI command.)
|
|
436
|
-
|
|
437
|
-
Then **restart the editor/session** so the MCP picks up the new key and endpoint.
|
|
438
|
-
|
|
439
|
-
**Important:** whitelabel users must always use their branded CLI command (e.g. `sapir`), not `kolbo`, to keep the MCP pointed at the correct API.
|
|
440
|
-
|
|
441
|
-
### MCP tools not responding or not found
|
|
442
|
-
If Kolbo tools timeout or aren't listed, the MCP server may not be wired. Tell the user to run:
|
|
443
|
-
```
|
|
444
|
-
<their-cli-command> auth login
|
|
445
|
-
```
|
|
446
|
-
This re-wires the MCP configuration automatically. Then restart the session.
|
|
447
|
-
|
|
448
|
-
### "Rate limited" (429 errors)
|
|
449
|
-
Kolbo allows 10 generation requests per minute per tool type. Wait 60 seconds and retry. Use `generate_creative_director` for batch image work instead of multiple `generate_image` calls.
|
|
450
|
-
|
|
451
|
-
---
|
|
452
|
-
|
|
453
220
|
## Examples
|
|
454
221
|
|
|
455
222
|
Natural-language triggers that should prompt this skill + a tool call:
|
|
456
223
|
|
|
457
224
|
- "Generate an image of a neon-lit Tokyo street at night" → `list_models` (image) → `generate_image`
|
|
458
|
-
- "Remove the background from this image" → `list_models` (image_edit) → `generate_image_edit`
|
|
459
|
-
- "Create a storyboard for a coffee brand ad" → `list_models` (image) → `generate_creative_director`
|
|
460
225
|
- "Create a 5-second cinematic video of ocean waves at sunset" → `list_models` (video) → `generate_video` with camera + mood guidance
|
|
461
226
|
- "Animate this product photo with a 360° orbit" → `list_models` (video_from_image) → `generate_video_from_image`
|
|
462
|
-
- "Restyle this video as anime" → `generate_video_from_video`
|
|
463
|
-
- "Make this character talk with this voiceover" → `generate_lipsync`
|
|
464
|
-
- "Create a smooth transition between these two frames" → `generate_first_last_frame`
|
|
465
227
|
- "Make a lo-fi hip hop beat, instrumental, 85 BPM" → `list_models` (music) → `generate_music`
|
|
466
|
-
- "Say this in English with a natural female voice: Welcome to Kolbo" → `
|
|
228
|
+
- "Say this in English with a natural female voice: Welcome to Kolbo" → `list_models` (speech) → `generate_speech`
|
|
467
229
|
- "Generate a door slam sound effect" → `list_models` (sound) → `generate_sound`
|
|
468
|
-
- "Create a 3D model of a medieval castle" → `list_models` (three_d) → `generate_3d`
|
|
469
|
-
- "Transcribe this podcast episode" → `transcribe_audio`
|
|
470
|
-
- "What's being said in this video?" → `transcribe_audio` → analyze the text
|
|
471
|
-
- "Generate word-by-word subtitles for this audio" → `transcribe_audio` → share `word_by_word_srt_url`
|
|
472
|
-
- "Keep the same character across all these images" → `create_visual_dna` → `generate_image` with `visual_dna_ids`
|
|
473
|
-
- "Upload this file to my media library" → `upload_media`
|
|
474
230
|
- "What video models are available?" → `list_models` (video)
|
|
475
231
|
- "How many credits do I have?" → `check_credits`
|
|
476
232
|
- "What's in this image?" (with upload) → describe per the Image Analysis section; no tool call needed unless the user asks to generate or edit
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: ollama-vision
|
|
3
|
+
description: >
|
|
4
|
+
Batch image analysis using local Ollama + gemma4 (multimodal).
|
|
5
|
+
Use when the user needs to analyze, caption, classify, or extract text from images locally —
|
|
6
|
+
free, offline, no rate limits, no API key needed.
|
|
7
|
+
Keywords: image analysis, batch images, captions, OCR, vision, gemma4, ollama, local AI
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
# Ollama Vision — Batch Image Analysis with gemma4
|
|
11
|
+
|
|
12
|
+
## Setup (already done on this machine)
|
|
13
|
+
|
|
14
|
+
- Ollama installed and running (auto-starts on Windows boot)
|
|
15
|
+
- Model: `gemma4` (9.6 GB, multimodal)
|
|
16
|
+
- Python package: `ollama` v0.6.1 installed (pip, Python 3.10)
|
|
17
|
+
- REST API available at `http://localhost:11434`
|
|
18
|
+
|
|
19
|
+
## Core Pattern
|
|
20
|
+
|
|
21
|
+
```python
|
|
22
|
+
import ollama
|
|
23
|
+
|
|
24
|
+
response = ollama.chat(model='gemma4', messages=[{
|
|
25
|
+
'role': 'user',
|
|
26
|
+
'content': 'Your prompt here',
|
|
27
|
+
'images': ['path/to/image.jpg'] # omit for text-only
|
|
28
|
+
}])
|
|
29
|
+
print(response['message']['content'])
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
## Batch Image Captioning Script
|
|
33
|
+
|
|
34
|
+
```python
|
|
35
|
+
import ollama
|
|
36
|
+
from pathlib import Path
|
|
37
|
+
import csv
|
|
38
|
+
|
|
39
|
+
def caption_images(folder: str, prompt: str = "Write a short caption for this image.", output_csv: str = "captions.csv"):
|
|
40
|
+
images_dir = Path(folder)
|
|
41
|
+
extensions = {'.jpg', '.jpeg', '.png', '.webp', '.gif', '.bmp'}
|
|
42
|
+
image_files = [f for f in images_dir.iterdir() if f.suffix.lower() in extensions]
|
|
43
|
+
|
|
44
|
+
results = []
|
|
45
|
+
for i, img_path in enumerate(image_files, 1):
|
|
46
|
+
print(f"[{i}/{len(image_files)}] Processing {img_path.name}...")
|
|
47
|
+
try:
|
|
48
|
+
response = ollama.chat(model='gemma4', messages=[{
|
|
49
|
+
'role': 'user',
|
|
50
|
+
'content': prompt,
|
|
51
|
+
'images': [str(img_path)]
|
|
52
|
+
}])
|
|
53
|
+
caption = response['message']['content'].strip()
|
|
54
|
+
results.append({'file': img_path.name, 'caption': caption})
|
|
55
|
+
print(f" → {caption[:80]}...")
|
|
56
|
+
except Exception as e:
|
|
57
|
+
print(f" ERROR: {e}")
|
|
58
|
+
results.append({'file': img_path.name, 'caption': f'ERROR: {e}'})
|
|
59
|
+
|
|
60
|
+
with open(output_csv, 'w', newline='', encoding='utf-8') as f:
|
|
61
|
+
writer = csv.DictWriter(f, fieldnames=['file', 'caption'])
|
|
62
|
+
writer.writeheader()
|
|
63
|
+
writer.writerows(results)
|
|
64
|
+
|
|
65
|
+
print(f"\nDone! Saved {len(results)} captions to {output_csv}")
|
|
66
|
+
|
|
67
|
+
# Usage
|
|
68
|
+
caption_images("./images", prompt="Describe this image in one sentence.")
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Common Prompts
|
|
72
|
+
|
|
73
|
+
| Task | Prompt |
|
|
74
|
+
|------|--------|
|
|
75
|
+
| Caption | `"Write a short, descriptive caption for this image."` |
|
|
76
|
+
| Alt text | `"Write alt text for this image for accessibility."` |
|
|
77
|
+
| Classification | `"What category does this image belong to? Reply with one word."` |
|
|
78
|
+
| OCR | `"Extract all text visible in this image."` |
|
|
79
|
+
| Product description | `"Write a product description for the item shown in this image."` |
|
|
80
|
+
| Social media | `"Write a catchy Instagram caption for this image."` |
|
|
81
|
+
|
|
82
|
+
## REST API Alternative (no Python package needed)
|
|
83
|
+
|
|
84
|
+
```python
|
|
85
|
+
import requests, base64
|
|
86
|
+
|
|
87
|
+
def analyze_image(image_path: str, prompt: str) -> str:
|
|
88
|
+
with open(image_path, "rb") as f:
|
|
89
|
+
img_b64 = base64.b64encode(f.read()).decode()
|
|
90
|
+
|
|
91
|
+
response = requests.post("http://localhost:11434/api/generate", json={
|
|
92
|
+
"model": "gemma4",
|
|
93
|
+
"prompt": prompt,
|
|
94
|
+
"images": [img_b64],
|
|
95
|
+
"stream": False
|
|
96
|
+
})
|
|
97
|
+
return response.json()["response"]
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
## Tips
|
|
101
|
+
|
|
102
|
+
- gemma4 handles JPG, PNG, WEBP, GIF, BMP
|
|
103
|
+
- For large batches, add `time.sleep(0.5)` between requests to avoid overloading
|
|
104
|
+
- Results are best when prompts are specific ("describe the main subject" vs "describe this")
|
|
105
|
+
- Ollama must be running — check with `ollama list` in terminal
|