EvoScientist 0.0.1.dev4__py3-none-any.whl → 0.1.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- EvoScientist/EvoScientist.py +25 -61
- EvoScientist/__init__.py +0 -19
- EvoScientist/backends.py +0 -26
- EvoScientist/cli.py +1365 -480
- EvoScientist/middleware.py +7 -56
- EvoScientist/skills/clip/SKILL.md +253 -0
- EvoScientist/skills/clip/references/applications.md +207 -0
- EvoScientist/skills/langgraph-docs/SKILL.md +36 -0
- EvoScientist/skills/tensorboard/SKILL.md +629 -0
- EvoScientist/skills/tensorboard/references/integrations.md +638 -0
- EvoScientist/skills/tensorboard/references/profiling.md +545 -0
- EvoScientist/skills/tensorboard/references/visualization.md +620 -0
- EvoScientist/skills/vllm/SKILL.md +364 -0
- EvoScientist/skills/vllm/references/optimization.md +226 -0
- EvoScientist/skills/vllm/references/quantization.md +284 -0
- EvoScientist/skills/vllm/references/server-deployment.md +255 -0
- EvoScientist/skills/vllm/references/troubleshooting.md +447 -0
- EvoScientist/stream/__init__.py +0 -25
- EvoScientist/stream/utils.py +16 -23
- EvoScientist/tools.py +2 -75
- {evoscientist-0.0.1.dev4.dist-info → evoscientist-0.1.0rc2.dist-info}/METADATA +8 -153
- {evoscientist-0.0.1.dev4.dist-info → evoscientist-0.1.0rc2.dist-info}/RECORD +26 -24
- evoscientist-0.1.0rc2.dist-info/entry_points.txt +2 -0
- EvoScientist/config.py +0 -274
- EvoScientist/llm/__init__.py +0 -21
- EvoScientist/llm/models.py +0 -99
- EvoScientist/memory.py +0 -715
- EvoScientist/onboard.py +0 -725
- EvoScientist/paths.py +0 -44
- EvoScientist/skills_manager.py +0 -391
- EvoScientist/stream/display.py +0 -604
- EvoScientist/stream/events.py +0 -415
- EvoScientist/stream/state.py +0 -343
- evoscientist-0.0.1.dev4.dist-info/entry_points.txt +0 -5
- {evoscientist-0.0.1.dev4.dist-info → evoscientist-0.1.0rc2.dist-info}/WHEEL +0 -0
- {evoscientist-0.0.1.dev4.dist-info → evoscientist-0.1.0rc2.dist-info}/licenses/LICENSE +0 -0
- {evoscientist-0.0.1.dev4.dist-info → evoscientist-0.1.0rc2.dist-info}/top_level.txt +0 -0
EvoScientist/middleware.py
CHANGED
|
@@ -1,80 +1,31 @@
|
|
|
1
1
|
"""Middleware configuration for the EvoScientist agent."""
|
|
2
2
|
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
3
|
from pathlib import Path
|
|
6
|
-
from typing import TYPE_CHECKING
|
|
7
4
|
|
|
8
5
|
from deepagents.backends import FilesystemBackend
|
|
9
6
|
from deepagents.middleware.skills import SkillsMiddleware
|
|
10
7
|
|
|
11
|
-
from .backends import MergedReadOnlyBackend
|
|
12
|
-
from .memory import EvoMemoryMiddleware
|
|
13
|
-
from .paths import MEMORY_DIR as _DEFAULT_MEMORY_DIR
|
|
14
|
-
|
|
15
|
-
if TYPE_CHECKING:
|
|
16
|
-
from langchain.chat_models import BaseChatModel
|
|
17
|
-
|
|
18
8
|
_DEFAULT_SKILLS_DIR = str(Path(__file__).parent / "skills")
|
|
19
9
|
|
|
20
10
|
|
|
21
11
|
def create_skills_middleware(
|
|
22
12
|
skills_dir: str = _DEFAULT_SKILLS_DIR,
|
|
23
|
-
workspace_dir: str = "
|
|
24
|
-
user_skills_dir: str | None = None,
|
|
13
|
+
workspace_dir: str = "./workspace/",
|
|
25
14
|
) -> SkillsMiddleware:
|
|
26
15
|
"""Create a SkillsMiddleware that loads skills.
|
|
27
16
|
|
|
28
|
-
|
|
29
|
-
|
|
17
|
+
All skills (system and user-installed) live in ./skills/.
|
|
18
|
+
The --user flag in install_skill.py also installs to ./skills/.
|
|
30
19
|
|
|
31
20
|
Args:
|
|
32
|
-
skills_dir: Path to the
|
|
33
|
-
workspace_dir:
|
|
34
|
-
user_skills_dir: Optional explicit path for user-installed skills. If set,
|
|
35
|
-
this path is used directly instead of {workspace_dir}/skills.
|
|
21
|
+
skills_dir: Path to the skills directory
|
|
22
|
+
workspace_dir: Unused, kept for API compatibility
|
|
36
23
|
|
|
37
24
|
Returns:
|
|
38
25
|
Configured SkillsMiddleware instance
|
|
39
26
|
"""
|
|
40
|
-
|
|
41
|
-
user_skills_dir = str(Path(workspace_dir) / "skills")
|
|
42
|
-
merged = MergedReadOnlyBackend(
|
|
43
|
-
primary_dir=user_skills_dir,
|
|
44
|
-
secondary_dir=skills_dir,
|
|
45
|
-
)
|
|
27
|
+
skills_backend = FilesystemBackend(root_dir=skills_dir, virtual_mode=True)
|
|
46
28
|
return SkillsMiddleware(
|
|
47
|
-
backend=
|
|
29
|
+
backend=skills_backend,
|
|
48
30
|
sources=["/"],
|
|
49
31
|
)
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
def create_memory_middleware(
|
|
53
|
-
memory_dir: str = str(_DEFAULT_MEMORY_DIR),
|
|
54
|
-
extraction_model: BaseChatModel | None = None,
|
|
55
|
-
trigger: tuple[str, int] = ("messages", 20),
|
|
56
|
-
) -> EvoMemoryMiddleware:
|
|
57
|
-
"""Create an EvoMemoryMiddleware for long-term memory.
|
|
58
|
-
|
|
59
|
-
Uses a FilesystemBackend rooted at ``memory_dir`` so that memory
|
|
60
|
-
persists across threads and sessions.
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
memory_dir: Path to the shared memory directory (not per-session).
|
|
64
|
-
extraction_model: Chat model for auto-extraction (optional; if None,
|
|
65
|
-
only prompt-guided manual memory updates via edit_file will work).
|
|
66
|
-
trigger: When to auto-extract. Default: every 20 human messages.
|
|
67
|
-
|
|
68
|
-
Returns:
|
|
69
|
-
Configured EvoMemoryMiddleware instance.
|
|
70
|
-
"""
|
|
71
|
-
memory_backend = FilesystemBackend(
|
|
72
|
-
root_dir=memory_dir,
|
|
73
|
-
virtual_mode=True,
|
|
74
|
-
)
|
|
75
|
-
return EvoMemoryMiddleware(
|
|
76
|
-
backend=memory_backend,
|
|
77
|
-
memory_path="/MEMORY.md",
|
|
78
|
-
extraction_model=extraction_model,
|
|
79
|
-
trigger=trigger,
|
|
80
|
-
)
|
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: clip
|
|
3
|
+
description: OpenAI's model connecting vision and language. Enables zero-shot image classification, image-text matching, and cross-modal retrieval. Trained on 400M image-text pairs. Use for image search, content moderation, or vision-language tasks without fine-tuning. Best for general-purpose image understanding.
|
|
4
|
+
version: 1.0.0
|
|
5
|
+
author: Orchestra Research
|
|
6
|
+
license: MIT
|
|
7
|
+
tags: [Multimodal, CLIP, Vision-Language, Zero-Shot, Image Classification, OpenAI, Image Search, Cross-Modal Retrieval, Content Moderation]
|
|
8
|
+
dependencies: [transformers, torch, pillow]
|
|
9
|
+
---
|
|
10
|
+
|
|
11
|
+
# CLIP - Contrastive Language-Image Pre-Training
|
|
12
|
+
|
|
13
|
+
OpenAI's model that understands images from natural language.
|
|
14
|
+
|
|
15
|
+
## When to use CLIP
|
|
16
|
+
|
|
17
|
+
**Use when:**
|
|
18
|
+
- Zero-shot image classification (no training data needed)
|
|
19
|
+
- Image-text similarity/matching
|
|
20
|
+
- Semantic image search
|
|
21
|
+
- Content moderation (detect NSFW, violence)
|
|
22
|
+
- Visual question answering
|
|
23
|
+
- Cross-modal retrieval (image→text, text→image)
|
|
24
|
+
|
|
25
|
+
**Metrics**:
|
|
26
|
+
- **25,300+ GitHub stars**
|
|
27
|
+
- Trained on 400M image-text pairs
|
|
28
|
+
- Matches ResNet-50 on ImageNet (zero-shot)
|
|
29
|
+
- MIT License
|
|
30
|
+
|
|
31
|
+
**Use alternatives instead**:
|
|
32
|
+
- **BLIP-2**: Better captioning
|
|
33
|
+
- **LLaVA**: Vision-language chat
|
|
34
|
+
- **Segment Anything**: Image segmentation
|
|
35
|
+
|
|
36
|
+
## Quick start
|
|
37
|
+
|
|
38
|
+
### Installation
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pip install git+https://github.com/openai/CLIP.git
|
|
42
|
+
pip install torch torchvision ftfy regex tqdm
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Zero-shot classification
|
|
46
|
+
|
|
47
|
+
```python
|
|
48
|
+
import torch
|
|
49
|
+
import clip
|
|
50
|
+
from PIL import Image
|
|
51
|
+
|
|
52
|
+
# Load model
|
|
53
|
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
54
|
+
model, preprocess = clip.load("ViT-B/32", device=device)
|
|
55
|
+
|
|
56
|
+
# Load image
|
|
57
|
+
image = preprocess(Image.open("photo.jpg")).unsqueeze(0).to(device)
|
|
58
|
+
|
|
59
|
+
# Define possible labels
|
|
60
|
+
text = clip.tokenize(["a dog", "a cat", "a bird", "a car"]).to(device)
|
|
61
|
+
|
|
62
|
+
# Compute similarity
|
|
63
|
+
with torch.no_grad():
|
|
64
|
+
image_features = model.encode_image(image)
|
|
65
|
+
text_features = model.encode_text(text)
|
|
66
|
+
|
|
67
|
+
# Cosine similarity
|
|
68
|
+
logits_per_image, logits_per_text = model(image, text)
|
|
69
|
+
probs = logits_per_image.softmax(dim=-1).cpu().numpy()
|
|
70
|
+
|
|
71
|
+
# Print results
|
|
72
|
+
labels = ["a dog", "a cat", "a bird", "a car"]
|
|
73
|
+
for label, prob in zip(labels, probs[0]):
|
|
74
|
+
print(f"{label}: {prob:.2%}")
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
## Available models
|
|
78
|
+
|
|
79
|
+
```python
|
|
80
|
+
# Models (sorted by size)
|
|
81
|
+
models = [
|
|
82
|
+
"RN50", # ResNet-50
|
|
83
|
+
"RN101", # ResNet-101
|
|
84
|
+
"ViT-B/32", # Vision Transformer (recommended)
|
|
85
|
+
"ViT-B/16", # Better quality, slower
|
|
86
|
+
"ViT-L/14", # Best quality, slowest
|
|
87
|
+
]
|
|
88
|
+
|
|
89
|
+
model, preprocess = clip.load("ViT-B/32")
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
| Model | Parameters | Speed | Quality |
|
|
93
|
+
|-------|------------|-------|---------|
|
|
94
|
+
| RN50 | 102M | Fast | Good |
|
|
95
|
+
| ViT-B/32 | 151M | Medium | Better |
|
|
96
|
+
| ViT-L/14 | 428M | Slow | Best |
|
|
97
|
+
|
|
98
|
+
## Image-text similarity
|
|
99
|
+
|
|
100
|
+
```python
|
|
101
|
+
# Compute embeddings
|
|
102
|
+
image_features = model.encode_image(image)
|
|
103
|
+
text_features = model.encode_text(text)
|
|
104
|
+
|
|
105
|
+
# Normalize
|
|
106
|
+
image_features /= image_features.norm(dim=-1, keepdim=True)
|
|
107
|
+
text_features /= text_features.norm(dim=-1, keepdim=True)
|
|
108
|
+
|
|
109
|
+
# Cosine similarity
|
|
110
|
+
similarity = (image_features @ text_features.T).item()
|
|
111
|
+
print(f"Similarity: {similarity:.4f}")
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
## Semantic image search
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
# Index images
|
|
118
|
+
image_paths = ["img1.jpg", "img2.jpg", "img3.jpg"]
|
|
119
|
+
image_embeddings = []
|
|
120
|
+
|
|
121
|
+
for img_path in image_paths:
|
|
122
|
+
image = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
|
|
123
|
+
with torch.no_grad():
|
|
124
|
+
embedding = model.encode_image(image)
|
|
125
|
+
embedding /= embedding.norm(dim=-1, keepdim=True)
|
|
126
|
+
image_embeddings.append(embedding)
|
|
127
|
+
|
|
128
|
+
image_embeddings = torch.cat(image_embeddings)
|
|
129
|
+
|
|
130
|
+
# Search with text query
|
|
131
|
+
query = "a sunset over the ocean"
|
|
132
|
+
text_input = clip.tokenize([query]).to(device)
|
|
133
|
+
with torch.no_grad():
|
|
134
|
+
text_embedding = model.encode_text(text_input)
|
|
135
|
+
text_embedding /= text_embedding.norm(dim=-1, keepdim=True)
|
|
136
|
+
|
|
137
|
+
# Find most similar images
|
|
138
|
+
similarities = (text_embedding @ image_embeddings.T).squeeze(0)
|
|
139
|
+
top_k = similarities.topk(3)
|
|
140
|
+
|
|
141
|
+
for idx, score in zip(top_k.indices, top_k.values):
|
|
142
|
+
print(f"{image_paths[idx]}: {score:.3f}")
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
## Content moderation
|
|
146
|
+
|
|
147
|
+
```python
|
|
148
|
+
# Define categories
|
|
149
|
+
categories = [
|
|
150
|
+
"safe for work",
|
|
151
|
+
"not safe for work",
|
|
152
|
+
"violent content",
|
|
153
|
+
"graphic content"
|
|
154
|
+
]
|
|
155
|
+
|
|
156
|
+
text = clip.tokenize(categories).to(device)
|
|
157
|
+
|
|
158
|
+
# Check image
|
|
159
|
+
with torch.no_grad():
|
|
160
|
+
logits_per_image, _ = model(image, text)
|
|
161
|
+
probs = logits_per_image.softmax(dim=-1)
|
|
162
|
+
|
|
163
|
+
# Get classification
|
|
164
|
+
max_idx = probs.argmax().item()
|
|
165
|
+
max_prob = probs[0, max_idx].item()
|
|
166
|
+
|
|
167
|
+
print(f"Category: {categories[max_idx]} ({max_prob:.2%})")
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
## Batch processing
|
|
171
|
+
|
|
172
|
+
```python
|
|
173
|
+
# Process multiple images
|
|
174
|
+
images = [preprocess(Image.open(f"img{i}.jpg")) for i in range(10)]
|
|
175
|
+
images = torch.stack(images).to(device)
|
|
176
|
+
|
|
177
|
+
with torch.no_grad():
|
|
178
|
+
image_features = model.encode_image(images)
|
|
179
|
+
image_features /= image_features.norm(dim=-1, keepdim=True)
|
|
180
|
+
|
|
181
|
+
# Batch text
|
|
182
|
+
texts = ["a dog", "a cat", "a bird"]
|
|
183
|
+
text_tokens = clip.tokenize(texts).to(device)
|
|
184
|
+
|
|
185
|
+
with torch.no_grad():
|
|
186
|
+
text_features = model.encode_text(text_tokens)
|
|
187
|
+
text_features /= text_features.norm(dim=-1, keepdim=True)
|
|
188
|
+
|
|
189
|
+
# Similarity matrix (10 images × 3 texts)
|
|
190
|
+
similarities = image_features @ text_features.T
|
|
191
|
+
print(similarities.shape) # (10, 3)
|
|
192
|
+
```
|
|
193
|
+
|
|
194
|
+
## Integration with vector databases
|
|
195
|
+
|
|
196
|
+
```python
|
|
197
|
+
# Store CLIP embeddings in Chroma/FAISS
|
|
198
|
+
import chromadb
|
|
199
|
+
|
|
200
|
+
client = chromadb.Client()
|
|
201
|
+
collection = client.create_collection("image_embeddings")
|
|
202
|
+
|
|
203
|
+
# Add image embeddings
|
|
204
|
+
for img_path, embedding in zip(image_paths, image_embeddings):
|
|
205
|
+
collection.add(
|
|
206
|
+
embeddings=[embedding.cpu().numpy().tolist()],
|
|
207
|
+
metadatas=[{"path": img_path}],
|
|
208
|
+
ids=[img_path]
|
|
209
|
+
)
|
|
210
|
+
|
|
211
|
+
# Query with text
|
|
212
|
+
query = "a sunset"
|
|
213
|
+
text_embedding = model.encode_text(clip.tokenize([query]))
|
|
214
|
+
results = collection.query(
|
|
215
|
+
query_embeddings=[text_embedding.cpu().numpy().tolist()],
|
|
216
|
+
n_results=5
|
|
217
|
+
)
|
|
218
|
+
```
|
|
219
|
+
|
|
220
|
+
## Best practices
|
|
221
|
+
|
|
222
|
+
1. **Use ViT-B/32 for most cases** - Good balance
|
|
223
|
+
2. **Normalize embeddings** - Required for cosine similarity
|
|
224
|
+
3. **Batch processing** - More efficient
|
|
225
|
+
4. **Cache embeddings** - Expensive to recompute
|
|
226
|
+
5. **Use descriptive labels** - Better zero-shot performance
|
|
227
|
+
6. **GPU recommended** - 10-50× faster
|
|
228
|
+
7. **Preprocess images** - Use provided preprocess function
|
|
229
|
+
|
|
230
|
+
## Performance
|
|
231
|
+
|
|
232
|
+
| Operation | CPU | GPU (V100) |
|
|
233
|
+
|-----------|-----|------------|
|
|
234
|
+
| Image encoding | ~200ms | ~20ms |
|
|
235
|
+
| Text encoding | ~50ms | ~5ms |
|
|
236
|
+
| Similarity compute | <1ms | <1ms |
|
|
237
|
+
|
|
238
|
+
## Limitations
|
|
239
|
+
|
|
240
|
+
1. **Not for fine-grained tasks** - Best for broad categories
|
|
241
|
+
2. **Requires descriptive text** - Vague labels perform poorly
|
|
242
|
+
3. **Biased on web data** - May have dataset biases
|
|
243
|
+
4. **No bounding boxes** - Whole image only
|
|
244
|
+
5. **Limited spatial understanding** - Position/counting weak
|
|
245
|
+
|
|
246
|
+
## Resources
|
|
247
|
+
|
|
248
|
+
- **GitHub**: https://github.com/openai/CLIP ⭐ 25,300+
|
|
249
|
+
- **Paper**: https://arxiv.org/abs/2103.00020
|
|
250
|
+
- **Colab**: https://colab.research.google.com/github/openai/clip/
|
|
251
|
+
- **License**: MIT
|
|
252
|
+
|
|
253
|
+
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
# CLIP Applications Guide
|
|
2
|
+
|
|
3
|
+
Practical applications and use cases for CLIP.
|
|
4
|
+
|
|
5
|
+
## Zero-shot image classification
|
|
6
|
+
|
|
7
|
+
```python
|
|
8
|
+
import torch
|
|
9
|
+
import clip
|
|
10
|
+
from PIL import Image
|
|
11
|
+
|
|
12
|
+
model, preprocess = clip.load("ViT-B/32")
|
|
13
|
+
|
|
14
|
+
# Define categories
|
|
15
|
+
categories = [
|
|
16
|
+
"a photo of a dog",
|
|
17
|
+
"a photo of a cat",
|
|
18
|
+
"a photo of a bird",
|
|
19
|
+
"a photo of a car",
|
|
20
|
+
"a photo of a person"
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
# Prepare image
|
|
24
|
+
image = preprocess(Image.open("photo.jpg")).unsqueeze(0)
|
|
25
|
+
text = clip.tokenize(categories)
|
|
26
|
+
|
|
27
|
+
# Classify
|
|
28
|
+
with torch.no_grad():
|
|
29
|
+
image_features = model.encode_image(image)
|
|
30
|
+
text_features = model.encode_text(text)
|
|
31
|
+
|
|
32
|
+
logits_per_image, _ = model(image, text)
|
|
33
|
+
probs = logits_per_image.softmax(dim=-1).cpu().numpy()
|
|
34
|
+
|
|
35
|
+
# Print results
|
|
36
|
+
for category, prob in zip(categories, probs[0]):
|
|
37
|
+
print(f"{category}: {prob:.2%}")
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Semantic image search
|
|
41
|
+
|
|
42
|
+
```python
|
|
43
|
+
# Index images
|
|
44
|
+
image_database = []
|
|
45
|
+
image_paths = ["img1.jpg", "img2.jpg", "img3.jpg"]
|
|
46
|
+
|
|
47
|
+
for img_path in image_paths:
|
|
48
|
+
image = preprocess(Image.open(img_path)).unsqueeze(0)
|
|
49
|
+
with torch.no_grad():
|
|
50
|
+
features = model.encode_image(image)
|
|
51
|
+
features /= features.norm(dim=-1, keepdim=True)
|
|
52
|
+
image_database.append((img_path, features))
|
|
53
|
+
|
|
54
|
+
# Search with text
|
|
55
|
+
query = "a sunset over mountains"
|
|
56
|
+
text_input = clip.tokenize([query])
|
|
57
|
+
|
|
58
|
+
with torch.no_grad():
|
|
59
|
+
text_features = model.encode_text(text_input)
|
|
60
|
+
text_features /= text_features.norm(dim=-1, keepdim=True)
|
|
61
|
+
|
|
62
|
+
# Find matches
|
|
63
|
+
similarities = []
|
|
64
|
+
for img_path, img_features in image_database:
|
|
65
|
+
similarity = (text_features @ img_features.T).item()
|
|
66
|
+
similarities.append((img_path, similarity))
|
|
67
|
+
|
|
68
|
+
# Sort by similarity
|
|
69
|
+
similarities.sort(key=lambda x: x[1], reverse=True)
|
|
70
|
+
for img_path, score in similarities[:3]:
|
|
71
|
+
print(f"{img_path}: {score:.3f}")
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Content moderation
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
# Define safety categories
|
|
78
|
+
categories = [
|
|
79
|
+
"safe for work content",
|
|
80
|
+
"not safe for work content",
|
|
81
|
+
"violent or graphic content",
|
|
82
|
+
"hate speech or offensive content",
|
|
83
|
+
"spam or misleading content"
|
|
84
|
+
]
|
|
85
|
+
|
|
86
|
+
text = clip.tokenize(categories)
|
|
87
|
+
|
|
88
|
+
# Check image
|
|
89
|
+
with torch.no_grad():
|
|
90
|
+
logits, _ = model(image, text)
|
|
91
|
+
probs = logits.softmax(dim=-1)
|
|
92
|
+
|
|
93
|
+
# Get classification
|
|
94
|
+
max_idx = probs.argmax().item()
|
|
95
|
+
confidence = probs[0, max_idx].item()
|
|
96
|
+
|
|
97
|
+
if confidence > 0.7:
|
|
98
|
+
print(f"Classified as: {categories[max_idx]} ({confidence:.2%})")
|
|
99
|
+
else:
|
|
100
|
+
print(f"Uncertain classification (confidence: {confidence:.2%})")
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
## Image-to-text retrieval
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
# Text database
|
|
107
|
+
captions = [
|
|
108
|
+
"A beautiful sunset over the ocean",
|
|
109
|
+
"A cute dog playing in the park",
|
|
110
|
+
"A modern city skyline at night",
|
|
111
|
+
"A delicious pizza with toppings"
|
|
112
|
+
]
|
|
113
|
+
|
|
114
|
+
# Encode captions
|
|
115
|
+
caption_features = []
|
|
116
|
+
for caption in captions:
|
|
117
|
+
text = clip.tokenize([caption])
|
|
118
|
+
with torch.no_grad():
|
|
119
|
+
features = model.encode_text(text)
|
|
120
|
+
features /= features.norm(dim=-1, keepdim=True)
|
|
121
|
+
caption_features.append(features)
|
|
122
|
+
|
|
123
|
+
caption_features = torch.cat(caption_features)
|
|
124
|
+
|
|
125
|
+
# Find matching captions for image
|
|
126
|
+
with torch.no_grad():
|
|
127
|
+
image_features = model.encode_image(image)
|
|
128
|
+
image_features /= image_features.norm(dim=-1, keepdim=True)
|
|
129
|
+
|
|
130
|
+
similarities = (image_features @ caption_features.T).squeeze(0)
|
|
131
|
+
top_k = similarities.topk(3)
|
|
132
|
+
|
|
133
|
+
for idx, score in zip(top_k.indices, top_k.values):
|
|
134
|
+
print(f"{captions[idx]}: {score:.3f}")
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Visual question answering
|
|
138
|
+
|
|
139
|
+
```python
|
|
140
|
+
# Create yes/no questions
|
|
141
|
+
image = preprocess(Image.open("photo.jpg")).unsqueeze(0)
|
|
142
|
+
|
|
143
|
+
questions = [
|
|
144
|
+
"a photo showing people",
|
|
145
|
+
"a photo showing animals",
|
|
146
|
+
"a photo taken indoors",
|
|
147
|
+
"a photo taken outdoors",
|
|
148
|
+
"a photo taken during daytime",
|
|
149
|
+
"a photo taken at night"
|
|
150
|
+
]
|
|
151
|
+
|
|
152
|
+
text = clip.tokenize(questions)
|
|
153
|
+
|
|
154
|
+
with torch.no_grad():
|
|
155
|
+
logits, _ = model(image, text)
|
|
156
|
+
probs = logits.softmax(dim=-1)
|
|
157
|
+
|
|
158
|
+
# Answer questions
|
|
159
|
+
for question, prob in zip(questions, probs[0]):
|
|
160
|
+
answer = "Yes" if prob > 0.5 else "No"
|
|
161
|
+
print(f"{question}: {answer} ({prob:.2%})")
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
## Image deduplication
|
|
165
|
+
|
|
166
|
+
```python
|
|
167
|
+
# Detect duplicate/similar images
|
|
168
|
+
def compute_similarity(img1_path, img2_path):
|
|
169
|
+
img1 = preprocess(Image.open(img1_path)).unsqueeze(0)
|
|
170
|
+
img2 = preprocess(Image.open(img2_path)).unsqueeze(0)
|
|
171
|
+
|
|
172
|
+
with torch.no_grad():
|
|
173
|
+
feat1 = model.encode_image(img1)
|
|
174
|
+
feat2 = model.encode_image(img2)
|
|
175
|
+
|
|
176
|
+
feat1 /= feat1.norm(dim=-1, keepdim=True)
|
|
177
|
+
feat2 /= feat2.norm(dim=-1, keepdim=True)
|
|
178
|
+
|
|
179
|
+
similarity = (feat1 @ feat2.T).item()
|
|
180
|
+
|
|
181
|
+
return similarity
|
|
182
|
+
|
|
183
|
+
# Check for duplicates
|
|
184
|
+
threshold = 0.95
|
|
185
|
+
image_pairs = [("img1.jpg", "img2.jpg"), ("img1.jpg", "img3.jpg")]
|
|
186
|
+
|
|
187
|
+
for img1, img2 in image_pairs:
|
|
188
|
+
sim = compute_similarity(img1, img2)
|
|
189
|
+
if sim > threshold:
|
|
190
|
+
print(f"{img1} and {img2} are duplicates (similarity: {sim:.3f})")
|
|
191
|
+
```
|
|
192
|
+
|
|
193
|
+
## Best practices
|
|
194
|
+
|
|
195
|
+
1. **Use descriptive labels** - "a photo of X" works better than just "X"
|
|
196
|
+
2. **Normalize embeddings** - Always normalize for cosine similarity
|
|
197
|
+
3. **Batch processing** - Process multiple images/texts together
|
|
198
|
+
4. **Cache embeddings** - Expensive to recompute
|
|
199
|
+
5. **Set appropriate thresholds** - Test on validation data
|
|
200
|
+
6. **Use GPU** - 10-50× faster than CPU
|
|
201
|
+
7. **Consider model size** - ViT-B/32 good default, ViT-L/14 for best quality
|
|
202
|
+
|
|
203
|
+
## Resources
|
|
204
|
+
|
|
205
|
+
- **Paper**: https://arxiv.org/abs/2103.00020
|
|
206
|
+
- **GitHub**: https://github.com/openai/CLIP
|
|
207
|
+
- **Colab**: https://colab.research.google.com/github/openai/clip/
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: langgraph-docs
|
|
3
|
+
description: Use this skill for requests related to LangGraph in order to fetch relevant documentation to provide accurate, up-to-date guidance.
|
|
4
|
+
---
|
|
5
|
+
|
|
6
|
+
# langgraph-docs
|
|
7
|
+
|
|
8
|
+
## Overview
|
|
9
|
+
|
|
10
|
+
This skill explains how to access LangGraph Python documentation to help answer questions and guide implementation.
|
|
11
|
+
|
|
12
|
+
## Instructions
|
|
13
|
+
|
|
14
|
+
### 1. Fetch the Documentation Index
|
|
15
|
+
|
|
16
|
+
Use the fetch_url tool to read the following URL:
|
|
17
|
+
https://docs.langchain.com/llms.txt
|
|
18
|
+
|
|
19
|
+
This provides a structured list of all available documentation with descriptions.
|
|
20
|
+
|
|
21
|
+
### 2. Select Relevant Documentation
|
|
22
|
+
|
|
23
|
+
Based on the question, identify 2-4 most relevant documentation URLs from the index. Prioritize:
|
|
24
|
+
|
|
25
|
+
- Specific how-to guides for implementation questions
|
|
26
|
+
- Core concept pages for understanding questions
|
|
27
|
+
- Tutorials for end-to-end examples
|
|
28
|
+
- Reference docs for API details
|
|
29
|
+
|
|
30
|
+
### 3. Fetch Selected Documentation
|
|
31
|
+
|
|
32
|
+
Use the fetch_url tool to read the selected documentation URLs.
|
|
33
|
+
|
|
34
|
+
### 4. Provide Accurate Guidance
|
|
35
|
+
|
|
36
|
+
After reading the documentation, complete the user's request.
|