PictSure 0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pictsure-0.2/LICENSE +21 -0
- pictsure-0.2/PKG-INFO +117 -0
- pictsure-0.2/PictSure/Embeddings/ViT.py +162 -0
- pictsure-0.2/PictSure/Embeddings/__init__.py +0 -0
- pictsure-0.2/PictSure/__init__.py +26 -0
- pictsure-0.2/PictSure/model_PictSure.py +844 -0
- pictsure-0.2/PictSure/model_embeddings.py +248 -0
- pictsure-0.2/PictSure/normalization.py +231 -0
- pictsure-0.2/PictSure.egg-info/PKG-INFO +117 -0
- pictsure-0.2/PictSure.egg-info/SOURCES.txt +15 -0
- pictsure-0.2/PictSure.egg-info/dependency_links.txt +1 -0
- pictsure-0.2/PictSure.egg-info/entry_points.txt +2 -0
- pictsure-0.2/PictSure.egg-info/requires.txt +9 -0
- pictsure-0.2/PictSure.egg-info/top_level.txt +4 -0
- pictsure-0.2/README.md +73 -0
- pictsure-0.2/pyproject.toml +40 -0
- pictsure-0.2/setup.cfg +4 -0
pictsure-0.2/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Cornelius Wolff; Lukas Schiesser
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
pictsure-0.2/PKG-INFO
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: PictSure
|
|
3
|
+
Version: 0.2
|
|
4
|
+
Summary: A package for generalized image classification using In-Context-Learning with PyTorch.
|
|
5
|
+
Author-email: Cornelius Wolff <cornelius.wolff@cwi.nl>, Lukas Schiesser <lukas.schiesser@dfki.de>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2025 Cornelius Wolff; Lukas Schiesser
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Classifier: Programming Language :: Python :: 3
|
|
29
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
30
|
+
Classifier: Operating System :: OS Independent
|
|
31
|
+
Requires-Python: >=3.9
|
|
32
|
+
Description-Content-Type: text/markdown
|
|
33
|
+
License-File: LICENSE
|
|
34
|
+
Requires-Dist: torch>=2.7.0
|
|
35
|
+
Requires-Dist: torchvision>=0.22.0
|
|
36
|
+
Requires-Dist: numpy>=1.26.4
|
|
37
|
+
Requires-Dist: Pillow
|
|
38
|
+
Requires-Dist: click>=8.1.7
|
|
39
|
+
Requires-Dist: tqdm>=4.66.4
|
|
40
|
+
Requires-Dist: requests>=2.32.3
|
|
41
|
+
Requires-Dist: huggingface-hub>=0.33.1
|
|
42
|
+
Requires-Dist: safetensors>=0.5.3
|
|
43
|
+
Dynamic: license-file
|
|
44
|
+
|
|
45
|
+
# PictSure: In-Context Learning for Image Classification
|
|
46
|
+
|
|
47
|
+
[](https://pepy.tech/projects/pictsure) [](https://www.alphaxiv.org/abs/2506.14842)
|
|
48
|
+
|
|
49
|
+
PictSure is a deep learning library designed for **in-context learning** using images and labels. It allows users to provide a set of labeled reference images and then predict labels for new images based on those references. This approach eliminates the need for traditional training, making it highly adaptable for various classification tasks.
|
|
50
|
+
|
|
51
|
+
<p align="center">
|
|
52
|
+
<img src="images/Flow-Chart.png" alt="The classification process" width="90%" />
|
|
53
|
+
</p>
|
|
54
|
+
|
|
55
|
+
## Features
|
|
56
|
+
- **In-Context Learning**: Predict labels for new images using a set of reference images without traditional model training.
|
|
57
|
+
- **Multiple Model Architectures**: Choose between ResNet and ViT-based models for your specific needs.
|
|
58
|
+
- **Pretrained Models**: Use our pretrained models or train your own.
|
|
59
|
+
- **Torch Compatibility**: Fully integrated with PyTorch, supporting CPU and GPU.
|
|
60
|
+
- **Easy-to-use CLI**: Manage models and weights through a simple command-line interface.
|
|
61
|
+
|
|
62
|
+
## Installation
|
|
63
|
+
```bash
|
|
64
|
+
pip install PictSure
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Quick Start
|
|
68
|
+
```python
|
|
69
|
+
from PictSure import PictSure
|
|
70
|
+
import torch
|
|
71
|
+
|
|
72
|
+
DEVICE = "cpu" # or cuda, mps
|
|
73
|
+
|
|
74
|
+
model = PictSure.from_pretrained("pictsure/pictsure-vit")
|
|
75
|
+
model = model.to(DEVICE)
|
|
76
|
+
|
|
77
|
+
# Set your reference images and labels
|
|
78
|
+
model.set_context_images(reference_images, reference_labels)
|
|
79
|
+
|
|
80
|
+
# Make predictions on new images
|
|
81
|
+
predictions = model.predict(new_images)
|
|
82
|
+
```
|
|
83
|
+
|
|
84
|
+
## Examples
|
|
85
|
+
For a complete working example, check out the Jupyter notebook in the Examples directory:
|
|
86
|
+
```bash
|
|
87
|
+
Examples/example.ipynb
|
|
88
|
+
```
|
|
89
|
+
This notebook demonstrates:
|
|
90
|
+
- Model initialization
|
|
91
|
+
- Loading and preprocessing images
|
|
92
|
+
- Setting up reference images
|
|
93
|
+
- Making predictions
|
|
94
|
+
- Visualizing results
|
|
95
|
+
|
|
96
|
+
## Citation
|
|
97
|
+
|
|
98
|
+
If you use this work, please cite it using the following BibTeX entry:
|
|
99
|
+
|
|
100
|
+
```bibtex
|
|
101
|
+
@article{schiesser2025pictsure,
|
|
102
|
+
title={PictSure: Pretraining Embeddings Matters for In-Context Learning Image Classifiers},
|
|
103
|
+
author={Schiesser, Lukas and Wolff, Cornelius and Haas, Sophie and Pukrop, Simon},
|
|
104
|
+
journal={arXiv preprint arXiv:2506.14842},
|
|
105
|
+
year={2025}
|
|
106
|
+
}
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
## License
|
|
110
|
+
This project is open-source under the MIT License.
|
|
111
|
+
|
|
112
|
+
## Contributing
|
|
113
|
+
Contributions and suggestions are welcome! Open an issue or submit a pull request.
|
|
114
|
+
|
|
115
|
+
## Contact
|
|
116
|
+
For questions or support, open an issue on GitHub.
|
|
117
|
+
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
import torch.nn as nn
|
|
3
|
+
import torch.nn.functional as F
|
|
4
|
+
from torchvision import datasets, transforms
|
|
5
|
+
|
|
6
|
+
import torch
|
|
7
|
+
import torch.nn as nn
|
|
8
|
+
import torch.nn.functional as F
|
|
9
|
+
|
|
10
|
+
class PatchEmbed(nn.Module):
|
|
11
|
+
def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
|
|
12
|
+
super().__init__()
|
|
13
|
+
self.img_size = img_size
|
|
14
|
+
self.patch_size = patch_size
|
|
15
|
+
self.grid_size = img_size // patch_size # e.g. 14 if 224 // 16
|
|
16
|
+
self.num_patches = self.grid_size ** 2
|
|
17
|
+
self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
|
|
18
|
+
|
|
19
|
+
def forward(self, x):
|
|
20
|
+
# x: [B, 3, H, W]
|
|
21
|
+
# project to embeddings with shape [B, D, #patches_row, #patches_col]
|
|
22
|
+
x = self.proj(x) # -> [B, embed_dim, grid_size, grid_size]
|
|
23
|
+
# flatten the spatial dims
|
|
24
|
+
x = x.flatten(2) # -> [B, embed_dim, grid_size*grid_size]
|
|
25
|
+
x = x.transpose(1, 2) # -> [B, #patches, embed_dim]
|
|
26
|
+
return x
|
|
27
|
+
|
|
28
|
+
class Attention(nn.Module):
|
|
29
|
+
def __init__(self, dim, num_heads=8, qkv_bias=True, attn_drop=0.0, proj_drop=0.0):
|
|
30
|
+
super().__init__()
|
|
31
|
+
self.num_heads = num_heads
|
|
32
|
+
head_dim = dim // num_heads
|
|
33
|
+
self.scale = head_dim ** -0.5
|
|
34
|
+
|
|
35
|
+
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
|
|
36
|
+
self.attn_drop = nn.Dropout(attn_drop)
|
|
37
|
+
self.proj = nn.Linear(dim, dim)
|
|
38
|
+
self.proj_drop = nn.Dropout(proj_drop)
|
|
39
|
+
|
|
40
|
+
def forward(self, x):
|
|
41
|
+
B, N, C = x.shape
|
|
42
|
+
qkv = self.qkv(x) # -> [B, N, 3*C]
|
|
43
|
+
qkv = qkv.reshape(B, N, 3, self.num_heads, C // self.num_heads)
|
|
44
|
+
qkv = qkv.permute(2, 0, 3, 1, 4) # -> [3, B, heads, N, C//heads]
|
|
45
|
+
q, k, v = qkv[0], qkv[1], qkv[2]
|
|
46
|
+
|
|
47
|
+
# scaled dot product
|
|
48
|
+
attn = (q @ k.transpose(-2, -1)) * self.scale # [B, heads, N, N]
|
|
49
|
+
attn = attn.softmax(dim=-1)
|
|
50
|
+
attn = self.attn_drop(attn)
|
|
51
|
+
|
|
52
|
+
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
|
|
53
|
+
x = self.proj(x)
|
|
54
|
+
x = self.proj_drop(x)
|
|
55
|
+
return x
|
|
56
|
+
|
|
57
|
+
class MLP(nn.Module):
|
|
58
|
+
def __init__(self, in_features, hidden_features=None, out_features=None, drop=0.0):
|
|
59
|
+
super().__init__()
|
|
60
|
+
out_features = out_features or in_features
|
|
61
|
+
hidden_features = hidden_features or in_features
|
|
62
|
+
self.fc1 = nn.Linear(in_features, hidden_features)
|
|
63
|
+
self.act = nn.GELU()
|
|
64
|
+
self.fc2 = nn.Linear(hidden_features, out_features)
|
|
65
|
+
self.drop = nn.Dropout(drop)
|
|
66
|
+
|
|
67
|
+
def forward(self, x):
|
|
68
|
+
x = self.fc1(x)
|
|
69
|
+
x = self.act(x)
|
|
70
|
+
x = self.drop(x)
|
|
71
|
+
x = self.fc2(x)
|
|
72
|
+
x = self.drop(x)
|
|
73
|
+
return x
|
|
74
|
+
|
|
75
|
+
class Block(nn.Module):
|
|
76
|
+
def __init__(self, dim, num_heads, mlp_ratio=4.0, qkv_bias=True,
|
|
77
|
+
drop=0.0, attn_drop=0.0):
|
|
78
|
+
super().__init__()
|
|
79
|
+
self.norm1 = nn.LayerNorm(dim)
|
|
80
|
+
self.attn = Attention(
|
|
81
|
+
dim, num_heads=num_heads, qkv_bias=qkv_bias,
|
|
82
|
+
attn_drop=attn_drop, proj_drop=drop
|
|
83
|
+
)
|
|
84
|
+
self.norm2 = nn.LayerNorm(dim)
|
|
85
|
+
self.mlp = MLP(
|
|
86
|
+
in_features=dim, hidden_features=int(dim*mlp_ratio),
|
|
87
|
+
out_features=dim, drop=drop
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
def forward(self, x):
|
|
91
|
+
x = x + self.attn(self.norm1(x))
|
|
92
|
+
x = x + self.mlp(self.norm2(x))
|
|
93
|
+
return x
|
|
94
|
+
|
|
95
|
+
class VisionTransformer(nn.Module):
|
|
96
|
+
def __init__(
|
|
97
|
+
self,
|
|
98
|
+
img_size=224,
|
|
99
|
+
patch_size=16,
|
|
100
|
+
in_chans=3,
|
|
101
|
+
num_classes=1000,
|
|
102
|
+
embed_dim=768,
|
|
103
|
+
depth=12,
|
|
104
|
+
num_heads=12,
|
|
105
|
+
mlp_ratio=4.0,
|
|
106
|
+
qkv_bias=True,
|
|
107
|
+
drop_rate=0.0,
|
|
108
|
+
attn_drop_rate=0.0
|
|
109
|
+
):
|
|
110
|
+
super().__init__()
|
|
111
|
+
self.num_classes = num_classes
|
|
112
|
+
self.embed_dim = embed_dim
|
|
113
|
+
self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, embed_dim)
|
|
114
|
+
self.num_patches = self.patch_embed.num_patches
|
|
115
|
+
|
|
116
|
+
# CLS token
|
|
117
|
+
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
|
|
118
|
+
# 1D positional embedding
|
|
119
|
+
self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches+1, embed_dim))
|
|
120
|
+
self.pos_drop = nn.Dropout(p=drop_rate)
|
|
121
|
+
|
|
122
|
+
# Transformer blocks
|
|
123
|
+
self.blocks = nn.ModuleList([
|
|
124
|
+
Block(embed_dim, num_heads, mlp_ratio,
|
|
125
|
+
qkv_bias, drop_rate, attn_drop_rate)
|
|
126
|
+
for _ in range(depth)
|
|
127
|
+
])
|
|
128
|
+
self.norm = nn.LayerNorm(embed_dim)
|
|
129
|
+
|
|
130
|
+
# Classifier head
|
|
131
|
+
self.head = nn.Linear(embed_dim, num_classes)
|
|
132
|
+
|
|
133
|
+
# Weight initialization
|
|
134
|
+
self._init_weights()
|
|
135
|
+
|
|
136
|
+
def _init_weights(self):
|
|
137
|
+
# simple initialization
|
|
138
|
+
torch.nn.init.normal_(self.pos_embed, std=0.02)
|
|
139
|
+
torch.nn.init.normal_(self.cls_token, std=0.02)
|
|
140
|
+
torch.nn.init.xavier_uniform_(self.head.weight)
|
|
141
|
+
torch.nn.init.normal_(self.head.bias, std=1e-6)
|
|
142
|
+
|
|
143
|
+
def forward(self, x):
|
|
144
|
+
# x shape: [B, 3, H, W]
|
|
145
|
+
B = x.shape[0]
|
|
146
|
+
x = self.patch_embed(x) # -> [B, N, D]
|
|
147
|
+
cls_tokens = self.cls_token.expand(B, -1, -1) # -> [B, 1, D]
|
|
148
|
+
x = torch.cat((cls_tokens, x), dim=1) # -> [B, N+1, D]
|
|
149
|
+
|
|
150
|
+
x = x + self.pos_embed[:, :(x.size(1)), :]
|
|
151
|
+
x = self.pos_drop(x)
|
|
152
|
+
|
|
153
|
+
for blk in self.blocks:
|
|
154
|
+
x = blk(x)
|
|
155
|
+
|
|
156
|
+
x = self.norm(x)
|
|
157
|
+
# extract CLS token
|
|
158
|
+
cls_token_final = x[:, 0]
|
|
159
|
+
# classification
|
|
160
|
+
logits = self.head(cls_token_final)
|
|
161
|
+
|
|
162
|
+
return logits, cls_token_final
|
|
File without changes
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
PictSure - Few-shot image classification library.
|
|
3
|
+
|
|
4
|
+
This module provides the main interface for few-shot image classification
|
|
5
|
+
using various vision encoders (ResNet, ViT, DINOv2, CLIP).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .model_PictSure import PictSure
|
|
9
|
+
from .model_embeddings import (
|
|
10
|
+
ResNetWrapper,
|
|
11
|
+
VitNetWrapper,
|
|
12
|
+
DINOV2Wrapper,
|
|
13
|
+
CLIPWrapper,
|
|
14
|
+
get_encoder,
|
|
15
|
+
load_encoder,
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
__all__ = [
|
|
19
|
+
'PictSure',
|
|
20
|
+
'ResNetWrapper',
|
|
21
|
+
'VitNetWrapper',
|
|
22
|
+
'DINOV2Wrapper',
|
|
23
|
+
'CLIPWrapper',
|
|
24
|
+
'get_encoder',
|
|
25
|
+
'load_encoder',
|
|
26
|
+
]
|