timeline-vlm 1.0.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- timeline_vlm-1.0.2/LICENSE +21 -0
- timeline_vlm-1.0.2/PKG-INFO +323 -0
- timeline_vlm-1.0.2/README.md +271 -0
- timeline_vlm-1.0.2/pyproject.toml +79 -0
- timeline_vlm-1.0.2/setup.cfg +4 -0
- timeline_vlm-1.0.2/timeline_vlm/__init__.py +53 -0
- timeline_vlm-1.0.2/timeline_vlm/cli.py +171 -0
- timeline_vlm-1.0.2/timeline_vlm/data/README.md +22 -0
- timeline_vlm-1.0.2/timeline_vlm/data/__init__.py +1 -0
- timeline_vlm-1.0.2/timeline_vlm/data/dataset.py +234 -0
- timeline_vlm-1.0.2/timeline_vlm/data/download.py +234 -0
- timeline_vlm-1.0.2/timeline_vlm/evaluation/__init__.py +10 -0
- timeline_vlm-1.0.2/timeline_vlm/evaluation/embedding_space.py +228 -0
- timeline_vlm-1.0.2/timeline_vlm/evaluation/embeddings.py +182 -0
- timeline_vlm-1.0.2/timeline_vlm/evaluation/time_probing.py +265 -0
- timeline_vlm-1.0.2/timeline_vlm/evaluation/timeline_bezier.py +399 -0
- timeline_vlm-1.0.2/timeline_vlm/evaluation/timeline_umap.py +236 -0
- timeline_vlm-1.0.2/timeline_vlm/models/__init__.py +8 -0
- timeline_vlm-1.0.2/timeline_vlm/models/model_loader.py +380 -0
- timeline_vlm-1.0.2/timeline_vlm/predictor.py +371 -0
- timeline_vlm-1.0.2/timeline_vlm/py.typed +0 -0
- timeline_vlm-1.0.2/timeline_vlm/utils/__init__.py +2 -0
- timeline_vlm-1.0.2/timeline_vlm/utils/metrics.py +248 -0
- timeline_vlm-1.0.2/timeline_vlm/utils/prompts.py +74 -0
- timeline_vlm-1.0.2/timeline_vlm/visualization.py +232 -0
- timeline_vlm-1.0.2/timeline_vlm.egg-info/PKG-INFO +323 -0
- timeline_vlm-1.0.2/timeline_vlm.egg-info/SOURCES.txt +29 -0
- timeline_vlm-1.0.2/timeline_vlm.egg-info/dependency_links.txt +1 -0
- timeline_vlm-1.0.2/timeline_vlm.egg-info/entry_points.txt +2 -0
- timeline_vlm-1.0.2/timeline_vlm.egg-info/requires.txt +25 -0
- timeline_vlm-1.0.2/timeline_vlm.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Nidham Tekaya
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,323 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: timeline-vlm
|
|
3
|
+
Version: 1.0.2
|
|
4
|
+
Summary: Temporal inference with Vision-Language Models — predict when an image was taken from its visual content.
|
|
5
|
+
Author: Manuela Waldner, Matthias Zeppelzauer
|
|
6
|
+
Author-email: Nidham Tekaya <nidham.tekaya@fhstp.ac.at>
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/tekayanidham/timeline-vlm
|
|
9
|
+
Project-URL: Documentation, https://github.com/tekayanidham/timeline-vlm#readme
|
|
10
|
+
Project-URL: Repository, https://github.com/tekayanidham/timeline-vlm
|
|
11
|
+
Project-URL: Bug Tracker, https://github.com/tekayanidham/timeline-vlm/issues
|
|
12
|
+
Project-URL: Paper (ACM), https://dl.acm.org/doi/10.1145/3746027.3758163
|
|
13
|
+
Project-URL: arXiv, https://arxiv.org/abs/2510.19559
|
|
14
|
+
Project-URL: Dataset (TIME10k), https://osf.io/4th79/?view_only=560f540a7bac4d489faf164b16109642
|
|
15
|
+
Keywords: vision-language-models,temporal-reasoning,CLIP,time-estimation,embedding-analysis,bezier-curve,computer-vision,deep-learning
|
|
16
|
+
Classifier: Development Status :: 4 - Beta
|
|
17
|
+
Classifier: Intended Audience :: Science/Research
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
24
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
25
|
+
Classifier: Topic :: Scientific/Engineering :: Image Recognition
|
|
26
|
+
Requires-Python: >=3.8
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: torch>=1.10.0
|
|
30
|
+
Requires-Dist: torchvision>=0.11.0
|
|
31
|
+
Requires-Dist: numpy>=1.19.0
|
|
32
|
+
Requires-Dist: scipy>=1.5.0
|
|
33
|
+
Requires-Dist: scikit-learn>=0.24.0
|
|
34
|
+
Requires-Dist: Pillow>=8.0.0
|
|
35
|
+
Requires-Dist: tqdm>=4.62.0
|
|
36
|
+
Requires-Dist: umap-learn>=0.5.0
|
|
37
|
+
Requires-Dist: matplotlib>=3.3.0
|
|
38
|
+
Requires-Dist: tabulate>=0.8.9
|
|
39
|
+
Provides-Extra: openclip
|
|
40
|
+
Requires-Dist: open-clip-torch>=2.24.0; extra == "openclip"
|
|
41
|
+
Provides-Extra: all
|
|
42
|
+
Requires-Dist: open-clip-torch>=2.24.0; extra == "all"
|
|
43
|
+
Requires-Dist: optuna>=3.0.0; extra == "all"
|
|
44
|
+
Requires-Dist: seaborn>=0.11.0; extra == "all"
|
|
45
|
+
Requires-Dist: pandas>=1.3.0; extra == "all"
|
|
46
|
+
Requires-Dist: requests>=2.25.0; extra == "all"
|
|
47
|
+
Requires-Dist: pyyaml>=5.4.0; extra == "all"
|
|
48
|
+
Provides-Extra: dev
|
|
49
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
50
|
+
Requires-Dist: pytest-cov>=4.0; extra == "dev"
|
|
51
|
+
Dynamic: license-file
|
|
52
|
+
|
|
53
|
+
# A Matter of Time: Revealing the Structure of Time in Vision-Language Models
|
|
54
|
+
|
|
55
|
+
[](https://www.python.org/downloads/)
|
|
56
|
+
[](LICENSE)
|
|
57
|
+
<a href="https://arxiv.org/pdf/2510.19559" target="_blank"><img src="https://img.shields.io/badge/arXiv-2510.19559-red.svg" alt="arXiv"></a>
|
|
58
|
+
<a href="https://dl.acm.org/doi/10.1145/3746027.3758163" target="_blank"><img src="https://img.shields.io/badge/paper-ACM-blue.svg" alt="Paper"></a>
|
|
59
|
+
<a href="https://osf.io/4th79/?view_only=560f540a7bac4d489faf164b16109642" target="_blank"><img src="https://img.shields.io/badge/dataset-TIME10k-orange.svg" alt="Dataset"></a>
|
|
60
|
+
<a href="https://huggingface.co/spaces/Nidhamtek/timeline-vlm" target="_blank"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Demo-yellow.svg" alt="Demo"></a>
|
|
61
|
+
|
|
62
|
+

|
|
63
|
+
|
|
64
|
+
Official implementation of **"A Matter of Time: Revealing the Structure of Time in Vision-Language Models"**, published at ACM Multimedia 2025 (MM '25).
|
|
65
|
+
|
|
66
|
+
> We investigate the temporal awareness of VLMs, assessing their ability to position visual content in time. We introduce **TIME10k**, a benchmark of over 10,000 images with temporal ground truth, and evaluate **37 VLMs**. We reveal that temporal information is structured along a low-dimensional, non-linear manifold in the VLM embedding space. We propose methods to derive an explicit "timeline" representation using **UMAP** and **Bezier curve** approximation, achieving competitive to superior accuracy while being computationally efficient.
|
|
67
|
+
|
|
68
|
+

|
|
69
|
+
|
|
70
|
+
**Try it now:** A live demo is available on [Hugging Face Spaces](https://huggingface.co/spaces/Nidhamtek/timeline-vlm).
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## Quick Start
|
|
75
|
+
|
|
76
|
+
**Predict the year of any image in 3 lines:**
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
from timeline_vlm import TimelinePredictor
|
|
80
|
+
|
|
81
|
+
predictor = TimelinePredictor('clip-vit-b32').fit_from_precomputed('encodings')
|
|
82
|
+
print(predictor.predict('photo.jpg')) # -> 1972
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
**Or from the command line:**
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
python predict.py --image photo.jpg
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
No GPU required — precomputed embeddings for CLIP and EVA-CLIP are included.
|
|
92
|
+
|
|
93
|
+
---
|
|
94
|
+
|
|
95
|
+
## Installation
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
git clone https://github.com/tekayanidham/timeline-vlm.git
|
|
99
|
+
cd timeline-vlm
|
|
100
|
+
pip install -r requirements.txt
|
|
101
|
+
pip install git+https://github.com/openai/CLIP.git
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
For all 37 models (including EVA-CLIP, ImageBind and ViT-Lens):
|
|
105
|
+
```bash
|
|
106
|
+
bash install_models.sh
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
Verify the installation:
|
|
110
|
+
```bash
|
|
111
|
+
python scripts/run_experiments.py --config configs/lightweight_test.yaml --device cpu
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
---
|
|
115
|
+
|
|
116
|
+
## Repository Structure
|
|
117
|
+
|
|
118
|
+
```
|
|
119
|
+
timeline-vlm/
|
|
120
|
+
│
|
|
121
|
+
│ # ── Use the Framework ─────────────────────────────────────────
|
|
122
|
+
├── predict.py # Predict year for images (CLI)
|
|
123
|
+
├── timeline_vlm.py # Python API for your pipelines
|
|
124
|
+
├── visualize.py # Visualize timelines and embeddings
|
|
125
|
+
│
|
|
126
|
+
│ # ── Core Library ──────────────────────────────────────────────
|
|
127
|
+
├── evaluation/ # Temporal inference methods
|
|
128
|
+
│ ├── time_probing.py # Baseline: dot-product similarity (Sec. 3.1)
|
|
129
|
+
│ ├── timeline_umap.py # UMAP 1D timeline (Sec. 3.3.1)
|
|
130
|
+
│ ├── timeline_bezier.py # Bezier curve timeline (Sec. 3.3.2)
|
|
131
|
+
│ ├── embedding_space.py # Embedding analysis (Sec. 3.2)
|
|
132
|
+
│ └── embeddings.py # Embedding generation & caching
|
|
133
|
+
├── models/ # Unified loader for 37 VLMs
|
|
134
|
+
│ └── model_loader.py
|
|
135
|
+
├── utils/ # TAI, MAE, ranking metrics, prompts
|
|
136
|
+
│ ├── metrics.py
|
|
137
|
+
│ └── prompts.py
|
|
138
|
+
├── data/ # TIME10k dataset loader & downloader
|
|
139
|
+
│ ├── dataset.py
|
|
140
|
+
│ ├── download.py
|
|
141
|
+
│ └── time10k.csv
|
|
142
|
+
│
|
|
143
|
+
│ # ── Paper Reproduction ────────────────────────────────────────
|
|
144
|
+
├── scripts/ # Benchmark & reproduction scripts
|
|
145
|
+
│ ├── reproduce_results.py # Per-table: --table 1 2 3 4 5 --figure 6
|
|
146
|
+
│ └── run_experiments.py # Full YAML-driven experiment pipeline
|
|
147
|
+
├── configs/ # Experiment configurations
|
|
148
|
+
│ ├── full_evaluation.yaml # All 37 models (GPU)
|
|
149
|
+
│ └── lightweight_test.yaml # Quick CPU test
|
|
150
|
+
├── docs/ # Extended documentation
|
|
151
|
+
│ ├── reproducing_results.md # Step-by-step reproduction guide
|
|
152
|
+
│ ├── methods.md # Detailed method descriptions
|
|
153
|
+
│ ├── models.md # All 37 VLMs documented
|
|
154
|
+
│ └── dataset.md # TIME10k dataset details
|
|
155
|
+
│
|
|
156
|
+
│ # ── Data ──────────────────────────────────────────────────────
|
|
157
|
+
├── encodings/ # Precomputed embeddings (CLIP, EVA-CLIP)
|
|
158
|
+
└── results/ # Output directory
|
|
159
|
+
```
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
## Which Script Should I Use?
|
|
164
|
+
|
|
165
|
+
| I want to... | Use this | Example |
|
|
166
|
+
|---|---|---|
|
|
167
|
+
| **Predict the year of an image** | `predict.py` | `python predict.py --image photo.jpg` |
|
|
168
|
+
| **Use this in my Python code** | `timeline_vlm.py` | `from timeline_vlm import TimelinePredictor` |
|
|
169
|
+
| **Visualize timelines or embeddings** | `visualize.py` | `python visualize.py timeline` |
|
|
170
|
+
| **Reproduce a specific paper table** | `scripts/` | `python scripts/reproduce_results.py --table 5` |
|
|
171
|
+
| **Run full benchmark** | `scripts/` | `python scripts/run_experiments.py --config configs/full_evaluation.yaml` |
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## Predicting Year of First Appearance
|
|
176
|
+
|
|
177
|
+
### Command Line
|
|
178
|
+
|
|
179
|
+
```bash
|
|
180
|
+
# Default: CLIP ViT-B/32, Bezier R^S method
|
|
181
|
+
python predict.py --image photo.jpg
|
|
182
|
+
|
|
183
|
+
# Choose model and method
|
|
184
|
+
python predict.py --image photo.jpg --model eva-clip-l14-336 --method bezier
|
|
185
|
+
|
|
186
|
+
# Time probing (direct similarity matching)
|
|
187
|
+
python predict.py --image photo.jpg --method time_probing --prompt P7
|
|
188
|
+
|
|
189
|
+
# Batch prediction on a directory
|
|
190
|
+
python predict.py --image_dir my_photos/ --output json
|
|
191
|
+
|
|
192
|
+
# UMAP timeline method
|
|
193
|
+
python predict.py --image photo.jpg --method umap
|
|
194
|
+
|
|
195
|
+
# Save results
|
|
196
|
+
python predict.py --image_dir photos/ --output csv --save results.csv
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### Python API
|
|
200
|
+
|
|
201
|
+
```python
|
|
202
|
+
from timeline_vlm import TimelinePredictor
|
|
203
|
+
|
|
204
|
+
# Initialize and fit
|
|
205
|
+
predictor = TimelinePredictor(
|
|
206
|
+
model='clip-vit-b32', # Any of the 37 supported models
|
|
207
|
+
method='bezier', # 'time_probing', 'umap', or 'bezier'
|
|
208
|
+
reduce_dim=13, # KPCA dimensions (Bezier only)
|
|
209
|
+
bezier_method='interpolation',
|
|
210
|
+
)
|
|
211
|
+
predictor.fit_from_precomputed('encodings')
|
|
212
|
+
|
|
213
|
+
# Single prediction
|
|
214
|
+
year = predictor.predict('photo.jpg')
|
|
215
|
+
|
|
216
|
+
# Batch prediction
|
|
217
|
+
years = predictor.predict_batch(['img1.jpg', 'img2.jpg', 'img3.jpg'])
|
|
218
|
+
|
|
219
|
+
# Detailed prediction with confidence
|
|
220
|
+
details = predictor.predict_with_details('photo.jpg')
|
|
221
|
+
|
|
222
|
+
# Evaluate on your own data
|
|
223
|
+
results = predictor.evaluate(image_embeddings, ground_truth_years)
|
|
224
|
+
print(f"MAE: {results['mae']:.2f}, TAI: {results['tai']:.3f}")
|
|
225
|
+
```
|
|
226
|
+
|
|
227
|
+
---
|
|
228
|
+
|
|
229
|
+
## Reproducing Paper Results
|
|
230
|
+
|
|
231
|
+
Reproduction scripts and documentation are separate from the core framework.
|
|
232
|
+
|
|
233
|
+
```bash
|
|
234
|
+
python scripts/reproduce_results.py --table 5 # Single table
|
|
235
|
+
python scripts/reproduce_results.py --table 4 5 # Multiple tables
|
|
236
|
+
python scripts/reproduce_results.py --figure 6 # Figure 6
|
|
237
|
+
python scripts/reproduce_results.py --all # Everything
|
|
238
|
+
python scripts/run_experiments.py --config configs/full_evaluation.yaml # Full benchmark
|
|
239
|
+
```
|
|
240
|
+
|
|
241
|
+
| Flag | What it reproduces |
|
|
242
|
+
|---|---|
|
|
243
|
+
| `--table 1` | Time probing MAE & TAI for 37 VLMs (P7) |
|
|
244
|
+
| `--table 2` | Prompt sensitivity P1-P9 |
|
|
245
|
+
| `--table 3` | Class-wise temporal awareness |
|
|
246
|
+
| `--table 4` | Chronological ordering quality (KPCA vs UMAP) |
|
|
247
|
+
| `--table 5` | Method comparison: Time Probing vs UMAP vs 4 Bezier variants |
|
|
248
|
+
| `--figure 6` | MAE per KPCA dimension (optimal S=13) |
|
|
249
|
+
|
|
250
|
+
See [`docs/reproducing_results.md`](docs/reproducing_results.md) for the full step-by-step guide.
|
|
251
|
+
|
|
252
|
+
---
|
|
253
|
+
|
|
254
|
+
## Visualizations
|
|
255
|
+
|
|
256
|
+
```bash
|
|
257
|
+
python visualize.py manifold --model clip-vit-b32 --save manifold.png # 2D/3D embedding manifold
|
|
258
|
+
python visualize.py timeline --model clip-vit-b32 --save timeline.png # 1D KPCA vs UMAP
|
|
259
|
+
python visualize.py bezier --model clip-vit-b32 --save bezier.png # 3D Bezier curve
|
|
260
|
+
python visualize.py dimension_sweep --model clip-vit-b32 --save sweep.png # MAE per dimension
|
|
261
|
+
python visualize.py distribution --model clip-vit-b32 --save dist.png # Year distribution
|
|
262
|
+
```
|
|
263
|
+
|
|
264
|
+
---
|
|
265
|
+
|
|
266
|
+
## Methods
|
|
267
|
+
|
|
268
|
+
Three temporal inference approaches, each described in detail in [`docs/methods.md`](docs/methods.md):
|
|
269
|
+
|
|
270
|
+
| Method | Paper | CLIP MAE | Description |
|
|
271
|
+
|---|---|---|---|
|
|
272
|
+
| Time Probing | Sec. 3.1 | 9.24 | Dot-product similarity baseline |
|
|
273
|
+
| UMAP Timeline | Sec. 3.3.1 | 13.01 | 1D manifold projection |
|
|
274
|
+
| **Bezier(R^S, Int)** | **Sec. 3.3.2** | **8.80** | **Bezier curve in KPCA subspace (best)** |
|
|
275
|
+
|
|
276
|
+
---
|
|
277
|
+
|
|
278
|
+
## Supported Models (37 VLMs)
|
|
279
|
+
|
|
280
|
+
| Family | Count | Backend |
|
|
281
|
+
|---|---|---|
|
|
282
|
+
| CLIP | 9 | `openai/CLIP` |
|
|
283
|
+
| EVA-CLIP | 8 | `eva_clip` (BAAI) |
|
|
284
|
+
| OpenCLIP | 10 | `open_clip` |
|
|
285
|
+
| SigLIP | 3 | `open_clip` |
|
|
286
|
+
| Others (CoCa, MobileCLIP, ViTamin, CLIPA, ImageBind, ViT-Lens) | 7 | various |
|
|
287
|
+
|
|
288
|
+
See [`docs/models.md`](docs/models.md) for the full list with model keys and installation instructions.
|
|
289
|
+
|
|
290
|
+
---
|
|
291
|
+
|
|
292
|
+
## Citation
|
|
293
|
+
|
|
294
|
+
```bibtex
|
|
295
|
+
@inproceedings{10.1145/3746027.3758163,
|
|
296
|
+
author = {Tekaya, Nidham and Waldner, Manuela and Zeppelzauer, Matthias},
|
|
297
|
+
title = {A Matter of Time: Revealing the Structure of Time in Vision-Language Models},
|
|
298
|
+
year = {2025},
|
|
299
|
+
isbn = {9798400720352},
|
|
300
|
+
publisher = {Association for Computing Machinery},
|
|
301
|
+
address = {New York, NY, USA},
|
|
302
|
+
url = {https://doi.org/10.1145/3746027.3758163},
|
|
303
|
+
doi = {10.1145/3746027.3758163},
|
|
304
|
+
booktitle = {Proceedings of the 33rd ACM International Conference on Multimedia},
|
|
305
|
+
pages = {12371--12380},
|
|
306
|
+
numpages = {10},
|
|
307
|
+
keywords = {benchmark dataset, multimodal representations, time estimation, time modeling, time reasoning, vision-language models},
|
|
308
|
+
location = {Dublin, Ireland},
|
|
309
|
+
series = {MM '25}
|
|
310
|
+
}
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
## Links
|
|
314
|
+
|
|
315
|
+
- [Paper (ACM Digital Library)](https://dl.acm.org/doi/10.1145/3746027.3758163)
|
|
316
|
+
- [arXiv Preprint](https://arxiv.org/pdf/2510.19559)
|
|
317
|
+
- [Project Page](https://tekayanidham.github.io/timeline-page/)
|
|
318
|
+
- [TIME10k Dataset](https://osf.io/4th79/?view_only=560f540a7bac4d489faf164b16109642)
|
|
319
|
+
- [Hugging Face Demo](https://huggingface.co/spaces/Nidhamtek/timeline-vlm)
|
|
320
|
+
|
|
321
|
+
## License
|
|
322
|
+
|
|
323
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
|
@@ -0,0 +1,271 @@
|
|
|
1
|
+
# A Matter of Time: Revealing the Structure of Time in Vision-Language Models
|
|
2
|
+
|
|
3
|
+
[](https://www.python.org/downloads/)
|
|
4
|
+
[](LICENSE)
|
|
5
|
+
<a href="https://arxiv.org/pdf/2510.19559" target="_blank"><img src="https://img.shields.io/badge/arXiv-2510.19559-red.svg" alt="arXiv"></a>
|
|
6
|
+
<a href="https://dl.acm.org/doi/10.1145/3746027.3758163" target="_blank"><img src="https://img.shields.io/badge/paper-ACM-blue.svg" alt="Paper"></a>
|
|
7
|
+
<a href="https://osf.io/4th79/?view_only=560f540a7bac4d489faf164b16109642" target="_blank"><img src="https://img.shields.io/badge/dataset-TIME10k-orange.svg" alt="Dataset"></a>
|
|
8
|
+
<a href="https://huggingface.co/spaces/Nidhamtek/timeline-vlm" target="_blank"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Demo-yellow.svg" alt="Demo"></a>
|
|
9
|
+
|
|
10
|
+

|
|
11
|
+
|
|
12
|
+
Official implementation of **"A Matter of Time: Revealing the Structure of Time in Vision-Language Models"**, published at ACM Multimedia 2025 (MM '25).
|
|
13
|
+
|
|
14
|
+
> We investigate the temporal awareness of VLMs, assessing their ability to position visual content in time. We introduce **TIME10k**, a benchmark of over 10,000 images with temporal ground truth, and evaluate **37 VLMs**. We reveal that temporal information is structured along a low-dimensional, non-linear manifold in the VLM embedding space. We propose methods to derive an explicit "timeline" representation using **UMAP** and **Bezier curve** approximation, achieving competitive to superior accuracy while being computationally efficient.
|
|
15
|
+
|
|
16
|
+

|
|
17
|
+
|
|
18
|
+
**Try it now:** A live demo is available on [Hugging Face Spaces](https://huggingface.co/spaces/Nidhamtek/timeline-vlm).
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## Quick Start
|
|
23
|
+
|
|
24
|
+
**Predict the year of any image in 3 lines:**
|
|
25
|
+
|
|
26
|
+
```python
|
|
27
|
+
from timeline_vlm import TimelinePredictor
|
|
28
|
+
|
|
29
|
+
predictor = TimelinePredictor('clip-vit-b32').fit_from_precomputed('encodings')
|
|
30
|
+
print(predictor.predict('photo.jpg')) # -> 1972
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
**Or from the command line:**
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
python predict.py --image photo.jpg
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
No GPU required — precomputed embeddings for CLIP and EVA-CLIP are included.
|
|
40
|
+
|
|
41
|
+
---
|
|
42
|
+
|
|
43
|
+
## Installation
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
git clone https://github.com/tekayanidham/timeline-vlm.git
|
|
47
|
+
cd timeline-vlm
|
|
48
|
+
pip install -r requirements.txt
|
|
49
|
+
pip install git+https://github.com/openai/CLIP.git
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
For all 37 models (including EVA-CLIP, ImageBind and ViT-Lens):
|
|
53
|
+
```bash
|
|
54
|
+
bash install_models.sh
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Verify the installation:
|
|
58
|
+
```bash
|
|
59
|
+
python scripts/run_experiments.py --config configs/lightweight_test.yaml --device cpu
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
---
|
|
63
|
+
|
|
64
|
+
## Repository Structure
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
timeline-vlm/
|
|
68
|
+
│
|
|
69
|
+
│ # ── Use the Framework ─────────────────────────────────────────
|
|
70
|
+
├── predict.py # Predict year for images (CLI)
|
|
71
|
+
├── timeline_vlm.py # Python API for your pipelines
|
|
72
|
+
├── visualize.py # Visualize timelines and embeddings
|
|
73
|
+
│
|
|
74
|
+
│ # ── Core Library ──────────────────────────────────────────────
|
|
75
|
+
├── evaluation/ # Temporal inference methods
|
|
76
|
+
│ ├── time_probing.py # Baseline: dot-product similarity (Sec. 3.1)
|
|
77
|
+
│ ├── timeline_umap.py # UMAP 1D timeline (Sec. 3.3.1)
|
|
78
|
+
│ ├── timeline_bezier.py # Bezier curve timeline (Sec. 3.3.2)
|
|
79
|
+
│ ├── embedding_space.py # Embedding analysis (Sec. 3.2)
|
|
80
|
+
│ └── embeddings.py # Embedding generation & caching
|
|
81
|
+
├── models/ # Unified loader for 37 VLMs
|
|
82
|
+
│ └── model_loader.py
|
|
83
|
+
├── utils/ # TAI, MAE, ranking metrics, prompts
|
|
84
|
+
│ ├── metrics.py
|
|
85
|
+
│ └── prompts.py
|
|
86
|
+
├── data/ # TIME10k dataset loader & downloader
|
|
87
|
+
│ ├── dataset.py
|
|
88
|
+
│ ├── download.py
|
|
89
|
+
│ └── time10k.csv
|
|
90
|
+
│
|
|
91
|
+
│ # ── Paper Reproduction ────────────────────────────────────────
|
|
92
|
+
├── scripts/ # Benchmark & reproduction scripts
|
|
93
|
+
│ ├── reproduce_results.py # Per-table: --table 1 2 3 4 5 --figure 6
|
|
94
|
+
│ └── run_experiments.py # Full YAML-driven experiment pipeline
|
|
95
|
+
├── configs/ # Experiment configurations
|
|
96
|
+
│ ├── full_evaluation.yaml # All 37 models (GPU)
|
|
97
|
+
│ └── lightweight_test.yaml # Quick CPU test
|
|
98
|
+
├── docs/ # Extended documentation
|
|
99
|
+
│ ├── reproducing_results.md # Step-by-step reproduction guide
|
|
100
|
+
│ ├── methods.md # Detailed method descriptions
|
|
101
|
+
│ ├── models.md # All 37 VLMs documented
|
|
102
|
+
│ └── dataset.md # TIME10k dataset details
|
|
103
|
+
│
|
|
104
|
+
│ # ── Data ──────────────────────────────────────────────────────
|
|
105
|
+
├── encodings/ # Precomputed embeddings (CLIP, EVA-CLIP)
|
|
106
|
+
└── results/ # Output directory
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
---
|
|
110
|
+
|
|
111
|
+
## Which Script Should I Use?
|
|
112
|
+
|
|
113
|
+
| I want to... | Use this | Example |
|
|
114
|
+
|---|---|---|
|
|
115
|
+
| **Predict the year of an image** | `predict.py` | `python predict.py --image photo.jpg` |
|
|
116
|
+
| **Use this in my Python code** | `timeline_vlm.py` | `from timeline_vlm import TimelinePredictor` |
|
|
117
|
+
| **Visualize timelines or embeddings** | `visualize.py` | `python visualize.py timeline` |
|
|
118
|
+
| **Reproduce a specific paper table** | `scripts/` | `python scripts/reproduce_results.py --table 5` |
|
|
119
|
+
| **Run full benchmark** | `scripts/` | `python scripts/run_experiments.py --config configs/full_evaluation.yaml` |
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## Predicting Year of First Appearance
|
|
124
|
+
|
|
125
|
+
### Command Line
|
|
126
|
+
|
|
127
|
+
```bash
|
|
128
|
+
# Default: CLIP ViT-B/32, Bezier R^S method
|
|
129
|
+
python predict.py --image photo.jpg
|
|
130
|
+
|
|
131
|
+
# Choose model and method
|
|
132
|
+
python predict.py --image photo.jpg --model eva-clip-l14-336 --method bezier
|
|
133
|
+
|
|
134
|
+
# Time probing (direct similarity matching)
|
|
135
|
+
python predict.py --image photo.jpg --method time_probing --prompt P7
|
|
136
|
+
|
|
137
|
+
# Batch prediction on a directory
|
|
138
|
+
python predict.py --image_dir my_photos/ --output json
|
|
139
|
+
|
|
140
|
+
# UMAP timeline method
|
|
141
|
+
python predict.py --image photo.jpg --method umap
|
|
142
|
+
|
|
143
|
+
# Save results
|
|
144
|
+
python predict.py --image_dir photos/ --output csv --save results.csv
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
### Python API
|
|
148
|
+
|
|
149
|
+
```python
|
|
150
|
+
from timeline_vlm import TimelinePredictor
|
|
151
|
+
|
|
152
|
+
# Initialize and fit
|
|
153
|
+
predictor = TimelinePredictor(
|
|
154
|
+
model='clip-vit-b32', # Any of the 37 supported models
|
|
155
|
+
method='bezier', # 'time_probing', 'umap', or 'bezier'
|
|
156
|
+
reduce_dim=13, # KPCA dimensions (Bezier only)
|
|
157
|
+
bezier_method='interpolation',
|
|
158
|
+
)
|
|
159
|
+
predictor.fit_from_precomputed('encodings')
|
|
160
|
+
|
|
161
|
+
# Single prediction
|
|
162
|
+
year = predictor.predict('photo.jpg')
|
|
163
|
+
|
|
164
|
+
# Batch prediction
|
|
165
|
+
years = predictor.predict_batch(['img1.jpg', 'img2.jpg', 'img3.jpg'])
|
|
166
|
+
|
|
167
|
+
# Detailed prediction with confidence
|
|
168
|
+
details = predictor.predict_with_details('photo.jpg')
|
|
169
|
+
|
|
170
|
+
# Evaluate on your own data
|
|
171
|
+
results = predictor.evaluate(image_embeddings, ground_truth_years)
|
|
172
|
+
print(f"MAE: {results['mae']:.2f}, TAI: {results['tai']:.3f}")
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## Reproducing Paper Results
|
|
178
|
+
|
|
179
|
+
Reproduction scripts and documentation are separate from the core framework.
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
python scripts/reproduce_results.py --table 5 # Single table
|
|
183
|
+
python scripts/reproduce_results.py --table 4 5 # Multiple tables
|
|
184
|
+
python scripts/reproduce_results.py --figure 6 # Figure 6
|
|
185
|
+
python scripts/reproduce_results.py --all # Everything
|
|
186
|
+
python scripts/run_experiments.py --config configs/full_evaluation.yaml # Full benchmark
|
|
187
|
+
```
|
|
188
|
+
|
|
189
|
+
| Flag | What it reproduces |
|
|
190
|
+
|---|---|
|
|
191
|
+
| `--table 1` | Time probing MAE & TAI for 37 VLMs (P7) |
|
|
192
|
+
| `--table 2` | Prompt sensitivity P1-P9 |
|
|
193
|
+
| `--table 3` | Class-wise temporal awareness |
|
|
194
|
+
| `--table 4` | Chronological ordering quality (KPCA vs UMAP) |
|
|
195
|
+
| `--table 5` | Method comparison: Time Probing vs UMAP vs 4 Bezier variants |
|
|
196
|
+
| `--figure 6` | MAE per KPCA dimension (optimal S=13) |
|
|
197
|
+
|
|
198
|
+
See [`docs/reproducing_results.md`](docs/reproducing_results.md) for the full step-by-step guide.
|
|
199
|
+
|
|
200
|
+
---
|
|
201
|
+
|
|
202
|
+
## Visualizations
|
|
203
|
+
|
|
204
|
+
```bash
|
|
205
|
+
python visualize.py manifold --model clip-vit-b32 --save manifold.png # 2D/3D embedding manifold
|
|
206
|
+
python visualize.py timeline --model clip-vit-b32 --save timeline.png # 1D KPCA vs UMAP
|
|
207
|
+
python visualize.py bezier --model clip-vit-b32 --save bezier.png # 3D Bezier curve
|
|
208
|
+
python visualize.py dimension_sweep --model clip-vit-b32 --save sweep.png # MAE per dimension
|
|
209
|
+
python visualize.py distribution --model clip-vit-b32 --save dist.png # Year distribution
|
|
210
|
+
```
|
|
211
|
+
|
|
212
|
+
---
|
|
213
|
+
|
|
214
|
+
## Methods
|
|
215
|
+
|
|
216
|
+
Three temporal inference approaches, each described in detail in [`docs/methods.md`](docs/methods.md):
|
|
217
|
+
|
|
218
|
+
| Method | Paper | CLIP MAE | Description |
|
|
219
|
+
|---|---|---|---|
|
|
220
|
+
| Time Probing | Sec. 3.1 | 9.24 | Dot-product similarity baseline |
|
|
221
|
+
| UMAP Timeline | Sec. 3.3.1 | 13.01 | 1D manifold projection |
|
|
222
|
+
| **Bezier(R^S, Int)** | **Sec. 3.3.2** | **8.80** | **Bezier curve in KPCA subspace (best)** |
|
|
223
|
+
|
|
224
|
+
---
|
|
225
|
+
|
|
226
|
+
## Supported Models (37 VLMs)
|
|
227
|
+
|
|
228
|
+
| Family | Count | Backend |
|
|
229
|
+
|---|---|---|
|
|
230
|
+
| CLIP | 9 | `openai/CLIP` |
|
|
231
|
+
| EVA-CLIP | 8 | `eva_clip` (BAAI) |
|
|
232
|
+
| OpenCLIP | 10 | `open_clip` |
|
|
233
|
+
| SigLIP | 3 | `open_clip` |
|
|
234
|
+
| Others (CoCa, MobileCLIP, ViTamin, CLIPA, ImageBind, ViT-Lens) | 7 | various |
|
|
235
|
+
|
|
236
|
+
See [`docs/models.md`](docs/models.md) for the full list with model keys and installation instructions.
|
|
237
|
+
|
|
238
|
+
---
|
|
239
|
+
|
|
240
|
+
## Citation
|
|
241
|
+
|
|
242
|
+
```bibtex
|
|
243
|
+
@inproceedings{10.1145/3746027.3758163,
|
|
244
|
+
author = {Tekaya, Nidham and Waldner, Manuela and Zeppelzauer, Matthias},
|
|
245
|
+
title = {A Matter of Time: Revealing the Structure of Time in Vision-Language Models},
|
|
246
|
+
year = {2025},
|
|
247
|
+
isbn = {9798400720352},
|
|
248
|
+
publisher = {Association for Computing Machinery},
|
|
249
|
+
address = {New York, NY, USA},
|
|
250
|
+
url = {https://doi.org/10.1145/3746027.3758163},
|
|
251
|
+
doi = {10.1145/3746027.3758163},
|
|
252
|
+
booktitle = {Proceedings of the 33rd ACM International Conference on Multimedia},
|
|
253
|
+
pages = {12371--12380},
|
|
254
|
+
numpages = {10},
|
|
255
|
+
keywords = {benchmark dataset, multimodal representations, time estimation, time modeling, time reasoning, vision-language models},
|
|
256
|
+
location = {Dublin, Ireland},
|
|
257
|
+
series = {MM '25}
|
|
258
|
+
}
|
|
259
|
+
```
|
|
260
|
+
|
|
261
|
+
## Links
|
|
262
|
+
|
|
263
|
+
- [Paper (ACM Digital Library)](https://dl.acm.org/doi/10.1145/3746027.3758163)
|
|
264
|
+
- [arXiv Preprint](https://arxiv.org/pdf/2510.19559)
|
|
265
|
+
- [Project Page](https://tekayanidham.github.io/timeline-page/)
|
|
266
|
+
- [TIME10k Dataset](https://osf.io/4th79/?view_only=560f540a7bac4d489faf164b16109642)
|
|
267
|
+
- [Hugging Face Demo](https://huggingface.co/spaces/Nidhamtek/timeline-vlm)
|
|
268
|
+
|
|
269
|
+
## License
|
|
270
|
+
|
|
271
|
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|