event2vector 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- event2vector-0.1.0/MANIFEST.in +4 -0
- event2vector-0.1.0/PKG-INFO +262 -0
- event2vector-0.1.0/README.md +240 -0
- event2vector-0.1.0/event2vector/__init__.py +4 -0
- event2vector-0.1.0/event2vector/data.py +52 -0
- event2vector-0.1.0/event2vector/models.py +120 -0
- event2vector-0.1.0/event2vector.egg-info/PKG-INFO +262 -0
- event2vector-0.1.0/event2vector.egg-info/SOURCES.txt +12 -0
- event2vector-0.1.0/event2vector.egg-info/dependency_links.txt +1 -0
- event2vector-0.1.0/event2vector.egg-info/requires.txt +8 -0
- event2vector-0.1.0/event2vector.egg-info/top_level.txt +1 -0
- event2vector-0.1.0/pyproject.toml +33 -0
- event2vector-0.1.0/setup.cfg +4 -0
- event2vector-0.1.0/setup.py +28 -0
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: event2vector
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A geometric approach to learning composable representations of event sequences.
|
|
5
|
+
Author: Antonin Sulc
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/sulcantonin/event2vec_public
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.6
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: torch
|
|
14
|
+
Requires-Dist: numpy
|
|
15
|
+
Requires-Dist: tqdm
|
|
16
|
+
Requires-Dist: matplotlib
|
|
17
|
+
Requires-Dist: scikit-learn
|
|
18
|
+
Requires-Dist: openTSNE
|
|
19
|
+
Requires-Dist: gensim
|
|
20
|
+
Requires-Dist: seaborn
|
|
21
|
+
Dynamic: requires-python
|
|
22
|
+
|
|
23
|
+
<div align="center">
|
|
24
|
+
|
|
25
|
+
# Event2Vector
|
|
26
|
+
## A Geometric Approach to Learning Composable Representations of Event Sequences
|
|
27
|
+
|
|
28
|
+
[](https://badge.fury.io/py/event2vector)
|
|
29
|
+
[](https://opensource.org/licenses/MIT)
|
|
30
|
+
[](https://www.python.org/downloads/)
|
|
31
|
+
[](https://arxiv.org/abs/2509.12188)
|
|
32
|
+
|
|
33
|
+

|
|
34
|
+
|
|
35
|
+
</div>
|
|
36
|
+
|
|
37
|
+
## Overview
|
|
38
|
+
|
|
39
|
+
**Event2Vector** is a framework for learning representations of discrete event sequences. Inspired by the geometric structures found in neural representations, this model uses a simple, additive recurrent structure to create composable and interpretable embeddings.
|
|
40
|
+
|
|
41
|
+
## Key Concepts
|
|
42
|
+
* **Linear Additive Hypothesis**: The core idea behind Event2Vector is that the representation of an event sequence can be modeled as the vector sum of the embeddings of its individual events. This allows for intuitive vector arithmetic, enabling the composition and decomposition of event trajectories.
|
|
43
|
+
* **Euclidean and Hyperbolic Models**: Event2Vector is offered in two geometric variants:
|
|
44
|
+
* **Euclidean model**: Uses standard vector addition, providing a straightforward, flat geometry for event trajectories.
|
|
45
|
+
* **Hyperbolic model**: Employs Möbius addition, which is better suited for hierarchical data structures, as it can embed tree-like patterns with less distortion.
|
|
46
|
+
|
|
47
|
+
For more details, check *Sulc A., Event2Vector: A Geometric Approach to Learning Composable Representations of Event Sequences*
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
Install the package directly from PyPI:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install event2vector
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Or install from source:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
git clone https://github.com/sulcantonin/event2vec_public.git
|
|
61
|
+
cd event2vec_public
|
|
62
|
+
pip install .
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
## Brown Example
|
|
67
|
+
Afeer installation, you can try to run Brown Part-of-Speech tagging example from the paper.
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
python3 -m experiments.prepare_brown_data.py
|
|
71
|
+
python3 -m experiments.train_brown_data.py
|
|
72
|
+
python3 -m experiments.visualize_brown_corpus.py
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Quickstart (tiny synthetic dataset)
|
|
76
|
+
|
|
77
|
+
The snippet below trains a small `EuclideanModel` on a toy event graph for a few epochs and prints an embedding for a short sequence. It runs in seconds on CPU.
|
|
78
|
+
|
|
79
|
+
We have 5 events: `START, A, B, C, END` and we test if we add `START + A + C ~ C`. The example should be self contained for educational purposes, as the main interest is the loss function.
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
import random
|
|
83
|
+
import torch
|
|
84
|
+
import numpy as np
|
|
85
|
+
from sklearn.decomposition import PCA
|
|
86
|
+
import matplotlib.pyplot as plt
|
|
87
|
+
|
|
88
|
+
# Reproducibility: fix all relevant seeds so results are stable across runs
|
|
89
|
+
SEED = 42
|
|
90
|
+
random.seed(SEED) # Python's RNG (used by get_sequences and shuffling)
|
|
91
|
+
np.random.seed(SEED) # NumPy RNG (used by PCA and any NumPy ops)
|
|
92
|
+
torch.manual_seed(SEED) # Torch CPU RNG
|
|
93
|
+
if torch.cuda.is_available():
|
|
94
|
+
torch.cuda.manual_seed_all(SEED) # Torch CUDA RNG (all devices)
|
|
95
|
+
|
|
96
|
+
# Optional: prefer deterministic behavior in cuDNN (may have performance impact)
|
|
97
|
+
torch.backends.cudnn.deterministic = True
|
|
98
|
+
torch.backends.cudnn.benchmark = False
|
|
99
|
+
|
|
100
|
+
# Expected results (for the tiny toy graph START→A/B→C→END):
|
|
101
|
+
# - Training loss should decrease over epochs (with minor wobble due to randomness/dropout).
|
|
102
|
+
# - The printed sequence embedding for [START, A, C] is a single 8-D vector (shape (1, 8)).
|
|
103
|
+
# - Nearest tokens by cosine similarity to that sequence embedding should rank 'C' highest
|
|
104
|
+
# (since the encoder is additive and the last token is C), with 'END' somewhat aligned.
|
|
105
|
+
# - The decoder's top-1 next-event probability from that state should be 'END' with high
|
|
106
|
+
# confidence (≈0.9+) because the toy transitions force C→END.
|
|
107
|
+
# Visualization (below): a 2D PCA of token embeddings plus the sequence embedding should show
|
|
108
|
+
# 'SEQ(START-A-C)' lying near the point labeled 'C'.
|
|
109
|
+
|
|
110
|
+
from event2vector import EuclideanModel
|
|
111
|
+
from event2vector.data import get_sequences
|
|
112
|
+
|
|
113
|
+
# 1) Define a tiny state-transition toy dataset
|
|
114
|
+
# We model a simple Markovian process: START → (A or B) → C → END
|
|
115
|
+
# The model will learn to predict the next token and produce a sequence embedding
|
|
116
|
+
# by additive composition of token embeddings.
|
|
117
|
+
event_types = ['START', 'A', 'B', 'C', 'END']
|
|
118
|
+
event_transitions = {
|
|
119
|
+
'START': [('A', 0.5), ('B', 0.5)],
|
|
120
|
+
'A': [('C', 0.6), ('B', 0.4)],
|
|
121
|
+
'B': [('C', 0.7), ('A', 0.3)],
|
|
122
|
+
'C': [('END', 1.0)],
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
# 2) Generate synthetic sequences (reproducible thanks to the fixed seeds above)
|
|
126
|
+
# get_sequences returns: raw sequences, tensorized sequences, and vocab mappings.
|
|
127
|
+
_, processed_sequences, event_2_idx, _ = get_sequences(
|
|
128
|
+
event_types=event_types,
|
|
129
|
+
event_transitions=event_transitions,
|
|
130
|
+
initial='START',
|
|
131
|
+
terminal='END',
|
|
132
|
+
num_seq=200, # generate 200 short sequences
|
|
133
|
+
max_seq_length=6, # keep them small for speed
|
|
134
|
+
generate_new=True,
|
|
135
|
+
prefix='tiny_quickstart'
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# 3) Initialize model and optimizer
|
|
139
|
+
# EuclideanModel composes sequence representations via additive updates.
|
|
140
|
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
141
|
+
embedding_dim = 8
|
|
142
|
+
model = EuclideanModel(num_event_types=len(event_types), embedding_dim=embedding_dim, dropout_p=0.1).to(device)
|
|
143
|
+
loss_fn = torch.nn.CrossEntropyLoss() # prediction loss (next-token)
|
|
144
|
+
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-3)
|
|
145
|
+
lambda_reconstruction = 0.2 # weight for reconstruction loss (h1 - e ≈ h_old)
|
|
146
|
+
lambda_consistency = 0.2 # weight for consistency loss (h1 ≈ h2)
|
|
147
|
+
mse_loss = torch.nn.MSELoss()
|
|
148
|
+
|
|
149
|
+
# 4) Minimal training loop
|
|
150
|
+
# Objective: next-token prediction with cross-entropy.
|
|
151
|
+
# We iterate tokens in each sequence and accumulate loss over (t → t+1) pairs.
|
|
152
|
+
model.train()
|
|
153
|
+
for epoch in range(128):
|
|
154
|
+
random.shuffle(processed_sequences) # shuffling is deterministic given the fixed Python seed
|
|
155
|
+
total_loss = 0.0
|
|
156
|
+
for seq_tensor in processed_sequences:
|
|
157
|
+
if len(seq_tensor) < 2:
|
|
158
|
+
continue
|
|
159
|
+
# h is the running sequence representation; encoder adds current token embedding
|
|
160
|
+
h = torch.zeros((1, embedding_dim), device=device)
|
|
161
|
+
seq_tensor = seq_tensor.to(device)
|
|
162
|
+
|
|
163
|
+
sequence_loss = 0.0
|
|
164
|
+
for t in range(len(seq_tensor) - 1):
|
|
165
|
+
x = seq_tensor[t].unsqueeze(0)
|
|
166
|
+
target = seq_tensor[t + 1].unsqueeze(0)
|
|
167
|
+
# Brown pipeline: two forward passes from the same h_old to compute a consistency loss
|
|
168
|
+
h_old = h.detach()
|
|
169
|
+
y1, h1, e_curr1 = model(x, h_old)
|
|
170
|
+
y2, h2, e_curr2 = model(x, h_old)
|
|
171
|
+
|
|
172
|
+
# Prediction: next-token cross-entropy on the first pass
|
|
173
|
+
prediction_loss = loss_fn(y1.view(1, -1), target)
|
|
174
|
+
|
|
175
|
+
# Reconstruction: undo the additive step (h1 - e ≈ h_old)
|
|
176
|
+
h_reconstructed = h1 - e_curr1
|
|
177
|
+
reconstruction_loss = mse_loss(h_reconstructed, h_old)
|
|
178
|
+
|
|
179
|
+
# Consistency: two identical passes should produce similar states
|
|
180
|
+
consistency_loss = mse_loss(h1, h2)
|
|
181
|
+
|
|
182
|
+
combined = (prediction_loss +
|
|
183
|
+
lambda_reconstruction * reconstruction_loss +
|
|
184
|
+
lambda_consistency * consistency_loss)
|
|
185
|
+
sequence_loss = sequence_loss + combined
|
|
186
|
+
|
|
187
|
+
# Advance hidden state using the first pass
|
|
188
|
+
h = h1.detach()
|
|
189
|
+
|
|
190
|
+
if (len(seq_tensor) - 1) > 0:
|
|
191
|
+
avg_seq_loss = sequence_loss / (len(seq_tensor) - 1)
|
|
192
|
+
optimizer.zero_grad()
|
|
193
|
+
avg_seq_loss.backward()
|
|
194
|
+
optimizer.step()
|
|
195
|
+
total_loss += avg_seq_loss.item()
|
|
196
|
+
|
|
197
|
+
print(f"epoch {epoch + 1}: loss={total_loss / max(1, len(processed_sequences)):.4f}")
|
|
198
|
+
|
|
199
|
+
# 5) Use the learned representation for a short sequence
|
|
200
|
+
# We encode [START, A, C] step-by-step; the final h is the sequence embedding.
|
|
201
|
+
model.eval()
|
|
202
|
+
with torch.no_grad():
|
|
203
|
+
seq = torch.tensor([
|
|
204
|
+
event_2_idx['START'], event_2_idx['A'], event_2_idx['C']
|
|
205
|
+
], device=device)
|
|
206
|
+
h = torch.zeros((1, embedding_dim), device=device)
|
|
207
|
+
for idx in seq:
|
|
208
|
+
_, h, _ = model(idx.unsqueeze(0), h)
|
|
209
|
+
print('Sequence embedding (START-A-C):', h.cpu().numpy())
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
with torch.no_grad():
|
|
213
|
+
# 6) Qualitative check: nearest tokens to the sequence embedding by cosine similarity
|
|
214
|
+
# Expect 'C' to be nearest (last consumed token) and 'END' reasonably aligned.
|
|
215
|
+
emb = model.embedding.weight.detach() # [V, 8]
|
|
216
|
+
h_norm = torch.nn.functional.normalize(h, dim=1)
|
|
217
|
+
emb_norm = torch.nn.functional.normalize(emb, dim=1)
|
|
218
|
+
sims = (emb_norm @ h_norm.squeeze(0))
|
|
219
|
+
top_sim = torch.topk(sims, k=3)
|
|
220
|
+
print('Nearest tokens by cosine:', [ (list(event_2_idx.keys())[i], float(sims[i])) for i in top_sim.indices ])
|
|
221
|
+
|
|
222
|
+
with torch.no_grad():
|
|
223
|
+
# 7) Next-event distribution from the current state
|
|
224
|
+
# Expect 'END' to be top-1 with high probability because C→END with p=1.0
|
|
225
|
+
logits = model.decoder(h) # [1, V]
|
|
226
|
+
probs = torch.softmax(logits, dim=-1).squeeze(0)
|
|
227
|
+
top = torch.topk(probs, k=3)
|
|
228
|
+
inv = {v:k for k,v in event_2_idx.items()}
|
|
229
|
+
print('Top-3 next events:', [ (inv[i.item()], float(probs[i])) for i in top.indices ])
|
|
230
|
+
|
|
231
|
+
# 8) Visualization: PCA of token embeddings + sequence embedding
|
|
232
|
+
# Expect the red star (sequence) to lie near the point labeled 'C'.
|
|
233
|
+
with torch.no_grad():
|
|
234
|
+
token_emb = model.embedding.weight.detach().cpu().numpy() # [V, 8]
|
|
235
|
+
seq_emb = h.detach().cpu().numpy() # [1, 8]
|
|
236
|
+
X = np.vstack([token_emb, seq_emb])
|
|
237
|
+
pca = PCA(n_components=2, random_state=SEED)
|
|
238
|
+
X2 = pca.fit_transform(X)
|
|
239
|
+
tokens2, seq2 = X2[:-1], X2[-1]
|
|
240
|
+
|
|
241
|
+
inv = {v:k for k,v in event_2_idx.items()}
|
|
242
|
+
plt.figure(figsize=(6, 6))
|
|
243
|
+
plt.scatter(tokens2[:, 0], tokens2[:, 1], c='gray', label='tokens')
|
|
244
|
+
for i, (x, y) in enumerate(tokens2):
|
|
245
|
+
plt.text(x + 0.02, y + 0.02, inv[i], fontsize=9)
|
|
246
|
+
plt.scatter([seq2[0]], [seq2[1]], c='red', marker='*', s=160, label='SEQ(START-A-C)')
|
|
247
|
+
plt.title('PCA: token embeddings + sequence embedding (expect SEQ near C)')
|
|
248
|
+
plt.legend(loc='best')
|
|
249
|
+
plt.tight_layout()
|
|
250
|
+
plt.show()
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
## References
|
|
254
|
+
For citations please use following Bibtex.
|
|
255
|
+
```bibtex
|
|
256
|
+
@article{sulc2025event2vec,
|
|
257
|
+
title={Event2Vector: A Geometric Approach to Learning Composable Representations of Event Sequences},
|
|
258
|
+
author={Sulc, Antonin},
|
|
259
|
+
journal={arXiv preprint arXiv:2509.12188},
|
|
260
|
+
year={2025}
|
|
261
|
+
}
|
|
262
|
+
```
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
<div align="center">
|
|
2
|
+
|
|
3
|
+
# Event2Vector
|
|
4
|
+
## A Geometric Approach to Learning Composable Representations of Event Sequences
|
|
5
|
+
|
|
6
|
+
[](https://badge.fury.io/py/event2vector)
|
|
7
|
+
[](https://opensource.org/licenses/MIT)
|
|
8
|
+
[](https://www.python.org/downloads/)
|
|
9
|
+
[](https://arxiv.org/abs/2509.12188)
|
|
10
|
+
|
|
11
|
+

|
|
12
|
+
|
|
13
|
+
</div>
|
|
14
|
+
|
|
15
|
+
## Overview
|
|
16
|
+
|
|
17
|
+
**Event2Vector** is a framework for learning representations of discrete event sequences. Inspired by the geometric structures found in neural representations, this model uses a simple, additive recurrent structure to create composable and interpretable embeddings.
|
|
18
|
+
|
|
19
|
+
## Key Concepts
|
|
20
|
+
* **Linear Additive Hypothesis**: The core idea behind Event2Vector is that the representation of an event sequence can be modeled as the vector sum of the embeddings of its individual events. This allows for intuitive vector arithmetic, enabling the composition and decomposition of event trajectories.
|
|
21
|
+
* **Euclidean and Hyperbolic Models**: Event2Vector is offered in two geometric variants:
|
|
22
|
+
* **Euclidean model**: Uses standard vector addition, providing a straightforward, flat geometry for event trajectories.
|
|
23
|
+
* **Hyperbolic model**: Employs Möbius addition, which is better suited for hierarchical data structures, as it can embed tree-like patterns with less distortion.
|
|
24
|
+
|
|
25
|
+
For more details, check *Sulc A., Event2Vector: A Geometric Approach to Learning Composable Representations of Event Sequences*
|
|
26
|
+
|
|
27
|
+
## Installation
|
|
28
|
+
|
|
29
|
+
Install the package directly from PyPI:
|
|
30
|
+
|
|
31
|
+
```bash
|
|
32
|
+
pip install event2vector
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
Or install from source:
|
|
36
|
+
|
|
37
|
+
```bash
|
|
38
|
+
git clone https://github.com/sulcantonin/event2vec_public.git
|
|
39
|
+
cd event2vec_public
|
|
40
|
+
pip install .
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
## Brown Example
|
|
45
|
+
Afeer installation, you can try to run Brown Part-of-Speech tagging example from the paper.
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
python3 -m experiments.prepare_brown_data.py
|
|
49
|
+
python3 -m experiments.train_brown_data.py
|
|
50
|
+
python3 -m experiments.visualize_brown_corpus.py
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
## Quickstart (tiny synthetic dataset)
|
|
54
|
+
|
|
55
|
+
The snippet below trains a small `EuclideanModel` on a toy event graph for a few epochs and prints an embedding for a short sequence. It runs in seconds on CPU.
|
|
56
|
+
|
|
57
|
+
We have 5 events: `START, A, B, C, END` and we test if we add `START + A + C ~ C`. The example should be self contained for educational purposes, as the main interest is the loss function.
|
|
58
|
+
|
|
59
|
+
```python
|
|
60
|
+
import random
|
|
61
|
+
import torch
|
|
62
|
+
import numpy as np
|
|
63
|
+
from sklearn.decomposition import PCA
|
|
64
|
+
import matplotlib.pyplot as plt
|
|
65
|
+
|
|
66
|
+
# Reproducibility: fix all relevant seeds so results are stable across runs
|
|
67
|
+
SEED = 42
|
|
68
|
+
random.seed(SEED) # Python's RNG (used by get_sequences and shuffling)
|
|
69
|
+
np.random.seed(SEED) # NumPy RNG (used by PCA and any NumPy ops)
|
|
70
|
+
torch.manual_seed(SEED) # Torch CPU RNG
|
|
71
|
+
if torch.cuda.is_available():
|
|
72
|
+
torch.cuda.manual_seed_all(SEED) # Torch CUDA RNG (all devices)
|
|
73
|
+
|
|
74
|
+
# Optional: prefer deterministic behavior in cuDNN (may have performance impact)
|
|
75
|
+
torch.backends.cudnn.deterministic = True
|
|
76
|
+
torch.backends.cudnn.benchmark = False
|
|
77
|
+
|
|
78
|
+
# Expected results (for the tiny toy graph START→A/B→C→END):
|
|
79
|
+
# - Training loss should decrease over epochs (with minor wobble due to randomness/dropout).
|
|
80
|
+
# - The printed sequence embedding for [START, A, C] is a single 8-D vector (shape (1, 8)).
|
|
81
|
+
# - Nearest tokens by cosine similarity to that sequence embedding should rank 'C' highest
|
|
82
|
+
# (since the encoder is additive and the last token is C), with 'END' somewhat aligned.
|
|
83
|
+
# - The decoder's top-1 next-event probability from that state should be 'END' with high
|
|
84
|
+
# confidence (≈0.9+) because the toy transitions force C→END.
|
|
85
|
+
# Visualization (below): a 2D PCA of token embeddings plus the sequence embedding should show
|
|
86
|
+
# 'SEQ(START-A-C)' lying near the point labeled 'C'.
|
|
87
|
+
|
|
88
|
+
from event2vector import EuclideanModel
|
|
89
|
+
from event2vector.data import get_sequences
|
|
90
|
+
|
|
91
|
+
# 1) Define a tiny state-transition toy dataset
|
|
92
|
+
# We model a simple Markovian process: START → (A or B) → C → END
|
|
93
|
+
# The model will learn to predict the next token and produce a sequence embedding
|
|
94
|
+
# by additive composition of token embeddings.
|
|
95
|
+
event_types = ['START', 'A', 'B', 'C', 'END']
|
|
96
|
+
event_transitions = {
|
|
97
|
+
'START': [('A', 0.5), ('B', 0.5)],
|
|
98
|
+
'A': [('C', 0.6), ('B', 0.4)],
|
|
99
|
+
'B': [('C', 0.7), ('A', 0.3)],
|
|
100
|
+
'C': [('END', 1.0)],
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
# 2) Generate synthetic sequences (reproducible thanks to the fixed seeds above)
|
|
104
|
+
# get_sequences returns: raw sequences, tensorized sequences, and vocab mappings.
|
|
105
|
+
_, processed_sequences, event_2_idx, _ = get_sequences(
|
|
106
|
+
event_types=event_types,
|
|
107
|
+
event_transitions=event_transitions,
|
|
108
|
+
initial='START',
|
|
109
|
+
terminal='END',
|
|
110
|
+
num_seq=200, # generate 200 short sequences
|
|
111
|
+
max_seq_length=6, # keep them small for speed
|
|
112
|
+
generate_new=True,
|
|
113
|
+
prefix='tiny_quickstart'
|
|
114
|
+
)
|
|
115
|
+
|
|
116
|
+
# 3) Initialize model and optimizer
|
|
117
|
+
# EuclideanModel composes sequence representations via additive updates.
|
|
118
|
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
119
|
+
embedding_dim = 8
|
|
120
|
+
model = EuclideanModel(num_event_types=len(event_types), embedding_dim=embedding_dim, dropout_p=0.1).to(device)
|
|
121
|
+
loss_fn = torch.nn.CrossEntropyLoss() # prediction loss (next-token)
|
|
122
|
+
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-3)
|
|
123
|
+
lambda_reconstruction = 0.2 # weight for reconstruction loss (h1 - e ≈ h_old)
|
|
124
|
+
lambda_consistency = 0.2 # weight for consistency loss (h1 ≈ h2)
|
|
125
|
+
mse_loss = torch.nn.MSELoss()
|
|
126
|
+
|
|
127
|
+
# 4) Minimal training loop
|
|
128
|
+
# Objective: next-token prediction with cross-entropy.
|
|
129
|
+
# We iterate tokens in each sequence and accumulate loss over (t → t+1) pairs.
|
|
130
|
+
model.train()
|
|
131
|
+
for epoch in range(128):
|
|
132
|
+
random.shuffle(processed_sequences) # shuffling is deterministic given the fixed Python seed
|
|
133
|
+
total_loss = 0.0
|
|
134
|
+
for seq_tensor in processed_sequences:
|
|
135
|
+
if len(seq_tensor) < 2:
|
|
136
|
+
continue
|
|
137
|
+
# h is the running sequence representation; encoder adds current token embedding
|
|
138
|
+
h = torch.zeros((1, embedding_dim), device=device)
|
|
139
|
+
seq_tensor = seq_tensor.to(device)
|
|
140
|
+
|
|
141
|
+
sequence_loss = 0.0
|
|
142
|
+
for t in range(len(seq_tensor) - 1):
|
|
143
|
+
x = seq_tensor[t].unsqueeze(0)
|
|
144
|
+
target = seq_tensor[t + 1].unsqueeze(0)
|
|
145
|
+
# Brown pipeline: two forward passes from the same h_old to compute a consistency loss
|
|
146
|
+
h_old = h.detach()
|
|
147
|
+
y1, h1, e_curr1 = model(x, h_old)
|
|
148
|
+
y2, h2, e_curr2 = model(x, h_old)
|
|
149
|
+
|
|
150
|
+
# Prediction: next-token cross-entropy on the first pass
|
|
151
|
+
prediction_loss = loss_fn(y1.view(1, -1), target)
|
|
152
|
+
|
|
153
|
+
# Reconstruction: undo the additive step (h1 - e ≈ h_old)
|
|
154
|
+
h_reconstructed = h1 - e_curr1
|
|
155
|
+
reconstruction_loss = mse_loss(h_reconstructed, h_old)
|
|
156
|
+
|
|
157
|
+
# Consistency: two identical passes should produce similar states
|
|
158
|
+
consistency_loss = mse_loss(h1, h2)
|
|
159
|
+
|
|
160
|
+
combined = (prediction_loss +
|
|
161
|
+
lambda_reconstruction * reconstruction_loss +
|
|
162
|
+
lambda_consistency * consistency_loss)
|
|
163
|
+
sequence_loss = sequence_loss + combined
|
|
164
|
+
|
|
165
|
+
# Advance hidden state using the first pass
|
|
166
|
+
h = h1.detach()
|
|
167
|
+
|
|
168
|
+
if (len(seq_tensor) - 1) > 0:
|
|
169
|
+
avg_seq_loss = sequence_loss / (len(seq_tensor) - 1)
|
|
170
|
+
optimizer.zero_grad()
|
|
171
|
+
avg_seq_loss.backward()
|
|
172
|
+
optimizer.step()
|
|
173
|
+
total_loss += avg_seq_loss.item()
|
|
174
|
+
|
|
175
|
+
print(f"epoch {epoch + 1}: loss={total_loss / max(1, len(processed_sequences)):.4f}")
|
|
176
|
+
|
|
177
|
+
# 5) Use the learned representation for a short sequence
|
|
178
|
+
# We encode [START, A, C] step-by-step; the final h is the sequence embedding.
|
|
179
|
+
model.eval()
|
|
180
|
+
with torch.no_grad():
|
|
181
|
+
seq = torch.tensor([
|
|
182
|
+
event_2_idx['START'], event_2_idx['A'], event_2_idx['C']
|
|
183
|
+
], device=device)
|
|
184
|
+
h = torch.zeros((1, embedding_dim), device=device)
|
|
185
|
+
for idx in seq:
|
|
186
|
+
_, h, _ = model(idx.unsqueeze(0), h)
|
|
187
|
+
print('Sequence embedding (START-A-C):', h.cpu().numpy())
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
with torch.no_grad():
|
|
191
|
+
# 6) Qualitative check: nearest tokens to the sequence embedding by cosine similarity
|
|
192
|
+
# Expect 'C' to be nearest (last consumed token) and 'END' reasonably aligned.
|
|
193
|
+
emb = model.embedding.weight.detach() # [V, 8]
|
|
194
|
+
h_norm = torch.nn.functional.normalize(h, dim=1)
|
|
195
|
+
emb_norm = torch.nn.functional.normalize(emb, dim=1)
|
|
196
|
+
sims = (emb_norm @ h_norm.squeeze(0))
|
|
197
|
+
top_sim = torch.topk(sims, k=3)
|
|
198
|
+
print('Nearest tokens by cosine:', [ (list(event_2_idx.keys())[i], float(sims[i])) for i in top_sim.indices ])
|
|
199
|
+
|
|
200
|
+
with torch.no_grad():
|
|
201
|
+
# 7) Next-event distribution from the current state
|
|
202
|
+
# Expect 'END' to be top-1 with high probability because C→END with p=1.0
|
|
203
|
+
logits = model.decoder(h) # [1, V]
|
|
204
|
+
probs = torch.softmax(logits, dim=-1).squeeze(0)
|
|
205
|
+
top = torch.topk(probs, k=3)
|
|
206
|
+
inv = {v:k for k,v in event_2_idx.items()}
|
|
207
|
+
print('Top-3 next events:', [ (inv[i.item()], float(probs[i])) for i in top.indices ])
|
|
208
|
+
|
|
209
|
+
# 8) Visualization: PCA of token embeddings + sequence embedding
|
|
210
|
+
# Expect the red star (sequence) to lie near the point labeled 'C'.
|
|
211
|
+
with torch.no_grad():
|
|
212
|
+
token_emb = model.embedding.weight.detach().cpu().numpy() # [V, 8]
|
|
213
|
+
seq_emb = h.detach().cpu().numpy() # [1, 8]
|
|
214
|
+
X = np.vstack([token_emb, seq_emb])
|
|
215
|
+
pca = PCA(n_components=2, random_state=SEED)
|
|
216
|
+
X2 = pca.fit_transform(X)
|
|
217
|
+
tokens2, seq2 = X2[:-1], X2[-1]
|
|
218
|
+
|
|
219
|
+
inv = {v:k for k,v in event_2_idx.items()}
|
|
220
|
+
plt.figure(figsize=(6, 6))
|
|
221
|
+
plt.scatter(tokens2[:, 0], tokens2[:, 1], c='gray', label='tokens')
|
|
222
|
+
for i, (x, y) in enumerate(tokens2):
|
|
223
|
+
plt.text(x + 0.02, y + 0.02, inv[i], fontsize=9)
|
|
224
|
+
plt.scatter([seq2[0]], [seq2[1]], c='red', marker='*', s=160, label='SEQ(START-A-C)')
|
|
225
|
+
plt.title('PCA: token embeddings + sequence embedding (expect SEQ near C)')
|
|
226
|
+
plt.legend(loc='best')
|
|
227
|
+
plt.tight_layout()
|
|
228
|
+
plt.show()
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
## References
|
|
232
|
+
For citations please use following Bibtex.
|
|
233
|
+
```bibtex
|
|
234
|
+
@article{sulc2025event2vec,
|
|
235
|
+
title={Event2Vector: A Geometric Approach to Learning Composable Representations of Event Sequences},
|
|
236
|
+
author={Sulc, Antonin},
|
|
237
|
+
journal={arXiv preprint arXiv:2509.12188},
|
|
238
|
+
year={2025}
|
|
239
|
+
}
|
|
240
|
+
```
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import random
|
|
2
|
+
import torch
|
|
3
|
+
import pickle
|
|
4
|
+
|
|
5
|
+
def generate_sequences(event_types, event_transitions, initial, terminal, num_seq, max_length):
|
|
6
|
+
"""
|
|
7
|
+
Generates synthetic sequences of events based on a state transition graph.
|
|
8
|
+
This function simulates realistic life trajectories by performing a guided
|
|
9
|
+
random walk on the graph.
|
|
10
|
+
"""
|
|
11
|
+
sequences = []
|
|
12
|
+
for _ in range(num_seq):
|
|
13
|
+
seq = [initial]
|
|
14
|
+
while seq[-1] != terminal and len(seq) < max_length:
|
|
15
|
+
current_event = seq[-1]
|
|
16
|
+
transitions = event_transitions.get(current_event, [])
|
|
17
|
+
if not transitions or random.random() < 0.1:
|
|
18
|
+
next_event = random.choice([e for e in event_types if e != current_event])
|
|
19
|
+
else:
|
|
20
|
+
events, probs = zip(*transitions)
|
|
21
|
+
next_event = random.choices(events, weights=probs, k=1)[0]
|
|
22
|
+
seq.append(next_event)
|
|
23
|
+
if seq[-1] != terminal:
|
|
24
|
+
seq.append(terminal)
|
|
25
|
+
sequences.append(seq)
|
|
26
|
+
return sequences
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def get_sequences(event_types, event_transitions, initial, terminal, num_seq, max_seq_length, generate_new, prefix):
|
|
30
|
+
"""
|
|
31
|
+
Generates and preprocesses sequences for synthetic data, or loads them from a file.
|
|
32
|
+
"""
|
|
33
|
+
data_file = f'{prefix}_training_data.pkl'
|
|
34
|
+
if generate_new:
|
|
35
|
+
event_2_idx = {event: idx for idx, event in enumerate(event_types)}
|
|
36
|
+
idx_2_event = {idx: event for idx, event in enumerate(event_types)}
|
|
37
|
+
sequences = generate_sequences(event_types, event_transitions, initial, terminal, num_seq, max_seq_length)
|
|
38
|
+
processed_sequences = [torch.tensor([event_2_idx[s] for s in seq], dtype=torch.long) for seq in sequences]
|
|
39
|
+
with open(data_file, 'wb') as f:
|
|
40
|
+
pickle.dump({'sequences': sequences,
|
|
41
|
+
'processed_sequences': processed_sequences,
|
|
42
|
+
'event_2_idx': event_2_idx,
|
|
43
|
+
'idx_2_event': idx_2_event}, f)
|
|
44
|
+
else:
|
|
45
|
+
with open(data_file, 'rb') as f:
|
|
46
|
+
data = pickle.load(f)
|
|
47
|
+
sequences = data['sequences']
|
|
48
|
+
processed_sequences = data['processed_sequences']
|
|
49
|
+
event_2_idx = data['event_2_idx']
|
|
50
|
+
idx_2_event = data['idx_2_event']
|
|
51
|
+
return sequences, processed_sequences, event_2_idx, idx_2_event
|
|
52
|
+
|
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
import torch
|
|
2
|
+
import torch.nn as nn
|
|
3
|
+
|
|
4
|
+
EPS = 1e-5 # Small constant for numerical stability of hyperbolic space
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class EuclideanModel(nn.Module):
|
|
8
|
+
"""
|
|
9
|
+
A recurrent model for learning event sequence representations in Euclidean space.
|
|
10
|
+
The model uses a simple additive update rule, where the representation of a
|
|
11
|
+
sequence is the sum of its constituent event embeddings.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
def __init__(self, num_event_types, embedding_dim, dropout_p=0.2, max_norm=10.0):
|
|
15
|
+
super(EuclideanModel, self).__init__()
|
|
16
|
+
self.embedding_dim = embedding_dim
|
|
17
|
+
self.max_norm = max_norm
|
|
18
|
+
self.embedding = nn.Embedding(num_event_types, embedding_dim)
|
|
19
|
+
self.dropout = nn.Dropout(p=dropout_p)
|
|
20
|
+
self.linear_dec = nn.Linear(embedding_dim, num_event_types)
|
|
21
|
+
|
|
22
|
+
def encoder(self, x, h=None, clipping=True):
|
|
23
|
+
if h is None:
|
|
24
|
+
h = torch.zeros((x.shape[0], self.embedding_dim), device=x.device)
|
|
25
|
+
e = self.embedding(x)
|
|
26
|
+
h_next = self.dropout(e + h)
|
|
27
|
+
|
|
28
|
+
if clipping:
|
|
29
|
+
norm = torch.norm(h_next, p=2, dim=1, keepdim=True)
|
|
30
|
+
clip_coef = self.max_norm / (norm + 1e-6)
|
|
31
|
+
clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
|
|
32
|
+
h_next = h_next * clip_coef
|
|
33
|
+
|
|
34
|
+
return h_next, e
|
|
35
|
+
|
|
36
|
+
def decoder(self, h):
|
|
37
|
+
return self.linear_dec(h)
|
|
38
|
+
|
|
39
|
+
def forward(self, x, h=None):
|
|
40
|
+
h, e = self.encoder(x, h)
|
|
41
|
+
return self.decoder(h), h, e
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class HyperbolicUtils:
|
|
45
|
+
"""A collection of static methods for operations in the Poincaré Ball model."""
|
|
46
|
+
|
|
47
|
+
@staticmethod
|
|
48
|
+
def mobius_add(x, y, c):
|
|
49
|
+
x2 = torch.sum(x * x, dim=-1, keepdim=True)
|
|
50
|
+
y2 = torch.sum(y * y, dim=-1, keepdim=True)
|
|
51
|
+
xy = torch.sum(x * y, dim=-1, keepdim=True)
|
|
52
|
+
num = (1 + 2 * c * xy + c * y2) * x + (1 - c * x2) * y
|
|
53
|
+
den = 1 + 2 * c * xy + c**2 * x2 * y2
|
|
54
|
+
return num / (den + EPS)
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def log_map_origin(y, c):
|
|
58
|
+
sqrt_c = c**0.5
|
|
59
|
+
y_norm = torch.norm(y, p=2, dim=-1, keepdim=True).clamp_min(EPS)
|
|
60
|
+
artanh_arg = (sqrt_c * y_norm).clamp(max=1.0 - EPS)
|
|
61
|
+
log_map = (1. / sqrt_c) * (0.5 * torch.log((1 + artanh_arg) / (1 - artanh_arg))) * (y / y_norm)
|
|
62
|
+
return log_map
|
|
63
|
+
|
|
64
|
+
@staticmethod
|
|
65
|
+
def poincare_dist_sq(x, y, c):
|
|
66
|
+
sqrt_c = c**0.5
|
|
67
|
+
mob_add_res = HyperbolicUtils.mobius_add(-x, y, c)
|
|
68
|
+
mob_add_norm = torch.norm(mob_add_res, p=2, dim=-1, keepdim=True).clamp_min(EPS)
|
|
69
|
+
artanh_arg = (sqrt_c * mob_add_norm).clamp(max=1.0 - EPS)
|
|
70
|
+
dist = (2. / sqrt_c) * (0.5 * torch.log((1 + artanh_arg) / (1 - artanh_arg)))
|
|
71
|
+
return dist.pow(2)
|
|
72
|
+
|
|
73
|
+
@staticmethod
|
|
74
|
+
def project_to_ball(x, c):
|
|
75
|
+
max_norm = 1.0 / (c**0.5)
|
|
76
|
+
norm = torch.norm(x, p=2, dim=-1, keepdim=True)
|
|
77
|
+
cond = norm >= max_norm
|
|
78
|
+
projected_x = torch.where(cond, x / (norm + EPS) * (max_norm - EPS), x)
|
|
79
|
+
return projected_x
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
class HyperbolicModel(nn.Module):
|
|
83
|
+
"""
|
|
84
|
+
A recurrent model for learning event sequence representations in Hyperbolic space.
|
|
85
|
+
This model is well-suited for hierarchical data, as the geometry of hyperbolic
|
|
86
|
+
space can embed tree-like structures with low distortion.
|
|
87
|
+
"""
|
|
88
|
+
|
|
89
|
+
def __init__(self, num_event_types, embedding_dim, dropout_p, c=1.0):
|
|
90
|
+
super(HyperbolicModel, self).__init__()
|
|
91
|
+
self.embedding_dim = embedding_dim
|
|
92
|
+
self.c = c
|
|
93
|
+
self.embedding = nn.Embedding(num_event_types, embedding_dim)
|
|
94
|
+
self.embedding.weight.data.uniform_(-0.001, 0.001)
|
|
95
|
+
self.dropout = nn.Dropout(p=dropout_p)
|
|
96
|
+
self.linear_dec = nn.Linear(embedding_dim, num_event_types)
|
|
97
|
+
|
|
98
|
+
def project_embeddings(self):
|
|
99
|
+
with torch.no_grad():
|
|
100
|
+
self.embedding.weight.data = HyperbolicUtils.project_to_ball(
|
|
101
|
+
self.embedding.weight.data, self.c
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
def encoder(self, x, h):
|
|
105
|
+
e = self.embedding(x)
|
|
106
|
+
e_dropped = self.dropout(e)
|
|
107
|
+
h_next = HyperbolicUtils.mobius_add(h, e_dropped, self.c)
|
|
108
|
+
h_next = HyperbolicUtils.project_to_ball(h_next, self.c)
|
|
109
|
+
return h_next, e
|
|
110
|
+
|
|
111
|
+
def decoder(self, h):
|
|
112
|
+
h_tangent = HyperbolicUtils.log_map_origin(h, self.c)
|
|
113
|
+
return self.linear_dec(h_tangent)
|
|
114
|
+
|
|
115
|
+
def forward(self, x, h=None):
|
|
116
|
+
if h is None:
|
|
117
|
+
h = torch.zeros((x.shape[0], self.embedding_dim), device=x.device)
|
|
118
|
+
h_next, e = self.encoder(x, h)
|
|
119
|
+
return self.decoder(h_next), h_next, e
|
|
120
|
+
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: event2vector
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A geometric approach to learning composable representations of event sequences.
|
|
5
|
+
Author: Antonin Sulc
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/sulcantonin/event2vec_public
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.6
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
Requires-Dist: torch
|
|
14
|
+
Requires-Dist: numpy
|
|
15
|
+
Requires-Dist: tqdm
|
|
16
|
+
Requires-Dist: matplotlib
|
|
17
|
+
Requires-Dist: scikit-learn
|
|
18
|
+
Requires-Dist: openTSNE
|
|
19
|
+
Requires-Dist: gensim
|
|
20
|
+
Requires-Dist: seaborn
|
|
21
|
+
Dynamic: requires-python
|
|
22
|
+
|
|
23
|
+
<div align="center">
|
|
24
|
+
|
|
25
|
+
# Event2Vector
|
|
26
|
+
## A Geometric Approach to Learning Composable Representations of Event Sequences
|
|
27
|
+
|
|
28
|
+
[](https://badge.fury.io/py/event2vector)
|
|
29
|
+
[](https://opensource.org/licenses/MIT)
|
|
30
|
+
[](https://www.python.org/downloads/)
|
|
31
|
+
[](https://arxiv.org/abs/2509.12188)
|
|
32
|
+
|
|
33
|
+

|
|
34
|
+
|
|
35
|
+
</div>
|
|
36
|
+
|
|
37
|
+
## Overview
|
|
38
|
+
|
|
39
|
+
**Event2Vector** is a framework for learning representations of discrete event sequences. Inspired by the geometric structures found in neural representations, this model uses a simple, additive recurrent structure to create composable and interpretable embeddings.
|
|
40
|
+
|
|
41
|
+
## Key Concepts
|
|
42
|
+
* **Linear Additive Hypothesis**: The core idea behind Event2Vector is that the representation of an event sequence can be modeled as the vector sum of the embeddings of its individual events. This allows for intuitive vector arithmetic, enabling the composition and decomposition of event trajectories.
|
|
43
|
+
* **Euclidean and Hyperbolic Models**: Event2Vector is offered in two geometric variants:
|
|
44
|
+
* **Euclidean model**: Uses standard vector addition, providing a straightforward, flat geometry for event trajectories.
|
|
45
|
+
* **Hyperbolic model**: Employs Möbius addition, which is better suited for hierarchical data structures, as it can embed tree-like patterns with less distortion.
|
|
46
|
+
|
|
47
|
+
For more details, check *Sulc A., Event2Vector: A Geometric Approach to Learning Composable Representations of Event Sequences*
|
|
48
|
+
|
|
49
|
+
## Installation
|
|
50
|
+
|
|
51
|
+
Install the package directly from PyPI:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install event2vector
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
Or install from source:
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
git clone https://github.com/sulcantonin/event2vec_public.git
|
|
61
|
+
cd event2vec_public
|
|
62
|
+
pip install .
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
## Brown Example
|
|
67
|
+
Afeer installation, you can try to run Brown Part-of-Speech tagging example from the paper.
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
python3 -m experiments.prepare_brown_data.py
|
|
71
|
+
python3 -m experiments.train_brown_data.py
|
|
72
|
+
python3 -m experiments.visualize_brown_corpus.py
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
## Quickstart (tiny synthetic dataset)
|
|
76
|
+
|
|
77
|
+
The snippet below trains a small `EuclideanModel` on a toy event graph for a few epochs and prints an embedding for a short sequence. It runs in seconds on CPU.
|
|
78
|
+
|
|
79
|
+
We have 5 events: `START, A, B, C, END` and we test if we add `START + A + C ~ C`. The example should be self contained for educational purposes, as the main interest is the loss function.
|
|
80
|
+
|
|
81
|
+
```python
|
|
82
|
+
import random
|
|
83
|
+
import torch
|
|
84
|
+
import numpy as np
|
|
85
|
+
from sklearn.decomposition import PCA
|
|
86
|
+
import matplotlib.pyplot as plt
|
|
87
|
+
|
|
88
|
+
# Reproducibility: fix all relevant seeds so results are stable across runs
|
|
89
|
+
SEED = 42
|
|
90
|
+
random.seed(SEED) # Python's RNG (used by get_sequences and shuffling)
|
|
91
|
+
np.random.seed(SEED) # NumPy RNG (used by PCA and any NumPy ops)
|
|
92
|
+
torch.manual_seed(SEED) # Torch CPU RNG
|
|
93
|
+
if torch.cuda.is_available():
|
|
94
|
+
torch.cuda.manual_seed_all(SEED) # Torch CUDA RNG (all devices)
|
|
95
|
+
|
|
96
|
+
# Optional: prefer deterministic behavior in cuDNN (may have performance impact)
|
|
97
|
+
torch.backends.cudnn.deterministic = True
|
|
98
|
+
torch.backends.cudnn.benchmark = False
|
|
99
|
+
|
|
100
|
+
# Expected results (for the tiny toy graph START→A/B→C→END):
|
|
101
|
+
# - Training loss should decrease over epochs (with minor wobble due to randomness/dropout).
|
|
102
|
+
# - The printed sequence embedding for [START, A, C] is a single 8-D vector (shape (1, 8)).
|
|
103
|
+
# - Nearest tokens by cosine similarity to that sequence embedding should rank 'C' highest
|
|
104
|
+
# (since the encoder is additive and the last token is C), with 'END' somewhat aligned.
|
|
105
|
+
# - The decoder's top-1 next-event probability from that state should be 'END' with high
|
|
106
|
+
# confidence (≈0.9+) because the toy transitions force C→END.
|
|
107
|
+
# Visualization (below): a 2D PCA of token embeddings plus the sequence embedding should show
|
|
108
|
+
# 'SEQ(START-A-C)' lying near the point labeled 'C'.
|
|
109
|
+
|
|
110
|
+
from event2vector import EuclideanModel
|
|
111
|
+
from event2vector.data import get_sequences
|
|
112
|
+
|
|
113
|
+
# 1) Define a tiny state-transition toy dataset
|
|
114
|
+
# We model a simple Markovian process: START → (A or B) → C → END
|
|
115
|
+
# The model will learn to predict the next token and produce a sequence embedding
|
|
116
|
+
# by additive composition of token embeddings.
|
|
117
|
+
event_types = ['START', 'A', 'B', 'C', 'END']
|
|
118
|
+
event_transitions = {
|
|
119
|
+
'START': [('A', 0.5), ('B', 0.5)],
|
|
120
|
+
'A': [('C', 0.6), ('B', 0.4)],
|
|
121
|
+
'B': [('C', 0.7), ('A', 0.3)],
|
|
122
|
+
'C': [('END', 1.0)],
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
# 2) Generate synthetic sequences (reproducible thanks to the fixed seeds above)
|
|
126
|
+
# get_sequences returns: raw sequences, tensorized sequences, and vocab mappings.
|
|
127
|
+
_, processed_sequences, event_2_idx, _ = get_sequences(
|
|
128
|
+
event_types=event_types,
|
|
129
|
+
event_transitions=event_transitions,
|
|
130
|
+
initial='START',
|
|
131
|
+
terminal='END',
|
|
132
|
+
num_seq=200, # generate 200 short sequences
|
|
133
|
+
max_seq_length=6, # keep them small for speed
|
|
134
|
+
generate_new=True,
|
|
135
|
+
prefix='tiny_quickstart'
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
# 3) Initialize model and optimizer
|
|
139
|
+
# EuclideanModel composes sequence representations via additive updates.
|
|
140
|
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
|
141
|
+
embedding_dim = 8
|
|
142
|
+
model = EuclideanModel(num_event_types=len(event_types), embedding_dim=embedding_dim, dropout_p=0.1).to(device)
|
|
143
|
+
loss_fn = torch.nn.CrossEntropyLoss() # prediction loss (next-token)
|
|
144
|
+
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-3)
|
|
145
|
+
lambda_reconstruction = 0.2 # weight for reconstruction loss (h1 - e ≈ h_old)
|
|
146
|
+
lambda_consistency = 0.2 # weight for consistency loss (h1 ≈ h2)
|
|
147
|
+
mse_loss = torch.nn.MSELoss()
|
|
148
|
+
|
|
149
|
+
# 4) Minimal training loop
|
|
150
|
+
# Objective: next-token prediction with cross-entropy.
|
|
151
|
+
# We iterate tokens in each sequence and accumulate loss over (t → t+1) pairs.
|
|
152
|
+
model.train()
|
|
153
|
+
for epoch in range(128):
|
|
154
|
+
random.shuffle(processed_sequences) # shuffling is deterministic given the fixed Python seed
|
|
155
|
+
total_loss = 0.0
|
|
156
|
+
for seq_tensor in processed_sequences:
|
|
157
|
+
if len(seq_tensor) < 2:
|
|
158
|
+
continue
|
|
159
|
+
# h is the running sequence representation; encoder adds current token embedding
|
|
160
|
+
h = torch.zeros((1, embedding_dim), device=device)
|
|
161
|
+
seq_tensor = seq_tensor.to(device)
|
|
162
|
+
|
|
163
|
+
sequence_loss = 0.0
|
|
164
|
+
for t in range(len(seq_tensor) - 1):
|
|
165
|
+
x = seq_tensor[t].unsqueeze(0)
|
|
166
|
+
target = seq_tensor[t + 1].unsqueeze(0)
|
|
167
|
+
# Brown pipeline: two forward passes from the same h_old to compute a consistency loss
|
|
168
|
+
h_old = h.detach()
|
|
169
|
+
y1, h1, e_curr1 = model(x, h_old)
|
|
170
|
+
y2, h2, e_curr2 = model(x, h_old)
|
|
171
|
+
|
|
172
|
+
# Prediction: next-token cross-entropy on the first pass
|
|
173
|
+
prediction_loss = loss_fn(y1.view(1, -1), target)
|
|
174
|
+
|
|
175
|
+
# Reconstruction: undo the additive step (h1 - e ≈ h_old)
|
|
176
|
+
h_reconstructed = h1 - e_curr1
|
|
177
|
+
reconstruction_loss = mse_loss(h_reconstructed, h_old)
|
|
178
|
+
|
|
179
|
+
# Consistency: two identical passes should produce similar states
|
|
180
|
+
consistency_loss = mse_loss(h1, h2)
|
|
181
|
+
|
|
182
|
+
combined = (prediction_loss +
|
|
183
|
+
lambda_reconstruction * reconstruction_loss +
|
|
184
|
+
lambda_consistency * consistency_loss)
|
|
185
|
+
sequence_loss = sequence_loss + combined
|
|
186
|
+
|
|
187
|
+
# Advance hidden state using the first pass
|
|
188
|
+
h = h1.detach()
|
|
189
|
+
|
|
190
|
+
if (len(seq_tensor) - 1) > 0:
|
|
191
|
+
avg_seq_loss = sequence_loss / (len(seq_tensor) - 1)
|
|
192
|
+
optimizer.zero_grad()
|
|
193
|
+
avg_seq_loss.backward()
|
|
194
|
+
optimizer.step()
|
|
195
|
+
total_loss += avg_seq_loss.item()
|
|
196
|
+
|
|
197
|
+
print(f"epoch {epoch + 1}: loss={total_loss / max(1, len(processed_sequences)):.4f}")
|
|
198
|
+
|
|
199
|
+
# 5) Use the learned representation for a short sequence
|
|
200
|
+
# We encode [START, A, C] step-by-step; the final h is the sequence embedding.
|
|
201
|
+
model.eval()
|
|
202
|
+
with torch.no_grad():
|
|
203
|
+
seq = torch.tensor([
|
|
204
|
+
event_2_idx['START'], event_2_idx['A'], event_2_idx['C']
|
|
205
|
+
], device=device)
|
|
206
|
+
h = torch.zeros((1, embedding_dim), device=device)
|
|
207
|
+
for idx in seq:
|
|
208
|
+
_, h, _ = model(idx.unsqueeze(0), h)
|
|
209
|
+
print('Sequence embedding (START-A-C):', h.cpu().numpy())
|
|
210
|
+
|
|
211
|
+
|
|
212
|
+
with torch.no_grad():
|
|
213
|
+
# 6) Qualitative check: nearest tokens to the sequence embedding by cosine similarity
|
|
214
|
+
# Expect 'C' to be nearest (last consumed token) and 'END' reasonably aligned.
|
|
215
|
+
emb = model.embedding.weight.detach() # [V, 8]
|
|
216
|
+
h_norm = torch.nn.functional.normalize(h, dim=1)
|
|
217
|
+
emb_norm = torch.nn.functional.normalize(emb, dim=1)
|
|
218
|
+
sims = (emb_norm @ h_norm.squeeze(0))
|
|
219
|
+
top_sim = torch.topk(sims, k=3)
|
|
220
|
+
print('Nearest tokens by cosine:', [ (list(event_2_idx.keys())[i], float(sims[i])) for i in top_sim.indices ])
|
|
221
|
+
|
|
222
|
+
with torch.no_grad():
|
|
223
|
+
# 7) Next-event distribution from the current state
|
|
224
|
+
# Expect 'END' to be top-1 with high probability because C→END with p=1.0
|
|
225
|
+
logits = model.decoder(h) # [1, V]
|
|
226
|
+
probs = torch.softmax(logits, dim=-1).squeeze(0)
|
|
227
|
+
top = torch.topk(probs, k=3)
|
|
228
|
+
inv = {v:k for k,v in event_2_idx.items()}
|
|
229
|
+
print('Top-3 next events:', [ (inv[i.item()], float(probs[i])) for i in top.indices ])
|
|
230
|
+
|
|
231
|
+
# 8) Visualization: PCA of token embeddings + sequence embedding
|
|
232
|
+
# Expect the red star (sequence) to lie near the point labeled 'C'.
|
|
233
|
+
with torch.no_grad():
|
|
234
|
+
token_emb = model.embedding.weight.detach().cpu().numpy() # [V, 8]
|
|
235
|
+
seq_emb = h.detach().cpu().numpy() # [1, 8]
|
|
236
|
+
X = np.vstack([token_emb, seq_emb])
|
|
237
|
+
pca = PCA(n_components=2, random_state=SEED)
|
|
238
|
+
X2 = pca.fit_transform(X)
|
|
239
|
+
tokens2, seq2 = X2[:-1], X2[-1]
|
|
240
|
+
|
|
241
|
+
inv = {v:k for k,v in event_2_idx.items()}
|
|
242
|
+
plt.figure(figsize=(6, 6))
|
|
243
|
+
plt.scatter(tokens2[:, 0], tokens2[:, 1], c='gray', label='tokens')
|
|
244
|
+
for i, (x, y) in enumerate(tokens2):
|
|
245
|
+
plt.text(x + 0.02, y + 0.02, inv[i], fontsize=9)
|
|
246
|
+
plt.scatter([seq2[0]], [seq2[1]], c='red', marker='*', s=160, label='SEQ(START-A-C)')
|
|
247
|
+
plt.title('PCA: token embeddings + sequence embedding (expect SEQ near C)')
|
|
248
|
+
plt.legend(loc='best')
|
|
249
|
+
plt.tight_layout()
|
|
250
|
+
plt.show()
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
## References
|
|
254
|
+
For citations please use following Bibtex.
|
|
255
|
+
```bibtex
|
|
256
|
+
@article{sulc2025event2vec,
|
|
257
|
+
title={Event2Vector: A Geometric Approach to Learning Composable Representations of Event Sequences},
|
|
258
|
+
author={Sulc, Antonin},
|
|
259
|
+
journal={arXiv preprint arXiv:2509.12188},
|
|
260
|
+
year={2025}
|
|
261
|
+
}
|
|
262
|
+
```
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
MANIFEST.in
|
|
2
|
+
README.md
|
|
3
|
+
pyproject.toml
|
|
4
|
+
setup.py
|
|
5
|
+
event2vector/__init__.py
|
|
6
|
+
event2vector/data.py
|
|
7
|
+
event2vector/models.py
|
|
8
|
+
event2vector.egg-info/PKG-INFO
|
|
9
|
+
event2vector.egg-info/SOURCES.txt
|
|
10
|
+
event2vector.egg-info/dependency_links.txt
|
|
11
|
+
event2vector.egg-info/requires.txt
|
|
12
|
+
event2vector.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
event2vector
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "event2vector"
|
|
7
|
+
|
|
8
|
+
version = "0.1.0"
|
|
9
|
+
description = "A geometric approach to learning composable representations of event sequences."
|
|
10
|
+
readme = "README.md"
|
|
11
|
+
authors = [
|
|
12
|
+
{ name = "Antonin Sulc" },
|
|
13
|
+
]
|
|
14
|
+
license = { text = "MIT" }
|
|
15
|
+
classifiers = [
|
|
16
|
+
"Programming Language :: Python :: 3",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: OS Independent",
|
|
19
|
+
]
|
|
20
|
+
requires-python = ">=3.6"
|
|
21
|
+
dependencies = [
|
|
22
|
+
"torch",
|
|
23
|
+
"numpy",
|
|
24
|
+
"tqdm",
|
|
25
|
+
"matplotlib",
|
|
26
|
+
"scikit-learn",
|
|
27
|
+
"openTSNE",
|
|
28
|
+
"gensim",
|
|
29
|
+
"seaborn",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.urls]
|
|
33
|
+
"Homepage" = "https://github.com/sulcantonin/event2vec_public"
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name='event2vector',
|
|
5
|
+
version='0.1.0',
|
|
6
|
+
author='Antonin Sulc',
|
|
7
|
+
description='A geometric approach to learning composable representations of event sequences.',
|
|
8
|
+
long_description=open('README.md').read(),
|
|
9
|
+
long_description_content_type='text/markdown',
|
|
10
|
+
packages=find_packages(),
|
|
11
|
+
install_requires=[
|
|
12
|
+
'torch',
|
|
13
|
+
'numpy',
|
|
14
|
+
'tqdm',
|
|
15
|
+
'matplotlib',
|
|
16
|
+
'scikit-learn',
|
|
17
|
+
'openTSNE',
|
|
18
|
+
'gensim',
|
|
19
|
+
'seaborn'
|
|
20
|
+
],
|
|
21
|
+
classifiers=[
|
|
22
|
+
'Programming Language :: Python :: 3',
|
|
23
|
+
'License :: OSI Approved :: MIT License',
|
|
24
|
+
'Operating System :: OS Independent',
|
|
25
|
+
],
|
|
26
|
+
python_requires='>=3.6',
|
|
27
|
+
)
|
|
28
|
+
|