event2vector 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,4 @@
1
+ include README.md
2
+ exclude *.pkl
3
+ exclude *.model
4
+ exclude data/*
@@ -0,0 +1,262 @@
1
+ Metadata-Version: 2.4
2
+ Name: event2vector
3
+ Version: 0.1.0
4
+ Summary: A geometric approach to learning composable representations of event sequences.
5
+ Author: Antonin Sulc
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/sulcantonin/event2vec_public
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.6
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: torch
14
+ Requires-Dist: numpy
15
+ Requires-Dist: tqdm
16
+ Requires-Dist: matplotlib
17
+ Requires-Dist: scikit-learn
18
+ Requires-Dist: openTSNE
19
+ Requires-Dist: gensim
20
+ Requires-Dist: seaborn
21
+ Dynamic: requires-python
22
+
23
+ <div align="center">
24
+
25
+ # Event2Vector
26
+ ## A Geometric Approach to Learning Composable Representations of Event Sequences
27
+
28
+ [![PyPI version](https://badge.fury.io/py/event2vector.svg)](https://badge.fury.io/py/event2vector)
29
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
30
+ [![Python 3.6+](https://img.shields.io/badge/python-3.6+-blue.svg)](https://www.python.org/downloads/)
31
+ [![arXiv](https://img.shields.io/badge/arXiv-2509.12188-b31b1b.svg)](https://arxiv.org/abs/2509.12188)
32
+
33
+ ![Teaser](./images/teaser.png)
34
+
35
+ </div>
36
+
37
+ ## Overview
38
+
39
+ **Event2Vector** is a framework for learning representations of discrete event sequences. Inspired by the geometric structures found in neural representations, this model uses a simple, additive recurrent structure to create composable and interpretable embeddings.
40
+
41
+ ## Key Concepts
42
+ * **Linear Additive Hypothesis**: The core idea behind Event2Vector is that the representation of an event sequence can be modeled as the vector sum of the embeddings of its individual events. This allows for intuitive vector arithmetic, enabling the composition and decomposition of event trajectories.
43
+ * **Euclidean and Hyperbolic Models**: Event2Vector is offered in two geometric variants:
44
+ * **Euclidean model**: Uses standard vector addition, providing a straightforward, flat geometry for event trajectories.
45
+ * **Hyperbolic model**: Employs Möbius addition, which is better suited for hierarchical data structures, as it can embed tree-like patterns with less distortion.
46
+
47
+ For more details, check *Sulc A., Event2Vector: A Geometric Approach to Learning Composable Representations of Event Sequences*
48
+
49
+ ## Installation
50
+
51
+ Install the package directly from PyPI:
52
+
53
+ ```bash
54
+ pip install event2vector
55
+ ```
56
+
57
+ Or install from source:
58
+
59
+ ```bash
60
+ git clone https://github.com/sulcantonin/event2vec_public.git
61
+ cd event2vec_public
62
+ pip install .
63
+ ```
64
+
65
+
66
+ ## Brown Example
67
+ Afeer installation, you can try to run Brown Part-of-Speech tagging example from the paper.
68
+
69
+ ```bash
70
+ python3 -m experiments.prepare_brown_data.py
71
+ python3 -m experiments.train_brown_data.py
72
+ python3 -m experiments.visualize_brown_corpus.py
73
+ ```
74
+
75
+ ## Quickstart (tiny synthetic dataset)
76
+
77
+ The snippet below trains a small `EuclideanModel` on a toy event graph for a few epochs and prints an embedding for a short sequence. It runs in seconds on CPU.
78
+
79
+ We have 5 events: `START, A, B, C, END` and we test if we add `START + A + C ~ C`. The example should be self contained for educational purposes, as the main interest is the loss function.
80
+
81
+ ```python
82
+ import random
83
+ import torch
84
+ import numpy as np
85
+ from sklearn.decomposition import PCA
86
+ import matplotlib.pyplot as plt
87
+
88
+ # Reproducibility: fix all relevant seeds so results are stable across runs
89
+ SEED = 42
90
+ random.seed(SEED) # Python's RNG (used by get_sequences and shuffling)
91
+ np.random.seed(SEED) # NumPy RNG (used by PCA and any NumPy ops)
92
+ torch.manual_seed(SEED) # Torch CPU RNG
93
+ if torch.cuda.is_available():
94
+ torch.cuda.manual_seed_all(SEED) # Torch CUDA RNG (all devices)
95
+
96
+ # Optional: prefer deterministic behavior in cuDNN (may have performance impact)
97
+ torch.backends.cudnn.deterministic = True
98
+ torch.backends.cudnn.benchmark = False
99
+
100
+ # Expected results (for the tiny toy graph START→A/B→C→END):
101
+ # - Training loss should decrease over epochs (with minor wobble due to randomness/dropout).
102
+ # - The printed sequence embedding for [START, A, C] is a single 8-D vector (shape (1, 8)).
103
+ # - Nearest tokens by cosine similarity to that sequence embedding should rank 'C' highest
104
+ # (since the encoder is additive and the last token is C), with 'END' somewhat aligned.
105
+ # - The decoder's top-1 next-event probability from that state should be 'END' with high
106
+ # confidence (≈0.9+) because the toy transitions force C→END.
107
+ # Visualization (below): a 2D PCA of token embeddings plus the sequence embedding should show
108
+ # 'SEQ(START-A-C)' lying near the point labeled 'C'.
109
+
110
+ from event2vector import EuclideanModel
111
+ from event2vector.data import get_sequences
112
+
113
+ # 1) Define a tiny state-transition toy dataset
114
+ # We model a simple Markovian process: START → (A or B) → C → END
115
+ # The model will learn to predict the next token and produce a sequence embedding
116
+ # by additive composition of token embeddings.
117
+ event_types = ['START', 'A', 'B', 'C', 'END']
118
+ event_transitions = {
119
+ 'START': [('A', 0.5), ('B', 0.5)],
120
+ 'A': [('C', 0.6), ('B', 0.4)],
121
+ 'B': [('C', 0.7), ('A', 0.3)],
122
+ 'C': [('END', 1.0)],
123
+ }
124
+
125
+ # 2) Generate synthetic sequences (reproducible thanks to the fixed seeds above)
126
+ # get_sequences returns: raw sequences, tensorized sequences, and vocab mappings.
127
+ _, processed_sequences, event_2_idx, _ = get_sequences(
128
+ event_types=event_types,
129
+ event_transitions=event_transitions,
130
+ initial='START',
131
+ terminal='END',
132
+ num_seq=200, # generate 200 short sequences
133
+ max_seq_length=6, # keep them small for speed
134
+ generate_new=True,
135
+ prefix='tiny_quickstart'
136
+ )
137
+
138
+ # 3) Initialize model and optimizer
139
+ # EuclideanModel composes sequence representations via additive updates.
140
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
141
+ embedding_dim = 8
142
+ model = EuclideanModel(num_event_types=len(event_types), embedding_dim=embedding_dim, dropout_p=0.1).to(device)
143
+ loss_fn = torch.nn.CrossEntropyLoss() # prediction loss (next-token)
144
+ optimizer = torch.optim.AdamW(model.parameters(), lr=5e-3)
145
+ lambda_reconstruction = 0.2 # weight for reconstruction loss (h1 - e ≈ h_old)
146
+ lambda_consistency = 0.2 # weight for consistency loss (h1 ≈ h2)
147
+ mse_loss = torch.nn.MSELoss()
148
+
149
+ # 4) Minimal training loop
150
+ # Objective: next-token prediction with cross-entropy.
151
+ # We iterate tokens in each sequence and accumulate loss over (t → t+1) pairs.
152
+ model.train()
153
+ for epoch in range(128):
154
+ random.shuffle(processed_sequences) # shuffling is deterministic given the fixed Python seed
155
+ total_loss = 0.0
156
+ for seq_tensor in processed_sequences:
157
+ if len(seq_tensor) < 2:
158
+ continue
159
+ # h is the running sequence representation; encoder adds current token embedding
160
+ h = torch.zeros((1, embedding_dim), device=device)
161
+ seq_tensor = seq_tensor.to(device)
162
+
163
+ sequence_loss = 0.0
164
+ for t in range(len(seq_tensor) - 1):
165
+ x = seq_tensor[t].unsqueeze(0)
166
+ target = seq_tensor[t + 1].unsqueeze(0)
167
+ # Brown pipeline: two forward passes from the same h_old to compute a consistency loss
168
+ h_old = h.detach()
169
+ y1, h1, e_curr1 = model(x, h_old)
170
+ y2, h2, e_curr2 = model(x, h_old)
171
+
172
+ # Prediction: next-token cross-entropy on the first pass
173
+ prediction_loss = loss_fn(y1.view(1, -1), target)
174
+
175
+ # Reconstruction: undo the additive step (h1 - e ≈ h_old)
176
+ h_reconstructed = h1 - e_curr1
177
+ reconstruction_loss = mse_loss(h_reconstructed, h_old)
178
+
179
+ # Consistency: two identical passes should produce similar states
180
+ consistency_loss = mse_loss(h1, h2)
181
+
182
+ combined = (prediction_loss +
183
+ lambda_reconstruction * reconstruction_loss +
184
+ lambda_consistency * consistency_loss)
185
+ sequence_loss = sequence_loss + combined
186
+
187
+ # Advance hidden state using the first pass
188
+ h = h1.detach()
189
+
190
+ if (len(seq_tensor) - 1) > 0:
191
+ avg_seq_loss = sequence_loss / (len(seq_tensor) - 1)
192
+ optimizer.zero_grad()
193
+ avg_seq_loss.backward()
194
+ optimizer.step()
195
+ total_loss += avg_seq_loss.item()
196
+
197
+ print(f"epoch {epoch + 1}: loss={total_loss / max(1, len(processed_sequences)):.4f}")
198
+
199
+ # 5) Use the learned representation for a short sequence
200
+ # We encode [START, A, C] step-by-step; the final h is the sequence embedding.
201
+ model.eval()
202
+ with torch.no_grad():
203
+ seq = torch.tensor([
204
+ event_2_idx['START'], event_2_idx['A'], event_2_idx['C']
205
+ ], device=device)
206
+ h = torch.zeros((1, embedding_dim), device=device)
207
+ for idx in seq:
208
+ _, h, _ = model(idx.unsqueeze(0), h)
209
+ print('Sequence embedding (START-A-C):', h.cpu().numpy())
210
+
211
+
212
+ with torch.no_grad():
213
+ # 6) Qualitative check: nearest tokens to the sequence embedding by cosine similarity
214
+ # Expect 'C' to be nearest (last consumed token) and 'END' reasonably aligned.
215
+ emb = model.embedding.weight.detach() # [V, 8]
216
+ h_norm = torch.nn.functional.normalize(h, dim=1)
217
+ emb_norm = torch.nn.functional.normalize(emb, dim=1)
218
+ sims = (emb_norm @ h_norm.squeeze(0))
219
+ top_sim = torch.topk(sims, k=3)
220
+ print('Nearest tokens by cosine:', [ (list(event_2_idx.keys())[i], float(sims[i])) for i in top_sim.indices ])
221
+
222
+ with torch.no_grad():
223
+ # 7) Next-event distribution from the current state
224
+ # Expect 'END' to be top-1 with high probability because C→END with p=1.0
225
+ logits = model.decoder(h) # [1, V]
226
+ probs = torch.softmax(logits, dim=-1).squeeze(0)
227
+ top = torch.topk(probs, k=3)
228
+ inv = {v:k for k,v in event_2_idx.items()}
229
+ print('Top-3 next events:', [ (inv[i.item()], float(probs[i])) for i in top.indices ])
230
+
231
+ # 8) Visualization: PCA of token embeddings + sequence embedding
232
+ # Expect the red star (sequence) to lie near the point labeled 'C'.
233
+ with torch.no_grad():
234
+ token_emb = model.embedding.weight.detach().cpu().numpy() # [V, 8]
235
+ seq_emb = h.detach().cpu().numpy() # [1, 8]
236
+ X = np.vstack([token_emb, seq_emb])
237
+ pca = PCA(n_components=2, random_state=SEED)
238
+ X2 = pca.fit_transform(X)
239
+ tokens2, seq2 = X2[:-1], X2[-1]
240
+
241
+ inv = {v:k for k,v in event_2_idx.items()}
242
+ plt.figure(figsize=(6, 6))
243
+ plt.scatter(tokens2[:, 0], tokens2[:, 1], c='gray', label='tokens')
244
+ for i, (x, y) in enumerate(tokens2):
245
+ plt.text(x + 0.02, y + 0.02, inv[i], fontsize=9)
246
+ plt.scatter([seq2[0]], [seq2[1]], c='red', marker='*', s=160, label='SEQ(START-A-C)')
247
+ plt.title('PCA: token embeddings + sequence embedding (expect SEQ near C)')
248
+ plt.legend(loc='best')
249
+ plt.tight_layout()
250
+ plt.show()
251
+ ```
252
+
253
+ ## References
254
+ For citations please use following Bibtex.
255
+ ```bibtex
256
+ @article{sulc2025event2vec,
257
+ title={Event2Vector: A Geometric Approach to Learning Composable Representations of Event Sequences},
258
+ author={Sulc, Antonin},
259
+ journal={arXiv preprint arXiv:2509.12188},
260
+ year={2025}
261
+ }
262
+ ```
@@ -0,0 +1,240 @@
1
+ <div align="center">
2
+
3
+ # Event2Vector
4
+ ## A Geometric Approach to Learning Composable Representations of Event Sequences
5
+
6
+ [![PyPI version](https://badge.fury.io/py/event2vector.svg)](https://badge.fury.io/py/event2vector)
7
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
8
+ [![Python 3.6+](https://img.shields.io/badge/python-3.6+-blue.svg)](https://www.python.org/downloads/)
9
+ [![arXiv](https://img.shields.io/badge/arXiv-2509.12188-b31b1b.svg)](https://arxiv.org/abs/2509.12188)
10
+
11
+ ![Teaser](./images/teaser.png)
12
+
13
+ </div>
14
+
15
+ ## Overview
16
+
17
+ **Event2Vector** is a framework for learning representations of discrete event sequences. Inspired by the geometric structures found in neural representations, this model uses a simple, additive recurrent structure to create composable and interpretable embeddings.
18
+
19
+ ## Key Concepts
20
+ * **Linear Additive Hypothesis**: The core idea behind Event2Vector is that the representation of an event sequence can be modeled as the vector sum of the embeddings of its individual events. This allows for intuitive vector arithmetic, enabling the composition and decomposition of event trajectories.
21
+ * **Euclidean and Hyperbolic Models**: Event2Vector is offered in two geometric variants:
22
+ * **Euclidean model**: Uses standard vector addition, providing a straightforward, flat geometry for event trajectories.
23
+ * **Hyperbolic model**: Employs Möbius addition, which is better suited for hierarchical data structures, as it can embed tree-like patterns with less distortion.
24
+
25
+ For more details, check *Sulc A., Event2Vector: A Geometric Approach to Learning Composable Representations of Event Sequences*
26
+
27
+ ## Installation
28
+
29
+ Install the package directly from PyPI:
30
+
31
+ ```bash
32
+ pip install event2vector
33
+ ```
34
+
35
+ Or install from source:
36
+
37
+ ```bash
38
+ git clone https://github.com/sulcantonin/event2vec_public.git
39
+ cd event2vec_public
40
+ pip install .
41
+ ```
42
+
43
+
44
+ ## Brown Example
45
+ Afeer installation, you can try to run Brown Part-of-Speech tagging example from the paper.
46
+
47
+ ```bash
48
+ python3 -m experiments.prepare_brown_data.py
49
+ python3 -m experiments.train_brown_data.py
50
+ python3 -m experiments.visualize_brown_corpus.py
51
+ ```
52
+
53
+ ## Quickstart (tiny synthetic dataset)
54
+
55
+ The snippet below trains a small `EuclideanModel` on a toy event graph for a few epochs and prints an embedding for a short sequence. It runs in seconds on CPU.
56
+
57
+ We have 5 events: `START, A, B, C, END` and we test if we add `START + A + C ~ C`. The example should be self contained for educational purposes, as the main interest is the loss function.
58
+
59
+ ```python
60
+ import random
61
+ import torch
62
+ import numpy as np
63
+ from sklearn.decomposition import PCA
64
+ import matplotlib.pyplot as plt
65
+
66
+ # Reproducibility: fix all relevant seeds so results are stable across runs
67
+ SEED = 42
68
+ random.seed(SEED) # Python's RNG (used by get_sequences and shuffling)
69
+ np.random.seed(SEED) # NumPy RNG (used by PCA and any NumPy ops)
70
+ torch.manual_seed(SEED) # Torch CPU RNG
71
+ if torch.cuda.is_available():
72
+ torch.cuda.manual_seed_all(SEED) # Torch CUDA RNG (all devices)
73
+
74
+ # Optional: prefer deterministic behavior in cuDNN (may have performance impact)
75
+ torch.backends.cudnn.deterministic = True
76
+ torch.backends.cudnn.benchmark = False
77
+
78
+ # Expected results (for the tiny toy graph START→A/B→C→END):
79
+ # - Training loss should decrease over epochs (with minor wobble due to randomness/dropout).
80
+ # - The printed sequence embedding for [START, A, C] is a single 8-D vector (shape (1, 8)).
81
+ # - Nearest tokens by cosine similarity to that sequence embedding should rank 'C' highest
82
+ # (since the encoder is additive and the last token is C), with 'END' somewhat aligned.
83
+ # - The decoder's top-1 next-event probability from that state should be 'END' with high
84
+ # confidence (≈0.9+) because the toy transitions force C→END.
85
+ # Visualization (below): a 2D PCA of token embeddings plus the sequence embedding should show
86
+ # 'SEQ(START-A-C)' lying near the point labeled 'C'.
87
+
88
+ from event2vector import EuclideanModel
89
+ from event2vector.data import get_sequences
90
+
91
+ # 1) Define a tiny state-transition toy dataset
92
+ # We model a simple Markovian process: START → (A or B) → C → END
93
+ # The model will learn to predict the next token and produce a sequence embedding
94
+ # by additive composition of token embeddings.
95
+ event_types = ['START', 'A', 'B', 'C', 'END']
96
+ event_transitions = {
97
+ 'START': [('A', 0.5), ('B', 0.5)],
98
+ 'A': [('C', 0.6), ('B', 0.4)],
99
+ 'B': [('C', 0.7), ('A', 0.3)],
100
+ 'C': [('END', 1.0)],
101
+ }
102
+
103
+ # 2) Generate synthetic sequences (reproducible thanks to the fixed seeds above)
104
+ # get_sequences returns: raw sequences, tensorized sequences, and vocab mappings.
105
+ _, processed_sequences, event_2_idx, _ = get_sequences(
106
+ event_types=event_types,
107
+ event_transitions=event_transitions,
108
+ initial='START',
109
+ terminal='END',
110
+ num_seq=200, # generate 200 short sequences
111
+ max_seq_length=6, # keep them small for speed
112
+ generate_new=True,
113
+ prefix='tiny_quickstart'
114
+ )
115
+
116
+ # 3) Initialize model and optimizer
117
+ # EuclideanModel composes sequence representations via additive updates.
118
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
119
+ embedding_dim = 8
120
+ model = EuclideanModel(num_event_types=len(event_types), embedding_dim=embedding_dim, dropout_p=0.1).to(device)
121
+ loss_fn = torch.nn.CrossEntropyLoss() # prediction loss (next-token)
122
+ optimizer = torch.optim.AdamW(model.parameters(), lr=5e-3)
123
+ lambda_reconstruction = 0.2 # weight for reconstruction loss (h1 - e ≈ h_old)
124
+ lambda_consistency = 0.2 # weight for consistency loss (h1 ≈ h2)
125
+ mse_loss = torch.nn.MSELoss()
126
+
127
+ # 4) Minimal training loop
128
+ # Objective: next-token prediction with cross-entropy.
129
+ # We iterate tokens in each sequence and accumulate loss over (t → t+1) pairs.
130
+ model.train()
131
+ for epoch in range(128):
132
+ random.shuffle(processed_sequences) # shuffling is deterministic given the fixed Python seed
133
+ total_loss = 0.0
134
+ for seq_tensor in processed_sequences:
135
+ if len(seq_tensor) < 2:
136
+ continue
137
+ # h is the running sequence representation; encoder adds current token embedding
138
+ h = torch.zeros((1, embedding_dim), device=device)
139
+ seq_tensor = seq_tensor.to(device)
140
+
141
+ sequence_loss = 0.0
142
+ for t in range(len(seq_tensor) - 1):
143
+ x = seq_tensor[t].unsqueeze(0)
144
+ target = seq_tensor[t + 1].unsqueeze(0)
145
+ # Brown pipeline: two forward passes from the same h_old to compute a consistency loss
146
+ h_old = h.detach()
147
+ y1, h1, e_curr1 = model(x, h_old)
148
+ y2, h2, e_curr2 = model(x, h_old)
149
+
150
+ # Prediction: next-token cross-entropy on the first pass
151
+ prediction_loss = loss_fn(y1.view(1, -1), target)
152
+
153
+ # Reconstruction: undo the additive step (h1 - e ≈ h_old)
154
+ h_reconstructed = h1 - e_curr1
155
+ reconstruction_loss = mse_loss(h_reconstructed, h_old)
156
+
157
+ # Consistency: two identical passes should produce similar states
158
+ consistency_loss = mse_loss(h1, h2)
159
+
160
+ combined = (prediction_loss +
161
+ lambda_reconstruction * reconstruction_loss +
162
+ lambda_consistency * consistency_loss)
163
+ sequence_loss = sequence_loss + combined
164
+
165
+ # Advance hidden state using the first pass
166
+ h = h1.detach()
167
+
168
+ if (len(seq_tensor) - 1) > 0:
169
+ avg_seq_loss = sequence_loss / (len(seq_tensor) - 1)
170
+ optimizer.zero_grad()
171
+ avg_seq_loss.backward()
172
+ optimizer.step()
173
+ total_loss += avg_seq_loss.item()
174
+
175
+ print(f"epoch {epoch + 1}: loss={total_loss / max(1, len(processed_sequences)):.4f}")
176
+
177
+ # 5) Use the learned representation for a short sequence
178
+ # We encode [START, A, C] step-by-step; the final h is the sequence embedding.
179
+ model.eval()
180
+ with torch.no_grad():
181
+ seq = torch.tensor([
182
+ event_2_idx['START'], event_2_idx['A'], event_2_idx['C']
183
+ ], device=device)
184
+ h = torch.zeros((1, embedding_dim), device=device)
185
+ for idx in seq:
186
+ _, h, _ = model(idx.unsqueeze(0), h)
187
+ print('Sequence embedding (START-A-C):', h.cpu().numpy())
188
+
189
+
190
+ with torch.no_grad():
191
+ # 6) Qualitative check: nearest tokens to the sequence embedding by cosine similarity
192
+ # Expect 'C' to be nearest (last consumed token) and 'END' reasonably aligned.
193
+ emb = model.embedding.weight.detach() # [V, 8]
194
+ h_norm = torch.nn.functional.normalize(h, dim=1)
195
+ emb_norm = torch.nn.functional.normalize(emb, dim=1)
196
+ sims = (emb_norm @ h_norm.squeeze(0))
197
+ top_sim = torch.topk(sims, k=3)
198
+ print('Nearest tokens by cosine:', [ (list(event_2_idx.keys())[i], float(sims[i])) for i in top_sim.indices ])
199
+
200
+ with torch.no_grad():
201
+ # 7) Next-event distribution from the current state
202
+ # Expect 'END' to be top-1 with high probability because C→END with p=1.0
203
+ logits = model.decoder(h) # [1, V]
204
+ probs = torch.softmax(logits, dim=-1).squeeze(0)
205
+ top = torch.topk(probs, k=3)
206
+ inv = {v:k for k,v in event_2_idx.items()}
207
+ print('Top-3 next events:', [ (inv[i.item()], float(probs[i])) for i in top.indices ])
208
+
209
+ # 8) Visualization: PCA of token embeddings + sequence embedding
210
+ # Expect the red star (sequence) to lie near the point labeled 'C'.
211
+ with torch.no_grad():
212
+ token_emb = model.embedding.weight.detach().cpu().numpy() # [V, 8]
213
+ seq_emb = h.detach().cpu().numpy() # [1, 8]
214
+ X = np.vstack([token_emb, seq_emb])
215
+ pca = PCA(n_components=2, random_state=SEED)
216
+ X2 = pca.fit_transform(X)
217
+ tokens2, seq2 = X2[:-1], X2[-1]
218
+
219
+ inv = {v:k for k,v in event_2_idx.items()}
220
+ plt.figure(figsize=(6, 6))
221
+ plt.scatter(tokens2[:, 0], tokens2[:, 1], c='gray', label='tokens')
222
+ for i, (x, y) in enumerate(tokens2):
223
+ plt.text(x + 0.02, y + 0.02, inv[i], fontsize=9)
224
+ plt.scatter([seq2[0]], [seq2[1]], c='red', marker='*', s=160, label='SEQ(START-A-C)')
225
+ plt.title('PCA: token embeddings + sequence embedding (expect SEQ near C)')
226
+ plt.legend(loc='best')
227
+ plt.tight_layout()
228
+ plt.show()
229
+ ```
230
+
231
+ ## References
232
+ For citations please use following Bibtex.
233
+ ```bibtex
234
+ @article{sulc2025event2vec,
235
+ title={Event2Vector: A Geometric Approach to Learning Composable Representations of Event Sequences},
236
+ author={Sulc, Antonin},
237
+ journal={arXiv preprint arXiv:2509.12188},
238
+ year={2025}
239
+ }
240
+ ```
@@ -0,0 +1,4 @@
1
+ from .models import EuclideanModel, HyperbolicModel, HyperbolicUtils
2
+ from .data import generate_sequences, get_sequences
3
+
4
+
@@ -0,0 +1,52 @@
1
+ import random
2
+ import torch
3
+ import pickle
4
+
5
+ def generate_sequences(event_types, event_transitions, initial, terminal, num_seq, max_length):
6
+ """
7
+ Generates synthetic sequences of events based on a state transition graph.
8
+ This function simulates realistic life trajectories by performing a guided
9
+ random walk on the graph.
10
+ """
11
+ sequences = []
12
+ for _ in range(num_seq):
13
+ seq = [initial]
14
+ while seq[-1] != terminal and len(seq) < max_length:
15
+ current_event = seq[-1]
16
+ transitions = event_transitions.get(current_event, [])
17
+ if not transitions or random.random() < 0.1:
18
+ next_event = random.choice([e for e in event_types if e != current_event])
19
+ else:
20
+ events, probs = zip(*transitions)
21
+ next_event = random.choices(events, weights=probs, k=1)[0]
22
+ seq.append(next_event)
23
+ if seq[-1] != terminal:
24
+ seq.append(terminal)
25
+ sequences.append(seq)
26
+ return sequences
27
+
28
+
29
+ def get_sequences(event_types, event_transitions, initial, terminal, num_seq, max_seq_length, generate_new, prefix):
30
+ """
31
+ Generates and preprocesses sequences for synthetic data, or loads them from a file.
32
+ """
33
+ data_file = f'{prefix}_training_data.pkl'
34
+ if generate_new:
35
+ event_2_idx = {event: idx for idx, event in enumerate(event_types)}
36
+ idx_2_event = {idx: event for idx, event in enumerate(event_types)}
37
+ sequences = generate_sequences(event_types, event_transitions, initial, terminal, num_seq, max_seq_length)
38
+ processed_sequences = [torch.tensor([event_2_idx[s] for s in seq], dtype=torch.long) for seq in sequences]
39
+ with open(data_file, 'wb') as f:
40
+ pickle.dump({'sequences': sequences,
41
+ 'processed_sequences': processed_sequences,
42
+ 'event_2_idx': event_2_idx,
43
+ 'idx_2_event': idx_2_event}, f)
44
+ else:
45
+ with open(data_file, 'rb') as f:
46
+ data = pickle.load(f)
47
+ sequences = data['sequences']
48
+ processed_sequences = data['processed_sequences']
49
+ event_2_idx = data['event_2_idx']
50
+ idx_2_event = data['idx_2_event']
51
+ return sequences, processed_sequences, event_2_idx, idx_2_event
52
+
@@ -0,0 +1,120 @@
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ EPS = 1e-5 # Small constant for numerical stability of hyperbolic space
5
+
6
+
7
+ class EuclideanModel(nn.Module):
8
+ """
9
+ A recurrent model for learning event sequence representations in Euclidean space.
10
+ The model uses a simple additive update rule, where the representation of a
11
+ sequence is the sum of its constituent event embeddings.
12
+ """
13
+
14
+ def __init__(self, num_event_types, embedding_dim, dropout_p=0.2, max_norm=10.0):
15
+ super(EuclideanModel, self).__init__()
16
+ self.embedding_dim = embedding_dim
17
+ self.max_norm = max_norm
18
+ self.embedding = nn.Embedding(num_event_types, embedding_dim)
19
+ self.dropout = nn.Dropout(p=dropout_p)
20
+ self.linear_dec = nn.Linear(embedding_dim, num_event_types)
21
+
22
+ def encoder(self, x, h=None, clipping=True):
23
+ if h is None:
24
+ h = torch.zeros((x.shape[0], self.embedding_dim), device=x.device)
25
+ e = self.embedding(x)
26
+ h_next = self.dropout(e + h)
27
+
28
+ if clipping:
29
+ norm = torch.norm(h_next, p=2, dim=1, keepdim=True)
30
+ clip_coef = self.max_norm / (norm + 1e-6)
31
+ clip_coef = torch.min(clip_coef, torch.ones_like(clip_coef))
32
+ h_next = h_next * clip_coef
33
+
34
+ return h_next, e
35
+
36
+ def decoder(self, h):
37
+ return self.linear_dec(h)
38
+
39
+ def forward(self, x, h=None):
40
+ h, e = self.encoder(x, h)
41
+ return self.decoder(h), h, e
42
+
43
+
44
+ class HyperbolicUtils:
45
+ """A collection of static methods for operations in the Poincaré Ball model."""
46
+
47
+ @staticmethod
48
+ def mobius_add(x, y, c):
49
+ x2 = torch.sum(x * x, dim=-1, keepdim=True)
50
+ y2 = torch.sum(y * y, dim=-1, keepdim=True)
51
+ xy = torch.sum(x * y, dim=-1, keepdim=True)
52
+ num = (1 + 2 * c * xy + c * y2) * x + (1 - c * x2) * y
53
+ den = 1 + 2 * c * xy + c**2 * x2 * y2
54
+ return num / (den + EPS)
55
+
56
+ @staticmethod
57
+ def log_map_origin(y, c):
58
+ sqrt_c = c**0.5
59
+ y_norm = torch.norm(y, p=2, dim=-1, keepdim=True).clamp_min(EPS)
60
+ artanh_arg = (sqrt_c * y_norm).clamp(max=1.0 - EPS)
61
+ log_map = (1. / sqrt_c) * (0.5 * torch.log((1 + artanh_arg) / (1 - artanh_arg))) * (y / y_norm)
62
+ return log_map
63
+
64
+ @staticmethod
65
+ def poincare_dist_sq(x, y, c):
66
+ sqrt_c = c**0.5
67
+ mob_add_res = HyperbolicUtils.mobius_add(-x, y, c)
68
+ mob_add_norm = torch.norm(mob_add_res, p=2, dim=-1, keepdim=True).clamp_min(EPS)
69
+ artanh_arg = (sqrt_c * mob_add_norm).clamp(max=1.0 - EPS)
70
+ dist = (2. / sqrt_c) * (0.5 * torch.log((1 + artanh_arg) / (1 - artanh_arg)))
71
+ return dist.pow(2)
72
+
73
+ @staticmethod
74
+ def project_to_ball(x, c):
75
+ max_norm = 1.0 / (c**0.5)
76
+ norm = torch.norm(x, p=2, dim=-1, keepdim=True)
77
+ cond = norm >= max_norm
78
+ projected_x = torch.where(cond, x / (norm + EPS) * (max_norm - EPS), x)
79
+ return projected_x
80
+
81
+
82
+ class HyperbolicModel(nn.Module):
83
+ """
84
+ A recurrent model for learning event sequence representations in Hyperbolic space.
85
+ This model is well-suited for hierarchical data, as the geometry of hyperbolic
86
+ space can embed tree-like structures with low distortion.
87
+ """
88
+
89
+ def __init__(self, num_event_types, embedding_dim, dropout_p, c=1.0):
90
+ super(HyperbolicModel, self).__init__()
91
+ self.embedding_dim = embedding_dim
92
+ self.c = c
93
+ self.embedding = nn.Embedding(num_event_types, embedding_dim)
94
+ self.embedding.weight.data.uniform_(-0.001, 0.001)
95
+ self.dropout = nn.Dropout(p=dropout_p)
96
+ self.linear_dec = nn.Linear(embedding_dim, num_event_types)
97
+
98
+ def project_embeddings(self):
99
+ with torch.no_grad():
100
+ self.embedding.weight.data = HyperbolicUtils.project_to_ball(
101
+ self.embedding.weight.data, self.c
102
+ )
103
+
104
+ def encoder(self, x, h):
105
+ e = self.embedding(x)
106
+ e_dropped = self.dropout(e)
107
+ h_next = HyperbolicUtils.mobius_add(h, e_dropped, self.c)
108
+ h_next = HyperbolicUtils.project_to_ball(h_next, self.c)
109
+ return h_next, e
110
+
111
+ def decoder(self, h):
112
+ h_tangent = HyperbolicUtils.log_map_origin(h, self.c)
113
+ return self.linear_dec(h_tangent)
114
+
115
+ def forward(self, x, h=None):
116
+ if h is None:
117
+ h = torch.zeros((x.shape[0], self.embedding_dim), device=x.device)
118
+ h_next, e = self.encoder(x, h)
119
+ return self.decoder(h_next), h_next, e
120
+
@@ -0,0 +1,262 @@
1
+ Metadata-Version: 2.4
2
+ Name: event2vector
3
+ Version: 0.1.0
4
+ Summary: A geometric approach to learning composable representations of event sequences.
5
+ Author: Antonin Sulc
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/sulcantonin/event2vec_public
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.6
12
+ Description-Content-Type: text/markdown
13
+ Requires-Dist: torch
14
+ Requires-Dist: numpy
15
+ Requires-Dist: tqdm
16
+ Requires-Dist: matplotlib
17
+ Requires-Dist: scikit-learn
18
+ Requires-Dist: openTSNE
19
+ Requires-Dist: gensim
20
+ Requires-Dist: seaborn
21
+ Dynamic: requires-python
22
+
23
+ <div align="center">
24
+
25
+ # Event2Vector
26
+ ## A Geometric Approach to Learning Composable Representations of Event Sequences
27
+
28
+ [![PyPI version](https://badge.fury.io/py/event2vector.svg)](https://badge.fury.io/py/event2vector)
29
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
30
+ [![Python 3.6+](https://img.shields.io/badge/python-3.6+-blue.svg)](https://www.python.org/downloads/)
31
+ [![arXiv](https://img.shields.io/badge/arXiv-2509.12188-b31b1b.svg)](https://arxiv.org/abs/2509.12188)
32
+
33
+ ![Teaser](./images/teaser.png)
34
+
35
+ </div>
36
+
37
+ ## Overview
38
+
39
+ **Event2Vector** is a framework for learning representations of discrete event sequences. Inspired by the geometric structures found in neural representations, this model uses a simple, additive recurrent structure to create composable and interpretable embeddings.
40
+
41
+ ## Key Concepts
42
+ * **Linear Additive Hypothesis**: The core idea behind Event2Vector is that the representation of an event sequence can be modeled as the vector sum of the embeddings of its individual events. This allows for intuitive vector arithmetic, enabling the composition and decomposition of event trajectories.
43
+ * **Euclidean and Hyperbolic Models**: Event2Vector is offered in two geometric variants:
44
+ * **Euclidean model**: Uses standard vector addition, providing a straightforward, flat geometry for event trajectories.
45
+ * **Hyperbolic model**: Employs Möbius addition, which is better suited for hierarchical data structures, as it can embed tree-like patterns with less distortion.
46
+
47
+ For more details, check *Sulc A., Event2Vector: A Geometric Approach to Learning Composable Representations of Event Sequences*
48
+
49
+ ## Installation
50
+
51
+ Install the package directly from PyPI:
52
+
53
+ ```bash
54
+ pip install event2vector
55
+ ```
56
+
57
+ Or install from source:
58
+
59
+ ```bash
60
+ git clone https://github.com/sulcantonin/event2vec_public.git
61
+ cd event2vec_public
62
+ pip install .
63
+ ```
64
+
65
+
66
+ ## Brown Example
67
+ Afeer installation, you can try to run Brown Part-of-Speech tagging example from the paper.
68
+
69
+ ```bash
70
+ python3 -m experiments.prepare_brown_data.py
71
+ python3 -m experiments.train_brown_data.py
72
+ python3 -m experiments.visualize_brown_corpus.py
73
+ ```
74
+
75
+ ## Quickstart (tiny synthetic dataset)
76
+
77
+ The snippet below trains a small `EuclideanModel` on a toy event graph for a few epochs and prints an embedding for a short sequence. It runs in seconds on CPU.
78
+
79
+ We have 5 events: `START, A, B, C, END` and we test if we add `START + A + C ~ C`. The example should be self contained for educational purposes, as the main interest is the loss function.
80
+
81
+ ```python
82
+ import random
83
+ import torch
84
+ import numpy as np
85
+ from sklearn.decomposition import PCA
86
+ import matplotlib.pyplot as plt
87
+
88
+ # Reproducibility: fix all relevant seeds so results are stable across runs
89
+ SEED = 42
90
+ random.seed(SEED) # Python's RNG (used by get_sequences and shuffling)
91
+ np.random.seed(SEED) # NumPy RNG (used by PCA and any NumPy ops)
92
+ torch.manual_seed(SEED) # Torch CPU RNG
93
+ if torch.cuda.is_available():
94
+ torch.cuda.manual_seed_all(SEED) # Torch CUDA RNG (all devices)
95
+
96
+ # Optional: prefer deterministic behavior in cuDNN (may have performance impact)
97
+ torch.backends.cudnn.deterministic = True
98
+ torch.backends.cudnn.benchmark = False
99
+
100
+ # Expected results (for the tiny toy graph START→A/B→C→END):
101
+ # - Training loss should decrease over epochs (with minor wobble due to randomness/dropout).
102
+ # - The printed sequence embedding for [START, A, C] is a single 8-D vector (shape (1, 8)).
103
+ # - Nearest tokens by cosine similarity to that sequence embedding should rank 'C' highest
104
+ # (since the encoder is additive and the last token is C), with 'END' somewhat aligned.
105
+ # - The decoder's top-1 next-event probability from that state should be 'END' with high
106
+ # confidence (≈0.9+) because the toy transitions force C→END.
107
+ # Visualization (below): a 2D PCA of token embeddings plus the sequence embedding should show
108
+ # 'SEQ(START-A-C)' lying near the point labeled 'C'.
109
+
110
+ from event2vector import EuclideanModel
111
+ from event2vector.data import get_sequences
112
+
113
+ # 1) Define a tiny state-transition toy dataset
114
+ # We model a simple Markovian process: START → (A or B) → C → END
115
+ # The model will learn to predict the next token and produce a sequence embedding
116
+ # by additive composition of token embeddings.
117
+ event_types = ['START', 'A', 'B', 'C', 'END']
118
+ event_transitions = {
119
+ 'START': [('A', 0.5), ('B', 0.5)],
120
+ 'A': [('C', 0.6), ('B', 0.4)],
121
+ 'B': [('C', 0.7), ('A', 0.3)],
122
+ 'C': [('END', 1.0)],
123
+ }
124
+
125
+ # 2) Generate synthetic sequences (reproducible thanks to the fixed seeds above)
126
+ # get_sequences returns: raw sequences, tensorized sequences, and vocab mappings.
127
+ _, processed_sequences, event_2_idx, _ = get_sequences(
128
+ event_types=event_types,
129
+ event_transitions=event_transitions,
130
+ initial='START',
131
+ terminal='END',
132
+ num_seq=200, # generate 200 short sequences
133
+ max_seq_length=6, # keep them small for speed
134
+ generate_new=True,
135
+ prefix='tiny_quickstart'
136
+ )
137
+
138
+ # 3) Initialize model and optimizer
139
+ # EuclideanModel composes sequence representations via additive updates.
140
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
141
+ embedding_dim = 8
142
+ model = EuclideanModel(num_event_types=len(event_types), embedding_dim=embedding_dim, dropout_p=0.1).to(device)
143
+ loss_fn = torch.nn.CrossEntropyLoss() # prediction loss (next-token)
144
+ optimizer = torch.optim.AdamW(model.parameters(), lr=5e-3)
145
+ lambda_reconstruction = 0.2 # weight for reconstruction loss (h1 - e ≈ h_old)
146
+ lambda_consistency = 0.2 # weight for consistency loss (h1 ≈ h2)
147
+ mse_loss = torch.nn.MSELoss()
148
+
149
+ # 4) Minimal training loop
150
+ # Objective: next-token prediction with cross-entropy.
151
+ # We iterate tokens in each sequence and accumulate loss over (t → t+1) pairs.
152
+ model.train()
153
+ for epoch in range(128):
154
+ random.shuffle(processed_sequences) # shuffling is deterministic given the fixed Python seed
155
+ total_loss = 0.0
156
+ for seq_tensor in processed_sequences:
157
+ if len(seq_tensor) < 2:
158
+ continue
159
+ # h is the running sequence representation; encoder adds current token embedding
160
+ h = torch.zeros((1, embedding_dim), device=device)
161
+ seq_tensor = seq_tensor.to(device)
162
+
163
+ sequence_loss = 0.0
164
+ for t in range(len(seq_tensor) - 1):
165
+ x = seq_tensor[t].unsqueeze(0)
166
+ target = seq_tensor[t + 1].unsqueeze(0)
167
+ # Brown pipeline: two forward passes from the same h_old to compute a consistency loss
168
+ h_old = h.detach()
169
+ y1, h1, e_curr1 = model(x, h_old)
170
+ y2, h2, e_curr2 = model(x, h_old)
171
+
172
+ # Prediction: next-token cross-entropy on the first pass
173
+ prediction_loss = loss_fn(y1.view(1, -1), target)
174
+
175
+ # Reconstruction: undo the additive step (h1 - e ≈ h_old)
176
+ h_reconstructed = h1 - e_curr1
177
+ reconstruction_loss = mse_loss(h_reconstructed, h_old)
178
+
179
+ # Consistency: two identical passes should produce similar states
180
+ consistency_loss = mse_loss(h1, h2)
181
+
182
+ combined = (prediction_loss +
183
+ lambda_reconstruction * reconstruction_loss +
184
+ lambda_consistency * consistency_loss)
185
+ sequence_loss = sequence_loss + combined
186
+
187
+ # Advance hidden state using the first pass
188
+ h = h1.detach()
189
+
190
+ if (len(seq_tensor) - 1) > 0:
191
+ avg_seq_loss = sequence_loss / (len(seq_tensor) - 1)
192
+ optimizer.zero_grad()
193
+ avg_seq_loss.backward()
194
+ optimizer.step()
195
+ total_loss += avg_seq_loss.item()
196
+
197
+ print(f"epoch {epoch + 1}: loss={total_loss / max(1, len(processed_sequences)):.4f}")
198
+
199
+ # 5) Use the learned representation for a short sequence
200
+ # We encode [START, A, C] step-by-step; the final h is the sequence embedding.
201
+ model.eval()
202
+ with torch.no_grad():
203
+ seq = torch.tensor([
204
+ event_2_idx['START'], event_2_idx['A'], event_2_idx['C']
205
+ ], device=device)
206
+ h = torch.zeros((1, embedding_dim), device=device)
207
+ for idx in seq:
208
+ _, h, _ = model(idx.unsqueeze(0), h)
209
+ print('Sequence embedding (START-A-C):', h.cpu().numpy())
210
+
211
+
212
+ with torch.no_grad():
213
+ # 6) Qualitative check: nearest tokens to the sequence embedding by cosine similarity
214
+ # Expect 'C' to be nearest (last consumed token) and 'END' reasonably aligned.
215
+ emb = model.embedding.weight.detach() # [V, 8]
216
+ h_norm = torch.nn.functional.normalize(h, dim=1)
217
+ emb_norm = torch.nn.functional.normalize(emb, dim=1)
218
+ sims = (emb_norm @ h_norm.squeeze(0))
219
+ top_sim = torch.topk(sims, k=3)
220
+ print('Nearest tokens by cosine:', [ (list(event_2_idx.keys())[i], float(sims[i])) for i in top_sim.indices ])
221
+
222
+ with torch.no_grad():
223
+ # 7) Next-event distribution from the current state
224
+ # Expect 'END' to be top-1 with high probability because C→END with p=1.0
225
+ logits = model.decoder(h) # [1, V]
226
+ probs = torch.softmax(logits, dim=-1).squeeze(0)
227
+ top = torch.topk(probs, k=3)
228
+ inv = {v:k for k,v in event_2_idx.items()}
229
+ print('Top-3 next events:', [ (inv[i.item()], float(probs[i])) for i in top.indices ])
230
+
231
+ # 8) Visualization: PCA of token embeddings + sequence embedding
232
+ # Expect the red star (sequence) to lie near the point labeled 'C'.
233
+ with torch.no_grad():
234
+ token_emb = model.embedding.weight.detach().cpu().numpy() # [V, 8]
235
+ seq_emb = h.detach().cpu().numpy() # [1, 8]
236
+ X = np.vstack([token_emb, seq_emb])
237
+ pca = PCA(n_components=2, random_state=SEED)
238
+ X2 = pca.fit_transform(X)
239
+ tokens2, seq2 = X2[:-1], X2[-1]
240
+
241
+ inv = {v:k for k,v in event_2_idx.items()}
242
+ plt.figure(figsize=(6, 6))
243
+ plt.scatter(tokens2[:, 0], tokens2[:, 1], c='gray', label='tokens')
244
+ for i, (x, y) in enumerate(tokens2):
245
+ plt.text(x + 0.02, y + 0.02, inv[i], fontsize=9)
246
+ plt.scatter([seq2[0]], [seq2[1]], c='red', marker='*', s=160, label='SEQ(START-A-C)')
247
+ plt.title('PCA: token embeddings + sequence embedding (expect SEQ near C)')
248
+ plt.legend(loc='best')
249
+ plt.tight_layout()
250
+ plt.show()
251
+ ```
252
+
253
+ ## References
254
+ For citations please use following Bibtex.
255
+ ```bibtex
256
+ @article{sulc2025event2vec,
257
+ title={Event2Vector: A Geometric Approach to Learning Composable Representations of Event Sequences},
258
+ author={Sulc, Antonin},
259
+ journal={arXiv preprint arXiv:2509.12188},
260
+ year={2025}
261
+ }
262
+ ```
@@ -0,0 +1,12 @@
1
+ MANIFEST.in
2
+ README.md
3
+ pyproject.toml
4
+ setup.py
5
+ event2vector/__init__.py
6
+ event2vector/data.py
7
+ event2vector/models.py
8
+ event2vector.egg-info/PKG-INFO
9
+ event2vector.egg-info/SOURCES.txt
10
+ event2vector.egg-info/dependency_links.txt
11
+ event2vector.egg-info/requires.txt
12
+ event2vector.egg-info/top_level.txt
@@ -0,0 +1,8 @@
1
+ torch
2
+ numpy
3
+ tqdm
4
+ matplotlib
5
+ scikit-learn
6
+ openTSNE
7
+ gensim
8
+ seaborn
@@ -0,0 +1 @@
1
+ event2vector
@@ -0,0 +1,33 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "event2vector"
7
+
8
+ version = "0.1.0"
9
+ description = "A geometric approach to learning composable representations of event sequences."
10
+ readme = "README.md"
11
+ authors = [
12
+ { name = "Antonin Sulc" },
13
+ ]
14
+ license = { text = "MIT" }
15
+ classifiers = [
16
+ "Programming Language :: Python :: 3",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Operating System :: OS Independent",
19
+ ]
20
+ requires-python = ">=3.6"
21
+ dependencies = [
22
+ "torch",
23
+ "numpy",
24
+ "tqdm",
25
+ "matplotlib",
26
+ "scikit-learn",
27
+ "openTSNE",
28
+ "gensim",
29
+ "seaborn",
30
+ ]
31
+
32
+ [project.urls]
33
+ "Homepage" = "https://github.com/sulcantonin/event2vec_public"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,28 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name='event2vector',
5
+ version='0.1.0',
6
+ author='Antonin Sulc',
7
+ description='A geometric approach to learning composable representations of event sequences.',
8
+ long_description=open('README.md').read(),
9
+ long_description_content_type='text/markdown',
10
+ packages=find_packages(),
11
+ install_requires=[
12
+ 'torch',
13
+ 'numpy',
14
+ 'tqdm',
15
+ 'matplotlib',
16
+ 'scikit-learn',
17
+ 'openTSNE',
18
+ 'gensim',
19
+ 'seaborn'
20
+ ],
21
+ classifiers=[
22
+ 'Programming Language :: Python :: 3',
23
+ 'License :: OSI Approved :: MIT License',
24
+ 'Operating System :: OS Independent',
25
+ ],
26
+ python_requires='>=3.6',
27
+ )
28
+