easy-detm 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- easy_detm-0.1.1/LICENSE +22 -0
- easy_detm-0.1.1/PKG-INFO +192 -0
- easy_detm-0.1.1/README.md +156 -0
- easy_detm-0.1.1/easy_detm/__init__.py +40 -0
- easy_detm-0.1.1/easy_detm/data/__init__.py +29 -0
- easy_detm-0.1.1/easy_detm/data/loaders.py +294 -0
- easy_detm-0.1.1/easy_detm/data/preprocessor.py +146 -0
- easy_detm-0.1.1/easy_detm/detm_wrapper.py +695 -0
- easy_detm-0.1.1/easy_detm/model.py +215 -0
- easy_detm-0.1.1/easy_detm/utils.py +263 -0
- easy_detm-0.1.1/easy_detm/visualization.py +1054 -0
- easy_detm-0.1.1/easy_detm.egg-info/PKG-INFO +192 -0
- easy_detm-0.1.1/easy_detm.egg-info/SOURCES.txt +17 -0
- easy_detm-0.1.1/easy_detm.egg-info/dependency_links.txt +1 -0
- easy_detm-0.1.1/easy_detm.egg-info/requires.txt +16 -0
- easy_detm-0.1.1/easy_detm.egg-info/top_level.txt +1 -0
- easy_detm-0.1.1/pyproject.toml +3 -0
- easy_detm-0.1.1/setup.cfg +4 -0
- easy_detm-0.1.1/setup.py +51 -0
easy_detm-0.1.1/LICENSE
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Jm Su
|
|
4
|
+
Portions copyright (c) 2021 Adji Bousso Dieng, Francisco Ruiz, David Blei
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
|
+
|
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
|
14
|
+
copies or substantial portions of the Software.
|
|
15
|
+
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
|
+
SOFTWARE.
|
easy_detm-0.1.1/PKG-INFO
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: easy-detm
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: A simple, easy-to-use toolkit for Dynamic Embedded Topic Models on temporal document collections.
|
|
5
|
+
Author: Jm Su
|
|
6
|
+
License: MIT
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Science/Research
|
|
9
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
10
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.7
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
18
|
+
Requires-Python: >=3.7
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: torch>=1.7.0
|
|
22
|
+
Requires-Dist: numpy>=1.19.0
|
|
23
|
+
Requires-Dist: scipy>=1.5.0
|
|
24
|
+
Requires-Dist: pandas>=1.1.0
|
|
25
|
+
Requires-Dist: scikit-learn>=0.23.0
|
|
26
|
+
Requires-Dist: matplotlib>=3.3.0
|
|
27
|
+
Requires-Dist: seaborn>=0.11.0
|
|
28
|
+
Requires-Dist: umap-learn>=0.5.0
|
|
29
|
+
Requires-Dist: plotly>=5.0.0
|
|
30
|
+
Requires-Dist: scienceplots>=2.0.0
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=6.0; extra == "dev"
|
|
33
|
+
Requires-Dist: pytest-cov>=2.0; extra == "dev"
|
|
34
|
+
Requires-Dist: black>=20.0; extra == "dev"
|
|
35
|
+
Requires-Dist: flake8>=3.8; extra == "dev"
|
|
36
|
+
|
|
37
|
+
# easy-detm Package Document
|
|
38
|
+
|
|
39
|
+
## Package Scope
|
|
40
|
+
|
|
41
|
+
This package provides a Python interface for training and visualizing the Dynamic
|
|
42
|
+
Embedded Topic Model (DETM).
|
|
43
|
+
|
|
44
|
+
The current data API is intentionally simple:
|
|
45
|
+
|
|
46
|
+
```python
|
|
47
|
+
documents: List[str]
|
|
48
|
+
timestamps: List[int]
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
## Main API
|
|
53
|
+
|
|
54
|
+
### Model
|
|
55
|
+
|
|
56
|
+
```python
|
|
57
|
+
from easy_detm import DETMModel
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
`DETMModel` is the high-level class for:
|
|
61
|
+
|
|
62
|
+
- creating the DETM model,
|
|
63
|
+
- fitting it to temporal documents,
|
|
64
|
+
- extracting topics,
|
|
65
|
+
- inferring document-topic distributions,
|
|
66
|
+
- saving and loading checkpoints,
|
|
67
|
+
- evaluating topic coherence and topic diversity.
|
|
68
|
+
|
|
69
|
+
### Data
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from easy_detm.data import create_dataset_from_list, DocumentCorpus
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Use `create_dataset_from_list()` for most workflows. Use `DocumentCorpus` only
|
|
76
|
+
when you need to manually control train/validation/test splits.
|
|
77
|
+
|
|
78
|
+
### Visualization
|
|
79
|
+
|
|
80
|
+
```python
|
|
81
|
+
from easy_detm import (
|
|
82
|
+
configure_cjk_fonts,
|
|
83
|
+
visualize_embeddings,
|
|
84
|
+
visualize_embeddings_over_time,
|
|
85
|
+
visualize_topic_evolution,
|
|
86
|
+
)
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Visualization functions use the learned model parameters. They do not retrain or
|
|
90
|
+
modify the model. `configure_cjk_fonts()` is called automatically by the
|
|
91
|
+
visualization module and can also be called manually to inspect or reset CJK font
|
|
92
|
+
support for Korean, Japanese, Chinese, and English labels.
|
|
93
|
+
|
|
94
|
+
### Topic Metrics
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
diversity = model.get_topic_diversity(num_words=10)
|
|
98
|
+
coherence = model.get_topic_coherence(data=train, num_words=10)
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
`get_topic_diversity()` uses only the trained topic-word distributions.
|
|
102
|
+
`get_topic_coherence()` also needs a reference corpus in DETM format, usually
|
|
103
|
+
the training split. If you call it on a model restored with `load()`, pass
|
|
104
|
+
`data=train` because checkpoints store model parameters and vocabulary, not the
|
|
105
|
+
original corpus.
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
## Input Requirements
|
|
109
|
+
|
|
110
|
+
### Documents
|
|
111
|
+
|
|
112
|
+
Documents should be strings where tokens are separated by whitespace:
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
documents = [
|
|
116
|
+
"climate carbon emissions",
|
|
117
|
+
"trade market finance",
|
|
118
|
+
]
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
The current package does not perform advanced NLP preprocessing. Recommended
|
|
122
|
+
preprocessing before calling the package:
|
|
123
|
+
|
|
124
|
+
- lowercase text,
|
|
125
|
+
- remove or normalize punctuation,
|
|
126
|
+
- remove domain-specific noise,
|
|
127
|
+
- tokenize consistently,
|
|
128
|
+
- optionally remove stopwords,
|
|
129
|
+
- optionally lemmatize or stem terms.
|
|
130
|
+
|
|
131
|
+
### Timestamps
|
|
132
|
+
|
|
133
|
+
Timestamps should be integers:
|
|
134
|
+
|
|
135
|
+
```python
|
|
136
|
+
timestamps = [0, 0, 1, 1, 2, 2]
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
Recommended convention:
|
|
140
|
+
|
|
141
|
+
- use zero-based indices,
|
|
142
|
+
- keep time IDs contiguous,
|
|
143
|
+
- make sure every document has one timestamp.
|
|
144
|
+
|
|
145
|
+
## Hyperparameter Notes
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
Important parameters:
|
|
149
|
+
|
|
150
|
+
- `num_topics`: number of topics.
|
|
151
|
+
- `num_times`: number of time periods.
|
|
152
|
+
- `rho_size`: topic embedding dimension.
|
|
153
|
+
- `emb_size`: word embedding dimension.
|
|
154
|
+
- `t_hidden_size`: hidden size for the theta encoder.
|
|
155
|
+
- `eta_hidden_size`: hidden size for the eta LSTM.
|
|
156
|
+
- `eta_nlayers`: number of LSTM layers for eta.
|
|
157
|
+
- `delta`: random-walk prior variance used by the original DETM implementation.
|
|
158
|
+
- `enc_drop`: dropout in the theta encoder.
|
|
159
|
+
- `batch_size`: minibatch size.
|
|
160
|
+
- `learning_rate`: optimizer learning rate.
|
|
161
|
+
|
|
162
|
+
## Output Interpretation
|
|
163
|
+
|
|
164
|
+
### Topics
|
|
165
|
+
|
|
166
|
+
`model.get_topics()` returns top words from the learned topic-word distributions.
|
|
167
|
+
Because DETM is dynamic, each topic can have different top words at different
|
|
168
|
+
time points.
|
|
169
|
+
|
|
170
|
+
### Document-Topic Matrix
|
|
171
|
+
|
|
172
|
+
`model.get_document_topics()` returns an array with shape:
|
|
173
|
+
|
|
174
|
+
```text
|
|
175
|
+
num_documents x num_topics
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
Each row is a topic-proportion vector for one input document.
|
|
179
|
+
|
|
180
|
+
### Visualizations
|
|
181
|
+
|
|
182
|
+
- Embedding plots show topics and words in a shared 2D projection.
|
|
183
|
+
- Topic evolution plots show word probability changes over time for one topic.
|
|
184
|
+
|
|
185
|
+
## Acknowledgements
|
|
186
|
+
|
|
187
|
+
The core DETM model implementation is adapted from the original DETM code by
|
|
188
|
+
Adji Bousso Dieng, Francisco J. R. Ruiz, and David M. Blei:
|
|
189
|
+
https://github.com/adjidieng/DETM
|
|
190
|
+
|
|
191
|
+
Please cite the original paper when using the DETM model:
|
|
192
|
+
"The Dynamic Embedded Topic Model" (Dieng, Ruiz, and Blei, 2019).
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
# easy-detm Package Document
|
|
2
|
+
|
|
3
|
+
## Package Scope
|
|
4
|
+
|
|
5
|
+
This package provides a Python interface for training and visualizing the Dynamic
|
|
6
|
+
Embedded Topic Model (DETM).
|
|
7
|
+
|
|
8
|
+
The current data API is intentionally simple:
|
|
9
|
+
|
|
10
|
+
```python
|
|
11
|
+
documents: List[str]
|
|
12
|
+
timestamps: List[int]
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
## Main API
|
|
17
|
+
|
|
18
|
+
### Model
|
|
19
|
+
|
|
20
|
+
```python
|
|
21
|
+
from easy_detm import DETMModel
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
`DETMModel` is the high-level class for:
|
|
25
|
+
|
|
26
|
+
- creating the DETM model,
|
|
27
|
+
- fitting it to temporal documents,
|
|
28
|
+
- extracting topics,
|
|
29
|
+
- inferring document-topic distributions,
|
|
30
|
+
- saving and loading checkpoints,
|
|
31
|
+
- evaluating topic coherence and topic diversity.
|
|
32
|
+
|
|
33
|
+
### Data
|
|
34
|
+
|
|
35
|
+
```python
|
|
36
|
+
from easy_detm.data import create_dataset_from_list, DocumentCorpus
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
Use `create_dataset_from_list()` for most workflows. Use `DocumentCorpus` only
|
|
40
|
+
when you need to manually control train/validation/test splits.
|
|
41
|
+
|
|
42
|
+
### Visualization
|
|
43
|
+
|
|
44
|
+
```python
|
|
45
|
+
from easy_detm import (
|
|
46
|
+
configure_cjk_fonts,
|
|
47
|
+
visualize_embeddings,
|
|
48
|
+
visualize_embeddings_over_time,
|
|
49
|
+
visualize_topic_evolution,
|
|
50
|
+
)
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
Visualization functions use the learned model parameters. They do not retrain or
|
|
54
|
+
modify the model. `configure_cjk_fonts()` is called automatically by the
|
|
55
|
+
visualization module and can also be called manually to inspect or reset CJK font
|
|
56
|
+
support for Korean, Japanese, Chinese, and English labels.
|
|
57
|
+
|
|
58
|
+
### Topic Metrics
|
|
59
|
+
|
|
60
|
+
```python
|
|
61
|
+
diversity = model.get_topic_diversity(num_words=10)
|
|
62
|
+
coherence = model.get_topic_coherence(data=train, num_words=10)
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
`get_topic_diversity()` uses only the trained topic-word distributions.
|
|
66
|
+
`get_topic_coherence()` also needs a reference corpus in DETM format, usually
|
|
67
|
+
the training split. If you call it on a model restored with `load()`, pass
|
|
68
|
+
`data=train` because checkpoints store model parameters and vocabulary, not the
|
|
69
|
+
original corpus.
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
## Input Requirements
|
|
73
|
+
|
|
74
|
+
### Documents
|
|
75
|
+
|
|
76
|
+
Documents should be strings where tokens are separated by whitespace:
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
documents = [
|
|
80
|
+
"climate carbon emissions",
|
|
81
|
+
"trade market finance",
|
|
82
|
+
]
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
The current package does not perform advanced NLP preprocessing. Recommended
|
|
86
|
+
preprocessing before calling the package:
|
|
87
|
+
|
|
88
|
+
- lowercase text,
|
|
89
|
+
- remove or normalize punctuation,
|
|
90
|
+
- remove domain-specific noise,
|
|
91
|
+
- tokenize consistently,
|
|
92
|
+
- optionally remove stopwords,
|
|
93
|
+
- optionally lemmatize or stem terms.
|
|
94
|
+
|
|
95
|
+
### Timestamps
|
|
96
|
+
|
|
97
|
+
Timestamps should be integers:
|
|
98
|
+
|
|
99
|
+
```python
|
|
100
|
+
timestamps = [0, 0, 1, 1, 2, 2]
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
Recommended convention:
|
|
104
|
+
|
|
105
|
+
- use zero-based indices,
|
|
106
|
+
- keep time IDs contiguous,
|
|
107
|
+
- make sure every document has one timestamp.
|
|
108
|
+
|
|
109
|
+
## Hyperparameter Notes
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
Important parameters:
|
|
113
|
+
|
|
114
|
+
- `num_topics`: number of topics.
|
|
115
|
+
- `num_times`: number of time periods.
|
|
116
|
+
- `rho_size`: topic embedding dimension.
|
|
117
|
+
- `emb_size`: word embedding dimension.
|
|
118
|
+
- `t_hidden_size`: hidden size for the theta encoder.
|
|
119
|
+
- `eta_hidden_size`: hidden size for the eta LSTM.
|
|
120
|
+
- `eta_nlayers`: number of LSTM layers for eta.
|
|
121
|
+
- `delta`: random-walk prior variance used by the original DETM implementation.
|
|
122
|
+
- `enc_drop`: dropout in the theta encoder.
|
|
123
|
+
- `batch_size`: minibatch size.
|
|
124
|
+
- `learning_rate`: optimizer learning rate.
|
|
125
|
+
|
|
126
|
+
## Output Interpretation
|
|
127
|
+
|
|
128
|
+
### Topics
|
|
129
|
+
|
|
130
|
+
`model.get_topics()` returns top words from the learned topic-word distributions.
|
|
131
|
+
Because DETM is dynamic, each topic can have different top words at different
|
|
132
|
+
time points.
|
|
133
|
+
|
|
134
|
+
### Document-Topic Matrix
|
|
135
|
+
|
|
136
|
+
`model.get_document_topics()` returns an array with shape:
|
|
137
|
+
|
|
138
|
+
```text
|
|
139
|
+
num_documents x num_topics
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
Each row is a topic-proportion vector for one input document.
|
|
143
|
+
|
|
144
|
+
### Visualizations
|
|
145
|
+
|
|
146
|
+
- Embedding plots show topics and words in a shared 2D projection.
|
|
147
|
+
- Topic evolution plots show word probability changes over time for one topic.
|
|
148
|
+
|
|
149
|
+
## Acknowledgements
|
|
150
|
+
|
|
151
|
+
The core DETM model implementation is adapted from the original DETM code by
|
|
152
|
+
Adji Bousso Dieng, Francisco J. R. Ruiz, and David M. Blei:
|
|
153
|
+
https://github.com/adjidieng/DETM
|
|
154
|
+
|
|
155
|
+
Please cite the original paper when using the DETM model:
|
|
156
|
+
"The Dynamic Embedded Topic Model" (Dieng, Ruiz, and Blei, 2019).
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Dynamic Embedded Topic Model (DETM) package.
|
|
2
|
+
|
|
3
|
+
This package provides a high-level API for training and using Dynamic Embedded
|
|
4
|
+
Topic Models on temporal document collections.
|
|
5
|
+
|
|
6
|
+
Example:
|
|
7
|
+
>>> from easy_detm import DETMModel
|
|
8
|
+
>>> from easy_detm.data import create_dataset_from_list
|
|
9
|
+
>>>
|
|
10
|
+
>>> vocab, train, valid, test = create_dataset_from_list(documents, timestamps)
|
|
11
|
+
>>> model = DETMModel(num_topics=50, vocab=vocab, num_times=len(set(timestamps)))
|
|
12
|
+
>>> model.fit(train, valid, test_data=test, epochs=100)
|
|
13
|
+
>>> topics = model.get_topics(num_words=10)
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from .detm_wrapper import DETMModel
|
|
17
|
+
from .model import DETM
|
|
18
|
+
from .utils import get_topic_coherence, get_topic_diversity
|
|
19
|
+
from .visualization import (
|
|
20
|
+
configure_cjk_fonts,
|
|
21
|
+
visualize_embeddings,
|
|
22
|
+
visualize_embeddings_over_time,
|
|
23
|
+
visualize_topic_evolution,
|
|
24
|
+
plot_topic_sankey,
|
|
25
|
+
plot_topic_sankey_individual
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
__version__ = '0.1.1'
|
|
29
|
+
__all__ = [
|
|
30
|
+
'DETMModel',
|
|
31
|
+
'DETM',
|
|
32
|
+
'get_topic_coherence',
|
|
33
|
+
'get_topic_diversity',
|
|
34
|
+
'configure_cjk_fonts',
|
|
35
|
+
'visualize_embeddings',
|
|
36
|
+
'visualize_embeddings_over_time',
|
|
37
|
+
'visualize_topic_evolution',
|
|
38
|
+
'plot_topic_sankey',
|
|
39
|
+
'plot_topic_sankey_individual'
|
|
40
|
+
]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Simplified data loading module for DETM.
|
|
2
|
+
|
|
3
|
+
This module provides a simple interface for loading data from Python lists.
|
|
4
|
+
|
|
5
|
+
Main entry point:
|
|
6
|
+
create_dataset_from_list() - Create dataset from list of documents and timestamps
|
|
7
|
+
|
|
8
|
+
Example:
|
|
9
|
+
>>> from easy_detm.data import create_dataset_from_list
|
|
10
|
+
>>>
|
|
11
|
+
>>> # Your raw data
|
|
12
|
+
>>> documents = ["doc 1", "doc 2", "doc 3", ...]
|
|
13
|
+
>>> timestamps = [0, 0, 1, 1, 2, ...]
|
|
14
|
+
>>>
|
|
15
|
+
>>> # Create dataset (auto-splits and builds vocab)
|
|
16
|
+
>>> vocab, train, valid, test = create_dataset_from_list(documents, timestamps)
|
|
17
|
+
>>>
|
|
18
|
+
>>> # Use with model
|
|
19
|
+
>>> from easy_detm import DETMModel
|
|
20
|
+
>>> model = DETMModel(vocab_size=len(vocab), num_topics=10, num_times=len(set(timestamps)))
|
|
21
|
+
>>> model.fit(train, valid, epochs=100)
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from .loaders import create_dataset_from_list, DocumentCorpus
|
|
25
|
+
|
|
26
|
+
__all__ = [
|
|
27
|
+
'create_dataset_from_list',
|
|
28
|
+
'DocumentCorpus',
|
|
29
|
+
]
|