easy-detm 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,22 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Jm Su
4
+ Portions copyright (c) 2021 Adji Bousso Dieng, Francisco Ruiz, David Blei
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
@@ -0,0 +1,192 @@
1
+ Metadata-Version: 2.1
2
+ Name: easy-detm
3
+ Version: 0.1.1
4
+ Summary: A simple, easy-to-use toolkit for Dynamic Embedded Topic Models on temporal document collections.
5
+ Author: Jm Su
6
+ License: MIT
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Science/Research
9
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
10
+ Classifier: License :: OSI Approved :: MIT License
11
+ Classifier: Programming Language :: Python :: 3
12
+ Classifier: Programming Language :: Python :: 3.7
13
+ Classifier: Programming Language :: Python :: 3.8
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Requires-Python: >=3.7
19
+ Description-Content-Type: text/markdown
20
+ License-File: LICENSE
21
+ Requires-Dist: torch>=1.7.0
22
+ Requires-Dist: numpy>=1.19.0
23
+ Requires-Dist: scipy>=1.5.0
24
+ Requires-Dist: pandas>=1.1.0
25
+ Requires-Dist: scikit-learn>=0.23.0
26
+ Requires-Dist: matplotlib>=3.3.0
27
+ Requires-Dist: seaborn>=0.11.0
28
+ Requires-Dist: umap-learn>=0.5.0
29
+ Requires-Dist: plotly>=5.0.0
30
+ Requires-Dist: scienceplots>=2.0.0
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=6.0; extra == "dev"
33
+ Requires-Dist: pytest-cov>=2.0; extra == "dev"
34
+ Requires-Dist: black>=20.0; extra == "dev"
35
+ Requires-Dist: flake8>=3.8; extra == "dev"
36
+
37
+ # easy-detm Package Document
38
+
39
+ ## Package Scope
40
+
41
+ This package provides a Python interface for training and visualizing the Dynamic
42
+ Embedded Topic Model (DETM).
43
+
44
+ The current data API is intentionally simple:
45
+
46
+ ```python
47
+ documents: List[str]
48
+ timestamps: List[int]
49
+ ```
50
+
51
+
52
+ ## Main API
53
+
54
+ ### Model
55
+
56
+ ```python
57
+ from easy_detm import DETMModel
58
+ ```
59
+
60
+ `DETMModel` is the high-level class for:
61
+
62
+ - creating the DETM model,
63
+ - fitting it to temporal documents,
64
+ - extracting topics,
65
+ - inferring document-topic distributions,
66
+ - saving and loading checkpoints,
67
+ - evaluating topic coherence and topic diversity.
68
+
69
+ ### Data
70
+
71
+ ```python
72
+ from easy_detm.data import create_dataset_from_list, DocumentCorpus
73
+ ```
74
+
75
+ Use `create_dataset_from_list()` for most workflows. Use `DocumentCorpus` only
76
+ when you need to manually control train/validation/test splits.
77
+
78
+ ### Visualization
79
+
80
+ ```python
81
+ from easy_detm import (
82
+ configure_cjk_fonts,
83
+ visualize_embeddings,
84
+ visualize_embeddings_over_time,
85
+ visualize_topic_evolution,
86
+ )
87
+ ```
88
+
89
+ Visualization functions use the learned model parameters. They do not retrain or
90
+ modify the model. `configure_cjk_fonts()` is called automatically by the
91
+ visualization module and can also be called manually to inspect or reset CJK font
92
+ support for Korean, Japanese, Chinese, and English labels.
93
+
94
+ ### Topic Metrics
95
+
96
+ ```python
97
+ diversity = model.get_topic_diversity(num_words=10)
98
+ coherence = model.get_topic_coherence(data=train, num_words=10)
99
+ ```
100
+
101
+ `get_topic_diversity()` uses only the trained topic-word distributions.
102
+ `get_topic_coherence()` also needs a reference corpus in DETM format, usually
103
+ the training split. If you call it on a model restored with `load()`, pass
104
+ `data=train` because checkpoints store model parameters and vocabulary, not the
105
+ original corpus.
106
+
107
+
108
+ ## Input Requirements
109
+
110
+ ### Documents
111
+
112
+ Documents should be strings where tokens are separated by whitespace:
113
+
114
+ ```python
115
+ documents = [
116
+ "climate carbon emissions",
117
+ "trade market finance",
118
+ ]
119
+ ```
120
+
121
+ The current package does not perform advanced NLP preprocessing. Recommended
122
+ preprocessing before calling the package:
123
+
124
+ - lowercase text,
125
+ - remove or normalize punctuation,
126
+ - remove domain-specific noise,
127
+ - tokenize consistently,
128
+ - optionally remove stopwords,
129
+ - optionally lemmatize or stem terms.
130
+
131
+ ### Timestamps
132
+
133
+ Timestamps should be integers:
134
+
135
+ ```python
136
+ timestamps = [0, 0, 1, 1, 2, 2]
137
+ ```
138
+
139
+ Recommended convention:
140
+
141
+ - use zero-based indices,
142
+ - keep time IDs contiguous,
143
+ - make sure every document has one timestamp.
144
+
145
+ ## Hyperparameter Notes
146
+
147
+
148
+ Important parameters:
149
+
150
+ - `num_topics`: number of topics.
151
+ - `num_times`: number of time periods.
152
+ - `rho_size`: topic embedding dimension.
153
+ - `emb_size`: word embedding dimension.
154
+ - `t_hidden_size`: hidden size for the theta encoder.
155
+ - `eta_hidden_size`: hidden size for the eta LSTM.
156
+ - `eta_nlayers`: number of LSTM layers for eta.
157
+ - `delta`: random-walk prior variance used by the original DETM implementation.
158
+ - `enc_drop`: dropout in the theta encoder.
159
+ - `batch_size`: minibatch size.
160
+ - `learning_rate`: optimizer learning rate.
161
+
162
+ ## Output Interpretation
163
+
164
+ ### Topics
165
+
166
+ `model.get_topics()` returns top words from the learned topic-word distributions.
167
+ Because DETM is dynamic, each topic can have different top words at different
168
+ time points.
169
+
170
+ ### Document-Topic Matrix
171
+
172
+ `model.get_document_topics()` returns an array with shape:
173
+
174
+ ```text
175
+ num_documents x num_topics
176
+ ```
177
+
178
+ Each row is a topic-proportion vector for one input document.
179
+
180
+ ### Visualizations
181
+
182
+ - Embedding plots show topics and words in a shared 2D projection.
183
+ - Topic evolution plots show word probability changes over time for one topic.
184
+
185
+ ## Acknowledgements
186
+
187
+ The core DETM model implementation is adapted from the original DETM code by
188
+ Adji Bousso Dieng, Francisco J. R. Ruiz, and David M. Blei:
189
+ https://github.com/adjidieng/DETM
190
+
191
+ Please cite the original paper when using the DETM model:
192
+ "The Dynamic Embedded Topic Model" (Dieng, Ruiz, and Blei, 2019).
@@ -0,0 +1,156 @@
1
+ # easy-detm Package Document
2
+
3
+ ## Package Scope
4
+
5
+ This package provides a Python interface for training and visualizing the Dynamic
6
+ Embedded Topic Model (DETM).
7
+
8
+ The current data API is intentionally simple:
9
+
10
+ ```python
11
+ documents: List[str]
12
+ timestamps: List[int]
13
+ ```
14
+
15
+
16
+ ## Main API
17
+
18
+ ### Model
19
+
20
+ ```python
21
+ from easy_detm import DETMModel
22
+ ```
23
+
24
+ `DETMModel` is the high-level class for:
25
+
26
+ - creating the DETM model,
27
+ - fitting it to temporal documents,
28
+ - extracting topics,
29
+ - inferring document-topic distributions,
30
+ - saving and loading checkpoints,
31
+ - evaluating topic coherence and topic diversity.
32
+
33
+ ### Data
34
+
35
+ ```python
36
+ from easy_detm.data import create_dataset_from_list, DocumentCorpus
37
+ ```
38
+
39
+ Use `create_dataset_from_list()` for most workflows. Use `DocumentCorpus` only
40
+ when you need to manually control train/validation/test splits.
41
+
42
+ ### Visualization
43
+
44
+ ```python
45
+ from easy_detm import (
46
+ configure_cjk_fonts,
47
+ visualize_embeddings,
48
+ visualize_embeddings_over_time,
49
+ visualize_topic_evolution,
50
+ )
51
+ ```
52
+
53
+ Visualization functions use the learned model parameters. They do not retrain or
54
+ modify the model. `configure_cjk_fonts()` is called automatically by the
55
+ visualization module and can also be called manually to inspect or reset CJK font
56
+ support for Korean, Japanese, Chinese, and English labels.
57
+
58
+ ### Topic Metrics
59
+
60
+ ```python
61
+ diversity = model.get_topic_diversity(num_words=10)
62
+ coherence = model.get_topic_coherence(data=train, num_words=10)
63
+ ```
64
+
65
+ `get_topic_diversity()` uses only the trained topic-word distributions.
66
+ `get_topic_coherence()` also needs a reference corpus in DETM format, usually
67
+ the training split. If you call it on a model restored with `load()`, pass
68
+ `data=train` because checkpoints store model parameters and vocabulary, not the
69
+ original corpus.
70
+
71
+
72
+ ## Input Requirements
73
+
74
+ ### Documents
75
+
76
+ Documents should be strings where tokens are separated by whitespace:
77
+
78
+ ```python
79
+ documents = [
80
+ "climate carbon emissions",
81
+ "trade market finance",
82
+ ]
83
+ ```
84
+
85
+ The current package does not perform advanced NLP preprocessing. Recommended
86
+ preprocessing before calling the package:
87
+
88
+ - lowercase text,
89
+ - remove or normalize punctuation,
90
+ - remove domain-specific noise,
91
+ - tokenize consistently,
92
+ - optionally remove stopwords,
93
+ - optionally lemmatize or stem terms.
94
+
95
+ ### Timestamps
96
+
97
+ Timestamps should be integers:
98
+
99
+ ```python
100
+ timestamps = [0, 0, 1, 1, 2, 2]
101
+ ```
102
+
103
+ Recommended convention:
104
+
105
+ - use zero-based indices,
106
+ - keep time IDs contiguous,
107
+ - make sure every document has one timestamp.
108
+
109
+ ## Hyperparameter Notes
110
+
111
+
112
+ Important parameters:
113
+
114
+ - `num_topics`: number of topics.
115
+ - `num_times`: number of time periods.
116
+ - `rho_size`: topic embedding dimension.
117
+ - `emb_size`: word embedding dimension.
118
+ - `t_hidden_size`: hidden size for the theta encoder.
119
+ - `eta_hidden_size`: hidden size for the eta LSTM.
120
+ - `eta_nlayers`: number of LSTM layers for eta.
121
+ - `delta`: random-walk prior variance used by the original DETM implementation.
122
+ - `enc_drop`: dropout in the theta encoder.
123
+ - `batch_size`: minibatch size.
124
+ - `learning_rate`: optimizer learning rate.
125
+
126
+ ## Output Interpretation
127
+
128
+ ### Topics
129
+
130
+ `model.get_topics()` returns top words from the learned topic-word distributions.
131
+ Because DETM is dynamic, each topic can have different top words at different
132
+ time points.
133
+
134
+ ### Document-Topic Matrix
135
+
136
+ `model.get_document_topics()` returns an array with shape:
137
+
138
+ ```text
139
+ num_documents x num_topics
140
+ ```
141
+
142
+ Each row is a topic-proportion vector for one input document.
143
+
144
+ ### Visualizations
145
+
146
+ - Embedding plots show topics and words in a shared 2D projection.
147
+ - Topic evolution plots show word probability changes over time for one topic.
148
+
149
+ ## Acknowledgements
150
+
151
+ The core DETM model implementation is adapted from the original DETM code by
152
+ Adji Bousso Dieng, Francisco J. R. Ruiz, and David M. Blei:
153
+ https://github.com/adjidieng/DETM
154
+
155
+ Please cite the original paper when using the DETM model:
156
+ "The Dynamic Embedded Topic Model" (Dieng, Ruiz, and Blei, 2019).
@@ -0,0 +1,40 @@
1
+ """Dynamic Embedded Topic Model (DETM) package.
2
+
3
+ This package provides a high-level API for training and using Dynamic Embedded
4
+ Topic Models on temporal document collections.
5
+
6
+ Example:
7
+ >>> from easy_detm import DETMModel
8
+ >>> from easy_detm.data import create_dataset_from_list
9
+ >>>
10
+ >>> vocab, train, valid, test = create_dataset_from_list(documents, timestamps)
11
+ >>> model = DETMModel(num_topics=50, vocab=vocab, num_times=len(set(timestamps)))
12
+ >>> model.fit(train, valid, test_data=test, epochs=100)
13
+ >>> topics = model.get_topics(num_words=10)
14
+ """
15
+
16
+ from .detm_wrapper import DETMModel
17
+ from .model import DETM
18
+ from .utils import get_topic_coherence, get_topic_diversity
19
+ from .visualization import (
20
+ configure_cjk_fonts,
21
+ visualize_embeddings,
22
+ visualize_embeddings_over_time,
23
+ visualize_topic_evolution,
24
+ plot_topic_sankey,
25
+ plot_topic_sankey_individual
26
+ )
27
+
28
+ __version__ = '0.1.1'
29
+ __all__ = [
30
+ 'DETMModel',
31
+ 'DETM',
32
+ 'get_topic_coherence',
33
+ 'get_topic_diversity',
34
+ 'configure_cjk_fonts',
35
+ 'visualize_embeddings',
36
+ 'visualize_embeddings_over_time',
37
+ 'visualize_topic_evolution',
38
+ 'plot_topic_sankey',
39
+ 'plot_topic_sankey_individual'
40
+ ]
@@ -0,0 +1,29 @@
1
+ """Simplified data loading module for DETM.
2
+
3
+ This module provides a simple interface for loading data from Python lists.
4
+
5
+ Main entry point:
6
+ create_dataset_from_list() - Create dataset from list of documents and timestamps
7
+
8
+ Example:
9
+ >>> from easy_detm.data import create_dataset_from_list
10
+ >>>
11
+ >>> # Your raw data
12
+ >>> documents = ["doc 1", "doc 2", "doc 3", ...]
13
+ >>> timestamps = [0, 0, 1, 1, 2, ...]
14
+ >>>
15
+ >>> # Create dataset (auto-splits and builds vocab)
16
+ >>> vocab, train, valid, test = create_dataset_from_list(documents, timestamps)
17
+ >>>
18
+ >>> # Use with model
19
+ >>> from easy_detm import DETMModel
20
+ >>> model = DETMModel(vocab_size=len(vocab), num_topics=10, num_times=len(set(timestamps)))
21
+ >>> model.fit(train, valid, epochs=100)
22
+ """
23
+
24
+ from .loaders import create_dataset_from_list, DocumentCorpus
25
+
26
+ __all__ = [
27
+ 'create_dataset_from_list',
28
+ 'DocumentCorpus',
29
+ ]