tactik 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tactik-0.1.3/PKG-INFO +269 -0
- tactik-0.1.3/README.md +238 -0
- tactik-0.1.3/license.txt +21 -0
- tactik-0.1.3/pyproject.toml +43 -0
- tactik-0.1.3/setup.cfg +4 -0
- tactik-0.1.3/src/tactik/__init__.py +19 -0
- tactik-0.1.3/src/tactik/clustering_pipeline.py +847 -0
- tactik-0.1.3/src/tactik/clustering_tuning.py +750 -0
- tactik-0.1.3/src/tactik/embeddings.py +1632 -0
- tactik-0.1.3/src/tactik/preprocessing.py +970 -0
- tactik-0.1.3/src/tactik/topic_extraction.py +594 -0
- tactik-0.1.3/src/tactik/utilities.py +1045 -0
- tactik-0.1.3/src/tactik/visualization.py +484 -0
- tactik-0.1.3/src/tactik.egg-info/PKG-INFO +269 -0
- tactik-0.1.3/src/tactik.egg-info/SOURCES.txt +23 -0
- tactik-0.1.3/src/tactik.egg-info/dependency_links.txt +1 -0
- tactik-0.1.3/src/tactik.egg-info/requires.txt +16 -0
- tactik-0.1.3/src/tactik.egg-info/top_level.txt +1 -0
- tactik-0.1.3/tests/test_clustering.py +0 -0
- tactik-0.1.3/tests/test_clustering_pipeline.py +1283 -0
- tactik-0.1.3/tests/test_clustering_tuning.py +996 -0
- tactik-0.1.3/tests/test_embeddings.py +1075 -0
- tactik-0.1.3/tests/test_preprocessing.py +902 -0
- tactik-0.1.3/tests/test_topic_extraction.py +627 -0
- tactik-0.1.3/tests/test_utilities.py +1102 -0
tactik-0.1.3/PKG-INFO
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tactik
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: A Python library for topic modeling, clustering, and NLP analysis.
|
|
5
|
+
Author-email: Your Name <you@example.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/yourusername/your-package
|
|
8
|
+
Project-URL: Documentation, https://yourusername.github.io/your-package
|
|
9
|
+
Project-URL: Source, https://github.com/yourusername/your-package
|
|
10
|
+
Project-URL: Issues, https://github.com/yourusername/your-package/issues
|
|
11
|
+
Requires-Python: >=3.8
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: license.txt
|
|
14
|
+
Requires-Dist: matplotlib==3.8.0
|
|
15
|
+
Requires-Dist: numpy==1.24.4
|
|
16
|
+
Requires-Dist: pandas==1.5.3
|
|
17
|
+
Requires-Dist: scikit-learn==1.2.2
|
|
18
|
+
Requires-Dist: scipy==1.10.1
|
|
19
|
+
Requires-Dist: seaborn==0.12.2
|
|
20
|
+
Requires-Dist: tqdm==4.65.0
|
|
21
|
+
Requires-Dist: gensim==4.3.0
|
|
22
|
+
Requires-Dist: hdbscan==0.8.40
|
|
23
|
+
Requires-Dist: nltk
|
|
24
|
+
Requires-Dist: sentence-transformers==4.1.0
|
|
25
|
+
Requires-Dist: spacy==3.8.7
|
|
26
|
+
Requires-Dist: torch==2.3.0+cpu
|
|
27
|
+
Requires-Dist: transformers==4.52.3
|
|
28
|
+
Requires-Dist: umap-learn==0.5.7
|
|
29
|
+
Requires-Dist: yake==0.4.8
|
|
30
|
+
Dynamic: license-file
|
|
31
|
+
|
|
32
|
+
<img src="tactik_header.png" alt="TACTIK Header" width="800" style="display:block;margin:auto;"/>
|
|
33
|
+
|
|
34
|
+
# TACTIK
|
|
35
|
+
## Text Analysis, Clustering, Tuning, Information and Keyword Extraction
|
|
36
|
+
|
|
37
|
+
Tactik started as a side project to streamline clustering of aviation-related reports. The pipeline initially faced long processing times and became a bottleneck for analysis. These issues were addressed, and further functionality was added to enable intuitive topic extraction. The pipeline was adapted to work domain-agnostically while keeping the core use case in mind. With this functionality, we decided to release the package publicly so other researchers can contribute to it, build on it, or benefit from the included tools. Thank you for using TACTIK — we hope you find it as useful as we did in our research!
|
|
38
|
+
|
|
39
|
+
## Features
|
|
40
|
+
|
|
41
|
+
- **End-to-End Clustering Pipeline**: Automated workflow from preprocessing to cluster analysis
|
|
42
|
+
- **Modular Design**: Use the different components as standalone modules or full pipelines
|
|
43
|
+
- **Layered Effective Methods**: UMAP dimensionality reduction + HDBSCAN clustering
|
|
44
|
+
- **Hyperparameter Tuning**: Automated parameter optimization using random search
|
|
45
|
+
- **Keyword Extraction**: Multiple methods including TF, TF-IDF, DF, and YAKE
|
|
46
|
+
- **Topic Modeling**: LDA-based topic discovery with BERT-powered semantic matching *(still in development)*
|
|
47
|
+
- **Rich Visualizations**: t-SNE plots with customizable styling and annotations
|
|
48
|
+
- **Memory Efficient**: Optimized for large datasets with lazy evaluation and caching
|
|
49
|
+
|
|
50
|
+
## Installation
|
|
51
|
+
|
|
52
|
+
```bash
|
|
53
|
+
# Install tactik
|
|
54
|
+
pip install tactik
|
|
55
|
+
|
|
56
|
+
# Or install from source
|
|
57
|
+
git clone https://github.com/npsAub/tactik.git
|
|
58
|
+
cd tactik
|
|
59
|
+
pip install -e .
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
### Dependencies
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
# Core dependencies
|
|
66
|
+
pip install pandas numpy matplotlib seaborn scikit-learn
|
|
67
|
+
pip install umap-learn hdbscan gensim nltk yake
|
|
68
|
+
pip install transformers torch
|
|
69
|
+
```
|
|
70
|
+
|
|
71
|
+
## Core Components
|
|
72
|
+
|
|
73
|
+
### 1. ClusteringPipeline
|
|
74
|
+
Main orchestrator class that coordinates the entire analysis workflow.
|
|
75
|
+
|
|
76
|
+
**Key Methods:**
|
|
77
|
+
- `preprocess_data()` - Text cleaning and stopword removal
|
|
78
|
+
- `cluster_data()` - Clustering with fixed parameters
|
|
79
|
+
- `tune_and_cluster()` - Clustering with hyperparameter tuning
|
|
80
|
+
- `cluster_and_extract_keywords()` - Integrated clustering + keyword extraction
|
|
81
|
+
- `cluster_and_analyze_topics()` - Integrated clustering + topic modeling
|
|
82
|
+
- `visualize_clusters()` - Create cluster visualizations
|
|
83
|
+
- `get_cluster_summary()` - Generate cluster statistics
|
|
84
|
+
|
|
85
|
+
### 2. Clustering & Tuning
|
|
86
|
+
Low-level clustering functions with hyperparameter optimization.
|
|
87
|
+
|
|
88
|
+
**Key Functions:**
|
|
89
|
+
- `tune_clustering_hyperparameters()` - Random search optimization
|
|
90
|
+
- `apply_best_clustering()` - Apply optimized parameters
|
|
91
|
+
- `full_clustering_pipeline()` - Complete pipeline with tuning
|
|
92
|
+
- `full_clustering_pipeline_fixed_params()` - Pipeline with fixed parameters
|
|
93
|
+
|
|
94
|
+
**Supported Metrics:**
|
|
95
|
+
- `davies_bouldin`: Lower is better (measures cluster separation)
|
|
96
|
+
- `calinski_harabasz`: Higher is better (ratio of between/within cluster dispersion)
|
|
97
|
+
|
|
98
|
+
### 3. Keyword Extraction
|
|
99
|
+
Extract representative keywords from each cluster using multiple methods.
|
|
100
|
+
|
|
101
|
+
**KeywordExtractor Class:**
|
|
102
|
+
- `extract_keywords_per_cluster()` - Extract keywords using multiple methods
|
|
103
|
+
- `save_keywords()` - Save results to CSV
|
|
104
|
+
|
|
105
|
+
**Extraction Methods:**
|
|
106
|
+
- **TF**: Term Frequency
|
|
107
|
+
- **TF-IDF**: Term Frequency–Inverse Document Frequency
|
|
108
|
+
- **TF-DF**: Term Frequency–Document Frequency
|
|
109
|
+
- **YAKE**: Yet Another Keyword Extractor (long and short narratives)
|
|
110
|
+
|
|
111
|
+
### 4. Topic Modeling
|
|
112
|
+
Discover latent topics using LDA and match them to predefined designators.
|
|
113
|
+
|
|
114
|
+
**TopicModeler Class:**
|
|
115
|
+
- `train_lda()` - Train Latent Dirichlet Allocation model
|
|
116
|
+
- `get_cluster_topics()` - Get top topics per cluster
|
|
117
|
+
- `match_designators_to_topics()` - Match topics to designators using BERT embeddings
|
|
118
|
+
- `get_bert_embedding()` - Compute BERT embeddings for semantic matching
|
|
119
|
+
|
|
120
|
+
**Default Aviation Safety Designators:**
|
|
121
|
+
- Inadequate or inaccurate knowledge
|
|
122
|
+
- Poor judgment and decision-making
|
|
123
|
+
- Failure to follow procedures
|
|
124
|
+
- Poor communication
|
|
125
|
+
- Inadequate monitoring or vigilance
|
|
126
|
+
- Task management and prioritization
|
|
127
|
+
- Stress and psychological factors
|
|
128
|
+
- Physical or physiological factors
|
|
129
|
+
- Technical or system failures
|
|
130
|
+
- Environmental factors
|
|
131
|
+
|
|
132
|
+
### 5. Visualization
|
|
133
|
+
Create publication-ready visualizations of clustering results.
|
|
134
|
+
|
|
135
|
+
**Visualization Functions:**
|
|
136
|
+
- `plot_clusters()` - Basic cluster scatter plot
|
|
137
|
+
- `plot_clusters_with_annotations()` - Plot with category annotations
|
|
138
|
+
- `plot_cluster_comparison()` - Side-by-side comparison plots
|
|
139
|
+
- `set_visualization_style()` - Configure plot styling
|
|
140
|
+
- `get_cluster_palette()` - Generate color palettes
|
|
141
|
+
- `get_cluster_markers()` - Generate marker styles
|
|
142
|
+
|
|
143
|
+
## Pipeline Architecture
|
|
144
|
+
|
|
145
|
+
```
|
|
146
|
+
Input Data (DataFrame)
|
|
147
|
+
↓
|
|
148
|
+
Preprocessing
|
|
149
|
+
├── Text cleaning
|
|
150
|
+
├── Stopword removal
|
|
151
|
+
└── Tokenization
|
|
152
|
+
↓
|
|
153
|
+
Vectorization (TF-IDF)
|
|
154
|
+
↓
|
|
155
|
+
Dimensionality Reduction (UMAP)
|
|
156
|
+
↓
|
|
157
|
+
Clustering (HDBSCAN)
|
|
158
|
+
↓
|
|
159
|
+
Visualization (t-SNE)
|
|
160
|
+
↓
|
|
161
|
+
Analysis
|
|
162
|
+
├── Keyword Extraction
|
|
163
|
+
└── Topic Modeling (LDA + BERT)
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
## Performance Considerations
|
|
167
|
+
|
|
168
|
+
### Memory Optimization
|
|
169
|
+
- DataFrame lazy copying
|
|
170
|
+
- Vectorization caching
|
|
171
|
+
- BERT embedding cache with `clear_cache()` method
|
|
172
|
+
- Incremental topic probability calculations
|
|
173
|
+
|
|
174
|
+
### GPU Acceleration
|
|
175
|
+
GPU acceleration is available for BERT computations when initializing TopicModeler with `use_gpu=True`.
|
|
176
|
+
|
|
177
|
+
### Large Datasets
|
|
178
|
+
For large datasets, consider:
|
|
179
|
+
- Disabling t-SNE computation with `compute_tsne=False`
|
|
180
|
+
- Using fixed parameters instead of hyperparameter tuning
|
|
181
|
+
- Clearing BERT embedding cache periodically
|
|
182
|
+
|
|
183
|
+
## Evaluation Metrics
|
|
184
|
+
|
|
185
|
+
- **Davies-Bouldin Score**: Measures average similarity between clusters (lower is better)
|
|
186
|
+
- **Calinski-Harabasz Score**: Ratio of between-cluster to within-cluster variance (higher is better)
|
|
187
|
+
- **Cluster Count**: Number of discovered clusters
|
|
188
|
+
- **Noise Ratio**: Proportion of outlier points
|
|
189
|
+
|
|
190
|
+
## Output Formats
|
|
191
|
+
|
|
192
|
+
### Cluster Summary
|
|
193
|
+
DataFrame with columns: Cluster ID, Size, Percentage
|
|
194
|
+
|
|
195
|
+
### Keywords DataFrame
|
|
196
|
+
DataFrame with columns: cluster, Yake Long, Yake Short, TF, TFIDF, TFDF
|
|
197
|
+
|
|
198
|
+
### Topic Analysis
|
|
199
|
+
Dictionary containing:
|
|
200
|
+
- `cluster_topics`: Mapping of clusters to top topics
|
|
201
|
+
- `topic_designators`: Matching of topics to designators
|
|
202
|
+
- `model`: TopicModeler instance
|
|
203
|
+
|
|
204
|
+
## Dependencies
|
|
205
|
+
|
|
206
|
+
- **Core**: pandas, numpy, scikit-learn
|
|
207
|
+
- **Clustering**: umap-learn, hdbscan
|
|
208
|
+
- **Visualization**: matplotlib, seaborn
|
|
209
|
+
- **NLP**: nltk, gensim, yake
|
|
210
|
+
- **Deep Learning**: transformers, torch
|
|
211
|
+
|
|
212
|
+
## Contributing
|
|
213
|
+
|
|
214
|
+
Contributions are welcome! We encourage you to:
|
|
215
|
+
|
|
216
|
+
- **Report Issues**: Found a bug or have a feature request? Open an issue on GitHub
|
|
217
|
+
- **Submit Pull Requests**: Improvements to code, documentation, or tests are appreciated
|
|
218
|
+
- **Share Use Cases**: Let us know how you're using tactik
|
|
219
|
+
- **Improve Documentation**: Help us make TACTIK more accessible
|
|
220
|
+
|
|
221
|
+
### Development Setup
|
|
222
|
+
|
|
223
|
+
```bash
|
|
224
|
+
# Clone the repository
|
|
225
|
+
git clone https://github.com/npsAub/tactik.git
|
|
226
|
+
cd tactik
|
|
227
|
+
|
|
228
|
+
# Install in development mode with dev dependencies
|
|
229
|
+
pip install -e ".[dev]"
|
|
230
|
+
|
|
231
|
+
# Run tests
|
|
232
|
+
unittest discover
|
|
233
|
+
|
|
234
|
+
# Run linting
|
|
235
|
+
flake8 tactik/
|
|
236
|
+
black tactik/
|
|
237
|
+
```
|
|
238
|
+
|
|
239
|
+
### Contribution Guidelines
|
|
240
|
+
|
|
241
|
+
1. Fork the repository and create a feature branch
|
|
242
|
+
2. Write clear, documented code following the existing style
|
|
243
|
+
3. Add tests for new functionality
|
|
244
|
+
4. Update documentation as needed
|
|
245
|
+
5. Submit a pull request with a clear description
|
|
246
|
+
|
|
247
|
+
For major changes, please open an issue first to discuss your proposal.
|
|
248
|
+
|
|
249
|
+
## Citation
|
|
250
|
+
|
|
251
|
+
If you use TACTIK in your research, please cite:
|
|
252
|
+
|
|
253
|
+
```bibtex
|
|
254
|
+
@software{tactik,
|
|
255
|
+
title={tactik: Text Analysis, Clustering, Tuning, Information and Keyword Extraction},
|
|
256
|
+
author={Niklas P. Schulmeyer and Nicoletta Fala},
|
|
257
|
+
year={2025},
|
|
258
|
+
url={https://github.com/npsAub/tactik}
|
|
259
|
+
}
|
|
260
|
+
```
|
|
261
|
+
|
|
262
|
+
## License
|
|
263
|
+
|
|
264
|
+
MIT License — See `LICENSE` for details
|
|
265
|
+
|
|
266
|
+
## Contact
|
|
267
|
+
|
|
268
|
+
For questions or support, please open an issue on GitHub or contact [nps0027@auburn.edu].
|
|
269
|
+
|
tactik-0.1.3/README.md
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
<img src="tactik_header.png" alt="TACTIK Header" width="800" style="display:block;margin:auto;"/>
|
|
2
|
+
|
|
3
|
+
# TACTIK
|
|
4
|
+
## Text Analysis, Clustering, Tuning, Information and Keyword Extraction
|
|
5
|
+
|
|
6
|
+
Tactik started as a side project to streamline clustering of aviation-related reports. The pipeline initially faced long processing times and became a bottleneck for analysis. These issues were addressed, and further functionality was added to enable intuitive topic extraction. The pipeline was adapted to work domain-agnostically while keeping the core use case in mind. With this functionality, we decided to release the package publicly so other researchers can contribute to it, build on it, or benefit from the included tools. Thank you for using TACTIK — we hope you find it as useful as we did in our research!
|
|
7
|
+
|
|
8
|
+
## Features
|
|
9
|
+
|
|
10
|
+
- **End-to-End Clustering Pipeline**: Automated workflow from preprocessing to cluster analysis
|
|
11
|
+
- **Modular Design**: Use the different components as standalone modules or full pipelines
|
|
12
|
+
- **Layered Effective Methods**: UMAP dimensionality reduction + HDBSCAN clustering
|
|
13
|
+
- **Hyperparameter Tuning**: Automated parameter optimization using random search
|
|
14
|
+
- **Keyword Extraction**: Multiple methods including TF, TF-IDF, DF, and YAKE
|
|
15
|
+
- **Topic Modeling**: LDA-based topic discovery with BERT-powered semantic matching *(still in development)*
|
|
16
|
+
- **Rich Visualizations**: t-SNE plots with customizable styling and annotations
|
|
17
|
+
- **Memory Efficient**: Optimized for large datasets with lazy evaluation and caching
|
|
18
|
+
|
|
19
|
+
## Installation
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
# Install tactik
|
|
23
|
+
pip install tactik
|
|
24
|
+
|
|
25
|
+
# Or install from source
|
|
26
|
+
git clone https://github.com/npsAub/tactik.git
|
|
27
|
+
cd tactik
|
|
28
|
+
pip install -e .
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### Dependencies
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
# Core dependencies
|
|
35
|
+
pip install pandas numpy matplotlib seaborn scikit-learn
|
|
36
|
+
pip install umap-learn hdbscan gensim nltk yake
|
|
37
|
+
pip install transformers torch
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
## Core Components
|
|
41
|
+
|
|
42
|
+
### 1. ClusteringPipeline
|
|
43
|
+
Main orchestrator class that coordinates the entire analysis workflow.
|
|
44
|
+
|
|
45
|
+
**Key Methods:**
|
|
46
|
+
- `preprocess_data()` - Text cleaning and stopword removal
|
|
47
|
+
- `cluster_data()` - Clustering with fixed parameters
|
|
48
|
+
- `tune_and_cluster()` - Clustering with hyperparameter tuning
|
|
49
|
+
- `cluster_and_extract_keywords()` - Integrated clustering + keyword extraction
|
|
50
|
+
- `cluster_and_analyze_topics()` - Integrated clustering + topic modeling
|
|
51
|
+
- `visualize_clusters()` - Create cluster visualizations
|
|
52
|
+
- `get_cluster_summary()` - Generate cluster statistics
|
|
53
|
+
|
|
54
|
+
### 2. Clustering & Tuning
|
|
55
|
+
Low-level clustering functions with hyperparameter optimization.
|
|
56
|
+
|
|
57
|
+
**Key Functions:**
|
|
58
|
+
- `tune_clustering_hyperparameters()` - Random search optimization
|
|
59
|
+
- `apply_best_clustering()` - Apply optimized parameters
|
|
60
|
+
- `full_clustering_pipeline()` - Complete pipeline with tuning
|
|
61
|
+
- `full_clustering_pipeline_fixed_params()` - Pipeline with fixed parameters
|
|
62
|
+
|
|
63
|
+
**Supported Metrics:**
|
|
64
|
+
- `davies_bouldin`: Lower is better (measures cluster separation)
|
|
65
|
+
- `calinski_harabasz`: Higher is better (ratio of between/within cluster dispersion)
|
|
66
|
+
|
|
67
|
+
### 3. Keyword Extraction
|
|
68
|
+
Extract representative keywords from each cluster using multiple methods.
|
|
69
|
+
|
|
70
|
+
**KeywordExtractor Class:**
|
|
71
|
+
- `extract_keywords_per_cluster()` - Extract keywords using multiple methods
|
|
72
|
+
- `save_keywords()` - Save results to CSV
|
|
73
|
+
|
|
74
|
+
**Extraction Methods:**
|
|
75
|
+
- **TF**: Term Frequency
|
|
76
|
+
- **TF-IDF**: Term Frequency–Inverse Document Frequency
|
|
77
|
+
- **TF-DF**: Term Frequency–Document Frequency
|
|
78
|
+
- **YAKE**: Yet Another Keyword Extractor (long and short narratives)
|
|
79
|
+
|
|
80
|
+
### 4. Topic Modeling
|
|
81
|
+
Discover latent topics using LDA and match them to predefined designators.
|
|
82
|
+
|
|
83
|
+
**TopicModeler Class:**
|
|
84
|
+
- `train_lda()` - Train Latent Dirichlet Allocation model
|
|
85
|
+
- `get_cluster_topics()` - Get top topics per cluster
|
|
86
|
+
- `match_designators_to_topics()` - Match topics to designators using BERT embeddings
|
|
87
|
+
- `get_bert_embedding()` - Compute BERT embeddings for semantic matching
|
|
88
|
+
|
|
89
|
+
**Default Aviation Safety Designators:**
|
|
90
|
+
- Inadequate or inaccurate knowledge
|
|
91
|
+
- Poor judgment and decision-making
|
|
92
|
+
- Failure to follow procedures
|
|
93
|
+
- Poor communication
|
|
94
|
+
- Inadequate monitoring or vigilance
|
|
95
|
+
- Task management and prioritization
|
|
96
|
+
- Stress and psychological factors
|
|
97
|
+
- Physical or physiological factors
|
|
98
|
+
- Technical or system failures
|
|
99
|
+
- Environmental factors
|
|
100
|
+
|
|
101
|
+
### 5. Visualization
|
|
102
|
+
Create publication-ready visualizations of clustering results.
|
|
103
|
+
|
|
104
|
+
**Visualization Functions:**
|
|
105
|
+
- `plot_clusters()` - Basic cluster scatter plot
|
|
106
|
+
- `plot_clusters_with_annotations()` - Plot with category annotations
|
|
107
|
+
- `plot_cluster_comparison()` - Side-by-side comparison plots
|
|
108
|
+
- `set_visualization_style()` - Configure plot styling
|
|
109
|
+
- `get_cluster_palette()` - Generate color palettes
|
|
110
|
+
- `get_cluster_markers()` - Generate marker styles
|
|
111
|
+
|
|
112
|
+
## Pipeline Architecture
|
|
113
|
+
|
|
114
|
+
```
|
|
115
|
+
Input Data (DataFrame)
|
|
116
|
+
↓
|
|
117
|
+
Preprocessing
|
|
118
|
+
├── Text cleaning
|
|
119
|
+
├── Stopword removal
|
|
120
|
+
└── Tokenization
|
|
121
|
+
↓
|
|
122
|
+
Vectorization (TF-IDF)
|
|
123
|
+
↓
|
|
124
|
+
Dimensionality Reduction (UMAP)
|
|
125
|
+
↓
|
|
126
|
+
Clustering (HDBSCAN)
|
|
127
|
+
↓
|
|
128
|
+
Visualization (t-SNE)
|
|
129
|
+
↓
|
|
130
|
+
Analysis
|
|
131
|
+
├── Keyword Extraction
|
|
132
|
+
└── Topic Modeling (LDA + BERT)
|
|
133
|
+
```
|
|
134
|
+
|
|
135
|
+
## Performance Considerations
|
|
136
|
+
|
|
137
|
+
### Memory Optimization
|
|
138
|
+
- DataFrame lazy copying
|
|
139
|
+
- Vectorization caching
|
|
140
|
+
- BERT embedding cache with `clear_cache()` method
|
|
141
|
+
- Incremental topic probability calculations
|
|
142
|
+
|
|
143
|
+
### GPU Acceleration
|
|
144
|
+
GPU acceleration is available for BERT computations when initializing TopicModeler with `use_gpu=True`.
|
|
145
|
+
|
|
146
|
+
### Large Datasets
|
|
147
|
+
For large datasets, consider:
|
|
148
|
+
- Disabling t-SNE computation with `compute_tsne=False`
|
|
149
|
+
- Using fixed parameters instead of hyperparameter tuning
|
|
150
|
+
- Clearing BERT embedding cache periodically
|
|
151
|
+
|
|
152
|
+
## Evaluation Metrics
|
|
153
|
+
|
|
154
|
+
- **Davies-Bouldin Score**: Measures average similarity between clusters (lower is better)
|
|
155
|
+
- **Calinski-Harabasz Score**: Ratio of between-cluster to within-cluster variance (higher is better)
|
|
156
|
+
- **Cluster Count**: Number of discovered clusters
|
|
157
|
+
- **Noise Ratio**: Proportion of outlier points
|
|
158
|
+
|
|
159
|
+
## Output Formats
|
|
160
|
+
|
|
161
|
+
### Cluster Summary
|
|
162
|
+
DataFrame with columns: Cluster ID, Size, Percentage
|
|
163
|
+
|
|
164
|
+
### Keywords DataFrame
|
|
165
|
+
DataFrame with columns: cluster, Yake Long, Yake Short, TF, TFIDF, TFDF
|
|
166
|
+
|
|
167
|
+
### Topic Analysis
|
|
168
|
+
Dictionary containing:
|
|
169
|
+
- `cluster_topics`: Mapping of clusters to top topics
|
|
170
|
+
- `topic_designators`: Matching of topics to designators
|
|
171
|
+
- `model`: TopicModeler instance
|
|
172
|
+
|
|
173
|
+
## Dependencies
|
|
174
|
+
|
|
175
|
+
- **Core**: pandas, numpy, scikit-learn
|
|
176
|
+
- **Clustering**: umap-learn, hdbscan
|
|
177
|
+
- **Visualization**: matplotlib, seaborn
|
|
178
|
+
- **NLP**: nltk, gensim, yake
|
|
179
|
+
- **Deep Learning**: transformers, torch
|
|
180
|
+
|
|
181
|
+
## Contributing
|
|
182
|
+
|
|
183
|
+
Contributions are welcome! We encourage you to:
|
|
184
|
+
|
|
185
|
+
- **Report Issues**: Found a bug or have a feature request? Open an issue on GitHub
|
|
186
|
+
- **Submit Pull Requests**: Improvements to code, documentation, or tests are appreciated
|
|
187
|
+
- **Share Use Cases**: Let us know how you're using tactik
|
|
188
|
+
- **Improve Documentation**: Help us make TACTIK more accessible
|
|
189
|
+
|
|
190
|
+
### Development Setup
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
# Clone the repository
|
|
194
|
+
git clone https://github.com/npsAub/tactik.git
|
|
195
|
+
cd tactik
|
|
196
|
+
|
|
197
|
+
# Install in development mode with dev dependencies
|
|
198
|
+
pip install -e ".[dev]"
|
|
199
|
+
|
|
200
|
+
# Run tests
|
|
201
|
+
unittest discover
|
|
202
|
+
|
|
203
|
+
# Run linting
|
|
204
|
+
flake8 tactik/
|
|
205
|
+
black tactik/
|
|
206
|
+
```
|
|
207
|
+
|
|
208
|
+
### Contribution Guidelines
|
|
209
|
+
|
|
210
|
+
1. Fork the repository and create a feature branch
|
|
211
|
+
2. Write clear, documented code following the existing style
|
|
212
|
+
3. Add tests for new functionality
|
|
213
|
+
4. Update documentation as needed
|
|
214
|
+
5. Submit a pull request with a clear description
|
|
215
|
+
|
|
216
|
+
For major changes, please open an issue first to discuss your proposal.
|
|
217
|
+
|
|
218
|
+
## Citation
|
|
219
|
+
|
|
220
|
+
If you use TACTIK in your research, please cite:
|
|
221
|
+
|
|
222
|
+
```bibtex
|
|
223
|
+
@software{tactik,
|
|
224
|
+
title={tactik: Text Analysis, Clustering, Tuning, Information and Keyword Extraction},
|
|
225
|
+
author={Niklas P. Schulmeyer and Nicoletta Fala},
|
|
226
|
+
year={2025},
|
|
227
|
+
url={https://github.com/npsAub/tactik}
|
|
228
|
+
}
|
|
229
|
+
```
|
|
230
|
+
|
|
231
|
+
## License
|
|
232
|
+
|
|
233
|
+
MIT License — See `LICENSE` for details
|
|
234
|
+
|
|
235
|
+
## Contact
|
|
236
|
+
|
|
237
|
+
For questions or support, please open an issue on GitHub or contact [nps0027@auburn.edu].
|
|
238
|
+
|
tactik-0.1.3/license.txt
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 TACTIC Contributors
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "tactik"
|
|
7
|
+
version = "0.1.3"
|
|
8
|
+
description = "A Python library for topic modeling, clustering, and NLP analysis."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
|
|
13
|
+
authors = [
|
|
14
|
+
{ name = "Your Name", email = "you@example.com" }
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
dependencies = [
|
|
18
|
+
"matplotlib==3.8.0",
|
|
19
|
+
"numpy==1.24.4",
|
|
20
|
+
"pandas==1.5.3",
|
|
21
|
+
"scikit-learn==1.2.2",
|
|
22
|
+
"scipy==1.10.1",
|
|
23
|
+
"seaborn==0.12.2",
|
|
24
|
+
"tqdm==4.65.0",
|
|
25
|
+
"gensim==4.3.0",
|
|
26
|
+
"hdbscan==0.8.40",
|
|
27
|
+
"nltk",
|
|
28
|
+
"sentence-transformers==4.1.0",
|
|
29
|
+
"spacy==3.8.7",
|
|
30
|
+
"torch==2.3.0+cpu",
|
|
31
|
+
"transformers==4.52.3",
|
|
32
|
+
"umap-learn==0.5.7",
|
|
33
|
+
"yake==0.4.8"
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
[project.urls]
|
|
37
|
+
Homepage = "https://github.com/yourusername/your-package"
|
|
38
|
+
Documentation = "https://yourusername.github.io/your-package"
|
|
39
|
+
Source = "https://github.com/yourusername/your-package"
|
|
40
|
+
Issues = "https://github.com/yourusername/your-package/issues"
|
|
41
|
+
|
|
42
|
+
[tool.setuptools.packages.find]
|
|
43
|
+
where = ["src"]
|
tactik-0.1.3/setup.cfg
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
|
|
2
|
+
"""
|
|
3
|
+
TACTIK Package
|
|
4
|
+
Text Analysis, Clustering, Tuning, Information and Keyword Extraction
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
# Package version
|
|
8
|
+
__version__ = "0.1.4"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
from . import clustering_pipeline
|
|
12
|
+
from . import clustering_tuning
|
|
13
|
+
from . import embeddings
|
|
14
|
+
from . import preprocessing
|
|
15
|
+
from . import topic_extraction
|
|
16
|
+
from . import utilities
|
|
17
|
+
from . import visualization
|
|
18
|
+
|
|
19
|
+
|