pyhercules 1.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pyhercules-1.0.0/LICENSE +21 -0
- pyhercules-1.0.0/MANIFEST.in +5 -0
- pyhercules-1.0.0/PKG-INFO +180 -0
- pyhercules-1.0.0/README.md +123 -0
- pyhercules-1.0.0/pyhercules.egg-info/PKG-INFO +180 -0
- pyhercules-1.0.0/pyhercules.egg-info/SOURCES.txt +16 -0
- pyhercules-1.0.0/pyhercules.egg-info/dependency_links.txt +1 -0
- pyhercules-1.0.0/pyhercules.egg-info/entry_points.txt +2 -0
- pyhercules-1.0.0/pyhercules.egg-info/requires.txt +27 -0
- pyhercules-1.0.0/pyhercules.egg-info/top_level.txt +3 -0
- pyhercules-1.0.0/pyhercules.py +3926 -0
- pyhercules-1.0.0/pyhercules_app.py +1734 -0
- pyhercules-1.0.0/pyhercules_functions.py +656 -0
- pyhercules-1.0.0/requirements-app.txt +7 -0
- pyhercules-1.0.0/requirements-core.txt +6 -0
- pyhercules-1.0.0/requirements-models.txt +14 -0
- pyhercules-1.0.0/setup.cfg +4 -0
- pyhercules-1.0.0/setup.py +77 -0
pyhercules-1.0.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Bandee
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyhercules
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A flexible framework for hierarchical clustering of text, numeric, or image data using LLMs.
|
|
5
|
+
Home-page: https://github.com/bandeerun/pyhercules
|
|
6
|
+
Author: Bandee
|
|
7
|
+
Author-email: bandeerun@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: numpy
|
|
22
|
+
Requires-Dist: pandas
|
|
23
|
+
Requires-Dist: scikit-learn
|
|
24
|
+
Requires-Dist: Pillow
|
|
25
|
+
Provides-Extra: models
|
|
26
|
+
Requires-Dist: accelerate; extra == "models"
|
|
27
|
+
Requires-Dist: google-generativeai; extra == "models"
|
|
28
|
+
Requires-Dist: huggingface_hub; extra == "models"
|
|
29
|
+
Requires-Dist: sentence-transformers; extra == "models"
|
|
30
|
+
Requires-Dist: torch; extra == "models"
|
|
31
|
+
Requires-Dist: transformers; extra == "models"
|
|
32
|
+
Requires-Dist: requests; extra == "models"
|
|
33
|
+
Requires-Dist: python-dotenv; extra == "models"
|
|
34
|
+
Provides-Extra: app
|
|
35
|
+
Requires-Dist: accelerate; extra == "app"
|
|
36
|
+
Requires-Dist: google-generativeai; extra == "app"
|
|
37
|
+
Requires-Dist: huggingface_hub; extra == "app"
|
|
38
|
+
Requires-Dist: sentence-transformers; extra == "app"
|
|
39
|
+
Requires-Dist: torch; extra == "app"
|
|
40
|
+
Requires-Dist: transformers; extra == "app"
|
|
41
|
+
Requires-Dist: requests; extra == "app"
|
|
42
|
+
Requires-Dist: python-dotenv; extra == "app"
|
|
43
|
+
Requires-Dist: dash; extra == "app"
|
|
44
|
+
Requires-Dist: dash-bootstrap-components; extra == "app"
|
|
45
|
+
Requires-Dist: plotly; extra == "app"
|
|
46
|
+
Dynamic: author
|
|
47
|
+
Dynamic: author-email
|
|
48
|
+
Dynamic: classifier
|
|
49
|
+
Dynamic: description
|
|
50
|
+
Dynamic: description-content-type
|
|
51
|
+
Dynamic: home-page
|
|
52
|
+
Dynamic: license-file
|
|
53
|
+
Dynamic: provides-extra
|
|
54
|
+
Dynamic: requires-dist
|
|
55
|
+
Dynamic: requires-python
|
|
56
|
+
Dynamic: summary
|
|
57
|
+
|
|
58
|
+
# pyHercules
|
|
59
|
+
|
|
60
|
+
[](https://badge.fury.io/py/pyhercules-clustering)
|
|
61
|
+
[](https://www.python.org/downloads/)
|
|
62
|
+
[](https://opensource.org/licenses/MIT)
|
|
63
|
+
|
|
64
|
+
**pyHercules** is a flexible Python framework for hierarchical clustering of text, numeric, or image data. The core algorithm, **Hercules**, uses recursive k-means and leverages Large Language Models (LLMs) for efficient and meaningful summarization of clusters at each level of the hierarchy. The project includes the core library (`pyhercules`), a set of "batteries-included" model functions, and a powerful Dash web application for interactive exploration.
|
|
65
|
+
|
|
66
|
+
### Key Features
|
|
67
|
+
|
|
68
|
+
- **Hierarchical Clustering:** Automatically builds a tree of clusters from your data.
|
|
69
|
+
- **Multi-Modal:** Natively handles text, numeric (NumPy, Pandas), and image data (file paths, URLs, PIL Images). (One modality at a time.)
|
|
70
|
+
- **LLM-Powered Summarization:** Uses Large Language Models (LLMs) to generate human-readable titles and descriptions for each cluster.
|
|
71
|
+
- **Flexible Representation:** Choose between `direct` mode (using original data embeddings) or `description` mode (using LLM-generated summary embeddings) for clustering at higher levels.
|
|
72
|
+
- **Interactive Web App:** An included Dash application (`app.py`) allows for easy data upload, parameter configuration, and visualization of clustering results.
|
|
73
|
+
- **Extensible:** The core library is dependency-light. Bring your own model functions or use the provided ones in `hercules_functions.py`.
|
|
74
|
+
|
|
75
|
+
### Project Structure
|
|
76
|
+
|
|
77
|
+
- `pyhercules.py`: The core clustering library. Contains the `Hercules` and `Cluster` classes.
|
|
78
|
+
- `pyhercules_functions.py`: A collection of ready-to-use functions for embedding, captioning, and LLM calls (using Hugging Face, Google Gemini, etc.).
|
|
79
|
+
- `pyhercules_app.py`: A comprehensive Dash web application for interactive clustering and visualization.
|
|
80
|
+
- `examples.ipynb`: A Jupyter Notebook demonstrating various use cases of the library.
|
|
81
|
+
- `requirements-*.txt`: Dependency files for different use cases (for reference).
|
|
82
|
+
- `setup.py`: The packaging configuration script.
|
|
83
|
+
|
|
84
|
+
### Installation
|
|
85
|
+
|
|
86
|
+
You can install `pyhercules-clustering` directly from PyPI. Several installation options are available depending on your needs.
|
|
87
|
+
|
|
88
|
+
**1. Core Library Only**
|
|
89
|
+
|
|
90
|
+
For using the `Hercules` class with your own model client functions. This is a minimal, lightweight installation.
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
pip install pyhercules-clustering
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
**2. Library with Model Functions**
|
|
97
|
+
|
|
98
|
+
To use the pre-built functions in `pyhercules_functions.py` (e.g., for running the `examples.ipynb` notebook).
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
pip install "pyhercules-clustering[models]"
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
**3. Full Web Application**
|
|
105
|
+
|
|
106
|
+
To run the interactive Dash application, which includes all dependencies.
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
pip install "pyhercules-clustering[app]"
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Configuration: API Keys
|
|
113
|
+
|
|
114
|
+
To use models from Google or gated models from Hugging Face (like Gemma), you must configure your API keys. The recommended way is to create a `.env` file in your project's working directory:
|
|
115
|
+
|
|
116
|
+
```env
|
|
117
|
+
# .env
|
|
118
|
+
GOOGLE_API_KEY="your-google-api-key-here"
|
|
119
|
+
HUGGINGFACE_HUB_TOKEN="your-hugging-face-token-for-gated-models"
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
The library will automatically load these variables. Alternatively, you can set them as system environment variables.
|
|
123
|
+
|
|
124
|
+
### Usage
|
|
125
|
+
|
|
126
|
+
#### 1. Running the Dash Web Application (Recommended)
|
|
127
|
+
|
|
128
|
+
The easiest way to get started is with the interactive app.
|
|
129
|
+
|
|
130
|
+
1. **Install dependencies:**
|
|
131
|
+
```bash
|
|
132
|
+
pip install "pyhercules-clustering[app]"
|
|
133
|
+
```
|
|
134
|
+
2. **Set API keys:** Create a `.env` file as described in the Configuration section.
|
|
135
|
+
3. **Run the app:**
|
|
136
|
+
```bash
|
|
137
|
+
pyhercules-app
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Then, open your web browser to `http://127.0.0.1:8050`.
|
|
141
|
+
|
|
142
|
+
#### 2. Using the Core Library in Python
|
|
143
|
+
|
|
144
|
+
You can use the `Hercules` class directly in your scripts. See `examples.ipynb` for more detailed use cases.
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
from pyhercules import Hercules
|
|
148
|
+
from pyhercules_functions import local_minilm_l6_v2_embedding, local_gemma_3_4b_it_llm
|
|
149
|
+
|
|
150
|
+
# 1. Sample data
|
|
151
|
+
sample_texts = [
|
|
152
|
+
"Introduction to machine learning concepts.",
|
|
153
|
+
"Advanced techniques in deep neural networks.",
|
|
154
|
+
"A guide to Python programming for beginners.",
|
|
155
|
+
"Web development using Flask and Jinja.",
|
|
156
|
+
"Understanding gradient descent and backpropagation.",
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
# 2. Instantiate Hercules with your chosen model clients
|
|
160
|
+
# Ensure you have set up your HUGGINGFACE_HUB_TOKEN in a .env file for Gemma
|
|
161
|
+
hercules = Hercules(
|
|
162
|
+
level_cluster_counts=[3, 2], # Desired hierarchy: 3 top-level, then subdivide
|
|
163
|
+
representation_mode="direct",
|
|
164
|
+
text_embedding_client=local_minilm_l6_v2_embedding,
|
|
165
|
+
llm_client=local_gemma_3_4b_it_llm,
|
|
166
|
+
verbose=1
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# 3. Run clustering
|
|
170
|
+
top_clusters = hercules.cluster(sample_texts, topic_seed="computer science topics")
|
|
171
|
+
|
|
172
|
+
# 4. Print results
|
|
173
|
+
if top_clusters:
|
|
174
|
+
for cluster in top_clusters:
|
|
175
|
+
cluster.print_hierarchy(indent_increment=2, print_level_0=False)
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### License
|
|
179
|
+
|
|
180
|
+
This project is licensed under the MIT License. See the `LICENSE` file for details.
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
# pyHercules
|
|
2
|
+
|
|
3
|
+
[](https://badge.fury.io/py/pyhercules-clustering)
|
|
4
|
+
[](https://www.python.org/downloads/)
|
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
|
6
|
+
|
|
7
|
+
**pyHercules** is a flexible Python framework for hierarchical clustering of text, numeric, or image data. The core algorithm, **Hercules**, uses recursive k-means and leverages Large Language Models (LLMs) for efficient and meaningful summarization of clusters at each level of the hierarchy. The project includes the core library (`pyhercules`), a set of "batteries-included" model functions, and a powerful Dash web application for interactive exploration.
|
|
8
|
+
|
|
9
|
+
### Key Features
|
|
10
|
+
|
|
11
|
+
- **Hierarchical Clustering:** Automatically builds a tree of clusters from your data.
|
|
12
|
+
- **Multi-Modal:** Natively handles text, numeric (NumPy, Pandas), and image data (file paths, URLs, PIL Images). (One modality at a time.)
|
|
13
|
+
- **LLM-Powered Summarization:** Uses Large Language Models (LLMs) to generate human-readable titles and descriptions for each cluster.
|
|
14
|
+
- **Flexible Representation:** Choose between `direct` mode (using original data embeddings) or `description` mode (using LLM-generated summary embeddings) for clustering at higher levels.
|
|
15
|
+
- **Interactive Web App:** An included Dash application (`app.py`) allows for easy data upload, parameter configuration, and visualization of clustering results.
|
|
16
|
+
- **Extensible:** The core library is dependency-light. Bring your own model functions or use the provided ones in `hercules_functions.py`.
|
|
17
|
+
|
|
18
|
+
### Project Structure
|
|
19
|
+
|
|
20
|
+
- `pyhercules.py`: The core clustering library. Contains the `Hercules` and `Cluster` classes.
|
|
21
|
+
- `pyhercules_functions.py`: A collection of ready-to-use functions for embedding, captioning, and LLM calls (using Hugging Face, Google Gemini, etc.).
|
|
22
|
+
- `pyhercules_app.py`: A comprehensive Dash web application for interactive clustering and visualization.
|
|
23
|
+
- `examples.ipynb`: A Jupyter Notebook demonstrating various use cases of the library.
|
|
24
|
+
- `requirements-*.txt`: Dependency files for different use cases (for reference).
|
|
25
|
+
- `setup.py`: The packaging configuration script.
|
|
26
|
+
|
|
27
|
+
### Installation
|
|
28
|
+
|
|
29
|
+
You can install `pyhercules-clustering` directly from PyPI. Several installation options are available depending on your needs.
|
|
30
|
+
|
|
31
|
+
**1. Core Library Only**
|
|
32
|
+
|
|
33
|
+
For using the `Hercules` class with your own model client functions. This is a minimal, lightweight installation.
|
|
34
|
+
|
|
35
|
+
```bash
|
|
36
|
+
pip install pyhercules-clustering
|
|
37
|
+
```
|
|
38
|
+
|
|
39
|
+
**2. Library with Model Functions**
|
|
40
|
+
|
|
41
|
+
To use the pre-built functions in `pyhercules_functions.py` (e.g., for running the `examples.ipynb` notebook).
|
|
42
|
+
|
|
43
|
+
```bash
|
|
44
|
+
pip install "pyhercules-clustering[models]"
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
**3. Full Web Application**
|
|
48
|
+
|
|
49
|
+
To run the interactive Dash application, which includes all dependencies.
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
pip install "pyhercules-clustering[app]"
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Configuration: API Keys
|
|
56
|
+
|
|
57
|
+
To use models from Google or gated models from Hugging Face (like Gemma), you must configure your API keys. The recommended way is to create a `.env` file in your project's working directory:
|
|
58
|
+
|
|
59
|
+
```env
|
|
60
|
+
# .env
|
|
61
|
+
GOOGLE_API_KEY="your-google-api-key-here"
|
|
62
|
+
HUGGINGFACE_HUB_TOKEN="your-hugging-face-token-for-gated-models"
|
|
63
|
+
```
|
|
64
|
+
|
|
65
|
+
The library will automatically load these variables. Alternatively, you can set them as system environment variables.
|
|
66
|
+
|
|
67
|
+
### Usage
|
|
68
|
+
|
|
69
|
+
#### 1. Running the Dash Web Application (Recommended)
|
|
70
|
+
|
|
71
|
+
The easiest way to get started is with the interactive app.
|
|
72
|
+
|
|
73
|
+
1. **Install dependencies:**
|
|
74
|
+
```bash
|
|
75
|
+
pip install "pyhercules-clustering[app]"
|
|
76
|
+
```
|
|
77
|
+
2. **Set API keys:** Create a `.env` file as described in the Configuration section.
|
|
78
|
+
3. **Run the app:**
|
|
79
|
+
```bash
|
|
80
|
+
pyhercules-app
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Then, open your web browser to `http://127.0.0.1:8050`.
|
|
84
|
+
|
|
85
|
+
#### 2. Using the Core Library in Python
|
|
86
|
+
|
|
87
|
+
You can use the `Hercules` class directly in your scripts. See `examples.ipynb` for more detailed use cases.
|
|
88
|
+
|
|
89
|
+
```python
|
|
90
|
+
from pyhercules import Hercules
|
|
91
|
+
from pyhercules_functions import local_minilm_l6_v2_embedding, local_gemma_3_4b_it_llm
|
|
92
|
+
|
|
93
|
+
# 1. Sample data
|
|
94
|
+
sample_texts = [
|
|
95
|
+
"Introduction to machine learning concepts.",
|
|
96
|
+
"Advanced techniques in deep neural networks.",
|
|
97
|
+
"A guide to Python programming for beginners.",
|
|
98
|
+
"Web development using Flask and Jinja.",
|
|
99
|
+
"Understanding gradient descent and backpropagation.",
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
# 2. Instantiate Hercules with your chosen model clients
|
|
103
|
+
# Ensure you have set up your HUGGINGFACE_HUB_TOKEN in a .env file for Gemma
|
|
104
|
+
hercules = Hercules(
|
|
105
|
+
level_cluster_counts=[3, 2], # Desired hierarchy: 3 top-level, then subdivide
|
|
106
|
+
representation_mode="direct",
|
|
107
|
+
text_embedding_client=local_minilm_l6_v2_embedding,
|
|
108
|
+
llm_client=local_gemma_3_4b_it_llm,
|
|
109
|
+
verbose=1
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# 3. Run clustering
|
|
113
|
+
top_clusters = hercules.cluster(sample_texts, topic_seed="computer science topics")
|
|
114
|
+
|
|
115
|
+
# 4. Print results
|
|
116
|
+
if top_clusters:
|
|
117
|
+
for cluster in top_clusters:
|
|
118
|
+
cluster.print_hierarchy(indent_increment=2, print_level_0=False)
|
|
119
|
+
```
|
|
120
|
+
|
|
121
|
+
### License
|
|
122
|
+
|
|
123
|
+
This project is licensed under the MIT License. See the `LICENSE` file for details.
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: pyhercules
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: A flexible framework for hierarchical clustering of text, numeric, or image data using LLMs.
|
|
5
|
+
Home-page: https://github.com/bandeerun/pyhercules
|
|
6
|
+
Author: Bandee
|
|
7
|
+
Author-email: bandeerun@gmail.com
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Operating System :: OS Independent
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
18
|
+
Requires-Python: >=3.9
|
|
19
|
+
Description-Content-Type: text/markdown
|
|
20
|
+
License-File: LICENSE
|
|
21
|
+
Requires-Dist: numpy
|
|
22
|
+
Requires-Dist: pandas
|
|
23
|
+
Requires-Dist: scikit-learn
|
|
24
|
+
Requires-Dist: Pillow
|
|
25
|
+
Provides-Extra: models
|
|
26
|
+
Requires-Dist: accelerate; extra == "models"
|
|
27
|
+
Requires-Dist: google-generativeai; extra == "models"
|
|
28
|
+
Requires-Dist: huggingface_hub; extra == "models"
|
|
29
|
+
Requires-Dist: sentence-transformers; extra == "models"
|
|
30
|
+
Requires-Dist: torch; extra == "models"
|
|
31
|
+
Requires-Dist: transformers; extra == "models"
|
|
32
|
+
Requires-Dist: requests; extra == "models"
|
|
33
|
+
Requires-Dist: python-dotenv; extra == "models"
|
|
34
|
+
Provides-Extra: app
|
|
35
|
+
Requires-Dist: accelerate; extra == "app"
|
|
36
|
+
Requires-Dist: google-generativeai; extra == "app"
|
|
37
|
+
Requires-Dist: huggingface_hub; extra == "app"
|
|
38
|
+
Requires-Dist: sentence-transformers; extra == "app"
|
|
39
|
+
Requires-Dist: torch; extra == "app"
|
|
40
|
+
Requires-Dist: transformers; extra == "app"
|
|
41
|
+
Requires-Dist: requests; extra == "app"
|
|
42
|
+
Requires-Dist: python-dotenv; extra == "app"
|
|
43
|
+
Requires-Dist: dash; extra == "app"
|
|
44
|
+
Requires-Dist: dash-bootstrap-components; extra == "app"
|
|
45
|
+
Requires-Dist: plotly; extra == "app"
|
|
46
|
+
Dynamic: author
|
|
47
|
+
Dynamic: author-email
|
|
48
|
+
Dynamic: classifier
|
|
49
|
+
Dynamic: description
|
|
50
|
+
Dynamic: description-content-type
|
|
51
|
+
Dynamic: home-page
|
|
52
|
+
Dynamic: license-file
|
|
53
|
+
Dynamic: provides-extra
|
|
54
|
+
Dynamic: requires-dist
|
|
55
|
+
Dynamic: requires-python
|
|
56
|
+
Dynamic: summary
|
|
57
|
+
|
|
58
|
+
# pyHercules
|
|
59
|
+
|
|
60
|
+
[](https://badge.fury.io/py/pyhercules-clustering)
|
|
61
|
+
[](https://www.python.org/downloads/)
|
|
62
|
+
[](https://opensource.org/licenses/MIT)
|
|
63
|
+
|
|
64
|
+
**pyHercules** is a flexible Python framework for hierarchical clustering of text, numeric, or image data. The core algorithm, **Hercules**, uses recursive k-means and leverages Large Language Models (LLMs) for efficient and meaningful summarization of clusters at each level of the hierarchy. The project includes the core library (`pyhercules`), a set of "batteries-included" model functions, and a powerful Dash web application for interactive exploration.
|
|
65
|
+
|
|
66
|
+
### Key Features
|
|
67
|
+
|
|
68
|
+
- **Hierarchical Clustering:** Automatically builds a tree of clusters from your data.
|
|
69
|
+
- **Multi-Modal:** Natively handles text, numeric (NumPy, Pandas), and image data (file paths, URLs, PIL Images). (One modality at a time.)
|
|
70
|
+
- **LLM-Powered Summarization:** Uses Large Language Models (LLMs) to generate human-readable titles and descriptions for each cluster.
|
|
71
|
+
- **Flexible Representation:** Choose between `direct` mode (using original data embeddings) or `description` mode (using LLM-generated summary embeddings) for clustering at higher levels.
|
|
72
|
+
- **Interactive Web App:** An included Dash application (`app.py`) allows for easy data upload, parameter configuration, and visualization of clustering results.
|
|
73
|
+
- **Extensible:** The core library is dependency-light. Bring your own model functions or use the provided ones in `hercules_functions.py`.
|
|
74
|
+
|
|
75
|
+
### Project Structure
|
|
76
|
+
|
|
77
|
+
- `pyhercules.py`: The core clustering library. Contains the `Hercules` and `Cluster` classes.
|
|
78
|
+
- `pyhercules_functions.py`: A collection of ready-to-use functions for embedding, captioning, and LLM calls (using Hugging Face, Google Gemini, etc.).
|
|
79
|
+
- `pyhercules_app.py`: A comprehensive Dash web application for interactive clustering and visualization.
|
|
80
|
+
- `examples.ipynb`: A Jupyter Notebook demonstrating various use cases of the library.
|
|
81
|
+
- `requirements-*.txt`: Dependency files for different use cases (for reference).
|
|
82
|
+
- `setup.py`: The packaging configuration script.
|
|
83
|
+
|
|
84
|
+
### Installation
|
|
85
|
+
|
|
86
|
+
You can install `pyhercules-clustering` directly from PyPI. Several installation options are available depending on your needs.
|
|
87
|
+
|
|
88
|
+
**1. Core Library Only**
|
|
89
|
+
|
|
90
|
+
For using the `Hercules` class with your own model client functions. This is a minimal, lightweight installation.
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
pip install pyhercules-clustering
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
**2. Library with Model Functions**
|
|
97
|
+
|
|
98
|
+
To use the pre-built functions in `pyhercules_functions.py` (e.g., for running the `examples.ipynb` notebook).
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
pip install "pyhercules-clustering[models]"
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
**3. Full Web Application**
|
|
105
|
+
|
|
106
|
+
To run the interactive Dash application, which includes all dependencies.
|
|
107
|
+
|
|
108
|
+
```bash
|
|
109
|
+
pip install "pyhercules-clustering[app]"
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Configuration: API Keys
|
|
113
|
+
|
|
114
|
+
To use models from Google or gated models from Hugging Face (like Gemma), you must configure your API keys. The recommended way is to create a `.env` file in your project's working directory:
|
|
115
|
+
|
|
116
|
+
```env
|
|
117
|
+
# .env
|
|
118
|
+
GOOGLE_API_KEY="your-google-api-key-here"
|
|
119
|
+
HUGGINGFACE_HUB_TOKEN="your-hugging-face-token-for-gated-models"
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
The library will automatically load these variables. Alternatively, you can set them as system environment variables.
|
|
123
|
+
|
|
124
|
+
### Usage
|
|
125
|
+
|
|
126
|
+
#### 1. Running the Dash Web Application (Recommended)
|
|
127
|
+
|
|
128
|
+
The easiest way to get started is with the interactive app.
|
|
129
|
+
|
|
130
|
+
1. **Install dependencies:**
|
|
131
|
+
```bash
|
|
132
|
+
pip install "pyhercules-clustering[app]"
|
|
133
|
+
```
|
|
134
|
+
2. **Set API keys:** Create a `.env` file as described in the Configuration section.
|
|
135
|
+
3. **Run the app:**
|
|
136
|
+
```bash
|
|
137
|
+
pyhercules-app
|
|
138
|
+
```
|
|
139
|
+
|
|
140
|
+
Then, open your web browser to `http://127.0.0.1:8050`.
|
|
141
|
+
|
|
142
|
+
#### 2. Using the Core Library in Python
|
|
143
|
+
|
|
144
|
+
You can use the `Hercules` class directly in your scripts. See `examples.ipynb` for more detailed use cases.
|
|
145
|
+
|
|
146
|
+
```python
|
|
147
|
+
from pyhercules import Hercules
|
|
148
|
+
from pyhercules_functions import local_minilm_l6_v2_embedding, local_gemma_3_4b_it_llm
|
|
149
|
+
|
|
150
|
+
# 1. Sample data
|
|
151
|
+
sample_texts = [
|
|
152
|
+
"Introduction to machine learning concepts.",
|
|
153
|
+
"Advanced techniques in deep neural networks.",
|
|
154
|
+
"A guide to Python programming for beginners.",
|
|
155
|
+
"Web development using Flask and Jinja.",
|
|
156
|
+
"Understanding gradient descent and backpropagation.",
|
|
157
|
+
]
|
|
158
|
+
|
|
159
|
+
# 2. Instantiate Hercules with your chosen model clients
|
|
160
|
+
# Ensure you have set up your HUGGINGFACE_HUB_TOKEN in a .env file for Gemma
|
|
161
|
+
hercules = Hercules(
|
|
162
|
+
level_cluster_counts=[3, 2], # Desired hierarchy: 3 top-level, then subdivide
|
|
163
|
+
representation_mode="direct",
|
|
164
|
+
text_embedding_client=local_minilm_l6_v2_embedding,
|
|
165
|
+
llm_client=local_gemma_3_4b_it_llm,
|
|
166
|
+
verbose=1
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
# 3. Run clustering
|
|
170
|
+
top_clusters = hercules.cluster(sample_texts, topic_seed="computer science topics")
|
|
171
|
+
|
|
172
|
+
# 4. Print results
|
|
173
|
+
if top_clusters:
|
|
174
|
+
for cluster in top_clusters:
|
|
175
|
+
cluster.print_hierarchy(indent_increment=2, print_level_0=False)
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### License
|
|
179
|
+
|
|
180
|
+
This project is licensed under the MIT License. See the `LICENSE` file for details.
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
MANIFEST.in
|
|
3
|
+
README.md
|
|
4
|
+
pyhercules.py
|
|
5
|
+
pyhercules_app.py
|
|
6
|
+
pyhercules_functions.py
|
|
7
|
+
requirements-app.txt
|
|
8
|
+
requirements-core.txt
|
|
9
|
+
requirements-models.txt
|
|
10
|
+
setup.py
|
|
11
|
+
pyhercules.egg-info/PKG-INFO
|
|
12
|
+
pyhercules.egg-info/SOURCES.txt
|
|
13
|
+
pyhercules.egg-info/dependency_links.txt
|
|
14
|
+
pyhercules.egg-info/entry_points.txt
|
|
15
|
+
pyhercules.egg-info/requires.txt
|
|
16
|
+
pyhercules.egg-info/top_level.txt
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
numpy
|
|
2
|
+
pandas
|
|
3
|
+
scikit-learn
|
|
4
|
+
Pillow
|
|
5
|
+
|
|
6
|
+
[app]
|
|
7
|
+
accelerate
|
|
8
|
+
google-generativeai
|
|
9
|
+
huggingface_hub
|
|
10
|
+
sentence-transformers
|
|
11
|
+
torch
|
|
12
|
+
transformers
|
|
13
|
+
requests
|
|
14
|
+
python-dotenv
|
|
15
|
+
dash
|
|
16
|
+
dash-bootstrap-components
|
|
17
|
+
plotly
|
|
18
|
+
|
|
19
|
+
[models]
|
|
20
|
+
accelerate
|
|
21
|
+
google-generativeai
|
|
22
|
+
huggingface_hub
|
|
23
|
+
sentence-transformers
|
|
24
|
+
torch
|
|
25
|
+
transformers
|
|
26
|
+
requests
|
|
27
|
+
python-dotenv
|