dustclust 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- dustclust-0.1.0/PKG-INFO +152 -0
- dustclust-0.1.0/README.md +135 -0
- dustclust-0.1.0/pyproject.toml +36 -0
- dustclust-0.1.0/setup.cfg +4 -0
- dustclust-0.1.0/src/DustClust/__init__.py +3 -0
- dustclust-0.1.0/src/DustClust/__main__.py +4 -0
- dustclust-0.1.0/src/DustClust/cluster_hardware.py +259 -0
- dustclust-0.1.0/src/DustClust/cluster_viz/app.js +546 -0
- dustclust-0.1.0/src/DustClust/cluster_viz/data.json +1 -0
- dustclust-0.1.0/src/DustClust/cluster_viz/index.html +130 -0
- dustclust-0.1.0/src/DustClust/cluster_viz/styles.css +510 -0
- dustclust-0.1.0/src/DustClust/custom_devices.py +292 -0
- dustclust-0.1.0/src/DustClust/data/dirty_hardware_data_40k.csv +40001 -0
- dustclust-0.1.0/src/DustClust/ifixit_devices.json +284727 -0
- dustclust-0.1.0/src/DustClust/paths.py +46 -0
- dustclust-0.1.0/src/DustClust/server.py +645 -0
- dustclust-0.1.0/src/dustclust.egg-info/PKG-INFO +152 -0
- dustclust-0.1.0/src/dustclust.egg-info/SOURCES.txt +20 -0
- dustclust-0.1.0/src/dustclust.egg-info/dependency_links.txt +1 -0
- dustclust-0.1.0/src/dustclust.egg-info/entry_points.txt +3 -0
- dustclust-0.1.0/src/dustclust.egg-info/requires.txt +8 -0
- dustclust-0.1.0/src/dustclust.egg-info/top_level.txt +1 -0
dustclust-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: dustclust
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: DustClust: interactive hardware inventory clustering with embeddings and a web visualization
|
|
5
|
+
Author: DustClust contributors
|
|
6
|
+
License: MIT
|
|
7
|
+
Requires-Python: >=3.10
|
|
8
|
+
Description-Content-Type: text/markdown
|
|
9
|
+
Requires-Dist: fastapi
|
|
10
|
+
Requires-Dist: uvicorn[standard]
|
|
11
|
+
Requires-Dist: python-multipart
|
|
12
|
+
Requires-Dist: pandas
|
|
13
|
+
Requires-Dist: openpyxl
|
|
14
|
+
Requires-Dist: numpy
|
|
15
|
+
Requires-Dist: sentence-transformers
|
|
16
|
+
Requires-Dist: scikit-learn
|
|
17
|
+
|
|
18
|
+
# DustClust
|
|
19
|
+
|
|
20
|
+
**DustClust** (`import DustClust`) is an interactive web application that clusters dirty hardware inventory data using semantic embeddings and visualizes the results in a D3.js radial network graph. The backend runs live clustering with configurable thresholds and matches clusters to real-world devices from the iFixit catalog plus a custom device list.
|
|
21
|
+
|
|
22
|
+
Install from PyPI as **`dustclust`** (`pip install dustclust`).
|
|
23
|
+
|
|
24
|
+
## Features
|
|
25
|
+
|
|
26
|
+
### GUI
|
|
27
|
+
|
|
28
|
+
- **Interactive Network Graph**: Zoom, pan, and drag nodes. Hover for quick tooltips; click to open the detail panel.
|
|
29
|
+
- **Search Clusters**: Real-time search by category, subcategory, or record content (e.g., Router, iPhone, 4331).
|
|
30
|
+
- **Clustering Threshold**: Adjust the similarity threshold (0.10–0.90) and click **Recalculate Clusters** to re-run clustering.
|
|
31
|
+
- **Min Cluster Size**: Filter out small clusters with a slider.
|
|
32
|
+
- **Upload Dataset**: Upload your own `.csv` or `.xlsx` file. The server generates embeddings and clusters on the fly.
|
|
33
|
+
- **Reset View**: Reset zoom and clear search/filters.
|
|
34
|
+
- **Detail Panel**: Click any node to see:
|
|
35
|
+
- **Category / Subcategory** (e.g., Computer Hardware / Mouse)
|
|
36
|
+
- Sample records
|
|
37
|
+
- **Matched iFixit Device** (when a cluster matches a known device, with link to iFixit)
|
|
38
|
+
|
|
39
|
+
### Backend
|
|
40
|
+
|
|
41
|
+
- **Embedding-based clustering**: Uses sentence-transformers with `paraphrase-multilingual-MiniLM-L12-v2` (or another HuggingFace model ID) for semantic similarity.
|
|
42
|
+
- **Hebrew & multilingual support**: The model and tokenization support Hebrew and other Unicode scripts. Category inference includes Hebrew keywords (e.g. מסך, מקלדת, עכבר).
|
|
43
|
+
- **Dynamic category inference**: Token-based matching against iFixit categories and `custom_devices.py`.
|
|
44
|
+
- **Subcategory inference**: Finer-grained labels (e.g., Keyboard, Monitor, Printer) derived from the device list.
|
|
45
|
+
- **Device matching**: IDF-scored matching of cluster records to iFixit devices and custom entries.
|
|
46
|
+
|
|
47
|
+
## Device List
|
|
48
|
+
|
|
49
|
+
The app uses two device sources:
|
|
50
|
+
|
|
51
|
+
1. **iFixit catalog** (`ifixit_devices.json`): Run `fetch_ifixit_devices.py` to populate.
|
|
52
|
+
2. **Custom devices** (module `DustClust.custom_devices`): Generic hardware that supplements iFixit for category/subcategory inference and device matching.
|
|
53
|
+
|
|
54
|
+
### Custom Device Categories
|
|
55
|
+
|
|
56
|
+
| Category | Subcategories / Types |
|
|
57
|
+
|---------|------------------------|
|
|
58
|
+
| **SIM Card** | Nano, Micro, Standard, eSIM |
|
|
59
|
+
| **Cable** | USB-C, Lightning, HDMI, DisplayPort, Ethernet, VGA, DVI, Thunderbolt, SATA, etc. |
|
|
60
|
+
| **Adapter** | USB hubs, HDMI/DisplayPort adapters |
|
|
61
|
+
| **Storage** | MicroSD, SD, Flash Drive, HDD, SSD, NAS, CD/DVD/Blu-ray drives |
|
|
62
|
+
| **PC Component** | RAM, CPU, GPU, Motherboard, PSU, SSD, Cooling, Thermal paste |
|
|
63
|
+
| **Computer Hardware** | Keyboard, Mouse, Laptop, Monitor, Webcam, Dashcam, Projector, Printer |
|
|
64
|
+
| **Audio** | Headphones, Earbuds, Microphone |
|
|
65
|
+
| **Telecom** | Router, Modem, Network Switch, Access Point, enterprise gear |
|
|
66
|
+
|
|
67
|
+
Edit `src/DustClust/custom_devices.py` (or the installed package’s `custom_devices.py`) to add or adjust devices. Each entry has `name`, `category`, `subcategory`, and optional `url`.
|
|
68
|
+
|
|
69
|
+
### Hebrew / Multilingual Datasets
|
|
70
|
+
|
|
71
|
+
The app works with Hebrew and other Unicode datasets. The default model (`sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`) supports 50+ languages. Category inference includes Hebrew boost keywords (מסך, מקלדת, עכבר, כבל, etc.), and `custom_devices.py` has Hebrew device entries for matching. Upload a Hebrew CSV and the clustering and categorization will work out of the box.
|
|
72
|
+
|
|
73
|
+
## Getting Started
|
|
74
|
+
|
|
75
|
+
### Install with pip
|
|
76
|
+
|
|
77
|
+
From the repository root (editable install for development):
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
pip install -e .
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
Or from a sdist/wheel once published:
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
pip install dustclust
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
This installs the **`dustclust`** CLI and the importable package **`DustClust`**.
|
|
90
|
+
|
|
91
|
+
### Prerequisites
|
|
92
|
+
|
|
93
|
+
- Python 3.10+
|
|
94
|
+
- Dependencies are declared in `pyproject.toml` (installed automatically with `pip install`).
|
|
95
|
+
|
|
96
|
+
### Optional: conda and iFixit cache
|
|
97
|
+
|
|
98
|
+
```bash
|
|
99
|
+
conda create -n dustclust python=3.11
|
|
100
|
+
conda activate dustclust
|
|
101
|
+
pip install -e .
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
Fetch / refresh the iFixit device catalog into the current directory (optional; a copy may already be bundled in the package):
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
python fetch_ifixit_devices.py
|
|
108
|
+
```
|
|
109
|
+
|
|
110
|
+
### Running the App
|
|
111
|
+
|
|
112
|
+
1. Start the server (any of these):
|
|
113
|
+
|
|
114
|
+
```bash
|
|
115
|
+
dustclust
|
|
116
|
+
# or
|
|
117
|
+
python -m DustClust
|
|
118
|
+
# or, from a clone without installing
|
|
119
|
+
python server.py
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
2. Open [http://localhost:8001](http://localhost:8001) in your browser.
|
|
123
|
+
|
|
124
|
+
**Command-line options** (run `dustclust --help` for details):
|
|
125
|
+
|
|
126
|
+
| Option | Default | Description |
|
|
127
|
+
|--------|---------|-------------|
|
|
128
|
+
| `--model` | `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` | Path to local model folder or HuggingFace ID |
|
|
129
|
+
| `--device` | `cpu` | Device for embeddings: `cpu` or `cuda` |
|
|
130
|
+
| `--threshold` | `0.3` | Default clustering threshold (0.1–0.9) |
|
|
131
|
+
| `--batch-size` | `512` | Batch size for embedding generation |
|
|
132
|
+
| `--data` | *(bundled sample in the package)* | Path to CSV dataset; omit to use the bundled sample |
|
|
133
|
+
| `--host` | `localhost` | Host to bind |
|
|
134
|
+
| `--port` | `8001` | Port to run on |
|
|
135
|
+
| `--no-reload` | — | Disable auto-reload on file changes |
|
|
136
|
+
|
|
137
|
+
The server loads the default dataset (`data/dirty_hardware_data_40k.csv`) and **serves the GUI immediately**; initial embeddings run in the background (the graph appears when they finish—see `/api/status`). Use **Recalculate Clusters** or **Upload Dataset** to change the data or clustering.
|
|
138
|
+
|
|
139
|
+
## Model
|
|
140
|
+
|
|
141
|
+
The default embedding model is `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`. Use `--model` to pass a local model folder or another HuggingFace model ID.
|
|
142
|
+
|
|
143
|
+
## Data
|
|
144
|
+
|
|
145
|
+
CSV files live in the `data/` folder. The default dataset is `data/dirty_hardware_data_40k.csv`. Use `--data` or `--input`/`--output` to point scripts at other paths.
|
|
146
|
+
|
|
147
|
+
## Data Processing Workflow
|
|
148
|
+
|
|
149
|
+
- `dataset_generator.py`: Generates raw `data/dirty_hardware_data_40k.csv`.
|
|
150
|
+
- `cluster_hardware` / CLI **`cluster-hardware`**: Produces `data/clustered_output.csv` from embeddings and clustering.
|
|
151
|
+
- `prepare_viz_data.py`: Builds static `src/DustClust/cluster_viz/data.json` for the legacy static workflow.
|
|
152
|
+
- `DustClust.server` / CLI **`dustclust`**: Serves the live app with on-demand clustering and device matching.
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# DustClust
|
|
2
|
+
|
|
3
|
+
**DustClust** (`import DustClust`) is an interactive web application that clusters dirty hardware inventory data using semantic embeddings and visualizes the results in a D3.js radial network graph. The backend runs live clustering with configurable thresholds and matches clusters to real-world devices from the iFixit catalog plus a custom device list.
|
|
4
|
+
|
|
5
|
+
Install from PyPI as **`dustclust`** (`pip install dustclust`).
|
|
6
|
+
|
|
7
|
+
## Features
|
|
8
|
+
|
|
9
|
+
### GUI
|
|
10
|
+
|
|
11
|
+
- **Interactive Network Graph**: Zoom, pan, and drag nodes. Hover for quick tooltips; click to open the detail panel.
|
|
12
|
+
- **Search Clusters**: Real-time search by category, subcategory, or record content (e.g., Router, iPhone, 4331).
|
|
13
|
+
- **Clustering Threshold**: Adjust the similarity threshold (0.10–0.90) and click **Recalculate Clusters** to re-run clustering.
|
|
14
|
+
- **Min Cluster Size**: Filter out small clusters with a slider.
|
|
15
|
+
- **Upload Dataset**: Upload your own `.csv` or `.xlsx` file. The server generates embeddings and clusters on the fly.
|
|
16
|
+
- **Reset View**: Reset zoom and clear search/filters.
|
|
17
|
+
- **Detail Panel**: Click any node to see:
|
|
18
|
+
- **Category / Subcategory** (e.g., Computer Hardware / Mouse)
|
|
19
|
+
- Sample records
|
|
20
|
+
- **Matched iFixit Device** (when a cluster matches a known device, with link to iFixit)
|
|
21
|
+
|
|
22
|
+
### Backend
|
|
23
|
+
|
|
24
|
+
- **Embedding-based clustering**: Uses sentence-transformers with `paraphrase-multilingual-MiniLM-L12-v2` (or another HuggingFace model ID) for semantic similarity.
|
|
25
|
+
- **Hebrew & multilingual support**: The model and tokenization support Hebrew and other Unicode scripts. Category inference includes Hebrew keywords (e.g. מסך, מקלדת, עכבר).
|
|
26
|
+
- **Dynamic category inference**: Token-based matching against iFixit categories and `custom_devices.py`.
|
|
27
|
+
- **Subcategory inference**: Finer-grained labels (e.g., Keyboard, Monitor, Printer) derived from the device list.
|
|
28
|
+
- **Device matching**: IDF-scored matching of cluster records to iFixit devices and custom entries.
|
|
29
|
+
|
|
30
|
+
## Device List
|
|
31
|
+
|
|
32
|
+
The app uses two device sources:
|
|
33
|
+
|
|
34
|
+
1. **iFixit catalog** (`ifixit_devices.json`): Run `fetch_ifixit_devices.py` to populate.
|
|
35
|
+
2. **Custom devices** (module `DustClust.custom_devices`): Generic hardware that supplements iFixit for category/subcategory inference and device matching.
|
|
36
|
+
|
|
37
|
+
### Custom Device Categories
|
|
38
|
+
|
|
39
|
+
| Category | Subcategories / Types |
|
|
40
|
+
|---------|------------------------|
|
|
41
|
+
| **SIM Card** | Nano, Micro, Standard, eSIM |
|
|
42
|
+
| **Cable** | USB-C, Lightning, HDMI, DisplayPort, Ethernet, VGA, DVI, Thunderbolt, SATA, etc. |
|
|
43
|
+
| **Adapter** | USB hubs, HDMI/DisplayPort adapters |
|
|
44
|
+
| **Storage** | MicroSD, SD, Flash Drive, HDD, SSD, NAS, CD/DVD/Blu-ray drives |
|
|
45
|
+
| **PC Component** | RAM, CPU, GPU, Motherboard, PSU, SSD, Cooling, Thermal paste |
|
|
46
|
+
| **Computer Hardware** | Keyboard, Mouse, Laptop, Monitor, Webcam, Dashcam, Projector, Printer |
|
|
47
|
+
| **Audio** | Headphones, Earbuds, Microphone |
|
|
48
|
+
| **Telecom** | Router, Modem, Network Switch, Access Point, enterprise gear |
|
|
49
|
+
|
|
50
|
+
Edit `src/DustClust/custom_devices.py` (or the installed package’s `custom_devices.py`) to add or adjust devices. Each entry has `name`, `category`, `subcategory`, and optional `url`.
|
|
51
|
+
|
|
52
|
+
### Hebrew / Multilingual Datasets
|
|
53
|
+
|
|
54
|
+
The app works with Hebrew and other Unicode datasets. The default model (`sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`) supports 50+ languages. Category inference includes Hebrew boost keywords (מסך, מקלדת, עכבר, כבל, etc.), and `custom_devices.py` has Hebrew device entries for matching. Upload a Hebrew CSV and the clustering and categorization will work out of the box.
|
|
55
|
+
|
|
56
|
+
## Getting Started
|
|
57
|
+
|
|
58
|
+
### Install with pip
|
|
59
|
+
|
|
60
|
+
From the repository root (editable install for development):
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
pip install -e .
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
Or from a sdist/wheel once published:
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
pip install dustclust
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
This installs the **`dustclust`** CLI and the importable package **`DustClust`**.
|
|
73
|
+
|
|
74
|
+
### Prerequisites
|
|
75
|
+
|
|
76
|
+
- Python 3.10+
|
|
77
|
+
- Dependencies are declared in `pyproject.toml` (installed automatically with `pip install`).
|
|
78
|
+
|
|
79
|
+
### Optional: conda and iFixit cache
|
|
80
|
+
|
|
81
|
+
```bash
|
|
82
|
+
conda create -n dustclust python=3.11
|
|
83
|
+
conda activate dustclust
|
|
84
|
+
pip install -e .
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
Fetch / refresh the iFixit device catalog into the current directory (optional; a copy may already be bundled in the package):
|
|
88
|
+
|
|
89
|
+
```bash
|
|
90
|
+
python fetch_ifixit_devices.py
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
### Running the App
|
|
94
|
+
|
|
95
|
+
1. Start the server (any of these):
|
|
96
|
+
|
|
97
|
+
```bash
|
|
98
|
+
dustclust
|
|
99
|
+
# or
|
|
100
|
+
python -m DustClust
|
|
101
|
+
# or, from a clone without installing
|
|
102
|
+
python server.py
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
2. Open [http://localhost:8001](http://localhost:8001) in your browser.
|
|
106
|
+
|
|
107
|
+
**Command-line options** (run `dustclust --help` for details):
|
|
108
|
+
|
|
109
|
+
| Option | Default | Description |
|
|
110
|
+
|--------|---------|-------------|
|
|
111
|
+
| `--model` | `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2` | Path to local model folder or HuggingFace ID |
|
|
112
|
+
| `--device` | `cpu` | Device for embeddings: `cpu` or `cuda` |
|
|
113
|
+
| `--threshold` | `0.3` | Default clustering threshold (0.1–0.9) |
|
|
114
|
+
| `--batch-size` | `512` | Batch size for embedding generation |
|
|
115
|
+
| `--data` | *(bundled sample in the package)* | Path to CSV dataset; omit to use the bundled sample |
|
|
116
|
+
| `--host` | `localhost` | Host to bind |
|
|
117
|
+
| `--port` | `8001` | Port to run on |
|
|
118
|
+
| `--no-reload` | — | Disable auto-reload on file changes |
|
|
119
|
+
|
|
120
|
+
The server loads the default dataset (`data/dirty_hardware_data_40k.csv`) and **serves the GUI immediately**; initial embeddings run in the background (the graph appears when they finish—see `/api/status`). Use **Recalculate Clusters** or **Upload Dataset** to change the data or clustering.
|
|
121
|
+
|
|
122
|
+
## Model
|
|
123
|
+
|
|
124
|
+
The default embedding model is `sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2`. Use `--model` to pass a local model folder or another HuggingFace model ID.
|
|
125
|
+
|
|
126
|
+
## Data
|
|
127
|
+
|
|
128
|
+
CSV files live in the `data/` folder. The default dataset is `data/dirty_hardware_data_40k.csv`. Use `--data` or `--input`/`--output` to point scripts at other paths.
|
|
129
|
+
|
|
130
|
+
## Data Processing Workflow
|
|
131
|
+
|
|
132
|
+
- `dataset_generator.py`: Generates raw `data/dirty_hardware_data_40k.csv`.
|
|
133
|
+
- `cluster_hardware` / CLI **`cluster-hardware`**: Produces `data/clustered_output.csv` from embeddings and clustering.
|
|
134
|
+
- `prepare_viz_data.py`: Builds static `src/DustClust/cluster_viz/data.json` for the legacy static workflow.
|
|
135
|
+
- `DustClust.server` / CLI **`dustclust`**: Serves the live app with on-demand clustering and device matching.
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=69", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "dustclust"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "DustClust: interactive hardware inventory clustering with embeddings and a web visualization"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.10"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "DustClust contributors" }]
|
|
13
|
+
dependencies = [
|
|
14
|
+
"fastapi",
|
|
15
|
+
"uvicorn[standard]",
|
|
16
|
+
"python-multipart",
|
|
17
|
+
"pandas",
|
|
18
|
+
"openpyxl",
|
|
19
|
+
"numpy",
|
|
20
|
+
"sentence-transformers",
|
|
21
|
+
"scikit-learn",
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
[project.scripts]
|
|
25
|
+
dustclust = "DustClust.server:main"
|
|
26
|
+
cluster-hardware = "DustClust.cluster_hardware:main"
|
|
27
|
+
|
|
28
|
+
[tool.setuptools.packages.find]
|
|
29
|
+
where = ["src"]
|
|
30
|
+
|
|
31
|
+
[tool.setuptools.package-data]
|
|
32
|
+
DustClust = [
|
|
33
|
+
"cluster_viz/**/*",
|
|
34
|
+
"data/**/*",
|
|
35
|
+
"ifixit_devices.json",
|
|
36
|
+
]
|
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Cluster dirty hardware records using text embeddings + agglomerative hierarchical clustering.
|
|
4
|
+
|
|
5
|
+
For large datasets (>10K rows), uses a two-phase approach:
|
|
6
|
+
Phase 1: Pre-group rows with MiniBatchKMeans into manageable chunks
|
|
7
|
+
Phase 2: Run agglomerative clustering within each chunk, then merge labels
|
|
8
|
+
|
|
9
|
+
Usage:
|
|
10
|
+
python cluster_hardware.py --input data/dirty_hardware_data_40k.csv --threshold 0.3
|
|
11
|
+
python cluster_hardware.py --input data/dirty_hardware_data_40k.csv --threshold 0.2 --sample-size 5000
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import os
|
|
16
|
+
import time
|
|
17
|
+
import sys
|
|
18
|
+
|
|
19
|
+
import pandas as pd
|
|
20
|
+
import numpy as np
|
|
21
|
+
from sentence_transformers import SentenceTransformer
|
|
22
|
+
from sklearn.cluster import AgglomerativeClustering, MiniBatchKMeans
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
# Maximum rows for direct agglomerative clustering (above this, use two-phase approach)
|
|
26
|
+
DIRECT_CLUSTERING_LIMIT = 15_000
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def parse_args():
|
|
30
|
+
parser = argparse.ArgumentParser(
|
|
31
|
+
description="Cluster dirty hardware CSV records using embeddings + agglomerative clustering."
|
|
32
|
+
)
|
|
33
|
+
parser.add_argument(
|
|
34
|
+
"--input", default="data/dirty_hardware_data_40k.csv",
|
|
35
|
+
help="Path to the input CSV file (default: data/dirty_hardware_data_40k.csv)"
|
|
36
|
+
)
|
|
37
|
+
parser.add_argument(
|
|
38
|
+
"--output", default="data/clustered_output.csv",
|
|
39
|
+
help="Path to the output CSV file (default: data/clustered_output.csv)"
|
|
40
|
+
)
|
|
41
|
+
parser.add_argument(
|
|
42
|
+
"--threshold", type=float, default=0.3,
|
|
43
|
+
help="Distance threshold for clustering. Lower = tighter/more clusters, higher = looser/fewer clusters. "
|
|
44
|
+
"Range 0.0–2.0 for cosine distance (default: 0.3)"
|
|
45
|
+
)
|
|
46
|
+
parser.add_argument(
|
|
47
|
+
"--batch-size", type=int, default=512,
|
|
48
|
+
help="Batch size for embedding generation (default: 512)"
|
|
49
|
+
)
|
|
50
|
+
parser.add_argument(
|
|
51
|
+
"--sample-size", type=int, default=None,
|
|
52
|
+
help="Use only the first N rows for quick testing (default: use all rows)"
|
|
53
|
+
)
|
|
54
|
+
parser.add_argument(
|
|
55
|
+
"--model", default="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
|
56
|
+
help="Path to local model folder or HuggingFace model ID (default: paraphrase-multilingual-MiniLM-L12-v2)"
|
|
57
|
+
)
|
|
58
|
+
parser.add_argument(
|
|
59
|
+
"--device", default="cpu",
|
|
60
|
+
help="Device for embedding model: 'cpu' or 'cuda' (default: cpu, avoids CUDA index errors with some models)"
|
|
61
|
+
)
|
|
62
|
+
parser.add_argument(
|
|
63
|
+
"--pre-clusters", type=int, default=None,
|
|
64
|
+
help="Number of KMeans pre-clusters for large datasets. Auto-calculated if not set."
|
|
65
|
+
)
|
|
66
|
+
return parser.parse_args()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def load_data(path, sample_size=None):
|
|
70
|
+
"""Load CSV and optionally sample rows."""
|
|
71
|
+
print(f"📂 Loading data from {path}...")
|
|
72
|
+
df = pd.read_csv(path, dtype=str).fillna("")
|
|
73
|
+
if sample_size is not None:
|
|
74
|
+
df = df.head(sample_size)
|
|
75
|
+
print(f" Loaded {len(df):,} rows × {len(df.columns)} columns")
|
|
76
|
+
return df
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def build_text_representations(df):
|
|
80
|
+
"""Concatenate all columns into a single text string per row."""
|
|
81
|
+
print("📝 Building text representations...")
|
|
82
|
+
texts = df.apply(lambda row: " | ".join(row.values), axis=1).tolist()
|
|
83
|
+
print(f" Example: {texts[0]!r}")
|
|
84
|
+
return texts
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def generate_embeddings(texts, model_name, batch_size, device="cpu"):
|
|
88
|
+
"""Generate embeddings using a sentence-transformer model."""
|
|
89
|
+
# Resolve local paths to absolute (avoids cwd issues)
|
|
90
|
+
if "/" not in model_name or os.path.exists(model_name):
|
|
91
|
+
model_path = os.path.abspath(model_name)
|
|
92
|
+
else:
|
|
93
|
+
model_path = model_name
|
|
94
|
+
|
|
95
|
+
print(f"🤖 Loading model '{model_path}' on {device}...")
|
|
96
|
+
model = SentenceTransformer(model_path, device=device)
|
|
97
|
+
|
|
98
|
+
print(f"⚡ Generating embeddings for {len(texts):,} texts (batch_size={batch_size})...")
|
|
99
|
+
t0 = time.time()
|
|
100
|
+
embeddings = model.encode(
|
|
101
|
+
texts,
|
|
102
|
+
batch_size=batch_size,
|
|
103
|
+
show_progress_bar=True,
|
|
104
|
+
normalize_embeddings=True, # Pre-normalize for cosine similarity
|
|
105
|
+
)
|
|
106
|
+
elapsed = time.time() - t0
|
|
107
|
+
print(f" Done in {elapsed:.1f}s — shape: {embeddings.shape}")
|
|
108
|
+
return embeddings
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def cluster_direct(embeddings, threshold):
|
|
112
|
+
"""Run agglomerative clustering directly (for smaller datasets)."""
|
|
113
|
+
print(f"🔗 Clustering {len(embeddings):,} rows with distance_threshold={threshold}...")
|
|
114
|
+
t0 = time.time()
|
|
115
|
+
clustering = AgglomerativeClustering(
|
|
116
|
+
n_clusters=None,
|
|
117
|
+
distance_threshold=threshold,
|
|
118
|
+
metric="cosine",
|
|
119
|
+
linkage="average",
|
|
120
|
+
)
|
|
121
|
+
labels = clustering.fit_predict(embeddings)
|
|
122
|
+
elapsed = time.time() - t0
|
|
123
|
+
n_clusters = len(set(labels))
|
|
124
|
+
print(f" Found {n_clusters:,} clusters in {elapsed:.1f}s")
|
|
125
|
+
return labels
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def cluster_twophase(embeddings, threshold, n_pre_clusters=None):
|
|
129
|
+
"""
|
|
130
|
+
Two-phase clustering for large datasets:
|
|
131
|
+
Phase 1: MiniBatchKMeans to create manageable pre-groups
|
|
132
|
+
Phase 2: Agglomerative clustering within each pre-group
|
|
133
|
+
"""
|
|
134
|
+
n = len(embeddings)
|
|
135
|
+
if n_pre_clusters is None:
|
|
136
|
+
# Aim for pre-groups of ~5000 rows each
|
|
137
|
+
n_pre_clusters = max(10, n // 5000)
|
|
138
|
+
n_pre_clusters = min(n_pre_clusters, n)
|
|
139
|
+
|
|
140
|
+
print(f"🔗 Phase 1: Pre-grouping {n:,} rows into {n_pre_clusters} groups with KMeans...")
|
|
141
|
+
t0 = time.time()
|
|
142
|
+
kmeans = MiniBatchKMeans(
|
|
143
|
+
n_clusters=n_pre_clusters,
|
|
144
|
+
batch_size=min(4096, n),
|
|
145
|
+
random_state=42,
|
|
146
|
+
n_init=3,
|
|
147
|
+
)
|
|
148
|
+
pre_labels = kmeans.fit_predict(embeddings)
|
|
149
|
+
elapsed = time.time() - t0
|
|
150
|
+
print(f" Pre-grouping done in {elapsed:.1f}s")
|
|
151
|
+
|
|
152
|
+
print(f"🔗 Phase 2: Agglomerative clustering within each group (threshold={threshold})...")
|
|
153
|
+
t0 = time.time()
|
|
154
|
+
final_labels = np.zeros(n, dtype=int)
|
|
155
|
+
global_cluster_id = 0
|
|
156
|
+
|
|
157
|
+
for group_id in range(n_pre_clusters):
|
|
158
|
+
mask = pre_labels == group_id
|
|
159
|
+
group_embeddings = embeddings[mask]
|
|
160
|
+
group_size = len(group_embeddings)
|
|
161
|
+
|
|
162
|
+
if group_size <= 1:
|
|
163
|
+
final_labels[mask] = global_cluster_id
|
|
164
|
+
global_cluster_id += 1
|
|
165
|
+
continue
|
|
166
|
+
|
|
167
|
+
clustering = AgglomerativeClustering(
|
|
168
|
+
n_clusters=None,
|
|
169
|
+
distance_threshold=threshold,
|
|
170
|
+
metric="cosine",
|
|
171
|
+
linkage="average",
|
|
172
|
+
)
|
|
173
|
+
group_labels = clustering.fit_predict(group_embeddings)
|
|
174
|
+
n_group_clusters = len(set(group_labels))
|
|
175
|
+
|
|
176
|
+
# Map local cluster IDs to global IDs
|
|
177
|
+
final_labels[mask] = group_labels + global_cluster_id
|
|
178
|
+
global_cluster_id += n_group_clusters
|
|
179
|
+
|
|
180
|
+
elapsed = time.time() - t0
|
|
181
|
+
n_clusters = len(set(final_labels))
|
|
182
|
+
print(f" Found {n_clusters:,} clusters in {elapsed:.1f}s")
|
|
183
|
+
return final_labels
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def cluster_embeddings(embeddings, threshold, n_pre_clusters=None):
|
|
187
|
+
"""Choose clustering strategy based on dataset size."""
|
|
188
|
+
n = len(embeddings)
|
|
189
|
+
if n <= DIRECT_CLUSTERING_LIMIT:
|
|
190
|
+
return cluster_direct(embeddings, threshold)
|
|
191
|
+
else:
|
|
192
|
+
print(f"ℹ️ Dataset has {n:,} rows (>{DIRECT_CLUSTERING_LIMIT:,}), using two-phase clustering")
|
|
193
|
+
return cluster_twophase(embeddings, threshold, n_pre_clusters)
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def print_summary(df, max_clusters=30, samples_per_cluster=3):
|
|
197
|
+
"""Print a summary of the clustering results."""
|
|
198
|
+
cluster_sizes = df.groupby("cluster_id").size().sort_values(ascending=False)
|
|
199
|
+
n_clusters = len(cluster_sizes)
|
|
200
|
+
|
|
201
|
+
print("\n" + "=" * 80)
|
|
202
|
+
print(f"📊 CLUSTERING SUMMARY")
|
|
203
|
+
print(f" Total rows: {len(df):,}")
|
|
204
|
+
print(f" Total clusters: {n_clusters:,}")
|
|
205
|
+
print(f" Largest: {cluster_sizes.iloc[0]:,} rows")
|
|
206
|
+
print(f" Smallest: {cluster_sizes.iloc[-1]:,} rows")
|
|
207
|
+
print(f" Median size: {int(cluster_sizes.median()):,} rows")
|
|
208
|
+
print("=" * 80)
|
|
209
|
+
|
|
210
|
+
# Show the top clusters with sample rows
|
|
211
|
+
show_n = min(max_clusters, n_clusters)
|
|
212
|
+
print(f"\n🔍 Top {show_n} clusters by size:\n")
|
|
213
|
+
|
|
214
|
+
for i, (cluster_id, size) in enumerate(cluster_sizes.head(show_n).items()):
|
|
215
|
+
cluster_rows = df[df["cluster_id"] == cluster_id]
|
|
216
|
+
# Show a few sample rows (excluding the cluster_id column for readability)
|
|
217
|
+
display_cols = [c for c in df.columns if c != "cluster_id"]
|
|
218
|
+
samples = cluster_rows[display_cols].head(samples_per_cluster)
|
|
219
|
+
|
|
220
|
+
print(f" Cluster {cluster_id} ({size:,} rows):")
|
|
221
|
+
for _, row in samples.iterrows():
|
|
222
|
+
print(f" → {' | '.join(row.values)}")
|
|
223
|
+
print()
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
def main():
|
|
227
|
+
args = parse_args()
|
|
228
|
+
|
|
229
|
+
print(f"\n{'=' * 80}")
|
|
230
|
+
print(f" Hardware Record Clustering")
|
|
231
|
+
print(f" Threshold: {args.threshold} | Model: {args.model}")
|
|
232
|
+
print(f"{'=' * 80}\n")
|
|
233
|
+
|
|
234
|
+
# 1. Load data
|
|
235
|
+
df = load_data(args.input, args.sample_size)
|
|
236
|
+
|
|
237
|
+
# 2. Build text representations
|
|
238
|
+
texts = build_text_representations(df)
|
|
239
|
+
|
|
240
|
+
# 3. Generate embeddings
|
|
241
|
+
embeddings = generate_embeddings(texts, args.model, args.batch_size, device=args.device)
|
|
242
|
+
|
|
243
|
+
# 4. Cluster
|
|
244
|
+
labels = cluster_embeddings(embeddings, args.threshold, args.pre_clusters)
|
|
245
|
+
|
|
246
|
+
# 5. Attach labels and sort by cluster
|
|
247
|
+
df["cluster_id"] = labels
|
|
248
|
+
df = df.sort_values("cluster_id").reset_index(drop=True)
|
|
249
|
+
|
|
250
|
+
# 6. Save output
|
|
251
|
+
df.to_csv(args.output, index=False)
|
|
252
|
+
print(f"\n💾 Saved clustered output to {args.output}")
|
|
253
|
+
|
|
254
|
+
# 7. Print summary
|
|
255
|
+
print_summary(df)
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
if __name__ == "__main__":
|
|
259
|
+
main()
|