Vortiq 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- vortiq-0.1.0/PKG-INFO +316 -0
- vortiq-0.1.0/README.md +301 -0
- vortiq-0.1.0/pyproject.toml +27 -0
- vortiq-0.1.0/setup.cfg +4 -0
- vortiq-0.1.0/src/Vortiq.egg-info/PKG-INFO +316 -0
- vortiq-0.1.0/src/Vortiq.egg-info/SOURCES.txt +17 -0
- vortiq-0.1.0/src/Vortiq.egg-info/dependency_links.txt +1 -0
- vortiq-0.1.0/src/Vortiq.egg-info/entry_points.txt +2 -0
- vortiq-0.1.0/src/Vortiq.egg-info/requires.txt +7 -0
- vortiq-0.1.0/src/Vortiq.egg-info/top_level.txt +1 -0
- vortiq-0.1.0/src/vortiq/__init__.py +0 -0
- vortiq-0.1.0/src/vortiq/clustering/dbscanModel.py +93 -0
- vortiq-0.1.0/src/vortiq/clustering/dirNaming.py +62 -0
- vortiq-0.1.0/src/vortiq/clustering/embeddings.py +68 -0
- vortiq-0.1.0/src/vortiq/clustering/fileContent.py +37 -0
- vortiq-0.1.0/src/vortiq/dedup/dirTraversal.py +31 -0
- vortiq-0.1.0/src/vortiq/dedup/fileHash.py +29 -0
- vortiq-0.1.0/src/vortiq/dedup/fileStats.py +23 -0
- vortiq-0.1.0/src/vortiq/main.py +192 -0
vortiq-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,316 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: Vortiq
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A lightning-fast, zero-cloud CLI utility that cleans your Windows directories by parsing file contents and visual data — automatically destroying clones and generating intelligently named folder structures.
|
|
5
|
+
Author-email: Om Patil <44ompatil@gmail.com>
|
|
6
|
+
Requires-Python: >=3.12
|
|
7
|
+
Description-Content-Type: text/markdown
|
|
8
|
+
Requires-Dist: fastembed>=0.8.0
|
|
9
|
+
Requires-Dist: numpy>=2.4.6
|
|
10
|
+
Requires-Dist: pillow>=12.2.0
|
|
11
|
+
Requires-Dist: pymupdf>=1.27.2.3
|
|
12
|
+
Requires-Dist: pytesseract>=0.3.13
|
|
13
|
+
Requires-Dist: qdrant-client[fastembed]>=1.18.0
|
|
14
|
+
Requires-Dist: scikit-learn>=1.8.0
|
|
15
|
+
|
|
16
|
+
<p align="center">
|
|
17
|
+
<h1 align="center">🌀 Vortex</h1>
|
|
18
|
+
<p align="center">
|
|
19
|
+
<em>A lightning-fast, zero-cloud CLI utility that cleans your Windows directories by parsing file contents and visual data — automatically destroying clones and generating intelligently named folder structures.</em>
|
|
20
|
+
</p>
|
|
21
|
+
<p align="center">
|
|
22
|
+
<img src="https://img.shields.io/badge/python-%3E%3D3.12-3776AB?logo=python&logoColor=white" alt="Python">
|
|
23
|
+
<img src="https://img.shields.io/badge/package_manager-uv-DE5FE9?logo=uv&logoColor=white" alt="uv">
|
|
24
|
+
<img src="https://img.shields.io/badge/ML-scikit--learn-F7931E?logo=scikit-learn&logoColor=white" alt="scikit-learn">
|
|
25
|
+
<img src="https://img.shields.io/badge/vector_db-Qdrant-DC382D?logo=qdrant&logoColor=white" alt="Qdrant">
|
|
26
|
+
<img src="https://img.shields.io/badge/platform-Windows-0078D6?logo=windows&logoColor=white" alt="Windows">
|
|
27
|
+
<img src="https://img.shields.io/badge/version-0.1.0-brightgreen" alt="Version">
|
|
28
|
+
<img src="https://img.shields.io/badge/license-MIT-blue" alt="License">
|
|
29
|
+
</p>
|
|
30
|
+
</p>
|
|
31
|
+
|
|
32
|
+
---
|
|
33
|
+
|
|
34
|
+
## Table of Contents
|
|
35
|
+
|
|
36
|
+
- [Overview](#overview)
|
|
37
|
+
- [Features](#features)
|
|
38
|
+
- [Architecture](#architecture)
|
|
39
|
+
- [Project Structure](#project-structure)
|
|
40
|
+
- [Tech Stack](#tech-stack)
|
|
41
|
+
- [Supported File Formats](#supported-file-formats)
|
|
42
|
+
- [Prerequisites](#prerequisites)
|
|
43
|
+
- [Installation](#installation)
|
|
44
|
+
- [Usage](#usage)
|
|
45
|
+
- [How It Works](#how-it-works)
|
|
46
|
+
- [Configuration](#configuration)
|
|
47
|
+
- [Roadmap](#roadmap)
|
|
48
|
+
- [Contributing](#contributing)
|
|
49
|
+
- [License](#license)
|
|
50
|
+
|
|
51
|
+
---
|
|
52
|
+
|
|
53
|
+
## Overview
|
|
54
|
+
|
|
55
|
+
Vortex uses state-of-the-art embedding models to intelligently group your files. Whether you have text documents, code, PDFs, or images — it semantically analyzes their content **entirely locally**. Your data never leaves your machine.
|
|
56
|
+
|
|
57
|
+
**The Problem:** Over time, directories accumulate duplicate files and an unstructured mess of documents, images, and code. Manually organizing them is tedious and error-prone.
|
|
58
|
+
|
|
59
|
+
**The Solution:** Vortex automates the entire process in a single command — deduplicating exact clones via SHA-256, then semantically clustering the remaining files using vector embeddings and DBSCAN, and finally reorganizing them into auto-named folders derived from their content.
|
|
60
|
+
|
|
61
|
+
---
|
|
62
|
+
|
|
63
|
+
## Features
|
|
64
|
+
|
|
65
|
+
### 🔍 Intelligent Deduplication
|
|
66
|
+
- **Recursive Directory Traversal** — Walks the entire target directory tree, resolving all nested files.
|
|
67
|
+
- **SHA-256 Chunked Hashing** — Identifies exact binary duplicates efficiently, even for very large files, using 4 MB chunked reads.
|
|
68
|
+
- **Storage Statistics** — Gathers file size metadata before acting.
|
|
69
|
+
|
|
70
|
+
### 🧠 Semantic Clustering
|
|
71
|
+
- **Multi-Format Content Extraction** — Pulls text from plain text files, code, PDFs (via PyMuPDF), and images (via Tesseract OCR).
|
|
72
|
+
- **Local Embedding Generation** — Generates text embeddings with `BAAI/bge-small-en-v1.5` and image embeddings with `Qdrant/clip-ViT-B-32-vision`, all via `fastembed`.
|
|
73
|
+
- **Vector Database** — Manages semantic indexes in a local Qdrant instance (on-disk, no server needed).
|
|
74
|
+
- **DBSCAN Clustering** — Groups files by semantic proximity using cosine-distance DBSCAN with tuned `eps` and `min_samples` per modality.
|
|
75
|
+
- **TF-IDF Directory Naming** — Automatically generates human-readable folder names by extracting the most relevant terms from each cluster's combined content.
|
|
76
|
+
|
|
77
|
+
### 🛡️ Human-in-the-Loop (HITL)
|
|
78
|
+
> *Implementation in progress.*
|
|
79
|
+
|
|
80
|
+
An interactive review phase that will let you verify the proposed folder structure and directory names before Vortex commits changes to your filesystem.
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Architecture
|
|
85
|
+
|
|
86
|
+
```mermaid
|
|
87
|
+
flowchart TD
|
|
88
|
+
A["🗂️ Target Directory"] --> B["Phase 1: Deduplication"]
|
|
89
|
+
|
|
90
|
+
subgraph DEDUP ["src/dedup"]
|
|
91
|
+
B --> B1["dirTraversal — Recursive file discovery"]
|
|
92
|
+
B1 --> B2["fileHash — SHA-256 chunked hashing"]
|
|
93
|
+
B2 --> B3["fileStats — File size metadata"]
|
|
94
|
+
B3 --> B4{"Duplicates found?"}
|
|
95
|
+
B4 -- Yes --> B5["Remove exact clones"]
|
|
96
|
+
B4 -- No --> C
|
|
97
|
+
B5 --> C
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
C["Unique Files"] --> D["Phase 2: Clustering"]
|
|
101
|
+
|
|
102
|
+
subgraph CLUSTER ["src/clustering"]
|
|
103
|
+
D --> D1["fileContent — Multi-format text extraction"]
|
|
104
|
+
D1 --> D2["embeddings — Vector generation"]
|
|
105
|
+
D2 --> D2a["Text: BAAI/bge-small-en-v1.5 → 384d"]
|
|
106
|
+
D2 --> D2b["Image: Qdrant/clip-ViT-B-32-vision → 512d"]
|
|
107
|
+
D2a --> D3["Qdrant Vector DB — Local on-disk storage"]
|
|
108
|
+
D2b --> D3
|
|
109
|
+
D3 --> D4["dbscanModel — Cosine DBSCAN clustering"]
|
|
110
|
+
D4 --> D5["dirNaming — TF-IDF folder name generation"]
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
D5 --> E["Phase 3: HITL Review"]
|
|
114
|
+
E --> F["📁 Organized Directory"]
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Project Structure
|
|
120
|
+
|
|
121
|
+
```
|
|
122
|
+
Vortex/
|
|
123
|
+
├── main.py # CLI entry point (Typer + Rich)
|
|
124
|
+
├── pyproject.toml # Project metadata & dependencies
|
|
125
|
+
├── uv.lock # Locked dependency versions
|
|
126
|
+
├── .python-version # Python 3.12
|
|
127
|
+
│
|
|
128
|
+
├── src/
|
|
129
|
+
│ ├── dedup/ # Phase 1 — Deduplication
|
|
130
|
+
│ │ ├── dirTraversal.py # Recursive directory walker
|
|
131
|
+
│ │ ├── fileHash.py # SHA-256 chunked file hashing
|
|
132
|
+
│ │ └── fileStats.py # File size statistics
|
|
133
|
+
│ │
|
|
134
|
+
│ ├── clustering/ # Phase 2 — Semantic Clustering
|
|
135
|
+
│ │ ├── fileContent.py # Multi-format content extraction
|
|
136
|
+
│ │ ├── embeddings.py # Text & image embedding generation + Qdrant storage
|
|
137
|
+
│ │ ├── dbscanModel.py # DBSCAN clustering with data retrieval from Qdrant
|
|
138
|
+
│ │ └── dirNaming.py # TF-IDF based directory name generation
|
|
139
|
+
│ │
|
|
140
|
+
│ └── hitl/ # Phase 3 — Human-in-the-Loop (WIP)
|
|
141
|
+
│
|
|
142
|
+
└── docs/ # Documentation (WIP)
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
---
|
|
146
|
+
|
|
147
|
+
## Tech Stack
|
|
148
|
+
|
|
149
|
+
| Category | Technology | Purpose |
|
|
150
|
+
|---|---|---|
|
|
151
|
+
| **Language** | Python ≥ 3.12 | Core runtime |
|
|
152
|
+
| **CLI Framework** | [Typer](https://typer.tiangolo.com/) + [Rich](https://rich.readthedocs.io/) | Command-line interface with styled output |
|
|
153
|
+
| **Text Embeddings** | [FastEmbed](https://github.com/qdrant/fastembed) (`BAAI/bge-small-en-v1.5`) | 384-dim text vectors |
|
|
154
|
+
| **Image Embeddings** | [FastEmbed](https://github.com/qdrant/fastembed) (`Qdrant/clip-ViT-B-32-vision`) | 512-dim image vectors |
|
|
155
|
+
| **Vector Database** | [Qdrant](https://qdrant.tech/) (local on-disk mode) | Semantic index storage |
|
|
156
|
+
| **Clustering** | [scikit-learn](https://scikit-learn.org/) (DBSCAN) | Density-based grouping |
|
|
157
|
+
| **PDF Parsing** | [PyMuPDF](https://pymupdf.readthedocs.io/) | Text extraction from PDFs |
|
|
158
|
+
| **OCR** | [Tesseract](https://github.com/tesseract-ocr/tesseract) + [pytesseract](https://github.com/madmaze/pytesseract) | Image-to-text extraction |
|
|
159
|
+
| **Image Processing** | [Pillow](https://pillow.readthedocs.io/) | Image loading for OCR |
|
|
160
|
+
| **TF-IDF** | [scikit-learn](https://scikit-learn.org/) (TfidfVectorizer) | Directory name generation |
|
|
161
|
+
| **Package Manager** | [uv](https://docs.astral.sh/uv/) | Fast dependency management |
|
|
162
|
+
|
|
163
|
+
---
|
|
164
|
+
|
|
165
|
+
## Supported File Formats
|
|
166
|
+
|
|
167
|
+
| Category | Extensions | Extraction Method |
|
|
168
|
+
|---|---|---|
|
|
169
|
+
| **Plain Text / Code** | `.txt`, `.md`, `.csv`, `.json`, `.py`, `.js`, `.html` | Direct UTF-8 read |
|
|
170
|
+
| **Documents** | `.pdf` | PyMuPDF text extraction |
|
|
171
|
+
| **Images** | `.png`, `.jpg`, `.jpeg` | CLIP embeddings (visual) + Tesseract OCR (textual) |
|
|
172
|
+
|
|
173
|
+
> **Note:** Images are embedded using CLIP for visual similarity clustering. OCR is used separately when text content is needed (e.g., for directory naming).
|
|
174
|
+
|
|
175
|
+
---
|
|
176
|
+
|
|
177
|
+
## Prerequisites
|
|
178
|
+
|
|
179
|
+
1. **Python 3.12+** — [Download](https://www.python.org/downloads/)
|
|
180
|
+
2. **uv** (recommended) — [Install](https://docs.astral.sh/uv/getting-started/installation/)
|
|
181
|
+
3. **Tesseract OCR** — Required for image text extraction.
|
|
182
|
+
- Download the Windows installer from [UB Mannheim](https://github.com/UB-Mannheim/tesseract/wiki).
|
|
183
|
+
- Ensure `tesseract.exe` is accessible via your system `PATH`.
|
|
184
|
+
|
|
185
|
+
---
|
|
186
|
+
|
|
187
|
+
## Installation
|
|
188
|
+
|
|
189
|
+
```bash
|
|
190
|
+
# Clone the repository
|
|
191
|
+
git clone https://github.com/44ompatil/Vortex.git
|
|
192
|
+
cd Vortex
|
|
193
|
+
|
|
194
|
+
# Install dependencies (uv recommended)
|
|
195
|
+
uv sync
|
|
196
|
+
|
|
197
|
+
# Or, using pip
|
|
198
|
+
pip install -e .
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
---
|
|
202
|
+
|
|
203
|
+
## Usage
|
|
204
|
+
|
|
205
|
+
Vortex exposes a `sort` command that orchestrates the full pipeline:
|
|
206
|
+
|
|
207
|
+
```bash
|
|
208
|
+
# Sort a directory
|
|
209
|
+
uv run python main.py sort <target-directory>
|
|
210
|
+
|
|
211
|
+
# Example
|
|
212
|
+
uv run python main.py sort "C:\Users\you\Downloads"
|
|
213
|
+
```
|
|
214
|
+
|
|
215
|
+
### Available Commands
|
|
216
|
+
|
|
217
|
+
| Command | Description |
|
|
218
|
+
|---|---|
|
|
219
|
+
| `sort <directory>` | Run the full dedup → cluster → organize pipeline on the target directory |
|
|
220
|
+
| `help` | Display available commands and usage information |
|
|
221
|
+
|
|
222
|
+
### What Happens When You Run `sort`
|
|
223
|
+
|
|
224
|
+
1. **Scan** — Recursively discovers all files in the target directory.
|
|
225
|
+
2. **Dedup** — Identifies and removes exact binary duplicates (SHA-256).
|
|
226
|
+
3. **Extract** — Pulls text/visual content from each unique file.
|
|
227
|
+
4. **Embed** — Generates vector embeddings (text: 384d, image: 512d).
|
|
228
|
+
5. **Cluster** — Groups semantically similar files using DBSCAN.
|
|
229
|
+
6. **Name** — Generates descriptive folder names via TF-IDF.
|
|
230
|
+
7. **Organize** — Moves files into their newly created, named folders.
|
|
231
|
+
|
|
232
|
+
---
|
|
233
|
+
|
|
234
|
+
## How It Works
|
|
235
|
+
|
|
236
|
+
### Phase 1: Deduplication (`src/dedup/`)
|
|
237
|
+
|
|
238
|
+
Files are recursively discovered via `os.walk`. Each file is hashed using **SHA-256** with 4 MB chunked reads to handle large files efficiently. Files sharing the same hash are identified as exact duplicates — only one copy is kept, the rest are deleted.
|
|
239
|
+
|
|
240
|
+
### Phase 2: Semantic Clustering (`src/clustering/`)
|
|
241
|
+
|
|
242
|
+
Remaining unique files have their content extracted based on file type. Text content is embedded into 384-dimensional vectors using `BAAI/bge-small-en-v1.5`, while images are embedded into 512-dimensional vectors using `Qdrant/clip-ViT-B-32-vision`. All vectors are stored in a local Qdrant database.
|
|
243
|
+
|
|
244
|
+
**DBSCAN** (Density-Based Spatial Clustering of Applications with Noise) then groups files by semantic similarity using cosine distance. The algorithm parameters are tuned separately for each modality:
|
|
245
|
+
|
|
246
|
+
| Modality | `eps` | `min_samples` |
|
|
247
|
+
|---|---|---|
|
|
248
|
+
| Text | 0.20 | 2 |
|
|
249
|
+
| Image | 0.45 | 2 |
|
|
250
|
+
|
|
251
|
+
Each cluster is assigned a human-readable name generated by running **TF-IDF** on the combined text content of the cluster's files, extracting the top-2 most distinctive terms.
|
|
252
|
+
|
|
253
|
+
### Phase 3: Human-in-the-Loop (`src/hitl/`)
|
|
254
|
+
|
|
255
|
+
> *Coming soon.* This phase will present the proposed folder structure for interactive review before files are moved.
|
|
256
|
+
|
|
257
|
+
---
|
|
258
|
+
|
|
259
|
+
## Configuration
|
|
260
|
+
|
|
261
|
+
Currently, model parameters are configured in-code:
|
|
262
|
+
|
|
263
|
+
| Parameter | Location | Default | Description |
|
|
264
|
+
|---|---|---|---|
|
|
265
|
+
| `chunkSize` | `fileHash.py` | `4194304` (4 MB) | Byte chunk size for SHA-256 hashing |
|
|
266
|
+
| `txtEps` | `dbscanModel.py` | `0.20` | DBSCAN epsilon for text clusters |
|
|
267
|
+
| `txtMinPts` | `dbscanModel.py` | `2` | DBSCAN minimum points for text clusters |
|
|
268
|
+
| `imgEps` | `dbscanModel.py` | `0.45` | DBSCAN epsilon for image clusters |
|
|
269
|
+
| `imgMinPts` | `dbscanModel.py` | `2` | DBSCAN minimum points for image clusters |
|
|
270
|
+
| `top_n_words` | `dirNaming.py` | `2` | Number of TF-IDF terms used in folder names |
|
|
271
|
+
| `PageSize` | `dbscanModel.py` | `30` | Qdrant scroll page size |
|
|
272
|
+
|
|
273
|
+
---
|
|
274
|
+
|
|
275
|
+
## Roadmap
|
|
276
|
+
|
|
277
|
+
- [x] Recursive directory traversal
|
|
278
|
+
- [x] SHA-256 chunked deduplication
|
|
279
|
+
- [x] Multi-format content extraction (text, PDF, images)
|
|
280
|
+
- [x] Local text & image embedding generation
|
|
281
|
+
- [x] Qdrant vector storage
|
|
282
|
+
- [x] DBSCAN semantic clustering
|
|
283
|
+
- [x] TF-IDF auto-naming for directories
|
|
284
|
+
- [x] Typer CLI with Rich output
|
|
285
|
+
- [ ] Human-in-the-Loop interactive review
|
|
286
|
+
- [ ] Config file support (YAML/TOML)
|
|
287
|
+
- [ ] Cross-modal clustering (text + image in unified space)
|
|
288
|
+
- [ ] Undo / dry-run mode
|
|
289
|
+
- [ ] Progress bars and summary statistics
|
|
290
|
+
- [ ] Additional file format support (`.docx`, `.xlsx`, `.pptx`)
|
|
291
|
+
|
|
292
|
+
---
|
|
293
|
+
|
|
294
|
+
## Contributing
|
|
295
|
+
|
|
296
|
+
Contributions are welcome! Here's how to get started:
|
|
297
|
+
|
|
298
|
+
1. **Fork** the repository.
|
|
299
|
+
2. **Create a feature branch** — `git checkout -b feature/your-feature`
|
|
300
|
+
3. **Commit your changes** — `git commit -m "Add your feature"`
|
|
301
|
+
4. **Push to the branch** — `git push origin feature/your-feature`
|
|
302
|
+
5. **Open a Pull Request.**
|
|
303
|
+
|
|
304
|
+
Please ensure your code follows the existing style and includes appropriate documentation.
|
|
305
|
+
|
|
306
|
+
---
|
|
307
|
+
|
|
308
|
+
## License
|
|
309
|
+
|
|
310
|
+
This project is licensed under the **MIT License** — see the [LICENSE](LICENSE) file for details.
|
|
311
|
+
|
|
312
|
+
---
|
|
313
|
+
|
|
314
|
+
<p align="center">
|
|
315
|
+
<sub>Built with ❤️ for anyone drowning in unorganized files.</sub>
|
|
316
|
+
</p>
|
vortiq-0.1.0/README.md
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<h1 align="center">🌀 Vortex</h1>
|
|
3
|
+
<p align="center">
|
|
4
|
+
<em>A lightning-fast, zero-cloud CLI utility that cleans your Windows directories by parsing file contents and visual data — automatically destroying clones and generating intelligently named folder structures.</em>
|
|
5
|
+
</p>
|
|
6
|
+
<p align="center">
|
|
7
|
+
<img src="https://img.shields.io/badge/python-%3E%3D3.12-3776AB?logo=python&logoColor=white" alt="Python">
|
|
8
|
+
<img src="https://img.shields.io/badge/package_manager-uv-DE5FE9?logo=uv&logoColor=white" alt="uv">
|
|
9
|
+
<img src="https://img.shields.io/badge/ML-scikit--learn-F7931E?logo=scikit-learn&logoColor=white" alt="scikit-learn">
|
|
10
|
+
<img src="https://img.shields.io/badge/vector_db-Qdrant-DC382D?logo=qdrant&logoColor=white" alt="Qdrant">
|
|
11
|
+
<img src="https://img.shields.io/badge/platform-Windows-0078D6?logo=windows&logoColor=white" alt="Windows">
|
|
12
|
+
<img src="https://img.shields.io/badge/version-0.1.0-brightgreen" alt="Version">
|
|
13
|
+
<img src="https://img.shields.io/badge/license-MIT-blue" alt="License">
|
|
14
|
+
</p>
|
|
15
|
+
</p>
|
|
16
|
+
|
|
17
|
+
---
|
|
18
|
+
|
|
19
|
+
## Table of Contents
|
|
20
|
+
|
|
21
|
+
- [Overview](#overview)
|
|
22
|
+
- [Features](#features)
|
|
23
|
+
- [Architecture](#architecture)
|
|
24
|
+
- [Project Structure](#project-structure)
|
|
25
|
+
- [Tech Stack](#tech-stack)
|
|
26
|
+
- [Supported File Formats](#supported-file-formats)
|
|
27
|
+
- [Prerequisites](#prerequisites)
|
|
28
|
+
- [Installation](#installation)
|
|
29
|
+
- [Usage](#usage)
|
|
30
|
+
- [How It Works](#how-it-works)
|
|
31
|
+
- [Configuration](#configuration)
|
|
32
|
+
- [Roadmap](#roadmap)
|
|
33
|
+
- [Contributing](#contributing)
|
|
34
|
+
- [License](#license)
|
|
35
|
+
|
|
36
|
+
---
|
|
37
|
+
|
|
38
|
+
## Overview
|
|
39
|
+
|
|
40
|
+
Vortex uses state-of-the-art embedding models to intelligently group your files. Whether you have text documents, code, PDFs, or images — it semantically analyzes their content **entirely locally**. Your data never leaves your machine.
|
|
41
|
+
|
|
42
|
+
**The Problem:** Over time, directories accumulate duplicate files and an unstructured mess of documents, images, and code. Manually organizing them is tedious and error-prone.
|
|
43
|
+
|
|
44
|
+
**The Solution:** Vortex automates the entire process in a single command — deduplicating exact clones via SHA-256, then semantically clustering the remaining files using vector embeddings and DBSCAN, and finally reorganizing them into auto-named folders derived from their content.
|
|
45
|
+
|
|
46
|
+
---
|
|
47
|
+
|
|
48
|
+
## Features
|
|
49
|
+
|
|
50
|
+
### 🔍 Intelligent Deduplication
|
|
51
|
+
- **Recursive Directory Traversal** — Walks the entire target directory tree, resolving all nested files.
|
|
52
|
+
- **SHA-256 Chunked Hashing** — Identifies exact binary duplicates efficiently, even for very large files, using 4 MB chunked reads.
|
|
53
|
+
- **Storage Statistics** — Gathers file size metadata before acting.
|
|
54
|
+
|
|
55
|
+
### 🧠 Semantic Clustering
|
|
56
|
+
- **Multi-Format Content Extraction** — Pulls text from plain text files, code, PDFs (via PyMuPDF), and images (via Tesseract OCR).
|
|
57
|
+
- **Local Embedding Generation** — Generates text embeddings with `BAAI/bge-small-en-v1.5` and image embeddings with `Qdrant/clip-ViT-B-32-vision`, all via `fastembed`.
|
|
58
|
+
- **Vector Database** — Manages semantic indexes in a local Qdrant instance (on-disk, no server needed).
|
|
59
|
+
- **DBSCAN Clustering** — Groups files by semantic proximity using cosine-distance DBSCAN with tuned `eps` and `min_samples` per modality.
|
|
60
|
+
- **TF-IDF Directory Naming** — Automatically generates human-readable folder names by extracting the most relevant terms from each cluster's combined content.
|
|
61
|
+
|
|
62
|
+
### 🛡️ Human-in-the-Loop (HITL)
|
|
63
|
+
> *Implementation in progress.*
|
|
64
|
+
|
|
65
|
+
An interactive review phase that will let you verify the proposed folder structure and directory names before Vortex commits changes to your filesystem.
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Architecture
|
|
70
|
+
|
|
71
|
+
```mermaid
|
|
72
|
+
flowchart TD
|
|
73
|
+
A["🗂️ Target Directory"] --> B["Phase 1: Deduplication"]
|
|
74
|
+
|
|
75
|
+
subgraph DEDUP ["src/dedup"]
|
|
76
|
+
B --> B1["dirTraversal — Recursive file discovery"]
|
|
77
|
+
B1 --> B2["fileHash — SHA-256 chunked hashing"]
|
|
78
|
+
B2 --> B3["fileStats — File size metadata"]
|
|
79
|
+
B3 --> B4{"Duplicates found?"}
|
|
80
|
+
B4 -- Yes --> B5["Remove exact clones"]
|
|
81
|
+
B4 -- No --> C
|
|
82
|
+
B5 --> C
|
|
83
|
+
end
|
|
84
|
+
|
|
85
|
+
C["Unique Files"] --> D["Phase 2: Clustering"]
|
|
86
|
+
|
|
87
|
+
subgraph CLUSTER ["src/clustering"]
|
|
88
|
+
D --> D1["fileContent — Multi-format text extraction"]
|
|
89
|
+
D1 --> D2["embeddings — Vector generation"]
|
|
90
|
+
D2 --> D2a["Text: BAAI/bge-small-en-v1.5 → 384d"]
|
|
91
|
+
D2 --> D2b["Image: Qdrant/clip-ViT-B-32-vision → 512d"]
|
|
92
|
+
D2a --> D3["Qdrant Vector DB — Local on-disk storage"]
|
|
93
|
+
D2b --> D3
|
|
94
|
+
D3 --> D4["dbscanModel — Cosine DBSCAN clustering"]
|
|
95
|
+
D4 --> D5["dirNaming — TF-IDF folder name generation"]
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
D5 --> E["Phase 3: HITL Review"]
|
|
99
|
+
E --> F["📁 Organized Directory"]
|
|
100
|
+
```
|
|
101
|
+
|
|
102
|
+
---
|
|
103
|
+
|
|
104
|
+
## Project Structure
|
|
105
|
+
|
|
106
|
+
```
|
|
107
|
+
Vortex/
|
|
108
|
+
├── main.py # CLI entry point (Typer + Rich)
|
|
109
|
+
├── pyproject.toml # Project metadata & dependencies
|
|
110
|
+
├── uv.lock # Locked dependency versions
|
|
111
|
+
├── .python-version # Python 3.12
|
|
112
|
+
│
|
|
113
|
+
├── src/
|
|
114
|
+
│ ├── dedup/ # Phase 1 — Deduplication
|
|
115
|
+
│ │ ├── dirTraversal.py # Recursive directory walker
|
|
116
|
+
│ │ ├── fileHash.py # SHA-256 chunked file hashing
|
|
117
|
+
│ │ └── fileStats.py # File size statistics
|
|
118
|
+
│ │
|
|
119
|
+
│ ├── clustering/ # Phase 2 — Semantic Clustering
|
|
120
|
+
│ │ ├── fileContent.py # Multi-format content extraction
|
|
121
|
+
│ │ ├── embeddings.py # Text & image embedding generation + Qdrant storage
|
|
122
|
+
│ │ ├── dbscanModel.py # DBSCAN clustering with data retrieval from Qdrant
|
|
123
|
+
│ │ └── dirNaming.py # TF-IDF based directory name generation
|
|
124
|
+
│ │
|
|
125
|
+
│ └── hitl/ # Phase 3 — Human-in-the-Loop (WIP)
|
|
126
|
+
│
|
|
127
|
+
└── docs/ # Documentation (WIP)
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
---
|
|
131
|
+
|
|
132
|
+
## Tech Stack
|
|
133
|
+
|
|
134
|
+
| Category | Technology | Purpose |
|
|
135
|
+
|---|---|---|
|
|
136
|
+
| **Language** | Python ≥ 3.12 | Core runtime |
|
|
137
|
+
| **CLI Framework** | [Typer](https://typer.tiangolo.com/) + [Rich](https://rich.readthedocs.io/) | Command-line interface with styled output |
|
|
138
|
+
| **Text Embeddings** | [FastEmbed](https://github.com/qdrant/fastembed) (`BAAI/bge-small-en-v1.5`) | 384-dim text vectors |
|
|
139
|
+
| **Image Embeddings** | [FastEmbed](https://github.com/qdrant/fastembed) (`Qdrant/clip-ViT-B-32-vision`) | 512-dim image vectors |
|
|
140
|
+
| **Vector Database** | [Qdrant](https://qdrant.tech/) (local on-disk mode) | Semantic index storage |
|
|
141
|
+
| **Clustering** | [scikit-learn](https://scikit-learn.org/) (DBSCAN) | Density-based grouping |
|
|
142
|
+
| **PDF Parsing** | [PyMuPDF](https://pymupdf.readthedocs.io/) | Text extraction from PDFs |
|
|
143
|
+
| **OCR** | [Tesseract](https://github.com/tesseract-ocr/tesseract) + [pytesseract](https://github.com/madmaze/pytesseract) | Image-to-text extraction |
|
|
144
|
+
| **Image Processing** | [Pillow](https://pillow.readthedocs.io/) | Image loading for OCR |
|
|
145
|
+
| **TF-IDF** | [scikit-learn](https://scikit-learn.org/) (TfidfVectorizer) | Directory name generation |
|
|
146
|
+
| **Package Manager** | [uv](https://docs.astral.sh/uv/) | Fast dependency management |
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## Supported File Formats
|
|
151
|
+
|
|
152
|
+
| Category | Extensions | Extraction Method |
|
|
153
|
+
|---|---|---|
|
|
154
|
+
| **Plain Text / Code** | `.txt`, `.md`, `.csv`, `.json`, `.py`, `.js`, `.html` | Direct UTF-8 read |
|
|
155
|
+
| **Documents** | `.pdf` | PyMuPDF text extraction |
|
|
156
|
+
| **Images** | `.png`, `.jpg`, `.jpeg` | CLIP embeddings (visual) + Tesseract OCR (textual) |
|
|
157
|
+
|
|
158
|
+
> **Note:** Images are embedded using CLIP for visual similarity clustering. OCR is used separately when text content is needed (e.g., for directory naming).
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## Prerequisites
|
|
163
|
+
|
|
164
|
+
1. **Python 3.12+** — [Download](https://www.python.org/downloads/)
|
|
165
|
+
2. **uv** (recommended) — [Install](https://docs.astral.sh/uv/getting-started/installation/)
|
|
166
|
+
3. **Tesseract OCR** — Required for image text extraction.
|
|
167
|
+
- Download the Windows installer from [UB Mannheim](https://github.com/UB-Mannheim/tesseract/wiki).
|
|
168
|
+
- Ensure `tesseract.exe` is accessible via your system `PATH`.
|
|
169
|
+
|
|
170
|
+
---
|
|
171
|
+
|
|
172
|
+
## Installation
|
|
173
|
+
|
|
174
|
+
```bash
|
|
175
|
+
# Clone the repository
|
|
176
|
+
git clone https://github.com/44ompatil/Vortex.git
|
|
177
|
+
cd Vortex
|
|
178
|
+
|
|
179
|
+
# Install dependencies (uv recommended)
|
|
180
|
+
uv sync
|
|
181
|
+
|
|
182
|
+
# Or, using pip
|
|
183
|
+
pip install -e .
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
---
|
|
187
|
+
|
|
188
|
+
## Usage
|
|
189
|
+
|
|
190
|
+
Vortex exposes a `sort` command that orchestrates the full pipeline:
|
|
191
|
+
|
|
192
|
+
```bash
|
|
193
|
+
# Sort a directory
|
|
194
|
+
uv run python main.py sort <target-directory>
|
|
195
|
+
|
|
196
|
+
# Example
|
|
197
|
+
uv run python main.py sort "C:\Users\you\Downloads"
|
|
198
|
+
```
|
|
199
|
+
|
|
200
|
+
### Available Commands
|
|
201
|
+
|
|
202
|
+
| Command | Description |
|
|
203
|
+
|---|---|
|
|
204
|
+
| `sort <directory>` | Run the full dedup → cluster → organize pipeline on the target directory |
|
|
205
|
+
| `help` | Display available commands and usage information |
|
|
206
|
+
|
|
207
|
+
### What Happens When You Run `sort`
|
|
208
|
+
|
|
209
|
+
1. **Scan** — Recursively discovers all files in the target directory.
|
|
210
|
+
2. **Dedup** — Identifies and removes exact binary duplicates (SHA-256).
|
|
211
|
+
3. **Extract** — Pulls text/visual content from each unique file.
|
|
212
|
+
4. **Embed** — Generates vector embeddings (text: 384d, image: 512d).
|
|
213
|
+
5. **Cluster** — Groups semantically similar files using DBSCAN.
|
|
214
|
+
6. **Name** — Generates descriptive folder names via TF-IDF.
|
|
215
|
+
7. **Organize** — Moves files into their newly created, named folders.
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## How It Works
|
|
220
|
+
|
|
221
|
+
### Phase 1: Deduplication (`src/dedup/`)
|
|
222
|
+
|
|
223
|
+
Files are recursively discovered via `os.walk`. Each file is hashed using **SHA-256** with 4 MB chunked reads to handle large files efficiently. Files sharing the same hash are identified as exact duplicates — only one copy is kept, the rest are deleted.
|
|
224
|
+
|
|
225
|
+
### Phase 2: Semantic Clustering (`src/clustering/`)
|
|
226
|
+
|
|
227
|
+
Remaining unique files have their content extracted based on file type. Text content is embedded into 384-dimensional vectors using `BAAI/bge-small-en-v1.5`, while images are embedded into 512-dimensional vectors using `Qdrant/clip-ViT-B-32-vision`. All vectors are stored in a local Qdrant database.
|
|
228
|
+
|
|
229
|
+
**DBSCAN** (Density-Based Spatial Clustering of Applications with Noise) then groups files by semantic similarity using cosine distance. The algorithm parameters are tuned separately for each modality:
|
|
230
|
+
|
|
231
|
+
| Modality | `eps` | `min_samples` |
|
|
232
|
+
|---|---|---|
|
|
233
|
+
| Text | 0.20 | 2 |
|
|
234
|
+
| Image | 0.45 | 2 |
|
|
235
|
+
|
|
236
|
+
Each cluster is assigned a human-readable name generated by running **TF-IDF** on the combined text content of the cluster's files, extracting the top-2 most distinctive terms.
|
|
237
|
+
|
|
238
|
+
### Phase 3: Human-in-the-Loop (`src/hitl/`)
|
|
239
|
+
|
|
240
|
+
> *Coming soon.* This phase will present the proposed folder structure for interactive review before files are moved.
|
|
241
|
+
|
|
242
|
+
---
|
|
243
|
+
|
|
244
|
+
## Configuration
|
|
245
|
+
|
|
246
|
+
Currently, model parameters are configured in-code:
|
|
247
|
+
|
|
248
|
+
| Parameter | Location | Default | Description |
|
|
249
|
+
|---|---|---|---|
|
|
250
|
+
| `chunkSize` | `fileHash.py` | `4194304` (4 MB) | Byte chunk size for SHA-256 hashing |
|
|
251
|
+
| `txtEps` | `dbscanModel.py` | `0.20` | DBSCAN epsilon for text clusters |
|
|
252
|
+
| `txtMinPts` | `dbscanModel.py` | `2` | DBSCAN minimum points for text clusters |
|
|
253
|
+
| `imgEps` | `dbscanModel.py` | `0.45` | DBSCAN epsilon for image clusters |
|
|
254
|
+
| `imgMinPts` | `dbscanModel.py` | `2` | DBSCAN minimum points for image clusters |
|
|
255
|
+
| `top_n_words` | `dirNaming.py` | `2` | Number of TF-IDF terms used in folder names |
|
|
256
|
+
| `PageSize` | `dbscanModel.py` | `30` | Qdrant scroll page size |
|
|
257
|
+
|
|
258
|
+
---
|
|
259
|
+
|
|
260
|
+
## Roadmap
|
|
261
|
+
|
|
262
|
+
- [x] Recursive directory traversal
|
|
263
|
+
- [x] SHA-256 chunked deduplication
|
|
264
|
+
- [x] Multi-format content extraction (text, PDF, images)
|
|
265
|
+
- [x] Local text & image embedding generation
|
|
266
|
+
- [x] Qdrant vector storage
|
|
267
|
+
- [x] DBSCAN semantic clustering
|
|
268
|
+
- [x] TF-IDF auto-naming for directories
|
|
269
|
+
- [x] Typer CLI with Rich output
|
|
270
|
+
- [ ] Human-in-the-Loop interactive review
|
|
271
|
+
- [ ] Config file support (YAML/TOML)
|
|
272
|
+
- [ ] Cross-modal clustering (text + image in unified space)
|
|
273
|
+
- [ ] Undo / dry-run mode
|
|
274
|
+
- [ ] Progress bars and summary statistics
|
|
275
|
+
- [ ] Additional file format support (`.docx`, `.xlsx`, `.pptx`)
|
|
276
|
+
|
|
277
|
+
---
|
|
278
|
+
|
|
279
|
+
## Contributing
|
|
280
|
+
|
|
281
|
+
Contributions are welcome! Here's how to get started:
|
|
282
|
+
|
|
283
|
+
1. **Fork** the repository.
|
|
284
|
+
2. **Create a feature branch** — `git checkout -b feature/your-feature`
|
|
285
|
+
3. **Commit your changes** — `git commit -m "Add your feature"`
|
|
286
|
+
4. **Push to the branch** — `git push origin feature/your-feature`
|
|
287
|
+
5. **Open a Pull Request.**
|
|
288
|
+
|
|
289
|
+
Please ensure your code follows the existing style and includes appropriate documentation.
|
|
290
|
+
|
|
291
|
+
---
|
|
292
|
+
|
|
293
|
+
## License
|
|
294
|
+
|
|
295
|
+
This project is licensed under the **MIT License** — see the [LICENSE](LICENSE) file for details.
|
|
296
|
+
|
|
297
|
+
---
|
|
298
|
+
|
|
299
|
+
<p align="center">
|
|
300
|
+
<sub>Built with ❤️ for anyone drowning in unorganized files.</sub>
|
|
301
|
+
</p>
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "Vortiq"
|
|
3
|
+
version = "0.1.0"
|
|
4
|
+
description = "A lightning-fast, zero-cloud CLI utility that cleans your Windows directories by parsing file contents and visual data — automatically destroying clones and generating intelligently named folder structures."
|
|
5
|
+
readme = "README.md"
|
|
6
|
+
requires-python = ">=3.12"
|
|
7
|
+
dependencies = [
|
|
8
|
+
"fastembed>=0.8.0",
|
|
9
|
+
"numpy>=2.4.6",
|
|
10
|
+
"pillow>=12.2.0",
|
|
11
|
+
"pymupdf>=1.27.2.3",
|
|
12
|
+
"pytesseract>=0.3.13",
|
|
13
|
+
"qdrant-client[fastembed]>=1.18.0",
|
|
14
|
+
"scikit-learn>=1.8.0",
|
|
15
|
+
]
|
|
16
|
+
authors = [
|
|
17
|
+
{name = "Om Patil", email = "44ompatil@gmail.com"}
|
|
18
|
+
]
|
|
19
|
+
|
|
20
|
+
[project.scripts]
|
|
21
|
+
vortiq = "vortiq.main:app"
|
|
22
|
+
|
|
23
|
+
[tool.setuptools]
|
|
24
|
+
package-dir = {"" = "src"}
|
|
25
|
+
|
|
26
|
+
[tool.setuptools.packages.find]
|
|
27
|
+
where = ["src"]
|
vortiq-0.1.0/setup.cfg
ADDED