hecvec 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hecvec-0.1.0/.devcontainer/Dockerfile +14 -0
- hecvec-0.1.0/.devcontainer/devcontainer.json +11 -0
- hecvec-0.1.0/.devcontainer/docker-compose.yml +39 -0
- hecvec-0.1.0/.gitignore +25 -0
- hecvec-0.1.0/PKG-INFO +182 -0
- hecvec-0.1.0/README.md +148 -0
- hecvec-0.1.0/pyproject.toml +53 -0
- hecvec-0.1.0/scripts/test_slice.py +70 -0
- hecvec-0.1.0/src/hecvec/__init__.py +34 -0
- hecvec-0.1.0/src/hecvec/chroma_client.py +58 -0
- hecvec-0.1.0/src/hecvec/chroma_list.py +27 -0
- hecvec-0.1.0/src/hecvec/chunking.py +69 -0
- hecvec-0.1.0/src/hecvec/cli.py +27 -0
- hecvec-0.1.0/src/hecvec/embeddings.py +39 -0
- hecvec-0.1.0/src/hecvec/env.py +26 -0
- hecvec-0.1.0/src/hecvec/hecvec.py +6 -0
- hecvec-0.1.0/src/hecvec/listdir.py +126 -0
- hecvec-0.1.0/src/hecvec/pipeline.py +187 -0
- hecvec-0.1.0/src/hecvec/reading.py +59 -0
- hecvec-0.1.0/src/hecvec/token_splitter.py +67 -0
- hecvec-0.1.0/tests/conftest.py +8 -0
- hecvec-0.1.0/tests/test_env.py +24 -0
- hecvec-0.1.0/tests/test_listdir.py +53 -0
- hecvec-0.1.0/tests/test_reading.py +27 -0
- hecvec-0.1.0/uv.lock +3612 -0
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
FROM mcr.microsoft.com/devcontainers/base:noble
|
|
2
|
+
|
|
3
|
+
# Instalar Python y dependencias
|
|
4
|
+
RUN apt-get update && apt-get install -y \
|
|
5
|
+
python3 \
|
|
6
|
+
python3-pip \
|
|
7
|
+
python3-venv \
|
|
8
|
+
&& rm -rf /var/lib/apt/lists/*
|
|
9
|
+
|
|
10
|
+
# Instalar boto3, chromadb-client y uv
|
|
11
|
+
RUN pip3 install --break-system-packages boto3 chromadb-client uv
|
|
12
|
+
|
|
13
|
+
# Crear directorio de trabajo
|
|
14
|
+
WORKDIR /workspace
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "Ubuntu",
|
|
3
|
+
"image": "mcr.microsoft.com/devcontainers/base:noble",
|
|
4
|
+
"features": {
|
|
5
|
+
"ghcr.io/devcontainers/features/aws-cli:1": {},
|
|
6
|
+
"ghcr.io/devcontainers/features/node:1": {},
|
|
7
|
+
"ghcr.io/devcontainers/features/docker-in-docker:2": {},
|
|
8
|
+
"ghcr.io/va-h/devcontainers-features/uv:1": {}
|
|
9
|
+
},
|
|
10
|
+
"postCreateCommand": "uv sync --extra chroma"
|
|
11
|
+
}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
version: '3.8'
|
|
2
|
+
|
|
3
|
+
services:
|
|
4
|
+
app:
|
|
5
|
+
build:
|
|
6
|
+
context: .
|
|
7
|
+
dockerfile: Dockerfile
|
|
8
|
+
|
|
9
|
+
volumes:
|
|
10
|
+
- ../:/workspace:cached
|
|
11
|
+
- /Users/toro/.ssh:/home/vscode/.ssh-host:ro
|
|
12
|
+
- /Users/toro/.aws:/home/vscode/.aws:cached
|
|
13
|
+
|
|
14
|
+
command: sleep infinity
|
|
15
|
+
|
|
16
|
+
environment:
|
|
17
|
+
AWS_PROFILE: default
|
|
18
|
+
AWS_DEFAULT_REGION: us-east-2
|
|
19
|
+
|
|
20
|
+
depends_on:
|
|
21
|
+
- chromadb
|
|
22
|
+
|
|
23
|
+
network_mode: service:chromadb
|
|
24
|
+
|
|
25
|
+
chromadb:
|
|
26
|
+
image: chromadb/chroma:latest
|
|
27
|
+
ports:
|
|
28
|
+
- "8000:8000"
|
|
29
|
+
|
|
30
|
+
volumes:
|
|
31
|
+
- chroma-data:/chroma/data
|
|
32
|
+
|
|
33
|
+
environment:
|
|
34
|
+
- IS_PERSISTENT=TRUE
|
|
35
|
+
- ANONYMIZED_TELEMETRY=FALSE
|
|
36
|
+
|
|
37
|
+
volumes:
|
|
38
|
+
chroma-data:
|
|
39
|
+
driver: local
|
hecvec-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# Environment and secrets
|
|
2
|
+
.env
|
|
3
|
+
.venv/
|
|
4
|
+
venv/
|
|
5
|
+
env/
|
|
6
|
+
|
|
7
|
+
# Build and publish
|
|
8
|
+
dist/
|
|
9
|
+
build/
|
|
10
|
+
*.egg-info/
|
|
11
|
+
*.egg
|
|
12
|
+
|
|
13
|
+
# IDE and OS
|
|
14
|
+
.idea/
|
|
15
|
+
.vscode/
|
|
16
|
+
*.swp
|
|
17
|
+
.DS_Store
|
|
18
|
+
|
|
19
|
+
# pytest
|
|
20
|
+
.pytest_cache/
|
|
21
|
+
.coverage
|
|
22
|
+
htmlcov/
|
|
23
|
+
|
|
24
|
+
# uv
|
|
25
|
+
.uv/
|
hecvec-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hecvec
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Keywords: chunking,document-pipeline,listdir,text-files
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
+
Requires-Python: <3.14,>=3.9
|
|
18
|
+
Requires-Dist: chromadb>=0.4.0
|
|
19
|
+
Requires-Dist: langchain-text-splitters>=0.2.0
|
|
20
|
+
Requires-Dist: openai>=1.0.0
|
|
21
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
22
|
+
Requires-Dist: tiktoken>=0.5.0
|
|
23
|
+
Provides-Extra: chroma
|
|
24
|
+
Requires-Dist: chromadb>=0.4.0; extra == 'chroma'
|
|
25
|
+
Requires-Dist: langchain-text-splitters>=0.2.0; extra == 'chroma'
|
|
26
|
+
Requires-Dist: openai>=1.0.0; extra == 'chroma'
|
|
27
|
+
Requires-Dist: python-dotenv>=1.0.0; extra == 'chroma'
|
|
28
|
+
Requires-Dist: tiktoken>=0.5.0; extra == 'chroma'
|
|
29
|
+
Provides-Extra: chunk
|
|
30
|
+
Requires-Dist: langchain-text-splitters>=0.2.0; extra == 'chunk'
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
# HecVec
|
|
36
|
+
|
|
37
|
+
List directories with a safe root, filter `.txt`/`.md` files, read them as text, and optionally chunk and push to Chroma — **library only, no API**.
|
|
38
|
+
|
|
39
|
+
## Install
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install hecvec
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
One-call pipeline (list → filter → token-chunk → Chroma):
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install hecvec[chroma]
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Optional chunking only (no Chroma):
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install hecvec[chunk]
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Usage
|
|
58
|
+
|
|
59
|
+
### One-call pipeline (list → filter → chunk → Chroma)
|
|
60
|
+
|
|
61
|
+
Runs entirely in the library (no API). You need Chroma running (e.g. `docker run -p 8000:8000 chromadb/chroma`) and `OPENAI_API_KEY` set (in the environment or in a `.env` file; the library loads `.env` via python-dotenv when you use `hecvec[chroma]`).
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
import hecvec
|
|
65
|
+
|
|
66
|
+
# Class-style: use defaults, then slice
|
|
67
|
+
test = hecvec.HecVec()
|
|
68
|
+
result = test.slice(path="/path/to/folder")
|
|
69
|
+
# → {"files": N, "chunks": M, "collection": "hecvec"}
|
|
70
|
+
|
|
71
|
+
# Or call slice on the class (same flow)
|
|
72
|
+
result = hecvec.HecVec.slice(path="/path/to/folder")
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Flow: resolve path → listdir → filter `.txt`/`.md` → token-chunk (200 tokens, `cl100k_base`) → embed with OpenAI → push to Chroma.
|
|
76
|
+
|
|
77
|
+
Optional config (instance or `HecVec.slice(..., key=value)`):
|
|
78
|
+
|
|
79
|
+
- `root`, `collection_name`, `chroma_host`, `chroma_port`
|
|
80
|
+
- `embedding_model`, `chunk_size`, `chunk_overlap`, `encoding_name`, `batch_size`
|
|
81
|
+
- `openai_api_key` (or set `OPENAI_API_KEY` in the environment or in a `.env` file; optional `dotenv_path` to point to a specific `.env`)
|
|
82
|
+
|
|
83
|
+
### Low-level building blocks
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from pathlib import Path
|
|
87
|
+
from hecvec import ListDir, ListDirTextFiles, ReadText
|
|
88
|
+
|
|
89
|
+
root = Path("/path/to/repo")
|
|
90
|
+
|
|
91
|
+
# List all entries under a path (restricted to root)
|
|
92
|
+
lister = ListDir(root=root)
|
|
93
|
+
for rel in lister.listdir("."):
|
|
94
|
+
print(rel)
|
|
95
|
+
|
|
96
|
+
# Only .txt and .md files, recursively
|
|
97
|
+
text_lister = ListDirTextFiles(root=root)
|
|
98
|
+
paths = text_lister.listdir_recursive_txt_md("docs")
|
|
99
|
+
|
|
100
|
+
# Read each file as text
|
|
101
|
+
reader = ReadText(paths)
|
|
102
|
+
for path, text in reader:
|
|
103
|
+
print(path, len(text))
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Chunking (optional)
|
|
107
|
+
|
|
108
|
+
With `pip install hecvec[chunk]`:
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
from hecvec import ListDirTextFiles, ReadText
|
|
112
|
+
from hecvec.chunking import chunk_documents
|
|
113
|
+
|
|
114
|
+
lister = ListDirTextFiles(root=root)
|
|
115
|
+
paths = lister.listdir_recursive_txt_md(".")
|
|
116
|
+
reader = ReadText(paths)
|
|
117
|
+
path_and_text = reader.read_all()
|
|
118
|
+
chunks = chunk_documents(path_and_text)
|
|
119
|
+
# list of {"path": "...", "chunk_index": 0, "content": "..."}
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### CLI
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
hecvec-listdir [path] [root]
|
|
126
|
+
# or
|
|
127
|
+
python -m hecvec.cli [path] [root]
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Test the full pipeline (the method that does everything)
|
|
131
|
+
|
|
132
|
+
From the project root, with Chroma running and `OPENAI_API_KEY` set (e.g. in `.env`):
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
# Start Chroma (one terminal)
|
|
136
|
+
docker run -p 8000:8000 chromadb/chroma
|
|
137
|
+
|
|
138
|
+
# Run the test script (another terminal)
|
|
139
|
+
uv run python scripts/test_slice.py
|
|
140
|
+
# or: python scripts/test_slice.py
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
The script creates a temp folder with two `.txt` files, runs `HecVec.slice(path=...)`, and prints `PASS` or `FAIL` with the result (`files`, `chunks`, `collection`).
|
|
144
|
+
|
|
145
|
+
### Modular layout (easy to study)
|
|
146
|
+
|
|
147
|
+
Each step of the pipeline lives in its own module:
|
|
148
|
+
|
|
149
|
+
| Module | Responsibility |
|
|
150
|
+
|--------|-----------------|
|
|
151
|
+
| `hecvec.env` | Load `.env` and `OPENAI_API_KEY` |
|
|
152
|
+
| `hecvec.listdir` | List dirs under a safe root; filter by extension (`.txt`/`.md`) |
|
|
153
|
+
| `hecvec.reading` | Read files as text (UTF-8 / latin-1 / cp1252 fallback) |
|
|
154
|
+
| `hecvec.token_splitter` | Token-based chunking (TokenTextSplitter) |
|
|
155
|
+
| `hecvec.chunking` | Recursive-character chunking (RecursiveCharacterTextSplitter) |
|
|
156
|
+
| `hecvec.embeddings` | OpenAI embeddings (`embed_texts`) |
|
|
157
|
+
| `hecvec.chroma_client` | Chroma client, get/create collection, add documents |
|
|
158
|
+
| `hecvec.chroma_list` | List Chroma collections and counts |
|
|
159
|
+
| `hecvec.pipeline` | Orchestrator: `HecVec` and `slice(path=...)` |
|
|
160
|
+
|
|
161
|
+
Example: use one step on its own:
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
from hecvec import embed_texts, token_chunk_text, list_collections
|
|
165
|
+
|
|
166
|
+
chunks = token_chunk_text("Some long document...", chunk_size=200)
|
|
167
|
+
vecs = embed_texts(chunks, api_key="sk-...")
|
|
168
|
+
names_and_counts = list_collections(host="localhost", port=8000)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Development
|
|
172
|
+
|
|
173
|
+
From the repo root:
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
uv sync
|
|
177
|
+
uv run python -c "from hecvec import ListDir; print(ListDir('.').listdir('.'))"
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## License
|
|
181
|
+
|
|
182
|
+
MIT
|
hecvec-0.1.0/README.md
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# HecVec
|
|
2
|
+
|
|
3
|
+
List directories with a safe root, filter `.txt`/`.md` files, read them as text, and optionally chunk and push to Chroma — **library only, no API**.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install hecvec
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
One-call pipeline (list → filter → token-chunk → Chroma):
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install hecvec[chroma]
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
Optional chunking only (no Chroma):
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
pip install hecvec[chunk]
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Usage
|
|
24
|
+
|
|
25
|
+
### One-call pipeline (list → filter → chunk → Chroma)
|
|
26
|
+
|
|
27
|
+
Runs entirely in the library (no API). You need Chroma running (e.g. `docker run -p 8000:8000 chromadb/chroma`) and `OPENAI_API_KEY` set (in the environment or in a `.env` file; the library loads `.env` via python-dotenv when you use `hecvec[chroma]`).
|
|
28
|
+
|
|
29
|
+
```python
|
|
30
|
+
import hecvec
|
|
31
|
+
|
|
32
|
+
# Class-style: use defaults, then slice
|
|
33
|
+
test = hecvec.HecVec()
|
|
34
|
+
result = test.slice(path="/path/to/folder")
|
|
35
|
+
# → {"files": N, "chunks": M, "collection": "hecvec"}
|
|
36
|
+
|
|
37
|
+
# Or call slice on the class (same flow)
|
|
38
|
+
result = hecvec.HecVec.slice(path="/path/to/folder")
|
|
39
|
+
```
|
|
40
|
+
|
|
41
|
+
Flow: resolve path → listdir → filter `.txt`/`.md` → token-chunk (200 tokens, `cl100k_base`) → embed with OpenAI → push to Chroma.
|
|
42
|
+
|
|
43
|
+
Optional config (instance or `HecVec.slice(..., key=value)`):
|
|
44
|
+
|
|
45
|
+
- `root`, `collection_name`, `chroma_host`, `chroma_port`
|
|
46
|
+
- `embedding_model`, `chunk_size`, `chunk_overlap`, `encoding_name`, `batch_size`
|
|
47
|
+
- `openai_api_key` (or set `OPENAI_API_KEY` in the environment or in a `.env` file; optional `dotenv_path` to point to a specific `.env`)
|
|
48
|
+
|
|
49
|
+
### Low-level building blocks
|
|
50
|
+
|
|
51
|
+
```python
|
|
52
|
+
from pathlib import Path
|
|
53
|
+
from hecvec import ListDir, ListDirTextFiles, ReadText
|
|
54
|
+
|
|
55
|
+
root = Path("/path/to/repo")
|
|
56
|
+
|
|
57
|
+
# List all entries under a path (restricted to root)
|
|
58
|
+
lister = ListDir(root=root)
|
|
59
|
+
for rel in lister.listdir("."):
|
|
60
|
+
print(rel)
|
|
61
|
+
|
|
62
|
+
# Only .txt and .md files, recursively
|
|
63
|
+
text_lister = ListDirTextFiles(root=root)
|
|
64
|
+
paths = text_lister.listdir_recursive_txt_md("docs")
|
|
65
|
+
|
|
66
|
+
# Read each file as text
|
|
67
|
+
reader = ReadText(paths)
|
|
68
|
+
for path, text in reader:
|
|
69
|
+
print(path, len(text))
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
### Chunking (optional)
|
|
73
|
+
|
|
74
|
+
With `pip install hecvec[chunk]`:
|
|
75
|
+
|
|
76
|
+
```python
|
|
77
|
+
from hecvec import ListDirTextFiles, ReadText
|
|
78
|
+
from hecvec.chunking import chunk_documents
|
|
79
|
+
|
|
80
|
+
lister = ListDirTextFiles(root=root)
|
|
81
|
+
paths = lister.listdir_recursive_txt_md(".")
|
|
82
|
+
reader = ReadText(paths)
|
|
83
|
+
path_and_text = reader.read_all()
|
|
84
|
+
chunks = chunk_documents(path_and_text)
|
|
85
|
+
# list of {"path": "...", "chunk_index": 0, "content": "..."}
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
### CLI
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
hecvec-listdir [path] [root]
|
|
92
|
+
# or
|
|
93
|
+
python -m hecvec.cli [path] [root]
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
### Test the full pipeline (the method that does everything)
|
|
97
|
+
|
|
98
|
+
From the project root, with Chroma running and `OPENAI_API_KEY` set (e.g. in `.env`):
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
# Start Chroma (one terminal)
|
|
102
|
+
docker run -p 8000:8000 chromadb/chroma
|
|
103
|
+
|
|
104
|
+
# Run the test script (another terminal)
|
|
105
|
+
uv run python scripts/test_slice.py
|
|
106
|
+
# or: python scripts/test_slice.py
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
The script creates a temp folder with two `.txt` files, runs `HecVec.slice(path=...)`, and prints `PASS` or `FAIL` with the result (`files`, `chunks`, `collection`).
|
|
110
|
+
|
|
111
|
+
### Modular layout (easy to study)
|
|
112
|
+
|
|
113
|
+
Each step of the pipeline lives in its own module:
|
|
114
|
+
|
|
115
|
+
| Module | Responsibility |
|
|
116
|
+
|--------|-----------------|
|
|
117
|
+
| `hecvec.env` | Load `.env` and `OPENAI_API_KEY` |
|
|
118
|
+
| `hecvec.listdir` | List dirs under a safe root; filter by extension (`.txt`/`.md`) |
|
|
119
|
+
| `hecvec.reading` | Read files as text (UTF-8 / latin-1 / cp1252 fallback) |
|
|
120
|
+
| `hecvec.token_splitter` | Token-based chunking (TokenTextSplitter) |
|
|
121
|
+
| `hecvec.chunking` | Recursive-character chunking (RecursiveCharacterTextSplitter) |
|
|
122
|
+
| `hecvec.embeddings` | OpenAI embeddings (`embed_texts`) |
|
|
123
|
+
| `hecvec.chroma_client` | Chroma client, get/create collection, add documents |
|
|
124
|
+
| `hecvec.chroma_list` | List Chroma collections and counts |
|
|
125
|
+
| `hecvec.pipeline` | Orchestrator: `HecVec` and `slice(path=...)` |
|
|
126
|
+
|
|
127
|
+
Example: use one step on its own:
|
|
128
|
+
|
|
129
|
+
```python
|
|
130
|
+
from hecvec import embed_texts, token_chunk_text, list_collections
|
|
131
|
+
|
|
132
|
+
chunks = token_chunk_text("Some long document...", chunk_size=200)
|
|
133
|
+
vecs = embed_texts(chunks, api_key="sk-...")
|
|
134
|
+
names_and_counts = list_collections(host="localhost", port=8000)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
## Development
|
|
138
|
+
|
|
139
|
+
From the repo root:
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
uv sync
|
|
143
|
+
uv run python -c "from hecvec import ListDir; print(ListDir('.').listdir('.'))"
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
## License
|
|
147
|
+
|
|
148
|
+
MIT
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "hecvec"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9,<3.14"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
authors = []
|
|
13
|
+
keywords = ["listdir", "text-files", "chunking", "document-pipeline"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 3 - Alpha",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Programming Language :: Python :: 3",
|
|
19
|
+
"Programming Language :: Python :: 3.9",
|
|
20
|
+
"Programming Language :: Python :: 3.10",
|
|
21
|
+
"Programming Language :: Python :: 3.11",
|
|
22
|
+
"Programming Language :: Python :: 3.12",
|
|
23
|
+
"Programming Language :: Python :: 3.13",
|
|
24
|
+
"Topic :: Software Development :: Libraries :: Python Modules",
|
|
25
|
+
]
|
|
26
|
+
dependencies = [
|
|
27
|
+
"chromadb>=0.4.0",
|
|
28
|
+
"langchain-text-splitters>=0.2.0",
|
|
29
|
+
"openai>=1.0.0",
|
|
30
|
+
"python-dotenv>=1.0.0",
|
|
31
|
+
"tiktoken>=0.5.0",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
[project.optional-dependencies]
|
|
35
|
+
chunk = ["langchain-text-splitters>=0.2.0"]
|
|
36
|
+
chroma = [
|
|
37
|
+
"chromadb>=0.4.0",
|
|
38
|
+
"langchain-text-splitters>=0.2.0",
|
|
39
|
+
"openai>=1.0.0",
|
|
40
|
+
"python-dotenv>=1.0.0",
|
|
41
|
+
"tiktoken>=0.5.0",
|
|
42
|
+
]
|
|
43
|
+
dev = ["pytest>=7.0.0"]
|
|
44
|
+
|
|
45
|
+
[tool.pytest.ini_options]
|
|
46
|
+
testpaths = ["tests"]
|
|
47
|
+
pythonpath = ["src"]
|
|
48
|
+
|
|
49
|
+
[project.scripts]
|
|
50
|
+
hecvec-listdir = "hecvec.cli:main"
|
|
51
|
+
|
|
52
|
+
[tool.hatch.build.targets.wheel]
|
|
53
|
+
packages = ["src/hecvec"]
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Create a HecVec object and test the method that does everything by passing your path.
|
|
4
|
+
|
|
5
|
+
listdir → filter .txt/.md → token-chunk → embed → Chroma
|
|
6
|
+
|
|
7
|
+
Requirements:
|
|
8
|
+
- pip install hecvec[chroma]
|
|
9
|
+
- Chroma running: docker run -p 8000:8000 chromadb/chroma
|
|
10
|
+
- OPENAI_API_KEY set (env or .env in project root)
|
|
11
|
+
|
|
12
|
+
Usage (from project root):
|
|
13
|
+
uv run python scripts/test_slice.py /path/to/folder
|
|
14
|
+
uv run python scripts/test_slice.py .
|
|
15
|
+
python scripts/test_slice.py /path/to/folder
|
|
16
|
+
"""
|
|
17
|
+
import logging
|
|
18
|
+
import sys
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
# Project root and src
|
|
22
|
+
ROOT = Path(__file__).resolve().parent.parent
|
|
23
|
+
SRC = ROOT / "src"
|
|
24
|
+
if str(SRC) not in sys.path:
|
|
25
|
+
sys.path.insert(0, str(SRC))
|
|
26
|
+
|
|
27
|
+
# Default path to slice: project root (works in devcontainer; use first arg to override)
|
|
28
|
+
DEFAULT_PATH = ROOT
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def main() -> int:
|
|
32
|
+
logging.basicConfig(
|
|
33
|
+
level=logging.INFO,
|
|
34
|
+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
35
|
+
datefmt="%H:%M:%S",
|
|
36
|
+
)
|
|
37
|
+
logger = logging.getLogger(__name__)
|
|
38
|
+
|
|
39
|
+
from hecvec import HecVec
|
|
40
|
+
|
|
41
|
+
if len(sys.argv) < 2:
|
|
42
|
+
path = DEFAULT_PATH
|
|
43
|
+
logger.info("No path given, using default: %s", path)
|
|
44
|
+
else:
|
|
45
|
+
path = Path(sys.argv[1]).expanduser().resolve()
|
|
46
|
+
logger.info("Path given: %s", path)
|
|
47
|
+
|
|
48
|
+
if not path.exists():
|
|
49
|
+
logger.error("Path does not exist: %s", path)
|
|
50
|
+
print(f"Error: path does not exist: {path}")
|
|
51
|
+
return 1
|
|
52
|
+
if not path.is_dir() and not path.is_file():
|
|
53
|
+
logger.error("Not a file or directory: %s", path)
|
|
54
|
+
print(f"Error: not a file or directory: {path}")
|
|
55
|
+
return 1
|
|
56
|
+
if path.is_file() and path.suffix.lower() not in (".txt", ".md"):
|
|
57
|
+
logger.error("File must be .txt or .md: %s", path)
|
|
58
|
+
print(f"Error: file must be .txt or .md: {path}")
|
|
59
|
+
return 1
|
|
60
|
+
|
|
61
|
+
test = HecVec()
|
|
62
|
+
result = test.slice(path=path)
|
|
63
|
+
|
|
64
|
+
logger.info("Result: %s", result)
|
|
65
|
+
print("Result:", result)
|
|
66
|
+
return 0
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
if __name__ == "__main__":
|
|
70
|
+
sys.exit(main())
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HecVec: modular library — listdir, read, chunk (token/recursive), embed, Chroma. No API.
|
|
3
|
+
"""
|
|
4
|
+
from hecvec.chunking import chunk_documents, chunk_text
|
|
5
|
+
from hecvec.chroma_client import add_documents, get_client, get_or_create_collection
|
|
6
|
+
from hecvec.chroma_list import list_collections
|
|
7
|
+
from hecvec.embeddings import embed_texts
|
|
8
|
+
from hecvec.env import load_dotenv_if_available, load_openai_key
|
|
9
|
+
from hecvec.listdir import ALLOWED_EXTENSIONS, ListDir, ListDirTextFiles
|
|
10
|
+
from hecvec.pipeline import HecVec
|
|
11
|
+
from hecvec.reading import ReadText
|
|
12
|
+
from hecvec.token_splitter import token_chunk_documents, token_chunk_text
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"ALLOWED_EXTENSIONS",
|
|
16
|
+
"HecVec",
|
|
17
|
+
"ListDir",
|
|
18
|
+
"ListDirTextFiles",
|
|
19
|
+
"ReadText",
|
|
20
|
+
"add_documents",
|
|
21
|
+
"chunk_documents",
|
|
22
|
+
"chunk_text",
|
|
23
|
+
"embed_texts",
|
|
24
|
+
"get_client",
|
|
25
|
+
"get_or_create_collection",
|
|
26
|
+
"list_collections",
|
|
27
|
+
"load_dotenv_if_available",
|
|
28
|
+
"load_openai_key",
|
|
29
|
+
"token_chunk_documents",
|
|
30
|
+
"token_chunk_text",
|
|
31
|
+
"__version__",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
__version__ = "0.1.0"
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Chroma client and collection operations. One module = one responsibility: connect and add documents.
|
|
3
|
+
Requires: pip install hecvec[chroma] (chromadb).
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
import chromadb
|
|
11
|
+
|
|
12
|
+
DEFAULT_HOST = "localhost"
|
|
13
|
+
DEFAULT_PORT = 8000
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_client(host: str = DEFAULT_HOST, port: int = DEFAULT_PORT):
|
|
17
|
+
"""Return a Chroma HTTP client, or an in-memory client if the server is not reachable."""
|
|
18
|
+
import chromadb
|
|
19
|
+
try:
|
|
20
|
+
return chromadb.HttpClient(host=host, port=port)
|
|
21
|
+
except Exception:
|
|
22
|
+
return chromadb.EphemeralClient()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_or_create_collection(
|
|
26
|
+
client: "chromadb.HttpClient",
|
|
27
|
+
name: str,
|
|
28
|
+
metadata: dict | None = None,
|
|
29
|
+
):
|
|
30
|
+
"""Get or create a collection (cosine similarity by default)."""
|
|
31
|
+
if metadata is None:
|
|
32
|
+
metadata = {"hnsw:space": "cosine"}
|
|
33
|
+
return client.get_or_create_collection(name=name, metadata=metadata)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def add_documents(
|
|
37
|
+
client: "chromadb.HttpClient",
|
|
38
|
+
collection_name: str,
|
|
39
|
+
ids: list[str],
|
|
40
|
+
embeddings: list[list[float]],
|
|
41
|
+
documents: list[str],
|
|
42
|
+
) -> None:
|
|
43
|
+
"""
|
|
44
|
+
Add documents to a collection. If dimension mismatch, deletes and recreates the collection.
|
|
45
|
+
"""
|
|
46
|
+
import chromadb
|
|
47
|
+
coll = get_or_create_collection(client, collection_name)
|
|
48
|
+
try:
|
|
49
|
+
coll.add(ids=ids, embeddings=embeddings, documents=documents)
|
|
50
|
+
except chromadb.errors.InvalidArgumentError as e:
|
|
51
|
+
if "dimension" not in str(e).lower():
|
|
52
|
+
raise
|
|
53
|
+
client.delete_collection(name=collection_name)
|
|
54
|
+
coll = client.create_collection(
|
|
55
|
+
name=collection_name,
|
|
56
|
+
metadata={"hnsw:space": "cosine"},
|
|
57
|
+
)
|
|
58
|
+
coll.add(ids=ids, embeddings=embeddings, documents=documents)
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
List Chroma collections. One module = one responsibility: inspect collections on a Chroma server.
|
|
3
|
+
Requires: pip install hecvec[chroma] (chromadb).
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
import chromadb
|
|
11
|
+
|
|
12
|
+
DEFAULT_HOST = "localhost"
|
|
13
|
+
DEFAULT_PORT = 8000
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def list_collections(
|
|
17
|
+
host: str = DEFAULT_HOST,
|
|
18
|
+
port: int = DEFAULT_PORT,
|
|
19
|
+
) -> list[tuple[str, int]]:
|
|
20
|
+
"""
|
|
21
|
+
List all collection names and their document counts on a Chroma server.
|
|
22
|
+
Returns [(name, count), ...].
|
|
23
|
+
"""
|
|
24
|
+
import chromadb
|
|
25
|
+
client = chromadb.HttpClient(host=host, port=port)
|
|
26
|
+
collections = client.list_collections()
|
|
27
|
+
return [(c.name, c.count()) for c in collections]
|