hecvec 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ FROM mcr.microsoft.com/devcontainers/base:noble
2
+
3
+ # Instalar Python y dependencias
4
+ RUN apt-get update && apt-get install -y \
5
+ python3 \
6
+ python3-pip \
7
+ python3-venv \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Instalar boto3, chromadb-client y uv
11
+ RUN pip3 install --break-system-packages boto3 chromadb-client uv
12
+
13
+ # Crear directorio de trabajo
14
+ WORKDIR /workspace
@@ -0,0 +1,11 @@
1
+ {
2
+ "name": "Ubuntu",
3
+ "image": "mcr.microsoft.com/devcontainers/base:noble",
4
+ "features": {
5
+ "ghcr.io/devcontainers/features/aws-cli:1": {},
6
+ "ghcr.io/devcontainers/features/node:1": {},
7
+ "ghcr.io/devcontainers/features/docker-in-docker:2": {},
8
+ "ghcr.io/va-h/devcontainers-features/uv:1": {}
9
+ },
10
+ "postCreateCommand": "uv sync --extra chroma"
11
+ }
@@ -0,0 +1,39 @@
1
+ version: '3.8'
2
+
3
+ services:
4
+ app:
5
+ build:
6
+ context: .
7
+ dockerfile: Dockerfile
8
+
9
+ volumes:
10
+ - ../:/workspace:cached
11
+ - /Users/toro/.ssh:/home/vscode/.ssh-host:ro
12
+ - /Users/toro/.aws:/home/vscode/.aws:cached
13
+
14
+ command: sleep infinity
15
+
16
+ environment:
17
+ AWS_PROFILE: default
18
+ AWS_DEFAULT_REGION: us-east-2
19
+
20
+ depends_on:
21
+ - chromadb
22
+
23
+ network_mode: service:chromadb
24
+
25
+ chromadb:
26
+ image: chromadb/chroma:latest
27
+ ports:
28
+ - "8000:8000"
29
+
30
+ volumes:
31
+ - chroma-data:/chroma/data
32
+
33
+ environment:
34
+ - IS_PERSISTENT=TRUE
35
+ - ANONYMIZED_TELEMETRY=FALSE
36
+
37
+ volumes:
38
+ chroma-data:
39
+ driver: local
@@ -0,0 +1,25 @@
1
+ # Environment and secrets
2
+ .env
3
+ .venv/
4
+ venv/
5
+ env/
6
+
7
+ # Build and publish
8
+ dist/
9
+ build/
10
+ *.egg-info/
11
+ *.egg
12
+
13
+ # IDE and OS
14
+ .idea/
15
+ .vscode/
16
+ *.swp
17
+ .DS_Store
18
+
19
+ # pytest
20
+ .pytest_cache/
21
+ .coverage
22
+ htmlcov/
23
+
24
+ # uv
25
+ .uv/
hecvec-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,182 @@
1
+ Metadata-Version: 2.4
2
+ Name: hecvec
3
+ Version: 0.1.0
4
+ Summary: List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma.
5
+ License-Expression: MIT
6
+ Keywords: chunking,document-pipeline,listdir,text-files
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
17
+ Requires-Python: <3.14,>=3.9
18
+ Requires-Dist: chromadb>=0.4.0
19
+ Requires-Dist: langchain-text-splitters>=0.2.0
20
+ Requires-Dist: openai>=1.0.0
21
+ Requires-Dist: python-dotenv>=1.0.0
22
+ Requires-Dist: tiktoken>=0.5.0
23
+ Provides-Extra: chroma
24
+ Requires-Dist: chromadb>=0.4.0; extra == 'chroma'
25
+ Requires-Dist: langchain-text-splitters>=0.2.0; extra == 'chroma'
26
+ Requires-Dist: openai>=1.0.0; extra == 'chroma'
27
+ Requires-Dist: python-dotenv>=1.0.0; extra == 'chroma'
28
+ Requires-Dist: tiktoken>=0.5.0; extra == 'chroma'
29
+ Provides-Extra: chunk
30
+ Requires-Dist: langchain-text-splitters>=0.2.0; extra == 'chunk'
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
33
+ Description-Content-Type: text/markdown
34
+
35
+ # HecVec
36
+
37
+ List directories with a safe root, filter `.txt`/`.md` files, read them as text, and optionally chunk and push to Chroma — **library only, no API**.
38
+
39
+ ## Install
40
+
41
+ ```bash
42
+ pip install hecvec
43
+ ```
44
+
45
+ One-call pipeline (list → filter → token-chunk → Chroma):
46
+
47
+ ```bash
48
+ pip install hecvec[chroma]
49
+ ```
50
+
51
+ Optional chunking only (no Chroma):
52
+
53
+ ```bash
54
+ pip install hecvec[chunk]
55
+ ```
56
+
57
+ ## Usage
58
+
59
+ ### One-call pipeline (list → filter → chunk → Chroma)
60
+
61
+ Runs entirely in the library (no API). You need Chroma running (e.g. `docker run -p 8000:8000 chromadb/chroma`) and `OPENAI_API_KEY` set (in the environment or in a `.env` file; the library loads `.env` via python-dotenv when you use `hecvec[chroma]`).
62
+
63
+ ```python
64
+ import hecvec
65
+
66
+ # Class-style: use defaults, then slice
67
+ test = hecvec.HecVec()
68
+ result = test.slice(path="/path/to/folder")
69
+ # → {"files": N, "chunks": M, "collection": "hecvec"}
70
+
71
+ # Or call slice on the class (same flow)
72
+ result = hecvec.HecVec.slice(path="/path/to/folder")
73
+ ```
74
+
75
+ Flow: resolve path → listdir → filter `.txt`/`.md` → token-chunk (200 tokens, `cl100k_base`) → embed with OpenAI → push to Chroma.
76
+
77
+ Optional config (instance or `HecVec.slice(..., key=value)`):
78
+
79
+ - `root`, `collection_name`, `chroma_host`, `chroma_port`
80
+ - `embedding_model`, `chunk_size`, `chunk_overlap`, `encoding_name`, `batch_size`
81
+ - `openai_api_key` (or set `OPENAI_API_KEY` in the environment or in a `.env` file; optional `dotenv_path` to point to a specific `.env`)
82
+
83
+ ### Low-level building blocks
84
+
85
+ ```python
86
+ from pathlib import Path
87
+ from hecvec import ListDir, ListDirTextFiles, ReadText
88
+
89
+ root = Path("/path/to/repo")
90
+
91
+ # List all entries under a path (restricted to root)
92
+ lister = ListDir(root=root)
93
+ for rel in lister.listdir("."):
94
+ print(rel)
95
+
96
+ # Only .txt and .md files, recursively
97
+ text_lister = ListDirTextFiles(root=root)
98
+ paths = text_lister.listdir_recursive_txt_md("docs")
99
+
100
+ # Read each file as text
101
+ reader = ReadText(paths)
102
+ for path, text in reader:
103
+ print(path, len(text))
104
+ ```
105
+
106
+ ### Chunking (optional)
107
+
108
+ With `pip install hecvec[chunk]`:
109
+
110
+ ```python
111
+ from hecvec import ListDirTextFiles, ReadText
112
+ from hecvec.chunking import chunk_documents
113
+
114
+ lister = ListDirTextFiles(root=root)
115
+ paths = lister.listdir_recursive_txt_md(".")
116
+ reader = ReadText(paths)
117
+ path_and_text = reader.read_all()
118
+ chunks = chunk_documents(path_and_text)
119
+ # list of {"path": "...", "chunk_index": 0, "content": "..."}
120
+ ```
121
+
122
+ ### CLI
123
+
124
+ ```bash
125
+ hecvec-listdir [path] [root]
126
+ # or
127
+ python -m hecvec.cli [path] [root]
128
+ ```
129
+
130
+ ### Test the full pipeline (the method that does everything)
131
+
132
+ From the project root, with Chroma running and `OPENAI_API_KEY` set (e.g. in `.env`):
133
+
134
+ ```bash
135
+ # Start Chroma (one terminal)
136
+ docker run -p 8000:8000 chromadb/chroma
137
+
138
+ # Run the test script (another terminal)
139
+ uv run python scripts/test_slice.py
140
+ # or: python scripts/test_slice.py
141
+ ```
142
+
143
+ The script creates a temp folder with two `.txt` files, runs `HecVec.slice(path=...)`, and prints `PASS` or `FAIL` with the result (`files`, `chunks`, `collection`).
144
+
145
+ ### Modular layout (easy to study)
146
+
147
+ Each step of the pipeline lives in its own module:
148
+
149
+ | Module | Responsibility |
150
+ |--------|-----------------|
151
+ | `hecvec.env` | Load `.env` and `OPENAI_API_KEY` |
152
+ | `hecvec.listdir` | List dirs under a safe root; filter by extension (`.txt`/`.md`) |
153
+ | `hecvec.reading` | Read files as text (UTF-8 / latin-1 / cp1252 fallback) |
154
+ | `hecvec.token_splitter` | Token-based chunking (TokenTextSplitter) |
155
+ | `hecvec.chunking` | Recursive-character chunking (RecursiveCharacterTextSplitter) |
156
+ | `hecvec.embeddings` | OpenAI embeddings (`embed_texts`) |
157
+ | `hecvec.chroma_client` | Chroma client, get/create collection, add documents |
158
+ | `hecvec.chroma_list` | List Chroma collections and counts |
159
+ | `hecvec.pipeline` | Orchestrator: `HecVec` and `slice(path=...)` |
160
+
161
+ Example: use one step on its own:
162
+
163
+ ```python
164
+ from hecvec import embed_texts, token_chunk_text, list_collections
165
+
166
+ chunks = token_chunk_text("Some long document...", chunk_size=200)
167
+ vecs = embed_texts(chunks, api_key="sk-...")
168
+ names_and_counts = list_collections(host="localhost", port=8000)
169
+ ```
170
+
171
+ ## Development
172
+
173
+ From the repo root:
174
+
175
+ ```bash
176
+ uv sync
177
+ uv run python -c "from hecvec import ListDir; print(ListDir('.').listdir('.'))"
178
+ ```
179
+
180
+ ## License
181
+
182
+ MIT
hecvec-0.1.0/README.md ADDED
@@ -0,0 +1,148 @@
1
+ # HecVec
2
+
3
+ List directories with a safe root, filter `.txt`/`.md` files, read them as text, and optionally chunk and push to Chroma — **library only, no API**.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install hecvec
9
+ ```
10
+
11
+ One-call pipeline (list → filter → token-chunk → Chroma):
12
+
13
+ ```bash
14
+ pip install hecvec[chroma]
15
+ ```
16
+
17
+ Optional chunking only (no Chroma):
18
+
19
+ ```bash
20
+ pip install hecvec[chunk]
21
+ ```
22
+
23
+ ## Usage
24
+
25
+ ### One-call pipeline (list → filter → chunk → Chroma)
26
+
27
+ Runs entirely in the library (no API). You need Chroma running (e.g. `docker run -p 8000:8000 chromadb/chroma`) and `OPENAI_API_KEY` set (in the environment or in a `.env` file; the library loads `.env` via python-dotenv when you use `hecvec[chroma]`).
28
+
29
+ ```python
30
+ import hecvec
31
+
32
+ # Class-style: use defaults, then slice
33
+ test = hecvec.HecVec()
34
+ result = test.slice(path="/path/to/folder")
35
+ # → {"files": N, "chunks": M, "collection": "hecvec"}
36
+
37
+ # Or call slice on the class (same flow)
38
+ result = hecvec.HecVec.slice(path="/path/to/folder")
39
+ ```
40
+
41
+ Flow: resolve path → listdir → filter `.txt`/`.md` → token-chunk (200 tokens, `cl100k_base`) → embed with OpenAI → push to Chroma.
42
+
43
+ Optional config (instance or `HecVec.slice(..., key=value)`):
44
+
45
+ - `root`, `collection_name`, `chroma_host`, `chroma_port`
46
+ - `embedding_model`, `chunk_size`, `chunk_overlap`, `encoding_name`, `batch_size`
47
+ - `openai_api_key` (or set `OPENAI_API_KEY` in the environment or in a `.env` file; optional `dotenv_path` to point to a specific `.env`)
48
+
49
+ ### Low-level building blocks
50
+
51
+ ```python
52
+ from pathlib import Path
53
+ from hecvec import ListDir, ListDirTextFiles, ReadText
54
+
55
+ root = Path("/path/to/repo")
56
+
57
+ # List all entries under a path (restricted to root)
58
+ lister = ListDir(root=root)
59
+ for rel in lister.listdir("."):
60
+ print(rel)
61
+
62
+ # Only .txt and .md files, recursively
63
+ text_lister = ListDirTextFiles(root=root)
64
+ paths = text_lister.listdir_recursive_txt_md("docs")
65
+
66
+ # Read each file as text
67
+ reader = ReadText(paths)
68
+ for path, text in reader:
69
+ print(path, len(text))
70
+ ```
71
+
72
+ ### Chunking (optional)
73
+
74
+ With `pip install hecvec[chunk]`:
75
+
76
+ ```python
77
+ from hecvec import ListDirTextFiles, ReadText
78
+ from hecvec.chunking import chunk_documents
79
+
80
+ lister = ListDirTextFiles(root=root)
81
+ paths = lister.listdir_recursive_txt_md(".")
82
+ reader = ReadText(paths)
83
+ path_and_text = reader.read_all()
84
+ chunks = chunk_documents(path_and_text)
85
+ # list of {"path": "...", "chunk_index": 0, "content": "..."}
86
+ ```
87
+
88
+ ### CLI
89
+
90
+ ```bash
91
+ hecvec-listdir [path] [root]
92
+ # or
93
+ python -m hecvec.cli [path] [root]
94
+ ```
95
+
96
+ ### Test the full pipeline (the method that does everything)
97
+
98
+ From the project root, with Chroma running and `OPENAI_API_KEY` set (e.g. in `.env`):
99
+
100
+ ```bash
101
+ # Start Chroma (one terminal)
102
+ docker run -p 8000:8000 chromadb/chroma
103
+
104
+ # Run the test script (another terminal)
105
+ uv run python scripts/test_slice.py
106
+ # or: python scripts/test_slice.py
107
+ ```
108
+
109
+ The script creates a temp folder with two `.txt` files, runs `HecVec.slice(path=...)`, and prints `PASS` or `FAIL` with the result (`files`, `chunks`, `collection`).
110
+
111
+ ### Modular layout (easy to study)
112
+
113
+ Each step of the pipeline lives in its own module:
114
+
115
+ | Module | Responsibility |
116
+ |--------|-----------------|
117
+ | `hecvec.env` | Load `.env` and `OPENAI_API_KEY` |
118
+ | `hecvec.listdir` | List dirs under a safe root; filter by extension (`.txt`/`.md`) |
119
+ | `hecvec.reading` | Read files as text (UTF-8 / latin-1 / cp1252 fallback) |
120
+ | `hecvec.token_splitter` | Token-based chunking (TokenTextSplitter) |
121
+ | `hecvec.chunking` | Recursive-character chunking (RecursiveCharacterTextSplitter) |
122
+ | `hecvec.embeddings` | OpenAI embeddings (`embed_texts`) |
123
+ | `hecvec.chroma_client` | Chroma client, get/create collection, add documents |
124
+ | `hecvec.chroma_list` | List Chroma collections and counts |
125
+ | `hecvec.pipeline` | Orchestrator: `HecVec` and `slice(path=...)` |
126
+
127
+ Example: use one step on its own:
128
+
129
+ ```python
130
+ from hecvec import embed_texts, token_chunk_text, list_collections
131
+
132
+ chunks = token_chunk_text("Some long document...", chunk_size=200)
133
+ vecs = embed_texts(chunks, api_key="sk-...")
134
+ names_and_counts = list_collections(host="localhost", port=8000)
135
+ ```
136
+
137
+ ## Development
138
+
139
+ From the repo root:
140
+
141
+ ```bash
142
+ uv sync
143
+ uv run python -c "from hecvec import ListDir; print(ListDir('.').listdir('.'))"
144
+ ```
145
+
146
+ ## License
147
+
148
+ MIT
@@ -0,0 +1,53 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "hecvec"
7
+ version = "0.1.0"
8
+ description = "List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9,<3.14"
11
+ license = "MIT"
12
+ authors = []
13
+ keywords = ["listdir", "text-files", "chunking", "document-pipeline"]
14
+ classifiers = [
15
+ "Development Status :: 3 - Alpha",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Programming Language :: Python :: 3",
19
+ "Programming Language :: Python :: 3.9",
20
+ "Programming Language :: Python :: 3.10",
21
+ "Programming Language :: Python :: 3.11",
22
+ "Programming Language :: Python :: 3.12",
23
+ "Programming Language :: Python :: 3.13",
24
+ "Topic :: Software Development :: Libraries :: Python Modules",
25
+ ]
26
+ dependencies = [
27
+ "chromadb>=0.4.0",
28
+ "langchain-text-splitters>=0.2.0",
29
+ "openai>=1.0.0",
30
+ "python-dotenv>=1.0.0",
31
+ "tiktoken>=0.5.0",
32
+ ]
33
+
34
+ [project.optional-dependencies]
35
+ chunk = ["langchain-text-splitters>=0.2.0"]
36
+ chroma = [
37
+ "chromadb>=0.4.0",
38
+ "langchain-text-splitters>=0.2.0",
39
+ "openai>=1.0.0",
40
+ "python-dotenv>=1.0.0",
41
+ "tiktoken>=0.5.0",
42
+ ]
43
+ dev = ["pytest>=7.0.0"]
44
+
45
+ [tool.pytest.ini_options]
46
+ testpaths = ["tests"]
47
+ pythonpath = ["src"]
48
+
49
+ [project.scripts]
50
+ hecvec-listdir = "hecvec.cli:main"
51
+
52
+ [tool.hatch.build.targets.wheel]
53
+ packages = ["src/hecvec"]
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Create a HecVec object and test the method that does everything by passing your path.
4
+
5
+ listdir → filter .txt/.md → token-chunk → embed → Chroma
6
+
7
+ Requirements:
8
+ - pip install hecvec[chroma]
9
+ - Chroma running: docker run -p 8000:8000 chromadb/chroma
10
+ - OPENAI_API_KEY set (env or .env in project root)
11
+
12
+ Usage (from project root):
13
+ uv run python scripts/test_slice.py /path/to/folder
14
+ uv run python scripts/test_slice.py .
15
+ python scripts/test_slice.py /path/to/folder
16
+ """
17
+ import logging
18
+ import sys
19
+ from pathlib import Path
20
+
21
+ # Project root and src
22
+ ROOT = Path(__file__).resolve().parent.parent
23
+ SRC = ROOT / "src"
24
+ if str(SRC) not in sys.path:
25
+ sys.path.insert(0, str(SRC))
26
+
27
+ # Default path to slice: project root (works in devcontainer; use first arg to override)
28
+ DEFAULT_PATH = ROOT
29
+
30
+
31
+ def main() -> int:
32
+ logging.basicConfig(
33
+ level=logging.INFO,
34
+ format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
35
+ datefmt="%H:%M:%S",
36
+ )
37
+ logger = logging.getLogger(__name__)
38
+
39
+ from hecvec import HecVec
40
+
41
+ if len(sys.argv) < 2:
42
+ path = DEFAULT_PATH
43
+ logger.info("No path given, using default: %s", path)
44
+ else:
45
+ path = Path(sys.argv[1]).expanduser().resolve()
46
+ logger.info("Path given: %s", path)
47
+
48
+ if not path.exists():
49
+ logger.error("Path does not exist: %s", path)
50
+ print(f"Error: path does not exist: {path}")
51
+ return 1
52
+ if not path.is_dir() and not path.is_file():
53
+ logger.error("Not a file or directory: %s", path)
54
+ print(f"Error: not a file or directory: {path}")
55
+ return 1
56
+ if path.is_file() and path.suffix.lower() not in (".txt", ".md"):
57
+ logger.error("File must be .txt or .md: %s", path)
58
+ print(f"Error: file must be .txt or .md: {path}")
59
+ return 1
60
+
61
+ test = HecVec()
62
+ result = test.slice(path=path)
63
+
64
+ logger.info("Result: %s", result)
65
+ print("Result:", result)
66
+ return 0
67
+
68
+
69
+ if __name__ == "__main__":
70
+ sys.exit(main())
@@ -0,0 +1,34 @@
1
+ """
2
+ HecVec: modular library — listdir, read, chunk (token/recursive), embed, Chroma. No API.
3
+ """
4
+ from hecvec.chunking import chunk_documents, chunk_text
5
+ from hecvec.chroma_client import add_documents, get_client, get_or_create_collection
6
+ from hecvec.chroma_list import list_collections
7
+ from hecvec.embeddings import embed_texts
8
+ from hecvec.env import load_dotenv_if_available, load_openai_key
9
+ from hecvec.listdir import ALLOWED_EXTENSIONS, ListDir, ListDirTextFiles
10
+ from hecvec.pipeline import HecVec
11
+ from hecvec.reading import ReadText
12
+ from hecvec.token_splitter import token_chunk_documents, token_chunk_text
13
+
14
+ __all__ = [
15
+ "ALLOWED_EXTENSIONS",
16
+ "HecVec",
17
+ "ListDir",
18
+ "ListDirTextFiles",
19
+ "ReadText",
20
+ "add_documents",
21
+ "chunk_documents",
22
+ "chunk_text",
23
+ "embed_texts",
24
+ "get_client",
25
+ "get_or_create_collection",
26
+ "list_collections",
27
+ "load_dotenv_if_available",
28
+ "load_openai_key",
29
+ "token_chunk_documents",
30
+ "token_chunk_text",
31
+ "__version__",
32
+ ]
33
+
34
+ __version__ = "0.1.0"
@@ -0,0 +1,58 @@
1
+ """
2
+ Chroma client and collection operations. One module = one responsibility: connect and add documents.
3
+ Requires: pip install hecvec[chroma] (chromadb).
4
+ """
5
+ from __future__ import annotations
6
+
7
+ from typing import TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING:
10
+ import chromadb
11
+
12
+ DEFAULT_HOST = "localhost"
13
+ DEFAULT_PORT = 8000
14
+
15
+
16
+ def get_client(host: str = DEFAULT_HOST, port: int = DEFAULT_PORT):
17
+ """Return a Chroma HTTP client, or an in-memory client if the server is not reachable."""
18
+ import chromadb
19
+ try:
20
+ return chromadb.HttpClient(host=host, port=port)
21
+ except Exception:
22
+ return chromadb.EphemeralClient()
23
+
24
+
25
+ def get_or_create_collection(
26
+ client: "chromadb.HttpClient",
27
+ name: str,
28
+ metadata: dict | None = None,
29
+ ):
30
+ """Get or create a collection (cosine similarity by default)."""
31
+ if metadata is None:
32
+ metadata = {"hnsw:space": "cosine"}
33
+ return client.get_or_create_collection(name=name, metadata=metadata)
34
+
35
+
36
+ def add_documents(
37
+ client: "chromadb.HttpClient",
38
+ collection_name: str,
39
+ ids: list[str],
40
+ embeddings: list[list[float]],
41
+ documents: list[str],
42
+ ) -> None:
43
+ """
44
+ Add documents to a collection. If dimension mismatch, deletes and recreates the collection.
45
+ """
46
+ import chromadb
47
+ coll = get_or_create_collection(client, collection_name)
48
+ try:
49
+ coll.add(ids=ids, embeddings=embeddings, documents=documents)
50
+ except chromadb.errors.InvalidArgumentError as e:
51
+ if "dimension" not in str(e).lower():
52
+ raise
53
+ client.delete_collection(name=collection_name)
54
+ coll = client.create_collection(
55
+ name=collection_name,
56
+ metadata={"hnsw:space": "cosine"},
57
+ )
58
+ coll.add(ids=ids, embeddings=embeddings, documents=documents)
@@ -0,0 +1,27 @@
1
+ """
2
+ List Chroma collections. One module = one responsibility: inspect collections on a Chroma server.
3
+ Requires: pip install hecvec[chroma] (chromadb).
4
+ """
5
+ from __future__ import annotations
6
+
7
+ from typing import TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING:
10
+ import chromadb
11
+
12
+ DEFAULT_HOST = "localhost"
13
+ DEFAULT_PORT = 8000
14
+
15
+
16
+ def list_collections(
17
+ host: str = DEFAULT_HOST,
18
+ port: int = DEFAULT_PORT,
19
+ ) -> list[tuple[str, int]]:
20
+ """
21
+ List all collection names and their document counts on a Chroma server.
22
+ Returns [(name, count), ...].
23
+ """
24
+ import chromadb
25
+ client = chromadb.HttpClient(host=host, port=port)
26
+ collections = client.list_collections()
27
+ return [(c.name, c.count()) for c in collections]