hecvec 0.4.3__tar.gz → 0.4.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hecvec-0.4.3 → hecvec-0.4.4}/PKG-INFO +1 -1
- {hecvec-0.4.3 → hecvec-0.4.4}/pyproject.toml +1 -1
- {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/__init__.py +3 -2
- {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/chroma_client.py +15 -6
- {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/pipeline.py +17 -3
- {hecvec-0.4.3 → hecvec-0.4.4}/.gitignore +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/README.md +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/scripts/test_slice.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/_recursive_chunking.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/chroma_list.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/chunkers.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/chunking.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/cli.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/embeddings.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/env.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/hecvec.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/listdir.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/reading.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/run_llm_chunk.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/run_semantic_chunk.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/token_splitter.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/tests/conftest.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/tests/test_env.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/tests/test_listdir.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/tests/test_reading.py +0 -0
- {hecvec-0.4.3 → hecvec-0.4.4}/uv.lock +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "hecvec"
|
|
7
|
-
version = "0.4.
|
|
7
|
+
version = "0.4.4"
|
|
8
8
|
description = "List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9,<3.14"
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
hecvec: modular library — listdir, read, chunk (token/recursive), embed, Chroma. No API.
|
|
3
3
|
"""
|
|
4
4
|
from hecvec.chunking import chunk_documents, chunk_text
|
|
5
|
-
from hecvec.chroma_client import add_documents, get_client, get_or_create_collection
|
|
5
|
+
from hecvec.chroma_client import add_documents, get_client, get_or_create_collection, ChromaMode
|
|
6
6
|
from hecvec.chunkers import ChunkingMethod, chunk_documents as chunk_documents_by_method
|
|
7
7
|
from hecvec.chroma_list import list_collections
|
|
8
8
|
from hecvec.embeddings import embed_texts
|
|
@@ -21,6 +21,7 @@ __all__ = [
|
|
|
21
21
|
"ReadText",
|
|
22
22
|
"add_documents",
|
|
23
23
|
"chunk_documents",
|
|
24
|
+
"ChromaMode",
|
|
24
25
|
"chunk_documents_by_method",
|
|
25
26
|
"chunk_text",
|
|
26
27
|
"embed_texts",
|
|
@@ -34,4 +35,4 @@ __all__ = [
|
|
|
34
35
|
"__version__",
|
|
35
36
|
]
|
|
36
37
|
|
|
37
|
-
__version__ = "0.4.
|
|
38
|
+
__version__ = "0.4.4"
|
|
@@ -4,7 +4,7 @@ Requires: pip install hecvec[chroma] (chromadb).
|
|
|
4
4
|
"""
|
|
5
5
|
from __future__ import annotations
|
|
6
6
|
|
|
7
|
-
from typing import TYPE_CHECKING
|
|
7
|
+
from typing import TYPE_CHECKING, Literal
|
|
8
8
|
|
|
9
9
|
if TYPE_CHECKING:
|
|
10
10
|
import chromadb
|
|
@@ -12,24 +12,33 @@ if TYPE_CHECKING:
|
|
|
12
12
|
DEFAULT_HOST = "localhost"
|
|
13
13
|
DEFAULT_PORT = 8000
|
|
14
14
|
|
|
15
|
+
ChromaMode = Literal["auto", "server", "ephemeral"]
|
|
16
|
+
|
|
15
17
|
|
|
16
18
|
def get_client(
|
|
17
19
|
host: str = DEFAULT_HOST,
|
|
18
20
|
port: int = DEFAULT_PORT,
|
|
19
21
|
persist_path: str | None = None,
|
|
22
|
+
mode: ChromaMode = "auto",
|
|
20
23
|
):
|
|
21
24
|
"""
|
|
22
|
-
Return a Chroma client and mode.
|
|
25
|
+
Return a Chroma client and mode. All backends use the same Chroma API (same add/query methods).
|
|
23
26
|
|
|
24
|
-
-
|
|
25
|
-
|
|
26
|
-
-
|
|
27
|
-
|
|
27
|
+
- persist_path set: use PersistentClient(path). Data on disk. Returns (client, "persistent").
|
|
28
|
+
- mode "ephemeral": use EphemeralClient(). In-memory, no persistence. Returns (client, "ephemeral").
|
|
29
|
+
- mode "server": use HttpClient(host, port) only. Raises if server unreachable. Returns (client, "server").
|
|
30
|
+
- mode "auto": try server, then fall back to ephemeral. Returns (client, "server") or (client, "ephemeral").
|
|
28
31
|
"""
|
|
29
32
|
import chromadb
|
|
30
33
|
if persist_path is not None:
|
|
31
34
|
client = chromadb.PersistentClient(path=persist_path)
|
|
32
35
|
return client, "persistent"
|
|
36
|
+
if mode == "ephemeral":
|
|
37
|
+
return chromadb.EphemeralClient(), "ephemeral"
|
|
38
|
+
if mode == "server":
|
|
39
|
+
client = chromadb.HttpClient(host=host, port=port)
|
|
40
|
+
return client, "server"
|
|
41
|
+
# auto: try server, fall back to ephemeral
|
|
33
42
|
try:
|
|
34
43
|
client = chromadb.HttpClient(host=host, port=port)
|
|
35
44
|
return client, "server"
|
|
@@ -11,6 +11,7 @@ from time import perf_counter
|
|
|
11
11
|
from typing import Any
|
|
12
12
|
|
|
13
13
|
from hecvec.chroma_client import add_documents, get_client
|
|
14
|
+
from hecvec.chroma_client import ChromaMode
|
|
14
15
|
from hecvec.chunkers import ChunkingMethod, chunk_documents as chunk_documents_by_method
|
|
15
16
|
from hecvec.embeddings import embed_texts
|
|
16
17
|
from hecvec.env import load_openai_key
|
|
@@ -68,6 +69,8 @@ class Slicer:
|
|
|
68
69
|
self.collection_name = collection_name
|
|
69
70
|
self.chroma_host = chroma_host
|
|
70
71
|
self.chroma_port = chroma_port
|
|
72
|
+
self.chroma_mode = chroma_mode
|
|
73
|
+
self.chroma_persist_path = chroma_persist_path
|
|
71
74
|
self.embedding_model = embedding_model
|
|
72
75
|
self.chunk_size = chunk_size
|
|
73
76
|
self.chunk_overlap = chunk_overlap
|
|
@@ -88,6 +91,8 @@ class Slicer:
|
|
|
88
91
|
collection_name=kwargs.pop("collection_name", self.collection_name),
|
|
89
92
|
chroma_host=kwargs.pop("chroma_host", self.chroma_host),
|
|
90
93
|
chroma_port=kwargs.pop("chroma_port", self.chroma_port),
|
|
94
|
+
chroma_mode=kwargs.pop("chroma_mode", self.chroma_mode),
|
|
95
|
+
chroma_persist_path=kwargs.pop("chroma_persist_path", self.chroma_persist_path),
|
|
91
96
|
embedding_model=kwargs.pop("embedding_model", self.embedding_model),
|
|
92
97
|
chunk_size=kwargs.pop("chunk_size", self.chunk_size),
|
|
93
98
|
chunk_overlap=kwargs.pop("chunk_overlap", self.chunk_overlap),
|
|
@@ -108,6 +113,8 @@ class Slicer:
|
|
|
108
113
|
collection_name: str = "hecvec",
|
|
109
114
|
chroma_host: str = "localhost",
|
|
110
115
|
chroma_port: int = 8000,
|
|
116
|
+
chroma_mode: ChromaMode = "auto",
|
|
117
|
+
chroma_persist_path: str | Path | None = None,
|
|
111
118
|
embedding_model: str = "text-embedding-3-small",
|
|
112
119
|
chunk_size: int = 200,
|
|
113
120
|
chunk_overlap: int = 0,
|
|
@@ -209,11 +216,18 @@ class Slicer:
|
|
|
209
216
|
# 5/5 Push to Chroma
|
|
210
217
|
stage_start = perf_counter()
|
|
211
218
|
logger.info("[5/5] Writing to Chroma | host=%s | port=%s | collection=%s", chroma_host, chroma_port, collection_name)
|
|
212
|
-
client,
|
|
213
|
-
|
|
219
|
+
client, chroma_mode_used = get_client(
|
|
220
|
+
host=chroma_host,
|
|
221
|
+
port=chroma_port,
|
|
222
|
+
persist_path=str(chroma_persist_path) if chroma_persist_path is not None else None,
|
|
223
|
+
mode=chroma_mode,
|
|
224
|
+
)
|
|
225
|
+
if chroma_mode_used == "server":
|
|
214
226
|
logger.info("[5/5] Using Chroma server at %s:%s", chroma_host, chroma_port)
|
|
227
|
+
elif chroma_mode_used == "persistent":
|
|
228
|
+
logger.info("[5/5] Using Chroma persistent storage at %s", chroma_persist_path)
|
|
215
229
|
else:
|
|
216
|
-
logger.info("[5/5] Chroma
|
|
230
|
+
logger.info("[5/5] Using Chroma in-memory (ephemeral) client")
|
|
217
231
|
logger.info("[5/5] Adding %d document chunk(s) to collection...", len(documents))
|
|
218
232
|
add_result = add_documents(client, collection_name, ids, embeddings, documents)
|
|
219
233
|
if add_result["collection_existed"]:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|