hecvec 0.4.2__tar.gz → 0.4.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hecvec-0.4.2 → hecvec-0.4.3}/PKG-INFO +1 -1
- {hecvec-0.4.2 → hecvec-0.4.3}/pyproject.toml +1 -1
- {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/__init__.py +1 -1
- {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/chroma_client.py +25 -5
- {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/pipeline.py +10 -2
- {hecvec-0.4.2 → hecvec-0.4.3}/uv.lock +1 -1
- {hecvec-0.4.2 → hecvec-0.4.3}/.gitignore +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/README.md +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/scripts/test_slice.py +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/_recursive_chunking.py +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/chroma_list.py +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/chunkers.py +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/chunking.py +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/cli.py +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/embeddings.py +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/env.py +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/hecvec.py +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/listdir.py +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/reading.py +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/run_llm_chunk.py +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/run_semantic_chunk.py +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/token_splitter.py +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/tests/conftest.py +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/tests/test_env.py +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/tests/test_listdir.py +0 -0
- {hecvec-0.4.2 → hecvec-0.4.3}/tests/test_reading.py +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "hecvec"
|
|
7
|
-
version = "0.4.
|
|
7
|
+
version = "0.4.3"
|
|
8
8
|
description = "List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9,<3.14"
|
|
@@ -13,13 +13,28 @@ DEFAULT_HOST = "localhost"
|
|
|
13
13
|
DEFAULT_PORT = 8000
|
|
14
14
|
|
|
15
15
|
|
|
16
|
-
def get_client(
|
|
17
|
-
|
|
16
|
+
def get_client(
|
|
17
|
+
host: str = DEFAULT_HOST,
|
|
18
|
+
port: int = DEFAULT_PORT,
|
|
19
|
+
persist_path: str | None = None,
|
|
20
|
+
):
|
|
21
|
+
"""
|
|
22
|
+
Return a Chroma client and mode.
|
|
23
|
+
|
|
24
|
+
- If persist_path is set: use PersistentClient(path). Data is stored on disk.
|
|
25
|
+
Returns (client, "persistent"). host/port are ignored.
|
|
26
|
+
- Else: try HttpClient(host, port). If that fails, use EphemeralClient().
|
|
27
|
+
Returns (client, "server") or (client, "ephemeral").
|
|
28
|
+
"""
|
|
18
29
|
import chromadb
|
|
30
|
+
if persist_path is not None:
|
|
31
|
+
client = chromadb.PersistentClient(path=persist_path)
|
|
32
|
+
return client, "persistent"
|
|
19
33
|
try:
|
|
20
|
-
|
|
34
|
+
client = chromadb.HttpClient(host=host, port=port)
|
|
35
|
+
return client, "server"
|
|
21
36
|
except Exception:
|
|
22
|
-
return chromadb.EphemeralClient()
|
|
37
|
+
return chromadb.EphemeralClient(), "ephemeral"
|
|
23
38
|
|
|
24
39
|
|
|
25
40
|
def get_or_create_collection(
|
|
@@ -39,11 +54,14 @@ def add_documents(
|
|
|
39
54
|
ids: list[str],
|
|
40
55
|
embeddings: list[list[float]],
|
|
41
56
|
documents: list[str],
|
|
42
|
-
) ->
|
|
57
|
+
) -> dict:
|
|
43
58
|
"""
|
|
44
59
|
Add documents to a collection. If dimension mismatch, deletes and recreates the collection.
|
|
60
|
+
Returns {"collection_existed": bool} so the caller can log whether the collection was pre-existing.
|
|
45
61
|
"""
|
|
46
62
|
import chromadb
|
|
63
|
+
existing_names = [c.name for c in client.list_collections()]
|
|
64
|
+
collection_existed = collection_name in existing_names
|
|
47
65
|
coll = get_or_create_collection(client, collection_name)
|
|
48
66
|
try:
|
|
49
67
|
coll.add(ids=ids, embeddings=embeddings, documents=documents)
|
|
@@ -56,3 +74,5 @@ def add_documents(
|
|
|
56
74
|
metadata={"hnsw:space": "cosine"},
|
|
57
75
|
)
|
|
58
76
|
coll.add(ids=ids, embeddings=embeddings, documents=documents)
|
|
77
|
+
collection_existed = False # we recreated it
|
|
78
|
+
return {"collection_existed": collection_existed}
|
|
@@ -209,9 +209,17 @@ class Slicer:
|
|
|
209
209
|
# 5/5 Push to Chroma
|
|
210
210
|
stage_start = perf_counter()
|
|
211
211
|
logger.info("[5/5] Writing to Chroma | host=%s | port=%s | collection=%s", chroma_host, chroma_port, collection_name)
|
|
212
|
-
client = get_client(host=chroma_host, port=chroma_port)
|
|
212
|
+
client, chroma_mode = get_client(host=chroma_host, port=chroma_port)
|
|
213
|
+
if chroma_mode == "server":
|
|
214
|
+
logger.info("[5/5] Using Chroma server at %s:%s", chroma_host, chroma_port)
|
|
215
|
+
else:
|
|
216
|
+
logger.info("[5/5] Chroma server unreachable; using in-memory (ephemeral) client")
|
|
213
217
|
logger.info("[5/5] Adding %d document chunk(s) to collection...", len(documents))
|
|
214
|
-
add_documents(client, collection_name, ids, embeddings, documents)
|
|
218
|
+
add_result = add_documents(client, collection_name, ids, embeddings, documents)
|
|
219
|
+
if add_result["collection_existed"]:
|
|
220
|
+
logger.info("[5/5] Collection %r already existed; documents appended", collection_name)
|
|
221
|
+
else:
|
|
222
|
+
logger.info("[5/5] Collection %r created (new)", collection_name)
|
|
215
223
|
logger.info("[5/5] Chroma write completed in %.2fs", perf_counter() - stage_start)
|
|
216
224
|
|
|
217
225
|
logger.info("Slice finished in %.2fs | files=%d | chunks=%d | collection=%s", perf_counter() - total_start, len(path_and_text), len(documents), collection_name)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|