hecvec 0.4.2__tar.gz → 0.4.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {hecvec-0.4.2 → hecvec-0.4.3}/PKG-INFO +1 -1
  2. {hecvec-0.4.2 → hecvec-0.4.3}/pyproject.toml +1 -1
  3. {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/__init__.py +1 -1
  4. {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/chroma_client.py +25 -5
  5. {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/pipeline.py +10 -2
  6. {hecvec-0.4.2 → hecvec-0.4.3}/uv.lock +1 -1
  7. {hecvec-0.4.2 → hecvec-0.4.3}/.gitignore +0 -0
  8. {hecvec-0.4.2 → hecvec-0.4.3}/README.md +0 -0
  9. {hecvec-0.4.2 → hecvec-0.4.3}/scripts/test_slice.py +0 -0
  10. {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/_recursive_chunking.py +0 -0
  11. {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/chroma_list.py +0 -0
  12. {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/chunkers.py +0 -0
  13. {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/chunking.py +0 -0
  14. {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/cli.py +0 -0
  15. {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/embeddings.py +0 -0
  16. {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/env.py +0 -0
  17. {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/hecvec.py +0 -0
  18. {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/listdir.py +0 -0
  19. {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/reading.py +0 -0
  20. {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/run_llm_chunk.py +0 -0
  21. {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/run_semantic_chunk.py +0 -0
  22. {hecvec-0.4.2 → hecvec-0.4.3}/src/hecvec/token_splitter.py +0 -0
  23. {hecvec-0.4.2 → hecvec-0.4.3}/tests/conftest.py +0 -0
  24. {hecvec-0.4.2 → hecvec-0.4.3}/tests/test_env.py +0 -0
  25. {hecvec-0.4.2 → hecvec-0.4.3}/tests/test_listdir.py +0 -0
  26. {hecvec-0.4.2 → hecvec-0.4.3}/tests/test_reading.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hecvec
3
- Version: 0.4.2
3
+ Version: 0.4.3
4
4
  Summary: List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma.
5
5
  License-Expression: MIT
6
6
  Keywords: chunking,document-pipeline,listdir,text-files
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "hecvec"
7
- version = "0.4.2"
7
+ version = "0.4.3"
8
8
  description = "List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9,<3.14"
@@ -34,4 +34,4 @@ __all__ = [
34
34
  "__version__",
35
35
  ]
36
36
 
37
- __version__ = "0.4.2"
37
+ __version__ = "0.4.3"
@@ -13,13 +13,28 @@ DEFAULT_HOST = "localhost"
13
13
  DEFAULT_PORT = 8000
14
14
 
15
15
 
16
- def get_client(host: str = DEFAULT_HOST, port: int = DEFAULT_PORT):
17
- """Return a Chroma HTTP client, or an in-memory client if the server is not reachable."""
16
+ def get_client(
17
+ host: str = DEFAULT_HOST,
18
+ port: int = DEFAULT_PORT,
19
+ persist_path: str | None = None,
20
+ ):
21
+ """
22
+ Return a Chroma client and mode.
23
+
24
+ - If persist_path is set: use PersistentClient(path). Data is stored on disk.
25
+ Returns (client, "persistent"). host/port are ignored.
26
+ - Else: try HttpClient(host, port). If that fails, use EphemeralClient().
27
+ Returns (client, "server") or (client, "ephemeral").
28
+ """
18
29
  import chromadb
30
+ if persist_path is not None:
31
+ client = chromadb.PersistentClient(path=persist_path)
32
+ return client, "persistent"
19
33
  try:
20
- return chromadb.HttpClient(host=host, port=port)
34
+ client = chromadb.HttpClient(host=host, port=port)
35
+ return client, "server"
21
36
  except Exception:
22
- return chromadb.EphemeralClient()
37
+ return chromadb.EphemeralClient(), "ephemeral"
23
38
 
24
39
 
25
40
  def get_or_create_collection(
@@ -39,11 +54,14 @@ def add_documents(
39
54
  ids: list[str],
40
55
  embeddings: list[list[float]],
41
56
  documents: list[str],
42
- ) -> None:
57
+ ) -> dict:
43
58
  """
44
59
  Add documents to a collection. If dimension mismatch, deletes and recreates the collection.
60
+ Returns {"collection_existed": bool} so the caller can log whether the collection was pre-existing.
45
61
  """
46
62
  import chromadb
63
+ existing_names = [c.name for c in client.list_collections()]
64
+ collection_existed = collection_name in existing_names
47
65
  coll = get_or_create_collection(client, collection_name)
48
66
  try:
49
67
  coll.add(ids=ids, embeddings=embeddings, documents=documents)
@@ -56,3 +74,5 @@ def add_documents(
56
74
  metadata={"hnsw:space": "cosine"},
57
75
  )
58
76
  coll.add(ids=ids, embeddings=embeddings, documents=documents)
77
+ collection_existed = False # we recreated it
78
+ return {"collection_existed": collection_existed}
@@ -209,9 +209,17 @@ class Slicer:
209
209
  # 5/5 Push to Chroma
210
210
  stage_start = perf_counter()
211
211
  logger.info("[5/5] Writing to Chroma | host=%s | port=%s | collection=%s", chroma_host, chroma_port, collection_name)
212
- client = get_client(host=chroma_host, port=chroma_port)
212
+ client, chroma_mode = get_client(host=chroma_host, port=chroma_port)
213
+ if chroma_mode == "server":
214
+ logger.info("[5/5] Using Chroma server at %s:%s", chroma_host, chroma_port)
215
+ else:
216
+ logger.info("[5/5] Chroma server unreachable; using in-memory (ephemeral) client")
213
217
  logger.info("[5/5] Adding %d document chunk(s) to collection...", len(documents))
214
- add_documents(client, collection_name, ids, embeddings, documents)
218
+ add_result = add_documents(client, collection_name, ids, embeddings, documents)
219
+ if add_result["collection_existed"]:
220
+ logger.info("[5/5] Collection %r already existed; documents appended", collection_name)
221
+ else:
222
+ logger.info("[5/5] Collection %r created (new)", collection_name)
215
223
  logger.info("[5/5] Chroma write completed in %.2fs", perf_counter() - stage_start)
216
224
 
217
225
  logger.info("Slice finished in %.2fs | files=%d | chunks=%d | collection=%s", perf_counter() - total_start, len(path_and_text), len(documents), collection_name)
@@ -599,7 +599,7 @@ wheels = [
599
599
 
600
600
  [[package]]
601
601
  name = "hecvec"
602
- version = "0.4.1"
602
+ version = "0.4.2"
603
603
  source = { editable = "." }
604
604
  dependencies = [
605
605
  { name = "chromadb" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes