hecvec 0.4.3__tar.gz → 0.4.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {hecvec-0.4.3 → hecvec-0.4.4}/PKG-INFO +1 -1
  2. {hecvec-0.4.3 → hecvec-0.4.4}/pyproject.toml +1 -1
  3. {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/__init__.py +3 -2
  4. {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/chroma_client.py +15 -6
  5. {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/pipeline.py +17 -3
  6. {hecvec-0.4.3 → hecvec-0.4.4}/.gitignore +0 -0
  7. {hecvec-0.4.3 → hecvec-0.4.4}/README.md +0 -0
  8. {hecvec-0.4.3 → hecvec-0.4.4}/scripts/test_slice.py +0 -0
  9. {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/_recursive_chunking.py +0 -0
  10. {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/chroma_list.py +0 -0
  11. {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/chunkers.py +0 -0
  12. {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/chunking.py +0 -0
  13. {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/cli.py +0 -0
  14. {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/embeddings.py +0 -0
  15. {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/env.py +0 -0
  16. {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/hecvec.py +0 -0
  17. {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/listdir.py +0 -0
  18. {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/reading.py +0 -0
  19. {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/run_llm_chunk.py +0 -0
  20. {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/run_semantic_chunk.py +0 -0
  21. {hecvec-0.4.3 → hecvec-0.4.4}/src/hecvec/token_splitter.py +0 -0
  22. {hecvec-0.4.3 → hecvec-0.4.4}/tests/conftest.py +0 -0
  23. {hecvec-0.4.3 → hecvec-0.4.4}/tests/test_env.py +0 -0
  24. {hecvec-0.4.3 → hecvec-0.4.4}/tests/test_listdir.py +0 -0
  25. {hecvec-0.4.3 → hecvec-0.4.4}/tests/test_reading.py +0 -0
  26. {hecvec-0.4.3 → hecvec-0.4.4}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hecvec
3
- Version: 0.4.3
3
+ Version: 0.4.4
4
4
  Summary: List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma.
5
5
  License-Expression: MIT
6
6
  Keywords: chunking,document-pipeline,listdir,text-files
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "hecvec"
7
- version = "0.4.3"
7
+ version = "0.4.4"
8
8
  description = "List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9,<3.14"
@@ -2,7 +2,7 @@
2
2
  hecvec: modular library — listdir, read, chunk (token/recursive), embed, Chroma. No API.
3
3
  """
4
4
  from hecvec.chunking import chunk_documents, chunk_text
5
- from hecvec.chroma_client import add_documents, get_client, get_or_create_collection
5
+ from hecvec.chroma_client import add_documents, get_client, get_or_create_collection, ChromaMode
6
6
  from hecvec.chunkers import ChunkingMethod, chunk_documents as chunk_documents_by_method
7
7
  from hecvec.chroma_list import list_collections
8
8
  from hecvec.embeddings import embed_texts
@@ -21,6 +21,7 @@ __all__ = [
21
21
  "ReadText",
22
22
  "add_documents",
23
23
  "chunk_documents",
24
+ "ChromaMode",
24
25
  "chunk_documents_by_method",
25
26
  "chunk_text",
26
27
  "embed_texts",
@@ -34,4 +35,4 @@ __all__ = [
34
35
  "__version__",
35
36
  ]
36
37
 
37
- __version__ = "0.4.3"
38
+ __version__ = "0.4.4"
@@ -4,7 +4,7 @@ Requires: pip install hecvec[chroma] (chromadb).
4
4
  """
5
5
  from __future__ import annotations
6
6
 
7
- from typing import TYPE_CHECKING
7
+ from typing import TYPE_CHECKING, Literal
8
8
 
9
9
  if TYPE_CHECKING:
10
10
  import chromadb
@@ -12,24 +12,33 @@ if TYPE_CHECKING:
12
12
  DEFAULT_HOST = "localhost"
13
13
  DEFAULT_PORT = 8000
14
14
 
15
+ ChromaMode = Literal["auto", "server", "ephemeral"]
16
+
15
17
 
16
18
  def get_client(
17
19
  host: str = DEFAULT_HOST,
18
20
  port: int = DEFAULT_PORT,
19
21
  persist_path: str | None = None,
22
+ mode: ChromaMode = "auto",
20
23
  ):
21
24
  """
22
- Return a Chroma client and mode.
25
+ Return a Chroma client and mode. All backends use the same Chroma API (same add/query methods).
23
26
 
24
- - If persist_path is set: use PersistentClient(path). Data is stored on disk.
25
- Returns (client, "persistent"). host/port are ignored.
26
- - Else: try HttpClient(host, port). If that fails, use EphemeralClient().
27
- Returns (client, "server") or (client, "ephemeral").
27
+ - persist_path set: use PersistentClient(path). Data on disk. Returns (client, "persistent").
28
+ - mode "ephemeral": use EphemeralClient(). In-memory, no persistence. Returns (client, "ephemeral").
29
+ - mode "server": use HttpClient(host, port) only. Raises if server unreachable. Returns (client, "server").
30
+ - mode "auto": try server, then fall back to ephemeral. Returns (client, "server") or (client, "ephemeral").
28
31
  """
29
32
  import chromadb
30
33
  if persist_path is not None:
31
34
  client = chromadb.PersistentClient(path=persist_path)
32
35
  return client, "persistent"
36
+ if mode == "ephemeral":
37
+ return chromadb.EphemeralClient(), "ephemeral"
38
+ if mode == "server":
39
+ client = chromadb.HttpClient(host=host, port=port)
40
+ return client, "server"
41
+ # auto: try server, fall back to ephemeral
33
42
  try:
34
43
  client = chromadb.HttpClient(host=host, port=port)
35
44
  return client, "server"
@@ -11,6 +11,7 @@ from time import perf_counter
11
11
  from typing import Any
12
12
 
13
13
  from hecvec.chroma_client import add_documents, get_client
14
+ from hecvec.chroma_client import ChromaMode
14
15
  from hecvec.chunkers import ChunkingMethod, chunk_documents as chunk_documents_by_method
15
16
  from hecvec.embeddings import embed_texts
16
17
  from hecvec.env import load_openai_key
@@ -68,6 +69,8 @@ class Slicer:
68
69
  self.collection_name = collection_name
69
70
  self.chroma_host = chroma_host
70
71
  self.chroma_port = chroma_port
72
+ self.chroma_mode = chroma_mode
73
+ self.chroma_persist_path = chroma_persist_path
71
74
  self.embedding_model = embedding_model
72
75
  self.chunk_size = chunk_size
73
76
  self.chunk_overlap = chunk_overlap
@@ -88,6 +91,8 @@ class Slicer:
88
91
  collection_name=kwargs.pop("collection_name", self.collection_name),
89
92
  chroma_host=kwargs.pop("chroma_host", self.chroma_host),
90
93
  chroma_port=kwargs.pop("chroma_port", self.chroma_port),
94
+ chroma_mode=kwargs.pop("chroma_mode", self.chroma_mode),
95
+ chroma_persist_path=kwargs.pop("chroma_persist_path", self.chroma_persist_path),
91
96
  embedding_model=kwargs.pop("embedding_model", self.embedding_model),
92
97
  chunk_size=kwargs.pop("chunk_size", self.chunk_size),
93
98
  chunk_overlap=kwargs.pop("chunk_overlap", self.chunk_overlap),
@@ -108,6 +113,8 @@ class Slicer:
108
113
  collection_name: str = "hecvec",
109
114
  chroma_host: str = "localhost",
110
115
  chroma_port: int = 8000,
116
+ chroma_mode: ChromaMode = "auto",
117
+ chroma_persist_path: str | Path | None = None,
111
118
  embedding_model: str = "text-embedding-3-small",
112
119
  chunk_size: int = 200,
113
120
  chunk_overlap: int = 0,
@@ -209,11 +216,18 @@ class Slicer:
209
216
  # 5/5 Push to Chroma
210
217
  stage_start = perf_counter()
211
218
  logger.info("[5/5] Writing to Chroma | host=%s | port=%s | collection=%s", chroma_host, chroma_port, collection_name)
212
- client, chroma_mode = get_client(host=chroma_host, port=chroma_port)
213
- if chroma_mode == "server":
219
+ client, chroma_mode_used = get_client(
220
+ host=chroma_host,
221
+ port=chroma_port,
222
+ persist_path=str(chroma_persist_path) if chroma_persist_path is not None else None,
223
+ mode=chroma_mode,
224
+ )
225
+ if chroma_mode_used == "server":
214
226
  logger.info("[5/5] Using Chroma server at %s:%s", chroma_host, chroma_port)
227
+ elif chroma_mode_used == "persistent":
228
+ logger.info("[5/5] Using Chroma persistent storage at %s", chroma_persist_path)
215
229
  else:
216
- logger.info("[5/5] Chroma server unreachable; using in-memory (ephemeral) client")
230
+ logger.info("[5/5] Using Chroma in-memory (ephemeral) client")
217
231
  logger.info("[5/5] Adding %d document chunk(s) to collection...", len(documents))
218
232
  add_result = add_documents(client, collection_name, ids, embeddings, documents)
219
233
  if add_result["collection_existed"]:
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes