PyPI - hecvec - Versions diffs - 6.2.0__tar.gz → 6.4.0__tar.gz - Mend

hecvec 6.2.0tar.gz → 6.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

{hecvec-6.2.0 → hecvec-6.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hecvec
-Version: 6.2.0
+Version: 6.4.0
 Summary: List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma.
 License-Expression: MIT
 Keywords: chunking,document-pipeline,listdir,text-files
@@ -62,7 +62,7 @@ Description-Content-Type: text/markdown
 ## Install
-**Full pipeline** (list → read → chunk → embed → Chroma):
+**Full pipeline** (list → verify Chroma is up → read → chunk → embed → Chroma):
 ```bash
 pip install hecvec
@@ -85,18 +85,19 @@ To use the full `Slicer.slice(...)` pipeline you need:
 ## Workflow
-The main entry point is `Slicer.slice(path=..., **kwargs)`. It runs five steps:
+The main entry point is `Slicer.slice(path=..., **kwargs)`. It runs six logged steps:
 | Step | Description |
 |------|-------------|
 | **0** | Resolve path, resolve collection name (`base_name` + `_` + `chunking_method`). |
 | **1** | Discover files: single `.txt`/`.md` file or recursive list under a directory. |
-| **2** | Read file contents as text (UTF-8 with fallbacks). |
-| **3** | Chunk using the chosen method (`token`, `text`, `semantic`, or `llm`). |
-| **4** | Generate embeddings with OpenAI. |
-| **5** | Connect to Chroma; if the collection **already exists**, skip adding (no duplicate docs). Otherwise create the collection and add documents. |
+| **2** | **Chroma server check:** connect to the server and fail fast if nothing is listening (before read/chunk/embed so you don’t pay for OpenAI when Chroma is down). The client is reused for the final write. |
+| **3** | Read file contents as text (UTF-8 with fallbacks). |
+| **4** | Chunk using the chosen method (`token`, `text`, `semantic`, or `llm`). |
+| **5** | Generate embeddings with OpenAI. |
+| **6** | Connect (already verified in step 2), list collections; if the collection **already exists**, skip adding. Otherwise create the collection and add documents. |
-Progress is logged as `[0/5]` … `[5/5]` with timings. If the collection already exists, the log states that clearly and no new documents are added.
+Progress is logged as `[0/6]` … `[6/6]` with timings. If the collection already exists, the log states that clearly after embeddings and no new documents are added.
 ---

{hecvec-6.2.0 → hecvec-6.4.0}/README.md RENAMED Viewed

@@ -25,7 +25,7 @@
 ## Install
-**Full pipeline** (list → read → chunk → embed → Chroma):
+**Full pipeline** (list → verify Chroma is up → read → chunk → embed → Chroma):
 ```bash
 pip install hecvec
@@ -48,18 +48,19 @@ To use the full `Slicer.slice(...)` pipeline you need:
 ## Workflow
-The main entry point is `Slicer.slice(path=..., **kwargs)`. It runs five steps:
+The main entry point is `Slicer.slice(path=..., **kwargs)`. It runs six logged steps:
 | Step | Description |
 |------|-------------|
 | **0** | Resolve path, resolve collection name (`base_name` + `_` + `chunking_method`). |
 | **1** | Discover files: single `.txt`/`.md` file or recursive list under a directory. |
-| **2** | Read file contents as text (UTF-8 with fallbacks). |
-| **3** | Chunk using the chosen method (`token`, `text`, `semantic`, or `llm`). |
-| **4** | Generate embeddings with OpenAI. |
-| **5** | Connect to Chroma; if the collection **already exists**, skip adding (no duplicate docs). Otherwise create the collection and add documents. |
+| **2** | **Chroma server check:** connect to the server and fail fast if nothing is listening (before read/chunk/embed so you don’t pay for OpenAI when Chroma is down). The client is reused for the final write. |
+| **3** | Read file contents as text (UTF-8 with fallbacks). |
+| **4** | Chunk using the chosen method (`token`, `text`, `semantic`, or `llm`). |
+| **5** | Generate embeddings with OpenAI. |
+| **6** | Connect (already verified in step 2), list collections; if the collection **already exists**, skip adding. Otherwise create the collection and add documents. |
-Progress is logged as `[0/5]` … `[5/5]` with timings. If the collection already exists, the log states that clearly and no new documents are added.
+Progress is logged as `[0/6]` … `[6/6]` with timings. If the collection already exists, the log states that clearly after embeddings and no new documents are added.
 ---

{hecvec-6.2.0 → hecvec-6.4.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "hecvec"
-version = "6.2.0"
+version = "6.4.0"
 description = "List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma."
 readme = "README.md"
 requires-python = ">=3.9,<3.14"

{hecvec-6.2.0 → hecvec-6.4.0}/scripts/test_slice.py RENAMED Viewed

@@ -13,7 +13,7 @@ def main():
     path = Path(sys.argv[1]).expanduser().resolve() if len(sys.argv) >= 2 else (ROOT / "tests/CNSF-S0043-0032-2025_CONDUSEF-005190-08.txt")
     slicer = Slicer(db="chroma", host="localhost", port=8000)
     print("hecvec slicer config:", {"db": slicer.db, "host": slicer.host, "port": slicer.port, "auth": slicer.auth})
-    test = slicer.slice(path=path, collection_name="test", chunking_method="token",chunk_size=400)
+    test = slicer.slice(path=path, collection_name="concatenated", chunking_method="token",chunk_size=400, )
     print(test)
     collections = slicer.collections()

{hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/__init__.py RENAMED Viewed

@@ -35,4 +35,4 @@ __all__ = [
     "__version__",
 ]
-__version__ = "6.2.0"
+__version__ = "6.4.0"

{hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/chroma_client.py RENAMED Viewed

@@ -6,6 +6,7 @@ from __future__ import annotations
 import logging
 import socket
+import uuid
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
@@ -97,8 +98,11 @@ def add_documents(
     existing_names = [c.name for c in client.list_collections()]
     collection_existed = collection_name in existing_names
     coll = get_or_create_collection(client, collection_name)
+    # IDs produced by chunking restart at chunk_0 for each run; namespace by write to avoid collisions.
+    write_prefix = uuid.uuid4().hex
+    unique_ids = [f"{write_prefix}:{chunk_id}" for chunk_id in ids]
     try:
-        coll.add(ids=ids, embeddings=embeddings, documents=documents)
+        coll.add(ids=unique_ids, embeddings=embeddings, documents=documents)
     except chromadb.errors.InvalidArgumentError as e:
         if "dimension" not in str(e).lower():
             raise
@@ -107,6 +111,6 @@ def add_documents(
             name=collection_name,
             metadata={"hnsw:space": "cosine"},
         )
-        coll.add(ids=ids, embeddings=embeddings, documents=documents)
+        coll.add(ids=unique_ids, embeddings=embeddings, documents=documents)
         collection_existed = False  # we recreated it
     return {"collection_existed": collection_existed}

{hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/pipeline.py RENAMED Viewed

@@ -166,6 +166,7 @@ class Slicer:
         """
         Run the full pipeline: find path → listdir → filter .txt/.md → chunk → push to db.
         db='chroma' only; connection uses host and port. Server must be listening.
+        If the resolved collection name already exists, new chunks are appended to it (not skipped).
         """
         return cls._slice_impl(
             path,
@@ -213,7 +214,7 @@ class Slicer:
         _ensure_slice_logging()
         total_start = perf_counter()
-        logger.info("[0/5] Starting slice | path=%s | method=%s | model=%s", path, chunking_method, embedding_model)
+        logger.info("[0/6] Starting slice | path=%s | method=%s | model=%s", path, chunking_method, embedding_model)
         path = Path(path).resolve()
         if not path.exists():
@@ -257,21 +258,28 @@ class Slicer:
             if len(paths) > 10:
                 logger.info("  ... and %d more", len(paths) - 10)
-        logger.info("[1/5] File discovery completed in %.2fs (%d file(s))", perf_counter() - stage_start, len(paths))
+        logger.info("[1/6] File discovery completed in %.2fs (%d file(s))", perf_counter() - stage_start, len(paths))
-        # 2/5 Read as text
+        # 2/6 Chroma server check: fail fast if nothing is listening (before read/chunk/embed to save time & API cost).
+        # Step 6 always writes: if the collection name already exists, new chunks are appended (concatenated).
         stage_start = perf_counter()
-        logger.info("[2/5] Reading content from %d file(s)...", len(paths))
+        logger.info("[2/6] Chroma server check | db=%s | host=%s | port=%s", db, host, port)
+        client = _get_db_client(db, host=host, port=port, auth=auth)
+        logger.info("[2/6] Server reachable at %s:%s in %.2fs (client kept for final write)", host, port, perf_counter() - stage_start)
+        # 3/6 Read as text
+        stage_start = perf_counter()
+        logger.info("[3/6] Reading content from %d file(s)...", len(paths))
         reader = ReadText(paths)
         path_and_text = reader.read_all()
-        logger.info("[2/5] Read completed in %.2fs (%d/%d file(s) read)", perf_counter() - stage_start, len(path_and_text), len(paths))
+        logger.info("[3/6] Read completed in %.2fs (%d/%d file(s) read)", perf_counter() - stage_start, len(path_and_text), len(paths))
         if len(path_and_text) < len(paths):
             logger.warning("Some files could not be read and were skipped (%d skipped)", len(paths) - len(path_and_text))
-        # 3/5 Chunk (token, text, semantic, or llm)
+        # 4/6 Chunk (token, text, semantic, or llm)
         stage_start = perf_counter()
         api_key = openai_api_key or load_openai_key(dotenv_path)
-        logger.info("[3/5] Chunking | method=%s | chunk_size=%d | overlap=%d", chunking_method, chunk_size, chunk_overlap)
+        logger.info("[4/6] Chunking | method=%s | chunk_size=%d | overlap=%d", chunking_method, chunk_size, chunk_overlap)
         ids, documents = chunk_documents_by_method(
             path_and_text,
             method=chunking_method,
@@ -281,12 +289,12 @@ class Slicer:
             openai_api_key=api_key,
         )
         if not documents:
-            logger.warning("[3/5] No chunks generated (empty or non-text content)")
+            logger.warning("[4/6] No chunks generated (empty or non-text content)")
             return {"files": len(path_and_text), "chunks": 0, "collection": collection_name}
-        logger.info("[3/5] Chunking completed in %.2fs (%d chunk(s))", perf_counter() - stage_start, len(documents))
+        logger.info("[4/6] Chunking completed in %.2fs (%d chunk(s))", perf_counter() - stage_start, len(documents))
-        # 4/5 Embed
+        # 5/6 Embed
         api_key = openai_api_key or load_openai_key(dotenv_path)
         if not api_key:
             raise ValueError(
@@ -294,42 +302,57 @@ class Slicer:
                 "Set it in .env, pass openai_api_key=, or set the OPENAI_API_KEY env var."
             )
         stage_start = perf_counter()
-        logger.info("[4/5] Generating embeddings | model=%s | batch_size=%d | chunk_count=%d", embedding_model, batch_size, len(documents))
+        logger.info("[5/6] Generating embeddings | model=%s | batch_size=%d | chunk_count=%d", embedding_model, batch_size, len(documents))
         embeddings = embed_texts(
             documents,
             api_key=api_key,
             model=embedding_model,
             batch_size=batch_size,
         )
-        logger.info("[4/5] Embeddings completed in %.2fs (%d vector(s))", perf_counter() - stage_start, len(embeddings))
+        logger.info("[5/6] Embeddings completed in %.2fs (%d vector(s))", perf_counter() - stage_start, len(embeddings))
-        # 5/5 Push to db
+        # 6/6 Push to Chroma (reuse client from server check). Always add: append into an existing collection if the name is already in use.
         stage_start = perf_counter()
-        logger.info("[5/5] db=%s | host=%s | port=%s | collection=%s", db, host, port, collection_name)
-        client = _get_db_client(db, host=host, port=port, auth=auth)
-        logger.info("[5/5] Connected to %s at %s:%s", db, host, port)
-        existing_names = [c.name for c in client.list_collections()]
-        if collection_name in existing_names:
-            logger.info("[5/5] Collection %r already exists; skipping (no new documents added).", collection_name)
-            logger.info("Slice finished in %.2fs | files=%d | chunks=0 | collection=%s | (skipped: already exists)", perf_counter() - total_start, len(path_and_text), collection_name)
-            return {
-                "files": len(path_and_text),
-                "chunks": 0,
-                "collection": collection_name,
-                "message": "Collection already exists; no documents added.",
-            }
-        logger.info("[5/5] Adding %d document chunk(s) to collection...", len(documents))
-        add_documents(client, collection_name, ids, embeddings, documents)
-        logger.info("[5/5] Collection %r created (new)", collection_name)
-        logger.info("[5/5] Chroma write completed in %.2fs", perf_counter() - stage_start)
-        logger.info("Slice finished in %.2fs | files=%d | chunks=%d | collection=%s", perf_counter() - total_start, len(path_and_text), len(documents), collection_name)
-        return {
+        logger.info("[6/6] db=%s | host=%s | port=%s | collection=%s", db, host, port, collection_name)
+        logger.info("[6/6] Writing %d document chunk(s) to collection...", len(documents))
+        add_result = add_documents(client, collection_name, ids, embeddings, documents)
+        if add_result["collection_existed"]:
+            logger.info(
+                "[6/6] Collection %r already existed — concatenating: appending these chunks to that collection (same name).",
+                collection_name,
+            )
+        else:
+            logger.info("[6/6] Collection %r did not exist before this run; created and populated.", collection_name)
+        logger.info("[6/6] Chroma write completed in %.2fs", perf_counter() - stage_start)
+        if add_result["collection_existed"]:
+            logger.info(
+                "Slice finished in %.2fs | files=%d | chunks=%d | collection=%s | appended to existing collection",
+                perf_counter() - total_start,
+                len(path_and_text),
+                len(documents),
+                collection_name,
+            )
+        else:
+            logger.info(
+                "Slice finished in %.2fs | files=%d | chunks=%d | collection=%s",
+                perf_counter() - total_start,
+                len(path_and_text),
+                len(documents),
+                collection_name,
+            )
+        out: dict[str, Any] = {
             "files": len(path_and_text),
             "chunks": len(documents),
             "collection": collection_name,
+            "appended_to_existing": add_result["collection_existed"],
         }
+        if add_result["collection_existed"]:
+            out["message"] = (
+                f"Collection {collection_name!r} already existed; appended {len(documents)} chunk(s) (concatenated)."
+            )
+        return out
     @classmethod
     def collections_server(

{hecvec-6.2.0 → hecvec-6.4.0}/uv.lock RENAMED Viewed

@@ -599,7 +599,7 @@ wheels = [
 [[package]]
 name = "hecvec"
-version = "6.1.3"
+version = "6.4.0"
 source = { editable = "." }
 dependencies = [
     { name = "chromadb" },