PyPI - hecvec - Versions diffs - 6.2.0__tar.gz → 6.3.0__tar.gz - Mend

hecvec 6.2.0tar.gz → 6.3.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

{hecvec-6.2.0 → hecvec-6.3.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hecvec
-Version: 6.2.0
+Version: 6.3.0
 Summary: List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma.
 License-Expression: MIT
 Keywords: chunking,document-pipeline,listdir,text-files
@@ -62,7 +62,7 @@ Description-Content-Type: text/markdown
 ## Install
-**Full pipeline** (list → read → chunk → embed → Chroma):
+**Full pipeline** (list → verify Chroma is up → read → chunk → embed → Chroma):
 ```bash
 pip install hecvec
@@ -85,18 +85,19 @@ To use the full `Slicer.slice(...)` pipeline you need:
 ## Workflow
-The main entry point is `Slicer.slice(path=..., **kwargs)`. It runs five steps:
+The main entry point is `Slicer.slice(path=..., **kwargs)`. It runs six logged steps:
 | Step | Description |
 |------|-------------|
 | **0** | Resolve path, resolve collection name (`base_name` + `_` + `chunking_method`). |
 | **1** | Discover files: single `.txt`/`.md` file or recursive list under a directory. |
-| **2** | Read file contents as text (UTF-8 with fallbacks). |
-| **3** | Chunk using the chosen method (`token`, `text`, `semantic`, or `llm`). |
-| **4** | Generate embeddings with OpenAI. |
-| **5** | Connect to Chroma; if the collection **already exists**, skip adding (no duplicate docs). Otherwise create the collection and add documents. |
+| **2** | **Chroma server check:** connect to the server and fail fast if nothing is listening (before read/chunk/embed so you don’t pay for OpenAI when Chroma is down). The client is reused for the final write. |
+| **3** | Read file contents as text (UTF-8 with fallbacks). |
+| **4** | Chunk using the chosen method (`token`, `text`, `semantic`, or `llm`). |
+| **5** | Generate embeddings with OpenAI. |
+| **6** | Connect (already verified in step 2), list collections; if the collection **already exists**, skip adding. Otherwise create the collection and add documents. |
-Progress is logged as `[0/5]` … `[5/5]` with timings. If the collection already exists, the log states that clearly and no new documents are added.
+Progress is logged as `[0/6]` … `[6/6]` with timings. If the collection already exists, the log states that clearly after embeddings and no new documents are added.
 ---

{hecvec-6.2.0 → hecvec-6.3.0}/README.md RENAMED Viewed

@@ -25,7 +25,7 @@
 ## Install
-**Full pipeline** (list → read → chunk → embed → Chroma):
+**Full pipeline** (list → verify Chroma is up → read → chunk → embed → Chroma):
 ```bash
 pip install hecvec
@@ -48,18 +48,19 @@ To use the full `Slicer.slice(...)` pipeline you need:
 ## Workflow
-The main entry point is `Slicer.slice(path=..., **kwargs)`. It runs five steps:
+The main entry point is `Slicer.slice(path=..., **kwargs)`. It runs six logged steps:
 | Step | Description |
 |------|-------------|
 | **0** | Resolve path, resolve collection name (`base_name` + `_` + `chunking_method`). |
 | **1** | Discover files: single `.txt`/`.md` file or recursive list under a directory. |
-| **2** | Read file contents as text (UTF-8 with fallbacks). |
-| **3** | Chunk using the chosen method (`token`, `text`, `semantic`, or `llm`). |
-| **4** | Generate embeddings with OpenAI. |
-| **5** | Connect to Chroma; if the collection **already exists**, skip adding (no duplicate docs). Otherwise create the collection and add documents. |
+| **2** | **Chroma server check:** connect to the server and fail fast if nothing is listening (before read/chunk/embed so you don’t pay for OpenAI when Chroma is down). The client is reused for the final write. |
+| **3** | Read file contents as text (UTF-8 with fallbacks). |
+| **4** | Chunk using the chosen method (`token`, `text`, `semantic`, or `llm`). |
+| **5** | Generate embeddings with OpenAI. |
+| **6** | Connect (already verified in step 2), list collections; if the collection **already exists**, skip adding. Otherwise create the collection and add documents. |
-Progress is logged as `[0/5]` … `[5/5]` with timings. If the collection already exists, the log states that clearly and no new documents are added.
+Progress is logged as `[0/6]` … `[6/6]` with timings. If the collection already exists, the log states that clearly after embeddings and no new documents are added.
 ---

{hecvec-6.2.0 → hecvec-6.3.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "hecvec"
-version = "6.2.0"
+version = "6.3.0"
 description = "List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma."
 readme = "README.md"
 requires-python = ">=3.9,<3.14"

{hecvec-6.2.0 → hecvec-6.3.0}/scripts/test_slice.py RENAMED Viewed

@@ -13,7 +13,7 @@ def main():
     path = Path(sys.argv[1]).expanduser().resolve() if len(sys.argv) >= 2 else (ROOT / "tests/CNSF-S0043-0032-2025_CONDUSEF-005190-08.txt")
     slicer = Slicer(db="chroma", host="localhost", port=8000)
     print("hecvec slicer config:", {"db": slicer.db, "host": slicer.host, "port": slicer.port, "auth": slicer.auth})
-    test = slicer.slice(path=path, collection_name="test", chunking_method="token",chunk_size=400)
+    test = slicer.slice(path=path, collection_name="test", chunking_method="token",chunk_size=400, )
     print(test)
     collections = slicer.collections()

{hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/__init__.py RENAMED Viewed

@@ -35,4 +35,4 @@ __all__ = [
     "__version__",
 ]
-__version__ = "6.2.0"
+__version__ = "6.3.0"

{hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/pipeline.py RENAMED Viewed

@@ -213,7 +213,7 @@ class Slicer:
         _ensure_slice_logging()
         total_start = perf_counter()
-        logger.info("[0/5] Starting slice | path=%s | method=%s | model=%s", path, chunking_method, embedding_model)
+        logger.info("[0/6] Starting slice | path=%s | method=%s | model=%s", path, chunking_method, embedding_model)
         path = Path(path).resolve()
         if not path.exists():
@@ -257,21 +257,28 @@ class Slicer:
             if len(paths) > 10:
                 logger.info("  ... and %d more", len(paths) - 10)
-        logger.info("[1/5] File discovery completed in %.2fs (%d file(s))", perf_counter() - stage_start, len(paths))
+        logger.info("[1/6] File discovery completed in %.2fs (%d file(s))", perf_counter() - stage_start, len(paths))
-        # 2/5 Read as text
+        # 2/6 Chroma server check: fail fast if nothing is listening (before read/chunk/embed to save time & API cost).
+        # Collection "already exists" is still handled at the end, after embeddings, unchanged.
         stage_start = perf_counter()
-        logger.info("[2/5] Reading content from %d file(s)...", len(paths))
+        logger.info("[2/6] Chroma server check | db=%s | host=%s | port=%s", db, host, port)
+        client = _get_db_client(db, host=host, port=port, auth=auth)
+        logger.info("[2/6] Server reachable at %s:%s in %.2fs (client kept for final write)", host, port, perf_counter() - stage_start)
+        # 3/6 Read as text
+        stage_start = perf_counter()
+        logger.info("[3/6] Reading content from %d file(s)...", len(paths))
         reader = ReadText(paths)
         path_and_text = reader.read_all()
-        logger.info("[2/5] Read completed in %.2fs (%d/%d file(s) read)", perf_counter() - stage_start, len(path_and_text), len(paths))
+        logger.info("[3/6] Read completed in %.2fs (%d/%d file(s) read)", perf_counter() - stage_start, len(path_and_text), len(paths))
         if len(path_and_text) < len(paths):
             logger.warning("Some files could not be read and were skipped (%d skipped)", len(paths) - len(path_and_text))
-        # 3/5 Chunk (token, text, semantic, or llm)
+        # 4/6 Chunk (token, text, semantic, or llm)
         stage_start = perf_counter()
         api_key = openai_api_key or load_openai_key(dotenv_path)
-        logger.info("[3/5] Chunking | method=%s | chunk_size=%d | overlap=%d", chunking_method, chunk_size, chunk_overlap)
+        logger.info("[4/6] Chunking | method=%s | chunk_size=%d | overlap=%d", chunking_method, chunk_size, chunk_overlap)
         ids, documents = chunk_documents_by_method(
             path_and_text,
             method=chunking_method,
@@ -281,12 +288,12 @@ class Slicer:
             openai_api_key=api_key,
         )
         if not documents:
-            logger.warning("[3/5] No chunks generated (empty or non-text content)")
+            logger.warning("[4/6] No chunks generated (empty or non-text content)")
             return {"files": len(path_and_text), "chunks": 0, "collection": collection_name}
-        logger.info("[3/5] Chunking completed in %.2fs (%d chunk(s))", perf_counter() - stage_start, len(documents))
+        logger.info("[4/6] Chunking completed in %.2fs (%d chunk(s))", perf_counter() - stage_start, len(documents))
-        # 4/5 Embed
+        # 5/6 Embed
         api_key = openai_api_key or load_openai_key(dotenv_path)
         if not api_key:
             raise ValueError(
@@ -294,23 +301,21 @@ class Slicer:
                 "Set it in .env, pass openai_api_key=, or set the OPENAI_API_KEY env var."
             )
         stage_start = perf_counter()
-        logger.info("[4/5] Generating embeddings | model=%s | batch_size=%d | chunk_count=%d", embedding_model, batch_size, len(documents))
+        logger.info("[5/6] Generating embeddings | model=%s | batch_size=%d | chunk_count=%d", embedding_model, batch_size, len(documents))
         embeddings = embed_texts(
             documents,
             api_key=api_key,
             model=embedding_model,
             batch_size=batch_size,
         )
-        logger.info("[4/5] Embeddings completed in %.2fs (%d vector(s))", perf_counter() - stage_start, len(embeddings))
+        logger.info("[5/6] Embeddings completed in %.2fs (%d vector(s))", perf_counter() - stage_start, len(embeddings))
-        # 5/5 Push to db
+        # 6/6 Push to Chroma (reuse client from server check; if collection exists, skip add — same as before)
         stage_start = perf_counter()
-        logger.info("[5/5] db=%s | host=%s | port=%s | collection=%s", db, host, port, collection_name)
-        client = _get_db_client(db, host=host, port=port, auth=auth)
-        logger.info("[5/5] Connected to %s at %s:%s", db, host, port)
+        logger.info("[6/6] db=%s | host=%s | port=%s | collection=%s", db, host, port, collection_name)
         existing_names = [c.name for c in client.list_collections()]
         if collection_name in existing_names:
-            logger.info("[5/5] Collection %r already exists; skipping (no new documents added).", collection_name)
+            logger.info("[6/6] Collection %r already exists; skipping (no new documents added).", collection_name)
             logger.info("Slice finished in %.2fs | files=%d | chunks=0 | collection=%s | (skipped: already exists)", perf_counter() - total_start, len(path_and_text), collection_name)
             return {
                 "files": len(path_and_text),
@@ -318,10 +323,10 @@ class Slicer:
                 "collection": collection_name,
                 "message": "Collection already exists; no documents added.",
             }
-        logger.info("[5/5] Adding %d document chunk(s) to collection...", len(documents))
+        logger.info("[6/6] Adding %d document chunk(s) to collection...", len(documents))
         add_documents(client, collection_name, ids, embeddings, documents)
-        logger.info("[5/5] Collection %r created (new)", collection_name)
-        logger.info("[5/5] Chroma write completed in %.2fs", perf_counter() - stage_start)
+        logger.info("[6/6] Collection %r created (new)", collection_name)
+        logger.info("[6/6] Chroma write completed in %.2fs", perf_counter() - stage_start)
         logger.info("Slice finished in %.2fs | files=%d | chunks=%d | collection=%s", perf_counter() - total_start, len(path_and_text), len(documents), collection_name)

{hecvec-6.2.0 → hecvec-6.3.0}/uv.lock RENAMED Viewed

@@ -599,7 +599,7 @@ wheels = [
 [[package]]
 name = "hecvec"
-version = "6.1.3"
+version = "6.3.0"
 source = { editable = "." }
 dependencies = [
     { name = "chromadb" },