PyPI - hecvec - Versions diffs - 6.1.2__tar.gz → 6.2.0__tar.gz - Mend

hecvec 6.1.2tar.gz → 6.2.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

{hecvec-6.1.2 → hecvec-6.2.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hecvec
-Version: 6.1.2
+Version: 6.2.0
 Summary: List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma.
 License-Expression: MIT
 Keywords: chunking,document-pipeline,listdir,text-files
@@ -107,7 +107,7 @@ import hecvec
 # Default: token chunking, Chroma at localhost:8000
 result = hecvec.Slicer.slice(path="/path/to/folder_or_file")
-# → {"files": N, "chunks": M, "collection": "folder_or_file_name_token"}
+# → {"files": N, "chunks": M, "collection": "folder_or_file_name_token_cs200_ov0_enccl100k_base"}
 # Custom host/port and semantic chunking
 result = hecvec.Slicer.slice(
@@ -151,7 +151,7 @@ All of these can be passed to `Slicer(...)` or to `Slicer.slice(..., key=value)`
 |-----------|---------|-------------|
 | `path` | *(required)* | File or directory to process (`.txt`/`.md` only). |
 | `root` | `path.parent` (file) or `path` (dir) | Safe root for resolving paths (used when listing under a directory). |
-| `collection_name` | `"hecvec"` | Base name for the Chroma collection. If `"hecvec"`, it is replaced by the file stem or directory name; the final name is always `{collection_name}_{chunking_method}` (e.g. `mydoc_token`). |
+| `collection_name` | `"hecvec"` | Base name for the Chroma collection. If `"hecvec"`, it is replaced by the file stem or directory name; the final name includes method + chunk config (so different `chunk_size`/`chunk_overlap` don’t collide). |
 | `db` | `"chroma"` | Database to use. Only `"chroma"` is supported. When `db="chroma"`, connection uses `host` and `port`. |
 | `host` | `"localhost"` | Server host (used when `db="chroma"`). Server must be listening. |
 | `port` | `8000` | Server port (used when `db="chroma"`). |
@@ -399,9 +399,12 @@ docker run -p 8000:8000 chromadb/chroma
   - **Directory:** `path.name` (e.g. `docs`)
 - The **final** collection name is always:
-  **`{base_name}_{chunking_method}`**
+  **`{base_name}_{chunking_method}_{chunk_config}`**
-  Examples: `mydoc_token`, `docs_semantic`, `CNSF-S0043-0032-2025_CONDUSEF-005190-08_token`.
+  Examples:
+  - token: `mydoc_token_cs200_ov0_enccl100k_base`
+  - text: `mydoc_text_cs400_ov0`
+  - llm/semantic: `mydoc_llm_cs200`
 - If a collection with that name **already exists**, the pipeline does **not** add documents again. It logs that the collection already exists and returns something like:

{hecvec-6.1.2 → hecvec-6.2.0}/README.md RENAMED Viewed

@@ -70,7 +70,7 @@ import hecvec
 # Default: token chunking, Chroma at localhost:8000
 result = hecvec.Slicer.slice(path="/path/to/folder_or_file")
-# → {"files": N, "chunks": M, "collection": "folder_or_file_name_token"}
+# → {"files": N, "chunks": M, "collection": "folder_or_file_name_token_cs200_ov0_enccl100k_base"}
 # Custom host/port and semantic chunking
 result = hecvec.Slicer.slice(
@@ -114,7 +114,7 @@ All of these can be passed to `Slicer(...)` or to `Slicer.slice(..., key=value)`
 |-----------|---------|-------------|
 | `path` | *(required)* | File or directory to process (`.txt`/`.md` only). |
 | `root` | `path.parent` (file) or `path` (dir) | Safe root for resolving paths (used when listing under a directory). |
-| `collection_name` | `"hecvec"` | Base name for the Chroma collection. If `"hecvec"`, it is replaced by the file stem or directory name; the final name is always `{collection_name}_{chunking_method}` (e.g. `mydoc_token`). |
+| `collection_name` | `"hecvec"` | Base name for the Chroma collection. If `"hecvec"`, it is replaced by the file stem or directory name; the final name includes method + chunk config (so different `chunk_size`/`chunk_overlap` don’t collide). |
 | `db` | `"chroma"` | Database to use. Only `"chroma"` is supported. When `db="chroma"`, connection uses `host` and `port`. |
 | `host` | `"localhost"` | Server host (used when `db="chroma"`). Server must be listening. |
 | `port` | `8000` | Server port (used when `db="chroma"`). |
@@ -362,9 +362,12 @@ docker run -p 8000:8000 chromadb/chroma
   - **Directory:** `path.name` (e.g. `docs`)
 - The **final** collection name is always:
-  **`{base_name}_{chunking_method}`**
+  **`{base_name}_{chunking_method}_{chunk_config}`**
-  Examples: `mydoc_token`, `docs_semantic`, `CNSF-S0043-0032-2025_CONDUSEF-005190-08_token`.
+  Examples:
+  - token: `mydoc_token_cs200_ov0_enccl100k_base`
+  - text: `mydoc_text_cs400_ov0`
+  - llm/semantic: `mydoc_llm_cs200`
 - If a collection with that name **already exists**, the pipeline does **not** add documents again. It logs that the collection already exists and returns something like:

{hecvec-6.1.2 → hecvec-6.2.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "hecvec"
-version = "6.1.2"
+version = "6.2.0"
 description = "List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma."
 readme = "README.md"
 requires-python = ">=3.9,<3.14"

{hecvec-6.1.2 → hecvec-6.2.0}/scripts/test_slice.py RENAMED Viewed

@@ -11,9 +11,9 @@ def main():
     from hecvec import Slicer
     path = Path(sys.argv[1]).expanduser().resolve() if len(sys.argv) >= 2 else (ROOT / "tests/CNSF-S0043-0032-2025_CONDUSEF-005190-08.txt")
-    slicer = Slicer(db="chroma", host="localhosttt", port=8000)
+    slicer = Slicer(db="chroma", host="localhost", port=8000)
     print("hecvec slicer config:", {"db": slicer.db, "host": slicer.host, "port": slicer.port, "auth": slicer.auth})
-    test = slicer.slice(path=path, collection_name="test", chunking_method="text")
+    test = slicer.slice(path=path, collection_name="test", chunking_method="token",chunk_size=400)
     print(test)
     collections = slicer.collections()

{hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/__init__.py RENAMED Viewed

@@ -35,4 +35,4 @@ __all__ = [
     "__version__",
 ]
-__version__ = "6.1.2"
+__version__ = "6.2.0"

{hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/pipeline.py RENAMED Viewed

@@ -221,7 +221,16 @@ class Slicer:
         if collection_name == "hecvec":
             collection_name = path.stem if path.is_file() else path.name
-        collection_name = f"{collection_name}_{chunking_method}"
+        # Include chunking configuration in the collection name so reruns with a different
+        # chunk_size/chunk_overlap do not collide with an existing collection.
+        if chunking_method == "token":
+            cfg_suffix = f"cs{chunk_size}_ov{chunk_overlap}_enc{encoding_name}"
+        elif chunking_method == "text":
+            cfg_suffix = f"cs{chunk_size}_ov{chunk_overlap}"
+        else:
+            # For semantic/llm we keep the suffix minimal since advanced knobs aren't exposed on Slicer.
+            cfg_suffix = f"cs{chunk_size}"
+        collection_name = f"{collection_name}_{chunking_method}_{cfg_suffix}"
         logger.info("Resolved collection name: %s", collection_name)

{hecvec-6.1.2 → hecvec-6.2.0}/uv.lock RENAMED Viewed

@@ -599,7 +599,7 @@ wheels = [
 [[package]]
 name = "hecvec"
-version = "6.1.1"
+version = "6.1.3"
 source = { editable = "." }
 dependencies = [
     { name = "chromadb" },