hecvec 6.1.2__tar.gz → 6.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {hecvec-6.1.2 → hecvec-6.2.0}/PKG-INFO +8 -5
  2. {hecvec-6.1.2 → hecvec-6.2.0}/README.md +7 -4
  3. {hecvec-6.1.2 → hecvec-6.2.0}/pyproject.toml +1 -1
  4. {hecvec-6.1.2 → hecvec-6.2.0}/scripts/test_slice.py +2 -2
  5. {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/__init__.py +1 -1
  6. {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/pipeline.py +10 -1
  7. {hecvec-6.1.2 → hecvec-6.2.0}/uv.lock +1 -1
  8. {hecvec-6.1.2 → hecvec-6.2.0}/.gitignore +0 -0
  9. {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/_recursive_chunking.py +0 -0
  10. {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/chroma_client.py +0 -0
  11. {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/chroma_list.py +0 -0
  12. {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/chunkers.py +0 -0
  13. {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/chunking.py +0 -0
  14. {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/cli.py +0 -0
  15. {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/embeddings.py +0 -0
  16. {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/env.py +0 -0
  17. {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/hecvec.py +0 -0
  18. {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/listdir.py +0 -0
  19. {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/reading.py +0 -0
  20. {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/run_llm_chunk.py +0 -0
  21. {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/run_semantic_chunk.py +0 -0
  22. {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/token_splitter.py +0 -0
  23. {hecvec-6.1.2 → hecvec-6.2.0}/tests/conftest.py +0 -0
  24. {hecvec-6.1.2 → hecvec-6.2.0}/tests/test_env.py +0 -0
  25. {hecvec-6.1.2 → hecvec-6.2.0}/tests/test_listdir.py +0 -0
  26. {hecvec-6.1.2 → hecvec-6.2.0}/tests/test_reading.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hecvec
3
- Version: 6.1.2
3
+ Version: 6.2.0
4
4
  Summary: List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma.
5
5
  License-Expression: MIT
6
6
  Keywords: chunking,document-pipeline,listdir,text-files
@@ -107,7 +107,7 @@ import hecvec
107
107
 
108
108
  # Default: token chunking, Chroma at localhost:8000
109
109
  result = hecvec.Slicer.slice(path="/path/to/folder_or_file")
110
- # → {"files": N, "chunks": M, "collection": "folder_or_file_name_token"}
110
+ # → {"files": N, "chunks": M, "collection": "folder_or_file_name_token_cs200_ov0_enccl100k_base"}
111
111
 
112
112
  # Custom host/port and semantic chunking
113
113
  result = hecvec.Slicer.slice(
@@ -151,7 +151,7 @@ All of these can be passed to `Slicer(...)` or to `Slicer.slice(..., key=value)`
151
151
  |-----------|---------|-------------|
152
152
  | `path` | *(required)* | File or directory to process (`.txt`/`.md` only). |
153
153
  | `root` | `path.parent` (file) or `path` (dir) | Safe root for resolving paths (used when listing under a directory). |
154
- | `collection_name` | `"hecvec"` | Base name for the Chroma collection. If `"hecvec"`, it is replaced by the file stem or directory name; the final name is always `{collection_name}_{chunking_method}` (e.g. `mydoc_token`). |
154
+ | `collection_name` | `"hecvec"` | Base name for the Chroma collection. If `"hecvec"`, it is replaced by the file stem or directory name; the final name includes method + chunk config (so different `chunk_size`/`chunk_overlap` don’t collide). |
155
155
  | `db` | `"chroma"` | Database to use. Only `"chroma"` is supported. When `db="chroma"`, connection uses `host` and `port`. |
156
156
  | `host` | `"localhost"` | Server host (used when `db="chroma"`). Server must be listening. |
157
157
  | `port` | `8000` | Server port (used when `db="chroma"`). |
@@ -399,9 +399,12 @@ docker run -p 8000:8000 chromadb/chroma
399
399
  - **Directory:** `path.name` (e.g. `docs`)
400
400
  - The **final** collection name is always:
401
401
 
402
- **`{base_name}_{chunking_method}`**
402
+ **`{base_name}_{chunking_method}_{chunk_config}`**
403
403
 
404
- Examples: `mydoc_token`, `docs_semantic`, `CNSF-S0043-0032-2025_CONDUSEF-005190-08_token`.
404
+ Examples:
405
+ - token: `mydoc_token_cs200_ov0_enccl100k_base`
406
+ - text: `mydoc_text_cs400_ov0`
407
+ - llm/semantic: `mydoc_llm_cs200`
405
408
 
406
409
  - If a collection with that name **already exists**, the pipeline does **not** add documents again. It logs that the collection already exists and returns something like:
407
410
 
@@ -70,7 +70,7 @@ import hecvec
70
70
 
71
71
  # Default: token chunking, Chroma at localhost:8000
72
72
  result = hecvec.Slicer.slice(path="/path/to/folder_or_file")
73
- # → {"files": N, "chunks": M, "collection": "folder_or_file_name_token"}
73
+ # → {"files": N, "chunks": M, "collection": "folder_or_file_name_token_cs200_ov0_enccl100k_base"}
74
74
 
75
75
  # Custom host/port and semantic chunking
76
76
  result = hecvec.Slicer.slice(
@@ -114,7 +114,7 @@ All of these can be passed to `Slicer(...)` or to `Slicer.slice(..., key=value)`
114
114
  |-----------|---------|-------------|
115
115
  | `path` | *(required)* | File or directory to process (`.txt`/`.md` only). |
116
116
  | `root` | `path.parent` (file) or `path` (dir) | Safe root for resolving paths (used when listing under a directory). |
117
- | `collection_name` | `"hecvec"` | Base name for the Chroma collection. If `"hecvec"`, it is replaced by the file stem or directory name; the final name is always `{collection_name}_{chunking_method}` (e.g. `mydoc_token`). |
117
+ | `collection_name` | `"hecvec"` | Base name for the Chroma collection. If `"hecvec"`, it is replaced by the file stem or directory name; the final name includes method + chunk config (so different `chunk_size`/`chunk_overlap` don’t collide). |
118
118
  | `db` | `"chroma"` | Database to use. Only `"chroma"` is supported. When `db="chroma"`, connection uses `host` and `port`. |
119
119
  | `host` | `"localhost"` | Server host (used when `db="chroma"`). Server must be listening. |
120
120
  | `port` | `8000` | Server port (used when `db="chroma"`). |
@@ -362,9 +362,12 @@ docker run -p 8000:8000 chromadb/chroma
362
362
  - **Directory:** `path.name` (e.g. `docs`)
363
363
  - The **final** collection name is always:
364
364
 
365
- **`{base_name}_{chunking_method}`**
365
+ **`{base_name}_{chunking_method}_{chunk_config}`**
366
366
 
367
- Examples: `mydoc_token`, `docs_semantic`, `CNSF-S0043-0032-2025_CONDUSEF-005190-08_token`.
367
+ Examples:
368
+ - token: `mydoc_token_cs200_ov0_enccl100k_base`
369
+ - text: `mydoc_text_cs400_ov0`
370
+ - llm/semantic: `mydoc_llm_cs200`
368
371
 
369
372
  - If a collection with that name **already exists**, the pipeline does **not** add documents again. It logs that the collection already exists and returns something like:
370
373
 
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "hecvec"
7
- version = "6.1.2"
7
+ version = "6.2.0"
8
8
  description = "List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9,<3.14"
@@ -11,9 +11,9 @@ def main():
11
11
 
12
12
  from hecvec import Slicer
13
13
  path = Path(sys.argv[1]).expanduser().resolve() if len(sys.argv) >= 2 else (ROOT / "tests/CNSF-S0043-0032-2025_CONDUSEF-005190-08.txt")
14
- slicer = Slicer(db="chroma", host="localhosttt", port=8000)
14
+ slicer = Slicer(db="chroma", host="localhost", port=8000)
15
15
  print("hecvec slicer config:", {"db": slicer.db, "host": slicer.host, "port": slicer.port, "auth": slicer.auth})
16
- test = slicer.slice(path=path, collection_name="test", chunking_method="text")
16
+ test = slicer.slice(path=path, collection_name="test", chunking_method="token",chunk_size=400)
17
17
  print(test)
18
18
 
19
19
  collections = slicer.collections()
@@ -35,4 +35,4 @@ __all__ = [
35
35
  "__version__",
36
36
  ]
37
37
 
38
- __version__ = "6.1.2"
38
+ __version__ = "6.2.0"
@@ -221,7 +221,16 @@ class Slicer:
221
221
 
222
222
  if collection_name == "hecvec":
223
223
  collection_name = path.stem if path.is_file() else path.name
224
- collection_name = f"{collection_name}_{chunking_method}"
224
+ # Include chunking configuration in the collection name so reruns with a different
225
+ # chunk_size/chunk_overlap do not collide with an existing collection.
226
+ if chunking_method == "token":
227
+ cfg_suffix = f"cs{chunk_size}_ov{chunk_overlap}_enc{encoding_name}"
228
+ elif chunking_method == "text":
229
+ cfg_suffix = f"cs{chunk_size}_ov{chunk_overlap}"
230
+ else:
231
+ # For semantic/llm we keep the suffix minimal since advanced knobs aren't exposed on Slicer.
232
+ cfg_suffix = f"cs{chunk_size}"
233
+ collection_name = f"{collection_name}_{chunking_method}_{cfg_suffix}"
225
234
 
226
235
  logger.info("Resolved collection name: %s", collection_name)
227
236
 
@@ -599,7 +599,7 @@ wheels = [
599
599
 
600
600
  [[package]]
601
601
  name = "hecvec"
602
- version = "6.1.1"
602
+ version = "6.1.3"
603
603
  source = { editable = "." }
604
604
  dependencies = [
605
605
  { name = "chromadb" },
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes