hecvec 6.1.2__tar.gz → 6.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hecvec-6.1.2 → hecvec-6.2.0}/PKG-INFO +8 -5
- {hecvec-6.1.2 → hecvec-6.2.0}/README.md +7 -4
- {hecvec-6.1.2 → hecvec-6.2.0}/pyproject.toml +1 -1
- {hecvec-6.1.2 → hecvec-6.2.0}/scripts/test_slice.py +2 -2
- {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/__init__.py +1 -1
- {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/pipeline.py +10 -1
- {hecvec-6.1.2 → hecvec-6.2.0}/uv.lock +1 -1
- {hecvec-6.1.2 → hecvec-6.2.0}/.gitignore +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/_recursive_chunking.py +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/chroma_client.py +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/chroma_list.py +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/chunkers.py +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/chunking.py +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/cli.py +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/embeddings.py +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/env.py +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/hecvec.py +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/listdir.py +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/reading.py +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/run_llm_chunk.py +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/run_semantic_chunk.py +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/src/hecvec/token_splitter.py +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/tests/conftest.py +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/tests/test_env.py +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/tests/test_listdir.py +0 -0
- {hecvec-6.1.2 → hecvec-6.2.0}/tests/test_reading.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hecvec
|
|
3
|
-
Version: 6.
|
|
3
|
+
Version: 6.2.0
|
|
4
4
|
Summary: List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma.
|
|
5
5
|
License-Expression: MIT
|
|
6
6
|
Keywords: chunking,document-pipeline,listdir,text-files
|
|
@@ -107,7 +107,7 @@ import hecvec
|
|
|
107
107
|
|
|
108
108
|
# Default: token chunking, Chroma at localhost:8000
|
|
109
109
|
result = hecvec.Slicer.slice(path="/path/to/folder_or_file")
|
|
110
|
-
# → {"files": N, "chunks": M, "collection": "
|
|
110
|
+
# → {"files": N, "chunks": M, "collection": "folder_or_file_name_token_cs200_ov0_enccl100k_base"}
|
|
111
111
|
|
|
112
112
|
# Custom host/port and semantic chunking
|
|
113
113
|
result = hecvec.Slicer.slice(
|
|
@@ -151,7 +151,7 @@ All of these can be passed to `Slicer(...)` or to `Slicer.slice(..., key=value)`
|
|
|
151
151
|
|-----------|---------|-------------|
|
|
152
152
|
| `path` | *(required)* | File or directory to process (`.txt`/`.md` only). |
|
|
153
153
|
| `root` | `path.parent` (file) or `path` (dir) | Safe root for resolving paths (used when listing under a directory). |
|
|
154
|
-
| `collection_name` | `"hecvec"` | Base name for the Chroma collection. If `"hecvec"`, it is replaced by the file stem or directory name; the final name
|
|
154
|
+
| `collection_name` | `"hecvec"` | Base name for the Chroma collection. If `"hecvec"`, it is replaced by the file stem or directory name; the final name includes method + chunk config (so different `chunk_size`/`chunk_overlap` don’t collide). |
|
|
155
155
|
| `db` | `"chroma"` | Database to use. Only `"chroma"` is supported. When `db="chroma"`, connection uses `host` and `port`. |
|
|
156
156
|
| `host` | `"localhost"` | Server host (used when `db="chroma"`). Server must be listening. |
|
|
157
157
|
| `port` | `8000` | Server port (used when `db="chroma"`). |
|
|
@@ -399,9 +399,12 @@ docker run -p 8000:8000 chromadb/chroma
|
|
|
399
399
|
- **Directory:** `path.name` (e.g. `docs`)
|
|
400
400
|
- The **final** collection name is always:
|
|
401
401
|
|
|
402
|
-
**`{base_name}_{chunking_method}`**
|
|
402
|
+
**`{base_name}_{chunking_method}_{chunk_config}`**
|
|
403
403
|
|
|
404
|
-
Examples:
|
|
404
|
+
Examples:
|
|
405
|
+
- token: `mydoc_token_cs200_ov0_enccl100k_base`
|
|
406
|
+
- text: `mydoc_text_cs400_ov0`
|
|
407
|
+
- llm/semantic: `mydoc_llm_cs200`
|
|
405
408
|
|
|
406
409
|
- If a collection with that name **already exists**, the pipeline does **not** add documents again. It logs that the collection already exists and returns something like:
|
|
407
410
|
|
|
@@ -70,7 +70,7 @@ import hecvec
|
|
|
70
70
|
|
|
71
71
|
# Default: token chunking, Chroma at localhost:8000
|
|
72
72
|
result = hecvec.Slicer.slice(path="/path/to/folder_or_file")
|
|
73
|
-
# → {"files": N, "chunks": M, "collection": "
|
|
73
|
+
# → {"files": N, "chunks": M, "collection": "folder_or_file_name_token_cs200_ov0_enccl100k_base"}
|
|
74
74
|
|
|
75
75
|
# Custom host/port and semantic chunking
|
|
76
76
|
result = hecvec.Slicer.slice(
|
|
@@ -114,7 +114,7 @@ All of these can be passed to `Slicer(...)` or to `Slicer.slice(..., key=value)`
|
|
|
114
114
|
|-----------|---------|-------------|
|
|
115
115
|
| `path` | *(required)* | File or directory to process (`.txt`/`.md` only). |
|
|
116
116
|
| `root` | `path.parent` (file) or `path` (dir) | Safe root for resolving paths (used when listing under a directory). |
|
|
117
|
-
| `collection_name` | `"hecvec"` | Base name for the Chroma collection. If `"hecvec"`, it is replaced by the file stem or directory name; the final name
|
|
117
|
+
| `collection_name` | `"hecvec"` | Base name for the Chroma collection. If `"hecvec"`, it is replaced by the file stem or directory name; the final name includes method + chunk config (so different `chunk_size`/`chunk_overlap` don’t collide). |
|
|
118
118
|
| `db` | `"chroma"` | Database to use. Only `"chroma"` is supported. When `db="chroma"`, connection uses `host` and `port`. |
|
|
119
119
|
| `host` | `"localhost"` | Server host (used when `db="chroma"`). Server must be listening. |
|
|
120
120
|
| `port` | `8000` | Server port (used when `db="chroma"`). |
|
|
@@ -362,9 +362,12 @@ docker run -p 8000:8000 chromadb/chroma
|
|
|
362
362
|
- **Directory:** `path.name` (e.g. `docs`)
|
|
363
363
|
- The **final** collection name is always:
|
|
364
364
|
|
|
365
|
-
**`{base_name}_{chunking_method}`**
|
|
365
|
+
**`{base_name}_{chunking_method}_{chunk_config}`**
|
|
366
366
|
|
|
367
|
-
Examples:
|
|
367
|
+
Examples:
|
|
368
|
+
- token: `mydoc_token_cs200_ov0_enccl100k_base`
|
|
369
|
+
- text: `mydoc_text_cs400_ov0`
|
|
370
|
+
- llm/semantic: `mydoc_llm_cs200`
|
|
368
371
|
|
|
369
372
|
- If a collection with that name **already exists**, the pipeline does **not** add documents again. It logs that the collection already exists and returns something like:
|
|
370
373
|
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "hecvec"
|
|
7
|
-
version = "6.
|
|
7
|
+
version = "6.2.0"
|
|
8
8
|
description = "List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9,<3.14"
|
|
@@ -11,9 +11,9 @@ def main():
|
|
|
11
11
|
|
|
12
12
|
from hecvec import Slicer
|
|
13
13
|
path = Path(sys.argv[1]).expanduser().resolve() if len(sys.argv) >= 2 else (ROOT / "tests/CNSF-S0043-0032-2025_CONDUSEF-005190-08.txt")
|
|
14
|
-
slicer = Slicer(db="chroma", host="
|
|
14
|
+
slicer = Slicer(db="chroma", host="localhost", port=8000)
|
|
15
15
|
print("hecvec slicer config:", {"db": slicer.db, "host": slicer.host, "port": slicer.port, "auth": slicer.auth})
|
|
16
|
-
test = slicer.slice(path=path, collection_name="test", chunking_method="
|
|
16
|
+
test = slicer.slice(path=path, collection_name="test", chunking_method="token",chunk_size=400)
|
|
17
17
|
print(test)
|
|
18
18
|
|
|
19
19
|
collections = slicer.collections()
|
|
@@ -221,7 +221,16 @@ class Slicer:
|
|
|
221
221
|
|
|
222
222
|
if collection_name == "hecvec":
|
|
223
223
|
collection_name = path.stem if path.is_file() else path.name
|
|
224
|
-
|
|
224
|
+
# Include chunking configuration in the collection name so reruns with a different
|
|
225
|
+
# chunk_size/chunk_overlap do not collide with an existing collection.
|
|
226
|
+
if chunking_method == "token":
|
|
227
|
+
cfg_suffix = f"cs{chunk_size}_ov{chunk_overlap}_enc{encoding_name}"
|
|
228
|
+
elif chunking_method == "text":
|
|
229
|
+
cfg_suffix = f"cs{chunk_size}_ov{chunk_overlap}"
|
|
230
|
+
else:
|
|
231
|
+
# For semantic/llm we keep the suffix minimal since advanced knobs aren't exposed on Slicer.
|
|
232
|
+
cfg_suffix = f"cs{chunk_size}"
|
|
233
|
+
collection_name = f"{collection_name}_{chunking_method}_{cfg_suffix}"
|
|
225
234
|
|
|
226
235
|
logger.info("Resolved collection name: %s", collection_name)
|
|
227
236
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|