hecvec 6.2.0__tar.gz → 6.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hecvec-6.2.0 → hecvec-6.4.0}/PKG-INFO +9 -8
- {hecvec-6.2.0 → hecvec-6.4.0}/README.md +8 -7
- {hecvec-6.2.0 → hecvec-6.4.0}/pyproject.toml +1 -1
- {hecvec-6.2.0 → hecvec-6.4.0}/scripts/test_slice.py +1 -1
- {hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/__init__.py +1 -1
- {hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/chroma_client.py +6 -2
- {hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/pipeline.py +57 -34
- {hecvec-6.2.0 → hecvec-6.4.0}/uv.lock +1 -1
- {hecvec-6.2.0 → hecvec-6.4.0}/.gitignore +0 -0
- {hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/_recursive_chunking.py +0 -0
- {hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/chroma_list.py +0 -0
- {hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/chunkers.py +0 -0
- {hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/chunking.py +0 -0
- {hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/cli.py +0 -0
- {hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/embeddings.py +0 -0
- {hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/env.py +0 -0
- {hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/hecvec.py +0 -0
- {hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/listdir.py +0 -0
- {hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/reading.py +0 -0
- {hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/run_llm_chunk.py +0 -0
- {hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/run_semantic_chunk.py +0 -0
- {hecvec-6.2.0 → hecvec-6.4.0}/src/hecvec/token_splitter.py +0 -0
- {hecvec-6.2.0 → hecvec-6.4.0}/tests/conftest.py +0 -0
- {hecvec-6.2.0 → hecvec-6.4.0}/tests/test_env.py +0 -0
- {hecvec-6.2.0 → hecvec-6.4.0}/tests/test_listdir.py +0 -0
- {hecvec-6.2.0 → hecvec-6.4.0}/tests/test_reading.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hecvec
|
|
3
|
-
Version: 6.
|
|
3
|
+
Version: 6.4.0
|
|
4
4
|
Summary: List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma.
|
|
5
5
|
License-Expression: MIT
|
|
6
6
|
Keywords: chunking,document-pipeline,listdir,text-files
|
|
@@ -62,7 +62,7 @@ Description-Content-Type: text/markdown
|
|
|
62
62
|
|
|
63
63
|
## Install
|
|
64
64
|
|
|
65
|
-
**Full pipeline** (list → read → chunk → embed → Chroma):
|
|
65
|
+
**Full pipeline** (list → verify Chroma is up → read → chunk → embed → Chroma):
|
|
66
66
|
|
|
67
67
|
```bash
|
|
68
68
|
pip install hecvec
|
|
@@ -85,18 +85,19 @@ To use the full `Slicer.slice(...)` pipeline you need:
|
|
|
85
85
|
|
|
86
86
|
## Workflow
|
|
87
87
|
|
|
88
|
-
The main entry point is `Slicer.slice(path=..., **kwargs)`. It runs
|
|
88
|
+
The main entry point is `Slicer.slice(path=..., **kwargs)`. It runs six logged steps:
|
|
89
89
|
|
|
90
90
|
| Step | Description |
|
|
91
91
|
|------|-------------|
|
|
92
92
|
| **0** | Resolve path, resolve collection name (`base_name` + `_` + `chunking_method`). |
|
|
93
93
|
| **1** | Discover files: single `.txt`/`.md` file or recursive list under a directory. |
|
|
94
|
-
| **2** |
|
|
95
|
-
| **3** |
|
|
96
|
-
| **4** |
|
|
97
|
-
| **5** |
|
|
94
|
+
| **2** | **Chroma server check:** connect to the server and fail fast if nothing is listening (before read/chunk/embed so you don’t pay for OpenAI when Chroma is down). The client is reused for the final write. |
|
|
95
|
+
| **3** | Read file contents as text (UTF-8 with fallbacks). |
|
|
96
|
+
| **4** | Chunk using the chosen method (`token`, `text`, `semantic`, or `llm`). |
|
|
97
|
+
| **5** | Generate embeddings with OpenAI. |
|
|
98
|
+
| **6** | Connect (already verified in step 2), list collections; if the collection **already exists**, skip adding. Otherwise create the collection and add documents. |
|
|
98
99
|
|
|
99
|
-
Progress is logged as `[0/
|
|
100
|
+
Progress is logged as `[0/6]` … `[6/6]` with timings. If the collection already exists, the log states that clearly after embeddings and no new documents are added.
|
|
100
101
|
|
|
101
102
|
---
|
|
102
103
|
|
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
|
|
26
26
|
## Install
|
|
27
27
|
|
|
28
|
-
**Full pipeline** (list → read → chunk → embed → Chroma):
|
|
28
|
+
**Full pipeline** (list → verify Chroma is up → read → chunk → embed → Chroma):
|
|
29
29
|
|
|
30
30
|
```bash
|
|
31
31
|
pip install hecvec
|
|
@@ -48,18 +48,19 @@ To use the full `Slicer.slice(...)` pipeline you need:
|
|
|
48
48
|
|
|
49
49
|
## Workflow
|
|
50
50
|
|
|
51
|
-
The main entry point is `Slicer.slice(path=..., **kwargs)`. It runs
|
|
51
|
+
The main entry point is `Slicer.slice(path=..., **kwargs)`. It runs six logged steps:
|
|
52
52
|
|
|
53
53
|
| Step | Description |
|
|
54
54
|
|------|-------------|
|
|
55
55
|
| **0** | Resolve path, resolve collection name (`base_name` + `_` + `chunking_method`). |
|
|
56
56
|
| **1** | Discover files: single `.txt`/`.md` file or recursive list under a directory. |
|
|
57
|
-
| **2** |
|
|
58
|
-
| **3** |
|
|
59
|
-
| **4** |
|
|
60
|
-
| **5** |
|
|
57
|
+
| **2** | **Chroma server check:** connect to the server and fail fast if nothing is listening (before read/chunk/embed so you don’t pay for OpenAI when Chroma is down). The client is reused for the final write. |
|
|
58
|
+
| **3** | Read file contents as text (UTF-8 with fallbacks). |
|
|
59
|
+
| **4** | Chunk using the chosen method (`token`, `text`, `semantic`, or `llm`). |
|
|
60
|
+
| **5** | Generate embeddings with OpenAI. |
|
|
61
|
+
| **6** | Connect (already verified in step 2), list collections; if the collection **already exists**, skip adding. Otherwise create the collection and add documents. |
|
|
61
62
|
|
|
62
|
-
Progress is logged as `[0/
|
|
63
|
+
Progress is logged as `[0/6]` … `[6/6]` with timings. If the collection already exists, the log states that clearly after embeddings and no new documents are added.
|
|
63
64
|
|
|
64
65
|
---
|
|
65
66
|
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "hecvec"
|
|
7
|
-
version = "6.
|
|
7
|
+
version = "6.4.0"
|
|
8
8
|
description = "List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9,<3.14"
|
|
@@ -13,7 +13,7 @@ def main():
|
|
|
13
13
|
path = Path(sys.argv[1]).expanduser().resolve() if len(sys.argv) >= 2 else (ROOT / "tests/CNSF-S0043-0032-2025_CONDUSEF-005190-08.txt")
|
|
14
14
|
slicer = Slicer(db="chroma", host="localhost", port=8000)
|
|
15
15
|
print("hecvec slicer config:", {"db": slicer.db, "host": slicer.host, "port": slicer.port, "auth": slicer.auth})
|
|
16
|
-
test = slicer.slice(path=path, collection_name="
|
|
16
|
+
test = slicer.slice(path=path, collection_name="concatenated", chunking_method="token",chunk_size=400, )
|
|
17
17
|
print(test)
|
|
18
18
|
|
|
19
19
|
collections = slicer.collections()
|
|
@@ -6,6 +6,7 @@ from __future__ import annotations
|
|
|
6
6
|
|
|
7
7
|
import logging
|
|
8
8
|
import socket
|
|
9
|
+
import uuid
|
|
9
10
|
from typing import TYPE_CHECKING
|
|
10
11
|
|
|
11
12
|
if TYPE_CHECKING:
|
|
@@ -97,8 +98,11 @@ def add_documents(
|
|
|
97
98
|
existing_names = [c.name for c in client.list_collections()]
|
|
98
99
|
collection_existed = collection_name in existing_names
|
|
99
100
|
coll = get_or_create_collection(client, collection_name)
|
|
101
|
+
# IDs produced by chunking restart at chunk_0 for each run; namespace by write to avoid collisions.
|
|
102
|
+
write_prefix = uuid.uuid4().hex
|
|
103
|
+
unique_ids = [f"{write_prefix}:{chunk_id}" for chunk_id in ids]
|
|
100
104
|
try:
|
|
101
|
-
coll.add(ids=
|
|
105
|
+
coll.add(ids=unique_ids, embeddings=embeddings, documents=documents)
|
|
102
106
|
except chromadb.errors.InvalidArgumentError as e:
|
|
103
107
|
if "dimension" not in str(e).lower():
|
|
104
108
|
raise
|
|
@@ -107,6 +111,6 @@ def add_documents(
|
|
|
107
111
|
name=collection_name,
|
|
108
112
|
metadata={"hnsw:space": "cosine"},
|
|
109
113
|
)
|
|
110
|
-
coll.add(ids=
|
|
114
|
+
coll.add(ids=unique_ids, embeddings=embeddings, documents=documents)
|
|
111
115
|
collection_existed = False # we recreated it
|
|
112
116
|
return {"collection_existed": collection_existed}
|
|
@@ -166,6 +166,7 @@ class Slicer:
|
|
|
166
166
|
"""
|
|
167
167
|
Run the full pipeline: find path → listdir → filter .txt/.md → chunk → push to db.
|
|
168
168
|
db='chroma' only; connection uses host and port. Server must be listening.
|
|
169
|
+
If the resolved collection name already exists, new chunks are appended to it (not skipped).
|
|
169
170
|
"""
|
|
170
171
|
return cls._slice_impl(
|
|
171
172
|
path,
|
|
@@ -213,7 +214,7 @@ class Slicer:
|
|
|
213
214
|
_ensure_slice_logging()
|
|
214
215
|
|
|
215
216
|
total_start = perf_counter()
|
|
216
|
-
logger.info("[0/
|
|
217
|
+
logger.info("[0/6] Starting slice | path=%s | method=%s | model=%s", path, chunking_method, embedding_model)
|
|
217
218
|
|
|
218
219
|
path = Path(path).resolve()
|
|
219
220
|
if not path.exists():
|
|
@@ -257,21 +258,28 @@ class Slicer:
|
|
|
257
258
|
if len(paths) > 10:
|
|
258
259
|
logger.info(" ... and %d more", len(paths) - 10)
|
|
259
260
|
|
|
260
|
-
logger.info("[1/
|
|
261
|
+
logger.info("[1/6] File discovery completed in %.2fs (%d file(s))", perf_counter() - stage_start, len(paths))
|
|
261
262
|
|
|
262
|
-
# 2/
|
|
263
|
+
# 2/6 Chroma server check: fail fast if nothing is listening (before read/chunk/embed to save time & API cost).
|
|
264
|
+
# Step 6 always writes: if the collection name already exists, new chunks are appended (concatenated).
|
|
263
265
|
stage_start = perf_counter()
|
|
264
|
-
logger.info("[2/
|
|
266
|
+
logger.info("[2/6] Chroma server check | db=%s | host=%s | port=%s", db, host, port)
|
|
267
|
+
client = _get_db_client(db, host=host, port=port, auth=auth)
|
|
268
|
+
logger.info("[2/6] Server reachable at %s:%s in %.2fs (client kept for final write)", host, port, perf_counter() - stage_start)
|
|
269
|
+
|
|
270
|
+
# 3/6 Read as text
|
|
271
|
+
stage_start = perf_counter()
|
|
272
|
+
logger.info("[3/6] Reading content from %d file(s)...", len(paths))
|
|
265
273
|
reader = ReadText(paths)
|
|
266
274
|
path_and_text = reader.read_all()
|
|
267
|
-
logger.info("[
|
|
275
|
+
logger.info("[3/6] Read completed in %.2fs (%d/%d file(s) read)", perf_counter() - stage_start, len(path_and_text), len(paths))
|
|
268
276
|
if len(path_and_text) < len(paths):
|
|
269
277
|
logger.warning("Some files could not be read and were skipped (%d skipped)", len(paths) - len(path_and_text))
|
|
270
278
|
|
|
271
|
-
#
|
|
279
|
+
# 4/6 Chunk (token, text, semantic, or llm)
|
|
272
280
|
stage_start = perf_counter()
|
|
273
281
|
api_key = openai_api_key or load_openai_key(dotenv_path)
|
|
274
|
-
logger.info("[
|
|
282
|
+
logger.info("[4/6] Chunking | method=%s | chunk_size=%d | overlap=%d", chunking_method, chunk_size, chunk_overlap)
|
|
275
283
|
ids, documents = chunk_documents_by_method(
|
|
276
284
|
path_and_text,
|
|
277
285
|
method=chunking_method,
|
|
@@ -281,12 +289,12 @@ class Slicer:
|
|
|
281
289
|
openai_api_key=api_key,
|
|
282
290
|
)
|
|
283
291
|
if not documents:
|
|
284
|
-
logger.warning("[
|
|
292
|
+
logger.warning("[4/6] No chunks generated (empty or non-text content)")
|
|
285
293
|
return {"files": len(path_and_text), "chunks": 0, "collection": collection_name}
|
|
286
294
|
|
|
287
|
-
logger.info("[
|
|
295
|
+
logger.info("[4/6] Chunking completed in %.2fs (%d chunk(s))", perf_counter() - stage_start, len(documents))
|
|
288
296
|
|
|
289
|
-
#
|
|
297
|
+
# 5/6 Embed
|
|
290
298
|
api_key = openai_api_key or load_openai_key(dotenv_path)
|
|
291
299
|
if not api_key:
|
|
292
300
|
raise ValueError(
|
|
@@ -294,42 +302,57 @@ class Slicer:
|
|
|
294
302
|
"Set it in .env, pass openai_api_key=, or set the OPENAI_API_KEY env var."
|
|
295
303
|
)
|
|
296
304
|
stage_start = perf_counter()
|
|
297
|
-
logger.info("[
|
|
305
|
+
logger.info("[5/6] Generating embeddings | model=%s | batch_size=%d | chunk_count=%d", embedding_model, batch_size, len(documents))
|
|
298
306
|
embeddings = embed_texts(
|
|
299
307
|
documents,
|
|
300
308
|
api_key=api_key,
|
|
301
309
|
model=embedding_model,
|
|
302
310
|
batch_size=batch_size,
|
|
303
311
|
)
|
|
304
|
-
logger.info("[
|
|
312
|
+
logger.info("[5/6] Embeddings completed in %.2fs (%d vector(s))", perf_counter() - stage_start, len(embeddings))
|
|
305
313
|
|
|
306
|
-
#
|
|
314
|
+
# 6/6 Push to Chroma (reuse client from server check). Always add: append into an existing collection if the name is already in use.
|
|
307
315
|
stage_start = perf_counter()
|
|
308
|
-
logger.info("[
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
316
|
+
logger.info("[6/6] db=%s | host=%s | port=%s | collection=%s", db, host, port, collection_name)
|
|
317
|
+
logger.info("[6/6] Writing %d document chunk(s) to collection...", len(documents))
|
|
318
|
+
add_result = add_documents(client, collection_name, ids, embeddings, documents)
|
|
319
|
+
if add_result["collection_existed"]:
|
|
320
|
+
logger.info(
|
|
321
|
+
"[6/6] Collection %r already existed — concatenating: appending these chunks to that collection (same name).",
|
|
322
|
+
collection_name,
|
|
323
|
+
)
|
|
324
|
+
else:
|
|
325
|
+
logger.info("[6/6] Collection %r did not exist before this run; created and populated.", collection_name)
|
|
326
|
+
logger.info("[6/6] Chroma write completed in %.2fs", perf_counter() - stage_start)
|
|
327
|
+
|
|
328
|
+
if add_result["collection_existed"]:
|
|
329
|
+
logger.info(
|
|
330
|
+
"Slice finished in %.2fs | files=%d | chunks=%d | collection=%s | appended to existing collection",
|
|
331
|
+
perf_counter() - total_start,
|
|
332
|
+
len(path_and_text),
|
|
333
|
+
len(documents),
|
|
334
|
+
collection_name,
|
|
335
|
+
)
|
|
336
|
+
else:
|
|
337
|
+
logger.info(
|
|
338
|
+
"Slice finished in %.2fs | files=%d | chunks=%d | collection=%s",
|
|
339
|
+
perf_counter() - total_start,
|
|
340
|
+
len(path_and_text),
|
|
341
|
+
len(documents),
|
|
342
|
+
collection_name,
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
out: dict[str, Any] = {
|
|
329
346
|
"files": len(path_and_text),
|
|
330
347
|
"chunks": len(documents),
|
|
331
348
|
"collection": collection_name,
|
|
349
|
+
"appended_to_existing": add_result["collection_existed"],
|
|
332
350
|
}
|
|
351
|
+
if add_result["collection_existed"]:
|
|
352
|
+
out["message"] = (
|
|
353
|
+
f"Collection {collection_name!r} already existed; appended {len(documents)} chunk(s) (concatenated)."
|
|
354
|
+
)
|
|
355
|
+
return out
|
|
333
356
|
|
|
334
357
|
@classmethod
|
|
335
358
|
def collections_server(
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|