hecvec 6.2.0__tar.gz → 6.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hecvec-6.2.0 → hecvec-6.3.0}/PKG-INFO +9 -8
- {hecvec-6.2.0 → hecvec-6.3.0}/README.md +8 -7
- {hecvec-6.2.0 → hecvec-6.3.0}/pyproject.toml +1 -1
- {hecvec-6.2.0 → hecvec-6.3.0}/scripts/test_slice.py +1 -1
- {hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/__init__.py +1 -1
- {hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/pipeline.py +25 -20
- {hecvec-6.2.0 → hecvec-6.3.0}/uv.lock +1 -1
- {hecvec-6.2.0 → hecvec-6.3.0}/.gitignore +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/_recursive_chunking.py +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/chroma_client.py +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/chroma_list.py +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/chunkers.py +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/chunking.py +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/cli.py +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/embeddings.py +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/env.py +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/hecvec.py +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/listdir.py +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/reading.py +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/run_llm_chunk.py +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/run_semantic_chunk.py +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/src/hecvec/token_splitter.py +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/tests/conftest.py +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/tests/test_env.py +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/tests/test_listdir.py +0 -0
- {hecvec-6.2.0 → hecvec-6.3.0}/tests/test_reading.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: hecvec
|
|
3
|
-
Version: 6.
|
|
3
|
+
Version: 6.3.0
|
|
4
4
|
Summary: List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma.
|
|
5
5
|
License-Expression: MIT
|
|
6
6
|
Keywords: chunking,document-pipeline,listdir,text-files
|
|
@@ -62,7 +62,7 @@ Description-Content-Type: text/markdown
|
|
|
62
62
|
|
|
63
63
|
## Install
|
|
64
64
|
|
|
65
|
-
**Full pipeline** (list → read → chunk → embed → Chroma):
|
|
65
|
+
**Full pipeline** (list → verify Chroma is up → read → chunk → embed → Chroma):
|
|
66
66
|
|
|
67
67
|
```bash
|
|
68
68
|
pip install hecvec
|
|
@@ -85,18 +85,19 @@ To use the full `Slicer.slice(...)` pipeline you need:
|
|
|
85
85
|
|
|
86
86
|
## Workflow
|
|
87
87
|
|
|
88
|
-
The main entry point is `Slicer.slice(path=..., **kwargs)`. It runs
|
|
88
|
+
The main entry point is `Slicer.slice(path=..., **kwargs)`. It runs six logged steps:
|
|
89
89
|
|
|
90
90
|
| Step | Description |
|
|
91
91
|
|------|-------------|
|
|
92
92
|
| **0** | Resolve path, resolve collection name (`base_name` + `_` + `chunking_method`). |
|
|
93
93
|
| **1** | Discover files: single `.txt`/`.md` file or recursive list under a directory. |
|
|
94
|
-
| **2** |
|
|
95
|
-
| **3** |
|
|
96
|
-
| **4** |
|
|
97
|
-
| **5** |
|
|
94
|
+
| **2** | **Chroma server check:** connect to the server and fail fast if nothing is listening (before read/chunk/embed so you don’t pay for OpenAI when Chroma is down). The client is reused for the final write. |
|
|
95
|
+
| **3** | Read file contents as text (UTF-8 with fallbacks). |
|
|
96
|
+
| **4** | Chunk using the chosen method (`token`, `text`, `semantic`, or `llm`). |
|
|
97
|
+
| **5** | Generate embeddings with OpenAI. |
|
|
98
|
+
| **6** | Connect (already verified in step 2), list collections; if the collection **already exists**, skip adding. Otherwise create the collection and add documents. |
|
|
98
99
|
|
|
99
|
-
Progress is logged as `[0/
|
|
100
|
+
Progress is logged as `[0/6]` … `[6/6]` with timings. If the collection already exists, the log states that clearly after embeddings and no new documents are added.
|
|
100
101
|
|
|
101
102
|
---
|
|
102
103
|
|
|
@@ -25,7 +25,7 @@
|
|
|
25
25
|
|
|
26
26
|
## Install
|
|
27
27
|
|
|
28
|
-
**Full pipeline** (list → read → chunk → embed → Chroma):
|
|
28
|
+
**Full pipeline** (list → verify Chroma is up → read → chunk → embed → Chroma):
|
|
29
29
|
|
|
30
30
|
```bash
|
|
31
31
|
pip install hecvec
|
|
@@ -48,18 +48,19 @@ To use the full `Slicer.slice(...)` pipeline you need:
|
|
|
48
48
|
|
|
49
49
|
## Workflow
|
|
50
50
|
|
|
51
|
-
The main entry point is `Slicer.slice(path=..., **kwargs)`. It runs
|
|
51
|
+
The main entry point is `Slicer.slice(path=..., **kwargs)`. It runs six logged steps:
|
|
52
52
|
|
|
53
53
|
| Step | Description |
|
|
54
54
|
|------|-------------|
|
|
55
55
|
| **0** | Resolve path, resolve collection name (`base_name` + `_` + `chunking_method`). |
|
|
56
56
|
| **1** | Discover files: single `.txt`/`.md` file or recursive list under a directory. |
|
|
57
|
-
| **2** |
|
|
58
|
-
| **3** |
|
|
59
|
-
| **4** |
|
|
60
|
-
| **5** |
|
|
57
|
+
| **2** | **Chroma server check:** connect to the server and fail fast if nothing is listening (before read/chunk/embed so you don’t pay for OpenAI when Chroma is down). The client is reused for the final write. |
|
|
58
|
+
| **3** | Read file contents as text (UTF-8 with fallbacks). |
|
|
59
|
+
| **4** | Chunk using the chosen method (`token`, `text`, `semantic`, or `llm`). |
|
|
60
|
+
| **5** | Generate embeddings with OpenAI. |
|
|
61
|
+
| **6** | Connect (already verified in step 2), list collections; if the collection **already exists**, skip adding. Otherwise create the collection and add documents. |
|
|
61
62
|
|
|
62
|
-
Progress is logged as `[0/
|
|
63
|
+
Progress is logged as `[0/6]` … `[6/6]` with timings. If the collection already exists, the log states that clearly after embeddings and no new documents are added.
|
|
63
64
|
|
|
64
65
|
---
|
|
65
66
|
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "hecvec"
|
|
7
|
-
version = "6.
|
|
7
|
+
version = "6.3.0"
|
|
8
8
|
description = "List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9,<3.14"
|
|
@@ -13,7 +13,7 @@ def main():
|
|
|
13
13
|
path = Path(sys.argv[1]).expanduser().resolve() if len(sys.argv) >= 2 else (ROOT / "tests/CNSF-S0043-0032-2025_CONDUSEF-005190-08.txt")
|
|
14
14
|
slicer = Slicer(db="chroma", host="localhost", port=8000)
|
|
15
15
|
print("hecvec slicer config:", {"db": slicer.db, "host": slicer.host, "port": slicer.port, "auth": slicer.auth})
|
|
16
|
-
test = slicer.slice(path=path, collection_name="test", chunking_method="token",chunk_size=400)
|
|
16
|
+
test = slicer.slice(path=path, collection_name="test", chunking_method="token",chunk_size=400, )
|
|
17
17
|
print(test)
|
|
18
18
|
|
|
19
19
|
collections = slicer.collections()
|
|
@@ -213,7 +213,7 @@ class Slicer:
|
|
|
213
213
|
_ensure_slice_logging()
|
|
214
214
|
|
|
215
215
|
total_start = perf_counter()
|
|
216
|
-
logger.info("[0/
|
|
216
|
+
logger.info("[0/6] Starting slice | path=%s | method=%s | model=%s", path, chunking_method, embedding_model)
|
|
217
217
|
|
|
218
218
|
path = Path(path).resolve()
|
|
219
219
|
if not path.exists():
|
|
@@ -257,21 +257,28 @@ class Slicer:
|
|
|
257
257
|
if len(paths) > 10:
|
|
258
258
|
logger.info(" ... and %d more", len(paths) - 10)
|
|
259
259
|
|
|
260
|
-
logger.info("[1/
|
|
260
|
+
logger.info("[1/6] File discovery completed in %.2fs (%d file(s))", perf_counter() - stage_start, len(paths))
|
|
261
261
|
|
|
262
|
-
# 2/
|
|
262
|
+
# 2/6 Chroma server check: fail fast if nothing is listening (before read/chunk/embed to save time & API cost).
|
|
263
|
+
# Collection "already exists" is still handled at the end, after embeddings, unchanged.
|
|
263
264
|
stage_start = perf_counter()
|
|
264
|
-
logger.info("[2/
|
|
265
|
+
logger.info("[2/6] Chroma server check | db=%s | host=%s | port=%s", db, host, port)
|
|
266
|
+
client = _get_db_client(db, host=host, port=port, auth=auth)
|
|
267
|
+
logger.info("[2/6] Server reachable at %s:%s in %.2fs (client kept for final write)", host, port, perf_counter() - stage_start)
|
|
268
|
+
|
|
269
|
+
# 3/6 Read as text
|
|
270
|
+
stage_start = perf_counter()
|
|
271
|
+
logger.info("[3/6] Reading content from %d file(s)...", len(paths))
|
|
265
272
|
reader = ReadText(paths)
|
|
266
273
|
path_and_text = reader.read_all()
|
|
267
|
-
logger.info("[
|
|
274
|
+
logger.info("[3/6] Read completed in %.2fs (%d/%d file(s) read)", perf_counter() - stage_start, len(path_and_text), len(paths))
|
|
268
275
|
if len(path_and_text) < len(paths):
|
|
269
276
|
logger.warning("Some files could not be read and were skipped (%d skipped)", len(paths) - len(path_and_text))
|
|
270
277
|
|
|
271
|
-
#
|
|
278
|
+
# 4/6 Chunk (token, text, semantic, or llm)
|
|
272
279
|
stage_start = perf_counter()
|
|
273
280
|
api_key = openai_api_key or load_openai_key(dotenv_path)
|
|
274
|
-
logger.info("[
|
|
281
|
+
logger.info("[4/6] Chunking | method=%s | chunk_size=%d | overlap=%d", chunking_method, chunk_size, chunk_overlap)
|
|
275
282
|
ids, documents = chunk_documents_by_method(
|
|
276
283
|
path_and_text,
|
|
277
284
|
method=chunking_method,
|
|
@@ -281,12 +288,12 @@ class Slicer:
|
|
|
281
288
|
openai_api_key=api_key,
|
|
282
289
|
)
|
|
283
290
|
if not documents:
|
|
284
|
-
logger.warning("[
|
|
291
|
+
logger.warning("[4/6] No chunks generated (empty or non-text content)")
|
|
285
292
|
return {"files": len(path_and_text), "chunks": 0, "collection": collection_name}
|
|
286
293
|
|
|
287
|
-
logger.info("[
|
|
294
|
+
logger.info("[4/6] Chunking completed in %.2fs (%d chunk(s))", perf_counter() - stage_start, len(documents))
|
|
288
295
|
|
|
289
|
-
#
|
|
296
|
+
# 5/6 Embed
|
|
290
297
|
api_key = openai_api_key or load_openai_key(dotenv_path)
|
|
291
298
|
if not api_key:
|
|
292
299
|
raise ValueError(
|
|
@@ -294,23 +301,21 @@ class Slicer:
|
|
|
294
301
|
"Set it in .env, pass openai_api_key=, or set the OPENAI_API_KEY env var."
|
|
295
302
|
)
|
|
296
303
|
stage_start = perf_counter()
|
|
297
|
-
logger.info("[
|
|
304
|
+
logger.info("[5/6] Generating embeddings | model=%s | batch_size=%d | chunk_count=%d", embedding_model, batch_size, len(documents))
|
|
298
305
|
embeddings = embed_texts(
|
|
299
306
|
documents,
|
|
300
307
|
api_key=api_key,
|
|
301
308
|
model=embedding_model,
|
|
302
309
|
batch_size=batch_size,
|
|
303
310
|
)
|
|
304
|
-
logger.info("[
|
|
311
|
+
logger.info("[5/6] Embeddings completed in %.2fs (%d vector(s))", perf_counter() - stage_start, len(embeddings))
|
|
305
312
|
|
|
306
|
-
#
|
|
313
|
+
# 6/6 Push to Chroma (reuse client from server check; if collection exists, skip add — same as before)
|
|
307
314
|
stage_start = perf_counter()
|
|
308
|
-
logger.info("[
|
|
309
|
-
client = _get_db_client(db, host=host, port=port, auth=auth)
|
|
310
|
-
logger.info("[5/5] Connected to %s at %s:%s", db, host, port)
|
|
315
|
+
logger.info("[6/6] db=%s | host=%s | port=%s | collection=%s", db, host, port, collection_name)
|
|
311
316
|
existing_names = [c.name for c in client.list_collections()]
|
|
312
317
|
if collection_name in existing_names:
|
|
313
|
-
logger.info("[
|
|
318
|
+
logger.info("[6/6] Collection %r already exists; skipping (no new documents added).", collection_name)
|
|
314
319
|
logger.info("Slice finished in %.2fs | files=%d | chunks=0 | collection=%s | (skipped: already exists)", perf_counter() - total_start, len(path_and_text), collection_name)
|
|
315
320
|
return {
|
|
316
321
|
"files": len(path_and_text),
|
|
@@ -318,10 +323,10 @@ class Slicer:
|
|
|
318
323
|
"collection": collection_name,
|
|
319
324
|
"message": "Collection already exists; no documents added.",
|
|
320
325
|
}
|
|
321
|
-
logger.info("[
|
|
326
|
+
logger.info("[6/6] Adding %d document chunk(s) to collection...", len(documents))
|
|
322
327
|
add_documents(client, collection_name, ids, embeddings, documents)
|
|
323
|
-
logger.info("[
|
|
324
|
-
logger.info("[
|
|
328
|
+
logger.info("[6/6] Collection %r created (new)", collection_name)
|
|
329
|
+
logger.info("[6/6] Chroma write completed in %.2fs", perf_counter() - stage_start)
|
|
325
330
|
|
|
326
331
|
logger.info("Slice finished in %.2fs | files=%d | chunks=%d | collection=%s", perf_counter() - total_start, len(path_and_text), len(documents), collection_name)
|
|
327
332
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|