hecvec 0.3.0__tar.gz → 0.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {hecvec-0.3.0 → hecvec-0.4.0}/PKG-INFO +1 -1
- {hecvec-0.3.0 → hecvec-0.4.0}/pyproject.toml +1 -1
- {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/__init__.py +1 -1
- {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/pipeline.py +31 -14
- {hecvec-0.3.0 → hecvec-0.4.0}/.gitignore +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/README.md +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/scripts/test_slice.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/_recursive_chunking.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/chroma_client.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/chroma_list.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/chunkers.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/chunking.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/cli.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/embeddings.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/env.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/hecvec.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/listdir.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/reading.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/run_llm_chunk.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/run_semantic_chunk.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/token_splitter.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/tests/conftest.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/tests/test_env.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/tests/test_listdir.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/tests/test_reading.py +0 -0
- {hecvec-0.3.0 → hecvec-0.4.0}/uv.lock +0 -0
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "hecvec"
|
|
7
|
-
version = "0.
|
|
7
|
+
version = "0.4.0"
|
|
8
8
|
description = "List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
requires-python = ">=3.9,<3.14"
|
|
@@ -6,6 +6,7 @@ from __future__ import annotations
|
|
|
6
6
|
|
|
7
7
|
import logging
|
|
8
8
|
from pathlib import Path
|
|
9
|
+
from time import perf_counter
|
|
9
10
|
from typing import Any
|
|
10
11
|
|
|
11
12
|
from hecvec.chroma_client import add_documents, get_client
|
|
@@ -110,6 +111,9 @@ class Slicer:
|
|
|
110
111
|
"""
|
|
111
112
|
_check_chroma_deps()
|
|
112
113
|
|
|
114
|
+
total_start = perf_counter()
|
|
115
|
+
logger.info("[0/5] Starting slice | path=%s | method=%s | model=%s", path, chunking_method, embedding_model)
|
|
116
|
+
|
|
113
117
|
path = Path(path).resolve()
|
|
114
118
|
if not path.exists():
|
|
115
119
|
raise ValueError(f"path does not exist: {path}")
|
|
@@ -117,6 +121,9 @@ class Slicer:
|
|
|
117
121
|
if collection_name == "hecvec":
|
|
118
122
|
collection_name = path.stem if path.is_file() else path.name
|
|
119
123
|
|
|
124
|
+
logger.info("Resolved collection name: %s", collection_name)
|
|
125
|
+
|
|
126
|
+
stage_start = perf_counter()
|
|
120
127
|
if path.is_file():
|
|
121
128
|
if path.suffix.lower() not in (".txt", ".md"):
|
|
122
129
|
raise ValueError(f"File must be .txt or .md: {path}")
|
|
@@ -139,15 +146,21 @@ class Slicer:
|
|
|
139
146
|
if len(paths) > 10:
|
|
140
147
|
logger.info(" ... y %d más", len(paths) - 10)
|
|
141
148
|
|
|
142
|
-
|
|
143
|
-
|
|
149
|
+
logger.info("[1/5] File discovery completed in %.2fs (%d file(s))", perf_counter() - stage_start, len(paths))
|
|
150
|
+
|
|
151
|
+
# 2/5 Read as text
|
|
152
|
+
stage_start = perf_counter()
|
|
153
|
+
logger.info("[2/5] Reading content from %d file(s)...", len(paths))
|
|
144
154
|
reader = ReadText(paths)
|
|
145
155
|
path_and_text = reader.read_all()
|
|
146
|
-
logger.info("
|
|
156
|
+
logger.info("[2/5] Read completed in %.2fs (%d/%d file(s) read)", perf_counter() - stage_start, len(path_and_text), len(paths))
|
|
157
|
+
if len(path_and_text) < len(paths):
|
|
158
|
+
logger.warning("Some files could not be read and were skipped (%d skipped)", len(paths) - len(path_and_text))
|
|
147
159
|
|
|
148
|
-
# 3
|
|
160
|
+
# 3/5 Chunk (token, text, semantic, or llm)
|
|
161
|
+
stage_start = perf_counter()
|
|
149
162
|
api_key = openai_api_key or load_openai_key(dotenv_path)
|
|
150
|
-
logger.info("
|
|
163
|
+
logger.info("[3/5] Chunking | method=%s | chunk_size=%d | overlap=%d", chunking_method, chunk_size, chunk_overlap)
|
|
151
164
|
ids, documents = chunk_documents_by_method(
|
|
152
165
|
path_and_text,
|
|
153
166
|
method=chunking_method,
|
|
@@ -157,33 +170,37 @@ class Slicer:
|
|
|
157
170
|
openai_api_key=api_key,
|
|
158
171
|
)
|
|
159
172
|
if not documents:
|
|
160
|
-
logger.warning("No
|
|
173
|
+
logger.warning("[3/5] No chunks generated (empty or non-text content)")
|
|
161
174
|
return {"files": len(path_and_text), "chunks": 0, "collection": collection_name}
|
|
162
175
|
|
|
163
|
-
logger.info("
|
|
176
|
+
logger.info("[3/5] Chunking completed in %.2fs (%d chunk(s))", perf_counter() - stage_start, len(documents))
|
|
164
177
|
|
|
165
|
-
# 4
|
|
178
|
+
# 4/5 Embed
|
|
166
179
|
api_key = openai_api_key or load_openai_key(dotenv_path)
|
|
167
180
|
if not api_key:
|
|
168
181
|
raise ValueError(
|
|
169
182
|
"OPENAI_API_KEY required for embeddings. "
|
|
170
183
|
"Set it in .env, pass openai_api_key=, or set the OPENAI_API_KEY env var."
|
|
171
184
|
)
|
|
172
|
-
|
|
185
|
+
stage_start = perf_counter()
|
|
186
|
+
logger.info("[4/5] Generating embeddings | model=%s | batch_size=%d | chunk_count=%d", embedding_model, batch_size, len(documents))
|
|
173
187
|
embeddings = embed_texts(
|
|
174
188
|
documents,
|
|
175
189
|
api_key=api_key,
|
|
176
190
|
model=embedding_model,
|
|
177
191
|
batch_size=batch_size,
|
|
178
192
|
)
|
|
179
|
-
logger.info("Embeddings
|
|
193
|
+
logger.info("[4/5] Embeddings completed in %.2fs (%d vector(s))", perf_counter() - stage_start, len(embeddings))
|
|
180
194
|
|
|
181
|
-
# 5
|
|
182
|
-
|
|
195
|
+
# 5/5 Push to Chroma
|
|
196
|
+
stage_start = perf_counter()
|
|
197
|
+
logger.info("[5/5] Writing to Chroma | host=%s | port=%s | collection=%s", chroma_host, chroma_port, collection_name)
|
|
183
198
|
client = get_client(host=chroma_host, port=chroma_port)
|
|
184
|
-
logger.info("
|
|
199
|
+
logger.info("[5/5] Adding %d document chunk(s) to collection...", len(documents))
|
|
185
200
|
add_documents(client, collection_name, ids, embeddings, documents)
|
|
186
|
-
logger.info("
|
|
201
|
+
logger.info("[5/5] Chroma write completed in %.2fs", perf_counter() - stage_start)
|
|
202
|
+
|
|
203
|
+
logger.info("Slice finished in %.2fs | files=%d | chunks=%d | collection=%s", perf_counter() - total_start, len(path_and_text), len(documents), collection_name)
|
|
187
204
|
|
|
188
205
|
return {
|
|
189
206
|
"files": len(path_and_text),
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|