hecvec 0.3.0__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (26) hide show
  1. {hecvec-0.3.0 → hecvec-0.4.0}/PKG-INFO +1 -1
  2. {hecvec-0.3.0 → hecvec-0.4.0}/pyproject.toml +1 -1
  3. {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/__init__.py +1 -1
  4. {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/pipeline.py +31 -14
  5. {hecvec-0.3.0 → hecvec-0.4.0}/.gitignore +0 -0
  6. {hecvec-0.3.0 → hecvec-0.4.0}/README.md +0 -0
  7. {hecvec-0.3.0 → hecvec-0.4.0}/scripts/test_slice.py +0 -0
  8. {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/_recursive_chunking.py +0 -0
  9. {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/chroma_client.py +0 -0
  10. {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/chroma_list.py +0 -0
  11. {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/chunkers.py +0 -0
  12. {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/chunking.py +0 -0
  13. {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/cli.py +0 -0
  14. {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/embeddings.py +0 -0
  15. {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/env.py +0 -0
  16. {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/hecvec.py +0 -0
  17. {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/listdir.py +0 -0
  18. {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/reading.py +0 -0
  19. {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/run_llm_chunk.py +0 -0
  20. {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/run_semantic_chunk.py +0 -0
  21. {hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/token_splitter.py +0 -0
  22. {hecvec-0.3.0 → hecvec-0.4.0}/tests/conftest.py +0 -0
  23. {hecvec-0.3.0 → hecvec-0.4.0}/tests/test_env.py +0 -0
  24. {hecvec-0.3.0 → hecvec-0.4.0}/tests/test_listdir.py +0 -0
  25. {hecvec-0.3.0 → hecvec-0.4.0}/tests/test_reading.py +0 -0
  26. {hecvec-0.3.0 → hecvec-0.4.0}/uv.lock +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: hecvec
3
- Version: 0.3.0
3
+ Version: 0.4.0
4
4
  Summary: List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma.
5
5
  License-Expression: MIT
6
6
  Keywords: chunking,document-pipeline,listdir,text-files
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "hecvec"
7
- version = "0.3.0"
7
+ version = "0.4.0"
8
8
  description = "List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma."
9
9
  readme = "README.md"
10
10
  requires-python = ">=3.9,<3.14"
@@ -34,4 +34,4 @@ __all__ = [
34
34
  "__version__",
35
35
  ]
36
36
 
37
- __version__ = "0.3.0"
37
+ __version__ = "0.4.0"
@@ -6,6 +6,7 @@ from __future__ import annotations
6
6
 
7
7
  import logging
8
8
  from pathlib import Path
9
+ from time import perf_counter
9
10
  from typing import Any
10
11
 
11
12
  from hecvec.chroma_client import add_documents, get_client
@@ -110,6 +111,9 @@ class Slicer:
110
111
  """
111
112
  _check_chroma_deps()
112
113
 
114
+ total_start = perf_counter()
115
+ logger.info("[0/5] Starting slice | path=%s | method=%s | model=%s", path, chunking_method, embedding_model)
116
+
113
117
  path = Path(path).resolve()
114
118
  if not path.exists():
115
119
  raise ValueError(f"path does not exist: {path}")
@@ -117,6 +121,9 @@ class Slicer:
117
121
  if collection_name == "hecvec":
118
122
  collection_name = path.stem if path.is_file() else path.name
119
123
 
124
+ logger.info("Resolved collection name: %s", collection_name)
125
+
126
+ stage_start = perf_counter()
120
127
  if path.is_file():
121
128
  if path.suffix.lower() not in (".txt", ".md"):
122
129
  raise ValueError(f"File must be .txt or .md: {path}")
@@ -139,15 +146,21 @@ class Slicer:
139
146
  if len(paths) > 10:
140
147
  logger.info(" ... y %d más", len(paths) - 10)
141
148
 
142
- # 1. (paths already set above) 2. Read as text
143
- logger.info("Leyendo contenido de %d archivos...", len(paths))
149
+ logger.info("[1/5] File discovery completed in %.2fs (%d file(s))", perf_counter() - stage_start, len(paths))
150
+
151
+ # 2/5 Read as text
152
+ stage_start = perf_counter()
153
+ logger.info("[2/5] Reading content from %d file(s)...", len(paths))
144
154
  reader = ReadText(paths)
145
155
  path_and_text = reader.read_all()
146
- logger.info("Leídos %d archivos correctamente", len(path_and_text))
156
+ logger.info("[2/5] Read completed in %.2fs (%d/%d file(s) read)", perf_counter() - stage_start, len(path_and_text), len(paths))
157
+ if len(path_and_text) < len(paths):
158
+ logger.warning("Some files could not be read and were skipped (%d skipped)", len(paths) - len(path_and_text))
147
159
 
148
- # 3. Chunk (token, text, semantic, or llm)
160
+ # 3/5 Chunk (token, text, semantic, or llm)
161
+ stage_start = perf_counter()
149
162
  api_key = openai_api_key or load_openai_key(dotenv_path)
150
- logger.info("Fragmentando con method=%s, chunk_size=%d, chunk_overlap=%d...", chunking_method, chunk_size, chunk_overlap)
163
+ logger.info("[3/5] Chunking | method=%s | chunk_size=%d | overlap=%d", chunking_method, chunk_size, chunk_overlap)
151
164
  ids, documents = chunk_documents_by_method(
152
165
  path_and_text,
153
166
  method=chunking_method,
@@ -157,33 +170,37 @@ class Slicer:
157
170
  openai_api_key=api_key,
158
171
  )
159
172
  if not documents:
160
- logger.warning("No se generaron chunks (archivos vacíos o sin texto)")
173
+ logger.warning("[3/5] No chunks generated (empty or non-text content)")
161
174
  return {"files": len(path_and_text), "chunks": 0, "collection": collection_name}
162
175
 
163
- logger.info("Chunks generados: %d", len(documents))
176
+ logger.info("[3/5] Chunking completed in %.2fs (%d chunk(s))", perf_counter() - stage_start, len(documents))
164
177
 
165
- # 4. Embed
178
+ # 4/5 Embed
166
179
  api_key = openai_api_key or load_openai_key(dotenv_path)
167
180
  if not api_key:
168
181
  raise ValueError(
169
182
  "OPENAI_API_KEY required for embeddings. "
170
183
  "Set it in .env, pass openai_api_key=, or set the OPENAI_API_KEY env var."
171
184
  )
172
- logger.info("Generando embeddings (modelo=%s, batch_size=%d)...", embedding_model, batch_size)
185
+ stage_start = perf_counter()
186
+ logger.info("[4/5] Generating embeddings | model=%s | batch_size=%d | chunk_count=%d", embedding_model, batch_size, len(documents))
173
187
  embeddings = embed_texts(
174
188
  documents,
175
189
  api_key=api_key,
176
190
  model=embedding_model,
177
191
  batch_size=batch_size,
178
192
  )
179
- logger.info("Embeddings generados: %d vectores", len(embeddings))
193
+ logger.info("[4/5] Embeddings completed in %.2fs (%d vector(s))", perf_counter() - stage_start, len(embeddings))
180
194
 
181
- # 5. Push to Chroma
182
- logger.info("Conectando a Chroma (host=%s, port=%s)...", chroma_host, chroma_port)
195
+ # 5/5 Push to Chroma
196
+ stage_start = perf_counter()
197
+ logger.info("[5/5] Writing to Chroma | host=%s | port=%s | collection=%s", chroma_host, chroma_port, collection_name)
183
198
  client = get_client(host=chroma_host, port=chroma_port)
184
- logger.info("Añadiendo %d documentos a la colección '%s'...", len(documents), collection_name)
199
+ logger.info("[5/5] Adding %d document chunk(s) to collection...", len(documents))
185
200
  add_documents(client, collection_name, ids, embeddings, documents)
186
- logger.info("Pipeline completado: %d archivos %d chunks → Chroma", len(path_and_text), len(documents))
201
+ logger.info("[5/5] Chroma write completed in %.2fs", perf_counter() - stage_start)
202
+
203
+ logger.info("Slice finished in %.2fs | files=%d | chunks=%d | collection=%s", perf_counter() - total_start, len(path_and_text), len(documents), collection_name)
187
204
 
188
205
  return {
189
206
  "files": len(path_and_text),
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes