PyPI - hecvec - Versions diffs - 0.3.0__tar.gz → 0.4.0__tar.gz - Mend

hecvec 0.3.0tar.gz → 0.4.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

{hecvec-0.3.0 → hecvec-0.4.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: hecvec
-Version: 0.3.0
+Version: 0.4.0
 Summary: List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma.
 License-Expression: MIT
 Keywords: chunking,document-pipeline,listdir,text-files

{hecvec-0.3.0 → hecvec-0.4.0}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "hecvec"
-version = "0.3.0"
+version = "0.4.0"
 description = "List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma."
 readme = "README.md"
 requires-python = ">=3.9,<3.14"

{hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/__init__.py RENAMED Viewed

@@ -34,4 +34,4 @@ __all__ = [
     "__version__",
 ]
-__version__ = "0.3.0"
+__version__ = "0.4.0"

{hecvec-0.3.0 → hecvec-0.4.0}/src/hecvec/pipeline.py RENAMED Viewed

@@ -6,6 +6,7 @@ from __future__ import annotations
 import logging
 from pathlib import Path
+from time import perf_counter
 from typing import Any
 from hecvec.chroma_client import add_documents, get_client
@@ -110,6 +111,9 @@ class Slicer:
         """
         _check_chroma_deps()
+        total_start = perf_counter()
+        logger.info("[0/5] Starting slice | path=%s | method=%s | model=%s", path, chunking_method, embedding_model)
         path = Path(path).resolve()
         if not path.exists():
             raise ValueError(f"path does not exist: {path}")
@@ -117,6 +121,9 @@ class Slicer:
         if collection_name == "hecvec":
             collection_name = path.stem if path.is_file() else path.name
+        logger.info("Resolved collection name: %s", collection_name)
+        stage_start = perf_counter()
         if path.is_file():
             if path.suffix.lower() not in (".txt", ".md"):
                 raise ValueError(f"File must be .txt or .md: {path}")
@@ -139,15 +146,21 @@ class Slicer:
             if len(paths) > 10:
                 logger.info("  ... y %d más", len(paths) - 10)
-        # 1. (paths already set above) — 2. Read as text
-        logger.info("Leyendo contenido de %d archivos...", len(paths))
+        logger.info("[1/5] File discovery completed in %.2fs (%d file(s))", perf_counter() - stage_start, len(paths))
+        # 2/5 Read as text
+        stage_start = perf_counter()
+        logger.info("[2/5] Reading content from %d file(s)...", len(paths))
         reader = ReadText(paths)
         path_and_text = reader.read_all()
-        logger.info("Leídos %d archivos correctamente", len(path_and_text))
+        logger.info("[2/5] Read completed in %.2fs (%d/%d file(s) read)", perf_counter() - stage_start, len(path_and_text), len(paths))
+        if len(path_and_text) < len(paths):
+            logger.warning("Some files could not be read and were skipped (%d skipped)", len(paths) - len(path_and_text))
-        # 3. Chunk (token, text, semantic, or llm)
+        # 3/5 Chunk (token, text, semantic, or llm)
+        stage_start = perf_counter()
         api_key = openai_api_key or load_openai_key(dotenv_path)
-        logger.info("Fragmentando con method=%s, chunk_size=%d, chunk_overlap=%d...", chunking_method, chunk_size, chunk_overlap)
+        logger.info("[3/5] Chunking | method=%s | chunk_size=%d | overlap=%d", chunking_method, chunk_size, chunk_overlap)
         ids, documents = chunk_documents_by_method(
             path_and_text,
             method=chunking_method,
@@ -157,33 +170,37 @@ class Slicer:
             openai_api_key=api_key,
         )
         if not documents:
-            logger.warning("No se generaron chunks (archivos vacíos o sin texto)")
+            logger.warning("[3/5] No chunks generated (empty or non-text content)")
             return {"files": len(path_and_text), "chunks": 0, "collection": collection_name}
-        logger.info("Chunks generados: %d", len(documents))
+        logger.info("[3/5] Chunking completed in %.2fs (%d chunk(s))", perf_counter() - stage_start, len(documents))
-        # 4. Embed
+        # 4/5 Embed
         api_key = openai_api_key or load_openai_key(dotenv_path)
         if not api_key:
             raise ValueError(
                 "OPENAI_API_KEY required for embeddings. "
                 "Set it in .env, pass openai_api_key=, or set the OPENAI_API_KEY env var."
             )
-        logger.info("Generando embeddings (modelo=%s, batch_size=%d)...", embedding_model, batch_size)
+        stage_start = perf_counter()
+        logger.info("[4/5] Generating embeddings | model=%s | batch_size=%d | chunk_count=%d", embedding_model, batch_size, len(documents))
         embeddings = embed_texts(
             documents,
             api_key=api_key,
             model=embedding_model,
             batch_size=batch_size,
         )
-        logger.info("Embeddings generados: %d vectores", len(embeddings))
+        logger.info("[4/5] Embeddings completed in %.2fs (%d vector(s))", perf_counter() - stage_start, len(embeddings))
-        # 5. Push to Chroma
-        logger.info("Conectando a Chroma (host=%s, port=%s)...", chroma_host, chroma_port)
+        # 5/5 Push to Chroma
+        stage_start = perf_counter()
+        logger.info("[5/5] Writing to Chroma | host=%s | port=%s | collection=%s", chroma_host, chroma_port, collection_name)
         client = get_client(host=chroma_host, port=chroma_port)
-        logger.info("Añadiendo %d documentos a la colección '%s'...", len(documents), collection_name)
+        logger.info("[5/5] Adding %d document chunk(s) to collection...", len(documents))
         add_documents(client, collection_name, ids, embeddings, documents)
-        logger.info("Pipeline completado: %d archivos → %d chunks → Chroma", len(path_and_text), len(documents))
+        logger.info("[5/5] Chroma write completed in %.2fs", perf_counter() - stage_start)
+        logger.info("Slice finished in %.2fs | files=%d | chunks=%d | collection=%s", perf_counter() - total_start, len(path_and_text), len(documents), collection_name)
         return {
             "files": len(path_and_text),