hecvec 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
hecvec/__init__.py ADDED
@@ -0,0 +1,34 @@
1
+ """
2
+ HecVec: modular library — listdir, read, chunk (token/recursive), embed, Chroma. No API.
3
+ """
4
+ from hecvec.chunking import chunk_documents, chunk_text
5
+ from hecvec.chroma_client import add_documents, get_client, get_or_create_collection
6
+ from hecvec.chroma_list import list_collections
7
+ from hecvec.embeddings import embed_texts
8
+ from hecvec.env import load_dotenv_if_available, load_openai_key
9
+ from hecvec.listdir import ALLOWED_EXTENSIONS, ListDir, ListDirTextFiles
10
+ from hecvec.pipeline import HecVec
11
+ from hecvec.reading import ReadText
12
+ from hecvec.token_splitter import token_chunk_documents, token_chunk_text
13
+
14
+ __all__ = [
15
+ "ALLOWED_EXTENSIONS",
16
+ "HecVec",
17
+ "ListDir",
18
+ "ListDirTextFiles",
19
+ "ReadText",
20
+ "add_documents",
21
+ "chunk_documents",
22
+ "chunk_text",
23
+ "embed_texts",
24
+ "get_client",
25
+ "get_or_create_collection",
26
+ "list_collections",
27
+ "load_dotenv_if_available",
28
+ "load_openai_key",
29
+ "token_chunk_documents",
30
+ "token_chunk_text",
31
+ "__version__",
32
+ ]
33
+
34
+ __version__ = "0.1.0"
@@ -0,0 +1,58 @@
1
+ """
2
+ Chroma client and collection operations. One module = one responsibility: connect and add documents.
3
+ Requires: pip install hecvec[chroma] (chromadb).
4
+ """
5
+ from __future__ import annotations
6
+
7
+ from typing import TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING:
10
+ import chromadb
11
+
12
+ DEFAULT_HOST = "localhost"
13
+ DEFAULT_PORT = 8000
14
+
15
+
16
+ def get_client(host: str = DEFAULT_HOST, port: int = DEFAULT_PORT):
17
+ """Return a Chroma HTTP client, or an in-memory client if the server is not reachable."""
18
+ import chromadb
19
+ try:
20
+ return chromadb.HttpClient(host=host, port=port)
21
+ except Exception:
22
+ return chromadb.EphemeralClient()
23
+
24
+
25
+ def get_or_create_collection(
26
+ client: "chromadb.HttpClient",
27
+ name: str,
28
+ metadata: dict | None = None,
29
+ ):
30
+ """Get or create a collection (cosine similarity by default)."""
31
+ if metadata is None:
32
+ metadata = {"hnsw:space": "cosine"}
33
+ return client.get_or_create_collection(name=name, metadata=metadata)
34
+
35
+
36
+ def add_documents(
37
+ client: "chromadb.HttpClient",
38
+ collection_name: str,
39
+ ids: list[str],
40
+ embeddings: list[list[float]],
41
+ documents: list[str],
42
+ ) -> None:
43
+ """
44
+ Add documents to a collection. If dimension mismatch, deletes and recreates the collection.
45
+ """
46
+ import chromadb
47
+ coll = get_or_create_collection(client, collection_name)
48
+ try:
49
+ coll.add(ids=ids, embeddings=embeddings, documents=documents)
50
+ except chromadb.errors.InvalidArgumentError as e:
51
+ if "dimension" not in str(e).lower():
52
+ raise
53
+ client.delete_collection(name=collection_name)
54
+ coll = client.create_collection(
55
+ name=collection_name,
56
+ metadata={"hnsw:space": "cosine"},
57
+ )
58
+ coll.add(ids=ids, embeddings=embeddings, documents=documents)
hecvec/chroma_list.py ADDED
@@ -0,0 +1,27 @@
1
+ """
2
+ List Chroma collections. One module = one responsibility: inspect collections on a Chroma server.
3
+ Requires: pip install hecvec[chroma] (chromadb).
4
+ """
5
+ from __future__ import annotations
6
+
7
+ from typing import TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING:
10
+ import chromadb
11
+
12
+ DEFAULT_HOST = "localhost"
13
+ DEFAULT_PORT = 8000
14
+
15
+
16
+ def list_collections(
17
+ host: str = DEFAULT_HOST,
18
+ port: int = DEFAULT_PORT,
19
+ ) -> list[tuple[str, int]]:
20
+ """
21
+ List all collection names and their document counts on a Chroma server.
22
+ Returns [(name, count), ...].
23
+ """
24
+ import chromadb
25
+ client = chromadb.HttpClient(host=host, port=port)
26
+ collections = client.list_collections()
27
+ return [(c.name, c.count()) for c in collections]
hecvec/chunking.py ADDED
@@ -0,0 +1,69 @@
1
+ """
2
+ Chunk text documents. Requires optional dependency: pip install hecvec[chunk]
3
+ """
4
+ from __future__ import annotations
5
+
6
+ from pathlib import Path
7
+ from typing import TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING:
10
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
11
+
12
+ _DEFAULT_CHUNK_SIZE = 400
13
+ _DEFAULT_CHUNK_OVERLAP = 0
14
+ _DEFAULT_SEPARATORS = ["\n\n\n", "\n\n", "\n", ". ", " ", ""]
15
+
16
+
17
+ def _get_splitter(
18
+ chunk_size: int = _DEFAULT_CHUNK_SIZE,
19
+ chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP,
20
+ separators: list[str] | None = None,
21
+ ) -> "RecursiveCharacterTextSplitter":
22
+ try:
23
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
24
+ except ImportError as e:
25
+ raise ImportError(
26
+ "Chunking requires langchain-text-splitters. Install with: pip install hecvec[chunk]"
27
+ ) from e
28
+ return RecursiveCharacterTextSplitter(
29
+ chunk_size=chunk_size,
30
+ chunk_overlap=chunk_overlap,
31
+ separators=separators or _DEFAULT_SEPARATORS,
32
+ )
33
+
34
+
35
+ def chunk_text(
36
+ text: str,
37
+ chunk_size: int = _DEFAULT_CHUNK_SIZE,
38
+ chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP,
39
+ separators: list[str] | None = None,
40
+ ) -> list[str]:
41
+ """
42
+ Split a single document text into chunks.
43
+
44
+ Requires optional dependency: pip install hecvec[chunk]
45
+ """
46
+ splitter = _get_splitter(chunk_size, chunk_overlap, separators)
47
+ return splitter.split_text(text)
48
+
49
+
50
+ def chunk_documents(
51
+ path_and_texts: list[tuple[str | Path, str]],
52
+ chunk_size: int = _DEFAULT_CHUNK_SIZE,
53
+ chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP,
54
+ separators: list[str] | None = None,
55
+ ) -> list[dict]:
56
+ """
57
+ Chunk multiple documents. Each input is (path, text).
58
+ Returns a list of dicts: {"path": str, "chunk_index": int, "content": str}.
59
+
60
+ Requires optional dependency: pip install hecvec[chunk]
61
+ """
62
+ splitter = _get_splitter(chunk_size, chunk_overlap, separators)
63
+ out = []
64
+ for path, text in path_and_texts:
65
+ path_str = str(Path(path).resolve())
66
+ chunks = splitter.split_text(text)
67
+ for i, content in enumerate(chunks):
68
+ out.append({"path": path_str, "chunk_index": i, "content": content})
69
+ return out
hecvec/cli.py ADDED
@@ -0,0 +1,27 @@
1
+ """CLI for listing directory contents under a root."""
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ from hecvec import ListDir
6
+
7
+
8
+ def main() -> None:
9
+ root = Path.cwd()
10
+ path = "."
11
+ if len(sys.argv) >= 2:
12
+ path = sys.argv[1]
13
+ if len(sys.argv) >= 3:
14
+ root = Path(sys.argv[2]).resolve()
15
+
16
+ lister = ListDir(root=root)
17
+ entries = lister.listdir(path)
18
+ print(f"Contenido de: {path}\n")
19
+ for rel in entries:
20
+ full = lister.root / rel
21
+ prefix = "[DIR] " if full.is_dir() else " "
22
+ print(f" {prefix}{rel}")
23
+ print(f"\nTotal: {len(entries)} entradas")
24
+
25
+
26
+ if __name__ == "__main__":
27
+ main()
hecvec/embeddings.py ADDED
@@ -0,0 +1,39 @@
1
+ """
2
+ OpenAI embeddings. One module = one responsibility: turn text list into embedding vectors.
3
+ Requires: pip install hecvec[chroma] (openai).
4
+ """
5
+ from __future__ import annotations
6
+
7
+ from typing import TYPE_CHECKING
8
+
9
+ if TYPE_CHECKING:
10
+ from openai import OpenAI
11
+
12
+ DEFAULT_MODEL = "text-embedding-3-small"
13
+ DEFAULT_BATCH_SIZE = 100
14
+
15
+
16
+ def _get_openai_client(api_key: str | None = None) -> "OpenAI":
17
+ from openai import OpenAI
18
+ if not api_key:
19
+ raise ValueError("api_key is required for embeddings")
20
+ return OpenAI(api_key=api_key)
21
+
22
+
23
+ def embed_texts(
24
+ texts: list[str],
25
+ api_key: str,
26
+ model: str = DEFAULT_MODEL,
27
+ batch_size: int = DEFAULT_BATCH_SIZE,
28
+ ) -> list[list[float]]:
29
+ """
30
+ Embed a list of text strings with OpenAI.
31
+ Returns a list of embedding vectors (same order as texts).
32
+ """
33
+ client = _get_openai_client(api_key)
34
+ out: list[list[float]] = []
35
+ for start in range(0, len(texts), batch_size):
36
+ batch = texts[start : start + batch_size]
37
+ response = client.embeddings.create(model=model, input=batch)
38
+ out.extend([item.embedding for item in response.data])
39
+ return out
hecvec/env.py ADDED
@@ -0,0 +1,26 @@
1
+ """
2
+ Load environment and secrets (e.g. OPENAI_API_KEY from .env).
3
+ Use this module to centralize dotenv and env access.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import os
8
+ from pathlib import Path
9
+
10
+
11
+ def load_dotenv_if_available(dotenv_path: str | Path | None = None) -> None:
12
+ """Load .env into os.environ if python-dotenv is installed. No-op otherwise."""
13
+ try:
14
+ from dotenv import load_dotenv
15
+ load_dotenv(dotenv_path)
16
+ except ImportError:
17
+ pass
18
+
19
+
20
+ def load_openai_key(dotenv_path: str | Path | None = None) -> str | None:
21
+ """
22
+ Load OPENAI_API_KEY from the environment.
23
+ If dotenv_path is given (or python-dotenv is available), loads .env first.
24
+ """
25
+ load_dotenv_if_available(dotenv_path)
26
+ return os.environ.get("OPENAI_API_KEY")
hecvec/hecvec.py ADDED
@@ -0,0 +1,6 @@
1
+ """
2
+ HecVec pipeline entrypoint. Re-exports HecVec from pipeline for backward compatibility.
3
+ """
4
+ from hecvec.pipeline import HecVec
5
+
6
+ __all__ = ["HecVec"]
hecvec/listdir.py ADDED
@@ -0,0 +1,126 @@
1
+ """Directory listing with safe path resolution under a root."""
2
+ from __future__ import annotations
3
+
4
+ import os
5
+ from pathlib import Path
6
+
7
+
8
+ ALLOWED_EXTENSIONS = (".txt", ".md")
9
+
10
+
11
+ class ListDir:
12
+ """
13
+ Lists directory contents with safe path resolution. All paths are
14
+ restricted under `root`. Returns a single list of relative path strings.
15
+ """
16
+
17
+ def __init__(self, root: str | Path):
18
+ self.root = Path(root).resolve()
19
+ if not self.root.is_dir():
20
+ raise ValueError(f"root must be an existing directory: {self.root}")
21
+
22
+ def _resolve(self, path: str | Path) -> Path:
23
+ """Resolve path under root; raise ValueError if it escapes root."""
24
+ p = Path(path)
25
+ if not p.is_absolute():
26
+ p = self.root / p
27
+ resolved = p.resolve()
28
+ try:
29
+ resolved.relative_to(self.root)
30
+ except ValueError:
31
+ raise ValueError(f"Path escapes allowed root: {path!r} -> {resolved}")
32
+ return resolved
33
+
34
+ def listdir(self, path: str | Path = ".") -> list[str]:
35
+ """
36
+ List entries in the given path (under root). Returns a sorted list
37
+ of relative path strings (dirs first, then files, by name).
38
+ """
39
+ target = self._resolve(path)
40
+ if not target.is_dir():
41
+ raise ValueError(f"Not a directory: {target}")
42
+ out = []
43
+ for entry in target.iterdir():
44
+ try:
45
+ rel = entry.relative_to(self.root)
46
+ except ValueError:
47
+ continue
48
+ out.append(str(rel))
49
+ return sorted(out, key=lambda x: (not (self.root / x).is_dir(), x.lower()))
50
+
51
+ def listdir_recursive(
52
+ self,
53
+ path: str | Path = ".",
54
+ max_depth: int | None = None,
55
+ ) -> list[str]:
56
+ """
57
+ List all entries under path recursively. Returns a flat sorted list
58
+ of relative path strings.
59
+ """
60
+ target = self._resolve(path)
61
+ if not target.is_dir():
62
+ raise ValueError(f"Not a directory: {target}")
63
+ out = []
64
+ for root_dir, dirs, files in os.walk(target, topdown=True):
65
+ root_path = Path(root_dir)
66
+ try:
67
+ rel_root = root_path.relative_to(self.root)
68
+ except ValueError:
69
+ continue
70
+ depth = len(rel_root.parts) if rel_root != Path(".") else 0
71
+ if max_depth is not None and depth > max_depth:
72
+ dirs.clear()
73
+ continue
74
+ for d in sorted(dirs):
75
+ p = root_path / d
76
+ try:
77
+ rel = p.relative_to(self.root)
78
+ except ValueError:
79
+ continue
80
+ out.append(str(rel))
81
+ for f in sorted(files):
82
+ p = root_path / f
83
+ try:
84
+ rel = p.relative_to(self.root)
85
+ except ValueError:
86
+ continue
87
+ out.append(str(rel))
88
+ return sorted(out)
89
+
90
+
91
+ class ListDirTextFiles(ListDir):
92
+ """
93
+ Child of ListDir. Takes the list of relative paths from the parent and
94
+ returns only .txt and .md files as full paths; filters out all other
95
+ extensions.
96
+ """
97
+
98
+ def __init__(self, root: str | Path, allowed_extensions: tuple[str, ...] = ALLOWED_EXTENSIONS):
99
+ super().__init__(root)
100
+ self.allowed_extensions = allowed_extensions
101
+
102
+ def filter_txt_md(self, relative_paths: list[str]) -> list[Path]:
103
+ """
104
+ From a list of relative path strings (e.g. from listdir/listdir_recursive),
105
+ build full paths and return only those that are files with .txt or .md.
106
+ """
107
+ result = []
108
+ for rel in relative_paths:
109
+ full = self.root / rel
110
+ if full.is_file() and full.suffix.lower() in self.allowed_extensions:
111
+ result.append(full.resolve())
112
+ return sorted(result)
113
+
114
+ def listdir_txt_md(self, path: str | Path = ".") -> list[Path]:
115
+ """List only .txt and .md files under path (one level)."""
116
+ paths = self.listdir(path)
117
+ return self.filter_txt_md(paths)
118
+
119
+ def listdir_recursive_txt_md(
120
+ self,
121
+ path: str | Path = ".",
122
+ max_depth: int | None = None,
123
+ ) -> list[Path]:
124
+ """List only .txt and .md files under path recursively."""
125
+ paths = self.listdir_recursive(path=path, max_depth=max_depth)
126
+ return self.filter_txt_md(paths)
hecvec/pipeline.py ADDED
@@ -0,0 +1,187 @@
1
+ """
2
+ Pipeline: composes listdir → read → token-chunk → embed → Chroma.
3
+ One module = one responsibility: orchestrate the full slice() flow.
4
+ """
5
+ from __future__ import annotations
6
+
7
+ import logging
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from hecvec.chroma_client import add_documents, get_client
12
+ from hecvec.embeddings import embed_texts
13
+ from hecvec.env import load_openai_key
14
+ from hecvec.listdir import ListDirTextFiles
15
+ from hecvec.reading import ReadText
16
+ from hecvec.token_splitter import token_chunk_documents
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ def _check_chroma_deps() -> None:
22
+ try:
23
+ import chromadb # noqa: F401
24
+ from langchain_text_splitters import TokenTextSplitter # noqa: F401
25
+ from openai import OpenAI # noqa: F401
26
+ except ImportError as e:
27
+ raise ImportError(
28
+ "Chroma pipeline requires: pip install hecvec[chroma]"
29
+ ) from e
30
+
31
+
32
+ class HecVec:
33
+ """
34
+ Library-only pipeline. No API.
35
+ Call slice(path=...) to: list path → filter .txt/.md → token-chunk → push to Chroma.
36
+ """
37
+
38
+ def __init__(
39
+ self,
40
+ *,
41
+ root: str | Path | None = None,
42
+ collection_name: str = "hecvec",
43
+ chroma_host: str = "localhost",
44
+ chroma_port: int = 8000,
45
+ embedding_model: str = "text-embedding-3-small",
46
+ chunk_size: int = 200,
47
+ chunk_overlap: int = 0,
48
+ encoding_name: str = "cl100k_base",
49
+ batch_size: int = 100,
50
+ openai_api_key: str | None = None,
51
+ dotenv_path: str | Path | None = None,
52
+ ):
53
+ self.root = Path(root).resolve() if root else Path.cwd()
54
+ self.collection_name = collection_name
55
+ self.chroma_host = chroma_host
56
+ self.chroma_port = chroma_port
57
+ self.embedding_model = embedding_model
58
+ self.chunk_size = chunk_size
59
+ self.chunk_overlap = chunk_overlap
60
+ self.encoding_name = encoding_name
61
+ self.batch_size = batch_size
62
+ self._dotenv_path = dotenv_path
63
+ self._openai_api_key = openai_api_key or load_openai_key(dotenv_path)
64
+
65
+ def slice(
66
+ self,
67
+ path: str | Path,
68
+ **kwargs: Any,
69
+ ) -> dict[str, Any]:
70
+ """Run the pipeline for this instance (uses instance config)."""
71
+ return HecVec.slice(
72
+ path,
73
+ root=kwargs.pop("root", self.root),
74
+ collection_name=kwargs.pop("collection_name", self.collection_name),
75
+ chroma_host=kwargs.pop("chroma_host", self.chroma_host),
76
+ chroma_port=kwargs.pop("chroma_port", self.chroma_port),
77
+ embedding_model=kwargs.pop("embedding_model", self.embedding_model),
78
+ chunk_size=kwargs.pop("chunk_size", self.chunk_size),
79
+ chunk_overlap=kwargs.pop("chunk_overlap", self.chunk_overlap),
80
+ encoding_name=kwargs.pop("encoding_name", self.encoding_name),
81
+ batch_size=kwargs.pop("batch_size", self.batch_size),
82
+ openai_api_key=kwargs.pop("openai_api_key", self._openai_api_key),
83
+ dotenv_path=kwargs.pop("dotenv_path", self._dotenv_path),
84
+ **kwargs,
85
+ )
86
+
87
+ @classmethod
88
+ def slice(
89
+ cls,
90
+ path: str | Path,
91
+ *,
92
+ root: str | Path | None = None,
93
+ collection_name: str = "hecvec",
94
+ chroma_host: str = "localhost",
95
+ chroma_port: int = 8000,
96
+ embedding_model: str = "text-embedding-3-small",
97
+ chunk_size: int = 200,
98
+ chunk_overlap: int = 0,
99
+ encoding_name: str = "cl100k_base",
100
+ batch_size: int = 100,
101
+ openai_api_key: str | None = None,
102
+ dotenv_path: str | Path | None = None,
103
+ **kwargs: Any,
104
+ ) -> dict[str, Any]:
105
+ """
106
+ Run the full pipeline: find path → listdir → filter .txt/.md → token-chunk → push to Chroma.
107
+ No API; everything runs in the library.
108
+ """
109
+ _check_chroma_deps()
110
+
111
+ path = Path(path).resolve()
112
+ if not path.exists():
113
+ raise ValueError(f"path does not exist: {path}")
114
+
115
+ if collection_name == "hecvec":
116
+ collection_name = path.stem if path.is_file() else path.name
117
+
118
+ if path.is_file():
119
+ if path.suffix.lower() not in (".txt", ".md"):
120
+ raise ValueError(f"File must be .txt or .md: {path}")
121
+ root = path.parent
122
+ paths = [path]
123
+ logger.info("Archivo a procesar: %s", path)
124
+ else:
125
+ root = Path(root).resolve() if root else path
126
+ if not root.is_dir():
127
+ raise ValueError(f"path must be an existing directory: {root}")
128
+ logger.info("Ruta a procesar: %s", root)
129
+ lister = ListDirTextFiles(root=root)
130
+ paths = lister.listdir_recursive_txt_md(path)
131
+ if not paths:
132
+ logger.warning("No se encontraron archivos .txt/.md en %s", path)
133
+ return {"files": 0, "chunks": 0, "collection": collection_name, "message": "No .txt/.md files found"}
134
+ logger.info("Archivos .txt/.md encontrados: %d", len(paths))
135
+ for i, p in enumerate(paths[:10]):
136
+ logger.info(" %s", p)
137
+ if len(paths) > 10:
138
+ logger.info(" ... y %d más", len(paths) - 10)
139
+
140
+ # 1. (paths already set above) — 2. Read as text
141
+ logger.info("Leyendo contenido de %d archivos...", len(paths))
142
+ reader = ReadText(paths)
143
+ path_and_text = reader.read_all()
144
+ logger.info("Leídos %d archivos correctamente", len(path_and_text))
145
+
146
+ # 3. Token chunk
147
+ logger.info("Fragmentando con chunk_size=%d, chunk_overlap=%d...", chunk_size, chunk_overlap)
148
+ ids, documents = token_chunk_documents(
149
+ path_and_text,
150
+ chunk_size=chunk_size,
151
+ chunk_overlap=chunk_overlap,
152
+ encoding_name=encoding_name,
153
+ )
154
+ if not documents:
155
+ logger.warning("No se generaron chunks (archivos vacíos o sin texto)")
156
+ return {"files": len(path_and_text), "chunks": 0, "collection": collection_name}
157
+
158
+ logger.info("Chunks generados: %d", len(documents))
159
+
160
+ # 4. Embed
161
+ api_key = openai_api_key or load_openai_key(dotenv_path)
162
+ if not api_key:
163
+ raise ValueError(
164
+ "OPENAI_API_KEY required for embeddings. "
165
+ "Set it in .env, pass openai_api_key=, or set the OPENAI_API_KEY env var."
166
+ )
167
+ logger.info("Generando embeddings (modelo=%s, batch_size=%d)...", embedding_model, batch_size)
168
+ embeddings = embed_texts(
169
+ documents,
170
+ api_key=api_key,
171
+ model=embedding_model,
172
+ batch_size=batch_size,
173
+ )
174
+ logger.info("Embeddings generados: %d vectores", len(embeddings))
175
+
176
+ # 5. Push to Chroma
177
+ logger.info("Conectando a Chroma (host=%s, port=%s)...", chroma_host, chroma_port)
178
+ client = get_client(host=chroma_host, port=chroma_port)
179
+ logger.info("Añadiendo %d documentos a la colección '%s'...", len(documents), collection_name)
180
+ add_documents(client, collection_name, ids, embeddings, documents)
181
+ logger.info("Pipeline completado: %d archivos → %d chunks → Chroma", len(path_and_text), len(documents))
182
+
183
+ return {
184
+ "files": len(path_and_text),
185
+ "chunks": len(documents),
186
+ "collection": collection_name,
187
+ }
hecvec/reading.py ADDED
@@ -0,0 +1,59 @@
1
+ """Read text files and expose content as strings."""
2
+ from __future__ import annotations
3
+
4
+ import logging
5
+ from pathlib import Path
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ # Fallback encodings to try if UTF-8 fails (e.g. Windows Latin-1 / CP1252)
10
+ FALLBACK_ENCODINGS = ("utf-8", "latin-1", "cp1252")
11
+
12
+
13
+ class ReadText:
14
+ """
15
+ Given a list of file paths (.txt and .md), reads each as text.
16
+ Returns (path, text) pairs. Tries UTF-8 first, then latin-1, then cp1252.
17
+ """
18
+
19
+ def __init__(self, paths: list[str] | list[Path], encoding: str = "utf-8"):
20
+ self.paths = [Path(p) for p in paths]
21
+ self.encoding = encoding
22
+
23
+ def read_all(self) -> list[tuple[Path, str]]:
24
+ """
25
+ Read each file as text.
26
+ Returns a list of (path, text). Skips non-files and missing paths.
27
+ """
28
+ out = []
29
+ for p in self.paths:
30
+ if not p.is_file():
31
+ logger.debug("Saltando (no es archivo): %s", p)
32
+ continue
33
+ text = None
34
+ for enc in FALLBACK_ENCODINGS:
35
+ try:
36
+ text = p.read_text(encoding=enc)
37
+ if enc != "utf-8":
38
+ logger.info("Leído con encoding %s: %s", enc, p.name)
39
+ break
40
+ except (OSError, UnicodeDecodeError) as e:
41
+ if enc == FALLBACK_ENCODINGS[-1]:
42
+ logger.warning("No se pudo leer %s: %s", p, e)
43
+ continue
44
+ if text is not None:
45
+ out.append((p.resolve(), text))
46
+ return out
47
+
48
+ def __iter__(self):
49
+ """Iterate over (path, text)."""
50
+ for p in self.paths:
51
+ if not p.is_file():
52
+ continue
53
+ for enc in FALLBACK_ENCODINGS:
54
+ try:
55
+ text = p.read_text(encoding=enc)
56
+ yield p.resolve(), text
57
+ break
58
+ except (OSError, UnicodeDecodeError):
59
+ continue
@@ -0,0 +1,67 @@
1
+ """
2
+ Token-based text chunking (TokenTextSplitter from langchain).
3
+ One module = one responsibility: split text by token count.
4
+ Requires: pip install hecvec[chunk] or hecvec[chroma]
5
+ """
6
+ from __future__ import annotations
7
+
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING
10
+
11
+ if TYPE_CHECKING:
12
+ from langchain_text_splitters import TokenTextSplitter
13
+
14
+ DEFAULT_CHUNK_SIZE = 200
15
+ DEFAULT_CHUNK_OVERLAP = 0
16
+ DEFAULT_ENCODING_NAME = "cl100k_base"
17
+
18
+
19
+ def _get_token_splitter(
20
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
21
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
22
+ encoding_name: str = DEFAULT_ENCODING_NAME,
23
+ ) -> "TokenTextSplitter":
24
+ try:
25
+ from langchain_text_splitters import TokenTextSplitter
26
+ except ImportError as e:
27
+ raise ImportError(
28
+ "Token splitting requires langchain-text-splitters. Install with: pip install hecvec[chunk]"
29
+ ) from e
30
+ return TokenTextSplitter(
31
+ chunk_size=chunk_size,
32
+ chunk_overlap=chunk_overlap,
33
+ encoding_name=encoding_name,
34
+ )
35
+
36
+
37
+ def token_chunk_text(
38
+ text: str,
39
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
40
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
41
+ encoding_name: str = DEFAULT_ENCODING_NAME,
42
+ ) -> list[str]:
43
+ """Split a single document text into chunks by token count."""
44
+ splitter = _get_token_splitter(chunk_size, chunk_overlap, encoding_name)
45
+ return splitter.split_text(text)
46
+
47
+
48
+ def token_chunk_documents(
49
+ path_and_texts: list[tuple[str | Path, str]],
50
+ chunk_size: int = DEFAULT_CHUNK_SIZE,
51
+ chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
52
+ encoding_name: str = DEFAULT_ENCODING_NAME,
53
+ ) -> tuple[list[str], list[str]]:
54
+ """
55
+ Chunk multiple documents by token count.
56
+ Returns (ids, documents) where ids are chunk_0, chunk_1, ...
57
+ """
58
+ splitter = _get_token_splitter(chunk_size, chunk_overlap, encoding_name)
59
+ documents: list[str] = []
60
+ ids: list[str] = []
61
+ for path, text in path_and_texts:
62
+ for content in splitter.split_text(text):
63
+ if not content.strip():
64
+ continue
65
+ ids.append(f"chunk_{len(ids)}")
66
+ documents.append(content)
67
+ return ids, documents
@@ -0,0 +1,182 @@
1
+ Metadata-Version: 2.4
2
+ Name: hecvec
3
+ Version: 0.1.0
4
+ Summary: List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma.
5
+ License-Expression: MIT
6
+ Keywords: chunking,document-pipeline,listdir,text-files
7
+ Classifier: Development Status :: 3 - Alpha
8
+ Classifier: Intended Audience :: Developers
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Programming Language :: Python :: 3
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Programming Language :: Python :: 3.13
16
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
17
+ Requires-Python: <3.14,>=3.9
18
+ Requires-Dist: chromadb>=0.4.0
19
+ Requires-Dist: langchain-text-splitters>=0.2.0
20
+ Requires-Dist: openai>=1.0.0
21
+ Requires-Dist: python-dotenv>=1.0.0
22
+ Requires-Dist: tiktoken>=0.5.0
23
+ Provides-Extra: chroma
24
+ Requires-Dist: chromadb>=0.4.0; extra == 'chroma'
25
+ Requires-Dist: langchain-text-splitters>=0.2.0; extra == 'chroma'
26
+ Requires-Dist: openai>=1.0.0; extra == 'chroma'
27
+ Requires-Dist: python-dotenv>=1.0.0; extra == 'chroma'
28
+ Requires-Dist: tiktoken>=0.5.0; extra == 'chroma'
29
+ Provides-Extra: chunk
30
+ Requires-Dist: langchain-text-splitters>=0.2.0; extra == 'chunk'
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=7.0.0; extra == 'dev'
33
+ Description-Content-Type: text/markdown
34
+
35
+ # HecVec
36
+
37
+ List directories with a safe root, filter `.txt`/`.md` files, read them as text, and optionally chunk and push to Chroma — **library only, no API**.
38
+
39
+ ## Install
40
+
41
+ ```bash
42
+ pip install hecvec
43
+ ```
44
+
45
+ One-call pipeline (list → filter → token-chunk → Chroma):
46
+
47
+ ```bash
48
+ pip install hecvec[chroma]
49
+ ```
50
+
51
+ Optional chunking only (no Chroma):
52
+
53
+ ```bash
54
+ pip install hecvec[chunk]
55
+ ```
56
+
57
+ ## Usage
58
+
59
+ ### One-call pipeline (list → filter → chunk → Chroma)
60
+
61
+ Runs entirely in the library (no API). You need Chroma running (e.g. `docker run -p 8000:8000 chromadb/chroma`) and `OPENAI_API_KEY` set (in the environment or in a `.env` file; the library loads `.env` via python-dotenv when you use `hecvec[chroma]`).
62
+
63
+ ```python
64
+ import hecvec
65
+
66
+ # Class-style: use defaults, then slice
67
+ test = hecvec.HecVec()
68
+ result = test.slice(path="/path/to/folder")
69
+ # → {"files": N, "chunks": M, "collection": "hecvec"}
70
+
71
+ # Or call slice on the class (same flow)
72
+ result = hecvec.HecVec.slice(path="/path/to/folder")
73
+ ```
74
+
75
+ Flow: resolve path → listdir → filter `.txt`/`.md` → token-chunk (200 tokens, `cl100k_base`) → embed with OpenAI → push to Chroma.
76
+
77
+ Optional config (instance or `HecVec.slice(..., key=value)`):
78
+
79
+ - `root`, `collection_name`, `chroma_host`, `chroma_port`
80
+ - `embedding_model`, `chunk_size`, `chunk_overlap`, `encoding_name`, `batch_size`
81
+ - `openai_api_key` (or set `OPENAI_API_KEY` in the environment or in a `.env` file; optional `dotenv_path` to point to a specific `.env`)
82
+
83
+ ### Low-level building blocks
84
+
85
+ ```python
86
+ from pathlib import Path
87
+ from hecvec import ListDir, ListDirTextFiles, ReadText
88
+
89
+ root = Path("/path/to/repo")
90
+
91
+ # List all entries under a path (restricted to root)
92
+ lister = ListDir(root=root)
93
+ for rel in lister.listdir("."):
94
+ print(rel)
95
+
96
+ # Only .txt and .md files, recursively
97
+ text_lister = ListDirTextFiles(root=root)
98
+ paths = text_lister.listdir_recursive_txt_md("docs")
99
+
100
+ # Read each file as text
101
+ reader = ReadText(paths)
102
+ for path, text in reader:
103
+ print(path, len(text))
104
+ ```
105
+
106
+ ### Chunking (optional)
107
+
108
+ With `pip install hecvec[chunk]`:
109
+
110
+ ```python
111
+ from hecvec import ListDirTextFiles, ReadText
112
+ from hecvec.chunking import chunk_documents
113
+
114
+ lister = ListDirTextFiles(root=root)
115
+ paths = lister.listdir_recursive_txt_md(".")
116
+ reader = ReadText(paths)
117
+ path_and_text = reader.read_all()
118
+ chunks = chunk_documents(path_and_text)
119
+ # list of {"path": "...", "chunk_index": 0, "content": "..."}
120
+ ```
121
+
122
+ ### CLI
123
+
124
+ ```bash
125
+ hecvec-listdir [path] [root]
126
+ # or
127
+ python -m hecvec.cli [path] [root]
128
+ ```
129
+
130
+ ### Test the full pipeline (the method that does everything)
131
+
132
+ From the project root, with Chroma running and `OPENAI_API_KEY` set (e.g. in `.env`):
133
+
134
+ ```bash
135
+ # Start Chroma (one terminal)
136
+ docker run -p 8000:8000 chromadb/chroma
137
+
138
+ # Run the test script (another terminal)
139
+ uv run python scripts/test_slice.py
140
+ # or: python scripts/test_slice.py
141
+ ```
142
+
143
+ The script creates a temp folder with two `.txt` files, runs `HecVec.slice(path=...)`, and prints `PASS` or `FAIL` with the result (`files`, `chunks`, `collection`).
144
+
145
+ ### Modular layout (easy to study)
146
+
147
+ Each step of the pipeline lives in its own module:
148
+
149
+ | Module | Responsibility |
150
+ |--------|-----------------|
151
+ | `hecvec.env` | Load `.env` and `OPENAI_API_KEY` |
152
+ | `hecvec.listdir` | List dirs under a safe root; filter by extension (`.txt`/`.md`) |
153
+ | `hecvec.reading` | Read files as text (UTF-8 / latin-1 / cp1252 fallback) |
154
+ | `hecvec.token_splitter` | Token-based chunking (TokenTextSplitter) |
155
+ | `hecvec.chunking` | Recursive-character chunking (RecursiveCharacterTextSplitter) |
156
+ | `hecvec.embeddings` | OpenAI embeddings (`embed_texts`) |
157
+ | `hecvec.chroma_client` | Chroma client, get/create collection, add documents |
158
+ | `hecvec.chroma_list` | List Chroma collections and counts |
159
+ | `hecvec.pipeline` | Orchestrator: `HecVec` and `slice(path=...)` |
160
+
161
+ Example: use one step on its own:
162
+
163
+ ```python
164
+ from hecvec import embed_texts, token_chunk_text, list_collections
165
+
166
+ chunks = token_chunk_text("Some long document...", chunk_size=200)
167
+ vecs = embed_texts(chunks, api_key="sk-...")
168
+ names_and_counts = list_collections(host="localhost", port=8000)
169
+ ```
170
+
171
+ ## Development
172
+
173
+ From the repo root:
174
+
175
+ ```bash
176
+ uv sync
177
+ uv run python -c "from hecvec import ListDir; print(ListDir('.').listdir('.'))"
178
+ ```
179
+
180
+ ## License
181
+
182
+ MIT
@@ -0,0 +1,16 @@
1
+ hecvec/__init__.py,sha256=mwGyDxZgshlfn1wWUB5xO2LTwg5-LWYQmD3Io6KGIZk,1028
2
+ hecvec/chroma_client.py,sha256=iNQDw5r2_BReLY7MVhoJHhXvJduBv_0MR0UkeDEtKf0,1779
3
+ hecvec/chroma_list.py,sha256=botRTK0Ge_-PwBsj9-WE0VxjyMJa6g07m1MO1_1pq2Q,722
4
+ hecvec/chunking.py,sha256=BpHjL5e7I8u-T6goU0TD_Y1US63GUEz_tYIu2iTN9U0,2203
5
+ hecvec/cli.py,sha256=oMernmxfk-_TZr83b0-jl3STl5xBRVORqTokcDByZBA,642
6
+ hecvec/embeddings.py,sha256=kvjty_BDx77COehxTvHWXcKsNqMy2iKiT2H_7V9wp4k,1149
7
+ hecvec/env.py,sha256=691efRFxSzEtDgtM0xl5aAbgrCvjgUqSQe7dG8jzV9g,778
8
+ hecvec/hecvec.py,sha256=XGBXBCAZhdOzP5xvDIFnZZtsMBNOZXrt_dcrIg1vlFE,153
9
+ hecvec/listdir.py,sha256=1xRR7Rn9hAhIscsweZtLBTq54BIV7G8yI9w_qySEIEY,4428
10
+ hecvec/pipeline.py,sha256=TwYBG9Bc06Z04oIAi-vYtCH5u6hjXt853EwR3mJVH_s,7388
11
+ hecvec/reading.py,sha256=gPPVuQPukJIKUJS69-BBZOwMgnvx-0da8gQJ5D3P6sM,2003
12
+ hecvec/token_splitter.py,sha256=usKZjNw_BQasuNx9Hz2iGUHUiq8fNYIy_HMiW7QaZ1k,2167
13
+ hecvec-0.1.0.dist-info/METADATA,sha256=Q3FZQUC_eeBUKOHvQI0CpgaqifMjpYi_ZU2m5sXl5dw,5704
14
+ hecvec-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
15
+ hecvec-0.1.0.dist-info/entry_points.txt,sha256=M-5kL-e8yJH7OxTWqnQhLWJhuDPzBU0_rbpr7RVcLvQ,51
16
+ hecvec-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: hatchling 1.29.0
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ hecvec-listdir = hecvec.cli:main