hecvec 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hecvec/__init__.py +34 -0
- hecvec/chroma_client.py +58 -0
- hecvec/chroma_list.py +27 -0
- hecvec/chunking.py +69 -0
- hecvec/cli.py +27 -0
- hecvec/embeddings.py +39 -0
- hecvec/env.py +26 -0
- hecvec/hecvec.py +6 -0
- hecvec/listdir.py +126 -0
- hecvec/pipeline.py +187 -0
- hecvec/reading.py +59 -0
- hecvec/token_splitter.py +67 -0
- hecvec-0.1.0.dist-info/METADATA +182 -0
- hecvec-0.1.0.dist-info/RECORD +16 -0
- hecvec-0.1.0.dist-info/WHEEL +4 -0
- hecvec-0.1.0.dist-info/entry_points.txt +2 -0
hecvec/__init__.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""
|
|
2
|
+
HecVec: modular library — listdir, read, chunk (token/recursive), embed, Chroma. No API.
|
|
3
|
+
"""
|
|
4
|
+
from hecvec.chunking import chunk_documents, chunk_text
|
|
5
|
+
from hecvec.chroma_client import add_documents, get_client, get_or_create_collection
|
|
6
|
+
from hecvec.chroma_list import list_collections
|
|
7
|
+
from hecvec.embeddings import embed_texts
|
|
8
|
+
from hecvec.env import load_dotenv_if_available, load_openai_key
|
|
9
|
+
from hecvec.listdir import ALLOWED_EXTENSIONS, ListDir, ListDirTextFiles
|
|
10
|
+
from hecvec.pipeline import HecVec
|
|
11
|
+
from hecvec.reading import ReadText
|
|
12
|
+
from hecvec.token_splitter import token_chunk_documents, token_chunk_text
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"ALLOWED_EXTENSIONS",
|
|
16
|
+
"HecVec",
|
|
17
|
+
"ListDir",
|
|
18
|
+
"ListDirTextFiles",
|
|
19
|
+
"ReadText",
|
|
20
|
+
"add_documents",
|
|
21
|
+
"chunk_documents",
|
|
22
|
+
"chunk_text",
|
|
23
|
+
"embed_texts",
|
|
24
|
+
"get_client",
|
|
25
|
+
"get_or_create_collection",
|
|
26
|
+
"list_collections",
|
|
27
|
+
"load_dotenv_if_available",
|
|
28
|
+
"load_openai_key",
|
|
29
|
+
"token_chunk_documents",
|
|
30
|
+
"token_chunk_text",
|
|
31
|
+
"__version__",
|
|
32
|
+
]
|
|
33
|
+
|
|
34
|
+
__version__ = "0.1.0"
|
hecvec/chroma_client.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Chroma client and collection operations. One module = one responsibility: connect and add documents.
|
|
3
|
+
Requires: pip install hecvec[chroma] (chromadb).
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
import chromadb
|
|
11
|
+
|
|
12
|
+
DEFAULT_HOST = "localhost"
|
|
13
|
+
DEFAULT_PORT = 8000
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_client(host: str = DEFAULT_HOST, port: int = DEFAULT_PORT):
|
|
17
|
+
"""Return a Chroma HTTP client, or an in-memory client if the server is not reachable."""
|
|
18
|
+
import chromadb
|
|
19
|
+
try:
|
|
20
|
+
return chromadb.HttpClient(host=host, port=port)
|
|
21
|
+
except Exception:
|
|
22
|
+
return chromadb.EphemeralClient()
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def get_or_create_collection(
|
|
26
|
+
client: "chromadb.HttpClient",
|
|
27
|
+
name: str,
|
|
28
|
+
metadata: dict | None = None,
|
|
29
|
+
):
|
|
30
|
+
"""Get or create a collection (cosine similarity by default)."""
|
|
31
|
+
if metadata is None:
|
|
32
|
+
metadata = {"hnsw:space": "cosine"}
|
|
33
|
+
return client.get_or_create_collection(name=name, metadata=metadata)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def add_documents(
|
|
37
|
+
client: "chromadb.HttpClient",
|
|
38
|
+
collection_name: str,
|
|
39
|
+
ids: list[str],
|
|
40
|
+
embeddings: list[list[float]],
|
|
41
|
+
documents: list[str],
|
|
42
|
+
) -> None:
|
|
43
|
+
"""
|
|
44
|
+
Add documents to a collection. If dimension mismatch, deletes and recreates the collection.
|
|
45
|
+
"""
|
|
46
|
+
import chromadb
|
|
47
|
+
coll = get_or_create_collection(client, collection_name)
|
|
48
|
+
try:
|
|
49
|
+
coll.add(ids=ids, embeddings=embeddings, documents=documents)
|
|
50
|
+
except chromadb.errors.InvalidArgumentError as e:
|
|
51
|
+
if "dimension" not in str(e).lower():
|
|
52
|
+
raise
|
|
53
|
+
client.delete_collection(name=collection_name)
|
|
54
|
+
coll = client.create_collection(
|
|
55
|
+
name=collection_name,
|
|
56
|
+
metadata={"hnsw:space": "cosine"},
|
|
57
|
+
)
|
|
58
|
+
coll.add(ids=ids, embeddings=embeddings, documents=documents)
|
hecvec/chroma_list.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""
|
|
2
|
+
List Chroma collections. One module = one responsibility: inspect collections on a Chroma server.
|
|
3
|
+
Requires: pip install hecvec[chroma] (chromadb).
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
import chromadb
|
|
11
|
+
|
|
12
|
+
DEFAULT_HOST = "localhost"
|
|
13
|
+
DEFAULT_PORT = 8000
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def list_collections(
|
|
17
|
+
host: str = DEFAULT_HOST,
|
|
18
|
+
port: int = DEFAULT_PORT,
|
|
19
|
+
) -> list[tuple[str, int]]:
|
|
20
|
+
"""
|
|
21
|
+
List all collection names and their document counts on a Chroma server.
|
|
22
|
+
Returns [(name, count), ...].
|
|
23
|
+
"""
|
|
24
|
+
import chromadb
|
|
25
|
+
client = chromadb.HttpClient(host=host, port=port)
|
|
26
|
+
collections = client.list_collections()
|
|
27
|
+
return [(c.name, c.count()) for c in collections]
|
hecvec/chunking.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Chunk text documents. Requires optional dependency: pip install hecvec[chunk]
|
|
3
|
+
"""
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
11
|
+
|
|
12
|
+
_DEFAULT_CHUNK_SIZE = 400
|
|
13
|
+
_DEFAULT_CHUNK_OVERLAP = 0
|
|
14
|
+
_DEFAULT_SEPARATORS = ["\n\n\n", "\n\n", "\n", ". ", " ", ""]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _get_splitter(
|
|
18
|
+
chunk_size: int = _DEFAULT_CHUNK_SIZE,
|
|
19
|
+
chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP,
|
|
20
|
+
separators: list[str] | None = None,
|
|
21
|
+
) -> "RecursiveCharacterTextSplitter":
|
|
22
|
+
try:
|
|
23
|
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
24
|
+
except ImportError as e:
|
|
25
|
+
raise ImportError(
|
|
26
|
+
"Chunking requires langchain-text-splitters. Install with: pip install hecvec[chunk]"
|
|
27
|
+
) from e
|
|
28
|
+
return RecursiveCharacterTextSplitter(
|
|
29
|
+
chunk_size=chunk_size,
|
|
30
|
+
chunk_overlap=chunk_overlap,
|
|
31
|
+
separators=separators or _DEFAULT_SEPARATORS,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def chunk_text(
|
|
36
|
+
text: str,
|
|
37
|
+
chunk_size: int = _DEFAULT_CHUNK_SIZE,
|
|
38
|
+
chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP,
|
|
39
|
+
separators: list[str] | None = None,
|
|
40
|
+
) -> list[str]:
|
|
41
|
+
"""
|
|
42
|
+
Split a single document text into chunks.
|
|
43
|
+
|
|
44
|
+
Requires optional dependency: pip install hecvec[chunk]
|
|
45
|
+
"""
|
|
46
|
+
splitter = _get_splitter(chunk_size, chunk_overlap, separators)
|
|
47
|
+
return splitter.split_text(text)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def chunk_documents(
|
|
51
|
+
path_and_texts: list[tuple[str | Path, str]],
|
|
52
|
+
chunk_size: int = _DEFAULT_CHUNK_SIZE,
|
|
53
|
+
chunk_overlap: int = _DEFAULT_CHUNK_OVERLAP,
|
|
54
|
+
separators: list[str] | None = None,
|
|
55
|
+
) -> list[dict]:
|
|
56
|
+
"""
|
|
57
|
+
Chunk multiple documents. Each input is (path, text).
|
|
58
|
+
Returns a list of dicts: {"path": str, "chunk_index": int, "content": str}.
|
|
59
|
+
|
|
60
|
+
Requires optional dependency: pip install hecvec[chunk]
|
|
61
|
+
"""
|
|
62
|
+
splitter = _get_splitter(chunk_size, chunk_overlap, separators)
|
|
63
|
+
out = []
|
|
64
|
+
for path, text in path_and_texts:
|
|
65
|
+
path_str = str(Path(path).resolve())
|
|
66
|
+
chunks = splitter.split_text(text)
|
|
67
|
+
for i, content in enumerate(chunks):
|
|
68
|
+
out.append({"path": path_str, "chunk_index": i, "content": content})
|
|
69
|
+
return out
|
hecvec/cli.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""CLI for listing directory contents under a root."""
|
|
2
|
+
import sys
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from hecvec import ListDir
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def main() -> None:
|
|
9
|
+
root = Path.cwd()
|
|
10
|
+
path = "."
|
|
11
|
+
if len(sys.argv) >= 2:
|
|
12
|
+
path = sys.argv[1]
|
|
13
|
+
if len(sys.argv) >= 3:
|
|
14
|
+
root = Path(sys.argv[2]).resolve()
|
|
15
|
+
|
|
16
|
+
lister = ListDir(root=root)
|
|
17
|
+
entries = lister.listdir(path)
|
|
18
|
+
print(f"Contenido de: {path}\n")
|
|
19
|
+
for rel in entries:
|
|
20
|
+
full = lister.root / rel
|
|
21
|
+
prefix = "[DIR] " if full.is_dir() else " "
|
|
22
|
+
print(f" {prefix}{rel}")
|
|
23
|
+
print(f"\nTotal: {len(entries)} entradas")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
if __name__ == "__main__":
|
|
27
|
+
main()
|
hecvec/embeddings.py
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"""
|
|
2
|
+
OpenAI embeddings. One module = one responsibility: turn text list into embedding vectors.
|
|
3
|
+
Requires: pip install hecvec[chroma] (openai).
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
from typing import TYPE_CHECKING
|
|
8
|
+
|
|
9
|
+
if TYPE_CHECKING:
|
|
10
|
+
from openai import OpenAI
|
|
11
|
+
|
|
12
|
+
DEFAULT_MODEL = "text-embedding-3-small"
|
|
13
|
+
DEFAULT_BATCH_SIZE = 100
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _get_openai_client(api_key: str | None = None) -> "OpenAI":
|
|
17
|
+
from openai import OpenAI
|
|
18
|
+
if not api_key:
|
|
19
|
+
raise ValueError("api_key is required for embeddings")
|
|
20
|
+
return OpenAI(api_key=api_key)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def embed_texts(
|
|
24
|
+
texts: list[str],
|
|
25
|
+
api_key: str,
|
|
26
|
+
model: str = DEFAULT_MODEL,
|
|
27
|
+
batch_size: int = DEFAULT_BATCH_SIZE,
|
|
28
|
+
) -> list[list[float]]:
|
|
29
|
+
"""
|
|
30
|
+
Embed a list of text strings with OpenAI.
|
|
31
|
+
Returns a list of embedding vectors (same order as texts).
|
|
32
|
+
"""
|
|
33
|
+
client = _get_openai_client(api_key)
|
|
34
|
+
out: list[list[float]] = []
|
|
35
|
+
for start in range(0, len(texts), batch_size):
|
|
36
|
+
batch = texts[start : start + batch_size]
|
|
37
|
+
response = client.embeddings.create(model=model, input=batch)
|
|
38
|
+
out.extend([item.embedding for item in response.data])
|
|
39
|
+
return out
|
hecvec/env.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Load environment and secrets (e.g. OPENAI_API_KEY from .env).
|
|
3
|
+
Use this module to centralize dotenv and env access.
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def load_dotenv_if_available(dotenv_path: str | Path | None = None) -> None:
|
|
12
|
+
"""Load .env into os.environ if python-dotenv is installed. No-op otherwise."""
|
|
13
|
+
try:
|
|
14
|
+
from dotenv import load_dotenv
|
|
15
|
+
load_dotenv(dotenv_path)
|
|
16
|
+
except ImportError:
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def load_openai_key(dotenv_path: str | Path | None = None) -> str | None:
|
|
21
|
+
"""
|
|
22
|
+
Load OPENAI_API_KEY from the environment.
|
|
23
|
+
If dotenv_path is given (or python-dotenv is available), loads .env first.
|
|
24
|
+
"""
|
|
25
|
+
load_dotenv_if_available(dotenv_path)
|
|
26
|
+
return os.environ.get("OPENAI_API_KEY")
|
hecvec/hecvec.py
ADDED
hecvec/listdir.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""Directory listing with safe path resolution under a root."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import os
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
ALLOWED_EXTENSIONS = (".txt", ".md")
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ListDir:
|
|
12
|
+
"""
|
|
13
|
+
Lists directory contents with safe path resolution. All paths are
|
|
14
|
+
restricted under `root`. Returns a single list of relative path strings.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, root: str | Path):
|
|
18
|
+
self.root = Path(root).resolve()
|
|
19
|
+
if not self.root.is_dir():
|
|
20
|
+
raise ValueError(f"root must be an existing directory: {self.root}")
|
|
21
|
+
|
|
22
|
+
def _resolve(self, path: str | Path) -> Path:
|
|
23
|
+
"""Resolve path under root; raise ValueError if it escapes root."""
|
|
24
|
+
p = Path(path)
|
|
25
|
+
if not p.is_absolute():
|
|
26
|
+
p = self.root / p
|
|
27
|
+
resolved = p.resolve()
|
|
28
|
+
try:
|
|
29
|
+
resolved.relative_to(self.root)
|
|
30
|
+
except ValueError:
|
|
31
|
+
raise ValueError(f"Path escapes allowed root: {path!r} -> {resolved}")
|
|
32
|
+
return resolved
|
|
33
|
+
|
|
34
|
+
def listdir(self, path: str | Path = ".") -> list[str]:
|
|
35
|
+
"""
|
|
36
|
+
List entries in the given path (under root). Returns a sorted list
|
|
37
|
+
of relative path strings (dirs first, then files, by name).
|
|
38
|
+
"""
|
|
39
|
+
target = self._resolve(path)
|
|
40
|
+
if not target.is_dir():
|
|
41
|
+
raise ValueError(f"Not a directory: {target}")
|
|
42
|
+
out = []
|
|
43
|
+
for entry in target.iterdir():
|
|
44
|
+
try:
|
|
45
|
+
rel = entry.relative_to(self.root)
|
|
46
|
+
except ValueError:
|
|
47
|
+
continue
|
|
48
|
+
out.append(str(rel))
|
|
49
|
+
return sorted(out, key=lambda x: (not (self.root / x).is_dir(), x.lower()))
|
|
50
|
+
|
|
51
|
+
def listdir_recursive(
|
|
52
|
+
self,
|
|
53
|
+
path: str | Path = ".",
|
|
54
|
+
max_depth: int | None = None,
|
|
55
|
+
) -> list[str]:
|
|
56
|
+
"""
|
|
57
|
+
List all entries under path recursively. Returns a flat sorted list
|
|
58
|
+
of relative path strings.
|
|
59
|
+
"""
|
|
60
|
+
target = self._resolve(path)
|
|
61
|
+
if not target.is_dir():
|
|
62
|
+
raise ValueError(f"Not a directory: {target}")
|
|
63
|
+
out = []
|
|
64
|
+
for root_dir, dirs, files in os.walk(target, topdown=True):
|
|
65
|
+
root_path = Path(root_dir)
|
|
66
|
+
try:
|
|
67
|
+
rel_root = root_path.relative_to(self.root)
|
|
68
|
+
except ValueError:
|
|
69
|
+
continue
|
|
70
|
+
depth = len(rel_root.parts) if rel_root != Path(".") else 0
|
|
71
|
+
if max_depth is not None and depth > max_depth:
|
|
72
|
+
dirs.clear()
|
|
73
|
+
continue
|
|
74
|
+
for d in sorted(dirs):
|
|
75
|
+
p = root_path / d
|
|
76
|
+
try:
|
|
77
|
+
rel = p.relative_to(self.root)
|
|
78
|
+
except ValueError:
|
|
79
|
+
continue
|
|
80
|
+
out.append(str(rel))
|
|
81
|
+
for f in sorted(files):
|
|
82
|
+
p = root_path / f
|
|
83
|
+
try:
|
|
84
|
+
rel = p.relative_to(self.root)
|
|
85
|
+
except ValueError:
|
|
86
|
+
continue
|
|
87
|
+
out.append(str(rel))
|
|
88
|
+
return sorted(out)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class ListDirTextFiles(ListDir):
|
|
92
|
+
"""
|
|
93
|
+
Child of ListDir. Takes the list of relative paths from the parent and
|
|
94
|
+
returns only .txt and .md files as full paths; filters out all other
|
|
95
|
+
extensions.
|
|
96
|
+
"""
|
|
97
|
+
|
|
98
|
+
def __init__(self, root: str | Path, allowed_extensions: tuple[str, ...] = ALLOWED_EXTENSIONS):
|
|
99
|
+
super().__init__(root)
|
|
100
|
+
self.allowed_extensions = allowed_extensions
|
|
101
|
+
|
|
102
|
+
def filter_txt_md(self, relative_paths: list[str]) -> list[Path]:
|
|
103
|
+
"""
|
|
104
|
+
From a list of relative path strings (e.g. from listdir/listdir_recursive),
|
|
105
|
+
build full paths and return only those that are files with .txt or .md.
|
|
106
|
+
"""
|
|
107
|
+
result = []
|
|
108
|
+
for rel in relative_paths:
|
|
109
|
+
full = self.root / rel
|
|
110
|
+
if full.is_file() and full.suffix.lower() in self.allowed_extensions:
|
|
111
|
+
result.append(full.resolve())
|
|
112
|
+
return sorted(result)
|
|
113
|
+
|
|
114
|
+
def listdir_txt_md(self, path: str | Path = ".") -> list[Path]:
|
|
115
|
+
"""List only .txt and .md files under path (one level)."""
|
|
116
|
+
paths = self.listdir(path)
|
|
117
|
+
return self.filter_txt_md(paths)
|
|
118
|
+
|
|
119
|
+
def listdir_recursive_txt_md(
|
|
120
|
+
self,
|
|
121
|
+
path: str | Path = ".",
|
|
122
|
+
max_depth: int | None = None,
|
|
123
|
+
) -> list[Path]:
|
|
124
|
+
"""List only .txt and .md files under path recursively."""
|
|
125
|
+
paths = self.listdir_recursive(path=path, max_depth=max_depth)
|
|
126
|
+
return self.filter_txt_md(paths)
|
hecvec/pipeline.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Pipeline: composes listdir → read → token-chunk → embed → Chroma.
|
|
3
|
+
One module = one responsibility: orchestrate the full slice() flow.
|
|
4
|
+
"""
|
|
5
|
+
from __future__ import annotations
|
|
6
|
+
|
|
7
|
+
import logging
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from hecvec.chroma_client import add_documents, get_client
|
|
12
|
+
from hecvec.embeddings import embed_texts
|
|
13
|
+
from hecvec.env import load_openai_key
|
|
14
|
+
from hecvec.listdir import ListDirTextFiles
|
|
15
|
+
from hecvec.reading import ReadText
|
|
16
|
+
from hecvec.token_splitter import token_chunk_documents
|
|
17
|
+
|
|
18
|
+
logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _check_chroma_deps() -> None:
|
|
22
|
+
try:
|
|
23
|
+
import chromadb # noqa: F401
|
|
24
|
+
from langchain_text_splitters import TokenTextSplitter # noqa: F401
|
|
25
|
+
from openai import OpenAI # noqa: F401
|
|
26
|
+
except ImportError as e:
|
|
27
|
+
raise ImportError(
|
|
28
|
+
"Chroma pipeline requires: pip install hecvec[chroma]"
|
|
29
|
+
) from e
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class HecVec:
|
|
33
|
+
"""
|
|
34
|
+
Library-only pipeline. No API.
|
|
35
|
+
Call slice(path=...) to: list path → filter .txt/.md → token-chunk → push to Chroma.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
def __init__(
|
|
39
|
+
self,
|
|
40
|
+
*,
|
|
41
|
+
root: str | Path | None = None,
|
|
42
|
+
collection_name: str = "hecvec",
|
|
43
|
+
chroma_host: str = "localhost",
|
|
44
|
+
chroma_port: int = 8000,
|
|
45
|
+
embedding_model: str = "text-embedding-3-small",
|
|
46
|
+
chunk_size: int = 200,
|
|
47
|
+
chunk_overlap: int = 0,
|
|
48
|
+
encoding_name: str = "cl100k_base",
|
|
49
|
+
batch_size: int = 100,
|
|
50
|
+
openai_api_key: str | None = None,
|
|
51
|
+
dotenv_path: str | Path | None = None,
|
|
52
|
+
):
|
|
53
|
+
self.root = Path(root).resolve() if root else Path.cwd()
|
|
54
|
+
self.collection_name = collection_name
|
|
55
|
+
self.chroma_host = chroma_host
|
|
56
|
+
self.chroma_port = chroma_port
|
|
57
|
+
self.embedding_model = embedding_model
|
|
58
|
+
self.chunk_size = chunk_size
|
|
59
|
+
self.chunk_overlap = chunk_overlap
|
|
60
|
+
self.encoding_name = encoding_name
|
|
61
|
+
self.batch_size = batch_size
|
|
62
|
+
self._dotenv_path = dotenv_path
|
|
63
|
+
self._openai_api_key = openai_api_key or load_openai_key(dotenv_path)
|
|
64
|
+
|
|
65
|
+
def slice(
|
|
66
|
+
self,
|
|
67
|
+
path: str | Path,
|
|
68
|
+
**kwargs: Any,
|
|
69
|
+
) -> dict[str, Any]:
|
|
70
|
+
"""Run the pipeline for this instance (uses instance config)."""
|
|
71
|
+
return HecVec.slice(
|
|
72
|
+
path,
|
|
73
|
+
root=kwargs.pop("root", self.root),
|
|
74
|
+
collection_name=kwargs.pop("collection_name", self.collection_name),
|
|
75
|
+
chroma_host=kwargs.pop("chroma_host", self.chroma_host),
|
|
76
|
+
chroma_port=kwargs.pop("chroma_port", self.chroma_port),
|
|
77
|
+
embedding_model=kwargs.pop("embedding_model", self.embedding_model),
|
|
78
|
+
chunk_size=kwargs.pop("chunk_size", self.chunk_size),
|
|
79
|
+
chunk_overlap=kwargs.pop("chunk_overlap", self.chunk_overlap),
|
|
80
|
+
encoding_name=kwargs.pop("encoding_name", self.encoding_name),
|
|
81
|
+
batch_size=kwargs.pop("batch_size", self.batch_size),
|
|
82
|
+
openai_api_key=kwargs.pop("openai_api_key", self._openai_api_key),
|
|
83
|
+
dotenv_path=kwargs.pop("dotenv_path", self._dotenv_path),
|
|
84
|
+
**kwargs,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
@classmethod
|
|
88
|
+
def slice(
|
|
89
|
+
cls,
|
|
90
|
+
path: str | Path,
|
|
91
|
+
*,
|
|
92
|
+
root: str | Path | None = None,
|
|
93
|
+
collection_name: str = "hecvec",
|
|
94
|
+
chroma_host: str = "localhost",
|
|
95
|
+
chroma_port: int = 8000,
|
|
96
|
+
embedding_model: str = "text-embedding-3-small",
|
|
97
|
+
chunk_size: int = 200,
|
|
98
|
+
chunk_overlap: int = 0,
|
|
99
|
+
encoding_name: str = "cl100k_base",
|
|
100
|
+
batch_size: int = 100,
|
|
101
|
+
openai_api_key: str | None = None,
|
|
102
|
+
dotenv_path: str | Path | None = None,
|
|
103
|
+
**kwargs: Any,
|
|
104
|
+
) -> dict[str, Any]:
|
|
105
|
+
"""
|
|
106
|
+
Run the full pipeline: find path → listdir → filter .txt/.md → token-chunk → push to Chroma.
|
|
107
|
+
No API; everything runs in the library.
|
|
108
|
+
"""
|
|
109
|
+
_check_chroma_deps()
|
|
110
|
+
|
|
111
|
+
path = Path(path).resolve()
|
|
112
|
+
if not path.exists():
|
|
113
|
+
raise ValueError(f"path does not exist: {path}")
|
|
114
|
+
|
|
115
|
+
if collection_name == "hecvec":
|
|
116
|
+
collection_name = path.stem if path.is_file() else path.name
|
|
117
|
+
|
|
118
|
+
if path.is_file():
|
|
119
|
+
if path.suffix.lower() not in (".txt", ".md"):
|
|
120
|
+
raise ValueError(f"File must be .txt or .md: {path}")
|
|
121
|
+
root = path.parent
|
|
122
|
+
paths = [path]
|
|
123
|
+
logger.info("Archivo a procesar: %s", path)
|
|
124
|
+
else:
|
|
125
|
+
root = Path(root).resolve() if root else path
|
|
126
|
+
if not root.is_dir():
|
|
127
|
+
raise ValueError(f"path must be an existing directory: {root}")
|
|
128
|
+
logger.info("Ruta a procesar: %s", root)
|
|
129
|
+
lister = ListDirTextFiles(root=root)
|
|
130
|
+
paths = lister.listdir_recursive_txt_md(path)
|
|
131
|
+
if not paths:
|
|
132
|
+
logger.warning("No se encontraron archivos .txt/.md en %s", path)
|
|
133
|
+
return {"files": 0, "chunks": 0, "collection": collection_name, "message": "No .txt/.md files found"}
|
|
134
|
+
logger.info("Archivos .txt/.md encontrados: %d", len(paths))
|
|
135
|
+
for i, p in enumerate(paths[:10]):
|
|
136
|
+
logger.info(" %s", p)
|
|
137
|
+
if len(paths) > 10:
|
|
138
|
+
logger.info(" ... y %d más", len(paths) - 10)
|
|
139
|
+
|
|
140
|
+
# 1. (paths already set above) — 2. Read as text
|
|
141
|
+
logger.info("Leyendo contenido de %d archivos...", len(paths))
|
|
142
|
+
reader = ReadText(paths)
|
|
143
|
+
path_and_text = reader.read_all()
|
|
144
|
+
logger.info("Leídos %d archivos correctamente", len(path_and_text))
|
|
145
|
+
|
|
146
|
+
# 3. Token chunk
|
|
147
|
+
logger.info("Fragmentando con chunk_size=%d, chunk_overlap=%d...", chunk_size, chunk_overlap)
|
|
148
|
+
ids, documents = token_chunk_documents(
|
|
149
|
+
path_and_text,
|
|
150
|
+
chunk_size=chunk_size,
|
|
151
|
+
chunk_overlap=chunk_overlap,
|
|
152
|
+
encoding_name=encoding_name,
|
|
153
|
+
)
|
|
154
|
+
if not documents:
|
|
155
|
+
logger.warning("No se generaron chunks (archivos vacíos o sin texto)")
|
|
156
|
+
return {"files": len(path_and_text), "chunks": 0, "collection": collection_name}
|
|
157
|
+
|
|
158
|
+
logger.info("Chunks generados: %d", len(documents))
|
|
159
|
+
|
|
160
|
+
# 4. Embed
|
|
161
|
+
api_key = openai_api_key or load_openai_key(dotenv_path)
|
|
162
|
+
if not api_key:
|
|
163
|
+
raise ValueError(
|
|
164
|
+
"OPENAI_API_KEY required for embeddings. "
|
|
165
|
+
"Set it in .env, pass openai_api_key=, or set the OPENAI_API_KEY env var."
|
|
166
|
+
)
|
|
167
|
+
logger.info("Generando embeddings (modelo=%s, batch_size=%d)...", embedding_model, batch_size)
|
|
168
|
+
embeddings = embed_texts(
|
|
169
|
+
documents,
|
|
170
|
+
api_key=api_key,
|
|
171
|
+
model=embedding_model,
|
|
172
|
+
batch_size=batch_size,
|
|
173
|
+
)
|
|
174
|
+
logger.info("Embeddings generados: %d vectores", len(embeddings))
|
|
175
|
+
|
|
176
|
+
# 5. Push to Chroma
|
|
177
|
+
logger.info("Conectando a Chroma (host=%s, port=%s)...", chroma_host, chroma_port)
|
|
178
|
+
client = get_client(host=chroma_host, port=chroma_port)
|
|
179
|
+
logger.info("Añadiendo %d documentos a la colección '%s'...", len(documents), collection_name)
|
|
180
|
+
add_documents(client, collection_name, ids, embeddings, documents)
|
|
181
|
+
logger.info("Pipeline completado: %d archivos → %d chunks → Chroma", len(path_and_text), len(documents))
|
|
182
|
+
|
|
183
|
+
return {
|
|
184
|
+
"files": len(path_and_text),
|
|
185
|
+
"chunks": len(documents),
|
|
186
|
+
"collection": collection_name,
|
|
187
|
+
}
|
hecvec/reading.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Read text files and expose content as strings."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import logging
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
logger = logging.getLogger(__name__)
|
|
8
|
+
|
|
9
|
+
# Fallback encodings to try if UTF-8 fails (e.g. Windows Latin-1 / CP1252)
|
|
10
|
+
FALLBACK_ENCODINGS = ("utf-8", "latin-1", "cp1252")
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class ReadText:
|
|
14
|
+
"""
|
|
15
|
+
Given a list of file paths (.txt and .md), reads each as text.
|
|
16
|
+
Returns (path, text) pairs. Tries UTF-8 first, then latin-1, then cp1252.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
def __init__(self, paths: list[str] | list[Path], encoding: str = "utf-8"):
|
|
20
|
+
self.paths = [Path(p) for p in paths]
|
|
21
|
+
self.encoding = encoding
|
|
22
|
+
|
|
23
|
+
def read_all(self) -> list[tuple[Path, str]]:
|
|
24
|
+
"""
|
|
25
|
+
Read each file as text.
|
|
26
|
+
Returns a list of (path, text). Skips non-files and missing paths.
|
|
27
|
+
"""
|
|
28
|
+
out = []
|
|
29
|
+
for p in self.paths:
|
|
30
|
+
if not p.is_file():
|
|
31
|
+
logger.debug("Saltando (no es archivo): %s", p)
|
|
32
|
+
continue
|
|
33
|
+
text = None
|
|
34
|
+
for enc in FALLBACK_ENCODINGS:
|
|
35
|
+
try:
|
|
36
|
+
text = p.read_text(encoding=enc)
|
|
37
|
+
if enc != "utf-8":
|
|
38
|
+
logger.info("Leído con encoding %s: %s", enc, p.name)
|
|
39
|
+
break
|
|
40
|
+
except (OSError, UnicodeDecodeError) as e:
|
|
41
|
+
if enc == FALLBACK_ENCODINGS[-1]:
|
|
42
|
+
logger.warning("No se pudo leer %s: %s", p, e)
|
|
43
|
+
continue
|
|
44
|
+
if text is not None:
|
|
45
|
+
out.append((p.resolve(), text))
|
|
46
|
+
return out
|
|
47
|
+
|
|
48
|
+
def __iter__(self):
|
|
49
|
+
"""Iterate over (path, text)."""
|
|
50
|
+
for p in self.paths:
|
|
51
|
+
if not p.is_file():
|
|
52
|
+
continue
|
|
53
|
+
for enc in FALLBACK_ENCODINGS:
|
|
54
|
+
try:
|
|
55
|
+
text = p.read_text(encoding=enc)
|
|
56
|
+
yield p.resolve(), text
|
|
57
|
+
break
|
|
58
|
+
except (OSError, UnicodeDecodeError):
|
|
59
|
+
continue
|
hecvec/token_splitter.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Token-based text chunking (TokenTextSplitter from langchain).
|
|
3
|
+
One module = one responsibility: split text by token count.
|
|
4
|
+
Requires: pip install hecvec[chunk] or hecvec[chroma]
|
|
5
|
+
"""
|
|
6
|
+
from __future__ import annotations
|
|
7
|
+
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import TYPE_CHECKING
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from langchain_text_splitters import TokenTextSplitter
|
|
13
|
+
|
|
14
|
+
DEFAULT_CHUNK_SIZE = 200
|
|
15
|
+
DEFAULT_CHUNK_OVERLAP = 0
|
|
16
|
+
DEFAULT_ENCODING_NAME = "cl100k_base"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _get_token_splitter(
|
|
20
|
+
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
|
21
|
+
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
|
|
22
|
+
encoding_name: str = DEFAULT_ENCODING_NAME,
|
|
23
|
+
) -> "TokenTextSplitter":
|
|
24
|
+
try:
|
|
25
|
+
from langchain_text_splitters import TokenTextSplitter
|
|
26
|
+
except ImportError as e:
|
|
27
|
+
raise ImportError(
|
|
28
|
+
"Token splitting requires langchain-text-splitters. Install with: pip install hecvec[chunk]"
|
|
29
|
+
) from e
|
|
30
|
+
return TokenTextSplitter(
|
|
31
|
+
chunk_size=chunk_size,
|
|
32
|
+
chunk_overlap=chunk_overlap,
|
|
33
|
+
encoding_name=encoding_name,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def token_chunk_text(
|
|
38
|
+
text: str,
|
|
39
|
+
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
|
40
|
+
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
|
|
41
|
+
encoding_name: str = DEFAULT_ENCODING_NAME,
|
|
42
|
+
) -> list[str]:
|
|
43
|
+
"""Split a single document text into chunks by token count."""
|
|
44
|
+
splitter = _get_token_splitter(chunk_size, chunk_overlap, encoding_name)
|
|
45
|
+
return splitter.split_text(text)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def token_chunk_documents(
|
|
49
|
+
path_and_texts: list[tuple[str | Path, str]],
|
|
50
|
+
chunk_size: int = DEFAULT_CHUNK_SIZE,
|
|
51
|
+
chunk_overlap: int = DEFAULT_CHUNK_OVERLAP,
|
|
52
|
+
encoding_name: str = DEFAULT_ENCODING_NAME,
|
|
53
|
+
) -> tuple[list[str], list[str]]:
|
|
54
|
+
"""
|
|
55
|
+
Chunk multiple documents by token count.
|
|
56
|
+
Returns (ids, documents) where ids are chunk_0, chunk_1, ...
|
|
57
|
+
"""
|
|
58
|
+
splitter = _get_token_splitter(chunk_size, chunk_overlap, encoding_name)
|
|
59
|
+
documents: list[str] = []
|
|
60
|
+
ids: list[str] = []
|
|
61
|
+
for path, text in path_and_texts:
|
|
62
|
+
for content in splitter.split_text(text):
|
|
63
|
+
if not content.strip():
|
|
64
|
+
continue
|
|
65
|
+
ids.append(f"chunk_{len(ids)}")
|
|
66
|
+
documents.append(content)
|
|
67
|
+
return ids, documents
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: hecvec
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: List directories (safe root), filter .txt/.md files, read as text, chunk, embed, and push to Chroma.
|
|
5
|
+
License-Expression: MIT
|
|
6
|
+
Keywords: chunking,document-pipeline,listdir,text-files
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
16
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
17
|
+
Requires-Python: <3.14,>=3.9
|
|
18
|
+
Requires-Dist: chromadb>=0.4.0
|
|
19
|
+
Requires-Dist: langchain-text-splitters>=0.2.0
|
|
20
|
+
Requires-Dist: openai>=1.0.0
|
|
21
|
+
Requires-Dist: python-dotenv>=1.0.0
|
|
22
|
+
Requires-Dist: tiktoken>=0.5.0
|
|
23
|
+
Provides-Extra: chroma
|
|
24
|
+
Requires-Dist: chromadb>=0.4.0; extra == 'chroma'
|
|
25
|
+
Requires-Dist: langchain-text-splitters>=0.2.0; extra == 'chroma'
|
|
26
|
+
Requires-Dist: openai>=1.0.0; extra == 'chroma'
|
|
27
|
+
Requires-Dist: python-dotenv>=1.0.0; extra == 'chroma'
|
|
28
|
+
Requires-Dist: tiktoken>=0.5.0; extra == 'chroma'
|
|
29
|
+
Provides-Extra: chunk
|
|
30
|
+
Requires-Dist: langchain-text-splitters>=0.2.0; extra == 'chunk'
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=7.0.0; extra == 'dev'
|
|
33
|
+
Description-Content-Type: text/markdown
|
|
34
|
+
|
|
35
|
+
# HecVec
|
|
36
|
+
|
|
37
|
+
List directories with a safe root, filter `.txt`/`.md` files, read them as text, and optionally chunk and push to Chroma — **library only, no API**.
|
|
38
|
+
|
|
39
|
+
## Install
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install hecvec
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
One-call pipeline (list → filter → token-chunk → Chroma):
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
pip install hecvec[chroma]
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Optional chunking only (no Chroma):
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
pip install hecvec[chunk]
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## Usage
|
|
58
|
+
|
|
59
|
+
### One-call pipeline (list → filter → chunk → Chroma)
|
|
60
|
+
|
|
61
|
+
Runs entirely in the library (no API). You need Chroma running (e.g. `docker run -p 8000:8000 chromadb/chroma`) and `OPENAI_API_KEY` set (in the environment or in a `.env` file; the library loads `.env` via python-dotenv when you use `hecvec[chroma]`).
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
import hecvec
|
|
65
|
+
|
|
66
|
+
# Class-style: use defaults, then slice
|
|
67
|
+
test = hecvec.HecVec()
|
|
68
|
+
result = test.slice(path="/path/to/folder")
|
|
69
|
+
# → {"files": N, "chunks": M, "collection": "hecvec"}
|
|
70
|
+
|
|
71
|
+
# Or call slice on the class (same flow)
|
|
72
|
+
result = hecvec.HecVec.slice(path="/path/to/folder")
|
|
73
|
+
```
|
|
74
|
+
|
|
75
|
+
Flow: resolve path → listdir → filter `.txt`/`.md` → token-chunk (200 tokens, `cl100k_base`) → embed with OpenAI → push to Chroma.
|
|
76
|
+
|
|
77
|
+
Optional config (instance or `HecVec.slice(..., key=value)`):
|
|
78
|
+
|
|
79
|
+
- `root`, `collection_name`, `chroma_host`, `chroma_port`
|
|
80
|
+
- `embedding_model`, `chunk_size`, `chunk_overlap`, `encoding_name`, `batch_size`
|
|
81
|
+
- `openai_api_key` (or set `OPENAI_API_KEY` in the environment or in a `.env` file; optional `dotenv_path` to point to a specific `.env`)
|
|
82
|
+
|
|
83
|
+
### Low-level building blocks
|
|
84
|
+
|
|
85
|
+
```python
|
|
86
|
+
from pathlib import Path
|
|
87
|
+
from hecvec import ListDir, ListDirTextFiles, ReadText
|
|
88
|
+
|
|
89
|
+
root = Path("/path/to/repo")
|
|
90
|
+
|
|
91
|
+
# List all entries under a path (restricted to root)
|
|
92
|
+
lister = ListDir(root=root)
|
|
93
|
+
for rel in lister.listdir("."):
|
|
94
|
+
print(rel)
|
|
95
|
+
|
|
96
|
+
# Only .txt and .md files, recursively
|
|
97
|
+
text_lister = ListDirTextFiles(root=root)
|
|
98
|
+
paths = text_lister.listdir_recursive_txt_md("docs")
|
|
99
|
+
|
|
100
|
+
# Read each file as text
|
|
101
|
+
reader = ReadText(paths)
|
|
102
|
+
for path, text in reader:
|
|
103
|
+
print(path, len(text))
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
### Chunking (optional)
|
|
107
|
+
|
|
108
|
+
With `pip install hecvec[chunk]`:
|
|
109
|
+
|
|
110
|
+
```python
|
|
111
|
+
from hecvec import ListDirTextFiles, ReadText
|
|
112
|
+
from hecvec.chunking import chunk_documents
|
|
113
|
+
|
|
114
|
+
lister = ListDirTextFiles(root=root)
|
|
115
|
+
paths = lister.listdir_recursive_txt_md(".")
|
|
116
|
+
reader = ReadText(paths)
|
|
117
|
+
path_and_text = reader.read_all()
|
|
118
|
+
chunks = chunk_documents(path_and_text)
|
|
119
|
+
# list of {"path": "...", "chunk_index": 0, "content": "..."}
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### CLI
|
|
123
|
+
|
|
124
|
+
```bash
|
|
125
|
+
hecvec-listdir [path] [root]
|
|
126
|
+
# or
|
|
127
|
+
python -m hecvec.cli [path] [root]
|
|
128
|
+
```
|
|
129
|
+
|
|
130
|
+
### Test the full pipeline (the method that does everything)
|
|
131
|
+
|
|
132
|
+
From the project root, with Chroma running and `OPENAI_API_KEY` set (e.g. in `.env`):
|
|
133
|
+
|
|
134
|
+
```bash
|
|
135
|
+
# Start Chroma (one terminal)
|
|
136
|
+
docker run -p 8000:8000 chromadb/chroma
|
|
137
|
+
|
|
138
|
+
# Run the test script (another terminal)
|
|
139
|
+
uv run python scripts/test_slice.py
|
|
140
|
+
# or: python scripts/test_slice.py
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
The script creates a temp folder with two `.txt` files, runs `HecVec.slice(path=...)`, and prints `PASS` or `FAIL` with the result (`files`, `chunks`, `collection`).
|
|
144
|
+
|
|
145
|
+
### Modular layout (easy to study)
|
|
146
|
+
|
|
147
|
+
Each step of the pipeline lives in its own module:
|
|
148
|
+
|
|
149
|
+
| Module | Responsibility |
|
|
150
|
+
|--------|-----------------|
|
|
151
|
+
| `hecvec.env` | Load `.env` and `OPENAI_API_KEY` |
|
|
152
|
+
| `hecvec.listdir` | List dirs under a safe root; filter by extension (`.txt`/`.md`) |
|
|
153
|
+
| `hecvec.reading` | Read files as text (UTF-8 / latin-1 / cp1252 fallback) |
|
|
154
|
+
| `hecvec.token_splitter` | Token-based chunking (TokenTextSplitter) |
|
|
155
|
+
| `hecvec.chunking` | Recursive-character chunking (RecursiveCharacterTextSplitter) |
|
|
156
|
+
| `hecvec.embeddings` | OpenAI embeddings (`embed_texts`) |
|
|
157
|
+
| `hecvec.chroma_client` | Chroma client, get/create collection, add documents |
|
|
158
|
+
| `hecvec.chroma_list` | List Chroma collections and counts |
|
|
159
|
+
| `hecvec.pipeline` | Orchestrator: `HecVec` and `slice(path=...)` |
|
|
160
|
+
|
|
161
|
+
Example: use one step on its own:
|
|
162
|
+
|
|
163
|
+
```python
|
|
164
|
+
from hecvec import embed_texts, token_chunk_text, list_collections
|
|
165
|
+
|
|
166
|
+
chunks = token_chunk_text("Some long document...", chunk_size=200)
|
|
167
|
+
vecs = embed_texts(chunks, api_key="sk-...")
|
|
168
|
+
names_and_counts = list_collections(host="localhost", port=8000)
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Development
|
|
172
|
+
|
|
173
|
+
From the repo root:
|
|
174
|
+
|
|
175
|
+
```bash
|
|
176
|
+
uv sync
|
|
177
|
+
uv run python -c "from hecvec import ListDir; print(ListDir('.').listdir('.'))"
|
|
178
|
+
```
|
|
179
|
+
|
|
180
|
+
## License
|
|
181
|
+
|
|
182
|
+
MIT
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
hecvec/__init__.py,sha256=mwGyDxZgshlfn1wWUB5xO2LTwg5-LWYQmD3Io6KGIZk,1028
|
|
2
|
+
hecvec/chroma_client.py,sha256=iNQDw5r2_BReLY7MVhoJHhXvJduBv_0MR0UkeDEtKf0,1779
|
|
3
|
+
hecvec/chroma_list.py,sha256=botRTK0Ge_-PwBsj9-WE0VxjyMJa6g07m1MO1_1pq2Q,722
|
|
4
|
+
hecvec/chunking.py,sha256=BpHjL5e7I8u-T6goU0TD_Y1US63GUEz_tYIu2iTN9U0,2203
|
|
5
|
+
hecvec/cli.py,sha256=oMernmxfk-_TZr83b0-jl3STl5xBRVORqTokcDByZBA,642
|
|
6
|
+
hecvec/embeddings.py,sha256=kvjty_BDx77COehxTvHWXcKsNqMy2iKiT2H_7V9wp4k,1149
|
|
7
|
+
hecvec/env.py,sha256=691efRFxSzEtDgtM0xl5aAbgrCvjgUqSQe7dG8jzV9g,778
|
|
8
|
+
hecvec/hecvec.py,sha256=XGBXBCAZhdOzP5xvDIFnZZtsMBNOZXrt_dcrIg1vlFE,153
|
|
9
|
+
hecvec/listdir.py,sha256=1xRR7Rn9hAhIscsweZtLBTq54BIV7G8yI9w_qySEIEY,4428
|
|
10
|
+
hecvec/pipeline.py,sha256=TwYBG9Bc06Z04oIAi-vYtCH5u6hjXt853EwR3mJVH_s,7388
|
|
11
|
+
hecvec/reading.py,sha256=gPPVuQPukJIKUJS69-BBZOwMgnvx-0da8gQJ5D3P6sM,2003
|
|
12
|
+
hecvec/token_splitter.py,sha256=usKZjNw_BQasuNx9Hz2iGUHUiq8fNYIy_HMiW7QaZ1k,2167
|
|
13
|
+
hecvec-0.1.0.dist-info/METADATA,sha256=Q3FZQUC_eeBUKOHvQI0CpgaqifMjpYi_ZU2m5sXl5dw,5704
|
|
14
|
+
hecvec-0.1.0.dist-info/WHEEL,sha256=QccIxa26bgl1E6uMy58deGWi-0aeIkkangHcxk2kWfw,87
|
|
15
|
+
hecvec-0.1.0.dist-info/entry_points.txt,sha256=M-5kL-e8yJH7OxTWqnQhLWJhuDPzBU0_rbpr7RVcLvQ,51
|
|
16
|
+
hecvec-0.1.0.dist-info/RECORD,,
|