ctxvault 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. ctxvault-0.1.0/LICENSE +21 -0
  2. ctxvault-0.1.0/PKG-INFO +186 -0
  3. ctxvault-0.1.0/README.md +137 -0
  4. ctxvault-0.1.0/pyproject.toml +87 -0
  5. ctxvault-0.1.0/setup.cfg +4 -0
  6. ctxvault-0.1.0/src/__init__.py +0 -0
  7. ctxvault-0.1.0/src/ctxvault/__init__.py +0 -0
  8. ctxvault-0.1.0/src/ctxvault/api/__init__.py +0 -0
  9. ctxvault-0.1.0/src/ctxvault/cli/__init__.py +0 -0
  10. ctxvault-0.1.0/src/ctxvault/cli/app.py +101 -0
  11. ctxvault-0.1.0/src/ctxvault/core/__init__.py +0 -0
  12. ctxvault-0.1.0/src/ctxvault/core/embedding.py +13 -0
  13. ctxvault-0.1.0/src/ctxvault/core/exceptions.py +21 -0
  14. ctxvault-0.1.0/src/ctxvault/core/indexer.py +26 -0
  15. ctxvault-0.1.0/src/ctxvault/core/querying.py +38 -0
  16. ctxvault-0.1.0/src/ctxvault/models/__init__.py +0 -0
  17. ctxvault-0.1.0/src/ctxvault/storage/__init__.py +0 -0
  18. ctxvault-0.1.0/src/ctxvault/storage/chroma_store.py +43 -0
  19. ctxvault-0.1.0/src/ctxvault/utils/__init__.py +0 -0
  20. ctxvault-0.1.0/src/ctxvault/utils/chuncking.py +8 -0
  21. ctxvault-0.1.0/src/ctxvault/utils/metadata_builder.py +19 -0
  22. ctxvault-0.1.0/src/ctxvault/utils/text_extraction.py +58 -0
  23. ctxvault-0.1.0/src/ctxvault.egg-info/PKG-INFO +186 -0
  24. ctxvault-0.1.0/src/ctxvault.egg-info/SOURCES.txt +30 -0
  25. ctxvault-0.1.0/src/ctxvault.egg-info/dependency_links.txt +1 -0
  26. ctxvault-0.1.0/src/ctxvault.egg-info/entry_points.txt +2 -0
  27. ctxvault-0.1.0/src/ctxvault.egg-info/requires.txt +20 -0
  28. ctxvault-0.1.0/src/ctxvault.egg-info/top_level.txt +1 -0
  29. ctxvault-0.1.0/src/main.py +19 -0
  30. ctxvault-0.1.0/tests/test_api.py +228 -0
  31. ctxvault-0.1.0/tests/test_cli.py +42 -0
  32. ctxvault-0.1.0/tests/test_core.py +37 -0
ctxvault-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Filippo Venturini
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,186 @@
1
+ Metadata-Version: 2.4
2
+ Name: ctxvault
3
+ Version: 0.1.0
4
+ Summary: ctxvault is a local-first knowledge vault that indexes your documents, generates embeddings, and enables fast semantic search via CLI or API. Designed for personal knowledge bases, RAG pipelines, and AI agents.
5
+ Author-email: Filippo Venturini <filippoventurini00@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/Filippo-Venturini/ctx-vault
8
+ Project-URL: Repository, https://github.com/Filippo-Venturini/ctx-vault
9
+ Project-URL: Issues, https://github.com/Filippo-Venturini/ctx-vault/issues
10
+ Keywords: rag,retrieval-augmented-generation,semantic-search,embeddings,vector-database,chroma,llm,ai,knowledge-base,document-search,local-ai,developer-tools,cli,fastapi
11
+ Classifier: Development Status :: 3 - Alpha
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Operating System :: OS Independent
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
21
+ Classifier: Topic :: Database
22
+ Classifier: Topic :: Text Processing :: Indexing
23
+ Classifier: Topic :: Utilities
24
+ Classifier: Environment :: Console
25
+ Classifier: Framework :: FastAPI
26
+ Requires-Python: >=3.9
27
+ Description-Content-Type: text/markdown
28
+ License-File: LICENSE
29
+ Requires-Dist: typer>=0.20.0
30
+ Requires-Dist: chromadb>=1.3.0
31
+ Requires-Dist: sentence-transformers>=5.0.0
32
+ Requires-Dist: fastapi>=0.124.0
33
+ Requires-Dist: uvicorn>=0.38.0
34
+ Requires-Dist: python-dotenv>=1.2.0
35
+ Requires-Dist: pydantic>=2.12.0
36
+ Requires-Dist: rich>=14.0.0
37
+ Requires-Dist: pypdf>=6.0.0
38
+ Requires-Dist: python-docx>=1.0.0
39
+ Requires-Dist: markdown>=3.0.0
40
+ Requires-Dist: strip-tags>=0.5.0
41
+ Provides-Extra: dev
42
+ Requires-Dist: pytest>=9.0.0; extra == "dev"
43
+ Requires-Dist: pytest-mock>=3.15.0; extra == "dev"
44
+ Requires-Dist: pytest-anyio>=0.0.0; extra == "dev"
45
+ Requires-Dist: black>=25.0.0; extra == "dev"
46
+ Requires-Dist: isort>=7.0.0; extra == "dev"
47
+ Requires-Dist: pre-commit>=3.5.0; extra == "dev"
48
+ Dynamic: license-file
49
+
50
+ # CtxVault
51
+
52
+ Local semantic search vault for LLMs.
53
+
54
+ CtxVault lets you index documents locally, generate embeddings, and query them with semantic search.
55
+ Designed as a lightweight RAG backend for agents, scripts, and LLM workflows.
56
+
57
+ ## Why CtxVault
58
+
59
+ - 100% local (no cloud, no data sharing)
60
+ - simple CLI
61
+ - works offline
62
+ - persistent vector store (Chroma)
63
+ - file-based workflow
64
+ - agent/API ready (future)
65
+
66
+ Ideal for:
67
+ - personal knowledge bases
68
+ - private documents
69
+ - local RAG pipelines
70
+ - AI agents needing contextual memory
71
+
72
+ ---
73
+
74
+ ## Install
75
+
76
+ Python 3.10+
77
+
78
+ ```bash
79
+ pip install -e .
80
+ ````
81
+
82
+ ---
83
+
84
+ ## Quickstart
85
+
86
+ Initialize a vault:
87
+
88
+ ```bash
89
+ ctxvault init ./my-vault
90
+ ```
91
+
92
+ Index files or folders:
93
+
94
+ ```bash
95
+ ctxvault index ./my-vault/docs
96
+ ```
97
+
98
+ Query:
99
+
100
+ ```bash
101
+ ctxvault query "what is project Orion?"
102
+ ```
103
+
104
+ ---
105
+
106
+ ## CLI Commands
107
+
108
+ ### init
109
+
110
+ Initialize a vault directory.
111
+
112
+ ```bash
113
+ ctxvault init <path>
114
+ ```
115
+
116
+ ---
117
+
118
+ ### index
119
+
120
+ Index a file or directory.
121
+
122
+ ```bash
123
+ ctxvault index <path>
124
+ ```
125
+
126
+ ---
127
+
128
+ ### query
129
+
130
+ Semantic search inside the vault.
131
+
132
+ ```bash
133
+ ctxvault query "<text>"
134
+ ```
135
+
136
+ ---
137
+
138
+ ### delete
139
+
140
+ Remove a document from the vault.
141
+
142
+ ```bash
143
+ ctxvault delete <path>
144
+ ```
145
+
146
+ ---
147
+
148
+ ### reindex
149
+
150
+ Reindex a document after changes.
151
+
152
+ ```bash
153
+ ctxvault reindex <path>
154
+ ```
155
+
156
+ ---
157
+
158
+ ### list
159
+
160
+ List indexed documents.
161
+
162
+ ```bash
163
+ ctxvault list
164
+ ```
165
+
166
+ ---
167
+
168
+ ## Privacy
169
+
170
+ All processing happens locally.
171
+ No data is sent to external services.
172
+
173
+ ---
174
+
175
+ ## Roadmap
176
+
177
+ * [x] CLI MVP
178
+ * [ ] FastAPI server
179
+ * [ ] sync and file watcher
180
+ * [ ] multi-vault support
181
+
182
+ ---
183
+
184
+ ## License
185
+
186
+ MIT
@@ -0,0 +1,137 @@
1
+ # CtxVault
2
+
3
+ Local semantic search vault for LLMs.
4
+
5
+ CtxVault lets you index documents locally, generate embeddings, and query them with semantic search.
6
+ Designed as a lightweight RAG backend for agents, scripts, and LLM workflows.
7
+
8
+ ## Why CtxVault
9
+
10
+ - 100% local (no cloud, no data sharing)
11
+ - simple CLI
12
+ - works offline
13
+ - persistent vector store (Chroma)
14
+ - file-based workflow
15
+ - agent/API ready (future)
16
+
17
+ Ideal for:
18
+ - personal knowledge bases
19
+ - private documents
20
+ - local RAG pipelines
21
+ - AI agents needing contextual memory
22
+
23
+ ---
24
+
25
+ ## Install
26
+
27
+ Python 3.10+
28
+
29
+ ```bash
30
+ pip install -e .
31
+ ````
32
+
33
+ ---
34
+
35
+ ## Quickstart
36
+
37
+ Initialize a vault:
38
+
39
+ ```bash
40
+ ctxvault init ./my-vault
41
+ ```
42
+
43
+ Index files or folders:
44
+
45
+ ```bash
46
+ ctxvault index ./my-vault/docs
47
+ ```
48
+
49
+ Query:
50
+
51
+ ```bash
52
+ ctxvault query "what is project Orion?"
53
+ ```
54
+
55
+ ---
56
+
57
+ ## CLI Commands
58
+
59
+ ### init
60
+
61
+ Initialize a vault directory.
62
+
63
+ ```bash
64
+ ctxvault init <path>
65
+ ```
66
+
67
+ ---
68
+
69
+ ### index
70
+
71
+ Index a file or directory.
72
+
73
+ ```bash
74
+ ctxvault index <path>
75
+ ```
76
+
77
+ ---
78
+
79
+ ### query
80
+
81
+ Semantic search inside the vault.
82
+
83
+ ```bash
84
+ ctxvault query "<text>"
85
+ ```
86
+
87
+ ---
88
+
89
+ ### delete
90
+
91
+ Remove a document from the vault.
92
+
93
+ ```bash
94
+ ctxvault delete <path>
95
+ ```
96
+
97
+ ---
98
+
99
+ ### reindex
100
+
101
+ Reindex a document after changes.
102
+
103
+ ```bash
104
+ ctxvault reindex <path>
105
+ ```
106
+
107
+ ---
108
+
109
+ ### list
110
+
111
+ List indexed documents.
112
+
113
+ ```bash
114
+ ctxvault list
115
+ ```
116
+
117
+ ---
118
+
119
+ ## Privacy
120
+
121
+ All processing happens locally.
122
+ No data is sent to external services.
123
+
124
+ ---
125
+
126
+ ## Roadmap
127
+
128
+ * [x] CLI MVP
129
+ * [ ] FastAPI server
130
+ * [ ] sync and file watcher
131
+ * [ ] multi-vault support
132
+
133
+ ---
134
+
135
+ ## License
136
+
137
+ MIT
@@ -0,0 +1,87 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "ctxvault"
7
+ version = "0.1.0"
8
+ description = "ctxvault is a local-first knowledge vault that indexes your documents, generates embeddings, and enables fast semantic search via CLI or API. Designed for personal knowledge bases, RAG pipelines, and AI agents."
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = {text = "MIT"}
12
+ authors = [
13
+ {name = "Filippo Venturini", email = "filippoventurini00@gmail.com"}
14
+ ]
15
+ keywords = [
16
+ "rag",
17
+ "retrieval-augmented-generation",
18
+ "semantic-search",
19
+ "embeddings",
20
+ "vector-database",
21
+ "chroma",
22
+ "llm",
23
+ "ai",
24
+ "knowledge-base",
25
+ "document-search",
26
+ "local-ai",
27
+ "developer-tools",
28
+ "cli",
29
+ "fastapi"
30
+ ]
31
+ classifiers = [
32
+ "Development Status :: 3 - Alpha",
33
+ "Intended Audience :: Developers",
34
+ "Intended Audience :: Science/Research",
35
+ "License :: OSI Approved :: MIT License",
36
+ "Operating System :: OS Independent",
37
+ "Programming Language :: Python :: 3",
38
+ "Programming Language :: Python :: 3.10",
39
+ "Programming Language :: Python :: 3.11",
40
+ "Programming Language :: Python :: 3.12",
41
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
42
+ "Topic :: Database",
43
+ "Topic :: Text Processing :: Indexing",
44
+ "Topic :: Utilities",
45
+ "Environment :: Console",
46
+ "Framework :: FastAPI"
47
+ ]
48
+ dependencies = [
49
+ "typer>=0.20.0",
50
+ "chromadb>=1.3.0",
51
+ "sentence-transformers>=5.0.0",
52
+ "fastapi>=0.124.0",
53
+ "uvicorn>=0.38.0",
54
+ "python-dotenv>=1.2.0",
55
+ "pydantic>=2.12.0",
56
+ "rich>=14.0.0",
57
+ "pypdf>=6.0.0",
58
+ "python-docx>=1.0.0",
59
+ "markdown>=3.0.0",
60
+ "strip-tags>=0.5.0",
61
+ ]
62
+
63
+ [project.optional-dependencies]
64
+ dev = [
65
+ "pytest>=9.0.0",
66
+ "pytest-mock>=3.15.0",
67
+ "pytest-anyio>=0.0.0",
68
+ "black>=25.0.0",
69
+ "isort>=7.0.0",
70
+ "pre-commit>=3.5.0",
71
+ ]
72
+
73
+ [project.scripts]
74
+ ctxvault = "ctxvault.cli.app:main"
75
+
76
+ [project.urls]
77
+ Homepage = "https://github.com/Filippo-Venturini/ctx-vault"
78
+ Repository = "https://github.com/Filippo-Venturini/ctx-vault"
79
+ Issues = "https://github.com/Filippo-Venturini/ctx-vault/issues"
80
+
81
+ [tool.setuptools]
82
+ packages = ["ctxvault"]
83
+ package-dir = {"" = "src"}
84
+
85
+ [tool.pytest.ini_options]
86
+ testpaths = ["tests"]
87
+ pythonpath = ["src"]
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
File without changes
File without changes
File without changes
File without changes
@@ -0,0 +1,101 @@
1
+ from pathlib import Path
2
+ import typer
3
+ from ctxvault.core import vault
4
+ from ctxvault.core.exceptions import VaultAlreadyExistsError
5
+
6
+ app = typer.Typer()
7
+
8
+ @app.command()
9
+ def init(path: str = "."):
10
+ try:
11
+ typer.echo(f"Initializing Context Vault at: {path} ...")
12
+ vault_path, config_path = vault.init_vault(path=path)
13
+ typer.secho("Context Vault initialized succesfully!", fg=typer.colors.GREEN, bold=True)
14
+ typer.echo(f"Context Vault path: {vault_path}")
15
+ typer.echo(f"Config file path: {config_path}")
16
+ except VaultAlreadyExistsError as e:
17
+ typer.secho("Warning: Context Vault already initialized in this path!", fg=typer.colors.YELLOW, bold=True)
18
+ typer.echo(f"Existing vault path: {e.existing_path}")
19
+ raise typer.Exit(1)
20
+
21
+ @app.command()
22
+ def index(path: str = "."):
23
+ indexed_files, skipped_files = vault.index_files(base_path=Path(path))
24
+
25
+ for file in indexed_files:
26
+ typer.secho(f"Indexed: {file}", fg=typer.colors.GREEN)
27
+
28
+ for file in skipped_files:
29
+ typer.secho(f"Skipped: {file}", fg=typer.colors.YELLOW)
30
+
31
+ typer.secho(f"Indexed: {len(indexed_files)}", fg=typer.colors.GREEN, bold=True)
32
+ typer.secho(f"Skipped: {len(skipped_files)}", fg=typer.colors.YELLOW, bold=True)
33
+
34
+ @app.command()
35
+ def query(text: str = ""):
36
+ result = vault.query(text=text)
37
+ if not result.results:
38
+ typer.secho("No results found.", fg=typer.colors.YELLOW)
39
+ return
40
+
41
+ typer.secho(f"\n Found {len(result.results)} chunks", fg=typer.colors.GREEN, bold=True)
42
+ typer.echo("─" * 80)
43
+
44
+ for idx, chunk in enumerate(result.results, 1):
45
+ typer.secho(f"\n[{idx}] ", fg=typer.colors.CYAN, bold=True, nl=False)
46
+ typer.secho(f"score: {chunk.score:.3f}", fg=typer.colors.MAGENTA)
47
+ typer.secho(f" ▸ {chunk.source} ", fg=typer.colors.BLUE, nl=False)
48
+ typer.echo(f"(chunk {chunk.chunk_index})")
49
+
50
+ preview = chunk.text.strip().replace("\n", " ")
51
+ if len(preview) > 200:
52
+ preview = preview[:200] + "..."
53
+ typer.echo(f" {preview}")
54
+
55
+ typer.echo("\n" + "─" * 80)
56
+
57
+ @app.command()
58
+ def delete(path: str = "."):
59
+ deleted_files, skipped_files = vault.delete_files(base_path=Path(path))
60
+
61
+ for file in deleted_files:
62
+ typer.secho(f"Deleted: {file}", fg=typer.colors.RED)
63
+
64
+ for file in skipped_files:
65
+ typer.secho(f"Skipped: {file}", fg=typer.colors.YELLOW)
66
+
67
+ typer.secho(f"Deleted: {len(deleted_files)}", fg=typer.colors.RED, bold=True)
68
+ typer.secho(f"Skipped: {len(skipped_files)}", fg=typer.colors.YELLOW, bold=True)
69
+
70
+ @app.command()
71
+ def reindex(path: str = "."):
72
+ reindexed_files, skipped_files = vault.reindex_files(base_path=Path(path))
73
+
74
+ for file in reindexed_files:
75
+ typer.secho(f"Reindexed: {file}", fg=typer.colors.GREEN)
76
+
77
+ for file in skipped_files:
78
+ typer.secho(f"Skipped: {file}", fg=typer.colors.YELLOW)
79
+
80
+ typer.secho(f"Reindexed: {len(reindexed_files)}", fg=typer.colors.GREEN, bold=True)
81
+ typer.secho(f"Skipped: {len(skipped_files)}", fg=typer.colors.YELLOW, bold=True)
82
+
83
+ @app.command()
84
+ def sync():
85
+ typer.echo(f"Synchronizing vault")
86
+
87
+ @app.command()
88
+ def list():
89
+ documents = vault.list_documents()
90
+
91
+ typer.secho(f"\nFound {len(documents)} documents\n", fg=typer.colors.GREEN, bold=True)
92
+
93
+ for i in range(len(documents)):
94
+ typer.echo(f"{i+1}. {documents[i].source} ({documents[i].chunks_count} chunks)")
95
+
96
+
97
+ def main():
98
+ app()
99
+
100
+ if __name__ == "__main__":
101
+ main()
File without changes
@@ -0,0 +1,13 @@
1
+ from sentence_transformers import SentenceTransformer
2
+
3
+ MODEL: SentenceTransformer = None
4
+
5
+ def get_model():
6
+ global MODEL
7
+ if MODEL is None:
8
+ MODEL = SentenceTransformer("all-MiniLM-L6-v2")
9
+ return MODEL
10
+
11
+ def embed_list(chunks: list[str])-> list[list[float]]:
12
+ embeddings = get_model().encode(sentences=chunks)
13
+ return embeddings
@@ -0,0 +1,21 @@
1
+ class UnsupportedFileTypeError(Exception):
2
+ """Raised when a file type is not supported by the extractor."""
3
+ pass
4
+
5
+ class ExtractionError(Exception):
6
+ """Raised when text extraction fails for reasons other than file type."""
7
+ pass
8
+
9
+ class VaultAlreadyExistsError(Exception):
10
+ """Raised when a Context Vault is already initialized at that path."""
11
+ def __init__(self, existing_path: str):
12
+ self.existing_path = existing_path
13
+ super().__init__(f"Vault already initialized at {existing_path}")
14
+
15
+ class VaultNotInitializedError(Exception):
16
+ """Raised when a Context Vault is not initialized at that path."""
17
+ pass
18
+
19
+ class FileOutsideVault(Exception):
20
+ """Raised when try to index a file outside the Context Vault"""
21
+ pass
@@ -0,0 +1,26 @@
1
+ from ctxvault.utils.text_extraction import extract_text
2
+ from ctxvault.core.identifiers import get_doc_id
3
+ from ctxvault.utils.chuncking import chunking
4
+ from ctxvault.core.embedding import embed_list
5
+ from ctxvault.storage.chroma_store import add_document, delete_document
6
+ from ctxvault.utils.metadata_builder import build_chunks_metadatas
7
+
8
+ def index_file(file_path: str)-> dict:
9
+ text, file_type = extract_text(path=file_path)
10
+ doc_id = get_doc_id(path=file_path)
11
+
12
+ chunks = chunking(text, chunk_size=50)
13
+
14
+ embeddings = embed_list(chunks=chunks)
15
+
16
+ chunk_ids, metadatas = build_chunks_metadatas(doc_id=doc_id, chunks_size=len(chunks), source=file_path, filetype=file_type)
17
+
18
+ add_document(ids=chunk_ids, embeddings=embeddings, metadatas=metadatas, chunks=chunks)
19
+
20
+ def delete_file(file_path: str)-> None:
21
+ doc_id = get_doc_id(path=file_path)
22
+ delete_document(doc_id=doc_id)
23
+
24
+ def reindex_file(file_path: str)->None:
25
+ delete_file(file_path=file_path)
26
+ index_file(file_path=file_path)
@@ -0,0 +1,38 @@
1
+ from ctxvault.core.embedding import embed_list
2
+ from ctxvault.models.documents import DocumentInfo
3
+ from ctxvault.storage import chroma_store
4
+
5
+ def build_documents_from_metadatas(metadatas)-> list[DocumentInfo]:
6
+ acc = {}
7
+
8
+ for row in metadatas:
9
+ doc_id = row["doc_id"]
10
+
11
+ if doc_id not in acc:
12
+ acc[doc_id] = (
13
+ row["source"],
14
+ row["filetype"],
15
+ 1
16
+ )
17
+ else:
18
+ source, filetype, count = acc[doc_id]
19
+ acc[doc_id] = (source, filetype, count + 1)
20
+
21
+ return [
22
+ DocumentInfo(
23
+ doc_id=doc_id,
24
+ source=source,
25
+ filetype=filetype,
26
+ chunks_count=count
27
+ )
28
+ for doc_id, (source, filetype, count) in acc.items()
29
+ ]
30
+
31
+ def query(query_txt: str)-> dict:
32
+ query_embedding = embed_list(chunks=[query_txt])
33
+ return chroma_store.query(query_embedding=query_embedding)
34
+
35
+ def list_documents()-> list[DocumentInfo]:
36
+ metadatas = chroma_store.get_all_metadatas()
37
+ return build_documents_from_metadatas(metadatas=metadatas)
38
+
File without changes
File without changes
@@ -0,0 +1,43 @@
1
+ from chromadb import PersistentClient
2
+ from pathlib import Path
3
+ from ctxvault.models.documents import DocumentInfo
4
+ from ctxvault.utils.config import get_db_path
5
+
6
+ _chroma_client = None
7
+ _collection = None
8
+
9
+ def get_collection():
10
+ global _chroma_client, _collection
11
+ if _collection is None:
12
+ path = get_db_path()
13
+ _chroma_client = PersistentClient(path=path)
14
+ _collection = _chroma_client.get_or_create_collection("ctxvault")
15
+ return _collection
16
+
17
+ def add_document(ids: list[str], embeddings: list[list[float]], metadatas: list[dict], chunks: list[str]):
18
+ collection = get_collection()
19
+ collection.add(
20
+ ids=ids,
21
+ embeddings=embeddings,
22
+ metadatas=metadatas,
23
+ documents=chunks
24
+ )
25
+
26
+ def query(query_embedding: list[float], n_results: int = 5)-> dict:
27
+ collection = get_collection()
28
+ results = collection.query(
29
+ query_embeddings=query_embedding,
30
+ n_results=n_results
31
+ )
32
+ return results
33
+
34
+ def delete_document(doc_id: str):
35
+ collection = get_collection()
36
+ collection.delete(
37
+ where={"doc_id": doc_id}
38
+ )
39
+
40
+ def get_all_metadatas():
41
+ collection = get_collection()
42
+ results = collection.get(include=["metadatas"])
43
+ return results["metadatas"]
File without changes