ctxvault 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ctxvault-0.1.0/LICENSE +21 -0
- ctxvault-0.1.0/PKG-INFO +186 -0
- ctxvault-0.1.0/README.md +137 -0
- ctxvault-0.1.0/pyproject.toml +87 -0
- ctxvault-0.1.0/setup.cfg +4 -0
- ctxvault-0.1.0/src/__init__.py +0 -0
- ctxvault-0.1.0/src/ctxvault/__init__.py +0 -0
- ctxvault-0.1.0/src/ctxvault/api/__init__.py +0 -0
- ctxvault-0.1.0/src/ctxvault/cli/__init__.py +0 -0
- ctxvault-0.1.0/src/ctxvault/cli/app.py +101 -0
- ctxvault-0.1.0/src/ctxvault/core/__init__.py +0 -0
- ctxvault-0.1.0/src/ctxvault/core/embedding.py +13 -0
- ctxvault-0.1.0/src/ctxvault/core/exceptions.py +21 -0
- ctxvault-0.1.0/src/ctxvault/core/indexer.py +26 -0
- ctxvault-0.1.0/src/ctxvault/core/querying.py +38 -0
- ctxvault-0.1.0/src/ctxvault/models/__init__.py +0 -0
- ctxvault-0.1.0/src/ctxvault/storage/__init__.py +0 -0
- ctxvault-0.1.0/src/ctxvault/storage/chroma_store.py +43 -0
- ctxvault-0.1.0/src/ctxvault/utils/__init__.py +0 -0
- ctxvault-0.1.0/src/ctxvault/utils/chuncking.py +8 -0
- ctxvault-0.1.0/src/ctxvault/utils/metadata_builder.py +19 -0
- ctxvault-0.1.0/src/ctxvault/utils/text_extraction.py +58 -0
- ctxvault-0.1.0/src/ctxvault.egg-info/PKG-INFO +186 -0
- ctxvault-0.1.0/src/ctxvault.egg-info/SOURCES.txt +30 -0
- ctxvault-0.1.0/src/ctxvault.egg-info/dependency_links.txt +1 -0
- ctxvault-0.1.0/src/ctxvault.egg-info/entry_points.txt +2 -0
- ctxvault-0.1.0/src/ctxvault.egg-info/requires.txt +20 -0
- ctxvault-0.1.0/src/ctxvault.egg-info/top_level.txt +1 -0
- ctxvault-0.1.0/src/main.py +19 -0
- ctxvault-0.1.0/tests/test_api.py +228 -0
- ctxvault-0.1.0/tests/test_cli.py +42 -0
- ctxvault-0.1.0/tests/test_core.py +37 -0
ctxvault-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Filippo Venturini
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
ctxvault-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ctxvault
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: ctxvault is a local-first knowledge vault that indexes your documents, generates embeddings, and enables fast semantic search via CLI or API. Designed for personal knowledge bases, RAG pipelines, and AI agents.
|
|
5
|
+
Author-email: Filippo Venturini <filippoventurini00@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Filippo-Venturini/ctx-vault
|
|
8
|
+
Project-URL: Repository, https://github.com/Filippo-Venturini/ctx-vault
|
|
9
|
+
Project-URL: Issues, https://github.com/Filippo-Venturini/ctx-vault/issues
|
|
10
|
+
Keywords: rag,retrieval-augmented-generation,semantic-search,embeddings,vector-database,chroma,llm,ai,knowledge-base,document-search,local-ai,developer-tools,cli,fastapi
|
|
11
|
+
Classifier: Development Status :: 3 - Alpha
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: Intended Audience :: Science/Research
|
|
14
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
21
|
+
Classifier: Topic :: Database
|
|
22
|
+
Classifier: Topic :: Text Processing :: Indexing
|
|
23
|
+
Classifier: Topic :: Utilities
|
|
24
|
+
Classifier: Environment :: Console
|
|
25
|
+
Classifier: Framework :: FastAPI
|
|
26
|
+
Requires-Python: >=3.9
|
|
27
|
+
Description-Content-Type: text/markdown
|
|
28
|
+
License-File: LICENSE
|
|
29
|
+
Requires-Dist: typer>=0.20.0
|
|
30
|
+
Requires-Dist: chromadb>=1.3.0
|
|
31
|
+
Requires-Dist: sentence-transformers>=5.0.0
|
|
32
|
+
Requires-Dist: fastapi>=0.124.0
|
|
33
|
+
Requires-Dist: uvicorn>=0.38.0
|
|
34
|
+
Requires-Dist: python-dotenv>=1.2.0
|
|
35
|
+
Requires-Dist: pydantic>=2.12.0
|
|
36
|
+
Requires-Dist: rich>=14.0.0
|
|
37
|
+
Requires-Dist: pypdf>=6.0.0
|
|
38
|
+
Requires-Dist: python-docx>=1.0.0
|
|
39
|
+
Requires-Dist: markdown>=3.0.0
|
|
40
|
+
Requires-Dist: strip-tags>=0.5.0
|
|
41
|
+
Provides-Extra: dev
|
|
42
|
+
Requires-Dist: pytest>=9.0.0; extra == "dev"
|
|
43
|
+
Requires-Dist: pytest-mock>=3.15.0; extra == "dev"
|
|
44
|
+
Requires-Dist: pytest-anyio>=0.0.0; extra == "dev"
|
|
45
|
+
Requires-Dist: black>=25.0.0; extra == "dev"
|
|
46
|
+
Requires-Dist: isort>=7.0.0; extra == "dev"
|
|
47
|
+
Requires-Dist: pre-commit>=3.5.0; extra == "dev"
|
|
48
|
+
Dynamic: license-file
|
|
49
|
+
|
|
50
|
+
# CtxVault
|
|
51
|
+
|
|
52
|
+
Local semantic search vault for LLMs.
|
|
53
|
+
|
|
54
|
+
CtxVault lets you index documents locally, generate embeddings, and query them with semantic search.
|
|
55
|
+
Designed as a lightweight RAG backend for agents, scripts, and LLM workflows.
|
|
56
|
+
|
|
57
|
+
## Why CtxVault
|
|
58
|
+
|
|
59
|
+
- 100% local (no cloud, no data sharing)
|
|
60
|
+
- simple CLI
|
|
61
|
+
- works offline
|
|
62
|
+
- persistent vector store (Chroma)
|
|
63
|
+
- file-based workflow
|
|
64
|
+
- agent/API ready (future)
|
|
65
|
+
|
|
66
|
+
Ideal for:
|
|
67
|
+
- personal knowledge bases
|
|
68
|
+
- private documents
|
|
69
|
+
- local RAG pipelines
|
|
70
|
+
- AI agents needing contextual memory
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## Install
|
|
75
|
+
|
|
76
|
+
Python 3.10+
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
pip install -e .
|
|
80
|
+
````
|
|
81
|
+
|
|
82
|
+
---
|
|
83
|
+
|
|
84
|
+
## Quickstart
|
|
85
|
+
|
|
86
|
+
Initialize a vault:
|
|
87
|
+
|
|
88
|
+
```bash
|
|
89
|
+
ctxvault init ./my-vault
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
Index files or folders:
|
|
93
|
+
|
|
94
|
+
```bash
|
|
95
|
+
ctxvault index ./my-vault/docs
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
Query:
|
|
99
|
+
|
|
100
|
+
```bash
|
|
101
|
+
ctxvault query "what is project Orion?"
|
|
102
|
+
```
|
|
103
|
+
|
|
104
|
+
---
|
|
105
|
+
|
|
106
|
+
## CLI Commands
|
|
107
|
+
|
|
108
|
+
### init
|
|
109
|
+
|
|
110
|
+
Initialize a vault directory.
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
ctxvault init <path>
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
### index
|
|
119
|
+
|
|
120
|
+
Index a file or directory.
|
|
121
|
+
|
|
122
|
+
```bash
|
|
123
|
+
ctxvault index <path>
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
### query
|
|
129
|
+
|
|
130
|
+
Semantic search inside the vault.
|
|
131
|
+
|
|
132
|
+
```bash
|
|
133
|
+
ctxvault query "<text>"
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
### delete
|
|
139
|
+
|
|
140
|
+
Remove a document from the vault.
|
|
141
|
+
|
|
142
|
+
```bash
|
|
143
|
+
ctxvault delete <path>
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
### reindex
|
|
149
|
+
|
|
150
|
+
Reindex a document after changes.
|
|
151
|
+
|
|
152
|
+
```bash
|
|
153
|
+
ctxvault reindex <path>
|
|
154
|
+
```
|
|
155
|
+
|
|
156
|
+
---
|
|
157
|
+
|
|
158
|
+
### list
|
|
159
|
+
|
|
160
|
+
List indexed documents.
|
|
161
|
+
|
|
162
|
+
```bash
|
|
163
|
+
ctxvault list
|
|
164
|
+
```
|
|
165
|
+
|
|
166
|
+
---
|
|
167
|
+
|
|
168
|
+
## Privacy
|
|
169
|
+
|
|
170
|
+
All processing happens locally.
|
|
171
|
+
No data is sent to external services.
|
|
172
|
+
|
|
173
|
+
---
|
|
174
|
+
|
|
175
|
+
## Roadmap
|
|
176
|
+
|
|
177
|
+
* [x] CLI MVP
|
|
178
|
+
* [ ] FastAPI server
|
|
179
|
+
* [ ] sync and file watcher
|
|
180
|
+
* [ ] multi-vault support
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## License
|
|
185
|
+
|
|
186
|
+
MIT
|
ctxvault-0.1.0/README.md
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# CtxVault
|
|
2
|
+
|
|
3
|
+
Local semantic search vault for LLMs.
|
|
4
|
+
|
|
5
|
+
CtxVault lets you index documents locally, generate embeddings, and query them with semantic search.
|
|
6
|
+
Designed as a lightweight RAG backend for agents, scripts, and LLM workflows.
|
|
7
|
+
|
|
8
|
+
## Why CtxVault
|
|
9
|
+
|
|
10
|
+
- 100% local (no cloud, no data sharing)
|
|
11
|
+
- simple CLI
|
|
12
|
+
- works offline
|
|
13
|
+
- persistent vector store (Chroma)
|
|
14
|
+
- file-based workflow
|
|
15
|
+
- agent/API ready (future)
|
|
16
|
+
|
|
17
|
+
Ideal for:
|
|
18
|
+
- personal knowledge bases
|
|
19
|
+
- private documents
|
|
20
|
+
- local RAG pipelines
|
|
21
|
+
- AI agents needing contextual memory
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Install
|
|
26
|
+
|
|
27
|
+
Python 3.10+
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
pip install -e .
|
|
31
|
+
````
|
|
32
|
+
|
|
33
|
+
---
|
|
34
|
+
|
|
35
|
+
## Quickstart
|
|
36
|
+
|
|
37
|
+
Initialize a vault:
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
ctxvault init ./my-vault
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Index files or folders:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
ctxvault index ./my-vault/docs
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Query:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
ctxvault query "what is project Orion?"
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
---
|
|
56
|
+
|
|
57
|
+
## CLI Commands
|
|
58
|
+
|
|
59
|
+
### init
|
|
60
|
+
|
|
61
|
+
Initialize a vault directory.
|
|
62
|
+
|
|
63
|
+
```bash
|
|
64
|
+
ctxvault init <path>
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
### index
|
|
70
|
+
|
|
71
|
+
Index a file or directory.
|
|
72
|
+
|
|
73
|
+
```bash
|
|
74
|
+
ctxvault index <path>
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
---
|
|
78
|
+
|
|
79
|
+
### query
|
|
80
|
+
|
|
81
|
+
Semantic search inside the vault.
|
|
82
|
+
|
|
83
|
+
```bash
|
|
84
|
+
ctxvault query "<text>"
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
---
|
|
88
|
+
|
|
89
|
+
### delete
|
|
90
|
+
|
|
91
|
+
Remove a document from the vault.
|
|
92
|
+
|
|
93
|
+
```bash
|
|
94
|
+
ctxvault delete <path>
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
### reindex
|
|
100
|
+
|
|
101
|
+
Reindex a document after changes.
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
ctxvault reindex <path>
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
---
|
|
108
|
+
|
|
109
|
+
### list
|
|
110
|
+
|
|
111
|
+
List indexed documents.
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
ctxvault list
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Privacy
|
|
120
|
+
|
|
121
|
+
All processing happens locally.
|
|
122
|
+
No data is sent to external services.
|
|
123
|
+
|
|
124
|
+
---
|
|
125
|
+
|
|
126
|
+
## Roadmap
|
|
127
|
+
|
|
128
|
+
* [x] CLI MVP
|
|
129
|
+
* [ ] FastAPI server
|
|
130
|
+
* [ ] sync and file watcher
|
|
131
|
+
* [ ] multi-vault support
|
|
132
|
+
|
|
133
|
+
---
|
|
134
|
+
|
|
135
|
+
## License
|
|
136
|
+
|
|
137
|
+
MIT
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=61.0"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ctxvault"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "ctxvault is a local-first knowledge vault that indexes your documents, generates embeddings, and enables fast semantic search via CLI or API. Designed for personal knowledge bases, RAG pipelines, and AI agents."
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = {text = "MIT"}
|
|
12
|
+
authors = [
|
|
13
|
+
{name = "Filippo Venturini", email = "filippoventurini00@gmail.com"}
|
|
14
|
+
]
|
|
15
|
+
keywords = [
|
|
16
|
+
"rag",
|
|
17
|
+
"retrieval-augmented-generation",
|
|
18
|
+
"semantic-search",
|
|
19
|
+
"embeddings",
|
|
20
|
+
"vector-database",
|
|
21
|
+
"chroma",
|
|
22
|
+
"llm",
|
|
23
|
+
"ai",
|
|
24
|
+
"knowledge-base",
|
|
25
|
+
"document-search",
|
|
26
|
+
"local-ai",
|
|
27
|
+
"developer-tools",
|
|
28
|
+
"cli",
|
|
29
|
+
"fastapi"
|
|
30
|
+
]
|
|
31
|
+
classifiers = [
|
|
32
|
+
"Development Status :: 3 - Alpha",
|
|
33
|
+
"Intended Audience :: Developers",
|
|
34
|
+
"Intended Audience :: Science/Research",
|
|
35
|
+
"License :: OSI Approved :: MIT License",
|
|
36
|
+
"Operating System :: OS Independent",
|
|
37
|
+
"Programming Language :: Python :: 3",
|
|
38
|
+
"Programming Language :: Python :: 3.10",
|
|
39
|
+
"Programming Language :: Python :: 3.11",
|
|
40
|
+
"Programming Language :: Python :: 3.12",
|
|
41
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
42
|
+
"Topic :: Database",
|
|
43
|
+
"Topic :: Text Processing :: Indexing",
|
|
44
|
+
"Topic :: Utilities",
|
|
45
|
+
"Environment :: Console",
|
|
46
|
+
"Framework :: FastAPI"
|
|
47
|
+
]
|
|
48
|
+
dependencies = [
|
|
49
|
+
"typer>=0.20.0",
|
|
50
|
+
"chromadb>=1.3.0",
|
|
51
|
+
"sentence-transformers>=5.0.0",
|
|
52
|
+
"fastapi>=0.124.0",
|
|
53
|
+
"uvicorn>=0.38.0",
|
|
54
|
+
"python-dotenv>=1.2.0",
|
|
55
|
+
"pydantic>=2.12.0",
|
|
56
|
+
"rich>=14.0.0",
|
|
57
|
+
"pypdf>=6.0.0",
|
|
58
|
+
"python-docx>=1.0.0",
|
|
59
|
+
"markdown>=3.0.0",
|
|
60
|
+
"strip-tags>=0.5.0",
|
|
61
|
+
]
|
|
62
|
+
|
|
63
|
+
[project.optional-dependencies]
|
|
64
|
+
dev = [
|
|
65
|
+
"pytest>=9.0.0",
|
|
66
|
+
"pytest-mock>=3.15.0",
|
|
67
|
+
"pytest-anyio>=0.0.0",
|
|
68
|
+
"black>=25.0.0",
|
|
69
|
+
"isort>=7.0.0",
|
|
70
|
+
"pre-commit>=3.5.0",
|
|
71
|
+
]
|
|
72
|
+
|
|
73
|
+
[project.scripts]
|
|
74
|
+
ctxvault = "ctxvault.cli.app:main"
|
|
75
|
+
|
|
76
|
+
[project.urls]
|
|
77
|
+
Homepage = "https://github.com/Filippo-Venturini/ctx-vault"
|
|
78
|
+
Repository = "https://github.com/Filippo-Venturini/ctx-vault"
|
|
79
|
+
Issues = "https://github.com/Filippo-Venturini/ctx-vault/issues"
|
|
80
|
+
|
|
81
|
+
[tool.setuptools]
|
|
82
|
+
packages = ["ctxvault"]
|
|
83
|
+
package-dir = {"" = "src"}
|
|
84
|
+
|
|
85
|
+
[tool.pytest.ini_options]
|
|
86
|
+
testpaths = ["tests"]
|
|
87
|
+
pythonpath = ["src"]
|
ctxvault-0.1.0/setup.cfg
ADDED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
from pathlib import Path
|
|
2
|
+
import typer
|
|
3
|
+
from ctxvault.core import vault
|
|
4
|
+
from ctxvault.core.exceptions import VaultAlreadyExistsError
|
|
5
|
+
|
|
6
|
+
app = typer.Typer()
|
|
7
|
+
|
|
8
|
+
@app.command()
|
|
9
|
+
def init(path: str = "."):
|
|
10
|
+
try:
|
|
11
|
+
typer.echo(f"Initializing Context Vault at: {path} ...")
|
|
12
|
+
vault_path, config_path = vault.init_vault(path=path)
|
|
13
|
+
typer.secho("Context Vault initialized succesfully!", fg=typer.colors.GREEN, bold=True)
|
|
14
|
+
typer.echo(f"Context Vault path: {vault_path}")
|
|
15
|
+
typer.echo(f"Config file path: {config_path}")
|
|
16
|
+
except VaultAlreadyExistsError as e:
|
|
17
|
+
typer.secho("Warning: Context Vault already initialized in this path!", fg=typer.colors.YELLOW, bold=True)
|
|
18
|
+
typer.echo(f"Existing vault path: {e.existing_path}")
|
|
19
|
+
raise typer.Exit(1)
|
|
20
|
+
|
|
21
|
+
@app.command()
|
|
22
|
+
def index(path: str = "."):
|
|
23
|
+
indexed_files, skipped_files = vault.index_files(base_path=Path(path))
|
|
24
|
+
|
|
25
|
+
for file in indexed_files:
|
|
26
|
+
typer.secho(f"Indexed: {file}", fg=typer.colors.GREEN)
|
|
27
|
+
|
|
28
|
+
for file in skipped_files:
|
|
29
|
+
typer.secho(f"Skipped: {file}", fg=typer.colors.YELLOW)
|
|
30
|
+
|
|
31
|
+
typer.secho(f"Indexed: {len(indexed_files)}", fg=typer.colors.GREEN, bold=True)
|
|
32
|
+
typer.secho(f"Skipped: {len(skipped_files)}", fg=typer.colors.YELLOW, bold=True)
|
|
33
|
+
|
|
34
|
+
@app.command()
|
|
35
|
+
def query(text: str = ""):
|
|
36
|
+
result = vault.query(text=text)
|
|
37
|
+
if not result.results:
|
|
38
|
+
typer.secho("No results found.", fg=typer.colors.YELLOW)
|
|
39
|
+
return
|
|
40
|
+
|
|
41
|
+
typer.secho(f"\n Found {len(result.results)} chunks", fg=typer.colors.GREEN, bold=True)
|
|
42
|
+
typer.echo("─" * 80)
|
|
43
|
+
|
|
44
|
+
for idx, chunk in enumerate(result.results, 1):
|
|
45
|
+
typer.secho(f"\n[{idx}] ", fg=typer.colors.CYAN, bold=True, nl=False)
|
|
46
|
+
typer.secho(f"score: {chunk.score:.3f}", fg=typer.colors.MAGENTA)
|
|
47
|
+
typer.secho(f" ▸ {chunk.source} ", fg=typer.colors.BLUE, nl=False)
|
|
48
|
+
typer.echo(f"(chunk {chunk.chunk_index})")
|
|
49
|
+
|
|
50
|
+
preview = chunk.text.strip().replace("\n", " ")
|
|
51
|
+
if len(preview) > 200:
|
|
52
|
+
preview = preview[:200] + "..."
|
|
53
|
+
typer.echo(f" {preview}")
|
|
54
|
+
|
|
55
|
+
typer.echo("\n" + "─" * 80)
|
|
56
|
+
|
|
57
|
+
@app.command()
|
|
58
|
+
def delete(path: str = "."):
|
|
59
|
+
deleted_files, skipped_files = vault.delete_files(base_path=Path(path))
|
|
60
|
+
|
|
61
|
+
for file in deleted_files:
|
|
62
|
+
typer.secho(f"Deleted: {file}", fg=typer.colors.RED)
|
|
63
|
+
|
|
64
|
+
for file in skipped_files:
|
|
65
|
+
typer.secho(f"Skipped: {file}", fg=typer.colors.YELLOW)
|
|
66
|
+
|
|
67
|
+
typer.secho(f"Deleted: {len(deleted_files)}", fg=typer.colors.RED, bold=True)
|
|
68
|
+
typer.secho(f"Skipped: {len(skipped_files)}", fg=typer.colors.YELLOW, bold=True)
|
|
69
|
+
|
|
70
|
+
@app.command()
|
|
71
|
+
def reindex(path: str = "."):
|
|
72
|
+
reindexed_files, skipped_files = vault.reindex_files(base_path=Path(path))
|
|
73
|
+
|
|
74
|
+
for file in reindexed_files:
|
|
75
|
+
typer.secho(f"Reindexed: {file}", fg=typer.colors.GREEN)
|
|
76
|
+
|
|
77
|
+
for file in skipped_files:
|
|
78
|
+
typer.secho(f"Skipped: {file}", fg=typer.colors.YELLOW)
|
|
79
|
+
|
|
80
|
+
typer.secho(f"Reindexed: {len(reindexed_files)}", fg=typer.colors.GREEN, bold=True)
|
|
81
|
+
typer.secho(f"Skipped: {len(skipped_files)}", fg=typer.colors.YELLOW, bold=True)
|
|
82
|
+
|
|
83
|
+
@app.command()
|
|
84
|
+
def sync():
|
|
85
|
+
typer.echo(f"Synchronizing vault")
|
|
86
|
+
|
|
87
|
+
@app.command()
|
|
88
|
+
def list():
|
|
89
|
+
documents = vault.list_documents()
|
|
90
|
+
|
|
91
|
+
typer.secho(f"\nFound {len(documents)} documents\n", fg=typer.colors.GREEN, bold=True)
|
|
92
|
+
|
|
93
|
+
for i in range(len(documents)):
|
|
94
|
+
typer.echo(f"{i+1}. {documents[i].source} ({documents[i].chunks_count} chunks)")
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def main():
|
|
98
|
+
app()
|
|
99
|
+
|
|
100
|
+
if __name__ == "__main__":
|
|
101
|
+
main()
|
|
File without changes
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
from sentence_transformers import SentenceTransformer
|
|
2
|
+
|
|
3
|
+
MODEL: SentenceTransformer = None
|
|
4
|
+
|
|
5
|
+
def get_model():
|
|
6
|
+
global MODEL
|
|
7
|
+
if MODEL is None:
|
|
8
|
+
MODEL = SentenceTransformer("all-MiniLM-L6-v2")
|
|
9
|
+
return MODEL
|
|
10
|
+
|
|
11
|
+
def embed_list(chunks: list[str])-> list[list[float]]:
|
|
12
|
+
embeddings = get_model().encode(sentences=chunks)
|
|
13
|
+
return embeddings
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
class UnsupportedFileTypeError(Exception):
|
|
2
|
+
"""Raised when a file type is not supported by the extractor."""
|
|
3
|
+
pass
|
|
4
|
+
|
|
5
|
+
class ExtractionError(Exception):
|
|
6
|
+
"""Raised when text extraction fails for reasons other than file type."""
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
class VaultAlreadyExistsError(Exception):
|
|
10
|
+
"""Raised when a Context Vault is already initialized at that path."""
|
|
11
|
+
def __init__(self, existing_path: str):
|
|
12
|
+
self.existing_path = existing_path
|
|
13
|
+
super().__init__(f"Vault already initialized at {existing_path}")
|
|
14
|
+
|
|
15
|
+
class VaultNotInitializedError(Exception):
|
|
16
|
+
"""Raised when a Context Vault is not initialized at that path."""
|
|
17
|
+
pass
|
|
18
|
+
|
|
19
|
+
class FileOutsideVault(Exception):
|
|
20
|
+
"""Raised when try to index a file outside the Context Vault"""
|
|
21
|
+
pass
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
from ctxvault.utils.text_extraction import extract_text
|
|
2
|
+
from ctxvault.core.identifiers import get_doc_id
|
|
3
|
+
from ctxvault.utils.chuncking import chunking
|
|
4
|
+
from ctxvault.core.embedding import embed_list
|
|
5
|
+
from ctxvault.storage.chroma_store import add_document, delete_document
|
|
6
|
+
from ctxvault.utils.metadata_builder import build_chunks_metadatas
|
|
7
|
+
|
|
8
|
+
def index_file(file_path: str)-> dict:
|
|
9
|
+
text, file_type = extract_text(path=file_path)
|
|
10
|
+
doc_id = get_doc_id(path=file_path)
|
|
11
|
+
|
|
12
|
+
chunks = chunking(text, chunk_size=50)
|
|
13
|
+
|
|
14
|
+
embeddings = embed_list(chunks=chunks)
|
|
15
|
+
|
|
16
|
+
chunk_ids, metadatas = build_chunks_metadatas(doc_id=doc_id, chunks_size=len(chunks), source=file_path, filetype=file_type)
|
|
17
|
+
|
|
18
|
+
add_document(ids=chunk_ids, embeddings=embeddings, metadatas=metadatas, chunks=chunks)
|
|
19
|
+
|
|
20
|
+
def delete_file(file_path: str)-> None:
|
|
21
|
+
doc_id = get_doc_id(path=file_path)
|
|
22
|
+
delete_document(doc_id=doc_id)
|
|
23
|
+
|
|
24
|
+
def reindex_file(file_path: str)->None:
|
|
25
|
+
delete_file(file_path=file_path)
|
|
26
|
+
index_file(file_path=file_path)
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
from ctxvault.core.embedding import embed_list
|
|
2
|
+
from ctxvault.models.documents import DocumentInfo
|
|
3
|
+
from ctxvault.storage import chroma_store
|
|
4
|
+
|
|
5
|
+
def build_documents_from_metadatas(metadatas)-> list[DocumentInfo]:
|
|
6
|
+
acc = {}
|
|
7
|
+
|
|
8
|
+
for row in metadatas:
|
|
9
|
+
doc_id = row["doc_id"]
|
|
10
|
+
|
|
11
|
+
if doc_id not in acc:
|
|
12
|
+
acc[doc_id] = (
|
|
13
|
+
row["source"],
|
|
14
|
+
row["filetype"],
|
|
15
|
+
1
|
|
16
|
+
)
|
|
17
|
+
else:
|
|
18
|
+
source, filetype, count = acc[doc_id]
|
|
19
|
+
acc[doc_id] = (source, filetype, count + 1)
|
|
20
|
+
|
|
21
|
+
return [
|
|
22
|
+
DocumentInfo(
|
|
23
|
+
doc_id=doc_id,
|
|
24
|
+
source=source,
|
|
25
|
+
filetype=filetype,
|
|
26
|
+
chunks_count=count
|
|
27
|
+
)
|
|
28
|
+
for doc_id, (source, filetype, count) in acc.items()
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
def query(query_txt: str)-> dict:
|
|
32
|
+
query_embedding = embed_list(chunks=[query_txt])
|
|
33
|
+
return chroma_store.query(query_embedding=query_embedding)
|
|
34
|
+
|
|
35
|
+
def list_documents()-> list[DocumentInfo]:
|
|
36
|
+
metadatas = chroma_store.get_all_metadatas()
|
|
37
|
+
return build_documents_from_metadatas(metadatas=metadatas)
|
|
38
|
+
|
|
File without changes
|
|
File without changes
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from chromadb import PersistentClient
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
from ctxvault.models.documents import DocumentInfo
|
|
4
|
+
from ctxvault.utils.config import get_db_path
|
|
5
|
+
|
|
6
|
+
_chroma_client = None
|
|
7
|
+
_collection = None
|
|
8
|
+
|
|
9
|
+
def get_collection():
|
|
10
|
+
global _chroma_client, _collection
|
|
11
|
+
if _collection is None:
|
|
12
|
+
path = get_db_path()
|
|
13
|
+
_chroma_client = PersistentClient(path=path)
|
|
14
|
+
_collection = _chroma_client.get_or_create_collection("ctxvault")
|
|
15
|
+
return _collection
|
|
16
|
+
|
|
17
|
+
def add_document(ids: list[str], embeddings: list[list[float]], metadatas: list[dict], chunks: list[str]):
|
|
18
|
+
collection = get_collection()
|
|
19
|
+
collection.add(
|
|
20
|
+
ids=ids,
|
|
21
|
+
embeddings=embeddings,
|
|
22
|
+
metadatas=metadatas,
|
|
23
|
+
documents=chunks
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
def query(query_embedding: list[float], n_results: int = 5)-> dict:
|
|
27
|
+
collection = get_collection()
|
|
28
|
+
results = collection.query(
|
|
29
|
+
query_embeddings=query_embedding,
|
|
30
|
+
n_results=n_results
|
|
31
|
+
)
|
|
32
|
+
return results
|
|
33
|
+
|
|
34
|
+
def delete_document(doc_id: str):
|
|
35
|
+
collection = get_collection()
|
|
36
|
+
collection.delete(
|
|
37
|
+
where={"doc_id": doc_id}
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
def get_all_metadatas():
|
|
41
|
+
collection = get_collection()
|
|
42
|
+
results = collection.get(include=["metadatas"])
|
|
43
|
+
return results["metadatas"]
|
|
File without changes
|