versionhq 1.1.11.1__py3-none-any.whl → 1.1.11.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- versionhq/__init__.py +1 -1
- versionhq/agent/model.py +1 -1
- versionhq/knowledge/model.py +5 -1
- versionhq/knowledge/source.py +101 -117
- versionhq/knowledge/source_docling.py +45 -43
- versionhq/knowledge/storage.py +72 -55
- {versionhq-1.1.11.1.dist-info → versionhq-1.1.11.3.dist-info}/METADATA +3 -4
- {versionhq-1.1.11.1.dist-info → versionhq-1.1.11.3.dist-info}/RECORD +11 -11
- {versionhq-1.1.11.1.dist-info → versionhq-1.1.11.3.dist-info}/LICENSE +0 -0
- {versionhq-1.1.11.1.dist-info → versionhq-1.1.11.3.dist-info}/WHEEL +0 -0
- {versionhq-1.1.11.1.dist-info → versionhq-1.1.11.3.dist-info}/top_level.txt +0 -0
versionhq/__init__.py
CHANGED
versionhq/agent/model.py
CHANGED
@@ -469,7 +469,7 @@ class Agent(BaseModel):
|
|
469
469
|
task_prompt += context
|
470
470
|
|
471
471
|
if self._knowledge:
|
472
|
-
agent_knowledge = self._knowledge.query(query=[task_prompt,])
|
472
|
+
agent_knowledge = self._knowledge.query(query=[task_prompt,], limit=5)
|
473
473
|
if agent_knowledge:
|
474
474
|
agent_knowledge_context = extract_knowledge_context(knowledge_snippets=agent_knowledge)
|
475
475
|
if agent_knowledge_context:
|
versionhq/knowledge/model.py
CHANGED
@@ -26,13 +26,17 @@ class Knowledge(BaseModel):
|
|
26
26
|
**data,
|
27
27
|
):
|
28
28
|
super().__init__(**data)
|
29
|
+
|
30
|
+
|
29
31
|
if storage:
|
30
32
|
self.storage = storage
|
31
33
|
else:
|
32
34
|
self.storage = KnowledgeStorage(embedder_config=embedder_config, collection_name=collection_name)
|
33
35
|
|
34
|
-
self.
|
36
|
+
self.storage._set_embedding_function(embedder_config=embedder_config)
|
35
37
|
self.storage.initialize_knowledge_storage()
|
38
|
+
|
39
|
+
self.sources = sources
|
36
40
|
for source in sources:
|
37
41
|
source.storage = self.storage
|
38
42
|
source.add()
|
versionhq/knowledge/source.py
CHANGED
@@ -8,6 +8,7 @@ import numpy as np
|
|
8
8
|
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
9
9
|
|
10
10
|
from versionhq.knowledge.storage import KnowledgeStorage
|
11
|
+
from versionhq.storage.utils import fetch_db_storage_path
|
11
12
|
from versionhq._utils.vars import KNOWLEDGE_DIRECTORY
|
12
13
|
from versionhq._utils.logger import Logger
|
13
14
|
|
@@ -16,50 +17,66 @@ class BaseKnowledgeSource(BaseModel, ABC):
|
|
16
17
|
"""
|
17
18
|
Abstract base class for knowledge sources: csv, json, excel, pdf, string, and docling.
|
18
19
|
"""
|
20
|
+
_logger: Logger = Logger(verbose=True)
|
19
21
|
|
20
|
-
chunk_size: int =
|
22
|
+
chunk_size: int = 3000
|
21
23
|
chunk_overlap: int = 200
|
22
24
|
chunks: List[str] = Field(default_factory=list)
|
23
25
|
chunk_embeddings: List[np.ndarray] = Field(default_factory=list)
|
24
26
|
|
25
27
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
26
28
|
storage: Optional[KnowledgeStorage] = Field(default=None)
|
27
|
-
metadata: Dict[str, Any] = Field(default_factory=dict)
|
29
|
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
28
30
|
collection_name: Optional[str] = Field(default=None)
|
29
31
|
|
32
|
+
|
30
33
|
@abstractmethod
|
31
|
-
def validate_content(self) -> Any:
|
34
|
+
def validate_content(self, **kwargs) -> Any:
|
32
35
|
"""Load and preprocess content from the source."""
|
33
36
|
pass
|
34
37
|
|
38
|
+
|
35
39
|
@abstractmethod
|
36
40
|
def add(self) -> None:
|
37
41
|
"""Process content, chunk it, compute embeddings, and save them."""
|
38
42
|
pass
|
39
43
|
|
44
|
+
|
40
45
|
def get_embeddings(self) -> List[np.ndarray]:
|
41
46
|
"""Return the list of embeddings for the chunks."""
|
42
47
|
return self.chunk_embeddings
|
43
48
|
|
49
|
+
|
44
50
|
def _chunk_text(self, text: str) -> List[str]:
|
45
51
|
"""
|
46
52
|
Utility method to split text into chunks.
|
47
53
|
"""
|
54
|
+
return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
48
55
|
|
49
|
-
return [
|
50
|
-
text[i : i + self.chunk_size]
|
51
|
-
for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
|
52
|
-
]
|
53
56
|
|
54
|
-
def _save_documents(self):
|
57
|
+
def _save_documents(self) -> None:
|
55
58
|
"""
|
56
|
-
Save the documents to the storage.
|
59
|
+
Save the documents to the given (or newly created) storage on ChromaDB.
|
57
60
|
This method should be called after the chunks and embeddings are generated.
|
58
61
|
"""
|
59
|
-
if self.
|
60
|
-
|
61
|
-
|
62
|
-
|
62
|
+
# if not self.chunks or self.chunk_embeddings:
|
63
|
+
# self._logger.log(level="warning", message="Chunks or chunk embeddings are missing. Save docs after creating them.", color="yellow")
|
64
|
+
# return
|
65
|
+
|
66
|
+
try:
|
67
|
+
if self.storage:
|
68
|
+
self.storage.save(documents=self.chunks, metadata=self.metadata)
|
69
|
+
|
70
|
+
else:
|
71
|
+
storage = KnowledgeStorage(collection_name=self.collection_name) if self.collection_name else KnowledgeStorage()
|
72
|
+
storage.initialize_knowledge_storage()
|
73
|
+
self.storage = storage
|
74
|
+
self.storage.save(documents=self.chunks, metadata=self.metadata)
|
75
|
+
|
76
|
+
except:
|
77
|
+
self._logger.log(level="error", message="No storage found or created to save the documents.", color="red")
|
78
|
+
return
|
79
|
+
# raise ValueError("No storage found to save documents.")
|
63
80
|
|
64
81
|
|
65
82
|
|
@@ -74,37 +91,32 @@ class StringKnowledgeSource(BaseKnowledgeSource):
|
|
74
91
|
def model_post_init(self, _):
|
75
92
|
"""Post-initialization method to validate content."""
|
76
93
|
self.validate_content()
|
94
|
+
self._save_documents()
|
95
|
+
|
77
96
|
|
78
97
|
def validate_content(self):
|
79
98
|
"""Validate string content."""
|
80
99
|
if not isinstance(self.content, str):
|
81
100
|
raise ValueError("StringKnowledgeSource only accepts string content")
|
82
101
|
|
102
|
+
|
83
103
|
def add(self) -> None:
|
84
104
|
"""
|
85
105
|
Add string content to the knowledge source, chunk it, compute embeddings, and save them.
|
86
106
|
"""
|
87
|
-
new_chunks = self._chunk_text(self.content)
|
107
|
+
new_chunks = self._chunk_text(text=self.content)
|
88
108
|
self.chunks.extend(new_chunks)
|
89
109
|
self._save_documents()
|
90
110
|
|
91
111
|
|
92
|
-
def _chunk_text(self, text: str) -> List[str]:
|
93
|
-
"""
|
94
|
-
Utility method to split text into chunks.
|
95
|
-
"""
|
96
|
-
return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
97
|
-
|
98
|
-
|
99
112
|
|
100
113
|
class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
|
101
114
|
"""Base class for knowledge sources that load content from files."""
|
102
115
|
|
103
|
-
_logger: Logger = Logger(verbose=True)
|
104
116
|
file_paths: Optional[Path | List[Path] | str | List[str]] = Field(default_factory=list)
|
105
117
|
content: Dict[Path, str] = Field(init=False, default_factory=dict)
|
106
118
|
storage: Optional[KnowledgeStorage] = Field(default=None)
|
107
|
-
|
119
|
+
valid_file_paths: List[Path] = Field(default_factory=list, description="store a list of `Path` objects from self.file_paths")
|
108
120
|
|
109
121
|
|
110
122
|
@field_validator("file_paths", mode="before")
|
@@ -117,70 +129,73 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
|
|
117
129
|
return v
|
118
130
|
|
119
131
|
|
120
|
-
def
|
132
|
+
def validate_content(self, path: str | Path) -> List[Path]:
|
121
133
|
"""
|
122
|
-
|
134
|
+
Convert the given path to a Path object, and validate if the path exists and refers to a file.)
|
123
135
|
"""
|
124
|
-
self.safe_file_paths = self._process_file_paths()
|
125
|
-
self.validate_content()
|
126
|
-
self.content = self.load_content()
|
127
|
-
|
128
136
|
|
129
|
-
|
130
|
-
def load_content(self) -> Dict[Path, str]:
|
131
|
-
"""
|
132
|
-
Load and preprocess file content. Should be overridden by subclasses.
|
133
|
-
Assume that the file path is relative to the project root in the knowledge directory.
|
134
|
-
"""
|
135
|
-
pass
|
137
|
+
path_instance = Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
|
136
138
|
|
139
|
+
if not path_instance.exists():
|
140
|
+
abs_path = fetch_db_storage_path()
|
141
|
+
path_instance = Path(abs_path + "/" + KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
|
137
142
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
"""
|
142
|
-
for path in self.safe_file_paths:
|
143
|
-
if not path.exists():
|
144
|
-
self._logger.log(
|
145
|
-
"error",
|
146
|
-
f"File not found: {path}. Try adding sources to the knowledge directory. If it's inside the knowledge directory, use the relative path.",
|
147
|
-
color="red",
|
148
|
-
)
|
149
|
-
raise FileNotFoundError(f"File not found: {path}")
|
150
|
-
if not path.is_file():
|
151
|
-
self._logger.log("error", f"Path is not a file: {path}", color="red")
|
143
|
+
if not path_instance.exists():
|
144
|
+
self._logger.log(level="error", message="File path not found.", color="red")
|
145
|
+
raise ValueError()
|
152
146
|
|
147
|
+
elif not path_instance.is_file():
|
148
|
+
self._logger.log(level="error", message="Non-file object was given.", color="red")
|
149
|
+
raise ValueError()
|
153
150
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
else:
|
158
|
-
raise ValueError("No storage found to save documents.")
|
151
|
+
elif not path_instance.is_file():
|
152
|
+
self._logger.log(level="error", message="Non-file object was given.", color="red")
|
153
|
+
raise ValueError()
|
159
154
|
|
155
|
+
return path_instance
|
160
156
|
|
161
|
-
def convert_to_path(self, path: Path | str) -> Path:
|
162
|
-
"""
|
163
|
-
Convert a path to a Path object.
|
164
|
-
"""
|
165
|
-
return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
|
166
157
|
|
167
158
|
|
168
159
|
def _process_file_paths(self) -> List[Path]:
|
169
160
|
"""
|
170
161
|
Convert file_path to a list of Path objects.
|
171
162
|
"""
|
163
|
+
if not self.file_paths:
|
164
|
+
self._logger.log(level="error", message="Missing file paths.", color="red")
|
165
|
+
raise ValueError("Missing file paths.")
|
172
166
|
|
173
|
-
if self.file_paths is None:
|
174
|
-
raise ValueError("Your source must be provided with a file_paths: []")
|
175
167
|
|
176
168
|
path_list: List[Path | str] = [self.file_paths] if isinstance(self.file_paths, (str, Path)) else list(self.file_paths) if isinstance(self.file_paths, list) else []
|
169
|
+
valid_path_list = list()
|
177
170
|
|
178
171
|
if not path_list:
|
179
|
-
|
180
|
-
|
181
|
-
|
172
|
+
self._logger.log(level="error", message="Missing valid file paths.", color="red")
|
173
|
+
raise ValueError("Your source must be provided with file_paths: []")
|
174
|
+
|
175
|
+
for item in path_list:
|
176
|
+
valid_path = self.validate_content(item)
|
177
|
+
if valid_path:
|
178
|
+
valid_path_list.append(valid_path)
|
182
179
|
|
183
|
-
return
|
180
|
+
return valid_path_list
|
181
|
+
|
182
|
+
|
183
|
+
def model_post_init(self, _) -> None:
|
184
|
+
"""
|
185
|
+
Post-initialization method to load content.
|
186
|
+
"""
|
187
|
+
self.valid_file_paths = self._process_file_paths()
|
188
|
+
self.content = self.load_content()
|
189
|
+
self._save_documents()
|
190
|
+
|
191
|
+
|
192
|
+
@abstractmethod
|
193
|
+
def load_content(self) -> Dict[Path, str]:
|
194
|
+
"""
|
195
|
+
Load and preprocess file content. Should be overridden by subclasses.
|
196
|
+
Assume that the file path is relative to the project root in the knowledge directory.
|
197
|
+
"""
|
198
|
+
pass
|
184
199
|
|
185
200
|
|
186
201
|
|
@@ -193,10 +208,9 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource):
|
|
193
208
|
"""
|
194
209
|
Load and preprocess text file content.
|
195
210
|
"""
|
196
|
-
|
197
211
|
content = {}
|
198
|
-
for path in self.
|
199
|
-
path = self.
|
212
|
+
for path in self.valid_file_paths:
|
213
|
+
path = self.validate_content(path=path)
|
200
214
|
with open(path, "r", encoding="utf-8") as f:
|
201
215
|
content[path] = f.read()
|
202
216
|
return content
|
@@ -207,16 +221,10 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource):
|
|
207
221
|
Add text file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
|
208
222
|
"""
|
209
223
|
for _, text in self.content.items():
|
210
|
-
new_chunks = self._chunk_text(text)
|
224
|
+
new_chunks = self._chunk_text(text=text)
|
211
225
|
self.chunks.extend(new_chunks)
|
212
|
-
self._save_documents()
|
213
|
-
|
214
226
|
|
215
|
-
|
216
|
-
"""
|
217
|
-
Utility method to split text into chunks.
|
218
|
-
"""
|
219
|
-
return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
227
|
+
self._save_documents()
|
220
228
|
|
221
229
|
|
222
230
|
|
@@ -231,9 +239,9 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):
|
|
231
239
|
"""
|
232
240
|
pdfplumber = self._import_pdfplumber()
|
233
241
|
content = {}
|
234
|
-
for path in self.
|
242
|
+
for path in self.valid_file_paths:
|
235
243
|
text = ""
|
236
|
-
path = self.
|
244
|
+
path = self.validate_content(path)
|
237
245
|
with pdfplumber.open(path) as pdf:
|
238
246
|
for page in pdf.pages:
|
239
247
|
page_text = page.extract_text()
|
@@ -259,17 +267,12 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):
|
|
259
267
|
Add PDF file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
|
260
268
|
"""
|
261
269
|
for _, text in self.content.items():
|
262
|
-
new_chunks = self._chunk_text(text)
|
270
|
+
new_chunks = self._chunk_text(text=text)
|
263
271
|
self.chunks.extend(new_chunks)
|
272
|
+
|
264
273
|
self._save_documents()
|
265
274
|
|
266
275
|
|
267
|
-
def _chunk_text(self, text: str) -> List[str]:
|
268
|
-
"""
|
269
|
-
Utility method to split text into chunks.
|
270
|
-
"""
|
271
|
-
return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
272
|
-
|
273
276
|
|
274
277
|
|
275
278
|
class CSVKnowledgeSource(BaseFileKnowledgeSource):
|
@@ -282,7 +285,7 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource):
|
|
282
285
|
Load and preprocess CSV file content.
|
283
286
|
"""
|
284
287
|
content_dict = {}
|
285
|
-
for file_path in self.
|
288
|
+
for file_path in self.valid_file_paths:
|
286
289
|
with open(file_path, "r", encoding="utf-8") as csvfile:
|
287
290
|
reader = csv.reader(csvfile)
|
288
291
|
content = ""
|
@@ -295,22 +298,14 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource):
|
|
295
298
|
|
296
299
|
def add(self) -> None:
|
297
300
|
"""
|
298
|
-
Add CSV file content to the knowledge source, chunk it, compute embeddings,
|
299
|
-
and save the embeddings.
|
301
|
+
Add CSV file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
|
300
302
|
"""
|
301
303
|
content_str = str(self.content) if isinstance(self.content, dict) else self.content
|
302
|
-
new_chunks = self._chunk_text(content_str)
|
304
|
+
new_chunks = self._chunk_text(text=content_str)
|
303
305
|
self.chunks.extend(new_chunks)
|
304
306
|
self._save_documents()
|
305
307
|
|
306
308
|
|
307
|
-
def _chunk_text(self, text: str) -> List[str]:
|
308
|
-
"""
|
309
|
-
Utility method to split text into chunks.
|
310
|
-
"""
|
311
|
-
return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
312
|
-
|
313
|
-
|
314
309
|
|
315
310
|
class JSONKnowledgeSource(BaseFileKnowledgeSource):
|
316
311
|
"""
|
@@ -322,13 +317,14 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
|
|
322
317
|
Load and preprocess JSON file content.
|
323
318
|
"""
|
324
319
|
content: Dict[Path, str] = {}
|
325
|
-
for path in self.
|
326
|
-
path = self.
|
320
|
+
for path in self.valid_file_paths:
|
321
|
+
path = self.validate_content(path)
|
327
322
|
with open(path, "r", encoding="utf-8") as json_file:
|
328
323
|
data = json.load(json_file)
|
329
324
|
content[path] = self._json_to_text(data)
|
330
325
|
return content
|
331
326
|
|
327
|
+
|
332
328
|
def _json_to_text(self, data: Any, level: int = 0) -> str:
|
333
329
|
"""
|
334
330
|
Recursively convert JSON data to a text representation.
|
@@ -351,18 +347,11 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
|
|
351
347
|
Add JSON file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
|
352
348
|
"""
|
353
349
|
content_str = str(self.content) if isinstance(self.content, dict) else self.content
|
354
|
-
new_chunks = self._chunk_text(content_str)
|
350
|
+
new_chunks = self._chunk_text(text=content_str)
|
355
351
|
self.chunks.extend(new_chunks)
|
356
352
|
self._save_documents()
|
357
353
|
|
358
354
|
|
359
|
-
def _chunk_text(self, text: str) -> List[str]:
|
360
|
-
"""
|
361
|
-
Utility method to split text into chunks.
|
362
|
-
"""
|
363
|
-
return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
364
|
-
|
365
|
-
|
366
355
|
|
367
356
|
class ExcelKnowledgeSource(BaseFileKnowledgeSource):
|
368
357
|
"""
|
@@ -376,13 +365,14 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource):
|
|
376
365
|
|
377
366
|
pd = self._import_dependencies()
|
378
367
|
content_dict = {}
|
379
|
-
for file_path in self.
|
380
|
-
file_path = self.
|
368
|
+
for file_path in self.valid_file_paths:
|
369
|
+
file_path = self.validate_content(file_path)
|
381
370
|
df = pd.read_excel(file_path)
|
382
371
|
content = df.to_csv(index=False)
|
383
372
|
content_dict[file_path] = content
|
384
373
|
return content_dict
|
385
374
|
|
375
|
+
|
386
376
|
def _import_dependencies(self):
|
387
377
|
"""
|
388
378
|
Dynamically import dependencies.
|
@@ -396,18 +386,12 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource):
|
|
396
386
|
f"{missing_package} is not installed. Please install it with: pip install {missing_package}"
|
397
387
|
)
|
398
388
|
|
389
|
+
|
399
390
|
def add(self) -> None:
|
400
391
|
"""
|
401
392
|
Add Excel file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
|
402
393
|
"""
|
403
394
|
content_str = "\n".join(str(value) for value in self.content.values()) if isinstance(self.content, dict) else str(self.content)
|
404
|
-
new_chunks = self._chunk_text(content_str)
|
395
|
+
new_chunks = self._chunk_text(text=content_str)
|
405
396
|
self.chunks.extend(new_chunks)
|
406
397
|
self._save_documents()
|
407
|
-
|
408
|
-
|
409
|
-
def _chunk_text(self, text: str) -> List[str]:
|
410
|
-
"""
|
411
|
-
Utility method to split text into chunks.
|
412
|
-
"""
|
413
|
-
return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
@@ -12,11 +12,11 @@ try:
|
|
12
12
|
except ImportError:
|
13
13
|
DOCLING_AVAILABLE = False
|
14
14
|
|
15
|
-
from pydantic import Field
|
15
|
+
from pydantic import Field, InstanceOf
|
16
16
|
|
17
17
|
from versionhq.knowledge.source import BaseKnowledgeSource
|
18
|
+
from versionhq.storage.utils import fetch_db_storage_path
|
18
19
|
from versionhq._utils.vars import KNOWLEDGE_DIRECTORY
|
19
|
-
from versionhq._utils.logger import Logger
|
20
20
|
|
21
21
|
|
22
22
|
class DoclingSource(BaseKnowledgeSource):
|
@@ -31,11 +31,10 @@ class DoclingSource(BaseKnowledgeSource):
|
|
31
31
|
|
32
32
|
super().__init__(*args, **kwargs)
|
33
33
|
|
34
|
-
|
34
|
+
|
35
35
|
file_paths: List[Path | str] = Field(default_factory=list)
|
36
|
-
|
37
|
-
|
38
|
-
content: List["DoclingDocument"] = Field(default_factory=list)
|
36
|
+
valid_file_paths: List[Path | str] = Field(default_factory=list)
|
37
|
+
content: List[InstanceOf[DoclingDocument]] = Field(default_factory=list)
|
39
38
|
document_converter: "DocumentConverter" = Field(
|
40
39
|
default_factory=lambda: DocumentConverter(
|
41
40
|
allowed_formats=[
|
@@ -51,44 +50,46 @@ class DoclingSource(BaseKnowledgeSource):
|
|
51
50
|
)
|
52
51
|
)
|
53
52
|
|
54
|
-
|
55
|
-
|
56
|
-
|
53
|
+
|
54
|
+
def _convert_source_to_docling_documents(self) -> List[InstanceOf[DoclingDocument]]:
|
55
|
+
conv_results_iter = self.document_converter.convert_all(self.valid_file_paths)
|
56
|
+
return [result.document for result in conv_results_iter]
|
57
57
|
|
58
58
|
|
59
|
-
def _load_content(self) -> List[
|
59
|
+
def _load_content(self) -> List[InstanceOf[DoclingDocument]]:
|
60
60
|
try:
|
61
61
|
return self._convert_source_to_docling_documents()
|
62
62
|
except ConversionError as e:
|
63
|
-
self._logger.log(
|
64
|
-
level="error",
|
65
|
-
message=f"Error loading content: {str(e)}. Supported formats: {self.document_converter.allowed_formats}",
|
66
|
-
color="red",
|
67
|
-
)
|
63
|
+
self._logger.log(level="error", message=f"Error loading content: {str(e)}. Supported formats: {self.document_converter.allowed_formats}", color="red")
|
68
64
|
raise e
|
69
65
|
except Exception as e:
|
70
|
-
self._logger.log(level="error", message=f"Error loading content: {e}", color="red")
|
66
|
+
self._logger.log(level="error", message=f"Error loading content: {str(e)}", color="red")
|
71
67
|
raise e
|
72
68
|
|
73
69
|
|
74
|
-
def
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
new_chunks_iterable = self._chunk_doc(doc)
|
79
|
-
self.chunks.extend(list(new_chunks_iterable))
|
80
|
-
self._save_documents()
|
70
|
+
def _chunk_doc(self, doc: InstanceOf[DoclingDocument]) -> Iterator[str]:
|
71
|
+
chunker = HierarchicalChunker()
|
72
|
+
for chunk in chunker.chunk(doc):
|
73
|
+
yield chunk.text
|
81
74
|
|
82
75
|
|
83
|
-
def
|
84
|
-
|
85
|
-
|
76
|
+
def _validate_url(self, url: str) -> bool:
|
77
|
+
try:
|
78
|
+
result = urlparse(url)
|
79
|
+
return all(
|
80
|
+
[
|
81
|
+
result.scheme in ("http", "https"),
|
82
|
+
result.netloc,
|
83
|
+
len(result.netloc.split(".")) >= 2, # Ensure domain has TLD
|
84
|
+
]
|
85
|
+
)
|
86
|
+
except Exception:
|
87
|
+
return False
|
86
88
|
|
87
89
|
|
88
|
-
def
|
89
|
-
|
90
|
-
|
91
|
-
yield chunk.text
|
90
|
+
def model_post_init(self, _) -> None:
|
91
|
+
self.valid_file_paths = self.validate_content()
|
92
|
+
self.content.extend(self._load_content())
|
92
93
|
|
93
94
|
|
94
95
|
def validate_content(self) -> List[Path | str]:
|
@@ -108,22 +109,23 @@ class DoclingSource(BaseKnowledgeSource):
|
|
108
109
|
if local_path.exists():
|
109
110
|
processed_paths.append(local_path)
|
110
111
|
else:
|
111
|
-
|
112
|
+
local_path = Path(fetch_db_storage_path() + "/" + KNOWLEDGE_DIRECTORY + "/" + path) # try with abs. path
|
113
|
+
if local_path.exists():
|
114
|
+
processed_paths.append(local_path)
|
115
|
+
else:
|
116
|
+
raise FileNotFoundError(f"File not found: {local_path}")
|
112
117
|
else:
|
113
118
|
if isinstance(path, Path):
|
114
119
|
processed_paths.append(path)
|
115
120
|
return processed_paths
|
116
121
|
|
117
122
|
|
118
|
-
def
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
)
|
128
|
-
except Exception:
|
129
|
-
return False
|
123
|
+
def add(self) -> None:
|
124
|
+
if self.content is None:
|
125
|
+
self.model_post_init()
|
126
|
+
|
127
|
+
if self.content:
|
128
|
+
for doc in self.content:
|
129
|
+
new_chunks_iterable = self._chunk_doc(doc)
|
130
|
+
self.chunks.extend(list(new_chunks_iterable))
|
131
|
+
self._save_documents()
|
versionhq/knowledge/storage.py
CHANGED
@@ -62,16 +62,56 @@ class BaseKnowledgeStorage(ABC):
|
|
62
62
|
|
63
63
|
class KnowledgeStorage(BaseKnowledgeStorage):
|
64
64
|
"""
|
65
|
-
|
65
|
+
A class to store ChromaDB Storage vals that handles embeddings, ChromaClient, and Collection.
|
66
66
|
"""
|
67
67
|
|
68
68
|
collection: Optional[chromadb.Collection] = None
|
69
69
|
collection_name: Optional[str] = "knowledge"
|
70
70
|
app: Optional[ClientAPI] = None
|
71
|
+
embedding_function: Optional[Any] = None # store ChromaDB's EmbeddingFunction instance
|
72
|
+
embedder_config: Optional[Dict[str, Any]] = None # store config dict for embedding_function
|
73
|
+
|
71
74
|
|
72
75
|
def __init__(self, embedder_config: Optional[Dict[str, Any]] = None, collection_name: Optional[str] = None):
|
73
76
|
self.collection_name = collection_name
|
74
|
-
self.
|
77
|
+
self.embedder_config = embedder_config
|
78
|
+
self.initialize_knowledge_storage()
|
79
|
+
|
80
|
+
|
81
|
+
def _create_default_embedding_function(self) -> Any:
|
82
|
+
from chromadb.utils.embedding_functions.openai_embedding_function import OpenAIEmbeddingFunction
|
83
|
+
|
84
|
+
return OpenAIEmbeddingFunction(
|
85
|
+
api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
|
86
|
+
)
|
87
|
+
|
88
|
+
|
89
|
+
def _set_embedding_function(self, embedder_config: Optional[Dict[str, Any]] = None) -> None:
|
90
|
+
"""
|
91
|
+
Set the embedding configuration for the knowledge storage.
|
92
|
+
"""
|
93
|
+
self.embedding_function = EmbeddingConfigurator().configure_embedder(embedder_config) if embedder_config else self._create_default_embedding_function()
|
94
|
+
|
95
|
+
|
96
|
+
def initialize_knowledge_storage(self):
|
97
|
+
"""
|
98
|
+
Create ChromaClinent, set up the embedding function using `embedder_config`, and get or create Collection.
|
99
|
+
"""
|
100
|
+
base_path = os.path.join(fetch_db_storage_path(), "knowledge")
|
101
|
+
chroma_client = chromadb.PersistentClient(path=base_path, settings=Settings(allow_reset=True))
|
102
|
+
self.app = chroma_client
|
103
|
+
|
104
|
+
self._set_embedding_function(self.embedder_config)
|
105
|
+
|
106
|
+
try:
|
107
|
+
collection_name = f"knowledge_{self.collection_name}" if self.collection_name else "knowledge"
|
108
|
+
if self.app:
|
109
|
+
self.collection = self.app.get_or_create_collection(name=collection_name, embedding_function=self.embedding_function)
|
110
|
+
else:
|
111
|
+
raise Exception("Vector Database Client not initialized")
|
112
|
+
except Exception:
|
113
|
+
raise Exception("Failed to create or get collection")
|
114
|
+
|
75
115
|
|
76
116
|
def search(self, query: List[str], limit: int = 3, filter: Optional[dict] = None, score_threshold: float = 0.35) -> List[Dict[str, Any]]:
|
77
117
|
with suppress_logging():
|
@@ -92,60 +132,44 @@ class KnowledgeStorage(BaseKnowledgeStorage):
|
|
92
132
|
raise Exception("Collection not initialized")
|
93
133
|
|
94
134
|
|
95
|
-
def initialize_knowledge_storage(self):
|
96
|
-
base_path = os.path.join(fetch_db_storage_path(), "knowledge")
|
97
|
-
chroma_client = chromadb.PersistentClient(path=base_path, settings=Settings(allow_reset=True))
|
98
|
-
self.app = chroma_client
|
99
|
-
|
100
|
-
try:
|
101
|
-
collection_name = f"knowledge_{self.collection_name}" if self.collection_name else "knowledge"
|
102
|
-
if self.app:
|
103
|
-
self.collection = self.app.get_or_create_collection(name=collection_name, embedding_function=self.embedder_config)
|
104
|
-
else:
|
105
|
-
raise Exception("Vector Database Client not initialized")
|
106
|
-
except Exception:
|
107
|
-
raise Exception("Failed to create or get collection")
|
108
|
-
|
109
|
-
|
110
|
-
def reset(self):
|
111
|
-
base_path = os.path.join(fetch_db_storage_path(), KNOWLEDGE_DIRECTORY)
|
112
|
-
if not self.app:
|
113
|
-
self.app = chromadb.PersistentClient(path=base_path, settings=Settings(allow_reset=True))
|
114
|
-
self.app.reset()
|
115
|
-
shutil.rmtree(base_path)
|
116
|
-
self.app = None
|
117
|
-
self.collection = None
|
118
|
-
|
119
|
-
|
120
135
|
def save(self, documents: List[str], metadata: Optional[Dict[str, Any] | List[Dict[str, Any]]] = None) -> None:
|
121
136
|
if not self.collection:
|
122
|
-
|
137
|
+
self.initialize_knowledge_storage()
|
138
|
+
# raise Exception("Collection not initialized")
|
123
139
|
|
124
140
|
try:
|
125
141
|
unique_docs = {}
|
126
142
|
for i, doc in enumerate(documents):
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
143
|
+
if doc:
|
144
|
+
doc = doc
|
145
|
+
if isinstance(doc, list):
|
146
|
+
doc = doc[0]
|
147
|
+
|
148
|
+
doc_id = hashlib.sha256(str(doc).encode("utf-8")).hexdigest()
|
149
|
+
doc_metadata = None
|
150
|
+
if metadata:
|
151
|
+
if isinstance(metadata, list):
|
152
|
+
doc_metadata = metadata[i]
|
153
|
+
else:
|
154
|
+
doc_metadata = metadata
|
155
|
+
unique_docs[doc_id] = (doc, doc_metadata)
|
135
156
|
|
136
157
|
filtered_docs = []
|
137
158
|
filtered_metadata = []
|
138
159
|
filtered_ids = []
|
139
160
|
|
140
161
|
for doc_id, (doc, meta) in unique_docs.items():
|
141
|
-
|
142
|
-
|
143
|
-
|
162
|
+
if doc_id and doc:
|
163
|
+
filtered_docs.append(doc)
|
164
|
+
filtered_metadata.append(meta)
|
165
|
+
filtered_ids.append(doc_id)
|
144
166
|
|
145
167
|
final_metadata: Optional[OneOrMany[chromadb.Metadata]] = (
|
146
168
|
None if all(m is None for m in filtered_metadata) else filtered_metadata
|
147
169
|
)
|
148
|
-
|
170
|
+
|
171
|
+
if filtered_docs:
|
172
|
+
self.collection.upsert(documents=filtered_docs, metadatas=final_metadata, ids=filtered_ids)
|
149
173
|
|
150
174
|
except chromadb.errors.InvalidDimensionException as e:
|
151
175
|
Logger(verbose=True).log(
|
@@ -160,18 +184,11 @@ class KnowledgeStorage(BaseKnowledgeStorage):
|
|
160
184
|
raise
|
161
185
|
|
162
186
|
|
163
|
-
def
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
def _set_embedder_config(self, embedder_config: Optional[Dict[str, Any]] = None) -> None:
|
174
|
-
"""
|
175
|
-
Set the embedding configuration for the knowledge storage.
|
176
|
-
"""
|
177
|
-
self.embedder_config = EmbeddingConfigurator().configure_embedder(embedder_config) if embedder_config else self._create_default_embedding_function()
|
187
|
+
def reset(self):
|
188
|
+
base_path = os.path.join(fetch_db_storage_path(), KNOWLEDGE_DIRECTORY)
|
189
|
+
if not self.app:
|
190
|
+
self.app = chromadb.PersistentClient(path=base_path, settings=Settings(allow_reset=True))
|
191
|
+
self.app.reset()
|
192
|
+
shutil.rmtree(base_path)
|
193
|
+
self.app = None
|
194
|
+
self.collection = None
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: versionhq
|
3
|
-
Version: 1.1.11.
|
3
|
+
Version: 1.1.11.3
|
4
4
|
Summary: LLM orchestration frameworks for model-agnostic AI agents that handle complex outbound workflows
|
5
5
|
Author-email: Kuriko Iwai <kuriko@versi0n.io>
|
6
6
|
License: MIT License
|
@@ -43,8 +43,8 @@ Requires-Dist: regex==2024.11.6
|
|
43
43
|
Requires-Dist: requests>=2.32.3
|
44
44
|
Requires-Dist: pydantic>=2.10.6
|
45
45
|
Requires-Dist: werkzeug>=3.1.3
|
46
|
-
Requires-Dist: typing
|
47
|
-
Requires-Dist: json-repair
|
46
|
+
Requires-Dist: typing
|
47
|
+
Requires-Dist: json-repair
|
48
48
|
Requires-Dist: litellm>=1.55.8
|
49
49
|
Requires-Dist: openai>=1.57.0
|
50
50
|
Requires-Dist: composio-openai>=0.6.9
|
@@ -57,7 +57,6 @@ Requires-Dist: langchain>=0.3.14
|
|
57
57
|
Requires-Dist: langchain-openai>=0.2.14
|
58
58
|
Requires-Dist: composio-langchain>=0.6.12
|
59
59
|
Requires-Dist: chromadb>=0.6.3
|
60
|
-
Requires-Dist: json-repair>=0.35.0
|
61
60
|
Requires-Dist: wheel>=0.45.1
|
62
61
|
Provides-Extra: docling
|
63
62
|
Requires-Dist: docling>=2.17.0; extra == "docling"
|
@@ -1,4 +1,4 @@
|
|
1
|
-
versionhq/__init__.py,sha256=
|
1
|
+
versionhq/__init__.py,sha256=6jDe1kpjOaxhIoTpJEfgVZcZGcwKg37J896LUSJUYGs,951
|
2
2
|
versionhq/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
3
3
|
versionhq/_utils/i18n.py,sha256=TwA_PnYfDLA6VqlUDPuybdV9lgi3Frh_ASsb_X8jJo8,1483
|
4
4
|
versionhq/_utils/logger.py,sha256=U-MpeGueA6YS8Ptfy0VnU_ePsZP-8Pvkvi0tZ4s_UMg,1438
|
@@ -7,7 +7,7 @@ versionhq/_utils/usage_metrics.py,sha256=hhq1OCW8Z4V93vwW2O2j528EyjOlF8wlTsX5IL-
|
|
7
7
|
versionhq/_utils/vars.py,sha256=bZ5Dx_bFKlt3hi4-NNGXqdk7B23If_WaTIju2fiTyPQ,57
|
8
8
|
versionhq/agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
9
|
versionhq/agent/default_agents.py,sha256=Sea3xDswxxMccer1vVDhp1E5etXW3ddf2n20JTMHgqs,503
|
10
|
-
versionhq/agent/model.py,sha256=
|
10
|
+
versionhq/agent/model.py,sha256=F_VkSQ6G6mJvDWrRBILZ6KjtlCpm0r_8bMN73sDKKGc,22921
|
11
11
|
versionhq/agent/parser.py,sha256=riG0dkdQCxH7uJ0AbdVdg7WvL0BXhUgJht0VtQvxJBc,4082
|
12
12
|
versionhq/agent/rpm_controller.py,sha256=7AKIEPbWBq_ESOZCaiKVOGjfSPHd2qwg6-wbBlhqC0g,2367
|
13
13
|
versionhq/agent/TEMPLATES/Backstory.py,sha256=IAhGnnt6VUMe3wO6IzeyZPDNu7XE7Uiu3VEXUreOcKs,532
|
@@ -23,10 +23,10 @@ versionhq/clients/workflow/model.py,sha256=FNftenLLoha0bkivrjId32awLHAkBwIT8iNlj
|
|
23
23
|
versionhq/knowledge/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
24
24
|
versionhq/knowledge/_utils.py,sha256=YWRF8U533cfZes_gZqUvdj-K24MD2ri1R0gjc_aPYyc,402
|
25
25
|
versionhq/knowledge/embedding.py,sha256=KfHc__1THxb5jrg1EMrF-v944RDuIr2hE0l-MtM3Bp0,6826
|
26
|
-
versionhq/knowledge/model.py,sha256=
|
27
|
-
versionhq/knowledge/source.py,sha256=
|
28
|
-
versionhq/knowledge/source_docling.py,sha256=
|
29
|
-
versionhq/knowledge/storage.py,sha256=
|
26
|
+
versionhq/knowledge/model.py,sha256=_liwQoS_VJlJgVSwAb7Y68SwbPuU0QBY_q0cA8x7dCo,1862
|
27
|
+
versionhq/knowledge/source.py,sha256=yUwOds0zc8oPLvtV_hIE4P7k9BjQ9vc4MbbGorv_H6I,13292
|
28
|
+
versionhq/knowledge/source_docling.py,sha256=A2XfVo5EzvmWMUGcBFTtABLPznDFtEIy7AOOQiAtB0E,4975
|
29
|
+
versionhq/knowledge/storage.py,sha256=vg7NEi19b47QaxXQxx2BLag3hjUZAQnwPqUifzhWCvQ,7373
|
30
30
|
versionhq/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
31
31
|
versionhq/llm/llm_vars.py,sha256=PO__b-h5e-6oQ-uoIgXx3lPSAUPUwXYfdVRW73fvX14,8761
|
32
32
|
versionhq/llm/model.py,sha256=1uaBxT10GIlUl-BtE8Mfux-ZRcScp4HUIas_fD_cdWQ,14471
|
@@ -57,8 +57,8 @@ versionhq/tool/composio_tool_vars.py,sha256=FvBuEXsOQUYnN7RTFxT20kAkiEYkxWKkiVtg
|
|
57
57
|
versionhq/tool/decorator.py,sha256=C4ZM7Xi2gwtEMaSeRo-geo_g_MAkY77WkSLkAuY0AyI,1205
|
58
58
|
versionhq/tool/model.py,sha256=7ccEnje_8LuxLVeog6pL38nToArXQXk4KY7A9hfprDo,12239
|
59
59
|
versionhq/tool/tool_handler.py,sha256=2m41K8qo5bGCCbwMFferEjT-XZ-mE9F0mDUOBkgivOI,1416
|
60
|
-
versionhq-1.1.11.
|
61
|
-
versionhq-1.1.11.
|
62
|
-
versionhq-1.1.11.
|
63
|
-
versionhq-1.1.11.
|
64
|
-
versionhq-1.1.11.
|
60
|
+
versionhq-1.1.11.3.dist-info/LICENSE,sha256=7CCXuMrAjPVsUvZrsBq9DsxI2rLDUSYXR_qj4yO_ZII,1077
|
61
|
+
versionhq-1.1.11.3.dist-info/METADATA,sha256=mjSq2raluRpiKGA7wIhK4RnByEGTf2AfoT1tx4YY5Hw,18251
|
62
|
+
versionhq-1.1.11.3.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
63
|
+
versionhq-1.1.11.3.dist-info/top_level.txt,sha256=DClQwxDWqIUGeRJkA8vBlgeNsYZs4_nJWMonzFt5Wj0,10
|
64
|
+
versionhq-1.1.11.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|