versionhq 1.1.11.2__py3-none-any.whl → 1.1.11.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
versionhq/__init__.py CHANGED
@@ -18,7 +18,7 @@ from versionhq.tool.model import Tool
18
18
  from versionhq.tool.composio_tool import ComposioHandler
19
19
 
20
20
 
21
- __version__ = "1.1.11.2"
21
+ __version__ = "1.1.11.4"
22
22
  __all__ = [
23
23
  "Agent",
24
24
  "Customer",
versionhq/agent/model.py CHANGED
@@ -469,7 +469,7 @@ class Agent(BaseModel):
469
469
  task_prompt += context
470
470
 
471
471
  if self._knowledge:
472
- agent_knowledge = self._knowledge.query(query=[task_prompt,])
472
+ agent_knowledge = self._knowledge.query(query=[task_prompt,], limit=5)
473
473
  if agent_knowledge:
474
474
  agent_knowledge_context = extract_knowledge_context(knowledge_snippets=agent_knowledge)
475
475
  if agent_knowledge_context:
@@ -26,13 +26,17 @@ class Knowledge(BaseModel):
26
26
  **data,
27
27
  ):
28
28
  super().__init__(**data)
29
+
30
+
29
31
  if storage:
30
32
  self.storage = storage
31
33
  else:
32
34
  self.storage = KnowledgeStorage(embedder_config=embedder_config, collection_name=collection_name)
33
35
 
34
- self.sources = sources
36
+ self.storage._set_embedding_function(embedder_config=embedder_config)
35
37
  self.storage.initialize_knowledge_storage()
38
+
39
+ self.sources = sources
36
40
  for source in sources:
37
41
  source.storage = self.storage
38
42
  source.add()
@@ -8,6 +8,7 @@ import numpy as np
8
8
  from pydantic import BaseModel, ConfigDict, Field, field_validator
9
9
 
10
10
  from versionhq.knowledge.storage import KnowledgeStorage
11
+ from versionhq.storage.utils import fetch_db_storage_path
11
12
  from versionhq._utils.vars import KNOWLEDGE_DIRECTORY
12
13
  from versionhq._utils.logger import Logger
13
14
 
@@ -16,50 +17,66 @@ class BaseKnowledgeSource(BaseModel, ABC):
16
17
  """
17
18
  Abstract base class for knowledge sources: csv, json, excel, pdf, string, and docling.
18
19
  """
20
+ _logger: Logger = Logger(verbose=True)
19
21
 
20
- chunk_size: int = 4000
22
+ chunk_size: int = 3000
21
23
  chunk_overlap: int = 200
22
24
  chunks: List[str] = Field(default_factory=list)
23
25
  chunk_embeddings: List[np.ndarray] = Field(default_factory=list)
24
26
 
25
27
  model_config = ConfigDict(arbitrary_types_allowed=True)
26
28
  storage: Optional[KnowledgeStorage] = Field(default=None)
27
- metadata: Dict[str, Any] = Field(default_factory=dict) # Currently unused
29
+ metadata: Dict[str, Any] = Field(default_factory=dict)
28
30
  collection_name: Optional[str] = Field(default=None)
29
31
 
32
+
30
33
  @abstractmethod
31
- def validate_content(self) -> Any:
34
+ def validate_content(self, **kwargs) -> Any:
32
35
  """Load and preprocess content from the source."""
33
36
  pass
34
37
 
38
+
35
39
  @abstractmethod
36
40
  def add(self) -> None:
37
41
  """Process content, chunk it, compute embeddings, and save them."""
38
42
  pass
39
43
 
44
+
40
45
  def get_embeddings(self) -> List[np.ndarray]:
41
46
  """Return the list of embeddings for the chunks."""
42
47
  return self.chunk_embeddings
43
48
 
49
+
44
50
  def _chunk_text(self, text: str) -> List[str]:
45
51
  """
46
52
  Utility method to split text into chunks.
47
53
  """
54
+ return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
48
55
 
49
- return [
50
- text[i : i + self.chunk_size]
51
- for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
52
- ]
53
56
 
54
- def _save_documents(self):
57
+ def _save_documents(self) -> None:
55
58
  """
56
- Save the documents to the storage.
59
+ Save the documents to the given (or newly created) storage on ChromaDB.
57
60
  This method should be called after the chunks and embeddings are generated.
58
61
  """
59
- if self.storage:
60
- self.storage.save(self.chunks)
61
- else:
62
- raise ValueError("No storage found to save documents.")
62
+ # if not self.chunks or self.chunk_embeddings:
63
+ # self._logger.log(level="warning", message="Chunks or chunk embeddings are missing. Save docs after creating them.", color="yellow")
64
+ # return
65
+
66
+ try:
67
+ if self.storage:
68
+ self.storage.save(documents=self.chunks, metadata=self.metadata)
69
+
70
+ else:
71
+ storage = KnowledgeStorage(collection_name=self.collection_name) if self.collection_name else KnowledgeStorage()
72
+ storage.initialize_knowledge_storage()
73
+ self.storage = storage
74
+ self.storage.save(documents=self.chunks, metadata=self.metadata)
75
+
76
+ except:
77
+ self._logger.log(level="error", message="No storage found or created to save the documents.", color="red")
78
+ return
79
+ # raise ValueError("No storage found to save documents.")
63
80
 
64
81
 
65
82
 
@@ -74,37 +91,32 @@ class StringKnowledgeSource(BaseKnowledgeSource):
74
91
  def model_post_init(self, _):
75
92
  """Post-initialization method to validate content."""
76
93
  self.validate_content()
94
+ self._save_documents()
95
+
77
96
 
78
97
  def validate_content(self):
79
98
  """Validate string content."""
80
99
  if not isinstance(self.content, str):
81
100
  raise ValueError("StringKnowledgeSource only accepts string content")
82
101
 
102
+
83
103
  def add(self) -> None:
84
104
  """
85
105
  Add string content to the knowledge source, chunk it, compute embeddings, and save them.
86
106
  """
87
- new_chunks = self._chunk_text(self.content)
107
+ new_chunks = self._chunk_text(text=self.content)
88
108
  self.chunks.extend(new_chunks)
89
109
  self._save_documents()
90
110
 
91
111
 
92
- def _chunk_text(self, text: str) -> List[str]:
93
- """
94
- Utility method to split text into chunks.
95
- """
96
- return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
97
-
98
-
99
112
 
100
113
  class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
101
114
  """Base class for knowledge sources that load content from files."""
102
115
 
103
- _logger: Logger = Logger(verbose=True)
104
116
  file_paths: Optional[Path | List[Path] | str | List[str]] = Field(default_factory=list)
105
117
  content: Dict[Path, str] = Field(init=False, default_factory=dict)
106
118
  storage: Optional[KnowledgeStorage] = Field(default=None)
107
- safe_file_paths: List[Path] = Field(default_factory=list, description="store a list of `Path` objects from self.file_paths")
119
+ valid_file_paths: List[Path] = Field(default_factory=list, description="store a list of `Path` objects from self.file_paths")
108
120
 
109
121
 
110
122
  @field_validator("file_paths", mode="before")
@@ -117,70 +129,73 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
117
129
  return v
118
130
 
119
131
 
120
- def model_post_init(self, _) -> None:
132
+ def validate_content(self, path: str | Path) -> List[Path]:
121
133
  """
122
- Post-initialization method to load content.
134
+ Convert the given path to a Path object, and validate if the path exists and refers to a file.)
123
135
  """
124
- self.safe_file_paths = self._process_file_paths()
125
- self.validate_content()
126
- self.content = self.load_content()
127
-
128
136
 
129
- @abstractmethod
130
- def load_content(self) -> Dict[Path, str]:
131
- """
132
- Load and preprocess file content. Should be overridden by subclasses.
133
- Assume that the file path is relative to the project root in the knowledge directory.
134
- """
135
- pass
137
+ path_instance = Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
136
138
 
139
+ if not path_instance.exists():
140
+ abs_path = fetch_db_storage_path()
141
+ path_instance = Path(abs_path + "/" + KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
137
142
 
138
- def validate_content(self):
139
- """
140
- Validate the given file paths.
141
- """
142
- for path in self.safe_file_paths:
143
- if not path.exists():
144
- self._logger.log(
145
- "error",
146
- f"File not found: {path}. Try adding sources to the knowledge directory. If it's inside the knowledge directory, use the relative path.",
147
- color="red",
148
- )
149
- raise FileNotFoundError(f"File not found: {path}")
150
- if not path.is_file():
151
- self._logger.log("error", f"Path is not a file: {path}", color="red")
143
+ if not path_instance.exists():
144
+ self._logger.log(level="error", message="File path not found.", color="red")
145
+ raise ValueError()
152
146
 
147
+ elif not path_instance.is_file():
148
+ self._logger.log(level="error", message="Non-file object was given.", color="red")
149
+ raise ValueError()
153
150
 
154
- def _save_documents(self):
155
- if self.storage:
156
- self.storage.save(self.chunks)
157
- else:
158
- raise ValueError("No storage found to save documents.")
151
+ elif not path_instance.is_file():
152
+ self._logger.log(level="error", message="Non-file object was given.", color="red")
153
+ raise ValueError()
159
154
 
155
+ return path_instance
160
156
 
161
- def convert_to_path(self, path: Path | str) -> Path:
162
- """
163
- Convert a path to a Path object.
164
- """
165
- return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
166
157
 
167
158
 
168
159
  def _process_file_paths(self) -> List[Path]:
169
160
  """
170
161
  Convert file_path to a list of Path objects.
171
162
  """
163
+ if not self.file_paths:
164
+ self._logger.log(level="error", message="Missing file paths.", color="red")
165
+ raise ValueError("Missing file paths.")
172
166
 
173
- if self.file_paths is None:
174
- raise ValueError("Your source must be provided with a file_paths: []")
175
167
 
176
168
  path_list: List[Path | str] = [self.file_paths] if isinstance(self.file_paths, (str, Path)) else list(self.file_paths) if isinstance(self.file_paths, list) else []
169
+ valid_path_list = list()
177
170
 
178
171
  if not path_list:
179
- raise ValueError(
180
- "file_path/file_paths must be a Path, str, or a list of these types"
181
- )
172
+ self._logger.log(level="error", message="Missing valid file paths.", color="red")
173
+ raise ValueError("Your source must be provided with file_paths: []")
174
+
175
+ for item in path_list:
176
+ valid_path = self.validate_content(item)
177
+ if valid_path:
178
+ valid_path_list.append(valid_path)
182
179
 
183
- return [self.convert_to_path(path) for path in path_list]
180
+ return valid_path_list
181
+
182
+
183
+ def model_post_init(self, _) -> None:
184
+ """
185
+ Post-initialization method to load content.
186
+ """
187
+ self.valid_file_paths = self._process_file_paths()
188
+ self.content = self.load_content()
189
+ self._save_documents()
190
+
191
+
192
+ @abstractmethod
193
+ def load_content(self) -> Dict[Path, str]:
194
+ """
195
+ Load and preprocess file content. Should be overridden by subclasses.
196
+ Assume that the file path is relative to the project root in the knowledge directory.
197
+ """
198
+ pass
184
199
 
185
200
 
186
201
 
@@ -193,10 +208,9 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource):
193
208
  """
194
209
  Load and preprocess text file content.
195
210
  """
196
-
197
211
  content = {}
198
- for path in self.safe_file_paths:
199
- path = self.convert_to_path(path)
212
+ for path in self.valid_file_paths:
213
+ path = self.validate_content(path=path)
200
214
  with open(path, "r", encoding="utf-8") as f:
201
215
  content[path] = f.read()
202
216
  return content
@@ -207,16 +221,10 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource):
207
221
  Add text file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
208
222
  """
209
223
  for _, text in self.content.items():
210
- new_chunks = self._chunk_text(text)
224
+ new_chunks = self._chunk_text(text=text)
211
225
  self.chunks.extend(new_chunks)
212
- self._save_documents()
213
-
214
226
 
215
- def _chunk_text(self, text: str) -> List[str]:
216
- """
217
- Utility method to split text into chunks.
218
- """
219
- return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
227
+ self._save_documents()
220
228
 
221
229
 
222
230
 
@@ -231,9 +239,9 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):
231
239
  """
232
240
  pdfplumber = self._import_pdfplumber()
233
241
  content = {}
234
- for path in self.safe_file_paths:
242
+ for path in self.valid_file_paths:
235
243
  text = ""
236
- path = self.convert_to_path(path)
244
+ path = self.validate_content(path)
237
245
  with pdfplumber.open(path) as pdf:
238
246
  for page in pdf.pages:
239
247
  page_text = page.extract_text()
@@ -259,17 +267,12 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):
259
267
  Add PDF file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
260
268
  """
261
269
  for _, text in self.content.items():
262
- new_chunks = self._chunk_text(text)
270
+ new_chunks = self._chunk_text(text=text)
263
271
  self.chunks.extend(new_chunks)
272
+
264
273
  self._save_documents()
265
274
 
266
275
 
267
- def _chunk_text(self, text: str) -> List[str]:
268
- """
269
- Utility method to split text into chunks.
270
- """
271
- return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
272
-
273
276
 
274
277
 
275
278
  class CSVKnowledgeSource(BaseFileKnowledgeSource):
@@ -282,7 +285,7 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource):
282
285
  Load and preprocess CSV file content.
283
286
  """
284
287
  content_dict = {}
285
- for file_path in self.safe_file_paths:
288
+ for file_path in self.valid_file_paths:
286
289
  with open(file_path, "r", encoding="utf-8") as csvfile:
287
290
  reader = csv.reader(csvfile)
288
291
  content = ""
@@ -295,22 +298,14 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource):
295
298
 
296
299
  def add(self) -> None:
297
300
  """
298
- Add CSV file content to the knowledge source, chunk it, compute embeddings,
299
- and save the embeddings.
301
+ Add CSV file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
300
302
  """
301
303
  content_str = str(self.content) if isinstance(self.content, dict) else self.content
302
- new_chunks = self._chunk_text(content_str)
304
+ new_chunks = self._chunk_text(text=content_str)
303
305
  self.chunks.extend(new_chunks)
304
306
  self._save_documents()
305
307
 
306
308
 
307
- def _chunk_text(self, text: str) -> List[str]:
308
- """
309
- Utility method to split text into chunks.
310
- """
311
- return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
312
-
313
-
314
309
 
315
310
  class JSONKnowledgeSource(BaseFileKnowledgeSource):
316
311
  """
@@ -322,13 +317,14 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
322
317
  Load and preprocess JSON file content.
323
318
  """
324
319
  content: Dict[Path, str] = {}
325
- for path in self.safe_file_paths:
326
- path = self.convert_to_path(path)
320
+ for path in self.valid_file_paths:
321
+ path = self.validate_content(path)
327
322
  with open(path, "r", encoding="utf-8") as json_file:
328
323
  data = json.load(json_file)
329
324
  content[path] = self._json_to_text(data)
330
325
  return content
331
326
 
327
+
332
328
  def _json_to_text(self, data: Any, level: int = 0) -> str:
333
329
  """
334
330
  Recursively convert JSON data to a text representation.
@@ -351,18 +347,11 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
351
347
  Add JSON file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
352
348
  """
353
349
  content_str = str(self.content) if isinstance(self.content, dict) else self.content
354
- new_chunks = self._chunk_text(content_str)
350
+ new_chunks = self._chunk_text(text=content_str)
355
351
  self.chunks.extend(new_chunks)
356
352
  self._save_documents()
357
353
 
358
354
 
359
- def _chunk_text(self, text: str) -> List[str]:
360
- """
361
- Utility method to split text into chunks.
362
- """
363
- return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
364
-
365
-
366
355
 
367
356
  class ExcelKnowledgeSource(BaseFileKnowledgeSource):
368
357
  """
@@ -376,13 +365,14 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource):
376
365
 
377
366
  pd = self._import_dependencies()
378
367
  content_dict = {}
379
- for file_path in self.safe_file_paths:
380
- file_path = self.convert_to_path(file_path)
368
+ for file_path in self.valid_file_paths:
369
+ file_path = self.validate_content(file_path)
381
370
  df = pd.read_excel(file_path)
382
371
  content = df.to_csv(index=False)
383
372
  content_dict[file_path] = content
384
373
  return content_dict
385
374
 
375
+
386
376
  def _import_dependencies(self):
387
377
  """
388
378
  Dynamically import dependencies.
@@ -396,18 +386,12 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource):
396
386
  f"{missing_package} is not installed. Please install it with: pip install {missing_package}"
397
387
  )
398
388
 
389
+
399
390
  def add(self) -> None:
400
391
  """
401
392
  Add Excel file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
402
393
  """
403
394
  content_str = "\n".join(str(value) for value in self.content.values()) if isinstance(self.content, dict) else str(self.content)
404
- new_chunks = self._chunk_text(content_str)
395
+ new_chunks = self._chunk_text(text=content_str)
405
396
  self.chunks.extend(new_chunks)
406
397
  self._save_documents()
407
-
408
-
409
- def _chunk_text(self, text: str) -> List[str]:
410
- """
411
- Utility method to split text into chunks.
412
- """
413
- return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
@@ -12,11 +12,11 @@ try:
12
12
  except ImportError:
13
13
  DOCLING_AVAILABLE = False
14
14
 
15
- from pydantic import Field
15
+ from pydantic import Field, InstanceOf
16
16
 
17
17
  from versionhq.knowledge.source import BaseKnowledgeSource
18
+ from versionhq.storage.utils import fetch_db_storage_path
18
19
  from versionhq._utils.vars import KNOWLEDGE_DIRECTORY
19
- from versionhq._utils.logger import Logger
20
20
 
21
21
 
22
22
  class DoclingSource(BaseKnowledgeSource):
@@ -31,10 +31,9 @@ class DoclingSource(BaseKnowledgeSource):
31
31
 
32
32
  super().__init__(*args, **kwargs)
33
33
 
34
- _logger: Logger = Logger(verbose=True)
34
+
35
35
  file_paths: List[Path | str] = Field(default_factory=list)
36
- chunks: List[str] = Field(default_factory=list)
37
- safe_file_paths: List[Path | str] = Field(default_factory=list)
36
+ valid_file_paths: List[Path | str] = Field(default_factory=list)
38
37
  content: List["DoclingDocument"] = Field(default_factory=list)
39
38
  document_converter: "DocumentConverter" = Field(
40
39
  default_factory=lambda: DocumentConverter(
@@ -51,46 +50,48 @@ class DoclingSource(BaseKnowledgeSource):
51
50
  )
52
51
  )
53
52
 
54
- def model_post_init(self, _) -> None:
55
- self.safe_file_paths = self.validate_content()
56
- self.content = self._load_content()
53
+
54
+ def _convert_source_to_docling_documents(self) -> List["DoclingDocument"]:
55
+ conv_results_iter = self.document_converter.convert_all(self.valid_file_paths)
56
+ return [result.document for result in conv_results_iter]
57
57
 
58
58
 
59
59
  def _load_content(self) -> List["DoclingDocument"]:
60
60
  try:
61
61
  return self._convert_source_to_docling_documents()
62
62
  except ConversionError as e:
63
- self._logger.log(
64
- level="error",
65
- message=f"Error loading content: {str(e)}. Supported formats: {self.document_converter.allowed_formats}",
66
- color="red",
67
- )
63
+ self._logger.log(level="error", message=f"Error loading content: {str(e)}. Supported formats: {self.document_converter.allowed_formats}", color="red")
68
64
  raise e
69
65
  except Exception as e:
70
- self._logger.log(level="error", message=f"Error loading content: {e}", color="red")
66
+ self._logger.log(level="error", message=f"Error loading content: {str(e)}", color="red")
71
67
  raise e
72
68
 
73
69
 
74
- def add(self) -> None:
75
- if self.content is None:
76
- return
77
- for doc in self.content:
78
- new_chunks_iterable = self._chunk_doc(doc)
79
- self.chunks.extend(list(new_chunks_iterable))
80
- self._save_documents()
81
-
82
-
83
- def _convert_source_to_docling_documents(self) -> List["DoclingDocument"]:
84
- conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
85
- return [result.document for result in conv_results_iter]
86
-
87
-
88
70
  def _chunk_doc(self, doc: "DoclingDocument") -> Iterator[str]:
89
71
  chunker = HierarchicalChunker()
90
72
  for chunk in chunker.chunk(doc):
91
73
  yield chunk.text
92
74
 
93
75
 
76
+ def _validate_url(self, url: str) -> bool:
77
+ try:
78
+ result = urlparse(url)
79
+ return all(
80
+ [
81
+ result.scheme in ("http", "https"),
82
+ result.netloc,
83
+ len(result.netloc.split(".")) >= 2, # Ensure domain has TLD
84
+ ]
85
+ )
86
+ except Exception:
87
+ return False
88
+
89
+
90
+ def model_post_init(self, _) -> None:
91
+ self.valid_file_paths = self.validate_content()
92
+ self.content.extend(self._load_content())
93
+
94
+
94
95
  def validate_content(self) -> List[Path | str]:
95
96
  processed_paths: List[Path | str] = []
96
97
  for path in self.file_paths:
@@ -108,22 +109,23 @@ class DoclingSource(BaseKnowledgeSource):
108
109
  if local_path.exists():
109
110
  processed_paths.append(local_path)
110
111
  else:
111
- raise FileNotFoundError(f"File not found: {local_path}")
112
+ local_path = Path(fetch_db_storage_path() + "/" + KNOWLEDGE_DIRECTORY + "/" + path) # try with abs. path
113
+ if local_path.exists():
114
+ processed_paths.append(local_path)
115
+ else:
116
+ raise FileNotFoundError(f"File not found: {local_path}")
112
117
  else:
113
118
  if isinstance(path, Path):
114
119
  processed_paths.append(path)
115
120
  return processed_paths
116
121
 
117
122
 
118
- def _validate_url(self, url: str) -> bool:
119
- try:
120
- result = urlparse(url)
121
- return all(
122
- [
123
- result.scheme in ("http", "https"),
124
- result.netloc,
125
- len(result.netloc.split(".")) >= 2, # Ensure domain has TLD
126
- ]
127
- )
128
- except Exception:
129
- return False
123
+ def add(self) -> None:
124
+ if self.content is None:
125
+ self.model_post_init()
126
+
127
+ if self.content:
128
+ for doc in self.content:
129
+ new_chunks_iterable = self._chunk_doc(doc)
130
+ self.chunks.extend(list(new_chunks_iterable))
131
+ self._save_documents()
@@ -62,16 +62,56 @@ class BaseKnowledgeStorage(ABC):
62
62
 
63
63
  class KnowledgeStorage(BaseKnowledgeStorage):
64
64
  """
65
- Extends Storage to handle embeddings for memory entries, improving search efficiency.
65
+ A class to store ChromaDB Storage vals that handles embeddings, ChromaClient, and Collection.
66
66
  """
67
67
 
68
68
  collection: Optional[chromadb.Collection] = None
69
69
  collection_name: Optional[str] = "knowledge"
70
70
  app: Optional[ClientAPI] = None
71
+ embedding_function: Optional[Any] = None # store ChromaDB's EmbeddingFunction instance
72
+ embedder_config: Optional[Dict[str, Any]] = None # store config dict for embedding_function
73
+
71
74
 
72
75
  def __init__(self, embedder_config: Optional[Dict[str, Any]] = None, collection_name: Optional[str] = None):
73
76
  self.collection_name = collection_name
74
- self._set_embedder_config(embedder_config)
77
+ self.embedder_config = embedder_config
78
+ self.initialize_knowledge_storage()
79
+
80
+
81
+ def _create_default_embedding_function(self) -> Any:
82
+ from chromadb.utils.embedding_functions.openai_embedding_function import OpenAIEmbeddingFunction
83
+
84
+ return OpenAIEmbeddingFunction(
85
+ api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
86
+ )
87
+
88
+
89
+ def _set_embedding_function(self, embedder_config: Optional[Dict[str, Any]] = None) -> None:
90
+ """
91
+ Set the embedding configuration for the knowledge storage.
92
+ """
93
+ self.embedding_function = EmbeddingConfigurator().configure_embedder(embedder_config) if embedder_config else self._create_default_embedding_function()
94
+
95
+
96
+ def initialize_knowledge_storage(self):
97
+ """
98
+ Create ChromaClinent, set up the embedding function using `embedder_config`, and get or create Collection.
99
+ """
100
+ base_path = os.path.join(fetch_db_storage_path(), "knowledge")
101
+ chroma_client = chromadb.PersistentClient(path=base_path, settings=Settings(allow_reset=True))
102
+ self.app = chroma_client
103
+
104
+ self._set_embedding_function(self.embedder_config)
105
+
106
+ try:
107
+ collection_name = f"knowledge_{self.collection_name}" if self.collection_name else "knowledge"
108
+ if self.app:
109
+ self.collection = self.app.get_or_create_collection(name=collection_name, embedding_function=self.embedding_function)
110
+ else:
111
+ raise Exception("Vector Database Client not initialized")
112
+ except Exception:
113
+ raise Exception("Failed to create or get collection")
114
+
75
115
 
76
116
  def search(self, query: List[str], limit: int = 3, filter: Optional[dict] = None, score_threshold: float = 0.35) -> List[Dict[str, Any]]:
77
117
  with suppress_logging():
@@ -92,60 +132,44 @@ class KnowledgeStorage(BaseKnowledgeStorage):
92
132
  raise Exception("Collection not initialized")
93
133
 
94
134
 
95
- def initialize_knowledge_storage(self):
96
- base_path = os.path.join(fetch_db_storage_path(), "knowledge")
97
- chroma_client = chromadb.PersistentClient(path=base_path, settings=Settings(allow_reset=True))
98
- self.app = chroma_client
99
-
100
- try:
101
- collection_name = f"knowledge_{self.collection_name}" if self.collection_name else "knowledge"
102
- if self.app:
103
- self.collection = self.app.get_or_create_collection(name=collection_name, embedding_function=self.embedder_config)
104
- else:
105
- raise Exception("Vector Database Client not initialized")
106
- except Exception:
107
- raise Exception("Failed to create or get collection")
108
-
109
-
110
- def reset(self):
111
- base_path = os.path.join(fetch_db_storage_path(), KNOWLEDGE_DIRECTORY)
112
- if not self.app:
113
- self.app = chromadb.PersistentClient(path=base_path, settings=Settings(allow_reset=True))
114
- self.app.reset()
115
- shutil.rmtree(base_path)
116
- self.app = None
117
- self.collection = None
118
-
119
-
120
135
  def save(self, documents: List[str], metadata: Optional[Dict[str, Any] | List[Dict[str, Any]]] = None) -> None:
121
136
  if not self.collection:
122
- raise Exception("Collection not initialized")
137
+ self.initialize_knowledge_storage()
138
+ # raise Exception("Collection not initialized")
123
139
 
124
140
  try:
125
141
  unique_docs = {}
126
142
  for i, doc in enumerate(documents):
127
- doc_id = hashlib.sha256(doc.encode("utf-8")).hexdigest()
128
- doc_metadata = None
129
- if metadata is not None:
130
- if isinstance(metadata, list):
131
- doc_metadata = metadata[i]
132
- else:
133
- doc_metadata = metadata
134
- unique_docs[doc_id] = (doc, doc_metadata)
143
+ if doc:
144
+ doc = doc
145
+ if isinstance(doc, list):
146
+ doc = doc[0]
147
+
148
+ doc_id = hashlib.sha256(str(doc).encode("utf-8")).hexdigest()
149
+ doc_metadata = None
150
+ if metadata:
151
+ if isinstance(metadata, list):
152
+ doc_metadata = metadata[i]
153
+ else:
154
+ doc_metadata = metadata
155
+ unique_docs[doc_id] = (doc, doc_metadata)
135
156
 
136
157
  filtered_docs = []
137
158
  filtered_metadata = []
138
159
  filtered_ids = []
139
160
 
140
161
  for doc_id, (doc, meta) in unique_docs.items():
141
- filtered_docs.append(doc)
142
- filtered_metadata.append(meta)
143
- filtered_ids.append(doc_id)
162
+ if doc_id and doc:
163
+ filtered_docs.append(doc)
164
+ filtered_metadata.append(meta)
165
+ filtered_ids.append(doc_id)
144
166
 
145
167
  final_metadata: Optional[OneOrMany[chromadb.Metadata]] = (
146
168
  None if all(m is None for m in filtered_metadata) else filtered_metadata
147
169
  )
148
- self.collection.upsert(documents=filtered_docs, metadatas=final_metadata, ids=filtered_ids)
170
+
171
+ if filtered_docs:
172
+ self.collection.upsert(documents=filtered_docs, metadatas=final_metadata, ids=filtered_ids)
149
173
 
150
174
  except chromadb.errors.InvalidDimensionException as e:
151
175
  Logger(verbose=True).log(
@@ -160,18 +184,11 @@ class KnowledgeStorage(BaseKnowledgeStorage):
160
184
  raise
161
185
 
162
186
 
163
- def _create_default_embedding_function(self) -> Any:
164
- from chromadb.utils.embedding_functions.openai_embedding_function import (
165
- OpenAIEmbeddingFunction,
166
- )
167
-
168
- return OpenAIEmbeddingFunction(
169
- api_key=os.getenv("OPENAI_API_KEY"), model_name="text-embedding-3-small"
170
- )
171
-
172
-
173
- def _set_embedder_config(self, embedder_config: Optional[Dict[str, Any]] = None) -> None:
174
- """
175
- Set the embedding configuration for the knowledge storage.
176
- """
177
- self.embedder_config = EmbeddingConfigurator().configure_embedder(embedder_config) if embedder_config else self._create_default_embedding_function()
187
+ def reset(self):
188
+ base_path = os.path.join(fetch_db_storage_path(), KNOWLEDGE_DIRECTORY)
189
+ if not self.app:
190
+ self.app = chromadb.PersistentClient(path=base_path, settings=Settings(allow_reset=True))
191
+ self.app.reset()
192
+ shutil.rmtree(base_path)
193
+ self.app = None
194
+ self.collection = None
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: versionhq
3
- Version: 1.1.11.2
3
+ Version: 1.1.11.4
4
4
  Summary: LLM orchestration frameworks for model-agnostic AI agents that handle complex outbound workflows
5
5
  Author-email: Kuriko Iwai <kuriko@versi0n.io>
6
6
  License: MIT License
@@ -1,4 +1,4 @@
1
- versionhq/__init__.py,sha256=hxbJxa8mGdUlu5VfCbdypygtU31S4CTYVPAtN4EFd78,951
1
+ versionhq/__init__.py,sha256=b8BjYKXpj6Dw8_k9zl1oysFiKt44MCiBc0bb8XeynD0,951
2
2
  versionhq/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
3
  versionhq/_utils/i18n.py,sha256=TwA_PnYfDLA6VqlUDPuybdV9lgi3Frh_ASsb_X8jJo8,1483
4
4
  versionhq/_utils/logger.py,sha256=U-MpeGueA6YS8Ptfy0VnU_ePsZP-8Pvkvi0tZ4s_UMg,1438
@@ -7,7 +7,7 @@ versionhq/_utils/usage_metrics.py,sha256=hhq1OCW8Z4V93vwW2O2j528EyjOlF8wlTsX5IL-
7
7
  versionhq/_utils/vars.py,sha256=bZ5Dx_bFKlt3hi4-NNGXqdk7B23If_WaTIju2fiTyPQ,57
8
8
  versionhq/agent/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  versionhq/agent/default_agents.py,sha256=Sea3xDswxxMccer1vVDhp1E5etXW3ddf2n20JTMHgqs,503
10
- versionhq/agent/model.py,sha256=a8mkzKR71901gnjR71dHdMit-33k9F8cF0FrIr85Fu4,22912
10
+ versionhq/agent/model.py,sha256=F_VkSQ6G6mJvDWrRBILZ6KjtlCpm0r_8bMN73sDKKGc,22921
11
11
  versionhq/agent/parser.py,sha256=riG0dkdQCxH7uJ0AbdVdg7WvL0BXhUgJht0VtQvxJBc,4082
12
12
  versionhq/agent/rpm_controller.py,sha256=7AKIEPbWBq_ESOZCaiKVOGjfSPHd2qwg6-wbBlhqC0g,2367
13
13
  versionhq/agent/TEMPLATES/Backstory.py,sha256=IAhGnnt6VUMe3wO6IzeyZPDNu7XE7Uiu3VEXUreOcKs,532
@@ -23,10 +23,10 @@ versionhq/clients/workflow/model.py,sha256=FNftenLLoha0bkivrjId32awLHAkBwIT8iNlj
23
23
  versionhq/knowledge/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
24
24
  versionhq/knowledge/_utils.py,sha256=YWRF8U533cfZes_gZqUvdj-K24MD2ri1R0gjc_aPYyc,402
25
25
  versionhq/knowledge/embedding.py,sha256=KfHc__1THxb5jrg1EMrF-v944RDuIr2hE0l-MtM3Bp0,6826
26
- versionhq/knowledge/model.py,sha256=xJJcFuDZcuFEFathgQDwbO1I39n0hq22UU_h7dYaJIQ,1781
27
- versionhq/knowledge/source.py,sha256=Wk-4OMd5mWA5E-fff-w0SA_BUstugspxvV7zQ_vhSOk,13565
28
- versionhq/knowledge/source_docling.py,sha256=Iii-cu9rnVabhVOEajbrqWsjdiXUkc4Iw6PWixbwLzY,4718
29
- versionhq/knowledge/storage.py,sha256=Q8kBwsyj-eMnst8zWC7oSwnRaTirLkTlRj0F9lsaLHc,6658
26
+ versionhq/knowledge/model.py,sha256=_liwQoS_VJlJgVSwAb7Y68SwbPuU0QBY_q0cA8x7dCo,1862
27
+ versionhq/knowledge/source.py,sha256=yUwOds0zc8oPLvtV_hIE4P7k9BjQ9vc4MbbGorv_H6I,13292
28
+ versionhq/knowledge/source_docling.py,sha256=fGfa3NntjH5tzpmWSoLsSgKJxBvTEKwl1egAlo67qnA,4935
29
+ versionhq/knowledge/storage.py,sha256=vg7NEi19b47QaxXQxx2BLag3hjUZAQnwPqUifzhWCvQ,7373
30
30
  versionhq/llm/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
31
  versionhq/llm/llm_vars.py,sha256=PO__b-h5e-6oQ-uoIgXx3lPSAUPUwXYfdVRW73fvX14,8761
32
32
  versionhq/llm/model.py,sha256=1uaBxT10GIlUl-BtE8Mfux-ZRcScp4HUIas_fD_cdWQ,14471
@@ -57,8 +57,8 @@ versionhq/tool/composio_tool_vars.py,sha256=FvBuEXsOQUYnN7RTFxT20kAkiEYkxWKkiVtg
57
57
  versionhq/tool/decorator.py,sha256=C4ZM7Xi2gwtEMaSeRo-geo_g_MAkY77WkSLkAuY0AyI,1205
58
58
  versionhq/tool/model.py,sha256=7ccEnje_8LuxLVeog6pL38nToArXQXk4KY7A9hfprDo,12239
59
59
  versionhq/tool/tool_handler.py,sha256=2m41K8qo5bGCCbwMFferEjT-XZ-mE9F0mDUOBkgivOI,1416
60
- versionhq-1.1.11.2.dist-info/LICENSE,sha256=7CCXuMrAjPVsUvZrsBq9DsxI2rLDUSYXR_qj4yO_ZII,1077
61
- versionhq-1.1.11.2.dist-info/METADATA,sha256=0iCpugY32Szc4lVH-ZsvC9SxC2cKJ5eGi7dCCbylIuo,18251
62
- versionhq-1.1.11.2.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
63
- versionhq-1.1.11.2.dist-info/top_level.txt,sha256=DClQwxDWqIUGeRJkA8vBlgeNsYZs4_nJWMonzFt5Wj0,10
64
- versionhq-1.1.11.2.dist-info/RECORD,,
60
+ versionhq-1.1.11.4.dist-info/LICENSE,sha256=7CCXuMrAjPVsUvZrsBq9DsxI2rLDUSYXR_qj4yO_ZII,1077
61
+ versionhq-1.1.11.4.dist-info/METADATA,sha256=H3fBLb0rTLGR5EL7yvNyzekPsa6Iu1SNpqOJbUD3uMw,18251
62
+ versionhq-1.1.11.4.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
63
+ versionhq-1.1.11.4.dist-info/top_level.txt,sha256=DClQwxDWqIUGeRJkA8vBlgeNsYZs4_nJWMonzFt5Wj0,10
64
+ versionhq-1.1.11.4.dist-info/RECORD,,