versionhq 1.1.10.8__py3-none-any.whl → 1.1.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. versionhq/__init__.py +1 -1
  2. versionhq/_utils/vars.py +2 -0
  3. versionhq/agent/TEMPLATES/Backstory.py +2 -2
  4. versionhq/agent/default_agents.py +15 -0
  5. versionhq/agent/model.py +127 -39
  6. versionhq/agent/parser.py +3 -20
  7. versionhq/{_utils → agent}/rpm_controller.py +22 -15
  8. versionhq/knowledge/__init__.py +0 -0
  9. versionhq/knowledge/_utils.py +11 -0
  10. versionhq/knowledge/embedding.py +192 -0
  11. versionhq/knowledge/model.py +54 -0
  12. versionhq/knowledge/source.py +413 -0
  13. versionhq/knowledge/source_docling.py +129 -0
  14. versionhq/knowledge/storage.py +177 -0
  15. versionhq/llm/model.py +76 -62
  16. versionhq/memory/__init__.py +0 -0
  17. versionhq/memory/contextual_memory.py +96 -0
  18. versionhq/memory/model.py +174 -0
  19. versionhq/storage/base.py +14 -0
  20. versionhq/storage/ltm_sqlite_storage.py +131 -0
  21. versionhq/storage/mem0_storage.py +109 -0
  22. versionhq/storage/rag_storage.py +231 -0
  23. versionhq/storage/task_output_storage.py +18 -29
  24. versionhq/storage/utils.py +26 -0
  25. versionhq/task/TEMPLATES/Description.py +5 -0
  26. versionhq/task/evaluate.py +122 -0
  27. versionhq/task/model.py +134 -43
  28. versionhq/team/team_planner.py +1 -1
  29. versionhq/tool/model.py +44 -46
  30. {versionhq-1.1.10.8.dist-info → versionhq-1.1.11.0.dist-info}/METADATA +54 -40
  31. versionhq-1.1.11.0.dist-info/RECORD +64 -0
  32. versionhq-1.1.10.8.dist-info/RECORD +0 -45
  33. {versionhq-1.1.10.8.dist-info → versionhq-1.1.11.0.dist-info}/LICENSE +0 -0
  34. {versionhq-1.1.10.8.dist-info → versionhq-1.1.11.0.dist-info}/WHEEL +0 -0
  35. {versionhq-1.1.10.8.dist-info → versionhq-1.1.11.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,54 @@
1
+ import os
2
+ from abc import ABC, abstractmethod
3
+ from typing import Any, Dict, List, Optional
4
+ from pydantic import BaseModel, ConfigDict, Field
5
+
6
+ from versionhq.knowledge.storage import KnowledgeStorage
7
+ from versionhq.knowledge.source import BaseKnowledgeSource
8
+
9
+
10
+ class Knowledge(BaseModel):
11
+ """
12
+ Knowlede class for collection of sources and setup for the vector store to query relevant context.
13
+ """
14
+ sources: List[BaseKnowledgeSource] = Field(default_factory=list)
15
+ model_config = ConfigDict(arbitrary_types_allowed=True)
16
+ storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage)
17
+ embedder_config: Optional[Dict[str, Any]] = None
18
+ collection_name: Optional[str] = None
19
+
20
+ def __init__(
21
+ self,
22
+ collection_name: str,
23
+ sources: List[BaseKnowledgeSource],
24
+ embedder_config: Optional[Dict[str, Any]] = None,
25
+ storage: Optional[KnowledgeStorage] = None,
26
+ **data,
27
+ ):
28
+ super().__init__(**data)
29
+ if storage:
30
+ self.storage = storage
31
+ else:
32
+ self.storage = KnowledgeStorage(embedder_config=embedder_config, collection_name=collection_name)
33
+
34
+ self.sources = sources
35
+ self.storage.initialize_knowledge_storage()
36
+ for source in sources:
37
+ source.storage = self.storage
38
+ source.add()
39
+
40
+
41
+ def query(self, query: List[str], limit: int = 3) -> List[Dict[str, Any]]:
42
+ """
43
+ Query across all knowledge sources to find the most relevant information.
44
+ Returns the top_k most relevant chunks.
45
+ """
46
+
47
+ results = self.storage.search(query, limit)
48
+ return results
49
+
50
+
51
+ def _add_sources(self):
52
+ for source in self.sources:
53
+ source.storage = self.storage
54
+ source.add()
@@ -0,0 +1,413 @@
1
+ import csv
2
+ import json
3
+ from abc import ABC, abstractmethod
4
+ from typing import Any, Dict, List, Optional
5
+ from pathlib import Path
6
+
7
+ import numpy as np
8
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
9
+
10
+ from versionhq.knowledge.storage import KnowledgeStorage
11
+ from versionhq._utils.vars import KNOWLEDGE_DIRECTORY
12
+ from versionhq._utils.logger import Logger
13
+
14
+
15
+ class BaseKnowledgeSource(BaseModel, ABC):
16
+ """
17
+ Abstract base class for knowledge sources: csv, json, excel, pdf, string, and docling.
18
+ """
19
+
20
+ chunk_size: int = 4000
21
+ chunk_overlap: int = 200
22
+ chunks: List[str] = Field(default_factory=list)
23
+ chunk_embeddings: List[np.ndarray] = Field(default_factory=list)
24
+
25
+ model_config = ConfigDict(arbitrary_types_allowed=True)
26
+ storage: Optional[KnowledgeStorage] = Field(default=None)
27
+ metadata: Dict[str, Any] = Field(default_factory=dict) # Currently unused
28
+ collection_name: Optional[str] = Field(default=None)
29
+
30
+ @abstractmethod
31
+ def validate_content(self) -> Any:
32
+ """Load and preprocess content from the source."""
33
+ pass
34
+
35
+ @abstractmethod
36
+ def add(self) -> None:
37
+ """Process content, chunk it, compute embeddings, and save them."""
38
+ pass
39
+
40
+ def get_embeddings(self) -> List[np.ndarray]:
41
+ """Return the list of embeddings for the chunks."""
42
+ return self.chunk_embeddings
43
+
44
+ def _chunk_text(self, text: str) -> List[str]:
45
+ """
46
+ Utility method to split text into chunks.
47
+ """
48
+
49
+ return [
50
+ text[i : i + self.chunk_size]
51
+ for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
52
+ ]
53
+
54
+ def _save_documents(self):
55
+ """
56
+ Save the documents to the storage.
57
+ This method should be called after the chunks and embeddings are generated.
58
+ """
59
+ if self.storage:
60
+ self.storage.save(self.chunks)
61
+ else:
62
+ raise ValueError("No storage found to save documents.")
63
+
64
+
65
+
66
+ class StringKnowledgeSource(BaseKnowledgeSource):
67
+ """
68
+ A knowledge source that stores and queries plain text content using embeddings.
69
+ """
70
+
71
+ content: str = Field(...)
72
+ collection_name: Optional[str] = Field(default=None)
73
+
74
+ def model_post_init(self, _):
75
+ """Post-initialization method to validate content."""
76
+ self.validate_content()
77
+
78
+ def validate_content(self):
79
+ """Validate string content."""
80
+ if not isinstance(self.content, str):
81
+ raise ValueError("StringKnowledgeSource only accepts string content")
82
+
83
+ def add(self) -> None:
84
+ """
85
+ Add string content to the knowledge source, chunk it, compute embeddings, and save them.
86
+ """
87
+ new_chunks = self._chunk_text(self.content)
88
+ self.chunks.extend(new_chunks)
89
+ self._save_documents()
90
+
91
+
92
+ def _chunk_text(self, text: str) -> List[str]:
93
+ """
94
+ Utility method to split text into chunks.
95
+ """
96
+ return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
97
+
98
+
99
+
100
+ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
101
+ """Base class for knowledge sources that load content from files."""
102
+
103
+ _logger: Logger = Logger(verbose=True)
104
+ file_paths: Optional[Path | List[Path] | str | List[str]] = Field(default_factory=list)
105
+ content: Dict[Path, str] = Field(init=False, default_factory=dict)
106
+ storage: Optional[KnowledgeStorage] = Field(default=None)
107
+ safe_file_paths: List[Path] = Field(default_factory=list, description="store a list of `Path` objects from self.file_paths")
108
+
109
+
110
+ @field_validator("file_paths", mode="before")
111
+ def validate_file_path(cls, v, info):
112
+ """
113
+ Validate if at least one valid file path is provided.
114
+ """
115
+ if v is None and info.data.get("file_paths") is None:
116
+ raise ValueError("Either file_path or file_paths must be provided")
117
+ return v
118
+
119
+
120
+ def model_post_init(self, _) -> None:
121
+ """
122
+ Post-initialization method to load content.
123
+ """
124
+ self.safe_file_paths = self._process_file_paths()
125
+ self.validate_content()
126
+ self.content = self.load_content()
127
+
128
+
129
+ @abstractmethod
130
+ def load_content(self) -> Dict[Path, str]:
131
+ """
132
+ Load and preprocess file content. Should be overridden by subclasses.
133
+ Assume that the file path is relative to the project root in the knowledge directory.
134
+ """
135
+ pass
136
+
137
+
138
+ def validate_content(self):
139
+ """
140
+ Validate the given file paths.
141
+ """
142
+ for path in self.safe_file_paths:
143
+ if not path.exists():
144
+ self._logger.log(
145
+ "error",
146
+ f"File not found: {path}. Try adding sources to the knowledge directory. If it's inside the knowledge directory, use the relative path.",
147
+ color="red",
148
+ )
149
+ raise FileNotFoundError(f"File not found: {path}")
150
+ if not path.is_file():
151
+ self._logger.log("error", f"Path is not a file: {path}", color="red")
152
+
153
+
154
+ def _save_documents(self):
155
+ if self.storage:
156
+ self.storage.save(self.chunks)
157
+ else:
158
+ raise ValueError("No storage found to save documents.")
159
+
160
+
161
+ def convert_to_path(self, path: Path | str) -> Path:
162
+ """
163
+ Convert a path to a Path object.
164
+ """
165
+ return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
166
+
167
+
168
+ def _process_file_paths(self) -> List[Path]:
169
+ """
170
+ Convert file_path to a list of Path objects.
171
+ """
172
+
173
+ if self.file_paths is None:
174
+ raise ValueError("Your source must be provided with a file_paths: []")
175
+
176
+ path_list: List[Path | str] = [self.file_paths] if isinstance(self.file_paths, (str, Path)) else list(self.file_paths) if isinstance(self.file_paths, list) else []
177
+
178
+ if not path_list:
179
+ raise ValueError(
180
+ "file_path/file_paths must be a Path, str, or a list of these types"
181
+ )
182
+
183
+ return [self.convert_to_path(path) for path in path_list]
184
+
185
+
186
+
187
+ class TextFileKnowledgeSource(BaseFileKnowledgeSource):
188
+ """
189
+ A knowledge source class that stores and queries text file content using embeddings.
190
+ """
191
+
192
+ def load_content(self) -> Dict[Path, str]:
193
+ """
194
+ Load and preprocess text file content.
195
+ """
196
+
197
+ content = {}
198
+ for path in self.safe_file_paths:
199
+ path = self.convert_to_path(path)
200
+ with open(path, "r", encoding="utf-8") as f:
201
+ content[path] = f.read()
202
+ return content
203
+
204
+
205
+ def add(self) -> None:
206
+ """
207
+ Add text file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
208
+ """
209
+ for _, text in self.content.items():
210
+ new_chunks = self._chunk_text(text)
211
+ self.chunks.extend(new_chunks)
212
+ self._save_documents()
213
+
214
+
215
+ def _chunk_text(self, text: str) -> List[str]:
216
+ """
217
+ Utility method to split text into chunks.
218
+ """
219
+ return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
220
+
221
+
222
+
223
+ class PDFKnowledgeSource(BaseFileKnowledgeSource):
224
+ """
225
+ A knowledge source class that stores and queries PDF file content using embeddings.
226
+ """
227
+
228
+ def load_content(self) -> Dict[Path, str]:
229
+ """
230
+ Load and preprocess PDF file content.
231
+ """
232
+ pdfplumber = self._import_pdfplumber()
233
+ content = {}
234
+ for path in self.safe_file_paths:
235
+ text = ""
236
+ path = self.convert_to_path(path)
237
+ with pdfplumber.open(path) as pdf:
238
+ for page in pdf.pages:
239
+ page_text = page.extract_text()
240
+ if page_text:
241
+ text += page_text + "\n"
242
+ content[path] = text
243
+ return content
244
+
245
+
246
+ def _import_pdfplumber(self):
247
+ """
248
+ Dynamically import pdfplumber.
249
+ """
250
+ try:
251
+ import pdfplumber
252
+ return pdfplumber
253
+ except ImportError:
254
+ raise ImportError("pdfplumber is not installed. Please install it with: pip install pdfplumber")
255
+
256
+
257
+ def add(self) -> None:
258
+ """
259
+ Add PDF file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
260
+ """
261
+ for _, text in self.content.items():
262
+ new_chunks = self._chunk_text(text)
263
+ self.chunks.extend(new_chunks)
264
+ self._save_documents()
265
+
266
+
267
+ def _chunk_text(self, text: str) -> List[str]:
268
+ """
269
+ Utility method to split text into chunks.
270
+ """
271
+ return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
272
+
273
+
274
+
275
+ class CSVKnowledgeSource(BaseFileKnowledgeSource):
276
+ """
277
+ A knowledge source class that stores and queries CSV file content using embeddings.
278
+ """
279
+
280
+ def load_content(self) -> Dict[Path, str]:
281
+ """
282
+ Load and preprocess CSV file content.
283
+ """
284
+ content_dict = {}
285
+ for file_path in self.safe_file_paths:
286
+ with open(file_path, "r", encoding="utf-8") as csvfile:
287
+ reader = csv.reader(csvfile)
288
+ content = ""
289
+ for row in reader:
290
+ content += " ".join(row) + "\n"
291
+ content_dict[file_path] = content
292
+
293
+ return content_dict
294
+
295
+
296
+ def add(self) -> None:
297
+ """
298
+ Add CSV file content to the knowledge source, chunk it, compute embeddings,
299
+ and save the embeddings.
300
+ """
301
+ content_str = str(self.content) if isinstance(self.content, dict) else self.content
302
+ new_chunks = self._chunk_text(content_str)
303
+ self.chunks.extend(new_chunks)
304
+ self._save_documents()
305
+
306
+
307
+ def _chunk_text(self, text: str) -> List[str]:
308
+ """
309
+ Utility method to split text into chunks.
310
+ """
311
+ return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
312
+
313
+
314
+
315
+ class JSONKnowledgeSource(BaseFileKnowledgeSource):
316
+ """
317
+ A knowledge source class that stores and queries JSON file content using embeddings.
318
+ """
319
+
320
+ def load_content(self) -> Dict[Path, str]:
321
+ """
322
+ Load and preprocess JSON file content.
323
+ """
324
+ content: Dict[Path, str] = {}
325
+ for path in self.safe_file_paths:
326
+ path = self.convert_to_path(path)
327
+ with open(path, "r", encoding="utf-8") as json_file:
328
+ data = json.load(json_file)
329
+ content[path] = self._json_to_text(data)
330
+ return content
331
+
332
+ def _json_to_text(self, data: Any, level: int = 0) -> str:
333
+ """
334
+ Recursively convert JSON data to a text representation.
335
+ """
336
+ text = ""
337
+ indent = " " * level
338
+ if isinstance(data, dict):
339
+ for key, value in data.items():
340
+ text += f"{indent}{key}: {self._json_to_text(value, level + 1)}\n"
341
+ elif isinstance(data, list):
342
+ for item in data:
343
+ text += f"{indent}- {self._json_to_text(item, level + 1)}\n"
344
+ else:
345
+ text += f"{str(data)}"
346
+ return text
347
+
348
+
349
+ def add(self) -> None:
350
+ """
351
+ Add JSON file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
352
+ """
353
+ content_str = str(self.content) if isinstance(self.content, dict) else self.content
354
+ new_chunks = self._chunk_text(content_str)
355
+ self.chunks.extend(new_chunks)
356
+ self._save_documents()
357
+
358
+
359
+ def _chunk_text(self, text: str) -> List[str]:
360
+ """
361
+ Utility method to split text into chunks.
362
+ """
363
+ return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
364
+
365
+
366
+
367
+ class ExcelKnowledgeSource(BaseFileKnowledgeSource):
368
+ """
369
+ A knowledge source that stores and queries Excel file content using embeddings.
370
+ """
371
+
372
+ def load_content(self) -> Dict[Path, str]:
373
+ """
374
+ Load and preprocess Excel file content.
375
+ """
376
+
377
+ pd = self._import_dependencies()
378
+ content_dict = {}
379
+ for file_path in self.safe_file_paths:
380
+ file_path = self.convert_to_path(file_path)
381
+ df = pd.read_excel(file_path)
382
+ content = df.to_csv(index=False)
383
+ content_dict[file_path] = content
384
+ return content_dict
385
+
386
+ def _import_dependencies(self):
387
+ """
388
+ Dynamically import dependencies.
389
+ """
390
+ try:
391
+ import pandas as pd
392
+ return pd
393
+ except ImportError as e:
394
+ missing_package = str(e).split()[-1]
395
+ raise ImportError(
396
+ f"{missing_package} is not installed. Please install it with: pip install {missing_package}"
397
+ )
398
+
399
+ def add(self) -> None:
400
+ """
401
+ Add Excel file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
402
+ """
403
+ content_str = "\n".join(str(value) for value in self.content.values()) if isinstance(self.content, dict) else str(self.content)
404
+ new_chunks = self._chunk_text(content_str)
405
+ self.chunks.extend(new_chunks)
406
+ self._save_documents()
407
+
408
+
409
+ def _chunk_text(self, text: str) -> List[str]:
410
+ """
411
+ Utility method to split text into chunks.
412
+ """
413
+ return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
@@ -0,0 +1,129 @@
1
+ from pathlib import Path
2
+ from typing import Iterator, List, Optional
3
+ from urllib.parse import urlparse
4
+
5
+ try:
6
+ from docling.datamodel.base_models import InputFormat
7
+ from docling.document_converter import DocumentConverter
8
+ from docling.exceptions import ConversionError
9
+ from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
10
+ from docling_core.types.doc.document import DoclingDocument
11
+ DOCLING_AVAILABLE = True
12
+ except ImportError:
13
+ DOCLING_AVAILABLE = False
14
+
15
+ from pydantic import Field
16
+
17
+ from versionhq.knowledge.source import BaseKnowledgeSource
18
+ from versionhq._utils.vars import KNOWLEDGE_DIRECTORY
19
+ from versionhq._utils.logger import Logger
20
+
21
+
22
+ class DoclingSource(BaseKnowledgeSource):
23
+ """
24
+ Default docling class for converting documents to markdown or json.
25
+ Support PDF, DOCX, TXT, XLSX, PPTX, MD, Images, and HTML files without any additional dependencies.
26
+ """
27
+
28
+ def __init__(self, *args, **kwargs):
29
+ if not DOCLING_AVAILABLE:
30
+ raise ImportError("The docling package is required. Please install the package using: $ uv add docling.")
31
+
32
+ super().__init__(*args, **kwargs)
33
+
34
+ _logger: Logger = Logger(verbose=True)
35
+ file_paths: List[Path | str] = Field(default_factory=list)
36
+ chunks: List[str] = Field(default_factory=list)
37
+ safe_file_paths: List[Path | str] = Field(default_factory=list)
38
+ content: List["DoclingDocument"] = Field(default_factory=list)
39
+ document_converter: "DocumentConverter" = Field(
40
+ default_factory=lambda: DocumentConverter(
41
+ allowed_formats=[
42
+ InputFormat.MD,
43
+ InputFormat.ASCIIDOC,
44
+ InputFormat.PDF,
45
+ InputFormat.DOCX,
46
+ InputFormat.HTML,
47
+ InputFormat.IMAGE,
48
+ InputFormat.XLSX,
49
+ InputFormat.PPTX,
50
+ ]
51
+ )
52
+ )
53
+
54
+ def model_post_init(self, _) -> None:
55
+ self.safe_file_paths = self.validate_content()
56
+ self.content = self._load_content()
57
+
58
+
59
+ def _load_content(self) -> List["DoclingDocument"]:
60
+ try:
61
+ return self._convert_source_to_docling_documents()
62
+ except ConversionError as e:
63
+ self._logger.log(
64
+ level="error",
65
+ message=f"Error loading content: {str(e)}. Supported formats: {self.document_converter.allowed_formats}",
66
+ color="red",
67
+ )
68
+ raise e
69
+ except Exception as e:
70
+ self._logger.log(level="error", message=f"Error loading content: {e}", color="red")
71
+ raise e
72
+
73
+
74
+ def add(self) -> None:
75
+ if self.content is None:
76
+ return
77
+ for doc in self.content:
78
+ new_chunks_iterable = self._chunk_doc(doc)
79
+ self.chunks.extend(list(new_chunks_iterable))
80
+ self._save_documents()
81
+
82
+
83
+ def _convert_source_to_docling_documents(self) -> List["DoclingDocument"]:
84
+ conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
85
+ return [result.document for result in conv_results_iter]
86
+
87
+
88
+ def _chunk_doc(self, doc: "DoclingDocument") -> Iterator[str]:
89
+ chunker = HierarchicalChunker()
90
+ for chunk in chunker.chunk(doc):
91
+ yield chunk.text
92
+
93
+
94
+ def validate_content(self) -> List[Path | str]:
95
+ processed_paths: List[Path | str] = []
96
+ for path in self.file_paths:
97
+ if isinstance(path, str):
98
+ if path.startswith(("http://", "https://")):
99
+ try:
100
+ if self._validate_url(path):
101
+ processed_paths.append(path)
102
+ else:
103
+ raise ValueError(f"Invalid URL format: {path}")
104
+ except Exception as e:
105
+ raise ValueError(f"Invalid URL: {path}. Error: {str(e)}")
106
+ else:
107
+ local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
108
+ if local_path.exists():
109
+ processed_paths.append(local_path)
110
+ else:
111
+ raise FileNotFoundError(f"File not found: {local_path}")
112
+ else:
113
+ if isinstance(path, Path):
114
+ processed_paths.append(path)
115
+ return processed_paths
116
+
117
+
118
+ def _validate_url(self, url: str) -> bool:
119
+ try:
120
+ result = urlparse(url)
121
+ return all(
122
+ [
123
+ result.scheme in ("http", "https"),
124
+ result.netloc,
125
+ len(result.netloc.split(".")) >= 2, # Ensure domain has TLD
126
+ ]
127
+ )
128
+ except Exception:
129
+ return False