PyPI - versionhq - Versions diffs - 1.1.11.2__tar.gz → 1.1.11.4__tar.gz - Mend

versionhq 1.1.11.2tar.gz → 1.1.11.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (105) hide show

{versionhq-1.1.11.2 → versionhq-1.1.11.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: versionhq
-Version: 1.1.11.2
+Version: 1.1.11.4
 Summary: LLM orchestration frameworks for model-agnostic AI agents that handle complex outbound workflows
 Author-email: Kuriko Iwai <kuriko@versi0n.io>
 License: MIT License

{versionhq-1.1.11.2 → versionhq-1.1.11.4}/pyproject.toml RENAMED Viewed

@@ -15,7 +15,7 @@ exclude = ["test*", "__pycache__", "*.egg-info"]
 [project]
 name = "versionhq"
-version = "1.1.11.2"
+version = "1.1.11.4"
 authors = [{ name = "Kuriko Iwai", email = "kuriko@versi0n.io" }]
 description = "LLM orchestration frameworks for model-agnostic AI agents that handle complex outbound workflows"
 readme = "README.md"

{versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/__init__.py RENAMED Viewed

@@ -18,7 +18,7 @@ from versionhq.tool.model import Tool
 from versionhq.tool.composio_tool import ComposioHandler
-__version__ = "1.1.11.2"
+__version__ = "1.1.11.4"
 __all__ = [
     "Agent",
     "Customer",

{versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/agent/model.py RENAMED Viewed

@@ -469,7 +469,7 @@ class Agent(BaseModel):
             task_prompt += context
         if self._knowledge:
-            agent_knowledge = self._knowledge.query(query=[task_prompt,])
+            agent_knowledge = self._knowledge.query(query=[task_prompt,], limit=5)
             if agent_knowledge:
                 agent_knowledge_context = extract_knowledge_context(knowledge_snippets=agent_knowledge)
                 if agent_knowledge_context:

{versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/knowledge/model.py RENAMED Viewed

@@ -26,13 +26,17 @@ class Knowledge(BaseModel):
         **data,
     ):
         super().__init__(**data)
         if storage:
             self.storage = storage
         else:
             self.storage = KnowledgeStorage(embedder_config=embedder_config, collection_name=collection_name)
-        self.sources = sources
+        self.storage._set_embedding_function(embedder_config=embedder_config)
         self.storage.initialize_knowledge_storage()
+        self.sources = sources
         for source in sources:
             source.storage = self.storage
             source.add()

{versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/knowledge/source.py RENAMED Viewed

@@ -8,6 +8,7 @@ import numpy as np
 from pydantic import BaseModel, ConfigDict, Field, field_validator
 from versionhq.knowledge.storage import KnowledgeStorage
+from versionhq.storage.utils import fetch_db_storage_path
 from versionhq._utils.vars import KNOWLEDGE_DIRECTORY
 from versionhq._utils.logger import Logger
@@ -16,50 +17,66 @@ class BaseKnowledgeSource(BaseModel, ABC):
     """
     Abstract base class for knowledge sources: csv, json, excel, pdf, string, and docling.
     """
+    _logger: Logger = Logger(verbose=True)
-    chunk_size: int = 4000
+    chunk_size: int = 3000
     chunk_overlap: int = 200
     chunks: List[str] = Field(default_factory=list)
     chunk_embeddings: List[np.ndarray] = Field(default_factory=list)
     model_config = ConfigDict(arbitrary_types_allowed=True)
     storage: Optional[KnowledgeStorage] = Field(default=None)
-    metadata: Dict[str, Any] = Field(default_factory=dict)  # Currently unused
+    metadata: Dict[str, Any] = Field(default_factory=dict)
     collection_name: Optional[str] = Field(default=None)
     @abstractmethod
-    def validate_content(self) -> Any:
+    def validate_content(self, **kwargs) -> Any:
         """Load and preprocess content from the source."""
         pass
     @abstractmethod
     def add(self) -> None:
         """Process content, chunk it, compute embeddings, and save them."""
         pass
     def get_embeddings(self) -> List[np.ndarray]:
         """Return the list of embeddings for the chunks."""
         return self.chunk_embeddings
     def _chunk_text(self, text: str) -> List[str]:
         """
         Utility method to split text into chunks.
         """
+        return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
-        return [
-            text[i : i + self.chunk_size]
-            for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
-        ]
-    def _save_documents(self):
+    def _save_documents(self) -> None:
         """
-        Save the documents to the storage.
+        Save the documents to the given (or newly created) storage on ChromaDB.
         This method should be called after the chunks and embeddings are generated.
         """
-        if self.storage:
-            self.storage.save(self.chunks)
-        else:
-            raise ValueError("No storage found to save documents.")
+        # if not self.chunks or self.chunk_embeddings:
+        #     self._logger.log(level="warning", message="Chunks or chunk embeddings are missing. Save docs after creating them.", color="yellow")
+        #     return
+        try:
+            if self.storage:
+                self.storage.save(documents=self.chunks, metadata=self.metadata)
+            else:
+                storage = KnowledgeStorage(collection_name=self.collection_name) if self.collection_name else KnowledgeStorage()
+                storage.initialize_knowledge_storage()
+                self.storage = storage
+                self.storage.save(documents=self.chunks, metadata=self.metadata)
+        except:
+            self._logger.log(level="error", message="No storage found or created to save the documents.", color="red")
+            return
+            # raise ValueError("No storage found to save documents.")
@@ -74,37 +91,32 @@ class StringKnowledgeSource(BaseKnowledgeSource):
     def model_post_init(self, _):
         """Post-initialization method to validate content."""
         self.validate_content()
+        self._save_documents()
     def validate_content(self):
         """Validate string content."""
         if not isinstance(self.content, str):
             raise ValueError("StringKnowledgeSource only accepts string content")
     def add(self) -> None:
         """
         Add string content to the knowledge source, chunk it, compute embeddings, and save them.
         """
-        new_chunks = self._chunk_text(self.content)
+        new_chunks = self._chunk_text(text=self.content)
         self.chunks.extend(new_chunks)
         self._save_documents()
-    def _chunk_text(self, text: str) -> List[str]:
-        """
-        Utility method to split text into chunks.
-        """
-        return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
 class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
     """Base class for knowledge sources that load content from files."""
-    _logger: Logger = Logger(verbose=True)
     file_paths: Optional[Path | List[Path] | str | List[str]] = Field(default_factory=list)
     content: Dict[Path, str] = Field(init=False, default_factory=dict)
     storage: Optional[KnowledgeStorage] = Field(default=None)
-    safe_file_paths: List[Path] = Field(default_factory=list, description="store a list of `Path` objects from self.file_paths")
+    valid_file_paths: List[Path] = Field(default_factory=list, description="store a list of `Path` objects from self.file_paths")
     @field_validator("file_paths", mode="before")
@@ -117,70 +129,73 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
         return v
-    def model_post_init(self, _) -> None:
+    def validate_content(self, path: str | Path) -> List[Path]:
         """
-        Post-initialization method to load content.
+        Convert the given path to a Path object, and validate if the path exists and refers to a file.)
         """
-        self.safe_file_paths = self._process_file_paths()
-        self.validate_content()
-        self.content = self.load_content()
-    @abstractmethod
-    def load_content(self) -> Dict[Path, str]:
-        """
-        Load and preprocess file content. Should be overridden by subclasses.
-        Assume that the file path is relative to the project root in the knowledge directory.
-        """
-        pass
+        path_instance = Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
+        if not path_instance.exists():
+            abs_path = fetch_db_storage_path()
+            path_instance = Path(abs_path + "/" + KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
-    def validate_content(self):
-        """
-        Validate the given file paths.
-        """
-        for path in self.safe_file_paths:
-            if not path.exists():
-                self._logger.log(
-                    "error",
-                    f"File not found: {path}. Try adding sources to the knowledge directory. If it's inside the knowledge directory, use the relative path.",
-                    color="red",
-                )
-                raise FileNotFoundError(f"File not found: {path}")
-            if not path.is_file():
-                self._logger.log("error", f"Path is not a file: {path}", color="red")
+            if not path_instance.exists():
+                self._logger.log(level="error", message="File path not found.", color="red")
+                raise ValueError()
+            elif not path_instance.is_file():
+                self._logger.log(level="error", message="Non-file object was given.", color="red")
+                raise ValueError()
-    def _save_documents(self):
-        if self.storage:
-            self.storage.save(self.chunks)
-        else:
-            raise ValueError("No storage found to save documents.")
+        elif not path_instance.is_file():
+            self._logger.log(level="error", message="Non-file object was given.", color="red")
+            raise ValueError()
+        return path_instance
-    def convert_to_path(self, path: Path | str) -> Path:
-        """
-        Convert a path to a Path object.
-        """
-        return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
     def _process_file_paths(self) -> List[Path]:
         """
         Convert file_path to a list of Path objects.
         """
+        if not self.file_paths:
+            self._logger.log(level="error", message="Missing file paths.", color="red")
+            raise ValueError("Missing file paths.")
-        if self.file_paths is None:
-            raise ValueError("Your source must be provided with a file_paths: []")
         path_list: List[Path | str] = [self.file_paths] if isinstance(self.file_paths, (str, Path)) else list(self.file_paths) if isinstance(self.file_paths, list) else []
+        valid_path_list = list()
         if not path_list:
-            raise ValueError(
-                "file_path/file_paths must be a Path, str, or a list of these types"
-            )
+            self._logger.log(level="error", message="Missing valid file paths.", color="red")
+            raise ValueError("Your source must be provided with file_paths: []")
+        for item in path_list:
+            valid_path = self.validate_content(item)
+            if valid_path:
+                valid_path_list.append(valid_path)
-        return [self.convert_to_path(path) for path in path_list]
+        return valid_path_list
+    def model_post_init(self, _) -> None:
+        """
+        Post-initialization method to load content.
+        """
+        self.valid_file_paths = self._process_file_paths()
+        self.content = self.load_content()
+        self._save_documents()
+    @abstractmethod
+    def load_content(self) -> Dict[Path, str]:
+        """
+        Load and preprocess file content. Should be overridden by subclasses.
+        Assume that the file path is relative to the project root in the knowledge directory.
+        """
+        pass
@@ -193,10 +208,9 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource):
         """
         Load and preprocess text file content.
         """
         content = {}
-        for path in self.safe_file_paths:
-            path = self.convert_to_path(path)
+        for path in self.valid_file_paths:
+            path = self.validate_content(path=path)
             with open(path, "r", encoding="utf-8") as f:
                 content[path] = f.read()
         return content
@@ -207,16 +221,10 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource):
         Add text file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
         """
         for _, text in self.content.items():
-            new_chunks = self._chunk_text(text)
+            new_chunks = self._chunk_text(text=text)
             self.chunks.extend(new_chunks)
-        self._save_documents()
-    def _chunk_text(self, text: str) -> List[str]:
-        """
-        Utility method to split text into chunks.
-        """
-        return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
+        self._save_documents()
@@ -231,9 +239,9 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):
         """
         pdfplumber = self._import_pdfplumber()
         content = {}
-        for path in self.safe_file_paths:
+        for path in self.valid_file_paths:
             text = ""
-            path = self.convert_to_path(path)
+            path = self.validate_content(path)
             with pdfplumber.open(path) as pdf:
                 for page in pdf.pages:
                     page_text = page.extract_text()
@@ -259,17 +267,12 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):
         Add PDF file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
         """
         for _, text in self.content.items():
-            new_chunks = self._chunk_text(text)
+            new_chunks = self._chunk_text(text=text)
             self.chunks.extend(new_chunks)
         self._save_documents()
-    def _chunk_text(self, text: str) -> List[str]:
-        """
-        Utility method to split text into chunks.
-        """
-        return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
 class CSVKnowledgeSource(BaseFileKnowledgeSource):
@@ -282,7 +285,7 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource):
         Load and preprocess CSV file content.
         """
         content_dict = {}
-        for file_path in self.safe_file_paths:
+        for file_path in self.valid_file_paths:
             with open(file_path, "r", encoding="utf-8") as csvfile:
                 reader = csv.reader(csvfile)
                 content = ""
@@ -295,22 +298,14 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource):
     def add(self) -> None:
         """
-        Add CSV file content to the knowledge source, chunk it, compute embeddings,
-        and save the embeddings.
+        Add CSV file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
         """
         content_str = str(self.content) if isinstance(self.content, dict) else self.content
-        new_chunks = self._chunk_text(content_str)
+        new_chunks = self._chunk_text(text=content_str)
         self.chunks.extend(new_chunks)
         self._save_documents()
-    def _chunk_text(self, text: str) -> List[str]:
-        """
-        Utility method to split text into chunks.
-        """
-        return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
 class JSONKnowledgeSource(BaseFileKnowledgeSource):
     """
@@ -322,13 +317,14 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
         Load and preprocess JSON file content.
         """
         content: Dict[Path, str] = {}
-        for path in self.safe_file_paths:
-            path = self.convert_to_path(path)
+        for path in self.valid_file_paths:
+            path = self.validate_content(path)
             with open(path, "r", encoding="utf-8") as json_file:
                 data = json.load(json_file)
             content[path] = self._json_to_text(data)
         return content
     def _json_to_text(self, data: Any, level: int = 0) -> str:
         """
         Recursively convert JSON data to a text representation.
@@ -351,18 +347,11 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
         Add JSON file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
         """
         content_str = str(self.content) if isinstance(self.content, dict) else self.content
-        new_chunks = self._chunk_text(content_str)
+        new_chunks = self._chunk_text(text=content_str)
         self.chunks.extend(new_chunks)
         self._save_documents()
-    def _chunk_text(self, text: str) -> List[str]:
-        """
-        Utility method to split text into chunks.
-        """
-        return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
 class ExcelKnowledgeSource(BaseFileKnowledgeSource):
     """
@@ -376,13 +365,14 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource):
         pd = self._import_dependencies()
         content_dict = {}
-        for file_path in self.safe_file_paths:
-            file_path = self.convert_to_path(file_path)
+        for file_path in self.valid_file_paths:
+            file_path = self.validate_content(file_path)
             df = pd.read_excel(file_path)
             content = df.to_csv(index=False)
             content_dict[file_path] = content
         return content_dict
     def _import_dependencies(self):
         """
         Dynamically import dependencies.
@@ -396,18 +386,12 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource):
                 f"{missing_package} is not installed. Please install it with: pip install {missing_package}"
             )
     def add(self) -> None:
         """
         Add Excel file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
         """
         content_str = "\n".join(str(value) for value in self.content.values()) if isinstance(self.content, dict) else str(self.content)
-        new_chunks = self._chunk_text(content_str)
+        new_chunks = self._chunk_text(text=content_str)
         self.chunks.extend(new_chunks)
         self._save_documents()
-    def _chunk_text(self, text: str) -> List[str]:
-        """
-        Utility method to split text into chunks.
-        """
-        return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]

{versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/knowledge/source_docling.py RENAMED Viewed

@@ -12,11 +12,11 @@ try:
 except ImportError:
     DOCLING_AVAILABLE = False
-from pydantic import Field
+from pydantic import Field, InstanceOf
 from versionhq.knowledge.source import BaseKnowledgeSource
+from versionhq.storage.utils import fetch_db_storage_path
 from versionhq._utils.vars import KNOWLEDGE_DIRECTORY
-from versionhq._utils.logger import Logger
 class DoclingSource(BaseKnowledgeSource):
@@ -31,10 +31,9 @@ class DoclingSource(BaseKnowledgeSource):
         super().__init__(*args, **kwargs)
-    _logger: Logger = Logger(verbose=True)
     file_paths: List[Path | str] = Field(default_factory=list)
-    chunks: List[str] = Field(default_factory=list)
-    safe_file_paths: List[Path | str] = Field(default_factory=list)
+    valid_file_paths: List[Path | str] = Field(default_factory=list)
     content: List["DoclingDocument"] = Field(default_factory=list)
     document_converter: "DocumentConverter" = Field(
         default_factory=lambda: DocumentConverter(
@@ -51,46 +50,48 @@ class DoclingSource(BaseKnowledgeSource):
         )
     )
-    def model_post_init(self, _) -> None:
-        self.safe_file_paths = self.validate_content()
-        self.content = self._load_content()
+    def _convert_source_to_docling_documents(self) -> List["DoclingDocument"]:
+        conv_results_iter = self.document_converter.convert_all(self.valid_file_paths)
+        return [result.document for result in conv_results_iter]
     def _load_content(self) -> List["DoclingDocument"]:
         try:
             return self._convert_source_to_docling_documents()
         except ConversionError as e:
-            self._logger.log(
-                level="error",
-                message=f"Error loading content: {str(e)}. Supported formats: {self.document_converter.allowed_formats}",
-                color="red",
-            )
+            self._logger.log(level="error", message=f"Error loading content: {str(e)}. Supported formats: {self.document_converter.allowed_formats}", color="red")
             raise e
         except Exception as e:
-            self._logger.log(level="error", message=f"Error loading content: {e}", color="red")
+            self._logger.log(level="error", message=f"Error loading content: {str(e)}", color="red")
             raise e
-    def add(self) -> None:
-        if self.content is None:
-            return
-        for doc in self.content:
-            new_chunks_iterable = self._chunk_doc(doc)
-            self.chunks.extend(list(new_chunks_iterable))
-        self._save_documents()
-    def _convert_source_to_docling_documents(self) -> List["DoclingDocument"]:
-        conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
-        return [result.document for result in conv_results_iter]
     def _chunk_doc(self, doc: "DoclingDocument") -> Iterator[str]:
         chunker = HierarchicalChunker()
         for chunk in chunker.chunk(doc):
             yield chunk.text
+    def _validate_url(self, url: str) -> bool:
+        try:
+            result = urlparse(url)
+            return all(
+                [
+                    result.scheme in ("http", "https"),
+                    result.netloc,
+                    len(result.netloc.split(".")) >= 2,  # Ensure domain has TLD
+                ]
+            )
+        except Exception:
+            return False
+    def model_post_init(self, _) -> None:
+        self.valid_file_paths = self.validate_content()
+        self.content.extend(self._load_content())
     def validate_content(self) -> List[Path | str]:
         processed_paths: List[Path | str] = []
         for path in self.file_paths:
@@ -108,22 +109,23 @@ class DoclingSource(BaseKnowledgeSource):
                     if local_path.exists():
                         processed_paths.append(local_path)
                     else:
-                        raise FileNotFoundError(f"File not found: {local_path}")
+                        local_path = Path(fetch_db_storage_path() + "/" + KNOWLEDGE_DIRECTORY + "/" + path) # try with abs. path
+                        if local_path.exists():
+                            processed_paths.append(local_path)
+                        else:
+                            raise FileNotFoundError(f"File not found: {local_path}")
             else:
                 if isinstance(path, Path):
                     processed_paths.append(path)
         return processed_paths
-    def _validate_url(self, url: str) -> bool:
-        try:
-            result = urlparse(url)
-            return all(
-                [
-                    result.scheme in ("http", "https"),
-                    result.netloc,
-                    len(result.netloc.split(".")) >= 2,  # Ensure domain has TLD
-                ]
-            )
-        except Exception:
-            return False
+    def add(self) -> None:
+        if self.content is None:
+            self.model_post_init()
+        if self.content:
+            for doc in self.content:
+                new_chunks_iterable = self._chunk_doc(doc)
+                self.chunks.extend(list(new_chunks_iterable))
+            self._save_documents()

versionhq 1.1.11.2__tar.gz → 1.1.11.4__tar.gz

versionhq 1.1.11.2tar.gz → 1.1.11.4tar.gz