versionhq 1.1.11.2__tar.gz → 1.1.11.4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/PKG-INFO +1 -1
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/pyproject.toml +1 -1
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/__init__.py +1 -1
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/agent/model.py +1 -1
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/knowledge/model.py +5 -1
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/knowledge/source.py +101 -117
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/knowledge/source_docling.py +43 -41
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/knowledge/storage.py +72 -55
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq.egg-info/PKG-INFO +1 -1
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/agent/agent_test.py +0 -1
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/knowledge/knowledge_test.py +3 -12
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/uv.lock +10 -10
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/.github/workflows/publish.yml +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/.github/workflows/publish_testpypi.yml +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/.github/workflows/run_tests.yml +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/.github/workflows/security_check.yml +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/.gitignore +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/.pre-commit-config.yaml +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/.python-version +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/LICENSE +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/README.md +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/SECURITY.md +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/db/preprocess.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/requirements-dev.txt +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/requirements.txt +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/runtime.txt +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/setup.cfg +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/_utils/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/_utils/i18n.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/_utils/logger.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/_utils/process_config.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/_utils/usage_metrics.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/_utils/vars.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/agent/TEMPLATES/Backstory.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/agent/TEMPLATES/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/agent/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/agent/default_agents.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/agent/parser.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/agent/rpm_controller.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/cli/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/clients/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/clients/customer/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/clients/customer/model.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/clients/product/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/clients/product/model.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/clients/workflow/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/clients/workflow/model.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/knowledge/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/knowledge/_utils.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/knowledge/embedding.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/llm/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/llm/llm_vars.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/llm/model.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/memory/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/memory/contextual_memory.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/memory/model.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/storage/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/storage/base.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/storage/ltm_sqlite_storage.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/storage/mem0_storage.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/storage/rag_storage.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/storage/task_output_storage.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/storage/utils.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/task/TEMPLATES/Description.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/task/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/task/evaluate.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/task/formatter.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/task/log_handler.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/task/model.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/task/structured_response.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/team/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/team/model.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/team/team_planner.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/tool/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/tool/cache_handler.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/tool/composio_tool.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/tool/composio_tool_vars.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/tool/decorator.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/tool/model.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq/tool/tool_handler.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq.egg-info/SOURCES.txt +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq.egg-info/dependency_links.txt +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq.egg-info/requires.txt +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/src/versionhq.egg-info/top_level.txt +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/agent/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/cli/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/clients/customer_test.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/clients/product_test.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/clients/workflow_test.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/conftest.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/knowledge/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/knowledge/mock_report_compressed.pdf +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/llm/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/llm/llm_test.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/memory/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/memory/memory_test.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/task/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/task/task_test.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/team/Prompts/Demo_test.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/team/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/team/team_test.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/tool/__init__.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/tool/composio_test.py +0 -0
- {versionhq-1.1.11.2 → versionhq-1.1.11.4}/tests/tool/tool_test.py +0 -0
@@ -15,7 +15,7 @@ exclude = ["test*", "__pycache__", "*.egg-info"]
|
|
15
15
|
|
16
16
|
[project]
|
17
17
|
name = "versionhq"
|
18
|
-
version = "1.1.11.
|
18
|
+
version = "1.1.11.4"
|
19
19
|
authors = [{ name = "Kuriko Iwai", email = "kuriko@versi0n.io" }]
|
20
20
|
description = "LLM orchestration frameworks for model-agnostic AI agents that handle complex outbound workflows"
|
21
21
|
readme = "README.md"
|
@@ -469,7 +469,7 @@ class Agent(BaseModel):
|
|
469
469
|
task_prompt += context
|
470
470
|
|
471
471
|
if self._knowledge:
|
472
|
-
agent_knowledge = self._knowledge.query(query=[task_prompt,])
|
472
|
+
agent_knowledge = self._knowledge.query(query=[task_prompt,], limit=5)
|
473
473
|
if agent_knowledge:
|
474
474
|
agent_knowledge_context = extract_knowledge_context(knowledge_snippets=agent_knowledge)
|
475
475
|
if agent_knowledge_context:
|
@@ -26,13 +26,17 @@ class Knowledge(BaseModel):
|
|
26
26
|
**data,
|
27
27
|
):
|
28
28
|
super().__init__(**data)
|
29
|
+
|
30
|
+
|
29
31
|
if storage:
|
30
32
|
self.storage = storage
|
31
33
|
else:
|
32
34
|
self.storage = KnowledgeStorage(embedder_config=embedder_config, collection_name=collection_name)
|
33
35
|
|
34
|
-
self.
|
36
|
+
self.storage._set_embedding_function(embedder_config=embedder_config)
|
35
37
|
self.storage.initialize_knowledge_storage()
|
38
|
+
|
39
|
+
self.sources = sources
|
36
40
|
for source in sources:
|
37
41
|
source.storage = self.storage
|
38
42
|
source.add()
|
@@ -8,6 +8,7 @@ import numpy as np
|
|
8
8
|
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
9
9
|
|
10
10
|
from versionhq.knowledge.storage import KnowledgeStorage
|
11
|
+
from versionhq.storage.utils import fetch_db_storage_path
|
11
12
|
from versionhq._utils.vars import KNOWLEDGE_DIRECTORY
|
12
13
|
from versionhq._utils.logger import Logger
|
13
14
|
|
@@ -16,50 +17,66 @@ class BaseKnowledgeSource(BaseModel, ABC):
|
|
16
17
|
"""
|
17
18
|
Abstract base class for knowledge sources: csv, json, excel, pdf, string, and docling.
|
18
19
|
"""
|
20
|
+
_logger: Logger = Logger(verbose=True)
|
19
21
|
|
20
|
-
chunk_size: int =
|
22
|
+
chunk_size: int = 3000
|
21
23
|
chunk_overlap: int = 200
|
22
24
|
chunks: List[str] = Field(default_factory=list)
|
23
25
|
chunk_embeddings: List[np.ndarray] = Field(default_factory=list)
|
24
26
|
|
25
27
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
26
28
|
storage: Optional[KnowledgeStorage] = Field(default=None)
|
27
|
-
metadata: Dict[str, Any] = Field(default_factory=dict)
|
29
|
+
metadata: Dict[str, Any] = Field(default_factory=dict)
|
28
30
|
collection_name: Optional[str] = Field(default=None)
|
29
31
|
|
32
|
+
|
30
33
|
@abstractmethod
|
31
|
-
def validate_content(self) -> Any:
|
34
|
+
def validate_content(self, **kwargs) -> Any:
|
32
35
|
"""Load and preprocess content from the source."""
|
33
36
|
pass
|
34
37
|
|
38
|
+
|
35
39
|
@abstractmethod
|
36
40
|
def add(self) -> None:
|
37
41
|
"""Process content, chunk it, compute embeddings, and save them."""
|
38
42
|
pass
|
39
43
|
|
44
|
+
|
40
45
|
def get_embeddings(self) -> List[np.ndarray]:
|
41
46
|
"""Return the list of embeddings for the chunks."""
|
42
47
|
return self.chunk_embeddings
|
43
48
|
|
49
|
+
|
44
50
|
def _chunk_text(self, text: str) -> List[str]:
|
45
51
|
"""
|
46
52
|
Utility method to split text into chunks.
|
47
53
|
"""
|
54
|
+
return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
48
55
|
|
49
|
-
return [
|
50
|
-
text[i : i + self.chunk_size]
|
51
|
-
for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
|
52
|
-
]
|
53
56
|
|
54
|
-
def _save_documents(self):
|
57
|
+
def _save_documents(self) -> None:
|
55
58
|
"""
|
56
|
-
Save the documents to the storage.
|
59
|
+
Save the documents to the given (or newly created) storage on ChromaDB.
|
57
60
|
This method should be called after the chunks and embeddings are generated.
|
58
61
|
"""
|
59
|
-
if self.
|
60
|
-
|
61
|
-
|
62
|
-
|
62
|
+
# if not self.chunks or self.chunk_embeddings:
|
63
|
+
# self._logger.log(level="warning", message="Chunks or chunk embeddings are missing. Save docs after creating them.", color="yellow")
|
64
|
+
# return
|
65
|
+
|
66
|
+
try:
|
67
|
+
if self.storage:
|
68
|
+
self.storage.save(documents=self.chunks, metadata=self.metadata)
|
69
|
+
|
70
|
+
else:
|
71
|
+
storage = KnowledgeStorage(collection_name=self.collection_name) if self.collection_name else KnowledgeStorage()
|
72
|
+
storage.initialize_knowledge_storage()
|
73
|
+
self.storage = storage
|
74
|
+
self.storage.save(documents=self.chunks, metadata=self.metadata)
|
75
|
+
|
76
|
+
except:
|
77
|
+
self._logger.log(level="error", message="No storage found or created to save the documents.", color="red")
|
78
|
+
return
|
79
|
+
# raise ValueError("No storage found to save documents.")
|
63
80
|
|
64
81
|
|
65
82
|
|
@@ -74,37 +91,32 @@ class StringKnowledgeSource(BaseKnowledgeSource):
|
|
74
91
|
def model_post_init(self, _):
|
75
92
|
"""Post-initialization method to validate content."""
|
76
93
|
self.validate_content()
|
94
|
+
self._save_documents()
|
95
|
+
|
77
96
|
|
78
97
|
def validate_content(self):
|
79
98
|
"""Validate string content."""
|
80
99
|
if not isinstance(self.content, str):
|
81
100
|
raise ValueError("StringKnowledgeSource only accepts string content")
|
82
101
|
|
102
|
+
|
83
103
|
def add(self) -> None:
|
84
104
|
"""
|
85
105
|
Add string content to the knowledge source, chunk it, compute embeddings, and save them.
|
86
106
|
"""
|
87
|
-
new_chunks = self._chunk_text(self.content)
|
107
|
+
new_chunks = self._chunk_text(text=self.content)
|
88
108
|
self.chunks.extend(new_chunks)
|
89
109
|
self._save_documents()
|
90
110
|
|
91
111
|
|
92
|
-
def _chunk_text(self, text: str) -> List[str]:
|
93
|
-
"""
|
94
|
-
Utility method to split text into chunks.
|
95
|
-
"""
|
96
|
-
return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
97
|
-
|
98
|
-
|
99
112
|
|
100
113
|
class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
|
101
114
|
"""Base class for knowledge sources that load content from files."""
|
102
115
|
|
103
|
-
_logger: Logger = Logger(verbose=True)
|
104
116
|
file_paths: Optional[Path | List[Path] | str | List[str]] = Field(default_factory=list)
|
105
117
|
content: Dict[Path, str] = Field(init=False, default_factory=dict)
|
106
118
|
storage: Optional[KnowledgeStorage] = Field(default=None)
|
107
|
-
|
119
|
+
valid_file_paths: List[Path] = Field(default_factory=list, description="store a list of `Path` objects from self.file_paths")
|
108
120
|
|
109
121
|
|
110
122
|
@field_validator("file_paths", mode="before")
|
@@ -117,70 +129,73 @@ class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
|
|
117
129
|
return v
|
118
130
|
|
119
131
|
|
120
|
-
def
|
132
|
+
def validate_content(self, path: str | Path) -> List[Path]:
|
121
133
|
"""
|
122
|
-
|
134
|
+
Convert the given path to a Path object, and validate if the path exists and refers to a file.)
|
123
135
|
"""
|
124
|
-
self.safe_file_paths = self._process_file_paths()
|
125
|
-
self.validate_content()
|
126
|
-
self.content = self.load_content()
|
127
|
-
|
128
136
|
|
129
|
-
|
130
|
-
def load_content(self) -> Dict[Path, str]:
|
131
|
-
"""
|
132
|
-
Load and preprocess file content. Should be overridden by subclasses.
|
133
|
-
Assume that the file path is relative to the project root in the knowledge directory.
|
134
|
-
"""
|
135
|
-
pass
|
137
|
+
path_instance = Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
|
136
138
|
|
139
|
+
if not path_instance.exists():
|
140
|
+
abs_path = fetch_db_storage_path()
|
141
|
+
path_instance = Path(abs_path + "/" + KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
|
137
142
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
"""
|
142
|
-
for path in self.safe_file_paths:
|
143
|
-
if not path.exists():
|
144
|
-
self._logger.log(
|
145
|
-
"error",
|
146
|
-
f"File not found: {path}. Try adding sources to the knowledge directory. If it's inside the knowledge directory, use the relative path.",
|
147
|
-
color="red",
|
148
|
-
)
|
149
|
-
raise FileNotFoundError(f"File not found: {path}")
|
150
|
-
if not path.is_file():
|
151
|
-
self._logger.log("error", f"Path is not a file: {path}", color="red")
|
143
|
+
if not path_instance.exists():
|
144
|
+
self._logger.log(level="error", message="File path not found.", color="red")
|
145
|
+
raise ValueError()
|
152
146
|
|
147
|
+
elif not path_instance.is_file():
|
148
|
+
self._logger.log(level="error", message="Non-file object was given.", color="red")
|
149
|
+
raise ValueError()
|
153
150
|
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
else:
|
158
|
-
raise ValueError("No storage found to save documents.")
|
151
|
+
elif not path_instance.is_file():
|
152
|
+
self._logger.log(level="error", message="Non-file object was given.", color="red")
|
153
|
+
raise ValueError()
|
159
154
|
|
155
|
+
return path_instance
|
160
156
|
|
161
|
-
def convert_to_path(self, path: Path | str) -> Path:
|
162
|
-
"""
|
163
|
-
Convert a path to a Path object.
|
164
|
-
"""
|
165
|
-
return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
|
166
157
|
|
167
158
|
|
168
159
|
def _process_file_paths(self) -> List[Path]:
|
169
160
|
"""
|
170
161
|
Convert file_path to a list of Path objects.
|
171
162
|
"""
|
163
|
+
if not self.file_paths:
|
164
|
+
self._logger.log(level="error", message="Missing file paths.", color="red")
|
165
|
+
raise ValueError("Missing file paths.")
|
172
166
|
|
173
|
-
if self.file_paths is None:
|
174
|
-
raise ValueError("Your source must be provided with a file_paths: []")
|
175
167
|
|
176
168
|
path_list: List[Path | str] = [self.file_paths] if isinstance(self.file_paths, (str, Path)) else list(self.file_paths) if isinstance(self.file_paths, list) else []
|
169
|
+
valid_path_list = list()
|
177
170
|
|
178
171
|
if not path_list:
|
179
|
-
|
180
|
-
|
181
|
-
|
172
|
+
self._logger.log(level="error", message="Missing valid file paths.", color="red")
|
173
|
+
raise ValueError("Your source must be provided with file_paths: []")
|
174
|
+
|
175
|
+
for item in path_list:
|
176
|
+
valid_path = self.validate_content(item)
|
177
|
+
if valid_path:
|
178
|
+
valid_path_list.append(valid_path)
|
182
179
|
|
183
|
-
return
|
180
|
+
return valid_path_list
|
181
|
+
|
182
|
+
|
183
|
+
def model_post_init(self, _) -> None:
|
184
|
+
"""
|
185
|
+
Post-initialization method to load content.
|
186
|
+
"""
|
187
|
+
self.valid_file_paths = self._process_file_paths()
|
188
|
+
self.content = self.load_content()
|
189
|
+
self._save_documents()
|
190
|
+
|
191
|
+
|
192
|
+
@abstractmethod
|
193
|
+
def load_content(self) -> Dict[Path, str]:
|
194
|
+
"""
|
195
|
+
Load and preprocess file content. Should be overridden by subclasses.
|
196
|
+
Assume that the file path is relative to the project root in the knowledge directory.
|
197
|
+
"""
|
198
|
+
pass
|
184
199
|
|
185
200
|
|
186
201
|
|
@@ -193,10 +208,9 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource):
|
|
193
208
|
"""
|
194
209
|
Load and preprocess text file content.
|
195
210
|
"""
|
196
|
-
|
197
211
|
content = {}
|
198
|
-
for path in self.
|
199
|
-
path = self.
|
212
|
+
for path in self.valid_file_paths:
|
213
|
+
path = self.validate_content(path=path)
|
200
214
|
with open(path, "r", encoding="utf-8") as f:
|
201
215
|
content[path] = f.read()
|
202
216
|
return content
|
@@ -207,16 +221,10 @@ class TextFileKnowledgeSource(BaseFileKnowledgeSource):
|
|
207
221
|
Add text file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
|
208
222
|
"""
|
209
223
|
for _, text in self.content.items():
|
210
|
-
new_chunks = self._chunk_text(text)
|
224
|
+
new_chunks = self._chunk_text(text=text)
|
211
225
|
self.chunks.extend(new_chunks)
|
212
|
-
self._save_documents()
|
213
|
-
|
214
226
|
|
215
|
-
|
216
|
-
"""
|
217
|
-
Utility method to split text into chunks.
|
218
|
-
"""
|
219
|
-
return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
227
|
+
self._save_documents()
|
220
228
|
|
221
229
|
|
222
230
|
|
@@ -231,9 +239,9 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):
|
|
231
239
|
"""
|
232
240
|
pdfplumber = self._import_pdfplumber()
|
233
241
|
content = {}
|
234
|
-
for path in self.
|
242
|
+
for path in self.valid_file_paths:
|
235
243
|
text = ""
|
236
|
-
path = self.
|
244
|
+
path = self.validate_content(path)
|
237
245
|
with pdfplumber.open(path) as pdf:
|
238
246
|
for page in pdf.pages:
|
239
247
|
page_text = page.extract_text()
|
@@ -259,17 +267,12 @@ class PDFKnowledgeSource(BaseFileKnowledgeSource):
|
|
259
267
|
Add PDF file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
|
260
268
|
"""
|
261
269
|
for _, text in self.content.items():
|
262
|
-
new_chunks = self._chunk_text(text)
|
270
|
+
new_chunks = self._chunk_text(text=text)
|
263
271
|
self.chunks.extend(new_chunks)
|
272
|
+
|
264
273
|
self._save_documents()
|
265
274
|
|
266
275
|
|
267
|
-
def _chunk_text(self, text: str) -> List[str]:
|
268
|
-
"""
|
269
|
-
Utility method to split text into chunks.
|
270
|
-
"""
|
271
|
-
return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
272
|
-
|
273
276
|
|
274
277
|
|
275
278
|
class CSVKnowledgeSource(BaseFileKnowledgeSource):
|
@@ -282,7 +285,7 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource):
|
|
282
285
|
Load and preprocess CSV file content.
|
283
286
|
"""
|
284
287
|
content_dict = {}
|
285
|
-
for file_path in self.
|
288
|
+
for file_path in self.valid_file_paths:
|
286
289
|
with open(file_path, "r", encoding="utf-8") as csvfile:
|
287
290
|
reader = csv.reader(csvfile)
|
288
291
|
content = ""
|
@@ -295,22 +298,14 @@ class CSVKnowledgeSource(BaseFileKnowledgeSource):
|
|
295
298
|
|
296
299
|
def add(self) -> None:
|
297
300
|
"""
|
298
|
-
Add CSV file content to the knowledge source, chunk it, compute embeddings,
|
299
|
-
and save the embeddings.
|
301
|
+
Add CSV file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
|
300
302
|
"""
|
301
303
|
content_str = str(self.content) if isinstance(self.content, dict) else self.content
|
302
|
-
new_chunks = self._chunk_text(content_str)
|
304
|
+
new_chunks = self._chunk_text(text=content_str)
|
303
305
|
self.chunks.extend(new_chunks)
|
304
306
|
self._save_documents()
|
305
307
|
|
306
308
|
|
307
|
-
def _chunk_text(self, text: str) -> List[str]:
|
308
|
-
"""
|
309
|
-
Utility method to split text into chunks.
|
310
|
-
"""
|
311
|
-
return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
312
|
-
|
313
|
-
|
314
309
|
|
315
310
|
class JSONKnowledgeSource(BaseFileKnowledgeSource):
|
316
311
|
"""
|
@@ -322,13 +317,14 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
|
|
322
317
|
Load and preprocess JSON file content.
|
323
318
|
"""
|
324
319
|
content: Dict[Path, str] = {}
|
325
|
-
for path in self.
|
326
|
-
path = self.
|
320
|
+
for path in self.valid_file_paths:
|
321
|
+
path = self.validate_content(path)
|
327
322
|
with open(path, "r", encoding="utf-8") as json_file:
|
328
323
|
data = json.load(json_file)
|
329
324
|
content[path] = self._json_to_text(data)
|
330
325
|
return content
|
331
326
|
|
327
|
+
|
332
328
|
def _json_to_text(self, data: Any, level: int = 0) -> str:
|
333
329
|
"""
|
334
330
|
Recursively convert JSON data to a text representation.
|
@@ -351,18 +347,11 @@ class JSONKnowledgeSource(BaseFileKnowledgeSource):
|
|
351
347
|
Add JSON file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
|
352
348
|
"""
|
353
349
|
content_str = str(self.content) if isinstance(self.content, dict) else self.content
|
354
|
-
new_chunks = self._chunk_text(content_str)
|
350
|
+
new_chunks = self._chunk_text(text=content_str)
|
355
351
|
self.chunks.extend(new_chunks)
|
356
352
|
self._save_documents()
|
357
353
|
|
358
354
|
|
359
|
-
def _chunk_text(self, text: str) -> List[str]:
|
360
|
-
"""
|
361
|
-
Utility method to split text into chunks.
|
362
|
-
"""
|
363
|
-
return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
364
|
-
|
365
|
-
|
366
355
|
|
367
356
|
class ExcelKnowledgeSource(BaseFileKnowledgeSource):
|
368
357
|
"""
|
@@ -376,13 +365,14 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource):
|
|
376
365
|
|
377
366
|
pd = self._import_dependencies()
|
378
367
|
content_dict = {}
|
379
|
-
for file_path in self.
|
380
|
-
file_path = self.
|
368
|
+
for file_path in self.valid_file_paths:
|
369
|
+
file_path = self.validate_content(file_path)
|
381
370
|
df = pd.read_excel(file_path)
|
382
371
|
content = df.to_csv(index=False)
|
383
372
|
content_dict[file_path] = content
|
384
373
|
return content_dict
|
385
374
|
|
375
|
+
|
386
376
|
def _import_dependencies(self):
|
387
377
|
"""
|
388
378
|
Dynamically import dependencies.
|
@@ -396,18 +386,12 @@ class ExcelKnowledgeSource(BaseFileKnowledgeSource):
|
|
396
386
|
f"{missing_package} is not installed. Please install it with: pip install {missing_package}"
|
397
387
|
)
|
398
388
|
|
389
|
+
|
399
390
|
def add(self) -> None:
|
400
391
|
"""
|
401
392
|
Add Excel file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
|
402
393
|
"""
|
403
394
|
content_str = "\n".join(str(value) for value in self.content.values()) if isinstance(self.content, dict) else str(self.content)
|
404
|
-
new_chunks = self._chunk_text(content_str)
|
395
|
+
new_chunks = self._chunk_text(text=content_str)
|
405
396
|
self.chunks.extend(new_chunks)
|
406
397
|
self._save_documents()
|
407
|
-
|
408
|
-
|
409
|
-
def _chunk_text(self, text: str) -> List[str]:
|
410
|
-
"""
|
411
|
-
Utility method to split text into chunks.
|
412
|
-
"""
|
413
|
-
return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
@@ -12,11 +12,11 @@ try:
|
|
12
12
|
except ImportError:
|
13
13
|
DOCLING_AVAILABLE = False
|
14
14
|
|
15
|
-
from pydantic import Field
|
15
|
+
from pydantic import Field, InstanceOf
|
16
16
|
|
17
17
|
from versionhq.knowledge.source import BaseKnowledgeSource
|
18
|
+
from versionhq.storage.utils import fetch_db_storage_path
|
18
19
|
from versionhq._utils.vars import KNOWLEDGE_DIRECTORY
|
19
|
-
from versionhq._utils.logger import Logger
|
20
20
|
|
21
21
|
|
22
22
|
class DoclingSource(BaseKnowledgeSource):
|
@@ -31,10 +31,9 @@ class DoclingSource(BaseKnowledgeSource):
|
|
31
31
|
|
32
32
|
super().__init__(*args, **kwargs)
|
33
33
|
|
34
|
-
|
34
|
+
|
35
35
|
file_paths: List[Path | str] = Field(default_factory=list)
|
36
|
-
|
37
|
-
safe_file_paths: List[Path | str] = Field(default_factory=list)
|
36
|
+
valid_file_paths: List[Path | str] = Field(default_factory=list)
|
38
37
|
content: List["DoclingDocument"] = Field(default_factory=list)
|
39
38
|
document_converter: "DocumentConverter" = Field(
|
40
39
|
default_factory=lambda: DocumentConverter(
|
@@ -51,46 +50,48 @@ class DoclingSource(BaseKnowledgeSource):
|
|
51
50
|
)
|
52
51
|
)
|
53
52
|
|
54
|
-
|
55
|
-
|
56
|
-
|
53
|
+
|
54
|
+
def _convert_source_to_docling_documents(self) -> List["DoclingDocument"]:
|
55
|
+
conv_results_iter = self.document_converter.convert_all(self.valid_file_paths)
|
56
|
+
return [result.document for result in conv_results_iter]
|
57
57
|
|
58
58
|
|
59
59
|
def _load_content(self) -> List["DoclingDocument"]:
|
60
60
|
try:
|
61
61
|
return self._convert_source_to_docling_documents()
|
62
62
|
except ConversionError as e:
|
63
|
-
self._logger.log(
|
64
|
-
level="error",
|
65
|
-
message=f"Error loading content: {str(e)}. Supported formats: {self.document_converter.allowed_formats}",
|
66
|
-
color="red",
|
67
|
-
)
|
63
|
+
self._logger.log(level="error", message=f"Error loading content: {str(e)}. Supported formats: {self.document_converter.allowed_formats}", color="red")
|
68
64
|
raise e
|
69
65
|
except Exception as e:
|
70
|
-
self._logger.log(level="error", message=f"Error loading content: {e}", color="red")
|
66
|
+
self._logger.log(level="error", message=f"Error loading content: {str(e)}", color="red")
|
71
67
|
raise e
|
72
68
|
|
73
69
|
|
74
|
-
def add(self) -> None:
|
75
|
-
if self.content is None:
|
76
|
-
return
|
77
|
-
for doc in self.content:
|
78
|
-
new_chunks_iterable = self._chunk_doc(doc)
|
79
|
-
self.chunks.extend(list(new_chunks_iterable))
|
80
|
-
self._save_documents()
|
81
|
-
|
82
|
-
|
83
|
-
def _convert_source_to_docling_documents(self) -> List["DoclingDocument"]:
|
84
|
-
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
|
85
|
-
return [result.document for result in conv_results_iter]
|
86
|
-
|
87
|
-
|
88
70
|
def _chunk_doc(self, doc: "DoclingDocument") -> Iterator[str]:
|
89
71
|
chunker = HierarchicalChunker()
|
90
72
|
for chunk in chunker.chunk(doc):
|
91
73
|
yield chunk.text
|
92
74
|
|
93
75
|
|
76
|
+
def _validate_url(self, url: str) -> bool:
|
77
|
+
try:
|
78
|
+
result = urlparse(url)
|
79
|
+
return all(
|
80
|
+
[
|
81
|
+
result.scheme in ("http", "https"),
|
82
|
+
result.netloc,
|
83
|
+
len(result.netloc.split(".")) >= 2, # Ensure domain has TLD
|
84
|
+
]
|
85
|
+
)
|
86
|
+
except Exception:
|
87
|
+
return False
|
88
|
+
|
89
|
+
|
90
|
+
def model_post_init(self, _) -> None:
|
91
|
+
self.valid_file_paths = self.validate_content()
|
92
|
+
self.content.extend(self._load_content())
|
93
|
+
|
94
|
+
|
94
95
|
def validate_content(self) -> List[Path | str]:
|
95
96
|
processed_paths: List[Path | str] = []
|
96
97
|
for path in self.file_paths:
|
@@ -108,22 +109,23 @@ class DoclingSource(BaseKnowledgeSource):
|
|
108
109
|
if local_path.exists():
|
109
110
|
processed_paths.append(local_path)
|
110
111
|
else:
|
111
|
-
|
112
|
+
local_path = Path(fetch_db_storage_path() + "/" + KNOWLEDGE_DIRECTORY + "/" + path) # try with abs. path
|
113
|
+
if local_path.exists():
|
114
|
+
processed_paths.append(local_path)
|
115
|
+
else:
|
116
|
+
raise FileNotFoundError(f"File not found: {local_path}")
|
112
117
|
else:
|
113
118
|
if isinstance(path, Path):
|
114
119
|
processed_paths.append(path)
|
115
120
|
return processed_paths
|
116
121
|
|
117
122
|
|
118
|
-
def
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
)
|
128
|
-
except Exception:
|
129
|
-
return False
|
123
|
+
def add(self) -> None:
|
124
|
+
if self.content is None:
|
125
|
+
self.model_post_init()
|
126
|
+
|
127
|
+
if self.content:
|
128
|
+
for doc in self.content:
|
129
|
+
new_chunks_iterable = self._chunk_doc(doc)
|
130
|
+
self.chunks.extend(list(new_chunks_iterable))
|
131
|
+
self._save_documents()
|