versionhq 1.1.10.7__py3-none-any.whl → 1.1.10.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- versionhq/__init__.py +1 -1
- versionhq/_utils/vars.py +2 -0
- versionhq/agent/TEMPLATES/Backstory.py +2 -2
- versionhq/agent/default_agents.py +10 -0
- versionhq/agent/model.py +127 -39
- versionhq/agent/parser.py +3 -20
- versionhq/{_utils → agent}/rpm_controller.py +22 -15
- versionhq/knowledge/__init__.py +0 -0
- versionhq/knowledge/_utils.py +11 -0
- versionhq/knowledge/embedding.py +192 -0
- versionhq/knowledge/model.py +54 -0
- versionhq/knowledge/source.py +413 -0
- versionhq/knowledge/source_docling.py +129 -0
- versionhq/knowledge/storage.py +177 -0
- versionhq/llm/model.py +76 -62
- versionhq/memory/__init__.py +0 -0
- versionhq/memory/contextual_memory.py +96 -0
- versionhq/memory/model.py +174 -0
- versionhq/storage/base.py +14 -0
- versionhq/storage/ltm_sqlite_storage.py +131 -0
- versionhq/storage/mem0_storage.py +109 -0
- versionhq/storage/rag_storage.py +231 -0
- versionhq/storage/task_output_storage.py +18 -29
- versionhq/storage/utils.py +26 -0
- versionhq/task/TEMPLATES/Description.py +5 -0
- versionhq/task/evaluate.py +122 -0
- versionhq/task/model.py +134 -43
- versionhq/team/team_planner.py +1 -1
- versionhq/tool/model.py +44 -46
- {versionhq-1.1.10.7.dist-info → versionhq-1.1.10.9.dist-info}/METADATA +48 -39
- versionhq-1.1.10.9.dist-info/RECORD +64 -0
- versionhq-1.1.10.7.dist-info/RECORD +0 -45
- {versionhq-1.1.10.7.dist-info → versionhq-1.1.10.9.dist-info}/LICENSE +0 -0
- {versionhq-1.1.10.7.dist-info → versionhq-1.1.10.9.dist-info}/WHEEL +0 -0
- {versionhq-1.1.10.7.dist-info → versionhq-1.1.10.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,54 @@
|
|
1
|
+
import os
|
2
|
+
from abc import ABC, abstractmethod
|
3
|
+
from typing import Any, Dict, List, Optional
|
4
|
+
from pydantic import BaseModel, ConfigDict, Field
|
5
|
+
|
6
|
+
from versionhq.knowledge.storage import KnowledgeStorage
|
7
|
+
from versionhq.knowledge.source import BaseKnowledgeSource
|
8
|
+
|
9
|
+
|
10
|
+
class Knowledge(BaseModel):
|
11
|
+
"""
|
12
|
+
Knowlede class for collection of sources and setup for the vector store to query relevant context.
|
13
|
+
"""
|
14
|
+
sources: List[BaseKnowledgeSource] = Field(default_factory=list)
|
15
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
16
|
+
storage: KnowledgeStorage = Field(default_factory=KnowledgeStorage)
|
17
|
+
embedder_config: Optional[Dict[str, Any]] = None
|
18
|
+
collection_name: Optional[str] = None
|
19
|
+
|
20
|
+
def __init__(
|
21
|
+
self,
|
22
|
+
collection_name: str,
|
23
|
+
sources: List[BaseKnowledgeSource],
|
24
|
+
embedder_config: Optional[Dict[str, Any]] = None,
|
25
|
+
storage: Optional[KnowledgeStorage] = None,
|
26
|
+
**data,
|
27
|
+
):
|
28
|
+
super().__init__(**data)
|
29
|
+
if storage:
|
30
|
+
self.storage = storage
|
31
|
+
else:
|
32
|
+
self.storage = KnowledgeStorage(embedder_config=embedder_config, collection_name=collection_name)
|
33
|
+
|
34
|
+
self.sources = sources
|
35
|
+
self.storage.initialize_knowledge_storage()
|
36
|
+
for source in sources:
|
37
|
+
source.storage = self.storage
|
38
|
+
source.add()
|
39
|
+
|
40
|
+
|
41
|
+
def query(self, query: List[str], limit: int = 3) -> List[Dict[str, Any]]:
|
42
|
+
"""
|
43
|
+
Query across all knowledge sources to find the most relevant information.
|
44
|
+
Returns the top_k most relevant chunks.
|
45
|
+
"""
|
46
|
+
|
47
|
+
results = self.storage.search(query, limit)
|
48
|
+
return results
|
49
|
+
|
50
|
+
|
51
|
+
def _add_sources(self):
|
52
|
+
for source in self.sources:
|
53
|
+
source.storage = self.storage
|
54
|
+
source.add()
|
@@ -0,0 +1,413 @@
|
|
1
|
+
import csv
|
2
|
+
import json
|
3
|
+
from abc import ABC, abstractmethod
|
4
|
+
from typing import Any, Dict, List, Optional
|
5
|
+
from pathlib import Path
|
6
|
+
|
7
|
+
import numpy as np
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
9
|
+
|
10
|
+
from versionhq.knowledge.storage import KnowledgeStorage
|
11
|
+
from versionhq._utils.vars import KNOWLEDGE_DIRECTORY
|
12
|
+
from versionhq._utils.logger import Logger
|
13
|
+
|
14
|
+
|
15
|
+
class BaseKnowledgeSource(BaseModel, ABC):
|
16
|
+
"""
|
17
|
+
Abstract base class for knowledge sources: csv, json, excel, pdf, string, and docling.
|
18
|
+
"""
|
19
|
+
|
20
|
+
chunk_size: int = 4000
|
21
|
+
chunk_overlap: int = 200
|
22
|
+
chunks: List[str] = Field(default_factory=list)
|
23
|
+
chunk_embeddings: List[np.ndarray] = Field(default_factory=list)
|
24
|
+
|
25
|
+
model_config = ConfigDict(arbitrary_types_allowed=True)
|
26
|
+
storage: Optional[KnowledgeStorage] = Field(default=None)
|
27
|
+
metadata: Dict[str, Any] = Field(default_factory=dict) # Currently unused
|
28
|
+
collection_name: Optional[str] = Field(default=None)
|
29
|
+
|
30
|
+
@abstractmethod
|
31
|
+
def validate_content(self) -> Any:
|
32
|
+
"""Load and preprocess content from the source."""
|
33
|
+
pass
|
34
|
+
|
35
|
+
@abstractmethod
|
36
|
+
def add(self) -> None:
|
37
|
+
"""Process content, chunk it, compute embeddings, and save them."""
|
38
|
+
pass
|
39
|
+
|
40
|
+
def get_embeddings(self) -> List[np.ndarray]:
|
41
|
+
"""Return the list of embeddings for the chunks."""
|
42
|
+
return self.chunk_embeddings
|
43
|
+
|
44
|
+
def _chunk_text(self, text: str) -> List[str]:
|
45
|
+
"""
|
46
|
+
Utility method to split text into chunks.
|
47
|
+
"""
|
48
|
+
|
49
|
+
return [
|
50
|
+
text[i : i + self.chunk_size]
|
51
|
+
for i in range(0, len(text), self.chunk_size - self.chunk_overlap)
|
52
|
+
]
|
53
|
+
|
54
|
+
def _save_documents(self):
|
55
|
+
"""
|
56
|
+
Save the documents to the storage.
|
57
|
+
This method should be called after the chunks and embeddings are generated.
|
58
|
+
"""
|
59
|
+
if self.storage:
|
60
|
+
self.storage.save(self.chunks)
|
61
|
+
else:
|
62
|
+
raise ValueError("No storage found to save documents.")
|
63
|
+
|
64
|
+
|
65
|
+
|
66
|
+
class StringKnowledgeSource(BaseKnowledgeSource):
|
67
|
+
"""
|
68
|
+
A knowledge source that stores and queries plain text content using embeddings.
|
69
|
+
"""
|
70
|
+
|
71
|
+
content: str = Field(...)
|
72
|
+
collection_name: Optional[str] = Field(default=None)
|
73
|
+
|
74
|
+
def model_post_init(self, _):
|
75
|
+
"""Post-initialization method to validate content."""
|
76
|
+
self.validate_content()
|
77
|
+
|
78
|
+
def validate_content(self):
|
79
|
+
"""Validate string content."""
|
80
|
+
if not isinstance(self.content, str):
|
81
|
+
raise ValueError("StringKnowledgeSource only accepts string content")
|
82
|
+
|
83
|
+
def add(self) -> None:
|
84
|
+
"""
|
85
|
+
Add string content to the knowledge source, chunk it, compute embeddings, and save them.
|
86
|
+
"""
|
87
|
+
new_chunks = self._chunk_text(self.content)
|
88
|
+
self.chunks.extend(new_chunks)
|
89
|
+
self._save_documents()
|
90
|
+
|
91
|
+
|
92
|
+
def _chunk_text(self, text: str) -> List[str]:
|
93
|
+
"""
|
94
|
+
Utility method to split text into chunks.
|
95
|
+
"""
|
96
|
+
return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
97
|
+
|
98
|
+
|
99
|
+
|
100
|
+
class BaseFileKnowledgeSource(BaseKnowledgeSource, ABC):
|
101
|
+
"""Base class for knowledge sources that load content from files."""
|
102
|
+
|
103
|
+
_logger: Logger = Logger(verbose=True)
|
104
|
+
file_paths: Optional[Path | List[Path] | str | List[str]] = Field(default_factory=list)
|
105
|
+
content: Dict[Path, str] = Field(init=False, default_factory=dict)
|
106
|
+
storage: Optional[KnowledgeStorage] = Field(default=None)
|
107
|
+
safe_file_paths: List[Path] = Field(default_factory=list, description="store a list of `Path` objects from self.file_paths")
|
108
|
+
|
109
|
+
|
110
|
+
@field_validator("file_paths", mode="before")
|
111
|
+
def validate_file_path(cls, v, info):
|
112
|
+
"""
|
113
|
+
Validate if at least one valid file path is provided.
|
114
|
+
"""
|
115
|
+
if v is None and info.data.get("file_paths") is None:
|
116
|
+
raise ValueError("Either file_path or file_paths must be provided")
|
117
|
+
return v
|
118
|
+
|
119
|
+
|
120
|
+
def model_post_init(self, _) -> None:
|
121
|
+
"""
|
122
|
+
Post-initialization method to load content.
|
123
|
+
"""
|
124
|
+
self.safe_file_paths = self._process_file_paths()
|
125
|
+
self.validate_content()
|
126
|
+
self.content = self.load_content()
|
127
|
+
|
128
|
+
|
129
|
+
@abstractmethod
|
130
|
+
def load_content(self) -> Dict[Path, str]:
|
131
|
+
"""
|
132
|
+
Load and preprocess file content. Should be overridden by subclasses.
|
133
|
+
Assume that the file path is relative to the project root in the knowledge directory.
|
134
|
+
"""
|
135
|
+
pass
|
136
|
+
|
137
|
+
|
138
|
+
def validate_content(self):
|
139
|
+
"""
|
140
|
+
Validate the given file paths.
|
141
|
+
"""
|
142
|
+
for path in self.safe_file_paths:
|
143
|
+
if not path.exists():
|
144
|
+
self._logger.log(
|
145
|
+
"error",
|
146
|
+
f"File not found: {path}. Try adding sources to the knowledge directory. If it's inside the knowledge directory, use the relative path.",
|
147
|
+
color="red",
|
148
|
+
)
|
149
|
+
raise FileNotFoundError(f"File not found: {path}")
|
150
|
+
if not path.is_file():
|
151
|
+
self._logger.log("error", f"Path is not a file: {path}", color="red")
|
152
|
+
|
153
|
+
|
154
|
+
def _save_documents(self):
|
155
|
+
if self.storage:
|
156
|
+
self.storage.save(self.chunks)
|
157
|
+
else:
|
158
|
+
raise ValueError("No storage found to save documents.")
|
159
|
+
|
160
|
+
|
161
|
+
def convert_to_path(self, path: Path | str) -> Path:
|
162
|
+
"""
|
163
|
+
Convert a path to a Path object.
|
164
|
+
"""
|
165
|
+
return Path(KNOWLEDGE_DIRECTORY + "/" + path) if isinstance(path, str) else path
|
166
|
+
|
167
|
+
|
168
|
+
def _process_file_paths(self) -> List[Path]:
|
169
|
+
"""
|
170
|
+
Convert file_path to a list of Path objects.
|
171
|
+
"""
|
172
|
+
|
173
|
+
if self.file_paths is None:
|
174
|
+
raise ValueError("Your source must be provided with a file_paths: []")
|
175
|
+
|
176
|
+
path_list: List[Path | str] = [self.file_paths] if isinstance(self.file_paths, (str, Path)) else list(self.file_paths) if isinstance(self.file_paths, list) else []
|
177
|
+
|
178
|
+
if not path_list:
|
179
|
+
raise ValueError(
|
180
|
+
"file_path/file_paths must be a Path, str, or a list of these types"
|
181
|
+
)
|
182
|
+
|
183
|
+
return [self.convert_to_path(path) for path in path_list]
|
184
|
+
|
185
|
+
|
186
|
+
|
187
|
+
class TextFileKnowledgeSource(BaseFileKnowledgeSource):
|
188
|
+
"""
|
189
|
+
A knowledge source class that stores and queries text file content using embeddings.
|
190
|
+
"""
|
191
|
+
|
192
|
+
def load_content(self) -> Dict[Path, str]:
|
193
|
+
"""
|
194
|
+
Load and preprocess text file content.
|
195
|
+
"""
|
196
|
+
|
197
|
+
content = {}
|
198
|
+
for path in self.safe_file_paths:
|
199
|
+
path = self.convert_to_path(path)
|
200
|
+
with open(path, "r", encoding="utf-8") as f:
|
201
|
+
content[path] = f.read()
|
202
|
+
return content
|
203
|
+
|
204
|
+
|
205
|
+
def add(self) -> None:
|
206
|
+
"""
|
207
|
+
Add text file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
|
208
|
+
"""
|
209
|
+
for _, text in self.content.items():
|
210
|
+
new_chunks = self._chunk_text(text)
|
211
|
+
self.chunks.extend(new_chunks)
|
212
|
+
self._save_documents()
|
213
|
+
|
214
|
+
|
215
|
+
def _chunk_text(self, text: str) -> List[str]:
|
216
|
+
"""
|
217
|
+
Utility method to split text into chunks.
|
218
|
+
"""
|
219
|
+
return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
220
|
+
|
221
|
+
|
222
|
+
|
223
|
+
class PDFKnowledgeSource(BaseFileKnowledgeSource):
|
224
|
+
"""
|
225
|
+
A knowledge source class that stores and queries PDF file content using embeddings.
|
226
|
+
"""
|
227
|
+
|
228
|
+
def load_content(self) -> Dict[Path, str]:
|
229
|
+
"""
|
230
|
+
Load and preprocess PDF file content.
|
231
|
+
"""
|
232
|
+
pdfplumber = self._import_pdfplumber()
|
233
|
+
content = {}
|
234
|
+
for path in self.safe_file_paths:
|
235
|
+
text = ""
|
236
|
+
path = self.convert_to_path(path)
|
237
|
+
with pdfplumber.open(path) as pdf:
|
238
|
+
for page in pdf.pages:
|
239
|
+
page_text = page.extract_text()
|
240
|
+
if page_text:
|
241
|
+
text += page_text + "\n"
|
242
|
+
content[path] = text
|
243
|
+
return content
|
244
|
+
|
245
|
+
|
246
|
+
def _import_pdfplumber(self):
|
247
|
+
"""
|
248
|
+
Dynamically import pdfplumber.
|
249
|
+
"""
|
250
|
+
try:
|
251
|
+
import pdfplumber
|
252
|
+
return pdfplumber
|
253
|
+
except ImportError:
|
254
|
+
raise ImportError("pdfplumber is not installed. Please install it with: pip install pdfplumber")
|
255
|
+
|
256
|
+
|
257
|
+
def add(self) -> None:
|
258
|
+
"""
|
259
|
+
Add PDF file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
|
260
|
+
"""
|
261
|
+
for _, text in self.content.items():
|
262
|
+
new_chunks = self._chunk_text(text)
|
263
|
+
self.chunks.extend(new_chunks)
|
264
|
+
self._save_documents()
|
265
|
+
|
266
|
+
|
267
|
+
def _chunk_text(self, text: str) -> List[str]:
|
268
|
+
"""
|
269
|
+
Utility method to split text into chunks.
|
270
|
+
"""
|
271
|
+
return [text[i : i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
272
|
+
|
273
|
+
|
274
|
+
|
275
|
+
class CSVKnowledgeSource(BaseFileKnowledgeSource):
|
276
|
+
"""
|
277
|
+
A knowledge source class that stores and queries CSV file content using embeddings.
|
278
|
+
"""
|
279
|
+
|
280
|
+
def load_content(self) -> Dict[Path, str]:
|
281
|
+
"""
|
282
|
+
Load and preprocess CSV file content.
|
283
|
+
"""
|
284
|
+
content_dict = {}
|
285
|
+
for file_path in self.safe_file_paths:
|
286
|
+
with open(file_path, "r", encoding="utf-8") as csvfile:
|
287
|
+
reader = csv.reader(csvfile)
|
288
|
+
content = ""
|
289
|
+
for row in reader:
|
290
|
+
content += " ".join(row) + "\n"
|
291
|
+
content_dict[file_path] = content
|
292
|
+
|
293
|
+
return content_dict
|
294
|
+
|
295
|
+
|
296
|
+
def add(self) -> None:
|
297
|
+
"""
|
298
|
+
Add CSV file content to the knowledge source, chunk it, compute embeddings,
|
299
|
+
and save the embeddings.
|
300
|
+
"""
|
301
|
+
content_str = str(self.content) if isinstance(self.content, dict) else self.content
|
302
|
+
new_chunks = self._chunk_text(content_str)
|
303
|
+
self.chunks.extend(new_chunks)
|
304
|
+
self._save_documents()
|
305
|
+
|
306
|
+
|
307
|
+
def _chunk_text(self, text: str) -> List[str]:
|
308
|
+
"""
|
309
|
+
Utility method to split text into chunks.
|
310
|
+
"""
|
311
|
+
return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
312
|
+
|
313
|
+
|
314
|
+
|
315
|
+
class JSONKnowledgeSource(BaseFileKnowledgeSource):
|
316
|
+
"""
|
317
|
+
A knowledge source class that stores and queries JSON file content using embeddings.
|
318
|
+
"""
|
319
|
+
|
320
|
+
def load_content(self) -> Dict[Path, str]:
|
321
|
+
"""
|
322
|
+
Load and preprocess JSON file content.
|
323
|
+
"""
|
324
|
+
content: Dict[Path, str] = {}
|
325
|
+
for path in self.safe_file_paths:
|
326
|
+
path = self.convert_to_path(path)
|
327
|
+
with open(path, "r", encoding="utf-8") as json_file:
|
328
|
+
data = json.load(json_file)
|
329
|
+
content[path] = self._json_to_text(data)
|
330
|
+
return content
|
331
|
+
|
332
|
+
def _json_to_text(self, data: Any, level: int = 0) -> str:
|
333
|
+
"""
|
334
|
+
Recursively convert JSON data to a text representation.
|
335
|
+
"""
|
336
|
+
text = ""
|
337
|
+
indent = " " * level
|
338
|
+
if isinstance(data, dict):
|
339
|
+
for key, value in data.items():
|
340
|
+
text += f"{indent}{key}: {self._json_to_text(value, level + 1)}\n"
|
341
|
+
elif isinstance(data, list):
|
342
|
+
for item in data:
|
343
|
+
text += f"{indent}- {self._json_to_text(item, level + 1)}\n"
|
344
|
+
else:
|
345
|
+
text += f"{str(data)}"
|
346
|
+
return text
|
347
|
+
|
348
|
+
|
349
|
+
def add(self) -> None:
|
350
|
+
"""
|
351
|
+
Add JSON file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
|
352
|
+
"""
|
353
|
+
content_str = str(self.content) if isinstance(self.content, dict) else self.content
|
354
|
+
new_chunks = self._chunk_text(content_str)
|
355
|
+
self.chunks.extend(new_chunks)
|
356
|
+
self._save_documents()
|
357
|
+
|
358
|
+
|
359
|
+
def _chunk_text(self, text: str) -> List[str]:
|
360
|
+
"""
|
361
|
+
Utility method to split text into chunks.
|
362
|
+
"""
|
363
|
+
return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
364
|
+
|
365
|
+
|
366
|
+
|
367
|
+
class ExcelKnowledgeSource(BaseFileKnowledgeSource):
|
368
|
+
"""
|
369
|
+
A knowledge source that stores and queries Excel file content using embeddings.
|
370
|
+
"""
|
371
|
+
|
372
|
+
def load_content(self) -> Dict[Path, str]:
|
373
|
+
"""
|
374
|
+
Load and preprocess Excel file content.
|
375
|
+
"""
|
376
|
+
|
377
|
+
pd = self._import_dependencies()
|
378
|
+
content_dict = {}
|
379
|
+
for file_path in self.safe_file_paths:
|
380
|
+
file_path = self.convert_to_path(file_path)
|
381
|
+
df = pd.read_excel(file_path)
|
382
|
+
content = df.to_csv(index=False)
|
383
|
+
content_dict[file_path] = content
|
384
|
+
return content_dict
|
385
|
+
|
386
|
+
def _import_dependencies(self):
|
387
|
+
"""
|
388
|
+
Dynamically import dependencies.
|
389
|
+
"""
|
390
|
+
try:
|
391
|
+
import pandas as pd
|
392
|
+
return pd
|
393
|
+
except ImportError as e:
|
394
|
+
missing_package = str(e).split()[-1]
|
395
|
+
raise ImportError(
|
396
|
+
f"{missing_package} is not installed. Please install it with: pip install {missing_package}"
|
397
|
+
)
|
398
|
+
|
399
|
+
def add(self) -> None:
|
400
|
+
"""
|
401
|
+
Add Excel file content to the knowledge source, chunk it, compute embeddings, and save the embeddings.
|
402
|
+
"""
|
403
|
+
content_str = "\n".join(str(value) for value in self.content.values()) if isinstance(self.content, dict) else str(self.content)
|
404
|
+
new_chunks = self._chunk_text(content_str)
|
405
|
+
self.chunks.extend(new_chunks)
|
406
|
+
self._save_documents()
|
407
|
+
|
408
|
+
|
409
|
+
def _chunk_text(self, text: str) -> List[str]:
|
410
|
+
"""
|
411
|
+
Utility method to split text into chunks.
|
412
|
+
"""
|
413
|
+
return [text[i:i + self.chunk_size] for i in range(0, len(text), self.chunk_size - self.chunk_overlap)]
|
@@ -0,0 +1,129 @@
|
|
1
|
+
from pathlib import Path
|
2
|
+
from typing import Iterator, List, Optional
|
3
|
+
from urllib.parse import urlparse
|
4
|
+
|
5
|
+
try:
|
6
|
+
from docling.datamodel.base_models import InputFormat
|
7
|
+
from docling.document_converter import DocumentConverter
|
8
|
+
from docling.exceptions import ConversionError
|
9
|
+
from docling_core.transforms.chunker.hierarchical_chunker import HierarchicalChunker
|
10
|
+
from docling_core.types.doc.document import DoclingDocument
|
11
|
+
DOCLING_AVAILABLE = True
|
12
|
+
except ImportError:
|
13
|
+
DOCLING_AVAILABLE = False
|
14
|
+
|
15
|
+
from pydantic import Field
|
16
|
+
|
17
|
+
from versionhq.knowledge.source import BaseKnowledgeSource
|
18
|
+
from versionhq._utils.vars import KNOWLEDGE_DIRECTORY
|
19
|
+
from versionhq._utils.logger import Logger
|
20
|
+
|
21
|
+
|
22
|
+
class DoclingSource(BaseKnowledgeSource):
|
23
|
+
"""
|
24
|
+
Default docling class for converting documents to markdown or json.
|
25
|
+
Support PDF, DOCX, TXT, XLSX, PPTX, MD, Images, and HTML files without any additional dependencies.
|
26
|
+
"""
|
27
|
+
|
28
|
+
def __init__(self, *args, **kwargs):
|
29
|
+
if not DOCLING_AVAILABLE:
|
30
|
+
raise ImportError("The docling package is required. Please install the package using: $ uv add docling.")
|
31
|
+
|
32
|
+
super().__init__(*args, **kwargs)
|
33
|
+
|
34
|
+
_logger: Logger = Logger(verbose=True)
|
35
|
+
file_paths: List[Path | str] = Field(default_factory=list)
|
36
|
+
chunks: List[str] = Field(default_factory=list)
|
37
|
+
safe_file_paths: List[Path | str] = Field(default_factory=list)
|
38
|
+
content: List["DoclingDocument"] = Field(default_factory=list)
|
39
|
+
document_converter: "DocumentConverter" = Field(
|
40
|
+
default_factory=lambda: DocumentConverter(
|
41
|
+
allowed_formats=[
|
42
|
+
InputFormat.MD,
|
43
|
+
InputFormat.ASCIIDOC,
|
44
|
+
InputFormat.PDF,
|
45
|
+
InputFormat.DOCX,
|
46
|
+
InputFormat.HTML,
|
47
|
+
InputFormat.IMAGE,
|
48
|
+
InputFormat.XLSX,
|
49
|
+
InputFormat.PPTX,
|
50
|
+
]
|
51
|
+
)
|
52
|
+
)
|
53
|
+
|
54
|
+
def model_post_init(self, _) -> None:
|
55
|
+
self.safe_file_paths = self.validate_content()
|
56
|
+
self.content = self._load_content()
|
57
|
+
|
58
|
+
|
59
|
+
def _load_content(self) -> List["DoclingDocument"]:
|
60
|
+
try:
|
61
|
+
return self._convert_source_to_docling_documents()
|
62
|
+
except ConversionError as e:
|
63
|
+
self._logger.log(
|
64
|
+
level="error",
|
65
|
+
message=f"Error loading content: {str(e)}. Supported formats: {self.document_converter.allowed_formats}",
|
66
|
+
color="red",
|
67
|
+
)
|
68
|
+
raise e
|
69
|
+
except Exception as e:
|
70
|
+
self._logger.log(level="error", message=f"Error loading content: {e}", color="red")
|
71
|
+
raise e
|
72
|
+
|
73
|
+
|
74
|
+
def add(self) -> None:
|
75
|
+
if self.content is None:
|
76
|
+
return
|
77
|
+
for doc in self.content:
|
78
|
+
new_chunks_iterable = self._chunk_doc(doc)
|
79
|
+
self.chunks.extend(list(new_chunks_iterable))
|
80
|
+
self._save_documents()
|
81
|
+
|
82
|
+
|
83
|
+
def _convert_source_to_docling_documents(self) -> List["DoclingDocument"]:
|
84
|
+
conv_results_iter = self.document_converter.convert_all(self.safe_file_paths)
|
85
|
+
return [result.document for result in conv_results_iter]
|
86
|
+
|
87
|
+
|
88
|
+
def _chunk_doc(self, doc: "DoclingDocument") -> Iterator[str]:
|
89
|
+
chunker = HierarchicalChunker()
|
90
|
+
for chunk in chunker.chunk(doc):
|
91
|
+
yield chunk.text
|
92
|
+
|
93
|
+
|
94
|
+
def validate_content(self) -> List[Path | str]:
|
95
|
+
processed_paths: List[Path | str] = []
|
96
|
+
for path in self.file_paths:
|
97
|
+
if isinstance(path, str):
|
98
|
+
if path.startswith(("http://", "https://")):
|
99
|
+
try:
|
100
|
+
if self._validate_url(path):
|
101
|
+
processed_paths.append(path)
|
102
|
+
else:
|
103
|
+
raise ValueError(f"Invalid URL format: {path}")
|
104
|
+
except Exception as e:
|
105
|
+
raise ValueError(f"Invalid URL: {path}. Error: {str(e)}")
|
106
|
+
else:
|
107
|
+
local_path = Path(KNOWLEDGE_DIRECTORY + "/" + path)
|
108
|
+
if local_path.exists():
|
109
|
+
processed_paths.append(local_path)
|
110
|
+
else:
|
111
|
+
raise FileNotFoundError(f"File not found: {local_path}")
|
112
|
+
else:
|
113
|
+
if isinstance(path, Path):
|
114
|
+
processed_paths.append(path)
|
115
|
+
return processed_paths
|
116
|
+
|
117
|
+
|
118
|
+
def _validate_url(self, url: str) -> bool:
|
119
|
+
try:
|
120
|
+
result = urlparse(url)
|
121
|
+
return all(
|
122
|
+
[
|
123
|
+
result.scheme in ("http", "https"),
|
124
|
+
result.netloc,
|
125
|
+
len(result.netloc.split(".")) >= 2, # Ensure domain has TLD
|
126
|
+
]
|
127
|
+
)
|
128
|
+
except Exception:
|
129
|
+
return False
|