agno 2.3.8__py3-none-any.whl → 2.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +134 -94
- agno/db/mysql/__init__.py +2 -1
- agno/db/mysql/async_mysql.py +2888 -0
- agno/db/mysql/mysql.py +17 -8
- agno/db/mysql/utils.py +139 -6
- agno/db/postgres/async_postgres.py +10 -5
- agno/db/postgres/postgres.py +7 -2
- agno/db/schemas/evals.py +1 -0
- agno/db/singlestore/singlestore.py +5 -1
- agno/db/sqlite/async_sqlite.py +3 -3
- agno/eval/__init__.py +10 -0
- agno/eval/accuracy.py +11 -8
- agno/eval/agent_as_judge.py +861 -0
- agno/eval/base.py +29 -0
- agno/eval/utils.py +2 -1
- agno/exceptions.py +7 -0
- agno/knowledge/embedder/openai.py +8 -8
- agno/knowledge/knowledge.py +1142 -176
- agno/media.py +22 -6
- agno/models/aws/claude.py +8 -7
- agno/models/base.py +61 -2
- agno/models/deepseek/deepseek.py +67 -0
- agno/models/google/gemini.py +134 -51
- agno/models/google/utils.py +22 -0
- agno/models/message.py +5 -0
- agno/models/openai/chat.py +4 -0
- agno/os/app.py +64 -74
- agno/os/interfaces/a2a/router.py +3 -4
- agno/os/interfaces/agui/router.py +2 -0
- agno/os/router.py +3 -1607
- agno/os/routers/agents/__init__.py +3 -0
- agno/os/routers/agents/router.py +581 -0
- agno/os/routers/agents/schema.py +261 -0
- agno/os/routers/evals/evals.py +26 -6
- agno/os/routers/evals/schemas.py +34 -2
- agno/os/routers/evals/utils.py +77 -18
- agno/os/routers/knowledge/knowledge.py +1 -1
- agno/os/routers/teams/__init__.py +3 -0
- agno/os/routers/teams/router.py +496 -0
- agno/os/routers/teams/schema.py +257 -0
- agno/os/routers/workflows/__init__.py +3 -0
- agno/os/routers/workflows/router.py +545 -0
- agno/os/routers/workflows/schema.py +75 -0
- agno/os/schema.py +1 -559
- agno/os/utils.py +139 -2
- agno/team/team.py +87 -24
- agno/tools/file_generation.py +12 -6
- agno/tools/firecrawl.py +15 -7
- agno/tools/function.py +37 -23
- agno/tools/shopify.py +1519 -0
- agno/tools/spotify.py +2 -5
- agno/utils/hooks.py +64 -5
- agno/utils/http.py +2 -2
- agno/utils/media.py +11 -1
- agno/utils/print_response/agent.py +8 -0
- agno/utils/print_response/team.py +8 -0
- agno/vectordb/pgvector/pgvector.py +88 -51
- agno/workflow/parallel.py +5 -3
- agno/workflow/step.py +14 -2
- agno/workflow/types.py +38 -2
- agno/workflow/workflow.py +12 -4
- {agno-2.3.8.dist-info → agno-2.3.10.dist-info}/METADATA +7 -2
- {agno-2.3.8.dist-info → agno-2.3.10.dist-info}/RECORD +66 -52
- {agno-2.3.8.dist-info → agno-2.3.10.dist-info}/WHEEL +0 -0
- {agno-2.3.8.dist-info → agno-2.3.10.dist-info}/licenses/LICENSE +0 -0
- {agno-2.3.8.dist-info → agno-2.3.10.dist-info}/top_level.txt +0 -0
agno/knowledge/knowledge.py
CHANGED
|
@@ -198,8 +198,6 @@ class Knowledge:
|
|
|
198
198
|
"""
|
|
199
199
|
Synchronously add multiple content items to the knowledge base.
|
|
200
200
|
|
|
201
|
-
This method wraps the asynchronous add_contents method
|
|
202
|
-
|
|
203
201
|
Supports two usage patterns:
|
|
204
202
|
1. Pass a list of content dictionaries as first argument
|
|
205
203
|
2. Pass keyword arguments with paths, urls, metadata, etc.
|
|
@@ -214,11 +212,109 @@ class Knowledge:
|
|
|
214
212
|
reader: Optional reader to use for processing content
|
|
215
213
|
include: Optional list of file patterns to include
|
|
216
214
|
exclude: Optional list of file patterns to exclude
|
|
217
|
-
upsert: Whether to update existing content if it already exists
|
|
218
|
-
skip_if_exists: Whether to skip adding content if it already exists
|
|
215
|
+
upsert: Whether to update existing content if it already exists (only used when skip_if_exists=False)
|
|
216
|
+
skip_if_exists: Whether to skip adding content if it already exists (default: True)
|
|
219
217
|
remote_content: Optional remote content (S3, GCS, etc.) to add
|
|
220
218
|
"""
|
|
221
|
-
|
|
219
|
+
if args and isinstance(args[0], list):
|
|
220
|
+
arguments = args[0]
|
|
221
|
+
upsert = kwargs.get("upsert", True)
|
|
222
|
+
skip_if_exists = kwargs.get("skip_if_exists", False)
|
|
223
|
+
for argument in arguments:
|
|
224
|
+
self.add_content(
|
|
225
|
+
name=argument.get("name"),
|
|
226
|
+
description=argument.get("description"),
|
|
227
|
+
path=argument.get("path"),
|
|
228
|
+
url=argument.get("url"),
|
|
229
|
+
metadata=argument.get("metadata"),
|
|
230
|
+
topics=argument.get("topics"),
|
|
231
|
+
text_content=argument.get("text_content"),
|
|
232
|
+
reader=argument.get("reader"),
|
|
233
|
+
include=argument.get("include"),
|
|
234
|
+
exclude=argument.get("exclude"),
|
|
235
|
+
upsert=argument.get("upsert", upsert),
|
|
236
|
+
skip_if_exists=argument.get("skip_if_exists", skip_if_exists),
|
|
237
|
+
remote_content=argument.get("remote_content", None),
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
elif kwargs:
|
|
241
|
+
name = kwargs.get("name", [])
|
|
242
|
+
metadata = kwargs.get("metadata", {})
|
|
243
|
+
description = kwargs.get("description", [])
|
|
244
|
+
topics = kwargs.get("topics", [])
|
|
245
|
+
reader = kwargs.get("reader", None)
|
|
246
|
+
paths = kwargs.get("paths", [])
|
|
247
|
+
urls = kwargs.get("urls", [])
|
|
248
|
+
text_contents = kwargs.get("text_contents", [])
|
|
249
|
+
include = kwargs.get("include")
|
|
250
|
+
exclude = kwargs.get("exclude")
|
|
251
|
+
upsert = kwargs.get("upsert", True)
|
|
252
|
+
skip_if_exists = kwargs.get("skip_if_exists", False)
|
|
253
|
+
remote_content = kwargs.get("remote_content", None)
|
|
254
|
+
for path in paths:
|
|
255
|
+
self.add_content(
|
|
256
|
+
name=name,
|
|
257
|
+
description=description,
|
|
258
|
+
path=path,
|
|
259
|
+
metadata=metadata,
|
|
260
|
+
include=include,
|
|
261
|
+
exclude=exclude,
|
|
262
|
+
upsert=upsert,
|
|
263
|
+
skip_if_exists=skip_if_exists,
|
|
264
|
+
reader=reader,
|
|
265
|
+
)
|
|
266
|
+
for url in urls:
|
|
267
|
+
self.add_content(
|
|
268
|
+
name=name,
|
|
269
|
+
description=description,
|
|
270
|
+
url=url,
|
|
271
|
+
metadata=metadata,
|
|
272
|
+
include=include,
|
|
273
|
+
exclude=exclude,
|
|
274
|
+
upsert=upsert,
|
|
275
|
+
skip_if_exists=skip_if_exists,
|
|
276
|
+
reader=reader,
|
|
277
|
+
)
|
|
278
|
+
for i, text_content in enumerate(text_contents):
|
|
279
|
+
content_name = f"{name}_{i}" if name else f"text_content_{i}"
|
|
280
|
+
log_debug(f"Adding text content: {content_name}")
|
|
281
|
+
self.add_content(
|
|
282
|
+
name=content_name,
|
|
283
|
+
description=description,
|
|
284
|
+
text_content=text_content,
|
|
285
|
+
metadata=metadata,
|
|
286
|
+
include=include,
|
|
287
|
+
exclude=exclude,
|
|
288
|
+
upsert=upsert,
|
|
289
|
+
skip_if_exists=skip_if_exists,
|
|
290
|
+
reader=reader,
|
|
291
|
+
)
|
|
292
|
+
if topics:
|
|
293
|
+
self.add_content(
|
|
294
|
+
name=name,
|
|
295
|
+
description=description,
|
|
296
|
+
topics=topics,
|
|
297
|
+
metadata=metadata,
|
|
298
|
+
include=include,
|
|
299
|
+
exclude=exclude,
|
|
300
|
+
upsert=upsert,
|
|
301
|
+
skip_if_exists=skip_if_exists,
|
|
302
|
+
reader=reader,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
if remote_content:
|
|
306
|
+
self.add_content(
|
|
307
|
+
name=name,
|
|
308
|
+
metadata=metadata,
|
|
309
|
+
description=description,
|
|
310
|
+
remote_content=remote_content,
|
|
311
|
+
upsert=upsert,
|
|
312
|
+
skip_if_exists=skip_if_exists,
|
|
313
|
+
reader=reader,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
else:
|
|
317
|
+
raise ValueError("Invalid usage of add_contents.")
|
|
222
318
|
|
|
223
319
|
# --- Add Content ---
|
|
224
320
|
|
|
@@ -255,7 +351,7 @@ class Knowledge:
|
|
|
255
351
|
include: Optional[List[str]] = None,
|
|
256
352
|
exclude: Optional[List[str]] = None,
|
|
257
353
|
upsert: bool = True,
|
|
258
|
-
skip_if_exists: bool =
|
|
354
|
+
skip_if_exists: bool = False,
|
|
259
355
|
auth: Optional[ContentAuth] = None,
|
|
260
356
|
) -> None:
|
|
261
357
|
# Validation: At least one of the parameters must be provided
|
|
@@ -265,10 +361,6 @@ class Knowledge:
|
|
|
265
361
|
)
|
|
266
362
|
return
|
|
267
363
|
|
|
268
|
-
if not skip_if_exists:
|
|
269
|
-
log_debug("skip_if_exists is disabled, disabling upsert")
|
|
270
|
-
upsert = False
|
|
271
|
-
|
|
272
364
|
content = None
|
|
273
365
|
file_data = None
|
|
274
366
|
if text_content:
|
|
@@ -289,7 +381,7 @@ class Knowledge:
|
|
|
289
381
|
content.content_hash = self._build_content_hash(content)
|
|
290
382
|
content.id = generate_id(content.content_hash)
|
|
291
383
|
|
|
292
|
-
await self.
|
|
384
|
+
await self._load_content_async(content, upsert, skip_if_exists, include, exclude)
|
|
293
385
|
|
|
294
386
|
@overload
|
|
295
387
|
def add_content(
|
|
@@ -342,27 +434,37 @@ class Knowledge:
|
|
|
342
434
|
reader: Optional custom reader for processing the content
|
|
343
435
|
include: Optional list of file patterns to include
|
|
344
436
|
exclude: Optional list of file patterns to exclude
|
|
345
|
-
upsert: Whether to update existing content if it already exists
|
|
346
|
-
skip_if_exists: Whether to skip adding content if it already exists
|
|
437
|
+
upsert: Whether to update existing content if it already exists (only used when skip_if_exists=False)
|
|
438
|
+
skip_if_exists: Whether to skip adding content if it already exists (default: False)
|
|
347
439
|
"""
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
path=path,
|
|
353
|
-
url=url,
|
|
354
|
-
text_content=text_content,
|
|
355
|
-
metadata=metadata,
|
|
356
|
-
topics=topics,
|
|
357
|
-
remote_content=remote_content,
|
|
358
|
-
reader=reader,
|
|
359
|
-
include=include,
|
|
360
|
-
exclude=exclude,
|
|
361
|
-
upsert=upsert,
|
|
362
|
-
skip_if_exists=skip_if_exists,
|
|
363
|
-
auth=auth,
|
|
440
|
+
# Validation: At least one of the parameters must be provided
|
|
441
|
+
if all(argument is None for argument in [path, url, text_content, topics, remote_content]):
|
|
442
|
+
log_warning(
|
|
443
|
+
"At least one of 'path', 'url', 'text_content', 'topics', or 'remote_content' must be provided."
|
|
364
444
|
)
|
|
445
|
+
return
|
|
446
|
+
|
|
447
|
+
content = None
|
|
448
|
+
file_data = None
|
|
449
|
+
if text_content:
|
|
450
|
+
file_data = FileData(content=text_content, type="Text")
|
|
451
|
+
|
|
452
|
+
content = Content(
|
|
453
|
+
name=name,
|
|
454
|
+
description=description,
|
|
455
|
+
path=path,
|
|
456
|
+
url=url,
|
|
457
|
+
file_data=file_data if file_data else None,
|
|
458
|
+
metadata=metadata,
|
|
459
|
+
topics=topics,
|
|
460
|
+
remote_content=remote_content,
|
|
461
|
+
reader=reader,
|
|
462
|
+
auth=auth,
|
|
365
463
|
)
|
|
464
|
+
content.content_hash = self._build_content_hash(content)
|
|
465
|
+
content.id = generate_id(content.content_hash)
|
|
466
|
+
|
|
467
|
+
self._load_content(content, upsert, skip_if_exists, include, exclude)
|
|
366
468
|
|
|
367
469
|
def _should_skip(self, content_hash: str, skip_if_exists: bool) -> bool:
|
|
368
470
|
"""
|
|
@@ -384,7 +486,148 @@ class Knowledge:
|
|
|
384
486
|
|
|
385
487
|
return False
|
|
386
488
|
|
|
387
|
-
|
|
489
|
+
def _select_reader_by_extension(
|
|
490
|
+
self, file_extension: str, provided_reader: Optional[Reader] = None
|
|
491
|
+
) -> Tuple[Optional[Reader], str]:
|
|
492
|
+
"""
|
|
493
|
+
Select a reader based on file extension.
|
|
494
|
+
|
|
495
|
+
Args:
|
|
496
|
+
file_extension: File extension (e.g., '.pdf', '.csv')
|
|
497
|
+
provided_reader: Optional reader already provided
|
|
498
|
+
|
|
499
|
+
Returns:
|
|
500
|
+
Tuple of (reader, name) where name may be adjusted based on extension
|
|
501
|
+
"""
|
|
502
|
+
if provided_reader:
|
|
503
|
+
return provided_reader, ""
|
|
504
|
+
|
|
505
|
+
file_extension = file_extension.lower()
|
|
506
|
+
if file_extension == ".csv":
|
|
507
|
+
return self.csv_reader, "data.csv"
|
|
508
|
+
elif file_extension == ".pdf":
|
|
509
|
+
return self.pdf_reader, ""
|
|
510
|
+
elif file_extension == ".docx":
|
|
511
|
+
return self.docx_reader, ""
|
|
512
|
+
elif file_extension == ".pptx":
|
|
513
|
+
return self.pptx_reader, ""
|
|
514
|
+
elif file_extension == ".json":
|
|
515
|
+
return self.json_reader, ""
|
|
516
|
+
elif file_extension == ".markdown":
|
|
517
|
+
return self.markdown_reader, ""
|
|
518
|
+
else:
|
|
519
|
+
return self.text_reader, ""
|
|
520
|
+
|
|
521
|
+
def _select_reader_by_uri(self, uri: str, provided_reader: Optional[Reader] = None) -> Optional[Reader]:
|
|
522
|
+
"""
|
|
523
|
+
Select a reader based on URI/file path extension.
|
|
524
|
+
|
|
525
|
+
Args:
|
|
526
|
+
uri: URI or file path
|
|
527
|
+
provided_reader: Optional reader already provided
|
|
528
|
+
|
|
529
|
+
Returns:
|
|
530
|
+
Selected reader or None
|
|
531
|
+
"""
|
|
532
|
+
if provided_reader:
|
|
533
|
+
return provided_reader
|
|
534
|
+
|
|
535
|
+
uri_lower = uri.lower()
|
|
536
|
+
if uri_lower.endswith(".pdf"):
|
|
537
|
+
return self.pdf_reader
|
|
538
|
+
elif uri_lower.endswith(".csv"):
|
|
539
|
+
return self.csv_reader
|
|
540
|
+
elif uri_lower.endswith(".docx"):
|
|
541
|
+
return self.docx_reader
|
|
542
|
+
elif uri_lower.endswith(".pptx"):
|
|
543
|
+
return self.pptx_reader
|
|
544
|
+
elif uri_lower.endswith(".json"):
|
|
545
|
+
return self.json_reader
|
|
546
|
+
elif uri_lower.endswith(".markdown"):
|
|
547
|
+
return self.markdown_reader
|
|
548
|
+
else:
|
|
549
|
+
return self.text_reader
|
|
550
|
+
|
|
551
|
+
def _read_with_reader(
|
|
552
|
+
self,
|
|
553
|
+
reader: Reader,
|
|
554
|
+
source: Union[Path, str, BytesIO],
|
|
555
|
+
name: Optional[str] = None,
|
|
556
|
+
password: Optional[str] = None,
|
|
557
|
+
) -> List[Document]:
|
|
558
|
+
"""
|
|
559
|
+
Read content using a reader with optional password handling.
|
|
560
|
+
|
|
561
|
+
Args:
|
|
562
|
+
reader: Reader to use
|
|
563
|
+
source: Source to read from (Path, URL string, or BytesIO)
|
|
564
|
+
name: Optional name for the document
|
|
565
|
+
password: Optional password for protected files
|
|
566
|
+
|
|
567
|
+
Returns:
|
|
568
|
+
List of documents read
|
|
569
|
+
"""
|
|
570
|
+
import inspect
|
|
571
|
+
|
|
572
|
+
read_signature = inspect.signature(reader.read)
|
|
573
|
+
if password and "password" in read_signature.parameters:
|
|
574
|
+
if isinstance(source, BytesIO):
|
|
575
|
+
return reader.read(source, name=name, password=password)
|
|
576
|
+
else:
|
|
577
|
+
return reader.read(source, name=name, password=password)
|
|
578
|
+
else:
|
|
579
|
+
if isinstance(source, BytesIO):
|
|
580
|
+
return reader.read(source, name=name)
|
|
581
|
+
else:
|
|
582
|
+
return reader.read(source, name=name)
|
|
583
|
+
|
|
584
|
+
def _prepare_documents_for_insert(
|
|
585
|
+
self,
|
|
586
|
+
documents: List[Document],
|
|
587
|
+
content_id: str,
|
|
588
|
+
calculate_sizes: bool = False,
|
|
589
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
590
|
+
) -> List[Document]:
|
|
591
|
+
"""
|
|
592
|
+
Prepare documents for insertion by assigning content_id and optionally calculating sizes and updating metadata.
|
|
593
|
+
|
|
594
|
+
Args:
|
|
595
|
+
documents: List of documents to prepare
|
|
596
|
+
content_id: Content ID to assign to documents
|
|
597
|
+
calculate_sizes: Whether to calculate document sizes
|
|
598
|
+
metadata: Optional metadata to merge into document metadata
|
|
599
|
+
|
|
600
|
+
Returns:
|
|
601
|
+
List of prepared documents
|
|
602
|
+
"""
|
|
603
|
+
for document in documents:
|
|
604
|
+
document.content_id = content_id
|
|
605
|
+
if calculate_sizes and document.content and not document.size:
|
|
606
|
+
document.size = len(document.content.encode("utf-8"))
|
|
607
|
+
if metadata:
|
|
608
|
+
document.meta_data.update(metadata)
|
|
609
|
+
return documents
|
|
610
|
+
|
|
611
|
+
def _chunk_documents_sync(self, reader: Reader, documents: List[Document]) -> List[Document]:
|
|
612
|
+
"""
|
|
613
|
+
Chunk documents synchronously.
|
|
614
|
+
|
|
615
|
+
Args:
|
|
616
|
+
reader: Reader with chunking strategy
|
|
617
|
+
documents: Documents to chunk
|
|
618
|
+
|
|
619
|
+
Returns:
|
|
620
|
+
List of chunked documents
|
|
621
|
+
"""
|
|
622
|
+
if not reader or reader.chunk:
|
|
623
|
+
return documents
|
|
624
|
+
|
|
625
|
+
chunked_documents = []
|
|
626
|
+
for doc in documents:
|
|
627
|
+
chunked_documents.extend(reader.chunk_document(doc))
|
|
628
|
+
return chunked_documents
|
|
629
|
+
|
|
630
|
+
async def _load_from_path_async(
|
|
388
631
|
self,
|
|
389
632
|
content: Content,
|
|
390
633
|
upsert: bool,
|
|
@@ -403,7 +646,7 @@ class Knowledge:
|
|
|
403
646
|
if self._should_include_file(str(path), include, exclude):
|
|
404
647
|
log_debug(f"Adding file {path} due to include/exclude filters")
|
|
405
648
|
|
|
406
|
-
await self.
|
|
649
|
+
await self._add_to_contents_db_async(content)
|
|
407
650
|
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
408
651
|
content.status = ContentStatus.COMPLETED
|
|
409
652
|
await self._aupdate_content(content)
|
|
@@ -411,7 +654,90 @@ class Knowledge:
|
|
|
411
654
|
|
|
412
655
|
# Handle LightRAG special case - read file and upload directly
|
|
413
656
|
if self.vector_db.__class__.__name__ == "LightRag":
|
|
414
|
-
await self.
|
|
657
|
+
await self._process_lightrag_content_async(content, KnowledgeContentOrigin.PATH)
|
|
658
|
+
return
|
|
659
|
+
|
|
660
|
+
if content.reader:
|
|
661
|
+
reader = content.reader
|
|
662
|
+
else:
|
|
663
|
+
reader = ReaderFactory.get_reader_for_extension(path.suffix)
|
|
664
|
+
log_debug(f"Using Reader: {reader.__class__.__name__}")
|
|
665
|
+
|
|
666
|
+
if reader:
|
|
667
|
+
password = content.auth.password if content.auth and content.auth.password else None
|
|
668
|
+
read_documents = self._read_with_reader(
|
|
669
|
+
reader, path, name=content.name or path.name, password=password
|
|
670
|
+
)
|
|
671
|
+
else:
|
|
672
|
+
read_documents = []
|
|
673
|
+
|
|
674
|
+
if not content.file_type:
|
|
675
|
+
content.file_type = path.suffix
|
|
676
|
+
|
|
677
|
+
if not content.size and content.file_data:
|
|
678
|
+
content.size = len(content.file_data.content) # type: ignore
|
|
679
|
+
if not content.size:
|
|
680
|
+
try:
|
|
681
|
+
content.size = path.stat().st_size
|
|
682
|
+
except (OSError, IOError) as e:
|
|
683
|
+
log_warning(f"Could not get file size for {path}: {e}")
|
|
684
|
+
content.size = 0
|
|
685
|
+
|
|
686
|
+
if not content.id:
|
|
687
|
+
content.id = generate_id(content.content_hash or "")
|
|
688
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
689
|
+
|
|
690
|
+
await self._handle_vector_db_insert_async(content, read_documents, upsert)
|
|
691
|
+
|
|
692
|
+
elif path.is_dir():
|
|
693
|
+
for file_path in path.iterdir():
|
|
694
|
+
# Apply include/exclude filtering
|
|
695
|
+
if not self._should_include_file(str(file_path), include, exclude):
|
|
696
|
+
log_debug(f"Skipping file {file_path} due to include/exclude filters")
|
|
697
|
+
continue
|
|
698
|
+
|
|
699
|
+
file_content = Content(
|
|
700
|
+
name=content.name,
|
|
701
|
+
path=str(file_path),
|
|
702
|
+
metadata=content.metadata,
|
|
703
|
+
description=content.description,
|
|
704
|
+
reader=content.reader,
|
|
705
|
+
)
|
|
706
|
+
file_content.content_hash = self._build_content_hash(file_content)
|
|
707
|
+
file_content.id = generate_id(file_content.content_hash)
|
|
708
|
+
|
|
709
|
+
await self._load_from_path_async(file_content, upsert, skip_if_exists, include, exclude)
|
|
710
|
+
else:
|
|
711
|
+
log_warning(f"Invalid path: {path}")
|
|
712
|
+
|
|
713
|
+
def _load_from_path(
|
|
714
|
+
self,
|
|
715
|
+
content: Content,
|
|
716
|
+
upsert: bool,
|
|
717
|
+
skip_if_exists: bool,
|
|
718
|
+
include: Optional[List[str]] = None,
|
|
719
|
+
exclude: Optional[List[str]] = None,
|
|
720
|
+
):
|
|
721
|
+
from agno.vectordb import VectorDb
|
|
722
|
+
|
|
723
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
724
|
+
|
|
725
|
+
log_info(f"Adding content from path, {content.id}, {content.name}, {content.path}, {content.description}")
|
|
726
|
+
path = Path(content.path) # type: ignore
|
|
727
|
+
|
|
728
|
+
if path.is_file():
|
|
729
|
+
if self._should_include_file(str(path), include, exclude):
|
|
730
|
+
log_debug(f"Adding file {path} due to include/exclude filters")
|
|
731
|
+
|
|
732
|
+
self._add_to_contents_db(content)
|
|
733
|
+
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
734
|
+
content.status = ContentStatus.COMPLETED
|
|
735
|
+
self._update_content(content)
|
|
736
|
+
return
|
|
737
|
+
|
|
738
|
+
# Handle LightRAG special case - read file and upload directly
|
|
739
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
740
|
+
self._process_lightrag_content(content, KnowledgeContentOrigin.PATH)
|
|
415
741
|
return
|
|
416
742
|
|
|
417
743
|
if content.reader:
|
|
@@ -453,10 +779,11 @@ class Knowledge:
|
|
|
453
779
|
log_warning(f"Could not get file size for {path}: {e}")
|
|
454
780
|
content.size = 0
|
|
455
781
|
|
|
456
|
-
|
|
457
|
-
|
|
782
|
+
if not content.id:
|
|
783
|
+
content.id = generate_id(content.content_hash or "")
|
|
784
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
458
785
|
|
|
459
|
-
|
|
786
|
+
self._handle_vector_db_insert(content, read_documents, upsert)
|
|
460
787
|
|
|
461
788
|
elif path.is_dir():
|
|
462
789
|
for file_path in path.iterdir():
|
|
@@ -475,11 +802,11 @@ class Knowledge:
|
|
|
475
802
|
file_content.content_hash = self._build_content_hash(file_content)
|
|
476
803
|
file_content.id = generate_id(file_content.content_hash)
|
|
477
804
|
|
|
478
|
-
|
|
805
|
+
self._load_from_path(file_content, upsert, skip_if_exists, include, exclude)
|
|
479
806
|
else:
|
|
480
807
|
log_warning(f"Invalid path: {path}")
|
|
481
808
|
|
|
482
|
-
async def
|
|
809
|
+
async def _load_from_url_async(
|
|
483
810
|
self,
|
|
484
811
|
content: Content,
|
|
485
812
|
upsert: bool,
|
|
@@ -503,14 +830,14 @@ class Knowledge:
|
|
|
503
830
|
raise ValueError("No url provided")
|
|
504
831
|
|
|
505
832
|
# 1. Add content to contents database
|
|
506
|
-
await self.
|
|
833
|
+
await self._add_to_contents_db_async(content)
|
|
507
834
|
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
508
835
|
content.status = ContentStatus.COMPLETED
|
|
509
836
|
await self._aupdate_content(content)
|
|
510
837
|
return
|
|
511
838
|
|
|
512
839
|
if self.vector_db.__class__.__name__ == "LightRag":
|
|
513
|
-
await self.
|
|
840
|
+
await self._process_lightrag_content_async(content, KnowledgeContentOrigin.URL)
|
|
514
841
|
return
|
|
515
842
|
|
|
516
843
|
# 2. Validate URL
|
|
@@ -540,47 +867,25 @@ class Knowledge:
|
|
|
540
867
|
bytes_content = BytesIO(response.content)
|
|
541
868
|
|
|
542
869
|
# 4. Select reader
|
|
543
|
-
# If a reader was provided by the user, use it
|
|
544
|
-
reader = content.reader
|
|
545
870
|
name = content.name if content.name else content.url
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
if file_extension == ".csv":
|
|
549
|
-
name = basename(parsed_url.path) or
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
reader = self.pdf_reader
|
|
553
|
-
elif file_extension == ".docx":
|
|
554
|
-
reader = self.docx_reader
|
|
555
|
-
elif file_extension == ".pptx":
|
|
556
|
-
reader = self.pptx_reader
|
|
557
|
-
elif file_extension == ".json":
|
|
558
|
-
reader = self.json_reader
|
|
559
|
-
elif file_extension == ".markdown":
|
|
560
|
-
reader = self.markdown_reader
|
|
561
|
-
else:
|
|
562
|
-
reader = self.text_reader
|
|
871
|
+
if file_extension:
|
|
872
|
+
reader, default_name = self._select_reader_by_extension(file_extension, content.reader)
|
|
873
|
+
if default_name and file_extension == ".csv":
|
|
874
|
+
name = basename(parsed_url.path) or default_name
|
|
875
|
+
else:
|
|
876
|
+
reader = content.reader or self.website_reader
|
|
563
877
|
|
|
564
878
|
# 5. Read content
|
|
565
879
|
try:
|
|
566
880
|
read_documents = []
|
|
567
881
|
if reader is not None:
|
|
568
|
-
#
|
|
569
|
-
import inspect
|
|
570
|
-
|
|
571
|
-
read_signature = inspect.signature(reader.read)
|
|
882
|
+
# Special handling for YouTubeReader
|
|
572
883
|
if reader.__class__.__name__ == "YouTubeReader":
|
|
573
884
|
read_documents = reader.read(content.url, name=name)
|
|
574
|
-
elif "password" in read_signature.parameters and content.auth and content.auth.password:
|
|
575
|
-
if bytes_content:
|
|
576
|
-
read_documents = reader.read(bytes_content, name=name, password=content.auth.password)
|
|
577
|
-
else:
|
|
578
|
-
read_documents = reader.read(content.url, name=name, password=content.auth.password)
|
|
579
885
|
else:
|
|
580
|
-
if
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
read_documents = reader.read(content.url, name=name)
|
|
886
|
+
password = content.auth.password if content.auth and content.auth.password else None
|
|
887
|
+
source = bytes_content if bytes_content else content.url
|
|
888
|
+
read_documents = self._read_with_reader(reader, source, name=name, password=password)
|
|
584
889
|
|
|
585
890
|
except Exception as e:
|
|
586
891
|
log_error(f"Error reading URL: {content.url} - {str(e)}")
|
|
@@ -593,19 +898,115 @@ class Knowledge:
|
|
|
593
898
|
if reader and not reader.chunk:
|
|
594
899
|
read_documents = await reader.chunk_documents_async(read_documents)
|
|
595
900
|
# 7. Prepare and insert the content in the vector database
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
await self._handle_vector_db_insert(content, read_documents, upsert)
|
|
603
|
-
|
|
604
|
-
async def _load_from_content(
|
|
901
|
+
if not content.id:
|
|
902
|
+
content.id = generate_id(content.content_hash or "")
|
|
903
|
+
self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
|
|
904
|
+
await self._handle_vector_db_insert_async(content, read_documents, upsert)
|
|
905
|
+
|
|
906
|
+
def _load_from_url(
|
|
605
907
|
self,
|
|
606
908
|
content: Content,
|
|
607
|
-
upsert: bool
|
|
608
|
-
skip_if_exists: bool
|
|
909
|
+
upsert: bool,
|
|
910
|
+
skip_if_exists: bool,
|
|
911
|
+
):
|
|
912
|
+
"""Synchronous version of _load_from_url.
|
|
913
|
+
|
|
914
|
+
Load the content from a URL:
|
|
915
|
+
1. Set content hash
|
|
916
|
+
2. Validate the URL
|
|
917
|
+
3. Read the content
|
|
918
|
+
4. Prepare and insert the content in the vector database
|
|
919
|
+
"""
|
|
920
|
+
from agno.utils.http import fetch_with_retry
|
|
921
|
+
from agno.vectordb import VectorDb
|
|
922
|
+
|
|
923
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
924
|
+
|
|
925
|
+
log_info(f"Adding content from URL {content.url}")
|
|
926
|
+
content.file_type = "url"
|
|
927
|
+
|
|
928
|
+
if not content.url:
|
|
929
|
+
raise ValueError("No url provided")
|
|
930
|
+
|
|
931
|
+
# 1. Add content to contents database
|
|
932
|
+
self._add_to_contents_db(content)
|
|
933
|
+
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
934
|
+
content.status = ContentStatus.COMPLETED
|
|
935
|
+
self._update_content(content)
|
|
936
|
+
return
|
|
937
|
+
|
|
938
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
939
|
+
self._process_lightrag_content(content, KnowledgeContentOrigin.URL)
|
|
940
|
+
return
|
|
941
|
+
|
|
942
|
+
# 2. Validate URL
|
|
943
|
+
try:
|
|
944
|
+
from urllib.parse import urlparse
|
|
945
|
+
|
|
946
|
+
parsed_url = urlparse(content.url)
|
|
947
|
+
if not all([parsed_url.scheme, parsed_url.netloc]):
|
|
948
|
+
content.status = ContentStatus.FAILED
|
|
949
|
+
content.status_message = f"Invalid URL format: {content.url}"
|
|
950
|
+
self._update_content(content)
|
|
951
|
+
log_warning(f"Invalid URL format: {content.url}")
|
|
952
|
+
except Exception as e:
|
|
953
|
+
content.status = ContentStatus.FAILED
|
|
954
|
+
content.status_message = f"Invalid URL: {content.url} - {str(e)}"
|
|
955
|
+
self._update_content(content)
|
|
956
|
+
log_warning(f"Invalid URL: {content.url} - {str(e)}")
|
|
957
|
+
|
|
958
|
+
# 3. Fetch and load content if file has an extension
|
|
959
|
+
url_path = Path(parsed_url.path)
|
|
960
|
+
file_extension = url_path.suffix.lower()
|
|
961
|
+
|
|
962
|
+
bytes_content = None
|
|
963
|
+
if file_extension:
|
|
964
|
+
response = fetch_with_retry(content.url)
|
|
965
|
+
bytes_content = BytesIO(response.content)
|
|
966
|
+
|
|
967
|
+
# 4. Select reader
|
|
968
|
+
name = content.name if content.name else content.url
|
|
969
|
+
if file_extension:
|
|
970
|
+
reader, default_name = self._select_reader_by_extension(file_extension, content.reader)
|
|
971
|
+
if default_name and file_extension == ".csv":
|
|
972
|
+
name = basename(parsed_url.path) or default_name
|
|
973
|
+
else:
|
|
974
|
+
reader = content.reader or self.website_reader
|
|
975
|
+
|
|
976
|
+
# 5. Read content
|
|
977
|
+
try:
|
|
978
|
+
read_documents = []
|
|
979
|
+
if reader is not None:
|
|
980
|
+
# Special handling for YouTubeReader
|
|
981
|
+
if reader.__class__.__name__ == "YouTubeReader":
|
|
982
|
+
read_documents = reader.read(content.url, name=name)
|
|
983
|
+
else:
|
|
984
|
+
password = content.auth.password if content.auth and content.auth.password else None
|
|
985
|
+
source = bytes_content if bytes_content else content.url
|
|
986
|
+
read_documents = self._read_with_reader(reader, source, name=name, password=password)
|
|
987
|
+
|
|
988
|
+
except Exception as e:
|
|
989
|
+
log_error(f"Error reading URL: {content.url} - {str(e)}")
|
|
990
|
+
content.status = ContentStatus.FAILED
|
|
991
|
+
content.status_message = f"Error reading URL: {content.url} - {str(e)}"
|
|
992
|
+
self._update_content(content)
|
|
993
|
+
return
|
|
994
|
+
|
|
995
|
+
# 6. Chunk documents if needed (sync version)
|
|
996
|
+
if reader:
|
|
997
|
+
read_documents = self._chunk_documents_sync(reader, read_documents)
|
|
998
|
+
|
|
999
|
+
# 7. Prepare and insert the content in the vector database
|
|
1000
|
+
if not content.id:
|
|
1001
|
+
content.id = generate_id(content.content_hash or "")
|
|
1002
|
+
self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
|
|
1003
|
+
self._handle_vector_db_insert(content, read_documents, upsert)
|
|
1004
|
+
|
|
1005
|
+
async def _load_from_content_async(
|
|
1006
|
+
self,
|
|
1007
|
+
content: Content,
|
|
1008
|
+
upsert: bool = True,
|
|
1009
|
+
skip_if_exists: bool = False,
|
|
609
1010
|
):
|
|
610
1011
|
from agno.vectordb import VectorDb
|
|
611
1012
|
|
|
@@ -632,14 +1033,14 @@ class Knowledge:
|
|
|
632
1033
|
|
|
633
1034
|
log_info(f"Adding content from {content.name}")
|
|
634
1035
|
|
|
635
|
-
await self.
|
|
1036
|
+
await self._add_to_contents_db_async(content)
|
|
636
1037
|
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
637
1038
|
content.status = ContentStatus.COMPLETED
|
|
638
1039
|
await self._aupdate_content(content)
|
|
639
1040
|
return
|
|
640
1041
|
|
|
641
1042
|
if content.file_data and self.vector_db.__class__.__name__ == "LightRag":
|
|
642
|
-
await self.
|
|
1043
|
+
await self._process_lightrag_content_async(content, KnowledgeContentOrigin.CONTENT)
|
|
643
1044
|
return
|
|
644
1045
|
|
|
645
1046
|
read_documents = []
|
|
@@ -679,10 +1080,9 @@ class Knowledge:
|
|
|
679
1080
|
reader = self._select_reader(content.file_data.type)
|
|
680
1081
|
name = content.name if content.name else f"content_{content.file_data.type}"
|
|
681
1082
|
read_documents = reader.read(content_io, name=name)
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
read_document.content_id = content.id
|
|
1083
|
+
if not content.id:
|
|
1084
|
+
content.id = generate_id(content.content_hash or "")
|
|
1085
|
+
self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
|
|
686
1086
|
|
|
687
1087
|
if len(read_documents) == 0:
|
|
688
1088
|
content.status = ContentStatus.FAILED
|
|
@@ -696,9 +1096,106 @@ class Knowledge:
|
|
|
696
1096
|
await self._aupdate_content(content)
|
|
697
1097
|
return
|
|
698
1098
|
|
|
699
|
-
await self.
|
|
1099
|
+
await self._handle_vector_db_insert_async(content, read_documents, upsert)
|
|
1100
|
+
|
|
1101
|
+
def _load_from_content(
|
|
1102
|
+
self,
|
|
1103
|
+
content: Content,
|
|
1104
|
+
upsert: bool = True,
|
|
1105
|
+
skip_if_exists: bool = False,
|
|
1106
|
+
):
|
|
1107
|
+
"""Synchronous version of _load_from_content."""
|
|
1108
|
+
from agno.vectordb import VectorDb
|
|
1109
|
+
|
|
1110
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1111
|
+
|
|
1112
|
+
if content.name:
|
|
1113
|
+
name = content.name
|
|
1114
|
+
elif content.file_data and content.file_data.content:
|
|
1115
|
+
if isinstance(content.file_data.content, bytes):
|
|
1116
|
+
name = content.file_data.content[:10].decode("utf-8", errors="ignore")
|
|
1117
|
+
elif isinstance(content.file_data.content, str):
|
|
1118
|
+
name = (
|
|
1119
|
+
content.file_data.content[:10]
|
|
1120
|
+
if len(content.file_data.content) >= 10
|
|
1121
|
+
else content.file_data.content
|
|
1122
|
+
)
|
|
1123
|
+
else:
|
|
1124
|
+
name = str(content.file_data.content)[:10]
|
|
1125
|
+
else:
|
|
1126
|
+
name = None
|
|
1127
|
+
|
|
1128
|
+
if name is not None:
|
|
1129
|
+
content.name = name
|
|
1130
|
+
|
|
1131
|
+
log_info(f"Adding content from {content.name}")
|
|
1132
|
+
|
|
1133
|
+
self._add_to_contents_db(content)
|
|
1134
|
+
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
1135
|
+
content.status = ContentStatus.COMPLETED
|
|
1136
|
+
self._update_content(content)
|
|
1137
|
+
return
|
|
1138
|
+
|
|
1139
|
+
if content.file_data and self.vector_db.__class__.__name__ == "LightRag":
|
|
1140
|
+
self._process_lightrag_content(content, KnowledgeContentOrigin.CONTENT)
|
|
1141
|
+
return
|
|
1142
|
+
|
|
1143
|
+
read_documents = []
|
|
1144
|
+
|
|
1145
|
+
if isinstance(content.file_data, str):
|
|
1146
|
+
content_bytes = content.file_data.encode("utf-8", errors="replace")
|
|
1147
|
+
content_io = io.BytesIO(content_bytes)
|
|
1148
|
+
|
|
1149
|
+
if content.reader:
|
|
1150
|
+
log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
|
|
1151
|
+
read_documents = content.reader.read(content_io, name=name)
|
|
1152
|
+
else:
|
|
1153
|
+
text_reader = self.text_reader
|
|
1154
|
+
if text_reader:
|
|
1155
|
+
read_documents = text_reader.read(content_io, name=name)
|
|
1156
|
+
else:
|
|
1157
|
+
content.status = ContentStatus.FAILED
|
|
1158
|
+
content.status_message = "Text reader not available"
|
|
1159
|
+
self._update_content(content)
|
|
1160
|
+
return
|
|
700
1161
|
|
|
701
|
-
|
|
1162
|
+
elif isinstance(content.file_data, FileData):
|
|
1163
|
+
if content.file_data.type:
|
|
1164
|
+
if isinstance(content.file_data.content, bytes):
|
|
1165
|
+
content_io = io.BytesIO(content.file_data.content)
|
|
1166
|
+
elif isinstance(content.file_data.content, str):
|
|
1167
|
+
content_bytes = content.file_data.content.encode("utf-8", errors="replace")
|
|
1168
|
+
content_io = io.BytesIO(content_bytes)
|
|
1169
|
+
else:
|
|
1170
|
+
content_io = content.file_data.content # type: ignore
|
|
1171
|
+
|
|
1172
|
+
# Respect an explicitly provided reader; otherwise select based on file type
|
|
1173
|
+
if content.reader:
|
|
1174
|
+
log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
|
|
1175
|
+
reader = content.reader
|
|
1176
|
+
else:
|
|
1177
|
+
reader = self._select_reader(content.file_data.type)
|
|
1178
|
+
name = content.name if content.name else f"content_{content.file_data.type}"
|
|
1179
|
+
read_documents = reader.read(content_io, name=name)
|
|
1180
|
+
if not content.id:
|
|
1181
|
+
content.id = generate_id(content.content_hash or "")
|
|
1182
|
+
self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
|
|
1183
|
+
|
|
1184
|
+
if len(read_documents) == 0:
|
|
1185
|
+
content.status = ContentStatus.FAILED
|
|
1186
|
+
content.status_message = "Content could not be read"
|
|
1187
|
+
self._update_content(content)
|
|
1188
|
+
return
|
|
1189
|
+
|
|
1190
|
+
else:
|
|
1191
|
+
content.status = ContentStatus.FAILED
|
|
1192
|
+
content.status_message = "No content provided"
|
|
1193
|
+
self._update_content(content)
|
|
1194
|
+
return
|
|
1195
|
+
|
|
1196
|
+
self._handle_vector_db_insert(content, read_documents, upsert)
|
|
1197
|
+
|
|
1198
|
+
async def _load_from_topics_async(
|
|
702
1199
|
self,
|
|
703
1200
|
content: Content,
|
|
704
1201
|
upsert: bool,
|
|
@@ -727,47 +1224,264 @@ class Knowledge:
|
|
|
727
1224
|
content.content_hash = self._build_content_hash(content)
|
|
728
1225
|
content.id = generate_id(content.content_hash)
|
|
729
1226
|
|
|
730
|
-
await self.
|
|
1227
|
+
await self._add_to_contents_db_async(content)
|
|
731
1228
|
if self._should_skip(content.content_hash, skip_if_exists):
|
|
732
1229
|
content.status = ContentStatus.COMPLETED
|
|
733
1230
|
await self._aupdate_content(content)
|
|
734
1231
|
return
|
|
735
1232
|
|
|
736
|
-
if self.vector_db.__class__.__name__ == "LightRag":
|
|
737
|
-
await self.
|
|
1233
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
1234
|
+
await self._process_lightrag_content_async(content, KnowledgeContentOrigin.TOPIC)
|
|
1235
|
+
return
|
|
1236
|
+
|
|
1237
|
+
if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
|
|
1238
|
+
log_info(f"Content {content.content_hash} already exists, skipping")
|
|
1239
|
+
continue
|
|
1240
|
+
|
|
1241
|
+
await self._add_to_contents_db_async(content)
|
|
1242
|
+
if content.reader is None:
|
|
1243
|
+
log_error(f"No reader available for topic: {topic}")
|
|
1244
|
+
content.status = ContentStatus.FAILED
|
|
1245
|
+
content.status_message = "No reader available for topic"
|
|
1246
|
+
await self._aupdate_content(content)
|
|
1247
|
+
continue
|
|
1248
|
+
|
|
1249
|
+
read_documents = content.reader.read(topic)
|
|
1250
|
+
if len(read_documents) > 0:
|
|
1251
|
+
self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
|
|
1252
|
+
else:
|
|
1253
|
+
content.status = ContentStatus.FAILED
|
|
1254
|
+
content.status_message = "No content found for topic"
|
|
1255
|
+
await self._aupdate_content(content)
|
|
1256
|
+
|
|
1257
|
+
await self._handle_vector_db_insert_async(content, read_documents, upsert)
|
|
1258
|
+
|
|
1259
|
+
def _load_from_topics(
|
|
1260
|
+
self,
|
|
1261
|
+
content: Content,
|
|
1262
|
+
upsert: bool,
|
|
1263
|
+
skip_if_exists: bool,
|
|
1264
|
+
):
|
|
1265
|
+
"""Synchronous version of _load_from_topics."""
|
|
1266
|
+
from agno.vectordb import VectorDb
|
|
1267
|
+
|
|
1268
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1269
|
+
log_info(f"Adding content from topics: {content.topics}")
|
|
1270
|
+
|
|
1271
|
+
if content.topics is None:
|
|
1272
|
+
log_warning("No topics provided for content")
|
|
1273
|
+
return
|
|
1274
|
+
|
|
1275
|
+
for topic in content.topics:
|
|
1276
|
+
content = Content(
|
|
1277
|
+
name=topic,
|
|
1278
|
+
metadata=content.metadata,
|
|
1279
|
+
reader=content.reader,
|
|
1280
|
+
status=ContentStatus.PROCESSING if content.reader else ContentStatus.FAILED,
|
|
1281
|
+
file_data=FileData(
|
|
1282
|
+
type="Topic",
|
|
1283
|
+
),
|
|
1284
|
+
topics=[topic],
|
|
1285
|
+
)
|
|
1286
|
+
content.content_hash = self._build_content_hash(content)
|
|
1287
|
+
content.id = generate_id(content.content_hash)
|
|
1288
|
+
|
|
1289
|
+
self._add_to_contents_db(content)
|
|
1290
|
+
if self._should_skip(content.content_hash, skip_if_exists):
|
|
1291
|
+
content.status = ContentStatus.COMPLETED
|
|
1292
|
+
self._update_content(content)
|
|
1293
|
+
return
|
|
1294
|
+
|
|
1295
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
1296
|
+
self._process_lightrag_content(content, KnowledgeContentOrigin.TOPIC)
|
|
1297
|
+
return
|
|
1298
|
+
|
|
1299
|
+
if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
|
|
1300
|
+
log_info(f"Content {content.content_hash} already exists, skipping")
|
|
1301
|
+
continue
|
|
1302
|
+
|
|
1303
|
+
self._add_to_contents_db(content)
|
|
1304
|
+
if content.reader is None:
|
|
1305
|
+
log_error(f"No reader available for topic: {topic}")
|
|
1306
|
+
content.status = ContentStatus.FAILED
|
|
1307
|
+
content.status_message = "No reader available for topic"
|
|
1308
|
+
self._update_content(content)
|
|
1309
|
+
continue
|
|
1310
|
+
|
|
1311
|
+
read_documents = content.reader.read(topic)
|
|
1312
|
+
if len(read_documents) > 0:
|
|
1313
|
+
self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
|
|
1314
|
+
else:
|
|
1315
|
+
content.status = ContentStatus.FAILED
|
|
1316
|
+
content.status_message = "No content found for topic"
|
|
1317
|
+
self._update_content(content)
|
|
1318
|
+
|
|
1319
|
+
self._handle_vector_db_insert(content, read_documents, upsert)
|
|
1320
|
+
|
|
1321
|
+
async def _load_from_remote_content_async(
|
|
1322
|
+
self,
|
|
1323
|
+
content: Content,
|
|
1324
|
+
upsert: bool,
|
|
1325
|
+
skip_if_exists: bool,
|
|
1326
|
+
):
|
|
1327
|
+
if content.remote_content is None:
|
|
1328
|
+
log_warning("No remote content provided for content")
|
|
1329
|
+
return
|
|
1330
|
+
|
|
1331
|
+
remote_content = content.remote_content
|
|
1332
|
+
|
|
1333
|
+
if isinstance(remote_content, S3Content):
|
|
1334
|
+
await self._load_from_s3_async(content, upsert, skip_if_exists)
|
|
1335
|
+
|
|
1336
|
+
elif isinstance(remote_content, GCSContent):
|
|
1337
|
+
await self._load_from_gcs_async(content, upsert, skip_if_exists)
|
|
1338
|
+
|
|
1339
|
+
else:
|
|
1340
|
+
log_warning(f"Unsupported remote content type: {type(remote_content)}")
|
|
1341
|
+
|
|
1342
|
+
async def _load_from_s3_async(self, content: Content, upsert: bool, skip_if_exists: bool):
|
|
1343
|
+
"""Load the contextual S3 content.
|
|
1344
|
+
|
|
1345
|
+
1. Identify objects to read
|
|
1346
|
+
2. Setup Content object
|
|
1347
|
+
3. Hash content and add it to the contents database
|
|
1348
|
+
4. Select reader
|
|
1349
|
+
5. Fetch and load the content
|
|
1350
|
+
6. Read the content
|
|
1351
|
+
7. Prepare and insert the content in the vector database
|
|
1352
|
+
8. Remove temporary file if needed
|
|
1353
|
+
"""
|
|
1354
|
+
from agno.cloud.aws.s3.object import S3Object
|
|
1355
|
+
|
|
1356
|
+
remote_content: S3Content = cast(S3Content, content.remote_content)
|
|
1357
|
+
|
|
1358
|
+
# 1. Identify objects to read
|
|
1359
|
+
objects_to_read: List[S3Object] = []
|
|
1360
|
+
if remote_content.bucket is not None:
|
|
1361
|
+
if remote_content.key is not None:
|
|
1362
|
+
_object = S3Object(bucket_name=remote_content.bucket.name, name=remote_content.key)
|
|
1363
|
+
objects_to_read.append(_object)
|
|
1364
|
+
elif remote_content.object is not None:
|
|
1365
|
+
objects_to_read.append(remote_content.object)
|
|
1366
|
+
elif remote_content.prefix is not None:
|
|
1367
|
+
objects_to_read.extend(remote_content.bucket.get_objects(prefix=remote_content.prefix))
|
|
1368
|
+
else:
|
|
1369
|
+
objects_to_read.extend(remote_content.bucket.get_objects())
|
|
1370
|
+
|
|
1371
|
+
for s3_object in objects_to_read:
|
|
1372
|
+
# 2. Setup Content object
|
|
1373
|
+
content_name = content.name or ""
|
|
1374
|
+
content_name += "_" + (s3_object.name or "")
|
|
1375
|
+
content_entry = Content(
|
|
1376
|
+
name=content_name,
|
|
1377
|
+
description=content.description,
|
|
1378
|
+
status=ContentStatus.PROCESSING,
|
|
1379
|
+
metadata=content.metadata,
|
|
1380
|
+
file_type="s3",
|
|
1381
|
+
)
|
|
1382
|
+
|
|
1383
|
+
# 3. Hash content and add it to the contents database
|
|
1384
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
1385
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
1386
|
+
await self._add_to_contents_db_async(content_entry)
|
|
1387
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
1388
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
1389
|
+
await self._aupdate_content(content_entry)
|
|
1390
|
+
return
|
|
1391
|
+
|
|
1392
|
+
# 4. Select reader
|
|
1393
|
+
reader = self._select_reader_by_uri(s3_object.uri, content.reader)
|
|
1394
|
+
reader = cast(Reader, reader)
|
|
1395
|
+
|
|
1396
|
+
# 5. Fetch and load the content
|
|
1397
|
+
temporary_file = None
|
|
1398
|
+
obj_name = content_name or s3_object.name.split("/")[-1]
|
|
1399
|
+
readable_content: Optional[Union[BytesIO, Path]] = None
|
|
1400
|
+
if s3_object.uri.endswith(".pdf"):
|
|
1401
|
+
readable_content = BytesIO(s3_object.get_resource().get()["Body"].read())
|
|
1402
|
+
else:
|
|
1403
|
+
temporary_file = Path("storage").joinpath(obj_name)
|
|
1404
|
+
readable_content = temporary_file
|
|
1405
|
+
s3_object.download(readable_content) # type: ignore
|
|
1406
|
+
|
|
1407
|
+
# 6. Read the content
|
|
1408
|
+
read_documents = reader.read(readable_content, name=obj_name)
|
|
1409
|
+
|
|
1410
|
+
# 7. Prepare and insert the content in the vector database
|
|
1411
|
+
if not content.id:
|
|
1412
|
+
content.id = generate_id(content.content_hash or "")
|
|
1413
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
1414
|
+
await self._handle_vector_db_insert_async(content_entry, read_documents, upsert)
|
|
1415
|
+
|
|
1416
|
+
# 8. Remove temporary file if needed
|
|
1417
|
+
if temporary_file:
|
|
1418
|
+
temporary_file.unlink()
|
|
1419
|
+
|
|
1420
|
+
async def _load_from_gcs_async(self, content: Content, upsert: bool, skip_if_exists: bool):
|
|
1421
|
+
"""Load the contextual GCS content.
|
|
1422
|
+
|
|
1423
|
+
1. Identify objects to read
|
|
1424
|
+
2. Setup Content object
|
|
1425
|
+
3. Hash content and add it to the contents database
|
|
1426
|
+
4. Select reader
|
|
1427
|
+
5. Fetch and load the content
|
|
1428
|
+
6. Read the content
|
|
1429
|
+
7. Prepare and insert the content in the vector database
|
|
1430
|
+
"""
|
|
1431
|
+
remote_content: GCSContent = cast(GCSContent, content.remote_content)
|
|
1432
|
+
|
|
1433
|
+
# 1. Identify objects to read
|
|
1434
|
+
objects_to_read = []
|
|
1435
|
+
if remote_content.blob_name is not None:
|
|
1436
|
+
objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name)) # type: ignore
|
|
1437
|
+
elif remote_content.prefix is not None:
|
|
1438
|
+
objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
|
|
1439
|
+
else:
|
|
1440
|
+
objects_to_read.extend(remote_content.bucket.list_blobs()) # type: ignore
|
|
1441
|
+
|
|
1442
|
+
for gcs_object in objects_to_read:
|
|
1443
|
+
# 2. Setup Content object
|
|
1444
|
+
name = (content.name or "content") + "_" + gcs_object.name
|
|
1445
|
+
content_entry = Content(
|
|
1446
|
+
name=name,
|
|
1447
|
+
description=content.description,
|
|
1448
|
+
status=ContentStatus.PROCESSING,
|
|
1449
|
+
metadata=content.metadata,
|
|
1450
|
+
file_type="gcs",
|
|
1451
|
+
)
|
|
1452
|
+
|
|
1453
|
+
# 3. Hash content and add it to the contents database
|
|
1454
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
1455
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
1456
|
+
await self._add_to_contents_db_async(content_entry)
|
|
1457
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
1458
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
1459
|
+
await self._aupdate_content(content_entry)
|
|
738
1460
|
return
|
|
739
1461
|
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
1462
|
+
# 4. Select reader
|
|
1463
|
+
reader = self._select_reader_by_uri(gcs_object.name, content.reader)
|
|
1464
|
+
reader = cast(Reader, reader)
|
|
743
1465
|
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
log_error(f"No reader available for topic: {topic}")
|
|
747
|
-
content.status = ContentStatus.FAILED
|
|
748
|
-
content.status_message = "No reader available for topic"
|
|
749
|
-
await self._aupdate_content(content)
|
|
750
|
-
continue
|
|
1466
|
+
# 5. Fetch and load the content
|
|
1467
|
+
readable_content = BytesIO(gcs_object.download_as_bytes())
|
|
751
1468
|
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
for read_document in read_documents:
|
|
755
|
-
read_document.content_id = content.id
|
|
756
|
-
if read_document.content:
|
|
757
|
-
read_document.size = len(read_document.content.encode("utf-8"))
|
|
758
|
-
else:
|
|
759
|
-
content.status = ContentStatus.FAILED
|
|
760
|
-
content.status_message = "No content found for topic"
|
|
761
|
-
await self._aupdate_content(content)
|
|
1469
|
+
# 6. Read the content
|
|
1470
|
+
read_documents = reader.read(readable_content, name=name)
|
|
762
1471
|
|
|
763
|
-
|
|
1472
|
+
# 7. Prepare and insert the content in the vector database
|
|
1473
|
+
if not content.id:
|
|
1474
|
+
content.id = generate_id(content.content_hash or "")
|
|
1475
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
1476
|
+
await self._handle_vector_db_insert_async(content_entry, read_documents, upsert)
|
|
764
1477
|
|
|
765
|
-
|
|
1478
|
+
def _load_from_remote_content(
|
|
766
1479
|
self,
|
|
767
1480
|
content: Content,
|
|
768
1481
|
upsert: bool,
|
|
769
1482
|
skip_if_exists: bool,
|
|
770
1483
|
):
|
|
1484
|
+
"""Synchronous version of _load_from_remote_content."""
|
|
771
1485
|
if content.remote_content is None:
|
|
772
1486
|
log_warning("No remote content provided for content")
|
|
773
1487
|
return
|
|
@@ -775,17 +1489,18 @@ class Knowledge:
|
|
|
775
1489
|
remote_content = content.remote_content
|
|
776
1490
|
|
|
777
1491
|
if isinstance(remote_content, S3Content):
|
|
778
|
-
|
|
1492
|
+
self._load_from_s3(content, upsert, skip_if_exists)
|
|
779
1493
|
|
|
780
1494
|
elif isinstance(remote_content, GCSContent):
|
|
781
|
-
|
|
1495
|
+
self._load_from_gcs(content, upsert, skip_if_exists)
|
|
782
1496
|
|
|
783
1497
|
else:
|
|
784
1498
|
log_warning(f"Unsupported remote content type: {type(remote_content)}")
|
|
785
1499
|
|
|
786
|
-
|
|
787
|
-
"""
|
|
1500
|
+
def _load_from_s3(self, content: Content, upsert: bool, skip_if_exists: bool):
|
|
1501
|
+
"""Synchronous version of _load_from_s3.
|
|
788
1502
|
|
|
1503
|
+
Load the contextual S3 content:
|
|
789
1504
|
1. Identify objects to read
|
|
790
1505
|
2. Setup Content object
|
|
791
1506
|
3. Hash content and add it to the contents database
|
|
@@ -827,29 +1542,14 @@ class Knowledge:
|
|
|
827
1542
|
# 3. Hash content and add it to the contents database
|
|
828
1543
|
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
829
1544
|
content_entry.id = generate_id(content_entry.content_hash)
|
|
830
|
-
|
|
1545
|
+
self._add_to_contents_db(content_entry)
|
|
831
1546
|
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
832
1547
|
content_entry.status = ContentStatus.COMPLETED
|
|
833
|
-
|
|
1548
|
+
self._update_content(content_entry)
|
|
834
1549
|
return
|
|
835
1550
|
|
|
836
1551
|
# 4. Select reader
|
|
837
|
-
reader = content.reader
|
|
838
|
-
if reader is None:
|
|
839
|
-
if s3_object.uri.endswith(".pdf"):
|
|
840
|
-
reader = self.pdf_reader
|
|
841
|
-
elif s3_object.uri.endswith(".csv"):
|
|
842
|
-
reader = self.csv_reader
|
|
843
|
-
elif s3_object.uri.endswith(".docx"):
|
|
844
|
-
reader = self.docx_reader
|
|
845
|
-
elif s3_object.uri.endswith(".pptx"):
|
|
846
|
-
reader = self.pptx_reader
|
|
847
|
-
elif s3_object.uri.endswith(".json"):
|
|
848
|
-
reader = self.json_reader
|
|
849
|
-
elif s3_object.uri.endswith(".markdown"):
|
|
850
|
-
reader = self.markdown_reader
|
|
851
|
-
else:
|
|
852
|
-
reader = self.text_reader
|
|
1552
|
+
reader = self._select_reader_by_uri(s3_object.uri, content.reader)
|
|
853
1553
|
reader = cast(Reader, reader)
|
|
854
1554
|
|
|
855
1555
|
# 5. Fetch and load the content
|
|
@@ -867,17 +1567,19 @@ class Knowledge:
|
|
|
867
1567
|
read_documents = reader.read(readable_content, name=obj_name)
|
|
868
1568
|
|
|
869
1569
|
# 7. Prepare and insert the content in the vector database
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
1570
|
+
if not content.id:
|
|
1571
|
+
content.id = generate_id(content.content_hash or "")
|
|
1572
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
1573
|
+
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
873
1574
|
|
|
874
1575
|
# 8. Remove temporary file if needed
|
|
875
1576
|
if temporary_file:
|
|
876
1577
|
temporary_file.unlink()
|
|
877
1578
|
|
|
878
|
-
|
|
879
|
-
"""
|
|
1579
|
+
def _load_from_gcs(self, content: Content, upsert: bool, skip_if_exists: bool):
|
|
1580
|
+
"""Synchronous version of _load_from_gcs.
|
|
880
1581
|
|
|
1582
|
+
Load the contextual GCS content:
|
|
881
1583
|
1. Identify objects to read
|
|
882
1584
|
2. Setup Content object
|
|
883
1585
|
3. Hash content and add it to the contents database
|
|
@@ -911,29 +1613,14 @@ class Knowledge:
|
|
|
911
1613
|
# 3. Hash content and add it to the contents database
|
|
912
1614
|
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
913
1615
|
content_entry.id = generate_id(content_entry.content_hash)
|
|
914
|
-
|
|
1616
|
+
self._add_to_contents_db(content_entry)
|
|
915
1617
|
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
916
1618
|
content_entry.status = ContentStatus.COMPLETED
|
|
917
|
-
|
|
1619
|
+
self._update_content(content_entry)
|
|
918
1620
|
return
|
|
919
1621
|
|
|
920
1622
|
# 4. Select reader
|
|
921
|
-
reader = content.reader
|
|
922
|
-
if reader is None:
|
|
923
|
-
if gcs_object.name.endswith(".pdf"):
|
|
924
|
-
reader = self.pdf_reader
|
|
925
|
-
elif gcs_object.name.endswith(".csv"):
|
|
926
|
-
reader = self.csv_reader
|
|
927
|
-
elif gcs_object.name.endswith(".docx"):
|
|
928
|
-
reader = self.docx_reader
|
|
929
|
-
elif gcs_object.name.endswith(".pptx"):
|
|
930
|
-
reader = self.pptx_reader
|
|
931
|
-
elif gcs_object.name.endswith(".json"):
|
|
932
|
-
reader = self.json_reader
|
|
933
|
-
elif gcs_object.name.endswith(".markdown"):
|
|
934
|
-
reader = self.markdown_reader
|
|
935
|
-
else:
|
|
936
|
-
reader = self.text_reader
|
|
1623
|
+
reader = self._select_reader_by_uri(gcs_object.name, content.reader)
|
|
937
1624
|
reader = cast(Reader, reader)
|
|
938
1625
|
|
|
939
1626
|
# 5. Fetch and load the content
|
|
@@ -943,11 +1630,12 @@ class Knowledge:
|
|
|
943
1630
|
read_documents = reader.read(readable_content, name=name)
|
|
944
1631
|
|
|
945
1632
|
# 7. Prepare and insert the content in the vector database
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
1633
|
+
if not content.id:
|
|
1634
|
+
content.id = generate_id(content.content_hash or "")
|
|
1635
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
1636
|
+
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
949
1637
|
|
|
950
|
-
async def
|
|
1638
|
+
async def _handle_vector_db_insert_async(self, content: Content, read_documents, upsert):
|
|
951
1639
|
from agno.vectordb import VectorDb
|
|
952
1640
|
|
|
953
1641
|
self.vector_db = cast(VectorDb, self.vector_db)
|
|
@@ -985,7 +1673,70 @@ class Knowledge:
|
|
|
985
1673
|
content.status = ContentStatus.COMPLETED
|
|
986
1674
|
await self._aupdate_content(content)
|
|
987
1675
|
|
|
988
|
-
|
|
1676
|
+
def _handle_vector_db_insert(self, content: Content, read_documents, upsert):
|
|
1677
|
+
"""Synchronously handle vector database insertion."""
|
|
1678
|
+
from agno.vectordb import VectorDb
|
|
1679
|
+
|
|
1680
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1681
|
+
|
|
1682
|
+
if not self.vector_db:
|
|
1683
|
+
log_error("No vector database configured")
|
|
1684
|
+
content.status = ContentStatus.FAILED
|
|
1685
|
+
content.status_message = "No vector database configured"
|
|
1686
|
+
self._update_content(content)
|
|
1687
|
+
return
|
|
1688
|
+
|
|
1689
|
+
if self.vector_db.upsert_available() and upsert:
|
|
1690
|
+
try:
|
|
1691
|
+
self.vector_db.upsert(content.content_hash, read_documents, content.metadata) # type: ignore[arg-type]
|
|
1692
|
+
except Exception as e:
|
|
1693
|
+
log_error(f"Error upserting document: {e}")
|
|
1694
|
+
content.status = ContentStatus.FAILED
|
|
1695
|
+
content.status_message = "Could not upsert embedding"
|
|
1696
|
+
self._update_content(content)
|
|
1697
|
+
return
|
|
1698
|
+
else:
|
|
1699
|
+
try:
|
|
1700
|
+
self.vector_db.insert(
|
|
1701
|
+
content.content_hash, # type: ignore[arg-type]
|
|
1702
|
+
documents=read_documents,
|
|
1703
|
+
filters=content.metadata, # type: ignore[arg-type]
|
|
1704
|
+
)
|
|
1705
|
+
except Exception as e:
|
|
1706
|
+
log_error(f"Error inserting document: {e}")
|
|
1707
|
+
content.status = ContentStatus.FAILED
|
|
1708
|
+
content.status_message = "Could not insert embedding"
|
|
1709
|
+
self._update_content(content)
|
|
1710
|
+
return
|
|
1711
|
+
|
|
1712
|
+
content.status = ContentStatus.COMPLETED
|
|
1713
|
+
self._update_content(content)
|
|
1714
|
+
|
|
1715
|
+
def _load_content(
|
|
1716
|
+
self,
|
|
1717
|
+
content: Content,
|
|
1718
|
+
upsert: bool,
|
|
1719
|
+
skip_if_exists: bool,
|
|
1720
|
+
include: Optional[List[str]] = None,
|
|
1721
|
+
exclude: Optional[List[str]] = None,
|
|
1722
|
+
) -> None:
|
|
1723
|
+
"""Synchronously load content."""
|
|
1724
|
+
if content.path:
|
|
1725
|
+
self._load_from_path(content, upsert, skip_if_exists, include, exclude)
|
|
1726
|
+
|
|
1727
|
+
if content.url:
|
|
1728
|
+
self._load_from_url(content, upsert, skip_if_exists)
|
|
1729
|
+
|
|
1730
|
+
if content.file_data:
|
|
1731
|
+
self._load_from_content(content, upsert, skip_if_exists)
|
|
1732
|
+
|
|
1733
|
+
if content.topics:
|
|
1734
|
+
self._load_from_topics(content, upsert, skip_if_exists)
|
|
1735
|
+
|
|
1736
|
+
if content.remote_content:
|
|
1737
|
+
self._load_from_remote_content(content, upsert, skip_if_exists)
|
|
1738
|
+
|
|
1739
|
+
async def _load_content_async(
|
|
989
1740
|
self,
|
|
990
1741
|
content: Content,
|
|
991
1742
|
upsert: bool,
|
|
@@ -994,19 +1745,19 @@ class Knowledge:
|
|
|
994
1745
|
exclude: Optional[List[str]] = None,
|
|
995
1746
|
) -> None:
|
|
996
1747
|
if content.path:
|
|
997
|
-
await self.
|
|
1748
|
+
await self._load_from_path_async(content, upsert, skip_if_exists, include, exclude)
|
|
998
1749
|
|
|
999
1750
|
if content.url:
|
|
1000
|
-
await self.
|
|
1751
|
+
await self._load_from_url_async(content, upsert, skip_if_exists)
|
|
1001
1752
|
|
|
1002
1753
|
if content.file_data:
|
|
1003
|
-
await self.
|
|
1754
|
+
await self._load_from_content_async(content, upsert, skip_if_exists)
|
|
1004
1755
|
|
|
1005
1756
|
if content.topics:
|
|
1006
|
-
await self.
|
|
1757
|
+
await self._load_from_topics_async(content, upsert, skip_if_exists)
|
|
1007
1758
|
|
|
1008
1759
|
if content.remote_content:
|
|
1009
|
-
await self.
|
|
1760
|
+
await self._load_from_remote_content_async(content, upsert, skip_if_exists)
|
|
1010
1761
|
|
|
1011
1762
|
def _build_content_hash(self, content: Content) -> str:
|
|
1012
1763
|
"""
|
|
@@ -1078,7 +1829,7 @@ class Knowledge:
|
|
|
1078
1829
|
# Already a string, return as-is
|
|
1079
1830
|
return value
|
|
1080
1831
|
|
|
1081
|
-
async def
|
|
1832
|
+
async def _add_to_contents_db_async(self, content: Content):
|
|
1082
1833
|
if self.contents_db:
|
|
1083
1834
|
created_at = content.created_at if content.created_at else int(time.time())
|
|
1084
1835
|
updated_at = content.updated_at if content.updated_at else int(time.time())
|
|
@@ -1121,6 +1872,52 @@ class Knowledge:
|
|
|
1121
1872
|
else:
|
|
1122
1873
|
self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
1123
1874
|
|
|
1875
|
+
def _add_to_contents_db(self, content: Content):
|
|
1876
|
+
"""Synchronously add content to contents database."""
|
|
1877
|
+
if self.contents_db:
|
|
1878
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1879
|
+
raise ValueError(
|
|
1880
|
+
"_add_to_contents_db() is not supported with an async DB. Please use add_content_async with AsyncDb."
|
|
1881
|
+
)
|
|
1882
|
+
|
|
1883
|
+
created_at = content.created_at if content.created_at else int(time.time())
|
|
1884
|
+
updated_at = content.updated_at if content.updated_at else int(time.time())
|
|
1885
|
+
|
|
1886
|
+
file_type = (
|
|
1887
|
+
content.file_type
|
|
1888
|
+
if content.file_type
|
|
1889
|
+
else content.file_data.type
|
|
1890
|
+
if content.file_data and content.file_data.type
|
|
1891
|
+
else None
|
|
1892
|
+
)
|
|
1893
|
+
# Safely handle string fields with proper type checking
|
|
1894
|
+
safe_name = self._ensure_string_field(content.name, "content.name", default="")
|
|
1895
|
+
safe_description = self._ensure_string_field(content.description, "content.description", default="")
|
|
1896
|
+
safe_linked_to = self._ensure_string_field(self.name, "knowledge.name", default="")
|
|
1897
|
+
safe_status_message = self._ensure_string_field(
|
|
1898
|
+
content.status_message, "content.status_message", default=""
|
|
1899
|
+
)
|
|
1900
|
+
|
|
1901
|
+
content_row = KnowledgeRow(
|
|
1902
|
+
id=content.id,
|
|
1903
|
+
name=safe_name,
|
|
1904
|
+
description=safe_description,
|
|
1905
|
+
metadata=content.metadata,
|
|
1906
|
+
type=file_type,
|
|
1907
|
+
size=content.size
|
|
1908
|
+
if content.size
|
|
1909
|
+
else len(content.file_data.content)
|
|
1910
|
+
if content.file_data and content.file_data.content
|
|
1911
|
+
else None,
|
|
1912
|
+
linked_to=safe_linked_to,
|
|
1913
|
+
access_count=0,
|
|
1914
|
+
status=content.status if content.status else ContentStatus.PROCESSING,
|
|
1915
|
+
status_message=safe_status_message,
|
|
1916
|
+
created_at=created_at,
|
|
1917
|
+
updated_at=updated_at,
|
|
1918
|
+
)
|
|
1919
|
+
self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
1920
|
+
|
|
1124
1921
|
def _update_content(self, content: Content) -> Optional[Dict[str, Any]]:
|
|
1125
1922
|
from agno.vectordb import VectorDb
|
|
1126
1923
|
|
|
@@ -1221,12 +2018,12 @@ class Knowledge:
|
|
|
1221
2018
|
log_warning("Contents DB not found for knowledge base")
|
|
1222
2019
|
return None
|
|
1223
2020
|
|
|
1224
|
-
async def
|
|
2021
|
+
async def _process_lightrag_content_async(self, content: Content, content_type: KnowledgeContentOrigin) -> None:
|
|
1225
2022
|
from agno.vectordb import VectorDb
|
|
1226
2023
|
|
|
1227
2024
|
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1228
2025
|
|
|
1229
|
-
await self.
|
|
2026
|
+
await self._add_to_contents_db_async(content)
|
|
1230
2027
|
if content_type == KnowledgeContentOrigin.PATH:
|
|
1231
2028
|
if content.file_data is None:
|
|
1232
2029
|
log_warning("No file data provided")
|
|
@@ -1283,9 +2080,9 @@ class Knowledge:
|
|
|
1283
2080
|
|
|
1284
2081
|
reader.chunk = False
|
|
1285
2082
|
read_documents = reader.read(content.url, name=content.name)
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
2083
|
+
if not content.id:
|
|
2084
|
+
content.id = generate_id(content.content_hash or "")
|
|
2085
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
1289
2086
|
|
|
1290
2087
|
if not read_documents:
|
|
1291
2088
|
log_error("No documents read from URL")
|
|
@@ -1378,6 +2175,175 @@ class Knowledge:
|
|
|
1378
2175
|
log_warning(f"No documents found for LightRAG upload: {content.name}")
|
|
1379
2176
|
return
|
|
1380
2177
|
|
|
2178
|
+
def _process_lightrag_content(self, content: Content, content_type: KnowledgeContentOrigin) -> None:
|
|
2179
|
+
"""Synchronously process LightRAG content. Uses asyncio.run() only for LightRAG-specific async methods."""
|
|
2180
|
+
from agno.vectordb import VectorDb
|
|
2181
|
+
|
|
2182
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
2183
|
+
|
|
2184
|
+
self._add_to_contents_db(content)
|
|
2185
|
+
if content_type == KnowledgeContentOrigin.PATH:
|
|
2186
|
+
if content.file_data is None:
|
|
2187
|
+
log_warning("No file data provided")
|
|
2188
|
+
|
|
2189
|
+
if content.path is None:
|
|
2190
|
+
log_error("No path provided for content")
|
|
2191
|
+
return
|
|
2192
|
+
|
|
2193
|
+
path = Path(content.path)
|
|
2194
|
+
|
|
2195
|
+
log_info(f"Uploading file to LightRAG from path: {path}")
|
|
2196
|
+
try:
|
|
2197
|
+
# Read the file content from path
|
|
2198
|
+
with open(path, "rb") as f:
|
|
2199
|
+
file_content = f.read()
|
|
2200
|
+
|
|
2201
|
+
# Get file type from extension or content.file_type
|
|
2202
|
+
file_type = content.file_type or path.suffix
|
|
2203
|
+
|
|
2204
|
+
if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
|
|
2205
|
+
# LightRAG only has async methods, use asyncio.run() here
|
|
2206
|
+
result = asyncio.run(
|
|
2207
|
+
self.vector_db.insert_file_bytes(
|
|
2208
|
+
file_content=file_content,
|
|
2209
|
+
filename=path.name,
|
|
2210
|
+
content_type=file_type,
|
|
2211
|
+
send_metadata=True,
|
|
2212
|
+
)
|
|
2213
|
+
)
|
|
2214
|
+
else:
|
|
2215
|
+
log_error("Vector database does not support file insertion")
|
|
2216
|
+
content.status = ContentStatus.FAILED
|
|
2217
|
+
self._update_content(content)
|
|
2218
|
+
return
|
|
2219
|
+
content.external_id = result
|
|
2220
|
+
content.status = ContentStatus.COMPLETED
|
|
2221
|
+
self._update_content(content)
|
|
2222
|
+
return
|
|
2223
|
+
|
|
2224
|
+
except Exception as e:
|
|
2225
|
+
log_error(f"Error uploading file to LightRAG: {e}")
|
|
2226
|
+
content.status = ContentStatus.FAILED
|
|
2227
|
+
content.status_message = f"Could not upload to LightRAG: {str(e)}"
|
|
2228
|
+
self._update_content(content)
|
|
2229
|
+
return
|
|
2230
|
+
|
|
2231
|
+
elif content_type == KnowledgeContentOrigin.URL:
|
|
2232
|
+
log_info(f"Uploading file to LightRAG from URL: {content.url}")
|
|
2233
|
+
try:
|
|
2234
|
+
reader = content.reader or self.website_reader
|
|
2235
|
+
if reader is None:
|
|
2236
|
+
log_error("No URL reader available")
|
|
2237
|
+
content.status = ContentStatus.FAILED
|
|
2238
|
+
self._update_content(content)
|
|
2239
|
+
return
|
|
2240
|
+
|
|
2241
|
+
reader.chunk = False
|
|
2242
|
+
read_documents = reader.read(content.url, name=content.name)
|
|
2243
|
+
if not content.id:
|
|
2244
|
+
content.id = generate_id(content.content_hash or "")
|
|
2245
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
2246
|
+
|
|
2247
|
+
if not read_documents:
|
|
2248
|
+
log_error("No documents read from URL")
|
|
2249
|
+
content.status = ContentStatus.FAILED
|
|
2250
|
+
self._update_content(content)
|
|
2251
|
+
return
|
|
2252
|
+
|
|
2253
|
+
if self.vector_db and hasattr(self.vector_db, "insert_text"):
|
|
2254
|
+
# LightRAG only has async methods, use asyncio.run() here
|
|
2255
|
+
result = asyncio.run(
|
|
2256
|
+
self.vector_db.insert_text(
|
|
2257
|
+
file_source=content.url,
|
|
2258
|
+
text=read_documents[0].content,
|
|
2259
|
+
)
|
|
2260
|
+
)
|
|
2261
|
+
else:
|
|
2262
|
+
log_error("Vector database does not support text insertion")
|
|
2263
|
+
content.status = ContentStatus.FAILED
|
|
2264
|
+
self._update_content(content)
|
|
2265
|
+
return
|
|
2266
|
+
|
|
2267
|
+
content.external_id = result
|
|
2268
|
+
content.status = ContentStatus.COMPLETED
|
|
2269
|
+
self._update_content(content)
|
|
2270
|
+
return
|
|
2271
|
+
|
|
2272
|
+
except Exception as e:
|
|
2273
|
+
log_error(f"Error uploading file to LightRAG: {e}")
|
|
2274
|
+
content.status = ContentStatus.FAILED
|
|
2275
|
+
content.status_message = f"Could not upload to LightRAG: {str(e)}"
|
|
2276
|
+
self._update_content(content)
|
|
2277
|
+
return
|
|
2278
|
+
|
|
2279
|
+
elif content_type == KnowledgeContentOrigin.CONTENT:
|
|
2280
|
+
filename = (
|
|
2281
|
+
content.file_data.filename if content.file_data and content.file_data.filename else "uploaded_file"
|
|
2282
|
+
)
|
|
2283
|
+
log_info(f"Uploading file to LightRAG: {filename}")
|
|
2284
|
+
|
|
2285
|
+
# Use the content from file_data
|
|
2286
|
+
if content.file_data and content.file_data.content:
|
|
2287
|
+
if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
|
|
2288
|
+
# LightRAG only has async methods, use asyncio.run() here
|
|
2289
|
+
result = asyncio.run(
|
|
2290
|
+
self.vector_db.insert_file_bytes(
|
|
2291
|
+
file_content=content.file_data.content,
|
|
2292
|
+
filename=filename,
|
|
2293
|
+
content_type=content.file_data.type,
|
|
2294
|
+
send_metadata=True,
|
|
2295
|
+
)
|
|
2296
|
+
)
|
|
2297
|
+
else:
|
|
2298
|
+
log_error("Vector database does not support file insertion")
|
|
2299
|
+
content.status = ContentStatus.FAILED
|
|
2300
|
+
self._update_content(content)
|
|
2301
|
+
return
|
|
2302
|
+
content.external_id = result
|
|
2303
|
+
content.status = ContentStatus.COMPLETED
|
|
2304
|
+
self._update_content(content)
|
|
2305
|
+
else:
|
|
2306
|
+
log_warning(f"No file data available for LightRAG upload: {content.name}")
|
|
2307
|
+
return
|
|
2308
|
+
|
|
2309
|
+
elif content_type == KnowledgeContentOrigin.TOPIC:
|
|
2310
|
+
log_info(f"Uploading file to LightRAG: {content.name}")
|
|
2311
|
+
|
|
2312
|
+
if content.reader is None:
|
|
2313
|
+
log_error("No reader available for topic content")
|
|
2314
|
+
content.status = ContentStatus.FAILED
|
|
2315
|
+
self._update_content(content)
|
|
2316
|
+
return
|
|
2317
|
+
|
|
2318
|
+
if not content.topics:
|
|
2319
|
+
log_error("No topics available for content")
|
|
2320
|
+
content.status = ContentStatus.FAILED
|
|
2321
|
+
self._update_content(content)
|
|
2322
|
+
return
|
|
2323
|
+
|
|
2324
|
+
read_documents = content.reader.read(content.topics)
|
|
2325
|
+
if len(read_documents) > 0:
|
|
2326
|
+
if self.vector_db and hasattr(self.vector_db, "insert_text"):
|
|
2327
|
+
# LightRAG only has async methods, use asyncio.run() here
|
|
2328
|
+
result = asyncio.run(
|
|
2329
|
+
self.vector_db.insert_text(
|
|
2330
|
+
file_source=content.topics[0],
|
|
2331
|
+
text=read_documents[0].content,
|
|
2332
|
+
)
|
|
2333
|
+
)
|
|
2334
|
+
else:
|
|
2335
|
+
log_error("Vector database does not support text insertion")
|
|
2336
|
+
content.status = ContentStatus.FAILED
|
|
2337
|
+
self._update_content(content)
|
|
2338
|
+
return
|
|
2339
|
+
content.external_id = result
|
|
2340
|
+
content.status = ContentStatus.COMPLETED
|
|
2341
|
+
self._update_content(content)
|
|
2342
|
+
return
|
|
2343
|
+
else:
|
|
2344
|
+
log_warning(f"No documents found for LightRAG upload: {content.name}")
|
|
2345
|
+
return
|
|
2346
|
+
|
|
1381
2347
|
def search(
|
|
1382
2348
|
self,
|
|
1383
2349
|
query: str,
|