agno 2.3.8__py3-none-any.whl → 2.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. agno/agent/agent.py +134 -94
  2. agno/db/mysql/__init__.py +2 -1
  3. agno/db/mysql/async_mysql.py +2888 -0
  4. agno/db/mysql/mysql.py +17 -8
  5. agno/db/mysql/utils.py +139 -6
  6. agno/db/postgres/async_postgres.py +10 -5
  7. agno/db/postgres/postgres.py +7 -2
  8. agno/db/schemas/evals.py +1 -0
  9. agno/db/singlestore/singlestore.py +5 -1
  10. agno/db/sqlite/async_sqlite.py +3 -3
  11. agno/eval/__init__.py +10 -0
  12. agno/eval/accuracy.py +11 -8
  13. agno/eval/agent_as_judge.py +861 -0
  14. agno/eval/base.py +29 -0
  15. agno/eval/utils.py +2 -1
  16. agno/exceptions.py +7 -0
  17. agno/knowledge/embedder/openai.py +8 -8
  18. agno/knowledge/knowledge.py +1142 -176
  19. agno/media.py +22 -6
  20. agno/models/aws/claude.py +8 -7
  21. agno/models/base.py +61 -2
  22. agno/models/deepseek/deepseek.py +67 -0
  23. agno/models/google/gemini.py +134 -51
  24. agno/models/google/utils.py +22 -0
  25. agno/models/message.py +5 -0
  26. agno/models/openai/chat.py +4 -0
  27. agno/os/app.py +64 -74
  28. agno/os/interfaces/a2a/router.py +3 -4
  29. agno/os/interfaces/agui/router.py +2 -0
  30. agno/os/router.py +3 -1607
  31. agno/os/routers/agents/__init__.py +3 -0
  32. agno/os/routers/agents/router.py +581 -0
  33. agno/os/routers/agents/schema.py +261 -0
  34. agno/os/routers/evals/evals.py +26 -6
  35. agno/os/routers/evals/schemas.py +34 -2
  36. agno/os/routers/evals/utils.py +77 -18
  37. agno/os/routers/knowledge/knowledge.py +1 -1
  38. agno/os/routers/teams/__init__.py +3 -0
  39. agno/os/routers/teams/router.py +496 -0
  40. agno/os/routers/teams/schema.py +257 -0
  41. agno/os/routers/workflows/__init__.py +3 -0
  42. agno/os/routers/workflows/router.py +545 -0
  43. agno/os/routers/workflows/schema.py +75 -0
  44. agno/os/schema.py +1 -559
  45. agno/os/utils.py +139 -2
  46. agno/team/team.py +87 -24
  47. agno/tools/file_generation.py +12 -6
  48. agno/tools/firecrawl.py +15 -7
  49. agno/tools/function.py +37 -23
  50. agno/tools/shopify.py +1519 -0
  51. agno/tools/spotify.py +2 -5
  52. agno/utils/hooks.py +64 -5
  53. agno/utils/http.py +2 -2
  54. agno/utils/media.py +11 -1
  55. agno/utils/print_response/agent.py +8 -0
  56. agno/utils/print_response/team.py +8 -0
  57. agno/vectordb/pgvector/pgvector.py +88 -51
  58. agno/workflow/parallel.py +5 -3
  59. agno/workflow/step.py +14 -2
  60. agno/workflow/types.py +38 -2
  61. agno/workflow/workflow.py +12 -4
  62. {agno-2.3.8.dist-info → agno-2.3.10.dist-info}/METADATA +7 -2
  63. {agno-2.3.8.dist-info → agno-2.3.10.dist-info}/RECORD +66 -52
  64. {agno-2.3.8.dist-info → agno-2.3.10.dist-info}/WHEEL +0 -0
  65. {agno-2.3.8.dist-info → agno-2.3.10.dist-info}/licenses/LICENSE +0 -0
  66. {agno-2.3.8.dist-info → agno-2.3.10.dist-info}/top_level.txt +0 -0
@@ -198,8 +198,6 @@ class Knowledge:
198
198
  """
199
199
  Synchronously add multiple content items to the knowledge base.
200
200
 
201
- This method wraps the asynchronous add_contents method
202
-
203
201
  Supports two usage patterns:
204
202
  1. Pass a list of content dictionaries as first argument
205
203
  2. Pass keyword arguments with paths, urls, metadata, etc.
@@ -214,11 +212,109 @@ class Knowledge:
214
212
  reader: Optional reader to use for processing content
215
213
  include: Optional list of file patterns to include
216
214
  exclude: Optional list of file patterns to exclude
217
- upsert: Whether to update existing content if it already exists
218
- skip_if_exists: Whether to skip adding content if it already exists
215
+ upsert: Whether to update existing content if it already exists (only used when skip_if_exists=False)
216
+ skip_if_exists: Whether to skip adding content if it already exists (default: True)
219
217
  remote_content: Optional remote content (S3, GCS, etc.) to add
220
218
  """
221
- asyncio.run(self.add_contents_async(*args, **kwargs))
219
+ if args and isinstance(args[0], list):
220
+ arguments = args[0]
221
+ upsert = kwargs.get("upsert", True)
222
+ skip_if_exists = kwargs.get("skip_if_exists", False)
223
+ for argument in arguments:
224
+ self.add_content(
225
+ name=argument.get("name"),
226
+ description=argument.get("description"),
227
+ path=argument.get("path"),
228
+ url=argument.get("url"),
229
+ metadata=argument.get("metadata"),
230
+ topics=argument.get("topics"),
231
+ text_content=argument.get("text_content"),
232
+ reader=argument.get("reader"),
233
+ include=argument.get("include"),
234
+ exclude=argument.get("exclude"),
235
+ upsert=argument.get("upsert", upsert),
236
+ skip_if_exists=argument.get("skip_if_exists", skip_if_exists),
237
+ remote_content=argument.get("remote_content", None),
238
+ )
239
+
240
+ elif kwargs:
241
+ name = kwargs.get("name", [])
242
+ metadata = kwargs.get("metadata", {})
243
+ description = kwargs.get("description", [])
244
+ topics = kwargs.get("topics", [])
245
+ reader = kwargs.get("reader", None)
246
+ paths = kwargs.get("paths", [])
247
+ urls = kwargs.get("urls", [])
248
+ text_contents = kwargs.get("text_contents", [])
249
+ include = kwargs.get("include")
250
+ exclude = kwargs.get("exclude")
251
+ upsert = kwargs.get("upsert", True)
252
+ skip_if_exists = kwargs.get("skip_if_exists", False)
253
+ remote_content = kwargs.get("remote_content", None)
254
+ for path in paths:
255
+ self.add_content(
256
+ name=name,
257
+ description=description,
258
+ path=path,
259
+ metadata=metadata,
260
+ include=include,
261
+ exclude=exclude,
262
+ upsert=upsert,
263
+ skip_if_exists=skip_if_exists,
264
+ reader=reader,
265
+ )
266
+ for url in urls:
267
+ self.add_content(
268
+ name=name,
269
+ description=description,
270
+ url=url,
271
+ metadata=metadata,
272
+ include=include,
273
+ exclude=exclude,
274
+ upsert=upsert,
275
+ skip_if_exists=skip_if_exists,
276
+ reader=reader,
277
+ )
278
+ for i, text_content in enumerate(text_contents):
279
+ content_name = f"{name}_{i}" if name else f"text_content_{i}"
280
+ log_debug(f"Adding text content: {content_name}")
281
+ self.add_content(
282
+ name=content_name,
283
+ description=description,
284
+ text_content=text_content,
285
+ metadata=metadata,
286
+ include=include,
287
+ exclude=exclude,
288
+ upsert=upsert,
289
+ skip_if_exists=skip_if_exists,
290
+ reader=reader,
291
+ )
292
+ if topics:
293
+ self.add_content(
294
+ name=name,
295
+ description=description,
296
+ topics=topics,
297
+ metadata=metadata,
298
+ include=include,
299
+ exclude=exclude,
300
+ upsert=upsert,
301
+ skip_if_exists=skip_if_exists,
302
+ reader=reader,
303
+ )
304
+
305
+ if remote_content:
306
+ self.add_content(
307
+ name=name,
308
+ metadata=metadata,
309
+ description=description,
310
+ remote_content=remote_content,
311
+ upsert=upsert,
312
+ skip_if_exists=skip_if_exists,
313
+ reader=reader,
314
+ )
315
+
316
+ else:
317
+ raise ValueError("Invalid usage of add_contents.")
222
318
 
223
319
  # --- Add Content ---
224
320
 
@@ -255,7 +351,7 @@ class Knowledge:
255
351
  include: Optional[List[str]] = None,
256
352
  exclude: Optional[List[str]] = None,
257
353
  upsert: bool = True,
258
- skip_if_exists: bool = True,
354
+ skip_if_exists: bool = False,
259
355
  auth: Optional[ContentAuth] = None,
260
356
  ) -> None:
261
357
  # Validation: At least one of the parameters must be provided
@@ -265,10 +361,6 @@ class Knowledge:
265
361
  )
266
362
  return
267
363
 
268
- if not skip_if_exists:
269
- log_debug("skip_if_exists is disabled, disabling upsert")
270
- upsert = False
271
-
272
364
  content = None
273
365
  file_data = None
274
366
  if text_content:
@@ -289,7 +381,7 @@ class Knowledge:
289
381
  content.content_hash = self._build_content_hash(content)
290
382
  content.id = generate_id(content.content_hash)
291
383
 
292
- await self._load_content(content, upsert, skip_if_exists, include, exclude)
384
+ await self._load_content_async(content, upsert, skip_if_exists, include, exclude)
293
385
 
294
386
  @overload
295
387
  def add_content(
@@ -342,27 +434,37 @@ class Knowledge:
342
434
  reader: Optional custom reader for processing the content
343
435
  include: Optional list of file patterns to include
344
436
  exclude: Optional list of file patterns to exclude
345
- upsert: Whether to update existing content if it already exists
346
- skip_if_exists: Whether to skip adding content if it already exists
437
+ upsert: Whether to update existing content if it already exists (only used when skip_if_exists=False)
438
+ skip_if_exists: Whether to skip adding content if it already exists (default: False)
347
439
  """
348
- asyncio.run(
349
- self.add_content_async(
350
- name=name,
351
- description=description,
352
- path=path,
353
- url=url,
354
- text_content=text_content,
355
- metadata=metadata,
356
- topics=topics,
357
- remote_content=remote_content,
358
- reader=reader,
359
- include=include,
360
- exclude=exclude,
361
- upsert=upsert,
362
- skip_if_exists=skip_if_exists,
363
- auth=auth,
440
+ # Validation: At least one of the parameters must be provided
441
+ if all(argument is None for argument in [path, url, text_content, topics, remote_content]):
442
+ log_warning(
443
+ "At least one of 'path', 'url', 'text_content', 'topics', or 'remote_content' must be provided."
364
444
  )
445
+ return
446
+
447
+ content = None
448
+ file_data = None
449
+ if text_content:
450
+ file_data = FileData(content=text_content, type="Text")
451
+
452
+ content = Content(
453
+ name=name,
454
+ description=description,
455
+ path=path,
456
+ url=url,
457
+ file_data=file_data if file_data else None,
458
+ metadata=metadata,
459
+ topics=topics,
460
+ remote_content=remote_content,
461
+ reader=reader,
462
+ auth=auth,
365
463
  )
464
+ content.content_hash = self._build_content_hash(content)
465
+ content.id = generate_id(content.content_hash)
466
+
467
+ self._load_content(content, upsert, skip_if_exists, include, exclude)
366
468
 
367
469
  def _should_skip(self, content_hash: str, skip_if_exists: bool) -> bool:
368
470
  """
@@ -384,7 +486,148 @@ class Knowledge:
384
486
 
385
487
  return False
386
488
 
387
- async def _load_from_path(
489
+ def _select_reader_by_extension(
490
+ self, file_extension: str, provided_reader: Optional[Reader] = None
491
+ ) -> Tuple[Optional[Reader], str]:
492
+ """
493
+ Select a reader based on file extension.
494
+
495
+ Args:
496
+ file_extension: File extension (e.g., '.pdf', '.csv')
497
+ provided_reader: Optional reader already provided
498
+
499
+ Returns:
500
+ Tuple of (reader, name) where name may be adjusted based on extension
501
+ """
502
+ if provided_reader:
503
+ return provided_reader, ""
504
+
505
+ file_extension = file_extension.lower()
506
+ if file_extension == ".csv":
507
+ return self.csv_reader, "data.csv"
508
+ elif file_extension == ".pdf":
509
+ return self.pdf_reader, ""
510
+ elif file_extension == ".docx":
511
+ return self.docx_reader, ""
512
+ elif file_extension == ".pptx":
513
+ return self.pptx_reader, ""
514
+ elif file_extension == ".json":
515
+ return self.json_reader, ""
516
+ elif file_extension == ".markdown":
517
+ return self.markdown_reader, ""
518
+ else:
519
+ return self.text_reader, ""
520
+
521
+ def _select_reader_by_uri(self, uri: str, provided_reader: Optional[Reader] = None) -> Optional[Reader]:
522
+ """
523
+ Select a reader based on URI/file path extension.
524
+
525
+ Args:
526
+ uri: URI or file path
527
+ provided_reader: Optional reader already provided
528
+
529
+ Returns:
530
+ Selected reader or None
531
+ """
532
+ if provided_reader:
533
+ return provided_reader
534
+
535
+ uri_lower = uri.lower()
536
+ if uri_lower.endswith(".pdf"):
537
+ return self.pdf_reader
538
+ elif uri_lower.endswith(".csv"):
539
+ return self.csv_reader
540
+ elif uri_lower.endswith(".docx"):
541
+ return self.docx_reader
542
+ elif uri_lower.endswith(".pptx"):
543
+ return self.pptx_reader
544
+ elif uri_lower.endswith(".json"):
545
+ return self.json_reader
546
+ elif uri_lower.endswith(".markdown"):
547
+ return self.markdown_reader
548
+ else:
549
+ return self.text_reader
550
+
551
+ def _read_with_reader(
552
+ self,
553
+ reader: Reader,
554
+ source: Union[Path, str, BytesIO],
555
+ name: Optional[str] = None,
556
+ password: Optional[str] = None,
557
+ ) -> List[Document]:
558
+ """
559
+ Read content using a reader with optional password handling.
560
+
561
+ Args:
562
+ reader: Reader to use
563
+ source: Source to read from (Path, URL string, or BytesIO)
564
+ name: Optional name for the document
565
+ password: Optional password for protected files
566
+
567
+ Returns:
568
+ List of documents read
569
+ """
570
+ import inspect
571
+
572
+ read_signature = inspect.signature(reader.read)
573
+ if password and "password" in read_signature.parameters:
574
+ if isinstance(source, BytesIO):
575
+ return reader.read(source, name=name, password=password)
576
+ else:
577
+ return reader.read(source, name=name, password=password)
578
+ else:
579
+ if isinstance(source, BytesIO):
580
+ return reader.read(source, name=name)
581
+ else:
582
+ return reader.read(source, name=name)
583
+
584
+ def _prepare_documents_for_insert(
585
+ self,
586
+ documents: List[Document],
587
+ content_id: str,
588
+ calculate_sizes: bool = False,
589
+ metadata: Optional[Dict[str, Any]] = None,
590
+ ) -> List[Document]:
591
+ """
592
+ Prepare documents for insertion by assigning content_id and optionally calculating sizes and updating metadata.
593
+
594
+ Args:
595
+ documents: List of documents to prepare
596
+ content_id: Content ID to assign to documents
597
+ calculate_sizes: Whether to calculate document sizes
598
+ metadata: Optional metadata to merge into document metadata
599
+
600
+ Returns:
601
+ List of prepared documents
602
+ """
603
+ for document in documents:
604
+ document.content_id = content_id
605
+ if calculate_sizes and document.content and not document.size:
606
+ document.size = len(document.content.encode("utf-8"))
607
+ if metadata:
608
+ document.meta_data.update(metadata)
609
+ return documents
610
+
611
+ def _chunk_documents_sync(self, reader: Reader, documents: List[Document]) -> List[Document]:
612
+ """
613
+ Chunk documents synchronously.
614
+
615
+ Args:
616
+ reader: Reader with chunking strategy
617
+ documents: Documents to chunk
618
+
619
+ Returns:
620
+ List of chunked documents
621
+ """
622
+ if not reader or reader.chunk:
623
+ return documents
624
+
625
+ chunked_documents = []
626
+ for doc in documents:
627
+ chunked_documents.extend(reader.chunk_document(doc))
628
+ return chunked_documents
629
+
630
+ async def _load_from_path_async(
388
631
  self,
389
632
  content: Content,
390
633
  upsert: bool,
@@ -403,7 +646,7 @@ class Knowledge:
403
646
  if self._should_include_file(str(path), include, exclude):
404
647
  log_debug(f"Adding file {path} due to include/exclude filters")
405
648
 
406
- await self._add_to_contents_db(content)
649
+ await self._add_to_contents_db_async(content)
407
650
  if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
408
651
  content.status = ContentStatus.COMPLETED
409
652
  await self._aupdate_content(content)
@@ -411,7 +654,90 @@ class Knowledge:
411
654
 
412
655
  # Handle LightRAG special case - read file and upload directly
413
656
  if self.vector_db.__class__.__name__ == "LightRag":
414
- await self._process_lightrag_content(content, KnowledgeContentOrigin.PATH)
657
+ await self._process_lightrag_content_async(content, KnowledgeContentOrigin.PATH)
658
+ return
659
+
660
+ if content.reader:
661
+ reader = content.reader
662
+ else:
663
+ reader = ReaderFactory.get_reader_for_extension(path.suffix)
664
+ log_debug(f"Using Reader: {reader.__class__.__name__}")
665
+
666
+ if reader:
667
+ password = content.auth.password if content.auth and content.auth.password else None
668
+ read_documents = self._read_with_reader(
669
+ reader, path, name=content.name or path.name, password=password
670
+ )
671
+ else:
672
+ read_documents = []
673
+
674
+ if not content.file_type:
675
+ content.file_type = path.suffix
676
+
677
+ if not content.size and content.file_data:
678
+ content.size = len(content.file_data.content) # type: ignore
679
+ if not content.size:
680
+ try:
681
+ content.size = path.stat().st_size
682
+ except (OSError, IOError) as e:
683
+ log_warning(f"Could not get file size for {path}: {e}")
684
+ content.size = 0
685
+
686
+ if not content.id:
687
+ content.id = generate_id(content.content_hash or "")
688
+ self._prepare_documents_for_insert(read_documents, content.id)
689
+
690
+ await self._handle_vector_db_insert_async(content, read_documents, upsert)
691
+
692
+ elif path.is_dir():
693
+ for file_path in path.iterdir():
694
+ # Apply include/exclude filtering
695
+ if not self._should_include_file(str(file_path), include, exclude):
696
+ log_debug(f"Skipping file {file_path} due to include/exclude filters")
697
+ continue
698
+
699
+ file_content = Content(
700
+ name=content.name,
701
+ path=str(file_path),
702
+ metadata=content.metadata,
703
+ description=content.description,
704
+ reader=content.reader,
705
+ )
706
+ file_content.content_hash = self._build_content_hash(file_content)
707
+ file_content.id = generate_id(file_content.content_hash)
708
+
709
+ await self._load_from_path_async(file_content, upsert, skip_if_exists, include, exclude)
710
+ else:
711
+ log_warning(f"Invalid path: {path}")
712
+
713
+ def _load_from_path(
714
+ self,
715
+ content: Content,
716
+ upsert: bool,
717
+ skip_if_exists: bool,
718
+ include: Optional[List[str]] = None,
719
+ exclude: Optional[List[str]] = None,
720
+ ):
721
+ from agno.vectordb import VectorDb
722
+
723
+ self.vector_db = cast(VectorDb, self.vector_db)
724
+
725
+ log_info(f"Adding content from path, {content.id}, {content.name}, {content.path}, {content.description}")
726
+ path = Path(content.path) # type: ignore
727
+
728
+ if path.is_file():
729
+ if self._should_include_file(str(path), include, exclude):
730
+ log_debug(f"Adding file {path} due to include/exclude filters")
731
+
732
+ self._add_to_contents_db(content)
733
+ if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
734
+ content.status = ContentStatus.COMPLETED
735
+ self._update_content(content)
736
+ return
737
+
738
+ # Handle LightRAG special case - read file and upload directly
739
+ if self.vector_db.__class__.__name__ == "LightRag":
740
+ self._process_lightrag_content(content, KnowledgeContentOrigin.PATH)
415
741
  return
416
742
 
417
743
  if content.reader:
@@ -453,10 +779,11 @@ class Knowledge:
453
779
  log_warning(f"Could not get file size for {path}: {e}")
454
780
  content.size = 0
455
781
 
456
- for read_document in read_documents:
457
- read_document.content_id = content.id
782
+ if not content.id:
783
+ content.id = generate_id(content.content_hash or "")
784
+ self._prepare_documents_for_insert(read_documents, content.id)
458
785
 
459
- await self._handle_vector_db_insert(content, read_documents, upsert)
786
+ self._handle_vector_db_insert(content, read_documents, upsert)
460
787
 
461
788
  elif path.is_dir():
462
789
  for file_path in path.iterdir():
@@ -475,11 +802,11 @@ class Knowledge:
475
802
  file_content.content_hash = self._build_content_hash(file_content)
476
803
  file_content.id = generate_id(file_content.content_hash)
477
804
 
478
- await self._load_from_path(file_content, upsert, skip_if_exists, include, exclude)
805
+ self._load_from_path(file_content, upsert, skip_if_exists, include, exclude)
479
806
  else:
480
807
  log_warning(f"Invalid path: {path}")
481
808
 
482
- async def _load_from_url(
809
+ async def _load_from_url_async(
483
810
  self,
484
811
  content: Content,
485
812
  upsert: bool,
@@ -503,14 +830,14 @@ class Knowledge:
503
830
  raise ValueError("No url provided")
504
831
 
505
832
  # 1. Add content to contents database
506
- await self._add_to_contents_db(content)
833
+ await self._add_to_contents_db_async(content)
507
834
  if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
508
835
  content.status = ContentStatus.COMPLETED
509
836
  await self._aupdate_content(content)
510
837
  return
511
838
 
512
839
  if self.vector_db.__class__.__name__ == "LightRag":
513
- await self._process_lightrag_content(content, KnowledgeContentOrigin.URL)
840
+ await self._process_lightrag_content_async(content, KnowledgeContentOrigin.URL)
514
841
  return
515
842
 
516
843
  # 2. Validate URL
@@ -540,47 +867,25 @@ class Knowledge:
540
867
  bytes_content = BytesIO(response.content)
541
868
 
542
869
  # 4. Select reader
543
- # If a reader was provided by the user, use it
544
- reader = content.reader
545
870
  name = content.name if content.name else content.url
546
- # Else select based on file extension
547
- if reader is None:
548
- if file_extension == ".csv":
549
- name = basename(parsed_url.path) or "data.csv"
550
- reader = self.csv_reader
551
- elif file_extension == ".pdf":
552
- reader = self.pdf_reader
553
- elif file_extension == ".docx":
554
- reader = self.docx_reader
555
- elif file_extension == ".pptx":
556
- reader = self.pptx_reader
557
- elif file_extension == ".json":
558
- reader = self.json_reader
559
- elif file_extension == ".markdown":
560
- reader = self.markdown_reader
561
- else:
562
- reader = self.text_reader
871
+ if file_extension:
872
+ reader, default_name = self._select_reader_by_extension(file_extension, content.reader)
873
+ if default_name and file_extension == ".csv":
874
+ name = basename(parsed_url.path) or default_name
875
+ else:
876
+ reader = content.reader or self.website_reader
563
877
 
564
878
  # 5. Read content
565
879
  try:
566
880
  read_documents = []
567
881
  if reader is not None:
568
- # TODO: We will refactor this to eventually pass authorization to all readers
569
- import inspect
570
-
571
- read_signature = inspect.signature(reader.read)
882
+ # Special handling for YouTubeReader
572
883
  if reader.__class__.__name__ == "YouTubeReader":
573
884
  read_documents = reader.read(content.url, name=name)
574
- elif "password" in read_signature.parameters and content.auth and content.auth.password:
575
- if bytes_content:
576
- read_documents = reader.read(bytes_content, name=name, password=content.auth.password)
577
- else:
578
- read_documents = reader.read(content.url, name=name, password=content.auth.password)
579
885
  else:
580
- if bytes_content:
581
- read_documents = reader.read(bytes_content, name=name)
582
- else:
583
- read_documents = reader.read(content.url, name=name)
886
+ password = content.auth.password if content.auth and content.auth.password else None
887
+ source = bytes_content if bytes_content else content.url
888
+ read_documents = self._read_with_reader(reader, source, name=name, password=password)
584
889
 
585
890
  except Exception as e:
586
891
  log_error(f"Error reading URL: {content.url} - {str(e)}")
@@ -593,19 +898,115 @@ class Knowledge:
593
898
  if reader and not reader.chunk:
594
899
  read_documents = await reader.chunk_documents_async(read_documents)
595
900
  # 7. Prepare and insert the content in the vector database
596
- file_size = 0
597
- if read_documents:
598
- for read_document in read_documents:
599
- if read_document.size:
600
- file_size += read_document.size
601
- read_document.content_id = content.id
602
- await self._handle_vector_db_insert(content, read_documents, upsert)
603
-
604
- async def _load_from_content(
901
+ if not content.id:
902
+ content.id = generate_id(content.content_hash or "")
903
+ self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
904
+ await self._handle_vector_db_insert_async(content, read_documents, upsert)
905
+
906
+ def _load_from_url(
605
907
  self,
606
908
  content: Content,
607
- upsert: bool = True,
608
- skip_if_exists: bool = False,
909
+ upsert: bool,
910
+ skip_if_exists: bool,
911
+ ):
912
+ """Synchronous version of _load_from_url.
913
+
914
+ Load the content from a URL:
915
+ 1. Set content hash
916
+ 2. Validate the URL
917
+ 3. Read the content
918
+ 4. Prepare and insert the content in the vector database
919
+ """
920
+ from agno.utils.http import fetch_with_retry
921
+ from agno.vectordb import VectorDb
922
+
923
+ self.vector_db = cast(VectorDb, self.vector_db)
924
+
925
+ log_info(f"Adding content from URL {content.url}")
926
+ content.file_type = "url"
927
+
928
+ if not content.url:
929
+ raise ValueError("No url provided")
930
+
931
+ # 1. Add content to contents database
932
+ self._add_to_contents_db(content)
933
+ if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
934
+ content.status = ContentStatus.COMPLETED
935
+ self._update_content(content)
936
+ return
937
+
938
+ if self.vector_db.__class__.__name__ == "LightRag":
939
+ self._process_lightrag_content(content, KnowledgeContentOrigin.URL)
940
+ return
941
+
942
+ # 2. Validate URL
943
+ try:
944
+ from urllib.parse import urlparse
945
+
946
+ parsed_url = urlparse(content.url)
947
+ if not all([parsed_url.scheme, parsed_url.netloc]):
948
+ content.status = ContentStatus.FAILED
949
+ content.status_message = f"Invalid URL format: {content.url}"
950
+ self._update_content(content)
951
+ log_warning(f"Invalid URL format: {content.url}")
952
+ except Exception as e:
953
+ content.status = ContentStatus.FAILED
954
+ content.status_message = f"Invalid URL: {content.url} - {str(e)}"
955
+ self._update_content(content)
956
+ log_warning(f"Invalid URL: {content.url} - {str(e)}")
957
+
958
+ # 3. Fetch and load content if file has an extension
959
+ url_path = Path(parsed_url.path)
960
+ file_extension = url_path.suffix.lower()
961
+
962
+ bytes_content = None
963
+ if file_extension:
964
+ response = fetch_with_retry(content.url)
965
+ bytes_content = BytesIO(response.content)
966
+
967
+ # 4. Select reader
968
+ name = content.name if content.name else content.url
969
+ if file_extension:
970
+ reader, default_name = self._select_reader_by_extension(file_extension, content.reader)
971
+ if default_name and file_extension == ".csv":
972
+ name = basename(parsed_url.path) or default_name
973
+ else:
974
+ reader = content.reader or self.website_reader
975
+
976
+ # 5. Read content
977
+ try:
978
+ read_documents = []
979
+ if reader is not None:
980
+ # Special handling for YouTubeReader
981
+ if reader.__class__.__name__ == "YouTubeReader":
982
+ read_documents = reader.read(content.url, name=name)
983
+ else:
984
+ password = content.auth.password if content.auth and content.auth.password else None
985
+ source = bytes_content if bytes_content else content.url
986
+ read_documents = self._read_with_reader(reader, source, name=name, password=password)
987
+
988
+ except Exception as e:
989
+ log_error(f"Error reading URL: {content.url} - {str(e)}")
990
+ content.status = ContentStatus.FAILED
991
+ content.status_message = f"Error reading URL: {content.url} - {str(e)}"
992
+ self._update_content(content)
993
+ return
994
+
995
+ # 6. Chunk documents if needed (sync version)
996
+ if reader:
997
+ read_documents = self._chunk_documents_sync(reader, read_documents)
998
+
999
+ # 7. Prepare and insert the content in the vector database
1000
+ if not content.id:
1001
+ content.id = generate_id(content.content_hash or "")
1002
+ self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
1003
+ self._handle_vector_db_insert(content, read_documents, upsert)
1004
+
1005
+ async def _load_from_content_async(
1006
+ self,
1007
+ content: Content,
1008
+ upsert: bool = True,
1009
+ skip_if_exists: bool = False,
609
1010
  ):
610
1011
  from agno.vectordb import VectorDb
611
1012
 
@@ -632,14 +1033,14 @@ class Knowledge:
632
1033
 
633
1034
  log_info(f"Adding content from {content.name}")
634
1035
 
635
- await self._add_to_contents_db(content)
1036
+ await self._add_to_contents_db_async(content)
636
1037
  if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
637
1038
  content.status = ContentStatus.COMPLETED
638
1039
  await self._aupdate_content(content)
639
1040
  return
640
1041
 
641
1042
  if content.file_data and self.vector_db.__class__.__name__ == "LightRag":
642
- await self._process_lightrag_content(content, KnowledgeContentOrigin.CONTENT)
1043
+ await self._process_lightrag_content_async(content, KnowledgeContentOrigin.CONTENT)
643
1044
  return
644
1045
 
645
1046
  read_documents = []
@@ -679,10 +1080,9 @@ class Knowledge:
679
1080
  reader = self._select_reader(content.file_data.type)
680
1081
  name = content.name if content.name else f"content_{content.file_data.type}"
681
1082
  read_documents = reader.read(content_io, name=name)
682
- for read_document in read_documents:
683
- if content.metadata:
684
- read_document.meta_data.update(content.metadata)
685
- read_document.content_id = content.id
1083
+ if not content.id:
1084
+ content.id = generate_id(content.content_hash or "")
1085
+ self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
686
1086
 
687
1087
  if len(read_documents) == 0:
688
1088
  content.status = ContentStatus.FAILED
@@ -696,9 +1096,106 @@ class Knowledge:
696
1096
  await self._aupdate_content(content)
697
1097
  return
698
1098
 
699
- await self._handle_vector_db_insert(content, read_documents, upsert)
1099
+ await self._handle_vector_db_insert_async(content, read_documents, upsert)
1100
+
1101
+ def _load_from_content(
1102
+ self,
1103
+ content: Content,
1104
+ upsert: bool = True,
1105
+ skip_if_exists: bool = False,
1106
+ ):
1107
+ """Synchronous version of _load_from_content."""
1108
+ from agno.vectordb import VectorDb
1109
+
1110
+ self.vector_db = cast(VectorDb, self.vector_db)
1111
+
1112
+ if content.name:
1113
+ name = content.name
1114
+ elif content.file_data and content.file_data.content:
1115
+ if isinstance(content.file_data.content, bytes):
1116
+ name = content.file_data.content[:10].decode("utf-8", errors="ignore")
1117
+ elif isinstance(content.file_data.content, str):
1118
+ name = (
1119
+ content.file_data.content[:10]
1120
+ if len(content.file_data.content) >= 10
1121
+ else content.file_data.content
1122
+ )
1123
+ else:
1124
+ name = str(content.file_data.content)[:10]
1125
+ else:
1126
+ name = None
1127
+
1128
+ if name is not None:
1129
+ content.name = name
1130
+
1131
+ log_info(f"Adding content from {content.name}")
1132
+
1133
+ self._add_to_contents_db(content)
1134
+ if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
1135
+ content.status = ContentStatus.COMPLETED
1136
+ self._update_content(content)
1137
+ return
1138
+
1139
+ if content.file_data and self.vector_db.__class__.__name__ == "LightRag":
1140
+ self._process_lightrag_content(content, KnowledgeContentOrigin.CONTENT)
1141
+ return
1142
+
1143
+ read_documents = []
1144
+
1145
+ if isinstance(content.file_data, str):
1146
+ content_bytes = content.file_data.encode("utf-8", errors="replace")
1147
+ content_io = io.BytesIO(content_bytes)
1148
+
1149
+ if content.reader:
1150
+ log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
1151
+ read_documents = content.reader.read(content_io, name=name)
1152
+ else:
1153
+ text_reader = self.text_reader
1154
+ if text_reader:
1155
+ read_documents = text_reader.read(content_io, name=name)
1156
+ else:
1157
+ content.status = ContentStatus.FAILED
1158
+ content.status_message = "Text reader not available"
1159
+ self._update_content(content)
1160
+ return
700
1161
 
701
- async def _load_from_topics(
1162
+ elif isinstance(content.file_data, FileData):
1163
+ if content.file_data.type:
1164
+ if isinstance(content.file_data.content, bytes):
1165
+ content_io = io.BytesIO(content.file_data.content)
1166
+ elif isinstance(content.file_data.content, str):
1167
+ content_bytes = content.file_data.content.encode("utf-8", errors="replace")
1168
+ content_io = io.BytesIO(content_bytes)
1169
+ else:
1170
+ content_io = content.file_data.content # type: ignore
1171
+
1172
+ # Respect an explicitly provided reader; otherwise select based on file type
1173
+ if content.reader:
1174
+ log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
1175
+ reader = content.reader
1176
+ else:
1177
+ reader = self._select_reader(content.file_data.type)
1178
+ name = content.name if content.name else f"content_{content.file_data.type}"
1179
+ read_documents = reader.read(content_io, name=name)
1180
+ if not content.id:
1181
+ content.id = generate_id(content.content_hash or "")
1182
+ self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
1183
+
1184
+ if len(read_documents) == 0:
1185
+ content.status = ContentStatus.FAILED
1186
+ content.status_message = "Content could not be read"
1187
+ self._update_content(content)
1188
+ return
1189
+
1190
+ else:
1191
+ content.status = ContentStatus.FAILED
1192
+ content.status_message = "No content provided"
1193
+ self._update_content(content)
1194
+ return
1195
+
1196
+ self._handle_vector_db_insert(content, read_documents, upsert)
1197
+
1198
+ async def _load_from_topics_async(
702
1199
  self,
703
1200
  content: Content,
704
1201
  upsert: bool,
@@ -727,47 +1224,264 @@ class Knowledge:
727
1224
  content.content_hash = self._build_content_hash(content)
728
1225
  content.id = generate_id(content.content_hash)
729
1226
 
730
- await self._add_to_contents_db(content)
1227
+ await self._add_to_contents_db_async(content)
731
1228
  if self._should_skip(content.content_hash, skip_if_exists):
732
1229
  content.status = ContentStatus.COMPLETED
733
1230
  await self._aupdate_content(content)
734
1231
  return
735
1232
 
736
- if self.vector_db.__class__.__name__ == "LightRag":
737
- await self._process_lightrag_content(content, KnowledgeContentOrigin.TOPIC)
1233
+ if self.vector_db.__class__.__name__ == "LightRag":
1234
+ await self._process_lightrag_content_async(content, KnowledgeContentOrigin.TOPIC)
1235
+ return
1236
+
1237
+ if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
1238
+ log_info(f"Content {content.content_hash} already exists, skipping")
1239
+ continue
1240
+
1241
+ await self._add_to_contents_db_async(content)
1242
+ if content.reader is None:
1243
+ log_error(f"No reader available for topic: {topic}")
1244
+ content.status = ContentStatus.FAILED
1245
+ content.status_message = "No reader available for topic"
1246
+ await self._aupdate_content(content)
1247
+ continue
1248
+
1249
+ read_documents = content.reader.read(topic)
1250
+ if len(read_documents) > 0:
1251
+ self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
1252
+ else:
1253
+ content.status = ContentStatus.FAILED
1254
+ content.status_message = "No content found for topic"
1255
+ await self._aupdate_content(content)
1256
+
1257
+ await self._handle_vector_db_insert_async(content, read_documents, upsert)
1258
+
1259
+ def _load_from_topics(
1260
+ self,
1261
+ content: Content,
1262
+ upsert: bool,
1263
+ skip_if_exists: bool,
1264
+ ):
1265
+ """Synchronous version of _load_from_topics."""
1266
+ from agno.vectordb import VectorDb
1267
+
1268
+ self.vector_db = cast(VectorDb, self.vector_db)
1269
+ log_info(f"Adding content from topics: {content.topics}")
1270
+
1271
+ if content.topics is None:
1272
+ log_warning("No topics provided for content")
1273
+ return
1274
+
1275
+ for topic in content.topics:
1276
+ content = Content(
1277
+ name=topic,
1278
+ metadata=content.metadata,
1279
+ reader=content.reader,
1280
+ status=ContentStatus.PROCESSING if content.reader else ContentStatus.FAILED,
1281
+ file_data=FileData(
1282
+ type="Topic",
1283
+ ),
1284
+ topics=[topic],
1285
+ )
1286
+ content.content_hash = self._build_content_hash(content)
1287
+ content.id = generate_id(content.content_hash)
1288
+
1289
+ self._add_to_contents_db(content)
1290
+ if self._should_skip(content.content_hash, skip_if_exists):
1291
+ content.status = ContentStatus.COMPLETED
1292
+ self._update_content(content)
1293
+ return
1294
+
1295
+ if self.vector_db.__class__.__name__ == "LightRag":
1296
+ self._process_lightrag_content(content, KnowledgeContentOrigin.TOPIC)
1297
+ return
1298
+
1299
+ if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
1300
+ log_info(f"Content {content.content_hash} already exists, skipping")
1301
+ continue
1302
+
1303
+ self._add_to_contents_db(content)
1304
+ if content.reader is None:
1305
+ log_error(f"No reader available for topic: {topic}")
1306
+ content.status = ContentStatus.FAILED
1307
+ content.status_message = "No reader available for topic"
1308
+ self._update_content(content)
1309
+ continue
1310
+
1311
+ read_documents = content.reader.read(topic)
1312
+ if len(read_documents) > 0:
1313
+ self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
1314
+ else:
1315
+ content.status = ContentStatus.FAILED
1316
+ content.status_message = "No content found for topic"
1317
+ self._update_content(content)
1318
+
1319
+ self._handle_vector_db_insert(content, read_documents, upsert)
1320
+
1321
+ async def _load_from_remote_content_async(
1322
+ self,
1323
+ content: Content,
1324
+ upsert: bool,
1325
+ skip_if_exists: bool,
1326
+ ):
1327
+ if content.remote_content is None:
1328
+ log_warning("No remote content provided for content")
1329
+ return
1330
+
1331
+ remote_content = content.remote_content
1332
+
1333
+ if isinstance(remote_content, S3Content):
1334
+ await self._load_from_s3_async(content, upsert, skip_if_exists)
1335
+
1336
+ elif isinstance(remote_content, GCSContent):
1337
+ await self._load_from_gcs_async(content, upsert, skip_if_exists)
1338
+
1339
+ else:
1340
+ log_warning(f"Unsupported remote content type: {type(remote_content)}")
1341
+
1342
+ async def _load_from_s3_async(self, content: Content, upsert: bool, skip_if_exists: bool):
1343
+ """Load the contextual S3 content.
1344
+
1345
+ 1. Identify objects to read
1346
+ 2. Setup Content object
1347
+ 3. Hash content and add it to the contents database
1348
+ 4. Select reader
1349
+ 5. Fetch and load the content
1350
+ 6. Read the content
1351
+ 7. Prepare and insert the content in the vector database
1352
+ 8. Remove temporary file if needed
1353
+ """
1354
+ from agno.cloud.aws.s3.object import S3Object
1355
+
1356
+ remote_content: S3Content = cast(S3Content, content.remote_content)
1357
+
1358
+ # 1. Identify objects to read
1359
+ objects_to_read: List[S3Object] = []
1360
+ if remote_content.bucket is not None:
1361
+ if remote_content.key is not None:
1362
+ _object = S3Object(bucket_name=remote_content.bucket.name, name=remote_content.key)
1363
+ objects_to_read.append(_object)
1364
+ elif remote_content.object is not None:
1365
+ objects_to_read.append(remote_content.object)
1366
+ elif remote_content.prefix is not None:
1367
+ objects_to_read.extend(remote_content.bucket.get_objects(prefix=remote_content.prefix))
1368
+ else:
1369
+ objects_to_read.extend(remote_content.bucket.get_objects())
1370
+
1371
+ for s3_object in objects_to_read:
1372
+ # 2. Setup Content object
1373
+ content_name = content.name or ""
1374
+ content_name += "_" + (s3_object.name or "")
1375
+ content_entry = Content(
1376
+ name=content_name,
1377
+ description=content.description,
1378
+ status=ContentStatus.PROCESSING,
1379
+ metadata=content.metadata,
1380
+ file_type="s3",
1381
+ )
1382
+
1383
+ # 3. Hash content and add it to the contents database
1384
+ content_entry.content_hash = self._build_content_hash(content_entry)
1385
+ content_entry.id = generate_id(content_entry.content_hash)
1386
+ await self._add_to_contents_db_async(content_entry)
1387
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
1388
+ content_entry.status = ContentStatus.COMPLETED
1389
+ await self._aupdate_content(content_entry)
1390
+ return
1391
+
1392
+ # 4. Select reader
1393
+ reader = self._select_reader_by_uri(s3_object.uri, content.reader)
1394
+ reader = cast(Reader, reader)
1395
+
1396
+ # 5. Fetch and load the content
1397
+ temporary_file = None
1398
+ obj_name = content_name or s3_object.name.split("/")[-1]
1399
+ readable_content: Optional[Union[BytesIO, Path]] = None
1400
+ if s3_object.uri.endswith(".pdf"):
1401
+ readable_content = BytesIO(s3_object.get_resource().get()["Body"].read())
1402
+ else:
1403
+ temporary_file = Path("storage").joinpath(obj_name)
1404
+ readable_content = temporary_file
1405
+ s3_object.download(readable_content) # type: ignore
1406
+
1407
+ # 6. Read the content
1408
+ read_documents = reader.read(readable_content, name=obj_name)
1409
+
1410
+ # 7. Prepare and insert the content in the vector database
1411
+ if not content.id:
1412
+ content.id = generate_id(content.content_hash or "")
1413
+ self._prepare_documents_for_insert(read_documents, content.id)
1414
+ await self._handle_vector_db_insert_async(content_entry, read_documents, upsert)
1415
+
1416
+ # 8. Remove temporary file if needed
1417
+ if temporary_file:
1418
+ temporary_file.unlink()
1419
+
1420
+ async def _load_from_gcs_async(self, content: Content, upsert: bool, skip_if_exists: bool):
1421
+ """Load the contextual GCS content.
1422
+
1423
+ 1. Identify objects to read
1424
+ 2. Setup Content object
1425
+ 3. Hash content and add it to the contents database
1426
+ 4. Select reader
1427
+ 5. Fetch and load the content
1428
+ 6. Read the content
1429
+ 7. Prepare and insert the content in the vector database
1430
+ """
1431
+ remote_content: GCSContent = cast(GCSContent, content.remote_content)
1432
+
1433
+ # 1. Identify objects to read
1434
+ objects_to_read = []
1435
+ if remote_content.blob_name is not None:
1436
+ objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name)) # type: ignore
1437
+ elif remote_content.prefix is not None:
1438
+ objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
1439
+ else:
1440
+ objects_to_read.extend(remote_content.bucket.list_blobs()) # type: ignore
1441
+
1442
+ for gcs_object in objects_to_read:
1443
+ # 2. Setup Content object
1444
+ name = (content.name or "content") + "_" + gcs_object.name
1445
+ content_entry = Content(
1446
+ name=name,
1447
+ description=content.description,
1448
+ status=ContentStatus.PROCESSING,
1449
+ metadata=content.metadata,
1450
+ file_type="gcs",
1451
+ )
1452
+
1453
+ # 3. Hash content and add it to the contents database
1454
+ content_entry.content_hash = self._build_content_hash(content_entry)
1455
+ content_entry.id = generate_id(content_entry.content_hash)
1456
+ await self._add_to_contents_db_async(content_entry)
1457
+ if self._should_skip(content_entry.content_hash, skip_if_exists):
1458
+ content_entry.status = ContentStatus.COMPLETED
1459
+ await self._aupdate_content(content_entry)
738
1460
  return
739
1461
 
740
- if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
741
- log_info(f"Content {content.content_hash} already exists, skipping")
742
- continue
1462
+ # 4. Select reader
1463
+ reader = self._select_reader_by_uri(gcs_object.name, content.reader)
1464
+ reader = cast(Reader, reader)
743
1465
 
744
- await self._add_to_contents_db(content)
745
- if content.reader is None:
746
- log_error(f"No reader available for topic: {topic}")
747
- content.status = ContentStatus.FAILED
748
- content.status_message = "No reader available for topic"
749
- await self._aupdate_content(content)
750
- continue
1466
+ # 5. Fetch and load the content
1467
+ readable_content = BytesIO(gcs_object.download_as_bytes())
751
1468
 
752
- read_documents = content.reader.read(topic)
753
- if len(read_documents) > 0:
754
- for read_document in read_documents:
755
- read_document.content_id = content.id
756
- if read_document.content:
757
- read_document.size = len(read_document.content.encode("utf-8"))
758
- else:
759
- content.status = ContentStatus.FAILED
760
- content.status_message = "No content found for topic"
761
- await self._aupdate_content(content)
1469
+ # 6. Read the content
1470
+ read_documents = reader.read(readable_content, name=name)
762
1471
 
763
- await self._handle_vector_db_insert(content, read_documents, upsert)
1472
+ # 7. Prepare and insert the content in the vector database
1473
+ if not content.id:
1474
+ content.id = generate_id(content.content_hash or "")
1475
+ self._prepare_documents_for_insert(read_documents, content.id)
1476
+ await self._handle_vector_db_insert_async(content_entry, read_documents, upsert)
764
1477
 
765
- async def _load_from_remote_content(
1478
+ def _load_from_remote_content(
766
1479
  self,
767
1480
  content: Content,
768
1481
  upsert: bool,
769
1482
  skip_if_exists: bool,
770
1483
  ):
1484
+ """Synchronous version of _load_from_remote_content."""
771
1485
  if content.remote_content is None:
772
1486
  log_warning("No remote content provided for content")
773
1487
  return
@@ -775,17 +1489,18 @@ class Knowledge:
775
1489
  remote_content = content.remote_content
776
1490
 
777
1491
  if isinstance(remote_content, S3Content):
778
- await self._load_from_s3(content, upsert, skip_if_exists)
1492
+ self._load_from_s3(content, upsert, skip_if_exists)
779
1493
 
780
1494
  elif isinstance(remote_content, GCSContent):
781
- await self._load_from_gcs(content, upsert, skip_if_exists)
1495
+ self._load_from_gcs(content, upsert, skip_if_exists)
782
1496
 
783
1497
  else:
784
1498
  log_warning(f"Unsupported remote content type: {type(remote_content)}")
785
1499
 
786
- async def _load_from_s3(self, content: Content, upsert: bool, skip_if_exists: bool):
787
- """Load the contextual S3 content.
1500
+ def _load_from_s3(self, content: Content, upsert: bool, skip_if_exists: bool):
1501
+ """Synchronous version of _load_from_s3.
788
1502
 
1503
+ Load the contextual S3 content:
789
1504
  1. Identify objects to read
790
1505
  2. Setup Content object
791
1506
  3. Hash content and add it to the contents database
@@ -827,29 +1542,14 @@ class Knowledge:
827
1542
  # 3. Hash content and add it to the contents database
828
1543
  content_entry.content_hash = self._build_content_hash(content_entry)
829
1544
  content_entry.id = generate_id(content_entry.content_hash)
830
- await self._add_to_contents_db(content_entry)
1545
+ self._add_to_contents_db(content_entry)
831
1546
  if self._should_skip(content_entry.content_hash, skip_if_exists):
832
1547
  content_entry.status = ContentStatus.COMPLETED
833
- await self._aupdate_content(content_entry)
1548
+ self._update_content(content_entry)
834
1549
  return
835
1550
 
836
1551
  # 4. Select reader
837
- reader = content.reader
838
- if reader is None:
839
- if s3_object.uri.endswith(".pdf"):
840
- reader = self.pdf_reader
841
- elif s3_object.uri.endswith(".csv"):
842
- reader = self.csv_reader
843
- elif s3_object.uri.endswith(".docx"):
844
- reader = self.docx_reader
845
- elif s3_object.uri.endswith(".pptx"):
846
- reader = self.pptx_reader
847
- elif s3_object.uri.endswith(".json"):
848
- reader = self.json_reader
849
- elif s3_object.uri.endswith(".markdown"):
850
- reader = self.markdown_reader
851
- else:
852
- reader = self.text_reader
1552
+ reader = self._select_reader_by_uri(s3_object.uri, content.reader)
853
1553
  reader = cast(Reader, reader)
854
1554
 
855
1555
  # 5. Fetch and load the content
@@ -867,17 +1567,19 @@ class Knowledge:
867
1567
  read_documents = reader.read(readable_content, name=obj_name)
868
1568
 
869
1569
  # 7. Prepare and insert the content in the vector database
870
- for read_document in read_documents:
871
- read_document.content_id = content.id
872
- await self._handle_vector_db_insert(content_entry, read_documents, upsert)
1570
+ if not content.id:
1571
+ content.id = generate_id(content.content_hash or "")
1572
+ self._prepare_documents_for_insert(read_documents, content.id)
1573
+ self._handle_vector_db_insert(content_entry, read_documents, upsert)
873
1574
 
874
1575
  # 8. Remove temporary file if needed
875
1576
  if temporary_file:
876
1577
  temporary_file.unlink()
877
1578
 
878
- async def _load_from_gcs(self, content: Content, upsert: bool, skip_if_exists: bool):
879
- """Load the contextual GCS content.
1579
+ def _load_from_gcs(self, content: Content, upsert: bool, skip_if_exists: bool):
1580
+ """Synchronous version of _load_from_gcs.
880
1581
 
1582
+ Load the contextual GCS content:
881
1583
  1. Identify objects to read
882
1584
  2. Setup Content object
883
1585
  3. Hash content and add it to the contents database
@@ -911,29 +1613,14 @@ class Knowledge:
911
1613
  # 3. Hash content and add it to the contents database
912
1614
  content_entry.content_hash = self._build_content_hash(content_entry)
913
1615
  content_entry.id = generate_id(content_entry.content_hash)
914
- await self._add_to_contents_db(content_entry)
1616
+ self._add_to_contents_db(content_entry)
915
1617
  if self._should_skip(content_entry.content_hash, skip_if_exists):
916
1618
  content_entry.status = ContentStatus.COMPLETED
917
- await self._aupdate_content(content_entry)
1619
+ self._update_content(content_entry)
918
1620
  return
919
1621
 
920
1622
  # 4. Select reader
921
- reader = content.reader
922
- if reader is None:
923
- if gcs_object.name.endswith(".pdf"):
924
- reader = self.pdf_reader
925
- elif gcs_object.name.endswith(".csv"):
926
- reader = self.csv_reader
927
- elif gcs_object.name.endswith(".docx"):
928
- reader = self.docx_reader
929
- elif gcs_object.name.endswith(".pptx"):
930
- reader = self.pptx_reader
931
- elif gcs_object.name.endswith(".json"):
932
- reader = self.json_reader
933
- elif gcs_object.name.endswith(".markdown"):
934
- reader = self.markdown_reader
935
- else:
936
- reader = self.text_reader
1623
+ reader = self._select_reader_by_uri(gcs_object.name, content.reader)
937
1624
  reader = cast(Reader, reader)
938
1625
 
939
1626
  # 5. Fetch and load the content
@@ -943,11 +1630,12 @@ class Knowledge:
943
1630
  read_documents = reader.read(readable_content, name=name)
944
1631
 
945
1632
  # 7. Prepare and insert the content in the vector database
946
- for read_document in read_documents:
947
- read_document.content_id = content.id
948
- await self._handle_vector_db_insert(content_entry, read_documents, upsert)
1633
+ if not content.id:
1634
+ content.id = generate_id(content.content_hash or "")
1635
+ self._prepare_documents_for_insert(read_documents, content.id)
1636
+ self._handle_vector_db_insert(content_entry, read_documents, upsert)
949
1637
 
950
- async def _handle_vector_db_insert(self, content: Content, read_documents, upsert):
1638
+ async def _handle_vector_db_insert_async(self, content: Content, read_documents, upsert):
951
1639
  from agno.vectordb import VectorDb
952
1640
 
953
1641
  self.vector_db = cast(VectorDb, self.vector_db)
@@ -985,7 +1673,70 @@ class Knowledge:
985
1673
  content.status = ContentStatus.COMPLETED
986
1674
  await self._aupdate_content(content)
987
1675
 
988
- async def _load_content(
1676
+ def _handle_vector_db_insert(self, content: Content, read_documents, upsert):
1677
+ """Synchronously handle vector database insertion."""
1678
+ from agno.vectordb import VectorDb
1679
+
1680
+ self.vector_db = cast(VectorDb, self.vector_db)
1681
+
1682
+ if not self.vector_db:
1683
+ log_error("No vector database configured")
1684
+ content.status = ContentStatus.FAILED
1685
+ content.status_message = "No vector database configured"
1686
+ self._update_content(content)
1687
+ return
1688
+
1689
+ if self.vector_db.upsert_available() and upsert:
1690
+ try:
1691
+ self.vector_db.upsert(content.content_hash, read_documents, content.metadata) # type: ignore[arg-type]
1692
+ except Exception as e:
1693
+ log_error(f"Error upserting document: {e}")
1694
+ content.status = ContentStatus.FAILED
1695
+ content.status_message = "Could not upsert embedding"
1696
+ self._update_content(content)
1697
+ return
1698
+ else:
1699
+ try:
1700
+ self.vector_db.insert(
1701
+ content.content_hash, # type: ignore[arg-type]
1702
+ documents=read_documents,
1703
+ filters=content.metadata, # type: ignore[arg-type]
1704
+ )
1705
+ except Exception as e:
1706
+ log_error(f"Error inserting document: {e}")
1707
+ content.status = ContentStatus.FAILED
1708
+ content.status_message = "Could not insert embedding"
1709
+ self._update_content(content)
1710
+ return
1711
+
1712
+ content.status = ContentStatus.COMPLETED
1713
+ self._update_content(content)
1714
+
1715
+ def _load_content(
1716
+ self,
1717
+ content: Content,
1718
+ upsert: bool,
1719
+ skip_if_exists: bool,
1720
+ include: Optional[List[str]] = None,
1721
+ exclude: Optional[List[str]] = None,
1722
+ ) -> None:
1723
+ """Synchronously load content."""
1724
+ if content.path:
1725
+ self._load_from_path(content, upsert, skip_if_exists, include, exclude)
1726
+
1727
+ if content.url:
1728
+ self._load_from_url(content, upsert, skip_if_exists)
1729
+
1730
+ if content.file_data:
1731
+ self._load_from_content(content, upsert, skip_if_exists)
1732
+
1733
+ if content.topics:
1734
+ self._load_from_topics(content, upsert, skip_if_exists)
1735
+
1736
+ if content.remote_content:
1737
+ self._load_from_remote_content(content, upsert, skip_if_exists)
1738
+
1739
+ async def _load_content_async(
989
1740
  self,
990
1741
  content: Content,
991
1742
  upsert: bool,
@@ -994,19 +1745,19 @@ class Knowledge:
994
1745
  exclude: Optional[List[str]] = None,
995
1746
  ) -> None:
996
1747
  if content.path:
997
- await self._load_from_path(content, upsert, skip_if_exists, include, exclude)
1748
+ await self._load_from_path_async(content, upsert, skip_if_exists, include, exclude)
998
1749
 
999
1750
  if content.url:
1000
- await self._load_from_url(content, upsert, skip_if_exists)
1751
+ await self._load_from_url_async(content, upsert, skip_if_exists)
1001
1752
 
1002
1753
  if content.file_data:
1003
- await self._load_from_content(content, upsert, skip_if_exists)
1754
+ await self._load_from_content_async(content, upsert, skip_if_exists)
1004
1755
 
1005
1756
  if content.topics:
1006
- await self._load_from_topics(content, upsert, skip_if_exists)
1757
+ await self._load_from_topics_async(content, upsert, skip_if_exists)
1007
1758
 
1008
1759
  if content.remote_content:
1009
- await self._load_from_remote_content(content, upsert, skip_if_exists)
1760
+ await self._load_from_remote_content_async(content, upsert, skip_if_exists)
1010
1761
 
1011
1762
  def _build_content_hash(self, content: Content) -> str:
1012
1763
  """
@@ -1078,7 +1829,7 @@ class Knowledge:
1078
1829
  # Already a string, return as-is
1079
1830
  return value
1080
1831
 
1081
- async def _add_to_contents_db(self, content: Content):
1832
+ async def _add_to_contents_db_async(self, content: Content):
1082
1833
  if self.contents_db:
1083
1834
  created_at = content.created_at if content.created_at else int(time.time())
1084
1835
  updated_at = content.updated_at if content.updated_at else int(time.time())
@@ -1121,6 +1872,52 @@ class Knowledge:
1121
1872
  else:
1122
1873
  self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
1123
1874
 
1875
+ def _add_to_contents_db(self, content: Content):
1876
+ """Synchronously add content to contents database."""
1877
+ if self.contents_db:
1878
+ if isinstance(self.contents_db, AsyncBaseDb):
1879
+ raise ValueError(
1880
+ "_add_to_contents_db() is not supported with an async DB. Please use add_content_async with AsyncDb."
1881
+ )
1882
+
1883
+ created_at = content.created_at if content.created_at else int(time.time())
1884
+ updated_at = content.updated_at if content.updated_at else int(time.time())
1885
+
1886
+ file_type = (
1887
+ content.file_type
1888
+ if content.file_type
1889
+ else content.file_data.type
1890
+ if content.file_data and content.file_data.type
1891
+ else None
1892
+ )
1893
+ # Safely handle string fields with proper type checking
1894
+ safe_name = self._ensure_string_field(content.name, "content.name", default="")
1895
+ safe_description = self._ensure_string_field(content.description, "content.description", default="")
1896
+ safe_linked_to = self._ensure_string_field(self.name, "knowledge.name", default="")
1897
+ safe_status_message = self._ensure_string_field(
1898
+ content.status_message, "content.status_message", default=""
1899
+ )
1900
+
1901
+ content_row = KnowledgeRow(
1902
+ id=content.id,
1903
+ name=safe_name,
1904
+ description=safe_description,
1905
+ metadata=content.metadata,
1906
+ type=file_type,
1907
+ size=content.size
1908
+ if content.size
1909
+ else len(content.file_data.content)
1910
+ if content.file_data and content.file_data.content
1911
+ else None,
1912
+ linked_to=safe_linked_to,
1913
+ access_count=0,
1914
+ status=content.status if content.status else ContentStatus.PROCESSING,
1915
+ status_message=safe_status_message,
1916
+ created_at=created_at,
1917
+ updated_at=updated_at,
1918
+ )
1919
+ self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
1920
+
1124
1921
  def _update_content(self, content: Content) -> Optional[Dict[str, Any]]:
1125
1922
  from agno.vectordb import VectorDb
1126
1923
 
@@ -1221,12 +2018,12 @@ class Knowledge:
1221
2018
  log_warning("Contents DB not found for knowledge base")
1222
2019
  return None
1223
2020
 
1224
- async def _process_lightrag_content(self, content: Content, content_type: KnowledgeContentOrigin) -> None:
2021
+ async def _process_lightrag_content_async(self, content: Content, content_type: KnowledgeContentOrigin) -> None:
1225
2022
  from agno.vectordb import VectorDb
1226
2023
 
1227
2024
  self.vector_db = cast(VectorDb, self.vector_db)
1228
2025
 
1229
- await self._add_to_contents_db(content)
2026
+ await self._add_to_contents_db_async(content)
1230
2027
  if content_type == KnowledgeContentOrigin.PATH:
1231
2028
  if content.file_data is None:
1232
2029
  log_warning("No file data provided")
@@ -1283,9 +2080,9 @@ class Knowledge:
1283
2080
 
1284
2081
  reader.chunk = False
1285
2082
  read_documents = reader.read(content.url, name=content.name)
1286
-
1287
- for read_document in read_documents:
1288
- read_document.content_id = content.id
2083
+ if not content.id:
2084
+ content.id = generate_id(content.content_hash or "")
2085
+ self._prepare_documents_for_insert(read_documents, content.id)
1289
2086
 
1290
2087
  if not read_documents:
1291
2088
  log_error("No documents read from URL")
@@ -1378,6 +2175,175 @@ class Knowledge:
1378
2175
  log_warning(f"No documents found for LightRAG upload: {content.name}")
1379
2176
  return
1380
2177
 
2178
+ def _process_lightrag_content(self, content: Content, content_type: KnowledgeContentOrigin) -> None:
2179
+ """Synchronously process LightRAG content. Uses asyncio.run() only for LightRAG-specific async methods."""
2180
+ from agno.vectordb import VectorDb
2181
+
2182
+ self.vector_db = cast(VectorDb, self.vector_db)
2183
+
2184
+ self._add_to_contents_db(content)
2185
+ if content_type == KnowledgeContentOrigin.PATH:
2186
+ if content.file_data is None:
2187
+ log_warning("No file data provided")
2188
+
2189
+ if content.path is None:
2190
+ log_error("No path provided for content")
2191
+ return
2192
+
2193
+ path = Path(content.path)
2194
+
2195
+ log_info(f"Uploading file to LightRAG from path: {path}")
2196
+ try:
2197
+ # Read the file content from path
2198
+ with open(path, "rb") as f:
2199
+ file_content = f.read()
2200
+
2201
+ # Get file type from extension or content.file_type
2202
+ file_type = content.file_type or path.suffix
2203
+
2204
+ if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
2205
+ # LightRAG only has async methods, use asyncio.run() here
2206
+ result = asyncio.run(
2207
+ self.vector_db.insert_file_bytes(
2208
+ file_content=file_content,
2209
+ filename=path.name,
2210
+ content_type=file_type,
2211
+ send_metadata=True,
2212
+ )
2213
+ )
2214
+ else:
2215
+ log_error("Vector database does not support file insertion")
2216
+ content.status = ContentStatus.FAILED
2217
+ self._update_content(content)
2218
+ return
2219
+ content.external_id = result
2220
+ content.status = ContentStatus.COMPLETED
2221
+ self._update_content(content)
2222
+ return
2223
+
2224
+ except Exception as e:
2225
+ log_error(f"Error uploading file to LightRAG: {e}")
2226
+ content.status = ContentStatus.FAILED
2227
+ content.status_message = f"Could not upload to LightRAG: {str(e)}"
2228
+ self._update_content(content)
2229
+ return
2230
+
2231
+ elif content_type == KnowledgeContentOrigin.URL:
2232
+ log_info(f"Uploading file to LightRAG from URL: {content.url}")
2233
+ try:
2234
+ reader = content.reader or self.website_reader
2235
+ if reader is None:
2236
+ log_error("No URL reader available")
2237
+ content.status = ContentStatus.FAILED
2238
+ self._update_content(content)
2239
+ return
2240
+
2241
+ reader.chunk = False
2242
+ read_documents = reader.read(content.url, name=content.name)
2243
+ if not content.id:
2244
+ content.id = generate_id(content.content_hash or "")
2245
+ self._prepare_documents_for_insert(read_documents, content.id)
2246
+
2247
+ if not read_documents:
2248
+ log_error("No documents read from URL")
2249
+ content.status = ContentStatus.FAILED
2250
+ self._update_content(content)
2251
+ return
2252
+
2253
+ if self.vector_db and hasattr(self.vector_db, "insert_text"):
2254
+ # LightRAG only has async methods, use asyncio.run() here
2255
+ result = asyncio.run(
2256
+ self.vector_db.insert_text(
2257
+ file_source=content.url,
2258
+ text=read_documents[0].content,
2259
+ )
2260
+ )
2261
+ else:
2262
+ log_error("Vector database does not support text insertion")
2263
+ content.status = ContentStatus.FAILED
2264
+ self._update_content(content)
2265
+ return
2266
+
2267
+ content.external_id = result
2268
+ content.status = ContentStatus.COMPLETED
2269
+ self._update_content(content)
2270
+ return
2271
+
2272
+ except Exception as e:
2273
+ log_error(f"Error uploading file to LightRAG: {e}")
2274
+ content.status = ContentStatus.FAILED
2275
+ content.status_message = f"Could not upload to LightRAG: {str(e)}"
2276
+ self._update_content(content)
2277
+ return
2278
+
2279
+ elif content_type == KnowledgeContentOrigin.CONTENT:
2280
+ filename = (
2281
+ content.file_data.filename if content.file_data and content.file_data.filename else "uploaded_file"
2282
+ )
2283
+ log_info(f"Uploading file to LightRAG: {filename}")
2284
+
2285
+ # Use the content from file_data
2286
+ if content.file_data and content.file_data.content:
2287
+ if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
2288
+ # LightRAG only has async methods, use asyncio.run() here
2289
+ result = asyncio.run(
2290
+ self.vector_db.insert_file_bytes(
2291
+ file_content=content.file_data.content,
2292
+ filename=filename,
2293
+ content_type=content.file_data.type,
2294
+ send_metadata=True,
2295
+ )
2296
+ )
2297
+ else:
2298
+ log_error("Vector database does not support file insertion")
2299
+ content.status = ContentStatus.FAILED
2300
+ self._update_content(content)
2301
+ return
2302
+ content.external_id = result
2303
+ content.status = ContentStatus.COMPLETED
2304
+ self._update_content(content)
2305
+ else:
2306
+ log_warning(f"No file data available for LightRAG upload: {content.name}")
2307
+ return
2308
+
2309
+ elif content_type == KnowledgeContentOrigin.TOPIC:
2310
+ log_info(f"Uploading file to LightRAG: {content.name}")
2311
+
2312
+ if content.reader is None:
2313
+ log_error("No reader available for topic content")
2314
+ content.status = ContentStatus.FAILED
2315
+ self._update_content(content)
2316
+ return
2317
+
2318
+ if not content.topics:
2319
+ log_error("No topics available for content")
2320
+ content.status = ContentStatus.FAILED
2321
+ self._update_content(content)
2322
+ return
2323
+
2324
+ read_documents = content.reader.read(content.topics)
2325
+ if len(read_documents) > 0:
2326
+ if self.vector_db and hasattr(self.vector_db, "insert_text"):
2327
+ # LightRAG only has async methods, use asyncio.run() here
2328
+ result = asyncio.run(
2329
+ self.vector_db.insert_text(
2330
+ file_source=content.topics[0],
2331
+ text=read_documents[0].content,
2332
+ )
2333
+ )
2334
+ else:
2335
+ log_error("Vector database does not support text insertion")
2336
+ content.status = ContentStatus.FAILED
2337
+ self._update_content(content)
2338
+ return
2339
+ content.external_id = result
2340
+ content.status = ContentStatus.COMPLETED
2341
+ self._update_content(content)
2342
+ return
2343
+ else:
2344
+ log_warning(f"No documents found for LightRAG upload: {content.name}")
2345
+ return
2346
+
1381
2347
  def search(
1382
2348
  self,
1383
2349
  query: str,