agno 2.0.0rc1__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +101 -140
- agno/db/mongo/mongo.py +8 -3
- agno/eval/accuracy.py +12 -5
- agno/knowledge/chunking/strategy.py +14 -14
- agno/knowledge/knowledge.py +156 -120
- agno/knowledge/reader/arxiv_reader.py +5 -5
- agno/knowledge/reader/csv_reader.py +6 -77
- agno/knowledge/reader/docx_reader.py +5 -5
- agno/knowledge/reader/firecrawl_reader.py +5 -5
- agno/knowledge/reader/json_reader.py +5 -5
- agno/knowledge/reader/markdown_reader.py +31 -9
- agno/knowledge/reader/pdf_reader.py +10 -123
- agno/knowledge/reader/reader_factory.py +65 -72
- agno/knowledge/reader/s3_reader.py +44 -114
- agno/knowledge/reader/text_reader.py +5 -5
- agno/knowledge/reader/url_reader.py +75 -31
- agno/knowledge/reader/web_search_reader.py +6 -29
- agno/knowledge/reader/website_reader.py +5 -5
- agno/knowledge/reader/wikipedia_reader.py +5 -5
- agno/knowledge/reader/youtube_reader.py +6 -6
- agno/knowledge/reranker/__init__.py +9 -0
- agno/knowledge/utils.py +10 -10
- agno/media.py +269 -268
- agno/models/aws/bedrock.py +3 -7
- agno/models/base.py +50 -54
- agno/models/google/gemini.py +11 -10
- agno/models/message.py +4 -4
- agno/models/ollama/chat.py +1 -1
- agno/models/openai/chat.py +33 -14
- agno/models/response.py +5 -5
- agno/os/app.py +40 -29
- agno/os/mcp.py +39 -59
- agno/os/router.py +547 -16
- agno/os/routers/evals/evals.py +197 -12
- agno/os/routers/knowledge/knowledge.py +428 -14
- agno/os/routers/memory/memory.py +250 -28
- agno/os/routers/metrics/metrics.py +125 -7
- agno/os/routers/session/session.py +393 -25
- agno/os/schema.py +55 -2
- agno/run/agent.py +37 -28
- agno/run/base.py +9 -19
- agno/run/team.py +110 -19
- agno/run/workflow.py +41 -28
- agno/team/team.py +808 -1080
- agno/tools/brightdata.py +3 -3
- agno/tools/cartesia.py +3 -5
- agno/tools/dalle.py +7 -4
- agno/tools/desi_vocal.py +2 -2
- agno/tools/e2b.py +6 -6
- agno/tools/eleven_labs.py +3 -3
- agno/tools/fal.py +4 -4
- agno/tools/function.py +7 -7
- agno/tools/giphy.py +2 -2
- agno/tools/lumalab.py +3 -3
- agno/tools/mcp.py +1 -2
- agno/tools/models/azure_openai.py +2 -2
- agno/tools/models/gemini.py +3 -3
- agno/tools/models/groq.py +3 -5
- agno/tools/models/nebius.py +2 -2
- agno/tools/models_labs.py +5 -5
- agno/tools/openai.py +4 -9
- agno/tools/opencv.py +3 -3
- agno/tools/replicate.py +7 -7
- agno/utils/events.py +5 -5
- agno/utils/gemini.py +1 -1
- agno/utils/log.py +52 -2
- agno/utils/mcp.py +57 -5
- agno/utils/models/aws_claude.py +1 -1
- agno/utils/models/claude.py +0 -8
- agno/utils/models/cohere.py +1 -1
- agno/utils/models/watsonx.py +1 -1
- agno/utils/openai.py +1 -1
- agno/utils/print_response/team.py +177 -73
- agno/utils/streamlit.py +27 -0
- agno/vectordb/lancedb/lance_db.py +82 -25
- agno/workflow/step.py +7 -7
- agno/workflow/types.py +13 -13
- agno/workflow/workflow.py +37 -28
- {agno-2.0.0rc1.dist-info → agno-2.0.1.dist-info}/METADATA +140 -1
- {agno-2.0.0rc1.dist-info → agno-2.0.1.dist-info}/RECORD +83 -84
- agno-2.0.1.dist-info/licenses/LICENSE +201 -0
- agno/knowledge/reader/gcs_reader.py +0 -67
- agno-2.0.0rc1.dist-info/licenses/LICENSE +0 -375
- {agno-2.0.0rc1.dist-info → agno-2.0.1.dist-info}/WHEEL +0 -0
- {agno-2.0.0rc1.dist-info → agno-2.0.1.dist-info}/top_level.txt +0 -0
agno/knowledge/knowledge.py
CHANGED
|
@@ -5,16 +5,21 @@ import time
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from enum import Enum
|
|
7
7
|
from functools import cached_property
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from os.path import basename
|
|
8
10
|
from pathlib import Path
|
|
9
11
|
from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast, overload
|
|
10
12
|
from uuid import uuid4
|
|
11
13
|
|
|
14
|
+
from httpx import AsyncClient
|
|
15
|
+
|
|
12
16
|
from agno.db.base import BaseDb
|
|
13
17
|
from agno.db.schemas.knowledge import KnowledgeRow
|
|
14
18
|
from agno.knowledge.content import Content, ContentAuth, ContentStatus, FileData
|
|
15
19
|
from agno.knowledge.document import Document
|
|
16
20
|
from agno.knowledge.reader import Reader, ReaderFactory
|
|
17
21
|
from agno.knowledge.remote_content.remote_content import GCSContent, RemoteContent, S3Content
|
|
22
|
+
from agno.utils.http import async_fetch_with_retry
|
|
18
23
|
from agno.utils.log import log_debug, log_error, log_info, log_warning
|
|
19
24
|
from agno.vectordb import VectorDb
|
|
20
25
|
|
|
@@ -421,20 +426,31 @@ class Knowledge:
|
|
|
421
426
|
upsert: bool,
|
|
422
427
|
skip_if_exists: bool,
|
|
423
428
|
):
|
|
429
|
+
"""Load the content in the contextual URL
|
|
430
|
+
|
|
431
|
+
1. Set content hash
|
|
432
|
+
2. Validate the URL
|
|
433
|
+
3. Read the content
|
|
434
|
+
4. Prepare and insert the content in the vector database
|
|
435
|
+
"""
|
|
424
436
|
log_info(f"Adding content from URL {content.url}")
|
|
425
437
|
content.file_type = "url"
|
|
426
438
|
|
|
439
|
+
if not content.url:
|
|
440
|
+
raise ValueError("No url provided")
|
|
441
|
+
|
|
427
442
|
if self.vector_db.__class__.__name__ == "LightRag":
|
|
428
443
|
await self._process_lightrag_content(content, KnowledgeContentOrigin.URL)
|
|
429
444
|
return
|
|
430
445
|
|
|
446
|
+
# 1. Set content hash
|
|
431
447
|
content.content_hash = self._build_content_hash(content)
|
|
432
448
|
if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
|
|
433
449
|
log_info(f"Content {content.content_hash} already exists, skipping")
|
|
434
450
|
return
|
|
435
451
|
self._add_to_contents_db(content)
|
|
436
452
|
|
|
437
|
-
# Validate URL
|
|
453
|
+
# 2. Validate URL
|
|
438
454
|
try:
|
|
439
455
|
from urllib.parse import urlparse
|
|
440
456
|
|
|
@@ -450,61 +466,47 @@ class Knowledge:
|
|
|
450
466
|
self._update_content(content)
|
|
451
467
|
log_warning(f"Invalid URL: {content.url} - {str(e)}")
|
|
452
468
|
|
|
453
|
-
#
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
try:
|
|
458
|
-
if content.url.endswith("llms-full.txt") or content.url.endswith("llms.txt"): # type: ignore
|
|
459
|
-
log_info("Detected llms, using url reader")
|
|
460
|
-
reader = content.reader or self.url_reader
|
|
461
|
-
if reader is not None:
|
|
462
|
-
# TODO: We will refactor this to eventually pass authorization to all readers
|
|
463
|
-
import inspect
|
|
469
|
+
# 3. Fetch and load content
|
|
470
|
+
async with AsyncClient() as client:
|
|
471
|
+
response = await async_fetch_with_retry(content.url, client=client)
|
|
472
|
+
bytes_content = BytesIO(response.content)
|
|
464
473
|
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
read_documents = reader.read(content.url, name=content.name, password=content.auth.password)
|
|
485
|
-
else:
|
|
486
|
-
read_documents = reader.read(content.url, name=content.name)
|
|
487
|
-
else:
|
|
488
|
-
log_info(f"No reader found for file extension: {file_extension}")
|
|
474
|
+
# 4. Select reader
|
|
475
|
+
# If a reader was provided by the user, use it
|
|
476
|
+
reader = content.reader
|
|
477
|
+
name = content.name if content.name else content.url
|
|
478
|
+
# Else select based on file extension
|
|
479
|
+
if reader is None:
|
|
480
|
+
url_path = Path(parsed_url.path)
|
|
481
|
+
file_extension = url_path.suffix.lower()
|
|
482
|
+
if file_extension == ".csv":
|
|
483
|
+
name = basename(parsed_url.path) or "data.csv"
|
|
484
|
+
reader = self.csv_reader
|
|
485
|
+
elif file_extension == ".pdf":
|
|
486
|
+
reader = self.pdf_reader
|
|
487
|
+
elif file_extension == ".docx":
|
|
488
|
+
reader = self.docx_reader
|
|
489
|
+
elif file_extension == ".json":
|
|
490
|
+
reader = self.json_reader
|
|
491
|
+
elif file_extension == ".markdown":
|
|
492
|
+
reader = self.markdown_reader
|
|
489
493
|
else:
|
|
490
|
-
|
|
491
|
-
if content.reader:
|
|
492
|
-
reader = content.reader
|
|
493
|
-
else:
|
|
494
|
-
reader = self._select_url_reader(content.url) # type: ignore
|
|
495
|
-
if reader is not None:
|
|
496
|
-
log_info(f"Selected reader: {reader.__class__.__name__}")
|
|
497
|
-
# TODO: We will refactor this to eventually pass authorization to all readers
|
|
498
|
-
import inspect
|
|
494
|
+
reader = self.text_reader
|
|
499
495
|
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
496
|
+
# 5. Read content
|
|
497
|
+
try:
|
|
498
|
+
read_documents = []
|
|
499
|
+
if reader is not None:
|
|
500
|
+
# TODO: We will refactor this to eventually pass authorization to all readers
|
|
501
|
+
import inspect
|
|
502
|
+
|
|
503
|
+
read_signature = inspect.signature(reader.read)
|
|
504
|
+
if reader.__class__.__name__ == "YouTubeReader":
|
|
505
|
+
read_documents = reader.read(content.url, name=name)
|
|
506
|
+
elif "password" in read_signature.parameters and content.auth and content.auth.password:
|
|
507
|
+
read_documents = reader.read(bytes_content, name=name, password=content.auth.password)
|
|
505
508
|
else:
|
|
506
|
-
|
|
507
|
-
|
|
509
|
+
read_documents = reader.read(bytes_content, name=name)
|
|
508
510
|
except Exception as e:
|
|
509
511
|
log_error(f"Error reading URL: {content.url} - {str(e)}")
|
|
510
512
|
content.status = ContentStatus.FAILED
|
|
@@ -512,13 +514,17 @@ class Knowledge:
|
|
|
512
514
|
self._update_content(content)
|
|
513
515
|
return
|
|
514
516
|
|
|
517
|
+
# 6. Chunk documents if needed
|
|
518
|
+
if reader and not reader.chunk:
|
|
519
|
+
read_documents = await reader.chunk_documents_async(read_documents)
|
|
520
|
+
|
|
521
|
+
# 7. Prepare and insert the content in the vector database
|
|
515
522
|
file_size = 0
|
|
516
523
|
if read_documents:
|
|
517
524
|
for read_document in read_documents:
|
|
518
525
|
if read_document.size:
|
|
519
526
|
file_size += read_document.size
|
|
520
527
|
read_document.content_id = content.id
|
|
521
|
-
|
|
522
528
|
await self._handle_vector_db_insert(content, read_documents, upsert)
|
|
523
529
|
|
|
524
530
|
async def _load_from_content(
|
|
@@ -699,21 +705,23 @@ class Knowledge:
|
|
|
699
705
|
log_warning(f"Unsupported remote content type: {type(remote_content)}")
|
|
700
706
|
|
|
701
707
|
async def _load_from_s3(self, content: Content, upsert: bool, skip_if_exists: bool):
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
708
|
+
"""Load the contextual S3 content.
|
|
709
|
+
|
|
710
|
+
1. Identify objects to read
|
|
711
|
+
2. Setup Content object
|
|
712
|
+
3. Hash content and add it to the contents database
|
|
713
|
+
4. Select reader
|
|
714
|
+
5. Fetch and load the content
|
|
715
|
+
6. Read the content
|
|
716
|
+
7. Prepare and insert the content in the vector database
|
|
717
|
+
8. Remove temporary file if needed
|
|
718
|
+
"""
|
|
719
|
+
from agno.cloud.aws.s3.object import S3Object
|
|
712
720
|
|
|
713
721
|
remote_content: S3Content = cast(S3Content, content.remote_content)
|
|
714
722
|
|
|
723
|
+
# 1. Identify objects to read
|
|
715
724
|
objects_to_read: List[S3Object] = []
|
|
716
|
-
|
|
717
725
|
if remote_content.bucket is not None:
|
|
718
726
|
if remote_content.key is not None:
|
|
719
727
|
_object = S3Object(bucket_name=remote_content.bucket.name, name=remote_content.key)
|
|
@@ -725,10 +733,11 @@ class Knowledge:
|
|
|
725
733
|
else:
|
|
726
734
|
objects_to_read.extend(remote_content.bucket.get_objects())
|
|
727
735
|
|
|
728
|
-
for
|
|
736
|
+
for s3_object in objects_to_read:
|
|
737
|
+
# 2. Setup Content object
|
|
729
738
|
id = str(uuid4())
|
|
730
739
|
content_name = content.name or ""
|
|
731
|
-
content_name += "_" + (
|
|
740
|
+
content_name += "_" + (s3_object.name or "")
|
|
732
741
|
content_entry = Content(
|
|
733
742
|
id=id,
|
|
734
743
|
name=content_name,
|
|
@@ -738,63 +747,123 @@ class Knowledge:
|
|
|
738
747
|
file_type="s3",
|
|
739
748
|
)
|
|
740
749
|
|
|
750
|
+
# 3. Hash content and add it to the contents database
|
|
741
751
|
content_hash = self._build_content_hash(content_entry)
|
|
742
752
|
if self.vector_db and self.vector_db.content_hash_exists(content_hash) and skip_if_exists:
|
|
743
753
|
log_info(f"Content {content_hash} already exists, skipping")
|
|
744
754
|
continue
|
|
745
|
-
|
|
746
755
|
self._add_to_contents_db(content_entry)
|
|
747
756
|
|
|
748
|
-
|
|
757
|
+
# 4. Select reader
|
|
758
|
+
reader = content.reader
|
|
759
|
+
if reader is None:
|
|
760
|
+
if s3_object.uri.endswith(".pdf"):
|
|
761
|
+
reader = self.pdf_reader
|
|
762
|
+
elif s3_object.uri.endswith(".csv"):
|
|
763
|
+
reader = self.csv_reader
|
|
764
|
+
elif s3_object.uri.endswith(".docx"):
|
|
765
|
+
reader = self.docx_reader
|
|
766
|
+
elif s3_object.uri.endswith(".json"):
|
|
767
|
+
reader = self.json_reader
|
|
768
|
+
elif s3_object.uri.endswith(".markdown"):
|
|
769
|
+
reader = self.markdown_reader
|
|
770
|
+
else:
|
|
771
|
+
reader = self.text_reader
|
|
772
|
+
reader = cast(Reader, reader)
|
|
773
|
+
|
|
774
|
+
# 5. Fetch and load the content
|
|
775
|
+
temporary_file = None
|
|
776
|
+
obj_name = content_name or s3_object.name.split("/")[-1]
|
|
777
|
+
readable_content: Optional[Union[BytesIO, Path]] = None
|
|
778
|
+
if s3_object.uri.endswith(".pdf"):
|
|
779
|
+
readable_content = BytesIO(s3_object.get_resource().get()["Body"].read())
|
|
780
|
+
else:
|
|
781
|
+
temporary_file = Path("storage").joinpath(obj_name)
|
|
782
|
+
readable_content = temporary_file
|
|
783
|
+
s3_object.download(readable_content) # type: ignore
|
|
784
|
+
|
|
785
|
+
# 6. Read the content
|
|
786
|
+
read_documents = reader.read(readable_content, name=obj_name)
|
|
749
787
|
|
|
788
|
+
# 7. Prepare and insert the content in the vector database
|
|
750
789
|
for read_document in read_documents:
|
|
751
790
|
read_document.content_id = content.id
|
|
752
|
-
|
|
753
791
|
await self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
754
792
|
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
else:
|
|
759
|
-
reader = content.reader
|
|
760
|
-
|
|
761
|
-
if reader is None:
|
|
762
|
-
log_warning("No reader provided for content")
|
|
763
|
-
return
|
|
793
|
+
# 8. Remove temporary file if needed
|
|
794
|
+
if temporary_file:
|
|
795
|
+
temporary_file.unlink()
|
|
764
796
|
|
|
797
|
+
async def _load_from_gcs(self, content: Content, upsert: bool, skip_if_exists: bool):
|
|
798
|
+
"""Load the contextual GCS content.
|
|
799
|
+
|
|
800
|
+
1. Identify objects to read
|
|
801
|
+
2. Setup Content object
|
|
802
|
+
3. Hash content and add it to the contents database
|
|
803
|
+
4. Select reader
|
|
804
|
+
5. Fetch and load the content
|
|
805
|
+
6. Read the content
|
|
806
|
+
7. Prepare and insert the content in the vector database
|
|
807
|
+
"""
|
|
765
808
|
remote_content: GCSContent = cast(GCSContent, content.remote_content)
|
|
766
|
-
objects_to_read = []
|
|
767
809
|
|
|
810
|
+
# 1. Identify objects to read
|
|
811
|
+
objects_to_read = []
|
|
768
812
|
if remote_content.blob_name is not None:
|
|
769
|
-
objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name))
|
|
813
|
+
objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name)) # type: ignore
|
|
770
814
|
elif remote_content.prefix is not None:
|
|
771
|
-
objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix))
|
|
815
|
+
objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
|
|
772
816
|
else:
|
|
773
|
-
objects_to_read.extend(remote_content.bucket.list_blobs())
|
|
817
|
+
objects_to_read.extend(remote_content.bucket.list_blobs()) # type: ignore
|
|
774
818
|
|
|
775
|
-
for
|
|
819
|
+
for gcs_object in objects_to_read:
|
|
820
|
+
# 2. Setup Content object
|
|
776
821
|
id = str(uuid4())
|
|
822
|
+
name = (content.name or "content") + "_" + gcs_object.name
|
|
777
823
|
content_entry = Content(
|
|
778
824
|
id=id,
|
|
779
|
-
name=
|
|
825
|
+
name=name,
|
|
780
826
|
description=content.description,
|
|
781
827
|
status=ContentStatus.PROCESSING,
|
|
782
828
|
metadata=content.metadata,
|
|
783
829
|
file_type="gcs",
|
|
784
830
|
)
|
|
785
831
|
|
|
832
|
+
# 3. Hash content and add it to the contents database
|
|
786
833
|
content_hash = self._build_content_hash(content_entry)
|
|
787
834
|
if self.vector_db and self.vector_db.content_hash_exists(content_hash) and skip_if_exists:
|
|
788
835
|
log_info(f"Content {content_hash} already exists, skipping")
|
|
789
836
|
continue
|
|
790
837
|
|
|
838
|
+
# 4. Add it to the contents database
|
|
791
839
|
self._add_to_contents_db(content_entry)
|
|
792
840
|
|
|
793
|
-
|
|
841
|
+
# 5. Select reader
|
|
842
|
+
reader = content.reader
|
|
843
|
+
if reader is None:
|
|
844
|
+
if gcs_object.name.endswith(".pdf"):
|
|
845
|
+
reader = self.pdf_reader
|
|
846
|
+
elif gcs_object.name.endswith(".csv"):
|
|
847
|
+
reader = self.csv_reader
|
|
848
|
+
elif gcs_object.name.endswith(".docx"):
|
|
849
|
+
reader = self.docx_reader
|
|
850
|
+
elif gcs_object.name.endswith(".json"):
|
|
851
|
+
reader = self.json_reader
|
|
852
|
+
elif gcs_object.name.endswith(".markdown"):
|
|
853
|
+
reader = self.markdown_reader
|
|
854
|
+
else:
|
|
855
|
+
reader = self.text_reader
|
|
856
|
+
reader = cast(Reader, reader)
|
|
857
|
+
|
|
858
|
+
# 5. Fetch and load the content
|
|
859
|
+
readable_content = BytesIO(gcs_object.download_as_bytes())
|
|
860
|
+
|
|
861
|
+
# 6. Read the content
|
|
862
|
+
read_documents = reader.read(readable_content, name=name)
|
|
794
863
|
|
|
864
|
+
# 7. Prepare and insert the content in the vector database
|
|
795
865
|
for read_document in read_documents:
|
|
796
866
|
read_document.content_id = content.id
|
|
797
|
-
|
|
798
867
|
await self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
799
868
|
|
|
800
869
|
async def _handle_vector_db_insert(self, content, read_documents, upsert):
|
|
@@ -1006,7 +1075,7 @@ class Knowledge:
|
|
|
1006
1075
|
elif content_type == KnowledgeContentOrigin.URL:
|
|
1007
1076
|
log_info(f"Uploading file to LightRAG from URL: {content.url}")
|
|
1008
1077
|
try:
|
|
1009
|
-
reader = self.
|
|
1078
|
+
reader = content.reader or self.website_reader
|
|
1010
1079
|
if reader is None:
|
|
1011
1080
|
log_error("No URL reader available")
|
|
1012
1081
|
content.status = ContentStatus.FAILED
|
|
@@ -1354,14 +1423,6 @@ class Knowledge:
|
|
|
1354
1423
|
log_info(f"Selecting reader for extension: {extension}")
|
|
1355
1424
|
return ReaderFactory.get_reader_for_extension(extension)
|
|
1356
1425
|
|
|
1357
|
-
def _select_url_reader(self, url: str) -> Reader:
|
|
1358
|
-
"""Select the appropriate reader for a URL."""
|
|
1359
|
-
return ReaderFactory.get_reader_for_url(url)
|
|
1360
|
-
|
|
1361
|
-
def _select_url_file_reader(self, extension: str) -> Reader:
|
|
1362
|
-
"""Select the appropriate reader for a URL file extension."""
|
|
1363
|
-
return ReaderFactory.get_reader_for_url_file(extension)
|
|
1364
|
-
|
|
1365
1426
|
def get_filters(self) -> List[str]:
|
|
1366
1427
|
return [
|
|
1367
1428
|
"filter_tag_1",
|
|
@@ -1484,32 +1545,7 @@ class Knowledge:
|
|
|
1484
1545
|
"""Firecrawl reader - lazy loaded via factory."""
|
|
1485
1546
|
return self._get_reader("firecrawl")
|
|
1486
1547
|
|
|
1487
|
-
@property
|
|
1488
|
-
def url_reader(self) -> Optional[Reader]:
|
|
1489
|
-
"""URL reader - lazy loaded via factory."""
|
|
1490
|
-
return self._get_reader("url")
|
|
1491
|
-
|
|
1492
|
-
@property
|
|
1493
|
-
def pdf_url_reader(self) -> Optional[Reader]:
|
|
1494
|
-
"""PDF URL reader - lazy loaded via factory."""
|
|
1495
|
-
return self._get_reader("pdf_url")
|
|
1496
|
-
|
|
1497
1548
|
@property
|
|
1498
1549
|
def youtube_reader(self) -> Optional[Reader]:
|
|
1499
1550
|
"""YouTube reader - lazy loaded via factory."""
|
|
1500
1551
|
return self._get_reader("youtube")
|
|
1501
|
-
|
|
1502
|
-
@property
|
|
1503
|
-
def csv_url_reader(self) -> Optional[Reader]:
|
|
1504
|
-
"""CSV URL reader - lazy loaded via factory."""
|
|
1505
|
-
return self._get_reader("csv_url")
|
|
1506
|
-
|
|
1507
|
-
@property
|
|
1508
|
-
def s3_reader(self) -> Optional[Reader]:
|
|
1509
|
-
"""S3 reader - lazy loaded via factory."""
|
|
1510
|
-
return self._get_reader("s3")
|
|
1511
|
-
|
|
1512
|
-
@property
|
|
1513
|
-
def gcs_reader(self) -> Optional[Reader]:
|
|
1514
|
-
"""GCS reader - lazy loaded via factory."""
|
|
1515
|
-
return self._get_reader("gcs")
|
|
@@ -20,11 +20,11 @@ class ArxivReader(Reader):
|
|
|
20
20
|
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
21
21
|
"""Get the list of supported chunking strategies for Arxiv readers."""
|
|
22
22
|
return [
|
|
23
|
-
ChunkingStrategyType.
|
|
24
|
-
ChunkingStrategyType.
|
|
25
|
-
ChunkingStrategyType.
|
|
26
|
-
ChunkingStrategyType.
|
|
27
|
-
ChunkingStrategyType.
|
|
23
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
24
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
25
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
26
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
27
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER,
|
|
28
28
|
]
|
|
29
29
|
|
|
30
30
|
@classmethod
|
|
@@ -1,14 +1,10 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import csv
|
|
3
3
|
import io
|
|
4
|
-
import os
|
|
5
4
|
from pathlib import Path
|
|
6
5
|
from typing import IO, Any, List, Optional, Union
|
|
7
|
-
from urllib.parse import urlparse
|
|
8
6
|
from uuid import uuid4
|
|
9
7
|
|
|
10
|
-
from agno.utils.http import async_fetch_with_retry, fetch_with_retry
|
|
11
|
-
|
|
12
8
|
try:
|
|
13
9
|
import aiofiles
|
|
14
10
|
except ImportError:
|
|
@@ -32,16 +28,16 @@ class CSVReader(Reader):
|
|
|
32
28
|
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
33
29
|
"""Get the list of supported chunking strategies for CSV readers."""
|
|
34
30
|
return [
|
|
35
|
-
ChunkingStrategyType.
|
|
36
|
-
ChunkingStrategyType.
|
|
37
|
-
ChunkingStrategyType.
|
|
38
|
-
ChunkingStrategyType.
|
|
39
|
-
ChunkingStrategyType.
|
|
31
|
+
ChunkingStrategyType.ROW_CHUNKER,
|
|
32
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
33
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
34
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
35
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
40
36
|
]
|
|
41
37
|
|
|
42
38
|
@classmethod
|
|
43
39
|
def get_supported_content_types(self) -> List[ContentType]:
|
|
44
|
-
return [ContentType.
|
|
40
|
+
return [ContentType.CSV, ContentType.XLSX, ContentType.XLS]
|
|
45
41
|
|
|
46
42
|
def read(
|
|
47
43
|
self, file: Union[Path, IO[Any]], delimiter: str = ",", quotechar: str = '"', name: Optional[str] = None
|
|
@@ -168,70 +164,3 @@ class CSVReader(Reader):
|
|
|
168
164
|
f"Error reading async: {getattr(file, 'name', str(file)) if isinstance(file, IO) else file}: {e}"
|
|
169
165
|
)
|
|
170
166
|
return []
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
class CSVUrlReader(Reader):
|
|
174
|
-
"""Reader for CSV files"""
|
|
175
|
-
|
|
176
|
-
def __init__(
|
|
177
|
-
self, chunking_strategy: Optional[ChunkingStrategy] = RowChunking(), proxy: Optional[str] = None, **kwargs
|
|
178
|
-
):
|
|
179
|
-
super().__init__(chunking_strategy=chunking_strategy, **kwargs)
|
|
180
|
-
self.proxy = proxy
|
|
181
|
-
|
|
182
|
-
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
183
|
-
"""Get the list of supported chunking strategies for CSV URL readers."""
|
|
184
|
-
return [
|
|
185
|
-
ChunkingStrategyType.ROW_CHUNKING,
|
|
186
|
-
ChunkingStrategyType.SEMANTIC_CHUNKING,
|
|
187
|
-
ChunkingStrategyType.FIXED_SIZE_CHUNKING,
|
|
188
|
-
ChunkingStrategyType.AGENTIC_CHUNKING,
|
|
189
|
-
ChunkingStrategyType.DOCUMENT_CHUNKING,
|
|
190
|
-
]
|
|
191
|
-
|
|
192
|
-
def get_supported_content_types(self) -> List[ContentType]:
|
|
193
|
-
return [ContentType.URL]
|
|
194
|
-
|
|
195
|
-
def read(self, url: str, name: Optional[str] = None) -> List[Document]:
|
|
196
|
-
if not url:
|
|
197
|
-
raise ValueError("No URL provided")
|
|
198
|
-
|
|
199
|
-
logger.info(f"Reading: {url}")
|
|
200
|
-
# Retry the request up to 3 times with exponential backoff
|
|
201
|
-
response = fetch_with_retry(url, proxy=self.proxy)
|
|
202
|
-
|
|
203
|
-
parsed_url = urlparse(url)
|
|
204
|
-
filename = os.path.basename(parsed_url.path) or "data.csv"
|
|
205
|
-
|
|
206
|
-
file_obj = io.BytesIO(response.content)
|
|
207
|
-
file_obj.name = filename
|
|
208
|
-
documents = CSVReader().read(file=file_obj, name=name)
|
|
209
|
-
|
|
210
|
-
file_obj.close()
|
|
211
|
-
|
|
212
|
-
return documents
|
|
213
|
-
|
|
214
|
-
async def async_read(self, url: str, name: Optional[str] = None) -> List[Document]:
|
|
215
|
-
if not url:
|
|
216
|
-
raise ValueError("No URL provided")
|
|
217
|
-
|
|
218
|
-
import httpx
|
|
219
|
-
|
|
220
|
-
logger.info(f"Reading async: {url}")
|
|
221
|
-
|
|
222
|
-
client_args = {"proxy": self.proxy} if self.proxy else {}
|
|
223
|
-
async with httpx.AsyncClient(**client_args) as client: # type: ignore
|
|
224
|
-
response = await async_fetch_with_retry(url, client=client)
|
|
225
|
-
|
|
226
|
-
parsed_url = urlparse(url)
|
|
227
|
-
filename = os.path.basename(parsed_url.path) or "data.csv"
|
|
228
|
-
|
|
229
|
-
file_obj = io.BytesIO(response.content)
|
|
230
|
-
file_obj.name = filename
|
|
231
|
-
|
|
232
|
-
# Use the async version of CSVReader
|
|
233
|
-
documents = await CSVReader().async_read(file=file_obj, name=name)
|
|
234
|
-
|
|
235
|
-
file_obj.close()
|
|
236
|
-
|
|
237
|
-
return documents
|
|
@@ -26,11 +26,11 @@ class DocxReader(Reader):
|
|
|
26
26
|
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
27
27
|
"""Get the list of supported chunking strategies for DOCX readers."""
|
|
28
28
|
return [
|
|
29
|
-
ChunkingStrategyType.
|
|
30
|
-
ChunkingStrategyType.
|
|
31
|
-
ChunkingStrategyType.
|
|
32
|
-
ChunkingStrategyType.
|
|
33
|
-
ChunkingStrategyType.
|
|
29
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
30
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
31
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER,
|
|
32
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
33
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
34
34
|
]
|
|
35
35
|
|
|
36
36
|
@classmethod
|
|
@@ -46,11 +46,11 @@ class FirecrawlReader(Reader):
|
|
|
46
46
|
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
47
47
|
"""Get the list of supported chunking strategies for Firecrawl readers."""
|
|
48
48
|
return [
|
|
49
|
-
ChunkingStrategyType.
|
|
50
|
-
ChunkingStrategyType.
|
|
51
|
-
ChunkingStrategyType.
|
|
52
|
-
ChunkingStrategyType.
|
|
53
|
-
ChunkingStrategyType.
|
|
49
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER,
|
|
50
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
51
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
52
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
53
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
54
54
|
]
|
|
55
55
|
|
|
56
56
|
@classmethod
|
|
@@ -25,11 +25,11 @@ class JSONReader(Reader):
|
|
|
25
25
|
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
26
26
|
"""Get the list of supported chunking strategies for JSON readers."""
|
|
27
27
|
return [
|
|
28
|
-
ChunkingStrategyType.
|
|
29
|
-
ChunkingStrategyType.
|
|
30
|
-
ChunkingStrategyType.
|
|
31
|
-
ChunkingStrategyType.
|
|
32
|
-
ChunkingStrategyType.
|
|
28
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
29
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
30
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
31
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
32
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER,
|
|
33
33
|
]
|
|
34
34
|
|
|
35
35
|
@classmethod
|