agno 2.0.0rc1__py3-none-any.whl → 2.0.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +32 -14
- agno/db/mongo/mongo.py +8 -3
- agno/eval/accuracy.py +12 -5
- agno/knowledge/chunking/strategy.py +14 -14
- agno/knowledge/knowledge.py +156 -120
- agno/knowledge/reader/arxiv_reader.py +5 -5
- agno/knowledge/reader/csv_reader.py +6 -77
- agno/knowledge/reader/docx_reader.py +5 -5
- agno/knowledge/reader/firecrawl_reader.py +5 -5
- agno/knowledge/reader/json_reader.py +5 -5
- agno/knowledge/reader/markdown_reader.py +31 -9
- agno/knowledge/reader/pdf_reader.py +10 -123
- agno/knowledge/reader/reader_factory.py +65 -72
- agno/knowledge/reader/s3_reader.py +44 -114
- agno/knowledge/reader/text_reader.py +5 -5
- agno/knowledge/reader/url_reader.py +75 -31
- agno/knowledge/reader/web_search_reader.py +6 -29
- agno/knowledge/reader/website_reader.py +5 -5
- agno/knowledge/reader/wikipedia_reader.py +5 -5
- agno/knowledge/reader/youtube_reader.py +6 -6
- agno/knowledge/utils.py +10 -10
- agno/models/aws/bedrock.py +3 -7
- agno/models/base.py +37 -6
- agno/os/app.py +32 -24
- agno/os/mcp.py +39 -59
- agno/os/router.py +547 -16
- agno/os/routers/evals/evals.py +197 -12
- agno/os/routers/knowledge/knowledge.py +428 -14
- agno/os/routers/memory/memory.py +250 -28
- agno/os/routers/metrics/metrics.py +125 -7
- agno/os/routers/session/session.py +393 -25
- agno/os/schema.py +55 -2
- agno/run/agent.py +9 -0
- agno/run/team.py +93 -2
- agno/run/workflow.py +25 -12
- agno/team/team.py +861 -1051
- agno/tools/mcp.py +1 -2
- agno/utils/log.py +52 -2
- agno/utils/mcp.py +55 -3
- agno/utils/models/claude.py +0 -8
- agno/utils/print_response/team.py +177 -73
- agno/utils/streamlit.py +27 -0
- agno/workflow/workflow.py +9 -0
- {agno-2.0.0rc1.dist-info → agno-2.0.0rc2.dist-info}/METADATA +1 -1
- {agno-2.0.0rc1.dist-info → agno-2.0.0rc2.dist-info}/RECORD +48 -49
- agno/knowledge/reader/gcs_reader.py +0 -67
- {agno-2.0.0rc1.dist-info → agno-2.0.0rc2.dist-info}/WHEEL +0 -0
- {agno-2.0.0rc1.dist-info → agno-2.0.0rc2.dist-info}/licenses/LICENSE +0 -0
- {agno-2.0.0rc1.dist-info → agno-2.0.0rc2.dist-info}/top_level.txt +0 -0
agno/agent/agent.py
CHANGED
|
@@ -129,10 +129,10 @@ class Agent:
|
|
|
129
129
|
session_id: Optional[str] = None
|
|
130
130
|
# Default session state (stored in the database to persist across runs)
|
|
131
131
|
session_state: Optional[Dict[str, Any]] = None
|
|
132
|
-
#
|
|
133
|
-
enable_agentic_state: bool = False
|
|
134
|
-
# If True, add the session state to the user prompt
|
|
132
|
+
# Set to True to add the session_state to the context
|
|
135
133
|
add_session_state_to_context: bool = False
|
|
134
|
+
# Set to True to give the agent tools to update the session_state dynamically
|
|
135
|
+
enable_agentic_state: bool = False
|
|
136
136
|
# If True, cache the current Agent session in memory for faster access
|
|
137
137
|
cache_session: bool = False
|
|
138
138
|
|
|
@@ -321,8 +321,6 @@ class Agent:
|
|
|
321
321
|
# --- If this Agent is part of a workflow ---
|
|
322
322
|
# Optional workflow ID. Indicates this agent is part of a workflow.
|
|
323
323
|
workflow_id: Optional[str] = None
|
|
324
|
-
# Set when this agent is part of a workflow.
|
|
325
|
-
workflow_session_id: Optional[str] = None
|
|
326
324
|
|
|
327
325
|
# Metadata stored with this agent
|
|
328
326
|
metadata: Optional[Dict[str, Any]] = None
|
|
@@ -345,7 +343,6 @@ class Agent:
|
|
|
345
343
|
id: Optional[str] = None,
|
|
346
344
|
introduction: Optional[str] = None,
|
|
347
345
|
user_id: Optional[str] = None,
|
|
348
|
-
app_id: Optional[str] = None,
|
|
349
346
|
session_id: Optional[str] = None,
|
|
350
347
|
session_state: Optional[Dict[str, Any]] = None,
|
|
351
348
|
add_session_state_to_context: bool = False,
|
|
@@ -429,7 +426,6 @@ class Agent:
|
|
|
429
426
|
self.id = id
|
|
430
427
|
self.introduction = introduction
|
|
431
428
|
self.user_id = user_id
|
|
432
|
-
self.app_id = app_id
|
|
433
429
|
|
|
434
430
|
self.session_id = session_id
|
|
435
431
|
self.session_state = session_state
|
|
@@ -593,6 +589,15 @@ class Agent:
|
|
|
593
589
|
if isinstance(input, Message):
|
|
594
590
|
input = input.content # type: ignore
|
|
595
591
|
|
|
592
|
+
# If input is a string, convert it to a dict
|
|
593
|
+
if isinstance(input, str):
|
|
594
|
+
import json
|
|
595
|
+
|
|
596
|
+
try:
|
|
597
|
+
input = json.loads(input)
|
|
598
|
+
except Exception as e:
|
|
599
|
+
raise ValueError(f"Failed to parse input. Is it a valid JSON string?: {e}")
|
|
600
|
+
|
|
596
601
|
# Case 1: Message is already a BaseModel instance
|
|
597
602
|
if isinstance(input, BaseModel):
|
|
598
603
|
if isinstance(input, self.input_schema):
|
|
@@ -3231,6 +3236,12 @@ class Agent:
|
|
|
3231
3236
|
if isinstance(model_response_event, tuple(get_args(RunOutputEvent))) or isinstance(
|
|
3232
3237
|
model_response_event, tuple(get_args(TeamRunOutputEvent))
|
|
3233
3238
|
):
|
|
3239
|
+
if model_response_event.event == RunEvent.custom_event: # type: ignore
|
|
3240
|
+
model_response_event.agent_id = self.id # type: ignore
|
|
3241
|
+
model_response_event.agent_name = self.name # type: ignore
|
|
3242
|
+
model_response_event.session_id = session.session_id # type: ignore
|
|
3243
|
+
model_response_event.run_id = run_response.run_id # type: ignore
|
|
3244
|
+
|
|
3234
3245
|
# We just bubble the event up
|
|
3235
3246
|
yield self._handle_event(model_response_event, run_response) # type: ignore
|
|
3236
3247
|
else:
|
|
@@ -4365,7 +4376,7 @@ class Agent:
|
|
|
4365
4376
|
|
|
4366
4377
|
return agent_session
|
|
4367
4378
|
|
|
4368
|
-
|
|
4379
|
+
log_debug(f"AgentSession {session_id_to_load} not found in db")
|
|
4369
4380
|
return None
|
|
4370
4381
|
|
|
4371
4382
|
def save_session(self, session: AgentSession) -> None:
|
|
@@ -6841,6 +6852,7 @@ class Agent:
|
|
|
6841
6852
|
|
|
6842
6853
|
if self.output_schema is not None:
|
|
6843
6854
|
markdown = False
|
|
6855
|
+
markdown = False
|
|
6844
6856
|
|
|
6845
6857
|
if stream is None:
|
|
6846
6858
|
stream = self.stream or False
|
|
@@ -7172,10 +7184,12 @@ class Agent:
|
|
|
7172
7184
|
image_artifacts = []
|
|
7173
7185
|
for img in images:
|
|
7174
7186
|
try:
|
|
7187
|
+
artifact_id = img.id if hasattr(img, "id") and img.id else str(uuid4())
|
|
7188
|
+
|
|
7175
7189
|
if img.url:
|
|
7176
|
-
image_artifacts.append(ImageArtifact(id=
|
|
7190
|
+
image_artifacts.append(ImageArtifact(id=artifact_id, url=img.url))
|
|
7177
7191
|
elif img.content:
|
|
7178
|
-
image_artifacts.append(ImageArtifact(id=
|
|
7192
|
+
image_artifacts.append(ImageArtifact(id=artifact_id, content=img.content))
|
|
7179
7193
|
except Exception as e:
|
|
7180
7194
|
log_warning(f"Error creating ImageArtifact: {e}")
|
|
7181
7195
|
continue
|
|
@@ -7185,10 +7199,12 @@ class Agent:
|
|
|
7185
7199
|
video_artifacts = []
|
|
7186
7200
|
for vid in videos:
|
|
7187
7201
|
try:
|
|
7202
|
+
artifact_id = vid.id if hasattr(vid, "id") and vid.id else str(uuid4())
|
|
7203
|
+
|
|
7188
7204
|
if vid.url:
|
|
7189
|
-
video_artifacts.append(VideoArtifact(id=
|
|
7205
|
+
video_artifacts.append(VideoArtifact(id=artifact_id, url=vid.url))
|
|
7190
7206
|
elif vid.content:
|
|
7191
|
-
video_artifacts.append(VideoArtifact(id=
|
|
7207
|
+
video_artifacts.append(VideoArtifact(id=artifact_id, content=vid.content))
|
|
7192
7208
|
except Exception as e:
|
|
7193
7209
|
log_warning(f"Error creating VideoArtifact: {e}")
|
|
7194
7210
|
continue
|
|
@@ -7198,10 +7214,12 @@ class Agent:
|
|
|
7198
7214
|
audio_artifacts = []
|
|
7199
7215
|
for aud in audios:
|
|
7200
7216
|
try:
|
|
7217
|
+
artifact_id = aud.id if hasattr(aud, "id") and aud.id else str(uuid4())
|
|
7218
|
+
|
|
7201
7219
|
if aud.url:
|
|
7202
|
-
audio_artifacts.append(AudioArtifact(id=
|
|
7220
|
+
audio_artifacts.append(AudioArtifact(id=artifact_id, url=aud.url))
|
|
7203
7221
|
elif aud.content:
|
|
7204
|
-
audio_artifacts.append(AudioArtifact(id=
|
|
7222
|
+
audio_artifacts.append(AudioArtifact(id=artifact_id, content=aud.content))
|
|
7205
7223
|
except Exception as e:
|
|
7206
7224
|
log_warning(f"Error creating AudioArtifact: {e}")
|
|
7207
7225
|
continue
|
agno/db/mongo/mongo.py
CHANGED
|
@@ -672,7 +672,9 @@ class MongoDb(BaseDb):
|
|
|
672
672
|
if result is None or not deserialize:
|
|
673
673
|
return result
|
|
674
674
|
|
|
675
|
-
|
|
675
|
+
# Remove MongoDB's _id field before creating UserMemory object
|
|
676
|
+
result_filtered = {k: v for k, v in result.items() if k != "_id"}
|
|
677
|
+
return UserMemory.from_dict(result_filtered)
|
|
676
678
|
|
|
677
679
|
except Exception as e:
|
|
678
680
|
log_error(f"Exception reading from collection: {e}")
|
|
@@ -750,7 +752,8 @@ class MongoDb(BaseDb):
|
|
|
750
752
|
if not deserialize:
|
|
751
753
|
return records, total_count
|
|
752
754
|
|
|
753
|
-
|
|
755
|
+
# Remove MongoDB's _id field before creating UserMemory objects
|
|
756
|
+
return [UserMemory.from_dict({k: v for k, v in record.items() if k != "_id"}) for record in records]
|
|
754
757
|
|
|
755
758
|
except Exception as e:
|
|
756
759
|
log_error(f"Exception reading from collection: {e}")
|
|
@@ -861,7 +864,9 @@ class MongoDb(BaseDb):
|
|
|
861
864
|
if not deserialize:
|
|
862
865
|
return update_doc
|
|
863
866
|
|
|
864
|
-
|
|
867
|
+
# Remove MongoDB's _id field before creating UserMemory object
|
|
868
|
+
update_doc_filtered = {k: v for k, v in update_doc.items() if k != "_id"}
|
|
869
|
+
return UserMemory.from_dict(update_doc_filtered)
|
|
865
870
|
|
|
866
871
|
except Exception as e:
|
|
867
872
|
log_error(f"Exception upserting user memory: {e}")
|
agno/eval/accuracy.py
CHANGED
|
@@ -97,11 +97,18 @@ class AccuracyResult:
|
|
|
97
97
|
title_justify="center",
|
|
98
98
|
)
|
|
99
99
|
summary_table.add_row("Number of Runs", f"{len(self.results)}")
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
100
|
+
|
|
101
|
+
if self.avg_score is not None:
|
|
102
|
+
summary_table.add_row("Average Score", f"{self.avg_score:.2f}")
|
|
103
|
+
if self.mean_score is not None:
|
|
104
|
+
summary_table.add_row("Mean Score", f"{self.mean_score:.2f}")
|
|
105
|
+
if self.min_score is not None:
|
|
106
|
+
summary_table.add_row("Minimum Score", f"{self.min_score:.2f}")
|
|
107
|
+
if self.max_score is not None:
|
|
108
|
+
summary_table.add_row("Maximum Score", f"{self.max_score:.2f}")
|
|
109
|
+
if self.std_dev_score is not None:
|
|
110
|
+
summary_table.add_row("Standard Deviation", f"{self.std_dev_score:.2f}")
|
|
111
|
+
|
|
105
112
|
console.print(summary_table)
|
|
106
113
|
|
|
107
114
|
def print_results(self, console: Optional["Console"] = None):
|
|
@@ -35,13 +35,13 @@ class ChunkingStrategy(ABC):
|
|
|
35
35
|
class ChunkingStrategyType(str, Enum):
|
|
36
36
|
"""Enumeration of available chunking strategies."""
|
|
37
37
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
38
|
+
AGENTIC_CHUNKER = "AgenticChunker"
|
|
39
|
+
DOCUMENT_CHUNKER = "DocumentChunker"
|
|
40
|
+
RECURSIVE_CHUNKER = "RecursiveChunker"
|
|
41
|
+
SEMANTIC_CHUNKER = "SemanticChunker"
|
|
42
|
+
FIXED_SIZE_CHUNKER = "FixedSizeChunker"
|
|
43
|
+
ROW_CHUNKER = "RowChunker"
|
|
44
|
+
MARKDOWN_CHUNKER = "MarkdownChunker"
|
|
45
45
|
|
|
46
46
|
@classmethod
|
|
47
47
|
def from_string(cls, strategy_name: str) -> "ChunkingStrategyType":
|
|
@@ -63,13 +63,13 @@ class ChunkingStrategyFactory:
|
|
|
63
63
|
def create_strategy(cls, strategy_type: ChunkingStrategyType, **kwargs) -> ChunkingStrategy:
|
|
64
64
|
"""Create an instance of the chunking strategy with the given parameters."""
|
|
65
65
|
strategy_map = {
|
|
66
|
-
ChunkingStrategyType.
|
|
67
|
-
ChunkingStrategyType.
|
|
68
|
-
ChunkingStrategyType.
|
|
69
|
-
ChunkingStrategyType.
|
|
70
|
-
ChunkingStrategyType.
|
|
71
|
-
ChunkingStrategyType.
|
|
72
|
-
ChunkingStrategyType.
|
|
66
|
+
ChunkingStrategyType.AGENTIC_CHUNKER: cls._create_agentic_chunking,
|
|
67
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER: cls._create_document_chunking,
|
|
68
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER: cls._create_recursive_chunking,
|
|
69
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER: cls._create_semantic_chunking,
|
|
70
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER: cls._create_fixed_chunking,
|
|
71
|
+
ChunkingStrategyType.ROW_CHUNKER: cls._create_row_chunking,
|
|
72
|
+
ChunkingStrategyType.MARKDOWN_CHUNKER: cls._create_markdown_chunking,
|
|
73
73
|
}
|
|
74
74
|
return strategy_map[strategy_type](**kwargs)
|
|
75
75
|
|
agno/knowledge/knowledge.py
CHANGED
|
@@ -5,16 +5,21 @@ import time
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from enum import Enum
|
|
7
7
|
from functools import cached_property
|
|
8
|
+
from io import BytesIO
|
|
9
|
+
from os.path import basename
|
|
8
10
|
from pathlib import Path
|
|
9
11
|
from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast, overload
|
|
10
12
|
from uuid import uuid4
|
|
11
13
|
|
|
14
|
+
from httpx import AsyncClient
|
|
15
|
+
|
|
12
16
|
from agno.db.base import BaseDb
|
|
13
17
|
from agno.db.schemas.knowledge import KnowledgeRow
|
|
14
18
|
from agno.knowledge.content import Content, ContentAuth, ContentStatus, FileData
|
|
15
19
|
from agno.knowledge.document import Document
|
|
16
20
|
from agno.knowledge.reader import Reader, ReaderFactory
|
|
17
21
|
from agno.knowledge.remote_content.remote_content import GCSContent, RemoteContent, S3Content
|
|
22
|
+
from agno.utils.http import async_fetch_with_retry
|
|
18
23
|
from agno.utils.log import log_debug, log_error, log_info, log_warning
|
|
19
24
|
from agno.vectordb import VectorDb
|
|
20
25
|
|
|
@@ -421,20 +426,31 @@ class Knowledge:
|
|
|
421
426
|
upsert: bool,
|
|
422
427
|
skip_if_exists: bool,
|
|
423
428
|
):
|
|
429
|
+
"""Load the content in the contextual URL
|
|
430
|
+
|
|
431
|
+
1. Set content hash
|
|
432
|
+
2. Validate the URL
|
|
433
|
+
3. Read the content
|
|
434
|
+
4. Prepare and insert the content in the vector database
|
|
435
|
+
"""
|
|
424
436
|
log_info(f"Adding content from URL {content.url}")
|
|
425
437
|
content.file_type = "url"
|
|
426
438
|
|
|
439
|
+
if not content.url:
|
|
440
|
+
raise ValueError("No url provided")
|
|
441
|
+
|
|
427
442
|
if self.vector_db.__class__.__name__ == "LightRag":
|
|
428
443
|
await self._process_lightrag_content(content, KnowledgeContentOrigin.URL)
|
|
429
444
|
return
|
|
430
445
|
|
|
446
|
+
# 1. Set content hash
|
|
431
447
|
content.content_hash = self._build_content_hash(content)
|
|
432
448
|
if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
|
|
433
449
|
log_info(f"Content {content.content_hash} already exists, skipping")
|
|
434
450
|
return
|
|
435
451
|
self._add_to_contents_db(content)
|
|
436
452
|
|
|
437
|
-
# Validate URL
|
|
453
|
+
# 2. Validate URL
|
|
438
454
|
try:
|
|
439
455
|
from urllib.parse import urlparse
|
|
440
456
|
|
|
@@ -450,61 +466,47 @@ class Knowledge:
|
|
|
450
466
|
self._update_content(content)
|
|
451
467
|
log_warning(f"Invalid URL: {content.url} - {str(e)}")
|
|
452
468
|
|
|
453
|
-
#
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
try:
|
|
458
|
-
if content.url.endswith("llms-full.txt") or content.url.endswith("llms.txt"): # type: ignore
|
|
459
|
-
log_info("Detected llms, using url reader")
|
|
460
|
-
reader = content.reader or self.url_reader
|
|
461
|
-
if reader is not None:
|
|
462
|
-
# TODO: We will refactor this to eventually pass authorization to all readers
|
|
463
|
-
import inspect
|
|
469
|
+
# 3. Fetch and load content
|
|
470
|
+
async with AsyncClient() as client:
|
|
471
|
+
response = await async_fetch_with_retry(content.url, client=client)
|
|
472
|
+
bytes_content = BytesIO(response.content)
|
|
464
473
|
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
read_documents = reader.read(content.url, name=content.name, password=content.auth.password)
|
|
485
|
-
else:
|
|
486
|
-
read_documents = reader.read(content.url, name=content.name)
|
|
487
|
-
else:
|
|
488
|
-
log_info(f"No reader found for file extension: {file_extension}")
|
|
474
|
+
# 4. Select reader
|
|
475
|
+
# If a reader was provided by the user, use it
|
|
476
|
+
reader = content.reader
|
|
477
|
+
name = content.name
|
|
478
|
+
# Else select based on file extension
|
|
479
|
+
if reader is None:
|
|
480
|
+
url_path = Path(parsed_url.path)
|
|
481
|
+
file_extension = url_path.suffix.lower()
|
|
482
|
+
if file_extension == ".csv":
|
|
483
|
+
name = basename(parsed_url.path) or "data.csv"
|
|
484
|
+
reader = self.csv_reader
|
|
485
|
+
elif file_extension == ".pdf":
|
|
486
|
+
reader = self.pdf_reader
|
|
487
|
+
elif file_extension == ".docx":
|
|
488
|
+
reader = self.docx_reader
|
|
489
|
+
elif file_extension == ".json":
|
|
490
|
+
reader = self.json_reader
|
|
491
|
+
elif file_extension == ".markdown":
|
|
492
|
+
reader = self.markdown_reader
|
|
489
493
|
else:
|
|
490
|
-
|
|
491
|
-
if content.reader:
|
|
492
|
-
reader = content.reader
|
|
493
|
-
else:
|
|
494
|
-
reader = self._select_url_reader(content.url) # type: ignore
|
|
495
|
-
if reader is not None:
|
|
496
|
-
log_info(f"Selected reader: {reader.__class__.__name__}")
|
|
497
|
-
# TODO: We will refactor this to eventually pass authorization to all readers
|
|
498
|
-
import inspect
|
|
494
|
+
reader = self.text_reader
|
|
499
495
|
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
496
|
+
# 5. Read content
|
|
497
|
+
try:
|
|
498
|
+
read_documents = []
|
|
499
|
+
if reader is not None:
|
|
500
|
+
# TODO: We will refactor this to eventually pass authorization to all readers
|
|
501
|
+
import inspect
|
|
502
|
+
|
|
503
|
+
read_signature = inspect.signature(reader.read)
|
|
504
|
+
if reader.__class__.__name__ == "YouTubeReader":
|
|
505
|
+
read_documents = reader.read(content.url, name=name)
|
|
506
|
+
elif "password" in read_signature.parameters and content.auth and content.auth.password:
|
|
507
|
+
read_documents = reader.read(bytes_content, name=name, password=content.auth.password)
|
|
505
508
|
else:
|
|
506
|
-
|
|
507
|
-
|
|
509
|
+
read_documents = reader.read(bytes_content, name=name)
|
|
508
510
|
except Exception as e:
|
|
509
511
|
log_error(f"Error reading URL: {content.url} - {str(e)}")
|
|
510
512
|
content.status = ContentStatus.FAILED
|
|
@@ -512,13 +514,17 @@ class Knowledge:
|
|
|
512
514
|
self._update_content(content)
|
|
513
515
|
return
|
|
514
516
|
|
|
517
|
+
# 6. Chunk documents if needed
|
|
518
|
+
if reader and not reader.chunk:
|
|
519
|
+
read_documents = await reader.chunk_documents_async(read_documents)
|
|
520
|
+
|
|
521
|
+
# 7. Prepare and insert the content in the vector database
|
|
515
522
|
file_size = 0
|
|
516
523
|
if read_documents:
|
|
517
524
|
for read_document in read_documents:
|
|
518
525
|
if read_document.size:
|
|
519
526
|
file_size += read_document.size
|
|
520
527
|
read_document.content_id = content.id
|
|
521
|
-
|
|
522
528
|
await self._handle_vector_db_insert(content, read_documents, upsert)
|
|
523
529
|
|
|
524
530
|
async def _load_from_content(
|
|
@@ -699,21 +705,23 @@ class Knowledge:
|
|
|
699
705
|
log_warning(f"Unsupported remote content type: {type(remote_content)}")
|
|
700
706
|
|
|
701
707
|
async def _load_from_s3(self, content: Content, upsert: bool, skip_if_exists: bool):
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
708
|
+
"""Load the contextual S3 content.
|
|
709
|
+
|
|
710
|
+
1. Identify objects to read
|
|
711
|
+
2. Setup Content object
|
|
712
|
+
3. Hash content and add it to the contents database
|
|
713
|
+
4. Select reader
|
|
714
|
+
5. Fetch and load the content
|
|
715
|
+
6. Read the content
|
|
716
|
+
7. Prepare and insert the content in the vector database
|
|
717
|
+
8. Remove temporary file if needed
|
|
718
|
+
"""
|
|
719
|
+
from agno.cloud.aws.s3.object import S3Object
|
|
712
720
|
|
|
713
721
|
remote_content: S3Content = cast(S3Content, content.remote_content)
|
|
714
722
|
|
|
723
|
+
# 1. Identify objects to read
|
|
715
724
|
objects_to_read: List[S3Object] = []
|
|
716
|
-
|
|
717
725
|
if remote_content.bucket is not None:
|
|
718
726
|
if remote_content.key is not None:
|
|
719
727
|
_object = S3Object(bucket_name=remote_content.bucket.name, name=remote_content.key)
|
|
@@ -725,10 +733,11 @@ class Knowledge:
|
|
|
725
733
|
else:
|
|
726
734
|
objects_to_read.extend(remote_content.bucket.get_objects())
|
|
727
735
|
|
|
728
|
-
for
|
|
736
|
+
for s3_object in objects_to_read:
|
|
737
|
+
# 2. Setup Content object
|
|
729
738
|
id = str(uuid4())
|
|
730
739
|
content_name = content.name or ""
|
|
731
|
-
content_name += "_" + (
|
|
740
|
+
content_name += "_" + (s3_object.name or "")
|
|
732
741
|
content_entry = Content(
|
|
733
742
|
id=id,
|
|
734
743
|
name=content_name,
|
|
@@ -738,63 +747,123 @@ class Knowledge:
|
|
|
738
747
|
file_type="s3",
|
|
739
748
|
)
|
|
740
749
|
|
|
750
|
+
# 3. Hash content and add it to the contents database
|
|
741
751
|
content_hash = self._build_content_hash(content_entry)
|
|
742
752
|
if self.vector_db and self.vector_db.content_hash_exists(content_hash) and skip_if_exists:
|
|
743
753
|
log_info(f"Content {content_hash} already exists, skipping")
|
|
744
754
|
continue
|
|
745
|
-
|
|
746
755
|
self._add_to_contents_db(content_entry)
|
|
747
756
|
|
|
748
|
-
|
|
757
|
+
# 4. Select reader
|
|
758
|
+
reader = content.reader
|
|
759
|
+
if reader is None:
|
|
760
|
+
if s3_object.uri.endswith(".pdf"):
|
|
761
|
+
reader = self.pdf_reader
|
|
762
|
+
elif s3_object.uri.endswith(".csv"):
|
|
763
|
+
reader = self.csv_reader
|
|
764
|
+
elif s3_object.uri.endswith(".docx"):
|
|
765
|
+
reader = self.docx_reader
|
|
766
|
+
elif s3_object.uri.endswith(".json"):
|
|
767
|
+
reader = self.json_reader
|
|
768
|
+
elif s3_object.uri.endswith(".markdown"):
|
|
769
|
+
reader = self.markdown_reader
|
|
770
|
+
else:
|
|
771
|
+
reader = self.text_reader
|
|
772
|
+
reader = cast(Reader, reader)
|
|
773
|
+
|
|
774
|
+
# 5. Fetch and load the content
|
|
775
|
+
temporary_file = None
|
|
776
|
+
obj_name = content_name or s3_object.name.split("/")[-1]
|
|
777
|
+
readable_content: Optional[Union[BytesIO, Path]] = None
|
|
778
|
+
if s3_object.uri.endswith(".pdf"):
|
|
779
|
+
readable_content = BytesIO(s3_object.get_resource().get()["Body"].read())
|
|
780
|
+
else:
|
|
781
|
+
temporary_file = Path("storage").joinpath(obj_name)
|
|
782
|
+
readable_content = temporary_file
|
|
783
|
+
s3_object.download(readable_content) # type: ignore
|
|
784
|
+
|
|
785
|
+
# 6. Read the content
|
|
786
|
+
read_documents = reader.read(readable_content, name=obj_name)
|
|
749
787
|
|
|
788
|
+
# 7. Prepare and insert the content in the vector database
|
|
750
789
|
for read_document in read_documents:
|
|
751
790
|
read_document.content_id = content.id
|
|
752
|
-
|
|
753
791
|
await self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
754
792
|
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
else:
|
|
759
|
-
reader = content.reader
|
|
760
|
-
|
|
761
|
-
if reader is None:
|
|
762
|
-
log_warning("No reader provided for content")
|
|
763
|
-
return
|
|
793
|
+
# 8. Remove temporary file if needed
|
|
794
|
+
if temporary_file:
|
|
795
|
+
temporary_file.unlink()
|
|
764
796
|
|
|
797
|
+
async def _load_from_gcs(self, content: Content, upsert: bool, skip_if_exists: bool):
|
|
798
|
+
"""Load the contextual GCS content.
|
|
799
|
+
|
|
800
|
+
1. Identify objects to read
|
|
801
|
+
2. Setup Content object
|
|
802
|
+
3. Hash content and add it to the contents database
|
|
803
|
+
4. Select reader
|
|
804
|
+
5. Fetch and load the content
|
|
805
|
+
6. Read the content
|
|
806
|
+
7. Prepare and insert the content in the vector database
|
|
807
|
+
"""
|
|
765
808
|
remote_content: GCSContent = cast(GCSContent, content.remote_content)
|
|
766
|
-
objects_to_read = []
|
|
767
809
|
|
|
810
|
+
# 1. Identify objects to read
|
|
811
|
+
objects_to_read = []
|
|
768
812
|
if remote_content.blob_name is not None:
|
|
769
|
-
objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name))
|
|
813
|
+
objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name)) # type: ignore
|
|
770
814
|
elif remote_content.prefix is not None:
|
|
771
|
-
objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix))
|
|
815
|
+
objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
|
|
772
816
|
else:
|
|
773
|
-
objects_to_read.extend(remote_content.bucket.list_blobs())
|
|
817
|
+
objects_to_read.extend(remote_content.bucket.list_blobs()) # type: ignore
|
|
774
818
|
|
|
775
|
-
for
|
|
819
|
+
for gcs_object in objects_to_read:
|
|
820
|
+
# 2. Setup Content object
|
|
776
821
|
id = str(uuid4())
|
|
822
|
+
name = (content.name or "content") + "_" + gcs_object.name
|
|
777
823
|
content_entry = Content(
|
|
778
824
|
id=id,
|
|
779
|
-
name=
|
|
825
|
+
name=name,
|
|
780
826
|
description=content.description,
|
|
781
827
|
status=ContentStatus.PROCESSING,
|
|
782
828
|
metadata=content.metadata,
|
|
783
829
|
file_type="gcs",
|
|
784
830
|
)
|
|
785
831
|
|
|
832
|
+
# 3. Hash content and add it to the contents database
|
|
786
833
|
content_hash = self._build_content_hash(content_entry)
|
|
787
834
|
if self.vector_db and self.vector_db.content_hash_exists(content_hash) and skip_if_exists:
|
|
788
835
|
log_info(f"Content {content_hash} already exists, skipping")
|
|
789
836
|
continue
|
|
790
837
|
|
|
838
|
+
# 4. Add it to the contents database
|
|
791
839
|
self._add_to_contents_db(content_entry)
|
|
792
840
|
|
|
793
|
-
|
|
841
|
+
# 5. Select reader
|
|
842
|
+
reader = content.reader
|
|
843
|
+
if reader is None:
|
|
844
|
+
if gcs_object.name.endswith(".pdf"):
|
|
845
|
+
reader = self.pdf_reader
|
|
846
|
+
elif gcs_object.name.endswith(".csv"):
|
|
847
|
+
reader = self.csv_reader
|
|
848
|
+
elif gcs_object.name.endswith(".docx"):
|
|
849
|
+
reader = self.docx_reader
|
|
850
|
+
elif gcs_object.name.endswith(".json"):
|
|
851
|
+
reader = self.json_reader
|
|
852
|
+
elif gcs_object.name.endswith(".markdown"):
|
|
853
|
+
reader = self.markdown_reader
|
|
854
|
+
else:
|
|
855
|
+
reader = self.text_reader
|
|
856
|
+
reader = cast(Reader, reader)
|
|
857
|
+
|
|
858
|
+
# 5. Fetch and load the content
|
|
859
|
+
readable_content = BytesIO(gcs_object.download_as_bytes())
|
|
860
|
+
|
|
861
|
+
# 6. Read the content
|
|
862
|
+
read_documents = reader.read(readable_content, name=name)
|
|
794
863
|
|
|
864
|
+
# 7. Prepare and insert the content in the vector database
|
|
795
865
|
for read_document in read_documents:
|
|
796
866
|
read_document.content_id = content.id
|
|
797
|
-
|
|
798
867
|
await self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
799
868
|
|
|
800
869
|
async def _handle_vector_db_insert(self, content, read_documents, upsert):
|
|
@@ -1006,7 +1075,7 @@ class Knowledge:
|
|
|
1006
1075
|
elif content_type == KnowledgeContentOrigin.URL:
|
|
1007
1076
|
log_info(f"Uploading file to LightRAG from URL: {content.url}")
|
|
1008
1077
|
try:
|
|
1009
|
-
reader = self.
|
|
1078
|
+
reader = content.reader or self.website_reader
|
|
1010
1079
|
if reader is None:
|
|
1011
1080
|
log_error("No URL reader available")
|
|
1012
1081
|
content.status = ContentStatus.FAILED
|
|
@@ -1354,14 +1423,6 @@ class Knowledge:
|
|
|
1354
1423
|
log_info(f"Selecting reader for extension: {extension}")
|
|
1355
1424
|
return ReaderFactory.get_reader_for_extension(extension)
|
|
1356
1425
|
|
|
1357
|
-
def _select_url_reader(self, url: str) -> Reader:
|
|
1358
|
-
"""Select the appropriate reader for a URL."""
|
|
1359
|
-
return ReaderFactory.get_reader_for_url(url)
|
|
1360
|
-
|
|
1361
|
-
def _select_url_file_reader(self, extension: str) -> Reader:
|
|
1362
|
-
"""Select the appropriate reader for a URL file extension."""
|
|
1363
|
-
return ReaderFactory.get_reader_for_url_file(extension)
|
|
1364
|
-
|
|
1365
1426
|
def get_filters(self) -> List[str]:
|
|
1366
1427
|
return [
|
|
1367
1428
|
"filter_tag_1",
|
|
@@ -1484,32 +1545,7 @@ class Knowledge:
|
|
|
1484
1545
|
"""Firecrawl reader - lazy loaded via factory."""
|
|
1485
1546
|
return self._get_reader("firecrawl")
|
|
1486
1547
|
|
|
1487
|
-
@property
|
|
1488
|
-
def url_reader(self) -> Optional[Reader]:
|
|
1489
|
-
"""URL reader - lazy loaded via factory."""
|
|
1490
|
-
return self._get_reader("url")
|
|
1491
|
-
|
|
1492
|
-
@property
|
|
1493
|
-
def pdf_url_reader(self) -> Optional[Reader]:
|
|
1494
|
-
"""PDF URL reader - lazy loaded via factory."""
|
|
1495
|
-
return self._get_reader("pdf_url")
|
|
1496
|
-
|
|
1497
1548
|
@property
|
|
1498
1549
|
def youtube_reader(self) -> Optional[Reader]:
|
|
1499
1550
|
"""YouTube reader - lazy loaded via factory."""
|
|
1500
1551
|
return self._get_reader("youtube")
|
|
1501
|
-
|
|
1502
|
-
@property
|
|
1503
|
-
def csv_url_reader(self) -> Optional[Reader]:
|
|
1504
|
-
"""CSV URL reader - lazy loaded via factory."""
|
|
1505
|
-
return self._get_reader("csv_url")
|
|
1506
|
-
|
|
1507
|
-
@property
|
|
1508
|
-
def s3_reader(self) -> Optional[Reader]:
|
|
1509
|
-
"""S3 reader - lazy loaded via factory."""
|
|
1510
|
-
return self._get_reader("s3")
|
|
1511
|
-
|
|
1512
|
-
@property
|
|
1513
|
-
def gcs_reader(self) -> Optional[Reader]:
|
|
1514
|
-
"""GCS reader - lazy loaded via factory."""
|
|
1515
|
-
return self._get_reader("gcs")
|
|
@@ -20,11 +20,11 @@ class ArxivReader(Reader):
|
|
|
20
20
|
def get_supported_chunking_strategies(self) -> List[ChunkingStrategyType]:
|
|
21
21
|
"""Get the list of supported chunking strategies for Arxiv readers."""
|
|
22
22
|
return [
|
|
23
|
-
ChunkingStrategyType.
|
|
24
|
-
ChunkingStrategyType.
|
|
25
|
-
ChunkingStrategyType.
|
|
26
|
-
ChunkingStrategyType.
|
|
27
|
-
ChunkingStrategyType.
|
|
23
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
24
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
25
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
26
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
27
|
+
ChunkingStrategyType.SEMANTIC_CHUNKER,
|
|
28
28
|
]
|
|
29
29
|
|
|
30
30
|
@classmethod
|