agno 0.1.2__py3-none-any.whl → 2.3.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/__init__.py +8 -0
- agno/agent/__init__.py +44 -5
- agno/agent/agent.py +10531 -2975
- agno/api/agent.py +14 -53
- agno/api/api.py +7 -46
- agno/api/evals.py +22 -0
- agno/api/os.py +17 -0
- agno/api/routes.py +6 -25
- agno/api/schemas/__init__.py +9 -0
- agno/api/schemas/agent.py +6 -9
- agno/api/schemas/evals.py +16 -0
- agno/api/schemas/os.py +14 -0
- agno/api/schemas/team.py +10 -10
- agno/api/schemas/utils.py +21 -0
- agno/api/schemas/workflows.py +16 -0
- agno/api/settings.py +53 -0
- agno/api/team.py +22 -26
- agno/api/workflow.py +28 -0
- agno/cloud/aws/base.py +214 -0
- agno/cloud/aws/s3/__init__.py +2 -0
- agno/cloud/aws/s3/api_client.py +43 -0
- agno/cloud/aws/s3/bucket.py +195 -0
- agno/cloud/aws/s3/object.py +57 -0
- agno/compression/__init__.py +3 -0
- agno/compression/manager.py +247 -0
- agno/culture/__init__.py +3 -0
- agno/culture/manager.py +956 -0
- agno/db/__init__.py +24 -0
- agno/db/async_postgres/__init__.py +3 -0
- agno/db/base.py +946 -0
- agno/db/dynamo/__init__.py +3 -0
- agno/db/dynamo/dynamo.py +2781 -0
- agno/db/dynamo/schemas.py +442 -0
- agno/db/dynamo/utils.py +743 -0
- agno/db/firestore/__init__.py +3 -0
- agno/db/firestore/firestore.py +2379 -0
- agno/db/firestore/schemas.py +181 -0
- agno/db/firestore/utils.py +376 -0
- agno/db/gcs_json/__init__.py +3 -0
- agno/db/gcs_json/gcs_json_db.py +1791 -0
- agno/db/gcs_json/utils.py +228 -0
- agno/db/in_memory/__init__.py +3 -0
- agno/db/in_memory/in_memory_db.py +1312 -0
- agno/db/in_memory/utils.py +230 -0
- agno/db/json/__init__.py +3 -0
- agno/db/json/json_db.py +1777 -0
- agno/db/json/utils.py +230 -0
- agno/db/migrations/manager.py +199 -0
- agno/db/migrations/v1_to_v2.py +635 -0
- agno/db/migrations/versions/v2_3_0.py +938 -0
- agno/db/mongo/__init__.py +17 -0
- agno/db/mongo/async_mongo.py +2760 -0
- agno/db/mongo/mongo.py +2597 -0
- agno/db/mongo/schemas.py +119 -0
- agno/db/mongo/utils.py +276 -0
- agno/db/mysql/__init__.py +4 -0
- agno/db/mysql/async_mysql.py +2912 -0
- agno/db/mysql/mysql.py +2923 -0
- agno/db/mysql/schemas.py +186 -0
- agno/db/mysql/utils.py +488 -0
- agno/db/postgres/__init__.py +4 -0
- agno/db/postgres/async_postgres.py +2579 -0
- agno/db/postgres/postgres.py +2870 -0
- agno/db/postgres/schemas.py +187 -0
- agno/db/postgres/utils.py +442 -0
- agno/db/redis/__init__.py +3 -0
- agno/db/redis/redis.py +2141 -0
- agno/db/redis/schemas.py +159 -0
- agno/db/redis/utils.py +346 -0
- agno/db/schemas/__init__.py +4 -0
- agno/db/schemas/culture.py +120 -0
- agno/db/schemas/evals.py +34 -0
- agno/db/schemas/knowledge.py +40 -0
- agno/db/schemas/memory.py +61 -0
- agno/db/singlestore/__init__.py +3 -0
- agno/db/singlestore/schemas.py +179 -0
- agno/db/singlestore/singlestore.py +2877 -0
- agno/db/singlestore/utils.py +384 -0
- agno/db/sqlite/__init__.py +4 -0
- agno/db/sqlite/async_sqlite.py +2911 -0
- agno/db/sqlite/schemas.py +181 -0
- agno/db/sqlite/sqlite.py +2908 -0
- agno/db/sqlite/utils.py +429 -0
- agno/db/surrealdb/__init__.py +3 -0
- agno/db/surrealdb/metrics.py +292 -0
- agno/db/surrealdb/models.py +334 -0
- agno/db/surrealdb/queries.py +71 -0
- agno/db/surrealdb/surrealdb.py +1908 -0
- agno/db/surrealdb/utils.py +147 -0
- agno/db/utils.py +118 -0
- agno/eval/__init__.py +24 -0
- agno/eval/accuracy.py +666 -276
- agno/eval/agent_as_judge.py +861 -0
- agno/eval/base.py +29 -0
- agno/eval/performance.py +779 -0
- agno/eval/reliability.py +241 -62
- agno/eval/utils.py +120 -0
- agno/exceptions.py +143 -1
- agno/filters.py +354 -0
- agno/guardrails/__init__.py +6 -0
- agno/guardrails/base.py +19 -0
- agno/guardrails/openai.py +144 -0
- agno/guardrails/pii.py +94 -0
- agno/guardrails/prompt_injection.py +52 -0
- agno/hooks/__init__.py +3 -0
- agno/hooks/decorator.py +164 -0
- agno/integrations/discord/__init__.py +3 -0
- agno/integrations/discord/client.py +203 -0
- agno/knowledge/__init__.py +5 -1
- agno/{document → knowledge}/chunking/agentic.py +22 -14
- agno/{document → knowledge}/chunking/document.py +2 -2
- agno/{document → knowledge}/chunking/fixed.py +7 -6
- agno/knowledge/chunking/markdown.py +151 -0
- agno/{document → knowledge}/chunking/recursive.py +15 -3
- agno/knowledge/chunking/row.py +39 -0
- agno/knowledge/chunking/semantic.py +91 -0
- agno/knowledge/chunking/strategy.py +165 -0
- agno/knowledge/content.py +74 -0
- agno/knowledge/document/__init__.py +5 -0
- agno/{document → knowledge/document}/base.py +12 -2
- agno/knowledge/embedder/__init__.py +5 -0
- agno/knowledge/embedder/aws_bedrock.py +343 -0
- agno/knowledge/embedder/azure_openai.py +210 -0
- agno/{embedder → knowledge/embedder}/base.py +8 -0
- agno/knowledge/embedder/cohere.py +323 -0
- agno/knowledge/embedder/fastembed.py +62 -0
- agno/{embedder → knowledge/embedder}/fireworks.py +1 -1
- agno/knowledge/embedder/google.py +258 -0
- agno/knowledge/embedder/huggingface.py +94 -0
- agno/knowledge/embedder/jina.py +182 -0
- agno/knowledge/embedder/langdb.py +22 -0
- agno/knowledge/embedder/mistral.py +206 -0
- agno/knowledge/embedder/nebius.py +13 -0
- agno/knowledge/embedder/ollama.py +154 -0
- agno/knowledge/embedder/openai.py +195 -0
- agno/knowledge/embedder/sentence_transformer.py +63 -0
- agno/{embedder → knowledge/embedder}/together.py +1 -1
- agno/knowledge/embedder/vllm.py +262 -0
- agno/knowledge/embedder/voyageai.py +165 -0
- agno/knowledge/knowledge.py +3006 -0
- agno/knowledge/reader/__init__.py +7 -0
- agno/knowledge/reader/arxiv_reader.py +81 -0
- agno/knowledge/reader/base.py +95 -0
- agno/knowledge/reader/csv_reader.py +164 -0
- agno/knowledge/reader/docx_reader.py +82 -0
- agno/knowledge/reader/field_labeled_csv_reader.py +290 -0
- agno/knowledge/reader/firecrawl_reader.py +201 -0
- agno/knowledge/reader/json_reader.py +88 -0
- agno/knowledge/reader/markdown_reader.py +137 -0
- agno/knowledge/reader/pdf_reader.py +431 -0
- agno/knowledge/reader/pptx_reader.py +101 -0
- agno/knowledge/reader/reader_factory.py +313 -0
- agno/knowledge/reader/s3_reader.py +89 -0
- agno/knowledge/reader/tavily_reader.py +193 -0
- agno/knowledge/reader/text_reader.py +127 -0
- agno/knowledge/reader/web_search_reader.py +325 -0
- agno/knowledge/reader/website_reader.py +455 -0
- agno/knowledge/reader/wikipedia_reader.py +91 -0
- agno/knowledge/reader/youtube_reader.py +78 -0
- agno/knowledge/remote_content/remote_content.py +88 -0
- agno/knowledge/reranker/__init__.py +3 -0
- agno/{reranker → knowledge/reranker}/base.py +1 -1
- agno/{reranker → knowledge/reranker}/cohere.py +2 -2
- agno/knowledge/reranker/infinity.py +195 -0
- agno/knowledge/reranker/sentence_transformer.py +54 -0
- agno/knowledge/types.py +39 -0
- agno/knowledge/utils.py +234 -0
- agno/media.py +439 -95
- agno/memory/__init__.py +16 -3
- agno/memory/manager.py +1474 -123
- agno/memory/strategies/__init__.py +15 -0
- agno/memory/strategies/base.py +66 -0
- agno/memory/strategies/summarize.py +196 -0
- agno/memory/strategies/types.py +37 -0
- agno/models/aimlapi/__init__.py +5 -0
- agno/models/aimlapi/aimlapi.py +62 -0
- agno/models/anthropic/__init__.py +4 -0
- agno/models/anthropic/claude.py +960 -496
- agno/models/aws/__init__.py +15 -0
- agno/models/aws/bedrock.py +686 -451
- agno/models/aws/claude.py +190 -183
- agno/models/azure/__init__.py +18 -1
- agno/models/azure/ai_foundry.py +489 -0
- agno/models/azure/openai_chat.py +89 -40
- agno/models/base.py +2477 -550
- agno/models/cerebras/__init__.py +12 -0
- agno/models/cerebras/cerebras.py +565 -0
- agno/models/cerebras/cerebras_openai.py +131 -0
- agno/models/cohere/__init__.py +4 -0
- agno/models/cohere/chat.py +306 -492
- agno/models/cometapi/__init__.py +5 -0
- agno/models/cometapi/cometapi.py +74 -0
- agno/models/dashscope/__init__.py +5 -0
- agno/models/dashscope/dashscope.py +90 -0
- agno/models/deepinfra/__init__.py +5 -0
- agno/models/deepinfra/deepinfra.py +45 -0
- agno/models/deepseek/__init__.py +4 -0
- agno/models/deepseek/deepseek.py +110 -9
- agno/models/fireworks/__init__.py +4 -0
- agno/models/fireworks/fireworks.py +19 -22
- agno/models/google/__init__.py +3 -7
- agno/models/google/gemini.py +1717 -662
- agno/models/google/utils.py +22 -0
- agno/models/groq/__init__.py +4 -0
- agno/models/groq/groq.py +391 -666
- agno/models/huggingface/__init__.py +4 -0
- agno/models/huggingface/huggingface.py +266 -538
- agno/models/ibm/__init__.py +5 -0
- agno/models/ibm/watsonx.py +432 -0
- agno/models/internlm/__init__.py +3 -0
- agno/models/internlm/internlm.py +20 -3
- agno/models/langdb/__init__.py +1 -0
- agno/models/langdb/langdb.py +60 -0
- agno/models/litellm/__init__.py +14 -0
- agno/models/litellm/chat.py +503 -0
- agno/models/litellm/litellm_openai.py +42 -0
- agno/models/llama_cpp/__init__.py +5 -0
- agno/models/llama_cpp/llama_cpp.py +22 -0
- agno/models/lmstudio/__init__.py +5 -0
- agno/models/lmstudio/lmstudio.py +25 -0
- agno/models/message.py +361 -39
- agno/models/meta/__init__.py +12 -0
- agno/models/meta/llama.py +502 -0
- agno/models/meta/llama_openai.py +79 -0
- agno/models/metrics.py +120 -0
- agno/models/mistral/__init__.py +4 -0
- agno/models/mistral/mistral.py +293 -393
- agno/models/nebius/__init__.py +3 -0
- agno/models/nebius/nebius.py +53 -0
- agno/models/nexus/__init__.py +3 -0
- agno/models/nexus/nexus.py +22 -0
- agno/models/nvidia/__init__.py +4 -0
- agno/models/nvidia/nvidia.py +22 -3
- agno/models/ollama/__init__.py +4 -2
- agno/models/ollama/chat.py +257 -492
- agno/models/openai/__init__.py +7 -0
- agno/models/openai/chat.py +725 -770
- agno/models/openai/like.py +16 -2
- agno/models/openai/responses.py +1121 -0
- agno/models/openrouter/__init__.py +4 -0
- agno/models/openrouter/openrouter.py +62 -5
- agno/models/perplexity/__init__.py +5 -0
- agno/models/perplexity/perplexity.py +203 -0
- agno/models/portkey/__init__.py +3 -0
- agno/models/portkey/portkey.py +82 -0
- agno/models/requesty/__init__.py +5 -0
- agno/models/requesty/requesty.py +69 -0
- agno/models/response.py +177 -7
- agno/models/sambanova/__init__.py +4 -0
- agno/models/sambanova/sambanova.py +23 -4
- agno/models/siliconflow/__init__.py +5 -0
- agno/models/siliconflow/siliconflow.py +42 -0
- agno/models/together/__init__.py +4 -0
- agno/models/together/together.py +21 -164
- agno/models/utils.py +266 -0
- agno/models/vercel/__init__.py +3 -0
- agno/models/vercel/v0.py +43 -0
- agno/models/vertexai/__init__.py +0 -1
- agno/models/vertexai/claude.py +190 -0
- agno/models/vllm/__init__.py +3 -0
- agno/models/vllm/vllm.py +83 -0
- agno/models/xai/__init__.py +2 -0
- agno/models/xai/xai.py +111 -7
- agno/os/__init__.py +3 -0
- agno/os/app.py +1027 -0
- agno/os/auth.py +244 -0
- agno/os/config.py +126 -0
- agno/os/interfaces/__init__.py +1 -0
- agno/os/interfaces/a2a/__init__.py +3 -0
- agno/os/interfaces/a2a/a2a.py +42 -0
- agno/os/interfaces/a2a/router.py +249 -0
- agno/os/interfaces/a2a/utils.py +924 -0
- agno/os/interfaces/agui/__init__.py +3 -0
- agno/os/interfaces/agui/agui.py +47 -0
- agno/os/interfaces/agui/router.py +147 -0
- agno/os/interfaces/agui/utils.py +574 -0
- agno/os/interfaces/base.py +25 -0
- agno/os/interfaces/slack/__init__.py +3 -0
- agno/os/interfaces/slack/router.py +148 -0
- agno/os/interfaces/slack/security.py +30 -0
- agno/os/interfaces/slack/slack.py +47 -0
- agno/os/interfaces/whatsapp/__init__.py +3 -0
- agno/os/interfaces/whatsapp/router.py +210 -0
- agno/os/interfaces/whatsapp/security.py +55 -0
- agno/os/interfaces/whatsapp/whatsapp.py +36 -0
- agno/os/mcp.py +293 -0
- agno/os/middleware/__init__.py +9 -0
- agno/os/middleware/jwt.py +797 -0
- agno/os/router.py +258 -0
- agno/os/routers/__init__.py +3 -0
- agno/os/routers/agents/__init__.py +3 -0
- agno/os/routers/agents/router.py +599 -0
- agno/os/routers/agents/schema.py +261 -0
- agno/os/routers/evals/__init__.py +3 -0
- agno/os/routers/evals/evals.py +450 -0
- agno/os/routers/evals/schemas.py +174 -0
- agno/os/routers/evals/utils.py +231 -0
- agno/os/routers/health.py +31 -0
- agno/os/routers/home.py +52 -0
- agno/os/routers/knowledge/__init__.py +3 -0
- agno/os/routers/knowledge/knowledge.py +1008 -0
- agno/os/routers/knowledge/schemas.py +178 -0
- agno/os/routers/memory/__init__.py +3 -0
- agno/os/routers/memory/memory.py +661 -0
- agno/os/routers/memory/schemas.py +88 -0
- agno/os/routers/metrics/__init__.py +3 -0
- agno/os/routers/metrics/metrics.py +190 -0
- agno/os/routers/metrics/schemas.py +47 -0
- agno/os/routers/session/__init__.py +3 -0
- agno/os/routers/session/session.py +997 -0
- agno/os/routers/teams/__init__.py +3 -0
- agno/os/routers/teams/router.py +512 -0
- agno/os/routers/teams/schema.py +257 -0
- agno/os/routers/traces/__init__.py +3 -0
- agno/os/routers/traces/schemas.py +414 -0
- agno/os/routers/traces/traces.py +499 -0
- agno/os/routers/workflows/__init__.py +3 -0
- agno/os/routers/workflows/router.py +624 -0
- agno/os/routers/workflows/schema.py +75 -0
- agno/os/schema.py +534 -0
- agno/os/scopes.py +469 -0
- agno/{playground → os}/settings.py +7 -15
- agno/os/utils.py +973 -0
- agno/reasoning/anthropic.py +80 -0
- agno/reasoning/azure_ai_foundry.py +67 -0
- agno/reasoning/deepseek.py +63 -0
- agno/reasoning/default.py +97 -0
- agno/reasoning/gemini.py +73 -0
- agno/reasoning/groq.py +71 -0
- agno/reasoning/helpers.py +24 -1
- agno/reasoning/ollama.py +67 -0
- agno/reasoning/openai.py +86 -0
- agno/reasoning/step.py +2 -1
- agno/reasoning/vertexai.py +76 -0
- agno/run/__init__.py +6 -0
- agno/run/agent.py +822 -0
- agno/run/base.py +247 -0
- agno/run/cancel.py +81 -0
- agno/run/requirement.py +181 -0
- agno/run/team.py +767 -0
- agno/run/workflow.py +708 -0
- agno/session/__init__.py +10 -0
- agno/session/agent.py +260 -0
- agno/session/summary.py +265 -0
- agno/session/team.py +342 -0
- agno/session/workflow.py +501 -0
- agno/table.py +10 -0
- agno/team/__init__.py +37 -0
- agno/team/team.py +9536 -0
- agno/tools/__init__.py +7 -0
- agno/tools/agentql.py +120 -0
- agno/tools/airflow.py +22 -12
- agno/tools/api.py +122 -0
- agno/tools/apify.py +276 -83
- agno/tools/{arxiv_toolkit.py → arxiv.py} +20 -12
- agno/tools/aws_lambda.py +28 -7
- agno/tools/aws_ses.py +66 -0
- agno/tools/baidusearch.py +11 -4
- agno/tools/bitbucket.py +292 -0
- agno/tools/brandfetch.py +213 -0
- agno/tools/bravesearch.py +106 -0
- agno/tools/brightdata.py +367 -0
- agno/tools/browserbase.py +209 -0
- agno/tools/calcom.py +32 -23
- agno/tools/calculator.py +24 -37
- agno/tools/cartesia.py +187 -0
- agno/tools/{clickup_tool.py → clickup.py} +17 -28
- agno/tools/confluence.py +91 -26
- agno/tools/crawl4ai.py +139 -43
- agno/tools/csv_toolkit.py +28 -22
- agno/tools/dalle.py +36 -22
- agno/tools/daytona.py +475 -0
- agno/tools/decorator.py +169 -14
- agno/tools/desi_vocal.py +23 -11
- agno/tools/discord.py +32 -29
- agno/tools/docker.py +716 -0
- agno/tools/duckdb.py +76 -81
- agno/tools/duckduckgo.py +43 -40
- agno/tools/e2b.py +703 -0
- agno/tools/eleven_labs.py +65 -54
- agno/tools/email.py +13 -5
- agno/tools/evm.py +129 -0
- agno/tools/exa.py +324 -42
- agno/tools/fal.py +39 -35
- agno/tools/file.py +196 -30
- agno/tools/file_generation.py +356 -0
- agno/tools/financial_datasets.py +288 -0
- agno/tools/firecrawl.py +108 -33
- agno/tools/function.py +960 -122
- agno/tools/giphy.py +34 -12
- agno/tools/github.py +1294 -97
- agno/tools/gmail.py +922 -0
- agno/tools/google_bigquery.py +117 -0
- agno/tools/google_drive.py +271 -0
- agno/tools/google_maps.py +253 -0
- agno/tools/googlecalendar.py +607 -107
- agno/tools/googlesheets.py +377 -0
- agno/tools/hackernews.py +20 -12
- agno/tools/jina.py +24 -14
- agno/tools/jira.py +48 -19
- agno/tools/knowledge.py +218 -0
- agno/tools/linear.py +82 -43
- agno/tools/linkup.py +58 -0
- agno/tools/local_file_system.py +15 -7
- agno/tools/lumalab.py +41 -26
- agno/tools/mcp/__init__.py +10 -0
- agno/tools/mcp/mcp.py +331 -0
- agno/tools/mcp/multi_mcp.py +347 -0
- agno/tools/mcp/params.py +24 -0
- agno/tools/mcp_toolbox.py +284 -0
- agno/tools/mem0.py +193 -0
- agno/tools/memory.py +419 -0
- agno/tools/mlx_transcribe.py +11 -9
- agno/tools/models/azure_openai.py +190 -0
- agno/tools/models/gemini.py +203 -0
- agno/tools/models/groq.py +158 -0
- agno/tools/models/morph.py +186 -0
- agno/tools/models/nebius.py +124 -0
- agno/tools/models_labs.py +163 -82
- agno/tools/moviepy_video.py +18 -13
- agno/tools/nano_banana.py +151 -0
- agno/tools/neo4j.py +134 -0
- agno/tools/newspaper.py +15 -4
- agno/tools/newspaper4k.py +19 -6
- agno/tools/notion.py +204 -0
- agno/tools/openai.py +181 -17
- agno/tools/openbb.py +27 -20
- agno/tools/opencv.py +321 -0
- agno/tools/openweather.py +233 -0
- agno/tools/oxylabs.py +385 -0
- agno/tools/pandas.py +25 -15
- agno/tools/parallel.py +314 -0
- agno/tools/postgres.py +238 -185
- agno/tools/pubmed.py +125 -13
- agno/tools/python.py +48 -35
- agno/tools/reasoning.py +283 -0
- agno/tools/reddit.py +207 -29
- agno/tools/redshift.py +406 -0
- agno/tools/replicate.py +69 -26
- agno/tools/resend.py +11 -6
- agno/tools/scrapegraph.py +179 -19
- agno/tools/searxng.py +23 -31
- agno/tools/serpapi.py +15 -10
- agno/tools/serper.py +255 -0
- agno/tools/shell.py +23 -12
- agno/tools/shopify.py +1519 -0
- agno/tools/slack.py +56 -14
- agno/tools/sleep.py +8 -6
- agno/tools/spider.py +35 -11
- agno/tools/spotify.py +919 -0
- agno/tools/sql.py +34 -19
- agno/tools/tavily.py +158 -8
- agno/tools/telegram.py +18 -8
- agno/tools/todoist.py +218 -0
- agno/tools/toolkit.py +134 -9
- agno/tools/trafilatura.py +388 -0
- agno/tools/trello.py +25 -28
- agno/tools/twilio.py +18 -9
- agno/tools/user_control_flow.py +78 -0
- agno/tools/valyu.py +228 -0
- agno/tools/visualization.py +467 -0
- agno/tools/webbrowser.py +28 -0
- agno/tools/webex.py +76 -0
- agno/tools/website.py +23 -19
- agno/tools/webtools.py +45 -0
- agno/tools/whatsapp.py +286 -0
- agno/tools/wikipedia.py +28 -19
- agno/tools/workflow.py +285 -0
- agno/tools/{twitter.py → x.py} +142 -46
- agno/tools/yfinance.py +41 -39
- agno/tools/youtube.py +34 -17
- agno/tools/zendesk.py +15 -5
- agno/tools/zep.py +454 -0
- agno/tools/zoom.py +86 -37
- agno/tracing/__init__.py +12 -0
- agno/tracing/exporter.py +157 -0
- agno/tracing/schemas.py +276 -0
- agno/tracing/setup.py +111 -0
- agno/utils/agent.py +938 -0
- agno/utils/audio.py +37 -1
- agno/utils/certs.py +27 -0
- agno/utils/code_execution.py +11 -0
- agno/utils/common.py +103 -20
- agno/utils/cryptography.py +22 -0
- agno/utils/dttm.py +33 -0
- agno/utils/events.py +700 -0
- agno/utils/functions.py +107 -37
- agno/utils/gemini.py +426 -0
- agno/utils/hooks.py +171 -0
- agno/utils/http.py +185 -0
- agno/utils/json_schema.py +159 -37
- agno/utils/knowledge.py +36 -0
- agno/utils/location.py +19 -0
- agno/utils/log.py +221 -8
- agno/utils/mcp.py +214 -0
- agno/utils/media.py +335 -14
- agno/utils/merge_dict.py +22 -1
- agno/utils/message.py +77 -2
- agno/utils/models/ai_foundry.py +50 -0
- agno/utils/models/claude.py +373 -0
- agno/utils/models/cohere.py +94 -0
- agno/utils/models/llama.py +85 -0
- agno/utils/models/mistral.py +100 -0
- agno/utils/models/openai_responses.py +140 -0
- agno/utils/models/schema_utils.py +153 -0
- agno/utils/models/watsonx.py +41 -0
- agno/utils/openai.py +257 -0
- agno/utils/pickle.py +1 -1
- agno/utils/pprint.py +124 -8
- agno/utils/print_response/agent.py +930 -0
- agno/utils/print_response/team.py +1914 -0
- agno/utils/print_response/workflow.py +1668 -0
- agno/utils/prompts.py +111 -0
- agno/utils/reasoning.py +108 -0
- agno/utils/response.py +163 -0
- agno/utils/serialize.py +32 -0
- agno/utils/shell.py +4 -4
- agno/utils/streamlit.py +487 -0
- agno/utils/string.py +204 -51
- agno/utils/team.py +139 -0
- agno/utils/timer.py +9 -2
- agno/utils/tokens.py +657 -0
- agno/utils/tools.py +19 -1
- agno/utils/whatsapp.py +305 -0
- agno/utils/yaml_io.py +3 -3
- agno/vectordb/__init__.py +2 -0
- agno/vectordb/base.py +87 -9
- agno/vectordb/cassandra/__init__.py +5 -1
- agno/vectordb/cassandra/cassandra.py +383 -27
- agno/vectordb/chroma/__init__.py +4 -0
- agno/vectordb/chroma/chromadb.py +748 -83
- agno/vectordb/clickhouse/__init__.py +7 -1
- agno/vectordb/clickhouse/clickhousedb.py +554 -53
- agno/vectordb/couchbase/__init__.py +3 -0
- agno/vectordb/couchbase/couchbase.py +1446 -0
- agno/vectordb/lancedb/__init__.py +5 -0
- agno/vectordb/lancedb/lance_db.py +730 -98
- agno/vectordb/langchaindb/__init__.py +5 -0
- agno/vectordb/langchaindb/langchaindb.py +163 -0
- agno/vectordb/lightrag/__init__.py +5 -0
- agno/vectordb/lightrag/lightrag.py +388 -0
- agno/vectordb/llamaindex/__init__.py +3 -0
- agno/vectordb/llamaindex/llamaindexdb.py +166 -0
- agno/vectordb/milvus/__init__.py +3 -0
- agno/vectordb/milvus/milvus.py +966 -78
- agno/vectordb/mongodb/__init__.py +9 -1
- agno/vectordb/mongodb/mongodb.py +1175 -172
- agno/vectordb/pgvector/__init__.py +8 -0
- agno/vectordb/pgvector/pgvector.py +599 -115
- agno/vectordb/pineconedb/__init__.py +5 -1
- agno/vectordb/pineconedb/pineconedb.py +406 -43
- agno/vectordb/qdrant/__init__.py +4 -0
- agno/vectordb/qdrant/qdrant.py +914 -61
- agno/vectordb/redis/__init__.py +9 -0
- agno/vectordb/redis/redisdb.py +682 -0
- agno/vectordb/singlestore/__init__.py +8 -1
- agno/vectordb/singlestore/singlestore.py +771 -0
- agno/vectordb/surrealdb/__init__.py +3 -0
- agno/vectordb/surrealdb/surrealdb.py +663 -0
- agno/vectordb/upstashdb/__init__.py +5 -0
- agno/vectordb/upstashdb/upstashdb.py +718 -0
- agno/vectordb/weaviate/__init__.py +8 -0
- agno/vectordb/weaviate/index.py +15 -0
- agno/vectordb/weaviate/weaviate.py +1009 -0
- agno/workflow/__init__.py +23 -1
- agno/workflow/agent.py +299 -0
- agno/workflow/condition.py +759 -0
- agno/workflow/loop.py +756 -0
- agno/workflow/parallel.py +853 -0
- agno/workflow/router.py +723 -0
- agno/workflow/step.py +1564 -0
- agno/workflow/steps.py +613 -0
- agno/workflow/types.py +556 -0
- agno/workflow/workflow.py +4327 -514
- agno-2.3.13.dist-info/METADATA +639 -0
- agno-2.3.13.dist-info/RECORD +613 -0
- {agno-0.1.2.dist-info → agno-2.3.13.dist-info}/WHEEL +1 -1
- agno-2.3.13.dist-info/licenses/LICENSE +201 -0
- agno/api/playground.py +0 -91
- agno/api/schemas/playground.py +0 -22
- agno/api/schemas/user.py +0 -22
- agno/api/schemas/workspace.py +0 -46
- agno/api/user.py +0 -160
- agno/api/workspace.py +0 -151
- agno/cli/auth_server.py +0 -118
- agno/cli/config.py +0 -275
- agno/cli/console.py +0 -88
- agno/cli/credentials.py +0 -23
- agno/cli/entrypoint.py +0 -571
- agno/cli/operator.py +0 -355
- agno/cli/settings.py +0 -85
- agno/cli/ws/ws_cli.py +0 -817
- agno/constants.py +0 -13
- agno/document/__init__.py +0 -1
- agno/document/chunking/semantic.py +0 -47
- agno/document/chunking/strategy.py +0 -31
- agno/document/reader/__init__.py +0 -1
- agno/document/reader/arxiv_reader.py +0 -41
- agno/document/reader/base.py +0 -22
- agno/document/reader/csv_reader.py +0 -84
- agno/document/reader/docx_reader.py +0 -46
- agno/document/reader/firecrawl_reader.py +0 -99
- agno/document/reader/json_reader.py +0 -43
- agno/document/reader/pdf_reader.py +0 -219
- agno/document/reader/s3/pdf_reader.py +0 -46
- agno/document/reader/s3/text_reader.py +0 -51
- agno/document/reader/text_reader.py +0 -41
- agno/document/reader/website_reader.py +0 -175
- agno/document/reader/youtube_reader.py +0 -50
- agno/embedder/__init__.py +0 -1
- agno/embedder/azure_openai.py +0 -86
- agno/embedder/cohere.py +0 -72
- agno/embedder/fastembed.py +0 -37
- agno/embedder/google.py +0 -73
- agno/embedder/huggingface.py +0 -54
- agno/embedder/mistral.py +0 -80
- agno/embedder/ollama.py +0 -57
- agno/embedder/openai.py +0 -74
- agno/embedder/sentence_transformer.py +0 -38
- agno/embedder/voyageai.py +0 -64
- agno/eval/perf.py +0 -201
- agno/file/__init__.py +0 -1
- agno/file/file.py +0 -16
- agno/file/local/csv.py +0 -32
- agno/file/local/txt.py +0 -19
- agno/infra/app.py +0 -240
- agno/infra/base.py +0 -144
- agno/infra/context.py +0 -20
- agno/infra/db_app.py +0 -52
- agno/infra/resource.py +0 -205
- agno/infra/resources.py +0 -55
- agno/knowledge/agent.py +0 -230
- agno/knowledge/arxiv.py +0 -22
- agno/knowledge/combined.py +0 -22
- agno/knowledge/csv.py +0 -28
- agno/knowledge/csv_url.py +0 -19
- agno/knowledge/document.py +0 -20
- agno/knowledge/docx.py +0 -30
- agno/knowledge/json.py +0 -28
- agno/knowledge/langchain.py +0 -71
- agno/knowledge/llamaindex.py +0 -66
- agno/knowledge/pdf.py +0 -28
- agno/knowledge/pdf_url.py +0 -26
- agno/knowledge/s3/base.py +0 -60
- agno/knowledge/s3/pdf.py +0 -21
- agno/knowledge/s3/text.py +0 -23
- agno/knowledge/text.py +0 -30
- agno/knowledge/website.py +0 -88
- agno/knowledge/wikipedia.py +0 -31
- agno/knowledge/youtube.py +0 -22
- agno/memory/agent.py +0 -392
- agno/memory/classifier.py +0 -104
- agno/memory/db/__init__.py +0 -1
- agno/memory/db/base.py +0 -42
- agno/memory/db/mongodb.py +0 -189
- agno/memory/db/postgres.py +0 -203
- agno/memory/db/sqlite.py +0 -193
- agno/memory/memory.py +0 -15
- agno/memory/row.py +0 -36
- agno/memory/summarizer.py +0 -192
- agno/memory/summary.py +0 -19
- agno/memory/workflow.py +0 -38
- agno/models/google/gemini_openai.py +0 -26
- agno/models/ollama/hermes.py +0 -221
- agno/models/ollama/tools.py +0 -362
- agno/models/vertexai/gemini.py +0 -595
- agno/playground/__init__.py +0 -3
- agno/playground/async_router.py +0 -421
- agno/playground/deploy.py +0 -249
- agno/playground/operator.py +0 -92
- agno/playground/playground.py +0 -91
- agno/playground/schemas.py +0 -76
- agno/playground/serve.py +0 -55
- agno/playground/sync_router.py +0 -405
- agno/reasoning/agent.py +0 -68
- agno/run/response.py +0 -112
- agno/storage/agent/__init__.py +0 -0
- agno/storage/agent/base.py +0 -38
- agno/storage/agent/dynamodb.py +0 -350
- agno/storage/agent/json.py +0 -92
- agno/storage/agent/mongodb.py +0 -228
- agno/storage/agent/postgres.py +0 -367
- agno/storage/agent/session.py +0 -79
- agno/storage/agent/singlestore.py +0 -303
- agno/storage/agent/sqlite.py +0 -357
- agno/storage/agent/yaml.py +0 -93
- agno/storage/workflow/__init__.py +0 -0
- agno/storage/workflow/base.py +0 -40
- agno/storage/workflow/mongodb.py +0 -233
- agno/storage/workflow/postgres.py +0 -366
- agno/storage/workflow/session.py +0 -60
- agno/storage/workflow/sqlite.py +0 -359
- agno/tools/googlesearch.py +0 -88
- agno/utils/defaults.py +0 -57
- agno/utils/filesystem.py +0 -39
- agno/utils/git.py +0 -52
- agno/utils/json_io.py +0 -30
- agno/utils/load_env.py +0 -19
- agno/utils/py_io.py +0 -19
- agno/utils/pyproject.py +0 -18
- agno/utils/resource_filter.py +0 -31
- agno/vectordb/singlestore/s2vectordb.py +0 -390
- agno/vectordb/singlestore/s2vectordb2.py +0 -355
- agno/workspace/__init__.py +0 -0
- agno/workspace/config.py +0 -325
- agno/workspace/enums.py +0 -6
- agno/workspace/helpers.py +0 -48
- agno/workspace/operator.py +0 -758
- agno/workspace/settings.py +0 -63
- agno-0.1.2.dist-info/LICENSE +0 -375
- agno-0.1.2.dist-info/METADATA +0 -502
- agno-0.1.2.dist-info/RECORD +0 -352
- agno-0.1.2.dist-info/entry_points.txt +0 -3
- /agno/{cli → db/migrations}/__init__.py +0 -0
- /agno/{cli/ws → db/migrations/versions}/__init__.py +0 -0
- /agno/{document/chunking/__init__.py → db/schemas/metrics.py} +0 -0
- /agno/{document/reader/s3 → integrations}/__init__.py +0 -0
- /agno/{file/local → knowledge/chunking}/__init__.py +0 -0
- /agno/{infra → knowledge/remote_content}/__init__.py +0 -0
- /agno/{knowledge/s3 → tools/models}/__init__.py +0 -0
- /agno/{reranker → utils/models}/__init__.py +0 -0
- /agno/{storage → utils/print_response}/__init__.py +0 -0
- {agno-0.1.2.dist-info → agno-2.3.13.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,3006 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import hashlib
|
|
3
|
+
import io
|
|
4
|
+
import time
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from enum import Enum
|
|
7
|
+
from io import BytesIO
|
|
8
|
+
from os.path import basename
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Union, cast, overload
|
|
11
|
+
|
|
12
|
+
from httpx import AsyncClient
|
|
13
|
+
|
|
14
|
+
from agno.db.base import AsyncBaseDb, BaseDb
|
|
15
|
+
from agno.db.schemas.knowledge import KnowledgeRow
|
|
16
|
+
from agno.filters import FilterExpr
|
|
17
|
+
from agno.knowledge.content import Content, ContentAuth, ContentStatus, FileData
|
|
18
|
+
from agno.knowledge.document import Document
|
|
19
|
+
from agno.knowledge.reader import Reader, ReaderFactory
|
|
20
|
+
from agno.knowledge.remote_content.remote_content import GCSContent, RemoteContent, S3Content
|
|
21
|
+
from agno.utils.http import async_fetch_with_retry
|
|
22
|
+
from agno.utils.log import log_debug, log_error, log_info, log_warning
|
|
23
|
+
from agno.utils.string import generate_id
|
|
24
|
+
|
|
25
|
+
ContentDict = Dict[str, Union[str, Dict[str, str]]]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class KnowledgeContentOrigin(Enum):
|
|
29
|
+
PATH = "path"
|
|
30
|
+
URL = "url"
|
|
31
|
+
TOPIC = "topic"
|
|
32
|
+
CONTENT = "content"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class Knowledge:
|
|
37
|
+
"""Knowledge class"""
|
|
38
|
+
|
|
39
|
+
name: Optional[str] = None
|
|
40
|
+
description: Optional[str] = None
|
|
41
|
+
vector_db: Optional[Any] = None
|
|
42
|
+
contents_db: Optional[Union[BaseDb, AsyncBaseDb]] = None
|
|
43
|
+
max_results: int = 10
|
|
44
|
+
readers: Optional[Dict[str, Reader]] = None
|
|
45
|
+
|
|
46
|
+
def __post_init__(self):
|
|
47
|
+
from agno.vectordb import VectorDb
|
|
48
|
+
|
|
49
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
50
|
+
if self.vector_db and not self.vector_db.exists():
|
|
51
|
+
self.vector_db.create()
|
|
52
|
+
|
|
53
|
+
self.construct_readers()
|
|
54
|
+
|
|
55
|
+
# --- Add Contents ---
|
|
56
|
+
@overload
|
|
57
|
+
async def add_contents_async(self, contents: List[ContentDict]) -> None: ...
|
|
58
|
+
|
|
59
|
+
@overload
|
|
60
|
+
async def add_contents_async(
|
|
61
|
+
self,
|
|
62
|
+
*,
|
|
63
|
+
paths: Optional[List[str]] = None,
|
|
64
|
+
urls: Optional[List[str]] = None,
|
|
65
|
+
metadata: Optional[Dict[str, str]] = None,
|
|
66
|
+
topics: Optional[List[str]] = None,
|
|
67
|
+
text_contents: Optional[List[str]] = None,
|
|
68
|
+
reader: Optional[Reader] = None,
|
|
69
|
+
include: Optional[List[str]] = None,
|
|
70
|
+
exclude: Optional[List[str]] = None,
|
|
71
|
+
upsert: bool = True,
|
|
72
|
+
skip_if_exists: bool = False,
|
|
73
|
+
remote_content: Optional[RemoteContent] = None,
|
|
74
|
+
) -> None: ...
|
|
75
|
+
|
|
76
|
+
async def add_contents_async(self, *args, **kwargs) -> None:
|
|
77
|
+
if args and isinstance(args[0], list):
|
|
78
|
+
arguments = args[0]
|
|
79
|
+
upsert = kwargs.get("upsert", True)
|
|
80
|
+
skip_if_exists = kwargs.get("skip_if_exists", False)
|
|
81
|
+
for argument in arguments:
|
|
82
|
+
await self.add_content_async(
|
|
83
|
+
name=argument.get("name"),
|
|
84
|
+
description=argument.get("description"),
|
|
85
|
+
path=argument.get("path"),
|
|
86
|
+
url=argument.get("url"),
|
|
87
|
+
metadata=argument.get("metadata"),
|
|
88
|
+
topics=argument.get("topics"),
|
|
89
|
+
text_content=argument.get("text_content"),
|
|
90
|
+
reader=argument.get("reader"),
|
|
91
|
+
include=argument.get("include"),
|
|
92
|
+
exclude=argument.get("exclude"),
|
|
93
|
+
upsert=argument.get("upsert", upsert),
|
|
94
|
+
skip_if_exists=argument.get("skip_if_exists", skip_if_exists),
|
|
95
|
+
remote_content=argument.get("remote_content", None),
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
elif kwargs:
|
|
99
|
+
name = kwargs.get("name", [])
|
|
100
|
+
metadata = kwargs.get("metadata", {})
|
|
101
|
+
description = kwargs.get("description", [])
|
|
102
|
+
topics = kwargs.get("topics", [])
|
|
103
|
+
reader = kwargs.get("reader", None)
|
|
104
|
+
paths = kwargs.get("paths", [])
|
|
105
|
+
urls = kwargs.get("urls", [])
|
|
106
|
+
text_contents = kwargs.get("text_contents", [])
|
|
107
|
+
include = kwargs.get("include")
|
|
108
|
+
exclude = kwargs.get("exclude")
|
|
109
|
+
upsert = kwargs.get("upsert", True)
|
|
110
|
+
skip_if_exists = kwargs.get("skip_if_exists", False)
|
|
111
|
+
remote_content = kwargs.get("remote_content", None)
|
|
112
|
+
for path in paths:
|
|
113
|
+
await self.add_content_async(
|
|
114
|
+
name=name,
|
|
115
|
+
description=description,
|
|
116
|
+
path=path,
|
|
117
|
+
metadata=metadata,
|
|
118
|
+
include=include,
|
|
119
|
+
exclude=exclude,
|
|
120
|
+
upsert=upsert,
|
|
121
|
+
skip_if_exists=skip_if_exists,
|
|
122
|
+
reader=reader,
|
|
123
|
+
)
|
|
124
|
+
for url in urls:
|
|
125
|
+
await self.add_content_async(
|
|
126
|
+
name=name,
|
|
127
|
+
description=description,
|
|
128
|
+
url=url,
|
|
129
|
+
metadata=metadata,
|
|
130
|
+
include=include,
|
|
131
|
+
exclude=exclude,
|
|
132
|
+
upsert=upsert,
|
|
133
|
+
skip_if_exists=skip_if_exists,
|
|
134
|
+
reader=reader,
|
|
135
|
+
)
|
|
136
|
+
for i, text_content in enumerate(text_contents):
|
|
137
|
+
content_name = f"{name}_{i}" if name else f"text_content_{i}"
|
|
138
|
+
log_debug(f"Adding text content: {content_name}")
|
|
139
|
+
await self.add_content_async(
|
|
140
|
+
name=content_name,
|
|
141
|
+
description=description,
|
|
142
|
+
text_content=text_content,
|
|
143
|
+
metadata=metadata,
|
|
144
|
+
include=include,
|
|
145
|
+
exclude=exclude,
|
|
146
|
+
upsert=upsert,
|
|
147
|
+
skip_if_exists=skip_if_exists,
|
|
148
|
+
reader=reader,
|
|
149
|
+
)
|
|
150
|
+
if topics:
|
|
151
|
+
await self.add_content_async(
|
|
152
|
+
name=name,
|
|
153
|
+
description=description,
|
|
154
|
+
topics=topics,
|
|
155
|
+
metadata=metadata,
|
|
156
|
+
include=include,
|
|
157
|
+
exclude=exclude,
|
|
158
|
+
upsert=upsert,
|
|
159
|
+
skip_if_exists=skip_if_exists,
|
|
160
|
+
reader=reader,
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
if remote_content:
|
|
164
|
+
await self.add_content_async(
|
|
165
|
+
name=name,
|
|
166
|
+
metadata=metadata,
|
|
167
|
+
description=description,
|
|
168
|
+
remote_content=remote_content,
|
|
169
|
+
upsert=upsert,
|
|
170
|
+
skip_if_exists=skip_if_exists,
|
|
171
|
+
reader=reader,
|
|
172
|
+
)
|
|
173
|
+
|
|
174
|
+
else:
|
|
175
|
+
raise ValueError("Invalid usage of add_contents.")
|
|
176
|
+
|
|
177
|
+
@overload
|
|
178
|
+
def add_contents(self, contents: List[ContentDict]) -> None: ...
|
|
179
|
+
|
|
180
|
+
@overload
|
|
181
|
+
def add_contents(
|
|
182
|
+
self,
|
|
183
|
+
*,
|
|
184
|
+
paths: Optional[List[str]] = None,
|
|
185
|
+
urls: Optional[List[str]] = None,
|
|
186
|
+
metadata: Optional[Dict[str, str]] = None,
|
|
187
|
+
topics: Optional[List[str]] = None,
|
|
188
|
+
text_contents: Optional[List[str]] = None,
|
|
189
|
+
reader: Optional[Reader] = None,
|
|
190
|
+
include: Optional[List[str]] = None,
|
|
191
|
+
exclude: Optional[List[str]] = None,
|
|
192
|
+
upsert: bool = True,
|
|
193
|
+
skip_if_exists: bool = False,
|
|
194
|
+
remote_content: Optional[RemoteContent] = None,
|
|
195
|
+
) -> None: ...
|
|
196
|
+
|
|
197
|
+
def add_contents(self, *args, **kwargs) -> None:
|
|
198
|
+
"""
|
|
199
|
+
Synchronously add multiple content items to the knowledge base.
|
|
200
|
+
|
|
201
|
+
Supports two usage patterns:
|
|
202
|
+
1. Pass a list of content dictionaries as first argument
|
|
203
|
+
2. Pass keyword arguments with paths, urls, metadata, etc.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
contents: List of content dictionaries (when used as first overload)
|
|
207
|
+
paths: Optional list of file paths to load content from
|
|
208
|
+
urls: Optional list of URLs to load content from
|
|
209
|
+
metadata: Optional metadata dictionary to apply to all content
|
|
210
|
+
topics: Optional list of topics to add
|
|
211
|
+
text_contents: Optional list of text content strings to add
|
|
212
|
+
reader: Optional reader to use for processing content
|
|
213
|
+
include: Optional list of file patterns to include
|
|
214
|
+
exclude: Optional list of file patterns to exclude
|
|
215
|
+
upsert: Whether to update existing content if it already exists (only used when skip_if_exists=False)
|
|
216
|
+
skip_if_exists: Whether to skip adding content if it already exists (default: True)
|
|
217
|
+
remote_content: Optional remote content (S3, GCS, etc.) to add
|
|
218
|
+
"""
|
|
219
|
+
if args and isinstance(args[0], list):
|
|
220
|
+
arguments = args[0]
|
|
221
|
+
upsert = kwargs.get("upsert", True)
|
|
222
|
+
skip_if_exists = kwargs.get("skip_if_exists", False)
|
|
223
|
+
for argument in arguments:
|
|
224
|
+
self.add_content(
|
|
225
|
+
name=argument.get("name"),
|
|
226
|
+
description=argument.get("description"),
|
|
227
|
+
path=argument.get("path"),
|
|
228
|
+
url=argument.get("url"),
|
|
229
|
+
metadata=argument.get("metadata"),
|
|
230
|
+
topics=argument.get("topics"),
|
|
231
|
+
text_content=argument.get("text_content"),
|
|
232
|
+
reader=argument.get("reader"),
|
|
233
|
+
include=argument.get("include"),
|
|
234
|
+
exclude=argument.get("exclude"),
|
|
235
|
+
upsert=argument.get("upsert", upsert),
|
|
236
|
+
skip_if_exists=argument.get("skip_if_exists", skip_if_exists),
|
|
237
|
+
remote_content=argument.get("remote_content", None),
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
elif kwargs:
|
|
241
|
+
name = kwargs.get("name", [])
|
|
242
|
+
metadata = kwargs.get("metadata", {})
|
|
243
|
+
description = kwargs.get("description", [])
|
|
244
|
+
topics = kwargs.get("topics", [])
|
|
245
|
+
reader = kwargs.get("reader", None)
|
|
246
|
+
paths = kwargs.get("paths", [])
|
|
247
|
+
urls = kwargs.get("urls", [])
|
|
248
|
+
text_contents = kwargs.get("text_contents", [])
|
|
249
|
+
include = kwargs.get("include")
|
|
250
|
+
exclude = kwargs.get("exclude")
|
|
251
|
+
upsert = kwargs.get("upsert", True)
|
|
252
|
+
skip_if_exists = kwargs.get("skip_if_exists", False)
|
|
253
|
+
remote_content = kwargs.get("remote_content", None)
|
|
254
|
+
for path in paths:
|
|
255
|
+
self.add_content(
|
|
256
|
+
name=name,
|
|
257
|
+
description=description,
|
|
258
|
+
path=path,
|
|
259
|
+
metadata=metadata,
|
|
260
|
+
include=include,
|
|
261
|
+
exclude=exclude,
|
|
262
|
+
upsert=upsert,
|
|
263
|
+
skip_if_exists=skip_if_exists,
|
|
264
|
+
reader=reader,
|
|
265
|
+
)
|
|
266
|
+
for url in urls:
|
|
267
|
+
self.add_content(
|
|
268
|
+
name=name,
|
|
269
|
+
description=description,
|
|
270
|
+
url=url,
|
|
271
|
+
metadata=metadata,
|
|
272
|
+
include=include,
|
|
273
|
+
exclude=exclude,
|
|
274
|
+
upsert=upsert,
|
|
275
|
+
skip_if_exists=skip_if_exists,
|
|
276
|
+
reader=reader,
|
|
277
|
+
)
|
|
278
|
+
for i, text_content in enumerate(text_contents):
|
|
279
|
+
content_name = f"{name}_{i}" if name else f"text_content_{i}"
|
|
280
|
+
log_debug(f"Adding text content: {content_name}")
|
|
281
|
+
self.add_content(
|
|
282
|
+
name=content_name,
|
|
283
|
+
description=description,
|
|
284
|
+
text_content=text_content,
|
|
285
|
+
metadata=metadata,
|
|
286
|
+
include=include,
|
|
287
|
+
exclude=exclude,
|
|
288
|
+
upsert=upsert,
|
|
289
|
+
skip_if_exists=skip_if_exists,
|
|
290
|
+
reader=reader,
|
|
291
|
+
)
|
|
292
|
+
if topics:
|
|
293
|
+
self.add_content(
|
|
294
|
+
name=name,
|
|
295
|
+
description=description,
|
|
296
|
+
topics=topics,
|
|
297
|
+
metadata=metadata,
|
|
298
|
+
include=include,
|
|
299
|
+
exclude=exclude,
|
|
300
|
+
upsert=upsert,
|
|
301
|
+
skip_if_exists=skip_if_exists,
|
|
302
|
+
reader=reader,
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
if remote_content:
|
|
306
|
+
self.add_content(
|
|
307
|
+
name=name,
|
|
308
|
+
metadata=metadata,
|
|
309
|
+
description=description,
|
|
310
|
+
remote_content=remote_content,
|
|
311
|
+
upsert=upsert,
|
|
312
|
+
skip_if_exists=skip_if_exists,
|
|
313
|
+
reader=reader,
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
else:
|
|
317
|
+
raise ValueError("Invalid usage of add_contents.")
|
|
318
|
+
|
|
319
|
+
# --- Add Content ---
|
|
320
|
+
|
|
321
|
+
@overload
|
|
322
|
+
async def add_content_async(
|
|
323
|
+
self,
|
|
324
|
+
*,
|
|
325
|
+
path: Optional[str] = None,
|
|
326
|
+
url: Optional[str] = None,
|
|
327
|
+
text_content: Optional[str] = None,
|
|
328
|
+
metadata: Optional[Dict[str, str]] = None,
|
|
329
|
+
include: Optional[List[str]] = None,
|
|
330
|
+
exclude: Optional[List[str]] = None,
|
|
331
|
+
upsert: bool = True,
|
|
332
|
+
skip_if_exists: bool = False,
|
|
333
|
+
reader: Optional[Reader] = None,
|
|
334
|
+
auth: Optional[ContentAuth] = None,
|
|
335
|
+
) -> None: ...
|
|
336
|
+
|
|
337
|
+
@overload
|
|
338
|
+
async def add_content_async(self, *args, **kwargs) -> None: ...
|
|
339
|
+
|
|
340
|
+
async def add_content_async(
|
|
341
|
+
self,
|
|
342
|
+
name: Optional[str] = None,
|
|
343
|
+
description: Optional[str] = None,
|
|
344
|
+
path: Optional[str] = None,
|
|
345
|
+
url: Optional[str] = None,
|
|
346
|
+
text_content: Optional[str] = None,
|
|
347
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
348
|
+
topics: Optional[List[str]] = None,
|
|
349
|
+
remote_content: Optional[RemoteContent] = None,
|
|
350
|
+
reader: Optional[Reader] = None,
|
|
351
|
+
include: Optional[List[str]] = None,
|
|
352
|
+
exclude: Optional[List[str]] = None,
|
|
353
|
+
upsert: bool = True,
|
|
354
|
+
skip_if_exists: bool = False,
|
|
355
|
+
auth: Optional[ContentAuth] = None,
|
|
356
|
+
) -> None:
|
|
357
|
+
# Validation: At least one of the parameters must be provided
|
|
358
|
+
if all(argument is None for argument in [path, url, text_content, topics, remote_content]):
|
|
359
|
+
log_warning(
|
|
360
|
+
"At least one of 'path', 'url', 'text_content', 'topics', or 'remote_content' must be provided."
|
|
361
|
+
)
|
|
362
|
+
return
|
|
363
|
+
|
|
364
|
+
content = None
|
|
365
|
+
file_data = None
|
|
366
|
+
if text_content:
|
|
367
|
+
file_data = FileData(content=text_content, type="Text")
|
|
368
|
+
|
|
369
|
+
content = Content(
|
|
370
|
+
name=name,
|
|
371
|
+
description=description,
|
|
372
|
+
path=path,
|
|
373
|
+
url=url,
|
|
374
|
+
file_data=file_data if file_data else None,
|
|
375
|
+
metadata=metadata,
|
|
376
|
+
topics=topics,
|
|
377
|
+
remote_content=remote_content,
|
|
378
|
+
reader=reader,
|
|
379
|
+
auth=auth,
|
|
380
|
+
)
|
|
381
|
+
content.content_hash = self._build_content_hash(content)
|
|
382
|
+
content.id = generate_id(content.content_hash)
|
|
383
|
+
|
|
384
|
+
await self._load_content_async(content, upsert, skip_if_exists, include, exclude)
|
|
385
|
+
|
|
386
|
+
@overload
|
|
387
|
+
def add_content(
|
|
388
|
+
self,
|
|
389
|
+
*,
|
|
390
|
+
path: Optional[str] = None,
|
|
391
|
+
url: Optional[str] = None,
|
|
392
|
+
text_content: Optional[str] = None,
|
|
393
|
+
metadata: Optional[Dict[str, str]] = None,
|
|
394
|
+
include: Optional[List[str]] = None,
|
|
395
|
+
exclude: Optional[List[str]] = None,
|
|
396
|
+
upsert: bool = True,
|
|
397
|
+
skip_if_exists: bool = False,
|
|
398
|
+
reader: Optional[Reader] = None,
|
|
399
|
+
auth: Optional[ContentAuth] = None,
|
|
400
|
+
) -> None: ...
|
|
401
|
+
|
|
402
|
+
@overload
|
|
403
|
+
def add_content(self, *args, **kwargs) -> None: ...
|
|
404
|
+
|
|
405
|
+
def add_content(
|
|
406
|
+
self,
|
|
407
|
+
name: Optional[str] = None,
|
|
408
|
+
description: Optional[str] = None,
|
|
409
|
+
path: Optional[str] = None,
|
|
410
|
+
url: Optional[str] = None,
|
|
411
|
+
text_content: Optional[str] = None,
|
|
412
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
413
|
+
topics: Optional[List[str]] = None,
|
|
414
|
+
remote_content: Optional[RemoteContent] = None,
|
|
415
|
+
reader: Optional[Reader] = None,
|
|
416
|
+
include: Optional[List[str]] = None,
|
|
417
|
+
exclude: Optional[List[str]] = None,
|
|
418
|
+
upsert: bool = True,
|
|
419
|
+
skip_if_exists: bool = False,
|
|
420
|
+
auth: Optional[ContentAuth] = None,
|
|
421
|
+
) -> None:
|
|
422
|
+
"""
|
|
423
|
+
Synchronously add content to the knowledge base.
|
|
424
|
+
|
|
425
|
+
Args:
|
|
426
|
+
name: Optional name for the content
|
|
427
|
+
description: Optional description for the content
|
|
428
|
+
path: Optional file path to load content from
|
|
429
|
+
url: Optional URL to load content from
|
|
430
|
+
text_content: Optional text content to add directly
|
|
431
|
+
metadata: Optional metadata dictionary
|
|
432
|
+
topics: Optional list of topics
|
|
433
|
+
remote_content: Optional cloud storage configuration
|
|
434
|
+
reader: Optional custom reader for processing the content
|
|
435
|
+
include: Optional list of file patterns to include
|
|
436
|
+
exclude: Optional list of file patterns to exclude
|
|
437
|
+
upsert: Whether to update existing content if it already exists (only used when skip_if_exists=False)
|
|
438
|
+
skip_if_exists: Whether to skip adding content if it already exists (default: False)
|
|
439
|
+
"""
|
|
440
|
+
# Validation: At least one of the parameters must be provided
|
|
441
|
+
if all(argument is None for argument in [path, url, text_content, topics, remote_content]):
|
|
442
|
+
log_warning(
|
|
443
|
+
"At least one of 'path', 'url', 'text_content', 'topics', or 'remote_content' must be provided."
|
|
444
|
+
)
|
|
445
|
+
return
|
|
446
|
+
|
|
447
|
+
content = None
|
|
448
|
+
file_data = None
|
|
449
|
+
if text_content:
|
|
450
|
+
file_data = FileData(content=text_content, type="Text")
|
|
451
|
+
|
|
452
|
+
content = Content(
|
|
453
|
+
name=name,
|
|
454
|
+
description=description,
|
|
455
|
+
path=path,
|
|
456
|
+
url=url,
|
|
457
|
+
file_data=file_data if file_data else None,
|
|
458
|
+
metadata=metadata,
|
|
459
|
+
topics=topics,
|
|
460
|
+
remote_content=remote_content,
|
|
461
|
+
reader=reader,
|
|
462
|
+
auth=auth,
|
|
463
|
+
)
|
|
464
|
+
content.content_hash = self._build_content_hash(content)
|
|
465
|
+
content.id = generate_id(content.content_hash)
|
|
466
|
+
|
|
467
|
+
self._load_content(content, upsert, skip_if_exists, include, exclude)
|
|
468
|
+
|
|
469
|
+
def _should_skip(self, content_hash: str, skip_if_exists: bool) -> bool:
|
|
470
|
+
"""
|
|
471
|
+
Handle the skip_if_exists logic for content that already exists in the vector database.
|
|
472
|
+
|
|
473
|
+
Args:
|
|
474
|
+
content_hash: The content hash string to check for existence
|
|
475
|
+
skip_if_exists: Whether to skip if content already exists
|
|
476
|
+
|
|
477
|
+
Returns:
|
|
478
|
+
bool: True if should skip processing, False if should continue
|
|
479
|
+
"""
|
|
480
|
+
from agno.vectordb import VectorDb
|
|
481
|
+
|
|
482
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
483
|
+
if self.vector_db and self.vector_db.content_hash_exists(content_hash) and skip_if_exists:
|
|
484
|
+
log_debug(f"Content already exists: {content_hash}, skipping...")
|
|
485
|
+
return True
|
|
486
|
+
|
|
487
|
+
return False
|
|
488
|
+
|
|
489
|
+
def _select_reader_by_extension(
|
|
490
|
+
self, file_extension: str, provided_reader: Optional[Reader] = None
|
|
491
|
+
) -> Tuple[Optional[Reader], str]:
|
|
492
|
+
"""
|
|
493
|
+
Select a reader based on file extension.
|
|
494
|
+
|
|
495
|
+
Args:
|
|
496
|
+
file_extension: File extension (e.g., '.pdf', '.csv')
|
|
497
|
+
provided_reader: Optional reader already provided
|
|
498
|
+
|
|
499
|
+
Returns:
|
|
500
|
+
Tuple of (reader, name) where name may be adjusted based on extension
|
|
501
|
+
"""
|
|
502
|
+
if provided_reader:
|
|
503
|
+
return provided_reader, ""
|
|
504
|
+
|
|
505
|
+
file_extension = file_extension.lower()
|
|
506
|
+
if file_extension == ".csv":
|
|
507
|
+
return self.csv_reader, "data.csv"
|
|
508
|
+
elif file_extension == ".pdf":
|
|
509
|
+
return self.pdf_reader, ""
|
|
510
|
+
elif file_extension == ".docx":
|
|
511
|
+
return self.docx_reader, ""
|
|
512
|
+
elif file_extension == ".pptx":
|
|
513
|
+
return self.pptx_reader, ""
|
|
514
|
+
elif file_extension == ".json":
|
|
515
|
+
return self.json_reader, ""
|
|
516
|
+
elif file_extension == ".markdown":
|
|
517
|
+
return self.markdown_reader, ""
|
|
518
|
+
else:
|
|
519
|
+
return self.text_reader, ""
|
|
520
|
+
|
|
521
|
+
def _select_reader_by_uri(self, uri: str, provided_reader: Optional[Reader] = None) -> Optional[Reader]:
|
|
522
|
+
"""
|
|
523
|
+
Select a reader based on URI/file path extension.
|
|
524
|
+
|
|
525
|
+
Args:
|
|
526
|
+
uri: URI or file path
|
|
527
|
+
provided_reader: Optional reader already provided
|
|
528
|
+
|
|
529
|
+
Returns:
|
|
530
|
+
Selected reader or None
|
|
531
|
+
"""
|
|
532
|
+
if provided_reader:
|
|
533
|
+
return provided_reader
|
|
534
|
+
|
|
535
|
+
uri_lower = uri.lower()
|
|
536
|
+
if uri_lower.endswith(".pdf"):
|
|
537
|
+
return self.pdf_reader
|
|
538
|
+
elif uri_lower.endswith(".csv"):
|
|
539
|
+
return self.csv_reader
|
|
540
|
+
elif uri_lower.endswith(".docx"):
|
|
541
|
+
return self.docx_reader
|
|
542
|
+
elif uri_lower.endswith(".pptx"):
|
|
543
|
+
return self.pptx_reader
|
|
544
|
+
elif uri_lower.endswith(".json"):
|
|
545
|
+
return self.json_reader
|
|
546
|
+
elif uri_lower.endswith(".markdown"):
|
|
547
|
+
return self.markdown_reader
|
|
548
|
+
else:
|
|
549
|
+
return self.text_reader
|
|
550
|
+
|
|
551
|
+
def _read(
|
|
552
|
+
self,
|
|
553
|
+
reader: Reader,
|
|
554
|
+
source: Union[Path, str, BytesIO],
|
|
555
|
+
name: Optional[str] = None,
|
|
556
|
+
password: Optional[str] = None,
|
|
557
|
+
) -> List[Document]:
|
|
558
|
+
"""
|
|
559
|
+
Read content using a reader with optional password handling.
|
|
560
|
+
|
|
561
|
+
Args:
|
|
562
|
+
reader: Reader to use
|
|
563
|
+
source: Source to read from (Path, URL string, or BytesIO)
|
|
564
|
+
name: Optional name for the document
|
|
565
|
+
password: Optional password for protected files
|
|
566
|
+
|
|
567
|
+
Returns:
|
|
568
|
+
List of documents read
|
|
569
|
+
"""
|
|
570
|
+
import inspect
|
|
571
|
+
|
|
572
|
+
read_signature = inspect.signature(reader.read)
|
|
573
|
+
if password and "password" in read_signature.parameters:
|
|
574
|
+
if isinstance(source, BytesIO):
|
|
575
|
+
return reader.read(source, name=name, password=password)
|
|
576
|
+
else:
|
|
577
|
+
return reader.read(source, name=name, password=password)
|
|
578
|
+
else:
|
|
579
|
+
if isinstance(source, BytesIO):
|
|
580
|
+
return reader.read(source, name=name)
|
|
581
|
+
else:
|
|
582
|
+
return reader.read(source, name=name)
|
|
583
|
+
|
|
584
|
+
async def _read_async(
|
|
585
|
+
self,
|
|
586
|
+
reader: Reader,
|
|
587
|
+
source: Union[Path, str, BytesIO],
|
|
588
|
+
name: Optional[str] = None,
|
|
589
|
+
password: Optional[str] = None,
|
|
590
|
+
) -> List[Document]:
|
|
591
|
+
"""
|
|
592
|
+
Read content using a reader's async_read method with optional password handling.
|
|
593
|
+
|
|
594
|
+
Args:
|
|
595
|
+
reader: Reader to use
|
|
596
|
+
source: Source to read from (Path, URL string, or BytesIO)
|
|
597
|
+
name: Optional name for the document
|
|
598
|
+
password: Optional password for protected files
|
|
599
|
+
|
|
600
|
+
Returns:
|
|
601
|
+
List of documents read
|
|
602
|
+
"""
|
|
603
|
+
import inspect
|
|
604
|
+
|
|
605
|
+
read_signature = inspect.signature(reader.async_read)
|
|
606
|
+
if password and "password" in read_signature.parameters:
|
|
607
|
+
return await reader.async_read(source, name=name, password=password)
|
|
608
|
+
else:
|
|
609
|
+
if isinstance(source, BytesIO):
|
|
610
|
+
return await reader.async_read(source, name=name)
|
|
611
|
+
else:
|
|
612
|
+
return await reader.async_read(source, name=name)
|
|
613
|
+
|
|
614
|
+
def _prepare_documents_for_insert(
|
|
615
|
+
self,
|
|
616
|
+
documents: List[Document],
|
|
617
|
+
content_id: str,
|
|
618
|
+
calculate_sizes: bool = False,
|
|
619
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
620
|
+
) -> List[Document]:
|
|
621
|
+
"""
|
|
622
|
+
Prepare documents for insertion by assigning content_id and optionally calculating sizes and updating metadata.
|
|
623
|
+
|
|
624
|
+
Args:
|
|
625
|
+
documents: List of documents to prepare
|
|
626
|
+
content_id: Content ID to assign to documents
|
|
627
|
+
calculate_sizes: Whether to calculate document sizes
|
|
628
|
+
metadata: Optional metadata to merge into document metadata
|
|
629
|
+
|
|
630
|
+
Returns:
|
|
631
|
+
List of prepared documents
|
|
632
|
+
"""
|
|
633
|
+
for document in documents:
|
|
634
|
+
document.content_id = content_id
|
|
635
|
+
if calculate_sizes and document.content and not document.size:
|
|
636
|
+
document.size = len(document.content.encode("utf-8"))
|
|
637
|
+
if metadata:
|
|
638
|
+
document.meta_data.update(metadata)
|
|
639
|
+
return documents
|
|
640
|
+
|
|
641
|
+
def _chunk_documents_sync(self, reader: Reader, documents: List[Document]) -> List[Document]:
|
|
642
|
+
"""
|
|
643
|
+
Chunk documents synchronously.
|
|
644
|
+
|
|
645
|
+
Args:
|
|
646
|
+
reader: Reader with chunking strategy
|
|
647
|
+
documents: Documents to chunk
|
|
648
|
+
|
|
649
|
+
Returns:
|
|
650
|
+
List of chunked documents
|
|
651
|
+
"""
|
|
652
|
+
if not reader or reader.chunk:
|
|
653
|
+
return documents
|
|
654
|
+
|
|
655
|
+
chunked_documents = []
|
|
656
|
+
for doc in documents:
|
|
657
|
+
chunked_documents.extend(reader.chunk_document(doc))
|
|
658
|
+
return chunked_documents
|
|
659
|
+
|
|
660
|
+
async def _load_from_path_async(
|
|
661
|
+
self,
|
|
662
|
+
content: Content,
|
|
663
|
+
upsert: bool,
|
|
664
|
+
skip_if_exists: bool,
|
|
665
|
+
include: Optional[List[str]] = None,
|
|
666
|
+
exclude: Optional[List[str]] = None,
|
|
667
|
+
):
|
|
668
|
+
from agno.vectordb import VectorDb
|
|
669
|
+
|
|
670
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
671
|
+
|
|
672
|
+
log_info(f"Adding content from path, {content.id}, {content.name}, {content.path}, {content.description}")
|
|
673
|
+
path = Path(content.path) # type: ignore
|
|
674
|
+
|
|
675
|
+
if path.is_file():
|
|
676
|
+
if self._should_include_file(str(path), include, exclude):
|
|
677
|
+
log_debug(f"Adding file {path} due to include/exclude filters")
|
|
678
|
+
|
|
679
|
+
await self._add_to_contents_db_async(content)
|
|
680
|
+
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
681
|
+
content.status = ContentStatus.COMPLETED
|
|
682
|
+
await self._aupdate_content(content)
|
|
683
|
+
return
|
|
684
|
+
|
|
685
|
+
# Handle LightRAG special case - read file and upload directly
|
|
686
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
687
|
+
await self._process_lightrag_content_async(content, KnowledgeContentOrigin.PATH)
|
|
688
|
+
return
|
|
689
|
+
|
|
690
|
+
if content.reader:
|
|
691
|
+
reader = content.reader
|
|
692
|
+
else:
|
|
693
|
+
reader = ReaderFactory.get_reader_for_extension(path.suffix)
|
|
694
|
+
log_debug(f"Using Reader: {reader.__class__.__name__}")
|
|
695
|
+
|
|
696
|
+
if reader:
|
|
697
|
+
password = content.auth.password if content.auth and content.auth.password else None
|
|
698
|
+
read_documents = await self._read_async(
|
|
699
|
+
reader, path, name=content.name or path.name, password=password
|
|
700
|
+
)
|
|
701
|
+
else:
|
|
702
|
+
read_documents = []
|
|
703
|
+
|
|
704
|
+
if not content.file_type:
|
|
705
|
+
content.file_type = path.suffix
|
|
706
|
+
|
|
707
|
+
if not content.size and content.file_data:
|
|
708
|
+
content.size = len(content.file_data.content) # type: ignore
|
|
709
|
+
if not content.size:
|
|
710
|
+
try:
|
|
711
|
+
content.size = path.stat().st_size
|
|
712
|
+
except (OSError, IOError) as e:
|
|
713
|
+
log_warning(f"Could not get file size for {path}: {e}")
|
|
714
|
+
content.size = 0
|
|
715
|
+
|
|
716
|
+
if not content.id:
|
|
717
|
+
content.id = generate_id(content.content_hash or "")
|
|
718
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
719
|
+
|
|
720
|
+
await self._handle_vector_db_insert_async(content, read_documents, upsert)
|
|
721
|
+
|
|
722
|
+
elif path.is_dir():
|
|
723
|
+
for file_path in path.iterdir():
|
|
724
|
+
# Apply include/exclude filtering
|
|
725
|
+
if not self._should_include_file(str(file_path), include, exclude):
|
|
726
|
+
log_debug(f"Skipping file {file_path} due to include/exclude filters")
|
|
727
|
+
continue
|
|
728
|
+
|
|
729
|
+
file_content = Content(
|
|
730
|
+
name=content.name,
|
|
731
|
+
path=str(file_path),
|
|
732
|
+
metadata=content.metadata,
|
|
733
|
+
description=content.description,
|
|
734
|
+
reader=content.reader,
|
|
735
|
+
)
|
|
736
|
+
file_content.content_hash = self._build_content_hash(file_content)
|
|
737
|
+
file_content.id = generate_id(file_content.content_hash)
|
|
738
|
+
|
|
739
|
+
await self._load_from_path_async(file_content, upsert, skip_if_exists, include, exclude)
|
|
740
|
+
else:
|
|
741
|
+
log_warning(f"Invalid path: {path}")
|
|
742
|
+
|
|
743
|
+
def _load_from_path(
|
|
744
|
+
self,
|
|
745
|
+
content: Content,
|
|
746
|
+
upsert: bool,
|
|
747
|
+
skip_if_exists: bool,
|
|
748
|
+
include: Optional[List[str]] = None,
|
|
749
|
+
exclude: Optional[List[str]] = None,
|
|
750
|
+
):
|
|
751
|
+
from agno.vectordb import VectorDb
|
|
752
|
+
|
|
753
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
754
|
+
|
|
755
|
+
log_info(f"Adding content from path, {content.id}, {content.name}, {content.path}, {content.description}")
|
|
756
|
+
path = Path(content.path) # type: ignore
|
|
757
|
+
|
|
758
|
+
if path.is_file():
|
|
759
|
+
if self._should_include_file(str(path), include, exclude):
|
|
760
|
+
log_debug(f"Adding file {path} due to include/exclude filters")
|
|
761
|
+
|
|
762
|
+
self._add_to_contents_db(content)
|
|
763
|
+
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
764
|
+
content.status = ContentStatus.COMPLETED
|
|
765
|
+
self._update_content(content)
|
|
766
|
+
return
|
|
767
|
+
|
|
768
|
+
# Handle LightRAG special case - read file and upload directly
|
|
769
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
770
|
+
self._process_lightrag_content(content, KnowledgeContentOrigin.PATH)
|
|
771
|
+
return
|
|
772
|
+
|
|
773
|
+
if content.reader:
|
|
774
|
+
# TODO: We will refactor this to eventually pass authorization to all readers
|
|
775
|
+
import inspect
|
|
776
|
+
|
|
777
|
+
read_signature = inspect.signature(content.reader.read)
|
|
778
|
+
if "password" in read_signature.parameters and content.auth and content.auth.password:
|
|
779
|
+
read_documents = content.reader.read(
|
|
780
|
+
path, name=content.name or path.name, password=content.auth.password
|
|
781
|
+
)
|
|
782
|
+
else:
|
|
783
|
+
read_documents = content.reader.read(path, name=content.name or path.name)
|
|
784
|
+
|
|
785
|
+
else:
|
|
786
|
+
reader = ReaderFactory.get_reader_for_extension(path.suffix)
|
|
787
|
+
log_debug(f"Using Reader: {reader.__class__.__name__}")
|
|
788
|
+
if reader:
|
|
789
|
+
# TODO: We will refactor this to eventually pass authorization to all readers
|
|
790
|
+
import inspect
|
|
791
|
+
|
|
792
|
+
read_signature = inspect.signature(reader.read)
|
|
793
|
+
if "password" in read_signature.parameters and content.auth and content.auth.password:
|
|
794
|
+
read_documents = reader.read(
|
|
795
|
+
path, name=content.name or path.name, password=content.auth.password
|
|
796
|
+
)
|
|
797
|
+
else:
|
|
798
|
+
read_documents = reader.read(path, name=content.name or path.name)
|
|
799
|
+
|
|
800
|
+
if not content.file_type:
|
|
801
|
+
content.file_type = path.suffix
|
|
802
|
+
|
|
803
|
+
if not content.size and content.file_data:
|
|
804
|
+
content.size = len(content.file_data.content) # type: ignore
|
|
805
|
+
if not content.size:
|
|
806
|
+
try:
|
|
807
|
+
content.size = path.stat().st_size
|
|
808
|
+
except (OSError, IOError) as e:
|
|
809
|
+
log_warning(f"Could not get file size for {path}: {e}")
|
|
810
|
+
content.size = 0
|
|
811
|
+
|
|
812
|
+
if not content.id:
|
|
813
|
+
content.id = generate_id(content.content_hash or "")
|
|
814
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
815
|
+
|
|
816
|
+
self._handle_vector_db_insert(content, read_documents, upsert)
|
|
817
|
+
|
|
818
|
+
elif path.is_dir():
|
|
819
|
+
for file_path in path.iterdir():
|
|
820
|
+
# Apply include/exclude filtering
|
|
821
|
+
if not self._should_include_file(str(file_path), include, exclude):
|
|
822
|
+
log_debug(f"Skipping file {file_path} due to include/exclude filters")
|
|
823
|
+
continue
|
|
824
|
+
|
|
825
|
+
file_content = Content(
|
|
826
|
+
name=content.name,
|
|
827
|
+
path=str(file_path),
|
|
828
|
+
metadata=content.metadata,
|
|
829
|
+
description=content.description,
|
|
830
|
+
reader=content.reader,
|
|
831
|
+
)
|
|
832
|
+
file_content.content_hash = self._build_content_hash(file_content)
|
|
833
|
+
file_content.id = generate_id(file_content.content_hash)
|
|
834
|
+
|
|
835
|
+
self._load_from_path(file_content, upsert, skip_if_exists, include, exclude)
|
|
836
|
+
else:
|
|
837
|
+
log_warning(f"Invalid path: {path}")
|
|
838
|
+
|
|
839
|
+
async def _load_from_url_async(
|
|
840
|
+
self,
|
|
841
|
+
content: Content,
|
|
842
|
+
upsert: bool,
|
|
843
|
+
skip_if_exists: bool,
|
|
844
|
+
):
|
|
845
|
+
"""Load the content in the contextual URL
|
|
846
|
+
|
|
847
|
+
1. Set content hash
|
|
848
|
+
2. Validate the URL
|
|
849
|
+
3. Read the content
|
|
850
|
+
4. Prepare and insert the content in the vector database
|
|
851
|
+
"""
|
|
852
|
+
from agno.vectordb import VectorDb
|
|
853
|
+
|
|
854
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
855
|
+
|
|
856
|
+
log_info(f"Adding content from URL {content.url}")
|
|
857
|
+
content.file_type = "url"
|
|
858
|
+
|
|
859
|
+
if not content.url:
|
|
860
|
+
raise ValueError("No url provided")
|
|
861
|
+
|
|
862
|
+
# 1. Add content to contents database
|
|
863
|
+
await self._add_to_contents_db_async(content)
|
|
864
|
+
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
865
|
+
content.status = ContentStatus.COMPLETED
|
|
866
|
+
await self._aupdate_content(content)
|
|
867
|
+
return
|
|
868
|
+
|
|
869
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
870
|
+
await self._process_lightrag_content_async(content, KnowledgeContentOrigin.URL)
|
|
871
|
+
return
|
|
872
|
+
|
|
873
|
+
# 2. Validate URL
|
|
874
|
+
try:
|
|
875
|
+
from urllib.parse import urlparse
|
|
876
|
+
|
|
877
|
+
parsed_url = urlparse(content.url)
|
|
878
|
+
if not all([parsed_url.scheme, parsed_url.netloc]):
|
|
879
|
+
content.status = ContentStatus.FAILED
|
|
880
|
+
content.status_message = f"Invalid URL format: {content.url}"
|
|
881
|
+
await self._aupdate_content(content)
|
|
882
|
+
log_warning(f"Invalid URL format: {content.url}")
|
|
883
|
+
except Exception as e:
|
|
884
|
+
content.status = ContentStatus.FAILED
|
|
885
|
+
content.status_message = f"Invalid URL: {content.url} - {str(e)}"
|
|
886
|
+
await self._aupdate_content(content)
|
|
887
|
+
log_warning(f"Invalid URL: {content.url} - {str(e)}")
|
|
888
|
+
# 3. Fetch and load content if file has an extension
|
|
889
|
+
url_path = Path(parsed_url.path)
|
|
890
|
+
file_extension = url_path.suffix.lower()
|
|
891
|
+
|
|
892
|
+
bytes_content = None
|
|
893
|
+
if file_extension:
|
|
894
|
+
async with AsyncClient() as client:
|
|
895
|
+
response = await async_fetch_with_retry(content.url, client=client)
|
|
896
|
+
bytes_content = BytesIO(response.content)
|
|
897
|
+
|
|
898
|
+
# 4. Select reader
|
|
899
|
+
name = content.name if content.name else content.url
|
|
900
|
+
if file_extension:
|
|
901
|
+
reader, default_name = self._select_reader_by_extension(file_extension, content.reader)
|
|
902
|
+
if default_name and file_extension == ".csv":
|
|
903
|
+
name = basename(parsed_url.path) or default_name
|
|
904
|
+
else:
|
|
905
|
+
reader = content.reader or self.website_reader
|
|
906
|
+
# 5. Read content
|
|
907
|
+
try:
|
|
908
|
+
read_documents = []
|
|
909
|
+
if reader is not None:
|
|
910
|
+
# Special handling for YouTubeReader
|
|
911
|
+
if reader.__class__.__name__ == "YouTubeReader":
|
|
912
|
+
read_documents = await reader.async_read(content.url, name=name)
|
|
913
|
+
else:
|
|
914
|
+
password = content.auth.password if content.auth and content.auth.password else None
|
|
915
|
+
source = bytes_content if bytes_content else content.url
|
|
916
|
+
read_documents = await self._read_async(reader, source, name=name, password=password)
|
|
917
|
+
|
|
918
|
+
except Exception as e:
|
|
919
|
+
log_error(f"Error reading URL: {content.url} - {str(e)}")
|
|
920
|
+
content.status = ContentStatus.FAILED
|
|
921
|
+
content.status_message = f"Error reading URL: {content.url} - {str(e)}"
|
|
922
|
+
await self._aupdate_content(content)
|
|
923
|
+
return
|
|
924
|
+
|
|
925
|
+
# 6. Chunk documents if needed
|
|
926
|
+
if reader and not reader.chunk:
|
|
927
|
+
read_documents = await reader.chunk_documents_async(read_documents)
|
|
928
|
+
# 7. Prepare and insert the content in the vector database
|
|
929
|
+
if not content.id:
|
|
930
|
+
content.id = generate_id(content.content_hash or "")
|
|
931
|
+
self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
|
|
932
|
+
await self._handle_vector_db_insert_async(content, read_documents, upsert)
|
|
933
|
+
|
|
934
|
+
def _load_from_url(
|
|
935
|
+
self,
|
|
936
|
+
content: Content,
|
|
937
|
+
upsert: bool,
|
|
938
|
+
skip_if_exists: bool,
|
|
939
|
+
):
|
|
940
|
+
"""Synchronous version of _load_from_url.
|
|
941
|
+
|
|
942
|
+
Load the content from a URL:
|
|
943
|
+
1. Set content hash
|
|
944
|
+
2. Validate the URL
|
|
945
|
+
3. Read the content
|
|
946
|
+
4. Prepare and insert the content in the vector database
|
|
947
|
+
"""
|
|
948
|
+
from agno.utils.http import fetch_with_retry
|
|
949
|
+
from agno.vectordb import VectorDb
|
|
950
|
+
|
|
951
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
952
|
+
|
|
953
|
+
log_info(f"Adding content from URL {content.url}")
|
|
954
|
+
content.file_type = "url"
|
|
955
|
+
|
|
956
|
+
if not content.url:
|
|
957
|
+
raise ValueError("No url provided")
|
|
958
|
+
|
|
959
|
+
# 1. Add content to contents database
|
|
960
|
+
self._add_to_contents_db(content)
|
|
961
|
+
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
962
|
+
content.status = ContentStatus.COMPLETED
|
|
963
|
+
self._update_content(content)
|
|
964
|
+
return
|
|
965
|
+
|
|
966
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
967
|
+
self._process_lightrag_content(content, KnowledgeContentOrigin.URL)
|
|
968
|
+
return
|
|
969
|
+
|
|
970
|
+
# 2. Validate URL
|
|
971
|
+
try:
|
|
972
|
+
from urllib.parse import urlparse
|
|
973
|
+
|
|
974
|
+
parsed_url = urlparse(content.url)
|
|
975
|
+
if not all([parsed_url.scheme, parsed_url.netloc]):
|
|
976
|
+
content.status = ContentStatus.FAILED
|
|
977
|
+
content.status_message = f"Invalid URL format: {content.url}"
|
|
978
|
+
self._update_content(content)
|
|
979
|
+
log_warning(f"Invalid URL format: {content.url}")
|
|
980
|
+
except Exception as e:
|
|
981
|
+
content.status = ContentStatus.FAILED
|
|
982
|
+
content.status_message = f"Invalid URL: {content.url} - {str(e)}"
|
|
983
|
+
self._update_content(content)
|
|
984
|
+
log_warning(f"Invalid URL: {content.url} - {str(e)}")
|
|
985
|
+
|
|
986
|
+
# 3. Fetch and load content if file has an extension
|
|
987
|
+
url_path = Path(parsed_url.path)
|
|
988
|
+
file_extension = url_path.suffix.lower()
|
|
989
|
+
|
|
990
|
+
bytes_content = None
|
|
991
|
+
if file_extension:
|
|
992
|
+
response = fetch_with_retry(content.url)
|
|
993
|
+
bytes_content = BytesIO(response.content)
|
|
994
|
+
|
|
995
|
+
# 4. Select reader
|
|
996
|
+
name = content.name if content.name else content.url
|
|
997
|
+
if file_extension:
|
|
998
|
+
reader, default_name = self._select_reader_by_extension(file_extension, content.reader)
|
|
999
|
+
if default_name and file_extension == ".csv":
|
|
1000
|
+
name = basename(parsed_url.path) or default_name
|
|
1001
|
+
else:
|
|
1002
|
+
reader = content.reader or self.website_reader
|
|
1003
|
+
|
|
1004
|
+
# 5. Read content
|
|
1005
|
+
try:
|
|
1006
|
+
read_documents = []
|
|
1007
|
+
if reader is not None:
|
|
1008
|
+
# Special handling for YouTubeReader
|
|
1009
|
+
if reader.__class__.__name__ == "YouTubeReader":
|
|
1010
|
+
read_documents = reader.read(content.url, name=name)
|
|
1011
|
+
else:
|
|
1012
|
+
password = content.auth.password if content.auth and content.auth.password else None
|
|
1013
|
+
source = bytes_content if bytes_content else content.url
|
|
1014
|
+
read_documents = self._read(reader, source, name=name, password=password)
|
|
1015
|
+
|
|
1016
|
+
except Exception as e:
|
|
1017
|
+
log_error(f"Error reading URL: {content.url} - {str(e)}")
|
|
1018
|
+
content.status = ContentStatus.FAILED
|
|
1019
|
+
content.status_message = f"Error reading URL: {content.url} - {str(e)}"
|
|
1020
|
+
self._update_content(content)
|
|
1021
|
+
return
|
|
1022
|
+
|
|
1023
|
+
# 6. Chunk documents if needed (sync version)
|
|
1024
|
+
if reader:
|
|
1025
|
+
read_documents = self._chunk_documents_sync(reader, read_documents)
|
|
1026
|
+
|
|
1027
|
+
# 7. Prepare and insert the content in the vector database
|
|
1028
|
+
if not content.id:
|
|
1029
|
+
content.id = generate_id(content.content_hash or "")
|
|
1030
|
+
self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
|
|
1031
|
+
self._handle_vector_db_insert(content, read_documents, upsert)
|
|
1032
|
+
|
|
1033
|
+
async def _load_from_content_async(
|
|
1034
|
+
self,
|
|
1035
|
+
content: Content,
|
|
1036
|
+
upsert: bool = True,
|
|
1037
|
+
skip_if_exists: bool = False,
|
|
1038
|
+
):
|
|
1039
|
+
from agno.vectordb import VectorDb
|
|
1040
|
+
|
|
1041
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1042
|
+
|
|
1043
|
+
if content.name:
|
|
1044
|
+
name = content.name
|
|
1045
|
+
elif content.file_data and content.file_data.content:
|
|
1046
|
+
if isinstance(content.file_data.content, bytes):
|
|
1047
|
+
name = content.file_data.content[:10].decode("utf-8", errors="ignore")
|
|
1048
|
+
elif isinstance(content.file_data.content, str):
|
|
1049
|
+
name = (
|
|
1050
|
+
content.file_data.content[:10]
|
|
1051
|
+
if len(content.file_data.content) >= 10
|
|
1052
|
+
else content.file_data.content
|
|
1053
|
+
)
|
|
1054
|
+
else:
|
|
1055
|
+
name = str(content.file_data.content)[:10]
|
|
1056
|
+
else:
|
|
1057
|
+
name = None
|
|
1058
|
+
|
|
1059
|
+
if name is not None:
|
|
1060
|
+
content.name = name
|
|
1061
|
+
|
|
1062
|
+
log_info(f"Adding content from {content.name}")
|
|
1063
|
+
|
|
1064
|
+
await self._add_to_contents_db_async(content)
|
|
1065
|
+
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
1066
|
+
content.status = ContentStatus.COMPLETED
|
|
1067
|
+
await self._aupdate_content(content)
|
|
1068
|
+
return
|
|
1069
|
+
|
|
1070
|
+
if content.file_data and self.vector_db.__class__.__name__ == "LightRag":
|
|
1071
|
+
await self._process_lightrag_content_async(content, KnowledgeContentOrigin.CONTENT)
|
|
1072
|
+
return
|
|
1073
|
+
|
|
1074
|
+
read_documents = []
|
|
1075
|
+
|
|
1076
|
+
if isinstance(content.file_data, str):
|
|
1077
|
+
content_bytes = content.file_data.encode("utf-8", errors="replace")
|
|
1078
|
+
content_io = io.BytesIO(content_bytes)
|
|
1079
|
+
|
|
1080
|
+
if content.reader:
|
|
1081
|
+
log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
|
|
1082
|
+
read_documents = await content.reader.async_read(content_io, name=name)
|
|
1083
|
+
else:
|
|
1084
|
+
text_reader = self.text_reader
|
|
1085
|
+
if text_reader:
|
|
1086
|
+
read_documents = await text_reader.async_read(content_io, name=name)
|
|
1087
|
+
else:
|
|
1088
|
+
content.status = ContentStatus.FAILED
|
|
1089
|
+
content.status_message = "Text reader not available"
|
|
1090
|
+
await self._aupdate_content(content)
|
|
1091
|
+
return
|
|
1092
|
+
|
|
1093
|
+
elif isinstance(content.file_data, FileData):
|
|
1094
|
+
if content.file_data.type:
|
|
1095
|
+
if isinstance(content.file_data.content, bytes):
|
|
1096
|
+
content_io = io.BytesIO(content.file_data.content)
|
|
1097
|
+
elif isinstance(content.file_data.content, str):
|
|
1098
|
+
content_bytes = content.file_data.content.encode("utf-8", errors="replace")
|
|
1099
|
+
content_io = io.BytesIO(content_bytes)
|
|
1100
|
+
else:
|
|
1101
|
+
content_io = content.file_data.content # type: ignore
|
|
1102
|
+
|
|
1103
|
+
# Respect an explicitly provided reader; otherwise select based on file type
|
|
1104
|
+
if content.reader:
|
|
1105
|
+
log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
|
|
1106
|
+
reader = content.reader
|
|
1107
|
+
else:
|
|
1108
|
+
reader = self._select_reader(content.file_data.type)
|
|
1109
|
+
name = content.name if content.name else f"content_{content.file_data.type}"
|
|
1110
|
+
read_documents = await reader.async_read(content_io, name=name)
|
|
1111
|
+
if not content.id:
|
|
1112
|
+
content.id = generate_id(content.content_hash or "")
|
|
1113
|
+
self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
|
|
1114
|
+
|
|
1115
|
+
if len(read_documents) == 0:
|
|
1116
|
+
content.status = ContentStatus.FAILED
|
|
1117
|
+
content.status_message = "Content could not be read"
|
|
1118
|
+
await self._aupdate_content(content)
|
|
1119
|
+
return
|
|
1120
|
+
|
|
1121
|
+
else:
|
|
1122
|
+
content.status = ContentStatus.FAILED
|
|
1123
|
+
content.status_message = "No content provided"
|
|
1124
|
+
await self._aupdate_content(content)
|
|
1125
|
+
return
|
|
1126
|
+
|
|
1127
|
+
await self._handle_vector_db_insert_async(content, read_documents, upsert)
|
|
1128
|
+
|
|
1129
|
+
def _load_from_content(
|
|
1130
|
+
self,
|
|
1131
|
+
content: Content,
|
|
1132
|
+
upsert: bool = True,
|
|
1133
|
+
skip_if_exists: bool = False,
|
|
1134
|
+
):
|
|
1135
|
+
"""Synchronous version of _load_from_content."""
|
|
1136
|
+
from agno.vectordb import VectorDb
|
|
1137
|
+
|
|
1138
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1139
|
+
|
|
1140
|
+
if content.name:
|
|
1141
|
+
name = content.name
|
|
1142
|
+
elif content.file_data and content.file_data.content:
|
|
1143
|
+
if isinstance(content.file_data.content, bytes):
|
|
1144
|
+
name = content.file_data.content[:10].decode("utf-8", errors="ignore")
|
|
1145
|
+
elif isinstance(content.file_data.content, str):
|
|
1146
|
+
name = (
|
|
1147
|
+
content.file_data.content[:10]
|
|
1148
|
+
if len(content.file_data.content) >= 10
|
|
1149
|
+
else content.file_data.content
|
|
1150
|
+
)
|
|
1151
|
+
else:
|
|
1152
|
+
name = str(content.file_data.content)[:10]
|
|
1153
|
+
else:
|
|
1154
|
+
name = None
|
|
1155
|
+
|
|
1156
|
+
if name is not None:
|
|
1157
|
+
content.name = name
|
|
1158
|
+
|
|
1159
|
+
log_info(f"Adding content from {content.name}")
|
|
1160
|
+
|
|
1161
|
+
self._add_to_contents_db(content)
|
|
1162
|
+
if self._should_skip(content.content_hash, skip_if_exists): # type: ignore[arg-type]
|
|
1163
|
+
content.status = ContentStatus.COMPLETED
|
|
1164
|
+
self._update_content(content)
|
|
1165
|
+
return
|
|
1166
|
+
|
|
1167
|
+
if content.file_data and self.vector_db.__class__.__name__ == "LightRag":
|
|
1168
|
+
self._process_lightrag_content(content, KnowledgeContentOrigin.CONTENT)
|
|
1169
|
+
return
|
|
1170
|
+
|
|
1171
|
+
read_documents = []
|
|
1172
|
+
|
|
1173
|
+
if isinstance(content.file_data, str):
|
|
1174
|
+
content_bytes = content.file_data.encode("utf-8", errors="replace")
|
|
1175
|
+
content_io = io.BytesIO(content_bytes)
|
|
1176
|
+
|
|
1177
|
+
if content.reader:
|
|
1178
|
+
log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
|
|
1179
|
+
read_documents = content.reader.read(content_io, name=name)
|
|
1180
|
+
else:
|
|
1181
|
+
text_reader = self.text_reader
|
|
1182
|
+
if text_reader:
|
|
1183
|
+
read_documents = text_reader.read(content_io, name=name)
|
|
1184
|
+
else:
|
|
1185
|
+
content.status = ContentStatus.FAILED
|
|
1186
|
+
content.status_message = "Text reader not available"
|
|
1187
|
+
self._update_content(content)
|
|
1188
|
+
return
|
|
1189
|
+
|
|
1190
|
+
elif isinstance(content.file_data, FileData):
|
|
1191
|
+
if content.file_data.type:
|
|
1192
|
+
if isinstance(content.file_data.content, bytes):
|
|
1193
|
+
content_io = io.BytesIO(content.file_data.content)
|
|
1194
|
+
elif isinstance(content.file_data.content, str):
|
|
1195
|
+
content_bytes = content.file_data.content.encode("utf-8", errors="replace")
|
|
1196
|
+
content_io = io.BytesIO(content_bytes)
|
|
1197
|
+
else:
|
|
1198
|
+
content_io = content.file_data.content # type: ignore
|
|
1199
|
+
|
|
1200
|
+
# Respect an explicitly provided reader; otherwise select based on file type
|
|
1201
|
+
if content.reader:
|
|
1202
|
+
log_debug(f"Using reader: {content.reader.__class__.__name__} to read content")
|
|
1203
|
+
reader = content.reader
|
|
1204
|
+
else:
|
|
1205
|
+
reader = self._select_reader(content.file_data.type)
|
|
1206
|
+
name = content.name if content.name else f"content_{content.file_data.type}"
|
|
1207
|
+
read_documents = reader.read(content_io, name=name)
|
|
1208
|
+
if not content.id:
|
|
1209
|
+
content.id = generate_id(content.content_hash or "")
|
|
1210
|
+
self._prepare_documents_for_insert(read_documents, content.id, metadata=content.metadata)
|
|
1211
|
+
|
|
1212
|
+
if len(read_documents) == 0:
|
|
1213
|
+
content.status = ContentStatus.FAILED
|
|
1214
|
+
content.status_message = "Content could not be read"
|
|
1215
|
+
self._update_content(content)
|
|
1216
|
+
return
|
|
1217
|
+
|
|
1218
|
+
else:
|
|
1219
|
+
content.status = ContentStatus.FAILED
|
|
1220
|
+
content.status_message = "No content provided"
|
|
1221
|
+
self._update_content(content)
|
|
1222
|
+
return
|
|
1223
|
+
|
|
1224
|
+
self._handle_vector_db_insert(content, read_documents, upsert)
|
|
1225
|
+
|
|
1226
|
+
async def _load_from_topics_async(
|
|
1227
|
+
self,
|
|
1228
|
+
content: Content,
|
|
1229
|
+
upsert: bool,
|
|
1230
|
+
skip_if_exists: bool,
|
|
1231
|
+
):
|
|
1232
|
+
from agno.vectordb import VectorDb
|
|
1233
|
+
|
|
1234
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1235
|
+
log_info(f"Adding content from topics: {content.topics}")
|
|
1236
|
+
|
|
1237
|
+
if content.topics is None:
|
|
1238
|
+
log_warning("No topics provided for content")
|
|
1239
|
+
return
|
|
1240
|
+
|
|
1241
|
+
for topic in content.topics:
|
|
1242
|
+
content = Content(
|
|
1243
|
+
name=topic,
|
|
1244
|
+
metadata=content.metadata,
|
|
1245
|
+
reader=content.reader,
|
|
1246
|
+
status=ContentStatus.PROCESSING if content.reader else ContentStatus.FAILED,
|
|
1247
|
+
file_data=FileData(
|
|
1248
|
+
type="Topic",
|
|
1249
|
+
),
|
|
1250
|
+
topics=[topic],
|
|
1251
|
+
)
|
|
1252
|
+
content.content_hash = self._build_content_hash(content)
|
|
1253
|
+
content.id = generate_id(content.content_hash)
|
|
1254
|
+
|
|
1255
|
+
await self._add_to_contents_db_async(content)
|
|
1256
|
+
if self._should_skip(content.content_hash, skip_if_exists):
|
|
1257
|
+
content.status = ContentStatus.COMPLETED
|
|
1258
|
+
await self._aupdate_content(content)
|
|
1259
|
+
return
|
|
1260
|
+
|
|
1261
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
1262
|
+
await self._process_lightrag_content_async(content, KnowledgeContentOrigin.TOPIC)
|
|
1263
|
+
return
|
|
1264
|
+
|
|
1265
|
+
if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
|
|
1266
|
+
log_info(f"Content {content.content_hash} already exists, skipping")
|
|
1267
|
+
continue
|
|
1268
|
+
|
|
1269
|
+
await self._add_to_contents_db_async(content)
|
|
1270
|
+
if content.reader is None:
|
|
1271
|
+
log_error(f"No reader available for topic: {topic}")
|
|
1272
|
+
content.status = ContentStatus.FAILED
|
|
1273
|
+
content.status_message = "No reader available for topic"
|
|
1274
|
+
await self._aupdate_content(content)
|
|
1275
|
+
continue
|
|
1276
|
+
|
|
1277
|
+
read_documents = await content.reader.async_read(topic)
|
|
1278
|
+
if len(read_documents) > 0:
|
|
1279
|
+
self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
|
|
1280
|
+
else:
|
|
1281
|
+
content.status = ContentStatus.FAILED
|
|
1282
|
+
content.status_message = "No content found for topic"
|
|
1283
|
+
await self._aupdate_content(content)
|
|
1284
|
+
|
|
1285
|
+
await self._handle_vector_db_insert_async(content, read_documents, upsert)
|
|
1286
|
+
|
|
1287
|
+
def _load_from_topics(
|
|
1288
|
+
self,
|
|
1289
|
+
content: Content,
|
|
1290
|
+
upsert: bool,
|
|
1291
|
+
skip_if_exists: bool,
|
|
1292
|
+
):
|
|
1293
|
+
"""Synchronous version of _load_from_topics."""
|
|
1294
|
+
from agno.vectordb import VectorDb
|
|
1295
|
+
|
|
1296
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1297
|
+
log_info(f"Adding content from topics: {content.topics}")
|
|
1298
|
+
|
|
1299
|
+
if content.topics is None:
|
|
1300
|
+
log_warning("No topics provided for content")
|
|
1301
|
+
return
|
|
1302
|
+
|
|
1303
|
+
for topic in content.topics:
|
|
1304
|
+
content = Content(
|
|
1305
|
+
name=topic,
|
|
1306
|
+
metadata=content.metadata,
|
|
1307
|
+
reader=content.reader,
|
|
1308
|
+
status=ContentStatus.PROCESSING if content.reader else ContentStatus.FAILED,
|
|
1309
|
+
file_data=FileData(
|
|
1310
|
+
type="Topic",
|
|
1311
|
+
),
|
|
1312
|
+
topics=[topic],
|
|
1313
|
+
)
|
|
1314
|
+
content.content_hash = self._build_content_hash(content)
|
|
1315
|
+
content.id = generate_id(content.content_hash)
|
|
1316
|
+
|
|
1317
|
+
self._add_to_contents_db(content)
|
|
1318
|
+
if self._should_skip(content.content_hash, skip_if_exists):
|
|
1319
|
+
content.status = ContentStatus.COMPLETED
|
|
1320
|
+
self._update_content(content)
|
|
1321
|
+
return
|
|
1322
|
+
|
|
1323
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
1324
|
+
self._process_lightrag_content(content, KnowledgeContentOrigin.TOPIC)
|
|
1325
|
+
return
|
|
1326
|
+
|
|
1327
|
+
if self.vector_db and self.vector_db.content_hash_exists(content.content_hash) and skip_if_exists:
|
|
1328
|
+
log_info(f"Content {content.content_hash} already exists, skipping")
|
|
1329
|
+
continue
|
|
1330
|
+
|
|
1331
|
+
self._add_to_contents_db(content)
|
|
1332
|
+
if content.reader is None:
|
|
1333
|
+
log_error(f"No reader available for topic: {topic}")
|
|
1334
|
+
content.status = ContentStatus.FAILED
|
|
1335
|
+
content.status_message = "No reader available for topic"
|
|
1336
|
+
self._update_content(content)
|
|
1337
|
+
continue
|
|
1338
|
+
|
|
1339
|
+
read_documents = content.reader.read(topic)
|
|
1340
|
+
if len(read_documents) > 0:
|
|
1341
|
+
self._prepare_documents_for_insert(read_documents, content.id, calculate_sizes=True)
|
|
1342
|
+
else:
|
|
1343
|
+
content.status = ContentStatus.FAILED
|
|
1344
|
+
content.status_message = "No content found for topic"
|
|
1345
|
+
self._update_content(content)
|
|
1346
|
+
|
|
1347
|
+
self._handle_vector_db_insert(content, read_documents, upsert)
|
|
1348
|
+
|
|
1349
|
+
async def _load_from_remote_content_async(
|
|
1350
|
+
self,
|
|
1351
|
+
content: Content,
|
|
1352
|
+
upsert: bool,
|
|
1353
|
+
skip_if_exists: bool,
|
|
1354
|
+
):
|
|
1355
|
+
if content.remote_content is None:
|
|
1356
|
+
log_warning("No remote content provided for content")
|
|
1357
|
+
return
|
|
1358
|
+
|
|
1359
|
+
remote_content = content.remote_content
|
|
1360
|
+
|
|
1361
|
+
if isinstance(remote_content, S3Content):
|
|
1362
|
+
await self._load_from_s3_async(content, upsert, skip_if_exists)
|
|
1363
|
+
|
|
1364
|
+
elif isinstance(remote_content, GCSContent):
|
|
1365
|
+
await self._load_from_gcs_async(content, upsert, skip_if_exists)
|
|
1366
|
+
|
|
1367
|
+
else:
|
|
1368
|
+
log_warning(f"Unsupported remote content type: {type(remote_content)}")
|
|
1369
|
+
|
|
1370
|
+
async def _load_from_s3_async(self, content: Content, upsert: bool, skip_if_exists: bool):
|
|
1371
|
+
"""Load the contextual S3 content.
|
|
1372
|
+
|
|
1373
|
+
1. Identify objects to read
|
|
1374
|
+
2. Setup Content object
|
|
1375
|
+
3. Hash content and add it to the contents database
|
|
1376
|
+
4. Select reader
|
|
1377
|
+
5. Fetch and load the content
|
|
1378
|
+
6. Read the content
|
|
1379
|
+
7. Prepare and insert the content in the vector database
|
|
1380
|
+
8. Remove temporary file if needed
|
|
1381
|
+
"""
|
|
1382
|
+
from agno.cloud.aws.s3.object import S3Object
|
|
1383
|
+
|
|
1384
|
+
remote_content: S3Content = cast(S3Content, content.remote_content)
|
|
1385
|
+
|
|
1386
|
+
# 1. Identify objects to read
|
|
1387
|
+
objects_to_read: List[S3Object] = []
|
|
1388
|
+
if remote_content.bucket is not None:
|
|
1389
|
+
if remote_content.key is not None:
|
|
1390
|
+
_object = S3Object(bucket_name=remote_content.bucket.name, name=remote_content.key)
|
|
1391
|
+
objects_to_read.append(_object)
|
|
1392
|
+
elif remote_content.object is not None:
|
|
1393
|
+
objects_to_read.append(remote_content.object)
|
|
1394
|
+
elif remote_content.prefix is not None:
|
|
1395
|
+
objects_to_read.extend(remote_content.bucket.get_objects(prefix=remote_content.prefix))
|
|
1396
|
+
else:
|
|
1397
|
+
objects_to_read.extend(remote_content.bucket.get_objects())
|
|
1398
|
+
|
|
1399
|
+
for s3_object in objects_to_read:
|
|
1400
|
+
# 2. Setup Content object
|
|
1401
|
+
content_name = content.name or ""
|
|
1402
|
+
content_name += "_" + (s3_object.name or "")
|
|
1403
|
+
content_entry = Content(
|
|
1404
|
+
name=content_name,
|
|
1405
|
+
description=content.description,
|
|
1406
|
+
status=ContentStatus.PROCESSING,
|
|
1407
|
+
metadata=content.metadata,
|
|
1408
|
+
file_type="s3",
|
|
1409
|
+
)
|
|
1410
|
+
|
|
1411
|
+
# 3. Hash content and add it to the contents database
|
|
1412
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
1413
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
1414
|
+
await self._add_to_contents_db_async(content_entry)
|
|
1415
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
1416
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
1417
|
+
await self._aupdate_content(content_entry)
|
|
1418
|
+
return
|
|
1419
|
+
|
|
1420
|
+
# 4. Select reader
|
|
1421
|
+
reader = self._select_reader_by_uri(s3_object.uri, content.reader)
|
|
1422
|
+
reader = cast(Reader, reader)
|
|
1423
|
+
|
|
1424
|
+
# 5. Fetch and load the content
|
|
1425
|
+
temporary_file = None
|
|
1426
|
+
obj_name = content_name or s3_object.name.split("/")[-1]
|
|
1427
|
+
readable_content: Optional[Union[BytesIO, Path]] = None
|
|
1428
|
+
if s3_object.uri.endswith(".pdf"):
|
|
1429
|
+
readable_content = BytesIO(s3_object.get_resource().get()["Body"].read())
|
|
1430
|
+
else:
|
|
1431
|
+
temporary_file = Path("storage").joinpath(obj_name)
|
|
1432
|
+
readable_content = temporary_file
|
|
1433
|
+
s3_object.download(readable_content) # type: ignore
|
|
1434
|
+
|
|
1435
|
+
# 6. Read the content
|
|
1436
|
+
read_documents = await reader.async_read(readable_content, name=obj_name)
|
|
1437
|
+
|
|
1438
|
+
# 7. Prepare and insert the content in the vector database
|
|
1439
|
+
if not content.id:
|
|
1440
|
+
content.id = generate_id(content.content_hash or "")
|
|
1441
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
1442
|
+
await self._handle_vector_db_insert_async(content_entry, read_documents, upsert)
|
|
1443
|
+
|
|
1444
|
+
# 8. Remove temporary file if needed
|
|
1445
|
+
if temporary_file:
|
|
1446
|
+
temporary_file.unlink()
|
|
1447
|
+
|
|
1448
|
+
async def _load_from_gcs_async(self, content: Content, upsert: bool, skip_if_exists: bool):
|
|
1449
|
+
"""Load the contextual GCS content.
|
|
1450
|
+
|
|
1451
|
+
1. Identify objects to read
|
|
1452
|
+
2. Setup Content object
|
|
1453
|
+
3. Hash content and add it to the contents database
|
|
1454
|
+
4. Select reader
|
|
1455
|
+
5. Fetch and load the content
|
|
1456
|
+
6. Read the content
|
|
1457
|
+
7. Prepare and insert the content in the vector database
|
|
1458
|
+
"""
|
|
1459
|
+
remote_content: GCSContent = cast(GCSContent, content.remote_content)
|
|
1460
|
+
|
|
1461
|
+
# 1. Identify objects to read
|
|
1462
|
+
objects_to_read = []
|
|
1463
|
+
if remote_content.blob_name is not None:
|
|
1464
|
+
objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name)) # type: ignore
|
|
1465
|
+
elif remote_content.prefix is not None:
|
|
1466
|
+
objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
|
|
1467
|
+
else:
|
|
1468
|
+
objects_to_read.extend(remote_content.bucket.list_blobs()) # type: ignore
|
|
1469
|
+
|
|
1470
|
+
for gcs_object in objects_to_read:
|
|
1471
|
+
# 2. Setup Content object
|
|
1472
|
+
name = (content.name or "content") + "_" + gcs_object.name
|
|
1473
|
+
content_entry = Content(
|
|
1474
|
+
name=name,
|
|
1475
|
+
description=content.description,
|
|
1476
|
+
status=ContentStatus.PROCESSING,
|
|
1477
|
+
metadata=content.metadata,
|
|
1478
|
+
file_type="gcs",
|
|
1479
|
+
)
|
|
1480
|
+
|
|
1481
|
+
# 3. Hash content and add it to the contents database
|
|
1482
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
1483
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
1484
|
+
await self._add_to_contents_db_async(content_entry)
|
|
1485
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
1486
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
1487
|
+
await self._aupdate_content(content_entry)
|
|
1488
|
+
return
|
|
1489
|
+
|
|
1490
|
+
# 4. Select reader
|
|
1491
|
+
reader = self._select_reader_by_uri(gcs_object.name, content.reader)
|
|
1492
|
+
reader = cast(Reader, reader)
|
|
1493
|
+
|
|
1494
|
+
# 5. Fetch and load the content
|
|
1495
|
+
readable_content = BytesIO(gcs_object.download_as_bytes())
|
|
1496
|
+
|
|
1497
|
+
# 6. Read the content
|
|
1498
|
+
read_documents = await reader.async_read(readable_content, name=name)
|
|
1499
|
+
|
|
1500
|
+
# 7. Prepare and insert the content in the vector database
|
|
1501
|
+
if not content.id:
|
|
1502
|
+
content.id = generate_id(content.content_hash or "")
|
|
1503
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
1504
|
+
await self._handle_vector_db_insert_async(content_entry, read_documents, upsert)
|
|
1505
|
+
|
|
1506
|
+
def _load_from_remote_content(
|
|
1507
|
+
self,
|
|
1508
|
+
content: Content,
|
|
1509
|
+
upsert: bool,
|
|
1510
|
+
skip_if_exists: bool,
|
|
1511
|
+
):
|
|
1512
|
+
"""Synchronous version of _load_from_remote_content."""
|
|
1513
|
+
if content.remote_content is None:
|
|
1514
|
+
log_warning("No remote content provided for content")
|
|
1515
|
+
return
|
|
1516
|
+
|
|
1517
|
+
remote_content = content.remote_content
|
|
1518
|
+
|
|
1519
|
+
if isinstance(remote_content, S3Content):
|
|
1520
|
+
self._load_from_s3(content, upsert, skip_if_exists)
|
|
1521
|
+
|
|
1522
|
+
elif isinstance(remote_content, GCSContent):
|
|
1523
|
+
self._load_from_gcs(content, upsert, skip_if_exists)
|
|
1524
|
+
|
|
1525
|
+
else:
|
|
1526
|
+
log_warning(f"Unsupported remote content type: {type(remote_content)}")
|
|
1527
|
+
|
|
1528
|
+
def _load_from_s3(self, content: Content, upsert: bool, skip_if_exists: bool):
|
|
1529
|
+
"""Synchronous version of _load_from_s3.
|
|
1530
|
+
|
|
1531
|
+
Load the contextual S3 content:
|
|
1532
|
+
1. Identify objects to read
|
|
1533
|
+
2. Setup Content object
|
|
1534
|
+
3. Hash content and add it to the contents database
|
|
1535
|
+
4. Select reader
|
|
1536
|
+
5. Fetch and load the content
|
|
1537
|
+
6. Read the content
|
|
1538
|
+
7. Prepare and insert the content in the vector database
|
|
1539
|
+
8. Remove temporary file if needed
|
|
1540
|
+
"""
|
|
1541
|
+
from agno.cloud.aws.s3.object import S3Object
|
|
1542
|
+
|
|
1543
|
+
remote_content: S3Content = cast(S3Content, content.remote_content)
|
|
1544
|
+
|
|
1545
|
+
# 1. Identify objects to read
|
|
1546
|
+
objects_to_read: List[S3Object] = []
|
|
1547
|
+
if remote_content.bucket is not None:
|
|
1548
|
+
if remote_content.key is not None:
|
|
1549
|
+
_object = S3Object(bucket_name=remote_content.bucket.name, name=remote_content.key)
|
|
1550
|
+
objects_to_read.append(_object)
|
|
1551
|
+
elif remote_content.object is not None:
|
|
1552
|
+
objects_to_read.append(remote_content.object)
|
|
1553
|
+
elif remote_content.prefix is not None:
|
|
1554
|
+
objects_to_read.extend(remote_content.bucket.get_objects(prefix=remote_content.prefix))
|
|
1555
|
+
else:
|
|
1556
|
+
objects_to_read.extend(remote_content.bucket.get_objects())
|
|
1557
|
+
|
|
1558
|
+
for s3_object in objects_to_read:
|
|
1559
|
+
# 2. Setup Content object
|
|
1560
|
+
content_name = content.name or ""
|
|
1561
|
+
content_name += "_" + (s3_object.name or "")
|
|
1562
|
+
content_entry = Content(
|
|
1563
|
+
name=content_name,
|
|
1564
|
+
description=content.description,
|
|
1565
|
+
status=ContentStatus.PROCESSING,
|
|
1566
|
+
metadata=content.metadata,
|
|
1567
|
+
file_type="s3",
|
|
1568
|
+
)
|
|
1569
|
+
|
|
1570
|
+
# 3. Hash content and add it to the contents database
|
|
1571
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
1572
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
1573
|
+
self._add_to_contents_db(content_entry)
|
|
1574
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
1575
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
1576
|
+
self._update_content(content_entry)
|
|
1577
|
+
return
|
|
1578
|
+
|
|
1579
|
+
# 4. Select reader
|
|
1580
|
+
reader = self._select_reader_by_uri(s3_object.uri, content.reader)
|
|
1581
|
+
reader = cast(Reader, reader)
|
|
1582
|
+
|
|
1583
|
+
# 5. Fetch and load the content
|
|
1584
|
+
temporary_file = None
|
|
1585
|
+
obj_name = content_name or s3_object.name.split("/")[-1]
|
|
1586
|
+
readable_content: Optional[Union[BytesIO, Path]] = None
|
|
1587
|
+
if s3_object.uri.endswith(".pdf"):
|
|
1588
|
+
readable_content = BytesIO(s3_object.get_resource().get()["Body"].read())
|
|
1589
|
+
else:
|
|
1590
|
+
temporary_file = Path("storage").joinpath(obj_name)
|
|
1591
|
+
readable_content = temporary_file
|
|
1592
|
+
s3_object.download(readable_content) # type: ignore
|
|
1593
|
+
|
|
1594
|
+
# 6. Read the content
|
|
1595
|
+
read_documents = reader.read(readable_content, name=obj_name)
|
|
1596
|
+
|
|
1597
|
+
# 7. Prepare and insert the content in the vector database
|
|
1598
|
+
if not content.id:
|
|
1599
|
+
content.id = generate_id(content.content_hash or "")
|
|
1600
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
1601
|
+
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
1602
|
+
|
|
1603
|
+
# 8. Remove temporary file if needed
|
|
1604
|
+
if temporary_file:
|
|
1605
|
+
temporary_file.unlink()
|
|
1606
|
+
|
|
1607
|
+
def _load_from_gcs(self, content: Content, upsert: bool, skip_if_exists: bool):
|
|
1608
|
+
"""Synchronous version of _load_from_gcs.
|
|
1609
|
+
|
|
1610
|
+
Load the contextual GCS content:
|
|
1611
|
+
1. Identify objects to read
|
|
1612
|
+
2. Setup Content object
|
|
1613
|
+
3. Hash content and add it to the contents database
|
|
1614
|
+
4. Select reader
|
|
1615
|
+
5. Fetch and load the content
|
|
1616
|
+
6. Read the content
|
|
1617
|
+
7. Prepare and insert the content in the vector database
|
|
1618
|
+
"""
|
|
1619
|
+
remote_content: GCSContent = cast(GCSContent, content.remote_content)
|
|
1620
|
+
|
|
1621
|
+
# 1. Identify objects to read
|
|
1622
|
+
objects_to_read = []
|
|
1623
|
+
if remote_content.blob_name is not None:
|
|
1624
|
+
objects_to_read.append(remote_content.bucket.blob(remote_content.blob_name)) # type: ignore
|
|
1625
|
+
elif remote_content.prefix is not None:
|
|
1626
|
+
objects_to_read.extend(remote_content.bucket.list_blobs(prefix=remote_content.prefix)) # type: ignore
|
|
1627
|
+
else:
|
|
1628
|
+
objects_to_read.extend(remote_content.bucket.list_blobs()) # type: ignore
|
|
1629
|
+
|
|
1630
|
+
for gcs_object in objects_to_read:
|
|
1631
|
+
# 2. Setup Content object
|
|
1632
|
+
name = (content.name or "content") + "_" + gcs_object.name
|
|
1633
|
+
content_entry = Content(
|
|
1634
|
+
name=name,
|
|
1635
|
+
description=content.description,
|
|
1636
|
+
status=ContentStatus.PROCESSING,
|
|
1637
|
+
metadata=content.metadata,
|
|
1638
|
+
file_type="gcs",
|
|
1639
|
+
)
|
|
1640
|
+
|
|
1641
|
+
# 3. Hash content and add it to the contents database
|
|
1642
|
+
content_entry.content_hash = self._build_content_hash(content_entry)
|
|
1643
|
+
content_entry.id = generate_id(content_entry.content_hash)
|
|
1644
|
+
self._add_to_contents_db(content_entry)
|
|
1645
|
+
if self._should_skip(content_entry.content_hash, skip_if_exists):
|
|
1646
|
+
content_entry.status = ContentStatus.COMPLETED
|
|
1647
|
+
self._update_content(content_entry)
|
|
1648
|
+
return
|
|
1649
|
+
|
|
1650
|
+
# 4. Select reader
|
|
1651
|
+
reader = self._select_reader_by_uri(gcs_object.name, content.reader)
|
|
1652
|
+
reader = cast(Reader, reader)
|
|
1653
|
+
|
|
1654
|
+
# 5. Fetch and load the content
|
|
1655
|
+
readable_content = BytesIO(gcs_object.download_as_bytes())
|
|
1656
|
+
|
|
1657
|
+
# 6. Read the content
|
|
1658
|
+
read_documents = reader.read(readable_content, name=name)
|
|
1659
|
+
|
|
1660
|
+
# 7. Prepare and insert the content in the vector database
|
|
1661
|
+
if not content.id:
|
|
1662
|
+
content.id = generate_id(content.content_hash or "")
|
|
1663
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
1664
|
+
self._handle_vector_db_insert(content_entry, read_documents, upsert)
|
|
1665
|
+
|
|
1666
|
+
async def _handle_vector_db_insert_async(self, content: Content, read_documents, upsert):
|
|
1667
|
+
from agno.vectordb import VectorDb
|
|
1668
|
+
|
|
1669
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1670
|
+
|
|
1671
|
+
if not self.vector_db:
|
|
1672
|
+
log_error("No vector database configured")
|
|
1673
|
+
content.status = ContentStatus.FAILED
|
|
1674
|
+
content.status_message = "No vector database configured"
|
|
1675
|
+
await self._aupdate_content(content)
|
|
1676
|
+
return
|
|
1677
|
+
|
|
1678
|
+
if self.vector_db.upsert_available() and upsert:
|
|
1679
|
+
try:
|
|
1680
|
+
await self.vector_db.async_upsert(content.content_hash, read_documents, content.metadata) # type: ignore[arg-type]
|
|
1681
|
+
except Exception as e:
|
|
1682
|
+
log_error(f"Error upserting document: {e}")
|
|
1683
|
+
content.status = ContentStatus.FAILED
|
|
1684
|
+
content.status_message = "Could not upsert embedding"
|
|
1685
|
+
await self._aupdate_content(content)
|
|
1686
|
+
return
|
|
1687
|
+
else:
|
|
1688
|
+
try:
|
|
1689
|
+
await self.vector_db.async_insert(
|
|
1690
|
+
content.content_hash, # type: ignore[arg-type]
|
|
1691
|
+
documents=read_documents,
|
|
1692
|
+
filters=content.metadata, # type: ignore[arg-type]
|
|
1693
|
+
)
|
|
1694
|
+
except Exception as e:
|
|
1695
|
+
log_error(f"Error inserting document: {e}")
|
|
1696
|
+
content.status = ContentStatus.FAILED
|
|
1697
|
+
content.status_message = "Could not insert embedding"
|
|
1698
|
+
await self._aupdate_content(content)
|
|
1699
|
+
return
|
|
1700
|
+
|
|
1701
|
+
content.status = ContentStatus.COMPLETED
|
|
1702
|
+
await self._aupdate_content(content)
|
|
1703
|
+
|
|
1704
|
+
def _handle_vector_db_insert(self, content: Content, read_documents, upsert):
|
|
1705
|
+
"""Synchronously handle vector database insertion."""
|
|
1706
|
+
from agno.vectordb import VectorDb
|
|
1707
|
+
|
|
1708
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1709
|
+
|
|
1710
|
+
if not self.vector_db:
|
|
1711
|
+
log_error("No vector database configured")
|
|
1712
|
+
content.status = ContentStatus.FAILED
|
|
1713
|
+
content.status_message = "No vector database configured"
|
|
1714
|
+
self._update_content(content)
|
|
1715
|
+
return
|
|
1716
|
+
|
|
1717
|
+
if self.vector_db.upsert_available() and upsert:
|
|
1718
|
+
try:
|
|
1719
|
+
self.vector_db.upsert(content.content_hash, read_documents, content.metadata) # type: ignore[arg-type]
|
|
1720
|
+
except Exception as e:
|
|
1721
|
+
log_error(f"Error upserting document: {e}")
|
|
1722
|
+
content.status = ContentStatus.FAILED
|
|
1723
|
+
content.status_message = "Could not upsert embedding"
|
|
1724
|
+
self._update_content(content)
|
|
1725
|
+
return
|
|
1726
|
+
else:
|
|
1727
|
+
try:
|
|
1728
|
+
self.vector_db.insert(
|
|
1729
|
+
content.content_hash, # type: ignore[arg-type]
|
|
1730
|
+
documents=read_documents,
|
|
1731
|
+
filters=content.metadata, # type: ignore[arg-type]
|
|
1732
|
+
)
|
|
1733
|
+
except Exception as e:
|
|
1734
|
+
log_error(f"Error inserting document: {e}")
|
|
1735
|
+
content.status = ContentStatus.FAILED
|
|
1736
|
+
content.status_message = "Could not insert embedding"
|
|
1737
|
+
self._update_content(content)
|
|
1738
|
+
return
|
|
1739
|
+
|
|
1740
|
+
content.status = ContentStatus.COMPLETED
|
|
1741
|
+
self._update_content(content)
|
|
1742
|
+
|
|
1743
|
+
def _load_content(
|
|
1744
|
+
self,
|
|
1745
|
+
content: Content,
|
|
1746
|
+
upsert: bool,
|
|
1747
|
+
skip_if_exists: bool,
|
|
1748
|
+
include: Optional[List[str]] = None,
|
|
1749
|
+
exclude: Optional[List[str]] = None,
|
|
1750
|
+
) -> None:
|
|
1751
|
+
"""Synchronously load content."""
|
|
1752
|
+
if content.path:
|
|
1753
|
+
self._load_from_path(content, upsert, skip_if_exists, include, exclude)
|
|
1754
|
+
|
|
1755
|
+
if content.url:
|
|
1756
|
+
self._load_from_url(content, upsert, skip_if_exists)
|
|
1757
|
+
|
|
1758
|
+
if content.file_data:
|
|
1759
|
+
self._load_from_content(content, upsert, skip_if_exists)
|
|
1760
|
+
|
|
1761
|
+
if content.topics:
|
|
1762
|
+
self._load_from_topics(content, upsert, skip_if_exists)
|
|
1763
|
+
|
|
1764
|
+
if content.remote_content:
|
|
1765
|
+
self._load_from_remote_content(content, upsert, skip_if_exists)
|
|
1766
|
+
|
|
1767
|
+
async def _load_content_async(
|
|
1768
|
+
self,
|
|
1769
|
+
content: Content,
|
|
1770
|
+
upsert: bool,
|
|
1771
|
+
skip_if_exists: bool,
|
|
1772
|
+
include: Optional[List[str]] = None,
|
|
1773
|
+
exclude: Optional[List[str]] = None,
|
|
1774
|
+
) -> None:
|
|
1775
|
+
if content.path:
|
|
1776
|
+
await self._load_from_path_async(content, upsert, skip_if_exists, include, exclude)
|
|
1777
|
+
|
|
1778
|
+
if content.url:
|
|
1779
|
+
await self._load_from_url_async(content, upsert, skip_if_exists)
|
|
1780
|
+
|
|
1781
|
+
if content.file_data:
|
|
1782
|
+
await self._load_from_content_async(content, upsert, skip_if_exists)
|
|
1783
|
+
|
|
1784
|
+
if content.topics:
|
|
1785
|
+
await self._load_from_topics_async(content, upsert, skip_if_exists)
|
|
1786
|
+
|
|
1787
|
+
if content.remote_content:
|
|
1788
|
+
await self._load_from_remote_content_async(content, upsert, skip_if_exists)
|
|
1789
|
+
|
|
1790
|
+
def _build_content_hash(self, content: Content) -> str:
|
|
1791
|
+
"""
|
|
1792
|
+
Build the content hash from the content.
|
|
1793
|
+
|
|
1794
|
+
For URLs and paths, includes the name and description in the hash if provided
|
|
1795
|
+
to ensure unique content with the same URL/path but different names/descriptions
|
|
1796
|
+
get different hashes.
|
|
1797
|
+
|
|
1798
|
+
Hash format:
|
|
1799
|
+
- URL with name and description: hash("{name}:{description}:{url}")
|
|
1800
|
+
- URL with name only: hash("{name}:{url}")
|
|
1801
|
+
- URL with description only: hash("{description}:{url}")
|
|
1802
|
+
- URL without name/description: hash("{url}") (backward compatible)
|
|
1803
|
+
- Same logic applies to paths
|
|
1804
|
+
"""
|
|
1805
|
+
hash_parts = []
|
|
1806
|
+
if content.name:
|
|
1807
|
+
hash_parts.append(content.name)
|
|
1808
|
+
if content.description:
|
|
1809
|
+
hash_parts.append(content.description)
|
|
1810
|
+
|
|
1811
|
+
if content.path:
|
|
1812
|
+
hash_parts.append(str(content.path))
|
|
1813
|
+
elif content.url:
|
|
1814
|
+
hash_parts.append(content.url)
|
|
1815
|
+
elif content.file_data and content.file_data.content:
|
|
1816
|
+
# For file_data, always add filename, type, size, or content for uniqueness
|
|
1817
|
+
if content.file_data.filename:
|
|
1818
|
+
hash_parts.append(content.file_data.filename)
|
|
1819
|
+
elif content.file_data.type:
|
|
1820
|
+
hash_parts.append(content.file_data.type)
|
|
1821
|
+
elif content.file_data.size is not None:
|
|
1822
|
+
hash_parts.append(str(content.file_data.size))
|
|
1823
|
+
else:
|
|
1824
|
+
# Fallback: use the content for uniqueness
|
|
1825
|
+
# Include type information to distinguish str vs bytes
|
|
1826
|
+
content_type = "str" if isinstance(content.file_data.content, str) else "bytes"
|
|
1827
|
+
content_bytes = (
|
|
1828
|
+
content.file_data.content.encode()
|
|
1829
|
+
if isinstance(content.file_data.content, str)
|
|
1830
|
+
else content.file_data.content
|
|
1831
|
+
)
|
|
1832
|
+
content_hash = hashlib.sha256(content_bytes).hexdigest()[:16] # Use first 16 chars
|
|
1833
|
+
hash_parts.append(f"{content_type}:{content_hash}")
|
|
1834
|
+
elif content.topics and len(content.topics) > 0:
|
|
1835
|
+
topic = content.topics[0]
|
|
1836
|
+
reader = type(content.reader).__name__ if content.reader else "unknown"
|
|
1837
|
+
hash_parts.append(f"{topic}-{reader}")
|
|
1838
|
+
else:
|
|
1839
|
+
# Fallback for edge cases
|
|
1840
|
+
import random
|
|
1841
|
+
import string
|
|
1842
|
+
|
|
1843
|
+
fallback = (
|
|
1844
|
+
content.name
|
|
1845
|
+
or content.id
|
|
1846
|
+
or ("unknown_content" + "".join(random.choices(string.ascii_lowercase + string.digits, k=6)))
|
|
1847
|
+
)
|
|
1848
|
+
hash_parts.append(fallback)
|
|
1849
|
+
|
|
1850
|
+
hash_input = ":".join(hash_parts)
|
|
1851
|
+
return hashlib.sha256(hash_input.encode()).hexdigest()
|
|
1852
|
+
|
|
1853
|
+
def _ensure_string_field(self, value: Any, field_name: str, default: str = "") -> str:
|
|
1854
|
+
"""
|
|
1855
|
+
Safely ensure a field is a string, handling various edge cases.
|
|
1856
|
+
|
|
1857
|
+
Args:
|
|
1858
|
+
value: The value to convert to string
|
|
1859
|
+
field_name: Name of the field for logging purposes
|
|
1860
|
+
default: Default string value if conversion fails
|
|
1861
|
+
|
|
1862
|
+
Returns:
|
|
1863
|
+
str: A safe string value
|
|
1864
|
+
"""
|
|
1865
|
+
# Handle None/falsy values
|
|
1866
|
+
if value is None or value == "":
|
|
1867
|
+
return default
|
|
1868
|
+
|
|
1869
|
+
# Handle unexpected list types (the root cause of our Pydantic warning)
|
|
1870
|
+
if isinstance(value, list):
|
|
1871
|
+
if len(value) == 0:
|
|
1872
|
+
log_debug(f"Empty list found for {field_name}, using default: '{default}'")
|
|
1873
|
+
return default
|
|
1874
|
+
elif len(value) == 1:
|
|
1875
|
+
# Single item list, extract the item
|
|
1876
|
+
log_debug(f"Single-item list found for {field_name}, extracting: '{value[0]}'")
|
|
1877
|
+
return str(value[0]) if value[0] is not None else default
|
|
1878
|
+
else:
|
|
1879
|
+
# Multiple items, join them
|
|
1880
|
+
log_debug(f"Multi-item list found for {field_name}, joining: {value}")
|
|
1881
|
+
return " | ".join(str(item) for item in value if item is not None)
|
|
1882
|
+
|
|
1883
|
+
# Handle other unexpected types
|
|
1884
|
+
if not isinstance(value, str):
|
|
1885
|
+
log_debug(f"Non-string type {type(value)} found for {field_name}, converting: '{value}'")
|
|
1886
|
+
try:
|
|
1887
|
+
return str(value)
|
|
1888
|
+
except Exception as e:
|
|
1889
|
+
log_warning(f"Failed to convert {field_name} to string: {e}, using default")
|
|
1890
|
+
return default
|
|
1891
|
+
|
|
1892
|
+
# Already a string, return as-is
|
|
1893
|
+
return value
|
|
1894
|
+
|
|
1895
|
+
async def _add_to_contents_db_async(self, content: Content):
|
|
1896
|
+
if self.contents_db:
|
|
1897
|
+
created_at = content.created_at if content.created_at else int(time.time())
|
|
1898
|
+
updated_at = content.updated_at if content.updated_at else int(time.time())
|
|
1899
|
+
|
|
1900
|
+
file_type = (
|
|
1901
|
+
content.file_type
|
|
1902
|
+
if content.file_type
|
|
1903
|
+
else content.file_data.type
|
|
1904
|
+
if content.file_data and content.file_data.type
|
|
1905
|
+
else None
|
|
1906
|
+
)
|
|
1907
|
+
# Safely handle string fields with proper type checking
|
|
1908
|
+
safe_name = self._ensure_string_field(content.name, "content.name", default="")
|
|
1909
|
+
safe_description = self._ensure_string_field(content.description, "content.description", default="")
|
|
1910
|
+
safe_linked_to = self._ensure_string_field(self.name, "knowledge.name", default="")
|
|
1911
|
+
safe_status_message = self._ensure_string_field(
|
|
1912
|
+
content.status_message, "content.status_message", default=""
|
|
1913
|
+
)
|
|
1914
|
+
|
|
1915
|
+
content_row = KnowledgeRow(
|
|
1916
|
+
id=content.id,
|
|
1917
|
+
name=safe_name,
|
|
1918
|
+
description=safe_description,
|
|
1919
|
+
metadata=content.metadata,
|
|
1920
|
+
type=file_type,
|
|
1921
|
+
size=content.size
|
|
1922
|
+
if content.size
|
|
1923
|
+
else len(content.file_data.content)
|
|
1924
|
+
if content.file_data and content.file_data.content
|
|
1925
|
+
else None,
|
|
1926
|
+
linked_to=safe_linked_to,
|
|
1927
|
+
access_count=0,
|
|
1928
|
+
status=content.status if content.status else ContentStatus.PROCESSING,
|
|
1929
|
+
status_message=safe_status_message,
|
|
1930
|
+
created_at=created_at,
|
|
1931
|
+
updated_at=updated_at,
|
|
1932
|
+
)
|
|
1933
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1934
|
+
await self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
1935
|
+
else:
|
|
1936
|
+
self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
1937
|
+
|
|
1938
|
+
def _add_to_contents_db(self, content: Content):
|
|
1939
|
+
"""Synchronously add content to contents database."""
|
|
1940
|
+
if self.contents_db:
|
|
1941
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1942
|
+
raise ValueError(
|
|
1943
|
+
"_add_to_contents_db() is not supported with an async DB. Please use add_content_async with AsyncDb."
|
|
1944
|
+
)
|
|
1945
|
+
|
|
1946
|
+
created_at = content.created_at if content.created_at else int(time.time())
|
|
1947
|
+
updated_at = content.updated_at if content.updated_at else int(time.time())
|
|
1948
|
+
|
|
1949
|
+
file_type = (
|
|
1950
|
+
content.file_type
|
|
1951
|
+
if content.file_type
|
|
1952
|
+
else content.file_data.type
|
|
1953
|
+
if content.file_data and content.file_data.type
|
|
1954
|
+
else None
|
|
1955
|
+
)
|
|
1956
|
+
# Safely handle string fields with proper type checking
|
|
1957
|
+
safe_name = self._ensure_string_field(content.name, "content.name", default="")
|
|
1958
|
+
safe_description = self._ensure_string_field(content.description, "content.description", default="")
|
|
1959
|
+
safe_linked_to = self._ensure_string_field(self.name, "knowledge.name", default="")
|
|
1960
|
+
safe_status_message = self._ensure_string_field(
|
|
1961
|
+
content.status_message, "content.status_message", default=""
|
|
1962
|
+
)
|
|
1963
|
+
|
|
1964
|
+
content_row = KnowledgeRow(
|
|
1965
|
+
id=content.id,
|
|
1966
|
+
name=safe_name,
|
|
1967
|
+
description=safe_description,
|
|
1968
|
+
metadata=content.metadata,
|
|
1969
|
+
type=file_type,
|
|
1970
|
+
size=content.size
|
|
1971
|
+
if content.size
|
|
1972
|
+
else len(content.file_data.content)
|
|
1973
|
+
if content.file_data and content.file_data.content
|
|
1974
|
+
else None,
|
|
1975
|
+
linked_to=safe_linked_to,
|
|
1976
|
+
access_count=0,
|
|
1977
|
+
status=content.status if content.status else ContentStatus.PROCESSING,
|
|
1978
|
+
status_message=safe_status_message,
|
|
1979
|
+
created_at=created_at,
|
|
1980
|
+
updated_at=updated_at,
|
|
1981
|
+
)
|
|
1982
|
+
self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
1983
|
+
|
|
1984
|
+
def _update_content(self, content: Content) -> Optional[Dict[str, Any]]:
|
|
1985
|
+
from agno.vectordb import VectorDb
|
|
1986
|
+
|
|
1987
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
1988
|
+
if self.contents_db:
|
|
1989
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
1990
|
+
raise ValueError(
|
|
1991
|
+
"update_content() is not supported with an async DB. Please use aupdate_content() instead."
|
|
1992
|
+
)
|
|
1993
|
+
|
|
1994
|
+
if not content.id:
|
|
1995
|
+
log_warning("Content id is required to update Knowledge content")
|
|
1996
|
+
return None
|
|
1997
|
+
|
|
1998
|
+
# TODO: we shouldn't check for content here, we should trust the upsert method to handle conflicts
|
|
1999
|
+
content_row = self.contents_db.get_knowledge_content(content.id)
|
|
2000
|
+
if content_row is None:
|
|
2001
|
+
log_warning(f"Content row not found for id: {content.id}, cannot update status")
|
|
2002
|
+
return None
|
|
2003
|
+
|
|
2004
|
+
# Apply safe string handling for updates as well
|
|
2005
|
+
if content.name is not None:
|
|
2006
|
+
content_row.name = self._ensure_string_field(content.name, "content.name", default="")
|
|
2007
|
+
if content.description is not None:
|
|
2008
|
+
content_row.description = self._ensure_string_field(
|
|
2009
|
+
content.description, "content.description", default=""
|
|
2010
|
+
)
|
|
2011
|
+
if content.metadata is not None:
|
|
2012
|
+
content_row.metadata = content.metadata
|
|
2013
|
+
if content.status is not None:
|
|
2014
|
+
content_row.status = content.status
|
|
2015
|
+
if content.status_message is not None:
|
|
2016
|
+
content_row.status_message = self._ensure_string_field(
|
|
2017
|
+
content.status_message, "content.status_message", default=""
|
|
2018
|
+
)
|
|
2019
|
+
if content.external_id is not None:
|
|
2020
|
+
content_row.external_id = self._ensure_string_field(
|
|
2021
|
+
content.external_id, "content.external_id", default=""
|
|
2022
|
+
)
|
|
2023
|
+
content_row.updated_at = int(time.time())
|
|
2024
|
+
self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
2025
|
+
|
|
2026
|
+
if self.vector_db:
|
|
2027
|
+
self.vector_db.update_metadata(content_id=content.id, metadata=content.metadata or {})
|
|
2028
|
+
|
|
2029
|
+
return content_row.to_dict()
|
|
2030
|
+
|
|
2031
|
+
else:
|
|
2032
|
+
if self.name:
|
|
2033
|
+
log_warning(f"Contents DB not found for knowledge base: {self.name}")
|
|
2034
|
+
else:
|
|
2035
|
+
log_warning("Contents DB not found for knowledge base")
|
|
2036
|
+
return None
|
|
2037
|
+
|
|
2038
|
+
async def _aupdate_content(self, content: Content) -> Optional[Dict[str, Any]]:
|
|
2039
|
+
if self.contents_db:
|
|
2040
|
+
if not content.id:
|
|
2041
|
+
log_warning("Content id is required to update Knowledge content")
|
|
2042
|
+
return None
|
|
2043
|
+
|
|
2044
|
+
# TODO: we shouldn't check for content here, we should trust the upsert method to handle conflicts
|
|
2045
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2046
|
+
content_row = await self.contents_db.get_knowledge_content(content.id)
|
|
2047
|
+
else:
|
|
2048
|
+
content_row = self.contents_db.get_knowledge_content(content.id)
|
|
2049
|
+
if content_row is None:
|
|
2050
|
+
log_warning(f"Content row not found for id: {content.id}, cannot update status")
|
|
2051
|
+
return None
|
|
2052
|
+
|
|
2053
|
+
if content.name is not None:
|
|
2054
|
+
content_row.name = content.name
|
|
2055
|
+
if content.description is not None:
|
|
2056
|
+
content_row.description = content.description
|
|
2057
|
+
if content.metadata is not None:
|
|
2058
|
+
content_row.metadata = content.metadata
|
|
2059
|
+
if content.status is not None:
|
|
2060
|
+
content_row.status = content.status
|
|
2061
|
+
if content.status_message is not None:
|
|
2062
|
+
content_row.status_message = content.status_message if content.status_message else ""
|
|
2063
|
+
if content.external_id is not None:
|
|
2064
|
+
content_row.external_id = content.external_id
|
|
2065
|
+
|
|
2066
|
+
content_row.updated_at = int(time.time())
|
|
2067
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2068
|
+
await self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
2069
|
+
else:
|
|
2070
|
+
self.contents_db.upsert_knowledge_content(knowledge_row=content_row)
|
|
2071
|
+
|
|
2072
|
+
if self.vector_db:
|
|
2073
|
+
self.vector_db.update_metadata(content_id=content.id, metadata=content.metadata or {})
|
|
2074
|
+
|
|
2075
|
+
return content_row.to_dict()
|
|
2076
|
+
|
|
2077
|
+
else:
|
|
2078
|
+
if self.name:
|
|
2079
|
+
log_warning(f"Contents DB not found for knowledge base: {self.name}")
|
|
2080
|
+
else:
|
|
2081
|
+
log_warning("Contents DB not found for knowledge base")
|
|
2082
|
+
return None
|
|
2083
|
+
|
|
2084
|
+
async def _process_lightrag_content_async(self, content: Content, content_type: KnowledgeContentOrigin) -> None:
|
|
2085
|
+
from agno.vectordb import VectorDb
|
|
2086
|
+
|
|
2087
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
2088
|
+
|
|
2089
|
+
await self._add_to_contents_db_async(content)
|
|
2090
|
+
if content_type == KnowledgeContentOrigin.PATH:
|
|
2091
|
+
if content.file_data is None:
|
|
2092
|
+
log_warning("No file data provided")
|
|
2093
|
+
|
|
2094
|
+
if content.path is None:
|
|
2095
|
+
log_error("No path provided for content")
|
|
2096
|
+
return
|
|
2097
|
+
|
|
2098
|
+
path = Path(content.path)
|
|
2099
|
+
|
|
2100
|
+
log_info(f"Uploading file to LightRAG from path: {path}")
|
|
2101
|
+
try:
|
|
2102
|
+
# Read the file content from path
|
|
2103
|
+
with open(path, "rb") as f:
|
|
2104
|
+
file_content = f.read()
|
|
2105
|
+
|
|
2106
|
+
# Get file type from extension or content.file_type
|
|
2107
|
+
file_type = content.file_type or path.suffix
|
|
2108
|
+
|
|
2109
|
+
if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
|
|
2110
|
+
result = await self.vector_db.insert_file_bytes(
|
|
2111
|
+
file_content=file_content,
|
|
2112
|
+
filename=path.name, # Use the original filename with extension
|
|
2113
|
+
content_type=file_type,
|
|
2114
|
+
send_metadata=True, # Enable metadata so server knows the file type
|
|
2115
|
+
)
|
|
2116
|
+
|
|
2117
|
+
else:
|
|
2118
|
+
log_error("Vector database does not support file insertion")
|
|
2119
|
+
content.status = ContentStatus.FAILED
|
|
2120
|
+
await self._aupdate_content(content)
|
|
2121
|
+
return
|
|
2122
|
+
content.external_id = result
|
|
2123
|
+
content.status = ContentStatus.COMPLETED
|
|
2124
|
+
await self._aupdate_content(content)
|
|
2125
|
+
return
|
|
2126
|
+
|
|
2127
|
+
except Exception as e:
|
|
2128
|
+
log_error(f"Error uploading file to LightRAG: {e}")
|
|
2129
|
+
content.status = ContentStatus.FAILED
|
|
2130
|
+
content.status_message = f"Could not upload to LightRAG: {str(e)}"
|
|
2131
|
+
await self._aupdate_content(content)
|
|
2132
|
+
return
|
|
2133
|
+
|
|
2134
|
+
elif content_type == KnowledgeContentOrigin.URL:
|
|
2135
|
+
log_info(f"Uploading file to LightRAG from URL: {content.url}")
|
|
2136
|
+
try:
|
|
2137
|
+
reader = content.reader or self.website_reader
|
|
2138
|
+
if reader is None:
|
|
2139
|
+
log_error("No URL reader available")
|
|
2140
|
+
content.status = ContentStatus.FAILED
|
|
2141
|
+
await self._aupdate_content(content)
|
|
2142
|
+
return
|
|
2143
|
+
|
|
2144
|
+
reader.chunk = False
|
|
2145
|
+
read_documents = reader.read(content.url, name=content.name)
|
|
2146
|
+
if not content.id:
|
|
2147
|
+
content.id = generate_id(content.content_hash or "")
|
|
2148
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
2149
|
+
|
|
2150
|
+
if not read_documents:
|
|
2151
|
+
log_error("No documents read from URL")
|
|
2152
|
+
content.status = ContentStatus.FAILED
|
|
2153
|
+
await self._aupdate_content(content)
|
|
2154
|
+
return
|
|
2155
|
+
|
|
2156
|
+
if self.vector_db and hasattr(self.vector_db, "insert_text"):
|
|
2157
|
+
result = await self.vector_db.insert_text(
|
|
2158
|
+
file_source=content.url,
|
|
2159
|
+
text=read_documents[0].content,
|
|
2160
|
+
)
|
|
2161
|
+
else:
|
|
2162
|
+
log_error("Vector database does not support text insertion")
|
|
2163
|
+
content.status = ContentStatus.FAILED
|
|
2164
|
+
await self._aupdate_content(content)
|
|
2165
|
+
return
|
|
2166
|
+
|
|
2167
|
+
content.external_id = result
|
|
2168
|
+
content.status = ContentStatus.COMPLETED
|
|
2169
|
+
await self._aupdate_content(content)
|
|
2170
|
+
return
|
|
2171
|
+
|
|
2172
|
+
except Exception as e:
|
|
2173
|
+
log_error(f"Error uploading file to LightRAG: {e}")
|
|
2174
|
+
content.status = ContentStatus.FAILED
|
|
2175
|
+
content.status_message = f"Could not upload to LightRAG: {str(e)}"
|
|
2176
|
+
await self._aupdate_content(content)
|
|
2177
|
+
return
|
|
2178
|
+
|
|
2179
|
+
elif content_type == KnowledgeContentOrigin.CONTENT:
|
|
2180
|
+
filename = (
|
|
2181
|
+
content.file_data.filename if content.file_data and content.file_data.filename else "uploaded_file"
|
|
2182
|
+
)
|
|
2183
|
+
log_info(f"Uploading file to LightRAG: {filename}")
|
|
2184
|
+
|
|
2185
|
+
# Use the content from file_data
|
|
2186
|
+
if content.file_data and content.file_data.content:
|
|
2187
|
+
if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
|
|
2188
|
+
result = await self.vector_db.insert_file_bytes(
|
|
2189
|
+
file_content=content.file_data.content,
|
|
2190
|
+
filename=filename,
|
|
2191
|
+
content_type=content.file_data.type,
|
|
2192
|
+
send_metadata=True, # Enable metadata so server knows the file type
|
|
2193
|
+
)
|
|
2194
|
+
else:
|
|
2195
|
+
log_error("Vector database does not support file insertion")
|
|
2196
|
+
content.status = ContentStatus.FAILED
|
|
2197
|
+
await self._aupdate_content(content)
|
|
2198
|
+
return
|
|
2199
|
+
content.external_id = result
|
|
2200
|
+
content.status = ContentStatus.COMPLETED
|
|
2201
|
+
await self._aupdate_content(content)
|
|
2202
|
+
else:
|
|
2203
|
+
log_warning(f"No file data available for LightRAG upload: {content.name}")
|
|
2204
|
+
return
|
|
2205
|
+
|
|
2206
|
+
elif content_type == KnowledgeContentOrigin.TOPIC:
|
|
2207
|
+
log_info(f"Uploading file to LightRAG: {content.name}")
|
|
2208
|
+
|
|
2209
|
+
if content.reader is None:
|
|
2210
|
+
log_error("No reader available for topic content")
|
|
2211
|
+
content.status = ContentStatus.FAILED
|
|
2212
|
+
await self._aupdate_content(content)
|
|
2213
|
+
return
|
|
2214
|
+
|
|
2215
|
+
if not content.topics:
|
|
2216
|
+
log_error("No topics available for content")
|
|
2217
|
+
content.status = ContentStatus.FAILED
|
|
2218
|
+
await self._aupdate_content(content)
|
|
2219
|
+
return
|
|
2220
|
+
|
|
2221
|
+
read_documents = content.reader.read(content.topics)
|
|
2222
|
+
if len(read_documents) > 0:
|
|
2223
|
+
if self.vector_db and hasattr(self.vector_db, "insert_text"):
|
|
2224
|
+
result = await self.vector_db.insert_text(
|
|
2225
|
+
file_source=content.topics[0],
|
|
2226
|
+
text=read_documents[0].content,
|
|
2227
|
+
)
|
|
2228
|
+
else:
|
|
2229
|
+
log_error("Vector database does not support text insertion")
|
|
2230
|
+
content.status = ContentStatus.FAILED
|
|
2231
|
+
await self._aupdate_content(content)
|
|
2232
|
+
return
|
|
2233
|
+
content.external_id = result
|
|
2234
|
+
content.status = ContentStatus.COMPLETED
|
|
2235
|
+
await self._aupdate_content(content)
|
|
2236
|
+
return
|
|
2237
|
+
else:
|
|
2238
|
+
log_warning(f"No documents found for LightRAG upload: {content.name}")
|
|
2239
|
+
return
|
|
2240
|
+
|
|
2241
|
+
def _process_lightrag_content(self, content: Content, content_type: KnowledgeContentOrigin) -> None:
|
|
2242
|
+
"""Synchronously process LightRAG content. Uses asyncio.run() only for LightRAG-specific async methods."""
|
|
2243
|
+
from agno.vectordb import VectorDb
|
|
2244
|
+
|
|
2245
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
2246
|
+
|
|
2247
|
+
self._add_to_contents_db(content)
|
|
2248
|
+
if content_type == KnowledgeContentOrigin.PATH:
|
|
2249
|
+
if content.file_data is None:
|
|
2250
|
+
log_warning("No file data provided")
|
|
2251
|
+
|
|
2252
|
+
if content.path is None:
|
|
2253
|
+
log_error("No path provided for content")
|
|
2254
|
+
return
|
|
2255
|
+
|
|
2256
|
+
path = Path(content.path)
|
|
2257
|
+
|
|
2258
|
+
log_info(f"Uploading file to LightRAG from path: {path}")
|
|
2259
|
+
try:
|
|
2260
|
+
# Read the file content from path
|
|
2261
|
+
with open(path, "rb") as f:
|
|
2262
|
+
file_content = f.read()
|
|
2263
|
+
|
|
2264
|
+
# Get file type from extension or content.file_type
|
|
2265
|
+
file_type = content.file_type or path.suffix
|
|
2266
|
+
|
|
2267
|
+
if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
|
|
2268
|
+
# LightRAG only has async methods, use asyncio.run() here
|
|
2269
|
+
result = asyncio.run(
|
|
2270
|
+
self.vector_db.insert_file_bytes(
|
|
2271
|
+
file_content=file_content,
|
|
2272
|
+
filename=path.name,
|
|
2273
|
+
content_type=file_type,
|
|
2274
|
+
send_metadata=True,
|
|
2275
|
+
)
|
|
2276
|
+
)
|
|
2277
|
+
else:
|
|
2278
|
+
log_error("Vector database does not support file insertion")
|
|
2279
|
+
content.status = ContentStatus.FAILED
|
|
2280
|
+
self._update_content(content)
|
|
2281
|
+
return
|
|
2282
|
+
content.external_id = result
|
|
2283
|
+
content.status = ContentStatus.COMPLETED
|
|
2284
|
+
self._update_content(content)
|
|
2285
|
+
return
|
|
2286
|
+
|
|
2287
|
+
except Exception as e:
|
|
2288
|
+
log_error(f"Error uploading file to LightRAG: {e}")
|
|
2289
|
+
content.status = ContentStatus.FAILED
|
|
2290
|
+
content.status_message = f"Could not upload to LightRAG: {str(e)}"
|
|
2291
|
+
self._update_content(content)
|
|
2292
|
+
return
|
|
2293
|
+
|
|
2294
|
+
elif content_type == KnowledgeContentOrigin.URL:
|
|
2295
|
+
log_info(f"Uploading file to LightRAG from URL: {content.url}")
|
|
2296
|
+
try:
|
|
2297
|
+
reader = content.reader or self.website_reader
|
|
2298
|
+
if reader is None:
|
|
2299
|
+
log_error("No URL reader available")
|
|
2300
|
+
content.status = ContentStatus.FAILED
|
|
2301
|
+
self._update_content(content)
|
|
2302
|
+
return
|
|
2303
|
+
|
|
2304
|
+
reader.chunk = False
|
|
2305
|
+
read_documents = reader.read(content.url, name=content.name)
|
|
2306
|
+
if not content.id:
|
|
2307
|
+
content.id = generate_id(content.content_hash or "")
|
|
2308
|
+
self._prepare_documents_for_insert(read_documents, content.id)
|
|
2309
|
+
|
|
2310
|
+
if not read_documents:
|
|
2311
|
+
log_error("No documents read from URL")
|
|
2312
|
+
content.status = ContentStatus.FAILED
|
|
2313
|
+
self._update_content(content)
|
|
2314
|
+
return
|
|
2315
|
+
|
|
2316
|
+
if self.vector_db and hasattr(self.vector_db, "insert_text"):
|
|
2317
|
+
# LightRAG only has async methods, use asyncio.run() here
|
|
2318
|
+
result = asyncio.run(
|
|
2319
|
+
self.vector_db.insert_text(
|
|
2320
|
+
file_source=content.url,
|
|
2321
|
+
text=read_documents[0].content,
|
|
2322
|
+
)
|
|
2323
|
+
)
|
|
2324
|
+
else:
|
|
2325
|
+
log_error("Vector database does not support text insertion")
|
|
2326
|
+
content.status = ContentStatus.FAILED
|
|
2327
|
+
self._update_content(content)
|
|
2328
|
+
return
|
|
2329
|
+
|
|
2330
|
+
content.external_id = result
|
|
2331
|
+
content.status = ContentStatus.COMPLETED
|
|
2332
|
+
self._update_content(content)
|
|
2333
|
+
return
|
|
2334
|
+
|
|
2335
|
+
except Exception as e:
|
|
2336
|
+
log_error(f"Error uploading file to LightRAG: {e}")
|
|
2337
|
+
content.status = ContentStatus.FAILED
|
|
2338
|
+
content.status_message = f"Could not upload to LightRAG: {str(e)}"
|
|
2339
|
+
self._update_content(content)
|
|
2340
|
+
return
|
|
2341
|
+
|
|
2342
|
+
elif content_type == KnowledgeContentOrigin.CONTENT:
|
|
2343
|
+
filename = (
|
|
2344
|
+
content.file_data.filename if content.file_data and content.file_data.filename else "uploaded_file"
|
|
2345
|
+
)
|
|
2346
|
+
log_info(f"Uploading file to LightRAG: {filename}")
|
|
2347
|
+
|
|
2348
|
+
# Use the content from file_data
|
|
2349
|
+
if content.file_data and content.file_data.content:
|
|
2350
|
+
if self.vector_db and hasattr(self.vector_db, "insert_file_bytes"):
|
|
2351
|
+
# LightRAG only has async methods, use asyncio.run() here
|
|
2352
|
+
result = asyncio.run(
|
|
2353
|
+
self.vector_db.insert_file_bytes(
|
|
2354
|
+
file_content=content.file_data.content,
|
|
2355
|
+
filename=filename,
|
|
2356
|
+
content_type=content.file_data.type,
|
|
2357
|
+
send_metadata=True,
|
|
2358
|
+
)
|
|
2359
|
+
)
|
|
2360
|
+
else:
|
|
2361
|
+
log_error("Vector database does not support file insertion")
|
|
2362
|
+
content.status = ContentStatus.FAILED
|
|
2363
|
+
self._update_content(content)
|
|
2364
|
+
return
|
|
2365
|
+
content.external_id = result
|
|
2366
|
+
content.status = ContentStatus.COMPLETED
|
|
2367
|
+
self._update_content(content)
|
|
2368
|
+
else:
|
|
2369
|
+
log_warning(f"No file data available for LightRAG upload: {content.name}")
|
|
2370
|
+
return
|
|
2371
|
+
|
|
2372
|
+
elif content_type == KnowledgeContentOrigin.TOPIC:
|
|
2373
|
+
log_info(f"Uploading file to LightRAG: {content.name}")
|
|
2374
|
+
|
|
2375
|
+
if content.reader is None:
|
|
2376
|
+
log_error("No reader available for topic content")
|
|
2377
|
+
content.status = ContentStatus.FAILED
|
|
2378
|
+
self._update_content(content)
|
|
2379
|
+
return
|
|
2380
|
+
|
|
2381
|
+
if not content.topics:
|
|
2382
|
+
log_error("No topics available for content")
|
|
2383
|
+
content.status = ContentStatus.FAILED
|
|
2384
|
+
self._update_content(content)
|
|
2385
|
+
return
|
|
2386
|
+
|
|
2387
|
+
read_documents = content.reader.read(content.topics)
|
|
2388
|
+
if len(read_documents) > 0:
|
|
2389
|
+
if self.vector_db and hasattr(self.vector_db, "insert_text"):
|
|
2390
|
+
# LightRAG only has async methods, use asyncio.run() here
|
|
2391
|
+
result = asyncio.run(
|
|
2392
|
+
self.vector_db.insert_text(
|
|
2393
|
+
file_source=content.topics[0],
|
|
2394
|
+
text=read_documents[0].content,
|
|
2395
|
+
)
|
|
2396
|
+
)
|
|
2397
|
+
else:
|
|
2398
|
+
log_error("Vector database does not support text insertion")
|
|
2399
|
+
content.status = ContentStatus.FAILED
|
|
2400
|
+
self._update_content(content)
|
|
2401
|
+
return
|
|
2402
|
+
content.external_id = result
|
|
2403
|
+
content.status = ContentStatus.COMPLETED
|
|
2404
|
+
self._update_content(content)
|
|
2405
|
+
return
|
|
2406
|
+
else:
|
|
2407
|
+
log_warning(f"No documents found for LightRAG upload: {content.name}")
|
|
2408
|
+
return
|
|
2409
|
+
|
|
2410
|
+
def search(
|
|
2411
|
+
self,
|
|
2412
|
+
query: str,
|
|
2413
|
+
max_results: Optional[int] = None,
|
|
2414
|
+
filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None,
|
|
2415
|
+
search_type: Optional[str] = None,
|
|
2416
|
+
) -> List[Document]:
|
|
2417
|
+
"""Returns relevant documents matching a query"""
|
|
2418
|
+
from agno.vectordb import VectorDb
|
|
2419
|
+
from agno.vectordb.search import SearchType
|
|
2420
|
+
|
|
2421
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
2422
|
+
|
|
2423
|
+
if (
|
|
2424
|
+
hasattr(self.vector_db, "search_type")
|
|
2425
|
+
and isinstance(self.vector_db.search_type, SearchType)
|
|
2426
|
+
and search_type
|
|
2427
|
+
):
|
|
2428
|
+
self.vector_db.search_type = SearchType(search_type)
|
|
2429
|
+
try:
|
|
2430
|
+
if self.vector_db is None:
|
|
2431
|
+
log_warning("No vector db provided")
|
|
2432
|
+
return []
|
|
2433
|
+
|
|
2434
|
+
_max_results = max_results or self.max_results
|
|
2435
|
+
log_debug(f"Getting {_max_results} relevant documents for query: {query}")
|
|
2436
|
+
return self.vector_db.search(query=query, limit=_max_results, filters=filters)
|
|
2437
|
+
except Exception as e:
|
|
2438
|
+
log_error(f"Error searching for documents: {e}")
|
|
2439
|
+
return []
|
|
2440
|
+
|
|
2441
|
+
async def async_search(
|
|
2442
|
+
self,
|
|
2443
|
+
query: str,
|
|
2444
|
+
max_results: Optional[int] = None,
|
|
2445
|
+
filters: Optional[Union[Dict[str, Any], List[FilterExpr]]] = None,
|
|
2446
|
+
search_type: Optional[str] = None,
|
|
2447
|
+
) -> List[Document]:
|
|
2448
|
+
"""Returns relevant documents matching a query"""
|
|
2449
|
+
from agno.vectordb import VectorDb
|
|
2450
|
+
from agno.vectordb.search import SearchType
|
|
2451
|
+
|
|
2452
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
2453
|
+
if (
|
|
2454
|
+
hasattr(self.vector_db, "search_type")
|
|
2455
|
+
and isinstance(self.vector_db.search_type, SearchType)
|
|
2456
|
+
and search_type
|
|
2457
|
+
):
|
|
2458
|
+
self.vector_db.search_type = SearchType(search_type)
|
|
2459
|
+
try:
|
|
2460
|
+
if self.vector_db is None:
|
|
2461
|
+
log_warning("No vector db provided")
|
|
2462
|
+
return []
|
|
2463
|
+
|
|
2464
|
+
_max_results = max_results or self.max_results
|
|
2465
|
+
log_debug(f"Getting {_max_results} relevant documents for query: {query}")
|
|
2466
|
+
try:
|
|
2467
|
+
return await self.vector_db.async_search(query=query, limit=_max_results, filters=filters)
|
|
2468
|
+
except NotImplementedError:
|
|
2469
|
+
log_info("Vector db does not support async search")
|
|
2470
|
+
return self.search(query=query, max_results=_max_results, filters=filters)
|
|
2471
|
+
except Exception as e:
|
|
2472
|
+
log_error(f"Error searching for documents: {e}")
|
|
2473
|
+
return []
|
|
2474
|
+
|
|
2475
|
+
def get_valid_filters(self) -> Set[str]:
|
|
2476
|
+
if self.contents_db is None:
|
|
2477
|
+
log_warning("No contents db provided. This is required for filtering.")
|
|
2478
|
+
return set()
|
|
2479
|
+
contents, _ = self.get_content()
|
|
2480
|
+
valid_filters: Set[str] = set()
|
|
2481
|
+
for content in contents:
|
|
2482
|
+
if content.metadata:
|
|
2483
|
+
valid_filters.update(content.metadata.keys())
|
|
2484
|
+
|
|
2485
|
+
return valid_filters
|
|
2486
|
+
|
|
2487
|
+
async def async_get_valid_filters(self) -> Set[str]:
|
|
2488
|
+
if self.contents_db is None:
|
|
2489
|
+
log_warning("No contents db provided. This is required for filtering.")
|
|
2490
|
+
return set()
|
|
2491
|
+
contents, _ = await self.aget_content()
|
|
2492
|
+
valid_filters: Set[str] = set()
|
|
2493
|
+
for content in contents:
|
|
2494
|
+
if content.metadata:
|
|
2495
|
+
valid_filters.update(content.metadata.keys())
|
|
2496
|
+
|
|
2497
|
+
return valid_filters
|
|
2498
|
+
|
|
2499
|
+
def _validate_filters(
|
|
2500
|
+
self, filters: Union[Dict[str, Any], List[FilterExpr]], valid_metadata_filters: Set[str]
|
|
2501
|
+
) -> Tuple[Union[Dict[str, Any], List[FilterExpr]], List[str]]:
|
|
2502
|
+
if not filters:
|
|
2503
|
+
return {}, []
|
|
2504
|
+
|
|
2505
|
+
valid_filters: Union[Dict[str, Any], List[FilterExpr]] = {}
|
|
2506
|
+
invalid_keys = []
|
|
2507
|
+
|
|
2508
|
+
if isinstance(filters, dict):
|
|
2509
|
+
# If no metadata filters tracked yet, all keys are considered invalid
|
|
2510
|
+
if valid_metadata_filters is None or not valid_metadata_filters:
|
|
2511
|
+
invalid_keys = list(filters.keys())
|
|
2512
|
+
log_warning(
|
|
2513
|
+
f"No valid metadata filters tracked yet. All filter keys considered invalid: {invalid_keys}"
|
|
2514
|
+
)
|
|
2515
|
+
return {}, invalid_keys
|
|
2516
|
+
|
|
2517
|
+
for key, value in filters.items():
|
|
2518
|
+
# Handle both normal keys and prefixed keys like meta_data.key
|
|
2519
|
+
base_key = key.split(".")[-1] if "." in key else key
|
|
2520
|
+
if base_key in valid_metadata_filters or key in valid_metadata_filters:
|
|
2521
|
+
valid_filters[key] = value # type: ignore
|
|
2522
|
+
else:
|
|
2523
|
+
invalid_keys.append(key)
|
|
2524
|
+
log_warning(f"Invalid filter key: {key} - not present in knowledge base")
|
|
2525
|
+
|
|
2526
|
+
elif isinstance(filters, List):
|
|
2527
|
+
# Validate that list contains FilterExpr instances
|
|
2528
|
+
for i, filter_item in enumerate(filters):
|
|
2529
|
+
if not isinstance(filter_item, FilterExpr):
|
|
2530
|
+
log_warning(
|
|
2531
|
+
f"Invalid filter at index {i}: expected FilterExpr instance, "
|
|
2532
|
+
f"got {type(filter_item).__name__}. "
|
|
2533
|
+
f"Use filter expressions like EQ('key', 'value'), IN('key', [values]), "
|
|
2534
|
+
f"AND(...), OR(...), NOT(...) from agno.filters"
|
|
2535
|
+
)
|
|
2536
|
+
# Filter expressions are already validated, return empty dict/list
|
|
2537
|
+
# The actual filtering happens in the vector_db layer
|
|
2538
|
+
return filters, []
|
|
2539
|
+
|
|
2540
|
+
return valid_filters, invalid_keys
|
|
2541
|
+
|
|
2542
|
+
def validate_filters(
|
|
2543
|
+
self, filters: Union[Dict[str, Any], List[FilterExpr]]
|
|
2544
|
+
) -> Tuple[Union[Dict[str, Any], List[FilterExpr]], List[str]]:
|
|
2545
|
+
valid_filters_from_db = self.get_valid_filters()
|
|
2546
|
+
|
|
2547
|
+
valid_filters, invalid_keys = self._validate_filters(filters, valid_filters_from_db)
|
|
2548
|
+
|
|
2549
|
+
return valid_filters, invalid_keys
|
|
2550
|
+
|
|
2551
|
+
async def async_validate_filters(
|
|
2552
|
+
self, filters: Union[Dict[str, Any], List[FilterExpr]]
|
|
2553
|
+
) -> Tuple[Union[Dict[str, Any], List[FilterExpr]], List[str]]:
|
|
2554
|
+
"""Return a tuple containing a dict with all valid filters and a list of invalid filter keys"""
|
|
2555
|
+
valid_filters_from_db = await self.async_get_valid_filters()
|
|
2556
|
+
|
|
2557
|
+
valid_filters, invalid_keys = self._validate_filters(filters, valid_filters_from_db)
|
|
2558
|
+
|
|
2559
|
+
return valid_filters, invalid_keys
|
|
2560
|
+
|
|
2561
|
+
def remove_vector_by_id(self, id: str) -> bool:
|
|
2562
|
+
from agno.vectordb import VectorDb
|
|
2563
|
+
|
|
2564
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
2565
|
+
if self.vector_db is None:
|
|
2566
|
+
log_warning("No vector DB provided")
|
|
2567
|
+
return False
|
|
2568
|
+
return self.vector_db.delete_by_id(id)
|
|
2569
|
+
|
|
2570
|
+
def remove_vectors_by_name(self, name: str) -> bool:
|
|
2571
|
+
from agno.vectordb import VectorDb
|
|
2572
|
+
|
|
2573
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
2574
|
+
if self.vector_db is None:
|
|
2575
|
+
log_warning("No vector DB provided")
|
|
2576
|
+
return False
|
|
2577
|
+
return self.vector_db.delete_by_name(name)
|
|
2578
|
+
|
|
2579
|
+
def remove_vectors_by_metadata(self, metadata: Dict[str, Any]) -> bool:
|
|
2580
|
+
from agno.vectordb import VectorDb
|
|
2581
|
+
|
|
2582
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
2583
|
+
if self.vector_db is None:
|
|
2584
|
+
log_warning("No vector DB provided")
|
|
2585
|
+
return False
|
|
2586
|
+
return self.vector_db.delete_by_metadata(metadata)
|
|
2587
|
+
|
|
2588
|
+
# --- API Only Methods ---
|
|
2589
|
+
|
|
2590
|
+
def patch_content(self, content: Content) -> Optional[Dict[str, Any]]:
|
|
2591
|
+
return self._update_content(content)
|
|
2592
|
+
|
|
2593
|
+
async def apatch_content(self, content: Content) -> Optional[Dict[str, Any]]:
|
|
2594
|
+
return await self._aupdate_content(content)
|
|
2595
|
+
|
|
2596
|
+
def get_content_by_id(self, content_id: str) -> Optional[Content]:
|
|
2597
|
+
if self.contents_db is None:
|
|
2598
|
+
raise ValueError("No contents db provided")
|
|
2599
|
+
|
|
2600
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2601
|
+
raise ValueError(
|
|
2602
|
+
"get_content_by_id() is not supported for async databases. Please use aget_content_by_id() instead."
|
|
2603
|
+
)
|
|
2604
|
+
|
|
2605
|
+
content_row = self.contents_db.get_knowledge_content(content_id)
|
|
2606
|
+
|
|
2607
|
+
if content_row is None:
|
|
2608
|
+
return None
|
|
2609
|
+
content = Content(
|
|
2610
|
+
id=content_row.id,
|
|
2611
|
+
name=content_row.name,
|
|
2612
|
+
description=content_row.description,
|
|
2613
|
+
metadata=content_row.metadata,
|
|
2614
|
+
file_type=content_row.type,
|
|
2615
|
+
size=content_row.size,
|
|
2616
|
+
status=ContentStatus(content_row.status) if content_row.status else None,
|
|
2617
|
+
status_message=content_row.status_message,
|
|
2618
|
+
created_at=content_row.created_at,
|
|
2619
|
+
updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
|
|
2620
|
+
external_id=content_row.external_id,
|
|
2621
|
+
)
|
|
2622
|
+
return content
|
|
2623
|
+
|
|
2624
|
+
async def aget_content_by_id(self, content_id: str) -> Optional[Content]:
|
|
2625
|
+
if self.contents_db is None:
|
|
2626
|
+
raise ValueError("No contents db provided")
|
|
2627
|
+
|
|
2628
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2629
|
+
content_row = await self.contents_db.get_knowledge_content(content_id)
|
|
2630
|
+
else:
|
|
2631
|
+
content_row = self.contents_db.get_knowledge_content(content_id)
|
|
2632
|
+
|
|
2633
|
+
if content_row is None:
|
|
2634
|
+
return None
|
|
2635
|
+
content = Content(
|
|
2636
|
+
id=content_row.id,
|
|
2637
|
+
name=content_row.name,
|
|
2638
|
+
description=content_row.description,
|
|
2639
|
+
metadata=content_row.metadata,
|
|
2640
|
+
file_type=content_row.type,
|
|
2641
|
+
size=content_row.size,
|
|
2642
|
+
status=ContentStatus(content_row.status) if content_row.status else None,
|
|
2643
|
+
status_message=content_row.status_message,
|
|
2644
|
+
created_at=content_row.created_at,
|
|
2645
|
+
updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
|
|
2646
|
+
external_id=content_row.external_id,
|
|
2647
|
+
)
|
|
2648
|
+
return content
|
|
2649
|
+
|
|
2650
|
+
def get_content(
|
|
2651
|
+
self,
|
|
2652
|
+
limit: Optional[int] = None,
|
|
2653
|
+
page: Optional[int] = None,
|
|
2654
|
+
sort_by: Optional[str] = None,
|
|
2655
|
+
sort_order: Optional[str] = None,
|
|
2656
|
+
) -> Tuple[List[Content], int]:
|
|
2657
|
+
if self.contents_db is None:
|
|
2658
|
+
raise ValueError("No contents db provided")
|
|
2659
|
+
|
|
2660
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2661
|
+
raise ValueError("get_content() is not supported for async databases. Please use aget_content() instead.")
|
|
2662
|
+
|
|
2663
|
+
contents, count = self.contents_db.get_knowledge_contents(
|
|
2664
|
+
limit=limit, page=page, sort_by=sort_by, sort_order=sort_order
|
|
2665
|
+
)
|
|
2666
|
+
|
|
2667
|
+
result = []
|
|
2668
|
+
for content_row in contents:
|
|
2669
|
+
# Create Content from database row
|
|
2670
|
+
content = Content(
|
|
2671
|
+
id=content_row.id,
|
|
2672
|
+
name=content_row.name,
|
|
2673
|
+
description=content_row.description,
|
|
2674
|
+
metadata=content_row.metadata,
|
|
2675
|
+
size=content_row.size,
|
|
2676
|
+
file_type=content_row.type,
|
|
2677
|
+
status=ContentStatus(content_row.status) if content_row.status else None,
|
|
2678
|
+
status_message=content_row.status_message,
|
|
2679
|
+
created_at=content_row.created_at,
|
|
2680
|
+
updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
|
|
2681
|
+
external_id=content_row.external_id,
|
|
2682
|
+
)
|
|
2683
|
+
result.append(content)
|
|
2684
|
+
return result, count
|
|
2685
|
+
|
|
2686
|
+
async def aget_content(
|
|
2687
|
+
self,
|
|
2688
|
+
limit: Optional[int] = None,
|
|
2689
|
+
page: Optional[int] = None,
|
|
2690
|
+
sort_by: Optional[str] = None,
|
|
2691
|
+
sort_order: Optional[str] = None,
|
|
2692
|
+
) -> Tuple[List[Content], int]:
|
|
2693
|
+
if self.contents_db is None:
|
|
2694
|
+
raise ValueError("No contents db provided")
|
|
2695
|
+
|
|
2696
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2697
|
+
contents, count = await self.contents_db.get_knowledge_contents(
|
|
2698
|
+
limit=limit, page=page, sort_by=sort_by, sort_order=sort_order
|
|
2699
|
+
)
|
|
2700
|
+
else:
|
|
2701
|
+
contents, count = self.contents_db.get_knowledge_contents(
|
|
2702
|
+
limit=limit, page=page, sort_by=sort_by, sort_order=sort_order
|
|
2703
|
+
)
|
|
2704
|
+
|
|
2705
|
+
result = []
|
|
2706
|
+
for content_row in contents:
|
|
2707
|
+
# Create Content from database row
|
|
2708
|
+
content = Content(
|
|
2709
|
+
id=content_row.id,
|
|
2710
|
+
name=content_row.name,
|
|
2711
|
+
description=content_row.description,
|
|
2712
|
+
metadata=content_row.metadata,
|
|
2713
|
+
size=content_row.size,
|
|
2714
|
+
file_type=content_row.type,
|
|
2715
|
+
status=ContentStatus(content_row.status) if content_row.status else None,
|
|
2716
|
+
status_message=content_row.status_message,
|
|
2717
|
+
created_at=content_row.created_at,
|
|
2718
|
+
updated_at=content_row.updated_at if content_row.updated_at else content_row.created_at,
|
|
2719
|
+
external_id=content_row.external_id,
|
|
2720
|
+
)
|
|
2721
|
+
result.append(content)
|
|
2722
|
+
return result, count
|
|
2723
|
+
|
|
2724
|
+
def get_content_status(self, content_id: str) -> Tuple[Optional[ContentStatus], Optional[str]]:
|
|
2725
|
+
if self.contents_db is None:
|
|
2726
|
+
raise ValueError("No contents db provided")
|
|
2727
|
+
|
|
2728
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2729
|
+
raise ValueError(
|
|
2730
|
+
"get_content_status() is not supported for async databases. Please use aget_content_status() instead."
|
|
2731
|
+
)
|
|
2732
|
+
|
|
2733
|
+
content_row = self.contents_db.get_knowledge_content(content_id)
|
|
2734
|
+
if content_row is None:
|
|
2735
|
+
return None, "Content not found"
|
|
2736
|
+
|
|
2737
|
+
# Convert string status to enum, defaulting to PROCESSING if unknown
|
|
2738
|
+
status_str = content_row.status
|
|
2739
|
+
try:
|
|
2740
|
+
status = ContentStatus(status_str.lower()) if status_str else ContentStatus.PROCESSING
|
|
2741
|
+
except ValueError:
|
|
2742
|
+
# Handle legacy or unknown statuses
|
|
2743
|
+
if status_str and "failed" in status_str.lower():
|
|
2744
|
+
status = ContentStatus.FAILED
|
|
2745
|
+
elif status_str and "completed" in status_str.lower():
|
|
2746
|
+
status = ContentStatus.COMPLETED
|
|
2747
|
+
else:
|
|
2748
|
+
status = ContentStatus.PROCESSING
|
|
2749
|
+
|
|
2750
|
+
return status, content_row.status_message
|
|
2751
|
+
|
|
2752
|
+
async def aget_content_status(self, content_id: str) -> Tuple[Optional[ContentStatus], Optional[str]]:
|
|
2753
|
+
if self.contents_db is None:
|
|
2754
|
+
raise ValueError("No contents db provided")
|
|
2755
|
+
|
|
2756
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2757
|
+
content_row = await self.contents_db.get_knowledge_content(content_id)
|
|
2758
|
+
else:
|
|
2759
|
+
content_row = self.contents_db.get_knowledge_content(content_id)
|
|
2760
|
+
|
|
2761
|
+
if content_row is None:
|
|
2762
|
+
return None, "Content not found"
|
|
2763
|
+
|
|
2764
|
+
# Convert string status to enum, defaulting to PROCESSING if unknown
|
|
2765
|
+
status_str = content_row.status
|
|
2766
|
+
try:
|
|
2767
|
+
status = ContentStatus(status_str.lower()) if status_str else ContentStatus.PROCESSING
|
|
2768
|
+
except ValueError:
|
|
2769
|
+
# Handle legacy or unknown statuses
|
|
2770
|
+
if status_str and "failed" in status_str.lower():
|
|
2771
|
+
status = ContentStatus.FAILED
|
|
2772
|
+
elif status_str and "completed" in status_str.lower():
|
|
2773
|
+
status = ContentStatus.COMPLETED
|
|
2774
|
+
else:
|
|
2775
|
+
status = ContentStatus.PROCESSING
|
|
2776
|
+
|
|
2777
|
+
return status, content_row.status_message
|
|
2778
|
+
|
|
2779
|
+
def remove_content_by_id(self, content_id: str):
|
|
2780
|
+
from agno.vectordb import VectorDb
|
|
2781
|
+
|
|
2782
|
+
self.vector_db = cast(VectorDb, self.vector_db)
|
|
2783
|
+
if self.vector_db is not None:
|
|
2784
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
2785
|
+
# For LightRAG, get the content first to find the external_id
|
|
2786
|
+
content = self.get_content_by_id(content_id)
|
|
2787
|
+
if content and content.external_id:
|
|
2788
|
+
self.vector_db.delete_by_external_id(content.external_id) # type: ignore
|
|
2789
|
+
else:
|
|
2790
|
+
log_warning(f"No external_id found for content {content_id}, cannot delete from LightRAG")
|
|
2791
|
+
else:
|
|
2792
|
+
self.vector_db.delete_by_content_id(content_id)
|
|
2793
|
+
|
|
2794
|
+
if self.contents_db is not None:
|
|
2795
|
+
self.contents_db.delete_knowledge_content(content_id)
|
|
2796
|
+
|
|
2797
|
+
async def aremove_content_by_id(self, content_id: str):
|
|
2798
|
+
if self.vector_db is not None:
|
|
2799
|
+
if self.vector_db.__class__.__name__ == "LightRag":
|
|
2800
|
+
# For LightRAG, get the content first to find the external_id
|
|
2801
|
+
content = await self.aget_content_by_id(content_id)
|
|
2802
|
+
if content and content.external_id:
|
|
2803
|
+
self.vector_db.delete_by_external_id(content.external_id) # type: ignore
|
|
2804
|
+
else:
|
|
2805
|
+
log_warning(f"No external_id found for content {content_id}, cannot delete from LightRAG")
|
|
2806
|
+
else:
|
|
2807
|
+
self.vector_db.delete_by_content_id(content_id)
|
|
2808
|
+
|
|
2809
|
+
if self.contents_db is not None:
|
|
2810
|
+
if isinstance(self.contents_db, AsyncBaseDb):
|
|
2811
|
+
await self.contents_db.delete_knowledge_content(content_id)
|
|
2812
|
+
else:
|
|
2813
|
+
self.contents_db.delete_knowledge_content(content_id)
|
|
2814
|
+
|
|
2815
|
+
def remove_all_content(self):
|
|
2816
|
+
contents, _ = self.get_content()
|
|
2817
|
+
for content in contents:
|
|
2818
|
+
if content.id is not None:
|
|
2819
|
+
self.remove_content_by_id(content.id)
|
|
2820
|
+
|
|
2821
|
+
async def aremove_all_content(self):
|
|
2822
|
+
contents, _ = await self.aget_content()
|
|
2823
|
+
for content in contents:
|
|
2824
|
+
if content.id is not None:
|
|
2825
|
+
await self.aremove_content_by_id(content.id)
|
|
2826
|
+
|
|
2827
|
+
# --- Reader Factory Integration ---
|
|
2828
|
+
|
|
2829
|
+
def construct_readers(self):
|
|
2830
|
+
"""Initialize readers dictionary for lazy loading."""
|
|
2831
|
+
# Initialize empty readers dict - readers will be created on-demand
|
|
2832
|
+
if self.readers is None:
|
|
2833
|
+
self.readers = {}
|
|
2834
|
+
|
|
2835
|
+
def add_reader(self, reader: Reader):
|
|
2836
|
+
"""Add a custom reader to the knowledge base."""
|
|
2837
|
+
if self.readers is None:
|
|
2838
|
+
self.readers = {}
|
|
2839
|
+
|
|
2840
|
+
# Generate a key for the reader
|
|
2841
|
+
reader_key = self._generate_reader_key(reader)
|
|
2842
|
+
self.readers[reader_key] = reader
|
|
2843
|
+
return reader
|
|
2844
|
+
|
|
2845
|
+
def get_readers(self) -> Dict[str, Reader]:
|
|
2846
|
+
"""Get all currently loaded readers (only returns readers that have been used)."""
|
|
2847
|
+
if self.readers is None:
|
|
2848
|
+
self.readers = {}
|
|
2849
|
+
elif not isinstance(self.readers, dict):
|
|
2850
|
+
# Defensive check: if readers is not a dict (e.g., was set to a list), convert it
|
|
2851
|
+
if isinstance(self.readers, list):
|
|
2852
|
+
readers_dict: Dict[str, Reader] = {}
|
|
2853
|
+
for reader in self.readers:
|
|
2854
|
+
if isinstance(reader, Reader):
|
|
2855
|
+
reader_key = self._generate_reader_key(reader)
|
|
2856
|
+
# Handle potential duplicate keys by appending index if needed
|
|
2857
|
+
original_key = reader_key
|
|
2858
|
+
counter = 1
|
|
2859
|
+
while reader_key in readers_dict:
|
|
2860
|
+
reader_key = f"{original_key}_{counter}"
|
|
2861
|
+
counter += 1
|
|
2862
|
+
readers_dict[reader_key] = reader
|
|
2863
|
+
self.readers = readers_dict
|
|
2864
|
+
else:
|
|
2865
|
+
# For any other unexpected type, reset to empty dict
|
|
2866
|
+
self.readers = {}
|
|
2867
|
+
|
|
2868
|
+
return self.readers
|
|
2869
|
+
|
|
2870
|
+
def _generate_reader_key(self, reader: Reader) -> str:
|
|
2871
|
+
"""Generate a key for a reader instance."""
|
|
2872
|
+
if reader.name:
|
|
2873
|
+
return f"{reader.name.lower().replace(' ', '_')}"
|
|
2874
|
+
else:
|
|
2875
|
+
return f"{reader.__class__.__name__.lower().replace(' ', '_')}"
|
|
2876
|
+
|
|
2877
|
+
def _select_reader(self, extension: str) -> Reader:
|
|
2878
|
+
"""Select the appropriate reader for a file extension."""
|
|
2879
|
+
log_info(f"Selecting reader for extension: {extension}")
|
|
2880
|
+
return ReaderFactory.get_reader_for_extension(extension)
|
|
2881
|
+
|
|
2882
|
+
# --- Convenience Properties for Backward Compatibility ---
|
|
2883
|
+
|
|
2884
|
+
def _is_text_mime_type(self, mime_type: str) -> bool:
|
|
2885
|
+
"""
|
|
2886
|
+
Check if a MIME type represents text content that can be safely encoded as UTF-8.
|
|
2887
|
+
|
|
2888
|
+
Args:
|
|
2889
|
+
mime_type: The MIME type to check
|
|
2890
|
+
|
|
2891
|
+
Returns:
|
|
2892
|
+
bool: True if it's a text type, False if binary
|
|
2893
|
+
"""
|
|
2894
|
+
if not mime_type:
|
|
2895
|
+
return False
|
|
2896
|
+
|
|
2897
|
+
text_types = [
|
|
2898
|
+
"text/",
|
|
2899
|
+
"application/json",
|
|
2900
|
+
"application/xml",
|
|
2901
|
+
"application/javascript",
|
|
2902
|
+
"application/csv",
|
|
2903
|
+
"application/sql",
|
|
2904
|
+
]
|
|
2905
|
+
|
|
2906
|
+
return any(mime_type.startswith(t) for t in text_types)
|
|
2907
|
+
|
|
2908
|
+
def _should_include_file(self, file_path: str, include: Optional[List[str]], exclude: Optional[List[str]]) -> bool:
|
|
2909
|
+
"""
|
|
2910
|
+
Determine if a file should be included based on include/exclude patterns.
|
|
2911
|
+
|
|
2912
|
+
Logic:
|
|
2913
|
+
1. If include is specified, file must match at least one include pattern
|
|
2914
|
+
2. If exclude is specified, file must not match any exclude pattern
|
|
2915
|
+
3. If neither specified, include all files
|
|
2916
|
+
|
|
2917
|
+
Args:
|
|
2918
|
+
file_path: Path to the file to check
|
|
2919
|
+
include: Optional list of include patterns (glob-style)
|
|
2920
|
+
exclude: Optional list of exclude patterns (glob-style)
|
|
2921
|
+
|
|
2922
|
+
Returns:
|
|
2923
|
+
bool: True if file should be included, False otherwise
|
|
2924
|
+
"""
|
|
2925
|
+
import fnmatch
|
|
2926
|
+
|
|
2927
|
+
# If include patterns specified, file must match at least one
|
|
2928
|
+
if include:
|
|
2929
|
+
if not any(fnmatch.fnmatch(file_path, pattern) for pattern in include):
|
|
2930
|
+
return False
|
|
2931
|
+
|
|
2932
|
+
# If exclude patterns specified, file must not match any
|
|
2933
|
+
if exclude:
|
|
2934
|
+
if any(fnmatch.fnmatch(file_path, pattern) for pattern in exclude):
|
|
2935
|
+
return False
|
|
2936
|
+
|
|
2937
|
+
return True
|
|
2938
|
+
|
|
2939
|
+
def _get_reader(self, reader_type: str) -> Optional[Reader]:
|
|
2940
|
+
"""Get a cached reader or create it if not cached, handling missing dependencies gracefully."""
|
|
2941
|
+
if self.readers is None:
|
|
2942
|
+
self.readers = {}
|
|
2943
|
+
|
|
2944
|
+
if reader_type not in self.readers:
|
|
2945
|
+
try:
|
|
2946
|
+
reader = ReaderFactory.create_reader(reader_type)
|
|
2947
|
+
if reader:
|
|
2948
|
+
self.readers[reader_type] = reader
|
|
2949
|
+
else:
|
|
2950
|
+
return None
|
|
2951
|
+
|
|
2952
|
+
except Exception as e:
|
|
2953
|
+
log_warning(f"Cannot create {reader_type} reader {e}")
|
|
2954
|
+
return None
|
|
2955
|
+
|
|
2956
|
+
return self.readers.get(reader_type)
|
|
2957
|
+
|
|
2958
|
+
@property
|
|
2959
|
+
def pdf_reader(self) -> Optional[Reader]:
|
|
2960
|
+
"""PDF reader - lazy loaded via factory."""
|
|
2961
|
+
return self._get_reader("pdf")
|
|
2962
|
+
|
|
2963
|
+
@property
|
|
2964
|
+
def csv_reader(self) -> Optional[Reader]:
|
|
2965
|
+
"""CSV reader - lazy loaded via factory."""
|
|
2966
|
+
return self._get_reader("csv")
|
|
2967
|
+
|
|
2968
|
+
@property
|
|
2969
|
+
def docx_reader(self) -> Optional[Reader]:
|
|
2970
|
+
"""Docx reader - lazy loaded via factory."""
|
|
2971
|
+
return self._get_reader("docx")
|
|
2972
|
+
|
|
2973
|
+
@property
|
|
2974
|
+
def pptx_reader(self) -> Optional[Reader]:
|
|
2975
|
+
"""PPTX reader - lazy loaded via factory."""
|
|
2976
|
+
return self._get_reader("pptx")
|
|
2977
|
+
|
|
2978
|
+
@property
|
|
2979
|
+
def json_reader(self) -> Optional[Reader]:
|
|
2980
|
+
"""JSON reader - lazy loaded via factory."""
|
|
2981
|
+
return self._get_reader("json")
|
|
2982
|
+
|
|
2983
|
+
@property
|
|
2984
|
+
def markdown_reader(self) -> Optional[Reader]:
|
|
2985
|
+
"""Markdown reader - lazy loaded via factory."""
|
|
2986
|
+
return self._get_reader("markdown")
|
|
2987
|
+
|
|
2988
|
+
@property
|
|
2989
|
+
def text_reader(self) -> Optional[Reader]:
|
|
2990
|
+
"""Text reader - lazy loaded via factory."""
|
|
2991
|
+
return self._get_reader("text")
|
|
2992
|
+
|
|
2993
|
+
@property
|
|
2994
|
+
def website_reader(self) -> Optional[Reader]:
|
|
2995
|
+
"""Website reader - lazy loaded via factory."""
|
|
2996
|
+
return self._get_reader("website")
|
|
2997
|
+
|
|
2998
|
+
@property
|
|
2999
|
+
def firecrawl_reader(self) -> Optional[Reader]:
|
|
3000
|
+
"""Firecrawl reader - lazy loaded via factory."""
|
|
3001
|
+
return self._get_reader("firecrawl")
|
|
3002
|
+
|
|
3003
|
+
@property
|
|
3004
|
+
def youtube_reader(self) -> Optional[Reader]:
|
|
3005
|
+
"""YouTube reader - lazy loaded via factory."""
|
|
3006
|
+
return self._get_reader("youtube")
|