agno 2.3.15__py3-none-any.whl → 2.3.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/__init__.py +2 -0
- agno/agent/agent.py +4 -53
- agno/agent/remote.py +351 -0
- agno/client/__init__.py +3 -0
- agno/client/os.py +2669 -0
- agno/db/base.py +20 -0
- agno/db/mongo/async_mongo.py +11 -0
- agno/db/mongo/mongo.py +10 -0
- agno/db/mysql/async_mysql.py +9 -0
- agno/db/mysql/mysql.py +9 -0
- agno/db/postgres/async_postgres.py +9 -0
- agno/db/postgres/postgres.py +9 -0
- agno/db/postgres/utils.py +3 -2
- agno/db/sqlite/async_sqlite.py +9 -0
- agno/db/sqlite/sqlite.py +11 -1
- agno/exceptions.py +23 -0
- agno/knowledge/chunking/semantic.py +123 -46
- agno/knowledge/reader/csv_reader.py +9 -14
- agno/knowledge/reader/docx_reader.py +2 -4
- agno/knowledge/reader/field_labeled_csv_reader.py +11 -17
- agno/knowledge/reader/json_reader.py +9 -17
- agno/knowledge/reader/markdown_reader.py +6 -6
- agno/knowledge/reader/pdf_reader.py +14 -11
- agno/knowledge/reader/pptx_reader.py +2 -4
- agno/knowledge/reader/s3_reader.py +2 -11
- agno/knowledge/reader/text_reader.py +6 -18
- agno/knowledge/reader/web_search_reader.py +4 -15
- agno/os/app.py +104 -23
- agno/os/auth.py +25 -1
- agno/os/interfaces/a2a/a2a.py +7 -6
- agno/os/interfaces/a2a/router.py +13 -13
- agno/os/interfaces/agui/agui.py +5 -3
- agno/os/interfaces/agui/router.py +23 -16
- agno/os/interfaces/base.py +7 -7
- agno/os/interfaces/slack/router.py +6 -6
- agno/os/interfaces/slack/slack.py +7 -7
- agno/os/interfaces/whatsapp/router.py +29 -6
- agno/os/interfaces/whatsapp/whatsapp.py +11 -8
- agno/os/managers.py +326 -0
- agno/os/mcp.py +651 -79
- agno/os/router.py +125 -18
- agno/os/routers/agents/router.py +65 -22
- agno/os/routers/agents/schema.py +16 -4
- agno/os/routers/database.py +5 -0
- agno/os/routers/evals/evals.py +93 -11
- agno/os/routers/evals/utils.py +6 -6
- agno/os/routers/knowledge/knowledge.py +104 -16
- agno/os/routers/memory/memory.py +124 -7
- agno/os/routers/metrics/metrics.py +21 -4
- agno/os/routers/session/session.py +141 -12
- agno/os/routers/teams/router.py +40 -14
- agno/os/routers/teams/schema.py +12 -4
- agno/os/routers/traces/traces.py +54 -4
- agno/os/routers/workflows/router.py +223 -117
- agno/os/routers/workflows/schema.py +65 -1
- agno/os/schema.py +38 -12
- agno/os/utils.py +87 -166
- agno/remote/__init__.py +3 -0
- agno/remote/base.py +484 -0
- agno/run/workflow.py +1 -0
- agno/team/__init__.py +2 -0
- agno/team/remote.py +287 -0
- agno/team/team.py +25 -54
- agno/tracing/exporter.py +10 -6
- agno/tracing/setup.py +2 -1
- agno/utils/agent.py +58 -1
- agno/utils/http.py +68 -20
- agno/utils/os.py +0 -0
- agno/utils/remote.py +23 -0
- agno/vectordb/chroma/chromadb.py +452 -16
- agno/vectordb/pgvector/pgvector.py +7 -0
- agno/vectordb/redis/redisdb.py +1 -1
- agno/workflow/__init__.py +2 -0
- agno/workflow/agent.py +2 -2
- agno/workflow/remote.py +222 -0
- agno/workflow/types.py +0 -73
- agno/workflow/workflow.py +119 -68
- {agno-2.3.15.dist-info → agno-2.3.17.dist-info}/METADATA +1 -1
- {agno-2.3.15.dist-info → agno-2.3.17.dist-info}/RECORD +82 -72
- {agno-2.3.15.dist-info → agno-2.3.17.dist-info}/WHEEL +0 -0
- {agno-2.3.15.dist-info → agno-2.3.17.dist-info}/licenses/LICENSE +0 -0
- {agno-2.3.15.dist-info → agno-2.3.17.dist-info}/top_level.txt +0 -0
agno/db/base.py
CHANGED
|
@@ -58,6 +58,14 @@ class BaseDb(ABC):
|
|
|
58
58
|
"""Create all tables for this database."""
|
|
59
59
|
pass
|
|
60
60
|
|
|
61
|
+
def close(self) -> None:
|
|
62
|
+
"""Close database connections and release resources.
|
|
63
|
+
|
|
64
|
+
Override in subclasses to properly dispose of connection pools.
|
|
65
|
+
Should be called during application shutdown.
|
|
66
|
+
"""
|
|
67
|
+
pass
|
|
68
|
+
|
|
61
69
|
# --- Schema Version ---
|
|
62
70
|
@abstractmethod
|
|
63
71
|
def get_latest_schema_version(self, table_name: str):
|
|
@@ -517,6 +525,18 @@ class AsyncBaseDb(ABC):
|
|
|
517
525
|
self.culture_table_name = culture_table or "agno_culture"
|
|
518
526
|
self.versions_table_name = versions_table or "agno_schema_versions"
|
|
519
527
|
|
|
528
|
+
async def _create_all_tables(self) -> None:
|
|
529
|
+
"""Create all tables for this database. Override in subclasses."""
|
|
530
|
+
pass
|
|
531
|
+
|
|
532
|
+
async def close(self) -> None:
|
|
533
|
+
"""Close database connections and release resources.
|
|
534
|
+
|
|
535
|
+
Override in subclasses to properly dispose of connection pools.
|
|
536
|
+
Should be called during application shutdown.
|
|
537
|
+
"""
|
|
538
|
+
pass
|
|
539
|
+
|
|
520
540
|
@abstractmethod
|
|
521
541
|
async def table_exists(self, table_name: str) -> bool:
|
|
522
542
|
"""Check if a table with the given name exists in this database.
|
agno/db/mongo/async_mongo.py
CHANGED
|
@@ -249,6 +249,17 @@ class AsyncMongoDb(AsyncBaseDb):
|
|
|
249
249
|
if collection_name and not await self.table_exists(collection_name):
|
|
250
250
|
await self._get_collection(collection_type, create_collection_if_not_found=True)
|
|
251
251
|
|
|
252
|
+
async def close(self) -> None:
|
|
253
|
+
"""Close the MongoDB client connection.
|
|
254
|
+
|
|
255
|
+
Should be called during application shutdown to properly release
|
|
256
|
+
all database connections.
|
|
257
|
+
"""
|
|
258
|
+
if self._client is not None:
|
|
259
|
+
self._client.close()
|
|
260
|
+
self._client = None
|
|
261
|
+
self._database = None
|
|
262
|
+
|
|
252
263
|
def _ensure_client(self) -> AsyncMongoClientType:
|
|
253
264
|
"""
|
|
254
265
|
Ensure the MongoDB async client is valid for the current event loop.
|
agno/db/mongo/mongo.py
CHANGED
|
@@ -98,10 +98,20 @@ class MongoDb(BaseDb):
|
|
|
98
98
|
|
|
99
99
|
self.db_url: Optional[str] = db_url
|
|
100
100
|
self.db_client: MongoClient = _client
|
|
101
|
+
|
|
101
102
|
self.db_name: str = db_name if db_name is not None else "agno"
|
|
102
103
|
|
|
103
104
|
self._database: Optional[Database] = None
|
|
104
105
|
|
|
106
|
+
def close(self) -> None:
|
|
107
|
+
"""Close the MongoDB client connection.
|
|
108
|
+
|
|
109
|
+
Should be called during application shutdown to properly release
|
|
110
|
+
all database connections.
|
|
111
|
+
"""
|
|
112
|
+
if self.db_client is not None:
|
|
113
|
+
self.db_client.close()
|
|
114
|
+
|
|
105
115
|
@property
|
|
106
116
|
def database(self) -> Database:
|
|
107
117
|
if self._database is None:
|
agno/db/mysql/async_mysql.py
CHANGED
|
@@ -123,6 +123,15 @@ class AsyncMySQLDb(AsyncBaseDb):
|
|
|
123
123
|
expire_on_commit=False,
|
|
124
124
|
)
|
|
125
125
|
|
|
126
|
+
async def close(self) -> None:
|
|
127
|
+
"""Close database connections and dispose of the connection pool.
|
|
128
|
+
|
|
129
|
+
Should be called during application shutdown to properly release
|
|
130
|
+
all database connections.
|
|
131
|
+
"""
|
|
132
|
+
if self.db_engine is not None:
|
|
133
|
+
await self.db_engine.dispose()
|
|
134
|
+
|
|
126
135
|
# -- DB methods --
|
|
127
136
|
async def table_exists(self, table_name: str) -> bool:
|
|
128
137
|
"""Check if a table with the given name exists in the MySQL database.
|
agno/db/mysql/mysql.py
CHANGED
|
@@ -121,6 +121,15 @@ class MySQLDb(BaseDb):
|
|
|
121
121
|
# Initialize database session
|
|
122
122
|
self.Session: scoped_session = scoped_session(sessionmaker(bind=self.db_engine))
|
|
123
123
|
|
|
124
|
+
def close(self) -> None:
|
|
125
|
+
"""Close database connections and dispose of the connection pool.
|
|
126
|
+
|
|
127
|
+
Should be called during application shutdown to properly release
|
|
128
|
+
all database connections.
|
|
129
|
+
"""
|
|
130
|
+
if self.db_engine is not None:
|
|
131
|
+
self.db_engine.dispose()
|
|
132
|
+
|
|
124
133
|
# -- DB methods --
|
|
125
134
|
def table_exists(self, table_name: str) -> bool:
|
|
126
135
|
"""Check if a table with the given name exists in the MySQL database.
|
|
@@ -128,6 +128,15 @@ class AsyncPostgresDb(AsyncBaseDb):
|
|
|
128
128
|
expire_on_commit=False,
|
|
129
129
|
)
|
|
130
130
|
|
|
131
|
+
async def close(self) -> None:
|
|
132
|
+
"""Close database connections and dispose of the connection pool.
|
|
133
|
+
|
|
134
|
+
Should be called during application shutdown to properly release
|
|
135
|
+
all database connections.
|
|
136
|
+
"""
|
|
137
|
+
if self.db_engine is not None:
|
|
138
|
+
await self.db_engine.dispose()
|
|
139
|
+
|
|
131
140
|
# -- DB methods --
|
|
132
141
|
async def table_exists(self, table_name: str) -> bool:
|
|
133
142
|
"""Check if a table with the given name exists in the Postgres database.
|
agno/db/postgres/postgres.py
CHANGED
|
@@ -124,6 +124,15 @@ class PostgresDb(BaseDb):
|
|
|
124
124
|
# Initialize database session
|
|
125
125
|
self.Session: scoped_session = scoped_session(sessionmaker(bind=self.db_engine, expire_on_commit=False))
|
|
126
126
|
|
|
127
|
+
def close(self) -> None:
|
|
128
|
+
"""Close database connections and dispose of the connection pool.
|
|
129
|
+
|
|
130
|
+
Should be called during application shutdown to properly release
|
|
131
|
+
all database connections.
|
|
132
|
+
"""
|
|
133
|
+
if self.db_engine is not None:
|
|
134
|
+
self.db_engine.dispose()
|
|
135
|
+
|
|
127
136
|
# -- DB methods --
|
|
128
137
|
def table_exists(self, table_name: str) -> bool:
|
|
129
138
|
"""Check if a table with the given name exists in the Postgres database.
|
agno/db/postgres/utils.py
CHANGED
|
@@ -298,8 +298,9 @@ def calculate_date_metrics(date_to_process: date, sessions_data: dict) -> dict:
|
|
|
298
298
|
for session in sessions:
|
|
299
299
|
if session.get("user_id"):
|
|
300
300
|
all_user_ids.add(session["user_id"])
|
|
301
|
-
|
|
302
|
-
|
|
301
|
+
runs = session.get("runs", []) or []
|
|
302
|
+
metrics[runs_count_key] += len(runs)
|
|
303
|
+
if runs:
|
|
303
304
|
for run in runs:
|
|
304
305
|
if model_id := run.get("model"):
|
|
305
306
|
model_provider = run.get("model_provider", "")
|
agno/db/sqlite/async_sqlite.py
CHANGED
|
@@ -124,6 +124,15 @@ class AsyncSqliteDb(AsyncBaseDb):
|
|
|
124
124
|
# Initialize database session factory
|
|
125
125
|
self.async_session_factory = async_sessionmaker(bind=self.db_engine, expire_on_commit=False)
|
|
126
126
|
|
|
127
|
+
async def close(self) -> None:
|
|
128
|
+
"""Close database connections and dispose of the connection pool.
|
|
129
|
+
|
|
130
|
+
Should be called during application shutdown to properly release
|
|
131
|
+
all database connections.
|
|
132
|
+
"""
|
|
133
|
+
if self.db_engine is not None:
|
|
134
|
+
await self.db_engine.dispose()
|
|
135
|
+
|
|
127
136
|
# -- DB methods --
|
|
128
137
|
async def table_exists(self, table_name: str) -> bool:
|
|
129
138
|
"""Check if a table with the given name exists in the SQLite database.
|
agno/db/sqlite/sqlite.py
CHANGED
|
@@ -125,6 +125,15 @@ class SqliteDb(BaseDb):
|
|
|
125
125
|
# Initialize database session
|
|
126
126
|
self.Session: scoped_session = scoped_session(sessionmaker(bind=self.db_engine))
|
|
127
127
|
|
|
128
|
+
def close(self) -> None:
|
|
129
|
+
"""Close database connections and dispose of the connection pool.
|
|
130
|
+
|
|
131
|
+
Should be called during application shutdown to properly release
|
|
132
|
+
all database connections.
|
|
133
|
+
"""
|
|
134
|
+
if self.db_engine is not None:
|
|
135
|
+
self.db_engine.dispose()
|
|
136
|
+
|
|
128
137
|
# -- DB methods --
|
|
129
138
|
def table_exists(self, table_name: str) -> bool:
|
|
130
139
|
"""Check if a table with the given name exists in the SQLite database.
|
|
@@ -1110,7 +1119,8 @@ class SqliteDb(BaseDb):
|
|
|
1110
1119
|
# Select topics from all results
|
|
1111
1120
|
stmt = select(table.c.topics)
|
|
1112
1121
|
result = sess.execute(stmt).fetchall()
|
|
1113
|
-
|
|
1122
|
+
result = result[0][0]
|
|
1123
|
+
return list(set(result))
|
|
1114
1124
|
|
|
1115
1125
|
except Exception as e:
|
|
1116
1126
|
log_debug(f"Exception reading from memory table: {e}")
|
agno/exceptions.py
CHANGED
|
@@ -178,3 +178,26 @@ class RetryableModelProviderError(Exception):
|
|
|
178
178
|
original_error: Optional[str] = None
|
|
179
179
|
# Guidance message to retry a model invocation after an error
|
|
180
180
|
retry_guidance_message: Optional[str] = None
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
class RemoteServerUnavailableError(AgnoError):
|
|
184
|
+
"""Exception raised when a remote server is unavailable.
|
|
185
|
+
|
|
186
|
+
This can happen due to:
|
|
187
|
+
- Connection refused (server not running)
|
|
188
|
+
- Connection timeout
|
|
189
|
+
- Network errors
|
|
190
|
+
- DNS resolution failures
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
def __init__(
|
|
194
|
+
self,
|
|
195
|
+
message: str,
|
|
196
|
+
base_url: Optional[str] = None,
|
|
197
|
+
original_error: Optional[Exception] = None,
|
|
198
|
+
):
|
|
199
|
+
super().__init__(message, status_code=503)
|
|
200
|
+
self.base_url = base_url
|
|
201
|
+
self.original_error = original_error
|
|
202
|
+
self.type = "remote_server_unavailable_error"
|
|
203
|
+
self.error_id = "remote_server_unavailable_error"
|
|
@@ -1,5 +1,18 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
1
|
+
from typing import Any, Dict, List, Literal, Optional, Union
|
|
2
|
+
|
|
3
|
+
try:
|
|
4
|
+
import numpy as np
|
|
5
|
+
except ImportError:
|
|
6
|
+
raise ImportError("`numpy` not installed. Please install using `pip install numpy`")
|
|
7
|
+
|
|
8
|
+
try:
|
|
9
|
+
from chonkie import SemanticChunker
|
|
10
|
+
from chonkie.embeddings.base import BaseEmbeddings
|
|
11
|
+
except ImportError:
|
|
12
|
+
raise ImportError(
|
|
13
|
+
"`chonkie` is required for semantic chunking. "
|
|
14
|
+
"Please install it using `pip install chonkie` to use SemanticChunking."
|
|
15
|
+
)
|
|
3
16
|
|
|
4
17
|
from agno.knowledge.chunking.strategy import ChunkingStrategy
|
|
5
18
|
from agno.knowledge.document.base import Document
|
|
@@ -7,10 +20,69 @@ from agno.knowledge.embedder.base import Embedder
|
|
|
7
20
|
from agno.utils.log import log_info
|
|
8
21
|
|
|
9
22
|
|
|
10
|
-
|
|
11
|
-
"""
|
|
23
|
+
def _get_chonkie_embedder_wrapper(embedder: Embedder):
|
|
24
|
+
"""Create a wrapper that adapts Agno Embedder to chonkie's BaseEmbeddings interface."""
|
|
25
|
+
|
|
26
|
+
class _ChonkieEmbedderWrapper(BaseEmbeddings):
|
|
27
|
+
"""Wrapper to make Agno Embedders compatible with chonkie."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, agno_embedder: Embedder):
|
|
30
|
+
super().__init__()
|
|
31
|
+
self._embedder = agno_embedder
|
|
32
|
+
|
|
33
|
+
def embed(self, text: str):
|
|
34
|
+
embedding = self._embedder.get_embedding(text) # type: ignore[attr-defined]
|
|
35
|
+
return np.array(embedding, dtype=np.float32)
|
|
36
|
+
|
|
37
|
+
def get_tokenizer(self):
|
|
38
|
+
"""Return a simple token counter function."""
|
|
39
|
+
return lambda text: len(text.split())
|
|
12
40
|
|
|
13
|
-
|
|
41
|
+
@property
|
|
42
|
+
def dimension(self) -> int:
|
|
43
|
+
return getattr(self._embedder, "dimensions")
|
|
44
|
+
|
|
45
|
+
return _ChonkieEmbedderWrapper(embedder)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class SemanticChunking(ChunkingStrategy):
|
|
49
|
+
"""Chunking strategy that splits text into semantic chunks using chonkie.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
embedder: The embedder to use for generating embeddings. Can be:
|
|
53
|
+
- A string model identifier (e.g., "minishlab/potion-base-32M") for chonkie's built-in models
|
|
54
|
+
- A chonkie BaseEmbeddings instance (used directly)
|
|
55
|
+
- An Agno Embedder (wrapped for chonkie compatibility)
|
|
56
|
+
chunk_size: Maximum tokens allowed per chunk.
|
|
57
|
+
similarity_threshold: Threshold for semantic similarity (0-1).
|
|
58
|
+
similarity_window: Number of sentences to consider for similarity calculation.
|
|
59
|
+
min_sentences_per_chunk: Minimum number of sentences per chunk.
|
|
60
|
+
min_characters_per_sentence: Minimum number of characters per sentence.
|
|
61
|
+
delimiters: Delimiters to use for sentence splitting.
|
|
62
|
+
include_delimiters: Whether to include delimiter in prev/next sentence or None.
|
|
63
|
+
skip_window: Number of groups to skip when merging (0=disabled).
|
|
64
|
+
filter_window: Window length for the Savitzky-Golay filter.
|
|
65
|
+
filter_polyorder: Polynomial order for the Savitzky-Golay filter.
|
|
66
|
+
filter_tolerance: Tolerance for the Savitzky-Golay filter.
|
|
67
|
+
chunker_params: Additional parameters to pass to chonkie's SemanticChunker.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
def __init__(
|
|
71
|
+
self,
|
|
72
|
+
embedder: Optional[Union[str, Embedder, BaseEmbeddings]] = None,
|
|
73
|
+
chunk_size: int = 5000,
|
|
74
|
+
similarity_threshold: float = 0.5,
|
|
75
|
+
similarity_window: int = 3,
|
|
76
|
+
min_sentences_per_chunk: int = 1,
|
|
77
|
+
min_characters_per_sentence: int = 24,
|
|
78
|
+
delimiters: Optional[List[str]] = None,
|
|
79
|
+
include_delimiters: Literal["prev", "next", None] = "prev",
|
|
80
|
+
skip_window: int = 0,
|
|
81
|
+
filter_window: int = 5,
|
|
82
|
+
filter_polyorder: int = 3,
|
|
83
|
+
filter_tolerance: float = 0.2,
|
|
84
|
+
chunker_params: Optional[Dict[str, Any]] = None,
|
|
85
|
+
):
|
|
14
86
|
if embedder is None:
|
|
15
87
|
from agno.knowledge.embedder.openai import OpenAIEmbedder
|
|
16
88
|
|
|
@@ -19,50 +91,55 @@ class SemanticChunking(ChunkingStrategy):
|
|
|
19
91
|
self.embedder = embedder
|
|
20
92
|
self.chunk_size = chunk_size
|
|
21
93
|
self.similarity_threshold = similarity_threshold
|
|
22
|
-
self.
|
|
94
|
+
self.similarity_window = similarity_window
|
|
95
|
+
self.min_sentences_per_chunk = min_sentences_per_chunk
|
|
96
|
+
self.min_characters_per_sentence = min_characters_per_sentence
|
|
97
|
+
self.delimiters = delimiters if delimiters is not None else [". ", "! ", "? ", "\n"]
|
|
98
|
+
self.include_delimiters = include_delimiters
|
|
99
|
+
self.skip_window = skip_window
|
|
100
|
+
self.filter_window = filter_window
|
|
101
|
+
self.filter_polyorder = filter_polyorder
|
|
102
|
+
self.filter_tolerance = filter_tolerance
|
|
103
|
+
self.chunker_params = chunker_params
|
|
104
|
+
self.chunker: Optional[SemanticChunker] = None
|
|
23
105
|
|
|
24
106
|
def _initialize_chunker(self):
|
|
25
107
|
"""Lazily initialize the chunker with chonkie dependency."""
|
|
26
|
-
if self.chunker is None:
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
self.chunker = SemanticChunker( # type: ignore
|
|
62
|
-
embedding_model=getattr(self.embedder, "id", None) or "text-embedding-3-small",
|
|
63
|
-
chunk_size=self.chunk_size,
|
|
64
|
-
threshold=self.similarity_threshold,
|
|
65
|
-
)
|
|
108
|
+
if self.chunker is not None:
|
|
109
|
+
return
|
|
110
|
+
|
|
111
|
+
# Determine embedding model based on type:
|
|
112
|
+
# - str: pass directly to chonkie (uses chonkie's built-in models)
|
|
113
|
+
# - BaseEmbeddings: pass directly to chonkie
|
|
114
|
+
# - Agno Embedder: wrap for chonkie compatibility
|
|
115
|
+
embedding_model: Union[str, BaseEmbeddings]
|
|
116
|
+
if isinstance(self.embedder, str):
|
|
117
|
+
embedding_model = self.embedder
|
|
118
|
+
elif isinstance(self.embedder, BaseEmbeddings):
|
|
119
|
+
embedding_model = self.embedder
|
|
120
|
+
elif isinstance(self.embedder, Embedder):
|
|
121
|
+
embedding_model = _get_chonkie_embedder_wrapper(self.embedder)
|
|
122
|
+
else:
|
|
123
|
+
raise ValueError("Invalid embedder type. Must be a string, BaseEmbeddings, or Embedder instance.")
|
|
124
|
+
|
|
125
|
+
_chunker_params: Dict[str, Any] = {
|
|
126
|
+
"embedding_model": embedding_model,
|
|
127
|
+
"chunk_size": self.chunk_size,
|
|
128
|
+
"threshold": self.similarity_threshold,
|
|
129
|
+
"similarity_window": self.similarity_window,
|
|
130
|
+
"min_sentences_per_chunk": self.min_sentences_per_chunk,
|
|
131
|
+
"min_characters_per_sentence": self.min_characters_per_sentence,
|
|
132
|
+
"delim": self.delimiters,
|
|
133
|
+
"include_delim": self.include_delimiters,
|
|
134
|
+
"skip_window": self.skip_window,
|
|
135
|
+
"filter_window": self.filter_window,
|
|
136
|
+
"filter_polyorder": self.filter_polyorder,
|
|
137
|
+
"filter_tolerance": self.filter_tolerance,
|
|
138
|
+
}
|
|
139
|
+
if self.chunker_params:
|
|
140
|
+
_chunker_params.update(self.chunker_params)
|
|
141
|
+
|
|
142
|
+
self.chunker = SemanticChunker(**_chunker_params)
|
|
66
143
|
|
|
67
144
|
def chunk(self, document: Document) -> List[Document]:
|
|
68
145
|
"""Split document into semantic chunks using chonkie"""
|
|
@@ -47,17 +47,16 @@ class CSVReader(Reader):
|
|
|
47
47
|
if not file.exists():
|
|
48
48
|
raise FileNotFoundError(f"Could not find file: {file}")
|
|
49
49
|
log_debug(f"Reading: {file}")
|
|
50
|
-
|
|
50
|
+
csv_name = name or file.stem
|
|
51
|
+
file_content: Union[io.TextIOWrapper, io.StringIO] = file.open(
|
|
52
|
+
newline="", mode="r", encoding=self.encoding or "utf-8"
|
|
53
|
+
)
|
|
51
54
|
else:
|
|
52
|
-
log_debug(f"Reading retrieved file: {name
|
|
55
|
+
log_debug(f"Reading retrieved file: {getattr(file, 'name', 'BytesIO')}")
|
|
56
|
+
csv_name = name or getattr(file, "name", "csv_file").split(".")[0]
|
|
53
57
|
file.seek(0)
|
|
54
|
-
file_content = io.StringIO(file.read().decode("utf-8"))
|
|
58
|
+
file_content = io.StringIO(file.read().decode("utf-8"))
|
|
55
59
|
|
|
56
|
-
csv_name = name or (
|
|
57
|
-
Path(file.name).stem
|
|
58
|
-
if isinstance(file, Path)
|
|
59
|
-
else (getattr(file, "name", "csv_file").split(".")[0] if hasattr(file, "name") else "csv_file")
|
|
60
|
-
)
|
|
61
60
|
csv_content = ""
|
|
62
61
|
with file_content as csvfile:
|
|
63
62
|
csv_reader = csv.reader(csvfile, delimiter=delimiter, quotechar=quotechar)
|
|
@@ -109,16 +108,12 @@ class CSVReader(Reader):
|
|
|
109
108
|
async with aiofiles.open(file, mode="r", encoding="utf-8", newline="") as file_content:
|
|
110
109
|
content = await file_content.read()
|
|
111
110
|
file_content_io = io.StringIO(content)
|
|
111
|
+
csv_name = name or file.stem
|
|
112
112
|
else:
|
|
113
113
|
log_debug(f"Reading retrieved file async: {getattr(file, 'name', 'BytesIO')}")
|
|
114
114
|
file.seek(0)
|
|
115
115
|
file_content_io = io.StringIO(file.read().decode("utf-8"))
|
|
116
|
-
|
|
117
|
-
csv_name = name or (
|
|
118
|
-
Path(file.name).stem
|
|
119
|
-
if isinstance(file, Path)
|
|
120
|
-
else (getattr(file, "name", "csv_file").split(".")[0] if hasattr(file, "name") else "csv_file")
|
|
121
|
-
)
|
|
116
|
+
csv_name = name or getattr(file, "name", "csv_file").split(".")[0]
|
|
122
117
|
|
|
123
118
|
file_content_io.seek(0)
|
|
124
119
|
csv_reader = csv.reader(file_content_io, delimiter=delimiter, quotechar=quotechar)
|
|
@@ -47,11 +47,9 @@ class DocxReader(Reader):
|
|
|
47
47
|
docx_document = DocxDocument(str(file))
|
|
48
48
|
doc_name = name or file.stem
|
|
49
49
|
else:
|
|
50
|
-
log_debug(f"Reading uploaded file: {getattr(file, 'name', '
|
|
50
|
+
log_debug(f"Reading uploaded file: {getattr(file, 'name', 'BytesIO')}")
|
|
51
51
|
docx_document = DocxDocument(file)
|
|
52
|
-
doc_name = name or (
|
|
53
|
-
getattr(file, "name", "docx_file").split(".")[0] if hasattr(file, "name") else "docx_file"
|
|
54
|
-
)
|
|
52
|
+
doc_name = name or getattr(file, "name", "docx_file").split(".")[0]
|
|
55
53
|
|
|
56
54
|
doc_content = "\n\n".join([para.text for para in docx_document.paragraphs])
|
|
57
55
|
|
|
@@ -106,17 +106,15 @@ class FieldLabeledCSVReader(Reader):
|
|
|
106
106
|
if not file.exists():
|
|
107
107
|
raise FileNotFoundError(f"Could not find file: {file}")
|
|
108
108
|
log_debug(f"Reading: {file}")
|
|
109
|
-
|
|
109
|
+
csv_name = name or file.stem
|
|
110
|
+
file_content: Union[io.TextIOWrapper, io.StringIO] = file.open(
|
|
111
|
+
newline="", mode="r", encoding=self.encoding or "utf-8"
|
|
112
|
+
)
|
|
110
113
|
else:
|
|
111
|
-
log_debug(f"Reading retrieved file: {name
|
|
114
|
+
log_debug(f"Reading retrieved file: {getattr(file, 'name', 'BytesIO')}")
|
|
115
|
+
csv_name = name or getattr(file, "name", "csv_file").split(".")[0]
|
|
112
116
|
file.seek(0)
|
|
113
|
-
file_content = io.StringIO(file.read().decode("utf-8"))
|
|
114
|
-
|
|
115
|
-
csv_name = name or (
|
|
116
|
-
Path(file.name).stem
|
|
117
|
-
if isinstance(file, Path)
|
|
118
|
-
else (getattr(file, "name", "csv_file").split(".")[0] if hasattr(file, "name") else "csv_file")
|
|
119
|
-
)
|
|
117
|
+
file_content = io.StringIO(file.read().decode("utf-8"))
|
|
120
118
|
|
|
121
119
|
documents = []
|
|
122
120
|
|
|
@@ -189,16 +187,12 @@ class FieldLabeledCSVReader(Reader):
|
|
|
189
187
|
async with aiofiles.open(file, mode="r", encoding=self.encoding or "utf-8", newline="") as file_content:
|
|
190
188
|
content = await file_content.read()
|
|
191
189
|
file_content_io = io.StringIO(content)
|
|
190
|
+
csv_name = name or file.stem
|
|
192
191
|
else:
|
|
193
|
-
log_debug(f"Reading retrieved file async: {name
|
|
192
|
+
log_debug(f"Reading retrieved file async: {getattr(file, 'name', 'BytesIO')}")
|
|
193
|
+
csv_name = name or getattr(file, "name", "csv_file").split(".")[0]
|
|
194
194
|
file.seek(0)
|
|
195
|
-
file_content_io = io.StringIO(file.read().decode("utf-8"))
|
|
196
|
-
|
|
197
|
-
csv_name = name or (
|
|
198
|
-
Path(file.name).stem
|
|
199
|
-
if isinstance(file, Path)
|
|
200
|
-
else (getattr(file, "name", "csv_file").split(".")[0] if hasattr(file, "name") else "csv_file")
|
|
201
|
-
)
|
|
195
|
+
file_content_io = io.StringIO(file.read().decode("utf-8"))
|
|
202
196
|
|
|
203
197
|
file_content_io.seek(0)
|
|
204
198
|
csv_reader = csv.reader(file_content_io, delimiter=delimiter, quotechar=quotechar)
|
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import json
|
|
3
|
-
from io import BytesIO
|
|
4
3
|
from pathlib import Path
|
|
5
4
|
from typing import IO, Any, List, Optional, Union
|
|
6
5
|
from uuid import uuid4
|
|
@@ -42,17 +41,15 @@ class JSONReader(Reader):
|
|
|
42
41
|
if not path.exists():
|
|
43
42
|
raise FileNotFoundError(f"Could not find file: {path}")
|
|
44
43
|
log_debug(f"Reading: {path}")
|
|
45
|
-
json_name = name or path.
|
|
46
|
-
json_contents = json.loads(path.read_text(self.encoding or "utf-8"))
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
json_name = name or path
|
|
50
|
-
log_debug(f"Reading uploaded file: {json_name}")
|
|
44
|
+
json_name = name or path.stem
|
|
45
|
+
json_contents = json.loads(path.read_text(encoding=self.encoding or "utf-8"))
|
|
46
|
+
elif hasattr(path, "seek") and hasattr(path, "read"):
|
|
47
|
+
log_debug(f"Reading uploaded file: {getattr(path, 'name', 'BytesIO')}")
|
|
48
|
+
json_name = name or getattr(path, "name", "json_file").split(".")[0]
|
|
51
49
|
path.seek(0)
|
|
52
50
|
json_contents = json.load(path)
|
|
53
|
-
|
|
54
51
|
else:
|
|
55
|
-
raise ValueError("Unsupported file type. Must be Path or
|
|
52
|
+
raise ValueError("Unsupported file type. Must be Path or file-like object.")
|
|
56
53
|
|
|
57
54
|
if isinstance(json_contents, dict):
|
|
58
55
|
json_contents = [json_contents]
|
|
@@ -72,17 +69,12 @@ class JSONReader(Reader):
|
|
|
72
69
|
chunked_documents.extend(self.chunk_document(document))
|
|
73
70
|
return chunked_documents
|
|
74
71
|
return documents
|
|
72
|
+
except (FileNotFoundError, ValueError, json.JSONDecodeError):
|
|
73
|
+
raise
|
|
75
74
|
except Exception as e:
|
|
76
75
|
log_error(f"Error reading: {path}: {e}")
|
|
77
76
|
raise
|
|
78
77
|
|
|
79
78
|
async def async_read(self, path: Union[Path, IO[Any]], name: Optional[str] = None) -> List[Document]:
|
|
80
|
-
"""Asynchronously read JSON files.
|
|
81
|
-
|
|
82
|
-
Args:
|
|
83
|
-
path (Union[Path, IO[Any]]): Path to a JSON file or a file-like object
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
List[Document]: List of documents from the JSON file
|
|
87
|
-
"""
|
|
79
|
+
"""Asynchronously read JSON files."""
|
|
88
80
|
return await asyncio.to_thread(self.read, path, name)
|
|
@@ -69,8 +69,8 @@ class MarkdownReader(Reader):
|
|
|
69
69
|
file_name = name or file.stem
|
|
70
70
|
file_contents = file.read_text(encoding=self.encoding or "utf-8")
|
|
71
71
|
else:
|
|
72
|
-
log_debug(f"Reading uploaded file: {file
|
|
73
|
-
file_name = name or file
|
|
72
|
+
log_debug(f"Reading uploaded file: {getattr(file, 'name', 'BytesIO')}")
|
|
73
|
+
file_name = name or getattr(file, "name", "file").split(".")[0]
|
|
74
74
|
file.seek(0)
|
|
75
75
|
file_contents = file.read().decode(self.encoding or "utf-8")
|
|
76
76
|
|
|
@@ -101,16 +101,16 @@ class MarkdownReader(Reader):
|
|
|
101
101
|
file_contents = await f.read()
|
|
102
102
|
except ImportError:
|
|
103
103
|
log_warning("aiofiles not installed, using synchronous file I/O")
|
|
104
|
-
file_contents = file.read_text(self.encoding or "utf-8")
|
|
104
|
+
file_contents = file.read_text(encoding=self.encoding or "utf-8")
|
|
105
105
|
else:
|
|
106
|
-
log_debug(f"Reading uploaded file asynchronously: {file
|
|
107
|
-
file_name = name or file
|
|
106
|
+
log_debug(f"Reading uploaded file asynchronously: {getattr(file, 'name', 'BytesIO')}")
|
|
107
|
+
file_name = name or getattr(file, "name", "file").split(".")[0]
|
|
108
108
|
file.seek(0)
|
|
109
109
|
file_contents = file.read().decode(self.encoding or "utf-8")
|
|
110
110
|
|
|
111
111
|
document = Document(
|
|
112
112
|
name=file_name,
|
|
113
|
-
id=str(uuid.uuid4()),
|
|
113
|
+
id=str(uuid.uuid4()),
|
|
114
114
|
content=file_contents,
|
|
115
115
|
)
|
|
116
116
|
|
|
@@ -218,16 +218,13 @@ class BasePDFReader(Reader):
|
|
|
218
218
|
|
|
219
219
|
def _get_doc_name(self, pdf_source: Union[str, Path, IO[Any]], name: Optional[str] = None) -> str:
|
|
220
220
|
"""Determines the document name from the source or a provided name."""
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
except Exception:
|
|
229
|
-
# The original code had a bug here, it should check `name` first.
|
|
230
|
-
return name or "pdf"
|
|
221
|
+
if name:
|
|
222
|
+
return name
|
|
223
|
+
if isinstance(pdf_source, str):
|
|
224
|
+
return Path(pdf_source).stem.replace(" ", "_")
|
|
225
|
+
if isinstance(pdf_source, Path):
|
|
226
|
+
return pdf_source.stem.replace(" ", "_")
|
|
227
|
+
return getattr(pdf_source, "name", "pdf_file").split(".")[0].replace(" ", "_")
|
|
231
228
|
|
|
232
229
|
def _decrypt_pdf(self, doc_reader: DocumentReader, doc_name: str, password: Optional[str] = None) -> bool:
|
|
233
230
|
if not doc_reader.is_encrypted:
|
|
@@ -341,8 +338,14 @@ class PDFReader(BasePDFReader):
|
|
|
341
338
|
return [ContentType.PDF]
|
|
342
339
|
|
|
343
340
|
def read(
|
|
344
|
-
self,
|
|
341
|
+
self,
|
|
342
|
+
pdf: Optional[Union[str, Path, IO[Any]]] = None,
|
|
343
|
+
name: Optional[str] = None,
|
|
344
|
+
password: Optional[str] = None,
|
|
345
345
|
) -> List[Document]:
|
|
346
|
+
if pdf is None:
|
|
347
|
+
log_error("No pdf provided")
|
|
348
|
+
return []
|
|
346
349
|
doc_name = self._get_doc_name(pdf, name)
|
|
347
350
|
log_debug(f"Reading: {doc_name}")
|
|
348
351
|
|