agno 2.4.0__py3-none-any.whl → 2.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/db/postgres/postgres.py +25 -12
- agno/db/sqlite/sqlite.py +24 -11
- agno/integrations/discord/client.py +12 -1
- agno/knowledge/knowledge.py +1070 -43
- agno/knowledge/reader/csv_reader.py +231 -8
- agno/knowledge/reader/field_labeled_csv_reader.py +167 -3
- agno/knowledge/reader/reader_factory.py +8 -1
- agno/knowledge/remote_content/__init__.py +29 -0
- agno/knowledge/remote_content/config.py +204 -0
- agno/knowledge/remote_content/remote_content.py +74 -17
- agno/models/base.py +12 -2
- agno/models/cerebras/cerebras.py +34 -2
- agno/models/n1n/__init__.py +3 -0
- agno/models/n1n/n1n.py +57 -0
- agno/models/openai/chat.py +18 -1
- agno/models/perplexity/perplexity.py +2 -0
- agno/os/interfaces/slack/router.py +10 -1
- agno/os/interfaces/whatsapp/router.py +6 -0
- agno/os/routers/components/components.py +10 -1
- agno/os/routers/knowledge/knowledge.py +125 -0
- agno/os/routers/knowledge/schemas.py +12 -0
- agno/run/agent.py +2 -0
- agno/team/team.py +20 -4
- agno/vectordb/pgvector/pgvector.py +3 -3
- {agno-2.4.0.dist-info → agno-2.4.1.dist-info}/METADATA +4 -1
- {agno-2.4.0.dist-info → agno-2.4.1.dist-info}/RECORD +29 -26
- {agno-2.4.0.dist-info → agno-2.4.1.dist-info}/WHEEL +1 -1
- {agno-2.4.0.dist-info → agno-2.4.1.dist-info}/licenses/LICENSE +0 -0
- {agno-2.4.0.dist-info → agno-2.4.1.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import csv
|
|
3
3
|
import io
|
|
4
|
+
from datetime import date, datetime
|
|
4
5
|
from pathlib import Path
|
|
5
|
-
from typing import IO, Any, List, Optional, Union
|
|
6
|
+
from typing import IO, Any, Iterable, List, Optional, Sequence, Tuple, Union
|
|
6
7
|
from uuid import uuid4
|
|
7
8
|
|
|
8
9
|
try:
|
|
@@ -18,6 +19,113 @@ from agno.knowledge.types import ContentType
|
|
|
18
19
|
from agno.utils.log import log_debug, log_error
|
|
19
20
|
|
|
20
21
|
|
|
22
|
+
def _get_workbook_name(file: Union[Path, IO[Any]], name: Optional[str]) -> str:
|
|
23
|
+
"""Extract workbook name from file path or name parameter.
|
|
24
|
+
|
|
25
|
+
Priority: explicit name > file path stem > file object name attribute > "workbook"
|
|
26
|
+
"""
|
|
27
|
+
if name:
|
|
28
|
+
return Path(name).stem
|
|
29
|
+
if isinstance(file, Path):
|
|
30
|
+
return file.stem
|
|
31
|
+
return Path(getattr(file, "name", "workbook")).stem
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _infer_file_extension(file: Union[Path, IO[Any]], name: Optional[str]) -> str:
|
|
35
|
+
if isinstance(file, Path):
|
|
36
|
+
return file.suffix.lower()
|
|
37
|
+
|
|
38
|
+
file_name = getattr(file, "name", None)
|
|
39
|
+
if isinstance(file_name, str) and file_name:
|
|
40
|
+
return Path(file_name).suffix.lower()
|
|
41
|
+
|
|
42
|
+
if name:
|
|
43
|
+
return Path(name).suffix.lower()
|
|
44
|
+
|
|
45
|
+
return ""
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _convert_xls_cell_value(cell_value: Any, cell_type: int, datemode: int) -> Any:
|
|
49
|
+
"""Convert xlrd cell value to Python type.
|
|
50
|
+
|
|
51
|
+
xlrd returns dates as Excel serial numbers and booleans as 0/1 integers.
|
|
52
|
+
This converts them to proper Python types for consistent handling with openpyxl.
|
|
53
|
+
"""
|
|
54
|
+
try:
|
|
55
|
+
import xlrd
|
|
56
|
+
except ImportError:
|
|
57
|
+
return cell_value
|
|
58
|
+
|
|
59
|
+
if cell_type == xlrd.XL_CELL_DATE:
|
|
60
|
+
try:
|
|
61
|
+
date_tuple = xlrd.xldate_as_tuple(cell_value, datemode)
|
|
62
|
+
return datetime(*date_tuple)
|
|
63
|
+
except Exception:
|
|
64
|
+
return cell_value
|
|
65
|
+
if cell_type == xlrd.XL_CELL_BOOLEAN:
|
|
66
|
+
return bool(cell_value)
|
|
67
|
+
return cell_value
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
def _stringify_spreadsheet_cell_value(value: Any) -> str:
|
|
71
|
+
if value is None:
|
|
72
|
+
return ""
|
|
73
|
+
|
|
74
|
+
# Handle datetime/date before float check (datetime is not a float)
|
|
75
|
+
if isinstance(value, datetime):
|
|
76
|
+
return value.isoformat()
|
|
77
|
+
if isinstance(value, date):
|
|
78
|
+
return value.isoformat()
|
|
79
|
+
|
|
80
|
+
if isinstance(value, float) and value.is_integer():
|
|
81
|
+
return str(int(value))
|
|
82
|
+
|
|
83
|
+
result = str(value)
|
|
84
|
+
# Normalize all line endings to space to preserve row integrity in CSV-like output
|
|
85
|
+
# Must handle CRLF first before individual CR/LF to avoid double-spacing
|
|
86
|
+
result = result.replace("\r\n", " ") # Windows (CRLF)
|
|
87
|
+
result = result.replace("\r", " ") # Old Mac (CR)
|
|
88
|
+
result = result.replace("\n", " ") # Unix (LF)
|
|
89
|
+
return result
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def _row_values_to_csv_line(row_values: Sequence[Any]) -> str:
|
|
93
|
+
values = [_stringify_spreadsheet_cell_value(v) for v in row_values]
|
|
94
|
+
while values and values[-1] == "":
|
|
95
|
+
values.pop()
|
|
96
|
+
|
|
97
|
+
return ", ".join(values)
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
def _excel_rows_to_documents(
|
|
101
|
+
*,
|
|
102
|
+
workbook_name: str,
|
|
103
|
+
sheets: Iterable[Tuple[str, Iterable[Sequence[Any]]]],
|
|
104
|
+
) -> List[Document]:
|
|
105
|
+
documents = []
|
|
106
|
+
for sheet_index, (sheet_name, rows) in enumerate(sheets, start=1):
|
|
107
|
+
lines = []
|
|
108
|
+
for row in rows:
|
|
109
|
+
line = _row_values_to_csv_line(row)
|
|
110
|
+
if line:
|
|
111
|
+
lines.append(line)
|
|
112
|
+
|
|
113
|
+
if not lines:
|
|
114
|
+
log_debug(f"Sheet '{sheet_name}' is empty, skipping")
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
documents.append(
|
|
118
|
+
Document(
|
|
119
|
+
name=workbook_name,
|
|
120
|
+
id=str(uuid4()),
|
|
121
|
+
meta_data={"sheet_name": sheet_name, "sheet_index": sheet_index},
|
|
122
|
+
content="\n".join(lines),
|
|
123
|
+
)
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
return documents
|
|
127
|
+
|
|
128
|
+
|
|
21
129
|
class CSVReader(Reader):
|
|
22
130
|
"""Reader for CSV files"""
|
|
23
131
|
|
|
@@ -44,6 +152,22 @@ class CSVReader(Reader):
|
|
|
44
152
|
self, file: Union[Path, IO[Any]], delimiter: str = ",", quotechar: str = '"', name: Optional[str] = None
|
|
45
153
|
) -> List[Document]:
|
|
46
154
|
try:
|
|
155
|
+
file_extension = _infer_file_extension(file, name)
|
|
156
|
+
if file_extension in {ContentType.XLSX, ContentType.XLS}:
|
|
157
|
+
workbook_name = _get_workbook_name(file, name)
|
|
158
|
+
|
|
159
|
+
if file_extension == ContentType.XLSX:
|
|
160
|
+
documents = self._read_xlsx(file, workbook_name=workbook_name)
|
|
161
|
+
else:
|
|
162
|
+
documents = self._read_xls(file, workbook_name=workbook_name)
|
|
163
|
+
|
|
164
|
+
if self.chunk:
|
|
165
|
+
chunked_documents = []
|
|
166
|
+
for document in documents:
|
|
167
|
+
chunked_documents.extend(self.chunk_document(document))
|
|
168
|
+
return chunked_documents
|
|
169
|
+
return documents
|
|
170
|
+
|
|
47
171
|
if isinstance(file, Path):
|
|
48
172
|
if not file.exists():
|
|
49
173
|
raise FileNotFoundError(f"Could not find file: {file}")
|
|
@@ -58,17 +182,18 @@ class CSVReader(Reader):
|
|
|
58
182
|
file.seek(0)
|
|
59
183
|
file_content = io.StringIO(file.read().decode(self.encoding or "utf-8"))
|
|
60
184
|
|
|
61
|
-
|
|
185
|
+
csv_lines: List[str] = []
|
|
62
186
|
with file_content as csvfile:
|
|
63
187
|
csv_reader = csv.reader(csvfile, delimiter=delimiter, quotechar=quotechar)
|
|
64
188
|
for row in csv_reader:
|
|
65
|
-
|
|
189
|
+
# Use stringify to normalize line endings in CSV cells
|
|
190
|
+
csv_lines.append(", ".join(_stringify_spreadsheet_cell_value(cell) for cell in row))
|
|
66
191
|
|
|
67
192
|
documents = [
|
|
68
193
|
Document(
|
|
69
194
|
name=csv_name,
|
|
70
195
|
id=str(uuid4()),
|
|
71
|
-
content=
|
|
196
|
+
content="\n".join(csv_lines),
|
|
72
197
|
)
|
|
73
198
|
]
|
|
74
199
|
if self.chunk:
|
|
@@ -77,8 +202,17 @@ class CSVReader(Reader):
|
|
|
77
202
|
chunked_documents.extend(self.chunk_document(document))
|
|
78
203
|
return chunked_documents
|
|
79
204
|
return documents
|
|
205
|
+
except FileNotFoundError:
|
|
206
|
+
raise
|
|
207
|
+
except ImportError:
|
|
208
|
+
raise
|
|
209
|
+
except UnicodeDecodeError as e:
|
|
210
|
+
file_desc = getattr(file, "name", str(file)) if isinstance(file, IO) else file
|
|
211
|
+
log_error(f"Encoding error reading {file_desc}: {e}. Try specifying a different encoding.")
|
|
212
|
+
return []
|
|
80
213
|
except Exception as e:
|
|
81
|
-
|
|
214
|
+
file_desc = getattr(file, "name", str(file)) if isinstance(file, IO) else file
|
|
215
|
+
log_error(f"Error reading {file_desc}: {e}")
|
|
82
216
|
return []
|
|
83
217
|
|
|
84
218
|
async def async_read(
|
|
@@ -102,6 +236,19 @@ class CSVReader(Reader):
|
|
|
102
236
|
List of Document objects
|
|
103
237
|
"""
|
|
104
238
|
try:
|
|
239
|
+
file_extension = _infer_file_extension(file, name)
|
|
240
|
+
if file_extension in {ContentType.XLSX, ContentType.XLS}:
|
|
241
|
+
workbook_name = _get_workbook_name(file, name)
|
|
242
|
+
|
|
243
|
+
if file_extension == ContentType.XLSX:
|
|
244
|
+
documents = await asyncio.to_thread(self._read_xlsx, file, workbook_name=workbook_name)
|
|
245
|
+
else:
|
|
246
|
+
documents = await asyncio.to_thread(self._read_xls, file, workbook_name=workbook_name)
|
|
247
|
+
|
|
248
|
+
if self.chunk:
|
|
249
|
+
documents = await self.chunk_documents_async(documents)
|
|
250
|
+
return documents
|
|
251
|
+
|
|
105
252
|
if isinstance(file, Path):
|
|
106
253
|
if not file.exists():
|
|
107
254
|
raise FileNotFoundError(f"Could not find file: {file}")
|
|
@@ -122,7 +269,10 @@ class CSVReader(Reader):
|
|
|
122
269
|
total_rows = len(rows)
|
|
123
270
|
|
|
124
271
|
if total_rows <= 10:
|
|
125
|
-
|
|
272
|
+
# Use stringify to normalize line endings in CSV cells
|
|
273
|
+
csv_content = " ".join(
|
|
274
|
+
", ".join(_stringify_spreadsheet_cell_value(cell) for cell in row) for row in rows
|
|
275
|
+
)
|
|
126
276
|
documents = [
|
|
127
277
|
Document(
|
|
128
278
|
name=csv_name,
|
|
@@ -138,7 +288,10 @@ class CSVReader(Reader):
|
|
|
138
288
|
async def _process_page(page_number: int, page_rows: List[List[str]]) -> Document:
|
|
139
289
|
"""Process a page of rows into a document"""
|
|
140
290
|
start_row = (page_number - 1) * page_size + 1
|
|
141
|
-
|
|
291
|
+
# Use stringify to normalize line endings in CSV cells
|
|
292
|
+
page_content = " ".join(
|
|
293
|
+
", ".join(_stringify_spreadsheet_cell_value(cell) for cell in row) for row in page_rows
|
|
294
|
+
)
|
|
142
295
|
|
|
143
296
|
return Document(
|
|
144
297
|
name=csv_name,
|
|
@@ -155,6 +308,76 @@ class CSVReader(Reader):
|
|
|
155
308
|
documents = await self.chunk_documents_async(documents)
|
|
156
309
|
|
|
157
310
|
return documents
|
|
311
|
+
except FileNotFoundError:
|
|
312
|
+
raise
|
|
313
|
+
except ImportError:
|
|
314
|
+
raise
|
|
315
|
+
except UnicodeDecodeError as e:
|
|
316
|
+
file_desc = getattr(file, "name", str(file)) if isinstance(file, IO) else file
|
|
317
|
+
log_error(f"Encoding error reading {file_desc}: {e}. Try specifying a different encoding.")
|
|
318
|
+
return []
|
|
158
319
|
except Exception as e:
|
|
159
|
-
|
|
320
|
+
file_desc = getattr(file, "name", str(file)) if isinstance(file, IO) else file
|
|
321
|
+
log_error(f"Error reading {file_desc}: {e}")
|
|
160
322
|
return []
|
|
323
|
+
|
|
324
|
+
def _read_xlsx(self, file: Union[Path, IO[Any]], *, workbook_name: str) -> List[Document]:
|
|
325
|
+
try:
|
|
326
|
+
import openpyxl # type: ignore
|
|
327
|
+
except ImportError as e:
|
|
328
|
+
raise ImportError(
|
|
329
|
+
"`openpyxl` not installed. Please install it via `pip install agno[csv]` or `pip install openpyxl`."
|
|
330
|
+
) from e
|
|
331
|
+
|
|
332
|
+
if isinstance(file, Path):
|
|
333
|
+
workbook = openpyxl.load_workbook(filename=str(file), read_only=True, data_only=True)
|
|
334
|
+
else:
|
|
335
|
+
file.seek(0)
|
|
336
|
+
raw = file.read()
|
|
337
|
+
if isinstance(raw, str):
|
|
338
|
+
raw = raw.encode("utf-8", errors="replace")
|
|
339
|
+
workbook = openpyxl.load_workbook(filename=io.BytesIO(raw), read_only=True, data_only=True)
|
|
340
|
+
|
|
341
|
+
try:
|
|
342
|
+
return _excel_rows_to_documents(
|
|
343
|
+
workbook_name=workbook_name,
|
|
344
|
+
sheets=[(worksheet.title, worksheet.iter_rows(values_only=True)) for worksheet in workbook.worksheets],
|
|
345
|
+
)
|
|
346
|
+
finally:
|
|
347
|
+
workbook.close()
|
|
348
|
+
|
|
349
|
+
def _read_xls(self, file: Union[Path, IO[Any]], *, workbook_name: str) -> List[Document]:
|
|
350
|
+
try:
|
|
351
|
+
import xlrd # type: ignore
|
|
352
|
+
except ImportError as e:
|
|
353
|
+
raise ImportError(
|
|
354
|
+
"`xlrd` not installed. Please install it via `pip install agno[csv]` or `pip install xlrd`."
|
|
355
|
+
) from e
|
|
356
|
+
|
|
357
|
+
if isinstance(file, Path):
|
|
358
|
+
workbook = xlrd.open_workbook(filename=str(file))
|
|
359
|
+
else:
|
|
360
|
+
file.seek(0)
|
|
361
|
+
raw = file.read()
|
|
362
|
+
if isinstance(raw, str):
|
|
363
|
+
raw = raw.encode("utf-8", errors="replace")
|
|
364
|
+
workbook = xlrd.open_workbook(file_contents=raw)
|
|
365
|
+
|
|
366
|
+
sheets: List[Tuple[str, Iterable[Sequence[Any]]]] = []
|
|
367
|
+
for sheet_index in range(workbook.nsheets):
|
|
368
|
+
sheet = workbook.sheet_by_index(sheet_index)
|
|
369
|
+
|
|
370
|
+
def _iter_sheet_rows(_sheet: Any = sheet, _datemode: int = workbook.datemode) -> Iterable[Sequence[Any]]:
|
|
371
|
+
for row_index in range(_sheet.nrows):
|
|
372
|
+
yield [
|
|
373
|
+
_convert_xls_cell_value(
|
|
374
|
+
_sheet.cell_value(row_index, col_index),
|
|
375
|
+
_sheet.cell_type(row_index, col_index),
|
|
376
|
+
_datemode,
|
|
377
|
+
)
|
|
378
|
+
for col_index in range(_sheet.ncols)
|
|
379
|
+
]
|
|
380
|
+
|
|
381
|
+
sheets.append((sheet.name, _iter_sheet_rows()))
|
|
382
|
+
|
|
383
|
+
return _excel_rows_to_documents(workbook_name=workbook_name, sheets=sheets)
|
|
@@ -2,7 +2,7 @@ import asyncio
|
|
|
2
2
|
import csv
|
|
3
3
|
import io
|
|
4
4
|
from pathlib import Path
|
|
5
|
-
from typing import IO, Any, List, Optional, Union
|
|
5
|
+
from typing import IO, Any, Iterable, List, Optional, Sequence, Tuple, Union
|
|
6
6
|
|
|
7
7
|
try:
|
|
8
8
|
import aiofiles
|
|
@@ -12,6 +12,12 @@ except ImportError:
|
|
|
12
12
|
from agno.knowledge.chunking.strategy import ChunkingStrategyType
|
|
13
13
|
from agno.knowledge.document.base import Document
|
|
14
14
|
from agno.knowledge.reader.base import Reader
|
|
15
|
+
from agno.knowledge.reader.csv_reader import (
|
|
16
|
+
_convert_xls_cell_value,
|
|
17
|
+
_get_workbook_name,
|
|
18
|
+
_infer_file_extension,
|
|
19
|
+
_stringify_spreadsheet_cell_value,
|
|
20
|
+
)
|
|
15
21
|
from agno.knowledge.types import ContentType
|
|
16
22
|
from agno.utils.log import log_debug, log_error, log_warning
|
|
17
23
|
|
|
@@ -84,7 +90,8 @@ class FieldLabeledCSVReader(Reader):
|
|
|
84
90
|
lines.append(title)
|
|
85
91
|
|
|
86
92
|
for i, (header, value) in enumerate(zip(headers, row)):
|
|
87
|
-
|
|
93
|
+
# Normalize line endings before stripping to handle embedded newlines
|
|
94
|
+
clean_value = _stringify_spreadsheet_cell_value(value).strip() if value else ""
|
|
88
95
|
|
|
89
96
|
if self.skip_empty_fields and not clean_value:
|
|
90
97
|
continue
|
|
@@ -98,10 +105,155 @@ class FieldLabeledCSVReader(Reader):
|
|
|
98
105
|
|
|
99
106
|
return "\n".join(lines)
|
|
100
107
|
|
|
108
|
+
def _excel_rows_to_field_labeled_documents(
|
|
109
|
+
self,
|
|
110
|
+
*,
|
|
111
|
+
workbook_name: str,
|
|
112
|
+
sheets: Iterable[Tuple[str, Iterable[Sequence[Any]]]],
|
|
113
|
+
) -> List[Document]:
|
|
114
|
+
"""Convert Excel rows to field-labeled documents (one document per data row).
|
|
115
|
+
|
|
116
|
+
For each sheet: first row = headers, subsequent rows = data.
|
|
117
|
+
Each data row becomes a Document with field-labeled content.
|
|
118
|
+
"""
|
|
119
|
+
documents = []
|
|
120
|
+
global_row_index = 0
|
|
121
|
+
|
|
122
|
+
for sheet_index, (sheet_name, rows) in enumerate(sheets, start=1):
|
|
123
|
+
rows_list = list(rows)
|
|
124
|
+
|
|
125
|
+
if not rows_list:
|
|
126
|
+
log_debug(f"Sheet '{sheet_name}' is empty, skipping")
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
# First row is headers
|
|
130
|
+
headers = [_stringify_spreadsheet_cell_value(h).strip() for h in rows_list[0]]
|
|
131
|
+
if not any(headers):
|
|
132
|
+
log_debug(f"Sheet '{sheet_name}' has no valid headers, skipping")
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
data_rows = rows_list[1:]
|
|
136
|
+
if not data_rows:
|
|
137
|
+
log_debug(f"Sheet '{sheet_name}' has only headers, skipping")
|
|
138
|
+
continue
|
|
139
|
+
|
|
140
|
+
log_debug(f"Processing sheet '{sheet_name}' with {len(headers)} headers and {len(data_rows)} rows")
|
|
141
|
+
|
|
142
|
+
for row_in_sheet, row in enumerate(data_rows):
|
|
143
|
+
# Convert cell values to strings
|
|
144
|
+
str_row = [_stringify_spreadsheet_cell_value(v) for v in row]
|
|
145
|
+
|
|
146
|
+
# Normalize row length
|
|
147
|
+
normalized_row = str_row[: len(headers)]
|
|
148
|
+
while len(normalized_row) < len(headers):
|
|
149
|
+
normalized_row.append("")
|
|
150
|
+
|
|
151
|
+
# Skip entirely empty rows
|
|
152
|
+
if not any(v.strip() for v in normalized_row):
|
|
153
|
+
continue
|
|
154
|
+
|
|
155
|
+
labeled_text = self._convert_row_to_labeled_text(headers, normalized_row, global_row_index)
|
|
156
|
+
|
|
157
|
+
if labeled_text.strip():
|
|
158
|
+
doc_id = f"{workbook_name}_{sheet_name}_row_{row_in_sheet + 1}"
|
|
159
|
+
documents.append(
|
|
160
|
+
Document(
|
|
161
|
+
id=doc_id,
|
|
162
|
+
name=workbook_name,
|
|
163
|
+
meta_data={
|
|
164
|
+
"sheet_name": sheet_name,
|
|
165
|
+
"sheet_index": sheet_index,
|
|
166
|
+
"row_index": row_in_sheet,
|
|
167
|
+
"headers": headers,
|
|
168
|
+
"source": "field_labeled_csv_reader",
|
|
169
|
+
},
|
|
170
|
+
content=labeled_text,
|
|
171
|
+
)
|
|
172
|
+
)
|
|
173
|
+
global_row_index += 1
|
|
174
|
+
|
|
175
|
+
return documents
|
|
176
|
+
|
|
177
|
+
def _read_xlsx(self, file: Union[Path, IO[Any]], *, workbook_name: str) -> List[Document]:
|
|
178
|
+
"""Read .xlsx file and convert rows to field-labeled documents."""
|
|
179
|
+
try:
|
|
180
|
+
import openpyxl # type: ignore
|
|
181
|
+
except ImportError as e:
|
|
182
|
+
raise ImportError(
|
|
183
|
+
"`openpyxl` not installed. Please install it via `pip install agno[csv]` or `pip install openpyxl`."
|
|
184
|
+
) from e
|
|
185
|
+
|
|
186
|
+
if isinstance(file, Path):
|
|
187
|
+
workbook = openpyxl.load_workbook(filename=str(file), read_only=True, data_only=True)
|
|
188
|
+
else:
|
|
189
|
+
file.seek(0)
|
|
190
|
+
raw = file.read()
|
|
191
|
+
if isinstance(raw, str):
|
|
192
|
+
raw = raw.encode("utf-8", errors="replace")
|
|
193
|
+
workbook = openpyxl.load_workbook(filename=io.BytesIO(raw), read_only=True, data_only=True)
|
|
194
|
+
|
|
195
|
+
try:
|
|
196
|
+
return self._excel_rows_to_field_labeled_documents(
|
|
197
|
+
workbook_name=workbook_name,
|
|
198
|
+
sheets=[(worksheet.title, worksheet.iter_rows(values_only=True)) for worksheet in workbook.worksheets],
|
|
199
|
+
)
|
|
200
|
+
finally:
|
|
201
|
+
workbook.close()
|
|
202
|
+
|
|
203
|
+
def _read_xls(self, file: Union[Path, IO[Any]], *, workbook_name: str) -> List[Document]:
|
|
204
|
+
"""Read .xls file and convert rows to field-labeled documents."""
|
|
205
|
+
try:
|
|
206
|
+
import xlrd # type: ignore
|
|
207
|
+
except ImportError as e:
|
|
208
|
+
raise ImportError(
|
|
209
|
+
"`xlrd` not installed. Please install it via `pip install agno[csv]` or `pip install xlrd`."
|
|
210
|
+
) from e
|
|
211
|
+
|
|
212
|
+
if isinstance(file, Path):
|
|
213
|
+
workbook = xlrd.open_workbook(filename=str(file))
|
|
214
|
+
else:
|
|
215
|
+
file.seek(0)
|
|
216
|
+
raw = file.read()
|
|
217
|
+
if isinstance(raw, str):
|
|
218
|
+
raw = raw.encode("utf-8", errors="replace")
|
|
219
|
+
workbook = xlrd.open_workbook(file_contents=raw)
|
|
220
|
+
|
|
221
|
+
sheets: List[Tuple[str, Iterable[Sequence[Any]]]] = []
|
|
222
|
+
for sheet_index in range(workbook.nsheets):
|
|
223
|
+
sheet = workbook.sheet_by_index(sheet_index)
|
|
224
|
+
|
|
225
|
+
def _iter_sheet_rows(_sheet: Any = sheet, _datemode: int = workbook.datemode) -> Iterable[Sequence[Any]]:
|
|
226
|
+
for row_index in range(_sheet.nrows):
|
|
227
|
+
yield [
|
|
228
|
+
_convert_xls_cell_value(
|
|
229
|
+
_sheet.cell_value(row_index, col_index),
|
|
230
|
+
_sheet.cell_type(row_index, col_index),
|
|
231
|
+
_datemode,
|
|
232
|
+
)
|
|
233
|
+
for col_index in range(_sheet.ncols)
|
|
234
|
+
]
|
|
235
|
+
|
|
236
|
+
sheets.append((sheet.name, _iter_sheet_rows()))
|
|
237
|
+
|
|
238
|
+
return self._excel_rows_to_field_labeled_documents(workbook_name=workbook_name, sheets=sheets)
|
|
239
|
+
|
|
101
240
|
def read(
|
|
102
241
|
self, file: Union[Path, IO[Any]], delimiter: str = ",", quotechar: str = '"', name: Optional[str] = None
|
|
103
242
|
) -> List[Document]:
|
|
104
243
|
try:
|
|
244
|
+
file_extension = _infer_file_extension(file, name)
|
|
245
|
+
|
|
246
|
+
# Handle Excel files
|
|
247
|
+
if file_extension in {ContentType.XLSX, ContentType.XLS}:
|
|
248
|
+
workbook_name = _get_workbook_name(file, name)
|
|
249
|
+
log_debug(f"Reading Excel file: {workbook_name}{file_extension}")
|
|
250
|
+
|
|
251
|
+
if file_extension == ContentType.XLSX:
|
|
252
|
+
return self._read_xlsx(file, workbook_name=workbook_name)
|
|
253
|
+
else:
|
|
254
|
+
return self._read_xls(file, workbook_name=workbook_name)
|
|
255
|
+
|
|
256
|
+
# Handle CSV files
|
|
105
257
|
if isinstance(file, Path):
|
|
106
258
|
if not file.exists():
|
|
107
259
|
raise FileNotFoundError(f"Could not find file: {file}")
|
|
@@ -179,7 +331,19 @@ class FieldLabeledCSVReader(Reader):
|
|
|
179
331
|
name: Optional[str] = None,
|
|
180
332
|
) -> List[Document]:
|
|
181
333
|
try:
|
|
182
|
-
|
|
334
|
+
file_extension = _infer_file_extension(file, name)
|
|
335
|
+
|
|
336
|
+
# Handle Excel files (use asyncio.to_thread for sync openpyxl/xlrd)
|
|
337
|
+
if file_extension in {ContentType.XLSX, ContentType.XLS}:
|
|
338
|
+
workbook_name = _get_workbook_name(file, name)
|
|
339
|
+
log_debug(f"Reading Excel file async: {workbook_name}{file_extension}")
|
|
340
|
+
|
|
341
|
+
if file_extension == ContentType.XLSX:
|
|
342
|
+
return await asyncio.to_thread(self._read_xlsx, file, workbook_name=workbook_name)
|
|
343
|
+
else:
|
|
344
|
+
return await asyncio.to_thread(self._read_xls, file, workbook_name=workbook_name)
|
|
345
|
+
|
|
346
|
+
# Handle CSV files
|
|
183
347
|
if isinstance(file, Path):
|
|
184
348
|
if not file.exists():
|
|
185
349
|
raise FileNotFoundError(f"Could not find file: {file}")
|
|
@@ -335,7 +335,14 @@ class ReaderFactory:
|
|
|
335
335
|
|
|
336
336
|
if extension in [".pdf", "application/pdf"]:
|
|
337
337
|
return cls.create_reader("pdf")
|
|
338
|
-
elif extension in [
|
|
338
|
+
elif extension in [
|
|
339
|
+
".csv",
|
|
340
|
+
".xlsx",
|
|
341
|
+
".xls",
|
|
342
|
+
"text/csv",
|
|
343
|
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
|
344
|
+
"application/vnd.ms-excel",
|
|
345
|
+
]:
|
|
339
346
|
return cls.create_reader("csv")
|
|
340
347
|
elif extension in [".docx", ".doc", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"]:
|
|
341
348
|
return cls.create_reader("docx")
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
from agno.knowledge.remote_content.config import (
|
|
2
|
+
GcsConfig,
|
|
3
|
+
GitHubConfig,
|
|
4
|
+
RemoteContentConfig,
|
|
5
|
+
S3Config,
|
|
6
|
+
SharePointConfig,
|
|
7
|
+
)
|
|
8
|
+
from agno.knowledge.remote_content.remote_content import (
|
|
9
|
+
GCSContent,
|
|
10
|
+
GitHubContent,
|
|
11
|
+
RemoteContent,
|
|
12
|
+
S3Content,
|
|
13
|
+
SharePointContent,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
# Config classes
|
|
18
|
+
"RemoteContentConfig",
|
|
19
|
+
"S3Config",
|
|
20
|
+
"GcsConfig",
|
|
21
|
+
"SharePointConfig",
|
|
22
|
+
"GitHubConfig",
|
|
23
|
+
# Content classes
|
|
24
|
+
"RemoteContent",
|
|
25
|
+
"S3Content",
|
|
26
|
+
"GCSContent",
|
|
27
|
+
"SharePointContent",
|
|
28
|
+
"GitHubContent",
|
|
29
|
+
]
|