agno 2.4.2__py3-none-any.whl → 2.4.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agno/agent/agent.py +13 -0
- agno/db/firestore/firestore.py +53 -30
- agno/db/surrealdb/models.py +5 -5
- agno/db/surrealdb/surrealdb.py +13 -1
- agno/knowledge/chunking/markdown.py +112 -11
- agno/knowledge/embedder/openai.py +8 -4
- agno/knowledge/knowledge.py +59 -6
- agno/knowledge/reader/csv_reader.py +48 -216
- agno/knowledge/reader/excel_reader.py +225 -0
- agno/knowledge/reader/field_labeled_csv_reader.py +13 -179
- agno/knowledge/reader/reader_factory.py +22 -5
- agno/knowledge/reader/utils/__init__.py +17 -0
- agno/knowledge/reader/utils/spreadsheet.py +114 -0
- agno/models/base.py +6 -0
- agno/models/moonshot/__init__.py +3 -0
- agno/models/moonshot/moonshot.py +57 -0
- agno/models/openrouter/responses.py +2 -2
- agno/models/response.py +4 -0
- agno/models/utils.py +5 -0
- agno/os/routers/knowledge/knowledge.py +5 -3
- agno/run/base.py +4 -0
- agno/tools/decorator.py +3 -0
- agno/tools/function.py +3 -0
- agno/tools/unsplash.py +341 -0
- agno/utils/print_response/agent.py +8 -5
- agno/utils/response.py +38 -28
- agno/utils/string.py +2 -1
- agno/vectordb/lancedb/lance_db.py +29 -7
- agno/workflow/workflow.py +16 -6
- {agno-2.4.2.dist-info → agno-2.4.4.dist-info}/METADATA +7 -5
- {agno-2.4.2.dist-info → agno-2.4.4.dist-info}/RECORD +34 -28
- {agno-2.4.2.dist-info → agno-2.4.4.dist-info}/WHEEL +1 -1
- {agno-2.4.2.dist-info → agno-2.4.4.dist-info}/licenses/LICENSE +0 -0
- {agno-2.4.2.dist-info → agno-2.4.4.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,8 @@
|
|
|
1
1
|
import asyncio
|
|
2
2
|
import csv
|
|
3
3
|
import io
|
|
4
|
-
from datetime import date, datetime
|
|
5
4
|
from pathlib import Path
|
|
6
|
-
from typing import IO, Any,
|
|
5
|
+
from typing import IO, Any, List, Optional, Union
|
|
7
6
|
from uuid import uuid4
|
|
8
7
|
|
|
9
8
|
try:
|
|
@@ -15,119 +14,32 @@ from agno.knowledge.chunking.row import RowChunking
|
|
|
15
14
|
from agno.knowledge.chunking.strategy import ChunkingStrategy, ChunkingStrategyType
|
|
16
15
|
from agno.knowledge.document.base import Document
|
|
17
16
|
from agno.knowledge.reader.base import Reader
|
|
17
|
+
from agno.knowledge.reader.utils import stringify_cell_value
|
|
18
18
|
from agno.knowledge.types import ContentType
|
|
19
19
|
from agno.utils.log import log_debug, log_error
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
|
|
23
|
-
"""
|
|
24
|
-
|
|
25
|
-
Priority: explicit name > file path stem > file object name attribute > "workbook"
|
|
26
|
-
"""
|
|
27
|
-
if name:
|
|
28
|
-
return Path(name).stem
|
|
29
|
-
if isinstance(file, Path):
|
|
30
|
-
return file.stem
|
|
31
|
-
return Path(getattr(file, "name", "workbook")).stem
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
def _infer_file_extension(file: Union[Path, IO[Any]], name: Optional[str]) -> str:
|
|
35
|
-
if isinstance(file, Path):
|
|
36
|
-
return file.suffix.lower()
|
|
37
|
-
|
|
38
|
-
file_name = getattr(file, "name", None)
|
|
39
|
-
if isinstance(file_name, str) and file_name:
|
|
40
|
-
return Path(file_name).suffix.lower()
|
|
22
|
+
class CSVReader(Reader):
|
|
23
|
+
"""Reader for CSV files.
|
|
41
24
|
|
|
42
|
-
|
|
43
|
-
|
|
25
|
+
Converts CSV files to documents with optional chunking support.
|
|
26
|
+
For Excel files (.xlsx, .xls), use ExcelReader instead.
|
|
44
27
|
|
|
45
|
-
|
|
28
|
+
Args:
|
|
29
|
+
chunking_strategy: Strategy for chunking documents. Default is RowChunking.
|
|
30
|
+
**kwargs: Additional arguments passed to base Reader.
|
|
46
31
|
|
|
32
|
+
Example:
|
|
33
|
+
```python
|
|
34
|
+
from agno.knowledge.reader.csv_reader import CSVReader
|
|
47
35
|
|
|
48
|
-
|
|
49
|
-
|
|
36
|
+
reader = CSVReader()
|
|
37
|
+
docs = reader.read("data.csv")
|
|
50
38
|
|
|
51
|
-
|
|
52
|
-
|
|
39
|
+
# Custom delimiter
|
|
40
|
+
docs = reader.read("data.tsv", delimiter="\\t")
|
|
41
|
+
```
|
|
53
42
|
"""
|
|
54
|
-
try:
|
|
55
|
-
import xlrd
|
|
56
|
-
except ImportError:
|
|
57
|
-
return cell_value
|
|
58
|
-
|
|
59
|
-
if cell_type == xlrd.XL_CELL_DATE:
|
|
60
|
-
try:
|
|
61
|
-
date_tuple = xlrd.xldate_as_tuple(cell_value, datemode)
|
|
62
|
-
return datetime(*date_tuple)
|
|
63
|
-
except Exception:
|
|
64
|
-
return cell_value
|
|
65
|
-
if cell_type == xlrd.XL_CELL_BOOLEAN:
|
|
66
|
-
return bool(cell_value)
|
|
67
|
-
return cell_value
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
def _stringify_spreadsheet_cell_value(value: Any) -> str:
|
|
71
|
-
if value is None:
|
|
72
|
-
return ""
|
|
73
|
-
|
|
74
|
-
# Handle datetime/date before float check (datetime is not a float)
|
|
75
|
-
if isinstance(value, datetime):
|
|
76
|
-
return value.isoformat()
|
|
77
|
-
if isinstance(value, date):
|
|
78
|
-
return value.isoformat()
|
|
79
|
-
|
|
80
|
-
if isinstance(value, float) and value.is_integer():
|
|
81
|
-
return str(int(value))
|
|
82
|
-
|
|
83
|
-
result = str(value)
|
|
84
|
-
# Normalize all line endings to space to preserve row integrity in CSV-like output
|
|
85
|
-
# Must handle CRLF first before individual CR/LF to avoid double-spacing
|
|
86
|
-
result = result.replace("\r\n", " ") # Windows (CRLF)
|
|
87
|
-
result = result.replace("\r", " ") # Old Mac (CR)
|
|
88
|
-
result = result.replace("\n", " ") # Unix (LF)
|
|
89
|
-
return result
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
def _row_values_to_csv_line(row_values: Sequence[Any]) -> str:
|
|
93
|
-
values = [_stringify_spreadsheet_cell_value(v) for v in row_values]
|
|
94
|
-
while values and values[-1] == "":
|
|
95
|
-
values.pop()
|
|
96
|
-
|
|
97
|
-
return ", ".join(values)
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
def _excel_rows_to_documents(
|
|
101
|
-
*,
|
|
102
|
-
workbook_name: str,
|
|
103
|
-
sheets: Iterable[Tuple[str, Iterable[Sequence[Any]]]],
|
|
104
|
-
) -> List[Document]:
|
|
105
|
-
documents = []
|
|
106
|
-
for sheet_index, (sheet_name, rows) in enumerate(sheets, start=1):
|
|
107
|
-
lines = []
|
|
108
|
-
for row in rows:
|
|
109
|
-
line = _row_values_to_csv_line(row)
|
|
110
|
-
if line:
|
|
111
|
-
lines.append(line)
|
|
112
|
-
|
|
113
|
-
if not lines:
|
|
114
|
-
log_debug(f"Sheet '{sheet_name}' is empty, skipping")
|
|
115
|
-
continue
|
|
116
|
-
|
|
117
|
-
documents.append(
|
|
118
|
-
Document(
|
|
119
|
-
name=workbook_name,
|
|
120
|
-
id=str(uuid4()),
|
|
121
|
-
meta_data={"sheet_name": sheet_name, "sheet_index": sheet_index},
|
|
122
|
-
content="\n".join(lines),
|
|
123
|
-
)
|
|
124
|
-
)
|
|
125
|
-
|
|
126
|
-
return documents
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
class CSVReader(Reader):
|
|
130
|
-
"""Reader for CSV files"""
|
|
131
43
|
|
|
132
44
|
def __init__(self, chunking_strategy: Optional[ChunkingStrategy] = RowChunking(), **kwargs):
|
|
133
45
|
super().__init__(chunking_strategy=chunking_strategy, **kwargs)
|
|
@@ -146,28 +58,27 @@ class CSVReader(Reader):
|
|
|
146
58
|
|
|
147
59
|
@classmethod
|
|
148
60
|
def get_supported_content_types(cls) -> List[ContentType]:
|
|
149
|
-
|
|
61
|
+
"""Get the list of supported content types."""
|
|
62
|
+
return [ContentType.CSV]
|
|
150
63
|
|
|
151
64
|
def read(
|
|
152
65
|
self, file: Union[Path, IO[Any]], delimiter: str = ",", quotechar: str = '"', name: Optional[str] = None
|
|
153
66
|
) -> List[Document]:
|
|
154
|
-
|
|
155
|
-
file_extension = _infer_file_extension(file, name)
|
|
156
|
-
if file_extension in {ContentType.XLSX, ContentType.XLS}:
|
|
157
|
-
workbook_name = _get_workbook_name(file, name)
|
|
67
|
+
"""Read a CSV file and return a list of documents.
|
|
158
68
|
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
69
|
+
Args:
|
|
70
|
+
file: Path to CSV file or file-like object.
|
|
71
|
+
delimiter: CSV field delimiter. Default is comma.
|
|
72
|
+
quotechar: CSV quote character. Default is double quote.
|
|
73
|
+
name: Optional name override for the document.
|
|
163
74
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
for document in documents:
|
|
167
|
-
chunked_documents.extend(self.chunk_document(document))
|
|
168
|
-
return chunked_documents
|
|
169
|
-
return documents
|
|
75
|
+
Returns:
|
|
76
|
+
List of Document objects.
|
|
170
77
|
|
|
78
|
+
Raises:
|
|
79
|
+
FileNotFoundError: If the file path doesn't exist.
|
|
80
|
+
"""
|
|
81
|
+
try:
|
|
171
82
|
if isinstance(file, Path):
|
|
172
83
|
if not file.exists():
|
|
173
84
|
raise FileNotFoundError(f"Could not find file: {file}")
|
|
@@ -186,8 +97,8 @@ class CSVReader(Reader):
|
|
|
186
97
|
with file_content as csvfile:
|
|
187
98
|
csv_reader = csv.reader(csvfile, delimiter=delimiter, quotechar=quotechar)
|
|
188
99
|
for row in csv_reader:
|
|
189
|
-
#
|
|
190
|
-
csv_lines.append(", ".join(
|
|
100
|
+
# Normalize line endings in CSV cells to preserve row integrity
|
|
101
|
+
csv_lines.append(", ".join(stringify_cell_value(cell) for cell in row))
|
|
191
102
|
|
|
192
103
|
documents = [
|
|
193
104
|
Document(
|
|
@@ -204,8 +115,6 @@ class CSVReader(Reader):
|
|
|
204
115
|
return documents
|
|
205
116
|
except FileNotFoundError:
|
|
206
117
|
raise
|
|
207
|
-
except ImportError:
|
|
208
|
-
raise
|
|
209
118
|
except UnicodeDecodeError as e:
|
|
210
119
|
file_desc = getattr(file, "name", str(file)) if isinstance(file, IO) else file
|
|
211
120
|
log_error(f"Encoding error reading {file_desc}: {e}. Try specifying a different encoding.")
|
|
@@ -223,32 +132,22 @@ class CSVReader(Reader):
|
|
|
223
132
|
page_size: int = 1000,
|
|
224
133
|
name: Optional[str] = None,
|
|
225
134
|
) -> List[Document]:
|
|
226
|
-
"""
|
|
227
|
-
Read a CSV file asynchronously, processing batches of rows concurrently.
|
|
135
|
+
"""Read a CSV file asynchronously, processing batches of rows concurrently.
|
|
228
136
|
|
|
229
137
|
Args:
|
|
230
|
-
file: Path or file-like object
|
|
231
|
-
delimiter: CSV delimiter
|
|
232
|
-
quotechar: CSV quote character
|
|
233
|
-
page_size: Number of rows per page
|
|
138
|
+
file: Path to CSV file or file-like object.
|
|
139
|
+
delimiter: CSV field delimiter. Default is comma.
|
|
140
|
+
quotechar: CSV quote character. Default is double quote.
|
|
141
|
+
page_size: Number of rows per page for large files.
|
|
142
|
+
name: Optional name override for the document.
|
|
234
143
|
|
|
235
144
|
Returns:
|
|
236
|
-
List of Document objects
|
|
145
|
+
List of Document objects.
|
|
146
|
+
|
|
147
|
+
Raises:
|
|
148
|
+
FileNotFoundError: If the file path doesn't exist.
|
|
237
149
|
"""
|
|
238
150
|
try:
|
|
239
|
-
file_extension = _infer_file_extension(file, name)
|
|
240
|
-
if file_extension in {ContentType.XLSX, ContentType.XLS}:
|
|
241
|
-
workbook_name = _get_workbook_name(file, name)
|
|
242
|
-
|
|
243
|
-
if file_extension == ContentType.XLSX:
|
|
244
|
-
documents = await asyncio.to_thread(self._read_xlsx, file, workbook_name=workbook_name)
|
|
245
|
-
else:
|
|
246
|
-
documents = await asyncio.to_thread(self._read_xls, file, workbook_name=workbook_name)
|
|
247
|
-
|
|
248
|
-
if self.chunk:
|
|
249
|
-
documents = await self.chunk_documents_async(documents)
|
|
250
|
-
return documents
|
|
251
|
-
|
|
252
151
|
if isinstance(file, Path):
|
|
253
152
|
if not file.exists():
|
|
254
153
|
raise FileNotFoundError(f"Could not find file: {file}")
|
|
@@ -269,10 +168,8 @@ class CSVReader(Reader):
|
|
|
269
168
|
total_rows = len(rows)
|
|
270
169
|
|
|
271
170
|
if total_rows <= 10:
|
|
272
|
-
#
|
|
273
|
-
csv_content = " ".join(
|
|
274
|
-
", ".join(_stringify_spreadsheet_cell_value(cell) for cell in row) for row in rows
|
|
275
|
-
)
|
|
171
|
+
# Small files: single document
|
|
172
|
+
csv_content = " ".join(", ".join(stringify_cell_value(cell) for cell in row) for row in rows)
|
|
276
173
|
documents = [
|
|
277
174
|
Document(
|
|
278
175
|
name=csv_name,
|
|
@@ -281,17 +178,15 @@ class CSVReader(Reader):
|
|
|
281
178
|
)
|
|
282
179
|
]
|
|
283
180
|
else:
|
|
181
|
+
# Large files: paginate and process in parallel
|
|
284
182
|
pages = []
|
|
285
183
|
for i in range(0, total_rows, page_size):
|
|
286
184
|
pages.append(rows[i : i + page_size])
|
|
287
185
|
|
|
288
186
|
async def _process_page(page_number: int, page_rows: List[List[str]]) -> Document:
|
|
289
|
-
"""Process a page of rows into a document"""
|
|
187
|
+
"""Process a page of rows into a document."""
|
|
290
188
|
start_row = (page_number - 1) * page_size + 1
|
|
291
|
-
|
|
292
|
-
page_content = " ".join(
|
|
293
|
-
", ".join(_stringify_spreadsheet_cell_value(cell) for cell in row) for row in page_rows
|
|
294
|
-
)
|
|
189
|
+
page_content = " ".join(", ".join(stringify_cell_value(cell) for cell in row) for row in page_rows)
|
|
295
190
|
|
|
296
191
|
return Document(
|
|
297
192
|
name=csv_name,
|
|
@@ -310,8 +205,6 @@ class CSVReader(Reader):
|
|
|
310
205
|
return documents
|
|
311
206
|
except FileNotFoundError:
|
|
312
207
|
raise
|
|
313
|
-
except ImportError:
|
|
314
|
-
raise
|
|
315
208
|
except UnicodeDecodeError as e:
|
|
316
209
|
file_desc = getattr(file, "name", str(file)) if isinstance(file, IO) else file
|
|
317
210
|
log_error(f"Encoding error reading {file_desc}: {e}. Try specifying a different encoding.")
|
|
@@ -320,64 +213,3 @@ class CSVReader(Reader):
|
|
|
320
213
|
file_desc = getattr(file, "name", str(file)) if isinstance(file, IO) else file
|
|
321
214
|
log_error(f"Error reading {file_desc}: {e}")
|
|
322
215
|
return []
|
|
323
|
-
|
|
324
|
-
def _read_xlsx(self, file: Union[Path, IO[Any]], *, workbook_name: str) -> List[Document]:
|
|
325
|
-
try:
|
|
326
|
-
import openpyxl # type: ignore
|
|
327
|
-
except ImportError as e:
|
|
328
|
-
raise ImportError(
|
|
329
|
-
"`openpyxl` not installed. Please install it via `pip install agno[csv]` or `pip install openpyxl`."
|
|
330
|
-
) from e
|
|
331
|
-
|
|
332
|
-
if isinstance(file, Path):
|
|
333
|
-
workbook = openpyxl.load_workbook(filename=str(file), read_only=True, data_only=True)
|
|
334
|
-
else:
|
|
335
|
-
file.seek(0)
|
|
336
|
-
raw = file.read()
|
|
337
|
-
if isinstance(raw, str):
|
|
338
|
-
raw = raw.encode("utf-8", errors="replace")
|
|
339
|
-
workbook = openpyxl.load_workbook(filename=io.BytesIO(raw), read_only=True, data_only=True)
|
|
340
|
-
|
|
341
|
-
try:
|
|
342
|
-
return _excel_rows_to_documents(
|
|
343
|
-
workbook_name=workbook_name,
|
|
344
|
-
sheets=[(worksheet.title, worksheet.iter_rows(values_only=True)) for worksheet in workbook.worksheets],
|
|
345
|
-
)
|
|
346
|
-
finally:
|
|
347
|
-
workbook.close()
|
|
348
|
-
|
|
349
|
-
def _read_xls(self, file: Union[Path, IO[Any]], *, workbook_name: str) -> List[Document]:
|
|
350
|
-
try:
|
|
351
|
-
import xlrd # type: ignore
|
|
352
|
-
except ImportError as e:
|
|
353
|
-
raise ImportError(
|
|
354
|
-
"`xlrd` not installed. Please install it via `pip install agno[csv]` or `pip install xlrd`."
|
|
355
|
-
) from e
|
|
356
|
-
|
|
357
|
-
if isinstance(file, Path):
|
|
358
|
-
workbook = xlrd.open_workbook(filename=str(file))
|
|
359
|
-
else:
|
|
360
|
-
file.seek(0)
|
|
361
|
-
raw = file.read()
|
|
362
|
-
if isinstance(raw, str):
|
|
363
|
-
raw = raw.encode("utf-8", errors="replace")
|
|
364
|
-
workbook = xlrd.open_workbook(file_contents=raw)
|
|
365
|
-
|
|
366
|
-
sheets: List[Tuple[str, Iterable[Sequence[Any]]]] = []
|
|
367
|
-
for sheet_index in range(workbook.nsheets):
|
|
368
|
-
sheet = workbook.sheet_by_index(sheet_index)
|
|
369
|
-
|
|
370
|
-
def _iter_sheet_rows(_sheet: Any = sheet, _datemode: int = workbook.datemode) -> Iterable[Sequence[Any]]:
|
|
371
|
-
for row_index in range(_sheet.nrows):
|
|
372
|
-
yield [
|
|
373
|
-
_convert_xls_cell_value(
|
|
374
|
-
_sheet.cell_value(row_index, col_index),
|
|
375
|
-
_sheet.cell_type(row_index, col_index),
|
|
376
|
-
_datemode,
|
|
377
|
-
)
|
|
378
|
-
for col_index in range(_sheet.ncols)
|
|
379
|
-
]
|
|
380
|
-
|
|
381
|
-
sheets.append((sheet.name, _iter_sheet_rows()))
|
|
382
|
-
|
|
383
|
-
return _excel_rows_to_documents(workbook_name=workbook_name, sheets=sheets)
|
|
@@ -0,0 +1,225 @@
|
|
|
1
|
+
import asyncio
|
|
2
|
+
import io
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import IO, Any, Iterable, List, Optional, Sequence, Tuple, Union
|
|
5
|
+
|
|
6
|
+
from agno.knowledge.chunking.row import RowChunking
|
|
7
|
+
from agno.knowledge.chunking.strategy import ChunkingStrategy, ChunkingStrategyType
|
|
8
|
+
from agno.knowledge.document.base import Document
|
|
9
|
+
from agno.knowledge.reader.base import Reader
|
|
10
|
+
from agno.knowledge.reader.utils import (
|
|
11
|
+
convert_xls_cell_value,
|
|
12
|
+
excel_rows_to_documents,
|
|
13
|
+
get_workbook_name,
|
|
14
|
+
infer_file_extension,
|
|
15
|
+
)
|
|
16
|
+
from agno.knowledge.types import ContentType
|
|
17
|
+
from agno.utils.log import log_debug, log_error
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ExcelReader(Reader):
|
|
21
|
+
"""Reader for Excel files (.xlsx and .xls)."""
|
|
22
|
+
|
|
23
|
+
def __init__(
|
|
24
|
+
self,
|
|
25
|
+
sheets: Optional[List[Union[str, int]]] = None,
|
|
26
|
+
chunking_strategy: Optional[ChunkingStrategy] = RowChunking(),
|
|
27
|
+
**kwargs,
|
|
28
|
+
):
|
|
29
|
+
super().__init__(chunking_strategy=chunking_strategy, **kwargs)
|
|
30
|
+
self.sheets = sheets
|
|
31
|
+
|
|
32
|
+
@classmethod
|
|
33
|
+
def get_supported_chunking_strategies(cls) -> List[ChunkingStrategyType]:
|
|
34
|
+
"""Get the list of supported chunking strategies for Excel readers."""
|
|
35
|
+
return [
|
|
36
|
+
ChunkingStrategyType.ROW_CHUNKER,
|
|
37
|
+
ChunkingStrategyType.CODE_CHUNKER,
|
|
38
|
+
ChunkingStrategyType.FIXED_SIZE_CHUNKER,
|
|
39
|
+
ChunkingStrategyType.AGENTIC_CHUNKER,
|
|
40
|
+
ChunkingStrategyType.DOCUMENT_CHUNKER,
|
|
41
|
+
ChunkingStrategyType.RECURSIVE_CHUNKER,
|
|
42
|
+
]
|
|
43
|
+
|
|
44
|
+
@classmethod
|
|
45
|
+
def get_supported_content_types(cls) -> List[ContentType]:
|
|
46
|
+
"""Get the list of supported content types."""
|
|
47
|
+
return [ContentType.XLSX, ContentType.XLS]
|
|
48
|
+
|
|
49
|
+
def _should_include_sheet(
|
|
50
|
+
self,
|
|
51
|
+
sheet_name: str,
|
|
52
|
+
sheet_index: int,
|
|
53
|
+
) -> bool:
|
|
54
|
+
"""Check if sheet passes the configured filters.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
sheet_name: Name of the sheet
|
|
58
|
+
sheet_index: 1-based index of the sheet (matches document metadata)
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
True if sheet should be included, False otherwise.
|
|
62
|
+
|
|
63
|
+
Note:
|
|
64
|
+
- Index filtering is 1-based to match sheet_index in document metadata
|
|
65
|
+
- Name filtering is case-insensitive
|
|
66
|
+
- Empty list or None means include all sheets
|
|
67
|
+
"""
|
|
68
|
+
# None or empty list = include all sheets
|
|
69
|
+
if not self.sheets:
|
|
70
|
+
return True
|
|
71
|
+
|
|
72
|
+
for sheet_filter in self.sheets:
|
|
73
|
+
if isinstance(sheet_filter, int):
|
|
74
|
+
# 1-based indexing to match metadata
|
|
75
|
+
if sheet_index == sheet_filter:
|
|
76
|
+
return True
|
|
77
|
+
elif isinstance(sheet_filter, str):
|
|
78
|
+
# Case-insensitive name matching
|
|
79
|
+
if sheet_name.lower() == sheet_filter.lower():
|
|
80
|
+
return True
|
|
81
|
+
|
|
82
|
+
return False
|
|
83
|
+
|
|
84
|
+
def _read_xlsx(self, file: Union[Path, IO[Any]], *, workbook_name: str) -> List[Document]:
|
|
85
|
+
"""Read .xlsx file using openpyxl."""
|
|
86
|
+
try:
|
|
87
|
+
import openpyxl
|
|
88
|
+
except ImportError as e:
|
|
89
|
+
raise ImportError("`openpyxl` not installed. Please install it via `pip install openpyxl`.") from e
|
|
90
|
+
|
|
91
|
+
if isinstance(file, Path):
|
|
92
|
+
workbook = openpyxl.load_workbook(filename=str(file), read_only=True, data_only=True)
|
|
93
|
+
else:
|
|
94
|
+
file.seek(0)
|
|
95
|
+
raw = file.read()
|
|
96
|
+
if isinstance(raw, str):
|
|
97
|
+
raw = raw.encode("utf-8", errors="replace")
|
|
98
|
+
workbook = openpyxl.load_workbook(filename=io.BytesIO(raw), read_only=True, data_only=True)
|
|
99
|
+
|
|
100
|
+
try:
|
|
101
|
+
sheets: List[Tuple[str, int, Iterable[Sequence[Any]]]] = []
|
|
102
|
+
for sheet_index, worksheet in enumerate(workbook.worksheets):
|
|
103
|
+
# Pass 1-based index to match metadata (sheet_index + 1)
|
|
104
|
+
if not self._should_include_sheet(worksheet.title, sheet_index + 1):
|
|
105
|
+
log_debug(f"Skipping sheet '{worksheet.title}' (filtered out)")
|
|
106
|
+
continue
|
|
107
|
+
|
|
108
|
+
sheets.append((worksheet.title, sheet_index + 1, worksheet.iter_rows(values_only=True)))
|
|
109
|
+
|
|
110
|
+
return excel_rows_to_documents(workbook_name=workbook_name, sheets=sheets)
|
|
111
|
+
finally:
|
|
112
|
+
workbook.close()
|
|
113
|
+
|
|
114
|
+
def _read_xls(self, file: Union[Path, IO[Any]], *, workbook_name: str) -> List[Document]:
|
|
115
|
+
"""Read .xls file using xlrd."""
|
|
116
|
+
try:
|
|
117
|
+
import xlrd
|
|
118
|
+
except ImportError as e:
|
|
119
|
+
raise ImportError("`xlrd` not installed. Please install it via `pip install xlrd`.") from e
|
|
120
|
+
|
|
121
|
+
if isinstance(file, Path):
|
|
122
|
+
workbook = xlrd.open_workbook(filename=str(file), encoding_override=self.encoding)
|
|
123
|
+
else:
|
|
124
|
+
file.seek(0)
|
|
125
|
+
raw = file.read()
|
|
126
|
+
if isinstance(raw, str):
|
|
127
|
+
raw = raw.encode("utf-8", errors="replace")
|
|
128
|
+
workbook = xlrd.open_workbook(file_contents=raw, encoding_override=self.encoding)
|
|
129
|
+
|
|
130
|
+
sheets: List[Tuple[str, int, Iterable[Sequence[Any]]]] = []
|
|
131
|
+
for sheet_index in range(workbook.nsheets):
|
|
132
|
+
sheet = workbook.sheet_by_index(sheet_index)
|
|
133
|
+
|
|
134
|
+
# Pass 1-based index to match metadata (sheet_index + 1)
|
|
135
|
+
if not self._should_include_sheet(sheet.name, sheet_index + 1):
|
|
136
|
+
log_debug(f"Skipping sheet '{sheet.name}' (filtered out)")
|
|
137
|
+
continue
|
|
138
|
+
|
|
139
|
+
def _iter_sheet_rows(_sheet: Any = sheet, _datemode: int = workbook.datemode) -> Iterable[Sequence[Any]]:
|
|
140
|
+
for row_index in range(_sheet.nrows):
|
|
141
|
+
yield [
|
|
142
|
+
convert_xls_cell_value(
|
|
143
|
+
_sheet.cell_value(row_index, col_index),
|
|
144
|
+
_sheet.cell_type(row_index, col_index),
|
|
145
|
+
_datemode,
|
|
146
|
+
)
|
|
147
|
+
for col_index in range(_sheet.ncols)
|
|
148
|
+
]
|
|
149
|
+
|
|
150
|
+
sheets.append((sheet.name, sheet_index + 1, _iter_sheet_rows()))
|
|
151
|
+
|
|
152
|
+
return excel_rows_to_documents(workbook_name=workbook_name, sheets=sheets)
|
|
153
|
+
|
|
154
|
+
def read(
|
|
155
|
+
self,
|
|
156
|
+
file: Union[Path, IO[Any]],
|
|
157
|
+
name: Optional[str] = None,
|
|
158
|
+
) -> List[Document]:
|
|
159
|
+
"""Read an Excel file and return documents (one per sheet)."""
|
|
160
|
+
try:
|
|
161
|
+
file_extension = infer_file_extension(file, name)
|
|
162
|
+
workbook_name = get_workbook_name(file, name)
|
|
163
|
+
|
|
164
|
+
if isinstance(file, Path) and not file.exists():
|
|
165
|
+
raise FileNotFoundError(f"Could not find file: {file}")
|
|
166
|
+
|
|
167
|
+
file_desc = str(file) if isinstance(file, Path) else getattr(file, "name", "BytesIO")
|
|
168
|
+
log_debug(f"Reading Excel file: {file_desc}")
|
|
169
|
+
|
|
170
|
+
if file_extension == ContentType.XLSX or file_extension == ".xlsx":
|
|
171
|
+
documents = self._read_xlsx(file, workbook_name=workbook_name)
|
|
172
|
+
elif file_extension == ContentType.XLS or file_extension == ".xls":
|
|
173
|
+
documents = self._read_xls(file, workbook_name=workbook_name)
|
|
174
|
+
else:
|
|
175
|
+
raise ValueError(f"Unsupported file extension: '{file_extension}'. Expected .xlsx or .xls")
|
|
176
|
+
|
|
177
|
+
if self.chunk:
|
|
178
|
+
chunked_documents = []
|
|
179
|
+
for document in documents:
|
|
180
|
+
chunked_documents.extend(self.chunk_document(document))
|
|
181
|
+
return chunked_documents
|
|
182
|
+
|
|
183
|
+
return documents
|
|
184
|
+
|
|
185
|
+
except (FileNotFoundError, ImportError, ValueError):
|
|
186
|
+
raise
|
|
187
|
+
except Exception as e:
|
|
188
|
+
file_desc = getattr(file, "name", str(file)) if isinstance(file, IO) else file
|
|
189
|
+
log_error(f"Error reading {file_desc}: {e}")
|
|
190
|
+
return []
|
|
191
|
+
|
|
192
|
+
async def async_read(
|
|
193
|
+
self,
|
|
194
|
+
file: Union[Path, IO[Any]],
|
|
195
|
+
name: Optional[str] = None,
|
|
196
|
+
) -> List[Document]:
|
|
197
|
+
"""Async version of read()."""
|
|
198
|
+
try:
|
|
199
|
+
file_extension = infer_file_extension(file, name)
|
|
200
|
+
workbook_name = get_workbook_name(file, name)
|
|
201
|
+
|
|
202
|
+
if isinstance(file, Path) and not file.exists():
|
|
203
|
+
raise FileNotFoundError(f"Could not find file: {file}")
|
|
204
|
+
|
|
205
|
+
file_desc = str(file) if isinstance(file, Path) else getattr(file, "name", "BytesIO")
|
|
206
|
+
log_debug(f"Reading Excel file async: {file_desc}")
|
|
207
|
+
|
|
208
|
+
if file_extension == ContentType.XLSX or file_extension == ".xlsx":
|
|
209
|
+
documents = await asyncio.to_thread(self._read_xlsx, file, workbook_name=workbook_name)
|
|
210
|
+
elif file_extension == ContentType.XLS or file_extension == ".xls":
|
|
211
|
+
documents = await asyncio.to_thread(self._read_xls, file, workbook_name=workbook_name)
|
|
212
|
+
else:
|
|
213
|
+
raise ValueError(f"Unsupported file extension: '{file_extension}'. Expected .xlsx or .xls")
|
|
214
|
+
|
|
215
|
+
if self.chunk:
|
|
216
|
+
documents = await self.chunk_documents_async(documents)
|
|
217
|
+
|
|
218
|
+
return documents
|
|
219
|
+
|
|
220
|
+
except (FileNotFoundError, ImportError, ValueError):
|
|
221
|
+
raise
|
|
222
|
+
except Exception as e:
|
|
223
|
+
file_desc = getattr(file, "name", str(file)) if isinstance(file, IO) else file
|
|
224
|
+
log_error(f"Error reading {file_desc}: {e}")
|
|
225
|
+
return []
|