auto-coder 0.1.375__py3-none-any.whl → 0.1.376__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of auto-coder might be problematic. Click here for more details.
- {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/METADATA +1 -1
- {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/RECORD +17 -51
- autocoder/agent/base_agentic/base_agent.py +9 -8
- autocoder/auto_coder_rag.py +12 -0
- autocoder/models.py +2 -2
- autocoder/rag/cache/local_duckdb_storage_cache.py +63 -33
- autocoder/rag/conversation_to_queries.py +37 -5
- autocoder/rag/long_context_rag.py +161 -41
- autocoder/rag/tools/recall_tool.py +2 -1
- autocoder/rag/tools/search_tool.py +2 -1
- autocoder/rag/types.py +36 -0
- autocoder/utils/_markitdown.py +59 -13
- autocoder/version.py +1 -1
- autocoder/agent/agentic_edit.py +0 -833
- autocoder/agent/agentic_edit_tools/__init__.py +0 -28
- autocoder/agent/agentic_edit_tools/ask_followup_question_tool_resolver.py +0 -32
- autocoder/agent/agentic_edit_tools/attempt_completion_tool_resolver.py +0 -29
- autocoder/agent/agentic_edit_tools/base_tool_resolver.py +0 -29
- autocoder/agent/agentic_edit_tools/execute_command_tool_resolver.py +0 -84
- autocoder/agent/agentic_edit_tools/list_code_definition_names_tool_resolver.py +0 -75
- autocoder/agent/agentic_edit_tools/list_files_tool_resolver.py +0 -62
- autocoder/agent/agentic_edit_tools/plan_mode_respond_tool_resolver.py +0 -30
- autocoder/agent/agentic_edit_tools/read_file_tool_resolver.py +0 -36
- autocoder/agent/agentic_edit_tools/replace_in_file_tool_resolver.py +0 -95
- autocoder/agent/agentic_edit_tools/search_files_tool_resolver.py +0 -70
- autocoder/agent/agentic_edit_tools/use_mcp_tool_resolver.py +0 -55
- autocoder/agent/agentic_edit_tools/write_to_file_tool_resolver.py +0 -98
- autocoder/agent/agentic_edit_types.py +0 -124
- autocoder/auto_coder_lang.py +0 -60
- autocoder/auto_coder_rag_client_mcp.py +0 -170
- autocoder/auto_coder_rag_mcp.py +0 -193
- autocoder/common/llm_rerank.py +0 -84
- autocoder/common/model_speed_test.py +0 -392
- autocoder/common/v2/agent/agentic_edit_conversation.py +0 -188
- autocoder/common/v2/agent/ignore_utils.py +0 -50
- autocoder/dispacher/actions/plugins/action_translate.py +0 -214
- autocoder/ignorefiles/__init__.py +0 -4
- autocoder/ignorefiles/ignore_file_utils.py +0 -63
- autocoder/ignorefiles/test_ignore_file_utils.py +0 -91
- autocoder/linters/code_linter.py +0 -588
- autocoder/rag/loaders/test_image_loader.py +0 -209
- autocoder/rag/raw_rag.py +0 -96
- autocoder/rag/simple_directory_reader.py +0 -646
- autocoder/rag/simple_rag.py +0 -404
- autocoder/regex_project/__init__.py +0 -162
- autocoder/utils/coder.py +0 -125
- autocoder/utils/tests.py +0 -37
- {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/LICENSE +0 -0
- {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/WHEEL +0 -0
- {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/entry_points.txt +0 -0
- {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/top_level.txt +0 -0
|
@@ -1,646 +0,0 @@
|
|
|
1
|
-
"""Simple reader that reads files of different formats from a directory."""
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
import logging
|
|
5
|
-
import mimetypes
|
|
6
|
-
import multiprocessing
|
|
7
|
-
import warnings
|
|
8
|
-
from datetime import datetime
|
|
9
|
-
from functools import reduce
|
|
10
|
-
import asyncio
|
|
11
|
-
from itertools import repeat
|
|
12
|
-
from pathlib import Path, PurePosixPath
|
|
13
|
-
import fsspec
|
|
14
|
-
from fsspec.implementations.local import LocalFileSystem
|
|
15
|
-
from typing import Any, Callable, Dict, Generator, List, Optional, Type
|
|
16
|
-
|
|
17
|
-
from llama_index.core.readers.base import BaseReader
|
|
18
|
-
from llama_index.core.async_utils import run_jobs, get_asyncio_module
|
|
19
|
-
from llama_index.core.schema import Document
|
|
20
|
-
from tqdm import tqdm
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
def _try_loading_included_file_formats() -> Dict[str, Type[BaseReader]]:
|
|
24
|
-
try:
|
|
25
|
-
from llama_index.readers.file import (
|
|
26
|
-
DocxReader,
|
|
27
|
-
EpubReader,
|
|
28
|
-
HWPReader,
|
|
29
|
-
ImageReader,
|
|
30
|
-
IPYNBReader,
|
|
31
|
-
MarkdownReader,
|
|
32
|
-
MboxReader,
|
|
33
|
-
PandasCSVReader,
|
|
34
|
-
PDFReader,
|
|
35
|
-
PptxReader,
|
|
36
|
-
VideoAudioReader,
|
|
37
|
-
) # pants: no-infer-dep
|
|
38
|
-
except ImportError:
|
|
39
|
-
raise ImportError("`llama-index-readers-file` package not found")
|
|
40
|
-
|
|
41
|
-
default_file_reader_cls: Dict[str, Type[BaseReader]] = {
|
|
42
|
-
".hwp": HWPReader,
|
|
43
|
-
".pdf": PDFReader,
|
|
44
|
-
".docx": DocxReader,
|
|
45
|
-
".pptx": PptxReader,
|
|
46
|
-
".ppt": PptxReader,
|
|
47
|
-
".pptm": PptxReader,
|
|
48
|
-
".jpg": ImageReader,
|
|
49
|
-
".png": ImageReader,
|
|
50
|
-
".jpeg": ImageReader,
|
|
51
|
-
".mp3": VideoAudioReader,
|
|
52
|
-
".mp4": VideoAudioReader,
|
|
53
|
-
".csv": PandasCSVReader,
|
|
54
|
-
".epub": EpubReader,
|
|
55
|
-
".md": MarkdownReader,
|
|
56
|
-
".mbox": MboxReader,
|
|
57
|
-
".ipynb": IPYNBReader,
|
|
58
|
-
}
|
|
59
|
-
return default_file_reader_cls
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
def _format_file_timestamp(timestamp: float) -> Optional[str]:
|
|
63
|
-
"""Format file timestamp to a %Y-%m-%d string.
|
|
64
|
-
|
|
65
|
-
Args:
|
|
66
|
-
timestamp (float): timestamp in float
|
|
67
|
-
|
|
68
|
-
Returns:
|
|
69
|
-
str: formatted timestamp
|
|
70
|
-
"""
|
|
71
|
-
try:
|
|
72
|
-
return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")
|
|
73
|
-
except Exception:
|
|
74
|
-
return None
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def default_file_metadata_func(
|
|
78
|
-
file_path: str, fs: Optional[fsspec.AbstractFileSystem] = None
|
|
79
|
-
) -> Dict:
|
|
80
|
-
"""Get some handy metadata from filesystem.
|
|
81
|
-
|
|
82
|
-
Args:
|
|
83
|
-
file_path: str: file path in str
|
|
84
|
-
"""
|
|
85
|
-
fs = fs or get_default_fs()
|
|
86
|
-
stat_result = fs.stat(file_path)
|
|
87
|
-
|
|
88
|
-
try:
|
|
89
|
-
file_name = os.path.basename(str(stat_result["name"]))
|
|
90
|
-
except Exception as e:
|
|
91
|
-
file_name = os.path.basename(file_path)
|
|
92
|
-
|
|
93
|
-
creation_date = _format_file_timestamp(stat_result.get("created"))
|
|
94
|
-
last_modified_date = _format_file_timestamp(stat_result.get("mtime"))
|
|
95
|
-
last_accessed_date = _format_file_timestamp(stat_result.get("atime"))
|
|
96
|
-
default_meta = {
|
|
97
|
-
"file_path": file_path,
|
|
98
|
-
"file_name": file_name,
|
|
99
|
-
"file_type": mimetypes.guess_type(file_path)[0],
|
|
100
|
-
"file_size": stat_result.get("size"),
|
|
101
|
-
"creation_date": creation_date,
|
|
102
|
-
"last_modified_date": last_modified_date,
|
|
103
|
-
"last_accessed_date": last_accessed_date,
|
|
104
|
-
}
|
|
105
|
-
|
|
106
|
-
# Return not null value
|
|
107
|
-
return {
|
|
108
|
-
meta_key: meta_value
|
|
109
|
-
for meta_key, meta_value in default_meta.items()
|
|
110
|
-
if meta_value is not None
|
|
111
|
-
}
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
class _DefaultFileMetadataFunc:
|
|
115
|
-
"""
|
|
116
|
-
Default file metadata function wrapper which stores the fs.
|
|
117
|
-
Allows for pickling of the function.
|
|
118
|
-
"""
|
|
119
|
-
|
|
120
|
-
def __init__(self, fs: Optional[fsspec.AbstractFileSystem] = None):
|
|
121
|
-
self.fs = fs or get_default_fs()
|
|
122
|
-
|
|
123
|
-
def __call__(self, file_path: str) -> Dict:
|
|
124
|
-
return default_file_metadata_func(file_path, self.fs)
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
def get_default_fs() -> fsspec.AbstractFileSystem:
|
|
128
|
-
return LocalFileSystem()
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
def is_default_fs(fs: fsspec.AbstractFileSystem) -> bool:
|
|
132
|
-
return isinstance(fs, LocalFileSystem) and not fs.auto_mkdir
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
logger = logging.getLogger(__name__)
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
class AutoCoderSimpleDirectoryReader(BaseReader):
|
|
139
|
-
"""Simple directory reader.
|
|
140
|
-
|
|
141
|
-
Load files from file directory.
|
|
142
|
-
Automatically select the best file reader given file extensions.
|
|
143
|
-
|
|
144
|
-
Args:
|
|
145
|
-
input_dir (str): Path to the directory.
|
|
146
|
-
input_files (List): List of file paths to read
|
|
147
|
-
(Optional; overrides input_dir, exclude)
|
|
148
|
-
exclude (List): glob of python file paths to exclude (Optional)
|
|
149
|
-
exclude_hidden (bool): Whether to exclude hidden files (dotfiles).
|
|
150
|
-
encoding (str): Encoding of the files.
|
|
151
|
-
Default is utf-8.
|
|
152
|
-
errors (str): how encoding and decoding errors are to be handled,
|
|
153
|
-
see https://docs.python.org/3/library/functions.html#open
|
|
154
|
-
recursive (bool): Whether to recursively search in subdirectories.
|
|
155
|
-
False by default.
|
|
156
|
-
filename_as_id (bool): Whether to use the filename as the document id.
|
|
157
|
-
False by default.
|
|
158
|
-
required_exts (Optional[List[str]]): List of required extensions.
|
|
159
|
-
Default is None.
|
|
160
|
-
file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file
|
|
161
|
-
extension to a BaseReader class that specifies how to convert that file
|
|
162
|
-
to text. If not specified, use default from DEFAULT_FILE_READER_CLS.
|
|
163
|
-
num_files_limit (Optional[int]): Maximum number of files to read.
|
|
164
|
-
Default is None.
|
|
165
|
-
file_metadata (Optional[Callable[str, Dict]]): A function that takes
|
|
166
|
-
in a filename and returns a Dict of metadata for the Document.
|
|
167
|
-
Default is None.
|
|
168
|
-
raise_on_error (bool): Whether to raise an error if a file cannot be read.
|
|
169
|
-
fs (Optional[fsspec.AbstractFileSystem]): File system to use. Defaults
|
|
170
|
-
to using the local file system. Can be changed to use any remote file system
|
|
171
|
-
exposed via the fsspec interface.
|
|
172
|
-
"""
|
|
173
|
-
|
|
174
|
-
supported_suffix_fn: Callable = _try_loading_included_file_formats
|
|
175
|
-
|
|
176
|
-
def __init__(
|
|
177
|
-
self,
|
|
178
|
-
input_dir: Optional[str] = None,
|
|
179
|
-
input_files: Optional[List] = None,
|
|
180
|
-
exclude: Optional[List] = None,
|
|
181
|
-
exclude_hidden: bool = True,
|
|
182
|
-
errors: str = "ignore",
|
|
183
|
-
recursive: bool = False,
|
|
184
|
-
encoding: str = "utf-8",
|
|
185
|
-
filename_as_id: bool = False,
|
|
186
|
-
required_exts: Optional[List[str]] = None,
|
|
187
|
-
file_extractor: Optional[Dict[str, BaseReader]] = None,
|
|
188
|
-
num_files_limit: Optional[int] = None,
|
|
189
|
-
file_metadata: Optional[Callable[[str], Dict]] = None,
|
|
190
|
-
raise_on_error: bool = False,
|
|
191
|
-
fs: Optional[fsspec.AbstractFileSystem] = None,
|
|
192
|
-
) -> None:
|
|
193
|
-
"""Initialize with parameters."""
|
|
194
|
-
super().__init__()
|
|
195
|
-
|
|
196
|
-
if not input_dir and not input_files:
|
|
197
|
-
raise ValueError("Must provide either `input_dir` or `input_files`.")
|
|
198
|
-
|
|
199
|
-
self.fs = fs or get_default_fs()
|
|
200
|
-
self.errors = errors
|
|
201
|
-
self.encoding = encoding
|
|
202
|
-
|
|
203
|
-
self.exclude = exclude
|
|
204
|
-
self.recursive = recursive
|
|
205
|
-
self.exclude_hidden = exclude_hidden
|
|
206
|
-
self.required_exts = required_exts
|
|
207
|
-
self.num_files_limit = num_files_limit
|
|
208
|
-
self.raise_on_error = raise_on_error
|
|
209
|
-
_Path = Path if is_default_fs(self.fs) else PurePosixPath
|
|
210
|
-
|
|
211
|
-
if input_files:
|
|
212
|
-
self.input_files = []
|
|
213
|
-
for path in input_files:
|
|
214
|
-
if not self.fs.isfile(path):
|
|
215
|
-
raise ValueError(f"File {path} does not exist.")
|
|
216
|
-
input_file = _Path(path)
|
|
217
|
-
self.input_files.append(input_file)
|
|
218
|
-
elif input_dir:
|
|
219
|
-
if not self.fs.isdir(input_dir):
|
|
220
|
-
raise ValueError(f"Directory {input_dir} does not exist.")
|
|
221
|
-
self.input_dir = _Path(input_dir)
|
|
222
|
-
self.exclude = exclude
|
|
223
|
-
self.input_files = self._add_files(self.input_dir)
|
|
224
|
-
|
|
225
|
-
if file_extractor is not None:
|
|
226
|
-
self.file_extractor = file_extractor
|
|
227
|
-
else:
|
|
228
|
-
self.file_extractor = {}
|
|
229
|
-
|
|
230
|
-
self.file_metadata = file_metadata or _DefaultFileMetadataFunc(self.fs)
|
|
231
|
-
self.filename_as_id = filename_as_id
|
|
232
|
-
|
|
233
|
-
def is_hidden(self, path: Path) -> bool:
|
|
234
|
-
return any(
|
|
235
|
-
part.startswith(".") and part not in [".", ".."] for part in path.parts
|
|
236
|
-
)
|
|
237
|
-
|
|
238
|
-
def _add_files(self, input_dir: Path) -> List[Path]:
|
|
239
|
-
"""Add files."""
|
|
240
|
-
all_files = set()
|
|
241
|
-
rejected_files = set()
|
|
242
|
-
rejected_dirs = set()
|
|
243
|
-
# Default to POSIX paths for non-default file systems (e.g. S3)
|
|
244
|
-
_Path = Path if is_default_fs(self.fs) else PurePosixPath
|
|
245
|
-
|
|
246
|
-
if self.exclude is not None:
|
|
247
|
-
for excluded_pattern in self.exclude:
|
|
248
|
-
if self.recursive:
|
|
249
|
-
# Recursive glob
|
|
250
|
-
excluded_glob = _Path(input_dir) / _Path("**") / excluded_pattern
|
|
251
|
-
else:
|
|
252
|
-
# Non-recursive glob
|
|
253
|
-
excluded_glob = _Path(input_dir) / excluded_pattern
|
|
254
|
-
for file in self.fs.glob(str(excluded_glob)):
|
|
255
|
-
if self.fs.isdir(file):
|
|
256
|
-
rejected_dirs.add(_Path(file))
|
|
257
|
-
else:
|
|
258
|
-
rejected_files.add(_Path(file))
|
|
259
|
-
|
|
260
|
-
file_refs: List[str] = []
|
|
261
|
-
if self.recursive:
|
|
262
|
-
for root, dirs, files in self.fs.walk(input_dir):
|
|
263
|
-
for file in files:
|
|
264
|
-
file_refs.append(os.path.join(root, file))
|
|
265
|
-
else:
|
|
266
|
-
file_refs = self.fs.glob(str(input_dir) + "/*")
|
|
267
|
-
|
|
268
|
-
for ref in file_refs:
|
|
269
|
-
# Manually check if file is hidden or directory instead of
|
|
270
|
-
# in glob for backwards compatibility.
|
|
271
|
-
ref = _Path(ref)
|
|
272
|
-
is_dir = self.fs.isdir(ref)
|
|
273
|
-
skip_because_hidden = self.exclude_hidden and self.is_hidden(ref)
|
|
274
|
-
skip_because_bad_ext = (
|
|
275
|
-
self.required_exts is not None and ref.suffix not in self.required_exts
|
|
276
|
-
)
|
|
277
|
-
skip_because_excluded = ref in rejected_files
|
|
278
|
-
if not skip_because_excluded:
|
|
279
|
-
if is_dir:
|
|
280
|
-
ref_parent_dir = ref
|
|
281
|
-
else:
|
|
282
|
-
ref_parent_dir = self.fs._parent(ref)
|
|
283
|
-
for rejected_dir in rejected_dirs:
|
|
284
|
-
if str(ref_parent_dir).startswith(str(rejected_dir)):
|
|
285
|
-
skip_because_excluded = True
|
|
286
|
-
logger.debug(
|
|
287
|
-
"Skipping %s because it in parent dir %s which is in %s",
|
|
288
|
-
ref,
|
|
289
|
-
ref_parent_dir,
|
|
290
|
-
rejected_dir,
|
|
291
|
-
)
|
|
292
|
-
break
|
|
293
|
-
|
|
294
|
-
if (
|
|
295
|
-
is_dir
|
|
296
|
-
or skip_because_hidden
|
|
297
|
-
or skip_because_bad_ext
|
|
298
|
-
or skip_because_excluded
|
|
299
|
-
):
|
|
300
|
-
continue
|
|
301
|
-
else:
|
|
302
|
-
all_files.add(ref)
|
|
303
|
-
|
|
304
|
-
new_input_files = sorted(all_files)
|
|
305
|
-
|
|
306
|
-
if len(new_input_files) == 0:
|
|
307
|
-
raise ValueError(f"No files found in {input_dir}.")
|
|
308
|
-
|
|
309
|
-
if self.num_files_limit is not None and self.num_files_limit > 0:
|
|
310
|
-
new_input_files = new_input_files[0 : self.num_files_limit]
|
|
311
|
-
|
|
312
|
-
# print total number of files added
|
|
313
|
-
logger.debug(
|
|
314
|
-
f"> [AutoCoderSimpleDirectoryReader] Total files added: {len(new_input_files)}"
|
|
315
|
-
)
|
|
316
|
-
|
|
317
|
-
return new_input_files
|
|
318
|
-
|
|
319
|
-
def _exclude_metadata(self, documents: List[Document]) -> List[Document]:
|
|
320
|
-
"""Exclude metadata from documents.
|
|
321
|
-
|
|
322
|
-
Args:
|
|
323
|
-
documents (List[Document]): List of documents.
|
|
324
|
-
"""
|
|
325
|
-
for doc in documents:
|
|
326
|
-
# Keep only metadata['file_path'] in both embedding and llm content
|
|
327
|
-
# str, which contain extreme important context that about the chunks.
|
|
328
|
-
# Dates is provided for convenience of postprocessor such as
|
|
329
|
-
# TimeWeightedPostprocessor, but excluded for embedding and LLMprompts
|
|
330
|
-
doc.excluded_embed_metadata_keys.extend(
|
|
331
|
-
[
|
|
332
|
-
"file_name",
|
|
333
|
-
"file_type",
|
|
334
|
-
"file_size",
|
|
335
|
-
"creation_date",
|
|
336
|
-
"last_modified_date",
|
|
337
|
-
"last_accessed_date",
|
|
338
|
-
]
|
|
339
|
-
)
|
|
340
|
-
doc.excluded_llm_metadata_keys.extend(
|
|
341
|
-
[
|
|
342
|
-
"file_name",
|
|
343
|
-
"file_type",
|
|
344
|
-
"file_size",
|
|
345
|
-
"creation_date",
|
|
346
|
-
"last_modified_date",
|
|
347
|
-
"last_accessed_date",
|
|
348
|
-
]
|
|
349
|
-
)
|
|
350
|
-
|
|
351
|
-
return documents
|
|
352
|
-
|
|
353
|
-
@staticmethod
|
|
354
|
-
def load_file(
|
|
355
|
-
input_file: Path,
|
|
356
|
-
file_metadata: Callable[[str], Dict],
|
|
357
|
-
file_extractor: Dict[str, BaseReader],
|
|
358
|
-
filename_as_id: bool = False,
|
|
359
|
-
encoding: str = "utf-8",
|
|
360
|
-
errors: str = "ignore",
|
|
361
|
-
raise_on_error: bool = False,
|
|
362
|
-
fs: Optional[fsspec.AbstractFileSystem] = None,
|
|
363
|
-
) -> List[Document]:
|
|
364
|
-
"""Static method for loading file.
|
|
365
|
-
|
|
366
|
-
NOTE: necessarily as a static method for parallel processing.
|
|
367
|
-
|
|
368
|
-
Args:
|
|
369
|
-
input_file (Path): _description_
|
|
370
|
-
file_metadata (Callable[[str], Dict]): _description_
|
|
371
|
-
file_extractor (Dict[str, BaseReader]): _description_
|
|
372
|
-
filename_as_id (bool, optional): _description_. Defaults to False.
|
|
373
|
-
encoding (str, optional): _description_. Defaults to "utf-8".
|
|
374
|
-
errors (str, optional): _description_. Defaults to "ignore".
|
|
375
|
-
fs (Optional[fsspec.AbstractFileSystem], optional): _description_. Defaults to None.
|
|
376
|
-
|
|
377
|
-
input_file (Path): File path to read
|
|
378
|
-
file_metadata ([Callable[str, Dict]]): A function that takes
|
|
379
|
-
in a filename and returns a Dict of metadata for the Document.
|
|
380
|
-
file_extractor (Dict[str, BaseReader]): A mapping of file
|
|
381
|
-
extension to a BaseReader class that specifies how to convert that file
|
|
382
|
-
to text.
|
|
383
|
-
filename_as_id (bool): Whether to use the filename as the document id.
|
|
384
|
-
encoding (str): Encoding of the files.
|
|
385
|
-
Default is utf-8.
|
|
386
|
-
errors (str): how encoding and decoding errors are to be handled,
|
|
387
|
-
see https://docs.python.org/3/library/functions.html#open
|
|
388
|
-
raise_on_error (bool): Whether to raise an error if a file cannot be read.
|
|
389
|
-
fs (Optional[fsspec.AbstractFileSystem]): File system to use. Defaults
|
|
390
|
-
to using the local file system. Can be changed to use any remote file system
|
|
391
|
-
|
|
392
|
-
Returns:
|
|
393
|
-
List[Document]: loaded documents
|
|
394
|
-
"""
|
|
395
|
-
# TODO: make this less redundant
|
|
396
|
-
default_file_reader_cls = AutoCoderSimpleDirectoryReader.supported_suffix_fn()
|
|
397
|
-
default_file_reader_suffix = list(default_file_reader_cls.keys())
|
|
398
|
-
metadata: Optional[dict] = None
|
|
399
|
-
documents: List[Document] = []
|
|
400
|
-
|
|
401
|
-
if file_metadata is not None:
|
|
402
|
-
metadata = file_metadata(str(input_file))
|
|
403
|
-
|
|
404
|
-
file_suffix = input_file.suffix.lower()
|
|
405
|
-
if file_suffix in default_file_reader_suffix or file_suffix in file_extractor:
|
|
406
|
-
# use file readers
|
|
407
|
-
if file_suffix not in file_extractor:
|
|
408
|
-
# instantiate file reader if not already
|
|
409
|
-
reader_cls = default_file_reader_cls[file_suffix]
|
|
410
|
-
file_extractor[file_suffix] = reader_cls()
|
|
411
|
-
reader = file_extractor[file_suffix]
|
|
412
|
-
|
|
413
|
-
# load data -- catch all errors except for ImportError
|
|
414
|
-
try:
|
|
415
|
-
kwargs = {"extra_info": metadata}
|
|
416
|
-
if fs and not is_default_fs(fs):
|
|
417
|
-
kwargs["fs"] = fs
|
|
418
|
-
docs = reader.load_data(input_file, **kwargs)
|
|
419
|
-
except ImportError as e:
|
|
420
|
-
# ensure that ImportError is raised so user knows
|
|
421
|
-
# about missing dependencies
|
|
422
|
-
raise ImportError(str(e))
|
|
423
|
-
except Exception as e:
|
|
424
|
-
if raise_on_error:
|
|
425
|
-
raise Exception("Error loading file") from e
|
|
426
|
-
# otherwise, just skip the file and report the error
|
|
427
|
-
print(
|
|
428
|
-
f"Failed to load file {input_file} with error: {e}. Skipping...",
|
|
429
|
-
flush=True,
|
|
430
|
-
)
|
|
431
|
-
return []
|
|
432
|
-
|
|
433
|
-
# iterate over docs if needed
|
|
434
|
-
if filename_as_id:
|
|
435
|
-
for i, doc in enumerate(docs):
|
|
436
|
-
doc.id_ = f"{input_file!s}_part_{i}"
|
|
437
|
-
|
|
438
|
-
documents.extend(docs)
|
|
439
|
-
else:
|
|
440
|
-
# do standard read
|
|
441
|
-
fs = fs or get_default_fs()
|
|
442
|
-
with fs.open(input_file, errors=errors, encoding=encoding) as f:
|
|
443
|
-
data = f.read().decode(encoding, errors=errors)
|
|
444
|
-
|
|
445
|
-
doc = Document(text=data, metadata=metadata or {})
|
|
446
|
-
if filename_as_id:
|
|
447
|
-
doc.id_ = str(input_file)
|
|
448
|
-
|
|
449
|
-
documents.append(doc)
|
|
450
|
-
|
|
451
|
-
return documents
|
|
452
|
-
|
|
453
|
-
async def aload_file(self, input_file: Path) -> List[Document]:
|
|
454
|
-
"""Load file asynchronously."""
|
|
455
|
-
# TODO: make this less redundant
|
|
456
|
-
default_file_reader_cls = AutoCoderSimpleDirectoryReader.supported_suffix_fn()
|
|
457
|
-
default_file_reader_suffix = list(default_file_reader_cls.keys())
|
|
458
|
-
metadata: Optional[dict] = None
|
|
459
|
-
documents: List[Document] = []
|
|
460
|
-
|
|
461
|
-
if self.file_metadata is not None:
|
|
462
|
-
metadata = self.file_metadata(str(input_file))
|
|
463
|
-
|
|
464
|
-
file_suffix = input_file.suffix.lower()
|
|
465
|
-
if (
|
|
466
|
-
file_suffix in default_file_reader_suffix
|
|
467
|
-
or file_suffix in self.file_extractor
|
|
468
|
-
):
|
|
469
|
-
# use file readers
|
|
470
|
-
if file_suffix not in self.file_extractor:
|
|
471
|
-
# instantiate file reader if not already
|
|
472
|
-
reader_cls = default_file_reader_cls[file_suffix]
|
|
473
|
-
self.file_extractor[file_suffix] = reader_cls()
|
|
474
|
-
reader = self.file_extractor[file_suffix]
|
|
475
|
-
|
|
476
|
-
# load data -- catch all errors except for ImportError
|
|
477
|
-
try:
|
|
478
|
-
kwargs = {"extra_info": metadata}
|
|
479
|
-
if self.fs and not is_default_fs(self.fs):
|
|
480
|
-
kwargs["fs"] = self.fs
|
|
481
|
-
docs = await reader.aload_data(input_file, **kwargs)
|
|
482
|
-
except ImportError as e:
|
|
483
|
-
# ensure that ImportError is raised so user knows
|
|
484
|
-
# about missing dependencies
|
|
485
|
-
raise ImportError(str(e))
|
|
486
|
-
except Exception as e:
|
|
487
|
-
if self.raise_on_error:
|
|
488
|
-
raise
|
|
489
|
-
# otherwise, just skip the file and report the error
|
|
490
|
-
print(
|
|
491
|
-
f"Failed to load file {input_file} with error: {e}. Skipping...",
|
|
492
|
-
flush=True,
|
|
493
|
-
)
|
|
494
|
-
return []
|
|
495
|
-
|
|
496
|
-
# iterate over docs if needed
|
|
497
|
-
if self.filename_as_id:
|
|
498
|
-
for i, doc in enumerate(docs):
|
|
499
|
-
doc.id_ = f"{input_file!s}_part_{i}"
|
|
500
|
-
|
|
501
|
-
documents.extend(docs)
|
|
502
|
-
else:
|
|
503
|
-
# do standard read
|
|
504
|
-
fs = self.fs or get_default_fs()
|
|
505
|
-
with fs.open(input_file, errors=self.errors, encoding=self.encoding) as f:
|
|
506
|
-
data = f.read().decode(self.encoding, errors=self.errors)
|
|
507
|
-
|
|
508
|
-
doc = Document(text=data, metadata=metadata or {})
|
|
509
|
-
if self.filename_as_id:
|
|
510
|
-
doc.id_ = str(input_file)
|
|
511
|
-
|
|
512
|
-
documents.append(doc)
|
|
513
|
-
|
|
514
|
-
return documents
|
|
515
|
-
|
|
516
|
-
def load_data(
|
|
517
|
-
self,
|
|
518
|
-
show_progress: bool = False,
|
|
519
|
-
num_workers: Optional[int] = None,
|
|
520
|
-
fs: Optional[fsspec.AbstractFileSystem] = None,
|
|
521
|
-
) -> List[Document]:
|
|
522
|
-
"""Load data from the input directory.
|
|
523
|
-
|
|
524
|
-
Args:
|
|
525
|
-
show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
|
|
526
|
-
num_workers (Optional[int]): Number of workers to parallelize data-loading over.
|
|
527
|
-
fs (Optional[fsspec.AbstractFileSystem]): File system to use. If fs was specified
|
|
528
|
-
in the constructor, it will override the fs parameter here.
|
|
529
|
-
|
|
530
|
-
Returns:
|
|
531
|
-
List[Document]: A list of documents.
|
|
532
|
-
"""
|
|
533
|
-
documents = []
|
|
534
|
-
|
|
535
|
-
files_to_process = self.input_files
|
|
536
|
-
fs = fs or self.fs
|
|
537
|
-
|
|
538
|
-
if num_workers and num_workers > 1:
|
|
539
|
-
if num_workers > multiprocessing.cpu_count():
|
|
540
|
-
warnings.warn(
|
|
541
|
-
"Specified num_workers exceed number of CPUs in the system. "
|
|
542
|
-
"Setting `num_workers` down to the maximum CPU count."
|
|
543
|
-
)
|
|
544
|
-
with multiprocessing.get_context("spawn").Pool(num_workers) as p:
|
|
545
|
-
results = p.starmap(
|
|
546
|
-
AutoCoderSimpleDirectoryReader.load_file,
|
|
547
|
-
zip(
|
|
548
|
-
files_to_process,
|
|
549
|
-
repeat(self.file_metadata),
|
|
550
|
-
repeat(self.file_extractor),
|
|
551
|
-
repeat(self.filename_as_id),
|
|
552
|
-
repeat(self.encoding),
|
|
553
|
-
repeat(self.errors),
|
|
554
|
-
repeat(self.raise_on_error),
|
|
555
|
-
repeat(fs),
|
|
556
|
-
),
|
|
557
|
-
)
|
|
558
|
-
documents = reduce(lambda x, y: x + y, results)
|
|
559
|
-
|
|
560
|
-
else:
|
|
561
|
-
if show_progress:
|
|
562
|
-
files_to_process = tqdm(
|
|
563
|
-
self.input_files, desc="Loading files", unit="file"
|
|
564
|
-
)
|
|
565
|
-
for input_file in files_to_process:
|
|
566
|
-
documents.extend(
|
|
567
|
-
AutoCoderSimpleDirectoryReader.load_file(
|
|
568
|
-
input_file=input_file,
|
|
569
|
-
file_metadata=self.file_metadata,
|
|
570
|
-
file_extractor=self.file_extractor,
|
|
571
|
-
filename_as_id=self.filename_as_id,
|
|
572
|
-
encoding=self.encoding,
|
|
573
|
-
errors=self.errors,
|
|
574
|
-
raise_on_error=self.raise_on_error,
|
|
575
|
-
fs=fs,
|
|
576
|
-
)
|
|
577
|
-
)
|
|
578
|
-
|
|
579
|
-
return self._exclude_metadata(documents)
|
|
580
|
-
|
|
581
|
-
async def aload_data(
|
|
582
|
-
self,
|
|
583
|
-
show_progress: bool = False,
|
|
584
|
-
num_workers: Optional[int] = None,
|
|
585
|
-
fs: Optional[fsspec.AbstractFileSystem] = None,
|
|
586
|
-
) -> List[Document]:
|
|
587
|
-
"""Load data from the input directory.
|
|
588
|
-
|
|
589
|
-
Args:
|
|
590
|
-
show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
|
|
591
|
-
num_workers (Optional[int]): Number of workers to parallelize data-loading over.
|
|
592
|
-
fs (Optional[fsspec.AbstractFileSystem]): File system to use. If fs was specified
|
|
593
|
-
in the constructor, it will override the fs parameter here.
|
|
594
|
-
|
|
595
|
-
Returns:
|
|
596
|
-
List[Document]: A list of documents.
|
|
597
|
-
"""
|
|
598
|
-
files_to_process = self.input_files
|
|
599
|
-
fs = fs or self.fs
|
|
600
|
-
|
|
601
|
-
coroutines = [self.aload_file(input_file) for input_file in files_to_process]
|
|
602
|
-
if num_workers:
|
|
603
|
-
document_lists = await run_jobs(
|
|
604
|
-
coroutines, show_progress=show_progress, workers=num_workers
|
|
605
|
-
)
|
|
606
|
-
elif show_progress:
|
|
607
|
-
_asyncio = get_asyncio_module(show_progress=show_progress)
|
|
608
|
-
document_lists = await _asyncio.gather(*coroutines)
|
|
609
|
-
else:
|
|
610
|
-
document_lists = await asyncio.gather(*coroutines)
|
|
611
|
-
documents = [doc for doc_list in document_lists for doc in doc_list]
|
|
612
|
-
|
|
613
|
-
return self._exclude_metadata(documents)
|
|
614
|
-
|
|
615
|
-
def iter_data(
|
|
616
|
-
self, show_progress: bool = False
|
|
617
|
-
) -> Generator[List[Document], Any, Any]:
|
|
618
|
-
"""Load data iteratively from the input directory.
|
|
619
|
-
|
|
620
|
-
Args:
|
|
621
|
-
show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
|
|
622
|
-
|
|
623
|
-
Returns:
|
|
624
|
-
Generator[List[Document]]: A list of documents.
|
|
625
|
-
"""
|
|
626
|
-
files_to_process = self.input_files
|
|
627
|
-
|
|
628
|
-
if show_progress:
|
|
629
|
-
files_to_process = tqdm(self.input_files, desc="Loading files", unit="file")
|
|
630
|
-
|
|
631
|
-
for input_file in files_to_process:
|
|
632
|
-
documents = AutoCoderSimpleDirectoryReader.load_file(
|
|
633
|
-
input_file=input_file,
|
|
634
|
-
file_metadata=self.file_metadata,
|
|
635
|
-
file_extractor=self.file_extractor,
|
|
636
|
-
filename_as_id=self.filename_as_id,
|
|
637
|
-
encoding=self.encoding,
|
|
638
|
-
errors=self.errors,
|
|
639
|
-
raise_on_error=self.raise_on_error,
|
|
640
|
-
fs=self.fs,
|
|
641
|
-
)
|
|
642
|
-
|
|
643
|
-
documents = self._exclude_metadata(documents)
|
|
644
|
-
|
|
645
|
-
if len(documents) > 0:
|
|
646
|
-
yield documents
|