auto-coder 0.1.375__py3-none-any.whl → 0.1.376__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of auto-coder might be problematic. Click here for more details.

Files changed (51) hide show
  1. {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/METADATA +1 -1
  2. {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/RECORD +17 -51
  3. autocoder/agent/base_agentic/base_agent.py +9 -8
  4. autocoder/auto_coder_rag.py +12 -0
  5. autocoder/models.py +2 -2
  6. autocoder/rag/cache/local_duckdb_storage_cache.py +63 -33
  7. autocoder/rag/conversation_to_queries.py +37 -5
  8. autocoder/rag/long_context_rag.py +161 -41
  9. autocoder/rag/tools/recall_tool.py +2 -1
  10. autocoder/rag/tools/search_tool.py +2 -1
  11. autocoder/rag/types.py +36 -0
  12. autocoder/utils/_markitdown.py +59 -13
  13. autocoder/version.py +1 -1
  14. autocoder/agent/agentic_edit.py +0 -833
  15. autocoder/agent/agentic_edit_tools/__init__.py +0 -28
  16. autocoder/agent/agentic_edit_tools/ask_followup_question_tool_resolver.py +0 -32
  17. autocoder/agent/agentic_edit_tools/attempt_completion_tool_resolver.py +0 -29
  18. autocoder/agent/agentic_edit_tools/base_tool_resolver.py +0 -29
  19. autocoder/agent/agentic_edit_tools/execute_command_tool_resolver.py +0 -84
  20. autocoder/agent/agentic_edit_tools/list_code_definition_names_tool_resolver.py +0 -75
  21. autocoder/agent/agentic_edit_tools/list_files_tool_resolver.py +0 -62
  22. autocoder/agent/agentic_edit_tools/plan_mode_respond_tool_resolver.py +0 -30
  23. autocoder/agent/agentic_edit_tools/read_file_tool_resolver.py +0 -36
  24. autocoder/agent/agentic_edit_tools/replace_in_file_tool_resolver.py +0 -95
  25. autocoder/agent/agentic_edit_tools/search_files_tool_resolver.py +0 -70
  26. autocoder/agent/agentic_edit_tools/use_mcp_tool_resolver.py +0 -55
  27. autocoder/agent/agentic_edit_tools/write_to_file_tool_resolver.py +0 -98
  28. autocoder/agent/agentic_edit_types.py +0 -124
  29. autocoder/auto_coder_lang.py +0 -60
  30. autocoder/auto_coder_rag_client_mcp.py +0 -170
  31. autocoder/auto_coder_rag_mcp.py +0 -193
  32. autocoder/common/llm_rerank.py +0 -84
  33. autocoder/common/model_speed_test.py +0 -392
  34. autocoder/common/v2/agent/agentic_edit_conversation.py +0 -188
  35. autocoder/common/v2/agent/ignore_utils.py +0 -50
  36. autocoder/dispacher/actions/plugins/action_translate.py +0 -214
  37. autocoder/ignorefiles/__init__.py +0 -4
  38. autocoder/ignorefiles/ignore_file_utils.py +0 -63
  39. autocoder/ignorefiles/test_ignore_file_utils.py +0 -91
  40. autocoder/linters/code_linter.py +0 -588
  41. autocoder/rag/loaders/test_image_loader.py +0 -209
  42. autocoder/rag/raw_rag.py +0 -96
  43. autocoder/rag/simple_directory_reader.py +0 -646
  44. autocoder/rag/simple_rag.py +0 -404
  45. autocoder/regex_project/__init__.py +0 -162
  46. autocoder/utils/coder.py +0 -125
  47. autocoder/utils/tests.py +0 -37
  48. {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/LICENSE +0 -0
  49. {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/WHEEL +0 -0
  50. {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/entry_points.txt +0 -0
  51. {auto_coder-0.1.375.dist-info → auto_coder-0.1.376.dist-info}/top_level.txt +0 -0
@@ -1,646 +0,0 @@
1
- """Simple reader that reads files of different formats from a directory."""
2
-
3
- import os
4
- import logging
5
- import mimetypes
6
- import multiprocessing
7
- import warnings
8
- from datetime import datetime
9
- from functools import reduce
10
- import asyncio
11
- from itertools import repeat
12
- from pathlib import Path, PurePosixPath
13
- import fsspec
14
- from fsspec.implementations.local import LocalFileSystem
15
- from typing import Any, Callable, Dict, Generator, List, Optional, Type
16
-
17
- from llama_index.core.readers.base import BaseReader
18
- from llama_index.core.async_utils import run_jobs, get_asyncio_module
19
- from llama_index.core.schema import Document
20
- from tqdm import tqdm
21
-
22
-
23
- def _try_loading_included_file_formats() -> Dict[str, Type[BaseReader]]:
24
- try:
25
- from llama_index.readers.file import (
26
- DocxReader,
27
- EpubReader,
28
- HWPReader,
29
- ImageReader,
30
- IPYNBReader,
31
- MarkdownReader,
32
- MboxReader,
33
- PandasCSVReader,
34
- PDFReader,
35
- PptxReader,
36
- VideoAudioReader,
37
- ) # pants: no-infer-dep
38
- except ImportError:
39
- raise ImportError("`llama-index-readers-file` package not found")
40
-
41
- default_file_reader_cls: Dict[str, Type[BaseReader]] = {
42
- ".hwp": HWPReader,
43
- ".pdf": PDFReader,
44
- ".docx": DocxReader,
45
- ".pptx": PptxReader,
46
- ".ppt": PptxReader,
47
- ".pptm": PptxReader,
48
- ".jpg": ImageReader,
49
- ".png": ImageReader,
50
- ".jpeg": ImageReader,
51
- ".mp3": VideoAudioReader,
52
- ".mp4": VideoAudioReader,
53
- ".csv": PandasCSVReader,
54
- ".epub": EpubReader,
55
- ".md": MarkdownReader,
56
- ".mbox": MboxReader,
57
- ".ipynb": IPYNBReader,
58
- }
59
- return default_file_reader_cls
60
-
61
-
62
- def _format_file_timestamp(timestamp: float) -> Optional[str]:
63
- """Format file timestamp to a %Y-%m-%d string.
64
-
65
- Args:
66
- timestamp (float): timestamp in float
67
-
68
- Returns:
69
- str: formatted timestamp
70
- """
71
- try:
72
- return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d")
73
- except Exception:
74
- return None
75
-
76
-
77
- def default_file_metadata_func(
78
- file_path: str, fs: Optional[fsspec.AbstractFileSystem] = None
79
- ) -> Dict:
80
- """Get some handy metadata from filesystem.
81
-
82
- Args:
83
- file_path: str: file path in str
84
- """
85
- fs = fs or get_default_fs()
86
- stat_result = fs.stat(file_path)
87
-
88
- try:
89
- file_name = os.path.basename(str(stat_result["name"]))
90
- except Exception as e:
91
- file_name = os.path.basename(file_path)
92
-
93
- creation_date = _format_file_timestamp(stat_result.get("created"))
94
- last_modified_date = _format_file_timestamp(stat_result.get("mtime"))
95
- last_accessed_date = _format_file_timestamp(stat_result.get("atime"))
96
- default_meta = {
97
- "file_path": file_path,
98
- "file_name": file_name,
99
- "file_type": mimetypes.guess_type(file_path)[0],
100
- "file_size": stat_result.get("size"),
101
- "creation_date": creation_date,
102
- "last_modified_date": last_modified_date,
103
- "last_accessed_date": last_accessed_date,
104
- }
105
-
106
- # Return not null value
107
- return {
108
- meta_key: meta_value
109
- for meta_key, meta_value in default_meta.items()
110
- if meta_value is not None
111
- }
112
-
113
-
114
- class _DefaultFileMetadataFunc:
115
- """
116
- Default file metadata function wrapper which stores the fs.
117
- Allows for pickling of the function.
118
- """
119
-
120
- def __init__(self, fs: Optional[fsspec.AbstractFileSystem] = None):
121
- self.fs = fs or get_default_fs()
122
-
123
- def __call__(self, file_path: str) -> Dict:
124
- return default_file_metadata_func(file_path, self.fs)
125
-
126
-
127
- def get_default_fs() -> fsspec.AbstractFileSystem:
128
- return LocalFileSystem()
129
-
130
-
131
- def is_default_fs(fs: fsspec.AbstractFileSystem) -> bool:
132
- return isinstance(fs, LocalFileSystem) and not fs.auto_mkdir
133
-
134
-
135
- logger = logging.getLogger(__name__)
136
-
137
-
138
- class AutoCoderSimpleDirectoryReader(BaseReader):
139
- """Simple directory reader.
140
-
141
- Load files from file directory.
142
- Automatically select the best file reader given file extensions.
143
-
144
- Args:
145
- input_dir (str): Path to the directory.
146
- input_files (List): List of file paths to read
147
- (Optional; overrides input_dir, exclude)
148
- exclude (List): glob of python file paths to exclude (Optional)
149
- exclude_hidden (bool): Whether to exclude hidden files (dotfiles).
150
- encoding (str): Encoding of the files.
151
- Default is utf-8.
152
- errors (str): how encoding and decoding errors are to be handled,
153
- see https://docs.python.org/3/library/functions.html#open
154
- recursive (bool): Whether to recursively search in subdirectories.
155
- False by default.
156
- filename_as_id (bool): Whether to use the filename as the document id.
157
- False by default.
158
- required_exts (Optional[List[str]]): List of required extensions.
159
- Default is None.
160
- file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file
161
- extension to a BaseReader class that specifies how to convert that file
162
- to text. If not specified, use default from DEFAULT_FILE_READER_CLS.
163
- num_files_limit (Optional[int]): Maximum number of files to read.
164
- Default is None.
165
- file_metadata (Optional[Callable[str, Dict]]): A function that takes
166
- in a filename and returns a Dict of metadata for the Document.
167
- Default is None.
168
- raise_on_error (bool): Whether to raise an error if a file cannot be read.
169
- fs (Optional[fsspec.AbstractFileSystem]): File system to use. Defaults
170
- to using the local file system. Can be changed to use any remote file system
171
- exposed via the fsspec interface.
172
- """
173
-
174
- supported_suffix_fn: Callable = _try_loading_included_file_formats
175
-
176
- def __init__(
177
- self,
178
- input_dir: Optional[str] = None,
179
- input_files: Optional[List] = None,
180
- exclude: Optional[List] = None,
181
- exclude_hidden: bool = True,
182
- errors: str = "ignore",
183
- recursive: bool = False,
184
- encoding: str = "utf-8",
185
- filename_as_id: bool = False,
186
- required_exts: Optional[List[str]] = None,
187
- file_extractor: Optional[Dict[str, BaseReader]] = None,
188
- num_files_limit: Optional[int] = None,
189
- file_metadata: Optional[Callable[[str], Dict]] = None,
190
- raise_on_error: bool = False,
191
- fs: Optional[fsspec.AbstractFileSystem] = None,
192
- ) -> None:
193
- """Initialize with parameters."""
194
- super().__init__()
195
-
196
- if not input_dir and not input_files:
197
- raise ValueError("Must provide either `input_dir` or `input_files`.")
198
-
199
- self.fs = fs or get_default_fs()
200
- self.errors = errors
201
- self.encoding = encoding
202
-
203
- self.exclude = exclude
204
- self.recursive = recursive
205
- self.exclude_hidden = exclude_hidden
206
- self.required_exts = required_exts
207
- self.num_files_limit = num_files_limit
208
- self.raise_on_error = raise_on_error
209
- _Path = Path if is_default_fs(self.fs) else PurePosixPath
210
-
211
- if input_files:
212
- self.input_files = []
213
- for path in input_files:
214
- if not self.fs.isfile(path):
215
- raise ValueError(f"File {path} does not exist.")
216
- input_file = _Path(path)
217
- self.input_files.append(input_file)
218
- elif input_dir:
219
- if not self.fs.isdir(input_dir):
220
- raise ValueError(f"Directory {input_dir} does not exist.")
221
- self.input_dir = _Path(input_dir)
222
- self.exclude = exclude
223
- self.input_files = self._add_files(self.input_dir)
224
-
225
- if file_extractor is not None:
226
- self.file_extractor = file_extractor
227
- else:
228
- self.file_extractor = {}
229
-
230
- self.file_metadata = file_metadata or _DefaultFileMetadataFunc(self.fs)
231
- self.filename_as_id = filename_as_id
232
-
233
- def is_hidden(self, path: Path) -> bool:
234
- return any(
235
- part.startswith(".") and part not in [".", ".."] for part in path.parts
236
- )
237
-
238
- def _add_files(self, input_dir: Path) -> List[Path]:
239
- """Add files."""
240
- all_files = set()
241
- rejected_files = set()
242
- rejected_dirs = set()
243
- # Default to POSIX paths for non-default file systems (e.g. S3)
244
- _Path = Path if is_default_fs(self.fs) else PurePosixPath
245
-
246
- if self.exclude is not None:
247
- for excluded_pattern in self.exclude:
248
- if self.recursive:
249
- # Recursive glob
250
- excluded_glob = _Path(input_dir) / _Path("**") / excluded_pattern
251
- else:
252
- # Non-recursive glob
253
- excluded_glob = _Path(input_dir) / excluded_pattern
254
- for file in self.fs.glob(str(excluded_glob)):
255
- if self.fs.isdir(file):
256
- rejected_dirs.add(_Path(file))
257
- else:
258
- rejected_files.add(_Path(file))
259
-
260
- file_refs: List[str] = []
261
- if self.recursive:
262
- for root, dirs, files in self.fs.walk(input_dir):
263
- for file in files:
264
- file_refs.append(os.path.join(root, file))
265
- else:
266
- file_refs = self.fs.glob(str(input_dir) + "/*")
267
-
268
- for ref in file_refs:
269
- # Manually check if file is hidden or directory instead of
270
- # in glob for backwards compatibility.
271
- ref = _Path(ref)
272
- is_dir = self.fs.isdir(ref)
273
- skip_because_hidden = self.exclude_hidden and self.is_hidden(ref)
274
- skip_because_bad_ext = (
275
- self.required_exts is not None and ref.suffix not in self.required_exts
276
- )
277
- skip_because_excluded = ref in rejected_files
278
- if not skip_because_excluded:
279
- if is_dir:
280
- ref_parent_dir = ref
281
- else:
282
- ref_parent_dir = self.fs._parent(ref)
283
- for rejected_dir in rejected_dirs:
284
- if str(ref_parent_dir).startswith(str(rejected_dir)):
285
- skip_because_excluded = True
286
- logger.debug(
287
- "Skipping %s because it in parent dir %s which is in %s",
288
- ref,
289
- ref_parent_dir,
290
- rejected_dir,
291
- )
292
- break
293
-
294
- if (
295
- is_dir
296
- or skip_because_hidden
297
- or skip_because_bad_ext
298
- or skip_because_excluded
299
- ):
300
- continue
301
- else:
302
- all_files.add(ref)
303
-
304
- new_input_files = sorted(all_files)
305
-
306
- if len(new_input_files) == 0:
307
- raise ValueError(f"No files found in {input_dir}.")
308
-
309
- if self.num_files_limit is not None and self.num_files_limit > 0:
310
- new_input_files = new_input_files[0 : self.num_files_limit]
311
-
312
- # print total number of files added
313
- logger.debug(
314
- f"> [AutoCoderSimpleDirectoryReader] Total files added: {len(new_input_files)}"
315
- )
316
-
317
- return new_input_files
318
-
319
- def _exclude_metadata(self, documents: List[Document]) -> List[Document]:
320
- """Exclude metadata from documents.
321
-
322
- Args:
323
- documents (List[Document]): List of documents.
324
- """
325
- for doc in documents:
326
- # Keep only metadata['file_path'] in both embedding and llm content
327
- # str, which contain extreme important context that about the chunks.
328
- # Dates is provided for convenience of postprocessor such as
329
- # TimeWeightedPostprocessor, but excluded for embedding and LLMprompts
330
- doc.excluded_embed_metadata_keys.extend(
331
- [
332
- "file_name",
333
- "file_type",
334
- "file_size",
335
- "creation_date",
336
- "last_modified_date",
337
- "last_accessed_date",
338
- ]
339
- )
340
- doc.excluded_llm_metadata_keys.extend(
341
- [
342
- "file_name",
343
- "file_type",
344
- "file_size",
345
- "creation_date",
346
- "last_modified_date",
347
- "last_accessed_date",
348
- ]
349
- )
350
-
351
- return documents
352
-
353
- @staticmethod
354
- def load_file(
355
- input_file: Path,
356
- file_metadata: Callable[[str], Dict],
357
- file_extractor: Dict[str, BaseReader],
358
- filename_as_id: bool = False,
359
- encoding: str = "utf-8",
360
- errors: str = "ignore",
361
- raise_on_error: bool = False,
362
- fs: Optional[fsspec.AbstractFileSystem] = None,
363
- ) -> List[Document]:
364
- """Static method for loading file.
365
-
366
- NOTE: necessarily as a static method for parallel processing.
367
-
368
- Args:
369
- input_file (Path): _description_
370
- file_metadata (Callable[[str], Dict]): _description_
371
- file_extractor (Dict[str, BaseReader]): _description_
372
- filename_as_id (bool, optional): _description_. Defaults to False.
373
- encoding (str, optional): _description_. Defaults to "utf-8".
374
- errors (str, optional): _description_. Defaults to "ignore".
375
- fs (Optional[fsspec.AbstractFileSystem], optional): _description_. Defaults to None.
376
-
377
- input_file (Path): File path to read
378
- file_metadata ([Callable[str, Dict]]): A function that takes
379
- in a filename and returns a Dict of metadata for the Document.
380
- file_extractor (Dict[str, BaseReader]): A mapping of file
381
- extension to a BaseReader class that specifies how to convert that file
382
- to text.
383
- filename_as_id (bool): Whether to use the filename as the document id.
384
- encoding (str): Encoding of the files.
385
- Default is utf-8.
386
- errors (str): how encoding and decoding errors are to be handled,
387
- see https://docs.python.org/3/library/functions.html#open
388
- raise_on_error (bool): Whether to raise an error if a file cannot be read.
389
- fs (Optional[fsspec.AbstractFileSystem]): File system to use. Defaults
390
- to using the local file system. Can be changed to use any remote file system
391
-
392
- Returns:
393
- List[Document]: loaded documents
394
- """
395
- # TODO: make this less redundant
396
- default_file_reader_cls = AutoCoderSimpleDirectoryReader.supported_suffix_fn()
397
- default_file_reader_suffix = list(default_file_reader_cls.keys())
398
- metadata: Optional[dict] = None
399
- documents: List[Document] = []
400
-
401
- if file_metadata is not None:
402
- metadata = file_metadata(str(input_file))
403
-
404
- file_suffix = input_file.suffix.lower()
405
- if file_suffix in default_file_reader_suffix or file_suffix in file_extractor:
406
- # use file readers
407
- if file_suffix not in file_extractor:
408
- # instantiate file reader if not already
409
- reader_cls = default_file_reader_cls[file_suffix]
410
- file_extractor[file_suffix] = reader_cls()
411
- reader = file_extractor[file_suffix]
412
-
413
- # load data -- catch all errors except for ImportError
414
- try:
415
- kwargs = {"extra_info": metadata}
416
- if fs and not is_default_fs(fs):
417
- kwargs["fs"] = fs
418
- docs = reader.load_data(input_file, **kwargs)
419
- except ImportError as e:
420
- # ensure that ImportError is raised so user knows
421
- # about missing dependencies
422
- raise ImportError(str(e))
423
- except Exception as e:
424
- if raise_on_error:
425
- raise Exception("Error loading file") from e
426
- # otherwise, just skip the file and report the error
427
- print(
428
- f"Failed to load file {input_file} with error: {e}. Skipping...",
429
- flush=True,
430
- )
431
- return []
432
-
433
- # iterate over docs if needed
434
- if filename_as_id:
435
- for i, doc in enumerate(docs):
436
- doc.id_ = f"{input_file!s}_part_{i}"
437
-
438
- documents.extend(docs)
439
- else:
440
- # do standard read
441
- fs = fs or get_default_fs()
442
- with fs.open(input_file, errors=errors, encoding=encoding) as f:
443
- data = f.read().decode(encoding, errors=errors)
444
-
445
- doc = Document(text=data, metadata=metadata or {})
446
- if filename_as_id:
447
- doc.id_ = str(input_file)
448
-
449
- documents.append(doc)
450
-
451
- return documents
452
-
453
- async def aload_file(self, input_file: Path) -> List[Document]:
454
- """Load file asynchronously."""
455
- # TODO: make this less redundant
456
- default_file_reader_cls = AutoCoderSimpleDirectoryReader.supported_suffix_fn()
457
- default_file_reader_suffix = list(default_file_reader_cls.keys())
458
- metadata: Optional[dict] = None
459
- documents: List[Document] = []
460
-
461
- if self.file_metadata is not None:
462
- metadata = self.file_metadata(str(input_file))
463
-
464
- file_suffix = input_file.suffix.lower()
465
- if (
466
- file_suffix in default_file_reader_suffix
467
- or file_suffix in self.file_extractor
468
- ):
469
- # use file readers
470
- if file_suffix not in self.file_extractor:
471
- # instantiate file reader if not already
472
- reader_cls = default_file_reader_cls[file_suffix]
473
- self.file_extractor[file_suffix] = reader_cls()
474
- reader = self.file_extractor[file_suffix]
475
-
476
- # load data -- catch all errors except for ImportError
477
- try:
478
- kwargs = {"extra_info": metadata}
479
- if self.fs and not is_default_fs(self.fs):
480
- kwargs["fs"] = self.fs
481
- docs = await reader.aload_data(input_file, **kwargs)
482
- except ImportError as e:
483
- # ensure that ImportError is raised so user knows
484
- # about missing dependencies
485
- raise ImportError(str(e))
486
- except Exception as e:
487
- if self.raise_on_error:
488
- raise
489
- # otherwise, just skip the file and report the error
490
- print(
491
- f"Failed to load file {input_file} with error: {e}. Skipping...",
492
- flush=True,
493
- )
494
- return []
495
-
496
- # iterate over docs if needed
497
- if self.filename_as_id:
498
- for i, doc in enumerate(docs):
499
- doc.id_ = f"{input_file!s}_part_{i}"
500
-
501
- documents.extend(docs)
502
- else:
503
- # do standard read
504
- fs = self.fs or get_default_fs()
505
- with fs.open(input_file, errors=self.errors, encoding=self.encoding) as f:
506
- data = f.read().decode(self.encoding, errors=self.errors)
507
-
508
- doc = Document(text=data, metadata=metadata or {})
509
- if self.filename_as_id:
510
- doc.id_ = str(input_file)
511
-
512
- documents.append(doc)
513
-
514
- return documents
515
-
516
- def load_data(
517
- self,
518
- show_progress: bool = False,
519
- num_workers: Optional[int] = None,
520
- fs: Optional[fsspec.AbstractFileSystem] = None,
521
- ) -> List[Document]:
522
- """Load data from the input directory.
523
-
524
- Args:
525
- show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
526
- num_workers (Optional[int]): Number of workers to parallelize data-loading over.
527
- fs (Optional[fsspec.AbstractFileSystem]): File system to use. If fs was specified
528
- in the constructor, it will override the fs parameter here.
529
-
530
- Returns:
531
- List[Document]: A list of documents.
532
- """
533
- documents = []
534
-
535
- files_to_process = self.input_files
536
- fs = fs or self.fs
537
-
538
- if num_workers and num_workers > 1:
539
- if num_workers > multiprocessing.cpu_count():
540
- warnings.warn(
541
- "Specified num_workers exceed number of CPUs in the system. "
542
- "Setting `num_workers` down to the maximum CPU count."
543
- )
544
- with multiprocessing.get_context("spawn").Pool(num_workers) as p:
545
- results = p.starmap(
546
- AutoCoderSimpleDirectoryReader.load_file,
547
- zip(
548
- files_to_process,
549
- repeat(self.file_metadata),
550
- repeat(self.file_extractor),
551
- repeat(self.filename_as_id),
552
- repeat(self.encoding),
553
- repeat(self.errors),
554
- repeat(self.raise_on_error),
555
- repeat(fs),
556
- ),
557
- )
558
- documents = reduce(lambda x, y: x + y, results)
559
-
560
- else:
561
- if show_progress:
562
- files_to_process = tqdm(
563
- self.input_files, desc="Loading files", unit="file"
564
- )
565
- for input_file in files_to_process:
566
- documents.extend(
567
- AutoCoderSimpleDirectoryReader.load_file(
568
- input_file=input_file,
569
- file_metadata=self.file_metadata,
570
- file_extractor=self.file_extractor,
571
- filename_as_id=self.filename_as_id,
572
- encoding=self.encoding,
573
- errors=self.errors,
574
- raise_on_error=self.raise_on_error,
575
- fs=fs,
576
- )
577
- )
578
-
579
- return self._exclude_metadata(documents)
580
-
581
- async def aload_data(
582
- self,
583
- show_progress: bool = False,
584
- num_workers: Optional[int] = None,
585
- fs: Optional[fsspec.AbstractFileSystem] = None,
586
- ) -> List[Document]:
587
- """Load data from the input directory.
588
-
589
- Args:
590
- show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
591
- num_workers (Optional[int]): Number of workers to parallelize data-loading over.
592
- fs (Optional[fsspec.AbstractFileSystem]): File system to use. If fs was specified
593
- in the constructor, it will override the fs parameter here.
594
-
595
- Returns:
596
- List[Document]: A list of documents.
597
- """
598
- files_to_process = self.input_files
599
- fs = fs or self.fs
600
-
601
- coroutines = [self.aload_file(input_file) for input_file in files_to_process]
602
- if num_workers:
603
- document_lists = await run_jobs(
604
- coroutines, show_progress=show_progress, workers=num_workers
605
- )
606
- elif show_progress:
607
- _asyncio = get_asyncio_module(show_progress=show_progress)
608
- document_lists = await _asyncio.gather(*coroutines)
609
- else:
610
- document_lists = await asyncio.gather(*coroutines)
611
- documents = [doc for doc_list in document_lists for doc in doc_list]
612
-
613
- return self._exclude_metadata(documents)
614
-
615
- def iter_data(
616
- self, show_progress: bool = False
617
- ) -> Generator[List[Document], Any, Any]:
618
- """Load data iteratively from the input directory.
619
-
620
- Args:
621
- show_progress (bool): Whether to show tqdm progress bars. Defaults to False.
622
-
623
- Returns:
624
- Generator[List[Document]]: A list of documents.
625
- """
626
- files_to_process = self.input_files
627
-
628
- if show_progress:
629
- files_to_process = tqdm(self.input_files, desc="Loading files", unit="file")
630
-
631
- for input_file in files_to_process:
632
- documents = AutoCoderSimpleDirectoryReader.load_file(
633
- input_file=input_file,
634
- file_metadata=self.file_metadata,
635
- file_extractor=self.file_extractor,
636
- filename_as_id=self.filename_as_id,
637
- encoding=self.encoding,
638
- errors=self.errors,
639
- raise_on_error=self.raise_on_error,
640
- fs=self.fs,
641
- )
642
-
643
- documents = self._exclude_metadata(documents)
644
-
645
- if len(documents) > 0:
646
- yield documents