abstract-utilities 0.2.2.513__py3-none-any.whl → 0.2.2.627__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. abstract_utilities/__init__.py +11 -3
  2. abstract_utilities/class_utils/caller_utils.py +19 -0
  3. abstract_utilities/class_utils/global_utils.py +35 -20
  4. abstract_utilities/class_utils/imports/imports.py +1 -1
  5. abstract_utilities/directory_utils/__init__.py +2 -4
  6. abstract_utilities/directory_utils/imports/__init__.py +2 -0
  7. abstract_utilities/directory_utils/imports/imports.py +1 -0
  8. abstract_utilities/directory_utils/imports/module_imports.py +2 -0
  9. abstract_utilities/directory_utils/src/__init__.py +4 -0
  10. abstract_utilities/directory_utils/src/directory_utils.py +110 -0
  11. abstract_utilities/directory_utils/src/name_utils.py +43 -0
  12. abstract_utilities/directory_utils/src/size_utils.py +57 -0
  13. abstract_utilities/directory_utils/src/utils.py +116 -0
  14. abstract_utilities/file_utils/imports/constants.py +81 -7
  15. abstract_utilities/file_utils/imports/imports.py +0 -4
  16. abstract_utilities/file_utils/imports/module_imports.py +1 -1
  17. abstract_utilities/file_utils/src/__init__.py +2 -4
  18. abstract_utilities/file_utils/src/file_filters/__init__.py +4 -0
  19. abstract_utilities/file_utils/src/file_filters/ensure_utils.py +118 -0
  20. abstract_utilities/file_utils/src/file_filters/filter_params.py +86 -0
  21. abstract_utilities/file_utils/src/file_filters/filter_utils.py +78 -0
  22. abstract_utilities/file_utils/src/file_filters/predicate_utils.py +116 -0
  23. abstract_utilities/file_utils/src/file_filters.py +114 -47
  24. abstract_utilities/file_utils/src/file_reader.py +0 -64
  25. abstract_utilities/file_utils/src/file_utils.py +7 -130
  26. abstract_utilities/file_utils/src/filter_params.py +128 -86
  27. abstract_utilities/file_utils/src/find_collect.py +85 -165
  28. abstract_utilities/file_utils/src/find_content.py +210 -0
  29. abstract_utilities/file_utils/src/initFunctionsGen.py +35 -28
  30. abstract_utilities/file_utils/src/initFunctionsGens.py +280 -0
  31. abstract_utilities/file_utils/src/reader_utils/__init__.py +4 -0
  32. abstract_utilities/file_utils/src/reader_utils/directory_reader.py +53 -0
  33. abstract_utilities/file_utils/src/reader_utils/file_reader.py +543 -0
  34. abstract_utilities/file_utils/src/reader_utils/file_readers.py +376 -0
  35. abstract_utilities/file_utils/src/reader_utils/imports.py +18 -0
  36. abstract_utilities/file_utils/src/reader_utils/pdf_utils.py +300 -0
  37. abstract_utilities/file_utils (2)/__init__.py +2 -0
  38. abstract_utilities/file_utils (2)/imports/__init__.py +2 -0
  39. abstract_utilities/file_utils (2)/imports/constants.py +118 -0
  40. abstract_utilities/file_utils (2)/imports/imports/__init__.py +3 -0
  41. abstract_utilities/file_utils (2)/imports/imports/constants.py +119 -0
  42. abstract_utilities/file_utils (2)/imports/imports/imports.py +46 -0
  43. abstract_utilities/file_utils (2)/imports/imports/module_imports.py +8 -0
  44. abstract_utilities/file_utils (2)/imports/utils/__init__.py +3 -0
  45. abstract_utilities/file_utils (2)/imports/utils/classes.py +379 -0
  46. abstract_utilities/file_utils (2)/imports/utils/clean_imps.py +155 -0
  47. abstract_utilities/file_utils (2)/imports/utils/filter_utils.py +341 -0
  48. abstract_utilities/file_utils (2)/src/__init__.py +8 -0
  49. abstract_utilities/file_utils (2)/src/file_filters.py +155 -0
  50. abstract_utilities/file_utils (2)/src/file_reader.py +604 -0
  51. abstract_utilities/file_utils (2)/src/find_collect.py +258 -0
  52. abstract_utilities/file_utils (2)/src/initFunctionsGen.py +286 -0
  53. abstract_utilities/file_utils (2)/src/map_utils.py +28 -0
  54. abstract_utilities/file_utils (2)/src/pdf_utils.py +300 -0
  55. abstract_utilities/import_utils/circular_import_finder.py +222 -0
  56. abstract_utilities/import_utils/circular_import_finder2.py +118 -0
  57. abstract_utilities/import_utils/imports/module_imports.py +3 -1
  58. abstract_utilities/import_utils/src/clean_imports.py +156 -25
  59. abstract_utilities/import_utils/src/dot_utils.py +11 -0
  60. abstract_utilities/import_utils/src/extract_utils.py +4 -0
  61. abstract_utilities/import_utils/src/import_functions.py +66 -2
  62. abstract_utilities/import_utils/src/pkg_utils.py +58 -4
  63. abstract_utilities/import_utils/src/sysroot_utils.py +56 -1
  64. abstract_utilities/log_utils/log_file.py +73 -24
  65. abstract_utilities/parse_utils/parse_utils.py +23 -0
  66. abstract_utilities/path_utils/path_utils.py +25 -23
  67. abstract_utilities/read_write_utils/imports/imports.py +1 -1
  68. abstract_utilities/read_write_utils/read_write_utils.py +99 -31
  69. abstract_utilities/safe_utils/safe_utils.py +30 -0
  70. {abstract_utilities-0.2.2.513.dist-info → abstract_utilities-0.2.2.627.dist-info}/METADATA +1 -1
  71. {abstract_utilities-0.2.2.513.dist-info → abstract_utilities-0.2.2.627.dist-info}/RECORD +73 -32
  72. {abstract_utilities-0.2.2.513.dist-info → abstract_utilities-0.2.2.627.dist-info}/WHEEL +0 -0
  73. {abstract_utilities-0.2.2.513.dist-info → abstract_utilities-0.2.2.627.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,604 @@
1
+ # file_reader.py
2
+ from ..imports import *
3
+ # -------- Public API drop-ins that mirror your originals --------
4
+ from .pdf_utils import *
5
+ # ---------------------------------------------------------------------------
6
+ # NOTE: The following helper functions must be provided elsewhere:
7
+ # - convert_date_string(s: str) -> datetime
8
+ # - read_from_file(path: str) -> pd.DataFrame
9
+ # ---------------------------------------------------------------------------
10
+ def _should_skip_dir(dir_name: str, exclude_dirs: set[str]) -> bool:
11
+ """
12
+ Return True if dir_name match=self.exclude_types)es one of the excluded directory names exactly.
13
+ """
14
+ return dir_name in exclude_dirs
15
+
16
+
17
+ def _should_skip_file(filename: str, exclude_patterns: set[str]) -> bool:
18
+ """
19
+ Return True if filename matches any pattern in exclude_patterns.
20
+ Uses fnmatch (Unix‐style wildcard matching).
21
+ """
22
+ for pat in exclude_patterns:
23
+ if fnmatch.fnmatch(filename, pat):
24
+ return True
25
+ return False
26
+
27
+ def _should_skip_type(filename: str, exclude_types:set[str]) -> bool:
28
+ """
29
+ Return True if filename matches any pattern in exclude_patterns.
30
+ Uses fnmatch (Unix‐style wildcard matching).
31
+ """
32
+ return is_media_type(filename,media_types=exclude_types)
33
+ class shoudSkipManager(metaclass=SingletonMeta):
34
+ def __init__(self,exclude_types=None,exclude_file_patterns=None,exclude_dirs=None):
35
+ if not hasattr(self, 'initialized') or self.initialized == False:
36
+ self.initialized = True
37
+ exclude_types = exclude_types or set()
38
+ exclude_file_patterns = exclude_file_patterns or set()
39
+ exclude_dirs = exclude_dirs or set()
40
+ self.exclude_dirs = exclude_dirs.copy()
41
+ self.exclude_file_patterns = exclude_file_patterns.copy()
42
+ self.exclude_types = exclude_types.copy()
43
+ def should_skip(self,exclude_item=None,exclude_types=None,exclude_file_patterns=None,exclude_dirs=None):
44
+ if (exclude_dirs==None and exclude_file_patterns == None and exclude_types == None) and exclude_item:
45
+ if isinstance(item,str):
46
+ if _should_skip_dir(dir_name=exclude_item, exclude_dirs=self.exclude_dirs):
47
+ return True
48
+ if _should_skip_type(filename=exclude_item, exclude_types=self.exclude_types):
49
+ return True
50
+ if _should_skip_file(filename=exclude_item, exclude_patterns=self.exclude_file_patterns):
51
+ return True
52
+ return False
53
+ elif exclude_types or exclude_file_patterns:
54
+ if exclude_file_patterns != False:
55
+ if _should_skip_file(filename=exclude_item,
56
+ exclude_patterns=self.exclude_file_patterns):
57
+ return True
58
+ if exclude_types != False:
59
+ if _should_skip_type(filename=exclude_item,
60
+ exclude_types=self.exclude_types):
61
+ return True
62
+
63
+ if exclude_dirs:
64
+ if _should_skip_dir(filename=exclude_item, exclude_patterns=self.exclude_dirs):
65
+ return True
66
+ return False
67
+
68
+ SKIP_MGR = shoudSkipManager()
69
+ def should_skip(
70
+ exclude_item=None,
71
+ exclude_types=None,
72
+ exclude_file_patterns=None,
73
+ exclude_dirs=None
74
+ ):
75
+ return shoudSkipManager().should_skip(
76
+ exclude_item=exclude_item,
77
+ exclude_types=exclude_types,
78
+ exclude_file_patterns=exclude_file_patterns,
79
+ exclude_dirs=exclude_dirs
80
+ )
81
+ def re_initialize_skip_mgr(exclude_types=None,
82
+ exclude_file_patterns=None,
83
+ exclude_dirs=None):
84
+ shoudSkipManager().initialized = False
85
+ shoudSkipManager(
86
+ exclude_types=exclude_types,
87
+ exclude_file_patterns=exclude_file_patterns,
88
+ exclude_dirs=exclude_dirs
89
+ )
90
+
91
+ def convert_date_string(s):
92
+ # … your existing stub or real implementation …
93
+ try:
94
+ return datetime.fromisoformat(s)
95
+ except ValueError:
96
+ return None
97
+ # file_utils.py (below your existing imports)
98
+
99
+
100
+
101
+
102
+ def source_engine_for_ext(ext: str) -> str:
103
+ ext = ext.lower()
104
+ mapping = {
105
+ '.parquet': 'pyarrow',
106
+ '.txt': 'python',
107
+ '.csv': 'python',
108
+ '.tsv': 'python',
109
+ '.xlsx': 'openpyxl',
110
+ '.xls': 'xlrd',
111
+ '.xlsb': 'pyxlsb',
112
+ '.ods': 'odf',
113
+ '.geojson':'GeoJSON',
114
+ }
115
+ return mapping.get(ext)
116
+
117
+ def is_valid_file_path(path: str) -> Union[str, None]:
118
+ if not (isinstance(path, str) and path.strip()):
119
+ return None
120
+ if os.path.isfile(path):
121
+ return os.path.splitext(path)[1].lower()
122
+ return None
123
+
124
+ def is_dataframe(obj) -> bool:
125
+ return isinstance(obj, (pd.DataFrame, gpd.GeoDataFrame))
126
+
127
+ def create_dataframe(data=None, columns=None) -> pd.DataFrame:
128
+ # … unchanged …
129
+ if is_dataframe(data):
130
+ return data.copy()
131
+ data = data or {}
132
+ if isinstance(data, dict):
133
+ data = [data]
134
+ if columns is None:
135
+ all_keys = set()
136
+ for row in data:
137
+ if isinstance(row, dict):
138
+ all_keys.update(row.keys())
139
+ columns = list(all_keys)
140
+ if columns is False:
141
+ columns = None
142
+ try:
143
+ return pd.DataFrame(data, columns=columns)
144
+ except Exception as e:
145
+ _logger.error(f"Failed to create DataFrame: {e}")
146
+ return pd.DataFrame([], columns=columns)
147
+
148
+ def read_ods_file(path: str) -> dict[str, pd.DataFrame]:
149
+ # … unchanged …
150
+ if not is_valid_file_path(path):
151
+ _logger.error(f"File not found or invalid: {path}")
152
+ return {}
153
+ try:
154
+ doc = ezodf.opendoc(path)
155
+ except Exception as e:
156
+ _logger.error(f"Failed to open ODS document: {e}")
157
+ return {}
158
+ sheets: dict[str, pd.DataFrame] = {}
159
+ for sheet in doc.sheets:
160
+ table_rows = []
161
+ for row in sheet.rows():
162
+ row_data = []
163
+ for cell in row:
164
+ if cell.value_type == 'date':
165
+ row_data.append(convert_date_string(str(cell.value)))
166
+ else:
167
+ row_data.append(cell.value)
168
+ table_rows.append(row_data)
169
+ df = pd.DataFrame(table_rows)
170
+ sheets[sheet.name] = df
171
+ _logger.info(f"Processed sheet: {sheet.name}")
172
+ return sheets
173
+
174
+ def read_ods_as_excel(path: str, xlsx_path: str | None = None) -> pd.DataFrame:
175
+ # … unchanged …
176
+ if not is_valid_file_path(path):
177
+ _logger.error(f"File not found or invalid: {path}")
178
+ return pd.DataFrame()
179
+ if xlsx_path is None:
180
+ tmp_dir = tempfile.mkdtemp()
181
+ xlsx_path = os.path.join(tmp_dir, os.path.basename(path) + '.xlsx')
182
+ cleanup_temp = True
183
+ else:
184
+ cleanup_temp = False
185
+ try:
186
+ # You must implement ods_to_xlsx(...) externally
187
+ ods_to_xlsx(path, xlsx_path)
188
+ except Exception as e:
189
+ _logger.error(f"ODS→XLSX conversion failed: {e}")
190
+ if cleanup_temp:
191
+ shutil.rmtree(tmp_dir)
192
+ return pd.DataFrame()
193
+ try:
194
+ df = pd.read_excel(xlsx_path, engine='openpyxl')
195
+ except Exception as e:
196
+ _logger.error(f"Failed to read converted XLSX: {e}")
197
+ df = pd.DataFrame()
198
+ finally:
199
+ if cleanup_temp:
200
+ shutil.rmtree(tmp_dir)
201
+ return df
202
+
203
+ def filter_df(
204
+ df: pd.DataFrame,
205
+ nrows: int | None = None,
206
+ condition: pd.Series | None = None,
207
+ indices: list[int] | None = None
208
+ ) -> pd.DataFrame:
209
+ if nrows is not None:
210
+ df = df.head(nrows)
211
+ if condition is not None:
212
+ df = df[condition]
213
+ if indices is not None:
214
+ df = df.iloc[indices]
215
+ return df
216
+
217
+ def read_shape_file(path: str) -> Union[gpd.GeoDataFrame, None]:
218
+ # … unchanged …
219
+ ext = is_valid_file_path(path)
220
+ if not ext:
221
+ _logger.error(f"Shape file not found: {path}")
222
+ return None
223
+ ext = ext.lower()
224
+ try:
225
+ if ext in ('.shp', '.cpg', '.dbf', '.shx'):
226
+ return gpd.read_file(path)
227
+ if ext == '.geojson':
228
+ return gpd.read_file(path, driver='GeoJSON')
229
+ if ext == '.prj':
230
+ return read_from_file(path) # Must return GeoDataFrame
231
+ except Exception as e:
232
+ _logger.error(f"Failed to read spatial data ({path}): {e}")
233
+ return None
234
+ _logger.error(f"Unsupported spatial extension: {ext}")
235
+ return None
236
+
237
+
238
+ ##def collect_filepaths(
239
+ ## inputs: Union[str, List[str]],
240
+ ## exclude_dirs: set[str] = None,
241
+ ## exclude_file_patterns: set[str] = None,
242
+ ## exclude_types: set[str] = None
243
+ ##) -> List[str]:
244
+ ## """
245
+ ## Given a path or list of paths, return a list of all file paths under them.
246
+ ## - If an input is a file, it's included (unless it matches an exclude pattern).
247
+ ## - If an input is a directory, walk it recursively:
248
+ ## • Skip any subdirectory named in `exclude_dirs`
249
+ ## • Skip any file whose name matches one of `exclude_file_patterns`
250
+ ## """
251
+ ## re_initialize_skip_mgr(exclude_types=exclude_types,
252
+ ## exclude_file_patterns=exclude_file_patterns,
253
+ ## exclude_dirs=exclude_dirs)
254
+ ##
255
+ ##
256
+ ## # Normalize to list
257
+ ## if isinstance(inputs, str):
258
+ ## paths_to_scan = [inputs]
259
+ ## else:
260
+ ## paths_to_scan = list(inputs)
261
+ ##
262
+ ## all_files: List[str] = []
263
+ ##
264
+ ##
265
+ ##
266
+ ## def _collect_from_dir(dirpath: str):
267
+ ## for dirpath_root, dirnames, filenames in os.walk(dirpath):
268
+ ## # Remove any excluded subdirectories from os.walk
269
+ ## dirnames[:] = [d for d in dirnames if d not in exclude_dirs]
270
+ ##
271
+ ## for fname in filenames:
272
+ ## if should_skip(exclude_item=fname,
273
+ ## exclude_types=True,
274
+ ## exclude_file_patterns=True):
275
+ ## continue
276
+ ## full = os.path.join(dirpath_root, fname)
277
+ ## all_files.append(full)
278
+ ##
279
+ ## for p in paths_to_scan:
280
+ ## if not os.path.exists(p):
281
+ ## # skip nonexistent paths
282
+ ## continue
283
+ ##
284
+ ## if os.path.isfile(p):
285
+ ## basename = os.path.basename(p)
286
+ ## fname = os.path.splitext(basename)
287
+ ## if not should_skip(exclude_item=fname,
288
+ ## exclude_types=True,
289
+ ## exclude_file_patterns=True):
290
+ ## all_files.append(p)
291
+ ## else:
292
+ ## # p is a directory
293
+ ## _collect_from_dir(p)
294
+ ##
295
+ ## return all_files
296
+ ##
297
+ ### requirements:
298
+ ### pip install pdfplumber pdf2image pytesseract pillow
299
+ ### # plus Tesseract binary (apt install tesseract-ocr or brew install tesseract)
300
+
301
+
302
+
303
+ def pdf_to_text(path, keep_page_breaks=True, ocr_if_empty=True):
304
+ """
305
+ Return the full text of *path* (str or Path) as a single string.
306
+
307
+ keep_page_breaks → insert "\f" between pages so you can split later.
308
+ ocr_if_empty → any page with no text layer is rasterised & OCR'd.
309
+ """
310
+ path = Path(path)
311
+ if not path.exists():
312
+ raise FileNotFoundError(path)
313
+
314
+ all_pages = []
315
+
316
+ with pdfplumber.open(path) as pdf:
317
+ for i, page in enumerate(pdf.pages, start=1):
318
+ text = page.extract_text() or "" # might be None
319
+ if (not text.strip()) and ocr_if_empty:
320
+ # rasterise at 300 dpi then Tesseract
321
+ img = convert_from_path(str(path), dpi=300, first_page=i, last_page=i)[0]
322
+ text = pytesseract.image_to_string(img, lang="eng")
323
+ all_pages.append(text)
324
+
325
+ sep = "\f" if keep_page_breaks else "\n"
326
+ return sep.join(all_pages)
327
+ def get_df(
328
+ source: Union[
329
+ str,
330
+ pd.DataFrame,
331
+ gpd.GeoDataFrame,
332
+ dict,
333
+ list,
334
+ FileStorage
335
+ ],
336
+ nrows: int | None = None,
337
+ skiprows: list[int] | int | None = None,
338
+ condition: pd.Series | None = None,
339
+ indices: list[int] | None = None
340
+ ) -> Union[pd.DataFrame, gpd.GeoDataFrame, dict[str, Union[pd.DataFrame, str]], None]:
341
+ """
342
+ Load a DataFrame or GeoDataFrame from various sources, then apply optional filters.
343
+ If `source` is a directory, returns read_directory(source) instead (a dict).
344
+ """
345
+
346
+ # ─── Check for directory first ─────────────────────────────────────────────
347
+ if isinstance(source, str) and os.path.isdir(source):
348
+ return read_directory(root_path=source)
349
+
350
+ # ─── If already a DataFrame/GeoDataFrame, just filter and return ───────────
351
+ if is_dataframe(source):
352
+ _logger.info("Source is already a DataFrame/GeoDataFrame; applying filters.")
353
+ return filter_df(source, nrows=nrows, condition=condition, indices=indices)
354
+
355
+ if source is None:
356
+ _logger.error("No source provided to get_df().")
357
+ return None
358
+
359
+ # ─── Next: If source is a file path, read according to extension ───────────
360
+ if isinstance(source, str) and os.path.isfile(source):
361
+ ext = os.path.splitext(source)[1].lower()
362
+ try:
363
+ _logger.info(f"Loading file {source} with extension '{ext}'.")
364
+ if ext in ('.csv', '.tsv', '.txt'):
365
+ sep = {'.csv': ',', '.tsv': '\t', '.txt': None}.get(ext)
366
+ df = pd.read_csv(source, skiprows=skiprows, sep=sep, nrows=nrows)
367
+ elif ext in ('.ods', '.xlsx', '.xls', '.xlsb'):
368
+ engine = source_engine_for_ext(ext)
369
+ if ext == '.ods':
370
+ df = read_ods_as_excel(source)
371
+ else:
372
+ df = pd.read_excel(source, skiprows=skiprows, engine=engine, nrows=nrows)
373
+ elif ext == '.json':
374
+ df = safe_read_from_json(source)
375
+ return df
376
+ elif ext == '.parquet':
377
+ df = pd.read_parquet(source)
378
+ elif ext in ('.shp', '.cpg', '.dbf', '.shx', '.geojson', '.prj'):
379
+ return read_shape_file(source)
380
+ elif ext in ['.pdf']:
381
+ df = pdf_to_text(source)
382
+ return df
383
+ else:
384
+ df = read_from_file(source)
385
+ return df
386
+
387
+ if not isinstance(df, (dict, list, FileStorage)):
388
+ return filter_df(df, nrows=nrows, condition=condition, indices=indices)
389
+ source = df # pass on to next block if needed
390
+
391
+ except Exception as e:
392
+ _logger.error(f"Failed to read '{source}': {e}")
393
+ return None
394
+
395
+ # ─── If source is FileStorage (uploaded) ───────────────────────────────────
396
+ if isinstance(source, FileStorage):
397
+ try:
398
+ filename = secure_filename(source.filename or "uploaded.xlsx")
399
+ _logger.info(f"Reading uploaded file: {filename}")
400
+ df = pd.read_excel(source.stream, nrows=nrows)
401
+ return filter_df(df, nrows=nrows, condition=condition, indices=indices)
402
+ except Exception as e:
403
+ _logger.error(f"Failed to read FileStorage: {e}")
404
+ return None
405
+
406
+ # ─── If source is dict or list, turn into DataFrame ────────────────────────
407
+ if isinstance(source, (dict, list)):
408
+ _logger.info("Creating DataFrame from in-memory data structure.")
409
+ df = pd.DataFrame(source)
410
+ return filter_df(df, nrows=nrows, condition=condition, indices=indices)
411
+
412
+ _logger.error(f"Unsupported source type: {type(source)}")
413
+ return None
414
+
415
+
416
+ def read_file_as_text(paths: Union[str, List[str]]) -> List[str]:
417
+ """
418
+ Given one path or a list of paths, return a list of textual representations
419
+ for each “file” found. If a given path is:
420
+
421
+ 1) A directory → we call read_directory(...) on it (which skips node_modules,
422
+ __pycache__, *.ini, etc.) and iterate over each (relative_path → content).
423
+ 2) A plain‐text file (extension ∈ SUPPORTED_TEXT_EXTENSIONS) → we open it and return its raw text.
424
+ 3) Anything else (e.g. .xlsx, .ods, .parquet, .shp, etc.) → we delegate to get_df(...) and then
425
+ convert whatever get_df(...) gives us into CSV or “to_string()” as appropriate.
426
+
427
+ Returns:
428
+ A list of strings—each string is the “file’s contents” for one actual file.
429
+ (Ordering is “filesystem walk order” for directories, and “in order of the input list” for files.)
430
+
431
+ Raises:
432
+ FileNotFoundError if any path in `paths` does not exist.
433
+ ValueError if a file cannot be parsed/read.
434
+ """
435
+ # Ensure we have a list to iterate
436
+ if isinstance(paths, str):
437
+ files_to_process = [paths]
438
+ else:
439
+ files_to_process = list(paths)
440
+
441
+ all_data: List[str] = []
442
+
443
+ for full_path in files_to_process:
444
+ if not os.path.exists(full_path):
445
+ raise FileNotFoundError(f"Not a valid path: {full_path!r}")
446
+
447
+ # ── If this is a directory, walk it via read_directory(...) ─────────────────
448
+ if os.path.isdir(full_path):
449
+ # read_directory returns a dict: { relative_path: (DataFrame or text) }
450
+ nested_dict: Dict[str, Union[pd.DataFrame, gpd.GeoDataFrame, str]] = read_directory(full_path)
451
+
452
+ for rel, content in nested_dict.items():
453
+ # `content` is either a DataFrame, GeoDataFrame, or a plain‐text string
454
+ if isinstance(content, (pd.DataFrame, gpd.GeoDataFrame)):
455
+ # If GeoDataFrame, convert geometry column to WKT before CSV
456
+ if isinstance(content, gpd.GeoDataFrame):
457
+ gdf = content.copy()
458
+ gdf["geometry"] = gdf["geometry"].apply(lambda g: g.wkt if g is not None else "")
459
+ all_data.append(gdf.to_csv(index=False))
460
+ else:
461
+ all_data.append(content.to_csv(index=False))
462
+ else:
463
+ # Already a text blob
464
+ all_data.append(content)
465
+
466
+ continue # move on to the next item in files_to_process
467
+
468
+ # ── At this point, full_path is guaranteed to be a file ───────────────────────
469
+ ext = os.path.splitext(full_path)[1].lower()
470
+
471
+ # 1) PURE TEXT EXTENSION?
472
+ #if ext in SUPPORTED_TEXT_EXTENSIONS:
473
+ try:
474
+ with open(full_path, "r", encoding="utf-8", errors="replace") as f:
475
+ raw = f.read()
476
+ all_data.append(raw)
477
+ except Exception as e:
478
+ raise ValueError(f"Error reading text file {full_path!r}: {e}")
479
+
480
+ continue
481
+
482
+ # 2) ANY OTHER FILETYPE → delegate to get_df(...) and convert result to text
483
+ try:
484
+ df_or = get_df(full_path)
485
+ except Exception as e:
486
+ raise ValueError(f"get_df() failed for {full_path!r}: {e}")
487
+
488
+ # 2a) If get_df returned a dict (e.g. an ODS with multiple sheets, or a directory)
489
+ if isinstance(df_or, dict):
490
+ # Join each sheet or sub‐file’s DataFrame into one big text block
491
+ for key, value in df_or.items():
492
+ if isinstance(value, (pd.DataFrame, gpd.GeoDataFrame)):
493
+ if isinstance(value, gpd.GeoDataFrame):
494
+ gdf = value.copy()
495
+ gdf["geometry"] = gdf["geometry"].apply(lambda g: g.wkt if g is not None else "")
496
+ block = f"=== {key} ===\n" + gdf.to_csv(index=False)
497
+ else:
498
+ block = f"=== {key} ===\n" + value.to_csv(index=False)
499
+ else:
500
+ # It was already plain‐text under that key
501
+ block = f"=== {key} ===\n" + str(value)
502
+ all_data.append(block)
503
+
504
+ continue
505
+
506
+ # 2b) If get_df returned a DataFrame or GeoDataFrame directly
507
+ if isinstance(df_or, (pd.DataFrame, gpd.GeoDataFrame)):
508
+ if isinstance(df_or, gpd.GeoDataFrame):
509
+ gdf = df_or.copy()
510
+ gdf["geometry"] = gdf["geometry"].apply(lambda g: g.wkt if g is not None else "")
511
+ all_data.append(gdf.to_csv(index=False))
512
+ else:
513
+ all_data.append(df_or.to_csv(index=False))
514
+
515
+ continue
516
+
517
+ # 2c) If get_df returned a list of dicts (rare, but possible)
518
+ if isinstance(df_or, list):
519
+ try:
520
+ temp_df = pd.DataFrame(df_or)
521
+ all_data.append(temp_df.to_csv(index=False))
522
+ except Exception:
523
+ all_data.append(repr(df_or))
524
+ continue
525
+
526
+ # 2d) Otherwise, fall back to repr()
527
+ all_data.append(repr(df_or))
528
+
529
+ return all_data
530
+ def read_directory(
531
+ root_path: str,
532
+ exclude_dirs: set[str] = None,
533
+ exclude_file_patterns: set[str] = None,
534
+ exclude_types: set[str] = None,
535
+ ) -> Dict[str, Union[pd.DataFrame, str]]:
536
+ re_initialize_skip_mgr(exclude_types=exclude_types,
537
+ exclude_file_patterns=exclude_file_patterns,
538
+ exclude_dirs=exclude_dirs)
539
+ if not os.path.isdir(root_path):
540
+ raise FileNotFoundError(f"Not a valid directory: {root_path!r}")
541
+ collected: Dict[str, Union[pd.DataFrame, str]] = {}
542
+ root_path = os.path.abspath(root_path)
543
+ root_len = len(root_path.rstrip(os.sep)) + 1
544
+
545
+ for dirpath, dirnames, filenames in os.walk(root_path):
546
+ # 1) Skip excluded subfolders
547
+ dirnames[:] = [
548
+ d for d in dirnames if not should_skip(exclude_item=d,
549
+ exclude_dirs=True)
550
+ ]
551
+
552
+ for fname in filenames:
553
+ # 2) Skip excluded filename patterns
554
+ if should_skip(exclude_types=fname,exclude_file_patterns=fname):
555
+ _logger.debug(f"Skipping file by pattern: {os.path.join(dirpath, fname)}")
556
+ continue
557
+
558
+ full_path = os.path.join(dirpath, fname)
559
+ rel_path = full_path[root_len:] # e.g. "subdir/logs/vid_to_aud.log"
560
+ ext = os.path.splitext(fname)[1].lower()
561
+
562
+ # ── 2a) If it's one of our “plain‐text” extensions, read it as text right now:
563
+ if ext in {'.txt', '.md', '.csv', '.tsv', '.log'}:
564
+ try:
565
+ with open(full_path, 'r', encoding='utf-8', errors='replace') as f:
566
+ text = f.read()
567
+ collected[rel_path] = text
568
+ _logger.info(f"Read text file: {rel_path}")
569
+ except Exception as e:
570
+ _logger.warning(f"Failed to read {rel_path} as text: {e}")
571
+ continue
572
+
573
+ # ── 2b) Otherwise, try to load via get_df(...) (DataFrame/GeoDataFrame/etc.)
574
+ try:
575
+ df_or_gdf = get_df(full_path)
576
+ if isinstance(df_or_gdf, (pd.DataFrame, gpd.GeoDataFrame)):
577
+ collected[rel_path] = df_or_gdf
578
+ _logger.info(f"Loaded DataFrame: {rel_path}")
579
+ continue
580
+ # If get_df returned a dict (e.g. multi‐sheet ODS), merge as multiple entries
581
+ if isinstance(df_or_gdf, dict):
582
+ for sheet_name, df in df_or_gdf.items():
583
+ key = f"{rel_path}::[{sheet_name}]"
584
+ collected[key] = df
585
+ _logger.info(f"Loaded sheet DataFrame: {key}")
586
+ continue
587
+ # If get_df returned something else (list, non‐DataFrame), fall through to text
588
+ except Exception as e:
589
+ _logger.debug(f"get_df failed for {rel_path}: {e}")
590
+
591
+ # ── 2c) Lastly, if it wasn’t a “pure text” file and get_df didn’t return a DataFrame,
592
+ # treat it as text via read_file_as_text(...) so you get at least something:
593
+ try:
594
+ text = read_file_as_text(full_path)
595
+ # read_file_as_text returns a List[str], but here we're in a single-file context,
596
+ # so just join on "\n\n" or take the first element. For simplicity:
597
+ combined = "\n\n".join(text)
598
+ collected[rel_path] = combined
599
+ _logger.info(f"Read fallback text for: {rel_path}")
600
+ except Exception as e:
601
+ _logger.warning(f"Could not read {rel_path} as text or DataFrame: {e}")
602
+
603
+ return collected
604
+