abstract-utilities 0.2.2.492__py3-none-any.whl → 0.2.2.495__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. abstract_utilities/__init__.py +0 -1
  2. abstract_utilities/file_utils/__init__.py +1 -2
  3. abstract_utilities/file_utils/imports/constants.py +6 -0
  4. abstract_utilities/file_utils/imports/imports.py +1 -1
  5. abstract_utilities/file_utils/imports/module_imports.py +1 -2
  6. abstract_utilities/file_utils/module_imports.py +12 -0
  7. abstract_utilities/file_utils/src/__init__.py +10 -0
  8. abstract_utilities/file_utils/src/file_filters.py +110 -0
  9. abstract_utilities/file_utils/src/file_reader.py +607 -0
  10. abstract_utilities/file_utils/src/file_utils.py +279 -0
  11. abstract_utilities/file_utils/src/filter_params.py +155 -0
  12. abstract_utilities/file_utils/src/find_collect.py +154 -0
  13. abstract_utilities/file_utils/src/initFunctionsGen.py +286 -0
  14. abstract_utilities/file_utils/src/map_utils.py +29 -0
  15. abstract_utilities/file_utils/src/pdf_utils.py +300 -0
  16. abstract_utilities/file_utils/src/type_checks.py +92 -0
  17. abstract_utilities/import_utils/__init__.py +2 -0
  18. abstract_utilities/import_utils/imports/__init__.py +4 -0
  19. abstract_utilities/import_utils/imports/constants.py +2 -0
  20. abstract_utilities/import_utils/imports/imports.py +4 -0
  21. abstract_utilities/import_utils/imports/module_imports.py +6 -0
  22. abstract_utilities/import_utils/imports/utils.py +30 -0
  23. abstract_utilities/import_utils/src/__init__.py +7 -0
  24. abstract_utilities/import_utils/src/clean_imports.py +122 -0
  25. abstract_utilities/import_utils/src/dot_utils.py +60 -0
  26. abstract_utilities/import_utils/src/extract_utils.py +42 -0
  27. abstract_utilities/import_utils/src/import_functions.py +46 -0
  28. abstract_utilities/import_utils/src/import_utils.py +299 -0
  29. abstract_utilities/import_utils/src/package_utils/__init__.py +139 -0
  30. abstract_utilities/import_utils/src/package_utils/context_utils.py +27 -0
  31. abstract_utilities/import_utils/src/package_utils/import_collectors.py +53 -0
  32. abstract_utilities/import_utils/src/package_utils/path_utils.py +28 -0
  33. abstract_utilities/import_utils/src/package_utils/safe_import.py +27 -0
  34. abstract_utilities/import_utils/src/package_utils.py +140 -0
  35. abstract_utilities/import_utils/src/sysroot_utils.py +57 -0
  36. abstract_utilities/path_utils.py +1 -12
  37. abstract_utilities/read_write_utils.py +66 -32
  38. {abstract_utilities-0.2.2.492.dist-info → abstract_utilities-0.2.2.495.dist-info}/METADATA +1 -1
  39. {abstract_utilities-0.2.2.492.dist-info → abstract_utilities-0.2.2.495.dist-info}/RECORD +42 -11
  40. {abstract_utilities-0.2.2.492.dist-info → abstract_utilities-0.2.2.495.dist-info}/top_level.txt +1 -0
  41. imports/__init__.py +36 -0
  42. {abstract_utilities-0.2.2.492.dist-info → abstract_utilities-0.2.2.495.dist-info}/WHEEL +0 -0
@@ -0,0 +1,607 @@
1
+ # file_reader.py
2
+ from ..imports import *
3
+ # -------- Public API drop-ins that mirror your originals --------
4
+ from .filter_params import *
5
+ from .file_filters import *
6
+ from .file_utils import *
7
+ from .pdf_utils import *
8
+ # ---------------------------------------------------------------------------
9
+ # NOTE: The following helper functions must be provided elsewhere:
10
+ # - convert_date_string(s: str) -> datetime
11
+ # - read_from_file(path: str) -> pd.DataFrame
12
+ # ---------------------------------------------------------------------------
13
+ def _should_skip_dir(dir_name: str, exclude_dirs: set[str]) -> bool:
14
+ """
15
+ Return True if dir_name match=self.exclude_types)es one of the excluded directory names exactly.
16
+ """
17
+ return dir_name in exclude_dirs
18
+
19
+
20
+ def _should_skip_file(filename: str, exclude_patterns: set[str]) -> bool:
21
+ """
22
+ Return True if filename matches any pattern in exclude_patterns.
23
+ Uses fnmatch (Unix‐style wildcard matching).
24
+ """
25
+ for pat in exclude_patterns:
26
+ if fnmatch.fnmatch(filename, pat):
27
+ return True
28
+ return False
29
+
30
+ def _should_skip_type(filename: str, exclude_types:set[str]) -> bool:
31
+ """
32
+ Return True if filename matches any pattern in exclude_patterns.
33
+ Uses fnmatch (Unix‐style wildcard matching).
34
+ """
35
+ return is_media_type(filename,media_types=exclude_types)
36
+ class shoudSkipManager(metaclass=SingletonMeta):
37
+ def __init__(self,exclude_types=None,exclude_file_patterns=None,exclude_dirs=None):
38
+ if not hasattr(self, 'initialized') or self.initialized == False:
39
+ self.initialized = True
40
+ exclude_types = exclude_types or set()
41
+ exclude_file_patterns = exclude_file_patterns or set()
42
+ exclude_dirs = exclude_dirs or set()
43
+ self.exclude_dirs = exclude_dirs.copy()
44
+ self.exclude_file_patterns = exclude_file_patterns.copy()
45
+ self.exclude_types = exclude_types.copy()
46
+ def should_skip(self,exclude_item=None,exclude_types=None,exclude_file_patterns=None,exclude_dirs=None):
47
+ if (exclude_dirs==None and exclude_file_patterns == None and exclude_types == None) and exclude_item:
48
+ if isinstance(item,str):
49
+ if _should_skip_dir(dir_name=exclude_item, exclude_dirs=self.exclude_dirs):
50
+ return True
51
+ if _should_skip_type(filename=exclude_item, exclude_types=self.exclude_types):
52
+ return True
53
+ if _should_skip_file(filename=exclude_item, exclude_patterns=self.exclude_file_patterns):
54
+ return True
55
+ return False
56
+ elif exclude_types or exclude_file_patterns:
57
+ if exclude_file_patterns != False:
58
+ if _should_skip_file(filename=exclude_item,
59
+ exclude_patterns=self.exclude_file_patterns):
60
+ return True
61
+ if exclude_types != False:
62
+ if _should_skip_type(filename=exclude_item,
63
+ exclude_types=self.exclude_types):
64
+ return True
65
+
66
+ if exclude_dirs:
67
+ if _should_skip_dir(filename=exclude_item, exclude_patterns=self.exclude_dirs):
68
+ return True
69
+ return False
70
+
71
+ SKIP_MGR = shoudSkipManager()
72
+ def should_skip(
73
+ exclude_item=None,
74
+ exclude_types=None,
75
+ exclude_file_patterns=None,
76
+ exclude_dirs=None
77
+ ):
78
+ return shoudSkipManager().should_skip(
79
+ exclude_item=exclude_item,
80
+ exclude_types=exclude_types,
81
+ exclude_file_patterns=exclude_file_patterns,
82
+ exclude_dirs=exclude_dirs
83
+ )
84
+ def re_initialize_skip_mgr(exclude_types=None,
85
+ exclude_file_patterns=None,
86
+ exclude_dirs=None):
87
+ shoudSkipManager().initialized = False
88
+ shoudSkipManager(
89
+ exclude_types=exclude_types,
90
+ exclude_file_patterns=exclude_file_patterns,
91
+ exclude_dirs=exclude_dirs
92
+ )
93
+
94
+ def convert_date_string(s):
95
+ # … your existing stub or real implementation …
96
+ try:
97
+ return datetime.fromisoformat(s)
98
+ except ValueError:
99
+ return None
100
+ # file_utils.py (below your existing imports)
101
+
102
+
103
+
104
+
105
+ def source_engine_for_ext(ext: str) -> str:
106
+ ext = ext.lower()
107
+ mapping = {
108
+ '.parquet': 'pyarrow',
109
+ '.txt': 'python',
110
+ '.csv': 'python',
111
+ '.tsv': 'python',
112
+ '.xlsx': 'openpyxl',
113
+ '.xls': 'xlrd',
114
+ '.xlsb': 'pyxlsb',
115
+ '.ods': 'odf',
116
+ '.geojson':'GeoJSON',
117
+ }
118
+ return mapping.get(ext)
119
+
120
+ def is_valid_file_path(path: str) -> Union[str, None]:
121
+ if not (isinstance(path, str) and path.strip()):
122
+ return None
123
+ if os.path.isfile(path):
124
+ return os.path.splitext(path)[1].lower()
125
+ return None
126
+
127
+ def is_dataframe(obj) -> bool:
128
+ return isinstance(obj, (pd.DataFrame, gpd.GeoDataFrame))
129
+
130
+ def create_dataframe(data=None, columns=None) -> pd.DataFrame:
131
+ # … unchanged …
132
+ if is_dataframe(data):
133
+ return data.copy()
134
+ data = data or {}
135
+ if isinstance(data, dict):
136
+ data = [data]
137
+ if columns is None:
138
+ all_keys = set()
139
+ for row in data:
140
+ if isinstance(row, dict):
141
+ all_keys.update(row.keys())
142
+ columns = list(all_keys)
143
+ if columns is False:
144
+ columns = None
145
+ try:
146
+ return pd.DataFrame(data, columns=columns)
147
+ except Exception as e:
148
+ _logger.error(f"Failed to create DataFrame: {e}")
149
+ return pd.DataFrame([], columns=columns)
150
+
151
+ def read_ods_file(path: str) -> dict[str, pd.DataFrame]:
152
+ # … unchanged …
153
+ if not is_valid_file_path(path):
154
+ _logger.error(f"File not found or invalid: {path}")
155
+ return {}
156
+ try:
157
+ doc = ezodf.opendoc(path)
158
+ except Exception as e:
159
+ _logger.error(f"Failed to open ODS document: {e}")
160
+ return {}
161
+ sheets: dict[str, pd.DataFrame] = {}
162
+ for sheet in doc.sheets:
163
+ table_rows = []
164
+ for row in sheet.rows():
165
+ row_data = []
166
+ for cell in row:
167
+ if cell.value_type == 'date':
168
+ row_data.append(convert_date_string(str(cell.value)))
169
+ else:
170
+ row_data.append(cell.value)
171
+ table_rows.append(row_data)
172
+ df = pd.DataFrame(table_rows)
173
+ sheets[sheet.name] = df
174
+ _logger.info(f"Processed sheet: {sheet.name}")
175
+ return sheets
176
+
177
+ def read_ods_as_excel(path: str, xlsx_path: str | None = None) -> pd.DataFrame:
178
+ # … unchanged …
179
+ if not is_valid_file_path(path):
180
+ _logger.error(f"File not found or invalid: {path}")
181
+ return pd.DataFrame()
182
+ if xlsx_path is None:
183
+ tmp_dir = tempfile.mkdtemp()
184
+ xlsx_path = os.path.join(tmp_dir, os.path.basename(path) + '.xlsx')
185
+ cleanup_temp = True
186
+ else:
187
+ cleanup_temp = False
188
+ try:
189
+ # You must implement ods_to_xlsx(...) externally
190
+ ods_to_xlsx(path, xlsx_path)
191
+ except Exception as e:
192
+ _logger.error(f"ODS→XLSX conversion failed: {e}")
193
+ if cleanup_temp:
194
+ shutil.rmtree(tmp_dir)
195
+ return pd.DataFrame()
196
+ try:
197
+ df = pd.read_excel(xlsx_path, engine='openpyxl')
198
+ except Exception as e:
199
+ _logger.error(f"Failed to read converted XLSX: {e}")
200
+ df = pd.DataFrame()
201
+ finally:
202
+ if cleanup_temp:
203
+ shutil.rmtree(tmp_dir)
204
+ return df
205
+
206
+ def filter_df(
207
+ df: pd.DataFrame,
208
+ nrows: int | None = None,
209
+ condition: pd.Series | None = None,
210
+ indices: list[int] | None = None
211
+ ) -> pd.DataFrame:
212
+ if nrows is not None:
213
+ df = df.head(nrows)
214
+ if condition is not None:
215
+ df = df[condition]
216
+ if indices is not None:
217
+ df = df.iloc[indices]
218
+ return df
219
+
220
+ def read_shape_file(path: str) -> Union[gpd.GeoDataFrame, None]:
221
+ # … unchanged …
222
+ ext = is_valid_file_path(path)
223
+ if not ext:
224
+ _logger.error(f"Shape file not found: {path}")
225
+ return None
226
+ ext = ext.lower()
227
+ try:
228
+ if ext in ('.shp', '.cpg', '.dbf', '.shx'):
229
+ return gpd.read_file(path)
230
+ if ext == '.geojson':
231
+ return gpd.read_file(path, driver='GeoJSON')
232
+ if ext == '.prj':
233
+ return read_from_file(path) # Must return GeoDataFrame
234
+ except Exception as e:
235
+ _logger.error(f"Failed to read spatial data ({path}): {e}")
236
+ return None
237
+ _logger.error(f"Unsupported spatial extension: {ext}")
238
+ return None
239
+
240
+
241
+ def collect_filepaths(
242
+ inputs: Union[str, List[str]],
243
+ exclude_dirs: set[str] = None,
244
+ exclude_file_patterns: set[str] = None,
245
+ exclude_types: set[str] = None
246
+ ) -> List[str]:
247
+ """
248
+ Given a path or list of paths, return a list of all file paths under them.
249
+ - If an input is a file, it's included (unless it matches an exclude pattern).
250
+ - If an input is a directory, walk it recursively:
251
+ • Skip any subdirectory named in `exclude_dirs`
252
+ • Skip any file whose name matches one of `exclude_file_patterns`
253
+ """
254
+ re_initialize_skip_mgr(exclude_types=exclude_types,
255
+ exclude_file_patterns=exclude_file_patterns,
256
+ exclude_dirs=exclude_dirs)
257
+
258
+
259
+ # Normalize to list
260
+ if isinstance(inputs, str):
261
+ paths_to_scan = [inputs]
262
+ else:
263
+ paths_to_scan = list(inputs)
264
+
265
+ all_files: List[str] = []
266
+
267
+
268
+
269
+ def _collect_from_dir(dirpath: str):
270
+ for dirpath_root, dirnames, filenames in os.walk(dirpath):
271
+ # Remove any excluded subdirectories from os.walk
272
+ dirnames[:] = [d for d in dirnames if d not in exclude_dirs]
273
+
274
+ for fname in filenames:
275
+ if should_skip(exclude_item=fname,
276
+ exclude_types=True,
277
+ exclude_file_patterns=True):
278
+ continue
279
+ full = os.path.join(dirpath_root, fname)
280
+ all_files.append(full)
281
+
282
+ for p in paths_to_scan:
283
+ if not os.path.exists(p):
284
+ # skip nonexistent paths
285
+ continue
286
+
287
+ if os.path.isfile(p):
288
+ basename = os.path.basename(p)
289
+ fname = os.path.splitext(basename)
290
+ if not should_skip(exclude_item=fname,
291
+ exclude_types=True,
292
+ exclude_file_patterns=True):
293
+ all_files.append(p)
294
+ else:
295
+ # p is a directory
296
+ _collect_from_dir(p)
297
+
298
+ return all_files
299
+
300
+ # requirements:
301
+ # pip install pdfplumber pdf2image pytesseract pillow
302
+ # # plus Tesseract binary (apt install tesseract-ocr or brew install tesseract)
303
+
304
+
305
+
306
+ def pdf_to_text(path, keep_page_breaks=True, ocr_if_empty=True):
307
+ """
308
+ Return the full text of *path* (str or Path) as a single string.
309
+
310
+ keep_page_breaks → insert "\f" between pages so you can split later.
311
+ ocr_if_empty → any page with no text layer is rasterised & OCR'd.
312
+ """
313
+ path = Path(path)
314
+ if not path.exists():
315
+ raise FileNotFoundError(path)
316
+
317
+ all_pages = []
318
+
319
+ with pdfplumber.open(path) as pdf:
320
+ for i, page in enumerate(pdf.pages, start=1):
321
+ text = page.extract_text() or "" # might be None
322
+ if (not text.strip()) and ocr_if_empty:
323
+ # rasterise at 300 dpi then Tesseract
324
+ img = convert_from_path(str(path), dpi=300, first_page=i, last_page=i)[0]
325
+ text = pytesseract.image_to_string(img, lang="eng")
326
+ all_pages.append(text)
327
+
328
+ sep = "\f" if keep_page_breaks else "\n"
329
+ return sep.join(all_pages)
330
+ def get_df(
331
+ source: Union[
332
+ str,
333
+ pd.DataFrame,
334
+ gpd.GeoDataFrame,
335
+ dict,
336
+ list,
337
+ FileStorage
338
+ ],
339
+ nrows: int | None = None,
340
+ skiprows: list[int] | int | None = None,
341
+ condition: pd.Series | None = None,
342
+ indices: list[int] | None = None
343
+ ) -> Union[pd.DataFrame, gpd.GeoDataFrame, dict[str, Union[pd.DataFrame, str]], None]:
344
+ """
345
+ Load a DataFrame or GeoDataFrame from various sources, then apply optional filters.
346
+ If `source` is a directory, returns read_directory(source) instead (a dict).
347
+ """
348
+
349
+ # ─── Check for directory first ─────────────────────────────────────────────
350
+ if isinstance(source, str) and os.path.isdir(source):
351
+ return read_directory(root_path=source)
352
+
353
+ # ─── If already a DataFrame/GeoDataFrame, just filter and return ───────────
354
+ if is_dataframe(source):
355
+ _logger.info("Source is already a DataFrame/GeoDataFrame; applying filters.")
356
+ return filter_df(source, nrows=nrows, condition=condition, indices=indices)
357
+
358
+ if source is None:
359
+ _logger.error("No source provided to get_df().")
360
+ return None
361
+
362
+ # ─── Next: If source is a file path, read according to extension ───────────
363
+ if isinstance(source, str) and os.path.isfile(source):
364
+ ext = os.path.splitext(source)[1].lower()
365
+ try:
366
+ _logger.info(f"Loading file {source} with extension '{ext}'.")
367
+ if ext in ('.csv', '.tsv', '.txt'):
368
+ sep = {'.csv': ',', '.tsv': '\t', '.txt': None}.get(ext)
369
+ df = pd.read_csv(source, skiprows=skiprows, sep=sep, nrows=nrows)
370
+ elif ext in ('.ods', '.xlsx', '.xls', '.xlsb'):
371
+ engine = source_engine_for_ext(ext)
372
+ if ext == '.ods':
373
+ df = read_ods_as_excel(source)
374
+ else:
375
+ df = pd.read_excel(source, skiprows=skiprows, engine=engine, nrows=nrows)
376
+ elif ext == '.json':
377
+ df = safe_read_from_json(source)
378
+ return df
379
+ elif ext == '.parquet':
380
+ df = pd.read_parquet(source)
381
+ elif ext in ('.shp', '.cpg', '.dbf', '.shx', '.geojson', '.prj'):
382
+ return read_shape_file(source)
383
+ elif ext in ['.pdf']:
384
+ df = pdf_to_text(source)
385
+ return df
386
+ else:
387
+ df = read_from_file(source)
388
+ return df
389
+
390
+ if not isinstance(df, (dict, list, FileStorage)):
391
+ return filter_df(df, nrows=nrows, condition=condition, indices=indices)
392
+ source = df # pass on to next block if needed
393
+
394
+ except Exception as e:
395
+ _logger.error(f"Failed to read '{source}': {e}")
396
+ return None
397
+
398
+ # ─── If source is FileStorage (uploaded) ───────────────────────────────────
399
+ if isinstance(source, FileStorage):
400
+ try:
401
+ filename = secure_filename(source.filename or "uploaded.xlsx")
402
+ _logger.info(f"Reading uploaded file: {filename}")
403
+ df = pd.read_excel(source.stream, nrows=nrows)
404
+ return filter_df(df, nrows=nrows, condition=condition, indices=indices)
405
+ except Exception as e:
406
+ _logger.error(f"Failed to read FileStorage: {e}")
407
+ return None
408
+
409
+ # ─── If source is dict or list, turn into DataFrame ────────────────────────
410
+ if isinstance(source, (dict, list)):
411
+ _logger.info("Creating DataFrame from in-memory data structure.")
412
+ df = pd.DataFrame(source)
413
+ return filter_df(df, nrows=nrows, condition=condition, indices=indices)
414
+
415
+ _logger.error(f"Unsupported source type: {type(source)}")
416
+ return None
417
+
418
+
419
+ def read_file_as_text(paths: Union[str, List[str]]) -> List[str]:
420
+ """
421
+ Given one path or a list of paths, return a list of textual representations
422
+ for each “file” found. If a given path is:
423
+
424
+ 1) A directory → we call read_directory(...) on it (which skips node_modules,
425
+ __pycache__, *.ini, etc.) and iterate over each (relative_path → content).
426
+ 2) A plain‐text file (extension ∈ SUPPORTED_TEXT_EXTENSIONS) → we open it and return its raw text.
427
+ 3) Anything else (e.g. .xlsx, .ods, .parquet, .shp, etc.) → we delegate to get_df(...) and then
428
+ convert whatever get_df(...) gives us into CSV or “to_string()” as appropriate.
429
+
430
+ Returns:
431
+ A list of strings—each string is the “file’s contents” for one actual file.
432
+ (Ordering is “filesystem walk order” for directories, and “in order of the input list” for files.)
433
+
434
+ Raises:
435
+ FileNotFoundError if any path in `paths` does not exist.
436
+ ValueError if a file cannot be parsed/read.
437
+ """
438
+ # Ensure we have a list to iterate
439
+ if isinstance(paths, str):
440
+ files_to_process = [paths]
441
+ else:
442
+ files_to_process = list(paths)
443
+
444
+ all_data: List[str] = []
445
+
446
+ for full_path in files_to_process:
447
+ if not os.path.exists(full_path):
448
+ raise FileNotFoundError(f"Not a valid path: {full_path!r}")
449
+
450
+ # ── If this is a directory, walk it via read_directory(...) ─────────────────
451
+ if os.path.isdir(full_path):
452
+ # read_directory returns a dict: { relative_path: (DataFrame or text) }
453
+ nested_dict: Dict[str, Union[pd.DataFrame, gpd.GeoDataFrame, str]] = read_directory(full_path)
454
+
455
+ for rel, content in nested_dict.items():
456
+ # `content` is either a DataFrame, GeoDataFrame, or a plain‐text string
457
+ if isinstance(content, (pd.DataFrame, gpd.GeoDataFrame)):
458
+ # If GeoDataFrame, convert geometry column to WKT before CSV
459
+ if isinstance(content, gpd.GeoDataFrame):
460
+ gdf = content.copy()
461
+ gdf["geometry"] = gdf["geometry"].apply(lambda g: g.wkt if g is not None else "")
462
+ all_data.append(gdf.to_csv(index=False))
463
+ else:
464
+ all_data.append(content.to_csv(index=False))
465
+ else:
466
+ # Already a text blob
467
+ all_data.append(content)
468
+
469
+ continue # move on to the next item in files_to_process
470
+
471
+ # ── At this point, full_path is guaranteed to be a file ───────────────────────
472
+ ext = os.path.splitext(full_path)[1].lower()
473
+
474
+ # 1) PURE TEXT EXTENSION?
475
+ #if ext in SUPPORTED_TEXT_EXTENSIONS:
476
+ try:
477
+ with open(full_path, "r", encoding="utf-8", errors="replace") as f:
478
+ raw = f.read()
479
+ all_data.append(raw)
480
+ except Exception as e:
481
+ raise ValueError(f"Error reading text file {full_path!r}: {e}")
482
+
483
+ continue
484
+
485
+ # 2) ANY OTHER FILETYPE → delegate to get_df(...) and convert result to text
486
+ try:
487
+ df_or = get_df(full_path)
488
+ except Exception as e:
489
+ raise ValueError(f"get_df() failed for {full_path!r}: {e}")
490
+
491
+ # 2a) If get_df returned a dict (e.g. an ODS with multiple sheets, or a directory)
492
+ if isinstance(df_or, dict):
493
+ # Join each sheet or sub‐file’s DataFrame into one big text block
494
+ for key, value in df_or.items():
495
+ if isinstance(value, (pd.DataFrame, gpd.GeoDataFrame)):
496
+ if isinstance(value, gpd.GeoDataFrame):
497
+ gdf = value.copy()
498
+ gdf["geometry"] = gdf["geometry"].apply(lambda g: g.wkt if g is not None else "")
499
+ block = f"=== {key} ===\n" + gdf.to_csv(index=False)
500
+ else:
501
+ block = f"=== {key} ===\n" + value.to_csv(index=False)
502
+ else:
503
+ # It was already plain‐text under that key
504
+ block = f"=== {key} ===\n" + str(value)
505
+ all_data.append(block)
506
+
507
+ continue
508
+
509
+ # 2b) If get_df returned a DataFrame or GeoDataFrame directly
510
+ if isinstance(df_or, (pd.DataFrame, gpd.GeoDataFrame)):
511
+ if isinstance(df_or, gpd.GeoDataFrame):
512
+ gdf = df_or.copy()
513
+ gdf["geometry"] = gdf["geometry"].apply(lambda g: g.wkt if g is not None else "")
514
+ all_data.append(gdf.to_csv(index=False))
515
+ else:
516
+ all_data.append(df_or.to_csv(index=False))
517
+
518
+ continue
519
+
520
+ # 2c) If get_df returned a list of dicts (rare, but possible)
521
+ if isinstance(df_or, list):
522
+ try:
523
+ temp_df = pd.DataFrame(df_or)
524
+ all_data.append(temp_df.to_csv(index=False))
525
+ except Exception:
526
+ all_data.append(repr(df_or))
527
+ continue
528
+
529
+ # 2d) Otherwise, fall back to repr()
530
+ all_data.append(repr(df_or))
531
+
532
+ return all_data
533
+ def read_directory(
534
+ root_path: str,
535
+ exclude_dirs: set[str] = None,
536
+ exclude_file_patterns: set[str] = None,
537
+ exclude_types: set[str] = None,
538
+ ) -> Dict[str, Union[pd.DataFrame, str]]:
539
+ re_initialize_skip_mgr(exclude_types=exclude_types,
540
+ exclude_file_patterns=exclude_file_patterns,
541
+ exclude_dirs=exclude_dirs)
542
+ if not os.path.isdir(root_path):
543
+ raise FileNotFoundError(f"Not a valid directory: {root_path!r}")
544
+ collected: Dict[str, Union[pd.DataFrame, str]] = {}
545
+ root_path = os.path.abspath(root_path)
546
+ root_len = len(root_path.rstrip(os.sep)) + 1
547
+
548
+ for dirpath, dirnames, filenames in os.walk(root_path):
549
+ # 1) Skip excluded subfolders
550
+ dirnames[:] = [
551
+ d for d in dirnames if not should_skip(exclude_item=d,
552
+ exclude_dirs=True)
553
+ ]
554
+
555
+ for fname in filenames:
556
+ # 2) Skip excluded filename patterns
557
+ if should_skip(exclude_types=fname,exclude_file_patterns=fname):
558
+ _logger.debug(f"Skipping file by pattern: {os.path.join(dirpath, fname)}")
559
+ continue
560
+
561
+ full_path = os.path.join(dirpath, fname)
562
+ rel_path = full_path[root_len:] # e.g. "subdir/logs/vid_to_aud.log"
563
+ ext = os.path.splitext(fname)[1].lower()
564
+
565
+ # ── 2a) If it's one of our “plain‐text” extensions, read it as text right now:
566
+ if ext in {'.txt', '.md', '.csv', '.tsv', '.log'}:
567
+ try:
568
+ with open(full_path, 'r', encoding='utf-8', errors='replace') as f:
569
+ text = f.read()
570
+ collected[rel_path] = text
571
+ _logger.info(f"Read text file: {rel_path}")
572
+ except Exception as e:
573
+ _logger.warning(f"Failed to read {rel_path} as text: {e}")
574
+ continue
575
+
576
+ # ── 2b) Otherwise, try to load via get_df(...) (DataFrame/GeoDataFrame/etc.)
577
+ try:
578
+ df_or_gdf = get_df(full_path)
579
+ if isinstance(df_or_gdf, (pd.DataFrame, gpd.GeoDataFrame)):
580
+ collected[rel_path] = df_or_gdf
581
+ _logger.info(f"Loaded DataFrame: {rel_path}")
582
+ continue
583
+ # If get_df returned a dict (e.g. multi‐sheet ODS), merge as multiple entries
584
+ if isinstance(df_or_gdf, dict):
585
+ for sheet_name, df in df_or_gdf.items():
586
+ key = f"{rel_path}::[{sheet_name}]"
587
+ collected[key] = df
588
+ _logger.info(f"Loaded sheet DataFrame: {key}")
589
+ continue
590
+ # If get_df returned something else (list, non‐DataFrame), fall through to text
591
+ except Exception as e:
592
+ _logger.debug(f"get_df failed for {rel_path}: {e}")
593
+
594
+ # ── 2c) Lastly, if it wasn’t a “pure text” file and get_df didn’t return a DataFrame,
595
+ # treat it as text via read_file_as_text(...) so you get at least something:
596
+ try:
597
+ text = read_file_as_text(full_path)
598
+ # read_file_as_text returns a List[str], but here we're in a single-file context,
599
+ # so just join on "\n\n" or take the first element. For simplicity:
600
+ combined = "\n\n".join(text)
601
+ collected[rel_path] = combined
602
+ _logger.info(f"Read fallback text for: {rel_path}")
603
+ except Exception as e:
604
+ _logger.warning(f"Could not read {rel_path} as text or DataFrame: {e}")
605
+
606
+ return collected
607
+