abstract-utilities 0.2.2.540__py3-none-any.whl → 0.2.2.667__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of abstract-utilities might be problematic. Click here for more details.

Files changed (66) hide show
  1. abstract_utilities/__init__.py +13 -4
  2. abstract_utilities/class_utils/abstract_classes.py +104 -34
  3. abstract_utilities/class_utils/caller_utils.py +57 -0
  4. abstract_utilities/class_utils/global_utils.py +35 -20
  5. abstract_utilities/class_utils/imports/imports.py +1 -1
  6. abstract_utilities/directory_utils/src/directory_utils.py +19 -1
  7. abstract_utilities/file_utils/imports/classes.py +59 -55
  8. abstract_utilities/file_utils/imports/imports.py +0 -4
  9. abstract_utilities/file_utils/imports/module_imports.py +1 -1
  10. abstract_utilities/file_utils/src/__init__.py +2 -3
  11. abstract_utilities/file_utils/src/file_filters/__init__.py +1 -0
  12. abstract_utilities/file_utils/src/file_filters/ensure_utils.py +490 -0
  13. abstract_utilities/file_utils/src/file_filters/filter_params.py +150 -0
  14. abstract_utilities/file_utils/src/file_filters/filter_utils.py +78 -0
  15. abstract_utilities/file_utils/src/file_filters/predicate_utils.py +44 -0
  16. abstract_utilities/file_utils/src/file_reader.py +0 -1
  17. abstract_utilities/file_utils/src/find_collect.py +10 -86
  18. abstract_utilities/file_utils/src/find_content.py +210 -0
  19. abstract_utilities/file_utils/src/initFunctionsGen.py +36 -23
  20. abstract_utilities/file_utils/src/initFunctionsGens.py +280 -0
  21. abstract_utilities/file_utils/src/reader_utils/__init__.py +4 -0
  22. abstract_utilities/file_utils/src/reader_utils/directory_reader.py +53 -0
  23. abstract_utilities/file_utils/src/reader_utils/file_reader.py +543 -0
  24. abstract_utilities/file_utils/src/reader_utils/file_readers.py +376 -0
  25. abstract_utilities/file_utils/src/reader_utils/imports.py +18 -0
  26. abstract_utilities/file_utils/src/reader_utils/pdf_utils.py +300 -0
  27. abstract_utilities/import_utils/circular_import_finder.py +222 -0
  28. abstract_utilities/import_utils/circular_import_finder2.py +118 -0
  29. abstract_utilities/import_utils/imports/__init__.py +1 -1
  30. abstract_utilities/import_utils/imports/init_imports.py +3 -0
  31. abstract_utilities/import_utils/imports/module_imports.py +4 -1
  32. abstract_utilities/import_utils/imports/utils.py +1 -1
  33. abstract_utilities/import_utils/src/__init__.py +1 -0
  34. abstract_utilities/import_utils/src/clean_imports.py +156 -25
  35. abstract_utilities/import_utils/src/dot_utils.py +11 -0
  36. abstract_utilities/import_utils/src/extract_utils.py +4 -0
  37. abstract_utilities/import_utils/src/import_functions.py +66 -2
  38. abstract_utilities/import_utils/src/import_utils.py +39 -0
  39. abstract_utilities/import_utils/src/layze_import_utils/__init__.py +2 -0
  40. abstract_utilities/import_utils/src/layze_import_utils/lazy_utils.py +41 -0
  41. abstract_utilities/import_utils/src/layze_import_utils/nullProxy.py +32 -0
  42. abstract_utilities/import_utils/src/nullProxy.py +30 -0
  43. abstract_utilities/import_utils/src/pkg_utils.py +58 -4
  44. abstract_utilities/import_utils/src/sysroot_utils.py +56 -1
  45. abstract_utilities/imports.py +3 -2
  46. abstract_utilities/json_utils/json_utils.py +11 -3
  47. abstract_utilities/log_utils/log_file.py +73 -24
  48. abstract_utilities/parse_utils/parse_utils.py +23 -0
  49. abstract_utilities/path_utils/imports/module_imports.py +1 -1
  50. abstract_utilities/path_utils/path_utils.py +32 -35
  51. abstract_utilities/read_write_utils/imports/imports.py +1 -1
  52. abstract_utilities/read_write_utils/read_write_utils.py +102 -32
  53. abstract_utilities/safe_utils/safe_utils.py +30 -0
  54. abstract_utilities/type_utils/__init__.py +5 -1
  55. abstract_utilities/type_utils/get_type.py +116 -0
  56. abstract_utilities/type_utils/imports/__init__.py +1 -0
  57. abstract_utilities/type_utils/imports/constants.py +134 -0
  58. abstract_utilities/type_utils/imports/module_imports.py +25 -1
  59. abstract_utilities/type_utils/is_type.py +455 -0
  60. abstract_utilities/type_utils/make_type.py +126 -0
  61. abstract_utilities/type_utils/mime_types.py +68 -0
  62. abstract_utilities/type_utils/type_utils.py +0 -877
  63. {abstract_utilities-0.2.2.540.dist-info → abstract_utilities-0.2.2.667.dist-info}/METADATA +1 -1
  64. {abstract_utilities-0.2.2.540.dist-info → abstract_utilities-0.2.2.667.dist-info}/RECORD +66 -41
  65. {abstract_utilities-0.2.2.540.dist-info → abstract_utilities-0.2.2.667.dist-info}/WHEEL +0 -0
  66. {abstract_utilities-0.2.2.540.dist-info → abstract_utilities-0.2.2.667.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,543 @@
1
+ # file_reader.py
2
+ from .imports import *
3
+ # -------- Public API drop-ins that mirror your originals --------
4
+
5
+ from .pdf_utils import *
6
+ # ---------------------------------------------------------------------------
7
+ # NOTE: The following helper functions must be provided elsewhere:
8
+ # - convert_date_string(s: str) -> datetime
9
+ # - read_from_file(path: str) -> pd.DataFrame
10
+ # ---------------------------------------------------------------------------
11
+ def _should_skip_dir(dir_name: str, exclude_dirs: set[str]) -> bool:
12
+ """
13
+ Return True if dir_name match=self.exclude_types)es one of the excluded directory names exactly.
14
+ """
15
+ return dir_name in exclude_dirs
16
+
17
+
18
+ def _should_skip_file(filename: str, exclude_patterns: set[str]) -> bool:
19
+ """
20
+ Return True if filename matches any pattern in exclude_patterns.
21
+ Uses fnmatch (Unix‐style wildcard matching).
22
+ """
23
+ for pat in exclude_patterns:
24
+ if fnmatch.fnmatch(filename, pat):
25
+ return True
26
+ return False
27
+
28
+ def _should_skip_type(filename: str, exclude_types:set[str]) -> bool:
29
+ """
30
+ Return True if filename matches any pattern in exclude_patterns.
31
+ Uses fnmatch (Unix‐style wildcard matching).
32
+ """
33
+ return is_media_type(filename,media_types=exclude_types)
34
+ class shoudSkipManager(metaclass=SingletonMeta):
35
+ def __init__(self,exclude_types=None,exclude_file_patterns=None,exclude_dirs=None):
36
+ if not hasattr(self, 'initialized') or self.initialized == False:
37
+ self.initialized = True
38
+ exclude_types = exclude_types or set()
39
+ exclude_file_patterns = exclude_file_patterns or set()
40
+ exclude_dirs = exclude_dirs or set()
41
+ self.exclude_dirs = exclude_dirs.copy()
42
+ self.exclude_file_patterns = exclude_file_patterns.copy()
43
+ self.exclude_types = exclude_types.copy()
44
+ def should_skip(self,exclude_item=None,exclude_types=None,exclude_file_patterns=None,exclude_dirs=None):
45
+ if (exclude_dirs==None and exclude_file_patterns == None and exclude_types == None) and exclude_item:
46
+ if isinstance(item,str):
47
+ if _should_skip_dir(dir_name=exclude_item, exclude_dirs=self.exclude_dirs):
48
+ return True
49
+ if _should_skip_type(filename=exclude_item, exclude_types=self.exclude_types):
50
+ return True
51
+ if _should_skip_file(filename=exclude_item, exclude_patterns=self.exclude_file_patterns):
52
+ return True
53
+ return False
54
+ elif exclude_types or exclude_file_patterns:
55
+ if exclude_file_patterns != False:
56
+ if _should_skip_file(filename=exclude_item,
57
+ exclude_patterns=self.exclude_file_patterns):
58
+ return True
59
+ if exclude_types != False:
60
+ if _should_skip_type(filename=exclude_item,
61
+ exclude_types=self.exclude_types):
62
+ return True
63
+
64
+ if exclude_dirs:
65
+ if _should_skip_dir(filename=exclude_item, exclude_patterns=self.exclude_dirs):
66
+ return True
67
+ return False
68
+
69
+ SKIP_MGR = shoudSkipManager()
70
+ def should_skip(
71
+ exclude_item=None,
72
+ exclude_types=None,
73
+ exclude_file_patterns=None,
74
+ exclude_dirs=None
75
+ ):
76
+ return shoudSkipManager().should_skip(
77
+ exclude_item=exclude_item,
78
+ exclude_types=exclude_types,
79
+ exclude_file_patterns=exclude_file_patterns,
80
+ exclude_dirs=exclude_dirs
81
+ )
82
+ def re_initialize_skip_mgr(exclude_types=None,
83
+ exclude_file_patterns=None,
84
+ exclude_dirs=None):
85
+ shoudSkipManager().initialized = False
86
+ shoudSkipManager(
87
+ exclude_types=exclude_types,
88
+ exclude_file_patterns=exclude_file_patterns,
89
+ exclude_dirs=exclude_dirs
90
+ )
91
+
92
+ def convert_date_string(s):
93
+ # … your existing stub or real implementation …
94
+ try:
95
+ return datetime.fromisoformat(s)
96
+ except ValueError:
97
+ return None
98
+ # file_utils.py (below your existing imports)
99
+
100
+
101
+
102
+
103
+ def source_engine_for_ext(ext: str) -> str:
104
+ ext = ext.lower()
105
+ mapping = {
106
+ '.parquet': 'pyarrow',
107
+ '.txt': 'python',
108
+ '.csv': 'python',
109
+ '.tsv': 'python',
110
+ '.xlsx': 'openpyxl',
111
+ '.xls': 'xlrd',
112
+ '.xlsb': 'pyxlsb',
113
+ '.ods': 'odf',
114
+ '.geojson':'GeoJSON',
115
+ }
116
+ return mapping.get(ext)
117
+
118
+ def is_valid_file_path(path: str) -> Union[str, None]:
119
+ if not (isinstance(path, str) and path.strip()):
120
+ return None
121
+ if os.path.isfile(path):
122
+ return os.path.splitext(path)[1].lower()
123
+ return None
124
+
125
+ def is_dataframe(obj) -> bool:
126
+ return isinstance(obj, (pd.DataFrame, gpd.GeoDataFrame))
127
+
128
+ def create_dataframe(data=None, columns=None) -> pd.DataFrame:
129
+ # … unchanged …
130
+ if is_dataframe(data):
131
+ return data.copy()
132
+ data = data or {}
133
+ if isinstance(data, dict):
134
+ data = [data]
135
+ if columns is None:
136
+ all_keys = set()
137
+ for row in data:
138
+ if isinstance(row, dict):
139
+ all_keys.update(row.keys())
140
+ columns = list(all_keys)
141
+ if columns is False:
142
+ columns = None
143
+ try:
144
+ return pd.DataFrame(data, columns=columns)
145
+ except Exception as e:
146
+ _logger.error(f"Failed to create DataFrame: {e}")
147
+ return pd.DataFrame([], columns=columns)
148
+
149
+ def read_ods_file(path: str) -> dict[str, pd.DataFrame]:
150
+ # … unchanged …
151
+ if not is_valid_file_path(path):
152
+ _logger.error(f"File not found or invalid: {path}")
153
+ return {}
154
+ try:
155
+ doc = ezodf.opendoc(path)
156
+ except Exception as e:
157
+ _logger.error(f"Failed to open ODS document: {e}")
158
+ return {}
159
+ sheets: dict[str, pd.DataFrame] = {}
160
+ for sheet in doc.sheets:
161
+ table_rows = []
162
+ for row in sheet.rows():
163
+ row_data = []
164
+ for cell in row:
165
+ if cell.value_type == 'date':
166
+ row_data.append(convert_date_string(str(cell.value)))
167
+ else:
168
+ row_data.append(cell.value)
169
+ table_rows.append(row_data)
170
+ df = pd.DataFrame(table_rows)
171
+ sheets[sheet.name] = df
172
+ _logger.info(f"Processed sheet: {sheet.name}")
173
+ return sheets
174
+
175
+ def read_ods_as_excel(path: str, xlsx_path: str | None = None) -> pd.DataFrame:
176
+ # … unchanged …
177
+ if not is_valid_file_path(path):
178
+ _logger.error(f"File not found or invalid: {path}")
179
+ return pd.DataFrame()
180
+ if xlsx_path is None:
181
+ tmp_dir = tempfile.mkdtemp()
182
+ xlsx_path = os.path.join(tmp_dir, os.path.basename(path) + '.xlsx')
183
+ cleanup_temp = True
184
+ else:
185
+ cleanup_temp = False
186
+ try:
187
+ # You must implement ods_to_xlsx(...) externally
188
+ ods_to_xlsx(path, xlsx_path)
189
+ except Exception as e:
190
+ _logger.error(f"ODS→XLSX conversion failed: {e}")
191
+ if cleanup_temp:
192
+ shutil.rmtree(tmp_dir)
193
+ return pd.DataFrame()
194
+ try:
195
+ df = pd.read_excel(xlsx_path, engine='openpyxl')
196
+ except Exception as e:
197
+ _logger.error(f"Failed to read converted XLSX: {e}")
198
+ df = pd.DataFrame()
199
+ finally:
200
+ if cleanup_temp:
201
+ shutil.rmtree(tmp_dir)
202
+ return df
203
+
204
+ def filter_df(
205
+ df: pd.DataFrame,
206
+ nrows: int | None = None,
207
+ condition: pd.Series | None = None,
208
+ indices: list[int] | None = None
209
+ ) -> pd.DataFrame:
210
+ if nrows is not None:
211
+ df = df.head(nrows)
212
+ if condition is not None:
213
+ df = df[condition]
214
+ if indices is not None:
215
+ df = df.iloc[indices]
216
+ return df
217
+
218
+ def read_shape_file(path: str) -> Union[gpd.GeoDataFrame, None]:
219
+ # … unchanged …
220
+ ext = is_valid_file_path(path)
221
+ if not ext:
222
+ _logger.error(f"Shape file not found: {path}")
223
+ return None
224
+ ext = ext.lower()
225
+ try:
226
+ if ext in ('.shp', '.cpg', '.dbf', '.shx'):
227
+ return gpd.read_file(path)
228
+ if ext == '.geojson':
229
+ return gpd.read_file(path, driver='GeoJSON')
230
+ if ext == '.prj':
231
+ return read_from_file(path) # Must return GeoDataFrame
232
+ except Exception as e:
233
+ _logger.error(f"Failed to read spatial data ({path}): {e}")
234
+ return None
235
+ _logger.error(f"Unsupported spatial extension: {ext}")
236
+ return None
237
+
238
+
239
+
240
+
241
+
242
+ def pdf_to_text(path, keep_page_breaks=True, ocr_if_empty=True):
243
+ """
244
+ Return the full text of *path* (str or Path) as a single string.
245
+
246
+ keep_page_breaks → insert "\f" between pages so you can split later.
247
+ ocr_if_empty → any page with no text layer is rasterised & OCR'd.
248
+ """
249
+ path = Path(path)
250
+ if not path.exists():
251
+ raise FileNotFoundError(path)
252
+
253
+ all_pages = []
254
+
255
+ with pdfplumber.open(path) as pdf:
256
+ for i, page in enumerate(pdf.pages, start=1):
257
+ text = page.extract_text() or "" # might be None
258
+ if (not text.strip()) and ocr_if_empty:
259
+ # rasterise at 300 dpi then Tesseract
260
+ img = convert_from_path(str(path), dpi=300, first_page=i, last_page=i)[0]
261
+ text = pytesseract.image_to_string(img, lang="eng")
262
+ all_pages.append(text)
263
+
264
+ sep = "\f" if keep_page_breaks else "\n"
265
+ return sep.join(all_pages)
266
+ def get_df(
267
+ source: Union[
268
+ str,
269
+ pd.DataFrame,
270
+ gpd.GeoDataFrame,
271
+ dict,
272
+ list,
273
+ FileStorage
274
+ ],
275
+ nrows: int | None = None,
276
+ skiprows: list[int] | int | None = None,
277
+ condition: pd.Series | None = None,
278
+ indices: list[int] | None = None
279
+ ) -> Union[pd.DataFrame, gpd.GeoDataFrame, dict[str, Union[pd.DataFrame, str]], None]:
280
+ """
281
+ Load a DataFrame or GeoDataFrame from various sources, then apply optional filters.
282
+ If `source` is a directory, returns read_directory(source) instead (a dict).
283
+ """
284
+
285
+ # ─── Check for directory first ─────────────────────────────────────────────
286
+ if isinstance(source, str) and os.path.isdir(source):
287
+ return read_directory(root_path=source)
288
+
289
+ # ─── If already a DataFrame/GeoDataFrame, just filter and return ───────────
290
+ if is_dataframe(source):
291
+ _logger.info("Source is already a DataFrame/GeoDataFrame; applying filters.")
292
+ return filter_df(source, nrows=nrows, condition=condition, indices=indices)
293
+
294
+ if source is None:
295
+ _logger.error("No source provided to get_df().")
296
+ return None
297
+
298
+ # ─── Next: If source is a file path, read according to extension ───────────
299
+ if isinstance(source, str) and os.path.isfile(source):
300
+ ext = os.path.splitext(source)[1].lower()
301
+ try:
302
+ _logger.info(f"Loading file {source} with extension '{ext}'.")
303
+ if ext in ('.csv', '.tsv', '.txt'):
304
+ sep = {'.csv': ',', '.tsv': '\t', '.txt': None}.get(ext)
305
+ df = pd.read_csv(source, skiprows=skiprows, sep=sep, nrows=nrows)
306
+ elif ext in ('.ods', '.xlsx', '.xls', '.xlsb'):
307
+ engine = source_engine_for_ext(ext)
308
+ if ext == '.ods':
309
+ df = read_ods_as_excel(source)
310
+ else:
311
+ df = pd.read_excel(source, skiprows=skiprows, engine=engine, nrows=nrows)
312
+ elif ext == '.json':
313
+ df = safe_read_from_json(source)
314
+ return df
315
+ elif ext == '.parquet':
316
+ df = pd.read_parquet(source)
317
+ elif ext in ('.shp', '.cpg', '.dbf', '.shx', '.geojson', '.prj'):
318
+ return read_shape_file(source)
319
+ elif ext in ['.pdf']:
320
+ df = pdf_to_text(source)
321
+ return df
322
+ else:
323
+ df = read_from_file(source)
324
+ return df
325
+
326
+ if not isinstance(df, (dict, list, FileStorage)):
327
+ return filter_df(df, nrows=nrows, condition=condition, indices=indices)
328
+ source = df # pass on to next block if needed
329
+
330
+ except Exception as e:
331
+ _logger.error(f"Failed to read '{source}': {e}")
332
+ return None
333
+
334
+ # ─── If source is FileStorage (uploaded) ───────────────────────────────────
335
+ if isinstance(source, FileStorage):
336
+ try:
337
+ filename = secure_filename(source.filename or "uploaded.xlsx")
338
+ _logger.info(f"Reading uploaded file: {filename}")
339
+ df = pd.read_excel(source.stream, nrows=nrows)
340
+ return filter_df(df, nrows=nrows, condition=condition, indices=indices)
341
+ except Exception as e:
342
+ _logger.error(f"Failed to read FileStorage: {e}")
343
+ return None
344
+
345
+ # ─── If source is dict or list, turn into DataFrame ────────────────────────
346
+ if isinstance(source, (dict, list)):
347
+ _logger.info("Creating DataFrame from in-memory data structure.")
348
+ df = pd.DataFrame(source)
349
+ return filter_df(df, nrows=nrows, condition=condition, indices=indices)
350
+
351
+ _logger.error(f"Unsupported source type: {type(source)}")
352
+ return None
353
+
354
+
355
+ def read_file_as_text(paths: Union[str, List[str]]) -> List[str]:
356
+ """
357
+ Given one path or a list of paths, return a list of textual representations
358
+ for each “file” found. If a given path is:
359
+
360
+ 1) A directory → we call read_directory(...) on it (which skips node_modules,
361
+ __pycache__, *.ini, etc.) and iterate over each (relative_path → content).
362
+ 2) A plain‐text file (extension ∈ SUPPORTED_TEXT_EXTENSIONS) → we open it and return its raw text.
363
+ 3) Anything else (e.g. .xlsx, .ods, .parquet, .shp, etc.) → we delegate to get_df(...) and then
364
+ convert whatever get_df(...) gives us into CSV or “to_string()” as appropriate.
365
+
366
+ Returns:
367
+ A list of strings—each string is the “file’s contents” for one actual file.
368
+ (Ordering is “filesystem walk order” for directories, and “in order of the input list” for files.)
369
+
370
+ Raises:
371
+ FileNotFoundError if any path in `paths` does not exist.
372
+ ValueError if a file cannot be parsed/read.
373
+ """
374
+ # Ensure we have a list to iterate
375
+ if isinstance(paths, str):
376
+ files_to_process = [paths]
377
+ else:
378
+ files_to_process = list(paths)
379
+
380
+ all_data: List[str] = []
381
+
382
+ for full_path in files_to_process:
383
+ if not os.path.exists(full_path):
384
+ raise FileNotFoundError(f"Not a valid path: {full_path!r}")
385
+
386
+ # ── If this is a directory, walk it via read_directory(...) ─────────────────
387
+ if os.path.isdir(full_path):
388
+ # read_directory returns a dict: { relative_path: (DataFrame or text) }
389
+ nested_dict: Dict[str, Union[pd.DataFrame, gpd.GeoDataFrame, str]] = read_directory(full_path)
390
+
391
+ for rel, content in nested_dict.items():
392
+ # `content` is either a DataFrame, GeoDataFrame, or a plain‐text string
393
+ if isinstance(content, (pd.DataFrame, gpd.GeoDataFrame)):
394
+ # If GeoDataFrame, convert geometry column to WKT before CSV
395
+ if isinstance(content, gpd.GeoDataFrame):
396
+ gdf = content.copy()
397
+ gdf["geometry"] = gdf["geometry"].apply(lambda g: g.wkt if g is not None else "")
398
+ all_data.append(gdf.to_csv(index=False))
399
+ else:
400
+ all_data.append(content.to_csv(index=False))
401
+ else:
402
+ # Already a text blob
403
+ all_data.append(content)
404
+
405
+ continue # move on to the next item in files_to_process
406
+
407
+ # ── At this point, full_path is guaranteed to be a file ───────────────────────
408
+ ext = os.path.splitext(full_path)[1].lower()
409
+
410
+ # 1) PURE TEXT EXTENSION?
411
+ #if ext in SUPPORTED_TEXT_EXTENSIONS:
412
+ try:
413
+ with open(full_path, "r", encoding="utf-8", errors="replace") as f:
414
+ raw = f.read()
415
+ all_data.append(raw)
416
+ except Exception as e:
417
+ raise ValueError(f"Error reading text file {full_path!r}: {e}")
418
+
419
+ continue
420
+
421
+ # 2) ANY OTHER FILETYPE → delegate to get_df(...) and convert result to text
422
+ try:
423
+ df_or = get_df(full_path)
424
+ except Exception as e:
425
+ raise ValueError(f"get_df() failed for {full_path!r}: {e}")
426
+
427
+ # 2a) If get_df returned a dict (e.g. an ODS with multiple sheets, or a directory)
428
+ if isinstance(df_or, dict):
429
+ # Join each sheet or sub‐file’s DataFrame into one big text block
430
+ for key, value in df_or.items():
431
+ if isinstance(value, (pd.DataFrame, gpd.GeoDataFrame)):
432
+ if isinstance(value, gpd.GeoDataFrame):
433
+ gdf = value.copy()
434
+ gdf["geometry"] = gdf["geometry"].apply(lambda g: g.wkt if g is not None else "")
435
+ block = f"=== {key} ===\n" + gdf.to_csv(index=False)
436
+ else:
437
+ block = f"=== {key} ===\n" + value.to_csv(index=False)
438
+ else:
439
+ # It was already plain‐text under that key
440
+ block = f"=== {key} ===\n" + str(value)
441
+ all_data.append(block)
442
+
443
+ continue
444
+
445
+ # 2b) If get_df returned a DataFrame or GeoDataFrame directly
446
+ if isinstance(df_or, (pd.DataFrame, gpd.GeoDataFrame)):
447
+ if isinstance(df_or, gpd.GeoDataFrame):
448
+ gdf = df_or.copy()
449
+ gdf["geometry"] = gdf["geometry"].apply(lambda g: g.wkt if g is not None else "")
450
+ all_data.append(gdf.to_csv(index=False))
451
+ else:
452
+ all_data.append(df_or.to_csv(index=False))
453
+
454
+ continue
455
+
456
+ # 2c) If get_df returned a list of dicts (rare, but possible)
457
+ if isinstance(df_or, list):
458
+ try:
459
+ temp_df = pd.DataFrame(df_or)
460
+ all_data.append(temp_df.to_csv(index=False))
461
+ except Exception:
462
+ all_data.append(repr(df_or))
463
+ continue
464
+
465
+ # 2d) Otherwise, fall back to repr()
466
+ all_data.append(repr(df_or))
467
+
468
+ return all_data
469
+ def read_directory(
470
+ root_path: str,
471
+ exclude_dirs: set[str] = None,
472
+ exclude_file_patterns: set[str] = None,
473
+ exclude_types: set[str] = None,
474
+ ) -> Dict[str, Union[pd.DataFrame, str]]:
475
+ re_initialize_skip_mgr(exclude_types=exclude_types,
476
+ exclude_file_patterns=exclude_file_patterns,
477
+ exclude_dirs=exclude_dirs)
478
+ if not os.path.isdir(root_path):
479
+ raise FileNotFoundError(f"Not a valid directory: {root_path!r}")
480
+ collected: Dict[str, Union[pd.DataFrame, str]] = {}
481
+ root_path = os.path.abspath(root_path)
482
+ root_len = len(root_path.rstrip(os.sep)) + 1
483
+
484
+ for dirpath, dirnames, filenames in os.walk(root_path):
485
+ # 1) Skip excluded subfolders
486
+ dirnames[:] = [
487
+ d for d in dirnames if not should_skip(exclude_item=d,
488
+ exclude_dirs=True)
489
+ ]
490
+
491
+ for fname in filenames:
492
+ # 2) Skip excluded filename patterns
493
+ if should_skip(exclude_types=fname,exclude_file_patterns=fname):
494
+ _logger.debug(f"Skipping file by pattern: {os.path.join(dirpath, fname)}")
495
+ continue
496
+
497
+ full_path = os.path.join(dirpath, fname)
498
+ rel_path = full_path[root_len:] # e.g. "subdir/logs/vid_to_aud.log"
499
+ ext = os.path.splitext(fname)[1].lower()
500
+
501
+ # ── 2a) If it's one of our “plain‐text” extensions, read it as text right now:
502
+ if ext in {'.txt', '.md', '.csv', '.tsv', '.log'}:
503
+ try:
504
+ with open(full_path, 'r', encoding='utf-8', errors='replace') as f:
505
+ text = f.read()
506
+ collected[rel_path] = text
507
+ _logger.info(f"Read text file: {rel_path}")
508
+ except Exception as e:
509
+ _logger.warning(f"Failed to read {rel_path} as text: {e}")
510
+ continue
511
+
512
+ # ── 2b) Otherwise, try to load via get_df(...) (DataFrame/GeoDataFrame/etc.)
513
+ try:
514
+ df_or_gdf = get_df(full_path)
515
+ if isinstance(df_or_gdf, (pd.DataFrame, gpd.GeoDataFrame)):
516
+ collected[rel_path] = df_or_gdf
517
+ _logger.info(f"Loaded DataFrame: {rel_path}")
518
+ continue
519
+ # If get_df returned a dict (e.g. multi‐sheet ODS), merge as multiple entries
520
+ if isinstance(df_or_gdf, dict):
521
+ for sheet_name, df in df_or_gdf.items():
522
+ key = f"{rel_path}::[{sheet_name}]"
523
+ collected[key] = df
524
+ _logger.info(f"Loaded sheet DataFrame: {key}")
525
+ continue
526
+ # If get_df returned something else (list, non‐DataFrame), fall through to text
527
+ except Exception as e:
528
+ _logger.debug(f"get_df failed for {rel_path}: {e}")
529
+
530
+ # ── 2c) Lastly, if it wasn’t a “pure text” file and get_df didn’t return a DataFrame,
531
+ # treat it as text via read_file_as_text(...) so you get at least something:
532
+ try:
533
+ text = read_file_as_text(full_path)
534
+ # read_file_as_text returns a List[str], but here we're in a single-file context,
535
+ # so just join on "\n\n" or take the first element. For simplicity:
536
+ combined = "\n\n".join(text)
537
+ collected[rel_path] = combined
538
+ _logger.info(f"Read fallback text for: {rel_path}")
539
+ except Exception as e:
540
+ _logger.warning(f"Could not read {rel_path} as text or DataFrame: {e}")
541
+
542
+ return collected
543
+