abstract-utilities 0.2.2.513__py3-none-any.whl → 0.2.2.583__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. abstract_utilities/class_utils/caller_utils.py +18 -0
  2. abstract_utilities/class_utils/global_utils.py +3 -2
  3. abstract_utilities/class_utils/imports/imports.py +1 -1
  4. abstract_utilities/directory_utils/__init__.py +2 -4
  5. abstract_utilities/directory_utils/imports/__init__.py +2 -0
  6. abstract_utilities/directory_utils/imports/imports.py +1 -0
  7. abstract_utilities/directory_utils/imports/module_imports.py +2 -0
  8. abstract_utilities/directory_utils/src/__init__.py +4 -0
  9. abstract_utilities/directory_utils/src/directory_utils.py +108 -0
  10. abstract_utilities/directory_utils/src/name_utils.py +43 -0
  11. abstract_utilities/directory_utils/src/size_utils.py +57 -0
  12. abstract_utilities/directory_utils/src/utils.py +116 -0
  13. abstract_utilities/file_utils/imports/constants.py +81 -7
  14. abstract_utilities/file_utils/imports/imports.py +0 -4
  15. abstract_utilities/file_utils/imports/module_imports.py +1 -1
  16. abstract_utilities/file_utils/src/__init__.py +2 -4
  17. abstract_utilities/file_utils/src/file_filters/__init__.py +4 -0
  18. abstract_utilities/file_utils/src/file_filters/ensure_utils.py +116 -0
  19. abstract_utilities/file_utils/src/file_filters/filter_params.py +86 -0
  20. abstract_utilities/file_utils/src/file_filters/filter_utils.py +78 -0
  21. abstract_utilities/file_utils/src/file_filters/predicate_utils.py +114 -0
  22. abstract_utilities/file_utils/src/file_filters.py +114 -47
  23. abstract_utilities/file_utils/src/file_reader.py +0 -64
  24. abstract_utilities/file_utils/src/file_utils.py +7 -130
  25. abstract_utilities/file_utils/src/filter_params.py +128 -86
  26. abstract_utilities/file_utils/src/find_collect.py +85 -165
  27. abstract_utilities/file_utils/src/find_content.py +210 -0
  28. abstract_utilities/file_utils/src/initFunctionsGen.py +3 -9
  29. abstract_utilities/file_utils/src/reader_utils/__init__.py +4 -0
  30. abstract_utilities/file_utils/src/reader_utils/directory_reader.py +53 -0
  31. abstract_utilities/file_utils/src/reader_utils/file_reader.py +543 -0
  32. abstract_utilities/file_utils/src/reader_utils/file_readers.py +376 -0
  33. abstract_utilities/file_utils/src/reader_utils/imports.py +18 -0
  34. abstract_utilities/file_utils/src/reader_utils/pdf_utils.py +300 -0
  35. abstract_utilities/file_utils (2)/__init__.py +2 -0
  36. abstract_utilities/file_utils (2)/imports/__init__.py +2 -0
  37. abstract_utilities/file_utils (2)/imports/constants.py +118 -0
  38. abstract_utilities/file_utils (2)/imports/imports/__init__.py +3 -0
  39. abstract_utilities/file_utils (2)/imports/imports/constants.py +119 -0
  40. abstract_utilities/file_utils (2)/imports/imports/imports.py +46 -0
  41. abstract_utilities/file_utils (2)/imports/imports/module_imports.py +8 -0
  42. abstract_utilities/file_utils (2)/imports/utils/__init__.py +3 -0
  43. abstract_utilities/file_utils (2)/imports/utils/classes.py +379 -0
  44. abstract_utilities/file_utils (2)/imports/utils/clean_imps.py +155 -0
  45. abstract_utilities/file_utils (2)/imports/utils/filter_utils.py +341 -0
  46. abstract_utilities/file_utils (2)/src/__init__.py +8 -0
  47. abstract_utilities/file_utils (2)/src/file_filters.py +155 -0
  48. abstract_utilities/file_utils (2)/src/file_reader.py +604 -0
  49. abstract_utilities/file_utils (2)/src/find_collect.py +258 -0
  50. abstract_utilities/file_utils (2)/src/initFunctionsGen.py +286 -0
  51. abstract_utilities/file_utils (2)/src/map_utils.py +28 -0
  52. abstract_utilities/file_utils (2)/src/pdf_utils.py +300 -0
  53. abstract_utilities/import_utils/circular_import_finder.py +222 -0
  54. abstract_utilities/import_utils/circular_import_finder2.py +118 -0
  55. abstract_utilities/import_utils/imports/module_imports.py +3 -1
  56. abstract_utilities/import_utils/src/clean_imports.py +156 -25
  57. abstract_utilities/import_utils/src/dot_utils.py +11 -0
  58. abstract_utilities/import_utils/src/extract_utils.py +4 -0
  59. abstract_utilities/import_utils/src/import_functions.py +47 -2
  60. abstract_utilities/import_utils/src/pkg_utils.py +58 -4
  61. abstract_utilities/import_utils/src/sysroot_utils.py +56 -1
  62. abstract_utilities/log_utils/log_file.py +3 -2
  63. abstract_utilities/path_utils/path_utils.py +25 -23
  64. abstract_utilities/safe_utils/safe_utils.py +30 -0
  65. {abstract_utilities-0.2.2.513.dist-info → abstract_utilities-0.2.2.583.dist-info}/METADATA +1 -1
  66. {abstract_utilities-0.2.2.513.dist-info → abstract_utilities-0.2.2.583.dist-info}/RECORD +68 -28
  67. {abstract_utilities-0.2.2.513.dist-info → abstract_utilities-0.2.2.583.dist-info}/WHEEL +0 -0
  68. {abstract_utilities-0.2.2.513.dist-info → abstract_utilities-0.2.2.583.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,86 @@
1
+ from .predicate_utils import *
2
+ def _get_default_modular(value, default, add=False, typ=set):
3
+ """Merge user and default values intelligently."""
4
+ if value == None:
5
+ value = add
6
+ if value in [True]:
7
+ return default
8
+ if value is False:
9
+ return value
10
+ if add:
11
+ return combine_params(value,default,typ=None)
12
+
13
+ return typ(value)
14
+
15
+ # -------------------------
16
+ # Default derivation logic
17
+ # -------------------------
18
+ def _get_default_modular(value, default, add=None, typ=set):
19
+ """Merge user and default values intelligently."""
20
+ add = add or False
21
+ if value == None:
22
+ value = add
23
+ if value in [True]:
24
+ return default
25
+ if value is False:
26
+ return value
27
+ if add:
28
+ return combine_params(value,default,typ=None)
29
+ return typ(value)
30
+ def derive_all_defaults(**kwargs):
31
+ kwargs = get_safe_canonical_kwargs(**kwargs)
32
+ add = kwargs.get("add",False)
33
+ nu_defaults = {}
34
+ for key,values in DEFAULT_CANONICAL_MAP.items():
35
+ default = values.get("default")
36
+ typ = values.get("type")
37
+ key_value = kwargs.get(key)
38
+ if key in DEFAULT_ALLOWED_EXCLUDE_MAP:
39
+
40
+ if key.endswith('exts'):
41
+ input_value = ensure_exts(key_value)
42
+ if key.endswith('patterns'):
43
+ input_value = ensure_patterns(key_value)
44
+ else:
45
+ input_value = normalize_listlike(key_value, typ)
46
+ nu_defaults[key] = _get_default_modular(input_value, default, add, typ)
47
+ else:
48
+ value = default if key_value is None else key_value
49
+ if typ == list:
50
+ value = make_list(value)
51
+ elif typ == bool:
52
+ value = bool(value)
53
+ nu_defaults[key] = value
54
+
55
+ return nu_defaults
56
+ # -------------------------
57
+ # Default derivation logic
58
+ # -------------------------
59
+ def derive_file_defaults(**kwargs):
60
+ kwargs = derive_all_defaults(**kwargs)
61
+ add = kwargs.get("add",True)
62
+ nu_defaults = {}
63
+ for key,values in DEFAULT_ALLOWED_EXCLUDE_MAP.items():
64
+ default = values.get("default")
65
+ typ = values.get("type")
66
+ key_value = kwargs.get(key)
67
+ if key.endswith('exts'):
68
+ input_value = ensure_exts(key_value)
69
+ if key.endswith('patterns'):
70
+ input_value = ensure_patterns(key_value)
71
+ else:
72
+ input_value = normalize_listlike(key_value, typ)
73
+ nu_defaults[key] = _get_default_modular(input_value, default, add, typ)
74
+ return nu_defaults
75
+
76
+ def define_defaults(**kwargs):
77
+ defaults = derive_file_defaults(**kwargs)
78
+ return ScanConfig(**defaults)
79
+
80
+ def get_file_filters(*args,**kwargs):
81
+ directories = ensure_directories(*args,**kwargs)
82
+ recursive = kwargs.get('recursive',True)
83
+ include_files = kwargs.get('include_files',True)
84
+ cfg = define_defaults(**kwargs)
85
+ allowed = kwargs.get("allowed") or make_allowed_predicate(cfg)
86
+ return directories,cfg,allowed,include_files,recursive
@@ -0,0 +1,78 @@
1
+ from ...imports import *
2
+ import re
3
+ def combine_params(*values,typ=None):
4
+ nu_values = None
5
+ for value in values:
6
+ if value is not None:
7
+ typ = typ or type(value)
8
+ if nu_values is None:
9
+ nu_values = typ()
10
+
11
+ if typ is set:
12
+ nu_values = nu_values | typ(value)
13
+ if typ is list:
14
+ nu_values += typ(value)
15
+ return nu_values
16
+ def get_safe_kwargs(canonical_map, **kwargs):
17
+ # Lowercase all keys for safety
18
+ canonical_map = canonical_map or CANONICAL_MAP
19
+ norm_kwargs = {k.lower(): v for k, v in kwargs.items() if v is not None}
20
+
21
+ # Inverse lookup: alias → canonical key
22
+ alias_lookup = {
23
+ alias: canon
24
+ for canon, aliases in canonical_map.items()
25
+ if aliases
26
+ for alias in aliases
27
+ }
28
+
29
+ # Preserve correctly named keys
30
+ safe_kwargs = {k: v for k, v in norm_kwargs.items() if k in canonical_map}
31
+
32
+ for k, v in norm_kwargs.items():
33
+ if k in alias_lookup:
34
+ canonical_key = alias_lookup[k]
35
+ prev = safe_kwargs.get(canonical_key)
36
+ if prev is None:
37
+ safe_kwargs[canonical_key] = v
38
+ else:
39
+ # merge intelligently if both exist
40
+ if isinstance(prev, (set, list)) and isinstance(v, (set, list)):
41
+ safe_kwargs[canonical_key] = list(set(prev) | set(v))
42
+ else:
43
+ safe_kwargs[canonical_key] = v # overwrite for non-iterables
44
+
45
+ # fill defaults if missing
46
+ for canon in canonical_map:
47
+ safe_kwargs.setdefault(canon, None)
48
+
49
+ return safe_kwargs
50
+
51
+ def create_canonical_map(*args,canonical_map=None):
52
+ keys = [arg for arg in args if arg]
53
+ if not keys:
54
+ return CANONICAL_MAP
55
+ canonical_map = canonical_map or CANONICAL_MAP
56
+
57
+ return {key:canonical_map.get(key) for key in keys}
58
+ def get_safe_canonical_kwargs(*args,canonical_map=None,**kwargs):
59
+ canonical_map = canonical_map or create_canonical_map(*args)
60
+
61
+ return get_safe_kwargs(canonical_map=canonical_map,**kwargs)
62
+ def get_dir_filter_kwargs(**kwargs):
63
+ canonical_map = create_canonical_map("directories")
64
+ return get_safe_kwargs(canonical_map=canonical_map,**kwargs)
65
+ def get_file_filter_kwargs(**kwargs):
66
+ """
67
+ Normalize arbitrary keyword arguments for file scanning configuration.
68
+
69
+ Examples:
70
+ - 'excluded_ext' or 'unallowed_exts' → 'exclude_exts'
71
+ - 'include_dirs' or 'allow_dir' → 'allowed_dirs'
72
+ - 'excludePattern' or 'excluded_patterns' → 'exclude_patterns'
73
+ - 'allowed_type' or 'include_types' → 'allowed_types'
74
+ """
75
+ # Canonical keys and aliases
76
+ canonical_keys =["allowed_exts","exclude_exts","allowed_types","exclude_types","allowed_dirs","exclude_dirs","allowed_patterns","exclude_patterns"]
77
+
78
+ return get_safe_canonical_kwargs(*canonical_keys,**kwargs)
@@ -0,0 +1,114 @@
1
+ from .ensure_utils import *
2
+ def get_allowed_predicate(allowed=None):
3
+ if allowed != False:
4
+ if allowed == True:
5
+ allowed = None
6
+ allowed = allowed or make_allowed_predicate()
7
+ else:
8
+ def allowed(*args):
9
+ return True
10
+ allowed = allowed
11
+ return allowed
12
+ def get_globs(items,recursive: bool = True,allowed=None):
13
+ glob_paths = []
14
+ items = [item for item in make_list(items) if item]
15
+ for item in items:
16
+ pattern = os.path.join(item, "**/*") # include all files recursively\n
17
+ nuItems = glob.glob(pattern, recursive=recursive)
18
+ if allowed:
19
+ nuItems = [nuItem for nuItem in nuItems if nuItem and allowed(nuItem)]
20
+ glob_paths += nuItems
21
+ return glob_paths
22
+ def get_allowed_files(items,allowed=True):
23
+ allowed = get_allowed_predicate(allowed=allowed)
24
+ return [item for item in items if item and os.path.isfile(item) and allowed(item)]
25
+ def get_allowed_dirs(items,allowed=False):
26
+ allowed = get_allowed_predicate(allowed=allowed)
27
+ return [item for item in items if item and os.path.isdir(item) and allowed(item)]
28
+
29
+ def get_filtered_files(items,allowed=None,files = []):
30
+ allowed = get_allowed_predicate(allowed=allowed)
31
+ glob_paths = get_globs(items)
32
+ return [glob_path for glob_path in glob_paths if glob_path and os.path.isfile(glob_path) and glob_path not in files and allowed(glob_path)]
33
+ def get_filtered_dirs(items,allowed=None,dirs = []):
34
+ allowed = get_allowed_predicate(allowed=allowed)
35
+ glob_paths = get_globs(items)
36
+ return [glob_path for glob_path in glob_paths if glob_path and os.path.isdir(glob_path) and glob_path not in dirs and allowed(glob_path)]
37
+
38
+ def get_all_allowed_files(items,allowed=None):
39
+ dirs = get_all_allowed_dirs(items)
40
+ files = get_allowed_files(items)
41
+ nu_files = []
42
+ for directory in dirs:
43
+ files += get_filtered_files(directory,allowed=allowed,files=files)
44
+ return files
45
+ def get_all_allowed_dirs(items,allowed=None):
46
+ allowed = get_allowed_predicate(allowed=allowed)
47
+ dirs = get_allowed_dirs(items)
48
+ nu_dirs=[]
49
+ for directory in dirs:
50
+ nu_dirs += get_filtered_dirs(directory,allowed=allowed,dirs=nu_dirs)
51
+ return nu_dirs
52
+
53
+ def make_allowed_predicate(cfg: ScanConfig) -> Callable[[str], bool]:
54
+ """
55
+ Build a predicate that returns True if a given path is considered allowed
56
+ under the given ScanConfig. Applies allowed_* and exclude_* logic symmetrically.
57
+ """
58
+ def allowed(path: str=None,p=None) -> bool:
59
+ p = p or Path(path)
60
+ name = p.name.lower()
61
+ path_str = str(p).lower()
62
+
63
+ # --------------------
64
+ # A) directory filters
65
+ # --------------------
66
+ if cfg.exclude_dirs:
67
+ for dpat in cfg.exclude_dirs:
68
+ dpat_l = dpat.lower()
69
+ if dpat_l in path_str or fnmatch.fnmatch(name, dpat_l):
70
+ if p.is_dir() or dpat_l in path_str:
71
+ return False
72
+
73
+ if cfg.allowed_dirs and cfg.allowed_dirs != ["*"]:
74
+ # must be in at least one allowed dir
75
+ if not any(
76
+ fnmatch.fnmatch(path_str, f"*{dpat.lower()}*") for dpat in cfg.allowed_dirs
77
+ ):
78
+ return False
79
+
80
+ # --------------------
81
+ # B) pattern filters
82
+ # --------------------
83
+ if cfg.allowed_patterns and cfg.allowed_patterns != ["*"]:
84
+ if not any(fnmatch.fnmatch(name, pat.lower()) for pat in cfg.allowed_patterns):
85
+ return False
86
+
87
+ if cfg.exclude_patterns:
88
+ for pat in cfg.exclude_patterns:
89
+ if fnmatch.fnmatch(name, pat.lower()):
90
+ return False
91
+
92
+ # --------------------
93
+ # C) extension filters
94
+ # --------------------
95
+ if p.is_file():
96
+ ext = p.suffix.lower()
97
+ if cfg.allowed_exts and ext not in cfg.allowed_exts:
98
+ return False
99
+ if cfg.exclude_exts and ext in cfg.exclude_exts:
100
+ return False
101
+
102
+ # --------------------
103
+ # D) type filters (optional)
104
+ # --------------------
105
+ if cfg.allowed_types and cfg.allowed_types != {"*"}:
106
+ if not any(t in path_str for t in cfg.allowed_types):
107
+ return False
108
+ if cfg.exclude_types and cfg.exclude_types != {"*"}:
109
+ if any(t in path_str for t in cfg.exclude_types):
110
+ return False
111
+
112
+ return True
113
+
114
+ return allowed
@@ -1,53 +1,120 @@
1
- from ..imports import *
2
1
  from .filter_params import *
3
- from .file_utils import *
2
+ from ..imports import *
4
3
  ##from abstract_utilities import make_list,get_media_exts, is_media_type
4
+ def get_allowed_predicate(allowed=None):
5
+ if allowed != False:
6
+ if allowed == True:
7
+ allowed = None
8
+ allowed = allowed or make_allowed_predicate()
9
+ else:
10
+ def allowed(*args):
11
+ return True
12
+ allowed = allowed
13
+ return allowed
14
+ def get_globs(items,recursive: bool = True,allowed=None):
15
+ glob_paths = []
16
+ items = [item for item in make_list(items) if item]
17
+ for item in items:
18
+ pattern = os.path.join(item, "**/*") # include all files recursively\n
19
+ nuItems = glob.glob(pattern, recursive=recursive)
20
+ if allowed:
21
+ nuItems = [nuItem for nuItem in nuItems if nuItem and allowed(nuItem)]
22
+ glob_paths += nuItems
23
+ return glob_paths
24
+ def get_allowed_files(items,allowed=True):
25
+ allowed = get_allowed_predicate(allowed=allowed)
26
+ return [item for item in items if item and os.path.isfile(item) and allowed(item)]
27
+ def get_allowed_dirs(items,allowed=False):
28
+ allowed = get_allowed_predicate(allowed=allowed)
29
+ return [item for item in items if item and os.path.isdir(item) and allowed(item)]
30
+
31
+ def get_filtered_files(items,allowed=None,files = []):
32
+ allowed = get_allowed_predicate(allowed=allowed)
33
+ glob_paths = get_globs(items)
34
+ return [glob_path for glob_path in glob_paths if glob_path and os.path.isfile(glob_path) and glob_path not in files and allowed(glob_path)]
35
+ def get_filtered_dirs(items,allowed=None,dirs = []):
36
+ allowed = get_allowed_predicate(allowed=allowed)
37
+ glob_paths = get_globs(items)
38
+ return [glob_path for glob_path in glob_paths if glob_path and os.path.isdir(glob_path) and glob_path not in dirs and allowed(glob_path)]
39
+
40
+ def get_all_allowed_files(items,allowed=None):
41
+ dirs = get_all_allowed_dirs(items)
42
+ files = get_allowed_files(items)
43
+ nu_files = []
44
+ for directory in dirs:
45
+ files += get_filtered_files(directory,allowed=allowed,files=files)
46
+ return files
47
+ def get_all_allowed_dirs(items,allowed=None):
48
+ allowed = get_allowed_predicate(allowed=allowed)
49
+ dirs = get_allowed_dirs(items)
50
+ nu_dirs=[]
51
+ for directory in dirs:
52
+ nu_dirs += get_filtered_dirs(directory,allowed=allowed,dirs=nu_dirs)
53
+ return nu_dirs
54
+
55
+ def make_allowed_predicate(cfg: ScanConfig) -> Callable[[str], bool]:
56
+ """
57
+ Build a predicate that returns True if a given path is considered allowed
58
+ under the given ScanConfig. Applies allowed_* and exclude_* logic symmetrically.
59
+ """
60
+ def allowed(path: str=None,p=None) -> bool:
61
+ p = p or Path(path)
62
+ name = p.name.lower()
63
+ path_str = str(p).lower()
64
+
65
+ # --------------------
66
+ # A) directory filters
67
+ # --------------------
68
+ if cfg.exclude_dirs:
69
+ for dpat in cfg.exclude_dirs:
70
+ dpat_l = dpat.lower()
71
+ if dpat_l in path_str or fnmatch.fnmatch(name, dpat_l):
72
+ if p.is_dir() or dpat_l in path_str:
73
+ return False
74
+
75
+ if cfg.allowed_dirs and cfg.allowed_dirs != ["*"]:
76
+ # must be in at least one allowed dir
77
+ if not any(
78
+ fnmatch.fnmatch(path_str, f"*{dpat.lower()}*") for dpat in cfg.allowed_dirs
79
+ ):
80
+ return False
81
+
82
+ # --------------------
83
+ # B) pattern filters
84
+ # --------------------
85
+ if cfg.allowed_patterns and cfg.allowed_patterns != ["*"]:
86
+ if not any(fnmatch.fnmatch(name, pat.lower()) for pat in cfg.allowed_patterns):
87
+ return False
88
+
89
+ if cfg.exclude_patterns:
90
+ for pat in cfg.exclude_patterns:
91
+ if fnmatch.fnmatch(name, pat.lower()):
92
+ return False
93
+
94
+ # --------------------
95
+ # C) extension filters
96
+ # --------------------
97
+ if p.is_file():
98
+ ext = p.suffix.lower()
99
+ if cfg.allowed_exts and ext not in cfg.allowed_exts:
100
+ return False
101
+ if cfg.exclude_exts and ext in cfg.exclude_exts:
102
+ return False
103
+
104
+ # --------------------
105
+ # D) type filters (optional)
106
+ # --------------------
107
+ if cfg.allowed_types and cfg.allowed_types != {"*"}:
108
+ if not any(t in path_str for t in cfg.allowed_types):
109
+ return False
110
+ if cfg.exclude_types and cfg.exclude_types != {"*"}:
111
+ if any(t in path_str for t in cfg.exclude_types):
112
+ return False
113
+
114
+ return True
115
+
116
+ return allowed
5
117
 
6
- def collect_filepaths(
7
- directory: List[str],
8
- cfg: ScanConfig=None,
9
- allowed_exts: Optional[Set[str]] = False,
10
- unallowed_exts: Optional[Set[str]] = False,
11
- allowed_types: Optional[Set[str]] = False,
12
- exclude_types: Optional[Set[str]] = False,
13
- allowed_dirs: Optional[List[str]] = False,
14
- exclude_dirs: Optional[List[str]] = False,
15
- allowed_patterns: Optional[List[str]] = False,
16
- exclude_patterns: Optional[List[str]] = False,
17
- add=False,
18
- allowed: Optional[Callable[[str], bool]] = None,
19
- **kwargs
20
- ) -> List[str]:
21
- cfg = cfg or define_defaults(
22
- allowed_exts = allowed_exts,
23
- unallowed_exts = unallowed_exts,
24
- allowed_types = allowed_types,
25
- exclude_types = exclude_types,
26
- allowed_dirs = allowed_dirs,
27
- exclude_dirs = exclude_dirs,
28
- allowed_patterns = allowed_patterns,
29
- exclude_patterns = exclude_patterns,
30
- add = add
31
- )
32
- allowed = allowed or make_allowed_predicate(cfg)
33
- directories = make_list(directory)
34
- roots = [r for r in directories if r]
35
-
36
- # your existing helpers (get_dirs, get_globs, etc.) stay the same
37
- original_dirs = get_allowed_dirs(roots, allowed=allowed)
38
- original_globs = get_globs(original_dirs)
39
- files = get_allowed_files(original_globs, allowed=allowed)
40
-
41
- for d in get_filtered_dirs(original_dirs, allowed=allowed):
42
- files += get_filtered_files(d, allowed=allowed, files=files)
43
-
44
- # de-dupe while preserving order
45
- seen, out = set(), []
46
- for f in files:
47
- if f not in seen:
48
- seen.add(f)
49
- out.append(f)
50
- return out
51
118
 
52
119
 
53
120
  def _fast_walk(
@@ -1,9 +1,7 @@
1
1
  # file_reader.py
2
2
  from ..imports import *
3
3
  # -------- Public API drop-ins that mirror your originals --------
4
- from .filter_params import *
5
4
  from .file_filters import *
6
- from .file_utils import *
7
5
  from .pdf_utils import *
8
6
  # ---------------------------------------------------------------------------
9
7
  # NOTE: The following helper functions must be provided elsewhere:
@@ -238,68 +236,6 @@ def read_shape_file(path: str) -> Union[gpd.GeoDataFrame, None]:
238
236
  return None
239
237
 
240
238
 
241
- def collect_filepaths(
242
- inputs: Union[str, List[str]],
243
- exclude_dirs: set[str] = None,
244
- exclude_file_patterns: set[str] = None,
245
- exclude_types: set[str] = None
246
- ) -> List[str]:
247
- """
248
- Given a path or list of paths, return a list of all file paths under them.
249
- - If an input is a file, it's included (unless it matches an exclude pattern).
250
- - If an input is a directory, walk it recursively:
251
- • Skip any subdirectory named in `exclude_dirs`
252
- • Skip any file whose name matches one of `exclude_file_patterns`
253
- """
254
- re_initialize_skip_mgr(exclude_types=exclude_types,
255
- exclude_file_patterns=exclude_file_patterns,
256
- exclude_dirs=exclude_dirs)
257
-
258
-
259
- # Normalize to list
260
- if isinstance(inputs, str):
261
- paths_to_scan = [inputs]
262
- else:
263
- paths_to_scan = list(inputs)
264
-
265
- all_files: List[str] = []
266
-
267
-
268
-
269
- def _collect_from_dir(dirpath: str):
270
- for dirpath_root, dirnames, filenames in os.walk(dirpath):
271
- # Remove any excluded subdirectories from os.walk
272
- dirnames[:] = [d for d in dirnames if d not in exclude_dirs]
273
-
274
- for fname in filenames:
275
- if should_skip(exclude_item=fname,
276
- exclude_types=True,
277
- exclude_file_patterns=True):
278
- continue
279
- full = os.path.join(dirpath_root, fname)
280
- all_files.append(full)
281
-
282
- for p in paths_to_scan:
283
- if not os.path.exists(p):
284
- # skip nonexistent paths
285
- continue
286
-
287
- if os.path.isfile(p):
288
- basename = os.path.basename(p)
289
- fname = os.path.splitext(basename)
290
- if not should_skip(exclude_item=fname,
291
- exclude_types=True,
292
- exclude_file_patterns=True):
293
- all_files.append(p)
294
- else:
295
- # p is a directory
296
- _collect_from_dir(p)
297
-
298
- return all_files
299
-
300
- # requirements:
301
- # pip install pdfplumber pdf2image pytesseract pillow
302
- # # plus Tesseract binary (apt install tesseract-ocr or brew install tesseract)
303
239
 
304
240
 
305
241
 
@@ -52,38 +52,13 @@ def get_all_allowed_dirs(items,allowed=None):
52
52
  for directory in dirs:
53
53
  nu_dirs += get_filtered_dirs(directory,allowed=allowed,dirs=nu_dirs)
54
54
  return nu_dirs
55
- def get_files_and_dirs(
56
- directory: str,
57
- cfg: Optional["ScanConfig"] = None,
58
- allowed_exts: Optional[Set[str]] = False,
59
- unallowed_exts: Optional[Set[str]] = False,
60
- allowed_types: Optional[Set[str]] = False,
61
- exclude_types: Optional[Set[str]] = False,
62
- allowed_dirs: Optional[List[str]] = False,
63
- exclude_dirs: Optional[List[str]] = False,
64
- allowed_patterns: Optional[List[str]] = False,
65
- exclude_patterns: Optional[List[str]] = False,
66
- add = False,
67
- recursive: bool = True,
68
- include_files: bool = True,
69
- **kwargs
70
- ):
71
- cfg = cfg or define_defaults(
72
- allowed_exts = allowed_exts,
73
- unallowed_exts = unallowed_exts,
74
- allowed_types = allowed_types,
75
- exclude_types = exclude_types,
76
- allowed_dirs = allowed_dirs,
77
- exclude_dirs = exclude_dirs,
78
- allowed_patterns = allowed_patterns,
79
- exclude_patterns = exclude_patterns,
80
- add=add
81
- )
82
- allowed = make_allowed_predicate(cfg)
55
+ def get_files_and_dirs(*args,**kwargs)-> List[str]:
56
+ directories,cfg,allowed,include_files,recursive = get_file_filters(*args,**kwargs)
83
57
  items=[]
84
58
  files =[]
85
59
  if recursive:
86
- items = get_globs(directory,recursive=recursive,allowed=allowed)
60
+ for directory in directories:
61
+ items += get_globs(directories,recursive=recursive,allowed=allowed)
87
62
  else:
88
63
  directories = make_list(directory)
89
64
  for directory in directories:
@@ -92,68 +67,7 @@ def get_files_and_dirs(
92
67
  if include_files:
93
68
  files = get_allowed_files(items,allowed=allowed)
94
69
  return dirs,files
95
- def make_allowed_predicate(cfg: ScanConfig) -> Callable[[str], bool]:
96
- """
97
- Build a predicate that returns True if a given path is considered allowed
98
- under the given ScanConfig. Applies allowed_* and exclude_* logic symmetrically.
99
- """
100
- def allowed(path: str=None,p=None) -> bool:
101
- p = p or Path(path)
102
- name = p.name.lower()
103
- path_str = str(p).lower()
104
-
105
- # --------------------
106
- # A) directory filters
107
- # --------------------
108
- if cfg.exclude_dirs:
109
- for dpat in cfg.exclude_dirs:
110
- dpat_l = dpat.lower()
111
- if dpat_l in path_str or fnmatch.fnmatch(name, dpat_l):
112
- if p.is_dir() or dpat_l in path_str:
113
- return False
114
-
115
- if cfg.allowed_dirs and cfg.allowed_dirs != ["*"]:
116
- # must be in at least one allowed dir
117
- if not any(
118
- fnmatch.fnmatch(path_str, f"*{dpat.lower()}*") for dpat in cfg.allowed_dirs
119
- ):
120
- return False
121
70
 
122
- # --------------------
123
- # B) pattern filters
124
- # --------------------
125
- if cfg.allowed_patterns and cfg.allowed_patterns != ["*"]:
126
- if not any(fnmatch.fnmatch(name, pat.lower()) for pat in cfg.allowed_patterns):
127
- return False
128
-
129
- if cfg.exclude_patterns:
130
- for pat in cfg.exclude_patterns:
131
- if fnmatch.fnmatch(name, pat.lower()):
132
- return False
133
-
134
- # --------------------
135
- # C) extension filters
136
- # --------------------
137
- if p.is_file():
138
- ext = p.suffix.lower()
139
- if cfg.allowed_exts and ext not in cfg.allowed_exts:
140
- return False
141
- if cfg.unallowed_exts and ext in cfg.unallowed_exts:
142
- return False
143
-
144
- # --------------------
145
- # D) type filters (optional)
146
- # --------------------
147
- if cfg.allowed_types and cfg.allowed_types != {"*"}:
148
- if not any(t in path_str for t in cfg.allowed_types):
149
- return False
150
- if cfg.exclude_types and cfg.exclude_types != {"*"}:
151
- if any(t in path_str for t in cfg.exclude_types):
152
- return False
153
-
154
- return True
155
-
156
- return allowed
157
71
  def correct_kwargs(**kwargs):
158
72
  for key,values in kwargs.items():
159
73
  if key.startswith('excluded'):
@@ -163,47 +77,10 @@ def correct_kwargs(**kwargs):
163
77
  kwargs[correct_key]=combine_params(correct_vals,values)
164
78
  del kwargs[key]
165
79
 
166
- def collect_filepaths(
167
- directory: List[str],
168
- cfg: ScanConfig=None,
169
- allowed_exts: Optional[Set[str]] = False,
170
- unallowed_exts: Optional[Set[str]] = False,
171
- allowed_types: Optional[Set[str]] = False,
172
- exclude_types: Optional[Set[str]] = False,
173
- allowed_dirs: Optional[List[str]] = False,
174
- exclude_dirs: Optional[List[str]] = False,
175
- allowed_patterns: Optional[List[str]] = False,
176
- exclude_patterns: Optional[List[str]] = False,
177
- add=False,
178
- allowed: Optional[Callable[[str], bool]] = None,
179
- **kwargs
180
- ) -> List[str]:
181
- kwargs = correct_kwargs(allowed_exts = allowed_exts,
182
- unallowed_exts = unallowed_exts,
183
- allowed_types = allowed_types,
184
- exclude_types = exclude_types,
185
- allowed_dirs = allowed_dirs,
186
- exclude_dirs = exclude_dirs,
187
- allowed_patterns = allowed_patterns,
188
- exclude_patterns = exclude_patterns,
189
- **kwargs)
190
- cfg = cfg or define_defaults(
191
- allowed_exts = allowed_exts,
192
- unallowed_exts = unallowed_exts,
193
- allowed_types = allowed_types,
194
- exclude_types = exclude_types,
195
- allowed_dirs = allowed_dirs,
196
- exclude_dirs = exclude_dirs,
197
- allowed_patterns = allowed_patterns,
198
- exclude_patterns = exclude_patterns,
199
- add = add
200
- )
201
- allowed = allowed or make_allowed_predicate(cfg)
202
- directories = make_list(directory)
203
- roots = [r for r in directories if r]
204
-
80
+ def collect_filepaths(*args,**kwargs)-> List[str]:
81
+ directories,cfg,allowed,include_files,recursive = get_file_filters(*args,**kwargs)
205
82
  # your existing helpers (get_dirs, get_globs, etc.) stay the same
206
- original_dirs = get_allowed_dirs(roots, allowed=allowed)
83
+ original_dirs = get_allowed_dirs(directories, allowed=allowed)
207
84
  original_globs = get_globs(original_dirs)
208
85
  files = get_allowed_files(original_globs, allowed=allowed)
209
86