abstract-utilities 0.2.2.513__py3-none-any.whl → 0.2.2.583__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_utilities/class_utils/caller_utils.py +18 -0
- abstract_utilities/class_utils/global_utils.py +3 -2
- abstract_utilities/class_utils/imports/imports.py +1 -1
- abstract_utilities/directory_utils/__init__.py +2 -4
- abstract_utilities/directory_utils/imports/__init__.py +2 -0
- abstract_utilities/directory_utils/imports/imports.py +1 -0
- abstract_utilities/directory_utils/imports/module_imports.py +2 -0
- abstract_utilities/directory_utils/src/__init__.py +4 -0
- abstract_utilities/directory_utils/src/directory_utils.py +108 -0
- abstract_utilities/directory_utils/src/name_utils.py +43 -0
- abstract_utilities/directory_utils/src/size_utils.py +57 -0
- abstract_utilities/directory_utils/src/utils.py +116 -0
- abstract_utilities/file_utils/imports/constants.py +81 -7
- abstract_utilities/file_utils/imports/imports.py +0 -4
- abstract_utilities/file_utils/imports/module_imports.py +1 -1
- abstract_utilities/file_utils/src/__init__.py +2 -4
- abstract_utilities/file_utils/src/file_filters/__init__.py +4 -0
- abstract_utilities/file_utils/src/file_filters/ensure_utils.py +116 -0
- abstract_utilities/file_utils/src/file_filters/filter_params.py +86 -0
- abstract_utilities/file_utils/src/file_filters/filter_utils.py +78 -0
- abstract_utilities/file_utils/src/file_filters/predicate_utils.py +114 -0
- abstract_utilities/file_utils/src/file_filters.py +114 -47
- abstract_utilities/file_utils/src/file_reader.py +0 -64
- abstract_utilities/file_utils/src/file_utils.py +7 -130
- abstract_utilities/file_utils/src/filter_params.py +128 -86
- abstract_utilities/file_utils/src/find_collect.py +85 -165
- abstract_utilities/file_utils/src/find_content.py +210 -0
- abstract_utilities/file_utils/src/initFunctionsGen.py +3 -9
- abstract_utilities/file_utils/src/reader_utils/__init__.py +4 -0
- abstract_utilities/file_utils/src/reader_utils/directory_reader.py +53 -0
- abstract_utilities/file_utils/src/reader_utils/file_reader.py +543 -0
- abstract_utilities/file_utils/src/reader_utils/file_readers.py +376 -0
- abstract_utilities/file_utils/src/reader_utils/imports.py +18 -0
- abstract_utilities/file_utils/src/reader_utils/pdf_utils.py +300 -0
- abstract_utilities/file_utils (2)/__init__.py +2 -0
- abstract_utilities/file_utils (2)/imports/__init__.py +2 -0
- abstract_utilities/file_utils (2)/imports/constants.py +118 -0
- abstract_utilities/file_utils (2)/imports/imports/__init__.py +3 -0
- abstract_utilities/file_utils (2)/imports/imports/constants.py +119 -0
- abstract_utilities/file_utils (2)/imports/imports/imports.py +46 -0
- abstract_utilities/file_utils (2)/imports/imports/module_imports.py +8 -0
- abstract_utilities/file_utils (2)/imports/utils/__init__.py +3 -0
- abstract_utilities/file_utils (2)/imports/utils/classes.py +379 -0
- abstract_utilities/file_utils (2)/imports/utils/clean_imps.py +155 -0
- abstract_utilities/file_utils (2)/imports/utils/filter_utils.py +341 -0
- abstract_utilities/file_utils (2)/src/__init__.py +8 -0
- abstract_utilities/file_utils (2)/src/file_filters.py +155 -0
- abstract_utilities/file_utils (2)/src/file_reader.py +604 -0
- abstract_utilities/file_utils (2)/src/find_collect.py +258 -0
- abstract_utilities/file_utils (2)/src/initFunctionsGen.py +286 -0
- abstract_utilities/file_utils (2)/src/map_utils.py +28 -0
- abstract_utilities/file_utils (2)/src/pdf_utils.py +300 -0
- abstract_utilities/import_utils/circular_import_finder.py +222 -0
- abstract_utilities/import_utils/circular_import_finder2.py +118 -0
- abstract_utilities/import_utils/imports/module_imports.py +3 -1
- abstract_utilities/import_utils/src/clean_imports.py +156 -25
- abstract_utilities/import_utils/src/dot_utils.py +11 -0
- abstract_utilities/import_utils/src/extract_utils.py +4 -0
- abstract_utilities/import_utils/src/import_functions.py +47 -2
- abstract_utilities/import_utils/src/pkg_utils.py +58 -4
- abstract_utilities/import_utils/src/sysroot_utils.py +56 -1
- abstract_utilities/log_utils/log_file.py +3 -2
- abstract_utilities/path_utils/path_utils.py +25 -23
- abstract_utilities/safe_utils/safe_utils.py +30 -0
- {abstract_utilities-0.2.2.513.dist-info → abstract_utilities-0.2.2.583.dist-info}/METADATA +1 -1
- {abstract_utilities-0.2.2.513.dist-info → abstract_utilities-0.2.2.583.dist-info}/RECORD +68 -28
- {abstract_utilities-0.2.2.513.dist-info → abstract_utilities-0.2.2.583.dist-info}/WHEEL +0 -0
- {abstract_utilities-0.2.2.513.dist-info → abstract_utilities-0.2.2.583.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
from .predicate_utils import *
|
|
2
|
+
def _get_default_modular(value, default, add=False, typ=set):
|
|
3
|
+
"""Merge user and default values intelligently."""
|
|
4
|
+
if value == None:
|
|
5
|
+
value = add
|
|
6
|
+
if value in [True]:
|
|
7
|
+
return default
|
|
8
|
+
if value is False:
|
|
9
|
+
return value
|
|
10
|
+
if add:
|
|
11
|
+
return combine_params(value,default,typ=None)
|
|
12
|
+
|
|
13
|
+
return typ(value)
|
|
14
|
+
|
|
15
|
+
# -------------------------
|
|
16
|
+
# Default derivation logic
|
|
17
|
+
# -------------------------
|
|
18
|
+
def _get_default_modular(value, default, add=None, typ=set):
|
|
19
|
+
"""Merge user and default values intelligently."""
|
|
20
|
+
add = add or False
|
|
21
|
+
if value == None:
|
|
22
|
+
value = add
|
|
23
|
+
if value in [True]:
|
|
24
|
+
return default
|
|
25
|
+
if value is False:
|
|
26
|
+
return value
|
|
27
|
+
if add:
|
|
28
|
+
return combine_params(value,default,typ=None)
|
|
29
|
+
return typ(value)
|
|
30
|
+
def derive_all_defaults(**kwargs):
|
|
31
|
+
kwargs = get_safe_canonical_kwargs(**kwargs)
|
|
32
|
+
add = kwargs.get("add",False)
|
|
33
|
+
nu_defaults = {}
|
|
34
|
+
for key,values in DEFAULT_CANONICAL_MAP.items():
|
|
35
|
+
default = values.get("default")
|
|
36
|
+
typ = values.get("type")
|
|
37
|
+
key_value = kwargs.get(key)
|
|
38
|
+
if key in DEFAULT_ALLOWED_EXCLUDE_MAP:
|
|
39
|
+
|
|
40
|
+
if key.endswith('exts'):
|
|
41
|
+
input_value = ensure_exts(key_value)
|
|
42
|
+
if key.endswith('patterns'):
|
|
43
|
+
input_value = ensure_patterns(key_value)
|
|
44
|
+
else:
|
|
45
|
+
input_value = normalize_listlike(key_value, typ)
|
|
46
|
+
nu_defaults[key] = _get_default_modular(input_value, default, add, typ)
|
|
47
|
+
else:
|
|
48
|
+
value = default if key_value is None else key_value
|
|
49
|
+
if typ == list:
|
|
50
|
+
value = make_list(value)
|
|
51
|
+
elif typ == bool:
|
|
52
|
+
value = bool(value)
|
|
53
|
+
nu_defaults[key] = value
|
|
54
|
+
|
|
55
|
+
return nu_defaults
|
|
56
|
+
# -------------------------
|
|
57
|
+
# Default derivation logic
|
|
58
|
+
# -------------------------
|
|
59
|
+
def derive_file_defaults(**kwargs):
|
|
60
|
+
kwargs = derive_all_defaults(**kwargs)
|
|
61
|
+
add = kwargs.get("add",True)
|
|
62
|
+
nu_defaults = {}
|
|
63
|
+
for key,values in DEFAULT_ALLOWED_EXCLUDE_MAP.items():
|
|
64
|
+
default = values.get("default")
|
|
65
|
+
typ = values.get("type")
|
|
66
|
+
key_value = kwargs.get(key)
|
|
67
|
+
if key.endswith('exts'):
|
|
68
|
+
input_value = ensure_exts(key_value)
|
|
69
|
+
if key.endswith('patterns'):
|
|
70
|
+
input_value = ensure_patterns(key_value)
|
|
71
|
+
else:
|
|
72
|
+
input_value = normalize_listlike(key_value, typ)
|
|
73
|
+
nu_defaults[key] = _get_default_modular(input_value, default, add, typ)
|
|
74
|
+
return nu_defaults
|
|
75
|
+
|
|
76
|
+
def define_defaults(**kwargs):
|
|
77
|
+
defaults = derive_file_defaults(**kwargs)
|
|
78
|
+
return ScanConfig(**defaults)
|
|
79
|
+
|
|
80
|
+
def get_file_filters(*args,**kwargs):
|
|
81
|
+
directories = ensure_directories(*args,**kwargs)
|
|
82
|
+
recursive = kwargs.get('recursive',True)
|
|
83
|
+
include_files = kwargs.get('include_files',True)
|
|
84
|
+
cfg = define_defaults(**kwargs)
|
|
85
|
+
allowed = kwargs.get("allowed") or make_allowed_predicate(cfg)
|
|
86
|
+
return directories,cfg,allowed,include_files,recursive
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
from ...imports import *
|
|
2
|
+
import re
|
|
3
|
+
def combine_params(*values,typ=None):
|
|
4
|
+
nu_values = None
|
|
5
|
+
for value in values:
|
|
6
|
+
if value is not None:
|
|
7
|
+
typ = typ or type(value)
|
|
8
|
+
if nu_values is None:
|
|
9
|
+
nu_values = typ()
|
|
10
|
+
|
|
11
|
+
if typ is set:
|
|
12
|
+
nu_values = nu_values | typ(value)
|
|
13
|
+
if typ is list:
|
|
14
|
+
nu_values += typ(value)
|
|
15
|
+
return nu_values
|
|
16
|
+
def get_safe_kwargs(canonical_map, **kwargs):
|
|
17
|
+
# Lowercase all keys for safety
|
|
18
|
+
canonical_map = canonical_map or CANONICAL_MAP
|
|
19
|
+
norm_kwargs = {k.lower(): v for k, v in kwargs.items() if v is not None}
|
|
20
|
+
|
|
21
|
+
# Inverse lookup: alias → canonical key
|
|
22
|
+
alias_lookup = {
|
|
23
|
+
alias: canon
|
|
24
|
+
for canon, aliases in canonical_map.items()
|
|
25
|
+
if aliases
|
|
26
|
+
for alias in aliases
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
# Preserve correctly named keys
|
|
30
|
+
safe_kwargs = {k: v for k, v in norm_kwargs.items() if k in canonical_map}
|
|
31
|
+
|
|
32
|
+
for k, v in norm_kwargs.items():
|
|
33
|
+
if k in alias_lookup:
|
|
34
|
+
canonical_key = alias_lookup[k]
|
|
35
|
+
prev = safe_kwargs.get(canonical_key)
|
|
36
|
+
if prev is None:
|
|
37
|
+
safe_kwargs[canonical_key] = v
|
|
38
|
+
else:
|
|
39
|
+
# merge intelligently if both exist
|
|
40
|
+
if isinstance(prev, (set, list)) and isinstance(v, (set, list)):
|
|
41
|
+
safe_kwargs[canonical_key] = list(set(prev) | set(v))
|
|
42
|
+
else:
|
|
43
|
+
safe_kwargs[canonical_key] = v # overwrite for non-iterables
|
|
44
|
+
|
|
45
|
+
# fill defaults if missing
|
|
46
|
+
for canon in canonical_map:
|
|
47
|
+
safe_kwargs.setdefault(canon, None)
|
|
48
|
+
|
|
49
|
+
return safe_kwargs
|
|
50
|
+
|
|
51
|
+
def create_canonical_map(*args,canonical_map=None):
|
|
52
|
+
keys = [arg for arg in args if arg]
|
|
53
|
+
if not keys:
|
|
54
|
+
return CANONICAL_MAP
|
|
55
|
+
canonical_map = canonical_map or CANONICAL_MAP
|
|
56
|
+
|
|
57
|
+
return {key:canonical_map.get(key) for key in keys}
|
|
58
|
+
def get_safe_canonical_kwargs(*args,canonical_map=None,**kwargs):
|
|
59
|
+
canonical_map = canonical_map or create_canonical_map(*args)
|
|
60
|
+
|
|
61
|
+
return get_safe_kwargs(canonical_map=canonical_map,**kwargs)
|
|
62
|
+
def get_dir_filter_kwargs(**kwargs):
|
|
63
|
+
canonical_map = create_canonical_map("directories")
|
|
64
|
+
return get_safe_kwargs(canonical_map=canonical_map,**kwargs)
|
|
65
|
+
def get_file_filter_kwargs(**kwargs):
|
|
66
|
+
"""
|
|
67
|
+
Normalize arbitrary keyword arguments for file scanning configuration.
|
|
68
|
+
|
|
69
|
+
Examples:
|
|
70
|
+
- 'excluded_ext' or 'unallowed_exts' → 'exclude_exts'
|
|
71
|
+
- 'include_dirs' or 'allow_dir' → 'allowed_dirs'
|
|
72
|
+
- 'excludePattern' or 'excluded_patterns' → 'exclude_patterns'
|
|
73
|
+
- 'allowed_type' or 'include_types' → 'allowed_types'
|
|
74
|
+
"""
|
|
75
|
+
# Canonical keys and aliases
|
|
76
|
+
canonical_keys =["allowed_exts","exclude_exts","allowed_types","exclude_types","allowed_dirs","exclude_dirs","allowed_patterns","exclude_patterns"]
|
|
77
|
+
|
|
78
|
+
return get_safe_canonical_kwargs(*canonical_keys,**kwargs)
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
from .ensure_utils import *
|
|
2
|
+
def get_allowed_predicate(allowed=None):
|
|
3
|
+
if allowed != False:
|
|
4
|
+
if allowed == True:
|
|
5
|
+
allowed = None
|
|
6
|
+
allowed = allowed or make_allowed_predicate()
|
|
7
|
+
else:
|
|
8
|
+
def allowed(*args):
|
|
9
|
+
return True
|
|
10
|
+
allowed = allowed
|
|
11
|
+
return allowed
|
|
12
|
+
def get_globs(items,recursive: bool = True,allowed=None):
|
|
13
|
+
glob_paths = []
|
|
14
|
+
items = [item for item in make_list(items) if item]
|
|
15
|
+
for item in items:
|
|
16
|
+
pattern = os.path.join(item, "**/*") # include all files recursively\n
|
|
17
|
+
nuItems = glob.glob(pattern, recursive=recursive)
|
|
18
|
+
if allowed:
|
|
19
|
+
nuItems = [nuItem for nuItem in nuItems if nuItem and allowed(nuItem)]
|
|
20
|
+
glob_paths += nuItems
|
|
21
|
+
return glob_paths
|
|
22
|
+
def get_allowed_files(items,allowed=True):
|
|
23
|
+
allowed = get_allowed_predicate(allowed=allowed)
|
|
24
|
+
return [item for item in items if item and os.path.isfile(item) and allowed(item)]
|
|
25
|
+
def get_allowed_dirs(items,allowed=False):
|
|
26
|
+
allowed = get_allowed_predicate(allowed=allowed)
|
|
27
|
+
return [item for item in items if item and os.path.isdir(item) and allowed(item)]
|
|
28
|
+
|
|
29
|
+
def get_filtered_files(items,allowed=None,files = []):
|
|
30
|
+
allowed = get_allowed_predicate(allowed=allowed)
|
|
31
|
+
glob_paths = get_globs(items)
|
|
32
|
+
return [glob_path for glob_path in glob_paths if glob_path and os.path.isfile(glob_path) and glob_path not in files and allowed(glob_path)]
|
|
33
|
+
def get_filtered_dirs(items,allowed=None,dirs = []):
|
|
34
|
+
allowed = get_allowed_predicate(allowed=allowed)
|
|
35
|
+
glob_paths = get_globs(items)
|
|
36
|
+
return [glob_path for glob_path in glob_paths if glob_path and os.path.isdir(glob_path) and glob_path not in dirs and allowed(glob_path)]
|
|
37
|
+
|
|
38
|
+
def get_all_allowed_files(items,allowed=None):
|
|
39
|
+
dirs = get_all_allowed_dirs(items)
|
|
40
|
+
files = get_allowed_files(items)
|
|
41
|
+
nu_files = []
|
|
42
|
+
for directory in dirs:
|
|
43
|
+
files += get_filtered_files(directory,allowed=allowed,files=files)
|
|
44
|
+
return files
|
|
45
|
+
def get_all_allowed_dirs(items,allowed=None):
|
|
46
|
+
allowed = get_allowed_predicate(allowed=allowed)
|
|
47
|
+
dirs = get_allowed_dirs(items)
|
|
48
|
+
nu_dirs=[]
|
|
49
|
+
for directory in dirs:
|
|
50
|
+
nu_dirs += get_filtered_dirs(directory,allowed=allowed,dirs=nu_dirs)
|
|
51
|
+
return nu_dirs
|
|
52
|
+
|
|
53
|
+
def make_allowed_predicate(cfg: ScanConfig) -> Callable[[str], bool]:
|
|
54
|
+
"""
|
|
55
|
+
Build a predicate that returns True if a given path is considered allowed
|
|
56
|
+
under the given ScanConfig. Applies allowed_* and exclude_* logic symmetrically.
|
|
57
|
+
"""
|
|
58
|
+
def allowed(path: str=None,p=None) -> bool:
|
|
59
|
+
p = p or Path(path)
|
|
60
|
+
name = p.name.lower()
|
|
61
|
+
path_str = str(p).lower()
|
|
62
|
+
|
|
63
|
+
# --------------------
|
|
64
|
+
# A) directory filters
|
|
65
|
+
# --------------------
|
|
66
|
+
if cfg.exclude_dirs:
|
|
67
|
+
for dpat in cfg.exclude_dirs:
|
|
68
|
+
dpat_l = dpat.lower()
|
|
69
|
+
if dpat_l in path_str or fnmatch.fnmatch(name, dpat_l):
|
|
70
|
+
if p.is_dir() or dpat_l in path_str:
|
|
71
|
+
return False
|
|
72
|
+
|
|
73
|
+
if cfg.allowed_dirs and cfg.allowed_dirs != ["*"]:
|
|
74
|
+
# must be in at least one allowed dir
|
|
75
|
+
if not any(
|
|
76
|
+
fnmatch.fnmatch(path_str, f"*{dpat.lower()}*") for dpat in cfg.allowed_dirs
|
|
77
|
+
):
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
# --------------------
|
|
81
|
+
# B) pattern filters
|
|
82
|
+
# --------------------
|
|
83
|
+
if cfg.allowed_patterns and cfg.allowed_patterns != ["*"]:
|
|
84
|
+
if not any(fnmatch.fnmatch(name, pat.lower()) for pat in cfg.allowed_patterns):
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
if cfg.exclude_patterns:
|
|
88
|
+
for pat in cfg.exclude_patterns:
|
|
89
|
+
if fnmatch.fnmatch(name, pat.lower()):
|
|
90
|
+
return False
|
|
91
|
+
|
|
92
|
+
# --------------------
|
|
93
|
+
# C) extension filters
|
|
94
|
+
# --------------------
|
|
95
|
+
if p.is_file():
|
|
96
|
+
ext = p.suffix.lower()
|
|
97
|
+
if cfg.allowed_exts and ext not in cfg.allowed_exts:
|
|
98
|
+
return False
|
|
99
|
+
if cfg.exclude_exts and ext in cfg.exclude_exts:
|
|
100
|
+
return False
|
|
101
|
+
|
|
102
|
+
# --------------------
|
|
103
|
+
# D) type filters (optional)
|
|
104
|
+
# --------------------
|
|
105
|
+
if cfg.allowed_types and cfg.allowed_types != {"*"}:
|
|
106
|
+
if not any(t in path_str for t in cfg.allowed_types):
|
|
107
|
+
return False
|
|
108
|
+
if cfg.exclude_types and cfg.exclude_types != {"*"}:
|
|
109
|
+
if any(t in path_str for t in cfg.exclude_types):
|
|
110
|
+
return False
|
|
111
|
+
|
|
112
|
+
return True
|
|
113
|
+
|
|
114
|
+
return allowed
|
|
@@ -1,53 +1,120 @@
|
|
|
1
|
-
from ..imports import *
|
|
2
1
|
from .filter_params import *
|
|
3
|
-
from
|
|
2
|
+
from ..imports import *
|
|
4
3
|
##from abstract_utilities import make_list,get_media_exts, is_media_type
|
|
4
|
+
def get_allowed_predicate(allowed=None):
|
|
5
|
+
if allowed != False:
|
|
6
|
+
if allowed == True:
|
|
7
|
+
allowed = None
|
|
8
|
+
allowed = allowed or make_allowed_predicate()
|
|
9
|
+
else:
|
|
10
|
+
def allowed(*args):
|
|
11
|
+
return True
|
|
12
|
+
allowed = allowed
|
|
13
|
+
return allowed
|
|
14
|
+
def get_globs(items,recursive: bool = True,allowed=None):
|
|
15
|
+
glob_paths = []
|
|
16
|
+
items = [item for item in make_list(items) if item]
|
|
17
|
+
for item in items:
|
|
18
|
+
pattern = os.path.join(item, "**/*") # include all files recursively\n
|
|
19
|
+
nuItems = glob.glob(pattern, recursive=recursive)
|
|
20
|
+
if allowed:
|
|
21
|
+
nuItems = [nuItem for nuItem in nuItems if nuItem and allowed(nuItem)]
|
|
22
|
+
glob_paths += nuItems
|
|
23
|
+
return glob_paths
|
|
24
|
+
def get_allowed_files(items,allowed=True):
|
|
25
|
+
allowed = get_allowed_predicate(allowed=allowed)
|
|
26
|
+
return [item for item in items if item and os.path.isfile(item) and allowed(item)]
|
|
27
|
+
def get_allowed_dirs(items,allowed=False):
|
|
28
|
+
allowed = get_allowed_predicate(allowed=allowed)
|
|
29
|
+
return [item for item in items if item and os.path.isdir(item) and allowed(item)]
|
|
30
|
+
|
|
31
|
+
def get_filtered_files(items,allowed=None,files = []):
|
|
32
|
+
allowed = get_allowed_predicate(allowed=allowed)
|
|
33
|
+
glob_paths = get_globs(items)
|
|
34
|
+
return [glob_path for glob_path in glob_paths if glob_path and os.path.isfile(glob_path) and glob_path not in files and allowed(glob_path)]
|
|
35
|
+
def get_filtered_dirs(items,allowed=None,dirs = []):
|
|
36
|
+
allowed = get_allowed_predicate(allowed=allowed)
|
|
37
|
+
glob_paths = get_globs(items)
|
|
38
|
+
return [glob_path for glob_path in glob_paths if glob_path and os.path.isdir(glob_path) and glob_path not in dirs and allowed(glob_path)]
|
|
39
|
+
|
|
40
|
+
def get_all_allowed_files(items,allowed=None):
|
|
41
|
+
dirs = get_all_allowed_dirs(items)
|
|
42
|
+
files = get_allowed_files(items)
|
|
43
|
+
nu_files = []
|
|
44
|
+
for directory in dirs:
|
|
45
|
+
files += get_filtered_files(directory,allowed=allowed,files=files)
|
|
46
|
+
return files
|
|
47
|
+
def get_all_allowed_dirs(items,allowed=None):
|
|
48
|
+
allowed = get_allowed_predicate(allowed=allowed)
|
|
49
|
+
dirs = get_allowed_dirs(items)
|
|
50
|
+
nu_dirs=[]
|
|
51
|
+
for directory in dirs:
|
|
52
|
+
nu_dirs += get_filtered_dirs(directory,allowed=allowed,dirs=nu_dirs)
|
|
53
|
+
return nu_dirs
|
|
54
|
+
|
|
55
|
+
def make_allowed_predicate(cfg: ScanConfig) -> Callable[[str], bool]:
|
|
56
|
+
"""
|
|
57
|
+
Build a predicate that returns True if a given path is considered allowed
|
|
58
|
+
under the given ScanConfig. Applies allowed_* and exclude_* logic symmetrically.
|
|
59
|
+
"""
|
|
60
|
+
def allowed(path: str=None,p=None) -> bool:
|
|
61
|
+
p = p or Path(path)
|
|
62
|
+
name = p.name.lower()
|
|
63
|
+
path_str = str(p).lower()
|
|
64
|
+
|
|
65
|
+
# --------------------
|
|
66
|
+
# A) directory filters
|
|
67
|
+
# --------------------
|
|
68
|
+
if cfg.exclude_dirs:
|
|
69
|
+
for dpat in cfg.exclude_dirs:
|
|
70
|
+
dpat_l = dpat.lower()
|
|
71
|
+
if dpat_l in path_str or fnmatch.fnmatch(name, dpat_l):
|
|
72
|
+
if p.is_dir() or dpat_l in path_str:
|
|
73
|
+
return False
|
|
74
|
+
|
|
75
|
+
if cfg.allowed_dirs and cfg.allowed_dirs != ["*"]:
|
|
76
|
+
# must be in at least one allowed dir
|
|
77
|
+
if not any(
|
|
78
|
+
fnmatch.fnmatch(path_str, f"*{dpat.lower()}*") for dpat in cfg.allowed_dirs
|
|
79
|
+
):
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
# --------------------
|
|
83
|
+
# B) pattern filters
|
|
84
|
+
# --------------------
|
|
85
|
+
if cfg.allowed_patterns and cfg.allowed_patterns != ["*"]:
|
|
86
|
+
if not any(fnmatch.fnmatch(name, pat.lower()) for pat in cfg.allowed_patterns):
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
if cfg.exclude_patterns:
|
|
90
|
+
for pat in cfg.exclude_patterns:
|
|
91
|
+
if fnmatch.fnmatch(name, pat.lower()):
|
|
92
|
+
return False
|
|
93
|
+
|
|
94
|
+
# --------------------
|
|
95
|
+
# C) extension filters
|
|
96
|
+
# --------------------
|
|
97
|
+
if p.is_file():
|
|
98
|
+
ext = p.suffix.lower()
|
|
99
|
+
if cfg.allowed_exts and ext not in cfg.allowed_exts:
|
|
100
|
+
return False
|
|
101
|
+
if cfg.exclude_exts and ext in cfg.exclude_exts:
|
|
102
|
+
return False
|
|
103
|
+
|
|
104
|
+
# --------------------
|
|
105
|
+
# D) type filters (optional)
|
|
106
|
+
# --------------------
|
|
107
|
+
if cfg.allowed_types and cfg.allowed_types != {"*"}:
|
|
108
|
+
if not any(t in path_str for t in cfg.allowed_types):
|
|
109
|
+
return False
|
|
110
|
+
if cfg.exclude_types and cfg.exclude_types != {"*"}:
|
|
111
|
+
if any(t in path_str for t in cfg.exclude_types):
|
|
112
|
+
return False
|
|
113
|
+
|
|
114
|
+
return True
|
|
115
|
+
|
|
116
|
+
return allowed
|
|
5
117
|
|
|
6
|
-
def collect_filepaths(
|
|
7
|
-
directory: List[str],
|
|
8
|
-
cfg: ScanConfig=None,
|
|
9
|
-
allowed_exts: Optional[Set[str]] = False,
|
|
10
|
-
unallowed_exts: Optional[Set[str]] = False,
|
|
11
|
-
allowed_types: Optional[Set[str]] = False,
|
|
12
|
-
exclude_types: Optional[Set[str]] = False,
|
|
13
|
-
allowed_dirs: Optional[List[str]] = False,
|
|
14
|
-
exclude_dirs: Optional[List[str]] = False,
|
|
15
|
-
allowed_patterns: Optional[List[str]] = False,
|
|
16
|
-
exclude_patterns: Optional[List[str]] = False,
|
|
17
|
-
add=False,
|
|
18
|
-
allowed: Optional[Callable[[str], bool]] = None,
|
|
19
|
-
**kwargs
|
|
20
|
-
) -> List[str]:
|
|
21
|
-
cfg = cfg or define_defaults(
|
|
22
|
-
allowed_exts = allowed_exts,
|
|
23
|
-
unallowed_exts = unallowed_exts,
|
|
24
|
-
allowed_types = allowed_types,
|
|
25
|
-
exclude_types = exclude_types,
|
|
26
|
-
allowed_dirs = allowed_dirs,
|
|
27
|
-
exclude_dirs = exclude_dirs,
|
|
28
|
-
allowed_patterns = allowed_patterns,
|
|
29
|
-
exclude_patterns = exclude_patterns,
|
|
30
|
-
add = add
|
|
31
|
-
)
|
|
32
|
-
allowed = allowed or make_allowed_predicate(cfg)
|
|
33
|
-
directories = make_list(directory)
|
|
34
|
-
roots = [r for r in directories if r]
|
|
35
|
-
|
|
36
|
-
# your existing helpers (get_dirs, get_globs, etc.) stay the same
|
|
37
|
-
original_dirs = get_allowed_dirs(roots, allowed=allowed)
|
|
38
|
-
original_globs = get_globs(original_dirs)
|
|
39
|
-
files = get_allowed_files(original_globs, allowed=allowed)
|
|
40
|
-
|
|
41
|
-
for d in get_filtered_dirs(original_dirs, allowed=allowed):
|
|
42
|
-
files += get_filtered_files(d, allowed=allowed, files=files)
|
|
43
|
-
|
|
44
|
-
# de-dupe while preserving order
|
|
45
|
-
seen, out = set(), []
|
|
46
|
-
for f in files:
|
|
47
|
-
if f not in seen:
|
|
48
|
-
seen.add(f)
|
|
49
|
-
out.append(f)
|
|
50
|
-
return out
|
|
51
118
|
|
|
52
119
|
|
|
53
120
|
def _fast_walk(
|
|
@@ -1,9 +1,7 @@
|
|
|
1
1
|
# file_reader.py
|
|
2
2
|
from ..imports import *
|
|
3
3
|
# -------- Public API drop-ins that mirror your originals --------
|
|
4
|
-
from .filter_params import *
|
|
5
4
|
from .file_filters import *
|
|
6
|
-
from .file_utils import *
|
|
7
5
|
from .pdf_utils import *
|
|
8
6
|
# ---------------------------------------------------------------------------
|
|
9
7
|
# NOTE: The following helper functions must be provided elsewhere:
|
|
@@ -238,68 +236,6 @@ def read_shape_file(path: str) -> Union[gpd.GeoDataFrame, None]:
|
|
|
238
236
|
return None
|
|
239
237
|
|
|
240
238
|
|
|
241
|
-
def collect_filepaths(
|
|
242
|
-
inputs: Union[str, List[str]],
|
|
243
|
-
exclude_dirs: set[str] = None,
|
|
244
|
-
exclude_file_patterns: set[str] = None,
|
|
245
|
-
exclude_types: set[str] = None
|
|
246
|
-
) -> List[str]:
|
|
247
|
-
"""
|
|
248
|
-
Given a path or list of paths, return a list of all file paths under them.
|
|
249
|
-
- If an input is a file, it's included (unless it matches an exclude pattern).
|
|
250
|
-
- If an input is a directory, walk it recursively:
|
|
251
|
-
• Skip any subdirectory named in `exclude_dirs`
|
|
252
|
-
• Skip any file whose name matches one of `exclude_file_patterns`
|
|
253
|
-
"""
|
|
254
|
-
re_initialize_skip_mgr(exclude_types=exclude_types,
|
|
255
|
-
exclude_file_patterns=exclude_file_patterns,
|
|
256
|
-
exclude_dirs=exclude_dirs)
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
# Normalize to list
|
|
260
|
-
if isinstance(inputs, str):
|
|
261
|
-
paths_to_scan = [inputs]
|
|
262
|
-
else:
|
|
263
|
-
paths_to_scan = list(inputs)
|
|
264
|
-
|
|
265
|
-
all_files: List[str] = []
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
def _collect_from_dir(dirpath: str):
|
|
270
|
-
for dirpath_root, dirnames, filenames in os.walk(dirpath):
|
|
271
|
-
# Remove any excluded subdirectories from os.walk
|
|
272
|
-
dirnames[:] = [d for d in dirnames if d not in exclude_dirs]
|
|
273
|
-
|
|
274
|
-
for fname in filenames:
|
|
275
|
-
if should_skip(exclude_item=fname,
|
|
276
|
-
exclude_types=True,
|
|
277
|
-
exclude_file_patterns=True):
|
|
278
|
-
continue
|
|
279
|
-
full = os.path.join(dirpath_root, fname)
|
|
280
|
-
all_files.append(full)
|
|
281
|
-
|
|
282
|
-
for p in paths_to_scan:
|
|
283
|
-
if not os.path.exists(p):
|
|
284
|
-
# skip nonexistent paths
|
|
285
|
-
continue
|
|
286
|
-
|
|
287
|
-
if os.path.isfile(p):
|
|
288
|
-
basename = os.path.basename(p)
|
|
289
|
-
fname = os.path.splitext(basename)
|
|
290
|
-
if not should_skip(exclude_item=fname,
|
|
291
|
-
exclude_types=True,
|
|
292
|
-
exclude_file_patterns=True):
|
|
293
|
-
all_files.append(p)
|
|
294
|
-
else:
|
|
295
|
-
# p is a directory
|
|
296
|
-
_collect_from_dir(p)
|
|
297
|
-
|
|
298
|
-
return all_files
|
|
299
|
-
|
|
300
|
-
# requirements:
|
|
301
|
-
# pip install pdfplumber pdf2image pytesseract pillow
|
|
302
|
-
# # plus Tesseract binary (apt install tesseract-ocr or brew install tesseract)
|
|
303
239
|
|
|
304
240
|
|
|
305
241
|
|
|
@@ -52,38 +52,13 @@ def get_all_allowed_dirs(items,allowed=None):
|
|
|
52
52
|
for directory in dirs:
|
|
53
53
|
nu_dirs += get_filtered_dirs(directory,allowed=allowed,dirs=nu_dirs)
|
|
54
54
|
return nu_dirs
|
|
55
|
-
def get_files_and_dirs(
|
|
56
|
-
|
|
57
|
-
cfg: Optional["ScanConfig"] = None,
|
|
58
|
-
allowed_exts: Optional[Set[str]] = False,
|
|
59
|
-
unallowed_exts: Optional[Set[str]] = False,
|
|
60
|
-
allowed_types: Optional[Set[str]] = False,
|
|
61
|
-
exclude_types: Optional[Set[str]] = False,
|
|
62
|
-
allowed_dirs: Optional[List[str]] = False,
|
|
63
|
-
exclude_dirs: Optional[List[str]] = False,
|
|
64
|
-
allowed_patterns: Optional[List[str]] = False,
|
|
65
|
-
exclude_patterns: Optional[List[str]] = False,
|
|
66
|
-
add = False,
|
|
67
|
-
recursive: bool = True,
|
|
68
|
-
include_files: bool = True,
|
|
69
|
-
**kwargs
|
|
70
|
-
):
|
|
71
|
-
cfg = cfg or define_defaults(
|
|
72
|
-
allowed_exts = allowed_exts,
|
|
73
|
-
unallowed_exts = unallowed_exts,
|
|
74
|
-
allowed_types = allowed_types,
|
|
75
|
-
exclude_types = exclude_types,
|
|
76
|
-
allowed_dirs = allowed_dirs,
|
|
77
|
-
exclude_dirs = exclude_dirs,
|
|
78
|
-
allowed_patterns = allowed_patterns,
|
|
79
|
-
exclude_patterns = exclude_patterns,
|
|
80
|
-
add=add
|
|
81
|
-
)
|
|
82
|
-
allowed = make_allowed_predicate(cfg)
|
|
55
|
+
def get_files_and_dirs(*args,**kwargs)-> List[str]:
|
|
56
|
+
directories,cfg,allowed,include_files,recursive = get_file_filters(*args,**kwargs)
|
|
83
57
|
items=[]
|
|
84
58
|
files =[]
|
|
85
59
|
if recursive:
|
|
86
|
-
|
|
60
|
+
for directory in directories:
|
|
61
|
+
items += get_globs(directories,recursive=recursive,allowed=allowed)
|
|
87
62
|
else:
|
|
88
63
|
directories = make_list(directory)
|
|
89
64
|
for directory in directories:
|
|
@@ -92,68 +67,7 @@ def get_files_and_dirs(
|
|
|
92
67
|
if include_files:
|
|
93
68
|
files = get_allowed_files(items,allowed=allowed)
|
|
94
69
|
return dirs,files
|
|
95
|
-
def make_allowed_predicate(cfg: ScanConfig) -> Callable[[str], bool]:
|
|
96
|
-
"""
|
|
97
|
-
Build a predicate that returns True if a given path is considered allowed
|
|
98
|
-
under the given ScanConfig. Applies allowed_* and exclude_* logic symmetrically.
|
|
99
|
-
"""
|
|
100
|
-
def allowed(path: str=None,p=None) -> bool:
|
|
101
|
-
p = p or Path(path)
|
|
102
|
-
name = p.name.lower()
|
|
103
|
-
path_str = str(p).lower()
|
|
104
|
-
|
|
105
|
-
# --------------------
|
|
106
|
-
# A) directory filters
|
|
107
|
-
# --------------------
|
|
108
|
-
if cfg.exclude_dirs:
|
|
109
|
-
for dpat in cfg.exclude_dirs:
|
|
110
|
-
dpat_l = dpat.lower()
|
|
111
|
-
if dpat_l in path_str or fnmatch.fnmatch(name, dpat_l):
|
|
112
|
-
if p.is_dir() or dpat_l in path_str:
|
|
113
|
-
return False
|
|
114
|
-
|
|
115
|
-
if cfg.allowed_dirs and cfg.allowed_dirs != ["*"]:
|
|
116
|
-
# must be in at least one allowed dir
|
|
117
|
-
if not any(
|
|
118
|
-
fnmatch.fnmatch(path_str, f"*{dpat.lower()}*") for dpat in cfg.allowed_dirs
|
|
119
|
-
):
|
|
120
|
-
return False
|
|
121
70
|
|
|
122
|
-
# --------------------
|
|
123
|
-
# B) pattern filters
|
|
124
|
-
# --------------------
|
|
125
|
-
if cfg.allowed_patterns and cfg.allowed_patterns != ["*"]:
|
|
126
|
-
if not any(fnmatch.fnmatch(name, pat.lower()) for pat in cfg.allowed_patterns):
|
|
127
|
-
return False
|
|
128
|
-
|
|
129
|
-
if cfg.exclude_patterns:
|
|
130
|
-
for pat in cfg.exclude_patterns:
|
|
131
|
-
if fnmatch.fnmatch(name, pat.lower()):
|
|
132
|
-
return False
|
|
133
|
-
|
|
134
|
-
# --------------------
|
|
135
|
-
# C) extension filters
|
|
136
|
-
# --------------------
|
|
137
|
-
if p.is_file():
|
|
138
|
-
ext = p.suffix.lower()
|
|
139
|
-
if cfg.allowed_exts and ext not in cfg.allowed_exts:
|
|
140
|
-
return False
|
|
141
|
-
if cfg.unallowed_exts and ext in cfg.unallowed_exts:
|
|
142
|
-
return False
|
|
143
|
-
|
|
144
|
-
# --------------------
|
|
145
|
-
# D) type filters (optional)
|
|
146
|
-
# --------------------
|
|
147
|
-
if cfg.allowed_types and cfg.allowed_types != {"*"}:
|
|
148
|
-
if not any(t in path_str for t in cfg.allowed_types):
|
|
149
|
-
return False
|
|
150
|
-
if cfg.exclude_types and cfg.exclude_types != {"*"}:
|
|
151
|
-
if any(t in path_str for t in cfg.exclude_types):
|
|
152
|
-
return False
|
|
153
|
-
|
|
154
|
-
return True
|
|
155
|
-
|
|
156
|
-
return allowed
|
|
157
71
|
def correct_kwargs(**kwargs):
|
|
158
72
|
for key,values in kwargs.items():
|
|
159
73
|
if key.startswith('excluded'):
|
|
@@ -163,47 +77,10 @@ def correct_kwargs(**kwargs):
|
|
|
163
77
|
kwargs[correct_key]=combine_params(correct_vals,values)
|
|
164
78
|
del kwargs[key]
|
|
165
79
|
|
|
166
|
-
def collect_filepaths(
|
|
167
|
-
|
|
168
|
-
cfg: ScanConfig=None,
|
|
169
|
-
allowed_exts: Optional[Set[str]] = False,
|
|
170
|
-
unallowed_exts: Optional[Set[str]] = False,
|
|
171
|
-
allowed_types: Optional[Set[str]] = False,
|
|
172
|
-
exclude_types: Optional[Set[str]] = False,
|
|
173
|
-
allowed_dirs: Optional[List[str]] = False,
|
|
174
|
-
exclude_dirs: Optional[List[str]] = False,
|
|
175
|
-
allowed_patterns: Optional[List[str]] = False,
|
|
176
|
-
exclude_patterns: Optional[List[str]] = False,
|
|
177
|
-
add=False,
|
|
178
|
-
allowed: Optional[Callable[[str], bool]] = None,
|
|
179
|
-
**kwargs
|
|
180
|
-
) -> List[str]:
|
|
181
|
-
kwargs = correct_kwargs(allowed_exts = allowed_exts,
|
|
182
|
-
unallowed_exts = unallowed_exts,
|
|
183
|
-
allowed_types = allowed_types,
|
|
184
|
-
exclude_types = exclude_types,
|
|
185
|
-
allowed_dirs = allowed_dirs,
|
|
186
|
-
exclude_dirs = exclude_dirs,
|
|
187
|
-
allowed_patterns = allowed_patterns,
|
|
188
|
-
exclude_patterns = exclude_patterns,
|
|
189
|
-
**kwargs)
|
|
190
|
-
cfg = cfg or define_defaults(
|
|
191
|
-
allowed_exts = allowed_exts,
|
|
192
|
-
unallowed_exts = unallowed_exts,
|
|
193
|
-
allowed_types = allowed_types,
|
|
194
|
-
exclude_types = exclude_types,
|
|
195
|
-
allowed_dirs = allowed_dirs,
|
|
196
|
-
exclude_dirs = exclude_dirs,
|
|
197
|
-
allowed_patterns = allowed_patterns,
|
|
198
|
-
exclude_patterns = exclude_patterns,
|
|
199
|
-
add = add
|
|
200
|
-
)
|
|
201
|
-
allowed = allowed or make_allowed_predicate(cfg)
|
|
202
|
-
directories = make_list(directory)
|
|
203
|
-
roots = [r for r in directories if r]
|
|
204
|
-
|
|
80
|
+
def collect_filepaths(*args,**kwargs)-> List[str]:
|
|
81
|
+
directories,cfg,allowed,include_files,recursive = get_file_filters(*args,**kwargs)
|
|
205
82
|
# your existing helpers (get_dirs, get_globs, etc.) stay the same
|
|
206
|
-
original_dirs = get_allowed_dirs(
|
|
83
|
+
original_dirs = get_allowed_dirs(directories, allowed=allowed)
|
|
207
84
|
original_globs = get_globs(original_dirs)
|
|
208
85
|
files = get_allowed_files(original_globs, allowed=allowed)
|
|
209
86
|
|