abstract-utilities 0.2.2.513__py3-none-any.whl → 0.2.2.627__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- abstract_utilities/__init__.py +11 -3
- abstract_utilities/class_utils/caller_utils.py +19 -0
- abstract_utilities/class_utils/global_utils.py +35 -20
- abstract_utilities/class_utils/imports/imports.py +1 -1
- abstract_utilities/directory_utils/__init__.py +2 -4
- abstract_utilities/directory_utils/imports/__init__.py +2 -0
- abstract_utilities/directory_utils/imports/imports.py +1 -0
- abstract_utilities/directory_utils/imports/module_imports.py +2 -0
- abstract_utilities/directory_utils/src/__init__.py +4 -0
- abstract_utilities/directory_utils/src/directory_utils.py +110 -0
- abstract_utilities/directory_utils/src/name_utils.py +43 -0
- abstract_utilities/directory_utils/src/size_utils.py +57 -0
- abstract_utilities/directory_utils/src/utils.py +116 -0
- abstract_utilities/file_utils/imports/constants.py +81 -7
- abstract_utilities/file_utils/imports/imports.py +0 -4
- abstract_utilities/file_utils/imports/module_imports.py +1 -1
- abstract_utilities/file_utils/src/__init__.py +2 -4
- abstract_utilities/file_utils/src/file_filters/__init__.py +4 -0
- abstract_utilities/file_utils/src/file_filters/ensure_utils.py +118 -0
- abstract_utilities/file_utils/src/file_filters/filter_params.py +86 -0
- abstract_utilities/file_utils/src/file_filters/filter_utils.py +78 -0
- abstract_utilities/file_utils/src/file_filters/predicate_utils.py +116 -0
- abstract_utilities/file_utils/src/file_filters.py +114 -47
- abstract_utilities/file_utils/src/file_reader.py +0 -64
- abstract_utilities/file_utils/src/file_utils.py +7 -130
- abstract_utilities/file_utils/src/filter_params.py +128 -86
- abstract_utilities/file_utils/src/find_collect.py +85 -165
- abstract_utilities/file_utils/src/find_content.py +210 -0
- abstract_utilities/file_utils/src/initFunctionsGen.py +35 -28
- abstract_utilities/file_utils/src/initFunctionsGens.py +280 -0
- abstract_utilities/file_utils/src/reader_utils/__init__.py +4 -0
- abstract_utilities/file_utils/src/reader_utils/directory_reader.py +53 -0
- abstract_utilities/file_utils/src/reader_utils/file_reader.py +543 -0
- abstract_utilities/file_utils/src/reader_utils/file_readers.py +376 -0
- abstract_utilities/file_utils/src/reader_utils/imports.py +18 -0
- abstract_utilities/file_utils/src/reader_utils/pdf_utils.py +300 -0
- abstract_utilities/file_utils (2)/__init__.py +2 -0
- abstract_utilities/file_utils (2)/imports/__init__.py +2 -0
- abstract_utilities/file_utils (2)/imports/constants.py +118 -0
- abstract_utilities/file_utils (2)/imports/imports/__init__.py +3 -0
- abstract_utilities/file_utils (2)/imports/imports/constants.py +119 -0
- abstract_utilities/file_utils (2)/imports/imports/imports.py +46 -0
- abstract_utilities/file_utils (2)/imports/imports/module_imports.py +8 -0
- abstract_utilities/file_utils (2)/imports/utils/__init__.py +3 -0
- abstract_utilities/file_utils (2)/imports/utils/classes.py +379 -0
- abstract_utilities/file_utils (2)/imports/utils/clean_imps.py +155 -0
- abstract_utilities/file_utils (2)/imports/utils/filter_utils.py +341 -0
- abstract_utilities/file_utils (2)/src/__init__.py +8 -0
- abstract_utilities/file_utils (2)/src/file_filters.py +155 -0
- abstract_utilities/file_utils (2)/src/file_reader.py +604 -0
- abstract_utilities/file_utils (2)/src/find_collect.py +258 -0
- abstract_utilities/file_utils (2)/src/initFunctionsGen.py +286 -0
- abstract_utilities/file_utils (2)/src/map_utils.py +28 -0
- abstract_utilities/file_utils (2)/src/pdf_utils.py +300 -0
- abstract_utilities/import_utils/circular_import_finder.py +222 -0
- abstract_utilities/import_utils/circular_import_finder2.py +118 -0
- abstract_utilities/import_utils/imports/module_imports.py +3 -1
- abstract_utilities/import_utils/src/clean_imports.py +156 -25
- abstract_utilities/import_utils/src/dot_utils.py +11 -0
- abstract_utilities/import_utils/src/extract_utils.py +4 -0
- abstract_utilities/import_utils/src/import_functions.py +66 -2
- abstract_utilities/import_utils/src/pkg_utils.py +58 -4
- abstract_utilities/import_utils/src/sysroot_utils.py +56 -1
- abstract_utilities/log_utils/log_file.py +73 -24
- abstract_utilities/parse_utils/parse_utils.py +23 -0
- abstract_utilities/path_utils/path_utils.py +25 -23
- abstract_utilities/read_write_utils/imports/imports.py +1 -1
- abstract_utilities/read_write_utils/read_write_utils.py +99 -31
- abstract_utilities/safe_utils/safe_utils.py +30 -0
- {abstract_utilities-0.2.2.513.dist-info → abstract_utilities-0.2.2.627.dist-info}/METADATA +1 -1
- {abstract_utilities-0.2.2.513.dist-info → abstract_utilities-0.2.2.627.dist-info}/RECORD +73 -32
- {abstract_utilities-0.2.2.513.dist-info → abstract_utilities-0.2.2.627.dist-info}/WHEEL +0 -0
- {abstract_utilities-0.2.2.513.dist-info → abstract_utilities-0.2.2.627.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,376 @@
|
|
|
1
|
+
# file_reader.py
|
|
2
|
+
from .imports import *
|
|
3
|
+
from .pdf_utils import *
|
|
4
|
+
_logger = get_logFile(__name__)
|
|
5
|
+
def convert_date_string(s):
|
|
6
|
+
# … your existing stub or real implementation …
|
|
7
|
+
try:
|
|
8
|
+
return datetime.fromisoformat(s)
|
|
9
|
+
except ValueError:
|
|
10
|
+
return None
|
|
11
|
+
# file_utils.py (below your existing imports)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def source_engine_for_ext(ext: str) -> str:
|
|
17
|
+
ext = ext.lower()
|
|
18
|
+
mapping = {
|
|
19
|
+
'.parquet': 'pyarrow',
|
|
20
|
+
'.txt': 'python',
|
|
21
|
+
'.csv': 'python',
|
|
22
|
+
'.tsv': 'python',
|
|
23
|
+
'.xlsx': 'openpyxl',
|
|
24
|
+
'.xls': 'xlrd',
|
|
25
|
+
'.xlsb': 'pyxlsb',
|
|
26
|
+
'.ods': 'odf',
|
|
27
|
+
'.geojson':'GeoJSON',
|
|
28
|
+
}
|
|
29
|
+
return mapping.get(ext)
|
|
30
|
+
|
|
31
|
+
def is_valid_file_path(path: str) -> Union[str, None]:
|
|
32
|
+
if not (isinstance(path, str) and path.strip()):
|
|
33
|
+
return None
|
|
34
|
+
if os.path.isfile(path):
|
|
35
|
+
return os.path.splitext(path)[1].lower()
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
def is_dataframe(obj) -> bool:
|
|
39
|
+
return isinstance(obj, (pd.DataFrame, gpd.GeoDataFrame))
|
|
40
|
+
|
|
41
|
+
def create_dataframe(data=None, columns=None) -> pd.DataFrame:
|
|
42
|
+
# … unchanged …
|
|
43
|
+
if is_dataframe(data):
|
|
44
|
+
return data.copy()
|
|
45
|
+
data = data or {}
|
|
46
|
+
if isinstance(data, dict):
|
|
47
|
+
data = [data]
|
|
48
|
+
if columns is None:
|
|
49
|
+
all_keys = set()
|
|
50
|
+
for row in data:
|
|
51
|
+
if isinstance(row, dict):
|
|
52
|
+
all_keys.update(row.keys())
|
|
53
|
+
columns = list(all_keys)
|
|
54
|
+
if columns is False:
|
|
55
|
+
columns = None
|
|
56
|
+
try:
|
|
57
|
+
return pd.DataFrame(data, columns=columns)
|
|
58
|
+
except Exception as e:
|
|
59
|
+
#_logger.error(f"Failed to create DataFrame: {e}")
|
|
60
|
+
return pd.DataFrame([], columns=columns)
|
|
61
|
+
|
|
62
|
+
def read_ods_file(path: str) -> dict[str, pd.DataFrame]:
|
|
63
|
+
# … unchanged …
|
|
64
|
+
if not is_valid_file_path(path):
|
|
65
|
+
#_logger.error(f"File not found or invalid: {path}")
|
|
66
|
+
return {}
|
|
67
|
+
try:
|
|
68
|
+
doc = ezodf.opendoc(path)
|
|
69
|
+
except Exception as e:
|
|
70
|
+
#_logger.error(f"Failed to open ODS document: {e}")
|
|
71
|
+
return {}
|
|
72
|
+
sheets: dict[str, pd.DataFrame] = {}
|
|
73
|
+
for sheet in doc.sheets:
|
|
74
|
+
table_rows = []
|
|
75
|
+
for row in sheet.rows():
|
|
76
|
+
row_data = []
|
|
77
|
+
for cell in row:
|
|
78
|
+
if cell.value_type == 'date':
|
|
79
|
+
row_data.append(convert_date_string(str(cell.value)))
|
|
80
|
+
else:
|
|
81
|
+
row_data.append(cell.value)
|
|
82
|
+
table_rows.append(row_data)
|
|
83
|
+
df = pd.DataFrame(table_rows)
|
|
84
|
+
sheets[sheet.name] = df
|
|
85
|
+
#_logger.info(f"Processed sheet: {sheet.name}")
|
|
86
|
+
return sheets
|
|
87
|
+
|
|
88
|
+
def read_ods_as_excel(path: str, xlsx_path: str | None = None) -> pd.DataFrame:
|
|
89
|
+
# … unchanged …
|
|
90
|
+
if not is_valid_file_path(path):
|
|
91
|
+
#_logger.error(f"File not found or invalid: {path}")
|
|
92
|
+
return pd.DataFrame()
|
|
93
|
+
if xlsx_path is None:
|
|
94
|
+
tmp_dir = tempfile.mkdtemp()
|
|
95
|
+
xlsx_path = os.path.join(tmp_dir, os.path.basename(path) + '.xlsx')
|
|
96
|
+
cleanup_temp = True
|
|
97
|
+
else:
|
|
98
|
+
cleanup_temp = False
|
|
99
|
+
try:
|
|
100
|
+
# You must implement ods_to_xlsx(...) externally
|
|
101
|
+
ods_to_xlsx(path, xlsx_path)
|
|
102
|
+
except Exception as e:
|
|
103
|
+
#_logger.error(f"ODS→XLSX conversion failed: {e}")
|
|
104
|
+
if cleanup_temp:
|
|
105
|
+
shutil.rmtree(tmp_dir)
|
|
106
|
+
return pd.DataFrame()
|
|
107
|
+
try:
|
|
108
|
+
df = pd.read_excel(xlsx_path, engine='openpyxl')
|
|
109
|
+
except Exception as e:
|
|
110
|
+
#_logger.error(f"Failed to read converted XLSX: {e}")
|
|
111
|
+
df = pd.DataFrame()
|
|
112
|
+
finally:
|
|
113
|
+
if cleanup_temp:
|
|
114
|
+
shutil.rmtree(tmp_dir)
|
|
115
|
+
return df
|
|
116
|
+
|
|
117
|
+
def filter_df(
|
|
118
|
+
df: pd.DataFrame,
|
|
119
|
+
nrows: int | None = None,
|
|
120
|
+
condition: pd.Series | None = None,
|
|
121
|
+
indices: list[int] | None = None
|
|
122
|
+
) -> pd.DataFrame:
|
|
123
|
+
if nrows is not None:
|
|
124
|
+
df = df.head(nrows)
|
|
125
|
+
if condition is not None:
|
|
126
|
+
df = df[condition]
|
|
127
|
+
if indices is not None:
|
|
128
|
+
df = df.iloc[indices]
|
|
129
|
+
return df
|
|
130
|
+
|
|
131
|
+
def read_shape_file(path: str) -> Union[gpd.GeoDataFrame, None]:
|
|
132
|
+
# … unchanged …
|
|
133
|
+
ext = is_valid_file_path(path)
|
|
134
|
+
if not ext:
|
|
135
|
+
#_logger.error(f"Shape file not found: {path}")
|
|
136
|
+
return None
|
|
137
|
+
ext = ext.lower()
|
|
138
|
+
try:
|
|
139
|
+
if ext in ('.shp', '.cpg', '.dbf', '.shx'):
|
|
140
|
+
return gpd.read_file(path)
|
|
141
|
+
if ext == '.geojson':
|
|
142
|
+
return gpd.read_file(path, driver='GeoJSON')
|
|
143
|
+
if ext == '.prj':
|
|
144
|
+
return read_from_file(path) # Must return GeoDataFrame
|
|
145
|
+
except Exception as e:
|
|
146
|
+
#_logger.error(f"Failed to read spatial data ({path}): {e}")
|
|
147
|
+
return None
|
|
148
|
+
#_logger.error(f"Unsupported spatial extension: {ext}")
|
|
149
|
+
return None
|
|
150
|
+
def pdf_to_text(path, keep_page_breaks=True, ocr_if_empty=True):
|
|
151
|
+
"""
|
|
152
|
+
Return the full text of *path* (str or Path) as a single string.
|
|
153
|
+
|
|
154
|
+
keep_page_breaks → insert "\f" between pages so you can split later.
|
|
155
|
+
ocr_if_empty → any page with no text layer is rasterised & OCR'd.
|
|
156
|
+
"""
|
|
157
|
+
path = Path(path)
|
|
158
|
+
if not path.exists():
|
|
159
|
+
raise FileNotFoundError(path)
|
|
160
|
+
|
|
161
|
+
all_pages = []
|
|
162
|
+
|
|
163
|
+
with pdfplumber.open(path) as pdf:
|
|
164
|
+
for i, page in enumerate(pdf.pages, start=1):
|
|
165
|
+
text = page.extract_text() or "" # might be None
|
|
166
|
+
if (not text.strip()) and ocr_if_empty:
|
|
167
|
+
# rasterise at 300 dpi then Tesseract
|
|
168
|
+
img = convert_from_path(str(path), dpi=300, first_page=i, last_page=i)[0]
|
|
169
|
+
text = pytesseract.image_to_string(img, lang="eng")
|
|
170
|
+
all_pages.append(text)
|
|
171
|
+
|
|
172
|
+
sep = "\f" if keep_page_breaks else "\n"
|
|
173
|
+
return sep.join(all_pages)
|
|
174
|
+
def get_df(
|
|
175
|
+
source: Union[
|
|
176
|
+
str,
|
|
177
|
+
pd.DataFrame,
|
|
178
|
+
gpd.GeoDataFrame,
|
|
179
|
+
dict,
|
|
180
|
+
list,
|
|
181
|
+
FileStorage
|
|
182
|
+
],
|
|
183
|
+
nrows: int | None = None,
|
|
184
|
+
skiprows: list[int] | int | None = None,
|
|
185
|
+
condition: pd.Series | None = None,
|
|
186
|
+
indices: list[int] | None = None
|
|
187
|
+
) -> Union[pd.DataFrame, gpd.GeoDataFrame, dict[str, Union[pd.DataFrame, str]], None]:
|
|
188
|
+
"""
|
|
189
|
+
Load a DataFrame or GeoDataFrame from various sources, then apply optional filters.
|
|
190
|
+
If `source` is a directory, returns read_directory(source) instead (a dict).
|
|
191
|
+
"""
|
|
192
|
+
|
|
193
|
+
# ─── Check for directory first ─────────────────────────────────────────────
|
|
194
|
+
if isinstance(source, str) and os.path.isdir(source):
|
|
195
|
+
return read_directory(root_path=source)
|
|
196
|
+
|
|
197
|
+
# ─── If already a DataFrame/GeoDataFrame, just filter and return ───────────
|
|
198
|
+
if is_dataframe(source):
|
|
199
|
+
#_logger.info("Source is already a DataFrame/GeoDataFrame; applying filters.")
|
|
200
|
+
return filter_df(source, nrows=nrows, condition=condition, indices=indices)
|
|
201
|
+
|
|
202
|
+
if source is None:
|
|
203
|
+
#_logger.error("No source provided to get_df().")
|
|
204
|
+
return None
|
|
205
|
+
|
|
206
|
+
# ─── Next: If source is a file path, read according to extension ───────────
|
|
207
|
+
if isinstance(source, str) and os.path.isfile(source):
|
|
208
|
+
ext = os.path.splitext(source)[1].lower()
|
|
209
|
+
try:
|
|
210
|
+
#_logger.info(f"Loading file {source} with extension '{ext}'.")
|
|
211
|
+
if ext in ('.csv', '.tsv', '.txt'):
|
|
212
|
+
sep = {'.csv': ',', '.tsv': '\t', '.txt': None}.get(ext)
|
|
213
|
+
df = pd.read_csv(source, skiprows=skiprows, sep=sep, nrows=nrows)
|
|
214
|
+
elif ext in ('.ods', '.xlsx', '.xls', '.xlsb'):
|
|
215
|
+
engine = source_engine_for_ext(ext)
|
|
216
|
+
if ext == '.ods':
|
|
217
|
+
df = read_ods_as_excel(source)
|
|
218
|
+
else:
|
|
219
|
+
df = pd.read_excel(source, skiprows=skiprows, engine=engine, nrows=nrows)
|
|
220
|
+
elif ext == '.json':
|
|
221
|
+
df = safe_read_from_json(source)
|
|
222
|
+
return df
|
|
223
|
+
elif ext == '.parquet':
|
|
224
|
+
df = pd.read_parquet(source)
|
|
225
|
+
elif ext in ('.shp', '.cpg', '.dbf', '.shx', '.geojson', '.prj'):
|
|
226
|
+
return read_shape_file(source)
|
|
227
|
+
elif ext in ['.pdf']:
|
|
228
|
+
df = pdf_to_text(source)
|
|
229
|
+
return df
|
|
230
|
+
else:
|
|
231
|
+
df = read_from_file(source)
|
|
232
|
+
return df
|
|
233
|
+
|
|
234
|
+
if not isinstance(df, (dict, list, FileStorage)):
|
|
235
|
+
return filter_df(df, nrows=nrows, condition=condition, indices=indices)
|
|
236
|
+
source = df # pass on to next block if needed
|
|
237
|
+
|
|
238
|
+
except Exception as e:
|
|
239
|
+
#_logger.error(f"Failed to read '{source}': {e}")
|
|
240
|
+
return None
|
|
241
|
+
|
|
242
|
+
# ─── If source is FileStorage (uploaded) ───────────────────────────────────
|
|
243
|
+
if isinstance(source, FileStorage):
|
|
244
|
+
try:
|
|
245
|
+
filename = secure_filename(source.filename or "uploaded.xlsx")
|
|
246
|
+
#_logger.info(f"Reading uploaded file: {filename}")
|
|
247
|
+
df = pd.read_excel(source.stream, nrows=nrows)
|
|
248
|
+
return filter_df(df, nrows=nrows, condition=condition, indices=indices)
|
|
249
|
+
except Exception as e:
|
|
250
|
+
#_logger.error(f"Failed to read FileStorage: {e}")
|
|
251
|
+
return None
|
|
252
|
+
|
|
253
|
+
# ─── If source is dict or list, turn into DataFrame ────────────────────────
|
|
254
|
+
if isinstance(source, (dict, list)):
|
|
255
|
+
#_logger.info("Creating DataFrame from in-memory data structure.")
|
|
256
|
+
df = pd.DataFrame(source)
|
|
257
|
+
return filter_df(df, nrows=nrows, condition=condition, indices=indices)
|
|
258
|
+
|
|
259
|
+
_logger.error(f"Unsupported source type: {type(source)}")
|
|
260
|
+
return None
|
|
261
|
+
|
|
262
|
+
def read_any_file(full_path):
|
|
263
|
+
data = None
|
|
264
|
+
if not os.path.exists(full_path):
|
|
265
|
+
raise FileNotFoundError(f"Not a valid path: {full_path!r}")
|
|
266
|
+
|
|
267
|
+
# ── If this is a directory, walk it via read_directory(...) ─────────────────
|
|
268
|
+
if os.path.isdir(full_path):
|
|
269
|
+
# read_directory returns a dict: { relative_path: (DataFrame or text) }
|
|
270
|
+
nested_dict: Dict[str, Union[pd.DataFrame, gpd.GeoDataFrame, str]] = read_directory(full_path)
|
|
271
|
+
|
|
272
|
+
for rel, content in nested_dict.items():
|
|
273
|
+
# `content` is either a DataFrame, GeoDataFrame, or a plain‐text string
|
|
274
|
+
if isinstance(content, (pd.DataFrame, gpd.GeoDataFrame)):
|
|
275
|
+
# If GeoDataFrame, convert geometry column to WKT before CSV
|
|
276
|
+
if isinstance(content, gpd.GeoDataFrame):
|
|
277
|
+
gdf = content.copy()
|
|
278
|
+
gdf["geometry"] = gdf["geometry"].apply(lambda g: g.wkt if g is not None else "")
|
|
279
|
+
data = gdf.to_csv(index=False)
|
|
280
|
+
else:
|
|
281
|
+
data = content.to_csv(index=False)
|
|
282
|
+
else:
|
|
283
|
+
# Already a text blob
|
|
284
|
+
data = content
|
|
285
|
+
|
|
286
|
+
return data
|
|
287
|
+
# ── At this point, full_path is guaranteed to be a file ───────────────────────
|
|
288
|
+
ext = os.path.splitext(full_path)[1].lower()
|
|
289
|
+
|
|
290
|
+
# 1) PURE TEXT EXTENSION?
|
|
291
|
+
#if ext in SUPPORTED_TEXT_EXTENSIONS:
|
|
292
|
+
try:
|
|
293
|
+
with open(full_path, "r", encoding="utf-8", errors="replace") as f:
|
|
294
|
+
raw = f.read()
|
|
295
|
+
data = raw
|
|
296
|
+
except Exception as e:
|
|
297
|
+
raise ValueError(f"Error reading text file {full_path!r}: {e}")
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
# 2) ANY OTHER FILETYPE → delegate to get_df(...) and convert result to text
|
|
302
|
+
try:
|
|
303
|
+
df_or = get_df(full_path)
|
|
304
|
+
except Exception as e:
|
|
305
|
+
raise ValueError(f"get_df() failed for {full_path!r}: {e}")
|
|
306
|
+
|
|
307
|
+
# 2a) If get_df returned a dict (e.g. an ODS with multiple sheets, or a directory)
|
|
308
|
+
if isinstance(df_or, dict):
|
|
309
|
+
# Join each sheet or sub‐file’s DataFrame into one big text block
|
|
310
|
+
for key, value in df_or.items():
|
|
311
|
+
if isinstance(value, (pd.DataFrame, gpd.GeoDataFrame)):
|
|
312
|
+
if isinstance(value, gpd.GeoDataFrame):
|
|
313
|
+
gdf = value.copy()
|
|
314
|
+
gdf["geometry"] = gdf["geometry"].apply(lambda g: g.wkt if g is not None else "")
|
|
315
|
+
block = f"=== {key} ===\n" + gdf.to_csv(index=False)
|
|
316
|
+
else:
|
|
317
|
+
block = f"=== {key} ===\n" + value.to_csv(index=False)
|
|
318
|
+
else:
|
|
319
|
+
# It was already plain‐text under that key
|
|
320
|
+
block = f"=== {key} ===\n" + str(value)
|
|
321
|
+
data = block
|
|
322
|
+
|
|
323
|
+
return data
|
|
324
|
+
|
|
325
|
+
# 2b) If get_df returned a DataFrame or GeoDataFrame directly
|
|
326
|
+
if isinstance(df_or, (pd.DataFrame, gpd.GeoDataFrame)):
|
|
327
|
+
if isinstance(df_or, gpd.GeoDataFrame):
|
|
328
|
+
gdf = df_or.copy()
|
|
329
|
+
gdf["geometry"] = gdf["geometry"].apply(lambda g: g.wkt if g is not None else "")
|
|
330
|
+
data = gdf.to_csv(index=False)
|
|
331
|
+
else:
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
data = df_or.to_csv(index=False)
|
|
335
|
+
|
|
336
|
+
return data
|
|
337
|
+
|
|
338
|
+
# 2c) If get_df returned a list of dicts (rare, but possible)
|
|
339
|
+
if isinstance(df_or, list):
|
|
340
|
+
try:
|
|
341
|
+
temp_df = pd.DataFrame(df_or)
|
|
342
|
+
data = temp_df.to_csv(index=False)
|
|
343
|
+
except Exception:
|
|
344
|
+
data = repr(df_or)
|
|
345
|
+
return data
|
|
346
|
+
return data or repr(df_or)
|
|
347
|
+
def read_file_as_text(paths: Union[str, List[str]]) -> List[str]:
|
|
348
|
+
"""
|
|
349
|
+
Given one path or a list of paths, return a list of textual representations
|
|
350
|
+
for each “file” found. If a given path is:
|
|
351
|
+
|
|
352
|
+
1) A directory → we call read_directory(...) on it (which skips node_modules,
|
|
353
|
+
__pycache__, *.ini, etc.) and iterate over each (relative_path → content).
|
|
354
|
+
2) A plain‐text file (extension ∈ SUPPORTED_TEXT_EXTENSIONS) → we open it and return its raw text.
|
|
355
|
+
3) Anything else (e.g. .xlsx, .ods, .parquet, .shp, etc.) → we delegate to get_df(...) and then
|
|
356
|
+
convert whatever get_df(...) gives us into CSV or “to_string()” as appropriate.
|
|
357
|
+
|
|
358
|
+
Returns:
|
|
359
|
+
A list of strings—each string is the “file’s contents” for one actual file.
|
|
360
|
+
(Ordering is “filesystem walk order” for directories, and “in order of the input list” for files.)
|
|
361
|
+
|
|
362
|
+
Raises:
|
|
363
|
+
FileNotFoundError if any path in `paths` does not exist.
|
|
364
|
+
ValueError if a file cannot be parsed/read.
|
|
365
|
+
"""
|
|
366
|
+
# Ensure we have a list to iterate
|
|
367
|
+
if isinstance(paths, str):
|
|
368
|
+
files_to_process = [paths]
|
|
369
|
+
else:
|
|
370
|
+
files_to_process = list(paths)
|
|
371
|
+
all_data: List[str] = []
|
|
372
|
+
for full_path in files_to_process:
|
|
373
|
+
data = read_any_file(full_path)
|
|
374
|
+
# 2d) Otherwise, fall back to repr()
|
|
375
|
+
all_data.append(data)
|
|
376
|
+
return all_data
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
from ...imports import *
|
|
2
|
+
# file_reader.py
|
|
3
|
+
from ..file_filters import *
|
|
4
|
+
from ....read_write_utils import read_from_file
|
|
5
|
+
from ....log_utils import get_logFile
|
|
6
|
+
import os,tempfile,shutil,logging,ezodf,fnmatch
|
|
7
|
+
from typing import Union
|
|
8
|
+
import pandas as pd
|
|
9
|
+
import geopandas as gpd
|
|
10
|
+
from datetime import datetime
|
|
11
|
+
from werkzeug.utils import secure_filename
|
|
12
|
+
from werkzeug.datastructures import FileStorage
|
|
13
|
+
from datetime import datetime
|
|
14
|
+
from typing import Dict, Union, List
|
|
15
|
+
import pdfplumber
|
|
16
|
+
from pdf2image import convert_from_path # only used for OCR fallback
|
|
17
|
+
import pytesseract
|
|
18
|
+
from pathlib import Path
|
|
@@ -0,0 +1,300 @@
|
|
|
1
|
+
from .imports import *
|
|
2
|
+
def if_none_return(obj: object, obj_2: object) -> object:
|
|
3
|
+
"""
|
|
4
|
+
Return obj if obj_2 is None, otherwise return obj_2.
|
|
5
|
+
|
|
6
|
+
Args:
|
|
7
|
+
obj (Any): Primary object to return.
|
|
8
|
+
obj_2 (Any): Secondary object to check.
|
|
9
|
+
|
|
10
|
+
Returns:
|
|
11
|
+
Any: obj if obj_2 is None, else obj_2.
|
|
12
|
+
"""
|
|
13
|
+
return obj if obj_2 is None else obj_2
|
|
14
|
+
|
|
15
|
+
def write_pdf() -> PyPDF2.PdfWriter:
|
|
16
|
+
"""
|
|
17
|
+
Return a new PDF writer object.
|
|
18
|
+
|
|
19
|
+
Returns:
|
|
20
|
+
PyPDF2.PdfWriter: New PDF writer object.
|
|
21
|
+
"""
|
|
22
|
+
return PyPDF2.PdfWriter()
|
|
23
|
+
def read_pdf(file: str):
|
|
24
|
+
"""
|
|
25
|
+
Read and return a PDF reader object from the provided file path.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
file (str): Path to the PDF file.
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
PyPDF2.PdfReader: PDF reader object.
|
|
32
|
+
"""
|
|
33
|
+
return PyPDF2.PdfReader(file)
|
|
34
|
+
def is_pdf_path(file: str):
|
|
35
|
+
"""
|
|
36
|
+
Checks if a given file path corresponds to a PDF file.
|
|
37
|
+
|
|
38
|
+
Args:
|
|
39
|
+
file (str): A string representing the file path.
|
|
40
|
+
|
|
41
|
+
Returns:
|
|
42
|
+
bool: True if the file has a '.pdf' extension, False otherwise.
|
|
43
|
+
"""
|
|
44
|
+
if is_file(file):
|
|
45
|
+
if get_ext(file) == '.pdf':
|
|
46
|
+
return True
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
def read_pdf(file: str):
|
|
50
|
+
"""Read and return a PDF reader object from the provided file path."""
|
|
51
|
+
return PyPDF2.PdfReader(file)
|
|
52
|
+
def get_pdf_obj(pdf_obj: Union[str, object]) -> object:
|
|
53
|
+
"""
|
|
54
|
+
Processes and returns a PDF object. If provided with a file path to a PDF,
|
|
55
|
+
it reads and returns the PDF content as an object.
|
|
56
|
+
|
|
57
|
+
Args:
|
|
58
|
+
pdf_obj: Either a PDF file path or an existing PDF object.
|
|
59
|
+
|
|
60
|
+
Returns:
|
|
61
|
+
object: The PDF content as an object.
|
|
62
|
+
"""
|
|
63
|
+
if is_str(pdf_obj):
|
|
64
|
+
if is_pdf_path(pdf_obj):
|
|
65
|
+
pdf_obj = read_pdf(pdf_obj) # Assuming there's a function read_pdf() to read PDF content
|
|
66
|
+
return pdf_obj
|
|
67
|
+
def get_separate_pages(pdf_reader, start_page:int=1, end_page:int=None):
|
|
68
|
+
"""
|
|
69
|
+
Get specific pages from a PDF and return them as a new PDF object.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
pdf_reader (object): The PDF reader object.
|
|
73
|
+
start_page (int, optional): The starting page number. Defaults to 1.
|
|
74
|
+
end_page (int, optional): The ending page number. Defaults to the last page.
|
|
75
|
+
|
|
76
|
+
Returns:
|
|
77
|
+
object: A new PDF writer object with the specified pages.
|
|
78
|
+
"""
|
|
79
|
+
num_pages = get_pdf_pages(pdf_reader)
|
|
80
|
+
|
|
81
|
+
# Handling default or out-of-bounds page values
|
|
82
|
+
if end_page is None or num_pages < end_page:
|
|
83
|
+
end_page = num_pages
|
|
84
|
+
elif num_pages < start_page:
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
pdf_writer = write_pdf()
|
|
88
|
+
|
|
89
|
+
for page_num in range(num_pages):
|
|
90
|
+
if start_page <= page_num <= end_page:
|
|
91
|
+
pdf_writer.add_page(pdf_reader.pages[page_num])
|
|
92
|
+
return pdf_writer
|
|
93
|
+
def is_pdf_path(file):
|
|
94
|
+
"""
|
|
95
|
+
Check if the provided file path corresponds to a valid PDF file.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
file (str): File path.
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
bool: True if it's a valid PDF path, False otherwise.
|
|
102
|
+
"""
|
|
103
|
+
if is_file(file) and get_ext(file).lower() == '.pdf':
|
|
104
|
+
return True
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
def get_pdf_pages(pdf_file):
|
|
108
|
+
"""
|
|
109
|
+
Get the total number of pages in the PDF.
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
pdf_file (object/str): PDF reader object or path to a PDF file.
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
int: Number of pages in the PDF.
|
|
116
|
+
"""
|
|
117
|
+
pdf_file = get_pdf_obj(pdf_file)
|
|
118
|
+
try:
|
|
119
|
+
return len(pdf_file.pages)
|
|
120
|
+
except:
|
|
121
|
+
return False
|
|
122
|
+
def save_pdf(output_file_path, pdf_writer):
|
|
123
|
+
"""
|
|
124
|
+
Save a PDF writer object to a file.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
output_file_path (str): Path to save the PDF.
|
|
128
|
+
pdf_writer (object): PDF writer object to save.
|
|
129
|
+
"""
|
|
130
|
+
with open(output_file_path, 'wb') as output_file:
|
|
131
|
+
pdf_writer.write(output_file)
|
|
132
|
+
def split_pdf(input_path: str, output_folder: Optional[str] = None, file_name: Optional[str] = None) -> List[str]:
|
|
133
|
+
"""
|
|
134
|
+
Split a PDF file into separate files for each page.
|
|
135
|
+
|
|
136
|
+
Args:
|
|
137
|
+
input_path (str): Path to the input PDF file.
|
|
138
|
+
output_folder (str, optional): Directory to save the split PDF files. Defaults to the directory of input_path.
|
|
139
|
+
file_name (str, optional): Base name for the output files. Defaults to the base name of input_path.
|
|
140
|
+
|
|
141
|
+
Returns:
|
|
142
|
+
list: List of paths to the created split PDF files.
|
|
143
|
+
"""
|
|
144
|
+
pdf_pages = []
|
|
145
|
+
file_name = get_file_name(input_path) if file_name is None else file_name
|
|
146
|
+
output_folder = if_none_return(get_directory(input_path), output_folder)
|
|
147
|
+
|
|
148
|
+
print(f"Splitting PDF: {input_path}")
|
|
149
|
+
print(f"Output Folder: {output_folder}")
|
|
150
|
+
print(f"Using Filename: {file_name}")
|
|
151
|
+
|
|
152
|
+
with open(input_path, 'rb') as pdf_file:
|
|
153
|
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
|
154
|
+
num_pages = len(pdf_reader.pages) # Replace getNumPages() with len(pdf_reader.pages)
|
|
155
|
+
|
|
156
|
+
print(f"Number of pages in PDF: {num_pages}")
|
|
157
|
+
|
|
158
|
+
for page_num in range(num_pages):
|
|
159
|
+
pdf_writer = PyPDF2.PdfWriter()
|
|
160
|
+
pdf_writer.add_page(pdf_reader.pages[page_num]) # Use the pdf_writer instance you created
|
|
161
|
+
|
|
162
|
+
output_file_path = os.path.join(output_folder, f'{file_name}_page_{page_num + 1}.pdf')
|
|
163
|
+
output_img_path = os.path.join(output_folder, f'{file_name}_page_{page_num + 1}.png')
|
|
164
|
+
print(f"Writing to: {output_file_path}")
|
|
165
|
+
pdf_pages.append(output_file_path)
|
|
166
|
+
save_pdf(output_file_path,pdf_writer)
|
|
167
|
+
|
|
168
|
+
return pdf_pages
|
|
169
|
+
def pdf_to_img_list(pdf_list: List[str], output_folder: Optional[str] = None, file_name: Optional[str] = None,
|
|
170
|
+
paginate: bool = False, extension: str = "png") -> List[str]:
|
|
171
|
+
"""
|
|
172
|
+
Convert a list of PDF files to images.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
pdf_list (List[str]): List of paths to PDF files.
|
|
176
|
+
output_folder (str, optional): Directory to save the images. Defaults to PDF's directory.
|
|
177
|
+
file_name (str, optional): Base name for the images. Defaults to PDF's name.
|
|
178
|
+
paginate (bool): Whether to paginate the image names. Defaults to False.
|
|
179
|
+
extension (str): Extension for the image files. Defaults to "png".
|
|
180
|
+
|
|
181
|
+
Returns:
|
|
182
|
+
List[str]: List of paths to the created image files.
|
|
183
|
+
"""
|
|
184
|
+
image_list=[]
|
|
185
|
+
file_name_start = file_name
|
|
186
|
+
for i, each in enumerate(pdf_list):
|
|
187
|
+
try:
|
|
188
|
+
images = convert_from_path(each)
|
|
189
|
+
except Exception as e:
|
|
190
|
+
print("An error occurred while converting the PDF:", e)
|
|
191
|
+
|
|
192
|
+
if output_folder is None:
|
|
193
|
+
output_folder = get_directory(each)
|
|
194
|
+
if file_name_start is None:
|
|
195
|
+
file_name = get_file_name(each)
|
|
196
|
+
if paginate:
|
|
197
|
+
file_name=f"{file_name}_Page_{i}"
|
|
198
|
+
|
|
199
|
+
for i, image in enumerate(images):
|
|
200
|
+
image_output_path = os.path.join(output_folder, f"{file_name}.{extension}")
|
|
201
|
+
image_list.append(image_output_path)
|
|
202
|
+
save_image(image=image, image_path=image_output_path, format=extension.upper())
|
|
203
|
+
return image_list
|
|
204
|
+
def img_to_txt_list(img_list: List[str], output_folder: Optional[str] = None, file_name: Optional[str] = None,
|
|
205
|
+
paginate: bool = False, extension: str = "txt") -> List[str]:
|
|
206
|
+
"""
|
|
207
|
+
Convert a list of image files to text.
|
|
208
|
+
|
|
209
|
+
Args:
|
|
210
|
+
img_list (List[str]): List of paths to image files.
|
|
211
|
+
output_folder (str, optional): Directory to save the text files. Defaults to image's directory.
|
|
212
|
+
file_name (str, optional): Base name for the text files. Defaults to image's name.
|
|
213
|
+
paginate (bool): Whether to paginate the text filenames. Defaults to False.
|
|
214
|
+
extension (str): Extension for the text files. Defaults to "txt".
|
|
215
|
+
|
|
216
|
+
Returns:
|
|
217
|
+
List[str]: List of paths to the created text files.
|
|
218
|
+
"""
|
|
219
|
+
text_list = []
|
|
220
|
+
file_name_start = file_name
|
|
221
|
+
for i, each in enumerate(img_list):
|
|
222
|
+
if output_folder is None:
|
|
223
|
+
output_folder = get_directory(each)
|
|
224
|
+
if file_name_start is None:
|
|
225
|
+
file_name = get_file_name(each)
|
|
226
|
+
if paginate:
|
|
227
|
+
file_name=f"{file_name}_Page_{i}"
|
|
228
|
+
|
|
229
|
+
text_output = image_to_text(each)
|
|
230
|
+
text_output_path = os.path.join(output_folder, f"{get_file_name(each)}.{extension}")
|
|
231
|
+
text_list.append(text_output_path)
|
|
232
|
+
write_to_file(filepath=text_output_path, contents=text_output)
|
|
233
|
+
return text_list
|
|
234
|
+
def open_pdf_file(pdf_file_path: str) -> None:
|
|
235
|
+
"""
|
|
236
|
+
Open a PDF file using the default associated program.
|
|
237
|
+
|
|
238
|
+
Args:
|
|
239
|
+
pdf_file_path (str): Path to the PDF file to open.
|
|
240
|
+
"""
|
|
241
|
+
try:
|
|
242
|
+
# Open the PDF file using the default associated program
|
|
243
|
+
cmd_input("open "+pdf_file_path)
|
|
244
|
+
except FileNotFoundError:
|
|
245
|
+
print("Error: The specified file does not exist.")
|
|
246
|
+
except Exception as e:
|
|
247
|
+
print("Error:", e)
|
|
248
|
+
# use it before writing to a file
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def get_pdfs_in_directory(directory: str) -> List[str]:
|
|
252
|
+
"""
|
|
253
|
+
Get a list of PDF filenames in a given directory.
|
|
254
|
+
|
|
255
|
+
Args:
|
|
256
|
+
directory (str): Path to the directory.
|
|
257
|
+
|
|
258
|
+
Returns:
|
|
259
|
+
list: List of PDF filenames in the directory.
|
|
260
|
+
"""
|
|
261
|
+
pdfs = []
|
|
262
|
+
for filename in os.listdir(directory):
|
|
263
|
+
if is_pdf_path(filename):
|
|
264
|
+
pdfs.append(filename)
|
|
265
|
+
return pdfs
|
|
266
|
+
|
|
267
|
+
def get_all_pdf_in_directory(file_directory: Optional[str] = None) -> List[str]:
|
|
268
|
+
"""
|
|
269
|
+
Get a list of complete paths to PDF files in a given directory.
|
|
270
|
+
|
|
271
|
+
Args:
|
|
272
|
+
file_directory (str, optional): Path to the directory.
|
|
273
|
+
|
|
274
|
+
Returns:
|
|
275
|
+
list: List of paths to PDF files in the directory.
|
|
276
|
+
"""
|
|
277
|
+
pdfs=[]
|
|
278
|
+
for filename in sorted(os.listdir(file_directory)):
|
|
279
|
+
if is_pdf_path(filename):
|
|
280
|
+
pdf_path = os.path.join(file_directory, filename)
|
|
281
|
+
if is_file(pdf_path):
|
|
282
|
+
pdfs.append(pdf_path)
|
|
283
|
+
return pdfs
|
|
284
|
+
|
|
285
|
+
def collate_pdfs(pdf_list: List[str], output_pdf_path: str) -> None:
|
|
286
|
+
"""
|
|
287
|
+
Merge multiple PDF files into a single PDF.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
pdf_list (list): List of paths to PDF files to be merged.
|
|
291
|
+
output_pdf_path (str): Path to save the merged PDF.
|
|
292
|
+
"""
|
|
293
|
+
pdf_writer = PyPDF2.PdfWriter()
|
|
294
|
+
for file_path in pdf_list:
|
|
295
|
+
with open(file_path, 'rb') as pdf_file:
|
|
296
|
+
pdf_reader = PyPDF2.PdfReader(pdf_file)
|
|
297
|
+
for page_num in range(len(pdf_reader.pages)):
|
|
298
|
+
pdf_writer.add_page(pdf_reader.pages[page_num])
|
|
299
|
+
save_pdf(output_file_path, pdf_writer)
|
|
300
|
+
|