opsci-toolbox 0.0.5__py3-none-any.whl → 0.0.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opsci_toolbox/apis/rapidapi_helpers.py +1 -0
- opsci_toolbox/helpers/common.py +557 -207
- opsci_toolbox/helpers/cv.py +298 -123
- opsci_toolbox/helpers/dataviz.py +875 -191
- opsci_toolbox/helpers/dates.py +55 -8
- opsci_toolbox/helpers/nlp.py +746 -97
- opsci_toolbox/helpers/nlp_cuml.py +166 -57
- opsci_toolbox/helpers/sna.py +101 -10
- opsci_toolbox/helpers/surreaction.py +58 -16
- {opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.6.dist-info}/METADATA +2 -1
- opsci_toolbox-0.0.6.dist-info/RECORD +21 -0
- opsci_toolbox-0.0.5.dist-info/RECORD +0 -21
- {opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.6.dist-info}/WHEEL +0 -0
- {opsci_toolbox-0.0.5.dist-info → opsci_toolbox-0.0.6.dist-info}/top_level.txt +0 -0
opsci_toolbox/helpers/common.py
CHANGED
@@ -15,12 +15,27 @@ import pyarrow.parquet as pq
|
|
15
15
|
from datetime import datetime
|
16
16
|
import hashlib
|
17
17
|
import ast
|
18
|
+
import subprocess
|
18
19
|
|
19
20
|
####################################################################################################
|
20
21
|
# FILE LOADERS
|
21
22
|
####################################################################################################
|
22
23
|
|
23
|
-
def load_file(path, delimiter = ";", decimal ="."):
|
24
|
+
def load_file(path: str, delimiter: str = ";", decimal: str = ".") -> pd.DataFrame:
|
25
|
+
"""
|
26
|
+
Load a file into a Pandas DataFrame based on the file extension.
|
27
|
+
|
28
|
+
Parameters:
|
29
|
+
path (str): The file path to load.
|
30
|
+
delimiter (str, optional): The delimiter used in CSV/TSV files. Default is ";".
|
31
|
+
decimal (str, optional): The character used for decimal points in CSV/TSV files. Default is ".".
|
32
|
+
|
33
|
+
Returns:
|
34
|
+
pd.DataFrame: The loaded data as a Pandas DataFrame.
|
35
|
+
|
36
|
+
Raises:
|
37
|
+
ValueError: If the file extension is not supported.
|
38
|
+
"""
|
24
39
|
extension = os.path.splitext(os.path.basename(path))[1]
|
25
40
|
if extension == ".parquet":
|
26
41
|
df = load_parquet(path)
|
@@ -38,9 +53,18 @@ def load_file(path, delimiter = ";", decimal ="."):
|
|
38
53
|
print("Check your input file. Extension isn't supported : .parquet, .pickle, .json, .jsonl, .csv, .tsv")
|
39
54
|
return df
|
40
55
|
|
41
|
-
def load_parquet(path):
|
56
|
+
def load_parquet(path: str) -> pd.DataFrame:
|
42
57
|
"""
|
43
|
-
Load a parquet file into a DataFrame
|
58
|
+
Load a parquet file into a DataFrame.
|
59
|
+
|
60
|
+
Parameters:
|
61
|
+
path (str): The file path to the parquet file.
|
62
|
+
|
63
|
+
Returns:
|
64
|
+
pd.DataFrame: The loaded data as a Pandas DataFrame.
|
65
|
+
|
66
|
+
Raises:
|
67
|
+
Exception: If there is an error reading the parquet file.
|
44
68
|
"""
|
45
69
|
try:
|
46
70
|
table = pq.read_table(path)
|
@@ -50,78 +74,108 @@ def load_parquet(path):
|
|
50
74
|
print(e)
|
51
75
|
return df
|
52
76
|
|
53
|
-
|
54
|
-
# """
|
55
|
-
# Load a pickle file into a dataframe
|
56
|
-
# """
|
57
|
-
|
58
|
-
# with open(path, 'rb') as f:
|
59
|
-
# df=pickle.load(f)
|
60
|
-
# return df
|
61
|
-
def load_pickle(path):
|
62
|
-
return pd.read_pickle(path)
|
63
|
-
|
64
|
-
def write_pickle(data, path, filename):
|
77
|
+
def load_pickle(path: str) -> pd.DataFrame:
|
65
78
|
"""
|
66
|
-
|
79
|
+
Load a pickle file into a DataFrame.
|
80
|
+
|
81
|
+
Parameters:
|
82
|
+
path (str): The file path to the pickle file.
|
83
|
+
|
84
|
+
Returns:
|
85
|
+
pd.DataFrame: The loaded data as a Pandas DataFrame.
|
67
86
|
"""
|
68
|
-
|
69
|
-
with open(file_path, 'wb') as f:
|
70
|
-
pickle.dump(data, f)
|
71
|
-
return file_path
|
87
|
+
return pd.read_pickle(path)
|
72
88
|
|
73
89
|
|
74
|
-
def load_json(path: str):
|
90
|
+
def load_json(path: str) -> pd.DataFrame:
|
75
91
|
"""
|
76
|
-
Load a
|
92
|
+
Load a JSON file into a DataFrame.
|
93
|
+
|
94
|
+
Parameters:
|
95
|
+
path (str): The file path to the JSON file.
|
96
|
+
|
97
|
+
Returns:
|
98
|
+
pd.DataFrame: The loaded data as a Pandas DataFrame.
|
99
|
+
|
100
|
+
Raises:
|
101
|
+
Exception: If there is an error reading the JSON file.
|
77
102
|
"""
|
78
|
-
df=pd.DataFrame()
|
103
|
+
df = pd.DataFrame()
|
79
104
|
try:
|
80
105
|
with open(path, 'r') as json_file:
|
81
106
|
data = json.load(json_file)
|
82
|
-
df=pd.json_normalize(data)
|
83
|
-
|
107
|
+
df = pd.json_normalize(data)
|
84
108
|
except Exception as e:
|
85
|
-
|
86
|
-
|
109
|
+
print(f"Error reading the JSON file: {e}")
|
110
|
+
raise
|
87
111
|
return df
|
88
112
|
|
89
|
-
def load_jsonl(path: str):
|
113
|
+
def load_jsonl(path: str) -> pd.DataFrame:
|
90
114
|
"""
|
91
|
-
Load a jsonl file into a
|
115
|
+
Load a JSON Lines (jsonl) file into a DataFrame.
|
116
|
+
|
117
|
+
Parameters:
|
118
|
+
path (str): The file path to the jsonl file.
|
119
|
+
|
120
|
+
Returns:
|
121
|
+
pd.DataFrame: The loaded data as a Pandas DataFrame.
|
122
|
+
|
123
|
+
Raises:
|
124
|
+
Exception: If there is an error reading the jsonl file.
|
92
125
|
"""
|
93
126
|
df = pd.DataFrame()
|
94
127
|
try:
|
95
128
|
data = []
|
96
129
|
with open(path, 'r') as json_file:
|
97
|
-
for line in tqdm(json_file):
|
130
|
+
for line in tqdm(json_file, desc="Loading JSON Lines"):
|
98
131
|
try:
|
99
132
|
data.append(json.loads(line))
|
100
|
-
except:
|
101
|
-
|
133
|
+
except json.JSONDecodeError as line_error:
|
134
|
+
print(f"Error decoding line: {line_error}")
|
102
135
|
|
103
136
|
df = pd.json_normalize(data)
|
104
137
|
except Exception as e:
|
105
|
-
|
106
|
-
|
138
|
+
print(f"Error reading the jsonl file: {e}")
|
139
|
+
raise
|
107
140
|
return df
|
108
141
|
|
109
142
|
|
110
|
-
def load_csv(path: str, delimiter: str =";", decimal:str ="."):
|
143
|
+
def load_csv(path: str, delimiter: str = ";", decimal: str = ".") -> pd.DataFrame:
|
111
144
|
"""
|
112
|
-
Load a
|
145
|
+
Load a CSV file into a DataFrame.
|
146
|
+
|
147
|
+
Parameters:
|
148
|
+
path (str): The file path to the CSV file.
|
149
|
+
delimiter (str, optional): The delimiter used in the CSV file. Default is ";".
|
150
|
+
decimal (str, optional): The character used for decimal points in the CSV file. Default is ".".
|
151
|
+
|
152
|
+
Returns:
|
153
|
+
pd.DataFrame: The loaded data as a Pandas DataFrame.
|
154
|
+
|
155
|
+
Raises:
|
156
|
+
Exception: If there is an error reading the CSV file.
|
113
157
|
"""
|
114
|
-
df= pd.DataFrame()
|
158
|
+
df = pd.DataFrame()
|
115
159
|
try:
|
116
160
|
df = pd.read_csv(path, delimiter=delimiter, encoding="utf-8", decimal=decimal)
|
117
161
|
except Exception as e:
|
118
|
-
|
119
|
-
|
162
|
+
print(f"Error reading the CSV file: {e}")
|
163
|
+
raise
|
120
164
|
return df
|
121
165
|
|
122
|
-
def read_txt_to_list(file_path: str):
|
166
|
+
def read_txt_to_list(file_path: str) -> list[str]:
|
123
167
|
"""
|
124
|
-
Read a text file line by line and append to a Python list
|
168
|
+
Read a text file line by line and append to a Python list.
|
169
|
+
|
170
|
+
Parameters:
|
171
|
+
file_path (str): The file path to the text file.
|
172
|
+
|
173
|
+
Returns:
|
174
|
+
list[str]: A list of lines read from the text file.
|
175
|
+
|
176
|
+
Raises:
|
177
|
+
FileNotFoundError: If the file does not exist.
|
178
|
+
Exception: If any other error occurs during file reading.
|
125
179
|
"""
|
126
180
|
|
127
181
|
# Initialize an empty list to store the lines
|
@@ -136,12 +190,22 @@ def read_txt_to_list(file_path: str):
|
|
136
190
|
print(f"File not found: {file_path}")
|
137
191
|
except Exception as e:
|
138
192
|
print(f"An error occurred: {e}")
|
193
|
+
raise
|
139
194
|
return lines
|
140
195
|
|
141
|
-
|
142
|
-
def read_json(path: str):
|
196
|
+
def read_json(path: str) -> dict:
|
143
197
|
"""
|
144
|
-
Read a
|
198
|
+
Read a JSON file and return a dictionary.
|
199
|
+
|
200
|
+
Parameters:
|
201
|
+
path (str): The file path to the JSON file.
|
202
|
+
|
203
|
+
Returns:
|
204
|
+
dict: The data read from the JSON file as a dictionary.
|
205
|
+
|
206
|
+
Raises:
|
207
|
+
FileNotFoundError: If the file does not exist.
|
208
|
+
Exception: If there is an error reading the JSON file.
|
145
209
|
"""
|
146
210
|
with open(path, 'r') as json_file:
|
147
211
|
data = json.load(json_file)
|
@@ -149,25 +213,55 @@ def read_json(path: str):
|
|
149
213
|
|
150
214
|
def read_txt_file(file_path: str) -> str:
|
151
215
|
"""
|
152
|
-
Read a text file
|
216
|
+
Read the content of a text file and return it as a string.
|
217
|
+
|
218
|
+
Parameters:
|
219
|
+
file_path (str): The file path to the text file.
|
220
|
+
|
221
|
+
Returns:
|
222
|
+
str: The content of the text file as a string.
|
223
|
+
|
224
|
+
Raises:
|
225
|
+
FileNotFoundError: If the file does not exist.
|
226
|
+
Exception: If there is an error reading the text file.
|
153
227
|
"""
|
154
|
-
|
155
|
-
|
228
|
+
try:
|
229
|
+
with open(file_path, 'r') as file:
|
230
|
+
content = file.read()
|
231
|
+
except FileNotFoundError:
|
232
|
+
print(f"File not found: {file_path}")
|
233
|
+
raise
|
234
|
+
except Exception as e:
|
235
|
+
print(f"An error occurred while reading the file: {e}")
|
236
|
+
raise
|
156
237
|
return content
|
157
238
|
|
158
|
-
def read_jsonl(path: str):
|
239
|
+
def read_jsonl(path: str) -> list[dict]:
|
159
240
|
"""
|
160
|
-
Load a jsonl file into a
|
241
|
+
Load a JSON Lines (jsonl) file into a list of dictionaries.
|
242
|
+
|
243
|
+
Parameters:
|
244
|
+
path (str): The file path to the jsonl file.
|
245
|
+
|
246
|
+
Returns:
|
247
|
+
list[dict]: A list of dictionaries containing the data read from the JSON Lines file.
|
248
|
+
|
249
|
+
Raises:
|
250
|
+
FileNotFoundError: If the file does not exist.
|
251
|
+
Exception: If there is an error reading the jsonl file.
|
161
252
|
"""
|
162
253
|
json_data = []
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
254
|
+
try:
|
255
|
+
with open(path, 'r') as json_file:
|
256
|
+
for line in tqdm(json_file, desc="Reading JSON Lines"):
|
257
|
+
try:
|
258
|
+
json_data.append(json.loads(line))
|
259
|
+
except Exception as e:
|
260
|
+
print(f"Error decoding line: {e}")
|
261
|
+
raise
|
262
|
+
except FileNotFoundError:
|
263
|
+
print(f"File not found: {path}")
|
264
|
+
raise
|
171
265
|
return json_data
|
172
266
|
|
173
267
|
|
@@ -176,37 +270,55 @@ def read_jsonl(path: str):
|
|
176
270
|
#########################################################################################
|
177
271
|
|
178
272
|
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
273
|
+
def write_pickle(data: pd.DataFrame, path: str, filename: str) -> str:
|
274
|
+
"""
|
275
|
+
Write a DataFrame into a pickle file.
|
276
|
+
|
277
|
+
Parameters:
|
278
|
+
data (pd.DataFrame): The DataFrame to be written to the pickle file.
|
279
|
+
path (str): The directory where the pickle file will be saved.
|
280
|
+
filename (str): The name of the pickle file (without the extension).
|
184
281
|
|
185
|
-
|
186
|
-
|
187
|
-
|
282
|
+
Returns:
|
283
|
+
str: The full path to the saved pickle file.
|
284
|
+
"""
|
285
|
+
file_path = os.path.join(path, filename + '.pickle')
|
286
|
+
with open(file_path, 'wb') as f:
|
287
|
+
pickle.dump(data, f)
|
288
|
+
return file_path
|
188
289
|
|
189
290
|
|
190
|
-
def write_list_to_txt(input_list: list, path: str, name: str):
|
291
|
+
def write_list_to_txt(input_list: list, path: str, name: str) -> str:
|
191
292
|
"""
|
192
293
|
Write a list to a text file, with each item on a new line.
|
193
294
|
|
194
295
|
Parameters:
|
195
|
-
- file_path (str): The path to the text file.
|
196
296
|
- input_list (list): The list to be written to the text file.
|
297
|
+
- path (str): The directory path where the text file will be saved.
|
298
|
+
- name (str): The name of the text file (without the extension).
|
299
|
+
|
300
|
+
Returns:
|
301
|
+
str: The full path to the saved text file.
|
197
302
|
"""
|
198
|
-
file_path=os.path.join(path, name+'.txt')
|
303
|
+
file_path = os.path.join(path, name + '.txt')
|
199
304
|
with open(file_path, 'w') as file:
|
200
305
|
for item in input_list:
|
201
306
|
file.write(str(item) + '\n')
|
202
|
-
|
203
307
|
return file_path
|
204
308
|
|
205
|
-
def write_jsonl(data: list,
|
309
|
+
def write_jsonl(data: list[dict], path: str, name: str) -> str:
|
206
310
|
"""
|
207
|
-
Write a jsonl file.
|
311
|
+
Write data to a JSON Lines (jsonl) file. Each dictionary in the list represents a single JSON object.
|
312
|
+
|
313
|
+
Parameters:
|
314
|
+
- data (list[dict]): The list of dictionaries to be written to the JSON Lines file.
|
315
|
+
- path (str): The directory path where the JSON Lines file will be saved.
|
316
|
+
- name (str): The name of the JSON Lines file (without the extension).
|
317
|
+
|
318
|
+
Returns:
|
319
|
+
str: The full path to the saved JSON Lines file.
|
208
320
|
"""
|
209
|
-
file_path=os.path.join(path, name+'.jsonl')
|
321
|
+
file_path = os.path.join(path, name + '.jsonl')
|
210
322
|
with open(file_path, 'w') as file:
|
211
323
|
for entry in data:
|
212
324
|
json.dump(entry, file)
|
@@ -214,41 +326,67 @@ def write_jsonl(data: list, path: str, name: str):
|
|
214
326
|
return file_path
|
215
327
|
|
216
328
|
|
217
|
-
def write_json(json_dict: dict, path: str, name: str):
|
329
|
+
def write_json(json_dict: dict, path: str, name: str) -> str:
|
218
330
|
"""
|
219
|
-
Write a
|
331
|
+
Write a dictionary to a JSON file.
|
332
|
+
|
333
|
+
Parameters:
|
334
|
+
- json_dict (dict): The dictionary to be written to the JSON file.
|
335
|
+
- path (str): The directory path where the JSON file will be saved.
|
336
|
+
- name (str): The name of the JSON file (without the extension).
|
337
|
+
|
338
|
+
Returns:
|
339
|
+
str: The full path to the saved JSON file.
|
220
340
|
"""
|
221
|
-
file_path=os.path.join(path, name+'.json')
|
341
|
+
file_path = os.path.join(path, name + '.json')
|
222
342
|
with open(file_path, 'w') as outfile:
|
223
343
|
json.dump(json_dict, outfile)
|
224
344
|
return file_path
|
225
345
|
|
226
346
|
|
227
|
-
def write_dataframe_to_json(df: pd.DataFrame, path: str, name: str, orient='records'):
|
347
|
+
def write_dataframe_to_json(df: pd.DataFrame, path: str, name: str, orient: str = 'records') -> str:
|
228
348
|
"""
|
229
|
-
Write a
|
349
|
+
Write a DataFrame to a JSON file.
|
350
|
+
|
351
|
+
Parameters:
|
352
|
+
- df (pd.DataFrame): The DataFrame to be written to the JSON file.
|
353
|
+
- path (str): The directory path where the JSON file will be saved.
|
354
|
+
- name (str): The name of the JSON file (without the extension).
|
355
|
+
- orient (str, optional): The format of the JSON file. Default is 'records'.
|
356
|
+
|
357
|
+
Returns:
|
358
|
+
str: The full path to the saved JSON file.
|
230
359
|
"""
|
231
|
-
file_path=os.path.join(path, name+".json")
|
360
|
+
file_path = os.path.join(path, name + ".json")
|
232
361
|
df.to_json(file_path, orient=orient, lines=True)
|
362
|
+
return file_path
|
233
363
|
|
234
364
|
|
235
|
-
def save_dataframe_excel(df: pd.DataFrame, path: str, name
|
365
|
+
def save_dataframe_excel(df: pd.DataFrame, path: str, name: str, sheet_name: str) -> str:
|
236
366
|
"""
|
237
|
-
Write a
|
367
|
+
Write a DataFrame to an Excel file.
|
368
|
+
|
369
|
+
Parameters:
|
370
|
+
- df (pd.DataFrame): The DataFrame to be written to the Excel file.
|
371
|
+
- path (str): The directory path where the Excel file will be saved.
|
372
|
+
- name (str): The name of the Excel file (without the extension).
|
373
|
+
- sheet_name (str): The name of the Excel sheet.
|
374
|
+
|
375
|
+
Returns:
|
376
|
+
str: The full path to the saved Excel file.
|
238
377
|
"""
|
239
|
-
|
240
|
-
file_path=os.path.join(path, f"{name}.xlsx")
|
378
|
+
file_path = os.path.join(path, f"{name}.xlsx")
|
241
379
|
df.to_excel(file_path, sheet_name=sheet_name, index=False)
|
242
380
|
print(file_path, "- File created")
|
243
381
|
return file_path
|
244
382
|
|
245
|
-
def add_dataframe_to_excel(df: pd.DataFrame, existing_file_path: str, new_sheet_name: str):
|
383
|
+
def add_dataframe_to_excel(df: pd.DataFrame, existing_file_path: str, new_sheet_name: str) -> None:
|
246
384
|
"""
|
247
385
|
Adds a DataFrame to an existing Excel file as a new sheet.
|
248
386
|
|
249
387
|
Parameters:
|
388
|
+
- df (pd.DataFrame): The DataFrame to be added.
|
250
389
|
- existing_file_path (str): Path to the existing Excel file.
|
251
|
-
- dataframe (pd.DataFrame): The DataFrame to be added.
|
252
390
|
- new_sheet_name (str): Name of the new sheet in the Excel file.
|
253
391
|
|
254
392
|
Returns:
|
@@ -257,7 +395,7 @@ def add_dataframe_to_excel(df: pd.DataFrame, existing_file_path: str, new_sheet_
|
|
257
395
|
# Read existing Excel file into a dictionary of DataFrames
|
258
396
|
excel_file = pd.read_excel(existing_file_path, sheet_name=None)
|
259
397
|
|
260
|
-
# Add the new DataFrame to the dictionary with the specified sheet
|
398
|
+
# Add the new DataFrame to the dictionary with the specified sheet name
|
261
399
|
excel_file[new_sheet_name] = df
|
262
400
|
|
263
401
|
# Write the updated dictionary of DataFrames back to the Excel file
|
@@ -265,46 +403,62 @@ def add_dataframe_to_excel(df: pd.DataFrame, existing_file_path: str, new_sheet_
|
|
265
403
|
for sheet_name, df in excel_file.items():
|
266
404
|
df.to_excel(writer, sheet_name=sheet_name, index=False)
|
267
405
|
|
268
|
-
def save_dataframe_csv(df: pd.DataFrame, path: str, name: str):
|
406
|
+
def save_dataframe_csv(df: pd.DataFrame, path: str, name: str) -> str:
|
269
407
|
"""
|
270
|
-
|
271
|
-
|
272
|
-
:
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
:
|
279
|
-
:type name: str
|
408
|
+
Save a DataFrame to a CSV file within a specified directory.
|
409
|
+
|
410
|
+
Parameters:
|
411
|
+
- df (pd.DataFrame): The DataFrame to be saved.
|
412
|
+
- path (str): The directory where the CSV file will be saved.
|
413
|
+
- name (str): The desired name for the CSV file (without extension).
|
414
|
+
|
415
|
+
Returns:
|
416
|
+
str: The full path to the saved CSV file.
|
280
417
|
"""
|
281
|
-
|
418
|
+
file_path = os.path.join(path, f"{name}.csv")
|
282
419
|
df.to_csv(
|
283
|
-
|
284
|
-
header=names,
|
420
|
+
file_path,
|
285
421
|
sep=";",
|
286
422
|
encoding="utf-8",
|
287
423
|
index=False,
|
288
424
|
decimal=",",
|
289
425
|
)
|
290
|
-
print("
|
426
|
+
print("File saved:", file_path)
|
427
|
+
return file_path
|
291
428
|
|
292
|
-
def write_txt_file(data: str,
|
429
|
+
def write_txt_file(data: str, path: str, name: str) -> str:
|
293
430
|
"""
|
294
|
-
Write a text file
|
431
|
+
Write a string to a text file.
|
432
|
+
|
433
|
+
Parameters:
|
434
|
+
- data (str): The string to be written to the text file.
|
435
|
+
- path (str): The directory path where the text file will be saved.
|
436
|
+
- name (str): The name of the text file (without the extension).
|
437
|
+
|
438
|
+
Returns:
|
439
|
+
str: The full path to the saved text file.
|
295
440
|
"""
|
296
|
-
file_path=os.path.join(path, name+'.txt')
|
441
|
+
file_path = os.path.join(path, name + '.txt')
|
297
442
|
with open(file_path, "w") as file:
|
298
443
|
file.write(data)
|
299
444
|
return file_path
|
300
445
|
|
301
|
-
def split_df_into_chunks(df, path, name, chunk_size = 10000):
|
446
|
+
def split_df_into_chunks(df: pd.DataFrame, path: str, name: str, chunk_size: int = 10000) -> list[str]:
|
302
447
|
"""
|
303
|
-
Split a
|
448
|
+
Split a DataFrame into multiple pickle files with a specified chunk size.
|
449
|
+
|
450
|
+
Parameters:
|
451
|
+
- df (pd.DataFrame): The DataFrame to be split.
|
452
|
+
- path (str): The directory path where the pickle files will be saved.
|
453
|
+
- name (str): The base name for the pickle files.
|
454
|
+
- chunk_size (int, optional): The size of each chunk. Default is 10000.
|
455
|
+
|
456
|
+
Returns:
|
457
|
+
list[str]: A list of file paths to the saved pickle files.
|
304
458
|
"""
|
305
459
|
num_chunks = -(-len(df) // chunk_size) # Calculate the number of chunks using ceil division
|
306
460
|
|
307
|
-
file_paths=[]
|
461
|
+
file_paths = []
|
308
462
|
|
309
463
|
# create smaller datasets of chunk_size each
|
310
464
|
for i in range(num_chunks):
|
@@ -317,16 +471,19 @@ def split_df_into_chunks(df, path, name, chunk_size = 10000):
|
|
317
471
|
|
318
472
|
return file_paths
|
319
473
|
|
320
|
-
|
321
|
-
|
322
474
|
###################################################################################################
|
323
475
|
# FOLDERS / FILES HELPERS
|
324
476
|
###################################################################################################
|
325
477
|
|
326
|
-
def create_dir(path:str):
|
478
|
+
def create_dir(path: str) -> str:
|
327
479
|
"""
|
328
|
-
Create a local directory
|
480
|
+
Create a local directory if it doesn't exist.
|
481
|
+
|
482
|
+
Parameters:
|
483
|
+
- path (str): The directory path to be created.
|
329
484
|
|
485
|
+
Returns:
|
486
|
+
str: The path of the created directory.
|
330
487
|
"""
|
331
488
|
if not os.path.exists(path):
|
332
489
|
os.makedirs(path)
|
@@ -334,18 +491,31 @@ def create_dir(path:str):
|
|
334
491
|
return path
|
335
492
|
|
336
493
|
|
337
|
-
def list_files_in_dir(path: str, filetype:str ='*.json'):
|
494
|
+
def list_files_in_dir(path: str, filetype: str = '*.json') -> list[str]:
|
338
495
|
"""
|
339
|
-
List files of a specific format in a directory
|
496
|
+
List files of a specific format in a directory.
|
497
|
+
|
498
|
+
Parameters:
|
499
|
+
- path (str): The directory path to search for files.
|
500
|
+
- filetype (str, optional): The file type pattern to search for. Default is '*.json'.
|
501
|
+
|
502
|
+
Returns:
|
503
|
+
list[str]: A list of file paths matching the specified file type pattern.
|
340
504
|
"""
|
341
505
|
pattern = os.path.join(path, filetype)
|
342
506
|
files = glob.glob(pattern)
|
343
507
|
return files
|
344
508
|
|
345
509
|
|
346
|
-
def list_subdirectories(root_directory: str):
|
510
|
+
def list_subdirectories(root_directory: str) -> list[str]:
|
347
511
|
"""
|
348
|
-
List subdirectories in a root directory
|
512
|
+
List subdirectories in a root directory.
|
513
|
+
|
514
|
+
Parameters:
|
515
|
+
- root_directory (str): The root directory path.
|
516
|
+
|
517
|
+
Returns:
|
518
|
+
list[str]: A list of subdirectory names.
|
349
519
|
"""
|
350
520
|
subdirectories = []
|
351
521
|
for entry in os.scandir(root_directory):
|
@@ -354,9 +524,15 @@ def list_subdirectories(root_directory: str):
|
|
354
524
|
return subdirectories
|
355
525
|
|
356
526
|
|
357
|
-
def list_recursive_subdirectories(root_directory: str):
|
527
|
+
def list_recursive_subdirectories(root_directory: str) -> list[str]:
|
358
528
|
"""
|
359
|
-
List recursively all subdirectories from a root directory
|
529
|
+
List recursively all subdirectories from a root directory.
|
530
|
+
|
531
|
+
Parameters:
|
532
|
+
- root_directory (str): The root directory path.
|
533
|
+
|
534
|
+
Returns:
|
535
|
+
list[str]: A list of subdirectory paths.
|
360
536
|
"""
|
361
537
|
subdirectories = []
|
362
538
|
for root, dirs, files in os.walk(root_directory):
|
@@ -364,9 +540,16 @@ def list_recursive_subdirectories(root_directory: str):
|
|
364
540
|
return subdirectories
|
365
541
|
|
366
542
|
|
367
|
-
def list_files_in_subdirectories(path:str, filetype:str='*.json'):
|
543
|
+
def list_files_in_subdirectories(path: str, filetype: str = '*.json') -> list[str]:
|
368
544
|
"""
|
369
|
-
Walk through subdirectories of a root directory to list files of a specific format
|
545
|
+
Walk through subdirectories of a root directory to list files of a specific format.
|
546
|
+
|
547
|
+
Parameters:
|
548
|
+
- path (str): The root directory path.
|
549
|
+
- filetype (str, optional): The file type pattern to search for. Default is '*.json'.
|
550
|
+
|
551
|
+
Returns:
|
552
|
+
list[str]: A list of file paths matching the specified file type pattern in subdirectories.
|
370
553
|
"""
|
371
554
|
files = []
|
372
555
|
|
@@ -381,21 +564,36 @@ def list_files_in_subdirectories(path:str, filetype:str='*.json'):
|
|
381
564
|
|
382
565
|
return files
|
383
566
|
|
384
|
-
def copy_file(source_path: str, destination_path: str, new_filename:str):
|
567
|
+
def copy_file(source_path: str, destination_path: str, new_filename: str = '') -> str:
|
385
568
|
"""
|
386
|
-
|
569
|
+
Copy a file from a source path to a destination path.
|
570
|
+
|
571
|
+
Parameters:
|
572
|
+
- source_path (str): The path of the source file.
|
573
|
+
- destination_path (str): The path of the destination directory.
|
574
|
+
- new_filename (str, optional): The new filename. If not provided, the original filename is used.
|
575
|
+
|
576
|
+
Returns:
|
577
|
+
str: The path of the copied file.
|
387
578
|
"""
|
388
579
|
if new_filename:
|
389
|
-
file_path=os.path.join(destination_path, new_filename)
|
580
|
+
file_path = os.path.join(destination_path, new_filename)
|
390
581
|
else:
|
391
|
-
filename=os.path.basename(source_path)
|
392
|
-
file_path=os.path.join(destination_path,filename)
|
582
|
+
filename = os.path.basename(source_path)
|
583
|
+
file_path = os.path.join(destination_path, filename)
|
584
|
+
|
393
585
|
shutil.copy(source_path, file_path)
|
394
586
|
return file_path
|
395
587
|
|
396
|
-
def remove_file(file_path):
|
588
|
+
def remove_file(file_path: str) -> None:
|
397
589
|
"""
|
398
|
-
Remove a single file
|
590
|
+
Remove a single file.
|
591
|
+
|
592
|
+
Parameters:
|
593
|
+
- file_path (str): The path of the file to be removed.
|
594
|
+
|
595
|
+
Returns:
|
596
|
+
None
|
399
597
|
"""
|
400
598
|
try:
|
401
599
|
os.remove(file_path)
|
@@ -403,20 +601,33 @@ def remove_file(file_path):
|
|
403
601
|
except OSError as e:
|
404
602
|
print(f"Error removing file {file_path}: {e}")
|
405
603
|
|
406
|
-
def remove_folder(folder_path):
|
604
|
+
def remove_folder(folder_path: str) -> None:
|
407
605
|
"""
|
408
|
-
Remove a folder and all
|
606
|
+
Remove a folder and all its contents.
|
607
|
+
|
608
|
+
Parameters:
|
609
|
+
- folder_path (str): The path of the folder to be removed.
|
610
|
+
|
611
|
+
Returns:
|
612
|
+
None
|
409
613
|
"""
|
410
614
|
try:
|
411
615
|
shutil.rmtree(folder_path)
|
412
616
|
print(f"Folder {folder_path} and its contents removed successfully.")
|
413
617
|
except OSError as e:
|
414
|
-
print(f"Error removing folder {folder_path}: {e}")
|
618
|
+
print(f"Error removing folder {folder_path}: {e}")
|
415
619
|
|
416
620
|
|
417
|
-
def get_file_size(file_path):
|
621
|
+
def get_file_size(file_path: str) -> tuple[int, str]:
|
418
622
|
"""
|
419
|
-
Get a single file
|
623
|
+
Get the size of a single file in a readable format (KB, MB, GB).
|
624
|
+
|
625
|
+
Parameters:
|
626
|
+
- file_path (str): The path of the file.
|
627
|
+
|
628
|
+
Returns:
|
629
|
+
tuple[int, str]: A tuple containing the size of the file in bytes and its formatted size.
|
630
|
+
If the file is not found, returns None.
|
420
631
|
"""
|
421
632
|
try:
|
422
633
|
size = os.path.getsize(file_path)
|
@@ -439,9 +650,16 @@ def get_file_size(file_path):
|
|
439
650
|
print(f"File not found: {file_path}")
|
440
651
|
return None
|
441
652
|
|
442
|
-
def get_folder_size(folder_path):
|
653
|
+
def get_folder_size(folder_path: str) -> tuple[int, str]:
|
443
654
|
"""
|
444
|
-
Get size of all files contained in a folder in a readable format (KB, MB, GB)
|
655
|
+
Get the size of all files contained in a folder in a readable format (KB, MB, GB).
|
656
|
+
|
657
|
+
Parameters:
|
658
|
+
- folder_path (str): The path of the folder.
|
659
|
+
|
660
|
+
Returns:
|
661
|
+
tuple[int, str]: A tuple containing the total size of all files in bytes and its formatted size.
|
662
|
+
If the folder is not found, returns None.
|
445
663
|
"""
|
446
664
|
total_size = 0
|
447
665
|
|
@@ -469,9 +687,16 @@ def get_folder_size(folder_path):
|
|
469
687
|
print(f"Folder not found: {folder_path}")
|
470
688
|
return None
|
471
689
|
|
472
|
-
def file_creation_date(file_path):
|
690
|
+
def file_creation_date(file_path: str) -> datetime:
|
473
691
|
"""
|
474
|
-
Return the last update timestamp
|
692
|
+
Return the last update timestamp of a file.
|
693
|
+
|
694
|
+
Parameters:
|
695
|
+
- file_path (str): The path of the file.
|
696
|
+
|
697
|
+
Returns:
|
698
|
+
datetime: The last update timestamp as a datetime object.
|
699
|
+
If the file does not exist, returns None.
|
475
700
|
"""
|
476
701
|
# Check if the file exists
|
477
702
|
if os.path.exists(file_path):
|
@@ -488,27 +713,34 @@ def file_creation_date(file_path):
|
|
488
713
|
############################################################################
|
489
714
|
|
490
715
|
|
491
|
-
def transform_to_n_items_list(
|
716
|
+
def transform_to_n_items_list(lst: list, n: int) -> list[list]:
|
492
717
|
"""
|
493
718
|
Transform a list into a list of n-items sublists.
|
494
719
|
|
495
720
|
Parameters:
|
496
|
-
-
|
497
|
-
- n: The number of items in each sublist.
|
721
|
+
- lst (list): The input list to be transformed.
|
722
|
+
- n (int): The number of items in each sublist.
|
498
723
|
|
499
724
|
Returns:
|
500
|
-
A list of n-items sublists.
|
725
|
+
list[list]: A list of n-items sublists.
|
501
726
|
"""
|
502
|
-
return [
|
727
|
+
return [lst[i:i + n] for i in range(0, len(lst), n)]
|
503
728
|
|
504
|
-
|
729
|
+
|
730
|
+
def unduplicate_list(lst: list) -> list:
|
505
731
|
"""
|
506
|
-
|
732
|
+
Remove duplicate elements from a list.
|
733
|
+
|
734
|
+
Parameters:
|
735
|
+
- lst (list): The input list with possible duplicate elements.
|
736
|
+
|
737
|
+
Returns:
|
738
|
+
list: A list with duplicate elements removed.
|
507
739
|
"""
|
508
740
|
return list(set(lst))
|
509
741
|
|
510
742
|
|
511
|
-
def sort_list(lst, reverse=False):
|
743
|
+
def sort_list(lst: list, reverse: bool = False) -> list:
|
512
744
|
"""
|
513
745
|
Sort the list in ascending or descending order.
|
514
746
|
|
@@ -518,12 +750,12 @@ def sort_list(lst, reverse=False):
|
|
518
750
|
If False (default), sort the list in ascending order.
|
519
751
|
|
520
752
|
Returns:
|
521
|
-
|
753
|
+
list: A new list sorted based on the specified order.
|
522
754
|
"""
|
523
755
|
return sorted(lst, reverse=reverse)
|
524
756
|
|
525
757
|
|
526
|
-
def map_list(lst, function):
|
758
|
+
def map_list(lst: list, function: callable) -> list:
|
527
759
|
"""
|
528
760
|
Apply a function to each element of the list.
|
529
761
|
|
@@ -532,12 +764,12 @@ def map_list(lst, function):
|
|
532
764
|
- function (callable): The function to apply to each element.
|
533
765
|
|
534
766
|
Returns:
|
535
|
-
|
767
|
+
list: A new list with the function applied to each element.
|
536
768
|
"""
|
537
769
|
return [function(element) for element in lst]
|
538
770
|
|
539
771
|
|
540
|
-
def flatten_list(lst):
|
772
|
+
def flatten_list(lst: list) -> list:
|
541
773
|
"""
|
542
774
|
Flatten a nested list into a single list.
|
543
775
|
|
@@ -545,7 +777,7 @@ def flatten_list(lst):
|
|
545
777
|
- lst (list): The input nested list.
|
546
778
|
|
547
779
|
Returns:
|
548
|
-
|
780
|
+
list: A new list with all nested elements flattened.
|
549
781
|
"""
|
550
782
|
flattened_list = []
|
551
783
|
|
@@ -560,7 +792,7 @@ def flatten_list(lst):
|
|
560
792
|
return flattened_list
|
561
793
|
|
562
794
|
|
563
|
-
def find_occurrences(lst, element):
|
795
|
+
def find_occurrences(lst: list, element) -> int:
|
564
796
|
"""
|
565
797
|
Find the occurrences of a specific element in the list.
|
566
798
|
|
@@ -569,12 +801,12 @@ def find_occurrences(lst, element):
|
|
569
801
|
- element: The element to find occurrences of.
|
570
802
|
|
571
803
|
Returns:
|
572
|
-
|
804
|
+
int: The number of occurrences of the specified element in the list.
|
573
805
|
"""
|
574
806
|
return lst.count(element)
|
575
807
|
|
576
808
|
|
577
|
-
def is_subset(subset, superset):
|
809
|
+
def is_subset(subset: list, superset: list) -> bool:
|
578
810
|
"""
|
579
811
|
Check if one list is a subset of another.
|
580
812
|
|
@@ -583,11 +815,11 @@ def is_subset(subset, superset):
|
|
583
815
|
- superset (list): The superset list.
|
584
816
|
|
585
817
|
Returns:
|
586
|
-
|
818
|
+
bool: True if the subset is a subset of the superset, False otherwise.
|
587
819
|
"""
|
588
820
|
return all(element in superset for element in subset)
|
589
821
|
|
590
|
-
def common_elements(list1, list2):
|
822
|
+
def common_elements(list1: list, list2: list) -> list:
|
591
823
|
"""
|
592
824
|
Find the common elements between two lists.
|
593
825
|
|
@@ -596,12 +828,12 @@ def common_elements(list1, list2):
|
|
596
828
|
- list2 (list): The second list.
|
597
829
|
|
598
830
|
Returns:
|
599
|
-
|
831
|
+
list: A new list containing the common elements between list1 and list2.
|
600
832
|
"""
|
601
833
|
return list(set(list1) & set(list2))
|
602
834
|
|
603
835
|
|
604
|
-
def shuffle_list(lst):
|
836
|
+
def shuffle_list(lst: list) -> list:
|
605
837
|
"""
|
606
838
|
Shuffle the elements of the list randomly.
|
607
839
|
|
@@ -609,14 +841,14 @@ def shuffle_list(lst):
|
|
609
841
|
- lst (list): The input list.
|
610
842
|
|
611
843
|
Returns:
|
612
|
-
|
844
|
+
list: A new list with the elements shuffled randomly.
|
613
845
|
"""
|
614
846
|
shuffled_list = lst.copy()
|
615
847
|
random.shuffle(shuffled_list)
|
616
848
|
return shuffled_list
|
617
849
|
|
618
850
|
|
619
|
-
def sample_list(lst, sample_size):
|
851
|
+
def sample_list(lst: list, sample_size) -> list:
|
620
852
|
"""
|
621
853
|
Sample a list based on an integer or a float representing the sample size.
|
622
854
|
|
@@ -626,7 +858,11 @@ def sample_list(lst, sample_size):
|
|
626
858
|
If a float, the percentage of elements to keep.
|
627
859
|
|
628
860
|
Returns:
|
629
|
-
|
861
|
+
list: A new list containing the sampled elements.
|
862
|
+
|
863
|
+
Raises:
|
864
|
+
- ValueError: If the sample size is invalid (negative integer or float outside [0, 1]).
|
865
|
+
- TypeError: If the sample size is neither an integer nor a float.
|
630
866
|
"""
|
631
867
|
if isinstance(sample_size, int):
|
632
868
|
if sample_size < 0:
|
@@ -640,7 +876,7 @@ def sample_list(lst, sample_size):
|
|
640
876
|
else:
|
641
877
|
raise TypeError("Sample size must be an integer or a float.")
|
642
878
|
|
643
|
-
def count_elements(lst):
|
879
|
+
def count_elements(lst: list) -> dict:
|
644
880
|
"""
|
645
881
|
Count the occurrences of each element in the list.
|
646
882
|
|
@@ -648,46 +884,70 @@ def count_elements(lst):
|
|
648
884
|
- lst (list): The input list.
|
649
885
|
|
650
886
|
Returns:
|
651
|
-
|
887
|
+
dict: A dictionary where keys are unique elements from the list, and values are their counts.
|
652
888
|
"""
|
653
889
|
return dict(Counter(lst))
|
654
890
|
|
655
|
-
def scale_list(lst, min_val=1, max_val=5):
|
891
|
+
def scale_list(lst: list, min_val: float = 1, max_val: float = 5) -> list:
|
892
|
+
"""
|
893
|
+
Scale the values of a list to a specified range.
|
894
|
+
|
895
|
+
Parameters:
|
896
|
+
- lst (list): The input list of values to be scaled.
|
897
|
+
- min_val (float): The minimum value of the output range (default is 1).
|
898
|
+
- max_val (float): The maximum value of the output range (default is 5).
|
899
|
+
|
900
|
+
Returns:
|
901
|
+
- list: A new list with values scaled to the specified range.
|
902
|
+
"""
|
656
903
|
min_w = min(lst)
|
657
904
|
max_w = max(lst)
|
658
|
-
scaled_w = [
|
905
|
+
scaled_w = []
|
659
906
|
for x in lst:
|
660
907
|
try:
|
661
908
|
scaled_value = (x - min_w) / (max_w - min_w) * (max_val - min_val) + min_val
|
662
|
-
except :
|
663
|
-
pass
|
909
|
+
except ZeroDivisionError:
|
664
910
|
scaled_value = min_val
|
665
|
-
|
666
911
|
scaled_w.append(scaled_value)
|
667
912
|
return scaled_w
|
668
913
|
|
669
|
-
|
914
|
+
|
915
|
+
def df_scale_column(df: pd.DataFrame, col_to_scale: str, col_out: str, min_val: float, max_val: float) -> pd.DataFrame:
|
916
|
+
"""
|
917
|
+
Scale values in a DataFrame column to a specified range.
|
918
|
+
|
919
|
+
Parameters:
|
920
|
+
- df (pd.DataFrame): The input DataFrame.
|
921
|
+
- col_to_scale (str): The name of the column to be scaled.
|
922
|
+
- col_out (str): The name of the new column to store scaled values.
|
923
|
+
- min_val (float): The minimum value of the output range.
|
924
|
+
- max_val (float): The maximum value of the output range.
|
925
|
+
|
926
|
+
Returns:
|
927
|
+
- pd.DataFrame: The DataFrame with a new column containing scaled values.
|
928
|
+
"""
|
670
929
|
min_freq = df[col_to_scale].min()
|
671
930
|
max_freq = df[col_to_scale].max()
|
672
|
-
df[col_out] = df[col_to_scale].apply(lambda x
|
931
|
+
df[col_out] = df[col_to_scale].apply(lambda x: ((x - min_freq) / (max_freq - min_freq)) * (max_val - min_val) + min_val)
|
673
932
|
return df
|
674
933
|
|
675
934
|
############################################################################
|
676
935
|
# ZIP HELPERS
|
677
936
|
############################################################################
|
678
937
|
|
679
|
-
def zip_file(source_file_path, zip_file_path, name):
|
938
|
+
def zip_file(source_file_path: str, zip_file_path: str, name: str) -> str:
|
680
939
|
"""
|
681
940
|
Zip a single file.
|
682
941
|
|
683
|
-
|
684
|
-
|
685
|
-
|
942
|
+
Parameters:
|
943
|
+
- source_file_path (str): Path to the file to be zipped.
|
944
|
+
- zip_file_path (str): Path for the resulting zip file.
|
945
|
+
- name (str): Name for the resulting zip file (without extension).
|
686
946
|
|
687
947
|
Returns:
|
688
|
-
|
948
|
+
str: Path to the resulting zip file.
|
689
949
|
"""
|
690
|
-
file_path=os.path.join(zip_file_path, name
|
950
|
+
file_path = os.path.join(zip_file_path, f"{name}.zip")
|
691
951
|
|
692
952
|
with zipfile.ZipFile(file_path, 'w') as zip_file:
|
693
953
|
# The second argument to `arcname` is used to set the name of the file inside the zip
|
@@ -695,18 +955,19 @@ def zip_file(source_file_path, zip_file_path, name):
|
|
695
955
|
|
696
956
|
return file_path
|
697
957
|
|
698
|
-
def zip_folder(source_folder_path, zip_file_path, name):
|
958
|
+
def zip_folder(source_folder_path: str, zip_file_path: str, name: str) -> str:
|
699
959
|
"""
|
700
960
|
Zip an entire folder.
|
701
961
|
|
702
|
-
|
703
|
-
|
704
|
-
|
962
|
+
Parameters:
|
963
|
+
- source_folder_path (str): Path to the folder to be zipped.
|
964
|
+
- zip_file_path (str): Path for the resulting zip file.
|
965
|
+
- name (str): Name for the resulting zip file (without extension).
|
705
966
|
|
706
967
|
Returns:
|
707
|
-
|
968
|
+
str: Path to the resulting zip file.
|
708
969
|
"""
|
709
|
-
file_path=os.path.join(zip_file_path, name
|
970
|
+
file_path = os.path.join(zip_file_path, f"{name}.zip")
|
710
971
|
|
711
972
|
with zipfile.ZipFile(file_path, 'w', zipfile.ZIP_DEFLATED) as zip_file:
|
712
973
|
for foldername, subfolders, filenames in os.walk(source_folder_path):
|
@@ -717,13 +978,19 @@ def zip_folder(source_folder_path, zip_file_path, name):
|
|
717
978
|
|
718
979
|
return file_path
|
719
980
|
|
720
|
-
def unzip_file(zip_file_path, destination_path):
|
981
|
+
def unzip_file(zip_file_path: str, destination_path: str) -> None:
|
721
982
|
"""
|
722
|
-
|
983
|
+
Unzip a zip file.
|
984
|
+
|
985
|
+
Parameters:
|
986
|
+
- zip_file_path (str): Path to the zip file to be unzipped.
|
987
|
+
- destination_path (str): Path where the contents of the zip file will be extracted.
|
988
|
+
|
989
|
+
Returns:
|
990
|
+
None
|
723
991
|
"""
|
724
992
|
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
|
725
993
|
zip_ref.extractall(destination_path)
|
726
|
-
|
727
994
|
|
728
995
|
|
729
996
|
############################################################################
|
@@ -731,19 +998,32 @@ def unzip_file(zip_file_path, destination_path):
|
|
731
998
|
############################################################################
|
732
999
|
|
733
1000
|
|
734
|
-
def create_google_spreadsheet_client(credentials:str):
|
1001
|
+
def create_google_spreadsheet_client(credentials: str):
|
735
1002
|
"""
|
736
|
-
Create a Gspread client to interact with Google Sheets
|
1003
|
+
Create a Gspread client to interact with Google Sheets.
|
1004
|
+
|
1005
|
+
Parameters:
|
1006
|
+
- credentials (str): Path to the JSON file containing Google Service Account credentials.
|
1007
|
+
|
1008
|
+
Returns:
|
1009
|
+
gspread.Client: A client object for interacting with Google Sheets.
|
737
1010
|
"""
|
738
1011
|
return gspread.service_account(filename=credentials)
|
739
1012
|
|
740
|
-
def read_google_spreadsheet(client, sheet_id: str, worksheet_name: str):
|
1013
|
+
def read_google_spreadsheet(client: gspread.Client, sheet_id: str, worksheet_name: str) -> pd.DataFrame:
|
741
1014
|
"""
|
742
|
-
|
1015
|
+
Read data from a Google spreadsheet and return it as a DataFrame.
|
1016
|
+
|
1017
|
+
Parameters:
|
1018
|
+
- client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
|
1019
|
+
- sheet_id (str): The ID of the Google spreadsheet.
|
1020
|
+
- worksheet_name (str): The name of the worksheet within the spreadsheet.
|
1021
|
+
|
1022
|
+
Returns:
|
1023
|
+
pd.DataFrame: A DataFrame containing the data from the specified worksheet.
|
743
1024
|
"""
|
744
1025
|
try:
|
745
|
-
|
746
|
-
# Open the Google Spreadsheet by name
|
1026
|
+
# Open the Google Spreadsheet by ID
|
747
1027
|
sheet = client.open_by_key(sheet_id)
|
748
1028
|
|
749
1029
|
# Select a specific worksheet by name
|
@@ -763,29 +1043,52 @@ def read_google_spreadsheet(client, sheet_id: str, worksheet_name: str):
|
|
763
1043
|
print(f"An error occurred: {e}")
|
764
1044
|
|
765
1045
|
|
766
|
-
def list_google_worksheets(client, sheet_id:str):
|
1046
|
+
def list_google_worksheets(client: gspread.Client, sheet_id: str) -> list:
|
767
1047
|
"""
|
768
|
-
Return a list of worksheet names for a spreadsheet ID
|
1048
|
+
Return a list of worksheet names for a spreadsheet ID.
|
1049
|
+
|
1050
|
+
Parameters:
|
1051
|
+
- client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
|
1052
|
+
- sheet_id (str): The ID of the Google spreadsheet.
|
1053
|
+
|
1054
|
+
Returns:
|
1055
|
+
list: A list of worksheet names.
|
769
1056
|
"""
|
770
1057
|
sheet = client.open_by_key(sheet_id)
|
771
1058
|
worksheet_obj = sheet.worksheets()
|
772
1059
|
worksheet_list = [sheet.title for sheet in worksheet_obj]
|
773
1060
|
return worksheet_list
|
774
1061
|
|
775
|
-
def get_spreadsheet_permissions(client, sheet_id:str):
|
1062
|
+
def get_spreadsheet_permissions(client: gspread.Client, sheet_id: str) -> pd.DataFrame:
|
776
1063
|
"""
|
777
|
-
Return a DataFrame with the list of user email and type that can access the document
|
1064
|
+
Return a DataFrame with the list of user email and type that can access the document.
|
1065
|
+
|
1066
|
+
Parameters:
|
1067
|
+
- client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
|
1068
|
+
- sheet_id (str): The ID of the Google spreadsheet.
|
1069
|
+
|
1070
|
+
Returns:
|
1071
|
+
pd.DataFrame: A DataFrame containing the list of user email addresses and their access types.
|
778
1072
|
"""
|
779
1073
|
sheet = client.open_by_key(sheet_id)
|
780
|
-
permissions=sheet.list_permissions()
|
781
|
-
user_list=[(user.get("emailAddress"),user.get("type")) for user in permissions if user.get("emailAddress") is not None]
|
1074
|
+
permissions = sheet.list_permissions()
|
1075
|
+
user_list = [(user.get("emailAddress"), user.get("type")) for user in permissions if user.get("emailAddress") is not None]
|
782
1076
|
df = pd.DataFrame(user_list, columns=['email', 'type'])
|
783
1077
|
return df
|
784
1078
|
|
785
1079
|
|
786
|
-
def create_google_spreadsheet(client, df, filename:str, worksheet_name:str = "Sheet1"):
|
1080
|
+
def create_google_spreadsheet(client: gspread.Client, df: pd.DataFrame, filename: str, worksheet_name: str = "Sheet1") -> gspread.Spreadsheet:
|
787
1081
|
"""
|
788
|
-
|
1082
|
+
Create a new Google spreadsheet and load a DataFrame into it.
|
1083
|
+
|
1084
|
+
Parameters:
|
1085
|
+
- client (gspread.Client): A Gspread client object authenticated with Google Sheets API.
|
1086
|
+
- df (pd.DataFrame): The DataFrame to be loaded into the spreadsheet.
|
1087
|
+
- filename (str): The desired filename for the new spreadsheet.
|
1088
|
+
- worksheet_name (str, optional): The name of the worksheet within the spreadsheet. Defaults to "Sheet1".
|
1089
|
+
|
1090
|
+
Returns:
|
1091
|
+
gspread.Spreadsheet: The created spreadsheet object.
|
789
1092
|
"""
|
790
1093
|
spreadsheet = client.create(filename)
|
791
1094
|
worksheet = spreadsheet.sheet1
|
@@ -795,17 +1098,34 @@ def create_google_spreadsheet(client, df, filename:str, worksheet_name:str = "Sh
|
|
795
1098
|
|
796
1099
|
return spreadsheet
|
797
1100
|
|
798
|
-
def share_google_spreadsheet(spreadsheet, email, user_type="user", user_role="writer", notify=False, email_message=None, with_link=False):
|
1101
|
+
def share_google_spreadsheet(spreadsheet: gspread.Spreadsheet, email: str, user_type: str = "user", user_role: str = "writer", notify: bool = False, email_message: str = None, with_link: bool = False) -> gspread.Spreadsheet:
|
799
1102
|
"""
|
800
|
-
Share a spreadsheet with a user
|
1103
|
+
Share a spreadsheet with a user.
|
1104
|
+
|
1105
|
+
Parameters:
|
1106
|
+
- spreadsheet (gspread.Spreadsheet): The Google spreadsheet object to be shared.
|
1107
|
+
- email (str): The email address of the user with whom the spreadsheet will be shared.
|
1108
|
+
- user_type (str, optional): The permission type for the user. Defaults to "user".
|
1109
|
+
- user_role (str, optional): The role assigned to the user. Defaults to "writer".
|
1110
|
+
- notify (bool, optional): Whether to notify the user about the sharing. Defaults to False.
|
1111
|
+
- email_message (str, optional): The message to include in the notification email.
|
1112
|
+
- with_link (bool, optional): Whether to include a link to the shared document in the notification email. Defaults to False.
|
1113
|
+
|
1114
|
+
Returns:
|
1115
|
+
gspread.Spreadsheet: The updated spreadsheet object.
|
801
1116
|
"""
|
802
|
-
spreadsheet.share(email, perm_type=user_type, role=user_role, notify
|
1117
|
+
spreadsheet.share(email, perm_type=user_type, role=user_role, notify=notify, email_message=email_message, with_link=with_link)
|
803
1118
|
return spreadsheet
|
804
1119
|
|
805
|
-
|
806
|
-
def generate_short_id(variables : dict):
|
1120
|
+
def generate_short_id(variables: dict) -> tuple[str, str]:
|
807
1121
|
"""
|
808
|
-
Generate
|
1122
|
+
Generate an 8-character ID using a dictionary as input.
|
1123
|
+
|
1124
|
+
Parameters:
|
1125
|
+
- variables (dict): A dictionary containing the variables to be serialized.
|
1126
|
+
|
1127
|
+
Returns:
|
1128
|
+
tuple: A tuple containing the generated short ID and the serialized variables.
|
809
1129
|
"""
|
810
1130
|
# Serialize variables into JSON string
|
811
1131
|
serialized_variables = json.dumps(variables, sort_keys=True)
|
@@ -815,7 +1135,7 @@ def generate_short_id(variables : dict):
|
|
815
1135
|
short_id = hash_value[:8]
|
816
1136
|
return short_id, serialized_variables
|
817
1137
|
|
818
|
-
def df_transform_column_as_list(column):
|
1138
|
+
def df_transform_column_as_list(column: pd.Series) -> pd.Series:
|
819
1139
|
def transform(cell):
|
820
1140
|
if isinstance(cell, str):
|
821
1141
|
# Check if it's a list formatted as string, and convert to list
|
@@ -824,9 +1144,7 @@ def df_transform_column_as_list(column):
|
|
824
1144
|
else:
|
825
1145
|
try:
|
826
1146
|
values = ast.literal_eval(cell)
|
827
|
-
|
828
1147
|
except Exception as e:
|
829
|
-
pass
|
830
1148
|
# If it's a single URL as string, make it a list
|
831
1149
|
values = [cell]
|
832
1150
|
elif isinstance(cell, (int, float, bool)):
|
@@ -844,7 +1162,11 @@ def df_transform_column_as_list(column):
|
|
844
1162
|
return column.apply(transform)
|
845
1163
|
|
846
1164
|
|
847
|
-
def top_rows_per_category(df
|
1165
|
+
def top_rows_per_category(df: pd.DataFrame,
|
1166
|
+
col_to_sort: str,
|
1167
|
+
col_to_gb: str,
|
1168
|
+
cols_to_keep: list[str],
|
1169
|
+
top_rows: int) -> pd.DataFrame:
|
848
1170
|
"""
|
849
1171
|
Select top rows for each category in a dataframe
|
850
1172
|
"""
|
@@ -855,7 +1177,7 @@ def top_rows_per_category(df, col_to_sort, col_to_gb, cols_to_keep, top_rows) :
|
|
855
1177
|
)[cols_to_keep]
|
856
1178
|
return df_gb
|
857
1179
|
|
858
|
-
def format_number(number):
|
1180
|
+
def format_number(number: int) -> str:
|
859
1181
|
"""
|
860
1182
|
Function to format a number in K, M or B
|
861
1183
|
"""
|
@@ -866,4 +1188,32 @@ def format_number(number):
|
|
866
1188
|
elif number < 1000000000:
|
867
1189
|
return f"{number / 1000000:.1f}M"
|
868
1190
|
else:
|
869
|
-
return f"{number / 1000000000:.1f}B"
|
1191
|
+
return f"{number / 1000000000:.1f}B"
|
1192
|
+
|
1193
|
+
|
1194
|
+
|
1195
|
+
def unrar_file(rar_file_path : str, output_dir : str) -> None:
|
1196
|
+
"""
|
1197
|
+
Extracts a .rar file to the specified output directory using the unrar command.
|
1198
|
+
|
1199
|
+
Parameters:
|
1200
|
+
rar_file_path (str): The path to the .rar file.
|
1201
|
+
output_dir (str): The directory where the contents should be extracted.
|
1202
|
+
|
1203
|
+
Returns:
|
1204
|
+
None
|
1205
|
+
"""
|
1206
|
+
try:
|
1207
|
+
# Ensure the output directory exists
|
1208
|
+
subprocess.run(['mkdir', '-p', output_dir], check=True)
|
1209
|
+
|
1210
|
+
# Run the unrar command
|
1211
|
+
result = subprocess.run(['unrar', 'x', '-y', rar_file_path, output_dir],
|
1212
|
+
stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
|
1213
|
+
|
1214
|
+
# Check if the extraction was successful
|
1215
|
+
if result.returncode != 0:
|
1216
|
+
print(f"Extraction failed. Error: {result.stderr}")
|
1217
|
+
|
1218
|
+
except Exception as e:
|
1219
|
+
print(f"An error occurred: {e}")
|