TSVZ 3.29__py3-none-any.whl → 3.35__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TSVZ.py +2125 -1393
- {tsvz-3.29.dist-info → tsvz-3.35.dist-info}/METADATA +1 -1
- tsvz-3.35.dist-info/RECORD +6 -0
- tsvz-3.29.dist-info/RECORD +0 -6
- {tsvz-3.29.dist-info → tsvz-3.35.dist-info}/WHEEL +0 -0
- {tsvz-3.29.dist-info → tsvz-3.35.dist-info}/entry_points.txt +0 -0
- {tsvz-3.29.dist-info → tsvz-3.35.dist-info}/top_level.txt +0 -0
TSVZ.py
CHANGED
|
@@ -4,28 +4,32 @@
|
|
|
4
4
|
# dependencies = [
|
|
5
5
|
# ]
|
|
6
6
|
# ///
|
|
7
|
-
import os , sys
|
|
8
|
-
from collections import OrderedDict , deque
|
|
9
|
-
import time
|
|
10
7
|
import atexit
|
|
11
|
-
import
|
|
8
|
+
import functools
|
|
9
|
+
import io
|
|
10
|
+
import os
|
|
12
11
|
import re
|
|
13
|
-
|
|
12
|
+
from tabnanny import verbose
|
|
13
|
+
import threading
|
|
14
|
+
import time
|
|
15
|
+
import sys
|
|
16
|
+
from collections import OrderedDict, deque
|
|
17
|
+
from collections.abc import MutableMapping
|
|
14
18
|
RESOURCE_LIB_AVAILABLE = True
|
|
15
19
|
try:
|
|
16
|
-
|
|
17
|
-
except:
|
|
18
|
-
|
|
20
|
+
import resource
|
|
21
|
+
except ImportError:
|
|
22
|
+
RESOURCE_LIB_AVAILABLE = False
|
|
19
23
|
|
|
20
24
|
if os.name == 'nt':
|
|
21
|
-
|
|
25
|
+
import msvcrt
|
|
22
26
|
elif os.name == 'posix':
|
|
23
|
-
|
|
27
|
+
import fcntl
|
|
24
28
|
|
|
25
|
-
version = '3.
|
|
29
|
+
version = '3.35'
|
|
26
30
|
__version__ = version
|
|
27
31
|
author = 'pan@zopyr.us'
|
|
28
|
-
COMMIT_DATE = '2025-
|
|
32
|
+
COMMIT_DATE = '2025-11-13'
|
|
29
33
|
|
|
30
34
|
DEFAULT_DELIMITER = '\t'
|
|
31
35
|
DEFAULTS_INDICATOR_KEY = '#_defaults_#'
|
|
@@ -33,137 +37,216 @@ DEFAULTS_INDICATOR_KEY = '#_defaults_#'
|
|
|
33
37
|
COMPRESSED_FILE_EXTENSIONS = ['gz','gzip','bz2','bzip2','xz','lzma']
|
|
34
38
|
|
|
35
39
|
def get_delimiter(delimiter,file_name = ''):
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
40
|
+
global DEFAULT_DELIMITER
|
|
41
|
+
if not delimiter:
|
|
42
|
+
return DEFAULT_DELIMITER
|
|
43
|
+
elif delimiter == ...:
|
|
44
|
+
if not file_name:
|
|
45
|
+
rtn = '\t'
|
|
46
|
+
elif file_name.endswith('.csv'):
|
|
47
|
+
rtn = ','
|
|
48
|
+
elif file_name.endswith('.nsv'):
|
|
49
|
+
rtn = '\0'
|
|
50
|
+
elif file_name.endswith('.psv'):
|
|
51
|
+
rtn = '|'
|
|
52
|
+
else:
|
|
53
|
+
rtn = '\t'
|
|
54
|
+
elif delimiter == 'comma':
|
|
55
|
+
rtn = ','
|
|
56
|
+
elif delimiter == 'tab':
|
|
57
|
+
rtn = '\t'
|
|
58
|
+
elif delimiter == 'pipe':
|
|
59
|
+
rtn = '|'
|
|
60
|
+
elif delimiter == 'null':
|
|
61
|
+
rtn = '\0'
|
|
62
|
+
else:
|
|
63
|
+
rtn = delimiter.encode().decode('unicode_escape')
|
|
64
|
+
DEFAULT_DELIMITER = rtn
|
|
65
|
+
return rtn
|
|
66
|
+
|
|
67
|
+
def eprint(*args, **kwargs):
|
|
68
|
+
try:
|
|
69
|
+
if 'file' in kwargs:
|
|
70
|
+
print(*args, **kwargs)
|
|
71
|
+
else:
|
|
72
|
+
print(*args, file=sys.stderr, **kwargs)
|
|
73
|
+
except Exception as e:
|
|
74
|
+
print(f"Error: Cannot print to stderr: {e}")
|
|
75
|
+
print(*args, **kwargs)
|
|
62
76
|
|
|
63
77
|
def openFileAsCompressed(fileName,mode = 'rb',encoding = 'utf8',teeLogger = None,compressLevel = 1):
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
78
|
+
if 'b' not in mode:
|
|
79
|
+
mode += 't'
|
|
80
|
+
kwargs = {}
|
|
81
|
+
if 'r' not in mode:
|
|
82
|
+
if fileName.endswith('.xz'):
|
|
83
|
+
kwargs['preset'] = compressLevel
|
|
84
|
+
else:
|
|
85
|
+
kwargs['compresslevel'] = compressLevel
|
|
86
|
+
if 'b' not in mode:
|
|
87
|
+
kwargs['encoding'] = encoding
|
|
88
|
+
if fileName.endswith('.xz') or fileName.endswith('.lzma'):
|
|
89
|
+
try:
|
|
90
|
+
import lzma
|
|
91
|
+
return lzma.open(fileName, mode, **kwargs)
|
|
92
|
+
except Exception:
|
|
93
|
+
__teePrintOrNot(f"Failed to open {fileName} with lzma, trying bin",teeLogger=teeLogger)
|
|
94
|
+
elif fileName.endswith('.gz') or fileName.endswith('.gzip'):
|
|
95
|
+
try:
|
|
96
|
+
import gzip
|
|
97
|
+
return gzip.open(fileName, mode, **kwargs)
|
|
98
|
+
except Exception:
|
|
99
|
+
__teePrintOrNot(f"Failed to open {fileName} with gzip, trying bin",teeLogger=teeLogger)
|
|
100
|
+
elif fileName.endswith('.bz2') or fileName.endswith('.bzip2'):
|
|
101
|
+
try:
|
|
102
|
+
import bz2
|
|
103
|
+
return bz2.open(fileName, mode, **kwargs)
|
|
104
|
+
except Exception:
|
|
105
|
+
__teePrintOrNot(f"Failed to open {fileName} with bz2, trying bin",teeLogger=teeLogger)
|
|
106
|
+
if 't' in mode:
|
|
107
|
+
mode = mode.replace('t','')
|
|
108
|
+
return open(fileName, mode, encoding=encoding)
|
|
109
|
+
if 'b' not in mode:
|
|
110
|
+
mode += 'b'
|
|
111
|
+
return open(fileName, mode)
|
|
112
|
+
|
|
113
|
+
def get_terminal_size():
|
|
114
|
+
'''
|
|
115
|
+
Get the terminal size
|
|
116
|
+
|
|
117
|
+
@params:
|
|
118
|
+
None
|
|
119
|
+
|
|
120
|
+
@returns:
|
|
121
|
+
(int,int): the number of columns and rows of the terminal
|
|
122
|
+
'''
|
|
123
|
+
try:
|
|
124
|
+
import os
|
|
125
|
+
_tsize = os.get_terminal_size()
|
|
126
|
+
except Exception:
|
|
127
|
+
try:
|
|
128
|
+
import fcntl
|
|
129
|
+
import struct
|
|
130
|
+
import termios
|
|
131
|
+
packed = fcntl.ioctl(0, termios.TIOCGWINSZ, struct.pack('HHHH', 0, 0, 0, 0))
|
|
132
|
+
_tsize = struct.unpack('HHHH', packed)[:2]
|
|
133
|
+
except Exception:
|
|
134
|
+
import shutil
|
|
135
|
+
_tsize = shutil.get_terminal_size(fallback=(240, 50))
|
|
136
|
+
return _tsize
|
|
137
|
+
|
|
138
|
+
def pretty_format_table(data, delimiter="\t", header=None, full=False):
|
|
139
|
+
version = 1.12
|
|
140
|
+
_ = version
|
|
141
|
+
def visible_len(s):
|
|
142
|
+
return len(re.sub(r"\x1b\[[0-?]*[ -/]*[@-~]", "", s))
|
|
143
|
+
def table_width(col_widths, sep_len):
|
|
144
|
+
# total width = sum of column widths + separators between columns
|
|
145
|
+
return sum(col_widths) + sep_len * (len(col_widths) - 1)
|
|
146
|
+
def truncate_to_width(s, width):
|
|
147
|
+
# If fits, leave as is. If too long and width >= 1, keep width-1 chars + "."
|
|
148
|
+
# If width == 0, nothing fits; return empty string.
|
|
149
|
+
if visible_len(s) <= width:
|
|
150
|
+
return s
|
|
151
|
+
if width <= 0:
|
|
152
|
+
return ""
|
|
153
|
+
# Build a truncated plain string based on visible chars (no ANSI awareness for slicing)
|
|
154
|
+
# For simplicity, slice the raw string. This may cut ANSI; best to avoid ANSI in data if truncation occurs.
|
|
155
|
+
return s[: max(width - 2, 0)] + ".."
|
|
156
|
+
if not data:
|
|
157
|
+
return ""
|
|
158
|
+
# Normalize input data structure
|
|
159
|
+
if isinstance(data, str):
|
|
160
|
+
data = data.strip("\n").split("\n")
|
|
161
|
+
data = [line.split(delimiter) for line in data]
|
|
162
|
+
elif isinstance(data, dict):
|
|
163
|
+
if isinstance(next(iter(data.values())), dict):
|
|
164
|
+
tempData = [["key"] + list(next(iter(data.values())).keys())]
|
|
165
|
+
tempData.extend([[key] + list(value.values()) for key, value in data.items()])
|
|
166
|
+
data = tempData
|
|
167
|
+
else:
|
|
168
|
+
data = [[key] + list(value) for key, value in data.items()]
|
|
169
|
+
elif not isinstance(data, list):
|
|
170
|
+
data = list(data)
|
|
171
|
+
if isinstance(data[0], dict):
|
|
172
|
+
tempData = [list(data[0].keys())]
|
|
173
|
+
tempData.extend([list(item.values()) for item in data])
|
|
174
|
+
data = tempData
|
|
175
|
+
data = [[str(item) for item in row] for row in data]
|
|
176
|
+
num_cols = len(data[0])
|
|
177
|
+
# Resolve header and rows
|
|
178
|
+
using_provided_header = header is not None
|
|
179
|
+
if not using_provided_header:
|
|
180
|
+
header = data[0]
|
|
181
|
+
rows = data[1:]
|
|
182
|
+
else:
|
|
183
|
+
if isinstance(header, str):
|
|
184
|
+
header = header.split(delimiter)
|
|
185
|
+
# Pad/trim header to match num_cols
|
|
186
|
+
if len(header) < num_cols:
|
|
187
|
+
header = header + [""] * (num_cols - len(header))
|
|
188
|
+
elif len(header) > num_cols:
|
|
189
|
+
header = header[:num_cols]
|
|
190
|
+
rows = data
|
|
191
|
+
# Compute initial column widths based on data and header
|
|
192
|
+
def compute_col_widths(hdr, rows_):
|
|
193
|
+
col_w = [0] * len(hdr)
|
|
194
|
+
for i in range(len(hdr)):
|
|
195
|
+
col_w[i] = max(0, visible_len(hdr[i]), *(visible_len(r[i]) for r in rows_ if i < len(r)))
|
|
196
|
+
return col_w
|
|
197
|
+
# Ensure all rows have the same number of columns
|
|
198
|
+
normalized_rows = []
|
|
199
|
+
for r in rows:
|
|
200
|
+
if len(r) < num_cols:
|
|
201
|
+
r = r + [""] * (num_cols - len(r))
|
|
202
|
+
elif len(r) > num_cols:
|
|
203
|
+
r = r[:num_cols]
|
|
204
|
+
normalized_rows.append(r)
|
|
205
|
+
rows = normalized_rows
|
|
206
|
+
col_widths = compute_col_widths(header, rows)
|
|
207
|
+
# If full=True, keep existing formatting
|
|
208
|
+
# Else try to fit within the terminal width by:
|
|
209
|
+
# 1) Switching to compressed separators if needed
|
|
210
|
+
# 2) Recursively compressing columns (truncating with ".")
|
|
211
|
+
sep = " | "
|
|
212
|
+
hsep = "-+-"
|
|
213
|
+
cols = get_terminal_size()[0]
|
|
214
|
+
def render(hdr, rows, col_w, sep_str, hsep_str):
|
|
215
|
+
row_fmt = sep_str.join("{{:<{}}}".format(w) for w in col_w)
|
|
216
|
+
out = []
|
|
217
|
+
out.append(row_fmt.format(*hdr))
|
|
218
|
+
out.append(hsep_str.join("-" * w for w in col_w))
|
|
219
|
+
for row in rows:
|
|
220
|
+
if not any(row):
|
|
221
|
+
out.append(hsep_str.join("-" * w for w in col_w))
|
|
222
|
+
else:
|
|
223
|
+
row = [truncate_to_width(row[i], col_w[i]) for i in range(len(row))]
|
|
224
|
+
out.append(row_fmt.format(*row))
|
|
225
|
+
return "\n".join(out) + "\n"
|
|
226
|
+
if full:
|
|
227
|
+
return render(header, rows, col_widths, sep, hsep)
|
|
228
|
+
# Try default separators first
|
|
229
|
+
if table_width(col_widths, len(sep)) <= cols:
|
|
230
|
+
return render(header, rows, col_widths, sep, hsep)
|
|
231
|
+
# Use compressed separators (no spaces)
|
|
232
|
+
sep = "|"
|
|
233
|
+
hsep = "+"
|
|
234
|
+
if table_width(col_widths, len(sep)) <= cols:
|
|
235
|
+
return render(header, rows, col_widths, sep, hsep)
|
|
236
|
+
# Begin column compression
|
|
237
|
+
# Track which columns have been compressed already to header width
|
|
238
|
+
header_widths = [visible_len(h) for h in header]
|
|
239
|
+
width_diff = [max(col_widths[i] - header_widths[i],0) for i in range(num_cols)]
|
|
240
|
+
total_overflow_width = table_width(col_widths, len(sep)) - cols
|
|
241
|
+
for i, diff in sorted(enumerate(width_diff), key=lambda x: -x[1]):
|
|
242
|
+
if total_overflow_width <= 0:
|
|
243
|
+
break
|
|
244
|
+
if diff <= 0:
|
|
245
|
+
continue
|
|
246
|
+
reduce_by = min(diff, total_overflow_width)
|
|
247
|
+
col_widths[i] -= reduce_by
|
|
248
|
+
total_overflow_width -= reduce_by
|
|
249
|
+
return render(header, rows, col_widths, sep, hsep)
|
|
167
250
|
|
|
168
251
|
def format_bytes(size, use_1024_bytes=None, to_int=False, to_str=False,str_format='.2f'):
|
|
169
252
|
"""
|
|
@@ -231,14 +314,14 @@ def format_bytes(size, use_1024_bytes=None, to_int=False, to_str=False,str_forma
|
|
|
231
314
|
else:
|
|
232
315
|
try:
|
|
233
316
|
return int(size)
|
|
234
|
-
except Exception
|
|
317
|
+
except Exception:
|
|
235
318
|
return 0
|
|
236
319
|
elif to_str or isinstance(size, int) or isinstance(size, float):
|
|
237
320
|
if isinstance(size, str):
|
|
238
321
|
try:
|
|
239
322
|
size = size.rstrip('B').rstrip('b')
|
|
240
323
|
size = float(size.lower().strip())
|
|
241
|
-
except Exception
|
|
324
|
+
except Exception:
|
|
242
325
|
return size
|
|
243
326
|
# size is in bytes
|
|
244
327
|
if use_1024_bytes or use_1024_bytes is None:
|
|
@@ -268,918 +351,1079 @@ def format_bytes(size, use_1024_bytes=None, to_int=False, to_str=False,str_forma
|
|
|
268
351
|
return 0
|
|
269
352
|
|
|
270
353
|
def get_resource_usage(return_dict = False):
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
354
|
+
try:
|
|
355
|
+
if RESOURCE_LIB_AVAILABLE:
|
|
356
|
+
rawResource = resource.getrusage(resource.RUSAGE_SELF)
|
|
357
|
+
resourceDict = {}
|
|
358
|
+
resourceDict['user mode time'] = f'{rawResource.ru_utime} seconds'
|
|
359
|
+
resourceDict['system mode time'] = f'{rawResource.ru_stime} seconds'
|
|
360
|
+
resourceDict['max resident set size'] = f'{format_bytes(rawResource.ru_maxrss * 1024)}B'
|
|
361
|
+
resourceDict['shared memory size'] = f'{format_bytes(rawResource.ru_ixrss * 1024)}B'
|
|
362
|
+
resourceDict['unshared memory size'] = f'{format_bytes(rawResource.ru_idrss * 1024)}B'
|
|
363
|
+
resourceDict['unshared stack size'] = f'{format_bytes(rawResource.ru_isrss * 1024)}B'
|
|
364
|
+
resourceDict['cached page hits'] = f'{rawResource.ru_minflt}'
|
|
365
|
+
resourceDict['missed page hits'] = f'{rawResource.ru_majflt}'
|
|
366
|
+
resourceDict['swapped out page count'] = f'{rawResource.ru_nswap}'
|
|
367
|
+
resourceDict['block input operations'] = f'{rawResource.ru_inblock}'
|
|
368
|
+
resourceDict['block output operations'] = f'{rawResource.ru_oublock}'
|
|
369
|
+
resourceDict['IPC messages sent'] = f'{rawResource.ru_msgsnd}'
|
|
370
|
+
resourceDict['IPC messages received'] = f'{rawResource.ru_msgrcv}'
|
|
371
|
+
resourceDict['signals received'] = f'{rawResource.ru_nsignals}'
|
|
372
|
+
resourceDict['voluntary context sw'] = f'{rawResource.ru_nvcsw}'
|
|
373
|
+
resourceDict['involuntary context sw'] = f'{rawResource.ru_nivcsw}'
|
|
374
|
+
if return_dict:
|
|
375
|
+
return resourceDict
|
|
376
|
+
return '\n'.join(['\t'.join(line) for line in resourceDict.items()])
|
|
377
|
+
except Exception as e:
|
|
378
|
+
print(f"Error: {e}")
|
|
379
|
+
if return_dict:
|
|
380
|
+
return {}
|
|
381
|
+
return ''
|
|
299
382
|
|
|
300
383
|
def __teePrintOrNot(message,level = 'info',teeLogger = None):
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
def _processLine(line,taskDic,correctColumnNum,
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
384
|
+
"""
|
|
385
|
+
Prints the given message or logs it using the provided teeLogger.
|
|
386
|
+
|
|
387
|
+
Parameters:
|
|
388
|
+
message (str): The message to be printed or logged.
|
|
389
|
+
level (str, optional): The log level. Defaults to 'info'.
|
|
390
|
+
teeLogger (object, optional): The logger object used for logging. Defaults to None.
|
|
391
|
+
|
|
392
|
+
Returns:
|
|
393
|
+
None
|
|
394
|
+
"""
|
|
395
|
+
try:
|
|
396
|
+
if teeLogger:
|
|
397
|
+
try:
|
|
398
|
+
teeLogger.teelog(message,level,callerStackDepth=3)
|
|
399
|
+
except Exception:
|
|
400
|
+
teeLogger.teelog(message,level)
|
|
401
|
+
else:
|
|
402
|
+
print(message,flush=True)
|
|
403
|
+
except Exception:
|
|
404
|
+
print(message,flush=True)
|
|
405
|
+
|
|
406
|
+
def _processLine(line,taskDic,correctColumnNum,strict = True,delimiter = DEFAULT_DELIMITER,defaults = ...,
|
|
407
|
+
storeOffset = False, offset = -1):
|
|
408
|
+
"""
|
|
409
|
+
Process a line of text and update the task dictionary.
|
|
410
|
+
|
|
411
|
+
Parameters:
|
|
412
|
+
line (str): The line of text to process.
|
|
413
|
+
taskDic (dict): The dictionary to update with the processed line.
|
|
414
|
+
correctColumnNum (int): The expected number of columns in the line.
|
|
415
|
+
strict (bool, optional): Whether to strictly enforce the correct number of columns. Defaults to True.
|
|
416
|
+
defaults (list, optional): The default values to use for missing columns. Defaults to [].
|
|
417
|
+
storeOffset (bool, optional): Whether to store the offset of the line in the taskDic. Defaults to False.
|
|
418
|
+
offset (int, optional): The offset of the line in the file. Defaults to -1.
|
|
419
|
+
|
|
420
|
+
Returns:
|
|
421
|
+
tuple: A tuple containing the updated correctColumnNum and the processed lineCache or offset.
|
|
422
|
+
|
|
423
|
+
"""
|
|
424
|
+
if defaults is ...:
|
|
425
|
+
defaults = []
|
|
426
|
+
line = line.strip('\x00').rstrip('\r\n')
|
|
427
|
+
if not line or (line.startswith('#') and not line.startswith(DEFAULTS_INDICATOR_KEY)):
|
|
428
|
+
# if verbose:
|
|
429
|
+
# __teePrintOrNot(f"Ignoring comment line: {line}",teeLogger=teeLogger)
|
|
430
|
+
return correctColumnNum , []
|
|
431
|
+
# we only interested in the lines that have the correct number of columns
|
|
432
|
+
lineCache = _unsanitize(line.split(delimiter),delimiter)
|
|
433
|
+
if not lineCache or not lineCache[0]:
|
|
434
|
+
return correctColumnNum , []
|
|
435
|
+
if correctColumnNum == -1:
|
|
436
|
+
if defaults and len(defaults) > 1:
|
|
437
|
+
correctColumnNum = len(defaults)
|
|
438
|
+
else:
|
|
439
|
+
correctColumnNum = len(lineCache)
|
|
440
|
+
# if verbose:
|
|
441
|
+
# __teePrintOrNot(f"detected correctColumnNum: {len(lineCache)}",teeLogger=teeLogger)
|
|
442
|
+
if len(lineCache) == 1 or not any(lineCache[1:]):
|
|
443
|
+
if correctColumnNum == 1:
|
|
444
|
+
taskDic[lineCache[0]] = lineCache if not storeOffset else offset
|
|
445
|
+
elif lineCache[0] == DEFAULTS_INDICATOR_KEY:
|
|
446
|
+
# if verbose:
|
|
447
|
+
# __teePrintOrNot(f"Empty defaults line found: {line}",teeLogger=teeLogger)
|
|
448
|
+
defaults.clear()
|
|
449
|
+
defaults[0] = DEFAULTS_INDICATOR_KEY
|
|
450
|
+
else:
|
|
451
|
+
# if verbose:
|
|
452
|
+
# __teePrintOrNot(f"Key {lineCache[0]} found with empty value, deleting such key's representaion",teeLogger=teeLogger)
|
|
453
|
+
if lineCache[0] in taskDic:
|
|
454
|
+
del taskDic[lineCache[0]]
|
|
455
|
+
return correctColumnNum , []
|
|
456
|
+
elif len(lineCache) != correctColumnNum:
|
|
457
|
+
if strict and not any(defaults[1:]):
|
|
458
|
+
# if verbose:
|
|
459
|
+
# __teePrintOrNot(f"Ignoring line with {len(lineCache)} columns: {line}",teeLogger=teeLogger)
|
|
460
|
+
return correctColumnNum , []
|
|
461
|
+
else:
|
|
462
|
+
# fill / cut the line with empty entries til the correct number of columns
|
|
463
|
+
if len(lineCache) < correctColumnNum:
|
|
464
|
+
lineCache += ['']*(correctColumnNum-len(lineCache))
|
|
465
|
+
elif len(lineCache) > correctColumnNum:
|
|
466
|
+
lineCache = lineCache[:correctColumnNum]
|
|
467
|
+
# if verbose:
|
|
468
|
+
# __teePrintOrNot(f"Correcting {lineCache[0]}",teeLogger=teeLogger)
|
|
469
|
+
# now replace empty values with defaults
|
|
470
|
+
if defaults and len(defaults) > 1:
|
|
471
|
+
for i in range(1,len(lineCache)):
|
|
472
|
+
if not lineCache[i] and i < len(defaults) and defaults[i]:
|
|
473
|
+
lineCache[i] = defaults[i]
|
|
474
|
+
if lineCache[0] == DEFAULTS_INDICATOR_KEY:
|
|
475
|
+
# if verbose:
|
|
476
|
+
# __teePrintOrNot(f"Defaults line found: {line}",teeLogger=teeLogger)
|
|
477
|
+
defaults[:] = lineCache
|
|
478
|
+
return correctColumnNum , []
|
|
479
|
+
taskDic[lineCache[0]] = lineCache if not storeOffset else offset
|
|
480
|
+
# if verbose:
|
|
481
|
+
# __teePrintOrNot(f"Key {lineCache[0]} added",teeLogger=teeLogger)
|
|
482
|
+
return correctColumnNum, lineCache
|
|
483
|
+
|
|
484
|
+
def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False,
|
|
485
|
+
encoding = 'utf8',delimiter = ...,defaults = ...,storeOffset = False ):
|
|
486
|
+
"""
|
|
487
|
+
Reads the last valid line from a file.
|
|
488
|
+
|
|
489
|
+
Args:
|
|
490
|
+
fileName (str): The name of the file to read.
|
|
491
|
+
taskDic (dict): A dictionary to pass to processLine function.
|
|
492
|
+
correctColumnNum (int): A column number to pass to processLine function.
|
|
493
|
+
verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
494
|
+
teeLogger (optional): Logger to use for tee print. Defaults to None.
|
|
495
|
+
encoding (str, optional): The encoding of the file. Defaults to None.
|
|
496
|
+
strict (bool, optional): Whether to enforce strict processing. Defaults to False.
|
|
497
|
+
delimiter (str, optional): The delimiter used in the file. Defaults to None.
|
|
498
|
+
defaults (list, optional): The default values to use for missing columns. Defaults to [].
|
|
499
|
+
storeOffset (bool, optional): Instead of storing the data in taskDic, store the offset of each line. Defaults to False.
|
|
500
|
+
|
|
501
|
+
Returns:
|
|
502
|
+
list: The last valid line as a list of strings, or an empty list if no valid line is found.
|
|
503
|
+
"""
|
|
504
|
+
chunk_size = 1024 # Read in chunks of 1024 bytes
|
|
505
|
+
last_valid_line = []
|
|
506
|
+
if defaults is ...:
|
|
507
|
+
defaults = []
|
|
508
|
+
delimiter = get_delimiter(delimiter,file_name=fileName)
|
|
509
|
+
if verbose:
|
|
510
|
+
__teePrintOrNot(f"Reading last line only from {fileName}",teeLogger=teeLogger)
|
|
511
|
+
with openFileAsCompressed(fileName, 'rb',encoding=encoding, teeLogger=teeLogger) as file:
|
|
512
|
+
file.seek(0, os.SEEK_END)
|
|
513
|
+
file_size = file.tell()
|
|
514
|
+
buffer = b''
|
|
515
|
+
position = file_size
|
|
516
|
+
processedSize = 0
|
|
517
|
+
|
|
518
|
+
while position > 0:
|
|
519
|
+
# Read chunks from the end of the file
|
|
520
|
+
read_size = min(chunk_size, position)
|
|
521
|
+
position -= read_size
|
|
522
|
+
file.seek(position)
|
|
523
|
+
chunk = file.read(read_size)
|
|
524
|
+
|
|
525
|
+
# Prepend new chunk to buffer
|
|
526
|
+
buffer = chunk + buffer
|
|
527
|
+
|
|
528
|
+
# Split the buffer into lines
|
|
529
|
+
lines = buffer.split(b'\n')
|
|
530
|
+
|
|
531
|
+
# Process lines from the last to the first
|
|
532
|
+
for i in range(len(lines) - 1, -1, -1):
|
|
533
|
+
processedSize += len(lines[i]) + 1 # +1 for the newline character
|
|
534
|
+
if lines[i].strip(): # Skip empty lines
|
|
535
|
+
# Process the line
|
|
536
|
+
correctColumnNum, lineCache = _processLine(
|
|
537
|
+
line=lines[i].decode(encoding=encoding,errors='replace'),
|
|
538
|
+
taskDic=taskDic,
|
|
539
|
+
correctColumnNum=correctColumnNum,
|
|
540
|
+
strict=strict,
|
|
541
|
+
delimiter=delimiter,
|
|
542
|
+
defaults=defaults,
|
|
543
|
+
storeOffset=storeOffset,
|
|
544
|
+
offset=file_size - processedSize + 1
|
|
545
|
+
)
|
|
546
|
+
# If the line is valid, return it
|
|
547
|
+
if lineCache:
|
|
548
|
+
if storeOffset and any(lineCache):
|
|
549
|
+
return lineCache
|
|
550
|
+
|
|
551
|
+
# Keep the last (possibly incomplete) line in buffer for the next read
|
|
552
|
+
buffer = lines[0]
|
|
553
|
+
|
|
554
|
+
# Return empty list if no valid line found
|
|
555
|
+
if storeOffset:
|
|
556
|
+
return -1
|
|
557
|
+
return last_valid_line
|
|
558
|
+
|
|
559
|
+
@functools.lru_cache(maxsize=None)
|
|
560
|
+
def _get_sanitization_re(delimiter = DEFAULT_DELIMITER):
|
|
561
|
+
return re.compile(r"(</sep/>|</LF/>|<sep>|<LF>|\n|" + re.escape(delimiter) + r")")
|
|
562
|
+
|
|
563
|
+
_sanitize_replacements = {
|
|
564
|
+
"<sep>":"</sep/>",
|
|
565
|
+
"<LF>":"</LF/>",
|
|
566
|
+
"\n":"<LF>",
|
|
567
|
+
}
|
|
568
|
+
_inverse_sanitize_replacements = {v: k for k, v in _sanitize_replacements.items()}
|
|
569
|
+
|
|
570
|
+
def _sanitize(data,delimiter = DEFAULT_DELIMITER):
|
|
571
|
+
if not data:
|
|
572
|
+
return data
|
|
573
|
+
def repl(m):
|
|
574
|
+
tok = m.group(0)
|
|
575
|
+
if tok == delimiter:
|
|
576
|
+
return "<sep>"
|
|
577
|
+
if tok in ("</sep/>", "</LF/>"):
|
|
578
|
+
eprint(f"Warning: Found illegal token '{tok}' during sanitization. It will be replaced.")
|
|
579
|
+
return _sanitize_replacements.get(tok, tok)
|
|
580
|
+
pattern = _get_sanitization_re(delimiter)
|
|
581
|
+
if isinstance(data,str):
|
|
582
|
+
return pattern.sub(repl, data)
|
|
583
|
+
else:
|
|
584
|
+
return [pattern.sub(repl,str(segment)) if segment else '' for segment in data]
|
|
585
|
+
|
|
586
|
+
def _unsanitize(data,delimiter = DEFAULT_DELIMITER):
|
|
587
|
+
if not data:
|
|
588
|
+
return data
|
|
589
|
+
def repl(m):
|
|
590
|
+
tok = m.group(0)
|
|
591
|
+
if tok == "<sep>":
|
|
592
|
+
return delimiter
|
|
593
|
+
return _inverse_sanitize_replacements.get(tok, tok)
|
|
594
|
+
pattern = _get_sanitization_re(delimiter)
|
|
595
|
+
if isinstance(data,str):
|
|
596
|
+
return pattern.sub(repl, data.rstrip())
|
|
597
|
+
else:
|
|
598
|
+
return [pattern.sub(repl,str(segment).rstrip()) if segment else '' for segment in data]
|
|
477
599
|
|
|
478
600
|
def _formatHeader(header,verbose = False,teeLogger = None,delimiter = DEFAULT_DELIMITER):
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
# header += '\n'
|
|
501
|
-
# else:
|
|
502
|
-
# header = ''
|
|
503
|
-
return header
|
|
601
|
+
"""
|
|
602
|
+
Format the header string.
|
|
603
|
+
|
|
604
|
+
Parameters:
|
|
605
|
+
- header (str or list): The header string or list to format.
|
|
606
|
+
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
607
|
+
- teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
|
|
608
|
+
|
|
609
|
+
Returns:
|
|
610
|
+
list: The formatted header list of string.
|
|
611
|
+
"""
|
|
612
|
+
if isinstance(header,str):
|
|
613
|
+
header = header.split(delimiter)
|
|
614
|
+
else:
|
|
615
|
+
try:
|
|
616
|
+
header = [str(s) for s in header]
|
|
617
|
+
except Exception:
|
|
618
|
+
if verbose:
|
|
619
|
+
__teePrintOrNot('Invalid header, setting header to empty.','error',teeLogger=teeLogger)
|
|
620
|
+
header = []
|
|
621
|
+
return [s.rstrip() for s in header]
|
|
504
622
|
|
|
505
623
|
def _lineContainHeader(header,line,verbose = False,teeLogger = None,strict = False,delimiter = DEFAULT_DELIMITER):
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
624
|
+
"""
|
|
625
|
+
Verify if a line contains the header.
|
|
626
|
+
|
|
627
|
+
Parameters:
|
|
628
|
+
- header (str): The header string to verify.
|
|
629
|
+
- line (str): The line to verify against the header.
|
|
630
|
+
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
631
|
+
- teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
|
|
632
|
+
- strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
|
|
633
|
+
|
|
634
|
+
Returns:
|
|
635
|
+
bool: True if the header matches the line, False otherwise.
|
|
636
|
+
"""
|
|
637
|
+
line = _formatHeader(line,verbose=verbose,teeLogger=teeLogger,delimiter=delimiter)
|
|
638
|
+
if verbose:
|
|
639
|
+
__teePrintOrNot(f"Header: \n{header}",teeLogger=teeLogger)
|
|
640
|
+
__teePrintOrNot(f"First line: \n{line}",teeLogger=teeLogger)
|
|
641
|
+
if len(header) != len(line) or any([header[i] not in line[i] for i in range(len(header))]):
|
|
642
|
+
__teePrintOrNot(f"Header mismatch: \n{line} \n!= \n{header}",teeLogger=teeLogger)
|
|
643
|
+
if strict:
|
|
644
|
+
raise ValueError("Data format error! Header mismatch")
|
|
645
|
+
return False
|
|
646
|
+
return True
|
|
647
|
+
|
|
648
|
+
def _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = None,header = [],encoding = 'utf8',strict = True,delimiter = DEFAULT_DELIMITER):
|
|
649
|
+
"""
|
|
650
|
+
Verify the existence of the tabular file.
|
|
651
|
+
|
|
652
|
+
Parameters:
|
|
653
|
+
- fileName (str): The path of the tabular file.
|
|
654
|
+
- createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to True.
|
|
655
|
+
- teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
|
|
656
|
+
- header (list, optional): The header line to verify against. Defaults to [].
|
|
657
|
+
- encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
|
|
658
|
+
- strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
|
|
659
|
+
|
|
660
|
+
Returns:
|
|
661
|
+
bool: True if the file exists, False otherwise.
|
|
662
|
+
"""
|
|
663
|
+
remainingFileName, _ ,extenstionName = fileName.rpartition('.')
|
|
664
|
+
if extenstionName in COMPRESSED_FILE_EXTENSIONS:
|
|
665
|
+
remainingFileName, _ ,extenstionName = remainingFileName.rpartition('.')
|
|
666
|
+
if delimiter and delimiter == '\t' and not extenstionName == 'tsv':
|
|
667
|
+
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .tsv','warning',teeLogger=teeLogger)
|
|
668
|
+
elif delimiter and delimiter == ',' and not extenstionName == 'csv':
|
|
669
|
+
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .csv','warning',teeLogger=teeLogger)
|
|
670
|
+
elif delimiter and delimiter == '\0' and not extenstionName == 'nsv':
|
|
671
|
+
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .nsv','warning',teeLogger=teeLogger)
|
|
672
|
+
elif delimiter and delimiter == '|' and not extenstionName == 'psv':
|
|
673
|
+
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .psv','warning',teeLogger=teeLogger)
|
|
674
|
+
if not os.path.isfile(fileName):
|
|
675
|
+
if createIfNotExist:
|
|
676
|
+
try:
|
|
677
|
+
with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
678
|
+
header = delimiter.join(_sanitize(_formatHeader(header,
|
|
679
|
+
verbose=verbose,
|
|
680
|
+
teeLogger=teeLogger,
|
|
681
|
+
delimiter=delimiter,
|
|
682
|
+
),delimiter=delimiter))
|
|
683
|
+
file.write(header.encode(encoding=encoding,errors='replace')+b'\n')
|
|
684
|
+
__teePrintOrNot('Created '+fileName,teeLogger=teeLogger)
|
|
685
|
+
return True
|
|
686
|
+
except Exception:
|
|
687
|
+
__teePrintOrNot('Failed to create '+fileName,'error',teeLogger=teeLogger)
|
|
688
|
+
if strict:
|
|
689
|
+
raise FileNotFoundError("Failed to create file")
|
|
690
|
+
return False
|
|
691
|
+
elif strict:
|
|
692
|
+
__teePrintOrNot('File not found','error',teeLogger=teeLogger)
|
|
693
|
+
raise FileNotFoundError("File not found")
|
|
694
|
+
else:
|
|
695
|
+
return False
|
|
696
|
+
return True
|
|
697
|
+
|
|
698
|
+
def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,
|
|
699
|
+
verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = '\t',defaults = ...,
|
|
700
|
+
correctColumnNum = -1):
|
|
701
|
+
"""
|
|
702
|
+
Compatibility method, calls readTabularFile.
|
|
703
|
+
Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
|
|
704
|
+
|
|
705
|
+
Parameters:
|
|
706
|
+
- fileName (str): The path to the Tabular file.
|
|
707
|
+
- teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
|
|
708
|
+
- header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
|
|
709
|
+
- createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
|
|
710
|
+
- lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
|
|
711
|
+
- verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
|
|
712
|
+
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
713
|
+
- taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
|
|
714
|
+
- encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
|
|
715
|
+
- strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
|
|
716
|
+
- delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t'.
|
|
717
|
+
- defaults (list, optional): The default values to use for missing columns. Defaults to [].
|
|
718
|
+
- correctColumnNum (int, optional): The expected number of columns in the file. If -1, it will be determined from the first valid line. Defaults to -1.
|
|
719
|
+
|
|
720
|
+
Returns:
|
|
721
|
+
- OrderedDict: The dictionary containing the data from the Tabular file.
|
|
722
|
+
|
|
723
|
+
Raises:
|
|
724
|
+
- Exception: If the file is not found or there is a data format error.
|
|
725
|
+
|
|
726
|
+
"""
|
|
727
|
+
return readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,
|
|
728
|
+
lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,
|
|
729
|
+
encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults,
|
|
730
|
+
correctColumnNum = correctColumnNum)
|
|
731
|
+
|
|
732
|
+
def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,
|
|
733
|
+
verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = ...,defaults = ...,
|
|
734
|
+
correctColumnNum = -1,storeOffset = False):
|
|
735
|
+
"""
|
|
736
|
+
Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
|
|
737
|
+
|
|
738
|
+
Parameters:
|
|
739
|
+
- fileName (str): The path to the Tabular file.
|
|
740
|
+
- teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
|
|
741
|
+
- header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
|
|
742
|
+
- createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
|
|
743
|
+
- lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
|
|
744
|
+
- verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
|
|
745
|
+
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
746
|
+
- taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
|
|
747
|
+
- encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
|
|
748
|
+
- strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
|
|
749
|
+
- delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
|
|
750
|
+
- defaults (list, optional): The default values to use for missing columns. Defaults to [].
|
|
751
|
+
- correctColumnNum (int, optional): The expected number of columns in the file. If -1, it will be determined from the first valid line. Defaults to -1.
|
|
752
|
+
- storeOffset (bool, optional): Instead of storing the data in taskDic, store the offset of each line. Defaults to False.
|
|
753
|
+
|
|
754
|
+
Returns:
|
|
755
|
+
- OrderedDict: The dictionary containing the data from the Tabular file.
|
|
756
|
+
|
|
757
|
+
Raises:
|
|
758
|
+
- Exception: If the file is not found or there is a data format error.
|
|
759
|
+
|
|
760
|
+
"""
|
|
761
|
+
if taskDic is None:
|
|
762
|
+
taskDic = {}
|
|
763
|
+
if defaults is ...:
|
|
764
|
+
defaults = []
|
|
765
|
+
delimiter = get_delimiter(delimiter,file_name=fileName)
|
|
766
|
+
header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger, delimiter = delimiter)
|
|
767
|
+
if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
|
|
768
|
+
return taskDic
|
|
769
|
+
with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
770
|
+
if any(header) and verifyHeader:
|
|
771
|
+
line = file.readline().decode(encoding=encoding,errors='replace')
|
|
772
|
+
if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict) and correctColumnNum == -1:
|
|
773
|
+
correctColumnNum = len(header)
|
|
774
|
+
if verbose:
|
|
775
|
+
__teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
|
|
776
|
+
if lastLineOnly:
|
|
777
|
+
lineCache = read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=verbose, teeLogger=teeLogger, strict=strict, delimiter=delimiter, defaults=defaults,storeOffset=storeOffset)
|
|
778
|
+
# if lineCache:
|
|
779
|
+
# taskDic[lineCache[0]] = lineCache
|
|
780
|
+
return lineCache
|
|
781
|
+
for line in file:
|
|
782
|
+
correctColumnNum, _ = _processLine(line.decode(encoding=encoding,errors='replace'),taskDic,correctColumnNum,strict = strict,delimiter=delimiter,defaults = defaults,storeOffset=storeOffset,offset=file.tell()-len(line))
|
|
783
|
+
return taskDic
|
|
653
784
|
|
|
654
785
|
def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = '\t'):
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
673
|
-
|
|
786
|
+
"""
|
|
787
|
+
Compatibility method, calls appendTabularFile.
|
|
788
|
+
Append a line of data to a Tabular file.
|
|
789
|
+
Parameters:
|
|
790
|
+
- fileName (str): The path of the Tabular file.
|
|
791
|
+
- lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
|
|
792
|
+
- teeLogger (optional): A logger object for logging messages.
|
|
793
|
+
- header (str or list, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
|
|
794
|
+
- createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
|
|
795
|
+
- verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
|
|
796
|
+
- verbose (bool, optional): If True, additional information will be printed during the execution.
|
|
797
|
+
- encoding (str, optional): The encoding of the file.
|
|
798
|
+
- strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
|
|
799
|
+
- delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
|
|
800
|
+
Raises:
|
|
801
|
+
- Exception: If the file does not exist and createIfNotExist is False.
|
|
802
|
+
- Exception: If the existing header does not match the provided header.
|
|
803
|
+
"""
|
|
804
|
+
return appendTabularFile(fileName,lineToAppend,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding, strict = strict, delimiter = delimiter)
|
|
674
805
|
|
|
675
806
|
def appendTabularFile(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = ...):
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
807
|
+
"""
|
|
808
|
+
Append a line of data to a Tabular file.
|
|
809
|
+
Parameters:
|
|
810
|
+
- fileName (str): The path of the Tabular file.
|
|
811
|
+
- lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
|
|
812
|
+
- teeLogger (optional): A logger object for logging messages.
|
|
813
|
+
- header (str or list, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
|
|
814
|
+
- createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
|
|
815
|
+
- verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
|
|
816
|
+
- verbose (bool, optional): If True, additional information will be printed during the execution.
|
|
817
|
+
- encoding (str, optional): The encoding of the file.
|
|
818
|
+
- strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
|
|
819
|
+
- delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
|
|
820
|
+
Raises:
|
|
821
|
+
- Exception: If the file does not exist and createIfNotExist is False.
|
|
822
|
+
- Exception: If the existing header does not match the provided header.
|
|
823
|
+
"""
|
|
824
|
+
return appendLinesTabularFile(fileName,[lineToAppend],teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding, strict = strict, delimiter = delimiter)
|
|
694
825
|
|
|
695
826
|
def appendLinesTabularFile(fileName,linesToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = ...):
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
715
|
-
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
747
|
-
|
|
748
|
-
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
__teePrintOrNot(f"Appended {len(formatedLines)} lines to {fileName}",teeLogger=teeLogger)
|
|
827
|
+
"""
|
|
828
|
+
Append lines of data to a Tabular file.
|
|
829
|
+
Parameters:
|
|
830
|
+
- fileName (str): The path of the Tabular file.
|
|
831
|
+
- linesToAppend (list): The lines of data to append. If it is a list of string, then each string will be split by delimiter to form a list.
|
|
832
|
+
- teeLogger (optional): A logger object for logging messages.
|
|
833
|
+
- header (str or list, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
|
|
834
|
+
- createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
|
|
835
|
+
- verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
|
|
836
|
+
- verbose (bool, optional): If True, additional information will be printed during the execution.
|
|
837
|
+
- encoding (str, optional): The encoding of the file.
|
|
838
|
+
- strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
|
|
839
|
+
- delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
|
|
840
|
+
Raises:
|
|
841
|
+
- Exception: If the file does not exist and createIfNotExist is False.
|
|
842
|
+
- Exception: If the existing header does not match the provided header.
|
|
843
|
+
"""
|
|
844
|
+
delimiter = get_delimiter(delimiter,file_name=fileName)
|
|
845
|
+
header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
|
|
846
|
+
if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
|
|
847
|
+
return
|
|
848
|
+
formatedLines = []
|
|
849
|
+
for line in linesToAppend:
|
|
850
|
+
if isinstance(linesToAppend,dict):
|
|
851
|
+
key = line
|
|
852
|
+
line = linesToAppend[key]
|
|
853
|
+
if isinstance(line,str):
|
|
854
|
+
line = line.split(delimiter)
|
|
855
|
+
elif line:
|
|
856
|
+
for i in range(len(line)):
|
|
857
|
+
if not isinstance(line[i],str):
|
|
858
|
+
try:
|
|
859
|
+
line[i] = str(line[i]).rstrip()
|
|
860
|
+
except Exception as e:
|
|
861
|
+
line[i] = str(e)
|
|
862
|
+
if isinstance(linesToAppend,dict):
|
|
863
|
+
if (not line or line[0] != key):
|
|
864
|
+
line = [key]+line
|
|
865
|
+
formatedLines.append(_sanitize(line,delimiter=delimiter))
|
|
866
|
+
if not formatedLines:
|
|
867
|
+
if verbose:
|
|
868
|
+
__teePrintOrNot(f"No lines to append to {fileName}",teeLogger=teeLogger)
|
|
869
|
+
return
|
|
870
|
+
correctColumnNum = max([len(line) for line in formatedLines])
|
|
871
|
+
if any(header) and verifyHeader:
|
|
872
|
+
with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
873
|
+
line = file.readline().decode(encoding=encoding,errors='replace')
|
|
874
|
+
if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
|
|
875
|
+
correctColumnNum = len(header)
|
|
876
|
+
if verbose:
|
|
877
|
+
__teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
|
|
878
|
+
# truncate / fill the lines to the correct number of columns
|
|
879
|
+
for i in range(len(formatedLines)):
|
|
880
|
+
if len(formatedLines[i]) < correctColumnNum:
|
|
881
|
+
formatedLines[i] += ['']*(correctColumnNum-len(formatedLines[i]))
|
|
882
|
+
elif len(formatedLines[i]) > correctColumnNum:
|
|
883
|
+
formatedLines[i] = formatedLines[i][:correctColumnNum]
|
|
884
|
+
with openFileAsCompressed(fileName, mode ='ab',encoding=encoding,teeLogger=teeLogger)as file:
|
|
885
|
+
# check if the file ends in a newline
|
|
886
|
+
# file.seek(-1, os.SEEK_END)
|
|
887
|
+
# if file.read(1) != b'\n':
|
|
888
|
+
# file.write(b'\n')
|
|
889
|
+
file.write(b'\n'.join([delimiter.join(line).encode(encoding=encoding,errors='replace') for line in formatedLines]) + b'\n')
|
|
890
|
+
if verbose:
|
|
891
|
+
__teePrintOrNot(f"Appended {len(formatedLines)} lines to {fileName}",teeLogger=teeLogger)
|
|
762
892
|
|
|
763
893
|
def clearTSV(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False,delimiter = '\t'):
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
|
|
768
|
-
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
894
|
+
"""
|
|
895
|
+
Compatibility method, calls clearTabularFile.
|
|
896
|
+
Clear the contents of a Tabular file. Will create if not exist.
|
|
897
|
+
Parameters:
|
|
898
|
+
- fileName (str): The path of the Tabular file.
|
|
899
|
+
- teeLogger (optional): A logger object for logging messages.
|
|
900
|
+
- header (str or list, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
|
|
901
|
+
- verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
|
|
902
|
+
- verbose (bool, optional): If True, additional information will be printed during the execution.
|
|
903
|
+
- encoding (str, optional): The encoding of the file.
|
|
904
|
+
- strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
|
|
905
|
+
"""
|
|
906
|
+
return clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
|
|
777
907
|
|
|
778
908
|
def clearTabularFile(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False,delimiter = ...):
|
|
779
|
-
|
|
780
|
-
|
|
781
|
-
|
|
782
|
-
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
|
|
787
|
-
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
__teePrintOrNot(f"Cleared {fileName}",teeLogger=teeLogger)
|
|
909
|
+
"""
|
|
910
|
+
Clear the contents of a Tabular file. Will create if not exist.
|
|
911
|
+
Parameters:
|
|
912
|
+
- fileName (str): The path of the Tabular file.
|
|
913
|
+
- teeLogger (optional): A logger object for logging messages.
|
|
914
|
+
- header (str or list, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
|
|
915
|
+
- verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
|
|
916
|
+
- verbose (bool, optional): If True, additional information will be printed during the execution.
|
|
917
|
+
- encoding (str, optional): The encoding of the file.
|
|
918
|
+
- strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
|
|
919
|
+
"""
|
|
920
|
+
delimiter = get_delimiter(delimiter,file_name=fileName)
|
|
921
|
+
header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
|
|
922
|
+
if not _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = teeLogger,header = header,encoding = encoding,strict = False,delimiter=delimiter):
|
|
923
|
+
raise FileNotFoundError("Something catastrophic happened! File still not found after creation")
|
|
924
|
+
else:
|
|
925
|
+
with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
926
|
+
if any(header) and verifyHeader:
|
|
927
|
+
line = file.readline().decode(encoding=encoding,errors='replace')
|
|
928
|
+
if not _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
|
|
929
|
+
__teePrintOrNot(f'Warning: Header mismatch in {fileName}. Keeping original header in file...','warning',teeLogger)
|
|
930
|
+
header = _formatHeader(line,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
|
|
931
|
+
with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
932
|
+
if header:
|
|
933
|
+
header = delimiter.join(_sanitize(header,delimiter=delimiter))
|
|
934
|
+
file.write(header.encode(encoding=encoding,errors='replace')+b'\n')
|
|
935
|
+
if verbose:
|
|
936
|
+
__teePrintOrNot(f"Cleared {fileName}",teeLogger=teeLogger)
|
|
808
937
|
|
|
809
938
|
def getFileUpdateTimeNs(fileName):
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
939
|
+
# return 0 if the file does not exist
|
|
940
|
+
if not os.path.isfile(fileName):
|
|
941
|
+
return 0
|
|
942
|
+
try:
|
|
943
|
+
return os.stat(fileName).st_mtime_ns
|
|
944
|
+
except Exception:
|
|
945
|
+
__teePrintOrNot(f"Failed to get file update time for {fileName}",'error')
|
|
946
|
+
return get_time_ns()
|
|
818
947
|
|
|
819
948
|
def get_time_ns():
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
949
|
+
try:
|
|
950
|
+
return time.time_ns()
|
|
951
|
+
except Exception:
|
|
952
|
+
# try to get the time in nanoseconds
|
|
953
|
+
return int(time.time()*1e9)
|
|
825
954
|
|
|
826
955
|
def scrubTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = False,delimiter = '\t',defaults = ...):
|
|
827
|
-
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
def scrubTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
956
|
+
"""
|
|
957
|
+
Compatibility method, calls scrubTabularFile.
|
|
958
|
+
Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
|
|
959
|
+
Return the data as a dictionary.
|
|
960
|
+
|
|
961
|
+
Parameters:
|
|
962
|
+
- fileName (str): The path to the Tabular file.
|
|
963
|
+
- teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
|
|
964
|
+
- header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
|
|
965
|
+
- createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
|
|
966
|
+
- lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
|
|
967
|
+
- verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
|
|
968
|
+
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
969
|
+
- taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
|
|
970
|
+
- encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
|
|
971
|
+
- strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
|
|
972
|
+
- delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
|
|
973
|
+
- defaults (list, optional): The default values to use for missing columns. Defaults to [].
|
|
974
|
+
|
|
975
|
+
Returns:
|
|
976
|
+
- OrderedDict: The dictionary containing the data from the Tabular file.
|
|
977
|
+
|
|
978
|
+
Raises:
|
|
979
|
+
- Exception: If the file is not found or there is a data format error.
|
|
980
|
+
|
|
981
|
+
"""
|
|
982
|
+
return scrubTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
|
|
983
|
+
|
|
984
|
+
def scrubTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,
|
|
985
|
+
verbose = False,taskDic = None,encoding = 'utf8',strict = False,delimiter = ...,defaults = ...,correctColumnNum = -1):
|
|
986
|
+
"""
|
|
987
|
+
Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
|
|
988
|
+
If using compressed files. This will recompress the file in whole and possibily increase the compression ratio reducing the file size.
|
|
989
|
+
Return the data as a dictionary.
|
|
990
|
+
|
|
991
|
+
Parameters:
|
|
992
|
+
- fileName (str): The path to the Tabular file.
|
|
993
|
+
- teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
|
|
994
|
+
- header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
|
|
995
|
+
- createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
|
|
996
|
+
- lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
|
|
997
|
+
- verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
|
|
998
|
+
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
999
|
+
- taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
|
|
1000
|
+
- encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
|
|
1001
|
+
- strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
|
|
1002
|
+
- delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
|
|
1003
|
+
- defaults (list, optional): The default values to use for missing columns. Defaults to [].
|
|
1004
|
+
- correctColumnNum (int, optional): The expected number of columns in the file. If -1, it will be determined from the first valid line. Defaults to -1.
|
|
1005
|
+
|
|
1006
|
+
Returns:
|
|
1007
|
+
- OrderedDict: The dictionary containing the data from the Tabular file.
|
|
1008
|
+
|
|
1009
|
+
Raises:
|
|
1010
|
+
- Exception: If the file is not found or there is a data format error.
|
|
1011
|
+
|
|
1012
|
+
"""
|
|
1013
|
+
file = readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,
|
|
1014
|
+
lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,
|
|
1015
|
+
encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults,correctColumnNum = correctColumnNum)
|
|
1016
|
+
if file:
|
|
1017
|
+
clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
|
|
1018
|
+
appendLinesTabularFile(fileName,file,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
|
|
1019
|
+
return file
|
|
887
1020
|
|
|
888
1021
|
def getListView(tsvzDic,header = [],delimiter = DEFAULT_DELIMITER):
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
893
|
-
|
|
894
|
-
|
|
895
|
-
|
|
896
|
-
|
|
897
|
-
|
|
898
|
-
|
|
899
|
-
|
|
900
|
-
|
|
901
|
-
|
|
902
|
-
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
|
|
909
|
-
|
|
1022
|
+
if header:
|
|
1023
|
+
if isinstance(header,str):
|
|
1024
|
+
header = header.split(delimiter)
|
|
1025
|
+
elif not isinstance(header,list):
|
|
1026
|
+
try:
|
|
1027
|
+
header = list(header)
|
|
1028
|
+
except Exception:
|
|
1029
|
+
header = []
|
|
1030
|
+
if not tsvzDic:
|
|
1031
|
+
if not header:
|
|
1032
|
+
return []
|
|
1033
|
+
else:
|
|
1034
|
+
return [header]
|
|
1035
|
+
if not header:
|
|
1036
|
+
return list(tsvzDic.values())
|
|
1037
|
+
else:
|
|
1038
|
+
values = list(tsvzDic.values())
|
|
1039
|
+
if values[0] and values[0] == header:
|
|
1040
|
+
return values
|
|
1041
|
+
else:
|
|
1042
|
+
return [header] + values
|
|
910
1043
|
|
|
911
1044
|
# create a tsv class that functions like a ordered dictionary but will update the file when modified
|
|
912
1045
|
class TSVZed(OrderedDict):
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
|
|
986
|
-
|
|
987
|
-
|
|
988
|
-
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
|
|
1017
|
-
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
|
|
1022
|
-
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
|
|
1030
|
-
|
|
1031
|
-
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
|
|
1051
|
-
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1057
|
-
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1066
|
-
|
|
1067
|
-
|
|
1068
|
-
|
|
1069
|
-
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1074
|
-
|
|
1075
|
-
|
|
1076
|
-
|
|
1077
|
-
|
|
1078
|
-
|
|
1079
|
-
|
|
1080
|
-
|
|
1081
|
-
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
|
|
1093
|
-
|
|
1094
|
-
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
|
|
1099
|
-
|
|
1100
|
-
|
|
1101
|
-
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1109
|
-
|
|
1110
|
-
|
|
1111
|
-
|
|
1112
|
-
|
|
1113
|
-
|
|
1114
|
-
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1138
|
-
|
|
1139
|
-
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
|
|
1145
|
-
|
|
1146
|
-
|
|
1147
|
-
|
|
1148
|
-
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
|
|
1163
|
-
|
|
1164
|
-
|
|
1165
|
-
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1169
|
-
|
|
1170
|
-
|
|
1171
|
-
|
|
1172
|
-
|
|
1173
|
-
|
|
1174
|
-
|
|
1175
|
-
|
|
1176
|
-
|
|
1177
|
-
|
|
1178
|
-
|
|
1179
|
-
|
|
1180
|
-
|
|
1181
|
-
|
|
1182
|
-
|
|
1046
|
+
"""
|
|
1047
|
+
A thread-safe, file-backed ordered dictionary for managing TSV (Tab-Separated Values) files.
|
|
1048
|
+
TSVZed extends OrderedDict to provide automatic synchronization between an in-memory
|
|
1049
|
+
dictionary and a TSV file on disk. It supports concurrent file access, automatic
|
|
1050
|
+
persistence, and configurable sync strategies.
|
|
1051
|
+
Parameters
|
|
1052
|
+
----------
|
|
1053
|
+
fileName : str
|
|
1054
|
+
Path to the TSV file to be managed.
|
|
1055
|
+
teeLogger : object, optional
|
|
1056
|
+
Logger object with a teelog method for logging messages. If None, uses print.
|
|
1057
|
+
header : str, optional
|
|
1058
|
+
Column header line for the TSV file. Used for validation and file creation.
|
|
1059
|
+
createIfNotExist : bool, default=True
|
|
1060
|
+
If True, creates the file if it doesn't exist.
|
|
1061
|
+
verifyHeader : bool, default=True
|
|
1062
|
+
If True, verifies that the file header matches the provided header.
|
|
1063
|
+
rewrite_on_load : bool, default=True
|
|
1064
|
+
If True, rewrites the entire file when loading to ensure consistency.
|
|
1065
|
+
rewrite_on_exit : bool, default=False
|
|
1066
|
+
If True, rewrites the entire file when closing/exiting.
|
|
1067
|
+
rewrite_interval : float, default=0
|
|
1068
|
+
Minimum time interval (in seconds) between full file rewrites. 0 means no limit.
|
|
1069
|
+
append_check_delay : float, default=0.01
|
|
1070
|
+
Time delay (in seconds) between checks of the append queue by the worker thread.
|
|
1071
|
+
monitor_external_changes : bool, default=True
|
|
1072
|
+
If True, monitors and detects external file modifications.
|
|
1073
|
+
verbose : bool, default=False
|
|
1074
|
+
If True, prints detailed operation logs.
|
|
1075
|
+
encoding : str, default='utf8'
|
|
1076
|
+
Character encoding for reading/writing the file.
|
|
1077
|
+
delimiter : str, optional
|
|
1078
|
+
Field delimiter character. Auto-detected from filename if not specified.
|
|
1079
|
+
defaults : list or str, optional
|
|
1080
|
+
Default values for columns when values are missing.
|
|
1081
|
+
strict : bool, default=False
|
|
1082
|
+
If True, enforces strict validation of column counts and raises errors on mismatch.
|
|
1083
|
+
correctColumnNum : int, default=-1
|
|
1084
|
+
Expected number of columns. -1 means auto-detect from header or first record.
|
|
1085
|
+
Attributes
|
|
1086
|
+
----------
|
|
1087
|
+
version : str
|
|
1088
|
+
Version of the TSVZed implementation.
|
|
1089
|
+
dirty : bool
|
|
1090
|
+
True if the in-memory data differs from the file on disk.
|
|
1091
|
+
deSynced : bool
|
|
1092
|
+
True if synchronization with the file has failed or external changes detected.
|
|
1093
|
+
memoryOnly : bool
|
|
1094
|
+
If True, changes are kept in memory only and not written to disk.
|
|
1095
|
+
appendQueue : deque
|
|
1096
|
+
Queue of lines waiting to be appended to the file.
|
|
1097
|
+
writeLock : threading.Lock
|
|
1098
|
+
Lock for ensuring thread-safe file operations.
|
|
1099
|
+
shutdownEvent : threading.Event
|
|
1100
|
+
Event signal for stopping the append worker thread.
|
|
1101
|
+
appendThread : threading.Thread
|
|
1102
|
+
Background thread that handles asynchronous file appending.
|
|
1103
|
+
Methods
|
|
1104
|
+
-------
|
|
1105
|
+
load()
|
|
1106
|
+
Load or reload data from the TSV file.
|
|
1107
|
+
reload()
|
|
1108
|
+
Refresh data from the TSV file, discarding in-memory changes.
|
|
1109
|
+
rewrite(force=False, reloadInternalFromFile=None)
|
|
1110
|
+
Rewrite the entire file with current in-memory data.
|
|
1111
|
+
mapToFile()
|
|
1112
|
+
Synchronize in-memory data to the file using in-place updates.
|
|
1113
|
+
hardMapToFile()
|
|
1114
|
+
Completely rewrite the file from scratch with current data.
|
|
1115
|
+
clear()
|
|
1116
|
+
Clear all data from memory and optionally the file.
|
|
1117
|
+
clear_file()
|
|
1118
|
+
Clear the file, keeping only the header.
|
|
1119
|
+
commitAppendToFile()
|
|
1120
|
+
Write all queued append operations to the file.
|
|
1121
|
+
stopAppendThread()
|
|
1122
|
+
Stop the background append worker thread and perform final sync.
|
|
1123
|
+
setDefaults(defaults)
|
|
1124
|
+
Set default values for columns.
|
|
1125
|
+
getListView()
|
|
1126
|
+
Get a list representation of the data with headers.
|
|
1127
|
+
getResourceUsage(return_dict=False)
|
|
1128
|
+
Get current resource usage statistics.
|
|
1129
|
+
checkExternalChanges()
|
|
1130
|
+
Check if the file has been modified externally.
|
|
1131
|
+
close()
|
|
1132
|
+
Close the TSVZed object, stopping background threads and syncing data.
|
|
1133
|
+
Notes
|
|
1134
|
+
-----
|
|
1135
|
+
- The class uses a background thread to handle asynchronous file operations.
|
|
1136
|
+
- File locking is implemented for both POSIX and Windows systems.
|
|
1137
|
+
- Keys starting with '#' are treated as comments and not persisted to file.
|
|
1138
|
+
- The special key '#DEFAULTS#' is used to store column default values.
|
|
1139
|
+
- Supports compressed file formats through automatic detection.
|
|
1140
|
+
- Thread-safe for concurrent access from multiple threads.
|
|
1141
|
+
Examples
|
|
1142
|
+
--------
|
|
1143
|
+
>>> with TSVZed('data.tsv', header='id\tname\tvalue') as tsv:
|
|
1144
|
+
... tsv['key1'] = ['key1', 'John', '100']
|
|
1145
|
+
... tsv['key2'] = ['key2', 'Jane', '200']
|
|
1146
|
+
... print(tsv['key1'])
|
|
1147
|
+
['key1', 'John', '100']
|
|
1148
|
+
>>> tsv = TSVZed('data.tsv', verbose=True, rewrite_on_exit=True)
|
|
1149
|
+
>>> tsv['key3'] = 'key3\tBob\t300'
|
|
1150
|
+
>>> tsv.close()
|
|
1151
|
+
"""
|
|
1152
|
+
def __teePrintOrNot(self,message,level = 'info'):
|
|
1153
|
+
try:
|
|
1154
|
+
if self.teeLogger:
|
|
1155
|
+
self.teeLogger.teelog(message,level)
|
|
1156
|
+
else:
|
|
1157
|
+
print(message,flush=True)
|
|
1158
|
+
except Exception:
|
|
1159
|
+
print(message,flush=True)
|
|
1160
|
+
|
|
1161
|
+
def getResourceUsage(self,return_dict = False):
|
|
1162
|
+
return get_resource_usage(return_dict = return_dict)
|
|
1163
|
+
|
|
1164
|
+
def __init__ (self,fileName,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,
|
|
1165
|
+
rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,monitor_external_changes = True,
|
|
1166
|
+
verbose = False,encoding = 'utf8',delimiter = ...,defaults = None,strict = False,correctColumnNum = -1):
|
|
1167
|
+
super().__init__()
|
|
1168
|
+
self.version = version
|
|
1169
|
+
self.strict = strict
|
|
1170
|
+
self.externalFileUpdateTime = getFileUpdateTimeNs(fileName)
|
|
1171
|
+
self.lastUpdateTime = self.externalFileUpdateTime
|
|
1172
|
+
self._fileName = fileName
|
|
1173
|
+
self.teeLogger = teeLogger
|
|
1174
|
+
self.delimiter = get_delimiter(delimiter,file_name=fileName)
|
|
1175
|
+
self.setDefaults(defaults)
|
|
1176
|
+
self.header = _formatHeader(header,verbose = verbose,teeLogger = self.teeLogger,delimiter=self.delimiter)
|
|
1177
|
+
self.correctColumnNum = correctColumnNum
|
|
1178
|
+
self.createIfNotExist = createIfNotExist
|
|
1179
|
+
self.verifyHeader = verifyHeader
|
|
1180
|
+
self.rewrite_on_load = rewrite_on_load
|
|
1181
|
+
self.rewrite_on_exit = rewrite_on_exit
|
|
1182
|
+
self.rewrite_interval = rewrite_interval
|
|
1183
|
+
self.monitor_external_changes = monitor_external_changes
|
|
1184
|
+
if not monitor_external_changes:
|
|
1185
|
+
self.__teePrintOrNot(f"Warning: External changes monitoring disabled for {self._fileName}. Will overwrite external changes.",'warning')
|
|
1186
|
+
self.verbose = verbose
|
|
1187
|
+
if append_check_delay < 0:
|
|
1188
|
+
append_check_delay = 0.00001
|
|
1189
|
+
self.__teePrintOrNot('append_check_delay cannot be less than 0, setting it to 0.00001','error')
|
|
1190
|
+
self.append_check_delay = append_check_delay
|
|
1191
|
+
self.appendQueue = deque()
|
|
1192
|
+
self.dirty = False
|
|
1193
|
+
self.deSynced = False
|
|
1194
|
+
self.memoryOnly = False
|
|
1195
|
+
self.encoding = encoding
|
|
1196
|
+
self.writeLock = threading.Lock()
|
|
1197
|
+
self.shutdownEvent = threading.Event()
|
|
1198
|
+
#self.appendEvent = threading.Event()
|
|
1199
|
+
self.appendThread = threading.Thread(target=self._appendWorker,daemon=True)
|
|
1200
|
+
self.appendThread.start()
|
|
1201
|
+
self.load()
|
|
1202
|
+
atexit.register(self.stopAppendThread)
|
|
1203
|
+
|
|
1204
|
+
def setDefaults(self,defaults):
|
|
1205
|
+
if not defaults:
|
|
1206
|
+
defaults = []
|
|
1207
|
+
if isinstance(defaults,str):
|
|
1208
|
+
defaults = defaults.split(self.delimiter)
|
|
1209
|
+
elif not isinstance(defaults,list):
|
|
1210
|
+
try:
|
|
1211
|
+
defaults = list(defaults)
|
|
1212
|
+
except Exception:
|
|
1213
|
+
if self.verbose:
|
|
1214
|
+
self.__teePrintOrNot('Invalid defaults, setting defaults to empty.','error')
|
|
1215
|
+
defaults = []
|
|
1216
|
+
defaults = [str(s).rstrip() if s else '' for s in defaults]
|
|
1217
|
+
if not any(defaults):
|
|
1218
|
+
defaults = []
|
|
1219
|
+
if not defaults or defaults[0] != DEFAULTS_INDICATOR_KEY:
|
|
1220
|
+
defaults = [DEFAULTS_INDICATOR_KEY]+defaults
|
|
1221
|
+
self.defaults = defaults
|
|
1222
|
+
|
|
1223
|
+
def load(self):
|
|
1224
|
+
self.reload()
|
|
1225
|
+
if self.rewrite_on_load:
|
|
1226
|
+
self.rewrite(force = True,reloadInternalFromFile = False)
|
|
1227
|
+
return self
|
|
1228
|
+
|
|
1229
|
+
def reload(self):
|
|
1230
|
+
# Load or refresh data from the TSV file
|
|
1231
|
+
mo = self.memoryOnly
|
|
1232
|
+
self.memoryOnly = True
|
|
1233
|
+
if self.verbose:
|
|
1234
|
+
self.__teePrintOrNot(f"Loading {self._fileName}")
|
|
1235
|
+
super().clear()
|
|
1236
|
+
readTabularFile(self._fileName, teeLogger = self.teeLogger, header = self.header,
|
|
1237
|
+
createIfNotExist = self.createIfNotExist, verifyHeader = self.verifyHeader,
|
|
1238
|
+
verbose = self.verbose, taskDic = self,encoding = self.encoding if self.encoding else None,
|
|
1239
|
+
strict = self.strict, delimiter = self.delimiter, defaults=self.defaults)
|
|
1240
|
+
if self.verbose:
|
|
1241
|
+
self.__teePrintOrNot(f"Loaded {len(self)} records from {self._fileName}")
|
|
1242
|
+
if self.header and any(self.header) and self.verifyHeader:
|
|
1243
|
+
self.correctColumnNum = len(self.header)
|
|
1244
|
+
elif self:
|
|
1245
|
+
self.correctColumnNum = len(self[next(iter(self))])
|
|
1246
|
+
else:
|
|
1247
|
+
self.correctColumnNum = -1
|
|
1248
|
+
if self.verbose:
|
|
1249
|
+
self.__teePrintOrNot(f"correctColumnNum: {self.correctColumnNum}")
|
|
1250
|
+
#super().update(loadedData)
|
|
1251
|
+
if self.verbose:
|
|
1252
|
+
self.__teePrintOrNot(f"TSVZed({self._fileName}) loaded")
|
|
1253
|
+
self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
|
|
1254
|
+
self.lastUpdateTime = self.externalFileUpdateTime
|
|
1255
|
+
self.memoryOnly = mo
|
|
1256
|
+
return self
|
|
1257
|
+
|
|
1258
|
+
def __setitem__(self,key,value):
|
|
1259
|
+
key = str(key).rstrip()
|
|
1260
|
+
if not key:
|
|
1261
|
+
self.__teePrintOrNot('Key cannot be empty','error')
|
|
1262
|
+
return
|
|
1263
|
+
if isinstance(value,str):
|
|
1264
|
+
value = value.split(self.delimiter)
|
|
1265
|
+
# sanitize the value
|
|
1266
|
+
value = [str(s).rstrip() if s else '' for s in value]
|
|
1267
|
+
# the first field in value should be the key
|
|
1268
|
+
# add it if it is not there
|
|
1269
|
+
if not value or value[0] != key:
|
|
1270
|
+
value = [key]+value
|
|
1271
|
+
# verify the value has the correct number of columns
|
|
1272
|
+
if self.correctColumnNum != 1 and len(value) == 1:
|
|
1273
|
+
# this means we want to clear / delete the key
|
|
1274
|
+
del self[key]
|
|
1275
|
+
elif self.correctColumnNum > 0:
|
|
1276
|
+
if len(value) != self.correctColumnNum:
|
|
1277
|
+
if self.strict:
|
|
1278
|
+
self.__teePrintOrNot(f"Value {value} does not have the correct number of columns: {self.correctColumnNum}. Refuse adding key...",'error')
|
|
1279
|
+
return
|
|
1280
|
+
elif self.verbose:
|
|
1281
|
+
self.__teePrintOrNot(f"Value {value} does not have the correct number of columns: {self.correctColumnNum}, correcting...",'warning')
|
|
1282
|
+
if len(value) < self.correctColumnNum:
|
|
1283
|
+
value += ['']*(self.correctColumnNum-len(value))
|
|
1284
|
+
elif len(value) > self.correctColumnNum:
|
|
1285
|
+
value = value[:self.correctColumnNum]
|
|
1286
|
+
else:
|
|
1287
|
+
self.correctColumnNum = len(value)
|
|
1288
|
+
if self.defaults and len(self.defaults) > 1:
|
|
1289
|
+
for i in range(1,len(value)):
|
|
1290
|
+
if not value[i] and i < len(self.defaults) and self.defaults[i]:
|
|
1291
|
+
value[i] = self.defaults[i]
|
|
1292
|
+
if self.verbose:
|
|
1293
|
+
self.__teePrintOrNot(f" Replacing empty value at {i} with default: {self.defaults[i]}")
|
|
1294
|
+
if key == DEFAULTS_INDICATOR_KEY:
|
|
1295
|
+
self.defaults = value
|
|
1296
|
+
if self.verbose:
|
|
1297
|
+
self.__teePrintOrNot(f"Defaults set to {value}")
|
|
1298
|
+
if not self.memoryOnly:
|
|
1299
|
+
self.appendQueue.append(value)
|
|
1300
|
+
self.lastUpdateTime = get_time_ns()
|
|
1301
|
+
if self.verbose:
|
|
1302
|
+
self.__teePrintOrNot(f"Appending Defaults {key} to the appendQueue")
|
|
1303
|
+
return
|
|
1304
|
+
if self.verbose:
|
|
1305
|
+
self.__teePrintOrNot(f"Setting {key} to {value}")
|
|
1306
|
+
if key in self:
|
|
1307
|
+
if self[key] == value:
|
|
1308
|
+
if self.verbose:
|
|
1309
|
+
self.__teePrintOrNot(f"Key {key} already exists with the same value")
|
|
1310
|
+
return
|
|
1311
|
+
self.dirty = True
|
|
1312
|
+
# update the dictionary,
|
|
1313
|
+
super().__setitem__(key,value)
|
|
1314
|
+
if self.memoryOnly:
|
|
1315
|
+
if self.verbose:
|
|
1316
|
+
self.__teePrintOrNot(f"Key {key} updated in memory only")
|
|
1317
|
+
return
|
|
1318
|
+
elif key.startswith('#'):
|
|
1319
|
+
if self.verbose:
|
|
1320
|
+
self.__teePrintOrNot(f"Key {key} updated in memory only as it starts with #")
|
|
1321
|
+
return
|
|
1322
|
+
if self.verbose:
|
|
1323
|
+
self.__teePrintOrNot(f"Appending {key} to the appendQueue")
|
|
1324
|
+
self.appendQueue.append(value)
|
|
1325
|
+
self.lastUpdateTime = get_time_ns()
|
|
1326
|
+
# if not self.appendThread.is_alive():
|
|
1327
|
+
# self.commitAppendToFile()
|
|
1328
|
+
# else:
|
|
1329
|
+
# self.appendEvent.set()
|
|
1330
|
+
|
|
1331
|
+
def __getitem__(self, key):
|
|
1332
|
+
return super().__getitem__(str(key).rstrip())
|
|
1333
|
+
|
|
1334
|
+
|
|
1335
|
+
def __delitem__(self,key):
|
|
1336
|
+
key = str(key).rstrip()
|
|
1337
|
+
if key == DEFAULTS_INDICATOR_KEY:
|
|
1338
|
+
self.defaults = [DEFAULTS_INDICATOR_KEY]
|
|
1339
|
+
if self.verbose:
|
|
1340
|
+
self.__teePrintOrNot("Defaults cleared")
|
|
1341
|
+
if not self.memoryOnly:
|
|
1342
|
+
self.__appendEmptyLine(key)
|
|
1343
|
+
if self.verbose:
|
|
1344
|
+
self.__teePrintOrNot(f"Appending empty default line {key}")
|
|
1345
|
+
return
|
|
1346
|
+
# delete the key from the dictionary and update the file
|
|
1347
|
+
if key not in self:
|
|
1348
|
+
if self.verbose:
|
|
1349
|
+
self.__teePrintOrNot(f"Key {key} not found")
|
|
1350
|
+
return
|
|
1351
|
+
super().__delitem__(key)
|
|
1352
|
+
if self.memoryOnly or key.startswith('#'):
|
|
1353
|
+
if self.verbose:
|
|
1354
|
+
self.__teePrintOrNot(f"Key {key} deleted in memory")
|
|
1355
|
+
return
|
|
1356
|
+
self.__appendEmptyLine(key)
|
|
1357
|
+
if self.verbose:
|
|
1358
|
+
self.__teePrintOrNot(f"Appending empty line {key}")
|
|
1359
|
+
self.lastUpdateTime = get_time_ns()
|
|
1360
|
+
|
|
1361
|
+
def __appendEmptyLine(self,key):
|
|
1362
|
+
self.dirty = True
|
|
1363
|
+
if self.correctColumnNum > 0:
|
|
1364
|
+
emptyLine = [key]+[self.delimiter]*(self.correctColumnNum-1)
|
|
1365
|
+
elif len(self[key]) > 1:
|
|
1366
|
+
self.correctColumnNum = len(self[key])
|
|
1367
|
+
emptyLine = [key]+[self.delimiter]*(self.correctColumnNum-1)
|
|
1368
|
+
else:
|
|
1369
|
+
emptyLine = [key]
|
|
1370
|
+
if self.verbose:
|
|
1371
|
+
self.__teePrintOrNot(f"Appending {emptyLine} to the appendQueue")
|
|
1372
|
+
self.appendQueue.append(emptyLine)
|
|
1373
|
+
return self
|
|
1374
|
+
|
|
1375
|
+
def getListView(self):
|
|
1376
|
+
return getListView(self,header=self.header,delimiter=self.delimiter)
|
|
1377
|
+
|
|
1378
|
+
def clear(self):
|
|
1379
|
+
# clear the dictionary and update the file
|
|
1380
|
+
super().clear()
|
|
1381
|
+
if self.verbose:
|
|
1382
|
+
self.__teePrintOrNot(f"Clearing {self._fileName}")
|
|
1383
|
+
if self.memoryOnly:
|
|
1384
|
+
return self
|
|
1385
|
+
self.clear_file()
|
|
1386
|
+
self.lastUpdateTime = self.externalFileUpdateTime
|
|
1387
|
+
return self
|
|
1388
|
+
|
|
1389
|
+
def clear_file(self):
|
|
1390
|
+
try:
|
|
1391
|
+
if self.header:
|
|
1392
|
+
file = self.get_file_obj('wb')
|
|
1393
|
+
header = self.delimiter.join(_sanitize(self.header,delimiter=self.delimiter))
|
|
1394
|
+
file.write(header.encode(self.encoding,errors='replace') + b'\n')
|
|
1395
|
+
self.release_file_obj(file)
|
|
1396
|
+
if self.verbose:
|
|
1397
|
+
self.__teePrintOrNot(f"Header {header} written to {self._fileName}")
|
|
1398
|
+
self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
|
|
1399
|
+
else:
|
|
1400
|
+
file = self.get_file_obj('wb')
|
|
1401
|
+
self.release_file_obj(file)
|
|
1402
|
+
if self.verbose:
|
|
1403
|
+
self.__teePrintOrNot(f"File {self._fileName} cleared empty")
|
|
1404
|
+
self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
|
|
1405
|
+
self.dirty = False
|
|
1406
|
+
self.deSynced = False
|
|
1407
|
+
except Exception as e:
|
|
1408
|
+
self.release_file_obj(file)
|
|
1409
|
+
self.__teePrintOrNot(f"Failed to write at clear_file() to {self._fileName}: {e}",'error')
|
|
1410
|
+
import traceback
|
|
1411
|
+
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
1412
|
+
self.deSynced = True
|
|
1413
|
+
return self
|
|
1414
|
+
|
|
1415
|
+
def __enter__(self):
|
|
1416
|
+
return self
|
|
1417
|
+
|
|
1418
|
+
def close(self):
|
|
1419
|
+
self.stopAppendThread()
|
|
1420
|
+
return self
|
|
1421
|
+
|
|
1422
|
+
def __exit__(self,exc_type,exc_value,traceback):
|
|
1423
|
+
return self.close()
|
|
1424
|
+
|
|
1425
|
+
def __repr__(self):
|
|
1426
|
+
return f"""TSVZed(
|
|
1183
1427
|
file_name:{self._fileName}
|
|
1184
1428
|
teeLogger:{self.teeLogger}
|
|
1185
1429
|
header:{self.header}
|
|
@@ -1196,372 +1440,860 @@ dirty:{self.dirty}
|
|
|
1196
1440
|
deSynced:{self.deSynced}
|
|
1197
1441
|
memoryOnly:{self.memoryOnly}
|
|
1198
1442
|
{dict(self)})"""
|
|
1199
|
-
|
|
1200
|
-
|
|
1201
|
-
|
|
1202
|
-
|
|
1203
|
-
|
|
1204
|
-
|
|
1205
|
-
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
|
|
1209
|
-
|
|
1210
|
-
|
|
1211
|
-
|
|
1212
|
-
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
|
|
1217
|
-
|
|
1218
|
-
|
|
1219
|
-
|
|
1220
|
-
|
|
1221
|
-
|
|
1222
|
-
|
|
1223
|
-
|
|
1224
|
-
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1241
|
-
|
|
1242
|
-
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1251
|
-
|
|
1252
|
-
|
|
1253
|
-
|
|
1254
|
-
|
|
1255
|
-
|
|
1256
|
-
|
|
1257
|
-
|
|
1258
|
-
|
|
1259
|
-
|
|
1260
|
-
|
|
1261
|
-
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1281
|
-
|
|
1282
|
-
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1305
|
-
|
|
1306
|
-
|
|
1307
|
-
|
|
1308
|
-
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
|
|
1312
|
-
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
|
|
1316
|
-
|
|
1317
|
-
|
|
1318
|
-
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1323
|
-
|
|
1324
|
-
|
|
1325
|
-
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
|
|
1332
|
-
|
|
1333
|
-
|
|
1334
|
-
|
|
1335
|
-
|
|
1336
|
-
|
|
1337
|
-
|
|
1338
|
-
|
|
1339
|
-
|
|
1340
|
-
|
|
1341
|
-
|
|
1342
|
-
|
|
1343
|
-
|
|
1344
|
-
|
|
1345
|
-
|
|
1346
|
-
|
|
1347
|
-
|
|
1348
|
-
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
|
|
1359
|
-
|
|
1360
|
-
|
|
1361
|
-
|
|
1362
|
-
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1399
|
-
|
|
1400
|
-
|
|
1401
|
-
|
|
1402
|
-
|
|
1403
|
-
|
|
1404
|
-
|
|
1405
|
-
|
|
1406
|
-
|
|
1407
|
-
|
|
1408
|
-
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
|
|
1412
|
-
|
|
1413
|
-
|
|
1414
|
-
|
|
1415
|
-
|
|
1416
|
-
|
|
1417
|
-
|
|
1418
|
-
|
|
1419
|
-
|
|
1420
|
-
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
|
|
1424
|
-
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1436
|
-
|
|
1437
|
-
|
|
1438
|
-
|
|
1439
|
-
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
|
|
1444
|
-
|
|
1445
|
-
|
|
1446
|
-
|
|
1447
|
-
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
|
|
1451
|
-
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1473
|
-
|
|
1474
|
-
|
|
1475
|
-
|
|
1476
|
-
|
|
1477
|
-
|
|
1478
|
-
|
|
1479
|
-
|
|
1480
|
-
|
|
1481
|
-
|
|
1482
|
-
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1486
|
-
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1491
|
-
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
|
|
1498
|
-
|
|
1499
|
-
|
|
1500
|
-
|
|
1501
|
-
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1443
|
+
|
|
1444
|
+
def __str__(self):
|
|
1445
|
+
return f"TSVZed({self._fileName},{dict(self)})"
|
|
1446
|
+
|
|
1447
|
+
def __del__(self):
|
|
1448
|
+
return self.close()
|
|
1449
|
+
|
|
1450
|
+
def popitem(self, last=True):
|
|
1451
|
+
key, value = super().popitem(last)
|
|
1452
|
+
if not self.memoryOnly:
|
|
1453
|
+
self.__appendEmptyLine(key)
|
|
1454
|
+
self.lastUpdateTime = get_time_ns()
|
|
1455
|
+
return key, value
|
|
1456
|
+
|
|
1457
|
+
__marker = object()
|
|
1458
|
+
|
|
1459
|
+
def pop(self, key, default=__marker):
|
|
1460
|
+
'''od.pop(k[,d]) -> v, remove specified key and return the corresponding
|
|
1461
|
+
value. If key is not found, d is returned if given, otherwise KeyError
|
|
1462
|
+
is raised.
|
|
1463
|
+
|
|
1464
|
+
'''
|
|
1465
|
+
key = str(key).rstrip()
|
|
1466
|
+
if key not in self:
|
|
1467
|
+
if default is self.__marker:
|
|
1468
|
+
raise KeyError(key)
|
|
1469
|
+
return default
|
|
1470
|
+
value = super().pop(key)
|
|
1471
|
+
if not self.memoryOnly:
|
|
1472
|
+
self.__appendEmptyLine(key)
|
|
1473
|
+
self.lastUpdateTime = get_time_ns()
|
|
1474
|
+
return value
|
|
1475
|
+
|
|
1476
|
+
def move_to_end(self, key, last=True):
|
|
1477
|
+
'''Move an existing element to the end (or beginning if last is false).
|
|
1478
|
+
Raise KeyError if the element does not exist.
|
|
1479
|
+
'''
|
|
1480
|
+
key = str(key).rstrip()
|
|
1481
|
+
super().move_to_end(key, last)
|
|
1482
|
+
self.dirty = True
|
|
1483
|
+
if not self.rewrite_on_exit:
|
|
1484
|
+
self.rewrite_on_exit = True
|
|
1485
|
+
self.__teePrintOrNot("Warning: move_to_end had been called. Need to resync for changes to apply to disk.")
|
|
1486
|
+
self.__teePrintOrNot("rewrite_on_exit set to True")
|
|
1487
|
+
if self.verbose:
|
|
1488
|
+
self.__teePrintOrNot(f"Warning: Trying to move Key {key} moved to {'end' if last else 'beginning'} Need to resync for changes to apply to disk")
|
|
1489
|
+
self.lastUpdateTime = get_time_ns()
|
|
1490
|
+
return self
|
|
1491
|
+
|
|
1492
|
+
def __sizeof__(self):
|
|
1493
|
+
sizeof = sys.getsizeof
|
|
1494
|
+
size = sizeof(super()) + sizeof(True) * 12 # for the booleans / integers
|
|
1495
|
+
size += sizeof(self.externalFileUpdateTime)
|
|
1496
|
+
size += sizeof(self.lastUpdateTime)
|
|
1497
|
+
size += sizeof(self._fileName)
|
|
1498
|
+
size += sizeof(self.teeLogger)
|
|
1499
|
+
size += sizeof(self.delimiter)
|
|
1500
|
+
size += sizeof(self.defaults)
|
|
1501
|
+
size += sizeof(self.header)
|
|
1502
|
+
size += sizeof(self.appendQueue)
|
|
1503
|
+
size += sizeof(self.encoding)
|
|
1504
|
+
size += sizeof(self.writeLock)
|
|
1505
|
+
size += sizeof(self.shutdownEvent)
|
|
1506
|
+
size += sizeof(self.appendThread)
|
|
1507
|
+
size += super().__sizeof__()
|
|
1508
|
+
return size
|
|
1509
|
+
|
|
1510
|
+
@classmethod
|
|
1511
|
+
def fromkeys(cls, iterable, value=None,fileName = None,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,verbose = False):
|
|
1512
|
+
'''Create a new ordered dictionary with keys from iterable and values set to value.
|
|
1513
|
+
'''
|
|
1514
|
+
self = cls(fileName,teeLogger,header,createIfNotExist,verifyHeader,rewrite_on_load,rewrite_on_exit,rewrite_interval,append_check_delay,verbose)
|
|
1515
|
+
for key in iterable:
|
|
1516
|
+
self[key] = value
|
|
1517
|
+
return self
|
|
1518
|
+
|
|
1519
|
+
|
|
1520
|
+
def rewrite(self,force = False,reloadInternalFromFile = None):
|
|
1521
|
+
if not self.deSynced and not force:
|
|
1522
|
+
if not self.dirty:
|
|
1523
|
+
return False
|
|
1524
|
+
if self.rewrite_interval == 0 or time.time() - os.path.getmtime(self._fileName) < self.rewrite_interval:
|
|
1525
|
+
return False
|
|
1526
|
+
try:
|
|
1527
|
+
|
|
1528
|
+
if reloadInternalFromFile is None:
|
|
1529
|
+
reloadInternalFromFile = self.monitor_external_changes
|
|
1530
|
+
if reloadInternalFromFile and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
|
|
1531
|
+
# this will be needed if more than 1 process is accessing the file
|
|
1532
|
+
self.commitAppendToFile()
|
|
1533
|
+
self.reload()
|
|
1534
|
+
if self.memoryOnly:
|
|
1535
|
+
if self.verbose:
|
|
1536
|
+
self.__teePrintOrNot("Memory only mode. Map to file skipped.")
|
|
1537
|
+
return False
|
|
1538
|
+
if self.dirty:
|
|
1539
|
+
if self.verbose:
|
|
1540
|
+
self.__teePrintOrNot(f"Rewriting {self._fileName}")
|
|
1541
|
+
self.mapToFile()
|
|
1542
|
+
if self.verbose:
|
|
1543
|
+
self.__teePrintOrNot(f"{len(self)} records rewrote to {self._fileName}")
|
|
1544
|
+
if not self.appendThread.is_alive():
|
|
1545
|
+
self.commitAppendToFile()
|
|
1546
|
+
# else:
|
|
1547
|
+
# self.appendEvent.set()
|
|
1548
|
+
return True
|
|
1549
|
+
except Exception as e:
|
|
1550
|
+
self.__teePrintOrNot(f"Failed to write at sync() to {self._fileName}: {e}",'error')
|
|
1551
|
+
import traceback
|
|
1552
|
+
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
1553
|
+
self.deSynced = True
|
|
1554
|
+
return False
|
|
1555
|
+
|
|
1556
|
+
def hardMapToFile(self):
|
|
1557
|
+
try:
|
|
1558
|
+
if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
|
|
1559
|
+
self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
|
|
1560
|
+
file = self.get_file_obj('wb')
|
|
1561
|
+
buf = io.BufferedWriter(file, buffer_size=64*1024*1024) # 64MB buffer
|
|
1562
|
+
if self.header:
|
|
1563
|
+
header = self.delimiter.join(_sanitize(self.header,delimiter=self.delimiter))
|
|
1564
|
+
buf.write(header.encode(self.encoding,errors='replace') + b'\n')
|
|
1565
|
+
for key in self:
|
|
1566
|
+
segments = _sanitize(self[key],delimiter=self.delimiter)
|
|
1567
|
+
buf.write(self.delimiter.join(segments).encode(encoding=self.encoding,errors='replace')+b'\n')
|
|
1568
|
+
buf.flush()
|
|
1569
|
+
self.release_file_obj(file)
|
|
1570
|
+
if self.verbose:
|
|
1571
|
+
self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
|
|
1572
|
+
self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
|
|
1573
|
+
self.dirty = False
|
|
1574
|
+
self.deSynced = False
|
|
1575
|
+
except Exception as e:
|
|
1576
|
+
self.release_file_obj(file)
|
|
1577
|
+
self.__teePrintOrNot(f"Failed to write at hardMapToFile() to {self._fileName}: {e}",'error')
|
|
1578
|
+
import traceback
|
|
1579
|
+
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
1580
|
+
self.deSynced = True
|
|
1581
|
+
return self
|
|
1582
|
+
|
|
1583
|
+
def mapToFile(self):
|
|
1584
|
+
mec = self.monitor_external_changes
|
|
1585
|
+
self.monitor_external_changes = False
|
|
1586
|
+
try:
|
|
1587
|
+
if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
|
|
1588
|
+
self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
|
|
1589
|
+
if self._fileName.rpartition('.')[2] in COMPRESSED_FILE_EXTENSIONS:
|
|
1590
|
+
# if the file is compressed, we need to use the hardMapToFile method
|
|
1591
|
+
return self.hardMapToFile()
|
|
1592
|
+
file = self.get_file_obj('r+b')
|
|
1593
|
+
overWrite = False
|
|
1594
|
+
if self.header:
|
|
1595
|
+
line = file.readline().decode(self.encoding,errors='replace')
|
|
1596
|
+
aftPos = file.tell()
|
|
1597
|
+
if not _lineContainHeader(self.header,line,verbose = self.verbose,teeLogger = self.teeLogger,strict = self.strict):
|
|
1598
|
+
header = self.delimiter.join(_sanitize(self.header,delimiter=self.delimiter))
|
|
1599
|
+
file.seek(0)
|
|
1600
|
+
file.write(f'{header}\n'.encode(encoding=self.encoding,errors='replace'))
|
|
1601
|
+
# if the header is not the same length as the line, we need to overwrite the file
|
|
1602
|
+
if aftPos != file.tell():
|
|
1603
|
+
overWrite = True
|
|
1604
|
+
if self.verbose:
|
|
1605
|
+
self.__teePrintOrNot(f"Header {header} written to {self._fileName}")
|
|
1606
|
+
for value in self.values():
|
|
1607
|
+
if value[0].startswith('#'):
|
|
1608
|
+
continue
|
|
1609
|
+
segments = _sanitize(value,delimiter=self.delimiter)
|
|
1610
|
+
strToWrite = self.delimiter.join(segments)
|
|
1611
|
+
if overWrite:
|
|
1612
|
+
if self.verbose:
|
|
1613
|
+
self.__teePrintOrNot(f"Overwriting {value} to {self._fileName}")
|
|
1614
|
+
file.write(strToWrite.encode(encoding=self.encoding,errors='replace')+b'\n')
|
|
1615
|
+
continue
|
|
1616
|
+
pos = file.tell()
|
|
1617
|
+
line = file.readline()
|
|
1618
|
+
aftPos = file.tell()
|
|
1619
|
+
if not line or pos == aftPos:
|
|
1620
|
+
if self.verbose:
|
|
1621
|
+
self.__teePrintOrNot(f"End of file reached. Appending {value} to {self._fileName}")
|
|
1622
|
+
file.write(strToWrite.encode(encoding=self.encoding,errors='replace'))
|
|
1623
|
+
overWrite = True
|
|
1624
|
+
continue
|
|
1625
|
+
strToWrite = strToWrite.encode(encoding=self.encoding,errors='replace').ljust(len(line)-1)+b'\n'
|
|
1626
|
+
if line != strToWrite:
|
|
1627
|
+
if self.verbose:
|
|
1628
|
+
self.__teePrintOrNot(f"Modifing {value} to {self._fileName}")
|
|
1629
|
+
file.seek(pos)
|
|
1630
|
+
# fill the string with space to write to the correct length
|
|
1631
|
+
file.write(strToWrite)
|
|
1632
|
+
if aftPos != file.tell():
|
|
1633
|
+
overWrite = True
|
|
1634
|
+
file.truncate()
|
|
1635
|
+
self.release_file_obj(file)
|
|
1636
|
+
if self.verbose:
|
|
1637
|
+
self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
|
|
1638
|
+
self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
|
|
1639
|
+
self.dirty = False
|
|
1640
|
+
self.deSynced = False
|
|
1641
|
+
except Exception as e:
|
|
1642
|
+
self.release_file_obj(file)
|
|
1643
|
+
self.__teePrintOrNot(f"Failed to write at mapToFile() to {self._fileName}: {e}",'error')
|
|
1644
|
+
import traceback
|
|
1645
|
+
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
1646
|
+
self.deSynced = True
|
|
1647
|
+
self.__teePrintOrNot("Trying failback hardMapToFile()")
|
|
1648
|
+
self.hardMapToFile()
|
|
1649
|
+
self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
|
|
1650
|
+
self.monitor_external_changes = mec
|
|
1651
|
+
return self
|
|
1652
|
+
|
|
1653
|
+
def checkExternalChanges(self):
|
|
1654
|
+
if self.deSynced:
|
|
1655
|
+
return self
|
|
1656
|
+
if not self.monitor_external_changes:
|
|
1657
|
+
return self
|
|
1658
|
+
realExternalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
|
|
1659
|
+
if self.externalFileUpdateTime < realExternalFileUpdateTime:
|
|
1660
|
+
self.deSynced = True
|
|
1661
|
+
self.__teePrintOrNot(f"External changes detected in {self._fileName}")
|
|
1662
|
+
elif self.externalFileUpdateTime > realExternalFileUpdateTime:
|
|
1663
|
+
self.__teePrintOrNot(f"Time anomalies detected in {self._fileName}, resetting externalFileUpdateTime")
|
|
1664
|
+
self.externalFileUpdateTime = realExternalFileUpdateTime
|
|
1665
|
+
return self
|
|
1666
|
+
|
|
1667
|
+
def _appendWorker(self):
|
|
1668
|
+
while not self.shutdownEvent.is_set():
|
|
1669
|
+
if not self.memoryOnly:
|
|
1670
|
+
self.checkExternalChanges()
|
|
1671
|
+
self.rewrite()
|
|
1672
|
+
self.commitAppendToFile()
|
|
1673
|
+
time.sleep(self.append_check_delay)
|
|
1674
|
+
# self.appendEvent.wait()
|
|
1675
|
+
# self.appendEvent.clear()
|
|
1676
|
+
if self.verbose:
|
|
1677
|
+
self.__teePrintOrNot(f"Append worker for {self._fileName} shut down")
|
|
1678
|
+
self.commitAppendToFile()
|
|
1679
|
+
|
|
1680
|
+
def commitAppendToFile(self):
|
|
1681
|
+
if self.appendQueue:
|
|
1682
|
+
if self.memoryOnly:
|
|
1683
|
+
self.appendQueue.clear()
|
|
1684
|
+
if self.verbose:
|
|
1685
|
+
self.__teePrintOrNot("Memory only mode. Append queue cleared.")
|
|
1686
|
+
return self
|
|
1687
|
+
try:
|
|
1688
|
+
if self.verbose:
|
|
1689
|
+
self.__teePrintOrNot(f"Commiting {len(self.appendQueue)} records to {self._fileName}")
|
|
1690
|
+
self.__teePrintOrNot(f"Before size of {self._fileName}: {os.path.getsize(self._fileName)}")
|
|
1691
|
+
file = self.get_file_obj('ab')
|
|
1692
|
+
buf = io.BufferedWriter(file, buffer_size=64*1024*1024) # 64MB buffer
|
|
1693
|
+
while self.appendQueue:
|
|
1694
|
+
line = _sanitize(self.appendQueue.popleft(),delimiter=self.delimiter)
|
|
1695
|
+
buf.write(self.delimiter.join(line).encode(encoding=self.encoding,errors='replace')+b'\n')
|
|
1696
|
+
buf.flush()
|
|
1697
|
+
self.release_file_obj(file)
|
|
1698
|
+
if self.verbose:
|
|
1699
|
+
self.__teePrintOrNot(f"Records commited to {self._fileName}")
|
|
1700
|
+
self.__teePrintOrNot(f"After size of {self._fileName}: {os.path.getsize(self._fileName)}")
|
|
1701
|
+
except Exception as e:
|
|
1702
|
+
self.release_file_obj(file)
|
|
1703
|
+
self.__teePrintOrNot(f"Failed to write at commitAppendToFile to {self._fileName}: {e}",'error')
|
|
1704
|
+
import traceback
|
|
1705
|
+
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
1706
|
+
self.deSynced = True
|
|
1707
|
+
return self
|
|
1708
|
+
|
|
1709
|
+
def stopAppendThread(self):
|
|
1710
|
+
try:
|
|
1711
|
+
if self.shutdownEvent.is_set():
|
|
1712
|
+
# if self.verbose:
|
|
1713
|
+
# self.__teePrintOrNot(f"Append thread for {self._fileName} already stopped")
|
|
1714
|
+
return
|
|
1715
|
+
self.rewrite(force=self.rewrite_on_exit) # Ensure any final sync operations are performed
|
|
1716
|
+
# self.appendEvent.set()
|
|
1717
|
+
self.shutdownEvent.set() # Signal the append thread to shut down
|
|
1718
|
+
self.appendThread.join() # Wait for the append thread to complete
|
|
1719
|
+
if self.verbose:
|
|
1720
|
+
self.__teePrintOrNot(f"Append thread for {self._fileName} stopped")
|
|
1721
|
+
except Exception as e:
|
|
1722
|
+
self.__teePrintOrNot(f"Failed to stop append thread for {self._fileName}: {e}",'error')
|
|
1723
|
+
import traceback
|
|
1724
|
+
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
1725
|
+
|
|
1726
|
+
def get_file_obj(self,modes = 'ab'):
|
|
1727
|
+
self.writeLock.acquire()
|
|
1728
|
+
try:
|
|
1729
|
+
if not self.encoding:
|
|
1730
|
+
self.encoding = 'utf8'
|
|
1731
|
+
file = openFileAsCompressed(self._fileName, mode=modes, encoding=self.encoding,teeLogger=self.teeLogger)
|
|
1732
|
+
# Lock the file after opening
|
|
1733
|
+
if os.name == 'posix':
|
|
1734
|
+
fcntl.lockf(file, fcntl.LOCK_EX)
|
|
1735
|
+
elif os.name == 'nt':
|
|
1736
|
+
# For Windows, locking the entire file, avoiding locking an empty file
|
|
1737
|
+
#lock_length = max(1, os.path.getsize(self._fileName))
|
|
1738
|
+
lock_length = 2147483647
|
|
1739
|
+
msvcrt.locking(file.fileno(), msvcrt.LK_LOCK, lock_length)
|
|
1740
|
+
if self.verbose:
|
|
1741
|
+
self.__teePrintOrNot(f"File {self._fileName} locked with mode {modes}")
|
|
1742
|
+
except Exception as e:
|
|
1743
|
+
try:
|
|
1744
|
+
self.writeLock.release() # Release the thread lock in case of an error
|
|
1745
|
+
except Exception as e:
|
|
1746
|
+
self.__teePrintOrNot(f"Failed to release writeLock for {self._fileName}: {e}",'error')
|
|
1747
|
+
self.__teePrintOrNot(f"Failed to open file {self._fileName}: {e}",'error')
|
|
1748
|
+
return file
|
|
1749
|
+
|
|
1750
|
+
def release_file_obj(self,file):
|
|
1751
|
+
# if write lock is already released, return
|
|
1752
|
+
if not self.writeLock.locked():
|
|
1753
|
+
return
|
|
1754
|
+
try:
|
|
1755
|
+
file.flush() # Ensure the file is flushed before unlocking
|
|
1756
|
+
os.fsync(file.fileno()) # Ensure the file is synced to disk before unlocking
|
|
1757
|
+
if not file.closed:
|
|
1758
|
+
if os.name == 'posix':
|
|
1759
|
+
fcntl.lockf(file, fcntl.LOCK_UN)
|
|
1760
|
+
elif os.name == 'nt':
|
|
1761
|
+
# Unlocking the entire file; for Windows, ensure not unlocking an empty file
|
|
1762
|
+
#unlock_length = max(1, os.path.getsize(os.path.realpath(file.name)))
|
|
1763
|
+
unlock_length = 2147483647
|
|
1764
|
+
try:
|
|
1765
|
+
msvcrt.locking(file.fileno(), msvcrt.LK_UNLCK, unlock_length)
|
|
1766
|
+
except Exception:
|
|
1767
|
+
pass
|
|
1768
|
+
file.close() # Ensure file is closed after unlocking
|
|
1769
|
+
if self.verbose:
|
|
1770
|
+
self.__teePrintOrNot(f"File {file.name} unlocked / released")
|
|
1771
|
+
except Exception as e:
|
|
1772
|
+
try:
|
|
1773
|
+
self.writeLock.release() # Ensure the thread lock is always released
|
|
1774
|
+
except Exception as e:
|
|
1775
|
+
self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
|
|
1776
|
+
self.__teePrintOrNot(f"Failed to release file {file.name}: {e}",'error')
|
|
1777
|
+
import traceback
|
|
1778
|
+
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
1779
|
+
# release the write lock if not already released
|
|
1780
|
+
if self.writeLock.locked():
|
|
1781
|
+
try:
|
|
1782
|
+
self.writeLock.release() # Ensure the thread lock is always released
|
|
1783
|
+
except Exception as e:
|
|
1784
|
+
self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
|
|
1785
|
+
self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
|
|
1786
|
+
|
|
1787
|
+
class TSVZedLite(MutableMapping):
|
|
1788
|
+
"""
|
|
1789
|
+
A mutable mapping class that provides a dictionary-like interface to a Tabular (TSV by default) file.
|
|
1790
|
+
TSVZedLite stores key-value pairs where each row in the file represents an entry, with the first
|
|
1791
|
+
column serving as the key. The class maintains an in-memory index of file positions for efficient
|
|
1792
|
+
random access while keeping the actual data on disk.
|
|
1793
|
+
TSVZedLite is designed for light memory footprint and forgoes some features from TSVZed, Notably,
|
|
1794
|
+
- Does not support simultaneous multi-process access.
|
|
1795
|
+
- Does not support compressed file formats.
|
|
1796
|
+
- Does not support automatic file rewriting on load / exit / periodically.
|
|
1797
|
+
- Does not support append worker thread for background writes.
|
|
1798
|
+
- Does not support external file change monitoring.
|
|
1799
|
+
- Does not support in-place updates; updates are append-only.
|
|
1800
|
+
- Does not support logging via teeLogger.
|
|
1801
|
+
- Does not support move_to_end method.
|
|
1802
|
+
- Does not support in-memory only mode. ( please just use a dict )
|
|
1803
|
+
- Does not lock the file during operations.
|
|
1804
|
+
- Does not track last update times.
|
|
1805
|
+
|
|
1806
|
+
However, it may be preferred in scenarios when:
|
|
1807
|
+
- Memory usage needs to be minimized.
|
|
1808
|
+
- Working with extremely large datasets where loading everything into memory is impractical.
|
|
1809
|
+
- Simplicity and ease of use are prioritized over advanced features.
|
|
1810
|
+
- The dataset is primarily write-only with infrequent reads.
|
|
1811
|
+
- The application can tolerate the lack of concurrency control. (single process access only)
|
|
1812
|
+
- Underlying file system is fast and can do constant time random seek (e.g., SSD).
|
|
1813
|
+
|
|
1814
|
+
Note: It is possible to load a custom dict like object for indexes (like TSVZed or pre-built dict)
|
|
1815
|
+
to avoid reading the entire data file to load the indexes at startup.
|
|
1816
|
+
Index consistency is not enforced in this case.
|
|
1817
|
+
Will raise error if mismatch happen (only checkes key exist in file) and strict mode is enabled.
|
|
1818
|
+
If using an external file-backed Index. This can function similar to a key-value store (like nosql).
|
|
1819
|
+
|
|
1820
|
+
Parameters
|
|
1821
|
+
----------
|
|
1822
|
+
fileName : str
|
|
1823
|
+
Path to the Tabular file to read from or create.
|
|
1824
|
+
header : str, optional
|
|
1825
|
+
Header row for the file. Can be a delimited string or empty string (default: '').
|
|
1826
|
+
createIfNotExist : bool, optional
|
|
1827
|
+
If True, creates the file if it doesn't exist (default: True).
|
|
1828
|
+
verifyHeader : bool, optional
|
|
1829
|
+
If True, verifies that the file header matches the provided header (default: True).
|
|
1830
|
+
verbose : bool, optional
|
|
1831
|
+
If True, prints detailed operation information to stderr (default: False).
|
|
1832
|
+
encoding : str, optional
|
|
1833
|
+
Character encoding for the file (default: 'utf8').
|
|
1834
|
+
delimiter : str, optional
|
|
1835
|
+
Field delimiter character. If Ellipsis (...), automatically detects from filename (default: ...).
|
|
1836
|
+
defaults : str, list, or None, optional
|
|
1837
|
+
Default values for columns. Can be a delimited string, list, or None (default: None).
|
|
1838
|
+
strict : bool, optional
|
|
1839
|
+
If True, enforces strict column count validation and raises errors on mismatches (default: True).
|
|
1840
|
+
correctColumnNum : int, optional
|
|
1841
|
+
Expected number of columns. -1 means auto-detect (default: -1).
|
|
1842
|
+
indexes : dict, optional
|
|
1843
|
+
Pre-existing index dictionary mapping keys to file positions (default: ...).
|
|
1844
|
+
fileObj : file object, optional
|
|
1845
|
+
Pre-existing file object to use (default: ...).
|
|
1846
|
+
Attributes
|
|
1847
|
+
----------
|
|
1848
|
+
version : str
|
|
1849
|
+
Version identifier for the TSVZedLite format.
|
|
1850
|
+
indexes : dict
|
|
1851
|
+
Dictionary mapping keys to their file positions (or in-memory data for keys starting with '#').
|
|
1852
|
+
fileObj : file object
|
|
1853
|
+
Binary file object for reading/writing the underlying file.
|
|
1854
|
+
defaults : list
|
|
1855
|
+
List of default values for columns, with DEFAULTS_INDICATOR_KEY as the first element.
|
|
1856
|
+
correctColumnNum : int
|
|
1857
|
+
The validated number of columns per row.
|
|
1858
|
+
Notes
|
|
1859
|
+
-----
|
|
1860
|
+
- Keys starting with '#' are stored in memory only and not written to file.
|
|
1861
|
+
- The special key DEFAULTS_INDICATOR_KEY is used to store and retrieve default column values.
|
|
1862
|
+
- Empty values in rows are automatically filled with defaults if available.
|
|
1863
|
+
- The class implements the MutableMapping interface, providing dict-like operations.
|
|
1864
|
+
- File operations are buffered and written immediately (append-only for updates).
|
|
1865
|
+
- Deleted entries are marked by writing a row with only the key (empty values).
|
|
1866
|
+
Examples
|
|
1867
|
+
--------
|
|
1868
|
+
>>> db = TSVZedLite('data.tsv', header='id\tname\tage')
|
|
1869
|
+
>>> db['user1'] = ['user1', 'Alice', '30']
|
|
1870
|
+
>>> print(db['user1'])
|
|
1871
|
+
['user1', 'Alice', '30']
|
|
1872
|
+
>>> del db['user1']
|
|
1873
|
+
>>> 'user1' in db
|
|
1874
|
+
False
|
|
1875
|
+
See Also
|
|
1876
|
+
--------
|
|
1877
|
+
collections.abc.MutableMapping : The abstract base class that this class implements.
|
|
1878
|
+
"""
|
|
1879
|
+
|
|
1880
|
+
#['__new__', '__repr__', '__hash__', '__lt__', '__le__', '__eq__', '__ne__', '__gt__', '__ge__', '__iter__', '__init__',
|
|
1881
|
+
# '__or__', '__ror__', '__ior__', '__len__', '__getitem__', '__setitem__', '__delitem__', '__contains__', '__sizeof__',
|
|
1882
|
+
# 'get', 'setdefault', 'pop', 'popitem', 'keys', 'items', 'values', 'update', 'fromkeys', 'clear', 'copy', '__reversed__',
|
|
1883
|
+
# '__class_getitem__', '__doc__']
|
|
1884
|
+
def __init__ (self,fileName,header = '',createIfNotExist = True,verifyHeader = True,
|
|
1885
|
+
verbose = False,encoding = 'utf8',
|
|
1886
|
+
delimiter = ...,defaults = None,strict = True,correctColumnNum = -1,
|
|
1887
|
+
indexes = ..., fileObj = ...
|
|
1888
|
+
):
|
|
1889
|
+
self.version = version
|
|
1890
|
+
self.strict = strict
|
|
1891
|
+
self._fileName = fileName
|
|
1892
|
+
self.delimiter = get_delimiter(delimiter,file_name=fileName)
|
|
1893
|
+
self.setDefaults(defaults)
|
|
1894
|
+
self.header = _formatHeader(header,verbose = verbose,delimiter=self.delimiter)
|
|
1895
|
+
self.correctColumnNum = correctColumnNum
|
|
1896
|
+
self.createIfNotExist = createIfNotExist
|
|
1897
|
+
self.verifyHeader = verifyHeader
|
|
1898
|
+
self.verbose = verbose
|
|
1899
|
+
self.encoding = encoding
|
|
1900
|
+
if indexes is ...:
|
|
1901
|
+
self.indexes = dict()
|
|
1902
|
+
self.load()
|
|
1903
|
+
else:
|
|
1904
|
+
self.indexes = indexes
|
|
1905
|
+
if fileObj is ...:
|
|
1906
|
+
self.fileObj = open(self._fileName,'r+b')
|
|
1907
|
+
else:
|
|
1908
|
+
self.fileObj = fileObj
|
|
1909
|
+
atexit.register(self.close)
|
|
1910
|
+
|
|
1911
|
+
# Implement custom methods just for TSVZedLite
|
|
1912
|
+
def getResourceUsage(self,return_dict = False):
|
|
1913
|
+
return get_resource_usage(return_dict = return_dict)
|
|
1914
|
+
|
|
1915
|
+
def setDefaults(self,defaults):
|
|
1916
|
+
if not defaults:
|
|
1917
|
+
defaults = []
|
|
1918
|
+
if isinstance(defaults,str):
|
|
1919
|
+
defaults = defaults.split(self.delimiter)
|
|
1920
|
+
elif not isinstance(defaults,list):
|
|
1921
|
+
try:
|
|
1922
|
+
defaults = list(defaults)
|
|
1923
|
+
except Exception:
|
|
1924
|
+
if self.verbose:
|
|
1925
|
+
eprint('Error: Invalid defaults, setting defaults to empty.')
|
|
1926
|
+
defaults = []
|
|
1927
|
+
defaults = [str(s).rstrip() if s else '' for s in defaults]
|
|
1928
|
+
if not any(defaults):
|
|
1929
|
+
defaults = []
|
|
1930
|
+
if not defaults or defaults[0] != DEFAULTS_INDICATOR_KEY:
|
|
1931
|
+
defaults = [DEFAULTS_INDICATOR_KEY]+defaults
|
|
1932
|
+
self.defaults = defaults
|
|
1933
|
+
|
|
1934
|
+
def load(self):
|
|
1935
|
+
if self.verbose:
|
|
1936
|
+
eprint(f"Loading {self._fileName}")
|
|
1937
|
+
readTabularFile(self._fileName, header = self.header, createIfNotExist = self.createIfNotExist,
|
|
1938
|
+
verifyHeader = self.verifyHeader, verbose = self.verbose, taskDic = self.indexes,
|
|
1939
|
+
encoding = self.encoding if self.encoding else None, strict = self.strict,
|
|
1940
|
+
delimiter = self.delimiter, defaults=self.defaults,storeOffset=True)
|
|
1941
|
+
return self
|
|
1942
|
+
|
|
1943
|
+
def positions(self):
|
|
1944
|
+
return self.indexes.values()
|
|
1945
|
+
|
|
1946
|
+
def reload(self):
|
|
1947
|
+
self.indexes.clear()
|
|
1948
|
+
return self.load()
|
|
1949
|
+
|
|
1950
|
+
def getListView(self):
|
|
1951
|
+
return getListView(self,header=self.header,delimiter=self.delimiter)
|
|
1952
|
+
|
|
1953
|
+
def clear_file(self):
|
|
1954
|
+
if self.verbose:
|
|
1955
|
+
eprint(f"Clearing {self._fileName}")
|
|
1956
|
+
self.fileObj.seek(0)
|
|
1957
|
+
self.fileObj.truncate()
|
|
1958
|
+
if self.verbose:
|
|
1959
|
+
eprint(f"File {self._fileName} cleared empty")
|
|
1960
|
+
if self.header:
|
|
1961
|
+
location = self.__writeValues(self.header)
|
|
1962
|
+
if self.verbose:
|
|
1963
|
+
eprint(f"Header {self.header} written to {self._fileName}")
|
|
1964
|
+
eprint(f"At {location} size: {self.fileObj.tell()}")
|
|
1965
|
+
return self
|
|
1966
|
+
|
|
1967
|
+
def switchFile(self,newFileName,createIfNotExist = ...,verifyHeader = ...):
|
|
1968
|
+
if createIfNotExist is ...:
|
|
1969
|
+
createIfNotExist = self.createIfNotExist
|
|
1970
|
+
if verifyHeader is ...:
|
|
1971
|
+
verifyHeader = self.verifyHeader
|
|
1972
|
+
self.fileObj.close()
|
|
1973
|
+
self._fileName = newFileName
|
|
1974
|
+
self.reload()
|
|
1975
|
+
self.fileObj = open(self._fileName,'r+b')
|
|
1976
|
+
self.createIfNotExist = createIfNotExist
|
|
1977
|
+
self.verifyHeader = verifyHeader
|
|
1978
|
+
return self
|
|
1979
|
+
|
|
1980
|
+
# Private methods for reading and writing values for TSVZedLite
|
|
1981
|
+
|
|
1982
|
+
def __writeValues(self,data):
|
|
1983
|
+
self.fileObj.seek(0, os.SEEK_END)
|
|
1984
|
+
write_at = self.fileObj.tell()
|
|
1985
|
+
if self.verbose:
|
|
1986
|
+
eprint(f"Writing at position {write_at}")
|
|
1987
|
+
data = _sanitize(data,delimiter=self.delimiter)
|
|
1988
|
+
data = self.delimiter.join(data)
|
|
1989
|
+
bytes = self.fileObj.write((data.encode(encoding=self.encoding,errors='replace') + b'\n'))
|
|
1990
|
+
if self.verbose:
|
|
1991
|
+
eprint(f"Wrote {bytes} bytes")
|
|
1992
|
+
return write_at
|
|
1993
|
+
|
|
1994
|
+
def __mapDeleteToFile(self,key):
|
|
1995
|
+
if key == DEFAULTS_INDICATOR_KEY:
|
|
1996
|
+
self.defaults = [DEFAULTS_INDICATOR_KEY]
|
|
1997
|
+
if self.verbose:
|
|
1998
|
+
eprint("Defaults cleared")
|
|
1999
|
+
# delete the key from the dictionary and update the file
|
|
2000
|
+
elif key not in self.indexes:
|
|
2001
|
+
if self.verbose:
|
|
2002
|
+
eprint(f"Key {key} not found")
|
|
2003
|
+
return
|
|
2004
|
+
elif key.startswith('#'):
|
|
2005
|
+
if self.verbose:
|
|
2006
|
+
eprint(f"Key {key} deleted in memory")
|
|
2007
|
+
return
|
|
2008
|
+
if self.verbose:
|
|
2009
|
+
eprint(f"Appending empty line {key}")
|
|
2010
|
+
self.indexes[key] = self.__writeValues([key])
|
|
2011
|
+
|
|
2012
|
+
def __readValuesAtPos(self,pos,key = ...):
|
|
2013
|
+
self.fileObj.seek(pos)
|
|
2014
|
+
line = self.fileObj.readline().decode(self.encoding,errors='replace')
|
|
2015
|
+
self.correctColumnNum, segments = _processLine(
|
|
2016
|
+
line=line,
|
|
2017
|
+
taskDic={},
|
|
2018
|
+
correctColumnNum=self.correctColumnNum,
|
|
2019
|
+
strict=self.strict,
|
|
2020
|
+
delimiter=self.delimiter,
|
|
2021
|
+
defaults=self.defaults,
|
|
2022
|
+
storeOffset=True,
|
|
2023
|
+
)
|
|
2024
|
+
if self.verbose:
|
|
2025
|
+
eprint(f"Read at position {pos}: {segments}")
|
|
2026
|
+
if key is not ... and segments[0] != key:
|
|
2027
|
+
eprint(f"Warning: Key mismatch at position {pos}: expected {key}, got {segments[0]}")
|
|
2028
|
+
if self.strict:
|
|
2029
|
+
eprint("Error: Key mismatch and strict mode enabled. Raising KeyError.")
|
|
2030
|
+
raise KeyError(key)
|
|
2031
|
+
else :
|
|
2032
|
+
eprint("Continuing despite key mismatch due to non-strict mode. Expect errors!")
|
|
2033
|
+
return segments
|
|
2034
|
+
|
|
2035
|
+
# Implement basic __getitem__, __setitem__, __delitem__, __iter__, and __len__. needed for MutableMapping
|
|
2036
|
+
def __getitem__(self,key):
|
|
2037
|
+
key = str(key).rstrip()
|
|
2038
|
+
if key not in self.indexes:
|
|
2039
|
+
if key == DEFAULTS_INDICATOR_KEY:
|
|
2040
|
+
return self.defaults
|
|
2041
|
+
raise KeyError(key)
|
|
2042
|
+
pos = self.indexes[key]
|
|
2043
|
+
return self.__readValuesAtPos(pos,key)
|
|
2044
|
+
|
|
2045
|
+
def __setitem__(self,key,value):
|
|
2046
|
+
key = str(key).rstrip()
|
|
2047
|
+
if not key:
|
|
2048
|
+
eprint('Error: Key cannot be empty')
|
|
2049
|
+
return
|
|
2050
|
+
if isinstance(value,str):
|
|
2051
|
+
value = value.split(self.delimiter)
|
|
2052
|
+
# sanitize the value
|
|
2053
|
+
value = [str(s).rstrip() if s else '' for s in value]
|
|
2054
|
+
# the first field in value should be the key
|
|
2055
|
+
# add it if it is not there
|
|
2056
|
+
if not value or value[0] != key:
|
|
2057
|
+
value = [key]+value
|
|
2058
|
+
# verify the value has the correct number of columns
|
|
2059
|
+
if self.correctColumnNum != 1 and len(value) == 1:
|
|
2060
|
+
# this means we want to clear / delete the key
|
|
2061
|
+
del self[key]
|
|
2062
|
+
elif self.correctColumnNum > 0:
|
|
2063
|
+
if len(value) != self.correctColumnNum:
|
|
2064
|
+
if self.strict:
|
|
2065
|
+
eprint(f"Error: Value {value} does not have the correct number of columns: {self.correctColumnNum}. Refuse adding key...")
|
|
2066
|
+
return
|
|
2067
|
+
elif self.verbose:
|
|
2068
|
+
eprint(f"Warning: Value {value} does not have the correct number of columns: {self.correctColumnNum}, correcting...")
|
|
2069
|
+
if len(value) < self.correctColumnNum:
|
|
2070
|
+
value += ['']*(self.correctColumnNum-len(value))
|
|
2071
|
+
elif len(value) > self.correctColumnNum:
|
|
2072
|
+
value = value[:self.correctColumnNum]
|
|
2073
|
+
else:
|
|
2074
|
+
self.correctColumnNum = len(value)
|
|
2075
|
+
if self.defaults and len(self.defaults) > 1:
|
|
2076
|
+
for i in range(1,len(value)):
|
|
2077
|
+
if not value[i] and i < len(self.defaults) and self.defaults[i]:
|
|
2078
|
+
value[i] = self.defaults[i]
|
|
2079
|
+
if self.verbose:
|
|
2080
|
+
eprint(f" Replacing empty value at {i} with default: {self.defaults[i]}")
|
|
2081
|
+
if key == DEFAULTS_INDICATOR_KEY:
|
|
2082
|
+
self.defaults = value
|
|
2083
|
+
if self.verbose:
|
|
2084
|
+
eprint(f"Defaults set to {value}")
|
|
2085
|
+
elif key.startswith('#'):
|
|
2086
|
+
if self.verbose:
|
|
2087
|
+
eprint(f"Key {key} updated in memory (data in index) as it starts with #")
|
|
2088
|
+
self.indexes[key] = value
|
|
2089
|
+
return
|
|
2090
|
+
if self.verbose:
|
|
2091
|
+
eprint(f"Writing {key}: {value}")
|
|
2092
|
+
self.indexes[key] = self.__writeValues(value)
|
|
2093
|
+
|
|
2094
|
+
def __delitem__(self,key):
|
|
2095
|
+
key = str(key).rstrip()
|
|
2096
|
+
self.indexes.pop(key,None)
|
|
2097
|
+
self.__mapDeleteToFile(key)
|
|
2098
|
+
|
|
2099
|
+
def __iter__(self):
|
|
2100
|
+
return iter(self.indexes)
|
|
2101
|
+
|
|
2102
|
+
def __len__(self):
|
|
2103
|
+
return len(self.indexes)
|
|
2104
|
+
|
|
2105
|
+
# Implement additional methods for dict like interface (order of function are somewhat from OrderedDict)
|
|
2106
|
+
def __reversed__(self):
|
|
2107
|
+
return reversed(self.indexes)
|
|
2108
|
+
|
|
2109
|
+
def clear(self):
|
|
2110
|
+
# clear the dictionary and update the file
|
|
2111
|
+
self.indexes.clear()
|
|
2112
|
+
self.clear_file()
|
|
2113
|
+
return self
|
|
2114
|
+
|
|
2115
|
+
def popitem(self, last=True,return_pos = False):
|
|
2116
|
+
if last:
|
|
2117
|
+
key, pos = self.indexes.popitem()
|
|
2118
|
+
else:
|
|
2119
|
+
try:
|
|
2120
|
+
key = next(iter(self.indexes))
|
|
2121
|
+
pos = self.indexes.pop(key)
|
|
2122
|
+
except StopIteration:
|
|
2123
|
+
raise KeyError("popitem(): dictionary is empty")
|
|
2124
|
+
if return_pos:
|
|
2125
|
+
value = pos
|
|
2126
|
+
else:
|
|
2127
|
+
value = self.__readValuesAtPos(pos,key)
|
|
2128
|
+
self.__mapDeleteToFile(key)
|
|
2129
|
+
return key, value
|
|
2130
|
+
|
|
2131
|
+
__marker = object()
|
|
2132
|
+
def pop(self, key, default=__marker, return_pos = False):
|
|
2133
|
+
key = str(key).rstrip()
|
|
2134
|
+
try:
|
|
2135
|
+
pos = self.indexes.pop(key)
|
|
2136
|
+
except KeyError:
|
|
2137
|
+
if default is self.__marker:
|
|
2138
|
+
raise KeyError(key)
|
|
2139
|
+
elif default is ...:
|
|
2140
|
+
return self.defaults
|
|
2141
|
+
return default
|
|
2142
|
+
if return_pos:
|
|
2143
|
+
value = pos
|
|
2144
|
+
else:
|
|
2145
|
+
value = self.__readValuesAtPos(pos,key)
|
|
2146
|
+
self.__mapDeleteToFile(key)
|
|
2147
|
+
return value
|
|
2148
|
+
|
|
2149
|
+
def __sizeof__(self):
|
|
2150
|
+
sizeof = sys.getsizeof
|
|
2151
|
+
size = sizeof(super()) + sizeof(True) * 6 # for the booleans / integers
|
|
2152
|
+
size += sizeof(self._fileName)
|
|
2153
|
+
size += sizeof(self.header)
|
|
2154
|
+
size += sizeof(self.encoding)
|
|
2155
|
+
size += sizeof(self.delimiter)
|
|
2156
|
+
size += sizeof(self.defaults)
|
|
2157
|
+
size += sizeof(self.indexes)
|
|
2158
|
+
size += sizeof(self.fileObj)
|
|
2159
|
+
return size
|
|
2160
|
+
|
|
2161
|
+
def __repr__(self):
|
|
2162
|
+
return f"""TSVZed at {hex(id(self))}(
|
|
2163
|
+
file_name:{self._fileName}
|
|
2164
|
+
index_count:{len(self.indexes)}
|
|
2165
|
+
header:{self.header}
|
|
2166
|
+
correctColumnNum:{self.correctColumnNum}
|
|
2167
|
+
createIfNotExist:{self.createIfNotExist}
|
|
2168
|
+
verifyHeader:{self.verifyHeader}
|
|
2169
|
+
strict:{self.strict}
|
|
2170
|
+
delimiter:{self.delimiter}
|
|
2171
|
+
defaults:{self.defaults}
|
|
2172
|
+
verbose:{self.verbose}
|
|
2173
|
+
encoding:{self.encoding}
|
|
2174
|
+
file_descriptor:{self.fileObj.fileno()}
|
|
2175
|
+
)"""
|
|
2176
|
+
|
|
2177
|
+
def __str__(self):
|
|
2178
|
+
return f"TSVZedLite({self._fileName})"
|
|
2179
|
+
|
|
2180
|
+
def __reduce__(self):
|
|
2181
|
+
'Return state information for pickling'
|
|
2182
|
+
# Return minimal state needed to reconstruct
|
|
2183
|
+
return (
|
|
2184
|
+
self.__class__,
|
|
2185
|
+
(self._fileName, self.header, self.createIfNotExist, self.verifyHeader,
|
|
2186
|
+
self.verbose, self.encoding, self.delimiter, self.defaults, self.strict,
|
|
2187
|
+
self.correctColumnNum),
|
|
2188
|
+
None,
|
|
2189
|
+
None,
|
|
2190
|
+
None
|
|
2191
|
+
)
|
|
2192
|
+
def copy(self):
|
|
2193
|
+
'Return a shallow copy of the ordered dictionary.'
|
|
2194
|
+
new = self.__class__(
|
|
2195
|
+
self._fileName,
|
|
2196
|
+
self.header,
|
|
2197
|
+
self.createIfNotExist,
|
|
2198
|
+
self.verifyHeader,
|
|
2199
|
+
self.verbose,
|
|
2200
|
+
self.encoding,
|
|
2201
|
+
self.delimiter,
|
|
2202
|
+
self.defaults,
|
|
2203
|
+
self.strict,
|
|
2204
|
+
self.correctColumnNum,
|
|
2205
|
+
self.indexes,
|
|
2206
|
+
self.fileObj,
|
|
2207
|
+
)
|
|
2208
|
+
eprint("""
|
|
2209
|
+
Warning: Copying TSVZedLite will share the same file object and indexes.
|
|
2210
|
+
Changes in one will affect the other.
|
|
2211
|
+
There is likely very little reason to copy a TSVZedLite instance unless you are immadiately then calling switchFile() on it.
|
|
2212
|
+
""")
|
|
2213
|
+
return new
|
|
2214
|
+
|
|
2215
|
+
@classmethod
|
|
2216
|
+
def fromkeys(cls, iterable, value=None,fileName = None,header = '',createIfNotExist = True,verifyHeader = True,verbose = False,encoding = 'utf8',
|
|
2217
|
+
delimiter = ...,defaults = None,strict = True,correctColumnNum = -1):
|
|
2218
|
+
'''Create a new ordered dictionary with keys from iterable and values set to value.
|
|
2219
|
+
'''
|
|
2220
|
+
self = cls(fileName,header,createIfNotExist,verifyHeader,verbose,encoding,delimiter,defaults,strict,correctColumnNum)
|
|
2221
|
+
for key in iterable:
|
|
2222
|
+
self[key] = value
|
|
2223
|
+
return self
|
|
2224
|
+
|
|
2225
|
+
def __eq__(self, other):
|
|
2226
|
+
if isinstance(other, TSVZedLite):
|
|
2227
|
+
eprint("Warning: Comparing two TSVZedLite instances will only compare their indexes. Data content is not compared.")
|
|
2228
|
+
return self.indexes == other.indexes
|
|
2229
|
+
return super().__eq__(other)
|
|
2230
|
+
|
|
2231
|
+
def __ior__(self, other):
|
|
2232
|
+
self.update(other)
|
|
2233
|
+
return self
|
|
2234
|
+
|
|
2235
|
+
# Implement context manager methods
|
|
2236
|
+
def __enter__(self):
|
|
2237
|
+
return self
|
|
2238
|
+
|
|
2239
|
+
def close(self):
|
|
2240
|
+
self.fileObj.close()
|
|
2241
|
+
return self
|
|
2242
|
+
|
|
2243
|
+
def __exit__(self,exc_type,exc_value,traceback):
|
|
2244
|
+
return self.close()
|
|
2245
|
+
|
|
2246
|
+
|
|
1515
2247
|
|
|
1516
2248
|
|
|
1517
2249
|
def __main__():
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1527
|
-
|
|
1528
|
-
|
|
1529
|
-
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1540
|
-
|
|
1541
|
-
|
|
1542
|
-
|
|
1543
|
-
|
|
1544
|
-
|
|
1545
|
-
|
|
1546
|
-
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
1555
|
-
|
|
1556
|
-
|
|
1557
|
-
|
|
1558
|
-
|
|
1559
|
-
|
|
1560
|
-
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
2250
|
+
import argparse
|
|
2251
|
+
parser = argparse.ArgumentParser(description='TSVZed: A TSV / CSV / NSV file manager')
|
|
2252
|
+
parser.add_argument('filename', type=str, help='The file to read')
|
|
2253
|
+
parser.add_argument('operation', type=str,nargs='?', choices=['read','append','delete','clear','scrub'], help='The operation to perform. Note: scrub will also remove all comments. Default: read', default='read')
|
|
2254
|
+
parser.add_argument('line', type=str, nargs='*', help='The line to append to the Tabular file. it follows as : {key} {value1} {value2} ... if a key without value be inserted, the value will get deleted.')
|
|
2255
|
+
parser.add_argument('-d', '--delimiter', type=str, help='The delimiter of the Tabular file. Default: Infer from last part of filename, or tab if cannot determine. Note: accept unicode escaped char, raw char, or string "comma,tab,null" will refer to their characters. ', default=...)
|
|
2256
|
+
parser.add_argument('-c', '--header', type=str, help='Perform checks with this header of the Tabular file. seperate using --delimiter.')
|
|
2257
|
+
parser.add_argument('--defaults', type=str, help='Default values to fill in the missing columns. seperate using --delimiter. Ex. if -d = comma, --defaults="key,value1,value2..." Note: Please specify the key. But it will not be used as a key need to be unique in data.')
|
|
2258
|
+
strictMode = parser.add_mutually_exclusive_group()
|
|
2259
|
+
strictMode.add_argument('-s', '--strict', dest = 'strict',action='store_true', help='Strict mode. Do not parse values that seems malformed, check for column numbers / headers')
|
|
2260
|
+
strictMode.add_argument('-f', '--force', dest = 'strict',action='store_false', help='Force the operation. Ignore checks for column numbers / headers')
|
|
2261
|
+
parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
|
|
2262
|
+
parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {version} @ {COMMIT_DATE} by {author}')
|
|
2263
|
+
args = parser.parse_args()
|
|
2264
|
+
args.delimiter = get_delimiter(delimiter=args.delimiter,file_name=args.filename)
|
|
2265
|
+
if args.header and args.header.endswith('\\'):
|
|
2266
|
+
args.header += '\\'
|
|
2267
|
+
try:
|
|
2268
|
+
header = args.header.encode().decode('unicode_escape') if args.header else ''
|
|
2269
|
+
except Exception:
|
|
2270
|
+
print(f"Failed to decode header: {args.header}")
|
|
2271
|
+
header = ''
|
|
2272
|
+
defaults = []
|
|
2273
|
+
if args.defaults:
|
|
2274
|
+
try:
|
|
2275
|
+
defaults = args.defaults.encode().decode('unicode_escape').split(args.delimiter)
|
|
2276
|
+
except Exception:
|
|
2277
|
+
print(f"Failed to decode defaults: {args.defaults}")
|
|
2278
|
+
defaults = []
|
|
2279
|
+
|
|
2280
|
+
if args.operation == 'read':
|
|
2281
|
+
# check if the file exist
|
|
2282
|
+
if not os.path.isfile(args.filename):
|
|
2283
|
+
print(f"File not found: {args.filename}")
|
|
2284
|
+
return
|
|
2285
|
+
# read the file
|
|
2286
|
+
data = readTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= args.strict, delimiter=args.delimiter, defaults=defaults)
|
|
2287
|
+
print(pretty_format_table(data.values(),delimiter=args.delimiter))
|
|
2288
|
+
elif args.operation == 'append':
|
|
2289
|
+
appendTabularFile(args.filename, args.line,createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
|
|
2290
|
+
elif args.operation == 'delete':
|
|
2291
|
+
appendTabularFile(args.filename, args.line[:1],createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
|
|
2292
|
+
elif args.operation == 'clear':
|
|
2293
|
+
clearTabularFile(args.filename, header=header, verbose=args.verbose, verifyHeader=args.strict, delimiter=args.delimiter)
|
|
2294
|
+
elif args.operation == 'scrub':
|
|
2295
|
+
scrubTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= args.strict, delimiter=args.delimiter, defaults=defaults)
|
|
2296
|
+
else:
|
|
2297
|
+
print("Invalid operation")
|
|
1566
2298
|
if __name__ == '__main__':
|
|
1567
|
-
|
|
2299
|
+
__main__()
|