TSVZ 2.70__py3-none-any.whl → 3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {TSVZ-2.70.dist-info → TSVZ-3.10.dist-info}/METADATA +57 -56
- TSVZ-3.10.dist-info/RECORD +6 -0
- {TSVZ-2.70.dist-info → TSVZ-3.10.dist-info}/entry_points.txt +0 -0
- {TSVZ-2.70.dist-info → TSVZ-3.10.dist-info}/top_level.txt +0 -0
- TSVZ.py +416 -179
- TSVZ-2.70.dist-info/LICENSE +0 -674
- TSVZ-2.70.dist-info/RECORD +0 -7
- {TSVZ-2.70.dist-info → TSVZ-3.10.dist-info}/WHEEL +0 -0
TSVZ.py
CHANGED
|
@@ -4,59 +4,91 @@ from collections import OrderedDict , deque
|
|
|
4
4
|
import time
|
|
5
5
|
import atexit
|
|
6
6
|
import threading
|
|
7
|
+
import re
|
|
7
8
|
|
|
8
9
|
if os.name == 'nt':
|
|
9
10
|
import msvcrt
|
|
10
11
|
elif os.name == 'posix':
|
|
11
12
|
import fcntl
|
|
12
13
|
|
|
13
|
-
version = '
|
|
14
|
+
version = '3.10'
|
|
14
15
|
author = 'pan@zopyr.us'
|
|
15
16
|
|
|
17
|
+
DEFAULT_DELIMITER = '\t'
|
|
18
|
+
DEFAULTS_INDICATOR_KEY = '#_defaults_#'
|
|
16
19
|
|
|
17
|
-
def
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
20
|
+
def get_delimiter(delimiter,file_name = ''):
|
|
21
|
+
if not delimiter:
|
|
22
|
+
return DEFAULT_DELIMITER
|
|
23
|
+
elif delimiter == ...:
|
|
24
|
+
if not file_name:
|
|
25
|
+
rtn = '\t'
|
|
26
|
+
if file_name.endswith('.csv'):
|
|
27
|
+
rtn = ','
|
|
28
|
+
elif file_name.endswith('.nsv'):
|
|
29
|
+
rtn = '\0'
|
|
30
|
+
elif file_name.endswith('.psv'):
|
|
31
|
+
rtn = '|'
|
|
32
|
+
else:
|
|
33
|
+
rtn = '\t'
|
|
34
|
+
elif delimiter == 'comma':
|
|
35
|
+
rtn = ','
|
|
36
|
+
elif delimiter == 'tab':
|
|
37
|
+
rtn = '\t'
|
|
38
|
+
elif delimiter == 'pipe':
|
|
39
|
+
rtn = '|'
|
|
40
|
+
elif delimiter == 'null':
|
|
41
|
+
rtn = '\0'
|
|
42
|
+
else:
|
|
43
|
+
rtn = delimiter.encode().decode('unicode_escape')
|
|
44
|
+
DEFAULT_DELIMITER = rtn
|
|
45
|
+
return rtn
|
|
46
|
+
|
|
47
|
+
def pretty_format_table(data, delimiter = DEFAULT_DELIMITER):
|
|
48
|
+
version = 1.0
|
|
49
|
+
if not data:
|
|
50
|
+
return ''
|
|
51
|
+
if type(data) == str:
|
|
52
|
+
data = data.strip('\n').split('\n')
|
|
53
|
+
data = [line.split(delimiter) for line in data]
|
|
54
|
+
elif isinstance(data, dict):
|
|
55
|
+
# flatten the 2D dict to a list of lists
|
|
56
|
+
if isinstance(next(iter(data.values())), dict):
|
|
57
|
+
tempData = [['key'] + list(next(iter(data.values())).keys())]
|
|
58
|
+
tempData.extend( [[key] + list(value.values()) for key, value in data.items()])
|
|
59
|
+
data = tempData
|
|
60
|
+
else:
|
|
61
|
+
# it is a dict of lists
|
|
62
|
+
data = [[key] + list(value) for key, value in data.items()]
|
|
63
|
+
elif type(data) != list:
|
|
64
|
+
data = list(data)
|
|
65
|
+
# format the list into 2d list of list of strings
|
|
66
|
+
if isinstance(data[0], dict):
|
|
67
|
+
tempData = [data[0].keys()]
|
|
68
|
+
tempData.extend([list(item.values()) for item in data])
|
|
69
|
+
data = tempData
|
|
70
|
+
data = [[str(item) for item in row] for row in data]
|
|
71
|
+
num_cols = len(data[0])
|
|
72
|
+
col_widths = [0] * num_cols
|
|
73
|
+
# Calculate the maximum width of each column
|
|
74
|
+
for c in range(num_cols):
|
|
75
|
+
#col_widths[c] = max(len(row[c]) for row in data)
|
|
76
|
+
# handle ansii escape sequences
|
|
77
|
+
col_widths[c] = max(len(re.sub(r'\x1b\[[0-?]*[ -/]*[@-~]','',row[c])) for row in data)
|
|
78
|
+
# Build the row format string
|
|
79
|
+
row_format = ' | '.join('{{:<{}}}'.format(width) for width in col_widths)
|
|
80
|
+
# Print the header
|
|
81
|
+
header = data[0]
|
|
82
|
+
outTable = []
|
|
83
|
+
outTable.append(row_format.format(*header))
|
|
84
|
+
outTable.append('-+-'.join('-' * width for width in col_widths))
|
|
85
|
+
for row in data[1:]:
|
|
86
|
+
# if the row is empty, print an divider
|
|
87
|
+
if not any(row):
|
|
88
|
+
outTable.append('-+-'.join('-' * width for width in col_widths))
|
|
89
|
+
else:
|
|
90
|
+
outTable.append(row_format.format(*row))
|
|
91
|
+
return '\n'.join(outTable) + '\n'
|
|
60
92
|
|
|
61
93
|
def __teePrintOrNot(message,level = 'info',teeLogger = None):
|
|
62
94
|
"""
|
|
@@ -78,7 +110,7 @@ def __teePrintOrNot(message,level = 'info',teeLogger = None):
|
|
|
78
110
|
except Exception as e:
|
|
79
111
|
print(message,flush=True)
|
|
80
112
|
|
|
81
|
-
def
|
|
113
|
+
def _processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,strict = True,delimiter = DEFAULT_DELIMITER,defaults = []):
|
|
82
114
|
"""
|
|
83
115
|
Process a line of text and update the task dictionary.
|
|
84
116
|
|
|
@@ -89,47 +121,52 @@ def processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,s
|
|
|
89
121
|
verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
90
122
|
teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
|
|
91
123
|
strict (bool, optional): Whether to strictly enforce the correct number of columns. Defaults to True.
|
|
124
|
+
defaults (list, optional): The default values to use for missing columns. Defaults to [].
|
|
92
125
|
|
|
93
126
|
Returns:
|
|
94
127
|
tuple: A tuple containing the updated correctColumnNum and the processed lineCache.
|
|
95
128
|
|
|
96
129
|
"""
|
|
97
|
-
line = line.
|
|
130
|
+
line = line.strip(' ').strip('\x00').rstrip('\r\n')
|
|
98
131
|
# we throw away the lines that start with '#'
|
|
99
132
|
if not line :
|
|
100
133
|
if verbose:
|
|
101
134
|
__teePrintOrNot(f"Ignoring empty line: {line}",teeLogger=teeLogger)
|
|
102
135
|
return correctColumnNum , []
|
|
103
|
-
if line.startswith('#'):
|
|
136
|
+
if line.startswith('#') and not line.startswith(DEFAULTS_INDICATOR_KEY):
|
|
104
137
|
if verbose:
|
|
105
138
|
__teePrintOrNot(f"Ignoring comment line: {line}",teeLogger=teeLogger)
|
|
106
139
|
return correctColumnNum , []
|
|
107
140
|
# we only interested in the lines that have the correct number of columns
|
|
108
|
-
lineCache = [segment.
|
|
141
|
+
lineCache = [segment.rstrip() for segment in line.split(delimiter)]
|
|
109
142
|
if not lineCache:
|
|
110
143
|
return correctColumnNum , []
|
|
111
144
|
if correctColumnNum == -1:
|
|
145
|
+
if defaults and len(defaults) > 1:
|
|
146
|
+
correctColumnNum = len(defaults)
|
|
147
|
+
else:
|
|
148
|
+
correctColumnNum = len(lineCache)
|
|
112
149
|
if verbose:
|
|
113
150
|
__teePrintOrNot(f"detected correctColumnNum: {len(lineCache)}",teeLogger=teeLogger)
|
|
114
|
-
correctColumnNum = len(lineCache)
|
|
115
151
|
if not lineCache[0]:
|
|
116
152
|
if verbose:
|
|
117
153
|
__teePrintOrNot(f"Ignoring line with empty key: {line}",teeLogger=teeLogger)
|
|
118
154
|
return correctColumnNum , []
|
|
119
155
|
if len(lineCache) == 1 or not any(lineCache[1:]):
|
|
120
|
-
if correctColumnNum == 1:
|
|
156
|
+
if correctColumnNum == 1:
|
|
157
|
+
taskDic[lineCache[0]] = lineCache
|
|
158
|
+
elif lineCache[0] == DEFAULTS_INDICATOR_KEY:
|
|
159
|
+
if verbose:
|
|
160
|
+
__teePrintOrNot(f"Empty defaults line found: {line}",teeLogger=teeLogger)
|
|
161
|
+
defaults = []
|
|
121
162
|
else:
|
|
122
163
|
if verbose:
|
|
123
164
|
__teePrintOrNot(f"Key {lineCache[0]} found with empty value, deleting such key's representaion",teeLogger=teeLogger)
|
|
124
165
|
if lineCache[0] in taskDic:
|
|
125
166
|
del taskDic[lineCache[0]]
|
|
126
167
|
return correctColumnNum , []
|
|
127
|
-
elif len(lineCache)
|
|
128
|
-
|
|
129
|
-
if verbose:
|
|
130
|
-
__teePrintOrNot(f"Key {lineCache[0]} added",teeLogger=teeLogger)
|
|
131
|
-
else:
|
|
132
|
-
if strict:
|
|
168
|
+
elif len(lineCache) != correctColumnNum:
|
|
169
|
+
if strict and not any(defaults):
|
|
133
170
|
if verbose:
|
|
134
171
|
__teePrintOrNot(f"Ignoring line with {len(lineCache)} columns: {line}",teeLogger=teeLogger)
|
|
135
172
|
return correctColumnNum , []
|
|
@@ -139,12 +176,26 @@ def processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,s
|
|
|
139
176
|
lineCache += ['']*(correctColumnNum-len(lineCache))
|
|
140
177
|
elif len(lineCache) > correctColumnNum:
|
|
141
178
|
lineCache = lineCache[:correctColumnNum]
|
|
142
|
-
taskDic[lineCache[0]] = lineCache
|
|
143
179
|
if verbose:
|
|
144
|
-
__teePrintOrNot(f"
|
|
180
|
+
__teePrintOrNot(f"Correcting {lineCache[0]}",teeLogger=teeLogger)
|
|
181
|
+
# now replace empty values with defaults
|
|
182
|
+
if defaults and len(defaults) > 1:
|
|
183
|
+
for i in range(1,len(lineCache)):
|
|
184
|
+
if not lineCache[i] and i < len(defaults) and defaults[i]:
|
|
185
|
+
lineCache[i] = defaults[i]
|
|
186
|
+
if verbose:
|
|
187
|
+
__teePrintOrNot(f"Replacing empty value at {i} with default: {defaults[i]}",teeLogger=teeLogger)
|
|
188
|
+
if lineCache[0] == DEFAULTS_INDICATOR_KEY:
|
|
189
|
+
if verbose:
|
|
190
|
+
__teePrintOrNot(f"Defaults line found: {line}",teeLogger=teeLogger)
|
|
191
|
+
defaults = lineCache
|
|
192
|
+
return correctColumnNum , []
|
|
193
|
+
taskDic[lineCache[0]] = lineCache
|
|
194
|
+
if verbose:
|
|
195
|
+
__teePrintOrNot(f"Key {lineCache[0]} added",teeLogger=teeLogger)
|
|
145
196
|
return correctColumnNum, lineCache
|
|
146
197
|
|
|
147
|
-
def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False):
|
|
198
|
+
def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False,encoding = 'utf8',delimiter = ...,defaults = []):
|
|
148
199
|
"""
|
|
149
200
|
Reads the last valid line from a file.
|
|
150
201
|
|
|
@@ -154,13 +205,17 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
|
|
|
154
205
|
correctColumnNum (int): A column number to pass to processLine function.
|
|
155
206
|
verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
156
207
|
teeLogger (optional): Logger to use for tee print. Defaults to None.
|
|
208
|
+
encoding (str, optional): The encoding of the file. Defaults to None.
|
|
157
209
|
strict (bool, optional): Whether to enforce strict processing. Defaults to False.
|
|
210
|
+
delimiter (str, optional): The delimiter used in the file. Defaults to None.
|
|
211
|
+
defaults (list, optional): The default values to use for missing columns. Defaults to [].
|
|
158
212
|
|
|
159
213
|
Returns:
|
|
160
214
|
list: The last valid line data processed by processLine, or an empty list if none found.
|
|
161
215
|
"""
|
|
162
216
|
chunk_size = 1024 # Read in chunks of 1024 bytes
|
|
163
217
|
last_valid_line = []
|
|
218
|
+
delimiter = get_delimiter(delimiter,file_name=fileName)
|
|
164
219
|
if verbose:
|
|
165
220
|
__teePrintOrNot(f"Reading last line only from {fileName}",teeLogger=teeLogger)
|
|
166
221
|
with open(fileName, 'rb') as file:
|
|
@@ -186,13 +241,15 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
|
|
|
186
241
|
for i in range(len(lines) - 1, -1, -1):
|
|
187
242
|
if lines[i].strip(): # Skip empty lines
|
|
188
243
|
# Process the line
|
|
189
|
-
correctColumnNum, lineCache =
|
|
190
|
-
lines[i],
|
|
191
|
-
taskDic,
|
|
192
|
-
correctColumnNum,
|
|
244
|
+
correctColumnNum, lineCache = _processLine(
|
|
245
|
+
line=lines[i].decode(encoding=encoding),
|
|
246
|
+
taskDic=taskDic,
|
|
247
|
+
correctColumnNum=correctColumnNum,
|
|
193
248
|
verbose=verbose,
|
|
194
249
|
teeLogger=teeLogger,
|
|
195
|
-
strict=strict
|
|
250
|
+
strict=strict,
|
|
251
|
+
delimiter=delimiter,
|
|
252
|
+
defaults=defaults,
|
|
196
253
|
)
|
|
197
254
|
# If the line is valid, return it
|
|
198
255
|
if lineCache and any(lineCache):
|
|
@@ -204,7 +261,7 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
|
|
|
204
261
|
# Return empty list if no valid line found
|
|
205
262
|
return last_valid_line
|
|
206
263
|
|
|
207
|
-
def
|
|
264
|
+
def _formatHeader(header,verbose = False,teeLogger = None,delimiter = DEFAULT_DELIMITER):
|
|
208
265
|
"""
|
|
209
266
|
Format the header string.
|
|
210
267
|
|
|
@@ -218,12 +275,12 @@ def formatHeader(header,verbose = False,teeLogger = None):
|
|
|
218
275
|
"""
|
|
219
276
|
if type(header) != str:
|
|
220
277
|
try:
|
|
221
|
-
header =
|
|
278
|
+
header = delimiter.join(header)
|
|
222
279
|
except:
|
|
223
280
|
if verbose:
|
|
224
281
|
__teePrintOrNot('Invalid header, setting header to empty.','error',teeLogger=teeLogger)
|
|
225
282
|
header = ''
|
|
226
|
-
header = header.
|
|
283
|
+
header = delimiter.join([segment.rstrip() for segment in header.split(delimiter)])
|
|
227
284
|
# if header:
|
|
228
285
|
# if not header.endswith('\n'):
|
|
229
286
|
# header += '\n'
|
|
@@ -231,7 +288,7 @@ def formatHeader(header,verbose = False,teeLogger = None):
|
|
|
231
288
|
# header = ''
|
|
232
289
|
return header
|
|
233
290
|
|
|
234
|
-
def
|
|
291
|
+
def _lineContainHeader(header,line,verbose = False,teeLogger = None,strict = False,delimiter = DEFAULT_DELIMITER):
|
|
235
292
|
"""
|
|
236
293
|
Verify if a line contains the header.
|
|
237
294
|
|
|
@@ -245,26 +302,24 @@ def lineContainHeader(header,line,verbose = False,teeLogger = None,strict = Fals
|
|
|
245
302
|
Returns:
|
|
246
303
|
bool: True if the header matches the line, False otherwise.
|
|
247
304
|
"""
|
|
248
|
-
|
|
249
|
-
|
|
305
|
+
header = [segment.rstrip() for segment in header.split(delimiter)]
|
|
306
|
+
line = [segment.rstrip() for segment in line.split(delimiter)]
|
|
250
307
|
if verbose:
|
|
251
|
-
__teePrintOrNot(f"Header: \n{
|
|
252
|
-
__teePrintOrNot(f"First line: \n{
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
if len(headerList) != len(lineList) or any([headerList[i] not in lineList[i] for i in range(len(headerList))]):
|
|
256
|
-
__teePrintOrNot(f"Header mismatch: \n{escapedLine} \n!= \n{escapedHeader}",teeLogger=teeLogger)
|
|
308
|
+
__teePrintOrNot(f"Header: \n{header}",teeLogger=teeLogger)
|
|
309
|
+
__teePrintOrNot(f"First line: \n{line}",teeLogger=teeLogger)
|
|
310
|
+
if len(header) != len(line) or any([header[i] not in line[i] for i in range(len(header))]):
|
|
311
|
+
__teePrintOrNot(f"Header mismatch: \n{line} \n!= \n{header}",teeLogger=teeLogger)
|
|
257
312
|
if strict:
|
|
258
313
|
raise Exception("Data format error! Header mismatch")
|
|
259
314
|
return False
|
|
260
315
|
return True
|
|
261
316
|
|
|
262
|
-
def
|
|
317
|
+
def _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = None,header = '',encoding = 'utf8',strict = True,delimiter = DEFAULT_DELIMITER):
|
|
263
318
|
"""
|
|
264
|
-
Verify the existence of
|
|
319
|
+
Verify the existence of the tabular file.
|
|
265
320
|
|
|
266
321
|
Parameters:
|
|
267
|
-
- fileName (str): The path of the
|
|
322
|
+
- fileName (str): The path of the tabular file.
|
|
268
323
|
- createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to True.
|
|
269
324
|
- teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
|
|
270
325
|
- header (str, optional): The header line to verify against. Defaults to ''.
|
|
@@ -274,8 +329,14 @@ def verifyTSVExistence(fileName,createIfNotExist = True,teeLogger = None,header
|
|
|
274
329
|
Returns:
|
|
275
330
|
bool: True if the file exists, False otherwise.
|
|
276
331
|
"""
|
|
277
|
-
if not fileName.endswith('.tsv'):
|
|
332
|
+
if delimiter and delimiter == '\t' and not fileName.endswith('.tsv'):
|
|
278
333
|
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .tsv','warning',teeLogger=teeLogger)
|
|
334
|
+
elif delimiter and delimiter == ',' and not fileName.endswith('.csv'):
|
|
335
|
+
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .csv','warning',teeLogger=teeLogger)
|
|
336
|
+
elif delimiter and delimiter == '\0' and not fileName.endswith('.nsv'):
|
|
337
|
+
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .nsv','warning',teeLogger=teeLogger)
|
|
338
|
+
elif delimiter and delimiter == '|' and not fileName.endswith('.psv'):
|
|
339
|
+
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .psv','warning',teeLogger=teeLogger)
|
|
279
340
|
if not os.path.isfile(fileName):
|
|
280
341
|
if createIfNotExist:
|
|
281
342
|
with open(fileName, mode ='w',encoding=encoding)as file:
|
|
@@ -289,14 +350,15 @@ def verifyTSVExistence(fileName,createIfNotExist = True,teeLogger = None,header
|
|
|
289
350
|
return False
|
|
290
351
|
return True
|
|
291
352
|
|
|
292
|
-
def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True):
|
|
353
|
+
def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = '\t',defaults = []):
|
|
293
354
|
"""
|
|
294
|
-
|
|
355
|
+
Compatibility method, calls readTabularFile.
|
|
356
|
+
Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
|
|
295
357
|
|
|
296
358
|
Parameters:
|
|
297
|
-
- fileName (str): The path to the
|
|
359
|
+
- fileName (str): The path to the Tabular file.
|
|
298
360
|
- teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
|
|
299
|
-
- header (str or list, optional): The header of the
|
|
361
|
+
- header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
|
|
300
362
|
- createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
|
|
301
363
|
- lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
|
|
302
364
|
- verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
|
|
@@ -304,9 +366,38 @@ def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, last
|
|
|
304
366
|
- taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
|
|
305
367
|
- encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
|
|
306
368
|
- strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
|
|
369
|
+
- delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t'.
|
|
370
|
+
- defaults (list, optional): The default values to use for missing columns. Defaults to [].
|
|
307
371
|
|
|
308
372
|
Returns:
|
|
309
|
-
- OrderedDict: The dictionary containing the data from the
|
|
373
|
+
- OrderedDict: The dictionary containing the data from the Tabular file.
|
|
374
|
+
|
|
375
|
+
Raises:
|
|
376
|
+
- Exception: If the file is not found or there is a data format error.
|
|
377
|
+
|
|
378
|
+
"""
|
|
379
|
+
return readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
|
|
380
|
+
|
|
381
|
+
def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = ...,defaults = []):
|
|
382
|
+
"""
|
|
383
|
+
Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
|
|
384
|
+
|
|
385
|
+
Parameters:
|
|
386
|
+
- fileName (str): The path to the Tabular file.
|
|
387
|
+
- teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
|
|
388
|
+
- header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
|
|
389
|
+
- createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
|
|
390
|
+
- lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
|
|
391
|
+
- verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
|
|
392
|
+
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
393
|
+
- taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
|
|
394
|
+
- encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
|
|
395
|
+
- strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
|
|
396
|
+
- delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
|
|
397
|
+
- defaults (list, optional): The default values to use for missing columns. Defaults to [].
|
|
398
|
+
|
|
399
|
+
Returns:
|
|
400
|
+
- OrderedDict: The dictionary containing the data from the Tabular file.
|
|
310
401
|
|
|
311
402
|
Raises:
|
|
312
403
|
- Exception: If the file is not found or there is a data format error.
|
|
@@ -314,33 +405,55 @@ def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, last
|
|
|
314
405
|
"""
|
|
315
406
|
if taskDic is None:
|
|
316
407
|
taskDic = {}
|
|
317
|
-
|
|
318
|
-
|
|
408
|
+
delimiter = get_delimiter(delimiter,file_name=fileName)
|
|
409
|
+
header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger, delimiter = delimiter)
|
|
410
|
+
if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
|
|
319
411
|
return taskDic
|
|
320
412
|
with open(fileName, mode ='rb')as file:
|
|
321
413
|
correctColumnNum = -1
|
|
322
|
-
if header.
|
|
414
|
+
if header.rstrip():
|
|
323
415
|
if verifyHeader:
|
|
324
|
-
line = file.readline().decode()
|
|
325
|
-
if
|
|
326
|
-
correctColumnNum = len(header.
|
|
416
|
+
line = file.readline().decode(encoding=encoding)
|
|
417
|
+
if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
|
|
418
|
+
correctColumnNum = len(header.split(delimiter))
|
|
327
419
|
if verbose:
|
|
328
420
|
__teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
|
|
329
421
|
if lastLineOnly:
|
|
330
|
-
lineCache = read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=verbose, teeLogger=teeLogger, strict=strict)
|
|
422
|
+
lineCache = read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=verbose, teeLogger=teeLogger, strict=strict, delimiter=delimiter, defaults=defaults)
|
|
331
423
|
if lineCache:
|
|
332
424
|
taskDic[lineCache[0]] = lineCache
|
|
333
425
|
return lineCache
|
|
334
426
|
for line in file:
|
|
335
|
-
correctColumnNum, lineCache =
|
|
427
|
+
correctColumnNum, lineCache = _processLine(line.decode(encoding=encoding),taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict,delimiter=delimiter,defaults = defaults)
|
|
336
428
|
return taskDic
|
|
337
429
|
|
|
338
|
-
def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True):
|
|
430
|
+
def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = '\t'):
|
|
431
|
+
"""
|
|
432
|
+
Compatibility method, calls appendTabularFile.
|
|
433
|
+
Append a line of data to a Tabular file.
|
|
434
|
+
Parameters:
|
|
435
|
+
- fileName (str): The path of the Tabular file.
|
|
436
|
+
- lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
|
|
437
|
+
- teeLogger (optional): A logger object for logging messages.
|
|
438
|
+
- header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
|
|
439
|
+
- createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
|
|
440
|
+
- verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
|
|
441
|
+
- verbose (bool, optional): If True, additional information will be printed during the execution.
|
|
442
|
+
- encoding (str, optional): The encoding of the file.
|
|
443
|
+
- strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
|
|
444
|
+
- delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
|
|
445
|
+
Raises:
|
|
446
|
+
- Exception: If the file does not exist and createIfNotExist is False.
|
|
447
|
+
- Exception: If the existing header does not match the provided header.
|
|
448
|
+
"""
|
|
449
|
+
return appendTabularFile(fileName,lineToAppend,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding, strict = strict, delimiter = delimiter)
|
|
450
|
+
|
|
451
|
+
def appendTabularFile(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = ...):
|
|
339
452
|
"""
|
|
340
|
-
Append a line of data to a
|
|
453
|
+
Append a line of data to a Tabular file.
|
|
341
454
|
Parameters:
|
|
342
|
-
- fileName (str): The path of the
|
|
343
|
-
- lineToAppend (str or list): The line of data to append. If it is a string, it will be split by
|
|
455
|
+
- fileName (str): The path of the Tabular file.
|
|
456
|
+
- lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
|
|
344
457
|
- teeLogger (optional): A logger object for logging messages.
|
|
345
458
|
- header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
|
|
346
459
|
- createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
|
|
@@ -348,15 +461,17 @@ def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExis
|
|
|
348
461
|
- verbose (bool, optional): If True, additional information will be printed during the execution.
|
|
349
462
|
- encoding (str, optional): The encoding of the file.
|
|
350
463
|
- strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
|
|
464
|
+
- delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
|
|
351
465
|
Raises:
|
|
352
466
|
- Exception: If the file does not exist and createIfNotExist is False.
|
|
353
467
|
- Exception: If the existing header does not match the provided header.
|
|
354
468
|
"""
|
|
355
|
-
|
|
356
|
-
|
|
469
|
+
delimiter = get_delimiter(delimiter,file_name=fileName)
|
|
470
|
+
header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
|
|
471
|
+
if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
|
|
357
472
|
return
|
|
358
473
|
if type(lineToAppend) == str:
|
|
359
|
-
lineToAppend = lineToAppend.
|
|
474
|
+
lineToAppend = lineToAppend.split(delimiter)
|
|
360
475
|
else:
|
|
361
476
|
for i in range(len(lineToAppend)):
|
|
362
477
|
if type(lineToAppend[i]) != str:
|
|
@@ -367,11 +482,11 @@ def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExis
|
|
|
367
482
|
|
|
368
483
|
with open(fileName, mode ='r+b')as file:
|
|
369
484
|
correctColumnNum = len(lineToAppend)
|
|
370
|
-
if header.
|
|
485
|
+
if header.rstrip():
|
|
371
486
|
if verifyHeader:
|
|
372
|
-
line = file.readline().decode()
|
|
373
|
-
if
|
|
374
|
-
correctColumnNum = len(header.
|
|
487
|
+
line = file.readline().decode(encoding=encoding)
|
|
488
|
+
if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
|
|
489
|
+
correctColumnNum = len(header.split(delimiter))
|
|
375
490
|
if verbose:
|
|
376
491
|
__teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
|
|
377
492
|
# truncate / fill the lineToAppend to the correct number of columns
|
|
@@ -383,15 +498,16 @@ def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExis
|
|
|
383
498
|
file.seek(-1, os.SEEK_END)
|
|
384
499
|
if file.read(1) != b'\n':
|
|
385
500
|
file.write(b'\n')
|
|
386
|
-
file.write(
|
|
501
|
+
file.write(get_delimiter(delimiter).join(lineToAppend).encode(encoding=encoding) + b'\n')
|
|
387
502
|
if verbose:
|
|
388
503
|
__teePrintOrNot(f"Appended {lineToAppend} to {fileName}",teeLogger=teeLogger)
|
|
389
504
|
|
|
390
|
-
def clearTSV(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False):
|
|
505
|
+
def clearTSV(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False,delimiter = '\t'):
|
|
391
506
|
"""
|
|
392
|
-
|
|
507
|
+
Compatibility method, calls clearTabularFile.
|
|
508
|
+
Clear the contents of a Tabular file. Will create if not exist.
|
|
393
509
|
Parameters:
|
|
394
|
-
- fileName (str): The path of the
|
|
510
|
+
- fileName (str): The path of the Tabular file.
|
|
395
511
|
- teeLogger (optional): A logger object for logging messages.
|
|
396
512
|
- header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
|
|
397
513
|
- verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
|
|
@@ -399,14 +515,29 @@ def clearTSV(fileName,teeLogger = None,header = '',verifyHeader = False,verbose
|
|
|
399
515
|
- encoding (str, optional): The encoding of the file.
|
|
400
516
|
- strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
|
|
401
517
|
"""
|
|
402
|
-
header =
|
|
403
|
-
|
|
518
|
+
return clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
|
|
519
|
+
|
|
520
|
+
def clearTabularFile(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False,delimiter = ...):
|
|
521
|
+
"""
|
|
522
|
+
Clear the contents of a Tabular file. Will create if not exist.
|
|
523
|
+
Parameters:
|
|
524
|
+
- fileName (str): The path of the Tabular file.
|
|
525
|
+
- teeLogger (optional): A logger object for logging messages.
|
|
526
|
+
- header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
|
|
527
|
+
- verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
|
|
528
|
+
- verbose (bool, optional): If True, additional information will be printed during the execution.
|
|
529
|
+
- encoding (str, optional): The encoding of the file.
|
|
530
|
+
- strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
|
|
531
|
+
"""
|
|
532
|
+
delimiter = get_delimiter(delimiter,file_name=fileName)
|
|
533
|
+
header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
|
|
534
|
+
if not _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = teeLogger,header = header,encoding = encoding,strict = False,delimiter=delimiter):
|
|
404
535
|
raise Exception("Something catastrophic happened! File still not found after creation")
|
|
405
536
|
else:
|
|
406
537
|
with open(fileName, mode ='r+',encoding=encoding)as file:
|
|
407
|
-
if header.
|
|
408
|
-
line = file.readline()
|
|
409
|
-
if not
|
|
538
|
+
if header.rstrip() and verifyHeader:
|
|
539
|
+
line = file.readline()
|
|
540
|
+
if not _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
|
|
410
541
|
__teePrintOrNot(f'Warning: Header mismatch in {fileName}. Keeping original header in file...','warning',teeLogger)
|
|
411
542
|
file.truncate()
|
|
412
543
|
else:
|
|
@@ -442,14 +573,17 @@ class TSVZed(OrderedDict):
|
|
|
442
573
|
except Exception as e:
|
|
443
574
|
print(message,flush=True)
|
|
444
575
|
|
|
445
|
-
def __init__ (self,fileName,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,monitor_external_changes = True,verbose = False,encoding =
|
|
576
|
+
def __init__ (self,fileName,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,monitor_external_changes = True,verbose = False,encoding = 'utf8',delimiter = ...,defualts = [],strict = False):
|
|
446
577
|
super().__init__()
|
|
447
578
|
self.version = version
|
|
579
|
+
self.strict = strict
|
|
448
580
|
self.externalFileUpdateTime = getFileUpdateTimeNs(fileName)
|
|
449
581
|
self.lastUpdateTime = self.externalFileUpdateTime
|
|
450
582
|
self._fileName = fileName
|
|
451
583
|
self.teeLogger = teeLogger
|
|
452
|
-
self.
|
|
584
|
+
self.delimiter = get_delimiter(delimiter,file_name=fileName)
|
|
585
|
+
self.defaults = defualts
|
|
586
|
+
self.header = _formatHeader(header,verbose = verbose,teeLogger = self.teeLogger,delimiter=self.delimiter)
|
|
453
587
|
self.correctColumnNum = -1
|
|
454
588
|
self.createIfNotExist = createIfNotExist
|
|
455
589
|
self.verifyHeader = verifyHeader
|
|
@@ -477,6 +611,27 @@ class TSVZed(OrderedDict):
|
|
|
477
611
|
self.load()
|
|
478
612
|
atexit.register(self.stopAppendThread)
|
|
479
613
|
|
|
614
|
+
def setDefaults(self,defaults):
|
|
615
|
+
if not defaults:
|
|
616
|
+
defaults = []
|
|
617
|
+
return
|
|
618
|
+
if isinstance(defaults,str):
|
|
619
|
+
defaults = defaults.split(self.delimiter)
|
|
620
|
+
elif not isinstance(defaults,list):
|
|
621
|
+
try:
|
|
622
|
+
defaults = list(defaults)
|
|
623
|
+
except:
|
|
624
|
+
if self.verbose:
|
|
625
|
+
self.__teePrintOrNot('Invalid defaults, setting defaults to empty.','error')
|
|
626
|
+
defaults = []
|
|
627
|
+
return
|
|
628
|
+
if not any(defaults):
|
|
629
|
+
defaults = []
|
|
630
|
+
return
|
|
631
|
+
if defaults[0] != DEFAULTS_INDICATOR_KEY:
|
|
632
|
+
defaults = [DEFAULTS_INDICATOR_KEY]+defaults
|
|
633
|
+
self.defaults = defaults
|
|
634
|
+
|
|
480
635
|
def load(self):
|
|
481
636
|
self.reload()
|
|
482
637
|
if self.rewrite_on_load:
|
|
@@ -490,10 +645,10 @@ class TSVZed(OrderedDict):
|
|
|
490
645
|
if self.verbose:
|
|
491
646
|
self.__teePrintOrNot(f"Loading {self._fileName}")
|
|
492
647
|
super().clear()
|
|
493
|
-
|
|
648
|
+
readTabularFile(self._fileName, teeLogger = self.teeLogger, header = self.header, createIfNotExist = self.createIfNotExist, verifyHeader = self.verifyHeader, verbose = self.verbose, taskDic = self,encoding = self.encoding if self.encoding else None, strict = self.strict, delimiter = self.delimiter, defaults=self.defaults)
|
|
494
649
|
if self.verbose:
|
|
495
650
|
self.__teePrintOrNot(f"Loaded {len(self)} records from {self._fileName}")
|
|
496
|
-
self.correctColumnNum = len(self.header.split(
|
|
651
|
+
self.correctColumnNum = len(self.header.split(self.delimiter)) if (self.header and self.verifyHeader) else (len(self[next(iter(self))]) if self else -1)
|
|
497
652
|
if self.verbose:
|
|
498
653
|
self.__teePrintOrNot(f"correctColumnNum: {self.correctColumnNum}")
|
|
499
654
|
#super().update(loadedData)
|
|
@@ -505,30 +660,55 @@ class TSVZed(OrderedDict):
|
|
|
505
660
|
return self
|
|
506
661
|
|
|
507
662
|
def __setitem__(self,key,value):
|
|
508
|
-
key = str(key).
|
|
663
|
+
key = str(key).rstrip()
|
|
509
664
|
if not key:
|
|
510
665
|
self.__teePrintOrNot('Key cannot be empty','error')
|
|
511
666
|
return
|
|
512
667
|
if type(value) == str:
|
|
513
|
-
value = value.
|
|
668
|
+
value = value.split(self.delimiter)
|
|
514
669
|
# sanitize the value
|
|
515
|
-
value = [(str(segment).
|
|
516
|
-
#
|
|
670
|
+
value = [(str(segment).rstrip() if type(segment) != str else segment.rstrip()) if segment else '' for segment in value]
|
|
671
|
+
# escape the delimiter and newline characters
|
|
672
|
+
value = [segment.replace(self.delimiter,'<sep>').replace('\n','\\n') for segment in value]
|
|
517
673
|
# the first field in value should be the key
|
|
518
674
|
# add it if it is not there
|
|
519
675
|
if not value or value[0] != key:
|
|
520
676
|
value = [key]+value
|
|
521
677
|
# verify the value has the correct number of columns
|
|
522
678
|
if self.correctColumnNum != 1 and len(value) == 1:
|
|
523
|
-
# this means we want to clear /
|
|
679
|
+
# this means we want to clear / delete the key
|
|
524
680
|
self.__delitem__(key)
|
|
525
681
|
elif self.correctColumnNum > 0:
|
|
526
|
-
|
|
682
|
+
if len(value) != self.correctColumnNum:
|
|
683
|
+
if self.strict:
|
|
684
|
+
self.__teePrintOrNot(f"Value {value} does not have the correct number of columns: {self.correctColumnNum}. Refuse adding key...",'error')
|
|
685
|
+
return
|
|
686
|
+
elif self.verbose:
|
|
687
|
+
self.__teePrintOrNot(f"Value {value} does not have the correct number of columns: {self.correctColumnNum}, correcting...",'warning')
|
|
688
|
+
if len(value) < self.correctColumnNum:
|
|
689
|
+
value += ['']*(self.correctColumnNum-len(value))
|
|
690
|
+
elif len(value) > self.correctColumnNum:
|
|
691
|
+
value = value[:self.correctColumnNum]
|
|
527
692
|
else:
|
|
528
693
|
self.correctColumnNum = len(value)
|
|
694
|
+
if self.defaults and len(self.defaults) > 1:
|
|
695
|
+
for i in range(1,len(value)):
|
|
696
|
+
if not value[i] and i < len(self.defaults) and self.defaults[i]:
|
|
697
|
+
value[i] = self.defaults[i]
|
|
698
|
+
if self.verbose:
|
|
699
|
+
self.__teePrintOrNot(f" Replacing empty value at {i} with default: {self.defaults[i]}")
|
|
700
|
+
if key == DEFAULTS_INDICATOR_KEY:
|
|
701
|
+
self.defaults = value
|
|
702
|
+
if self.verbose:
|
|
703
|
+
self.__teePrintOrNot(f"Defaults set to {value}")
|
|
704
|
+
if not self.memoryOnly:
|
|
705
|
+
self.appendQueue.append(self.delimiter.join(value))
|
|
706
|
+
self.lastUpdateTime = get_time_ns()
|
|
707
|
+
if self.verbose:
|
|
708
|
+
self.__teePrintOrNot(f"Appending Defaults {key} to the appendQueue")
|
|
709
|
+
return
|
|
529
710
|
if self.verbose:
|
|
530
711
|
self.__teePrintOrNot(f"Setting {key} to {value}")
|
|
531
|
-
|
|
532
712
|
if key in self:
|
|
533
713
|
if self[key] == value:
|
|
534
714
|
if self.verbose:
|
|
@@ -537,13 +717,17 @@ class TSVZed(OrderedDict):
|
|
|
537
717
|
self.dirty = True
|
|
538
718
|
# update the dictionary,
|
|
539
719
|
super().__setitem__(key,value)
|
|
540
|
-
if self.verbose:
|
|
541
|
-
self.__teePrintOrNot(f"Key {key} updated")
|
|
542
720
|
if self.memoryOnly:
|
|
721
|
+
if self.verbose:
|
|
722
|
+
self.__teePrintOrNot(f"Key {key} updated in memory only")
|
|
723
|
+
return
|
|
724
|
+
elif key.startswith('#'):
|
|
725
|
+
if self.verbose:
|
|
726
|
+
self.__teePrintOrNot(f"Key {key} updated in memory only as it starts with #")
|
|
543
727
|
return
|
|
544
728
|
if self.verbose:
|
|
545
729
|
self.__teePrintOrNot(f"Appending {key} to the appendQueue")
|
|
546
|
-
self.appendQueue.append(
|
|
730
|
+
self.appendQueue.append(self.delimiter.join(value))
|
|
547
731
|
self.lastUpdateTime = get_time_ns()
|
|
548
732
|
# if not self.appendThread.is_alive():
|
|
549
733
|
# self.commitAppendToFile()
|
|
@@ -552,25 +736,38 @@ class TSVZed(OrderedDict):
|
|
|
552
736
|
|
|
553
737
|
|
|
554
738
|
def __delitem__(self,key):
|
|
555
|
-
key = str(key).
|
|
739
|
+
key = str(key).rstrip()
|
|
740
|
+
if key == DEFAULTS_INDICATOR_KEY:
|
|
741
|
+
self.defaults = []
|
|
742
|
+
if self.verbose:
|
|
743
|
+
self.__teePrintOrNot(f"Defaults cleared")
|
|
744
|
+
if not self.memoryOnly:
|
|
745
|
+
self.__appendEmptyLine(key)
|
|
746
|
+
if self.verbose:
|
|
747
|
+
self.__teePrintOrNot(f"Appending empty default line {key}")
|
|
748
|
+
return
|
|
556
749
|
# delete the key from the dictionary and update the file
|
|
557
750
|
if key not in self:
|
|
558
751
|
if self.verbose:
|
|
559
752
|
self.__teePrintOrNot(f"Key {key} not found")
|
|
560
753
|
return
|
|
561
754
|
super().__delitem__(key)
|
|
562
|
-
if self.memoryOnly:
|
|
755
|
+
if self.memoryOnly or key.startswith('#'):
|
|
756
|
+
if self.verbose:
|
|
757
|
+
self.__teePrintOrNot(f"Key {key} deleted in memory")
|
|
563
758
|
return
|
|
564
759
|
self.__appendEmptyLine(key)
|
|
760
|
+
if self.verbose:
|
|
761
|
+
self.__teePrintOrNot(f"Appending empty line {key}")
|
|
565
762
|
self.lastUpdateTime = get_time_ns()
|
|
566
763
|
|
|
567
764
|
def __appendEmptyLine(self,key):
|
|
568
765
|
self.dirty = True
|
|
569
766
|
if self.correctColumnNum > 0:
|
|
570
|
-
emptyLine = key+
|
|
767
|
+
emptyLine = key+self.delimiter*(self.correctColumnNum-1)
|
|
571
768
|
elif len(self[key]) > 1:
|
|
572
769
|
self.correctColumnNum = len(self[key])
|
|
573
|
-
emptyLine = key+
|
|
770
|
+
emptyLine = key+self.delimiter*(self.correctColumnNum-1)
|
|
574
771
|
else:
|
|
575
772
|
emptyLine = key
|
|
576
773
|
if self.verbose:
|
|
@@ -745,7 +942,7 @@ memoryOnly:{self.memoryOnly}
|
|
|
745
942
|
if self.header:
|
|
746
943
|
file.write(self.header+'\n')
|
|
747
944
|
for key in self:
|
|
748
|
-
file.write(
|
|
945
|
+
file.write(self.delimiter.join(self[key])+'\n')
|
|
749
946
|
self.release_file_obj(file)
|
|
750
947
|
if self.verbose:
|
|
751
948
|
self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
|
|
@@ -761,27 +958,32 @@ memoryOnly:{self.memoryOnly}
|
|
|
761
958
|
return self
|
|
762
959
|
|
|
763
960
|
def mapToFile(self):
|
|
961
|
+
mec = self.monitor_external_changes
|
|
962
|
+
self.monitor_external_changes = False
|
|
764
963
|
try:
|
|
765
964
|
if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
|
|
766
965
|
self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
|
|
767
|
-
file = self.get_file_obj('r+')
|
|
966
|
+
file = self.get_file_obj('r+b')
|
|
768
967
|
overWrite = False
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
|
|
773
|
-
|
|
774
|
-
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
|
|
778
|
-
|
|
968
|
+
if self.header:
|
|
969
|
+
line = file.readline().decode(self.encoding)
|
|
970
|
+
aftPos = file.tell()
|
|
971
|
+
if not _lineContainHeader(self.header,line,verbose = self.verbose,teeLogger = self.teeLogger,strict = self.strict):
|
|
972
|
+
file.seek(0)
|
|
973
|
+
file.write(f'{self.header}\n'.encode(encoding=self.encoding))
|
|
974
|
+
# if the header is not the same length as the line, we need to overwrite the file
|
|
975
|
+
if aftPos != file.tell():
|
|
976
|
+
overWrite = True
|
|
977
|
+
if self.verbose:
|
|
978
|
+
self.__teePrintOrNot(f"Header {self.header} written to {self._fileName}")
|
|
779
979
|
for value in self.values():
|
|
780
|
-
|
|
980
|
+
if value[0].startswith('#'):
|
|
981
|
+
continue
|
|
982
|
+
strToWrite = self.delimiter.join(value)
|
|
781
983
|
if overWrite:
|
|
782
984
|
if self.verbose:
|
|
783
985
|
self.__teePrintOrNot(f"Overwriting {value} to {self._fileName}")
|
|
784
|
-
file.write(strToWrite)
|
|
986
|
+
file.write(strToWrite.encode(encoding=self.encoding)+b'\n')
|
|
785
987
|
continue
|
|
786
988
|
pos = file.tell()
|
|
787
989
|
line = file.readline()
|
|
@@ -789,15 +991,17 @@ memoryOnly:{self.memoryOnly}
|
|
|
789
991
|
if not line or pos == aftPos:
|
|
790
992
|
if self.verbose:
|
|
791
993
|
self.__teePrintOrNot(f"End of file reached. Appending {value} to {self._fileName}")
|
|
792
|
-
file.write(strToWrite)
|
|
994
|
+
file.write(strToWrite.encode(encoding=self.encoding))
|
|
793
995
|
overWrite = True
|
|
794
996
|
continue
|
|
997
|
+
strToWrite = strToWrite.encode(encoding=self.encoding).ljust(len(line)-1)+b'\n'
|
|
795
998
|
if line != strToWrite:
|
|
796
999
|
if self.verbose:
|
|
797
|
-
self.__teePrintOrNot(f"
|
|
1000
|
+
self.__teePrintOrNot(f"Modifing {value} to {self._fileName}")
|
|
798
1001
|
file.seek(pos)
|
|
799
1002
|
# fill the string with space to write to the correct length
|
|
800
|
-
file.write(strToWrite.rstrip('\n').ljust(len(line)-1)+'\n')
|
|
1003
|
+
#file.write(strToWrite.rstrip('\n').ljust(len(line)-1)+'\n')
|
|
1004
|
+
file.write(strToWrite)
|
|
801
1005
|
if aftPos != file.tell():
|
|
802
1006
|
overWrite = True
|
|
803
1007
|
file.truncate()
|
|
@@ -813,6 +1017,8 @@ memoryOnly:{self.memoryOnly}
|
|
|
813
1017
|
import traceback
|
|
814
1018
|
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
815
1019
|
self.deSynced = True
|
|
1020
|
+
self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
|
|
1021
|
+
self.monitor_external_changes = mec
|
|
816
1022
|
return self
|
|
817
1023
|
|
|
818
1024
|
def checkExternalChanges(self):
|
|
@@ -831,9 +1037,10 @@ memoryOnly:{self.memoryOnly}
|
|
|
831
1037
|
|
|
832
1038
|
def _appendWorker(self):
|
|
833
1039
|
while not self.shutdownEvent.is_set():
|
|
834
|
-
self.
|
|
835
|
-
|
|
836
|
-
|
|
1040
|
+
if not self.memoryOnly:
|
|
1041
|
+
self.checkExternalChanges()
|
|
1042
|
+
self.rewrite()
|
|
1043
|
+
self.commitAppendToFile()
|
|
837
1044
|
time.sleep(self.append_check_delay)
|
|
838
1045
|
# self.appendEvent.wait()
|
|
839
1046
|
# self.appendEvent.clear()
|
|
@@ -883,15 +1090,19 @@ memoryOnly:{self.memoryOnly}
|
|
|
883
1090
|
def get_file_obj(self,modes = 'a'):
|
|
884
1091
|
self.writeLock.acquire()
|
|
885
1092
|
try:
|
|
886
|
-
if not
|
|
887
|
-
self.encoding
|
|
888
|
-
|
|
1093
|
+
if 'b' not in modes:
|
|
1094
|
+
if not self.encoding:
|
|
1095
|
+
self.encoding = 'utf8'
|
|
1096
|
+
file = open(self._fileName, mode=modes, encoding=self.encoding)
|
|
1097
|
+
else:
|
|
1098
|
+
file = open(self._fileName, mode=modes)
|
|
889
1099
|
# Lock the file after opening
|
|
890
1100
|
if os.name == 'posix':
|
|
891
1101
|
fcntl.lockf(file, fcntl.LOCK_EX)
|
|
892
1102
|
elif os.name == 'nt':
|
|
893
1103
|
# For Windows, locking the entire file, avoiding locking an empty file
|
|
894
|
-
lock_length = max(1, os.path.getsize(self._fileName))
|
|
1104
|
+
#lock_length = max(1, os.path.getsize(self._fileName))
|
|
1105
|
+
lock_length = 2147483647
|
|
895
1106
|
msvcrt.locking(file.fileno(), msvcrt.LK_LOCK, lock_length)
|
|
896
1107
|
if self.verbose:
|
|
897
1108
|
self.__teePrintOrNot(f"File {self._fileName} locked with mode {modes}")
|
|
@@ -910,13 +1121,18 @@ memoryOnly:{self.memoryOnly}
|
|
|
910
1121
|
try:
|
|
911
1122
|
file.flush() # Ensure the file is flushed before unlocking
|
|
912
1123
|
os.fsync(file.fileno()) # Ensure the file is synced to disk before unlocking
|
|
913
|
-
if
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
1124
|
+
if not file.closed:
|
|
1125
|
+
if os.name == 'posix':
|
|
1126
|
+
fcntl.lockf(file, fcntl.LOCK_UN)
|
|
1127
|
+
elif os.name == 'nt':
|
|
1128
|
+
# Unlocking the entire file; for Windows, ensure not unlocking an empty file
|
|
1129
|
+
#unlock_length = max(1, os.path.getsize(os.path.realpath(file.name)))
|
|
1130
|
+
unlock_length = 2147483647
|
|
1131
|
+
try:
|
|
1132
|
+
msvcrt.locking(file.fileno(), msvcrt.LK_UNLCK, unlock_length)
|
|
1133
|
+
except:
|
|
1134
|
+
pass
|
|
1135
|
+
file.close() # Ensure file is closed after unlocking
|
|
920
1136
|
if self.verbose:
|
|
921
1137
|
self.__teePrintOrNot(f"File {file.name} unlocked / released")
|
|
922
1138
|
except Exception as e:
|
|
@@ -925,26 +1141,47 @@ memoryOnly:{self.memoryOnly}
|
|
|
925
1141
|
except Exception as e:
|
|
926
1142
|
self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
|
|
927
1143
|
self.__teePrintOrNot(f"Failed to release file {file.name}: {e}",'error')
|
|
928
|
-
|
|
929
|
-
self.
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
1144
|
+
import traceback
|
|
1145
|
+
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
1146
|
+
# release the write lock if not already released
|
|
1147
|
+
if self.writeLock.locked():
|
|
1148
|
+
try:
|
|
1149
|
+
self.writeLock.release() # Ensure the thread lock is always released
|
|
1150
|
+
except Exception as e:
|
|
1151
|
+
self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
|
|
1152
|
+
self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
|
|
933
1153
|
|
|
934
1154
|
|
|
935
1155
|
def __main__():
|
|
936
1156
|
import argparse
|
|
937
|
-
parser = argparse.ArgumentParser(description='TSVZed: A TSV file manager')
|
|
938
|
-
parser.add_argument('filename', type=str, help='The
|
|
1157
|
+
parser = argparse.ArgumentParser(description='TSVZed: A TSV / CSV / NSV file manager')
|
|
1158
|
+
parser.add_argument('filename', type=str, help='The file to read')
|
|
939
1159
|
parser.add_argument('operation', type=str,nargs='?', choices=['read','append','delete','clear'], help='The operation to perform. Default: read', default='read')
|
|
940
|
-
parser.add_argument('line', type=str, nargs='*', help='The line to append to the
|
|
941
|
-
parser.add_argument('-
|
|
942
|
-
parser.add_argument('-
|
|
1160
|
+
parser.add_argument('line', type=str, nargs='*', help='The line to append to the Tabular file. it follows as : {key} {value1} {value2} ... if a key without value be inserted, the value will get deleted.')
|
|
1161
|
+
parser.add_argument('-d', '--delimiter', type=str, help='The delimiter of the Tabular file. Default: Infer from last part of filename, or tab if cannot determine. Note: accept unicode escaped char, raw char, or string "comma,tab,null" will refer to their characters. ', default=...)
|
|
1162
|
+
parser.add_argument('-c', '--header', type=str, help='Perform checks with this header of the Tabular file. seperate using --delimiter.')
|
|
1163
|
+
parser.add_argument('--defaults', type=str, help='Default values to fill in the missing columns. seperate using --delimiter. Ex. if -d = comma, --defaults="key,value1,value2..." Note: Please specify the key. But it will not be used as a key need to be unique in data.')
|
|
1164
|
+
strictMode = parser.add_mutually_exclusive_group()
|
|
1165
|
+
strictMode.add_argument('-s', '--strict', dest = 'strict',action='store_true', help='Strict mode. Do not parse values that seems malformed, check for column numbers / headers')
|
|
1166
|
+
strictMode.add_argument('-f', '--force', dest = 'strict',action='store_false', help='Force the operation. Ignore checks for column numbers / headers')
|
|
943
1167
|
parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
|
|
944
1168
|
parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {version} by {author}')
|
|
945
1169
|
args = parser.parse_args()
|
|
946
|
-
|
|
947
|
-
header
|
|
1170
|
+
args.delimiter = get_delimiter(delimiter=args.delimiter,file_name=args.filename)
|
|
1171
|
+
if args.header and args.header.endswith('\\'):
|
|
1172
|
+
args.header += '\\'
|
|
1173
|
+
try:
|
|
1174
|
+
header = args.header.encode().decode('unicode_escape') if args.header else ''
|
|
1175
|
+
except Exception as e:
|
|
1176
|
+
print(f"Failed to decode header: {args.header}")
|
|
1177
|
+
header = ''
|
|
1178
|
+
defaults = []
|
|
1179
|
+
if args.defaults:
|
|
1180
|
+
try:
|
|
1181
|
+
defaults = args.defaults.encode().decode('unicode_escape').split(args.delimiter)
|
|
1182
|
+
except Exception as e:
|
|
1183
|
+
print(f"Failed to decode defaults: {args.defaults}")
|
|
1184
|
+
defaults = []
|
|
948
1185
|
|
|
949
1186
|
if args.operation == 'read':
|
|
950
1187
|
# check if the file exist
|
|
@@ -952,14 +1189,14 @@ def __main__():
|
|
|
952
1189
|
print(f"File not found: {args.filename}")
|
|
953
1190
|
return
|
|
954
1191
|
# read the file
|
|
955
|
-
data =
|
|
956
|
-
print(pretty_format_table(data.values()))
|
|
1192
|
+
data = readTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= args.strict, delimiter=args.delimiter, defaults=defaults)
|
|
1193
|
+
print(pretty_format_table(data.values(),delimiter=args.delimiter))
|
|
957
1194
|
elif args.operation == 'append':
|
|
958
|
-
|
|
1195
|
+
appendTabularFile(args.filename, args.line,createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
|
|
959
1196
|
elif args.operation == 'delete':
|
|
960
|
-
|
|
1197
|
+
appendTabularFile(args.filename, args.line[:1],createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
|
|
961
1198
|
elif args.operation == 'clear':
|
|
962
|
-
|
|
1199
|
+
clearTabularFile(args.filename, header=header, verbose=args.verbose, verifyHeader=args.strict, delimiter=args.delimiter)
|
|
963
1200
|
else:
|
|
964
1201
|
print("Invalid operation")
|
|
965
1202
|
return
|