TSVZ 2.57__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TSVZ-2.57.dist-info/LICENSE +674 -0
- TSVZ-2.57.dist-info/METADATA +17 -0
- TSVZ-2.57.dist-info/RECORD +6 -0
- TSVZ-2.57.dist-info/WHEEL +5 -0
- TSVZ-2.57.dist-info/top_level.txt +1 -0
- TSVZ.py +669 -0
TSVZ.py
ADDED
|
@@ -0,0 +1,669 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
import os
|
|
3
|
+
from collections import OrderedDict , deque
|
|
4
|
+
import time
|
|
5
|
+
import atexit
|
|
6
|
+
import threading
|
|
7
|
+
|
|
8
|
+
if os.name == 'nt':
|
|
9
|
+
import msvcrt
|
|
10
|
+
elif os.name == 'posix':
|
|
11
|
+
import fcntl
|
|
12
|
+
|
|
13
|
+
version = '2.57'
|
|
14
|
+
|
|
15
|
+
def processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,strict = True):
|
|
16
|
+
"""
|
|
17
|
+
Process a line of text and update the task dictionary.
|
|
18
|
+
|
|
19
|
+
Parameters:
|
|
20
|
+
line (str): The line of text to process.
|
|
21
|
+
taskDic (dict): The dictionary to update with the processed line.
|
|
22
|
+
correctColumnNum (int): The expected number of columns in the line.
|
|
23
|
+
verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
24
|
+
teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
|
|
25
|
+
strict (bool, optional): Whether to strictly enforce the correct number of columns. Defaults to True.
|
|
26
|
+
|
|
27
|
+
Returns:
|
|
28
|
+
tuple: A tuple containing the updated correctColumnNum and the processed lineCache.
|
|
29
|
+
|
|
30
|
+
"""
|
|
31
|
+
line = line.decode().strip(' ').strip('\x00')
|
|
32
|
+
# we throw away the lines that start with '#'
|
|
33
|
+
if not line :
|
|
34
|
+
if verbose:
|
|
35
|
+
__teePrintOrNot(f"Ignoring empty line: {line}",teeLogger=teeLogger)
|
|
36
|
+
return correctColumnNum , []
|
|
37
|
+
if line.startswith('#'):
|
|
38
|
+
if verbose:
|
|
39
|
+
__teePrintOrNot(f"Ignoring comment line: {line}",teeLogger=teeLogger)
|
|
40
|
+
return correctColumnNum , []
|
|
41
|
+
# we only interested in the lines that have the correct number of columns
|
|
42
|
+
lineCache = [segment.strip() for segment in line.split('\t')]
|
|
43
|
+
if not lineCache:
|
|
44
|
+
return correctColumnNum , []
|
|
45
|
+
if correctColumnNum == -1:
|
|
46
|
+
if verbose:
|
|
47
|
+
__teePrintOrNot(f"detected correctColumnNum: {len(lineCache)}",teeLogger=teeLogger)
|
|
48
|
+
correctColumnNum = len(lineCache)
|
|
49
|
+
if not lineCache[0]:
|
|
50
|
+
if verbose:
|
|
51
|
+
__teePrintOrNot(f"Ignoring line with empty key: {line}",teeLogger=teeLogger)
|
|
52
|
+
return correctColumnNum , []
|
|
53
|
+
if len(lineCache) == 1 or not any(lineCache[1:]):
|
|
54
|
+
if correctColumnNum == 1: taskDic[lineCache[0]] = lineCache
|
|
55
|
+
else:
|
|
56
|
+
if verbose:
|
|
57
|
+
__teePrintOrNot(f"Key {lineCache[0]} found with empty value, deleting such key's representaion",teeLogger=teeLogger)
|
|
58
|
+
if lineCache[0] in taskDic:
|
|
59
|
+
del taskDic[lineCache[0]]
|
|
60
|
+
return correctColumnNum , []
|
|
61
|
+
elif len(lineCache) == correctColumnNum:
|
|
62
|
+
taskDic[lineCache[0]] = lineCache
|
|
63
|
+
if verbose:
|
|
64
|
+
__teePrintOrNot(f"Key {lineCache[0]} added",teeLogger=teeLogger)
|
|
65
|
+
else:
|
|
66
|
+
if strict:
|
|
67
|
+
if verbose:
|
|
68
|
+
__teePrintOrNot(f"Ignoring line with {len(lineCache)} columns: {line}",teeLogger=teeLogger)
|
|
69
|
+
return correctColumnNum , []
|
|
70
|
+
else:
|
|
71
|
+
# fill / cut the line with empty entries til the correct number of columns
|
|
72
|
+
if len(lineCache) < correctColumnNum:
|
|
73
|
+
lineCache += ['']*(correctColumnNum-len(lineCache))
|
|
74
|
+
elif len(lineCache) > correctColumnNum:
|
|
75
|
+
lineCache = lineCache[:correctColumnNum]
|
|
76
|
+
taskDic[lineCache[0]] = lineCache
|
|
77
|
+
if verbose:
|
|
78
|
+
__teePrintOrNot(f"Key {lineCache[0]} added after correction",teeLogger=teeLogger)
|
|
79
|
+
return correctColumnNum, lineCache
|
|
80
|
+
|
|
81
|
+
def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False):
|
|
82
|
+
"""
|
|
83
|
+
Reads the last valid line from a file.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
fileName (str): The name of the file to read.
|
|
87
|
+
taskDic (dict): A dictionary to pass to processLine function.
|
|
88
|
+
correctColumnNum (int): A column number to pass to processLine function.
|
|
89
|
+
verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
90
|
+
teeLogger (optional): Logger to use for tee print. Defaults to None.
|
|
91
|
+
strict (bool, optional): Whether to enforce strict processing. Defaults to False.
|
|
92
|
+
|
|
93
|
+
Returns:
|
|
94
|
+
list: The last valid line data processed by processLine, or an empty list if none found.
|
|
95
|
+
"""
|
|
96
|
+
chunk_size = 1024 # Read in chunks of 1024 bytes
|
|
97
|
+
last_valid_line = []
|
|
98
|
+
if verbose:
|
|
99
|
+
__teePrintOrNot(f"Reading last line only from {fileName}",teeLogger=teeLogger)
|
|
100
|
+
with open(fileName, 'rb') as file:
|
|
101
|
+
file.seek(0, os.SEEK_END)
|
|
102
|
+
file_size = file.tell()
|
|
103
|
+
buffer = b''
|
|
104
|
+
position = file_size
|
|
105
|
+
|
|
106
|
+
while position > 0:
|
|
107
|
+
# Read chunks from the end of the file
|
|
108
|
+
read_size = min(chunk_size, position)
|
|
109
|
+
position -= read_size
|
|
110
|
+
file.seek(position)
|
|
111
|
+
chunk = file.read(read_size)
|
|
112
|
+
|
|
113
|
+
# Prepend new chunk to buffer
|
|
114
|
+
buffer = chunk + buffer
|
|
115
|
+
|
|
116
|
+
# Split the buffer into lines
|
|
117
|
+
lines = buffer.split(b'\n')
|
|
118
|
+
|
|
119
|
+
# Process lines from the last to the first
|
|
120
|
+
for i in range(len(lines) - 1, -1, -1):
|
|
121
|
+
if lines[i].strip(): # Skip empty lines
|
|
122
|
+
# Process the line
|
|
123
|
+
correctColumnNum, lineCache = processLine(
|
|
124
|
+
lines[i],
|
|
125
|
+
taskDic,
|
|
126
|
+
correctColumnNum,
|
|
127
|
+
verbose=verbose,
|
|
128
|
+
teeLogger=teeLogger,
|
|
129
|
+
strict=strict
|
|
130
|
+
)
|
|
131
|
+
# If the line is valid, return it
|
|
132
|
+
if lineCache and any(lineCache):
|
|
133
|
+
return lineCache
|
|
134
|
+
|
|
135
|
+
# Keep the last (possibly incomplete) line in buffer for the next read
|
|
136
|
+
buffer = lines[0]
|
|
137
|
+
|
|
138
|
+
# Return empty list if no valid line found
|
|
139
|
+
return last_valid_line
|
|
140
|
+
|
|
141
|
+
def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True):
|
|
142
|
+
"""
|
|
143
|
+
Read a TSV (Tab-Separated Values) file and return the data as a dictionary.
|
|
144
|
+
|
|
145
|
+
Parameters:
|
|
146
|
+
- fileName (str): The path to the TSV file.
|
|
147
|
+
- teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
|
|
148
|
+
- header (str or list, optional): The header of the TSV file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
|
|
149
|
+
- createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
|
|
150
|
+
- lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
|
|
151
|
+
- verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
|
|
152
|
+
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
153
|
+
- taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to None.
|
|
154
|
+
- encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
|
|
155
|
+
- strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
|
|
156
|
+
|
|
157
|
+
Returns:
|
|
158
|
+
- OrderedDict: The dictionary containing the data from the TSV file.
|
|
159
|
+
|
|
160
|
+
Raises:
|
|
161
|
+
- Exception: If the file is not found or there is a data format error.
|
|
162
|
+
|
|
163
|
+
"""
|
|
164
|
+
if taskDic is None:
|
|
165
|
+
taskDic = OrderedDict()
|
|
166
|
+
|
|
167
|
+
header = header.strip() if type(header) == str else '\t'.join(header)
|
|
168
|
+
if not header.endswith('\n'):
|
|
169
|
+
header += '\n'
|
|
170
|
+
if not os.path.isfile(fileName):
|
|
171
|
+
if createIfNotExist:
|
|
172
|
+
with open(fileName, mode ='w',encoding=encoding)as file:
|
|
173
|
+
file.write(header)
|
|
174
|
+
__teePrintOrNot('Created '+fileName,teeLogger=teeLogger)
|
|
175
|
+
verifyHeader = True
|
|
176
|
+
else:
|
|
177
|
+
__teePrintOrNot('File not found','error',teeLogger=teeLogger)
|
|
178
|
+
raise Exception("File not found")
|
|
179
|
+
with open(fileName, mode ='rb')as file:
|
|
180
|
+
if header.strip():
|
|
181
|
+
if verifyHeader:
|
|
182
|
+
line = file.readline().decode().strip()
|
|
183
|
+
if verbose:
|
|
184
|
+
__teePrintOrNot(f"Header: {header.strip()}",teeLogger=teeLogger)
|
|
185
|
+
__teePrintOrNot(f"First line: {line}",teeLogger=teeLogger)
|
|
186
|
+
#assert line.lower().replace(' ','').startswith(header.strip().lower().replace(' ','')), "Data format error!"
|
|
187
|
+
if not line.lower().replace(' ','').startswith(header.strip().lower().replace(' ','')):
|
|
188
|
+
__teePrintOrNot(f"Header mismatch: \n{line} \n!= \n{header.strip()}",teeLogger=teeLogger)
|
|
189
|
+
raise Exception("Data format error! Header mismatch")
|
|
190
|
+
correctColumnNum = len(header.strip().split('\t'))
|
|
191
|
+
if verbose:
|
|
192
|
+
__teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
|
|
193
|
+
else:
|
|
194
|
+
correctColumnNum = -1
|
|
195
|
+
if lastLineOnly:
|
|
196
|
+
lineCache = read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=verbose, teeLogger=teeLogger, strict=strict)
|
|
197
|
+
if lineCache:
|
|
198
|
+
taskDic[lineCache[0]] = lineCache
|
|
199
|
+
return lineCache
|
|
200
|
+
for line in file:
|
|
201
|
+
correctColumnNum, lineCache = processLine(line,taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict)
|
|
202
|
+
return taskDic
|
|
203
|
+
|
|
204
|
+
def __teePrintOrNot(message,level = 'info',teeLogger = None):
|
|
205
|
+
"""
|
|
206
|
+
Prints the given message or logs it using the provided teeLogger.
|
|
207
|
+
|
|
208
|
+
Parameters:
|
|
209
|
+
message (str): The message to be printed or logged.
|
|
210
|
+
level (str, optional): The log level. Defaults to 'info'.
|
|
211
|
+
teeLogger (object, optional): The logger object used for logging. Defaults to None.
|
|
212
|
+
|
|
213
|
+
Returns:
|
|
214
|
+
None
|
|
215
|
+
"""
|
|
216
|
+
try:
|
|
217
|
+
if teeLogger:
|
|
218
|
+
teeLogger.teelog(message,level)
|
|
219
|
+
else:
|
|
220
|
+
print(message)
|
|
221
|
+
except Exception as e:
|
|
222
|
+
print(message)
|
|
223
|
+
|
|
224
|
+
def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8'):
|
|
225
|
+
"""
|
|
226
|
+
Append a line of data to a TSV file.
|
|
227
|
+
Parameters:
|
|
228
|
+
- fileName (str): The path of the TSV file.
|
|
229
|
+
- lineToAppend (str or list): The line of data to append. If it is a string, it will be split by tabs ('\t') to form a list.
|
|
230
|
+
- teeLogger (optional): A logger object for logging messages.
|
|
231
|
+
- header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
|
|
232
|
+
- createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
|
|
233
|
+
- verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
|
|
234
|
+
- verbose (bool, optional): If True, additional information will be printed during the execution.
|
|
235
|
+
- encoding (str, optional): The encoding of the file.
|
|
236
|
+
Raises:
|
|
237
|
+
- Exception: If the file does not exist and createIfNotExist is False.
|
|
238
|
+
- Exception: If the existing header does not match the provided header.
|
|
239
|
+
"""
|
|
240
|
+
if not header.endswith('\n'):
|
|
241
|
+
header += '\n'
|
|
242
|
+
if not os.path.isfile(fileName):
|
|
243
|
+
if createIfNotExist:
|
|
244
|
+
with open(fileName, mode ='w',encoding=encoding)as file:
|
|
245
|
+
file.write(header)
|
|
246
|
+
__teePrintOrNot('Created '+fileName,teeLogger=teeLogger)
|
|
247
|
+
verifyHeader = True
|
|
248
|
+
else:
|
|
249
|
+
__teePrintOrNot('File not found','error',teeLogger=teeLogger)
|
|
250
|
+
raise Exception("File not found")
|
|
251
|
+
|
|
252
|
+
if type(lineToAppend) == str:
|
|
253
|
+
lineToAppend = lineToAppend.strip().split('\t')
|
|
254
|
+
|
|
255
|
+
with open(fileName, mode ='r+b')as file:
|
|
256
|
+
if header.strip():
|
|
257
|
+
if verifyHeader:
|
|
258
|
+
line = file.readline().decode().strip()
|
|
259
|
+
if verbose:
|
|
260
|
+
__teePrintOrNot(f"Header: {header.strip()}",teeLogger=teeLogger)
|
|
261
|
+
__teePrintOrNot(f"First line: {line}",teeLogger=teeLogger)
|
|
262
|
+
#assert line.lower().replace(' ','').startswith(header.strip().lower().replace(' ','')), "Data format error!"
|
|
263
|
+
if not line.lower().replace(' ','').startswith(header.strip().lower().replace(' ','')):
|
|
264
|
+
__teePrintOrNot(f"Header mismatch: \n{line} \n!= \n{header.strip()}",teeLogger=teeLogger)
|
|
265
|
+
raise Exception("Data format error! Header mismatch")
|
|
266
|
+
correctColumnNum = len(header.strip().split('\t'))
|
|
267
|
+
if verbose:
|
|
268
|
+
__teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
|
|
269
|
+
else:
|
|
270
|
+
correctColumnNum = len(lineToAppend)
|
|
271
|
+
# truncate / fill the lineToAppend to the correct number of columns
|
|
272
|
+
if len(lineToAppend) < correctColumnNum:
|
|
273
|
+
lineToAppend += ['']*(correctColumnNum-len(lineToAppend))
|
|
274
|
+
elif len(lineToAppend) > correctColumnNum:
|
|
275
|
+
lineToAppend = lineToAppend[:correctColumnNum]
|
|
276
|
+
# check if the file ends in a newline
|
|
277
|
+
file.seek(-1, os.SEEK_END)
|
|
278
|
+
if file.read(1) != b'\n':
|
|
279
|
+
file.write(b'\n')
|
|
280
|
+
file.write('\t'.join(lineToAppend).encode() + b'\n')
|
|
281
|
+
if verbose:
|
|
282
|
+
__teePrintOrNot(f"Appended {lineToAppend} to {fileName}",teeLogger=teeLogger)
|
|
283
|
+
|
|
284
|
+
# create a tsv class that functions like a ordered dictionary but will update the file when modified
|
|
285
|
+
class TSVZed(OrderedDict):
|
|
286
|
+
def __teePrintOrNot(self,message,level = 'info'):
|
|
287
|
+
try:
|
|
288
|
+
if self.teeLogger:
|
|
289
|
+
self.teeLogger.teelog(message,level)
|
|
290
|
+
else:
|
|
291
|
+
print(message)
|
|
292
|
+
except Exception as e:
|
|
293
|
+
print(message)
|
|
294
|
+
|
|
295
|
+
def __init__ (self,fileName,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,monitor_external_changes = True,verbose = False,encoding = None):
|
|
296
|
+
super().__init__()
|
|
297
|
+
self.version = version
|
|
298
|
+
self._fileName = fileName
|
|
299
|
+
self.teeLogger = teeLogger
|
|
300
|
+
self.header = header.strip() if type(header) == str else '\t'.join(header)
|
|
301
|
+
self.correctColumnNum = -1
|
|
302
|
+
self.createIfNotExist = createIfNotExist
|
|
303
|
+
self.verifyHeader = verifyHeader
|
|
304
|
+
self.rewrite_on_load = rewrite_on_load
|
|
305
|
+
self.rewrite_on_exit = rewrite_on_exit
|
|
306
|
+
self.rewrite_interval = rewrite_interval
|
|
307
|
+
self.monitor_external_changes = monitor_external_changes
|
|
308
|
+
self.verbose = verbose
|
|
309
|
+
if append_check_delay < 0:
|
|
310
|
+
append_check_delay = 0.00001
|
|
311
|
+
self.__teePrintOrNot('append_check_delay cannot be less than 0, setting it to 0.00001','error')
|
|
312
|
+
self.append_check_delay = append_check_delay
|
|
313
|
+
self.appendQueue = deque()
|
|
314
|
+
self.dirty = False
|
|
315
|
+
self.deSynced = False
|
|
316
|
+
self.memoryOnly = False
|
|
317
|
+
self.encoding = encoding
|
|
318
|
+
self.writeLock = threading.Lock()
|
|
319
|
+
self.shutdownEvent = threading.Event()
|
|
320
|
+
#self.appendEvent = threading.Event()
|
|
321
|
+
self.appendThread = threading.Thread(target=self._appendWorker,daemon=True)
|
|
322
|
+
self.appendThread.start()
|
|
323
|
+
self.load()
|
|
324
|
+
atexit.register(self.stopAppendThread)
|
|
325
|
+
|
|
326
|
+
def load(self):
|
|
327
|
+
self.reload()
|
|
328
|
+
if self.rewrite_on_load:
|
|
329
|
+
self.rewrite(force = True,reloadInternalFromFile = False)
|
|
330
|
+
return self
|
|
331
|
+
|
|
332
|
+
def reload(self):
|
|
333
|
+
# Load or refresh data from the TSV file
|
|
334
|
+
mo = self.memoryOnly
|
|
335
|
+
self.memoryOnly = True
|
|
336
|
+
if self.verbose:
|
|
337
|
+
self.__teePrintOrNot(f"Loading {self._fileName}")
|
|
338
|
+
super().clear()
|
|
339
|
+
readTSV(self._fileName, teeLogger = self.teeLogger, header = self.header, createIfNotExist = self.createIfNotExist, verifyHeader = self.verifyHeader, verbose = self.verbose, taskDic = self,encoding = self.encoding if self.encoding else None)
|
|
340
|
+
if self.verbose:
|
|
341
|
+
self.__teePrintOrNot(f"Loaded {len(self)} records from {self._fileName}")
|
|
342
|
+
self.correctColumnNum = len(self.header.split('\t')) if (self.header and self.verifyHeader) else (len(self[next(iter(self))]) if self else -1)
|
|
343
|
+
if self.verbose:
|
|
344
|
+
self.__teePrintOrNot(f"correctColumnNum: {self.correctColumnNum}")
|
|
345
|
+
#super().update(loadedData)
|
|
346
|
+
if self.verbose:
|
|
347
|
+
self.__teePrintOrNot(f"TSVZed({self._fileName}) loaded")
|
|
348
|
+
self.memoryOnly = mo
|
|
349
|
+
return self
|
|
350
|
+
|
|
351
|
+
def __setitem__(self,key,value):
|
|
352
|
+
key = str(key).strip()
|
|
353
|
+
if not key:
|
|
354
|
+
self.__teePrintOrNot('Key cannot be empty','error')
|
|
355
|
+
return
|
|
356
|
+
if type(value) == str:
|
|
357
|
+
value = value.strip().split('\t')
|
|
358
|
+
# sanitize the value
|
|
359
|
+
value = [(str(segment).strip() if type(segment) != str else segment.strip()) if segment else '' for segment in value]
|
|
360
|
+
#value = list(map(lambda segment: str(segment).strip(), value))
|
|
361
|
+
# the first field in value should be the key
|
|
362
|
+
# add it if it is not there
|
|
363
|
+
if not value or value[0] != key:
|
|
364
|
+
value = [key]+value
|
|
365
|
+
# verify the value has the correct number of columns
|
|
366
|
+
if self.correctColumnNum != 1 and len(value) == 1:
|
|
367
|
+
# this means we want to clear / deelte the key
|
|
368
|
+
self.__delitem__(key)
|
|
369
|
+
elif self.correctColumnNum > 0:
|
|
370
|
+
assert len(value) == self.correctColumnNum, f"Data format error! Expected {self.correctColumnNum} columns, but got {len(value) } columns"
|
|
371
|
+
else:
|
|
372
|
+
self.correctColumnNum = len(value)
|
|
373
|
+
if self.verbose:
|
|
374
|
+
self.__teePrintOrNot(f"Setting {key} to {value}")
|
|
375
|
+
|
|
376
|
+
if key in self:
|
|
377
|
+
if self[key] == value:
|
|
378
|
+
if self.verbose:
|
|
379
|
+
self.__teePrintOrNot(f"Key {key} already exists with the same value")
|
|
380
|
+
return
|
|
381
|
+
self.dirty = True
|
|
382
|
+
# update the dictionary,
|
|
383
|
+
super().__setitem__(key,value)
|
|
384
|
+
if self.verbose:
|
|
385
|
+
self.__teePrintOrNot(f"Key {key} updated")
|
|
386
|
+
if self.memoryOnly:
|
|
387
|
+
return
|
|
388
|
+
if self.verbose:
|
|
389
|
+
self.__teePrintOrNot(f"Appending {key} to the appendQueue")
|
|
390
|
+
self.appendQueue.append('\t'.join(value))
|
|
391
|
+
# if not self.appendThread.is_alive():
|
|
392
|
+
# self.commitAppendToFile()
|
|
393
|
+
# else:
|
|
394
|
+
# self.appendEvent.set()
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def __delitem__(self,key):
|
|
398
|
+
key = str(key).strip()
|
|
399
|
+
# delete the key from the dictionary and update the file
|
|
400
|
+
if key not in self:
|
|
401
|
+
if self.verbose:
|
|
402
|
+
self.__teePrintOrNot(f"Key {key} not found")
|
|
403
|
+
return
|
|
404
|
+
super().__delitem__(key)
|
|
405
|
+
if self.memoryOnly:
|
|
406
|
+
return
|
|
407
|
+
self.__appendEmptyLine(key)
|
|
408
|
+
|
|
409
|
+
def __appendEmptyLine(self,key):
|
|
410
|
+
self.dirty = True
|
|
411
|
+
if self.correctColumnNum > 0:
|
|
412
|
+
emptyLine = key+'\t'*(self.correctColumnNum-1)
|
|
413
|
+
elif len(self[key]) > 1:
|
|
414
|
+
self.correctColumnNum = len(self[key])
|
|
415
|
+
emptyLine = key+'\t'*(self.correctColumnNum-1)
|
|
416
|
+
else:
|
|
417
|
+
emptyLine = key
|
|
418
|
+
if self.verbose:
|
|
419
|
+
self.__teePrintOrNot(f"Appending {emptyLine} to the appendQueue")
|
|
420
|
+
self.appendQueue.append(emptyLine)
|
|
421
|
+
return self
|
|
422
|
+
|
|
423
|
+
def clear(self):
|
|
424
|
+
# clear the dictionary and update the file
|
|
425
|
+
super().clear()
|
|
426
|
+
if self.verbose:
|
|
427
|
+
self.__teePrintOrNot(f"Clearing {self._fileName}")
|
|
428
|
+
if self.memoryOnly:
|
|
429
|
+
return self
|
|
430
|
+
self.clear_file()
|
|
431
|
+
return self
|
|
432
|
+
|
|
433
|
+
def clear_file(self):
|
|
434
|
+
try:
|
|
435
|
+
if self.header:
|
|
436
|
+
file = self.get_file_obj('w')
|
|
437
|
+
file.write(self.header+'\n')
|
|
438
|
+
self.release_file_obj(file)
|
|
439
|
+
if self.verbose:
|
|
440
|
+
self.__teePrintOrNot(f"Header {self.header} written to {self._fileName}")
|
|
441
|
+
self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
|
|
442
|
+
else:
|
|
443
|
+
file = self.get_file_obj('w')
|
|
444
|
+
self.release_file_obj(file)
|
|
445
|
+
if self.verbose:
|
|
446
|
+
self.__teePrintOrNot(f"File {self._fileName} cleared empty")
|
|
447
|
+
self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
|
|
448
|
+
self.dirty = False
|
|
449
|
+
self.deSynced = False
|
|
450
|
+
except Exception as e:
|
|
451
|
+
self.__teePrintOrNot(f"Failed to write at clear_file() to {self._fileName}: {e}",'error')
|
|
452
|
+
import traceback
|
|
453
|
+
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
454
|
+
self.deSynced = True
|
|
455
|
+
return self
|
|
456
|
+
|
|
457
|
+
def __enter__(self):
|
|
458
|
+
return self
|
|
459
|
+
|
|
460
|
+
def __exit__(self,exc_type,exc_value,traceback):
|
|
461
|
+
self.stopAppendThread()
|
|
462
|
+
return self
|
|
463
|
+
|
|
464
|
+
def __repr__(self):
|
|
465
|
+
return f"""TSVZed(
|
|
466
|
+
file_name:{self._fileName}
|
|
467
|
+
teeLogger:{self.teeLogger}
|
|
468
|
+
header:{self.header}
|
|
469
|
+
correctColumnNum:{self.correctColumnNum}
|
|
470
|
+
createIfNotExist:{self.createIfNotExist}
|
|
471
|
+
verifyHeader:{self.verifyHeader}
|
|
472
|
+
rewrite_on_load:{self.rewrite_on_load}
|
|
473
|
+
rewrite_on_exit:{self.rewrite_on_exit}
|
|
474
|
+
rewrite_interval:{self.rewrite_interval}
|
|
475
|
+
append_check_delay:{self.append_check_delay}
|
|
476
|
+
appendQueueLength:{len(self.appendQueue)}
|
|
477
|
+
appendThreadAlive:{self.appendThread.is_alive()}
|
|
478
|
+
dirty:{self.dirty}
|
|
479
|
+
deSynced:{self.deSynced}
|
|
480
|
+
memoryOnly:{self.memoryOnly}
|
|
481
|
+
{dict(self)})"""
|
|
482
|
+
|
|
483
|
+
def close(self):
|
|
484
|
+
self.stopAppendThread()
|
|
485
|
+
return self
|
|
486
|
+
|
|
487
|
+
def __str__(self):
|
|
488
|
+
return f"TSVZed({self._fileName},{dict(self)})"
|
|
489
|
+
|
|
490
|
+
def __del__(self):
|
|
491
|
+
self.stopAppendThread()
|
|
492
|
+
return self
|
|
493
|
+
|
|
494
|
+
def popitem(self, last=True):
|
|
495
|
+
key, value = super().popitem(last)
|
|
496
|
+
if not self.memoryOnly:
|
|
497
|
+
self.__appendEmptyLine(key)
|
|
498
|
+
return key, value
|
|
499
|
+
|
|
500
|
+
__marker = object()
|
|
501
|
+
|
|
502
|
+
def pop(self, key, default=__marker):
|
|
503
|
+
'''od.pop(k[,d]) -> v, remove specified key and return the corresponding
|
|
504
|
+
value. If key is not found, d is returned if given, otherwise KeyError
|
|
505
|
+
is raised.
|
|
506
|
+
|
|
507
|
+
'''
|
|
508
|
+
if key not in self:
|
|
509
|
+
if default is self.__marker:
|
|
510
|
+
raise KeyError(key)
|
|
511
|
+
return default
|
|
512
|
+
value = super().pop(key)
|
|
513
|
+
if not self.memoryOnly:
|
|
514
|
+
self.__appendEmptyLine(key)
|
|
515
|
+
return value
|
|
516
|
+
|
|
517
|
+
def move_to_end(self, key, last=True):
|
|
518
|
+
'''Move an existing element to the end (or beginning if last is false).
|
|
519
|
+
Raise KeyError if the element does not exist.
|
|
520
|
+
'''
|
|
521
|
+
super().move_to_end(key, last)
|
|
522
|
+
self.dirty = True
|
|
523
|
+
if not self.rewrite_on_exit:
|
|
524
|
+
self.rewrite_on_exit = True
|
|
525
|
+
self.__teePrintOrNot(f"Warning: move_to_end had been called. Need to resync for changes to apply to disk.")
|
|
526
|
+
self.__teePrintOrNot(f"rewrite_on_exit set to True")
|
|
527
|
+
if self.verbose:
|
|
528
|
+
self.__teePrintOrNot(f"Warning: Trying to move Key {key} moved to {'end' if last else 'beginning'} Need to resync for changes to apply to disk")
|
|
529
|
+
return self
|
|
530
|
+
|
|
531
|
+
@classmethod
|
|
532
|
+
def fromkeys(cls, iterable, value=None,fileName = None,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,verbose = False):
|
|
533
|
+
'''Create a new ordered dictionary with keys from iterable and values set to value.
|
|
534
|
+
'''
|
|
535
|
+
self = cls(fileName,teeLogger,header,createIfNotExist,verifyHeader,rewrite_on_load,rewrite_on_exit,rewrite_interval,append_check_delay,verbose)
|
|
536
|
+
for key in iterable:
|
|
537
|
+
self[key] = value
|
|
538
|
+
return self
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
def rewrite(self,force = False,reloadInternalFromFile = None):
|
|
542
|
+
if not self.dirty and not force:
|
|
543
|
+
return False
|
|
544
|
+
if not self.deSynced and not force:
|
|
545
|
+
if self.rewrite_interval == 0 or time.time() - os.path.getmtime(self._fileName) < self.rewrite_interval:
|
|
546
|
+
return False
|
|
547
|
+
try:
|
|
548
|
+
if self.verbose:
|
|
549
|
+
self.__teePrintOrNot(f"Rewriting {self._fileName}")
|
|
550
|
+
if reloadInternalFromFile is None:
|
|
551
|
+
reloadInternalFromFile = self.monitor_external_changes
|
|
552
|
+
if reloadInternalFromFile:
|
|
553
|
+
# this will be needed if more than 1 process is accessing the file
|
|
554
|
+
self.commitAppendToFile()
|
|
555
|
+
self.reload()
|
|
556
|
+
self.mapToFile()
|
|
557
|
+
if self.verbose:
|
|
558
|
+
self.__teePrintOrNot(f"{len(self)} records rewrote to {self._fileName}")
|
|
559
|
+
if not self.appendThread.is_alive():
|
|
560
|
+
self.commitAppendToFile()
|
|
561
|
+
# else:
|
|
562
|
+
# self.appendEvent.set()
|
|
563
|
+
return True
|
|
564
|
+
except Exception as e:
|
|
565
|
+
self.__teePrintOrNot(f"Failed to write at sync() to {self._fileName}: {e}",'error')
|
|
566
|
+
import traceback
|
|
567
|
+
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
568
|
+
self.deSynced = True
|
|
569
|
+
return False
|
|
570
|
+
|
|
571
|
+
def mapToFile(self):
|
|
572
|
+
try:
|
|
573
|
+
file = self.get_file_obj('w')
|
|
574
|
+
if self.header:
|
|
575
|
+
file.write(self.header+'\n')
|
|
576
|
+
for key in self:
|
|
577
|
+
file.write('\t'.join(self[key])+'\n')
|
|
578
|
+
self.release_file_obj(file)
|
|
579
|
+
if self.verbose:
|
|
580
|
+
self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
|
|
581
|
+
self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
|
|
582
|
+
self.dirty = False
|
|
583
|
+
self.deSynced = False
|
|
584
|
+
except Exception as e:
|
|
585
|
+
self.__teePrintOrNot(f"Failed to write at dumpToFile() to {self._fileName}: {e}",'error')
|
|
586
|
+
import traceback
|
|
587
|
+
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
588
|
+
self.deSynced = True
|
|
589
|
+
return self
|
|
590
|
+
|
|
591
|
+
def _appendWorker(self):
|
|
592
|
+
while not self.shutdownEvent.is_set():
|
|
593
|
+
self.rewrite()
|
|
594
|
+
self.commitAppendToFile()
|
|
595
|
+
time.sleep(self.append_check_delay)
|
|
596
|
+
# self.appendEvent.wait()
|
|
597
|
+
# self.appendEvent.clear()
|
|
598
|
+
if self.verbose:
|
|
599
|
+
self.__teePrintOrNot(f"Append worker for {self._fileName} shut down")
|
|
600
|
+
self.commitAppendToFile()
|
|
601
|
+
|
|
602
|
+
def commitAppendToFile(self):
|
|
603
|
+
if self.appendQueue:
|
|
604
|
+
try:
|
|
605
|
+
if self.verbose:
|
|
606
|
+
self.__teePrintOrNot(f"Commiting {len(self.appendQueue)} records to {self._fileName}")
|
|
607
|
+
self.__teePrintOrNot(f"Before size of {self._fileName}: {os.path.getsize(self._fileName)}")
|
|
608
|
+
file = self.get_file_obj('a')
|
|
609
|
+
while self.appendQueue:
|
|
610
|
+
line = self.appendQueue.popleft()
|
|
611
|
+
file.write(line+'\n')
|
|
612
|
+
self.release_file_obj(file)
|
|
613
|
+
if self.verbose:
|
|
614
|
+
self.__teePrintOrNot(f"Records commited to {self._fileName}")
|
|
615
|
+
self.__teePrintOrNot(f"After size of {self._fileName}: {os.path.getsize(self._fileName)}")
|
|
616
|
+
except Exception as e:
|
|
617
|
+
self.__teePrintOrNot(f"Failed to write at commitAppendToFile to {self._fileName}: {e}",'error')
|
|
618
|
+
import traceback
|
|
619
|
+
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
620
|
+
self.deSynced = True
|
|
621
|
+
return self
|
|
622
|
+
|
|
623
|
+
def stopAppendThread(self):
|
|
624
|
+
if self.shutdownEvent.is_set():
|
|
625
|
+
# if self.verbose:
|
|
626
|
+
# self.__teePrintOrNot(f"Append thread for {self._fileName} already stopped")
|
|
627
|
+
return
|
|
628
|
+
self.rewrite(force=self.rewrite_on_exit) # Ensure any final sync operations are performed
|
|
629
|
+
# self.appendEvent.set()
|
|
630
|
+
self.shutdownEvent.set() # Signal the append thread to shut down
|
|
631
|
+
self.appendThread.join() # Wait for the append thread to complete
|
|
632
|
+
if self.verbose:
|
|
633
|
+
self.__teePrintOrNot(f"Append thread for {self._fileName} stopped")
|
|
634
|
+
|
|
635
|
+
def get_file_obj(self,modes = 'a'):
|
|
636
|
+
self.writeLock.acquire()
|
|
637
|
+
try:
|
|
638
|
+
if not self.encoding:
|
|
639
|
+
self.encoding = 'utf8'
|
|
640
|
+
file = open(self._fileName, mode=modes, encoding=self.encoding)
|
|
641
|
+
# Lock the file after opening
|
|
642
|
+
if os.name == 'posix':
|
|
643
|
+
fcntl.lockf(file, fcntl.LOCK_EX)
|
|
644
|
+
elif os.name == 'nt':
|
|
645
|
+
# For Windows, locking the entire file, avoiding locking an empty file
|
|
646
|
+
lock_length = max(1, os.path.getsize(self._fileName))
|
|
647
|
+
msvcrt.locking(file.fileno(), msvcrt.LK_LOCK, lock_length)
|
|
648
|
+
if self.verbose:
|
|
649
|
+
self.__teePrintOrNot(f"File {self._fileName} locked with mode {modes}")
|
|
650
|
+
except Exception as e:
|
|
651
|
+
self.writeLock.release() # Release the thread lock in case of an error
|
|
652
|
+
raise e # Re-raise the exception to handle it outside or notify the user
|
|
653
|
+
return file
|
|
654
|
+
|
|
655
|
+
def release_file_obj(self,file):
|
|
656
|
+
try:
|
|
657
|
+
if os.name == 'posix':
|
|
658
|
+
fcntl.lockf(file, fcntl.LOCK_UN)
|
|
659
|
+
elif os.name == 'nt':
|
|
660
|
+
# Unlocking the entire file; for Windows, ensure not unlocking an empty file
|
|
661
|
+
unlock_length = max(1, os.path.getsize(file.name))
|
|
662
|
+
msvcrt.locking(file.fileno(), msvcrt.LK_UNLCK, unlock_length)
|
|
663
|
+
file.close() # Ensure file is closed after unlocking
|
|
664
|
+
if self.verbose:
|
|
665
|
+
self.__teePrintOrNot(f"File {file.name} unlocked / released")
|
|
666
|
+
except Exception as e:
|
|
667
|
+
raise e # Re-raise the exception for external handling
|
|
668
|
+
finally:
|
|
669
|
+
self.writeLock.release() # Ensure the thread lock is always released
|