TSVZ 3.29__py3-none-any.whl → 3.35__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
TSVZ.py CHANGED
@@ -4,28 +4,32 @@
4
4
  # dependencies = [
5
5
  # ]
6
6
  # ///
7
- import os , sys
8
- from collections import OrderedDict , deque
9
- import time
10
7
  import atexit
11
- import threading
8
+ import functools
9
+ import io
10
+ import os
12
11
  import re
13
-
12
+ from tabnanny import verbose
13
+ import threading
14
+ import time
15
+ import sys
16
+ from collections import OrderedDict, deque
17
+ from collections.abc import MutableMapping
14
18
  RESOURCE_LIB_AVAILABLE = True
15
19
  try:
16
- import resource
17
- except:
18
- RESOURCE_LIB_AVAILABLE = False
20
+ import resource
21
+ except ImportError:
22
+ RESOURCE_LIB_AVAILABLE = False
19
23
 
20
24
  if os.name == 'nt':
21
- import msvcrt
25
+ import msvcrt
22
26
  elif os.name == 'posix':
23
- import fcntl
27
+ import fcntl
24
28
 
25
- version = '3.29'
29
+ version = '3.35'
26
30
  __version__ = version
27
31
  author = 'pan@zopyr.us'
28
- COMMIT_DATE = '2025-08-11'
32
+ COMMIT_DATE = '2025-11-13'
29
33
 
30
34
  DEFAULT_DELIMITER = '\t'
31
35
  DEFAULTS_INDICATOR_KEY = '#_defaults_#'
@@ -33,137 +37,216 @@ DEFAULTS_INDICATOR_KEY = '#_defaults_#'
33
37
  COMPRESSED_FILE_EXTENSIONS = ['gz','gzip','bz2','bzip2','xz','lzma']
34
38
 
35
39
  def get_delimiter(delimiter,file_name = ''):
36
- global DEFAULT_DELIMITER
37
- if not delimiter:
38
- return DEFAULT_DELIMITER
39
- elif delimiter == ...:
40
- if not file_name:
41
- rtn = '\t'
42
- elif file_name.endswith('.csv'):
43
- rtn = ','
44
- elif file_name.endswith('.nsv'):
45
- rtn = '\0'
46
- elif file_name.endswith('.psv'):
47
- rtn = '|'
48
- else:
49
- rtn = '\t'
50
- elif delimiter == 'comma':
51
- rtn = ','
52
- elif delimiter == 'tab':
53
- rtn = '\t'
54
- elif delimiter == 'pipe':
55
- rtn = '|'
56
- elif delimiter == 'null':
57
- rtn = '\0'
58
- else:
59
- rtn = delimiter.encode().decode('unicode_escape')
60
- DEFAULT_DELIMITER = rtn
61
- return rtn
40
+ global DEFAULT_DELIMITER
41
+ if not delimiter:
42
+ return DEFAULT_DELIMITER
43
+ elif delimiter == ...:
44
+ if not file_name:
45
+ rtn = '\t'
46
+ elif file_name.endswith('.csv'):
47
+ rtn = ','
48
+ elif file_name.endswith('.nsv'):
49
+ rtn = '\0'
50
+ elif file_name.endswith('.psv'):
51
+ rtn = '|'
52
+ else:
53
+ rtn = '\t'
54
+ elif delimiter == 'comma':
55
+ rtn = ','
56
+ elif delimiter == 'tab':
57
+ rtn = '\t'
58
+ elif delimiter == 'pipe':
59
+ rtn = '|'
60
+ elif delimiter == 'null':
61
+ rtn = '\0'
62
+ else:
63
+ rtn = delimiter.encode().decode('unicode_escape')
64
+ DEFAULT_DELIMITER = rtn
65
+ return rtn
66
+
67
+ def eprint(*args, **kwargs):
68
+ try:
69
+ if 'file' in kwargs:
70
+ print(*args, **kwargs)
71
+ else:
72
+ print(*args, file=sys.stderr, **kwargs)
73
+ except Exception as e:
74
+ print(f"Error: Cannot print to stderr: {e}")
75
+ print(*args, **kwargs)
62
76
 
63
77
  def openFileAsCompressed(fileName,mode = 'rb',encoding = 'utf8',teeLogger = None,compressLevel = 1):
64
- if 'b' not in mode:
65
- mode += 't'
66
- kwargs = {}
67
- if 'r' not in mode:
68
- if fileName.endswith('.xz'):
69
- kwargs['preset'] = compressLevel
70
- else:
71
- kwargs['compresslevel'] = compressLevel
72
- if 'b' not in mode:
73
- kwargs['encoding'] = encoding
74
- if fileName.endswith('.xz') or fileName.endswith('.lzma'):
75
- try:
76
- import lzma
77
- return lzma.open(fileName, mode, **kwargs)
78
- except:
79
- __teePrintOrNot(f"Failed to open {fileName} with lzma, trying bin",teeLogger=teeLogger)
80
- elif fileName.endswith('.gz') or fileName.endswith('.gzip'):
81
- try:
82
- import gzip
83
- return gzip.open(fileName, mode, **kwargs)
84
- except:
85
- __teePrintOrNot(f"Failed to open {fileName} with gzip, trying bin",teeLogger=teeLogger)
86
- elif fileName.endswith('.bz2') or fileName.endswith('.bzip2'):
87
- try:
88
- import bz2
89
- return bz2.open(fileName, mode, **kwargs)
90
- except:
91
- __teePrintOrNot(f"Failed to open {fileName} with bz2, trying bin",teeLogger=teeLogger)
92
- if 't' in mode:
93
- mode = mode.replace('t','')
94
- return open(fileName, mode, encoding=encoding)
95
- if 'b' not in mode:
96
- mode += 'b'
97
- return open(fileName, mode)
98
-
99
-
100
- def pretty_format_table(data, delimiter = DEFAULT_DELIMITER,header = None):
101
- version = 1.11
102
- _ = version
103
- if not data:
104
- return ''
105
- if isinstance(data, str):
106
- data = data.strip('\n').split('\n')
107
- data = [line.split(delimiter) for line in data]
108
- elif isinstance(data, dict):
109
- # flatten the 2D dict to a list of lists
110
- if isinstance(next(iter(data.values())), dict):
111
- tempData = [['key'] + list(next(iter(data.values())).keys())]
112
- tempData.extend( [[key] + list(value.values()) for key, value in data.items()])
113
- data = tempData
114
- else:
115
- # it is a dict of lists
116
- data = [[key] + list(value) for key, value in data.items()]
117
- elif not isinstance(data,list):
118
- data = list(data)
119
- # format the list into 2d list of list of strings
120
- if isinstance(data[0], dict):
121
- tempData = [data[0].keys()]
122
- tempData.extend([list(item.values()) for item in data])
123
- data = tempData
124
- data = [[str(item) for item in row] for row in data]
125
- num_cols = len(data[0])
126
- col_widths = [0] * num_cols
127
- # Calculate the maximum width of each column
128
- for c in range(num_cols):
129
- #col_widths[c] = max(len(row[c]) for row in data)
130
- # handle ansii escape sequences
131
- col_widths[c] = max(len(re.sub(r'\x1b\[[0-?]*[ -/]*[@-~]','',row[c])) for row in data)
132
- if header:
133
- header_widths = [len(re.sub(r'\x1b\[[0-?]*[ -/]*[@-~]', '', col)) for col in header]
134
- col_widths = [max(col_widths[i], header_widths[i]) for i in range(num_cols)]
135
- # Build the row format string
136
- row_format = ' | '.join('{{:<{}}}'.format(width) for width in col_widths)
137
- # Print the header
138
- if not header:
139
- header = data[0]
140
- outTable = []
141
- outTable.append(row_format.format(*header))
142
- outTable.append('-+-'.join('-' * width for width in col_widths))
143
- for row in data[1:]:
144
- # if the row is empty, print an divider
145
- if not any(row):
146
- outTable.append('-+-'.join('-' * width for width in col_widths))
147
- else:
148
- outTable.append(row_format.format(*row))
149
- else:
150
- # pad / truncate header to appropriate length
151
- if isinstance(header,str):
152
- header = header.split(delimiter)
153
- if len(header) < num_cols:
154
- header += ['']*(num_cols-len(header))
155
- elif len(header) > num_cols:
156
- header = header[:num_cols]
157
- outTable = []
158
- outTable.append(row_format.format(*header))
159
- outTable.append('-+-'.join('-' * width for width in col_widths))
160
- for row in data:
161
- # if the row is empty, print an divider
162
- if not any(row):
163
- outTable.append('-+-'.join('-' * width for width in col_widths))
164
- else:
165
- outTable.append(row_format.format(*row))
166
- return '\n'.join(outTable) + '\n'
78
+ if 'b' not in mode:
79
+ mode += 't'
80
+ kwargs = {}
81
+ if 'r' not in mode:
82
+ if fileName.endswith('.xz'):
83
+ kwargs['preset'] = compressLevel
84
+ else:
85
+ kwargs['compresslevel'] = compressLevel
86
+ if 'b' not in mode:
87
+ kwargs['encoding'] = encoding
88
+ if fileName.endswith('.xz') or fileName.endswith('.lzma'):
89
+ try:
90
+ import lzma
91
+ return lzma.open(fileName, mode, **kwargs)
92
+ except Exception:
93
+ __teePrintOrNot(f"Failed to open {fileName} with lzma, trying bin",teeLogger=teeLogger)
94
+ elif fileName.endswith('.gz') or fileName.endswith('.gzip'):
95
+ try:
96
+ import gzip
97
+ return gzip.open(fileName, mode, **kwargs)
98
+ except Exception:
99
+ __teePrintOrNot(f"Failed to open {fileName} with gzip, trying bin",teeLogger=teeLogger)
100
+ elif fileName.endswith('.bz2') or fileName.endswith('.bzip2'):
101
+ try:
102
+ import bz2
103
+ return bz2.open(fileName, mode, **kwargs)
104
+ except Exception:
105
+ __teePrintOrNot(f"Failed to open {fileName} with bz2, trying bin",teeLogger=teeLogger)
106
+ if 't' in mode:
107
+ mode = mode.replace('t','')
108
+ return open(fileName, mode, encoding=encoding)
109
+ if 'b' not in mode:
110
+ mode += 'b'
111
+ return open(fileName, mode)
112
+
113
+ def get_terminal_size():
114
+ '''
115
+ Get the terminal size
116
+
117
+ @params:
118
+ None
119
+
120
+ @returns:
121
+ (int,int): the number of columns and rows of the terminal
122
+ '''
123
+ try:
124
+ import os
125
+ _tsize = os.get_terminal_size()
126
+ except Exception:
127
+ try:
128
+ import fcntl
129
+ import struct
130
+ import termios
131
+ packed = fcntl.ioctl(0, termios.TIOCGWINSZ, struct.pack('HHHH', 0, 0, 0, 0))
132
+ _tsize = struct.unpack('HHHH', packed)[:2]
133
+ except Exception:
134
+ import shutil
135
+ _tsize = shutil.get_terminal_size(fallback=(240, 50))
136
+ return _tsize
137
+
138
+ def pretty_format_table(data, delimiter="\t", header=None, full=False):
139
+ version = 1.12
140
+ _ = version
141
+ def visible_len(s):
142
+ return len(re.sub(r"\x1b\[[0-?]*[ -/]*[@-~]", "", s))
143
+ def table_width(col_widths, sep_len):
144
+ # total width = sum of column widths + separators between columns
145
+ return sum(col_widths) + sep_len * (len(col_widths) - 1)
146
+ def truncate_to_width(s, width):
147
+ # If fits, leave as is. If too long and width >= 1, keep width-1 chars + "."
148
+ # If width == 0, nothing fits; return empty string.
149
+ if visible_len(s) <= width:
150
+ return s
151
+ if width <= 0:
152
+ return ""
153
+ # Build a truncated plain string based on visible chars (no ANSI awareness for slicing)
154
+ # For simplicity, slice the raw string. This may cut ANSI; best to avoid ANSI in data if truncation occurs.
155
+ return s[: max(width - 2, 0)] + ".."
156
+ if not data:
157
+ return ""
158
+ # Normalize input data structure
159
+ if isinstance(data, str):
160
+ data = data.strip("\n").split("\n")
161
+ data = [line.split(delimiter) for line in data]
162
+ elif isinstance(data, dict):
163
+ if isinstance(next(iter(data.values())), dict):
164
+ tempData = [["key"] + list(next(iter(data.values())).keys())]
165
+ tempData.extend([[key] + list(value.values()) for key, value in data.items()])
166
+ data = tempData
167
+ else:
168
+ data = [[key] + list(value) for key, value in data.items()]
169
+ elif not isinstance(data, list):
170
+ data = list(data)
171
+ if isinstance(data[0], dict):
172
+ tempData = [list(data[0].keys())]
173
+ tempData.extend([list(item.values()) for item in data])
174
+ data = tempData
175
+ data = [[str(item) for item in row] for row in data]
176
+ num_cols = len(data[0])
177
+ # Resolve header and rows
178
+ using_provided_header = header is not None
179
+ if not using_provided_header:
180
+ header = data[0]
181
+ rows = data[1:]
182
+ else:
183
+ if isinstance(header, str):
184
+ header = header.split(delimiter)
185
+ # Pad/trim header to match num_cols
186
+ if len(header) < num_cols:
187
+ header = header + [""] * (num_cols - len(header))
188
+ elif len(header) > num_cols:
189
+ header = header[:num_cols]
190
+ rows = data
191
+ # Compute initial column widths based on data and header
192
+ def compute_col_widths(hdr, rows_):
193
+ col_w = [0] * len(hdr)
194
+ for i in range(len(hdr)):
195
+ col_w[i] = max(0, visible_len(hdr[i]), *(visible_len(r[i]) for r in rows_ if i < len(r)))
196
+ return col_w
197
+ # Ensure all rows have the same number of columns
198
+ normalized_rows = []
199
+ for r in rows:
200
+ if len(r) < num_cols:
201
+ r = r + [""] * (num_cols - len(r))
202
+ elif len(r) > num_cols:
203
+ r = r[:num_cols]
204
+ normalized_rows.append(r)
205
+ rows = normalized_rows
206
+ col_widths = compute_col_widths(header, rows)
207
+ # If full=True, keep existing formatting
208
+ # Else try to fit within the terminal width by:
209
+ # 1) Switching to compressed separators if needed
210
+ # 2) Recursively compressing columns (truncating with ".")
211
+ sep = " | "
212
+ hsep = "-+-"
213
+ cols = get_terminal_size()[0]
214
+ def render(hdr, rows, col_w, sep_str, hsep_str):
215
+ row_fmt = sep_str.join("{{:<{}}}".format(w) for w in col_w)
216
+ out = []
217
+ out.append(row_fmt.format(*hdr))
218
+ out.append(hsep_str.join("-" * w for w in col_w))
219
+ for row in rows:
220
+ if not any(row):
221
+ out.append(hsep_str.join("-" * w for w in col_w))
222
+ else:
223
+ row = [truncate_to_width(row[i], col_w[i]) for i in range(len(row))]
224
+ out.append(row_fmt.format(*row))
225
+ return "\n".join(out) + "\n"
226
+ if full:
227
+ return render(header, rows, col_widths, sep, hsep)
228
+ # Try default separators first
229
+ if table_width(col_widths, len(sep)) <= cols:
230
+ return render(header, rows, col_widths, sep, hsep)
231
+ # Use compressed separators (no spaces)
232
+ sep = "|"
233
+ hsep = "+"
234
+ if table_width(col_widths, len(sep)) <= cols:
235
+ return render(header, rows, col_widths, sep, hsep)
236
+ # Begin column compression
237
+ # Track which columns have been compressed already to header width
238
+ header_widths = [visible_len(h) for h in header]
239
+ width_diff = [max(col_widths[i] - header_widths[i],0) for i in range(num_cols)]
240
+ total_overflow_width = table_width(col_widths, len(sep)) - cols
241
+ for i, diff in sorted(enumerate(width_diff), key=lambda x: -x[1]):
242
+ if total_overflow_width <= 0:
243
+ break
244
+ if diff <= 0:
245
+ continue
246
+ reduce_by = min(diff, total_overflow_width)
247
+ col_widths[i] -= reduce_by
248
+ total_overflow_width -= reduce_by
249
+ return render(header, rows, col_widths, sep, hsep)
167
250
 
168
251
  def format_bytes(size, use_1024_bytes=None, to_int=False, to_str=False,str_format='.2f'):
169
252
  """
@@ -231,14 +314,14 @@ def format_bytes(size, use_1024_bytes=None, to_int=False, to_str=False,str_forma
231
314
  else:
232
315
  try:
233
316
  return int(size)
234
- except Exception as e:
317
+ except Exception:
235
318
  return 0
236
319
  elif to_str or isinstance(size, int) or isinstance(size, float):
237
320
  if isinstance(size, str):
238
321
  try:
239
322
  size = size.rstrip('B').rstrip('b')
240
323
  size = float(size.lower().strip())
241
- except Exception as e:
324
+ except Exception:
242
325
  return size
243
326
  # size is in bytes
244
327
  if use_1024_bytes or use_1024_bytes is None:
@@ -268,918 +351,1079 @@ def format_bytes(size, use_1024_bytes=None, to_int=False, to_str=False,str_forma
268
351
  return 0
269
352
 
270
353
  def get_resource_usage(return_dict = False):
271
- try:
272
- if RESOURCE_LIB_AVAILABLE:
273
- rawResource = resource.getrusage(resource.RUSAGE_SELF)
274
- resourceDict = {}
275
- resourceDict['user mode time'] = f'{rawResource.ru_utime} seconds'
276
- resourceDict['system mode time'] = f'{rawResource.ru_stime} seconds'
277
- resourceDict['max resident set size'] = f'{format_bytes(rawResource.ru_maxrss * 1024)}B'
278
- resourceDict['shared memory size'] = f'{format_bytes(rawResource.ru_ixrss * 1024)}B'
279
- resourceDict['unshared memory size'] = f'{format_bytes(rawResource.ru_idrss * 1024)}B'
280
- resourceDict['unshared stack size'] = f'{format_bytes(rawResource.ru_isrss * 1024)}B'
281
- resourceDict['cached page hits'] = f'{rawResource.ru_minflt}'
282
- resourceDict['missed page hits'] = f'{rawResource.ru_majflt}'
283
- resourceDict['swapped out page count'] = f'{rawResource.ru_nswap}'
284
- resourceDict['block input operations'] = f'{rawResource.ru_inblock}'
285
- resourceDict['block output operations'] = f'{rawResource.ru_oublock}'
286
- resourceDict['IPC messages sent'] = f'{rawResource.ru_msgsnd}'
287
- resourceDict['IPC messages received'] = f'{rawResource.ru_msgrcv}'
288
- resourceDict['signals received'] = f'{rawResource.ru_nsignals}'
289
- resourceDict['voluntary context sw'] = f'{rawResource.ru_nvcsw}'
290
- resourceDict['involuntary context sw'] = f'{rawResource.ru_nivcsw}'
291
- if return_dict:
292
- return resourceDict
293
- return '\n'.join(['\t'.join(line) for line in resourceDict.items()])
294
- except Exception as e:
295
- print(f"Error: {e}")
296
- if return_dict:
297
- return {}
298
- return ''
354
+ try:
355
+ if RESOURCE_LIB_AVAILABLE:
356
+ rawResource = resource.getrusage(resource.RUSAGE_SELF)
357
+ resourceDict = {}
358
+ resourceDict['user mode time'] = f'{rawResource.ru_utime} seconds'
359
+ resourceDict['system mode time'] = f'{rawResource.ru_stime} seconds'
360
+ resourceDict['max resident set size'] = f'{format_bytes(rawResource.ru_maxrss * 1024)}B'
361
+ resourceDict['shared memory size'] = f'{format_bytes(rawResource.ru_ixrss * 1024)}B'
362
+ resourceDict['unshared memory size'] = f'{format_bytes(rawResource.ru_idrss * 1024)}B'
363
+ resourceDict['unshared stack size'] = f'{format_bytes(rawResource.ru_isrss * 1024)}B'
364
+ resourceDict['cached page hits'] = f'{rawResource.ru_minflt}'
365
+ resourceDict['missed page hits'] = f'{rawResource.ru_majflt}'
366
+ resourceDict['swapped out page count'] = f'{rawResource.ru_nswap}'
367
+ resourceDict['block input operations'] = f'{rawResource.ru_inblock}'
368
+ resourceDict['block output operations'] = f'{rawResource.ru_oublock}'
369
+ resourceDict['IPC messages sent'] = f'{rawResource.ru_msgsnd}'
370
+ resourceDict['IPC messages received'] = f'{rawResource.ru_msgrcv}'
371
+ resourceDict['signals received'] = f'{rawResource.ru_nsignals}'
372
+ resourceDict['voluntary context sw'] = f'{rawResource.ru_nvcsw}'
373
+ resourceDict['involuntary context sw'] = f'{rawResource.ru_nivcsw}'
374
+ if return_dict:
375
+ return resourceDict
376
+ return '\n'.join(['\t'.join(line) for line in resourceDict.items()])
377
+ except Exception as e:
378
+ print(f"Error: {e}")
379
+ if return_dict:
380
+ return {}
381
+ return ''
299
382
 
300
383
  def __teePrintOrNot(message,level = 'info',teeLogger = None):
301
- """
302
- Prints the given message or logs it using the provided teeLogger.
303
-
304
- Parameters:
305
- message (str): The message to be printed or logged.
306
- level (str, optional): The log level. Defaults to 'info'.
307
- teeLogger (object, optional): The logger object used for logging. Defaults to None.
308
-
309
- Returns:
310
- None
311
- """
312
- try:
313
- if teeLogger:
314
- try:
315
- teeLogger.teelog(message,level,callerStackDepth=3)
316
- except:
317
- teeLogger.teelog(message,level)
318
- else:
319
- print(message,flush=True)
320
- except Exception:
321
- print(message,flush=True)
322
-
323
- def _processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,strict = True,delimiter = DEFAULT_DELIMITER,defaults = ...):
324
- """
325
- Process a line of text and update the task dictionary.
326
-
327
- Parameters:
328
- line (str): The line of text to process.
329
- taskDic (dict): The dictionary to update with the processed line.
330
- correctColumnNum (int): The expected number of columns in the line.
331
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
332
- teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
333
- strict (bool, optional): Whether to strictly enforce the correct number of columns. Defaults to True.
334
- defaults (list, optional): The default values to use for missing columns. Defaults to [].
335
-
336
- Returns:
337
- tuple: A tuple containing the updated correctColumnNum and the processed lineCache.
338
-
339
- """
340
- if defaults is ...:
341
- defaults = []
342
- line = line.strip(' ').strip('\x00').rstrip('\r\n')
343
- # we throw away the lines that start with '#'
344
- if not line :
345
- if verbose:
346
- __teePrintOrNot(f"Ignoring empty line: {line}",teeLogger=teeLogger)
347
- return correctColumnNum , []
348
- if line.startswith('#') and not line.startswith(DEFAULTS_INDICATOR_KEY):
349
- if verbose:
350
- __teePrintOrNot(f"Ignoring comment line: {line}",teeLogger=teeLogger)
351
- return correctColumnNum , []
352
- # we only interested in the lines that have the correct number of columns
353
- lineCache = [segment.rstrip() for segment in line.split(delimiter)]
354
- if not lineCache:
355
- return correctColumnNum , []
356
- if correctColumnNum == -1:
357
- if defaults and len(defaults) > 1:
358
- correctColumnNum = len(defaults)
359
- else:
360
- correctColumnNum = len(lineCache)
361
- if verbose:
362
- __teePrintOrNot(f"detected correctColumnNum: {len(lineCache)}",teeLogger=teeLogger)
363
- if not lineCache[0]:
364
- if verbose:
365
- __teePrintOrNot(f"Ignoring line with empty key: {line}",teeLogger=teeLogger)
366
- return correctColumnNum , []
367
- if len(lineCache) == 1 or not any(lineCache[1:]):
368
- if correctColumnNum == 1:
369
- taskDic[lineCache[0]] = lineCache
370
- elif lineCache[0] == DEFAULTS_INDICATOR_KEY:
371
- if verbose:
372
- __teePrintOrNot(f"Empty defaults line found: {line}",teeLogger=teeLogger)
373
- defaults.clear()
374
- else:
375
- if verbose:
376
- __teePrintOrNot(f"Key {lineCache[0]} found with empty value, deleting such key's representaion",teeLogger=teeLogger)
377
- if lineCache[0] in taskDic:
378
- del taskDic[lineCache[0]]
379
- return correctColumnNum , []
380
- elif len(lineCache) != correctColumnNum:
381
- if strict and not any(defaults):
382
- if verbose:
383
- __teePrintOrNot(f"Ignoring line with {len(lineCache)} columns: {line}",teeLogger=teeLogger)
384
- return correctColumnNum , []
385
- else:
386
- # fill / cut the line with empty entries til the correct number of columns
387
- if len(lineCache) < correctColumnNum:
388
- lineCache += ['']*(correctColumnNum-len(lineCache))
389
- elif len(lineCache) > correctColumnNum:
390
- lineCache = lineCache[:correctColumnNum]
391
- if verbose:
392
- __teePrintOrNot(f"Correcting {lineCache[0]}",teeLogger=teeLogger)
393
- # now replace empty values with defaults
394
- if defaults and len(defaults) > 1:
395
- for i in range(1,len(lineCache)):
396
- if not lineCache[i] and i < len(defaults) and defaults[i]:
397
- lineCache[i] = defaults[i]
398
- if verbose:
399
- __teePrintOrNot(f"Replacing empty value at {i} with default: {defaults[i]}",teeLogger=teeLogger)
400
- if lineCache[0] == DEFAULTS_INDICATOR_KEY:
401
- if verbose:
402
- __teePrintOrNot(f"Defaults line found: {line}",teeLogger=teeLogger)
403
- defaults[:] = lineCache
404
- return correctColumnNum , []
405
- taskDic[lineCache[0]] = lineCache
406
- if verbose:
407
- __teePrintOrNot(f"Key {lineCache[0]} added",teeLogger=teeLogger)
408
- return correctColumnNum, lineCache
409
-
410
- def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False,encoding = 'utf8',delimiter = ...,defaults = ...):
411
- """
412
- Reads the last valid line from a file.
413
-
414
- Args:
415
- fileName (str): The name of the file to read.
416
- taskDic (dict): A dictionary to pass to processLine function.
417
- correctColumnNum (int): A column number to pass to processLine function.
418
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
419
- teeLogger (optional): Logger to use for tee print. Defaults to None.
420
- encoding (str, optional): The encoding of the file. Defaults to None.
421
- strict (bool, optional): Whether to enforce strict processing. Defaults to False.
422
- delimiter (str, optional): The delimiter used in the file. Defaults to None.
423
- defaults (list, optional): The default values to use for missing columns. Defaults to [].
424
-
425
- Returns:
426
- list: The last valid line data processed by processLine, or an empty list if none found.
427
- """
428
- chunk_size = 1024 # Read in chunks of 1024 bytes
429
- last_valid_line = []
430
- if defaults is ...:
431
- defaults = []
432
- delimiter = get_delimiter(delimiter,file_name=fileName)
433
- if verbose:
434
- __teePrintOrNot(f"Reading last line only from {fileName}",teeLogger=teeLogger)
435
- with openFileAsCompressed(fileName, 'rb',encoding=encoding, teeLogger=teeLogger) as file:
436
- file.seek(0, os.SEEK_END)
437
- file_size = file.tell()
438
- buffer = b''
439
- position = file_size
440
-
441
- while position > 0:
442
- # Read chunks from the end of the file
443
- read_size = min(chunk_size, position)
444
- position -= read_size
445
- file.seek(position)
446
- chunk = file.read(read_size)
447
-
448
- # Prepend new chunk to buffer
449
- buffer = chunk + buffer
450
-
451
- # Split the buffer into lines
452
- lines = buffer.split(b'\n')
453
-
454
- # Process lines from the last to the first
455
- for i in range(len(lines) - 1, -1, -1):
456
- if lines[i].strip(): # Skip empty lines
457
- # Process the line
458
- correctColumnNum, lineCache = _processLine(
459
- line=lines[i].decode(encoding=encoding,errors='replace'),
460
- taskDic=taskDic,
461
- correctColumnNum=correctColumnNum,
462
- verbose=verbose,
463
- teeLogger=teeLogger,
464
- strict=strict,
465
- delimiter=delimiter,
466
- defaults=defaults,
467
- )
468
- # If the line is valid, return it
469
- if lineCache and any(lineCache):
470
- return lineCache
471
-
472
- # Keep the last (possibly incomplete) line in buffer for the next read
473
- buffer = lines[0]
474
-
475
- # Return empty list if no valid line found
476
- return last_valid_line
384
+ """
385
+ Prints the given message or logs it using the provided teeLogger.
386
+
387
+ Parameters:
388
+ message (str): The message to be printed or logged.
389
+ level (str, optional): The log level. Defaults to 'info'.
390
+ teeLogger (object, optional): The logger object used for logging. Defaults to None.
391
+
392
+ Returns:
393
+ None
394
+ """
395
+ try:
396
+ if teeLogger:
397
+ try:
398
+ teeLogger.teelog(message,level,callerStackDepth=3)
399
+ except Exception:
400
+ teeLogger.teelog(message,level)
401
+ else:
402
+ print(message,flush=True)
403
+ except Exception:
404
+ print(message,flush=True)
405
+
406
+ def _processLine(line,taskDic,correctColumnNum,strict = True,delimiter = DEFAULT_DELIMITER,defaults = ...,
407
+ storeOffset = False, offset = -1):
408
+ """
409
+ Process a line of text and update the task dictionary.
410
+
411
+ Parameters:
412
+ line (str): The line of text to process.
413
+ taskDic (dict): The dictionary to update with the processed line.
414
+ correctColumnNum (int): The expected number of columns in the line.
415
+ strict (bool, optional): Whether to strictly enforce the correct number of columns. Defaults to True.
416
+ defaults (list, optional): The default values to use for missing columns. Defaults to [].
417
+ storeOffset (bool, optional): Whether to store the offset of the line in the taskDic. Defaults to False.
418
+ offset (int, optional): The offset of the line in the file. Defaults to -1.
419
+
420
+ Returns:
421
+ tuple: A tuple containing the updated correctColumnNum and the processed lineCache or offset.
422
+
423
+ """
424
+ if defaults is ...:
425
+ defaults = []
426
+ line = line.strip('\x00').rstrip('\r\n')
427
+ if not line or (line.startswith('#') and not line.startswith(DEFAULTS_INDICATOR_KEY)):
428
+ # if verbose:
429
+ # __teePrintOrNot(f"Ignoring comment line: {line}",teeLogger=teeLogger)
430
+ return correctColumnNum , []
431
+ # we only interested in the lines that have the correct number of columns
432
+ lineCache = _unsanitize(line.split(delimiter),delimiter)
433
+ if not lineCache or not lineCache[0]:
434
+ return correctColumnNum , []
435
+ if correctColumnNum == -1:
436
+ if defaults and len(defaults) > 1:
437
+ correctColumnNum = len(defaults)
438
+ else:
439
+ correctColumnNum = len(lineCache)
440
+ # if verbose:
441
+ # __teePrintOrNot(f"detected correctColumnNum: {len(lineCache)}",teeLogger=teeLogger)
442
+ if len(lineCache) == 1 or not any(lineCache[1:]):
443
+ if correctColumnNum == 1:
444
+ taskDic[lineCache[0]] = lineCache if not storeOffset else offset
445
+ elif lineCache[0] == DEFAULTS_INDICATOR_KEY:
446
+ # if verbose:
447
+ # __teePrintOrNot(f"Empty defaults line found: {line}",teeLogger=teeLogger)
448
+ defaults.clear()
449
+ defaults[0] = DEFAULTS_INDICATOR_KEY
450
+ else:
451
+ # if verbose:
452
+ # __teePrintOrNot(f"Key {lineCache[0]} found with empty value, deleting such key's representaion",teeLogger=teeLogger)
453
+ if lineCache[0] in taskDic:
454
+ del taskDic[lineCache[0]]
455
+ return correctColumnNum , []
456
+ elif len(lineCache) != correctColumnNum:
457
+ if strict and not any(defaults[1:]):
458
+ # if verbose:
459
+ # __teePrintOrNot(f"Ignoring line with {len(lineCache)} columns: {line}",teeLogger=teeLogger)
460
+ return correctColumnNum , []
461
+ else:
462
+ # fill / cut the line with empty entries til the correct number of columns
463
+ if len(lineCache) < correctColumnNum:
464
+ lineCache += ['']*(correctColumnNum-len(lineCache))
465
+ elif len(lineCache) > correctColumnNum:
466
+ lineCache = lineCache[:correctColumnNum]
467
+ # if verbose:
468
+ # __teePrintOrNot(f"Correcting {lineCache[0]}",teeLogger=teeLogger)
469
+ # now replace empty values with defaults
470
+ if defaults and len(defaults) > 1:
471
+ for i in range(1,len(lineCache)):
472
+ if not lineCache[i] and i < len(defaults) and defaults[i]:
473
+ lineCache[i] = defaults[i]
474
+ if lineCache[0] == DEFAULTS_INDICATOR_KEY:
475
+ # if verbose:
476
+ # __teePrintOrNot(f"Defaults line found: {line}",teeLogger=teeLogger)
477
+ defaults[:] = lineCache
478
+ return correctColumnNum , []
479
+ taskDic[lineCache[0]] = lineCache if not storeOffset else offset
480
+ # if verbose:
481
+ # __teePrintOrNot(f"Key {lineCache[0]} added",teeLogger=teeLogger)
482
+ return correctColumnNum, lineCache
483
+
484
+ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False,
485
+ encoding = 'utf8',delimiter = ...,defaults = ...,storeOffset = False ):
486
+ """
487
+ Reads the last valid line from a file.
488
+
489
+ Args:
490
+ fileName (str): The name of the file to read.
491
+ taskDic (dict): A dictionary to pass to processLine function.
492
+ correctColumnNum (int): A column number to pass to processLine function.
493
+ verbose (bool, optional): Whether to print verbose output. Defaults to False.
494
+ teeLogger (optional): Logger to use for tee print. Defaults to None.
495
+ encoding (str, optional): The encoding of the file. Defaults to None.
496
+ strict (bool, optional): Whether to enforce strict processing. Defaults to False.
497
+ delimiter (str, optional): The delimiter used in the file. Defaults to None.
498
+ defaults (list, optional): The default values to use for missing columns. Defaults to [].
499
+ storeOffset (bool, optional): Instead of storing the data in taskDic, store the offset of each line. Defaults to False.
500
+
501
+ Returns:
502
+ list: The last valid line as a list of strings, or an empty list if no valid line is found.
503
+ """
504
+ chunk_size = 1024 # Read in chunks of 1024 bytes
505
+ last_valid_line = []
506
+ if defaults is ...:
507
+ defaults = []
508
+ delimiter = get_delimiter(delimiter,file_name=fileName)
509
+ if verbose:
510
+ __teePrintOrNot(f"Reading last line only from {fileName}",teeLogger=teeLogger)
511
+ with openFileAsCompressed(fileName, 'rb',encoding=encoding, teeLogger=teeLogger) as file:
512
+ file.seek(0, os.SEEK_END)
513
+ file_size = file.tell()
514
+ buffer = b''
515
+ position = file_size
516
+ processedSize = 0
517
+
518
+ while position > 0:
519
+ # Read chunks from the end of the file
520
+ read_size = min(chunk_size, position)
521
+ position -= read_size
522
+ file.seek(position)
523
+ chunk = file.read(read_size)
524
+
525
+ # Prepend new chunk to buffer
526
+ buffer = chunk + buffer
527
+
528
+ # Split the buffer into lines
529
+ lines = buffer.split(b'\n')
530
+
531
+ # Process lines from the last to the first
532
+ for i in range(len(lines) - 1, -1, -1):
533
+ processedSize += len(lines[i]) + 1 # +1 for the newline character
534
+ if lines[i].strip(): # Skip empty lines
535
+ # Process the line
536
+ correctColumnNum, lineCache = _processLine(
537
+ line=lines[i].decode(encoding=encoding,errors='replace'),
538
+ taskDic=taskDic,
539
+ correctColumnNum=correctColumnNum,
540
+ strict=strict,
541
+ delimiter=delimiter,
542
+ defaults=defaults,
543
+ storeOffset=storeOffset,
544
+ offset=file_size - processedSize + 1
545
+ )
546
+ # If the line is valid, return it
547
+ if lineCache:
548
+ if storeOffset and any(lineCache):
549
+ return lineCache
550
+
551
+ # Keep the last (possibly incomplete) line in buffer for the next read
552
+ buffer = lines[0]
553
+
554
+ # Return empty list if no valid line found
555
+ if storeOffset:
556
+ return -1
557
+ return last_valid_line
558
+
559
+ @functools.lru_cache(maxsize=None)
560
+ def _get_sanitization_re(delimiter = DEFAULT_DELIMITER):
561
+ return re.compile(r"(</sep/>|</LF/>|<sep>|<LF>|\n|" + re.escape(delimiter) + r")")
562
+
563
+ _sanitize_replacements = {
564
+ "<sep>":"</sep/>",
565
+ "<LF>":"</LF/>",
566
+ "\n":"<LF>",
567
+ }
568
+ _inverse_sanitize_replacements = {v: k for k, v in _sanitize_replacements.items()}
569
+
570
+ def _sanitize(data,delimiter = DEFAULT_DELIMITER):
571
+ if not data:
572
+ return data
573
+ def repl(m):
574
+ tok = m.group(0)
575
+ if tok == delimiter:
576
+ return "<sep>"
577
+ if tok in ("</sep/>", "</LF/>"):
578
+ eprint(f"Warning: Found illegal token '{tok}' during sanitization. It will be replaced.")
579
+ return _sanitize_replacements.get(tok, tok)
580
+ pattern = _get_sanitization_re(delimiter)
581
+ if isinstance(data,str):
582
+ return pattern.sub(repl, data)
583
+ else:
584
+ return [pattern.sub(repl,str(segment)) if segment else '' for segment in data]
585
+
586
+ def _unsanitize(data,delimiter = DEFAULT_DELIMITER):
587
+ if not data:
588
+ return data
589
+ def repl(m):
590
+ tok = m.group(0)
591
+ if tok == "<sep>":
592
+ return delimiter
593
+ return _inverse_sanitize_replacements.get(tok, tok)
594
+ pattern = _get_sanitization_re(delimiter)
595
+ if isinstance(data,str):
596
+ return pattern.sub(repl, data.rstrip())
597
+ else:
598
+ return [pattern.sub(repl,str(segment).rstrip()) if segment else '' for segment in data]
477
599
 
478
600
  def _formatHeader(header,verbose = False,teeLogger = None,delimiter = DEFAULT_DELIMITER):
479
- """
480
- Format the header string.
481
-
482
- Parameters:
483
- - header (str or list): The header string or list to format.
484
- - verbose (bool, optional): Whether to print verbose output. Defaults to False.
485
- - teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
486
-
487
- Returns:
488
- str: The formatted header string.
489
- """
490
- if not isinstance(header,str):
491
- try:
492
- header = delimiter.join(header)
493
- except:
494
- if verbose:
495
- __teePrintOrNot('Invalid header, setting header to empty.','error',teeLogger=teeLogger)
496
- header = ''
497
- header = delimiter.join([segment.rstrip() for segment in header.split(delimiter)])
498
- # if header:
499
- # if not header.endswith('\n'):
500
- # header += '\n'
501
- # else:
502
- # header = ''
503
- return header
601
+ """
602
+ Format the header string.
603
+
604
+ Parameters:
605
+ - header (str or list): The header string or list to format.
606
+ - verbose (bool, optional): Whether to print verbose output. Defaults to False.
607
+ - teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
608
+
609
+ Returns:
610
+ list: The formatted header list of string.
611
+ """
612
+ if isinstance(header,str):
613
+ header = header.split(delimiter)
614
+ else:
615
+ try:
616
+ header = [str(s) for s in header]
617
+ except Exception:
618
+ if verbose:
619
+ __teePrintOrNot('Invalid header, setting header to empty.','error',teeLogger=teeLogger)
620
+ header = []
621
+ return [s.rstrip() for s in header]
504
622
 
505
623
  def _lineContainHeader(header,line,verbose = False,teeLogger = None,strict = False,delimiter = DEFAULT_DELIMITER):
506
- """
507
- Verify if a line contains the header.
508
-
509
- Parameters:
510
- - header (str): The header string to verify.
511
- - line (str): The line to verify against the header.
512
- - verbose (bool, optional): Whether to print verbose output. Defaults to False.
513
- - teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
514
- - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
515
-
516
- Returns:
517
- bool: True if the header matches the line, False otherwise.
518
- """
519
- header = [segment.rstrip() for segment in header.split(delimiter)]
520
- line = [segment.rstrip() for segment in line.split(delimiter)]
521
- if verbose:
522
- __teePrintOrNot(f"Header: \n{header}",teeLogger=teeLogger)
523
- __teePrintOrNot(f"First line: \n{line}",teeLogger=teeLogger)
524
- if len(header) != len(line) or any([header[i] not in line[i] for i in range(len(header))]):
525
- __teePrintOrNot(f"Header mismatch: \n{line} \n!= \n{header}",teeLogger=teeLogger)
526
- if strict:
527
- raise ValueError("Data format error! Header mismatch")
528
- return False
529
- return True
530
-
531
- def _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = None,header = '',encoding = 'utf8',strict = True,delimiter = DEFAULT_DELIMITER):
532
- """
533
- Verify the existence of the tabular file.
534
-
535
- Parameters:
536
- - fileName (str): The path of the tabular file.
537
- - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to True.
538
- - teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
539
- - header (str, optional): The header line to verify against. Defaults to ''.
540
- - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
541
- - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
542
-
543
- Returns:
544
- bool: True if the file exists, False otherwise.
545
- """
546
- remainingFileName, _ ,extenstionName = fileName.rpartition('.')
547
- if extenstionName in COMPRESSED_FILE_EXTENSIONS:
548
- remainingFileName, _ ,extenstionName = remainingFileName.rpartition('.')
549
- if delimiter and delimiter == '\t' and not extenstionName == 'tsv':
550
- __teePrintOrNot(f'Warning: Filename {fileName} does not end with .tsv','warning',teeLogger=teeLogger)
551
- elif delimiter and delimiter == ',' and not extenstionName == 'csv':
552
- __teePrintOrNot(f'Warning: Filename {fileName} does not end with .csv','warning',teeLogger=teeLogger)
553
- elif delimiter and delimiter == '\0' and not extenstionName == 'nsv':
554
- __teePrintOrNot(f'Warning: Filename {fileName} does not end with .nsv','warning',teeLogger=teeLogger)
555
- elif delimiter and delimiter == '|' and not extenstionName == 'psv':
556
- __teePrintOrNot(f'Warning: Filename {fileName} does not end with .psv','warning',teeLogger=teeLogger)
557
- if not os.path.isfile(fileName):
558
- if createIfNotExist:
559
- try:
560
- with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
561
- file.write(header.encode(encoding=encoding,errors='replace')+b'\n')
562
- __teePrintOrNot('Created '+fileName,teeLogger=teeLogger)
563
- return True
564
- except:
565
- __teePrintOrNot('Failed to create '+fileName,'error',teeLogger=teeLogger)
566
- if strict:
567
- raise FileNotFoundError("Failed to create file")
568
- return False
569
- elif strict:
570
- __teePrintOrNot('File not found','error',teeLogger=teeLogger)
571
- raise FileNotFoundError("File not found")
572
- else:
573
- return False
574
- return True
575
-
576
- def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = '\t',defaults = ...):
577
- """
578
- Compatibility method, calls readTabularFile.
579
- Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
580
-
581
- Parameters:
582
- - fileName (str): The path to the Tabular file.
583
- - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
584
- - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
585
- - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
586
- - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
587
- - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
588
- - verbose (bool, optional): Whether to print verbose output. Defaults to False.
589
- - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
590
- - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
591
- - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
592
- - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t'.
593
- - defaults (list, optional): The default values to use for missing columns. Defaults to [].
594
-
595
- Returns:
596
- - OrderedDict: The dictionary containing the data from the Tabular file.
597
-
598
- Raises:
599
- - Exception: If the file is not found or there is a data format error.
600
-
601
- """
602
- return readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
603
-
604
- def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = ...,defaults = ...):
605
- """
606
- Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
607
-
608
- Parameters:
609
- - fileName (str): The path to the Tabular file.
610
- - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
611
- - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
612
- - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
613
- - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
614
- - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
615
- - verbose (bool, optional): Whether to print verbose output. Defaults to False.
616
- - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
617
- - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
618
- - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
619
- - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
620
- - defaults (list, optional): The default values to use for missing columns. Defaults to [].
621
-
622
- Returns:
623
- - OrderedDict: The dictionary containing the data from the Tabular file.
624
-
625
- Raises:
626
- - Exception: If the file is not found or there is a data format error.
627
-
628
- """
629
- if taskDic is None:
630
- taskDic = {}
631
- if defaults is ...:
632
- defaults = []
633
- delimiter = get_delimiter(delimiter,file_name=fileName)
634
- header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger, delimiter = delimiter)
635
- if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
636
- return taskDic
637
- with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
638
- correctColumnNum = -1
639
- if header.rstrip() and verifyHeader:
640
- line = file.readline().decode(encoding=encoding,errors='replace')
641
- if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
642
- correctColumnNum = len(header.split(delimiter))
643
- if verbose:
644
- __teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
645
- if lastLineOnly:
646
- lineCache = read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=verbose, teeLogger=teeLogger, strict=strict, delimiter=delimiter, defaults=defaults)
647
- if lineCache:
648
- taskDic[lineCache[0]] = lineCache
649
- return lineCache
650
- for line in file:
651
- correctColumnNum, lineCache = _processLine(line.decode(encoding=encoding,errors='replace'),taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict,delimiter=delimiter,defaults = defaults)
652
- return taskDic
624
+ """
625
+ Verify if a line contains the header.
626
+
627
+ Parameters:
628
+ - header (str): The header string to verify.
629
+ - line (str): The line to verify against the header.
630
+ - verbose (bool, optional): Whether to print verbose output. Defaults to False.
631
+ - teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
632
+ - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
633
+
634
+ Returns:
635
+ bool: True if the header matches the line, False otherwise.
636
+ """
637
+ line = _formatHeader(line,verbose=verbose,teeLogger=teeLogger,delimiter=delimiter)
638
+ if verbose:
639
+ __teePrintOrNot(f"Header: \n{header}",teeLogger=teeLogger)
640
+ __teePrintOrNot(f"First line: \n{line}",teeLogger=teeLogger)
641
+ if len(header) != len(line) or any([header[i] not in line[i] for i in range(len(header))]):
642
+ __teePrintOrNot(f"Header mismatch: \n{line} \n!= \n{header}",teeLogger=teeLogger)
643
+ if strict:
644
+ raise ValueError("Data format error! Header mismatch")
645
+ return False
646
+ return True
647
+
648
+ def _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = None,header = [],encoding = 'utf8',strict = True,delimiter = DEFAULT_DELIMITER):
649
+ """
650
+ Verify the existence of the tabular file.
651
+
652
+ Parameters:
653
+ - fileName (str): The path of the tabular file.
654
+ - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to True.
655
+ - teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
656
+ - header (list, optional): The header line to verify against. Defaults to [].
657
+ - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
658
+ - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
659
+
660
+ Returns:
661
+ bool: True if the file exists, False otherwise.
662
+ """
663
+ remainingFileName, _ ,extenstionName = fileName.rpartition('.')
664
+ if extenstionName in COMPRESSED_FILE_EXTENSIONS:
665
+ remainingFileName, _ ,extenstionName = remainingFileName.rpartition('.')
666
+ if delimiter and delimiter == '\t' and not extenstionName == 'tsv':
667
+ __teePrintOrNot(f'Warning: Filename {fileName} does not end with .tsv','warning',teeLogger=teeLogger)
668
+ elif delimiter and delimiter == ',' and not extenstionName == 'csv':
669
+ __teePrintOrNot(f'Warning: Filename {fileName} does not end with .csv','warning',teeLogger=teeLogger)
670
+ elif delimiter and delimiter == '\0' and not extenstionName == 'nsv':
671
+ __teePrintOrNot(f'Warning: Filename {fileName} does not end with .nsv','warning',teeLogger=teeLogger)
672
+ elif delimiter and delimiter == '|' and not extenstionName == 'psv':
673
+ __teePrintOrNot(f'Warning: Filename {fileName} does not end with .psv','warning',teeLogger=teeLogger)
674
+ if not os.path.isfile(fileName):
675
+ if createIfNotExist:
676
+ try:
677
+ with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
678
+ header = delimiter.join(_sanitize(_formatHeader(header,
679
+ verbose=verbose,
680
+ teeLogger=teeLogger,
681
+ delimiter=delimiter,
682
+ ),delimiter=delimiter))
683
+ file.write(header.encode(encoding=encoding,errors='replace')+b'\n')
684
+ __teePrintOrNot('Created '+fileName,teeLogger=teeLogger)
685
+ return True
686
+ except Exception:
687
+ __teePrintOrNot('Failed to create '+fileName,'error',teeLogger=teeLogger)
688
+ if strict:
689
+ raise FileNotFoundError("Failed to create file")
690
+ return False
691
+ elif strict:
692
+ __teePrintOrNot('File not found','error',teeLogger=teeLogger)
693
+ raise FileNotFoundError("File not found")
694
+ else:
695
+ return False
696
+ return True
697
+
698
+ def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,
699
+ verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = '\t',defaults = ...,
700
+ correctColumnNum = -1):
701
+ """
702
+ Compatibility method, calls readTabularFile.
703
+ Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
704
+
705
+ Parameters:
706
+ - fileName (str): The path to the Tabular file.
707
+ - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
708
+ - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
709
+ - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
710
+ - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
711
+ - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
712
+ - verbose (bool, optional): Whether to print verbose output. Defaults to False.
713
+ - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
714
+ - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
715
+ - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
716
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t'.
717
+ - defaults (list, optional): The default values to use for missing columns. Defaults to [].
718
+ - correctColumnNum (int, optional): The expected number of columns in the file. If -1, it will be determined from the first valid line. Defaults to -1.
719
+
720
+ Returns:
721
+ - OrderedDict: The dictionary containing the data from the Tabular file.
722
+
723
+ Raises:
724
+ - Exception: If the file is not found or there is a data format error.
725
+
726
+ """
727
+ return readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,
728
+ lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,
729
+ encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults,
730
+ correctColumnNum = correctColumnNum)
731
+
732
+ def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,
733
+ verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = ...,defaults = ...,
734
+ correctColumnNum = -1,storeOffset = False):
735
+ """
736
+ Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
737
+
738
+ Parameters:
739
+ - fileName (str): The path to the Tabular file.
740
+ - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
741
+ - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
742
+ - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
743
+ - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
744
+ - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
745
+ - verbose (bool, optional): Whether to print verbose output. Defaults to False.
746
+ - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
747
+ - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
748
+ - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
749
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
750
+ - defaults (list, optional): The default values to use for missing columns. Defaults to [].
751
+ - correctColumnNum (int, optional): The expected number of columns in the file. If -1, it will be determined from the first valid line. Defaults to -1.
752
+ - storeOffset (bool, optional): Instead of storing the data in taskDic, store the offset of each line. Defaults to False.
753
+
754
+ Returns:
755
+ - OrderedDict: The dictionary containing the data from the Tabular file.
756
+
757
+ Raises:
758
+ - Exception: If the file is not found or there is a data format error.
759
+
760
+ """
761
+ if taskDic is None:
762
+ taskDic = {}
763
+ if defaults is ...:
764
+ defaults = []
765
+ delimiter = get_delimiter(delimiter,file_name=fileName)
766
+ header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger, delimiter = delimiter)
767
+ if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
768
+ return taskDic
769
+ with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
770
+ if any(header) and verifyHeader:
771
+ line = file.readline().decode(encoding=encoding,errors='replace')
772
+ if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict) and correctColumnNum == -1:
773
+ correctColumnNum = len(header)
774
+ if verbose:
775
+ __teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
776
+ if lastLineOnly:
777
+ lineCache = read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=verbose, teeLogger=teeLogger, strict=strict, delimiter=delimiter, defaults=defaults,storeOffset=storeOffset)
778
+ # if lineCache:
779
+ # taskDic[lineCache[0]] = lineCache
780
+ return lineCache
781
+ for line in file:
782
+ correctColumnNum, _ = _processLine(line.decode(encoding=encoding,errors='replace'),taskDic,correctColumnNum,strict = strict,delimiter=delimiter,defaults = defaults,storeOffset=storeOffset,offset=file.tell()-len(line))
783
+ return taskDic
653
784
 
654
785
  def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = '\t'):
655
- """
656
- Compatibility method, calls appendTabularFile.
657
- Append a line of data to a Tabular file.
658
- Parameters:
659
- - fileName (str): The path of the Tabular file.
660
- - lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
661
- - teeLogger (optional): A logger object for logging messages.
662
- - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
663
- - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
664
- - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
665
- - verbose (bool, optional): If True, additional information will be printed during the execution.
666
- - encoding (str, optional): The encoding of the file.
667
- - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
668
- - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
669
- Raises:
670
- - Exception: If the file does not exist and createIfNotExist is False.
671
- - Exception: If the existing header does not match the provided header.
672
- """
673
- return appendTabularFile(fileName,lineToAppend,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding, strict = strict, delimiter = delimiter)
786
+ """
787
+ Compatibility method, calls appendTabularFile.
788
+ Append a line of data to a Tabular file.
789
+ Parameters:
790
+ - fileName (str): The path of the Tabular file.
791
+ - lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
792
+ - teeLogger (optional): A logger object for logging messages.
793
+ - header (str or list, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
794
+ - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
795
+ - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
796
+ - verbose (bool, optional): If True, additional information will be printed during the execution.
797
+ - encoding (str, optional): The encoding of the file.
798
+ - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
799
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
800
+ Raises:
801
+ - Exception: If the file does not exist and createIfNotExist is False.
802
+ - Exception: If the existing header does not match the provided header.
803
+ """
804
+ return appendTabularFile(fileName,lineToAppend,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding, strict = strict, delimiter = delimiter)
674
805
 
675
806
  def appendTabularFile(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = ...):
676
- """
677
- Append a line of data to a Tabular file.
678
- Parameters:
679
- - fileName (str): The path of the Tabular file.
680
- - lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
681
- - teeLogger (optional): A logger object for logging messages.
682
- - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
683
- - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
684
- - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
685
- - verbose (bool, optional): If True, additional information will be printed during the execution.
686
- - encoding (str, optional): The encoding of the file.
687
- - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
688
- - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
689
- Raises:
690
- - Exception: If the file does not exist and createIfNotExist is False.
691
- - Exception: If the existing header does not match the provided header.
692
- """
693
- return appendLinesTabularFile(fileName,[lineToAppend],teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding, strict = strict, delimiter = delimiter)
807
+ """
808
+ Append a line of data to a Tabular file.
809
+ Parameters:
810
+ - fileName (str): The path of the Tabular file.
811
+ - lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
812
+ - teeLogger (optional): A logger object for logging messages.
813
+ - header (str or list, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
814
+ - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
815
+ - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
816
+ - verbose (bool, optional): If True, additional information will be printed during the execution.
817
+ - encoding (str, optional): The encoding of the file.
818
+ - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
819
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
820
+ Raises:
821
+ - Exception: If the file does not exist and createIfNotExist is False.
822
+ - Exception: If the existing header does not match the provided header.
823
+ """
824
+ return appendLinesTabularFile(fileName,[lineToAppend],teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding, strict = strict, delimiter = delimiter)
694
825
 
695
826
  def appendLinesTabularFile(fileName,linesToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = ...):
696
- """
697
- Append lines of data to a Tabular file.
698
- Parameters:
699
- - fileName (str): The path of the Tabular file.
700
- - linesToAppend (list): The lines of data to append. If it is a list of string, then each string will be split by delimiter to form a list.
701
- - teeLogger (optional): A logger object for logging messages.
702
- - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
703
- - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
704
- - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
705
- - verbose (bool, optional): If True, additional information will be printed during the execution.
706
- - encoding (str, optional): The encoding of the file.
707
- - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
708
- - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
709
- Raises:
710
- - Exception: If the file does not exist and createIfNotExist is False.
711
- - Exception: If the existing header does not match the provided header.
712
- """
713
- delimiter = get_delimiter(delimiter,file_name=fileName)
714
- header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
715
- if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
716
- return
717
- formatedLines = []
718
- for line in linesToAppend:
719
- if isinstance(linesToAppend,dict):
720
- key = line
721
- line = linesToAppend[key]
722
- if isinstance(line,str):
723
- line = line.split(delimiter)
724
- elif line:
725
- for i in range(len(line)):
726
- if not isinstance(line[i],str):
727
- try:
728
- line[i] = str(line[i])
729
- except Exception as e:
730
- line[i] = str(e)
731
- if isinstance(linesToAppend,dict):
732
- if (not line or line[0] != key):
733
- line = [key]+line
734
- formatedLines.append(line)
735
- if not formatedLines:
736
- if verbose:
737
- __teePrintOrNot(f"No lines to append to {fileName}",teeLogger=teeLogger)
738
- return
739
- correctColumnNum = max([len(line) for line in formatedLines])
740
-
741
- if header.rstrip() and verifyHeader:
742
- with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
743
- line = file.readline().decode(encoding=encoding,errors='replace')
744
- if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
745
- correctColumnNum = len(header.split(delimiter))
746
- if verbose:
747
- __teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
748
- # truncate / fill the lines to the correct number of columns
749
- for i in range(len(formatedLines)):
750
- if len(formatedLines[i]) < correctColumnNum:
751
- formatedLines[i] += ['']*(correctColumnNum-len(formatedLines[i]))
752
- elif len(formatedLines[i]) > correctColumnNum:
753
- formatedLines[i] = formatedLines[i][:correctColumnNum]
754
- with openFileAsCompressed(fileName, mode ='ab',encoding=encoding,teeLogger=teeLogger)as file:
755
- # check if the file ends in a newline
756
- # file.seek(-1, os.SEEK_END)
757
- # if file.read(1) != b'\n':
758
- # file.write(b'\n')
759
- file.write(b'\n'.join([delimiter.join(line).encode(encoding=encoding,errors='replace') for line in formatedLines]) + b'\n')
760
- if verbose:
761
- __teePrintOrNot(f"Appended {len(formatedLines)} lines to {fileName}",teeLogger=teeLogger)
827
+ """
828
+ Append lines of data to a Tabular file.
829
+ Parameters:
830
+ - fileName (str): The path of the Tabular file.
831
+ - linesToAppend (list): The lines of data to append. If it is a list of string, then each string will be split by delimiter to form a list.
832
+ - teeLogger (optional): A logger object for logging messages.
833
+ - header (str or list, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
834
+ - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
835
+ - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
836
+ - verbose (bool, optional): If True, additional information will be printed during the execution.
837
+ - encoding (str, optional): The encoding of the file.
838
+ - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
839
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
840
+ Raises:
841
+ - Exception: If the file does not exist and createIfNotExist is False.
842
+ - Exception: If the existing header does not match the provided header.
843
+ """
844
+ delimiter = get_delimiter(delimiter,file_name=fileName)
845
+ header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
846
+ if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
847
+ return
848
+ formatedLines = []
849
+ for line in linesToAppend:
850
+ if isinstance(linesToAppend,dict):
851
+ key = line
852
+ line = linesToAppend[key]
853
+ if isinstance(line,str):
854
+ line = line.split(delimiter)
855
+ elif line:
856
+ for i in range(len(line)):
857
+ if not isinstance(line[i],str):
858
+ try:
859
+ line[i] = str(line[i]).rstrip()
860
+ except Exception as e:
861
+ line[i] = str(e)
862
+ if isinstance(linesToAppend,dict):
863
+ if (not line or line[0] != key):
864
+ line = [key]+line
865
+ formatedLines.append(_sanitize(line,delimiter=delimiter))
866
+ if not formatedLines:
867
+ if verbose:
868
+ __teePrintOrNot(f"No lines to append to {fileName}",teeLogger=teeLogger)
869
+ return
870
+ correctColumnNum = max([len(line) for line in formatedLines])
871
+ if any(header) and verifyHeader:
872
+ with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
873
+ line = file.readline().decode(encoding=encoding,errors='replace')
874
+ if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
875
+ correctColumnNum = len(header)
876
+ if verbose:
877
+ __teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
878
+ # truncate / fill the lines to the correct number of columns
879
+ for i in range(len(formatedLines)):
880
+ if len(formatedLines[i]) < correctColumnNum:
881
+ formatedLines[i] += ['']*(correctColumnNum-len(formatedLines[i]))
882
+ elif len(formatedLines[i]) > correctColumnNum:
883
+ formatedLines[i] = formatedLines[i][:correctColumnNum]
884
+ with openFileAsCompressed(fileName, mode ='ab',encoding=encoding,teeLogger=teeLogger)as file:
885
+ # check if the file ends in a newline
886
+ # file.seek(-1, os.SEEK_END)
887
+ # if file.read(1) != b'\n':
888
+ # file.write(b'\n')
889
+ file.write(b'\n'.join([delimiter.join(line).encode(encoding=encoding,errors='replace') for line in formatedLines]) + b'\n')
890
+ if verbose:
891
+ __teePrintOrNot(f"Appended {len(formatedLines)} lines to {fileName}",teeLogger=teeLogger)
762
892
 
763
893
  def clearTSV(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False,delimiter = '\t'):
764
- """
765
- Compatibility method, calls clearTabularFile.
766
- Clear the contents of a Tabular file. Will create if not exist.
767
- Parameters:
768
- - fileName (str): The path of the Tabular file.
769
- - teeLogger (optional): A logger object for logging messages.
770
- - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
771
- - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
772
- - verbose (bool, optional): If True, additional information will be printed during the execution.
773
- - encoding (str, optional): The encoding of the file.
774
- - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
775
- """
776
- return clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
894
+ """
895
+ Compatibility method, calls clearTabularFile.
896
+ Clear the contents of a Tabular file. Will create if not exist.
897
+ Parameters:
898
+ - fileName (str): The path of the Tabular file.
899
+ - teeLogger (optional): A logger object for logging messages.
900
+ - header (str or list, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
901
+ - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
902
+ - verbose (bool, optional): If True, additional information will be printed during the execution.
903
+ - encoding (str, optional): The encoding of the file.
904
+ - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
905
+ """
906
+ return clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
777
907
 
778
908
  def clearTabularFile(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False,delimiter = ...):
779
- """
780
- Clear the contents of a Tabular file. Will create if not exist.
781
- Parameters:
782
- - fileName (str): The path of the Tabular file.
783
- - teeLogger (optional): A logger object for logging messages.
784
- - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
785
- - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
786
- - verbose (bool, optional): If True, additional information will be printed during the execution.
787
- - encoding (str, optional): The encoding of the file.
788
- - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
789
- """
790
- delimiter = get_delimiter(delimiter,file_name=fileName)
791
- header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
792
- if not _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = teeLogger,header = header,encoding = encoding,strict = False,delimiter=delimiter):
793
- raise FileNotFoundError("Something catastrophic happened! File still not found after creation")
794
- else:
795
- with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
796
- if header.rstrip() and verifyHeader:
797
- line = file.readline().decode(encoding=encoding,errors='replace')
798
- if not _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
799
- __teePrintOrNot(f'Warning: Header mismatch in {fileName}. Keeping original header in file...','warning',teeLogger)
800
- header = line
801
- with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
802
- if header:
803
- if not header.endswith('\n'):
804
- header += '\n'
805
- file.write(header.encode(encoding=encoding,errors='replace'))
806
- if verbose:
807
- __teePrintOrNot(f"Cleared {fileName}",teeLogger=teeLogger)
909
+ """
910
+ Clear the contents of a Tabular file. Will create if not exist.
911
+ Parameters:
912
+ - fileName (str): The path of the Tabular file.
913
+ - teeLogger (optional): A logger object for logging messages.
914
+ - header (str or list, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
915
+ - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
916
+ - verbose (bool, optional): If True, additional information will be printed during the execution.
917
+ - encoding (str, optional): The encoding of the file.
918
+ - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
919
+ """
920
+ delimiter = get_delimiter(delimiter,file_name=fileName)
921
+ header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
922
+ if not _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = teeLogger,header = header,encoding = encoding,strict = False,delimiter=delimiter):
923
+ raise FileNotFoundError("Something catastrophic happened! File still not found after creation")
924
+ else:
925
+ with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
926
+ if any(header) and verifyHeader:
927
+ line = file.readline().decode(encoding=encoding,errors='replace')
928
+ if not _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
929
+ __teePrintOrNot(f'Warning: Header mismatch in {fileName}. Keeping original header in file...','warning',teeLogger)
930
+ header = _formatHeader(line,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
931
+ with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
932
+ if header:
933
+ header = delimiter.join(_sanitize(header,delimiter=delimiter))
934
+ file.write(header.encode(encoding=encoding,errors='replace')+b'\n')
935
+ if verbose:
936
+ __teePrintOrNot(f"Cleared {fileName}",teeLogger=teeLogger)
808
937
 
809
938
  def getFileUpdateTimeNs(fileName):
810
- # return 0 if the file does not exist
811
- if not os.path.isfile(fileName):
812
- return 0
813
- try:
814
- return os.stat(fileName).st_mtime_ns
815
- except:
816
- __teePrintOrNot(f"Failed to get file update time for {fileName}",'error')
817
- return get_time_ns()
939
+ # return 0 if the file does not exist
940
+ if not os.path.isfile(fileName):
941
+ return 0
942
+ try:
943
+ return os.stat(fileName).st_mtime_ns
944
+ except Exception:
945
+ __teePrintOrNot(f"Failed to get file update time for {fileName}",'error')
946
+ return get_time_ns()
818
947
 
819
948
  def get_time_ns():
820
- try:
821
- return time.time_ns()
822
- except:
823
- # try to get the time in nanoseconds
824
- return int(time.time()*1e9)
949
+ try:
950
+ return time.time_ns()
951
+ except Exception:
952
+ # try to get the time in nanoseconds
953
+ return int(time.time()*1e9)
825
954
 
826
955
  def scrubTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = False,delimiter = '\t',defaults = ...):
827
- """
828
- Compatibility method, calls scrubTabularFile.
829
- Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
830
- Return the data as a dictionary.
831
-
832
- Parameters:
833
- - fileName (str): The path to the Tabular file.
834
- - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
835
- - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
836
- - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
837
- - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
838
- - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
839
- - verbose (bool, optional): Whether to print verbose output. Defaults to False.
840
- - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
841
- - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
842
- - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
843
- - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
844
- - defaults (list, optional): The default values to use for missing columns. Defaults to [].
845
-
846
- Returns:
847
- - OrderedDict: The dictionary containing the data from the Tabular file.
848
-
849
- Raises:
850
- - Exception: If the file is not found or there is a data format error.
851
-
852
- """
853
- return scrubTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
854
-
855
- def scrubTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = False,delimiter = ...,defaults = ...):
856
- """
857
- Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
858
- If using compressed files. This will recompress the file in whole and possibily increase the compression ratio reducing the file size.
859
- Return the data as a dictionary.
860
-
861
- Parameters:
862
- - fileName (str): The path to the Tabular file.
863
- - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
864
- - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
865
- - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
866
- - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
867
- - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
868
- - verbose (bool, optional): Whether to print verbose output. Defaults to False.
869
- - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
870
- - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
871
- - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
872
- - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
873
- - defaults (list, optional): The default values to use for missing columns. Defaults to [].
874
-
875
- Returns:
876
- - OrderedDict: The dictionary containing the data from the Tabular file.
877
-
878
- Raises:
879
- - Exception: If the file is not found or there is a data format error.
880
-
881
- """
882
- file = readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
883
- if file:
884
- clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
885
- appendLinesTabularFile(fileName,file,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
886
- return file
956
+ """
957
+ Compatibility method, calls scrubTabularFile.
958
+ Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
959
+ Return the data as a dictionary.
960
+
961
+ Parameters:
962
+ - fileName (str): The path to the Tabular file.
963
+ - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
964
+ - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
965
+ - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
966
+ - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
967
+ - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
968
+ - verbose (bool, optional): Whether to print verbose output. Defaults to False.
969
+ - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
970
+ - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
971
+ - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
972
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
973
+ - defaults (list, optional): The default values to use for missing columns. Defaults to [].
974
+
975
+ Returns:
976
+ - OrderedDict: The dictionary containing the data from the Tabular file.
977
+
978
+ Raises:
979
+ - Exception: If the file is not found or there is a data format error.
980
+
981
+ """
982
+ return scrubTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
983
+
984
+ def scrubTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,
985
+ verbose = False,taskDic = None,encoding = 'utf8',strict = False,delimiter = ...,defaults = ...,correctColumnNum = -1):
986
+ """
987
+ Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
988
+ If using compressed files. This will recompress the file in whole and possibily increase the compression ratio reducing the file size.
989
+ Return the data as a dictionary.
990
+
991
+ Parameters:
992
+ - fileName (str): The path to the Tabular file.
993
+ - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
994
+ - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
995
+ - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
996
+ - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
997
+ - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
998
+ - verbose (bool, optional): Whether to print verbose output. Defaults to False.
999
+ - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
1000
+ - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
1001
+ - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
1002
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
1003
+ - defaults (list, optional): The default values to use for missing columns. Defaults to [].
1004
+ - correctColumnNum (int, optional): The expected number of columns in the file. If -1, it will be determined from the first valid line. Defaults to -1.
1005
+
1006
+ Returns:
1007
+ - OrderedDict: The dictionary containing the data from the Tabular file.
1008
+
1009
+ Raises:
1010
+ - Exception: If the file is not found or there is a data format error.
1011
+
1012
+ """
1013
+ file = readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,
1014
+ lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,
1015
+ encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults,correctColumnNum = correctColumnNum)
1016
+ if file:
1017
+ clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
1018
+ appendLinesTabularFile(fileName,file,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
1019
+ return file
887
1020
 
888
1021
  def getListView(tsvzDic,header = [],delimiter = DEFAULT_DELIMITER):
889
- if header:
890
- if isinstance(header,str):
891
- header = header.split(delimiter)
892
- elif not isinstance(header,list):
893
- try:
894
- header = list(header)
895
- except:
896
- header = []
897
- if not tsvzDic:
898
- if not header:
899
- return []
900
- else:
901
- return [header]
902
- if not header:
903
- return list(tsvzDic.values())
904
- else:
905
- values = list(tsvzDic.values())
906
- if values[0] and values[0] == header:
907
- return values
908
- else:
909
- return [header] + values
1022
+ if header:
1023
+ if isinstance(header,str):
1024
+ header = header.split(delimiter)
1025
+ elif not isinstance(header,list):
1026
+ try:
1027
+ header = list(header)
1028
+ except Exception:
1029
+ header = []
1030
+ if not tsvzDic:
1031
+ if not header:
1032
+ return []
1033
+ else:
1034
+ return [header]
1035
+ if not header:
1036
+ return list(tsvzDic.values())
1037
+ else:
1038
+ values = list(tsvzDic.values())
1039
+ if values[0] and values[0] == header:
1040
+ return values
1041
+ else:
1042
+ return [header] + values
910
1043
 
911
1044
  # create a tsv class that functions like a ordered dictionary but will update the file when modified
912
1045
  class TSVZed(OrderedDict):
913
- def __teePrintOrNot(self,message,level = 'info'):
914
- try:
915
- if self.teeLogger:
916
- self.teeLogger.teelog(message,level)
917
- else:
918
- print(message,flush=True)
919
- except Exception:
920
- print(message,flush=True)
921
-
922
- def getResourseUsage(self,return_dict = False):
923
- return get_resource_usage(return_dict = return_dict)
924
-
925
- def __init__ (self,fileName,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,monitor_external_changes = True,verbose = False,encoding = 'utf8',delimiter = ...,defualts = None,strict = False):
926
- super().__init__()
927
- self.version = version
928
- self.strict = strict
929
- self.externalFileUpdateTime = getFileUpdateTimeNs(fileName)
930
- self.lastUpdateTime = self.externalFileUpdateTime
931
- self._fileName = fileName
932
- self.teeLogger = teeLogger
933
- self.delimiter = get_delimiter(delimiter,file_name=fileName)
934
- self.defaults = defualts if defualts else []
935
- self.header = _formatHeader(header,verbose = verbose,teeLogger = self.teeLogger,delimiter=self.delimiter)
936
- self.correctColumnNum = -1
937
- self.createIfNotExist = createIfNotExist
938
- self.verifyHeader = verifyHeader
939
- self.rewrite_on_load = rewrite_on_load
940
- self.rewrite_on_exit = rewrite_on_exit
941
- self.rewrite_interval = rewrite_interval
942
- self.monitor_external_changes = monitor_external_changes
943
- if not monitor_external_changes:
944
- self.__teePrintOrNot(f"Warning: External changes monitoring disabled for {self._fileName}. Will overwrite external changes.",'warning')
945
- self.verbose = verbose
946
- if append_check_delay < 0:
947
- append_check_delay = 0.00001
948
- self.__teePrintOrNot('append_check_delay cannot be less than 0, setting it to 0.00001','error')
949
- self.append_check_delay = append_check_delay
950
- self.appendQueue = deque()
951
- self.dirty = False
952
- self.deSynced = False
953
- self.memoryOnly = False
954
- self.encoding = encoding
955
- self.writeLock = threading.Lock()
956
- self.shutdownEvent = threading.Event()
957
- #self.appendEvent = threading.Event()
958
- self.appendThread = threading.Thread(target=self._appendWorker,daemon=True)
959
- self.appendThread.start()
960
- self.load()
961
- atexit.register(self.stopAppendThread)
962
-
963
- def setDefaults(self,defaults):
964
- if not defaults:
965
- defaults = []
966
- return
967
- if isinstance(defaults,str):
968
- defaults = defaults.split(self.delimiter)
969
- elif not isinstance(defaults,list):
970
- try:
971
- defaults = list(defaults)
972
- except:
973
- if self.verbose:
974
- self.__teePrintOrNot('Invalid defaults, setting defaults to empty.','error')
975
- defaults = []
976
- return
977
- if not any(defaults):
978
- defaults = []
979
- return
980
- if defaults[0] != DEFAULTS_INDICATOR_KEY:
981
- defaults = [DEFAULTS_INDICATOR_KEY]+defaults
982
- self.defaults = defaults
983
-
984
- def load(self):
985
- self.reload()
986
- if self.rewrite_on_load:
987
- self.rewrite(force = True,reloadInternalFromFile = False)
988
- return self
989
-
990
- def reload(self):
991
- # Load or refresh data from the TSV file
992
- mo = self.memoryOnly
993
- self.memoryOnly = True
994
- if self.verbose:
995
- self.__teePrintOrNot(f"Loading {self._fileName}")
996
- super().clear()
997
- readTabularFile(self._fileName, teeLogger = self.teeLogger, header = self.header, createIfNotExist = self.createIfNotExist, verifyHeader = self.verifyHeader, verbose = self.verbose, taskDic = self,encoding = self.encoding if self.encoding else None, strict = self.strict, delimiter = self.delimiter, defaults=self.defaults)
998
- if self.verbose:
999
- self.__teePrintOrNot(f"Loaded {len(self)} records from {self._fileName}")
1000
- if self.header and self.verifyHeader:
1001
- self.correctColumnNum = len(self.header.split(self.delimiter))
1002
- elif self:
1003
- self.correctColumnNum = len(self[next(iter(self))])
1004
- else:
1005
- self.correctColumnNum = -1
1006
- if self.verbose:
1007
- self.__teePrintOrNot(f"correctColumnNum: {self.correctColumnNum}")
1008
- #super().update(loadedData)
1009
- if self.verbose:
1010
- self.__teePrintOrNot(f"TSVZed({self._fileName}) loaded")
1011
- self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1012
- self.lastUpdateTime = self.externalFileUpdateTime
1013
- self.memoryOnly = mo
1014
- return self
1015
-
1016
- def __setitem__(self,key,value):
1017
- key = str(key).rstrip()
1018
- if not key:
1019
- self.__teePrintOrNot('Key cannot be empty','error')
1020
- return
1021
- if isinstance(value,str):
1022
- value = value.split(self.delimiter)
1023
- # sanitize the value
1024
- value = [(str(segment).rstrip() if not isinstance(segment,str) else segment.rstrip()) if segment else '' for segment in value]
1025
- # escape the delimiter and newline characters
1026
- value = [segment.replace(self.delimiter,'<sep>').replace('\n','\\n') for segment in value]
1027
- # the first field in value should be the key
1028
- # add it if it is not there
1029
- if not value or value[0] != key:
1030
- value = [key]+value
1031
- # verify the value has the correct number of columns
1032
- if self.correctColumnNum != 1 and len(value) == 1:
1033
- # this means we want to clear / delete the key
1034
- self.__delitem__(key)
1035
- elif self.correctColumnNum > 0:
1036
- if len(value) != self.correctColumnNum:
1037
- if self.strict:
1038
- self.__teePrintOrNot(f"Value {value} does not have the correct number of columns: {self.correctColumnNum}. Refuse adding key...",'error')
1039
- return
1040
- elif self.verbose:
1041
- self.__teePrintOrNot(f"Value {value} does not have the correct number of columns: {self.correctColumnNum}, correcting...",'warning')
1042
- if len(value) < self.correctColumnNum:
1043
- value += ['']*(self.correctColumnNum-len(value))
1044
- elif len(value) > self.correctColumnNum:
1045
- value = value[:self.correctColumnNum]
1046
- else:
1047
- self.correctColumnNum = len(value)
1048
- if self.defaults and len(self.defaults) > 1:
1049
- for i in range(1,len(value)):
1050
- if not value[i] and i < len(self.defaults) and self.defaults[i]:
1051
- value[i] = self.defaults[i]
1052
- if self.verbose:
1053
- self.__teePrintOrNot(f" Replacing empty value at {i} with default: {self.defaults[i]}")
1054
- if key == DEFAULTS_INDICATOR_KEY:
1055
- self.defaults = value
1056
- if self.verbose:
1057
- self.__teePrintOrNot(f"Defaults set to {value}")
1058
- if not self.memoryOnly:
1059
- self.appendQueue.append(self.delimiter.join(value))
1060
- self.lastUpdateTime = get_time_ns()
1061
- if self.verbose:
1062
- self.__teePrintOrNot(f"Appending Defaults {key} to the appendQueue")
1063
- return
1064
- if self.verbose:
1065
- self.__teePrintOrNot(f"Setting {key} to {value}")
1066
- if key in self:
1067
- if self[key] == value:
1068
- if self.verbose:
1069
- self.__teePrintOrNot(f"Key {key} already exists with the same value")
1070
- return
1071
- self.dirty = True
1072
- # update the dictionary,
1073
- super().__setitem__(key,value)
1074
- if self.memoryOnly:
1075
- if self.verbose:
1076
- self.__teePrintOrNot(f"Key {key} updated in memory only")
1077
- return
1078
- elif key.startswith('#'):
1079
- if self.verbose:
1080
- self.__teePrintOrNot(f"Key {key} updated in memory only as it starts with #")
1081
- return
1082
- if self.verbose:
1083
- self.__teePrintOrNot(f"Appending {key} to the appendQueue")
1084
- self.appendQueue.append(self.delimiter.join(value))
1085
- self.lastUpdateTime = get_time_ns()
1086
- # if not self.appendThread.is_alive():
1087
- # self.commitAppendToFile()
1088
- # else:
1089
- # self.appendEvent.set()
1090
-
1091
-
1092
- def __delitem__(self,key):
1093
- key = str(key).rstrip()
1094
- if key == DEFAULTS_INDICATOR_KEY:
1095
- self.defaults = []
1096
- if self.verbose:
1097
- self.__teePrintOrNot(f"Defaults cleared")
1098
- if not self.memoryOnly:
1099
- self.__appendEmptyLine(key)
1100
- if self.verbose:
1101
- self.__teePrintOrNot(f"Appending empty default line {key}")
1102
- return
1103
- # delete the key from the dictionary and update the file
1104
- if key not in self:
1105
- if self.verbose:
1106
- self.__teePrintOrNot(f"Key {key} not found")
1107
- return
1108
- super().__delitem__(key)
1109
- if self.memoryOnly or key.startswith('#'):
1110
- if self.verbose:
1111
- self.__teePrintOrNot(f"Key {key} deleted in memory")
1112
- return
1113
- self.__appendEmptyLine(key)
1114
- if self.verbose:
1115
- self.__teePrintOrNot(f"Appending empty line {key}")
1116
- self.lastUpdateTime = get_time_ns()
1117
-
1118
- def __appendEmptyLine(self,key):
1119
- self.dirty = True
1120
- if self.correctColumnNum > 0:
1121
- emptyLine = key+self.delimiter*(self.correctColumnNum-1)
1122
- elif len(self[key]) > 1:
1123
- self.correctColumnNum = len(self[key])
1124
- emptyLine = key+self.delimiter*(self.correctColumnNum-1)
1125
- else:
1126
- emptyLine = key
1127
- if self.verbose:
1128
- self.__teePrintOrNot(f"Appending {emptyLine} to the appendQueue")
1129
- self.appendQueue.append(emptyLine)
1130
- return self
1131
-
1132
- def getListView(self):
1133
- return getListView(self,header=self.header,delimiter=self.delimiter)
1134
-
1135
- def clear(self):
1136
- # clear the dictionary and update the file
1137
- super().clear()
1138
- if self.verbose:
1139
- self.__teePrintOrNot(f"Clearing {self._fileName}")
1140
- if self.memoryOnly:
1141
- return self
1142
- self.clear_file()
1143
- self.lastUpdateTime = self.externalFileUpdateTime
1144
- return self
1145
-
1146
- def clear_file(self):
1147
- try:
1148
- if self.header:
1149
- file = self.get_file_obj('wb')
1150
- file.write(self.header.encode(self.encoding,errors='replace') + b'\n')
1151
- self.release_file_obj(file)
1152
- if self.verbose:
1153
- self.__teePrintOrNot(f"Header {self.header} written to {self._fileName}")
1154
- self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
1155
- else:
1156
- file = self.get_file_obj('wb')
1157
- self.release_file_obj(file)
1158
- if self.verbose:
1159
- self.__teePrintOrNot(f"File {self._fileName} cleared empty")
1160
- self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
1161
- self.dirty = False
1162
- self.deSynced = False
1163
- except Exception as e:
1164
- self.release_file_obj(file)
1165
- self.__teePrintOrNot(f"Failed to write at clear_file() to {self._fileName}: {e}",'error')
1166
- import traceback
1167
- self.__teePrintOrNot(traceback.format_exc(),'error')
1168
- self.deSynced = True
1169
- return self
1170
-
1171
- def __enter__(self):
1172
- return self
1173
-
1174
- def close(self):
1175
- self.stopAppendThread()
1176
- return self
1177
-
1178
- def __exit__(self,exc_type,exc_value,traceback):
1179
- return self.close()
1180
-
1181
- def __repr__(self):
1182
- return f"""TSVZed(
1046
+ """
1047
+ A thread-safe, file-backed ordered dictionary for managing TSV (Tab-Separated Values) files.
1048
+ TSVZed extends OrderedDict to provide automatic synchronization between an in-memory
1049
+ dictionary and a TSV file on disk. It supports concurrent file access, automatic
1050
+ persistence, and configurable sync strategies.
1051
+ Parameters
1052
+ ----------
1053
+ fileName : str
1054
+ Path to the TSV file to be managed.
1055
+ teeLogger : object, optional
1056
+ Logger object with a teelog method for logging messages. If None, uses print.
1057
+ header : str, optional
1058
+ Column header line for the TSV file. Used for validation and file creation.
1059
+ createIfNotExist : bool, default=True
1060
+ If True, creates the file if it doesn't exist.
1061
+ verifyHeader : bool, default=True
1062
+ If True, verifies that the file header matches the provided header.
1063
+ rewrite_on_load : bool, default=True
1064
+ If True, rewrites the entire file when loading to ensure consistency.
1065
+ rewrite_on_exit : bool, default=False
1066
+ If True, rewrites the entire file when closing/exiting.
1067
+ rewrite_interval : float, default=0
1068
+ Minimum time interval (in seconds) between full file rewrites. 0 means no limit.
1069
+ append_check_delay : float, default=0.01
1070
+ Time delay (in seconds) between checks of the append queue by the worker thread.
1071
+ monitor_external_changes : bool, default=True
1072
+ If True, monitors and detects external file modifications.
1073
+ verbose : bool, default=False
1074
+ If True, prints detailed operation logs.
1075
+ encoding : str, default='utf8'
1076
+ Character encoding for reading/writing the file.
1077
+ delimiter : str, optional
1078
+ Field delimiter character. Auto-detected from filename if not specified.
1079
+ defaults : list or str, optional
1080
+ Default values for columns when values are missing.
1081
+ strict : bool, default=False
1082
+ If True, enforces strict validation of column counts and raises errors on mismatch.
1083
+ correctColumnNum : int, default=-1
1084
+ Expected number of columns. -1 means auto-detect from header or first record.
1085
+ Attributes
1086
+ ----------
1087
+ version : str
1088
+ Version of the TSVZed implementation.
1089
+ dirty : bool
1090
+ True if the in-memory data differs from the file on disk.
1091
+ deSynced : bool
1092
+ True if synchronization with the file has failed or external changes detected.
1093
+ memoryOnly : bool
1094
+ If True, changes are kept in memory only and not written to disk.
1095
+ appendQueue : deque
1096
+ Queue of lines waiting to be appended to the file.
1097
+ writeLock : threading.Lock
1098
+ Lock for ensuring thread-safe file operations.
1099
+ shutdownEvent : threading.Event
1100
+ Event signal for stopping the append worker thread.
1101
+ appendThread : threading.Thread
1102
+ Background thread that handles asynchronous file appending.
1103
+ Methods
1104
+ -------
1105
+ load()
1106
+ Load or reload data from the TSV file.
1107
+ reload()
1108
+ Refresh data from the TSV file, discarding in-memory changes.
1109
+ rewrite(force=False, reloadInternalFromFile=None)
1110
+ Rewrite the entire file with current in-memory data.
1111
+ mapToFile()
1112
+ Synchronize in-memory data to the file using in-place updates.
1113
+ hardMapToFile()
1114
+ Completely rewrite the file from scratch with current data.
1115
+ clear()
1116
+ Clear all data from memory and optionally the file.
1117
+ clear_file()
1118
+ Clear the file, keeping only the header.
1119
+ commitAppendToFile()
1120
+ Write all queued append operations to the file.
1121
+ stopAppendThread()
1122
+ Stop the background append worker thread and perform final sync.
1123
+ setDefaults(defaults)
1124
+ Set default values for columns.
1125
+ getListView()
1126
+ Get a list representation of the data with headers.
1127
+ getResourceUsage(return_dict=False)
1128
+ Get current resource usage statistics.
1129
+ checkExternalChanges()
1130
+ Check if the file has been modified externally.
1131
+ close()
1132
+ Close the TSVZed object, stopping background threads and syncing data.
1133
+ Notes
1134
+ -----
1135
+ - The class uses a background thread to handle asynchronous file operations.
1136
+ - File locking is implemented for both POSIX and Windows systems.
1137
+ - Keys starting with '#' are treated as comments and not persisted to file.
1138
+ - The special key '#DEFAULTS#' is used to store column default values.
1139
+ - Supports compressed file formats through automatic detection.
1140
+ - Thread-safe for concurrent access from multiple threads.
1141
+ Examples
1142
+ --------
1143
+ >>> with TSVZed('data.tsv', header='id\tname\tvalue') as tsv:
1144
+ ... tsv['key1'] = ['key1', 'John', '100']
1145
+ ... tsv['key2'] = ['key2', 'Jane', '200']
1146
+ ... print(tsv['key1'])
1147
+ ['key1', 'John', '100']
1148
+ >>> tsv = TSVZed('data.tsv', verbose=True, rewrite_on_exit=True)
1149
+ >>> tsv['key3'] = 'key3\tBob\t300'
1150
+ >>> tsv.close()
1151
+ """
1152
+ def __teePrintOrNot(self,message,level = 'info'):
1153
+ try:
1154
+ if self.teeLogger:
1155
+ self.teeLogger.teelog(message,level)
1156
+ else:
1157
+ print(message,flush=True)
1158
+ except Exception:
1159
+ print(message,flush=True)
1160
+
1161
+ def getResourceUsage(self,return_dict = False):
1162
+ return get_resource_usage(return_dict = return_dict)
1163
+
1164
+ def __init__ (self,fileName,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,
1165
+ rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,monitor_external_changes = True,
1166
+ verbose = False,encoding = 'utf8',delimiter = ...,defaults = None,strict = False,correctColumnNum = -1):
1167
+ super().__init__()
1168
+ self.version = version
1169
+ self.strict = strict
1170
+ self.externalFileUpdateTime = getFileUpdateTimeNs(fileName)
1171
+ self.lastUpdateTime = self.externalFileUpdateTime
1172
+ self._fileName = fileName
1173
+ self.teeLogger = teeLogger
1174
+ self.delimiter = get_delimiter(delimiter,file_name=fileName)
1175
+ self.setDefaults(defaults)
1176
+ self.header = _formatHeader(header,verbose = verbose,teeLogger = self.teeLogger,delimiter=self.delimiter)
1177
+ self.correctColumnNum = correctColumnNum
1178
+ self.createIfNotExist = createIfNotExist
1179
+ self.verifyHeader = verifyHeader
1180
+ self.rewrite_on_load = rewrite_on_load
1181
+ self.rewrite_on_exit = rewrite_on_exit
1182
+ self.rewrite_interval = rewrite_interval
1183
+ self.monitor_external_changes = monitor_external_changes
1184
+ if not monitor_external_changes:
1185
+ self.__teePrintOrNot(f"Warning: External changes monitoring disabled for {self._fileName}. Will overwrite external changes.",'warning')
1186
+ self.verbose = verbose
1187
+ if append_check_delay < 0:
1188
+ append_check_delay = 0.00001
1189
+ self.__teePrintOrNot('append_check_delay cannot be less than 0, setting it to 0.00001','error')
1190
+ self.append_check_delay = append_check_delay
1191
+ self.appendQueue = deque()
1192
+ self.dirty = False
1193
+ self.deSynced = False
1194
+ self.memoryOnly = False
1195
+ self.encoding = encoding
1196
+ self.writeLock = threading.Lock()
1197
+ self.shutdownEvent = threading.Event()
1198
+ #self.appendEvent = threading.Event()
1199
+ self.appendThread = threading.Thread(target=self._appendWorker,daemon=True)
1200
+ self.appendThread.start()
1201
+ self.load()
1202
+ atexit.register(self.stopAppendThread)
1203
+
1204
+ def setDefaults(self,defaults):
1205
+ if not defaults:
1206
+ defaults = []
1207
+ if isinstance(defaults,str):
1208
+ defaults = defaults.split(self.delimiter)
1209
+ elif not isinstance(defaults,list):
1210
+ try:
1211
+ defaults = list(defaults)
1212
+ except Exception:
1213
+ if self.verbose:
1214
+ self.__teePrintOrNot('Invalid defaults, setting defaults to empty.','error')
1215
+ defaults = []
1216
+ defaults = [str(s).rstrip() if s else '' for s in defaults]
1217
+ if not any(defaults):
1218
+ defaults = []
1219
+ if not defaults or defaults[0] != DEFAULTS_INDICATOR_KEY:
1220
+ defaults = [DEFAULTS_INDICATOR_KEY]+defaults
1221
+ self.defaults = defaults
1222
+
1223
+ def load(self):
1224
+ self.reload()
1225
+ if self.rewrite_on_load:
1226
+ self.rewrite(force = True,reloadInternalFromFile = False)
1227
+ return self
1228
+
1229
+ def reload(self):
1230
+ # Load or refresh data from the TSV file
1231
+ mo = self.memoryOnly
1232
+ self.memoryOnly = True
1233
+ if self.verbose:
1234
+ self.__teePrintOrNot(f"Loading {self._fileName}")
1235
+ super().clear()
1236
+ readTabularFile(self._fileName, teeLogger = self.teeLogger, header = self.header,
1237
+ createIfNotExist = self.createIfNotExist, verifyHeader = self.verifyHeader,
1238
+ verbose = self.verbose, taskDic = self,encoding = self.encoding if self.encoding else None,
1239
+ strict = self.strict, delimiter = self.delimiter, defaults=self.defaults)
1240
+ if self.verbose:
1241
+ self.__teePrintOrNot(f"Loaded {len(self)} records from {self._fileName}")
1242
+ if self.header and any(self.header) and self.verifyHeader:
1243
+ self.correctColumnNum = len(self.header)
1244
+ elif self:
1245
+ self.correctColumnNum = len(self[next(iter(self))])
1246
+ else:
1247
+ self.correctColumnNum = -1
1248
+ if self.verbose:
1249
+ self.__teePrintOrNot(f"correctColumnNum: {self.correctColumnNum}")
1250
+ #super().update(loadedData)
1251
+ if self.verbose:
1252
+ self.__teePrintOrNot(f"TSVZed({self._fileName}) loaded")
1253
+ self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1254
+ self.lastUpdateTime = self.externalFileUpdateTime
1255
+ self.memoryOnly = mo
1256
+ return self
1257
+
1258
+ def __setitem__(self,key,value):
1259
+ key = str(key).rstrip()
1260
+ if not key:
1261
+ self.__teePrintOrNot('Key cannot be empty','error')
1262
+ return
1263
+ if isinstance(value,str):
1264
+ value = value.split(self.delimiter)
1265
+ # sanitize the value
1266
+ value = [str(s).rstrip() if s else '' for s in value]
1267
+ # the first field in value should be the key
1268
+ # add it if it is not there
1269
+ if not value or value[0] != key:
1270
+ value = [key]+value
1271
+ # verify the value has the correct number of columns
1272
+ if self.correctColumnNum != 1 and len(value) == 1:
1273
+ # this means we want to clear / delete the key
1274
+ del self[key]
1275
+ elif self.correctColumnNum > 0:
1276
+ if len(value) != self.correctColumnNum:
1277
+ if self.strict:
1278
+ self.__teePrintOrNot(f"Value {value} does not have the correct number of columns: {self.correctColumnNum}. Refuse adding key...",'error')
1279
+ return
1280
+ elif self.verbose:
1281
+ self.__teePrintOrNot(f"Value {value} does not have the correct number of columns: {self.correctColumnNum}, correcting...",'warning')
1282
+ if len(value) < self.correctColumnNum:
1283
+ value += ['']*(self.correctColumnNum-len(value))
1284
+ elif len(value) > self.correctColumnNum:
1285
+ value = value[:self.correctColumnNum]
1286
+ else:
1287
+ self.correctColumnNum = len(value)
1288
+ if self.defaults and len(self.defaults) > 1:
1289
+ for i in range(1,len(value)):
1290
+ if not value[i] and i < len(self.defaults) and self.defaults[i]:
1291
+ value[i] = self.defaults[i]
1292
+ if self.verbose:
1293
+ self.__teePrintOrNot(f" Replacing empty value at {i} with default: {self.defaults[i]}")
1294
+ if key == DEFAULTS_INDICATOR_KEY:
1295
+ self.defaults = value
1296
+ if self.verbose:
1297
+ self.__teePrintOrNot(f"Defaults set to {value}")
1298
+ if not self.memoryOnly:
1299
+ self.appendQueue.append(value)
1300
+ self.lastUpdateTime = get_time_ns()
1301
+ if self.verbose:
1302
+ self.__teePrintOrNot(f"Appending Defaults {key} to the appendQueue")
1303
+ return
1304
+ if self.verbose:
1305
+ self.__teePrintOrNot(f"Setting {key} to {value}")
1306
+ if key in self:
1307
+ if self[key] == value:
1308
+ if self.verbose:
1309
+ self.__teePrintOrNot(f"Key {key} already exists with the same value")
1310
+ return
1311
+ self.dirty = True
1312
+ # update the dictionary,
1313
+ super().__setitem__(key,value)
1314
+ if self.memoryOnly:
1315
+ if self.verbose:
1316
+ self.__teePrintOrNot(f"Key {key} updated in memory only")
1317
+ return
1318
+ elif key.startswith('#'):
1319
+ if self.verbose:
1320
+ self.__teePrintOrNot(f"Key {key} updated in memory only as it starts with #")
1321
+ return
1322
+ if self.verbose:
1323
+ self.__teePrintOrNot(f"Appending {key} to the appendQueue")
1324
+ self.appendQueue.append(value)
1325
+ self.lastUpdateTime = get_time_ns()
1326
+ # if not self.appendThread.is_alive():
1327
+ # self.commitAppendToFile()
1328
+ # else:
1329
+ # self.appendEvent.set()
1330
+
1331
+ def __getitem__(self, key):
1332
+ return super().__getitem__(str(key).rstrip())
1333
+
1334
+
1335
+ def __delitem__(self,key):
1336
+ key = str(key).rstrip()
1337
+ if key == DEFAULTS_INDICATOR_KEY:
1338
+ self.defaults = [DEFAULTS_INDICATOR_KEY]
1339
+ if self.verbose:
1340
+ self.__teePrintOrNot("Defaults cleared")
1341
+ if not self.memoryOnly:
1342
+ self.__appendEmptyLine(key)
1343
+ if self.verbose:
1344
+ self.__teePrintOrNot(f"Appending empty default line {key}")
1345
+ return
1346
+ # delete the key from the dictionary and update the file
1347
+ if key not in self:
1348
+ if self.verbose:
1349
+ self.__teePrintOrNot(f"Key {key} not found")
1350
+ return
1351
+ super().__delitem__(key)
1352
+ if self.memoryOnly or key.startswith('#'):
1353
+ if self.verbose:
1354
+ self.__teePrintOrNot(f"Key {key} deleted in memory")
1355
+ return
1356
+ self.__appendEmptyLine(key)
1357
+ if self.verbose:
1358
+ self.__teePrintOrNot(f"Appending empty line {key}")
1359
+ self.lastUpdateTime = get_time_ns()
1360
+
1361
+ def __appendEmptyLine(self,key):
1362
+ self.dirty = True
1363
+ if self.correctColumnNum > 0:
1364
+ emptyLine = [key]+[self.delimiter]*(self.correctColumnNum-1)
1365
+ elif len(self[key]) > 1:
1366
+ self.correctColumnNum = len(self[key])
1367
+ emptyLine = [key]+[self.delimiter]*(self.correctColumnNum-1)
1368
+ else:
1369
+ emptyLine = [key]
1370
+ if self.verbose:
1371
+ self.__teePrintOrNot(f"Appending {emptyLine} to the appendQueue")
1372
+ self.appendQueue.append(emptyLine)
1373
+ return self
1374
+
1375
+ def getListView(self):
1376
+ return getListView(self,header=self.header,delimiter=self.delimiter)
1377
+
1378
+ def clear(self):
1379
+ # clear the dictionary and update the file
1380
+ super().clear()
1381
+ if self.verbose:
1382
+ self.__teePrintOrNot(f"Clearing {self._fileName}")
1383
+ if self.memoryOnly:
1384
+ return self
1385
+ self.clear_file()
1386
+ self.lastUpdateTime = self.externalFileUpdateTime
1387
+ return self
1388
+
1389
+ def clear_file(self):
1390
+ try:
1391
+ if self.header:
1392
+ file = self.get_file_obj('wb')
1393
+ header = self.delimiter.join(_sanitize(self.header,delimiter=self.delimiter))
1394
+ file.write(header.encode(self.encoding,errors='replace') + b'\n')
1395
+ self.release_file_obj(file)
1396
+ if self.verbose:
1397
+ self.__teePrintOrNot(f"Header {header} written to {self._fileName}")
1398
+ self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
1399
+ else:
1400
+ file = self.get_file_obj('wb')
1401
+ self.release_file_obj(file)
1402
+ if self.verbose:
1403
+ self.__teePrintOrNot(f"File {self._fileName} cleared empty")
1404
+ self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
1405
+ self.dirty = False
1406
+ self.deSynced = False
1407
+ except Exception as e:
1408
+ self.release_file_obj(file)
1409
+ self.__teePrintOrNot(f"Failed to write at clear_file() to {self._fileName}: {e}",'error')
1410
+ import traceback
1411
+ self.__teePrintOrNot(traceback.format_exc(),'error')
1412
+ self.deSynced = True
1413
+ return self
1414
+
1415
+ def __enter__(self):
1416
+ return self
1417
+
1418
+ def close(self):
1419
+ self.stopAppendThread()
1420
+ return self
1421
+
1422
+ def __exit__(self,exc_type,exc_value,traceback):
1423
+ return self.close()
1424
+
1425
+ def __repr__(self):
1426
+ return f"""TSVZed(
1183
1427
  file_name:{self._fileName}
1184
1428
  teeLogger:{self.teeLogger}
1185
1429
  header:{self.header}
@@ -1196,372 +1440,860 @@ dirty:{self.dirty}
1196
1440
  deSynced:{self.deSynced}
1197
1441
  memoryOnly:{self.memoryOnly}
1198
1442
  {dict(self)})"""
1199
-
1200
- def __str__(self):
1201
- return f"TSVZed({self._fileName},{dict(self)})"
1202
-
1203
- def __del__(self):
1204
- return self.close()
1205
-
1206
- def popitem(self, last=True):
1207
- key, value = super().popitem(last)
1208
- if not self.memoryOnly:
1209
- self.__appendEmptyLine(key)
1210
- self.lastUpdateTime = get_time_ns()
1211
- return key, value
1212
-
1213
- __marker = object()
1214
-
1215
- def pop(self, key, default=__marker):
1216
- '''od.pop(k[,d]) -> v, remove specified key and return the corresponding
1217
- value. If key is not found, d is returned if given, otherwise KeyError
1218
- is raised.
1219
-
1220
- '''
1221
- if key not in self:
1222
- if default is self.__marker:
1223
- raise KeyError(key)
1224
- return default
1225
- value = super().pop(key)
1226
- if not self.memoryOnly:
1227
- self.__appendEmptyLine(key)
1228
- self.lastUpdateTime = get_time_ns()
1229
- return value
1230
-
1231
- def move_to_end(self, key, last=True):
1232
- '''Move an existing element to the end (or beginning if last is false).
1233
- Raise KeyError if the element does not exist.
1234
- '''
1235
- super().move_to_end(key, last)
1236
- self.dirty = True
1237
- if not self.rewrite_on_exit:
1238
- self.rewrite_on_exit = True
1239
- self.__teePrintOrNot(f"Warning: move_to_end had been called. Need to resync for changes to apply to disk.")
1240
- self.__teePrintOrNot(f"rewrite_on_exit set to True")
1241
- if self.verbose:
1242
- self.__teePrintOrNot(f"Warning: Trying to move Key {key} moved to {'end' if last else 'beginning'} Need to resync for changes to apply to disk")
1243
- self.lastUpdateTime = get_time_ns()
1244
- return self
1245
-
1246
- @classmethod
1247
- def fromkeys(cls, iterable, value=None,fileName = None,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,verbose = False):
1248
- '''Create a new ordered dictionary with keys from iterable and values set to value.
1249
- '''
1250
- self = cls(fileName,teeLogger,header,createIfNotExist,verifyHeader,rewrite_on_load,rewrite_on_exit,rewrite_interval,append_check_delay,verbose)
1251
- for key in iterable:
1252
- self[key] = value
1253
- return self
1254
-
1255
-
1256
- def rewrite(self,force = False,reloadInternalFromFile = None):
1257
- if not self.deSynced and not force:
1258
- if not self.dirty:
1259
- return False
1260
- if self.rewrite_interval == 0 or time.time() - os.path.getmtime(self._fileName) < self.rewrite_interval:
1261
- return False
1262
- try:
1263
-
1264
- if reloadInternalFromFile is None:
1265
- reloadInternalFromFile = self.monitor_external_changes
1266
- if reloadInternalFromFile and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
1267
- # this will be needed if more than 1 process is accessing the file
1268
- self.commitAppendToFile()
1269
- self.reload()
1270
- if self.memoryOnly:
1271
- if self.verbose:
1272
- self.__teePrintOrNot(f"Memory only mode. Map to file skipped.")
1273
- return False
1274
- if self.dirty:
1275
- if self.verbose:
1276
- self.__teePrintOrNot(f"Rewriting {self._fileName}")
1277
- self.mapToFile()
1278
- if self.verbose:
1279
- self.__teePrintOrNot(f"{len(self)} records rewrote to {self._fileName}")
1280
- if not self.appendThread.is_alive():
1281
- self.commitAppendToFile()
1282
- # else:
1283
- # self.appendEvent.set()
1284
- return True
1285
- except Exception as e:
1286
- self.__teePrintOrNot(f"Failed to write at sync() to {self._fileName}: {e}",'error')
1287
- import traceback
1288
- self.__teePrintOrNot(traceback.format_exc(),'error')
1289
- self.deSynced = True
1290
- return False
1291
-
1292
- def hardMapToFile(self):
1293
- try:
1294
- if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
1295
- self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
1296
- file = self.get_file_obj('wb')
1297
- if self.header:
1298
- file.write(self.header.encode(self.encoding,errors='replace') + b'\n')
1299
- for key in self:
1300
- file.write(self.delimiter.join(self[key]).encode(encoding=self.encoding,errors='replace')+b'\n')
1301
- self.release_file_obj(file)
1302
- if self.verbose:
1303
- self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
1304
- self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
1305
- self.dirty = False
1306
- self.deSynced = False
1307
- except Exception as e:
1308
- self.release_file_obj(file)
1309
- self.__teePrintOrNot(f"Failed to write at hardMapToFile() to {self._fileName}: {e}",'error')
1310
- import traceback
1311
- self.__teePrintOrNot(traceback.format_exc(),'error')
1312
- self.deSynced = True
1313
- return self
1314
-
1315
- def mapToFile(self):
1316
- mec = self.monitor_external_changes
1317
- self.monitor_external_changes = False
1318
- try:
1319
- if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
1320
- self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
1321
- if self._fileName.rpartition('.')[2] in COMPRESSED_FILE_EXTENSIONS:
1322
- # if the file is compressed, we need to use the hardMapToFile method
1323
- return self.hardMapToFile()
1324
- file = self.get_file_obj('r+b')
1325
- overWrite = False
1326
- if self.header:
1327
- line = file.readline().decode(self.encoding,errors='replace')
1328
- aftPos = file.tell()
1329
- if not _lineContainHeader(self.header,line,verbose = self.verbose,teeLogger = self.teeLogger,strict = self.strict):
1330
- file.seek(0)
1331
- file.write(f'{self.header}\n'.encode(encoding=self.encoding,errors='replace'))
1332
- # if the header is not the same length as the line, we need to overwrite the file
1333
- if aftPos != file.tell():
1334
- overWrite = True
1335
- if self.verbose:
1336
- self.__teePrintOrNot(f"Header {self.header} written to {self._fileName}")
1337
- for value in self.values():
1338
- if value[0].startswith('#'):
1339
- continue
1340
- strToWrite = self.delimiter.join(value)
1341
- if overWrite:
1342
- if self.verbose:
1343
- self.__teePrintOrNot(f"Overwriting {value} to {self._fileName}")
1344
- file.write(strToWrite.encode(encoding=self.encoding,errors='replace')+b'\n')
1345
- continue
1346
- pos = file.tell()
1347
- line = file.readline()
1348
- aftPos = file.tell()
1349
- if not line or pos == aftPos:
1350
- if self.verbose:
1351
- self.__teePrintOrNot(f"End of file reached. Appending {value} to {self._fileName}")
1352
- file.write(strToWrite.encode(encoding=self.encoding,errors='replace'))
1353
- overWrite = True
1354
- continue
1355
- strToWrite = strToWrite.encode(encoding=self.encoding,errors='replace').ljust(len(line)-1)+b'\n'
1356
- if line != strToWrite:
1357
- if self.verbose:
1358
- self.__teePrintOrNot(f"Modifing {value} to {self._fileName}")
1359
- file.seek(pos)
1360
- # fill the string with space to write to the correct length
1361
- #file.write(strToWrite.rstrip('\n').ljust(len(line)-1)+'\n')
1362
- file.write(strToWrite)
1363
- if aftPos != file.tell():
1364
- overWrite = True
1365
- file.truncate()
1366
- self.release_file_obj(file)
1367
- if self.verbose:
1368
- self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
1369
- self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
1370
- self.dirty = False
1371
- self.deSynced = False
1372
- except Exception as e:
1373
- self.release_file_obj(file)
1374
- self.__teePrintOrNot(f"Failed to write at mapToFile() to {self._fileName}: {e}",'error')
1375
- import traceback
1376
- self.__teePrintOrNot(traceback.format_exc(),'error')
1377
- self.deSynced = True
1378
- self.__teePrintOrNot("Trying failback hardMapToFile()")
1379
- self.hardMapToFile()
1380
- self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1381
- self.monitor_external_changes = mec
1382
- return self
1383
-
1384
- def checkExternalChanges(self):
1385
- if self.deSynced:
1386
- return self
1387
- if not self.monitor_external_changes:
1388
- return self
1389
- realExternalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1390
- if self.externalFileUpdateTime < realExternalFileUpdateTime:
1391
- self.deSynced = True
1392
- self.__teePrintOrNot(f"External changes detected in {self._fileName}")
1393
- elif self.externalFileUpdateTime > realExternalFileUpdateTime:
1394
- self.__teePrintOrNot(f"Time anomalies detected in {self._fileName}, resetting externalFileUpdateTime")
1395
- self.externalFileUpdateTime = realExternalFileUpdateTime
1396
- return self
1397
-
1398
- def _appendWorker(self):
1399
- while not self.shutdownEvent.is_set():
1400
- if not self.memoryOnly:
1401
- self.checkExternalChanges()
1402
- self.rewrite()
1403
- self.commitAppendToFile()
1404
- time.sleep(self.append_check_delay)
1405
- # self.appendEvent.wait()
1406
- # self.appendEvent.clear()
1407
- if self.verbose:
1408
- self.__teePrintOrNot(f"Append worker for {self._fileName} shut down")
1409
- self.commitAppendToFile()
1410
-
1411
- def commitAppendToFile(self):
1412
- if self.appendQueue:
1413
- if self.memoryOnly:
1414
- self.appendQueue.clear()
1415
- if self.verbose:
1416
- self.__teePrintOrNot(f"Memory only mode. Append queue cleared.")
1417
- return self
1418
- try:
1419
- if self.verbose:
1420
- self.__teePrintOrNot(f"Commiting {len(self.appendQueue)} records to {self._fileName}")
1421
- self.__teePrintOrNot(f"Before size of {self._fileName}: {os.path.getsize(self._fileName)}")
1422
- file = self.get_file_obj('ab')
1423
- while self.appendQueue:
1424
- line = self.appendQueue.popleft()
1425
- file.write(line.encode(encoding=self.encoding,errors='replace')+b'\n')
1426
- self.release_file_obj(file)
1427
- if self.verbose:
1428
- self.__teePrintOrNot(f"Records commited to {self._fileName}")
1429
- self.__teePrintOrNot(f"After size of {self._fileName}: {os.path.getsize(self._fileName)}")
1430
- except Exception as e:
1431
- self.release_file_obj(file)
1432
- self.__teePrintOrNot(f"Failed to write at commitAppendToFile to {self._fileName}: {e}",'error')
1433
- import traceback
1434
- self.__teePrintOrNot(traceback.format_exc(),'error')
1435
- self.deSynced = True
1436
- return self
1437
-
1438
- def stopAppendThread(self):
1439
- try:
1440
- if self.shutdownEvent.is_set():
1441
- # if self.verbose:
1442
- # self.__teePrintOrNot(f"Append thread for {self._fileName} already stopped")
1443
- return
1444
- self.rewrite(force=self.rewrite_on_exit) # Ensure any final sync operations are performed
1445
- # self.appendEvent.set()
1446
- self.shutdownEvent.set() # Signal the append thread to shut down
1447
- self.appendThread.join() # Wait for the append thread to complete
1448
- if self.verbose:
1449
- self.__teePrintOrNot(f"Append thread for {self._fileName} stopped")
1450
- except Exception as e:
1451
- self.__teePrintOrNot(f"Failed to stop append thread for {self._fileName}: {e}",'error')
1452
- import traceback
1453
- self.__teePrintOrNot(traceback.format_exc(),'error')
1454
-
1455
- def get_file_obj(self,modes = 'ab'):
1456
- self.writeLock.acquire()
1457
- try:
1458
- if not self.encoding:
1459
- self.encoding = 'utf8'
1460
- file = openFileAsCompressed(self._fileName, mode=modes, encoding=self.encoding,teeLogger=self.teeLogger)
1461
- # Lock the file after opening
1462
- if os.name == 'posix':
1463
- fcntl.lockf(file, fcntl.LOCK_EX)
1464
- elif os.name == 'nt':
1465
- # For Windows, locking the entire file, avoiding locking an empty file
1466
- #lock_length = max(1, os.path.getsize(self._fileName))
1467
- lock_length = 2147483647
1468
- msvcrt.locking(file.fileno(), msvcrt.LK_LOCK, lock_length)
1469
- if self.verbose:
1470
- self.__teePrintOrNot(f"File {self._fileName} locked with mode {modes}")
1471
- except Exception as e:
1472
- try:
1473
- self.writeLock.release() # Release the thread lock in case of an error
1474
- except Exception as e:
1475
- self.__teePrintOrNot(f"Failed to release writeLock for {self._fileName}: {e}",'error')
1476
- self.__teePrintOrNot(f"Failed to open file {self._fileName}: {e}",'error')
1477
- return file
1478
-
1479
- def release_file_obj(self,file):
1480
- # if write lock is already released, return
1481
- if not self.writeLock.locked():
1482
- return
1483
- try:
1484
- file.flush() # Ensure the file is flushed before unlocking
1485
- os.fsync(file.fileno()) # Ensure the file is synced to disk before unlocking
1486
- if not file.closed:
1487
- if os.name == 'posix':
1488
- fcntl.lockf(file, fcntl.LOCK_UN)
1489
- elif os.name == 'nt':
1490
- # Unlocking the entire file; for Windows, ensure not unlocking an empty file
1491
- #unlock_length = max(1, os.path.getsize(os.path.realpath(file.name)))
1492
- unlock_length = 2147483647
1493
- try:
1494
- msvcrt.locking(file.fileno(), msvcrt.LK_UNLCK, unlock_length)
1495
- except:
1496
- pass
1497
- file.close() # Ensure file is closed after unlocking
1498
- if self.verbose:
1499
- self.__teePrintOrNot(f"File {file.name} unlocked / released")
1500
- except Exception as e:
1501
- try:
1502
- self.writeLock.release() # Ensure the thread lock is always released
1503
- except Exception as e:
1504
- self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
1505
- self.__teePrintOrNot(f"Failed to release file {file.name}: {e}",'error')
1506
- import traceback
1507
- self.__teePrintOrNot(traceback.format_exc(),'error')
1508
- # release the write lock if not already released
1509
- if self.writeLock.locked():
1510
- try:
1511
- self.writeLock.release() # Ensure the thread lock is always released
1512
- except Exception as e:
1513
- self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
1514
- self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1443
+
1444
+ def __str__(self):
1445
+ return f"TSVZed({self._fileName},{dict(self)})"
1446
+
1447
+ def __del__(self):
1448
+ return self.close()
1449
+
1450
+ def popitem(self, last=True):
1451
+ key, value = super().popitem(last)
1452
+ if not self.memoryOnly:
1453
+ self.__appendEmptyLine(key)
1454
+ self.lastUpdateTime = get_time_ns()
1455
+ return key, value
1456
+
1457
+ __marker = object()
1458
+
1459
+ def pop(self, key, default=__marker):
1460
+ '''od.pop(k[,d]) -> v, remove specified key and return the corresponding
1461
+ value. If key is not found, d is returned if given, otherwise KeyError
1462
+ is raised.
1463
+
1464
+ '''
1465
+ key = str(key).rstrip()
1466
+ if key not in self:
1467
+ if default is self.__marker:
1468
+ raise KeyError(key)
1469
+ return default
1470
+ value = super().pop(key)
1471
+ if not self.memoryOnly:
1472
+ self.__appendEmptyLine(key)
1473
+ self.lastUpdateTime = get_time_ns()
1474
+ return value
1475
+
1476
+ def move_to_end(self, key, last=True):
1477
+ '''Move an existing element to the end (or beginning if last is false).
1478
+ Raise KeyError if the element does not exist.
1479
+ '''
1480
+ key = str(key).rstrip()
1481
+ super().move_to_end(key, last)
1482
+ self.dirty = True
1483
+ if not self.rewrite_on_exit:
1484
+ self.rewrite_on_exit = True
1485
+ self.__teePrintOrNot("Warning: move_to_end had been called. Need to resync for changes to apply to disk.")
1486
+ self.__teePrintOrNot("rewrite_on_exit set to True")
1487
+ if self.verbose:
1488
+ self.__teePrintOrNot(f"Warning: Trying to move Key {key} moved to {'end' if last else 'beginning'} Need to resync for changes to apply to disk")
1489
+ self.lastUpdateTime = get_time_ns()
1490
+ return self
1491
+
1492
+ def __sizeof__(self):
1493
+ sizeof = sys.getsizeof
1494
+ size = sizeof(super()) + sizeof(True) * 12 # for the booleans / integers
1495
+ size += sizeof(self.externalFileUpdateTime)
1496
+ size += sizeof(self.lastUpdateTime)
1497
+ size += sizeof(self._fileName)
1498
+ size += sizeof(self.teeLogger)
1499
+ size += sizeof(self.delimiter)
1500
+ size += sizeof(self.defaults)
1501
+ size += sizeof(self.header)
1502
+ size += sizeof(self.appendQueue)
1503
+ size += sizeof(self.encoding)
1504
+ size += sizeof(self.writeLock)
1505
+ size += sizeof(self.shutdownEvent)
1506
+ size += sizeof(self.appendThread)
1507
+ size += super().__sizeof__()
1508
+ return size
1509
+
1510
+ @classmethod
1511
+ def fromkeys(cls, iterable, value=None,fileName = None,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,verbose = False):
1512
+ '''Create a new ordered dictionary with keys from iterable and values set to value.
1513
+ '''
1514
+ self = cls(fileName,teeLogger,header,createIfNotExist,verifyHeader,rewrite_on_load,rewrite_on_exit,rewrite_interval,append_check_delay,verbose)
1515
+ for key in iterable:
1516
+ self[key] = value
1517
+ return self
1518
+
1519
+
1520
+ def rewrite(self,force = False,reloadInternalFromFile = None):
1521
+ if not self.deSynced and not force:
1522
+ if not self.dirty:
1523
+ return False
1524
+ if self.rewrite_interval == 0 or time.time() - os.path.getmtime(self._fileName) < self.rewrite_interval:
1525
+ return False
1526
+ try:
1527
+
1528
+ if reloadInternalFromFile is None:
1529
+ reloadInternalFromFile = self.monitor_external_changes
1530
+ if reloadInternalFromFile and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
1531
+ # this will be needed if more than 1 process is accessing the file
1532
+ self.commitAppendToFile()
1533
+ self.reload()
1534
+ if self.memoryOnly:
1535
+ if self.verbose:
1536
+ self.__teePrintOrNot("Memory only mode. Map to file skipped.")
1537
+ return False
1538
+ if self.dirty:
1539
+ if self.verbose:
1540
+ self.__teePrintOrNot(f"Rewriting {self._fileName}")
1541
+ self.mapToFile()
1542
+ if self.verbose:
1543
+ self.__teePrintOrNot(f"{len(self)} records rewrote to {self._fileName}")
1544
+ if not self.appendThread.is_alive():
1545
+ self.commitAppendToFile()
1546
+ # else:
1547
+ # self.appendEvent.set()
1548
+ return True
1549
+ except Exception as e:
1550
+ self.__teePrintOrNot(f"Failed to write at sync() to {self._fileName}: {e}",'error')
1551
+ import traceback
1552
+ self.__teePrintOrNot(traceback.format_exc(),'error')
1553
+ self.deSynced = True
1554
+ return False
1555
+
1556
+ def hardMapToFile(self):
1557
+ try:
1558
+ if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
1559
+ self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
1560
+ file = self.get_file_obj('wb')
1561
+ buf = io.BufferedWriter(file, buffer_size=64*1024*1024) # 64MB buffer
1562
+ if self.header:
1563
+ header = self.delimiter.join(_sanitize(self.header,delimiter=self.delimiter))
1564
+ buf.write(header.encode(self.encoding,errors='replace') + b'\n')
1565
+ for key in self:
1566
+ segments = _sanitize(self[key],delimiter=self.delimiter)
1567
+ buf.write(self.delimiter.join(segments).encode(encoding=self.encoding,errors='replace')+b'\n')
1568
+ buf.flush()
1569
+ self.release_file_obj(file)
1570
+ if self.verbose:
1571
+ self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
1572
+ self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
1573
+ self.dirty = False
1574
+ self.deSynced = False
1575
+ except Exception as e:
1576
+ self.release_file_obj(file)
1577
+ self.__teePrintOrNot(f"Failed to write at hardMapToFile() to {self._fileName}: {e}",'error')
1578
+ import traceback
1579
+ self.__teePrintOrNot(traceback.format_exc(),'error')
1580
+ self.deSynced = True
1581
+ return self
1582
+
1583
+ def mapToFile(self):
1584
+ mec = self.monitor_external_changes
1585
+ self.monitor_external_changes = False
1586
+ try:
1587
+ if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
1588
+ self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
1589
+ if self._fileName.rpartition('.')[2] in COMPRESSED_FILE_EXTENSIONS:
1590
+ # if the file is compressed, we need to use the hardMapToFile method
1591
+ return self.hardMapToFile()
1592
+ file = self.get_file_obj('r+b')
1593
+ overWrite = False
1594
+ if self.header:
1595
+ line = file.readline().decode(self.encoding,errors='replace')
1596
+ aftPos = file.tell()
1597
+ if not _lineContainHeader(self.header,line,verbose = self.verbose,teeLogger = self.teeLogger,strict = self.strict):
1598
+ header = self.delimiter.join(_sanitize(self.header,delimiter=self.delimiter))
1599
+ file.seek(0)
1600
+ file.write(f'{header}\n'.encode(encoding=self.encoding,errors='replace'))
1601
+ # if the header is not the same length as the line, we need to overwrite the file
1602
+ if aftPos != file.tell():
1603
+ overWrite = True
1604
+ if self.verbose:
1605
+ self.__teePrintOrNot(f"Header {header} written to {self._fileName}")
1606
+ for value in self.values():
1607
+ if value[0].startswith('#'):
1608
+ continue
1609
+ segments = _sanitize(value,delimiter=self.delimiter)
1610
+ strToWrite = self.delimiter.join(segments)
1611
+ if overWrite:
1612
+ if self.verbose:
1613
+ self.__teePrintOrNot(f"Overwriting {value} to {self._fileName}")
1614
+ file.write(strToWrite.encode(encoding=self.encoding,errors='replace')+b'\n')
1615
+ continue
1616
+ pos = file.tell()
1617
+ line = file.readline()
1618
+ aftPos = file.tell()
1619
+ if not line or pos == aftPos:
1620
+ if self.verbose:
1621
+ self.__teePrintOrNot(f"End of file reached. Appending {value} to {self._fileName}")
1622
+ file.write(strToWrite.encode(encoding=self.encoding,errors='replace'))
1623
+ overWrite = True
1624
+ continue
1625
+ strToWrite = strToWrite.encode(encoding=self.encoding,errors='replace').ljust(len(line)-1)+b'\n'
1626
+ if line != strToWrite:
1627
+ if self.verbose:
1628
+ self.__teePrintOrNot(f"Modifing {value} to {self._fileName}")
1629
+ file.seek(pos)
1630
+ # fill the string with space to write to the correct length
1631
+ file.write(strToWrite)
1632
+ if aftPos != file.tell():
1633
+ overWrite = True
1634
+ file.truncate()
1635
+ self.release_file_obj(file)
1636
+ if self.verbose:
1637
+ self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
1638
+ self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
1639
+ self.dirty = False
1640
+ self.deSynced = False
1641
+ except Exception as e:
1642
+ self.release_file_obj(file)
1643
+ self.__teePrintOrNot(f"Failed to write at mapToFile() to {self._fileName}: {e}",'error')
1644
+ import traceback
1645
+ self.__teePrintOrNot(traceback.format_exc(),'error')
1646
+ self.deSynced = True
1647
+ self.__teePrintOrNot("Trying failback hardMapToFile()")
1648
+ self.hardMapToFile()
1649
+ self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1650
+ self.monitor_external_changes = mec
1651
+ return self
1652
+
1653
+ def checkExternalChanges(self):
1654
+ if self.deSynced:
1655
+ return self
1656
+ if not self.monitor_external_changes:
1657
+ return self
1658
+ realExternalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1659
+ if self.externalFileUpdateTime < realExternalFileUpdateTime:
1660
+ self.deSynced = True
1661
+ self.__teePrintOrNot(f"External changes detected in {self._fileName}")
1662
+ elif self.externalFileUpdateTime > realExternalFileUpdateTime:
1663
+ self.__teePrintOrNot(f"Time anomalies detected in {self._fileName}, resetting externalFileUpdateTime")
1664
+ self.externalFileUpdateTime = realExternalFileUpdateTime
1665
+ return self
1666
+
1667
+ def _appendWorker(self):
1668
+ while not self.shutdownEvent.is_set():
1669
+ if not self.memoryOnly:
1670
+ self.checkExternalChanges()
1671
+ self.rewrite()
1672
+ self.commitAppendToFile()
1673
+ time.sleep(self.append_check_delay)
1674
+ # self.appendEvent.wait()
1675
+ # self.appendEvent.clear()
1676
+ if self.verbose:
1677
+ self.__teePrintOrNot(f"Append worker for {self._fileName} shut down")
1678
+ self.commitAppendToFile()
1679
+
1680
+ def commitAppendToFile(self):
1681
+ if self.appendQueue:
1682
+ if self.memoryOnly:
1683
+ self.appendQueue.clear()
1684
+ if self.verbose:
1685
+ self.__teePrintOrNot("Memory only mode. Append queue cleared.")
1686
+ return self
1687
+ try:
1688
+ if self.verbose:
1689
+ self.__teePrintOrNot(f"Commiting {len(self.appendQueue)} records to {self._fileName}")
1690
+ self.__teePrintOrNot(f"Before size of {self._fileName}: {os.path.getsize(self._fileName)}")
1691
+ file = self.get_file_obj('ab')
1692
+ buf = io.BufferedWriter(file, buffer_size=64*1024*1024) # 64MB buffer
1693
+ while self.appendQueue:
1694
+ line = _sanitize(self.appendQueue.popleft(),delimiter=self.delimiter)
1695
+ buf.write(self.delimiter.join(line).encode(encoding=self.encoding,errors='replace')+b'\n')
1696
+ buf.flush()
1697
+ self.release_file_obj(file)
1698
+ if self.verbose:
1699
+ self.__teePrintOrNot(f"Records commited to {self._fileName}")
1700
+ self.__teePrintOrNot(f"After size of {self._fileName}: {os.path.getsize(self._fileName)}")
1701
+ except Exception as e:
1702
+ self.release_file_obj(file)
1703
+ self.__teePrintOrNot(f"Failed to write at commitAppendToFile to {self._fileName}: {e}",'error')
1704
+ import traceback
1705
+ self.__teePrintOrNot(traceback.format_exc(),'error')
1706
+ self.deSynced = True
1707
+ return self
1708
+
1709
+ def stopAppendThread(self):
1710
+ try:
1711
+ if self.shutdownEvent.is_set():
1712
+ # if self.verbose:
1713
+ # self.__teePrintOrNot(f"Append thread for {self._fileName} already stopped")
1714
+ return
1715
+ self.rewrite(force=self.rewrite_on_exit) # Ensure any final sync operations are performed
1716
+ # self.appendEvent.set()
1717
+ self.shutdownEvent.set() # Signal the append thread to shut down
1718
+ self.appendThread.join() # Wait for the append thread to complete
1719
+ if self.verbose:
1720
+ self.__teePrintOrNot(f"Append thread for {self._fileName} stopped")
1721
+ except Exception as e:
1722
+ self.__teePrintOrNot(f"Failed to stop append thread for {self._fileName}: {e}",'error')
1723
+ import traceback
1724
+ self.__teePrintOrNot(traceback.format_exc(),'error')
1725
+
1726
+ def get_file_obj(self,modes = 'ab'):
1727
+ self.writeLock.acquire()
1728
+ try:
1729
+ if not self.encoding:
1730
+ self.encoding = 'utf8'
1731
+ file = openFileAsCompressed(self._fileName, mode=modes, encoding=self.encoding,teeLogger=self.teeLogger)
1732
+ # Lock the file after opening
1733
+ if os.name == 'posix':
1734
+ fcntl.lockf(file, fcntl.LOCK_EX)
1735
+ elif os.name == 'nt':
1736
+ # For Windows, locking the entire file, avoiding locking an empty file
1737
+ #lock_length = max(1, os.path.getsize(self._fileName))
1738
+ lock_length = 2147483647
1739
+ msvcrt.locking(file.fileno(), msvcrt.LK_LOCK, lock_length)
1740
+ if self.verbose:
1741
+ self.__teePrintOrNot(f"File {self._fileName} locked with mode {modes}")
1742
+ except Exception as e:
1743
+ try:
1744
+ self.writeLock.release() # Release the thread lock in case of an error
1745
+ except Exception as e:
1746
+ self.__teePrintOrNot(f"Failed to release writeLock for {self._fileName}: {e}",'error')
1747
+ self.__teePrintOrNot(f"Failed to open file {self._fileName}: {e}",'error')
1748
+ return file
1749
+
1750
+ def release_file_obj(self,file):
1751
+ # if write lock is already released, return
1752
+ if not self.writeLock.locked():
1753
+ return
1754
+ try:
1755
+ file.flush() # Ensure the file is flushed before unlocking
1756
+ os.fsync(file.fileno()) # Ensure the file is synced to disk before unlocking
1757
+ if not file.closed:
1758
+ if os.name == 'posix':
1759
+ fcntl.lockf(file, fcntl.LOCK_UN)
1760
+ elif os.name == 'nt':
1761
+ # Unlocking the entire file; for Windows, ensure not unlocking an empty file
1762
+ #unlock_length = max(1, os.path.getsize(os.path.realpath(file.name)))
1763
+ unlock_length = 2147483647
1764
+ try:
1765
+ msvcrt.locking(file.fileno(), msvcrt.LK_UNLCK, unlock_length)
1766
+ except Exception:
1767
+ pass
1768
+ file.close() # Ensure file is closed after unlocking
1769
+ if self.verbose:
1770
+ self.__teePrintOrNot(f"File {file.name} unlocked / released")
1771
+ except Exception as e:
1772
+ try:
1773
+ self.writeLock.release() # Ensure the thread lock is always released
1774
+ except Exception as e:
1775
+ self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
1776
+ self.__teePrintOrNot(f"Failed to release file {file.name}: {e}",'error')
1777
+ import traceback
1778
+ self.__teePrintOrNot(traceback.format_exc(),'error')
1779
+ # release the write lock if not already released
1780
+ if self.writeLock.locked():
1781
+ try:
1782
+ self.writeLock.release() # Ensure the thread lock is always released
1783
+ except Exception as e:
1784
+ self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
1785
+ self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1786
+
1787
+ class TSVZedLite(MutableMapping):
1788
+ """
1789
+ A mutable mapping class that provides a dictionary-like interface to a Tabular (TSV by default) file.
1790
+ TSVZedLite stores key-value pairs where each row in the file represents an entry, with the first
1791
+ column serving as the key. The class maintains an in-memory index of file positions for efficient
1792
+ random access while keeping the actual data on disk.
1793
+ TSVZedLite is designed for light memory footprint and forgoes some features from TSVZed, Notably,
1794
+ - Does not support simultaneous multi-process access.
1795
+ - Does not support compressed file formats.
1796
+ - Does not support automatic file rewriting on load / exit / periodically.
1797
+ - Does not support append worker thread for background writes.
1798
+ - Does not support external file change monitoring.
1799
+ - Does not support in-place updates; updates are append-only.
1800
+ - Does not support logging via teeLogger.
1801
+ - Does not support move_to_end method.
1802
+ - Does not support in-memory only mode. ( please just use a dict )
1803
+ - Does not lock the file during operations.
1804
+ - Does not track last update times.
1805
+
1806
+ However, it may be preferred in scenarios when:
1807
+ - Memory usage needs to be minimized.
1808
+ - Working with extremely large datasets where loading everything into memory is impractical.
1809
+ - Simplicity and ease of use are prioritized over advanced features.
1810
+ - The dataset is primarily write-only with infrequent reads.
1811
+ - The application can tolerate the lack of concurrency control. (single process access only)
1812
+ - Underlying file system is fast and can do constant time random seek (e.g., SSD).
1813
+
1814
+ Note: It is possible to load a custom dict like object for indexes (like TSVZed or pre-built dict)
1815
+ to avoid reading the entire data file to load the indexes at startup.
1816
+ Index consistency is not enforced in this case.
1817
+ Will raise error if mismatch happen (only checkes key exist in file) and strict mode is enabled.
1818
+ If using an external file-backed Index. This can function similar to a key-value store (like nosql).
1819
+
1820
+ Parameters
1821
+ ----------
1822
+ fileName : str
1823
+ Path to the Tabular file to read from or create.
1824
+ header : str, optional
1825
+ Header row for the file. Can be a delimited string or empty string (default: '').
1826
+ createIfNotExist : bool, optional
1827
+ If True, creates the file if it doesn't exist (default: True).
1828
+ verifyHeader : bool, optional
1829
+ If True, verifies that the file header matches the provided header (default: True).
1830
+ verbose : bool, optional
1831
+ If True, prints detailed operation information to stderr (default: False).
1832
+ encoding : str, optional
1833
+ Character encoding for the file (default: 'utf8').
1834
+ delimiter : str, optional
1835
+ Field delimiter character. If Ellipsis (...), automatically detects from filename (default: ...).
1836
+ defaults : str, list, or None, optional
1837
+ Default values for columns. Can be a delimited string, list, or None (default: None).
1838
+ strict : bool, optional
1839
+ If True, enforces strict column count validation and raises errors on mismatches (default: True).
1840
+ correctColumnNum : int, optional
1841
+ Expected number of columns. -1 means auto-detect (default: -1).
1842
+ indexes : dict, optional
1843
+ Pre-existing index dictionary mapping keys to file positions (default: ...).
1844
+ fileObj : file object, optional
1845
+ Pre-existing file object to use (default: ...).
1846
+ Attributes
1847
+ ----------
1848
+ version : str
1849
+ Version identifier for the TSVZedLite format.
1850
+ indexes : dict
1851
+ Dictionary mapping keys to their file positions (or in-memory data for keys starting with '#').
1852
+ fileObj : file object
1853
+ Binary file object for reading/writing the underlying file.
1854
+ defaults : list
1855
+ List of default values for columns, with DEFAULTS_INDICATOR_KEY as the first element.
1856
+ correctColumnNum : int
1857
+ The validated number of columns per row.
1858
+ Notes
1859
+ -----
1860
+ - Keys starting with '#' are stored in memory only and not written to file.
1861
+ - The special key DEFAULTS_INDICATOR_KEY is used to store and retrieve default column values.
1862
+ - Empty values in rows are automatically filled with defaults if available.
1863
+ - The class implements the MutableMapping interface, providing dict-like operations.
1864
+ - File operations are buffered and written immediately (append-only for updates).
1865
+ - Deleted entries are marked by writing a row with only the key (empty values).
1866
+ Examples
1867
+ --------
1868
+ >>> db = TSVZedLite('data.tsv', header='id\tname\tage')
1869
+ >>> db['user1'] = ['user1', 'Alice', '30']
1870
+ >>> print(db['user1'])
1871
+ ['user1', 'Alice', '30']
1872
+ >>> del db['user1']
1873
+ >>> 'user1' in db
1874
+ False
1875
+ See Also
1876
+ --------
1877
+ collections.abc.MutableMapping : The abstract base class that this class implements.
1878
+ """
1879
+
1880
+ #['__new__', '__repr__', '__hash__', '__lt__', '__le__', '__eq__', '__ne__', '__gt__', '__ge__', '__iter__', '__init__',
1881
+ # '__or__', '__ror__', '__ior__', '__len__', '__getitem__', '__setitem__', '__delitem__', '__contains__', '__sizeof__',
1882
+ # 'get', 'setdefault', 'pop', 'popitem', 'keys', 'items', 'values', 'update', 'fromkeys', 'clear', 'copy', '__reversed__',
1883
+ # '__class_getitem__', '__doc__']
1884
+ def __init__ (self,fileName,header = '',createIfNotExist = True,verifyHeader = True,
1885
+ verbose = False,encoding = 'utf8',
1886
+ delimiter = ...,defaults = None,strict = True,correctColumnNum = -1,
1887
+ indexes = ..., fileObj = ...
1888
+ ):
1889
+ self.version = version
1890
+ self.strict = strict
1891
+ self._fileName = fileName
1892
+ self.delimiter = get_delimiter(delimiter,file_name=fileName)
1893
+ self.setDefaults(defaults)
1894
+ self.header = _formatHeader(header,verbose = verbose,delimiter=self.delimiter)
1895
+ self.correctColumnNum = correctColumnNum
1896
+ self.createIfNotExist = createIfNotExist
1897
+ self.verifyHeader = verifyHeader
1898
+ self.verbose = verbose
1899
+ self.encoding = encoding
1900
+ if indexes is ...:
1901
+ self.indexes = dict()
1902
+ self.load()
1903
+ else:
1904
+ self.indexes = indexes
1905
+ if fileObj is ...:
1906
+ self.fileObj = open(self._fileName,'r+b')
1907
+ else:
1908
+ self.fileObj = fileObj
1909
+ atexit.register(self.close)
1910
+
1911
+ # Implement custom methods just for TSVZedLite
1912
+ def getResourceUsage(self,return_dict = False):
1913
+ return get_resource_usage(return_dict = return_dict)
1914
+
1915
+ def setDefaults(self,defaults):
1916
+ if not defaults:
1917
+ defaults = []
1918
+ if isinstance(defaults,str):
1919
+ defaults = defaults.split(self.delimiter)
1920
+ elif not isinstance(defaults,list):
1921
+ try:
1922
+ defaults = list(defaults)
1923
+ except Exception:
1924
+ if self.verbose:
1925
+ eprint('Error: Invalid defaults, setting defaults to empty.')
1926
+ defaults = []
1927
+ defaults = [str(s).rstrip() if s else '' for s in defaults]
1928
+ if not any(defaults):
1929
+ defaults = []
1930
+ if not defaults or defaults[0] != DEFAULTS_INDICATOR_KEY:
1931
+ defaults = [DEFAULTS_INDICATOR_KEY]+defaults
1932
+ self.defaults = defaults
1933
+
1934
+ def load(self):
1935
+ if self.verbose:
1936
+ eprint(f"Loading {self._fileName}")
1937
+ readTabularFile(self._fileName, header = self.header, createIfNotExist = self.createIfNotExist,
1938
+ verifyHeader = self.verifyHeader, verbose = self.verbose, taskDic = self.indexes,
1939
+ encoding = self.encoding if self.encoding else None, strict = self.strict,
1940
+ delimiter = self.delimiter, defaults=self.defaults,storeOffset=True)
1941
+ return self
1942
+
1943
+ def positions(self):
1944
+ return self.indexes.values()
1945
+
1946
+ def reload(self):
1947
+ self.indexes.clear()
1948
+ return self.load()
1949
+
1950
+ def getListView(self):
1951
+ return getListView(self,header=self.header,delimiter=self.delimiter)
1952
+
1953
+ def clear_file(self):
1954
+ if self.verbose:
1955
+ eprint(f"Clearing {self._fileName}")
1956
+ self.fileObj.seek(0)
1957
+ self.fileObj.truncate()
1958
+ if self.verbose:
1959
+ eprint(f"File {self._fileName} cleared empty")
1960
+ if self.header:
1961
+ location = self.__writeValues(self.header)
1962
+ if self.verbose:
1963
+ eprint(f"Header {self.header} written to {self._fileName}")
1964
+ eprint(f"At {location} size: {self.fileObj.tell()}")
1965
+ return self
1966
+
1967
+ def switchFile(self,newFileName,createIfNotExist = ...,verifyHeader = ...):
1968
+ if createIfNotExist is ...:
1969
+ createIfNotExist = self.createIfNotExist
1970
+ if verifyHeader is ...:
1971
+ verifyHeader = self.verifyHeader
1972
+ self.fileObj.close()
1973
+ self._fileName = newFileName
1974
+ self.reload()
1975
+ self.fileObj = open(self._fileName,'r+b')
1976
+ self.createIfNotExist = createIfNotExist
1977
+ self.verifyHeader = verifyHeader
1978
+ return self
1979
+
1980
+ # Private methods for reading and writing values for TSVZedLite
1981
+
1982
+ def __writeValues(self,data):
1983
+ self.fileObj.seek(0, os.SEEK_END)
1984
+ write_at = self.fileObj.tell()
1985
+ if self.verbose:
1986
+ eprint(f"Writing at position {write_at}")
1987
+ data = _sanitize(data,delimiter=self.delimiter)
1988
+ data = self.delimiter.join(data)
1989
+ bytes = self.fileObj.write((data.encode(encoding=self.encoding,errors='replace') + b'\n'))
1990
+ if self.verbose:
1991
+ eprint(f"Wrote {bytes} bytes")
1992
+ return write_at
1993
+
1994
+ def __mapDeleteToFile(self,key):
1995
+ if key == DEFAULTS_INDICATOR_KEY:
1996
+ self.defaults = [DEFAULTS_INDICATOR_KEY]
1997
+ if self.verbose:
1998
+ eprint("Defaults cleared")
1999
+ # delete the key from the dictionary and update the file
2000
+ elif key not in self.indexes:
2001
+ if self.verbose:
2002
+ eprint(f"Key {key} not found")
2003
+ return
2004
+ elif key.startswith('#'):
2005
+ if self.verbose:
2006
+ eprint(f"Key {key} deleted in memory")
2007
+ return
2008
+ if self.verbose:
2009
+ eprint(f"Appending empty line {key}")
2010
+ self.indexes[key] = self.__writeValues([key])
2011
+
2012
+ def __readValuesAtPos(self,pos,key = ...):
2013
+ self.fileObj.seek(pos)
2014
+ line = self.fileObj.readline().decode(self.encoding,errors='replace')
2015
+ self.correctColumnNum, segments = _processLine(
2016
+ line=line,
2017
+ taskDic={},
2018
+ correctColumnNum=self.correctColumnNum,
2019
+ strict=self.strict,
2020
+ delimiter=self.delimiter,
2021
+ defaults=self.defaults,
2022
+ storeOffset=True,
2023
+ )
2024
+ if self.verbose:
2025
+ eprint(f"Read at position {pos}: {segments}")
2026
+ if key is not ... and segments[0] != key:
2027
+ eprint(f"Warning: Key mismatch at position {pos}: expected {key}, got {segments[0]}")
2028
+ if self.strict:
2029
+ eprint("Error: Key mismatch and strict mode enabled. Raising KeyError.")
2030
+ raise KeyError(key)
2031
+ else :
2032
+ eprint("Continuing despite key mismatch due to non-strict mode. Expect errors!")
2033
+ return segments
2034
+
2035
+ # Implement basic __getitem__, __setitem__, __delitem__, __iter__, and __len__. needed for MutableMapping
2036
+ def __getitem__(self,key):
2037
+ key = str(key).rstrip()
2038
+ if key not in self.indexes:
2039
+ if key == DEFAULTS_INDICATOR_KEY:
2040
+ return self.defaults
2041
+ raise KeyError(key)
2042
+ pos = self.indexes[key]
2043
+ return self.__readValuesAtPos(pos,key)
2044
+
2045
+ def __setitem__(self,key,value):
2046
+ key = str(key).rstrip()
2047
+ if not key:
2048
+ eprint('Error: Key cannot be empty')
2049
+ return
2050
+ if isinstance(value,str):
2051
+ value = value.split(self.delimiter)
2052
+ # sanitize the value
2053
+ value = [str(s).rstrip() if s else '' for s in value]
2054
+ # the first field in value should be the key
2055
+ # add it if it is not there
2056
+ if not value or value[0] != key:
2057
+ value = [key]+value
2058
+ # verify the value has the correct number of columns
2059
+ if self.correctColumnNum != 1 and len(value) == 1:
2060
+ # this means we want to clear / delete the key
2061
+ del self[key]
2062
+ elif self.correctColumnNum > 0:
2063
+ if len(value) != self.correctColumnNum:
2064
+ if self.strict:
2065
+ eprint(f"Error: Value {value} does not have the correct number of columns: {self.correctColumnNum}. Refuse adding key...")
2066
+ return
2067
+ elif self.verbose:
2068
+ eprint(f"Warning: Value {value} does not have the correct number of columns: {self.correctColumnNum}, correcting...")
2069
+ if len(value) < self.correctColumnNum:
2070
+ value += ['']*(self.correctColumnNum-len(value))
2071
+ elif len(value) > self.correctColumnNum:
2072
+ value = value[:self.correctColumnNum]
2073
+ else:
2074
+ self.correctColumnNum = len(value)
2075
+ if self.defaults and len(self.defaults) > 1:
2076
+ for i in range(1,len(value)):
2077
+ if not value[i] and i < len(self.defaults) and self.defaults[i]:
2078
+ value[i] = self.defaults[i]
2079
+ if self.verbose:
2080
+ eprint(f" Replacing empty value at {i} with default: {self.defaults[i]}")
2081
+ if key == DEFAULTS_INDICATOR_KEY:
2082
+ self.defaults = value
2083
+ if self.verbose:
2084
+ eprint(f"Defaults set to {value}")
2085
+ elif key.startswith('#'):
2086
+ if self.verbose:
2087
+ eprint(f"Key {key} updated in memory (data in index) as it starts with #")
2088
+ self.indexes[key] = value
2089
+ return
2090
+ if self.verbose:
2091
+ eprint(f"Writing {key}: {value}")
2092
+ self.indexes[key] = self.__writeValues(value)
2093
+
2094
+ def __delitem__(self,key):
2095
+ key = str(key).rstrip()
2096
+ self.indexes.pop(key,None)
2097
+ self.__mapDeleteToFile(key)
2098
+
2099
+ def __iter__(self):
2100
+ return iter(self.indexes)
2101
+
2102
+ def __len__(self):
2103
+ return len(self.indexes)
2104
+
2105
+ # Implement additional methods for dict like interface (order of function are somewhat from OrderedDict)
2106
+ def __reversed__(self):
2107
+ return reversed(self.indexes)
2108
+
2109
+ def clear(self):
2110
+ # clear the dictionary and update the file
2111
+ self.indexes.clear()
2112
+ self.clear_file()
2113
+ return self
2114
+
2115
+ def popitem(self, last=True,return_pos = False):
2116
+ if last:
2117
+ key, pos = self.indexes.popitem()
2118
+ else:
2119
+ try:
2120
+ key = next(iter(self.indexes))
2121
+ pos = self.indexes.pop(key)
2122
+ except StopIteration:
2123
+ raise KeyError("popitem(): dictionary is empty")
2124
+ if return_pos:
2125
+ value = pos
2126
+ else:
2127
+ value = self.__readValuesAtPos(pos,key)
2128
+ self.__mapDeleteToFile(key)
2129
+ return key, value
2130
+
2131
+ __marker = object()
2132
+ def pop(self, key, default=__marker, return_pos = False):
2133
+ key = str(key).rstrip()
2134
+ try:
2135
+ pos = self.indexes.pop(key)
2136
+ except KeyError:
2137
+ if default is self.__marker:
2138
+ raise KeyError(key)
2139
+ elif default is ...:
2140
+ return self.defaults
2141
+ return default
2142
+ if return_pos:
2143
+ value = pos
2144
+ else:
2145
+ value = self.__readValuesAtPos(pos,key)
2146
+ self.__mapDeleteToFile(key)
2147
+ return value
2148
+
2149
+ def __sizeof__(self):
2150
+ sizeof = sys.getsizeof
2151
+ size = sizeof(super()) + sizeof(True) * 6 # for the booleans / integers
2152
+ size += sizeof(self._fileName)
2153
+ size += sizeof(self.header)
2154
+ size += sizeof(self.encoding)
2155
+ size += sizeof(self.delimiter)
2156
+ size += sizeof(self.defaults)
2157
+ size += sizeof(self.indexes)
2158
+ size += sizeof(self.fileObj)
2159
+ return size
2160
+
2161
+ def __repr__(self):
2162
+ return f"""TSVZed at {hex(id(self))}(
2163
+ file_name:{self._fileName}
2164
+ index_count:{len(self.indexes)}
2165
+ header:{self.header}
2166
+ correctColumnNum:{self.correctColumnNum}
2167
+ createIfNotExist:{self.createIfNotExist}
2168
+ verifyHeader:{self.verifyHeader}
2169
+ strict:{self.strict}
2170
+ delimiter:{self.delimiter}
2171
+ defaults:{self.defaults}
2172
+ verbose:{self.verbose}
2173
+ encoding:{self.encoding}
2174
+ file_descriptor:{self.fileObj.fileno()}
2175
+ )"""
2176
+
2177
+ def __str__(self):
2178
+ return f"TSVZedLite({self._fileName})"
2179
+
2180
+ def __reduce__(self):
2181
+ 'Return state information for pickling'
2182
+ # Return minimal state needed to reconstruct
2183
+ return (
2184
+ self.__class__,
2185
+ (self._fileName, self.header, self.createIfNotExist, self.verifyHeader,
2186
+ self.verbose, self.encoding, self.delimiter, self.defaults, self.strict,
2187
+ self.correctColumnNum),
2188
+ None,
2189
+ None,
2190
+ None
2191
+ )
2192
+ def copy(self):
2193
+ 'Return a shallow copy of the ordered dictionary.'
2194
+ new = self.__class__(
2195
+ self._fileName,
2196
+ self.header,
2197
+ self.createIfNotExist,
2198
+ self.verifyHeader,
2199
+ self.verbose,
2200
+ self.encoding,
2201
+ self.delimiter,
2202
+ self.defaults,
2203
+ self.strict,
2204
+ self.correctColumnNum,
2205
+ self.indexes,
2206
+ self.fileObj,
2207
+ )
2208
+ eprint("""
2209
+ Warning: Copying TSVZedLite will share the same file object and indexes.
2210
+ Changes in one will affect the other.
2211
+ There is likely very little reason to copy a TSVZedLite instance unless you are immadiately then calling switchFile() on it.
2212
+ """)
2213
+ return new
2214
+
2215
+ @classmethod
2216
+ def fromkeys(cls, iterable, value=None,fileName = None,header = '',createIfNotExist = True,verifyHeader = True,verbose = False,encoding = 'utf8',
2217
+ delimiter = ...,defaults = None,strict = True,correctColumnNum = -1):
2218
+ '''Create a new ordered dictionary with keys from iterable and values set to value.
2219
+ '''
2220
+ self = cls(fileName,header,createIfNotExist,verifyHeader,verbose,encoding,delimiter,defaults,strict,correctColumnNum)
2221
+ for key in iterable:
2222
+ self[key] = value
2223
+ return self
2224
+
2225
+ def __eq__(self, other):
2226
+ if isinstance(other, TSVZedLite):
2227
+ eprint("Warning: Comparing two TSVZedLite instances will only compare their indexes. Data content is not compared.")
2228
+ return self.indexes == other.indexes
2229
+ return super().__eq__(other)
2230
+
2231
+ def __ior__(self, other):
2232
+ self.update(other)
2233
+ return self
2234
+
2235
+ # Implement context manager methods
2236
+ def __enter__(self):
2237
+ return self
2238
+
2239
+ def close(self):
2240
+ self.fileObj.close()
2241
+ return self
2242
+
2243
+ def __exit__(self,exc_type,exc_value,traceback):
2244
+ return self.close()
2245
+
2246
+
1515
2247
 
1516
2248
 
1517
2249
  def __main__():
1518
- import argparse
1519
- parser = argparse.ArgumentParser(description='TSVZed: A TSV / CSV / NSV file manager')
1520
- parser.add_argument('filename', type=str, help='The file to read')
1521
- parser.add_argument('operation', type=str,nargs='?', choices=['read','append','delete','clear','scrub'], help='The operation to perform. Note: scrub will also remove all comments. Default: read', default='read')
1522
- parser.add_argument('line', type=str, nargs='*', help='The line to append to the Tabular file. it follows as : {key} {value1} {value2} ... if a key without value be inserted, the value will get deleted.')
1523
- parser.add_argument('-d', '--delimiter', type=str, help='The delimiter of the Tabular file. Default: Infer from last part of filename, or tab if cannot determine. Note: accept unicode escaped char, raw char, or string "comma,tab,null" will refer to their characters. ', default=...)
1524
- parser.add_argument('-c', '--header', type=str, help='Perform checks with this header of the Tabular file. seperate using --delimiter.')
1525
- parser.add_argument('--defaults', type=str, help='Default values to fill in the missing columns. seperate using --delimiter. Ex. if -d = comma, --defaults="key,value1,value2..." Note: Please specify the key. But it will not be used as a key need to be unique in data.')
1526
- strictMode = parser.add_mutually_exclusive_group()
1527
- strictMode.add_argument('-s', '--strict', dest = 'strict',action='store_true', help='Strict mode. Do not parse values that seems malformed, check for column numbers / headers')
1528
- strictMode.add_argument('-f', '--force', dest = 'strict',action='store_false', help='Force the operation. Ignore checks for column numbers / headers')
1529
- parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
1530
- parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {version} @ {COMMIT_DATE} by {author}')
1531
- args = parser.parse_args()
1532
- args.delimiter = get_delimiter(delimiter=args.delimiter,file_name=args.filename)
1533
- if args.header and args.header.endswith('\\'):
1534
- args.header += '\\'
1535
- try:
1536
- header = args.header.encode().decode('unicode_escape') if args.header else ''
1537
- except Exception:
1538
- print(f"Failed to decode header: {args.header}")
1539
- header = ''
1540
- defaults = []
1541
- if args.defaults:
1542
- try:
1543
- defaults = args.defaults.encode().decode('unicode_escape').split(args.delimiter)
1544
- except Exception:
1545
- print(f"Failed to decode defaults: {args.defaults}")
1546
- defaults = []
1547
-
1548
- if args.operation == 'read':
1549
- # check if the file exist
1550
- if not os.path.isfile(args.filename):
1551
- print(f"File not found: {args.filename}")
1552
- return
1553
- # read the file
1554
- data = readTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= args.strict, delimiter=args.delimiter, defaults=defaults)
1555
- print(pretty_format_table(data.values(),delimiter=args.delimiter))
1556
- elif args.operation == 'append':
1557
- appendTabularFile(args.filename, args.line,createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
1558
- elif args.operation == 'delete':
1559
- appendTabularFile(args.filename, args.line[:1],createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
1560
- elif args.operation == 'clear':
1561
- clearTabularFile(args.filename, header=header, verbose=args.verbose, verifyHeader=args.strict, delimiter=args.delimiter)
1562
- elif args.operation == 'scrub':
1563
- scrubTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= args.strict, delimiter=args.delimiter, defaults=defaults)
1564
- else:
1565
- print("Invalid operation")
2250
+ import argparse
2251
+ parser = argparse.ArgumentParser(description='TSVZed: A TSV / CSV / NSV file manager')
2252
+ parser.add_argument('filename', type=str, help='The file to read')
2253
+ parser.add_argument('operation', type=str,nargs='?', choices=['read','append','delete','clear','scrub'], help='The operation to perform. Note: scrub will also remove all comments. Default: read', default='read')
2254
+ parser.add_argument('line', type=str, nargs='*', help='The line to append to the Tabular file. it follows as : {key} {value1} {value2} ... if a key without value be inserted, the value will get deleted.')
2255
+ parser.add_argument('-d', '--delimiter', type=str, help='The delimiter of the Tabular file. Default: Infer from last part of filename, or tab if cannot determine. Note: accept unicode escaped char, raw char, or string "comma,tab,null" will refer to their characters. ', default=...)
2256
+ parser.add_argument('-c', '--header', type=str, help='Perform checks with this header of the Tabular file. seperate using --delimiter.')
2257
+ parser.add_argument('--defaults', type=str, help='Default values to fill in the missing columns. seperate using --delimiter. Ex. if -d = comma, --defaults="key,value1,value2..." Note: Please specify the key. But it will not be used as a key need to be unique in data.')
2258
+ strictMode = parser.add_mutually_exclusive_group()
2259
+ strictMode.add_argument('-s', '--strict', dest = 'strict',action='store_true', help='Strict mode. Do not parse values that seems malformed, check for column numbers / headers')
2260
+ strictMode.add_argument('-f', '--force', dest = 'strict',action='store_false', help='Force the operation. Ignore checks for column numbers / headers')
2261
+ parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
2262
+ parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {version} @ {COMMIT_DATE} by {author}')
2263
+ args = parser.parse_args()
2264
+ args.delimiter = get_delimiter(delimiter=args.delimiter,file_name=args.filename)
2265
+ if args.header and args.header.endswith('\\'):
2266
+ args.header += '\\'
2267
+ try:
2268
+ header = args.header.encode().decode('unicode_escape') if args.header else ''
2269
+ except Exception:
2270
+ print(f"Failed to decode header: {args.header}")
2271
+ header = ''
2272
+ defaults = []
2273
+ if args.defaults:
2274
+ try:
2275
+ defaults = args.defaults.encode().decode('unicode_escape').split(args.delimiter)
2276
+ except Exception:
2277
+ print(f"Failed to decode defaults: {args.defaults}")
2278
+ defaults = []
2279
+
2280
+ if args.operation == 'read':
2281
+ # check if the file exist
2282
+ if not os.path.isfile(args.filename):
2283
+ print(f"File not found: {args.filename}")
2284
+ return
2285
+ # read the file
2286
+ data = readTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= args.strict, delimiter=args.delimiter, defaults=defaults)
2287
+ print(pretty_format_table(data.values(),delimiter=args.delimiter))
2288
+ elif args.operation == 'append':
2289
+ appendTabularFile(args.filename, args.line,createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
2290
+ elif args.operation == 'delete':
2291
+ appendTabularFile(args.filename, args.line[:1],createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
2292
+ elif args.operation == 'clear':
2293
+ clearTabularFile(args.filename, header=header, verbose=args.verbose, verifyHeader=args.strict, delimiter=args.delimiter)
2294
+ elif args.operation == 'scrub':
2295
+ scrubTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= args.strict, delimiter=args.delimiter, defaults=defaults)
2296
+ else:
2297
+ print("Invalid operation")
1566
2298
  if __name__ == '__main__':
1567
- __main__()
2299
+ __main__()