TSVZ 3.30__py3-none-any.whl → 3.36__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
TSVZ.py CHANGED
@@ -4,28 +4,32 @@
4
4
  # dependencies = [
5
5
  # ]
6
6
  # ///
7
- import os , sys
8
- from collections import OrderedDict , deque
9
- import time
10
7
  import atexit
11
- import threading
8
+ import functools
9
+ import io
10
+ import os
12
11
  import re
13
-
12
+ from tabnanny import verbose
13
+ import threading
14
+ import time
15
+ import sys
16
+ from collections import OrderedDict, deque
17
+ from collections.abc import MutableMapping
14
18
  RESOURCE_LIB_AVAILABLE = True
15
19
  try:
16
- import resource
17
- except:
18
- RESOURCE_LIB_AVAILABLE = False
20
+ import resource
21
+ except ImportError:
22
+ RESOURCE_LIB_AVAILABLE = False
19
23
 
20
24
  if os.name == 'nt':
21
- import msvcrt
25
+ import msvcrt
22
26
  elif os.name == 'posix':
23
- import fcntl
27
+ import fcntl
24
28
 
25
- version = '3.30'
29
+ version = '3.36'
26
30
  __version__ = version
27
31
  author = 'pan@zopyr.us'
28
- COMMIT_DATE = '2025-09-15'
32
+ COMMIT_DATE = '2026-02-02'
29
33
 
30
34
  DEFAULT_DELIMITER = '\t'
31
35
  DEFAULTS_INDICATOR_KEY = '#_defaults_#'
@@ -33,137 +37,216 @@ DEFAULTS_INDICATOR_KEY = '#_defaults_#'
33
37
  COMPRESSED_FILE_EXTENSIONS = ['gz','gzip','bz2','bzip2','xz','lzma']
34
38
 
35
39
  def get_delimiter(delimiter,file_name = ''):
36
- global DEFAULT_DELIMITER
37
- if not delimiter:
38
- return DEFAULT_DELIMITER
39
- elif delimiter == ...:
40
- if not file_name:
41
- rtn = '\t'
42
- elif file_name.endswith('.csv'):
43
- rtn = ','
44
- elif file_name.endswith('.nsv'):
45
- rtn = '\0'
46
- elif file_name.endswith('.psv'):
47
- rtn = '|'
48
- else:
49
- rtn = '\t'
50
- elif delimiter == 'comma':
51
- rtn = ','
52
- elif delimiter == 'tab':
53
- rtn = '\t'
54
- elif delimiter == 'pipe':
55
- rtn = '|'
56
- elif delimiter == 'null':
57
- rtn = '\0'
58
- else:
59
- rtn = delimiter.encode().decode('unicode_escape')
60
- DEFAULT_DELIMITER = rtn
61
- return rtn
40
+ global DEFAULT_DELIMITER
41
+ if not delimiter:
42
+ return DEFAULT_DELIMITER
43
+ elif delimiter == ...:
44
+ if not file_name:
45
+ rtn = '\t'
46
+ elif file_name.endswith('.csv'):
47
+ rtn = ','
48
+ elif file_name.endswith('.nsv'):
49
+ rtn = '\0'
50
+ elif file_name.endswith('.psv'):
51
+ rtn = '|'
52
+ else:
53
+ rtn = '\t'
54
+ elif delimiter == 'comma':
55
+ rtn = ','
56
+ elif delimiter == 'tab':
57
+ rtn = '\t'
58
+ elif delimiter == 'pipe':
59
+ rtn = '|'
60
+ elif delimiter == 'null':
61
+ rtn = '\0'
62
+ else:
63
+ rtn = delimiter.encode().decode('unicode_escape')
64
+ DEFAULT_DELIMITER = rtn
65
+ return rtn
66
+
67
+ def eprint(*args, **kwargs):
68
+ try:
69
+ if 'file' in kwargs:
70
+ print(*args, **kwargs)
71
+ else:
72
+ print(*args, file=sys.stderr, **kwargs)
73
+ except Exception as e:
74
+ print(f"Error: Cannot print to stderr: {e}")
75
+ print(*args, **kwargs)
62
76
 
63
77
  def openFileAsCompressed(fileName,mode = 'rb',encoding = 'utf8',teeLogger = None,compressLevel = 1):
64
- if 'b' not in mode:
65
- mode += 't'
66
- kwargs = {}
67
- if 'r' not in mode:
68
- if fileName.endswith('.xz'):
69
- kwargs['preset'] = compressLevel
70
- else:
71
- kwargs['compresslevel'] = compressLevel
72
- if 'b' not in mode:
73
- kwargs['encoding'] = encoding
74
- if fileName.endswith('.xz') or fileName.endswith('.lzma'):
75
- try:
76
- import lzma
77
- return lzma.open(fileName, mode, **kwargs)
78
- except:
79
- __teePrintOrNot(f"Failed to open {fileName} with lzma, trying bin",teeLogger=teeLogger)
80
- elif fileName.endswith('.gz') or fileName.endswith('.gzip'):
81
- try:
82
- import gzip
83
- return gzip.open(fileName, mode, **kwargs)
84
- except:
85
- __teePrintOrNot(f"Failed to open {fileName} with gzip, trying bin",teeLogger=teeLogger)
86
- elif fileName.endswith('.bz2') or fileName.endswith('.bzip2'):
87
- try:
88
- import bz2
89
- return bz2.open(fileName, mode, **kwargs)
90
- except:
91
- __teePrintOrNot(f"Failed to open {fileName} with bz2, trying bin",teeLogger=teeLogger)
92
- if 't' in mode:
93
- mode = mode.replace('t','')
94
- return open(fileName, mode, encoding=encoding)
95
- if 'b' not in mode:
96
- mode += 'b'
97
- return open(fileName, mode)
98
-
99
-
100
- def pretty_format_table(data, delimiter = DEFAULT_DELIMITER,header = None):
101
- version = 1.11
102
- _ = version
103
- if not data:
104
- return ''
105
- if isinstance(data, str):
106
- data = data.strip('\n').split('\n')
107
- data = [line.split(delimiter) for line in data]
108
- elif isinstance(data, dict):
109
- # flatten the 2D dict to a list of lists
110
- if isinstance(next(iter(data.values())), dict):
111
- tempData = [['key'] + list(next(iter(data.values())).keys())]
112
- tempData.extend( [[key] + list(value.values()) for key, value in data.items()])
113
- data = tempData
114
- else:
115
- # it is a dict of lists
116
- data = [[key] + list(value) for key, value in data.items()]
117
- elif not isinstance(data,list):
118
- data = list(data)
119
- # format the list into 2d list of list of strings
120
- if isinstance(data[0], dict):
121
- tempData = [data[0].keys()]
122
- tempData.extend([list(item.values()) for item in data])
123
- data = tempData
124
- data = [[str(item) for item in row] for row in data]
125
- num_cols = len(data[0])
126
- col_widths = [0] * num_cols
127
- # Calculate the maximum width of each column
128
- for c in range(num_cols):
129
- #col_widths[c] = max(len(row[c]) for row in data)
130
- # handle ansii escape sequences
131
- col_widths[c] = max(len(re.sub(r'\x1b\[[0-?]*[ -/]*[@-~]','',row[c])) for row in data)
132
- if header:
133
- header_widths = [len(re.sub(r'\x1b\[[0-?]*[ -/]*[@-~]', '', col)) for col in header]
134
- col_widths = [max(col_widths[i], header_widths[i]) for i in range(num_cols)]
135
- # Build the row format string
136
- row_format = ' | '.join('{{:<{}}}'.format(width) for width in col_widths)
137
- # Print the header
138
- if not header:
139
- header = data[0]
140
- outTable = []
141
- outTable.append(row_format.format(*header))
142
- outTable.append('-+-'.join('-' * width for width in col_widths))
143
- for row in data[1:]:
144
- # if the row is empty, print an divider
145
- if not any(row):
146
- outTable.append('-+-'.join('-' * width for width in col_widths))
147
- else:
148
- outTable.append(row_format.format(*row))
149
- else:
150
- # pad / truncate header to appropriate length
151
- if isinstance(header,str):
152
- header = header.split(delimiter)
153
- if len(header) < num_cols:
154
- header += ['']*(num_cols-len(header))
155
- elif len(header) > num_cols:
156
- header = header[:num_cols]
157
- outTable = []
158
- outTable.append(row_format.format(*header))
159
- outTable.append('-+-'.join('-' * width for width in col_widths))
160
- for row in data:
161
- # if the row is empty, print an divider
162
- if not any(row):
163
- outTable.append('-+-'.join('-' * width for width in col_widths))
164
- else:
165
- outTable.append(row_format.format(*row))
166
- return '\n'.join(outTable) + '\n'
78
+ if 'b' not in mode:
79
+ mode += 't'
80
+ kwargs = {}
81
+ if 'r' not in mode:
82
+ if fileName.endswith('.xz'):
83
+ kwargs['preset'] = compressLevel
84
+ else:
85
+ kwargs['compresslevel'] = compressLevel
86
+ if 'b' not in mode:
87
+ kwargs['encoding'] = encoding
88
+ if fileName.endswith('.xz') or fileName.endswith('.lzma'):
89
+ try:
90
+ import lzma
91
+ return lzma.open(fileName, mode, **kwargs)
92
+ except Exception:
93
+ __teePrintOrNot(f"Failed to open {fileName} with lzma, trying bin",teeLogger=teeLogger)
94
+ elif fileName.endswith('.gz') or fileName.endswith('.gzip'):
95
+ try:
96
+ import gzip
97
+ return gzip.open(fileName, mode, **kwargs)
98
+ except Exception:
99
+ __teePrintOrNot(f"Failed to open {fileName} with gzip, trying bin",teeLogger=teeLogger)
100
+ elif fileName.endswith('.bz2') or fileName.endswith('.bzip2'):
101
+ try:
102
+ import bz2
103
+ return bz2.open(fileName, mode, **kwargs)
104
+ except Exception:
105
+ __teePrintOrNot(f"Failed to open {fileName} with bz2, trying bin",teeLogger=teeLogger)
106
+ if 't' in mode:
107
+ mode = mode.replace('t','')
108
+ return open(fileName, mode, encoding=encoding)
109
+ if 'b' not in mode:
110
+ mode += 'b'
111
+ return open(fileName, mode)
112
+
113
+ def get_terminal_size():
114
+ '''
115
+ Get the terminal size
116
+
117
+ @params:
118
+ None
119
+
120
+ @returns:
121
+ (int,int): the number of columns and rows of the terminal
122
+ '''
123
+ try:
124
+ import os
125
+ _tsize = os.get_terminal_size()
126
+ except Exception:
127
+ try:
128
+ import fcntl
129
+ import struct
130
+ import termios
131
+ packed = fcntl.ioctl(0, termios.TIOCGWINSZ, struct.pack('HHHH', 0, 0, 0, 0))
132
+ _tsize = struct.unpack('HHHH', packed)[:2]
133
+ except Exception:
134
+ import shutil
135
+ _tsize = shutil.get_terminal_size(fallback=(240, 50))
136
+ return _tsize
137
+
138
+ def pretty_format_table(data, delimiter="\t", header=None, full=False):
139
+ version = 1.12
140
+ _ = version
141
+ def visible_len(s):
142
+ return len(re.sub(r"\x1b\[[0-?]*[ -/]*[@-~]", "", s))
143
+ def table_width(col_widths, sep_len):
144
+ # total width = sum of column widths + separators between columns
145
+ return sum(col_widths) + sep_len * (len(col_widths) - 1)
146
+ def truncate_to_width(s, width):
147
+ # If fits, leave as is. If too long and width >= 1, keep width-1 chars + "."
148
+ # If width == 0, nothing fits; return empty string.
149
+ if visible_len(s) <= width:
150
+ return s
151
+ if width <= 0:
152
+ return ""
153
+ # Build a truncated plain string based on visible chars (no ANSI awareness for slicing)
154
+ # For simplicity, slice the raw string. This may cut ANSI; best to avoid ANSI in data if truncation occurs.
155
+ return s[: max(width - 2, 0)] + ".."
156
+ if not data:
157
+ return ""
158
+ # Normalize input data structure
159
+ if isinstance(data, str):
160
+ data = data.strip("\n").split("\n")
161
+ data = [line.split(delimiter) for line in data]
162
+ elif isinstance(data, dict):
163
+ if isinstance(next(iter(data.values())), dict):
164
+ tempData = [["key"] + list(next(iter(data.values())).keys())]
165
+ tempData.extend([[key] + list(value.values()) for key, value in data.items()])
166
+ data = tempData
167
+ else:
168
+ data = [[key] + list(value) for key, value in data.items()]
169
+ elif not isinstance(data, list):
170
+ data = list(data)
171
+ if isinstance(data[0], dict):
172
+ tempData = [list(data[0].keys())]
173
+ tempData.extend([list(item.values()) for item in data])
174
+ data = tempData
175
+ data = [[str(item) for item in row] for row in data]
176
+ num_cols = len(data[0])
177
+ # Resolve header and rows
178
+ using_provided_header = header is not None
179
+ if not using_provided_header:
180
+ header = data[0]
181
+ rows = data[1:]
182
+ else:
183
+ if isinstance(header, str):
184
+ header = header.split(delimiter)
185
+ # Pad/trim header to match num_cols
186
+ if len(header) < num_cols:
187
+ header = header + [""] * (num_cols - len(header))
188
+ elif len(header) > num_cols:
189
+ header = header[:num_cols]
190
+ rows = data
191
+ # Compute initial column widths based on data and header
192
+ def compute_col_widths(hdr, rows_):
193
+ col_w = [0] * len(hdr)
194
+ for i in range(len(hdr)):
195
+ col_w[i] = max(0, visible_len(hdr[i]), *(visible_len(r[i]) for r in rows_ if i < len(r)))
196
+ return col_w
197
+ # Ensure all rows have the same number of columns
198
+ normalized_rows = []
199
+ for r in rows:
200
+ if len(r) < num_cols:
201
+ r = r + [""] * (num_cols - len(r))
202
+ elif len(r) > num_cols:
203
+ r = r[:num_cols]
204
+ normalized_rows.append(r)
205
+ rows = normalized_rows
206
+ col_widths = compute_col_widths(header, rows)
207
+ # If full=True, keep existing formatting
208
+ # Else try to fit within the terminal width by:
209
+ # 1) Switching to compressed separators if needed
210
+ # 2) Recursively compressing columns (truncating with ".")
211
+ sep = " | "
212
+ hsep = "-+-"
213
+ cols = get_terminal_size()[0]
214
+ def render(hdr, rows, col_w, sep_str, hsep_str):
215
+ row_fmt = sep_str.join("{{:<{}}}".format(w) for w in col_w)
216
+ out = []
217
+ out.append(row_fmt.format(*hdr))
218
+ out.append(hsep_str.join("-" * w for w in col_w))
219
+ for row in rows:
220
+ if not any(row):
221
+ out.append(hsep_str.join("-" * w for w in col_w))
222
+ else:
223
+ row = [truncate_to_width(row[i], col_w[i]) for i in range(len(row))]
224
+ out.append(row_fmt.format(*row))
225
+ return "\n".join(out) + "\n"
226
+ if full:
227
+ return render(header, rows, col_widths, sep, hsep)
228
+ # Try default separators first
229
+ if table_width(col_widths, len(sep)) <= cols:
230
+ return render(header, rows, col_widths, sep, hsep)
231
+ # Use compressed separators (no spaces)
232
+ sep = "|"
233
+ hsep = "+"
234
+ if table_width(col_widths, len(sep)) <= cols:
235
+ return render(header, rows, col_widths, sep, hsep)
236
+ # Begin column compression
237
+ # Track which columns have been compressed already to header width
238
+ header_widths = [visible_len(h) for h in header]
239
+ width_diff = [max(col_widths[i] - header_widths[i],0) for i in range(num_cols)]
240
+ total_overflow_width = table_width(col_widths, len(sep)) - cols
241
+ for i, diff in sorted(enumerate(width_diff), key=lambda x: -x[1]):
242
+ if total_overflow_width <= 0:
243
+ break
244
+ if diff <= 0:
245
+ continue
246
+ reduce_by = min(diff, total_overflow_width)
247
+ col_widths[i] -= reduce_by
248
+ total_overflow_width -= reduce_by
249
+ return render(header, rows, col_widths, sep, hsep)
167
250
 
168
251
  def format_bytes(size, use_1024_bytes=None, to_int=False, to_str=False,str_format='.2f'):
169
252
  """
@@ -231,14 +314,14 @@ def format_bytes(size, use_1024_bytes=None, to_int=False, to_str=False,str_forma
231
314
  else:
232
315
  try:
233
316
  return int(size)
234
- except Exception as e:
317
+ except Exception:
235
318
  return 0
236
319
  elif to_str or isinstance(size, int) or isinstance(size, float):
237
320
  if isinstance(size, str):
238
321
  try:
239
322
  size = size.rstrip('B').rstrip('b')
240
323
  size = float(size.lower().strip())
241
- except Exception as e:
324
+ except Exception:
242
325
  return size
243
326
  # size is in bytes
244
327
  if use_1024_bytes or use_1024_bytes is None:
@@ -268,932 +351,1080 @@ def format_bytes(size, use_1024_bytes=None, to_int=False, to_str=False,str_forma
268
351
  return 0
269
352
 
270
353
  def get_resource_usage(return_dict = False):
271
- try:
272
- if RESOURCE_LIB_AVAILABLE:
273
- rawResource = resource.getrusage(resource.RUSAGE_SELF)
274
- resourceDict = {}
275
- resourceDict['user mode time'] = f'{rawResource.ru_utime} seconds'
276
- resourceDict['system mode time'] = f'{rawResource.ru_stime} seconds'
277
- resourceDict['max resident set size'] = f'{format_bytes(rawResource.ru_maxrss * 1024)}B'
278
- resourceDict['shared memory size'] = f'{format_bytes(rawResource.ru_ixrss * 1024)}B'
279
- resourceDict['unshared memory size'] = f'{format_bytes(rawResource.ru_idrss * 1024)}B'
280
- resourceDict['unshared stack size'] = f'{format_bytes(rawResource.ru_isrss * 1024)}B'
281
- resourceDict['cached page hits'] = f'{rawResource.ru_minflt}'
282
- resourceDict['missed page hits'] = f'{rawResource.ru_majflt}'
283
- resourceDict['swapped out page count'] = f'{rawResource.ru_nswap}'
284
- resourceDict['block input operations'] = f'{rawResource.ru_inblock}'
285
- resourceDict['block output operations'] = f'{rawResource.ru_oublock}'
286
- resourceDict['IPC messages sent'] = f'{rawResource.ru_msgsnd}'
287
- resourceDict['IPC messages received'] = f'{rawResource.ru_msgrcv}'
288
- resourceDict['signals received'] = f'{rawResource.ru_nsignals}'
289
- resourceDict['voluntary context sw'] = f'{rawResource.ru_nvcsw}'
290
- resourceDict['involuntary context sw'] = f'{rawResource.ru_nivcsw}'
291
- if return_dict:
292
- return resourceDict
293
- return '\n'.join(['\t'.join(line) for line in resourceDict.items()])
294
- except Exception as e:
295
- print(f"Error: {e}")
296
- if return_dict:
297
- return {}
298
- return ''
354
+ try:
355
+ if RESOURCE_LIB_AVAILABLE:
356
+ rawResource = resource.getrusage(resource.RUSAGE_SELF)
357
+ resourceDict = {}
358
+ resourceDict['user mode time'] = f'{rawResource.ru_utime} seconds'
359
+ resourceDict['system mode time'] = f'{rawResource.ru_stime} seconds'
360
+ resourceDict['max resident set size'] = f'{format_bytes(rawResource.ru_maxrss * 1024)}B'
361
+ resourceDict['shared memory size'] = f'{format_bytes(rawResource.ru_ixrss * 1024)}B'
362
+ resourceDict['unshared memory size'] = f'{format_bytes(rawResource.ru_idrss * 1024)}B'
363
+ resourceDict['unshared stack size'] = f'{format_bytes(rawResource.ru_isrss * 1024)}B'
364
+ resourceDict['cached page hits'] = f'{rawResource.ru_minflt}'
365
+ resourceDict['missed page hits'] = f'{rawResource.ru_majflt}'
366
+ resourceDict['swapped out page count'] = f'{rawResource.ru_nswap}'
367
+ resourceDict['block input operations'] = f'{rawResource.ru_inblock}'
368
+ resourceDict['block output operations'] = f'{rawResource.ru_oublock}'
369
+ resourceDict['IPC messages sent'] = f'{rawResource.ru_msgsnd}'
370
+ resourceDict['IPC messages received'] = f'{rawResource.ru_msgrcv}'
371
+ resourceDict['signals received'] = f'{rawResource.ru_nsignals}'
372
+ resourceDict['voluntary context sw'] = f'{rawResource.ru_nvcsw}'
373
+ resourceDict['involuntary context sw'] = f'{rawResource.ru_nivcsw}'
374
+ if return_dict:
375
+ return resourceDict
376
+ return '\n'.join(['\t'.join(line) for line in resourceDict.items()])
377
+ except Exception as e:
378
+ print(f"Error: {e}")
379
+ if return_dict:
380
+ return {}
381
+ return ''
299
382
 
300
383
  def __teePrintOrNot(message,level = 'info',teeLogger = None):
301
- """
302
- Prints the given message or logs it using the provided teeLogger.
303
-
304
- Parameters:
305
- message (str): The message to be printed or logged.
306
- level (str, optional): The log level. Defaults to 'info'.
307
- teeLogger (object, optional): The logger object used for logging. Defaults to None.
308
-
309
- Returns:
310
- None
311
- """
312
- try:
313
- if teeLogger:
314
- try:
315
- teeLogger.teelog(message,level,callerStackDepth=3)
316
- except:
317
- teeLogger.teelog(message,level)
318
- else:
319
- print(message,flush=True)
320
- except Exception:
321
- print(message,flush=True)
322
-
323
- def _processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,strict = True,delimiter = DEFAULT_DELIMITER,defaults = ...):
324
- """
325
- Process a line of text and update the task dictionary.
326
-
327
- Parameters:
328
- line (str): The line of text to process.
329
- taskDic (dict): The dictionary to update with the processed line.
330
- correctColumnNum (int): The expected number of columns in the line.
331
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
332
- teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
333
- strict (bool, optional): Whether to strictly enforce the correct number of columns. Defaults to True.
334
- defaults (list, optional): The default values to use for missing columns. Defaults to [].
335
-
336
- Returns:
337
- tuple: A tuple containing the updated correctColumnNum and the processed lineCache.
338
-
339
- """
340
- if defaults is ...:
341
- defaults = []
342
- line = line.strip(' ').strip('\x00').rstrip('\r\n')
343
- # we throw away the lines that start with '#'
344
- if not line :
345
- if verbose:
346
- __teePrintOrNot(f"Ignoring empty line: {line}",teeLogger=teeLogger)
347
- return correctColumnNum , []
348
- if line.startswith('#') and not line.startswith(DEFAULTS_INDICATOR_KEY):
349
- if verbose:
350
- __teePrintOrNot(f"Ignoring comment line: {line}",teeLogger=teeLogger)
351
- return correctColumnNum , []
352
- # we only interested in the lines that have the correct number of columns
353
- lineCache = [segment.rstrip() for segment in line.split(delimiter)]
354
- if not lineCache:
355
- return correctColumnNum , []
356
- if correctColumnNum == -1:
357
- if defaults and len(defaults) > 1:
358
- correctColumnNum = len(defaults)
359
- else:
360
- correctColumnNum = len(lineCache)
361
- if verbose:
362
- __teePrintOrNot(f"detected correctColumnNum: {len(lineCache)}",teeLogger=teeLogger)
363
- if not lineCache[0]:
364
- if verbose:
365
- __teePrintOrNot(f"Ignoring line with empty key: {line}",teeLogger=teeLogger)
366
- return correctColumnNum , []
367
- if len(lineCache) == 1 or not any(lineCache[1:]):
368
- if correctColumnNum == 1:
369
- taskDic[lineCache[0]] = lineCache
370
- elif lineCache[0] == DEFAULTS_INDICATOR_KEY:
371
- if verbose:
372
- __teePrintOrNot(f"Empty defaults line found: {line}",teeLogger=teeLogger)
373
- defaults.clear()
374
- else:
375
- if verbose:
376
- __teePrintOrNot(f"Key {lineCache[0]} found with empty value, deleting such key's representaion",teeLogger=teeLogger)
377
- if lineCache[0] in taskDic:
378
- del taskDic[lineCache[0]]
379
- return correctColumnNum , []
380
- elif len(lineCache) != correctColumnNum:
381
- if strict and not any(defaults):
382
- if verbose:
383
- __teePrintOrNot(f"Ignoring line with {len(lineCache)} columns: {line}",teeLogger=teeLogger)
384
- return correctColumnNum , []
385
- else:
386
- # fill / cut the line with empty entries til the correct number of columns
387
- if len(lineCache) < correctColumnNum:
388
- lineCache += ['']*(correctColumnNum-len(lineCache))
389
- elif len(lineCache) > correctColumnNum:
390
- lineCache = lineCache[:correctColumnNum]
391
- if verbose:
392
- __teePrintOrNot(f"Correcting {lineCache[0]}",teeLogger=teeLogger)
393
- # now replace empty values with defaults
394
- if defaults and len(defaults) > 1:
395
- for i in range(1,len(lineCache)):
396
- if not lineCache[i] and i < len(defaults) and defaults[i]:
397
- lineCache[i] = defaults[i]
398
- if verbose:
399
- __teePrintOrNot(f"Replacing empty value at {i} with default: {defaults[i]}",teeLogger=teeLogger)
400
- if lineCache[0] == DEFAULTS_INDICATOR_KEY:
401
- if verbose:
402
- __teePrintOrNot(f"Defaults line found: {line}",teeLogger=teeLogger)
403
- defaults[:] = lineCache
404
- return correctColumnNum , []
405
- taskDic[lineCache[0]] = lineCache
406
- if verbose:
407
- __teePrintOrNot(f"Key {lineCache[0]} added",teeLogger=teeLogger)
408
- return correctColumnNum, lineCache
409
-
410
- def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False,encoding = 'utf8',delimiter = ...,defaults = ...):
411
- """
412
- Reads the last valid line from a file.
413
-
414
- Args:
415
- fileName (str): The name of the file to read.
416
- taskDic (dict): A dictionary to pass to processLine function.
417
- correctColumnNum (int): A column number to pass to processLine function.
418
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
419
- teeLogger (optional): Logger to use for tee print. Defaults to None.
420
- encoding (str, optional): The encoding of the file. Defaults to None.
421
- strict (bool, optional): Whether to enforce strict processing. Defaults to False.
422
- delimiter (str, optional): The delimiter used in the file. Defaults to None.
423
- defaults (list, optional): The default values to use for missing columns. Defaults to [].
424
-
425
- Returns:
426
- list: The last valid line data processed by processLine, or an empty list if none found.
427
- """
428
- chunk_size = 1024 # Read in chunks of 1024 bytes
429
- last_valid_line = []
430
- if defaults is ...:
431
- defaults = []
432
- delimiter = get_delimiter(delimiter,file_name=fileName)
433
- if verbose:
434
- __teePrintOrNot(f"Reading last line only from {fileName}",teeLogger=teeLogger)
435
- with openFileAsCompressed(fileName, 'rb',encoding=encoding, teeLogger=teeLogger) as file:
436
- file.seek(0, os.SEEK_END)
437
- file_size = file.tell()
438
- buffer = b''
439
- position = file_size
440
-
441
- while position > 0:
442
- # Read chunks from the end of the file
443
- read_size = min(chunk_size, position)
444
- position -= read_size
445
- file.seek(position)
446
- chunk = file.read(read_size)
447
-
448
- # Prepend new chunk to buffer
449
- buffer = chunk + buffer
450
-
451
- # Split the buffer into lines
452
- lines = buffer.split(b'\n')
453
-
454
- # Process lines from the last to the first
455
- for i in range(len(lines) - 1, -1, -1):
456
- if lines[i].strip(): # Skip empty lines
457
- # Process the line
458
- correctColumnNum, lineCache = _processLine(
459
- line=lines[i].decode(encoding=encoding,errors='replace'),
460
- taskDic=taskDic,
461
- correctColumnNum=correctColumnNum,
462
- verbose=verbose,
463
- teeLogger=teeLogger,
464
- strict=strict,
465
- delimiter=delimiter,
466
- defaults=defaults,
467
- )
468
- # If the line is valid, return it
469
- if lineCache and any(lineCache):
470
- return lineCache
471
-
472
- # Keep the last (possibly incomplete) line in buffer for the next read
473
- buffer = lines[0]
474
-
475
- # Return empty list if no valid line found
476
- return last_valid_line
384
+ """
385
+ Prints the given message or logs it using the provided teeLogger.
386
+
387
+ Parameters:
388
+ message (str): The message to be printed or logged.
389
+ level (str, optional): The log level. Defaults to 'info'.
390
+ teeLogger (object, optional): The logger object used for logging. Defaults to None.
391
+
392
+ Returns:
393
+ None
394
+ """
395
+ try:
396
+ if teeLogger:
397
+ try:
398
+ teeLogger.teelog(message,level,callerStackDepth=3)
399
+ except Exception:
400
+ teeLogger.teelog(message,level)
401
+ else:
402
+ print(message,flush=True)
403
+ except Exception:
404
+ print(message,flush=True)
405
+
406
+ def _processLine(line,taskDic,correctColumnNum,strict = True,delimiter = DEFAULT_DELIMITER,defaults = ...,
407
+ storeOffset = False, offset = -1):
408
+ """
409
+ Process a line of text and update the task dictionary.
410
+
411
+ Parameters:
412
+ line (str): The line of text to process.
413
+ taskDic (dict): The dictionary to update with the processed line.
414
+ correctColumnNum (int): The expected number of columns in the line.
415
+ strict (bool, optional): Whether to strictly enforce the correct number of columns. Defaults to True.
416
+ defaults (list, optional): The default values to use for missing columns. Defaults to [].
417
+ storeOffset (bool, optional): Whether to store the offset of the line in the taskDic. Defaults to False.
418
+ offset (int, optional): The offset of the line in the file. Defaults to -1.
419
+
420
+ Returns:
421
+ tuple: A tuple containing the updated correctColumnNum and the processed lineCache or offset.
422
+
423
+ """
424
+ if defaults is ...:
425
+ defaults = []
426
+ line = line.strip('\x00').rstrip('\r\n')
427
+ if not line or (line.startswith('#') and not line.startswith(DEFAULTS_INDICATOR_KEY)):
428
+ # if verbose:
429
+ # __teePrintOrNot(f"Ignoring comment line: {line}",teeLogger=teeLogger)
430
+ return correctColumnNum , []
431
+ # we only interested in the lines that have the correct number of columns
432
+ lineCache = _unsanitize(line.split(delimiter),delimiter)
433
+ if not lineCache or not lineCache[0]:
434
+ return correctColumnNum , []
435
+ if correctColumnNum == -1:
436
+ if defaults and len(defaults) > 1:
437
+ correctColumnNum = len(defaults)
438
+ else:
439
+ correctColumnNum = len(lineCache)
440
+ # if verbose:
441
+ # __teePrintOrNot(f"detected correctColumnNum: {len(lineCache)}",teeLogger=teeLogger)
442
+ if len(lineCache) == 1 or not any(lineCache[1:]):
443
+ if correctColumnNum == 1:
444
+ taskDic[lineCache[0]] = lineCache if not storeOffset else offset
445
+ elif lineCache[0] == DEFAULTS_INDICATOR_KEY:
446
+ # if verbose:
447
+ # __teePrintOrNot(f"Empty defaults line found: {line}",teeLogger=teeLogger)
448
+ defaults.clear()
449
+ defaults[0] = DEFAULTS_INDICATOR_KEY
450
+ else:
451
+ # if verbose:
452
+ # __teePrintOrNot(f"Key {lineCache[0]} found with empty value, deleting such key's representaion",teeLogger=teeLogger)
453
+ if lineCache[0] in taskDic:
454
+ del taskDic[lineCache[0]]
455
+ return correctColumnNum , []
456
+ elif len(lineCache) != correctColumnNum:
457
+ if strict and not any(defaults[1:]):
458
+ # if verbose:
459
+ # __teePrintOrNot(f"Ignoring line with {len(lineCache)} columns: {line}",teeLogger=teeLogger)
460
+ return correctColumnNum , []
461
+ else:
462
+ # fill / cut the line with empty entries til the correct number of columns
463
+ if len(lineCache) < correctColumnNum:
464
+ lineCache += ['']*(correctColumnNum-len(lineCache))
465
+ elif len(lineCache) > correctColumnNum:
466
+ lineCache = lineCache[:correctColumnNum]
467
+ # if verbose:
468
+ # __teePrintOrNot(f"Correcting {lineCache[0]}",teeLogger=teeLogger)
469
+ # now replace empty values with defaults
470
+ if defaults and len(defaults) > 1:
471
+ for i in range(1,len(lineCache)):
472
+ if not lineCache[i] and i < len(defaults) and defaults[i]:
473
+ lineCache[i] = defaults[i]
474
+ if lineCache[0] == DEFAULTS_INDICATOR_KEY:
475
+ # if verbose:
476
+ # __teePrintOrNot(f"Defaults line found: {line}",teeLogger=teeLogger)
477
+ defaults[:] = lineCache
478
+ return correctColumnNum , []
479
+ taskDic[lineCache[0]] = lineCache if not storeOffset else offset
480
+ # if verbose:
481
+ # __teePrintOrNot(f"Key {lineCache[0]} added",teeLogger=teeLogger)
482
+ return correctColumnNum, lineCache
483
+
484
+ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False,
485
+ encoding = 'utf8',delimiter = ...,defaults = ...,storeOffset = False ):
486
+ """
487
+ Reads the last valid line from a file.
488
+
489
+ Args:
490
+ fileName (str): The name of the file to read.
491
+ taskDic (dict): A dictionary to pass to processLine function.
492
+ correctColumnNum (int): A column number to pass to processLine function.
493
+ verbose (bool, optional): Whether to print verbose output. Defaults to False.
494
+ teeLogger (optional): Logger to use for tee print. Defaults to None.
495
+ encoding (str, optional): The encoding of the file. Defaults to None.
496
+ strict (bool, optional): Whether to enforce strict processing. Defaults to False.
497
+ delimiter (str, optional): The delimiter used in the file. Defaults to None.
498
+ defaults (list, optional): The default values to use for missing columns. Defaults to [].
499
+ storeOffset (bool, optional): Instead of storing the data in taskDic, store the offset of each line. Defaults to False.
500
+
501
+ Returns:
502
+ list: The last valid line as a list of strings, or an empty list if no valid line is found.
503
+ """
504
+ chunk_size = 1024 # Read in chunks of 1024 bytes
505
+ last_valid_line = []
506
+ if defaults is ...:
507
+ defaults = []
508
+ delimiter = get_delimiter(delimiter,file_name=fileName)
509
+ if verbose:
510
+ __teePrintOrNot(f"Reading last line only from {fileName}",teeLogger=teeLogger)
511
+ with openFileAsCompressed(fileName, 'rb',encoding=encoding, teeLogger=teeLogger) as file:
512
+ file.seek(0, os.SEEK_END)
513
+ file_size = file.tell()
514
+ buffer = b''
515
+ position = file_size
516
+ processedSize = 0
517
+
518
+ while position > 0:
519
+ # Read chunks from the end of the file
520
+ read_size = min(chunk_size, position)
521
+ position -= read_size
522
+ file.seek(position)
523
+ chunk = file.read(read_size)
524
+
525
+ # Prepend new chunk to buffer
526
+ buffer = chunk + buffer
527
+
528
+ # Split the buffer into lines
529
+ lines = buffer.split(b'\n')
530
+
531
+ # Process lines from the last to the first
532
+ for i in range(len(lines) - 1, -1, -1):
533
+ processedSize += len(lines[i]) + 1 # +1 for the newline character
534
+ if lines[i].strip(): # Skip empty lines
535
+ # Process the line
536
+ correctColumnNum, lineCache = _processLine(
537
+ line=lines[i].decode(encoding=encoding,errors='replace'),
538
+ taskDic=taskDic,
539
+ correctColumnNum=correctColumnNum,
540
+ strict=strict,
541
+ delimiter=delimiter,
542
+ defaults=defaults,
543
+ storeOffset=storeOffset,
544
+ offset=file_size - processedSize + 1
545
+ )
546
+ # If the line is valid, return it
547
+ if lineCache:
548
+ if storeOffset :
549
+ return file_size - processedSize + 1
550
+ return lineCache
551
+
552
+ # Keep the last (possibly incomplete) line in buffer for the next read
553
+ buffer = lines[0]
554
+
555
+ # Return empty list if no valid line found
556
+ if storeOffset:
557
+ return -1
558
+ return last_valid_line
559
+
560
+ @functools.lru_cache(maxsize=None)
561
+ def _get_sanitization_re(delimiter = DEFAULT_DELIMITER):
562
+ return re.compile(r"(</sep/>|</LF/>|<sep>|<LF>|\n|" + re.escape(delimiter) + r")")
563
+
564
+ _sanitize_replacements = {
565
+ "<sep>":"</sep/>",
566
+ "<LF>":"</LF/>",
567
+ "\n":"<LF>",
568
+ }
569
+ _inverse_sanitize_replacements = {v: k for k, v in _sanitize_replacements.items()}
570
+
571
+ def _sanitize(data,delimiter = DEFAULT_DELIMITER):
572
+ if not data:
573
+ return data
574
+ def repl(m):
575
+ tok = m.group(0)
576
+ if tok == delimiter:
577
+ return "<sep>"
578
+ if tok in ("</sep/>", "</LF/>"):
579
+ eprint(f"Warning: Found illegal token '{tok}' during sanitization. It will be replaced.")
580
+ return _sanitize_replacements.get(tok, tok)
581
+ pattern = _get_sanitization_re(delimiter)
582
+ if isinstance(data,str):
583
+ return pattern.sub(repl, data)
584
+ else:
585
+ return [pattern.sub(repl,str(segment)) if segment else '' for segment in data]
586
+
587
+ def _unsanitize(data,delimiter = DEFAULT_DELIMITER):
588
+ if not data:
589
+ return data
590
+ def repl(m):
591
+ tok = m.group(0)
592
+ if tok == "<sep>":
593
+ return delimiter
594
+ return _inverse_sanitize_replacements.get(tok, tok)
595
+ pattern = _get_sanitization_re(delimiter)
596
+ if isinstance(data,str):
597
+ return pattern.sub(repl, data.rstrip())
598
+ else:
599
+ return [pattern.sub(repl,str(segment).rstrip()) if segment else '' for segment in data]
477
600
 
478
601
  def _formatHeader(header,verbose = False,teeLogger = None,delimiter = DEFAULT_DELIMITER):
479
- """
480
- Format the header string.
481
-
482
- Parameters:
483
- - header (str or list): The header string or list to format.
484
- - verbose (bool, optional): Whether to print verbose output. Defaults to False.
485
- - teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
486
-
487
- Returns:
488
- str: The formatted header string.
489
- """
490
- if not isinstance(header,str):
491
- try:
492
- header = delimiter.join(header)
493
- except:
494
- if verbose:
495
- __teePrintOrNot('Invalid header, setting header to empty.','error',teeLogger=teeLogger)
496
- header = ''
497
- header = delimiter.join([segment.rstrip() for segment in header.split(delimiter)])
498
- # if header:
499
- # if not header.endswith('\n'):
500
- # header += '\n'
501
- # else:
502
- # header = ''
503
- return header
602
+ """
603
+ Format the header string.
604
+
605
+ Parameters:
606
+ - header (str or list): The header string or list to format.
607
+ - verbose (bool, optional): Whether to print verbose output. Defaults to False.
608
+ - teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
609
+
610
+ Returns:
611
+ list: The formatted header list of string.
612
+ """
613
+ if isinstance(header,str):
614
+ header = header.split(delimiter)
615
+ else:
616
+ try:
617
+ header = [str(s) for s in header]
618
+ except Exception:
619
+ if verbose:
620
+ __teePrintOrNot('Invalid header, setting header to empty.','error',teeLogger=teeLogger)
621
+ header = []
622
+ return [s.rstrip() for s in header]
504
623
 
505
624
  def _lineContainHeader(header,line,verbose = False,teeLogger = None,strict = False,delimiter = DEFAULT_DELIMITER):
506
- """
507
- Verify if a line contains the header.
508
-
509
- Parameters:
510
- - header (str): The header string to verify.
511
- - line (str): The line to verify against the header.
512
- - verbose (bool, optional): Whether to print verbose output. Defaults to False.
513
- - teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
514
- - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
515
-
516
- Returns:
517
- bool: True if the header matches the line, False otherwise.
518
- """
519
- header = [segment.rstrip() for segment in header.split(delimiter)]
520
- line = [segment.rstrip() for segment in line.split(delimiter)]
521
- if verbose:
522
- __teePrintOrNot(f"Header: \n{header}",teeLogger=teeLogger)
523
- __teePrintOrNot(f"First line: \n{line}",teeLogger=teeLogger)
524
- if len(header) != len(line) or any([header[i] not in line[i] for i in range(len(header))]):
525
- __teePrintOrNot(f"Header mismatch: \n{line} \n!= \n{header}",teeLogger=teeLogger)
526
- if strict:
527
- raise ValueError("Data format error! Header mismatch")
528
- return False
529
- return True
530
-
531
- def _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = None,header = '',encoding = 'utf8',strict = True,delimiter = DEFAULT_DELIMITER):
532
- """
533
- Verify the existence of the tabular file.
534
-
535
- Parameters:
536
- - fileName (str): The path of the tabular file.
537
- - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to True.
538
- - teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
539
- - header (str, optional): The header line to verify against. Defaults to ''.
540
- - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
541
- - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
542
-
543
- Returns:
544
- bool: True if the file exists, False otherwise.
545
- """
546
- remainingFileName, _ ,extenstionName = fileName.rpartition('.')
547
- if extenstionName in COMPRESSED_FILE_EXTENSIONS:
548
- remainingFileName, _ ,extenstionName = remainingFileName.rpartition('.')
549
- if delimiter and delimiter == '\t' and not extenstionName == 'tsv':
550
- __teePrintOrNot(f'Warning: Filename {fileName} does not end with .tsv','warning',teeLogger=teeLogger)
551
- elif delimiter and delimiter == ',' and not extenstionName == 'csv':
552
- __teePrintOrNot(f'Warning: Filename {fileName} does not end with .csv','warning',teeLogger=teeLogger)
553
- elif delimiter and delimiter == '\0' and not extenstionName == 'nsv':
554
- __teePrintOrNot(f'Warning: Filename {fileName} does not end with .nsv','warning',teeLogger=teeLogger)
555
- elif delimiter and delimiter == '|' and not extenstionName == 'psv':
556
- __teePrintOrNot(f'Warning: Filename {fileName} does not end with .psv','warning',teeLogger=teeLogger)
557
- if not os.path.isfile(fileName):
558
- if createIfNotExist:
559
- try:
560
- with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
561
- file.write(header.encode(encoding=encoding,errors='replace')+b'\n')
562
- __teePrintOrNot('Created '+fileName,teeLogger=teeLogger)
563
- return True
564
- except:
565
- __teePrintOrNot('Failed to create '+fileName,'error',teeLogger=teeLogger)
566
- if strict:
567
- raise FileNotFoundError("Failed to create file")
568
- return False
569
- elif strict:
570
- __teePrintOrNot('File not found','error',teeLogger=teeLogger)
571
- raise FileNotFoundError("File not found")
572
- else:
573
- return False
574
- return True
625
+ """
626
+ Verify if a line contains the header.
627
+
628
+ Parameters:
629
+ - header (str): The header string to verify.
630
+ - line (str): The line to verify against the header.
631
+ - verbose (bool, optional): Whether to print verbose output. Defaults to False.
632
+ - teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
633
+ - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
634
+
635
+ Returns:
636
+ bool: True if the header matches the line, False otherwise.
637
+ """
638
+ line = _formatHeader(line,verbose=verbose,teeLogger=teeLogger,delimiter=delimiter)
639
+ if verbose:
640
+ __teePrintOrNot(f"Header: \n{header}",teeLogger=teeLogger)
641
+ __teePrintOrNot(f"First line: \n{line}",teeLogger=teeLogger)
642
+ if len(header) != len(line) or any([header[i] not in line[i] for i in range(len(header))]):
643
+ __teePrintOrNot(f"Header mismatch: \n{line} \n!= \n{header}",teeLogger=teeLogger)
644
+ if strict:
645
+ raise ValueError("Data format error! Header mismatch")
646
+ return False
647
+ return True
648
+
649
+ def _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = None,header = [],encoding = 'utf8',strict = True,delimiter = DEFAULT_DELIMITER):
650
+ """
651
+ Verify the existence of the tabular file.
652
+
653
+ Parameters:
654
+ - fileName (str): The path of the tabular file.
655
+ - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to True.
656
+ - teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
657
+ - header (list, optional): The header line to verify against. Defaults to [].
658
+ - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
659
+ - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
660
+
661
+ Returns:
662
+ bool: True if the file exists, False otherwise.
663
+ """
664
+ remainingFileName, _ ,extenstionName = fileName.rpartition('.')
665
+ if extenstionName in COMPRESSED_FILE_EXTENSIONS:
666
+ remainingFileName, _ ,extenstionName = remainingFileName.rpartition('.')
667
+ if delimiter and delimiter == '\t' and not extenstionName == 'tsv':
668
+ __teePrintOrNot(f'Warning: Filename {fileName} does not end with .tsv','warning',teeLogger=teeLogger)
669
+ elif delimiter and delimiter == ',' and not extenstionName == 'csv':
670
+ __teePrintOrNot(f'Warning: Filename {fileName} does not end with .csv','warning',teeLogger=teeLogger)
671
+ elif delimiter and delimiter == '\0' and not extenstionName == 'nsv':
672
+ __teePrintOrNot(f'Warning: Filename {fileName} does not end with .nsv','warning',teeLogger=teeLogger)
673
+ elif delimiter and delimiter == '|' and not extenstionName == 'psv':
674
+ __teePrintOrNot(f'Warning: Filename {fileName} does not end with .psv','warning',teeLogger=teeLogger)
675
+ if not os.path.isfile(fileName):
676
+ if createIfNotExist:
677
+ try:
678
+ with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
679
+ header = delimiter.join(_sanitize(_formatHeader(header,
680
+ verbose=verbose,
681
+ teeLogger=teeLogger,
682
+ delimiter=delimiter,
683
+ ),delimiter=delimiter))
684
+ file.write(header.encode(encoding=encoding,errors='replace')+b'\n')
685
+ __teePrintOrNot('Created '+fileName,teeLogger=teeLogger)
686
+ return True
687
+ except Exception:
688
+ __teePrintOrNot('Failed to create '+fileName,'error',teeLogger=teeLogger)
689
+ if strict:
690
+ raise FileNotFoundError("Failed to create file")
691
+ return False
692
+ elif strict:
693
+ __teePrintOrNot('File not found','error',teeLogger=teeLogger)
694
+ raise FileNotFoundError("File not found")
695
+ else:
696
+ return False
697
+ return True
575
698
 
576
699
  def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,
577
- verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = '\t',defaults = ...,
578
- correctColumnNum = -1):
579
- """
580
- Compatibility method, calls readTabularFile.
581
- Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
582
-
583
- Parameters:
584
- - fileName (str): The path to the Tabular file.
585
- - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
586
- - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
587
- - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
588
- - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
589
- - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
590
- - verbose (bool, optional): Whether to print verbose output. Defaults to False.
591
- - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
592
- - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
593
- - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
594
- - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t'.
595
- - defaults (list, optional): The default values to use for missing columns. Defaults to [].
596
- - correctColumnNum (int, optional): The expected number of columns in the file. If -1, it will be determined from the first valid line. Defaults to -1.
597
-
598
- Returns:
599
- - OrderedDict: The dictionary containing the data from the Tabular file.
600
-
601
- Raises:
602
- - Exception: If the file is not found or there is a data format error.
603
-
604
- """
605
- return readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,
606
- lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,
607
- encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults,
608
- correctColumnNum = correctColumnNum)
700
+ verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = '\t',defaults = ...,
701
+ correctColumnNum = -1):
702
+ """
703
+ Compatibility method, calls readTabularFile.
704
+ Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
705
+
706
+ Parameters:
707
+ - fileName (str): The path to the Tabular file.
708
+ - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
709
+ - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
710
+ - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
711
+ - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
712
+ - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
713
+ - verbose (bool, optional): Whether to print verbose output. Defaults to False.
714
+ - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
715
+ - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
716
+ - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
717
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t'.
718
+ - defaults (list, optional): The default values to use for missing columns. Defaults to [].
719
+ - correctColumnNum (int, optional): The expected number of columns in the file. If -1, it will be determined from the first valid line. Defaults to -1.
720
+
721
+ Returns:
722
+ - OrderedDict: The dictionary containing the data from the Tabular file.
723
+
724
+ Raises:
725
+ - Exception: If the file is not found or there is a data format error.
726
+
727
+ """
728
+ return readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,
729
+ lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,
730
+ encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults,
731
+ correctColumnNum = correctColumnNum)
609
732
 
610
733
  def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,
611
- verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = ...,defaults = ...,
612
- correctColumnNum = -1):
613
- """
614
- Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
615
-
616
- Parameters:
617
- - fileName (str): The path to the Tabular file.
618
- - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
619
- - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
620
- - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
621
- - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
622
- - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
623
- - verbose (bool, optional): Whether to print verbose output. Defaults to False.
624
- - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
625
- - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
626
- - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
627
- - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
628
- - defaults (list, optional): The default values to use for missing columns. Defaults to [].
629
- - correctColumnNum (int, optional): The expected number of columns in the file. If -1, it will be determined from the first valid line. Defaults to -1.
630
-
631
- Returns:
632
- - OrderedDict: The dictionary containing the data from the Tabular file.
633
-
634
- Raises:
635
- - Exception: If the file is not found or there is a data format error.
636
-
637
- """
638
- if taskDic is None:
639
- taskDic = {}
640
- if defaults is ...:
641
- defaults = []
642
- delimiter = get_delimiter(delimiter,file_name=fileName)
643
- header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger, delimiter = delimiter)
644
- if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
645
- return taskDic
646
- with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
647
- if header.rstrip() and verifyHeader:
648
- line = file.readline().decode(encoding=encoding,errors='replace')
649
- if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict) and correctColumnNum == -1:
650
- correctColumnNum = len(header.split(delimiter))
651
- if verbose:
652
- __teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
653
- if lastLineOnly:
654
- lineCache = read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=verbose, teeLogger=teeLogger, strict=strict, delimiter=delimiter, defaults=defaults)
655
- if lineCache:
656
- taskDic[lineCache[0]] = lineCache
657
- return lineCache
658
- for line in file:
659
- correctColumnNum, lineCache = _processLine(line.decode(encoding=encoding,errors='replace'),taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict,delimiter=delimiter,defaults = defaults)
660
- return taskDic
734
+ verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = ...,defaults = ...,
735
+ correctColumnNum = -1,storeOffset = False):
736
+ """
737
+ Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
738
+
739
+ Parameters:
740
+ - fileName (str): The path to the Tabular file.
741
+ - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
742
+ - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
743
+ - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
744
+ - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
745
+ - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
746
+ - verbose (bool, optional): Whether to print verbose output. Defaults to False.
747
+ - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
748
+ - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
749
+ - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
750
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
751
+ - defaults (list, optional): The default values to use for missing columns. Defaults to [].
752
+ - correctColumnNum (int, optional): The expected number of columns in the file. If -1, it will be determined from the first valid line. Defaults to -1.
753
+ - storeOffset (bool, optional): Instead of storing the data in taskDic, store the offset of each line. Defaults to False.
754
+
755
+ Returns:
756
+ - OrderedDict: The dictionary containing the data from the Tabular file.
757
+
758
+ Raises:
759
+ - Exception: If the file is not found or there is a data format error.
760
+
761
+ """
762
+ if taskDic is None:
763
+ taskDic = {}
764
+ if defaults is ...:
765
+ defaults = []
766
+ delimiter = get_delimiter(delimiter,file_name=fileName)
767
+ header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger, delimiter = delimiter)
768
+ if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
769
+ return taskDic
770
+ with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
771
+ if any(header) and verifyHeader:
772
+ line = file.readline().decode(encoding=encoding,errors='replace')
773
+ if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict) and correctColumnNum == -1:
774
+ correctColumnNum = len(header)
775
+ if verbose:
776
+ __teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
777
+ if lastLineOnly:
778
+ lineCache = read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=verbose, teeLogger=teeLogger, strict=strict, delimiter=delimiter, defaults=defaults,storeOffset=storeOffset)
779
+ # if lineCache:
780
+ # taskDic[lineCache[0]] = lineCache
781
+ return lineCache
782
+ for line in file:
783
+ correctColumnNum, _ = _processLine(line.decode(encoding=encoding,errors='replace'),taskDic,correctColumnNum,strict = strict,delimiter=delimiter,defaults = defaults,storeOffset=storeOffset,offset=file.tell()-len(line))
784
+ return taskDic
661
785
 
662
786
  def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = '\t'):
663
- """
664
- Compatibility method, calls appendTabularFile.
665
- Append a line of data to a Tabular file.
666
- Parameters:
667
- - fileName (str): The path of the Tabular file.
668
- - lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
669
- - teeLogger (optional): A logger object for logging messages.
670
- - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
671
- - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
672
- - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
673
- - verbose (bool, optional): If True, additional information will be printed during the execution.
674
- - encoding (str, optional): The encoding of the file.
675
- - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
676
- - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
677
- Raises:
678
- - Exception: If the file does not exist and createIfNotExist is False.
679
- - Exception: If the existing header does not match the provided header.
680
- """
681
- return appendTabularFile(fileName,lineToAppend,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding, strict = strict, delimiter = delimiter)
787
+ """
788
+ Compatibility method, calls appendTabularFile.
789
+ Append a line of data to a Tabular file.
790
+ Parameters:
791
+ - fileName (str): The path of the Tabular file.
792
+ - lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
793
+ - teeLogger (optional): A logger object for logging messages.
794
+ - header (str or list, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
795
+ - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
796
+ - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
797
+ - verbose (bool, optional): If True, additional information will be printed during the execution.
798
+ - encoding (str, optional): The encoding of the file.
799
+ - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
800
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
801
+ Raises:
802
+ - Exception: If the file does not exist and createIfNotExist is False.
803
+ - Exception: If the existing header does not match the provided header.
804
+ """
805
+ return appendTabularFile(fileName,lineToAppend,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding, strict = strict, delimiter = delimiter)
682
806
 
683
807
  def appendTabularFile(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = ...):
684
- """
685
- Append a line of data to a Tabular file.
686
- Parameters:
687
- - fileName (str): The path of the Tabular file.
688
- - lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
689
- - teeLogger (optional): A logger object for logging messages.
690
- - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
691
- - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
692
- - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
693
- - verbose (bool, optional): If True, additional information will be printed during the execution.
694
- - encoding (str, optional): The encoding of the file.
695
- - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
696
- - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
697
- Raises:
698
- - Exception: If the file does not exist and createIfNotExist is False.
699
- - Exception: If the existing header does not match the provided header.
700
- """
701
- return appendLinesTabularFile(fileName,[lineToAppend],teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding, strict = strict, delimiter = delimiter)
808
+ """
809
+ Append a line of data to a Tabular file.
810
+ Parameters:
811
+ - fileName (str): The path of the Tabular file.
812
+ - lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
813
+ - teeLogger (optional): A logger object for logging messages.
814
+ - header (str or list, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
815
+ - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
816
+ - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
817
+ - verbose (bool, optional): If True, additional information will be printed during the execution.
818
+ - encoding (str, optional): The encoding of the file.
819
+ - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
820
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
821
+ Raises:
822
+ - Exception: If the file does not exist and createIfNotExist is False.
823
+ - Exception: If the existing header does not match the provided header.
824
+ """
825
+ return appendLinesTabularFile(fileName,[lineToAppend],teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding, strict = strict, delimiter = delimiter)
702
826
 
703
827
  def appendLinesTabularFile(fileName,linesToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = ...):
704
- """
705
- Append lines of data to a Tabular file.
706
- Parameters:
707
- - fileName (str): The path of the Tabular file.
708
- - linesToAppend (list): The lines of data to append. If it is a list of string, then each string will be split by delimiter to form a list.
709
- - teeLogger (optional): A logger object for logging messages.
710
- - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
711
- - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
712
- - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
713
- - verbose (bool, optional): If True, additional information will be printed during the execution.
714
- - encoding (str, optional): The encoding of the file.
715
- - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
716
- - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
717
- Raises:
718
- - Exception: If the file does not exist and createIfNotExist is False.
719
- - Exception: If the existing header does not match the provided header.
720
- """
721
- delimiter = get_delimiter(delimiter,file_name=fileName)
722
- header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
723
- if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
724
- return
725
- formatedLines = []
726
- for line in linesToAppend:
727
- if isinstance(linesToAppend,dict):
728
- key = line
729
- line = linesToAppend[key]
730
- if isinstance(line,str):
731
- line = line.split(delimiter)
732
- elif line:
733
- for i in range(len(line)):
734
- if not isinstance(line[i],str):
735
- try:
736
- line[i] = str(line[i])
737
- except Exception as e:
738
- line[i] = str(e)
739
- if isinstance(linesToAppend,dict):
740
- if (not line or line[0] != key):
741
- line = [key]+line
742
- formatedLines.append(line)
743
- if not formatedLines:
744
- if verbose:
745
- __teePrintOrNot(f"No lines to append to {fileName}",teeLogger=teeLogger)
746
- return
747
- correctColumnNum = max([len(line) for line in formatedLines])
748
-
749
- if header.rstrip() and verifyHeader:
750
- with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
751
- line = file.readline().decode(encoding=encoding,errors='replace')
752
- if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
753
- correctColumnNum = len(header.split(delimiter))
754
- if verbose:
755
- __teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
756
- # truncate / fill the lines to the correct number of columns
757
- for i in range(len(formatedLines)):
758
- if len(formatedLines[i]) < correctColumnNum:
759
- formatedLines[i] += ['']*(correctColumnNum-len(formatedLines[i]))
760
- elif len(formatedLines[i]) > correctColumnNum:
761
- formatedLines[i] = formatedLines[i][:correctColumnNum]
762
- with openFileAsCompressed(fileName, mode ='ab',encoding=encoding,teeLogger=teeLogger)as file:
763
- # check if the file ends in a newline
764
- # file.seek(-1, os.SEEK_END)
765
- # if file.read(1) != b'\n':
766
- # file.write(b'\n')
767
- file.write(b'\n'.join([delimiter.join(line).encode(encoding=encoding,errors='replace') for line in formatedLines]) + b'\n')
768
- if verbose:
769
- __teePrintOrNot(f"Appended {len(formatedLines)} lines to {fileName}",teeLogger=teeLogger)
828
+ """
829
+ Append lines of data to a Tabular file.
830
+ Parameters:
831
+ - fileName (str): The path of the Tabular file.
832
+ - linesToAppend (list): The lines of data to append. If it is a list of string, then each string will be split by delimiter to form a list.
833
+ - teeLogger (optional): A logger object for logging messages.
834
+ - header (str or list, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
835
+ - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
836
+ - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
837
+ - verbose (bool, optional): If True, additional information will be printed during the execution.
838
+ - encoding (str, optional): The encoding of the file.
839
+ - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
840
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
841
+ Raises:
842
+ - Exception: If the file does not exist and createIfNotExist is False.
843
+ - Exception: If the existing header does not match the provided header.
844
+ """
845
+ delimiter = get_delimiter(delimiter,file_name=fileName)
846
+ header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
847
+ if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
848
+ return
849
+ formatedLines = []
850
+ for line in linesToAppend:
851
+ if isinstance(linesToAppend,dict):
852
+ key = line
853
+ line = linesToAppend[key]
854
+ if isinstance(line,str):
855
+ line = line.split(delimiter)
856
+ elif line:
857
+ for i in range(len(line)):
858
+ if not isinstance(line[i],str):
859
+ try:
860
+ line[i] = str(line[i]).rstrip()
861
+ except Exception as e:
862
+ line[i] = str(e)
863
+ if isinstance(linesToAppend,dict):
864
+ if (not line or line[0] != key):
865
+ line = [key]+line
866
+ formatedLines.append(_sanitize(line,delimiter=delimiter))
867
+ if not formatedLines:
868
+ if verbose:
869
+ __teePrintOrNot(f"No lines to append to {fileName}",teeLogger=teeLogger)
870
+ return
871
+ correctColumnNum = max([len(line) for line in formatedLines])
872
+ if any(header) and verifyHeader:
873
+ with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
874
+ line = file.readline().decode(encoding=encoding,errors='replace')
875
+ if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
876
+ correctColumnNum = len(header)
877
+ if verbose:
878
+ __teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
879
+ # truncate / fill the lines to the correct number of columns
880
+ for i in range(len(formatedLines)):
881
+ if len(formatedLines[i]) < correctColumnNum:
882
+ formatedLines[i] += ['']*(correctColumnNum-len(formatedLines[i]))
883
+ elif len(formatedLines[i]) > correctColumnNum:
884
+ formatedLines[i] = formatedLines[i][:correctColumnNum]
885
+ with openFileAsCompressed(fileName, mode ='ab',encoding=encoding,teeLogger=teeLogger)as file:
886
+ # check if the file ends in a newline
887
+ # file.seek(-1, os.SEEK_END)
888
+ # if file.read(1) != b'\n':
889
+ # file.write(b'\n')
890
+ file.write(b'\n'.join([delimiter.join(line).encode(encoding=encoding,errors='replace') for line in formatedLines]) + b'\n')
891
+ if verbose:
892
+ __teePrintOrNot(f"Appended {len(formatedLines)} lines to {fileName}",teeLogger=teeLogger)
770
893
 
771
894
  def clearTSV(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False,delimiter = '\t'):
772
- """
773
- Compatibility method, calls clearTabularFile.
774
- Clear the contents of a Tabular file. Will create if not exist.
775
- Parameters:
776
- - fileName (str): The path of the Tabular file.
777
- - teeLogger (optional): A logger object for logging messages.
778
- - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
779
- - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
780
- - verbose (bool, optional): If True, additional information will be printed during the execution.
781
- - encoding (str, optional): The encoding of the file.
782
- - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
783
- """
784
- return clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
895
+ """
896
+ Compatibility method, calls clearTabularFile.
897
+ Clear the contents of a Tabular file. Will create if not exist.
898
+ Parameters:
899
+ - fileName (str): The path of the Tabular file.
900
+ - teeLogger (optional): A logger object for logging messages.
901
+ - header (str or list, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
902
+ - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
903
+ - verbose (bool, optional): If True, additional information will be printed during the execution.
904
+ - encoding (str, optional): The encoding of the file.
905
+ - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
906
+ """
907
+ return clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
785
908
 
786
909
  def clearTabularFile(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False,delimiter = ...):
787
- """
788
- Clear the contents of a Tabular file. Will create if not exist.
789
- Parameters:
790
- - fileName (str): The path of the Tabular file.
791
- - teeLogger (optional): A logger object for logging messages.
792
- - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
793
- - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
794
- - verbose (bool, optional): If True, additional information will be printed during the execution.
795
- - encoding (str, optional): The encoding of the file.
796
- - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
797
- """
798
- delimiter = get_delimiter(delimiter,file_name=fileName)
799
- header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
800
- if not _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = teeLogger,header = header,encoding = encoding,strict = False,delimiter=delimiter):
801
- raise FileNotFoundError("Something catastrophic happened! File still not found after creation")
802
- else:
803
- with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
804
- if header.rstrip() and verifyHeader:
805
- line = file.readline().decode(encoding=encoding,errors='replace')
806
- if not _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
807
- __teePrintOrNot(f'Warning: Header mismatch in {fileName}. Keeping original header in file...','warning',teeLogger)
808
- header = line
809
- with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
810
- if header:
811
- if not header.endswith('\n'):
812
- header += '\n'
813
- file.write(header.encode(encoding=encoding,errors='replace'))
814
- if verbose:
815
- __teePrintOrNot(f"Cleared {fileName}",teeLogger=teeLogger)
910
+ """
911
+ Clear the contents of a Tabular file. Will create if not exist.
912
+ Parameters:
913
+ - fileName (str): The path of the Tabular file.
914
+ - teeLogger (optional): A logger object for logging messages.
915
+ - header (str or list, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
916
+ - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
917
+ - verbose (bool, optional): If True, additional information will be printed during the execution.
918
+ - encoding (str, optional): The encoding of the file.
919
+ - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
920
+ """
921
+ delimiter = get_delimiter(delimiter,file_name=fileName)
922
+ header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
923
+ if not _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = teeLogger,header = header,encoding = encoding,strict = False,delimiter=delimiter):
924
+ raise FileNotFoundError("Something catastrophic happened! File still not found after creation")
925
+ else:
926
+ with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
927
+ if any(header) and verifyHeader:
928
+ line = file.readline().decode(encoding=encoding,errors='replace')
929
+ if not _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
930
+ __teePrintOrNot(f'Warning: Header mismatch in {fileName}. Keeping original header in file...','warning',teeLogger)
931
+ header = _formatHeader(line,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
932
+ with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
933
+ if header:
934
+ header = delimiter.join(_sanitize(header,delimiter=delimiter))
935
+ file.write(header.encode(encoding=encoding,errors='replace')+b'\n')
936
+ if verbose:
937
+ __teePrintOrNot(f"Cleared {fileName}",teeLogger=teeLogger)
816
938
 
817
939
  def getFileUpdateTimeNs(fileName):
818
- # return 0 if the file does not exist
819
- if not os.path.isfile(fileName):
820
- return 0
821
- try:
822
- return os.stat(fileName).st_mtime_ns
823
- except:
824
- __teePrintOrNot(f"Failed to get file update time for {fileName}",'error')
825
- return get_time_ns()
940
+ # return 0 if the file does not exist
941
+ if not os.path.isfile(fileName):
942
+ return 0
943
+ try:
944
+ return os.stat(fileName).st_mtime_ns
945
+ except Exception:
946
+ __teePrintOrNot(f"Failed to get file update time for {fileName}",'error')
947
+ return get_time_ns()
826
948
 
827
949
  def get_time_ns():
828
- try:
829
- return time.time_ns()
830
- except:
831
- # try to get the time in nanoseconds
832
- return int(time.time()*1e9)
950
+ try:
951
+ return time.time_ns()
952
+ except Exception:
953
+ # try to get the time in nanoseconds
954
+ return int(time.time()*1e9)
833
955
 
834
956
  def scrubTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = False,delimiter = '\t',defaults = ...):
835
- """
836
- Compatibility method, calls scrubTabularFile.
837
- Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
838
- Return the data as a dictionary.
839
-
840
- Parameters:
841
- - fileName (str): The path to the Tabular file.
842
- - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
843
- - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
844
- - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
845
- - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
846
- - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
847
- - verbose (bool, optional): Whether to print verbose output. Defaults to False.
848
- - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
849
- - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
850
- - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
851
- - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
852
- - defaults (list, optional): The default values to use for missing columns. Defaults to [].
853
-
854
- Returns:
855
- - OrderedDict: The dictionary containing the data from the Tabular file.
856
-
857
- Raises:
858
- - Exception: If the file is not found or there is a data format error.
859
-
860
- """
861
- return scrubTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
957
+ """
958
+ Compatibility method, calls scrubTabularFile.
959
+ Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
960
+ Return the data as a dictionary.
961
+
962
+ Parameters:
963
+ - fileName (str): The path to the Tabular file.
964
+ - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
965
+ - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
966
+ - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
967
+ - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
968
+ - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
969
+ - verbose (bool, optional): Whether to print verbose output. Defaults to False.
970
+ - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
971
+ - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
972
+ - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
973
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
974
+ - defaults (list, optional): The default values to use for missing columns. Defaults to [].
975
+
976
+ Returns:
977
+ - OrderedDict: The dictionary containing the data from the Tabular file.
978
+
979
+ Raises:
980
+ - Exception: If the file is not found or there is a data format error.
981
+
982
+ """
983
+ return scrubTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
862
984
 
863
985
  def scrubTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,
864
- verbose = False,taskDic = None,encoding = 'utf8',strict = False,delimiter = ...,defaults = ...,correctColumnNum = -1):
865
- """
866
- Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
867
- If using compressed files. This will recompress the file in whole and possibily increase the compression ratio reducing the file size.
868
- Return the data as a dictionary.
869
-
870
- Parameters:
871
- - fileName (str): The path to the Tabular file.
872
- - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
873
- - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
874
- - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
875
- - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
876
- - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
877
- - verbose (bool, optional): Whether to print verbose output. Defaults to False.
878
- - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
879
- - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
880
- - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
881
- - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
882
- - defaults (list, optional): The default values to use for missing columns. Defaults to [].
883
- - correctColumnNum (int, optional): The expected number of columns in the file. If -1, it will be determined from the first valid line. Defaults to -1.
884
-
885
- Returns:
886
- - OrderedDict: The dictionary containing the data from the Tabular file.
887
-
888
- Raises:
889
- - Exception: If the file is not found or there is a data format error.
890
-
891
- """
892
- file = readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,
893
- lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,
894
- encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults,correctColumnNum = correctColumnNum)
895
- if file:
896
- clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
897
- appendLinesTabularFile(fileName,file,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
898
- return file
986
+ verbose = False,taskDic = None,encoding = 'utf8',strict = False,delimiter = ...,defaults = ...,correctColumnNum = -1):
987
+ """
988
+ Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
989
+ If using compressed files. This will recompress the file in whole and possibily increase the compression ratio reducing the file size.
990
+ Return the data as a dictionary.
991
+
992
+ Parameters:
993
+ - fileName (str): The path to the Tabular file.
994
+ - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
995
+ - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
996
+ - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
997
+ - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
998
+ - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
999
+ - verbose (bool, optional): Whether to print verbose output. Defaults to False.
1000
+ - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
1001
+ - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
1002
+ - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
1003
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
1004
+ - defaults (list, optional): The default values to use for missing columns. Defaults to [].
1005
+ - correctColumnNum (int, optional): The expected number of columns in the file. If -1, it will be determined from the first valid line. Defaults to -1.
1006
+
1007
+ Returns:
1008
+ - OrderedDict: The dictionary containing the data from the Tabular file.
1009
+
1010
+ Raises:
1011
+ - Exception: If the file is not found or there is a data format error.
1012
+
1013
+ """
1014
+ file = readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,
1015
+ lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,
1016
+ encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults,correctColumnNum = correctColumnNum)
1017
+ if file:
1018
+ clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
1019
+ appendLinesTabularFile(fileName,file,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
1020
+ return file
899
1021
 
900
1022
  def getListView(tsvzDic,header = [],delimiter = DEFAULT_DELIMITER):
901
- if header:
902
- if isinstance(header,str):
903
- header = header.split(delimiter)
904
- elif not isinstance(header,list):
905
- try:
906
- header = list(header)
907
- except:
908
- header = []
909
- if not tsvzDic:
910
- if not header:
911
- return []
912
- else:
913
- return [header]
914
- if not header:
915
- return list(tsvzDic.values())
916
- else:
917
- values = list(tsvzDic.values())
918
- if values[0] and values[0] == header:
919
- return values
920
- else:
921
- return [header] + values
1023
+ if header:
1024
+ if isinstance(header,str):
1025
+ header = header.split(delimiter)
1026
+ elif not isinstance(header,list):
1027
+ try:
1028
+ header = list(header)
1029
+ except Exception:
1030
+ header = []
1031
+ if not tsvzDic:
1032
+ if not header:
1033
+ return []
1034
+ else:
1035
+ return [header]
1036
+ if not header:
1037
+ return list(tsvzDic.values())
1038
+ else:
1039
+ values = list(tsvzDic.values())
1040
+ if values[0] and values[0] == header:
1041
+ return values
1042
+ else:
1043
+ return [header] + values
922
1044
 
923
1045
  # create a tsv class that functions like a ordered dictionary but will update the file when modified
924
1046
  class TSVZed(OrderedDict):
925
- def __teePrintOrNot(self,message,level = 'info'):
926
- try:
927
- if self.teeLogger:
928
- self.teeLogger.teelog(message,level)
929
- else:
930
- print(message,flush=True)
931
- except Exception:
932
- print(message,flush=True)
933
-
934
- def getResourseUsage(self,return_dict = False):
935
- return get_resource_usage(return_dict = return_dict)
936
-
937
- def __init__ (self,fileName,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,
938
- rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,monitor_external_changes = True,
939
- verbose = False,encoding = 'utf8',delimiter = ...,defualts = None,strict = False,correctColumnNum = -1):
940
- super().__init__()
941
- self.version = version
942
- self.strict = strict
943
- self.externalFileUpdateTime = getFileUpdateTimeNs(fileName)
944
- self.lastUpdateTime = self.externalFileUpdateTime
945
- self._fileName = fileName
946
- self.teeLogger = teeLogger
947
- self.delimiter = get_delimiter(delimiter,file_name=fileName)
948
- self.defaults = defualts if defualts else []
949
- self.header = _formatHeader(header,verbose = verbose,teeLogger = self.teeLogger,delimiter=self.delimiter)
950
- self.correctColumnNum = correctColumnNum
951
- self.createIfNotExist = createIfNotExist
952
- self.verifyHeader = verifyHeader
953
- self.rewrite_on_load = rewrite_on_load
954
- self.rewrite_on_exit = rewrite_on_exit
955
- self.rewrite_interval = rewrite_interval
956
- self.monitor_external_changes = monitor_external_changes
957
- if not monitor_external_changes:
958
- self.__teePrintOrNot(f"Warning: External changes monitoring disabled for {self._fileName}. Will overwrite external changes.",'warning')
959
- self.verbose = verbose
960
- if append_check_delay < 0:
961
- append_check_delay = 0.00001
962
- self.__teePrintOrNot('append_check_delay cannot be less than 0, setting it to 0.00001','error')
963
- self.append_check_delay = append_check_delay
964
- self.appendQueue = deque()
965
- self.dirty = False
966
- self.deSynced = False
967
- self.memoryOnly = False
968
- self.encoding = encoding
969
- self.writeLock = threading.Lock()
970
- self.shutdownEvent = threading.Event()
971
- #self.appendEvent = threading.Event()
972
- self.appendThread = threading.Thread(target=self._appendWorker,daemon=True)
973
- self.appendThread.start()
974
- self.load()
975
- atexit.register(self.stopAppendThread)
976
-
977
- def setDefaults(self,defaults):
978
- if not defaults:
979
- defaults = []
980
- return
981
- if isinstance(defaults,str):
982
- defaults = defaults.split(self.delimiter)
983
- elif not isinstance(defaults,list):
984
- try:
985
- defaults = list(defaults)
986
- except:
987
- if self.verbose:
988
- self.__teePrintOrNot('Invalid defaults, setting defaults to empty.','error')
989
- defaults = []
990
- return
991
- if not any(defaults):
992
- defaults = []
993
- return
994
- if defaults[0] != DEFAULTS_INDICATOR_KEY:
995
- defaults = [DEFAULTS_INDICATOR_KEY]+defaults
996
- self.defaults = defaults
997
-
998
- def load(self):
999
- self.reload()
1000
- if self.rewrite_on_load:
1001
- self.rewrite(force = True,reloadInternalFromFile = False)
1002
- return self
1003
-
1004
- def reload(self):
1005
- # Load or refresh data from the TSV file
1006
- mo = self.memoryOnly
1007
- self.memoryOnly = True
1008
- if self.verbose:
1009
- self.__teePrintOrNot(f"Loading {self._fileName}")
1010
- super().clear()
1011
- readTabularFile(self._fileName, teeLogger = self.teeLogger, header = self.header, createIfNotExist = self.createIfNotExist, verifyHeader = self.verifyHeader, verbose = self.verbose, taskDic = self,encoding = self.encoding if self.encoding else None, strict = self.strict, delimiter = self.delimiter, defaults=self.defaults)
1012
- if self.verbose:
1013
- self.__teePrintOrNot(f"Loaded {len(self)} records from {self._fileName}")
1014
- if self.header and self.verifyHeader:
1015
- self.correctColumnNum = len(self.header.split(self.delimiter))
1016
- elif self:
1017
- self.correctColumnNum = len(self[next(iter(self))])
1018
- else:
1019
- self.correctColumnNum = -1
1020
- if self.verbose:
1021
- self.__teePrintOrNot(f"correctColumnNum: {self.correctColumnNum}")
1022
- #super().update(loadedData)
1023
- if self.verbose:
1024
- self.__teePrintOrNot(f"TSVZed({self._fileName}) loaded")
1025
- self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1026
- self.lastUpdateTime = self.externalFileUpdateTime
1027
- self.memoryOnly = mo
1028
- return self
1029
-
1030
- def __setitem__(self,key,value):
1031
- key = str(key).rstrip()
1032
- if not key:
1033
- self.__teePrintOrNot('Key cannot be empty','error')
1034
- return
1035
- if isinstance(value,str):
1036
- value = value.split(self.delimiter)
1037
- # sanitize the value
1038
- value = [(str(segment).rstrip() if not isinstance(segment,str) else segment.rstrip()) if segment else '' for segment in value]
1039
- # escape the delimiter and newline characters
1040
- value = [segment.replace(self.delimiter,'<sep>').replace('\n','\\n') for segment in value]
1041
- # the first field in value should be the key
1042
- # add it if it is not there
1043
- if not value or value[0] != key:
1044
- value = [key]+value
1045
- # verify the value has the correct number of columns
1046
- if self.correctColumnNum != 1 and len(value) == 1:
1047
- # this means we want to clear / delete the key
1048
- self.__delitem__(key)
1049
- elif self.correctColumnNum > 0:
1050
- if len(value) != self.correctColumnNum:
1051
- if self.strict:
1052
- self.__teePrintOrNot(f"Value {value} does not have the correct number of columns: {self.correctColumnNum}. Refuse adding key...",'error')
1053
- return
1054
- elif self.verbose:
1055
- self.__teePrintOrNot(f"Value {value} does not have the correct number of columns: {self.correctColumnNum}, correcting...",'warning')
1056
- if len(value) < self.correctColumnNum:
1057
- value += ['']*(self.correctColumnNum-len(value))
1058
- elif len(value) > self.correctColumnNum:
1059
- value = value[:self.correctColumnNum]
1060
- else:
1061
- self.correctColumnNum = len(value)
1062
- if self.defaults and len(self.defaults) > 1:
1063
- for i in range(1,len(value)):
1064
- if not value[i] and i < len(self.defaults) and self.defaults[i]:
1065
- value[i] = self.defaults[i]
1066
- if self.verbose:
1067
- self.__teePrintOrNot(f" Replacing empty value at {i} with default: {self.defaults[i]}")
1068
- if key == DEFAULTS_INDICATOR_KEY:
1069
- self.defaults = value
1070
- if self.verbose:
1071
- self.__teePrintOrNot(f"Defaults set to {value}")
1072
- if not self.memoryOnly:
1073
- self.appendQueue.append(self.delimiter.join(value))
1074
- self.lastUpdateTime = get_time_ns()
1075
- if self.verbose:
1076
- self.__teePrintOrNot(f"Appending Defaults {key} to the appendQueue")
1077
- return
1078
- if self.verbose:
1079
- self.__teePrintOrNot(f"Setting {key} to {value}")
1080
- if key in self:
1081
- if self[key] == value:
1082
- if self.verbose:
1083
- self.__teePrintOrNot(f"Key {key} already exists with the same value")
1084
- return
1085
- self.dirty = True
1086
- # update the dictionary,
1087
- super().__setitem__(key,value)
1088
- if self.memoryOnly:
1089
- if self.verbose:
1090
- self.__teePrintOrNot(f"Key {key} updated in memory only")
1091
- return
1092
- elif key.startswith('#'):
1093
- if self.verbose:
1094
- self.__teePrintOrNot(f"Key {key} updated in memory only as it starts with #")
1095
- return
1096
- if self.verbose:
1097
- self.__teePrintOrNot(f"Appending {key} to the appendQueue")
1098
- self.appendQueue.append(self.delimiter.join(value))
1099
- self.lastUpdateTime = get_time_ns()
1100
- # if not self.appendThread.is_alive():
1101
- # self.commitAppendToFile()
1102
- # else:
1103
- # self.appendEvent.set()
1104
-
1105
-
1106
- def __delitem__(self,key):
1107
- key = str(key).rstrip()
1108
- if key == DEFAULTS_INDICATOR_KEY:
1109
- self.defaults = []
1110
- if self.verbose:
1111
- self.__teePrintOrNot(f"Defaults cleared")
1112
- if not self.memoryOnly:
1113
- self.__appendEmptyLine(key)
1114
- if self.verbose:
1115
- self.__teePrintOrNot(f"Appending empty default line {key}")
1116
- return
1117
- # delete the key from the dictionary and update the file
1118
- if key not in self:
1119
- if self.verbose:
1120
- self.__teePrintOrNot(f"Key {key} not found")
1121
- return
1122
- super().__delitem__(key)
1123
- if self.memoryOnly or key.startswith('#'):
1124
- if self.verbose:
1125
- self.__teePrintOrNot(f"Key {key} deleted in memory")
1126
- return
1127
- self.__appendEmptyLine(key)
1128
- if self.verbose:
1129
- self.__teePrintOrNot(f"Appending empty line {key}")
1130
- self.lastUpdateTime = get_time_ns()
1131
-
1132
- def __appendEmptyLine(self,key):
1133
- self.dirty = True
1134
- if self.correctColumnNum > 0:
1135
- emptyLine = key+self.delimiter*(self.correctColumnNum-1)
1136
- elif len(self[key]) > 1:
1137
- self.correctColumnNum = len(self[key])
1138
- emptyLine = key+self.delimiter*(self.correctColumnNum-1)
1139
- else:
1140
- emptyLine = key
1141
- if self.verbose:
1142
- self.__teePrintOrNot(f"Appending {emptyLine} to the appendQueue")
1143
- self.appendQueue.append(emptyLine)
1144
- return self
1145
-
1146
- def getListView(self):
1147
- return getListView(self,header=self.header,delimiter=self.delimiter)
1148
-
1149
- def clear(self):
1150
- # clear the dictionary and update the file
1151
- super().clear()
1152
- if self.verbose:
1153
- self.__teePrintOrNot(f"Clearing {self._fileName}")
1154
- if self.memoryOnly:
1155
- return self
1156
- self.clear_file()
1157
- self.lastUpdateTime = self.externalFileUpdateTime
1158
- return self
1159
-
1160
- def clear_file(self):
1161
- try:
1162
- if self.header:
1163
- file = self.get_file_obj('wb')
1164
- file.write(self.header.encode(self.encoding,errors='replace') + b'\n')
1165
- self.release_file_obj(file)
1166
- if self.verbose:
1167
- self.__teePrintOrNot(f"Header {self.header} written to {self._fileName}")
1168
- self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
1169
- else:
1170
- file = self.get_file_obj('wb')
1171
- self.release_file_obj(file)
1172
- if self.verbose:
1173
- self.__teePrintOrNot(f"File {self._fileName} cleared empty")
1174
- self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
1175
- self.dirty = False
1176
- self.deSynced = False
1177
- except Exception as e:
1178
- self.release_file_obj(file)
1179
- self.__teePrintOrNot(f"Failed to write at clear_file() to {self._fileName}: {e}",'error')
1180
- import traceback
1181
- self.__teePrintOrNot(traceback.format_exc(),'error')
1182
- self.deSynced = True
1183
- return self
1184
-
1185
- def __enter__(self):
1186
- return self
1187
-
1188
- def close(self):
1189
- self.stopAppendThread()
1190
- return self
1191
-
1192
- def __exit__(self,exc_type,exc_value,traceback):
1193
- return self.close()
1194
-
1195
- def __repr__(self):
1196
- return f"""TSVZed(
1047
+ """
1048
+ A thread-safe, file-backed ordered dictionary for managing TSV (Tab-Separated Values) files.
1049
+ TSVZed extends OrderedDict to provide automatic synchronization between an in-memory
1050
+ dictionary and a TSV file on disk. It supports concurrent file access, automatic
1051
+ persistence, and configurable sync strategies.
1052
+ Parameters
1053
+ ----------
1054
+ fileName : str
1055
+ Path to the TSV file to be managed.
1056
+ teeLogger : object, optional
1057
+ Logger object with a teelog method for logging messages. If None, uses print.
1058
+ header : str, optional
1059
+ Column header line for the TSV file. Used for validation and file creation.
1060
+ createIfNotExist : bool, default=True
1061
+ If True, creates the file if it doesn't exist.
1062
+ verifyHeader : bool, default=True
1063
+ If True, verifies that the file header matches the provided header.
1064
+ rewrite_on_load : bool, default=True
1065
+ If True, rewrites the entire file when loading to ensure consistency.
1066
+ rewrite_on_exit : bool, default=False
1067
+ If True, rewrites the entire file when closing/exiting.
1068
+ rewrite_interval : float, default=0
1069
+ Minimum time interval (in seconds) between full file rewrites. 0 means no limit.
1070
+ append_check_delay : float, default=0.01
1071
+ Time delay (in seconds) between checks of the append queue by the worker thread.
1072
+ monitor_external_changes : bool, default=True
1073
+ If True, monitors and detects external file modifications.
1074
+ verbose : bool, default=False
1075
+ If True, prints detailed operation logs.
1076
+ encoding : str, default='utf8'
1077
+ Character encoding for reading/writing the file.
1078
+ delimiter : str, optional
1079
+ Field delimiter character. Auto-detected from filename if not specified.
1080
+ defaults : list or str, optional
1081
+ Default values for columns when values are missing.
1082
+ strict : bool, default=False
1083
+ If True, enforces strict validation of column counts and raises errors on mismatch.
1084
+ correctColumnNum : int, default=-1
1085
+ Expected number of columns. -1 means auto-detect from header or first record.
1086
+ Attributes
1087
+ ----------
1088
+ version : str
1089
+ Version of the TSVZed implementation.
1090
+ dirty : bool
1091
+ True if the in-memory data differs from the file on disk.
1092
+ deSynced : bool
1093
+ True if synchronization with the file has failed or external changes detected.
1094
+ memoryOnly : bool
1095
+ If True, changes are kept in memory only and not written to disk.
1096
+ appendQueue : deque
1097
+ Queue of lines waiting to be appended to the file.
1098
+ writeLock : threading.Lock
1099
+ Lock for ensuring thread-safe file operations.
1100
+ shutdownEvent : threading.Event
1101
+ Event signal for stopping the append worker thread.
1102
+ appendThread : threading.Thread
1103
+ Background thread that handles asynchronous file appending.
1104
+ Methods
1105
+ -------
1106
+ load()
1107
+ Load or reload data from the TSV file.
1108
+ reload()
1109
+ Refresh data from the TSV file, discarding in-memory changes.
1110
+ rewrite(force=False, reloadInternalFromFile=None)
1111
+ Rewrite the entire file with current in-memory data.
1112
+ mapToFile()
1113
+ Synchronize in-memory data to the file using in-place updates.
1114
+ hardMapToFile()
1115
+ Completely rewrite the file from scratch with current data.
1116
+ clear()
1117
+ Clear all data from memory and optionally the file.
1118
+ clear_file()
1119
+ Clear the file, keeping only the header.
1120
+ commitAppendToFile()
1121
+ Write all queued append operations to the file.
1122
+ stopAppendThread()
1123
+ Stop the background append worker thread and perform final sync.
1124
+ setDefaults(defaults)
1125
+ Set default values for columns.
1126
+ getListView()
1127
+ Get a list representation of the data with headers.
1128
+ getResourceUsage(return_dict=False)
1129
+ Get current resource usage statistics.
1130
+ checkExternalChanges()
1131
+ Check if the file has been modified externally.
1132
+ close()
1133
+ Close the TSVZed object, stopping background threads and syncing data.
1134
+ Notes
1135
+ -----
1136
+ - The class uses a background thread to handle asynchronous file operations.
1137
+ - File locking is implemented for both POSIX and Windows systems.
1138
+ - Keys starting with '#' are treated as comments and not persisted to file.
1139
+ - The special key '#DEFAULTS#' is used to store column default values.
1140
+ - Supports compressed file formats through automatic detection.
1141
+ - Thread-safe for concurrent access from multiple threads.
1142
+ Examples
1143
+ --------
1144
+ >>> with TSVZed('data.tsv', header='id\tname\tvalue') as tsv:
1145
+ ... tsv['key1'] = ['key1', 'John', '100']
1146
+ ... tsv['key2'] = ['key2', 'Jane', '200']
1147
+ ... print(tsv['key1'])
1148
+ ['key1', 'John', '100']
1149
+ >>> tsv = TSVZed('data.tsv', verbose=True, rewrite_on_exit=True)
1150
+ >>> tsv['key3'] = 'key3\tBob\t300'
1151
+ >>> tsv.close()
1152
+ """
1153
+ def __teePrintOrNot(self,message,level = 'info'):
1154
+ try:
1155
+ if self.teeLogger:
1156
+ self.teeLogger.teelog(message,level)
1157
+ else:
1158
+ print(message,flush=True)
1159
+ except Exception:
1160
+ print(message,flush=True)
1161
+
1162
+ def getResourceUsage(self,return_dict = False):
1163
+ return get_resource_usage(return_dict = return_dict)
1164
+
1165
+ def __init__ (self,fileName,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,
1166
+ rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,monitor_external_changes = True,
1167
+ verbose = False,encoding = 'utf8',delimiter = ...,defaults = None,strict = False,correctColumnNum = -1):
1168
+ super().__init__()
1169
+ self.version = version
1170
+ self.strict = strict
1171
+ self.externalFileUpdateTime = getFileUpdateTimeNs(fileName)
1172
+ self.lastUpdateTime = self.externalFileUpdateTime
1173
+ self._fileName = fileName
1174
+ self.teeLogger = teeLogger
1175
+ self.delimiter = get_delimiter(delimiter,file_name=fileName)
1176
+ self.setDefaults(defaults)
1177
+ self.header = _formatHeader(header,verbose = verbose,teeLogger = self.teeLogger,delimiter=self.delimiter)
1178
+ self.correctColumnNum = correctColumnNum
1179
+ self.createIfNotExist = createIfNotExist
1180
+ self.verifyHeader = verifyHeader
1181
+ self.rewrite_on_load = rewrite_on_load
1182
+ self.rewrite_on_exit = rewrite_on_exit
1183
+ self.rewrite_interval = rewrite_interval
1184
+ self.monitor_external_changes = monitor_external_changes
1185
+ if not monitor_external_changes:
1186
+ self.__teePrintOrNot(f"Warning: External changes monitoring disabled for {self._fileName}. Will overwrite external changes.",'warning')
1187
+ self.verbose = verbose
1188
+ if append_check_delay < 0:
1189
+ append_check_delay = 0.00001
1190
+ self.__teePrintOrNot('append_check_delay cannot be less than 0, setting it to 0.00001','error')
1191
+ self.append_check_delay = append_check_delay
1192
+ self.appendQueue = deque()
1193
+ self.dirty = False
1194
+ self.deSynced = False
1195
+ self.memoryOnly = False
1196
+ self.encoding = encoding
1197
+ self.writeLock = threading.Lock()
1198
+ self.shutdownEvent = threading.Event()
1199
+ #self.appendEvent = threading.Event()
1200
+ self.appendThread = threading.Thread(target=self._appendWorker,daemon=True)
1201
+ self.appendThread.start()
1202
+ self.load()
1203
+ atexit.register(self.stopAppendThread)
1204
+
1205
+ def setDefaults(self,defaults):
1206
+ if not defaults:
1207
+ defaults = []
1208
+ if isinstance(defaults,str):
1209
+ defaults = defaults.split(self.delimiter)
1210
+ elif not isinstance(defaults,list):
1211
+ try:
1212
+ defaults = list(defaults)
1213
+ except Exception:
1214
+ if self.verbose:
1215
+ self.__teePrintOrNot('Invalid defaults, setting defaults to empty.','error')
1216
+ defaults = []
1217
+ defaults = [str(s).rstrip() if s else '' for s in defaults]
1218
+ if not any(defaults):
1219
+ defaults = []
1220
+ if not defaults or defaults[0] != DEFAULTS_INDICATOR_KEY:
1221
+ defaults = [DEFAULTS_INDICATOR_KEY]+defaults
1222
+ self.defaults = defaults
1223
+
1224
+ def load(self):
1225
+ self.reload()
1226
+ if self.rewrite_on_load:
1227
+ self.rewrite(force = True,reloadInternalFromFile = False)
1228
+ return self
1229
+
1230
+ def reload(self):
1231
+ # Load or refresh data from the TSV file
1232
+ mo = self.memoryOnly
1233
+ self.memoryOnly = True
1234
+ if self.verbose:
1235
+ self.__teePrintOrNot(f"Loading {self._fileName}")
1236
+ super().clear()
1237
+ readTabularFile(self._fileName, teeLogger = self.teeLogger, header = self.header,
1238
+ createIfNotExist = self.createIfNotExist, verifyHeader = self.verifyHeader,
1239
+ verbose = self.verbose, taskDic = self,encoding = self.encoding if self.encoding else None,
1240
+ strict = self.strict, delimiter = self.delimiter, defaults=self.defaults)
1241
+ if self.verbose:
1242
+ self.__teePrintOrNot(f"Loaded {len(self)} records from {self._fileName}")
1243
+ if self.header and any(self.header) and self.verifyHeader:
1244
+ self.correctColumnNum = len(self.header)
1245
+ elif self:
1246
+ self.correctColumnNum = len(self[next(iter(self))])
1247
+ else:
1248
+ self.correctColumnNum = -1
1249
+ if self.verbose:
1250
+ self.__teePrintOrNot(f"correctColumnNum: {self.correctColumnNum}")
1251
+ #super().update(loadedData)
1252
+ if self.verbose:
1253
+ self.__teePrintOrNot(f"TSVZed({self._fileName}) loaded")
1254
+ self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1255
+ self.lastUpdateTime = self.externalFileUpdateTime
1256
+ self.memoryOnly = mo
1257
+ return self
1258
+
1259
+ def __setitem__(self,key,value):
1260
+ key = str(key).rstrip()
1261
+ if not key:
1262
+ self.__teePrintOrNot('Key cannot be empty','error')
1263
+ return
1264
+ if isinstance(value,str):
1265
+ value = value.split(self.delimiter)
1266
+ # sanitize the value
1267
+ value = [str(s).rstrip() if s else '' for s in value]
1268
+ # the first field in value should be the key
1269
+ # add it if it is not there
1270
+ if not value or value[0] != key:
1271
+ value = [key]+value
1272
+ # verify the value has the correct number of columns
1273
+ if self.correctColumnNum != 1 and len(value) == 1:
1274
+ # this means we want to clear / delete the key
1275
+ del self[key]
1276
+ elif self.correctColumnNum > 0:
1277
+ if len(value) != self.correctColumnNum:
1278
+ if self.strict:
1279
+ self.__teePrintOrNot(f"Value {value} does not have the correct number of columns: {self.correctColumnNum}. Refuse adding key...",'error')
1280
+ return
1281
+ elif self.verbose:
1282
+ self.__teePrintOrNot(f"Value {value} does not have the correct number of columns: {self.correctColumnNum}, correcting...",'warning')
1283
+ if len(value) < self.correctColumnNum:
1284
+ value += ['']*(self.correctColumnNum-len(value))
1285
+ elif len(value) > self.correctColumnNum:
1286
+ value = value[:self.correctColumnNum]
1287
+ else:
1288
+ self.correctColumnNum = len(value)
1289
+ if self.defaults and len(self.defaults) > 1:
1290
+ for i in range(1,len(value)):
1291
+ if not value[i] and i < len(self.defaults) and self.defaults[i]:
1292
+ value[i] = self.defaults[i]
1293
+ if self.verbose:
1294
+ self.__teePrintOrNot(f" Replacing empty value at {i} with default: {self.defaults[i]}")
1295
+ if key == DEFAULTS_INDICATOR_KEY:
1296
+ self.defaults = value
1297
+ if self.verbose:
1298
+ self.__teePrintOrNot(f"Defaults set to {value}")
1299
+ if not self.memoryOnly:
1300
+ self.appendQueue.append(value)
1301
+ self.lastUpdateTime = get_time_ns()
1302
+ if self.verbose:
1303
+ self.__teePrintOrNot(f"Appending Defaults {key} to the appendQueue")
1304
+ return
1305
+ if self.verbose:
1306
+ self.__teePrintOrNot(f"Setting {key} to {value}")
1307
+ if key in self:
1308
+ if self[key] == value:
1309
+ if self.verbose:
1310
+ self.__teePrintOrNot(f"Key {key} already exists with the same value")
1311
+ return
1312
+ self.dirty = True
1313
+ # update the dictionary,
1314
+ super().__setitem__(key,value)
1315
+ if self.memoryOnly:
1316
+ if self.verbose:
1317
+ self.__teePrintOrNot(f"Key {key} updated in memory only")
1318
+ return
1319
+ elif key.startswith('#'):
1320
+ if self.verbose:
1321
+ self.__teePrintOrNot(f"Key {key} updated in memory only as it starts with #")
1322
+ return
1323
+ if self.verbose:
1324
+ self.__teePrintOrNot(f"Appending {key} to the appendQueue")
1325
+ self.appendQueue.append(value)
1326
+ self.lastUpdateTime = get_time_ns()
1327
+ # if not self.appendThread.is_alive():
1328
+ # self.commitAppendToFile()
1329
+ # else:
1330
+ # self.appendEvent.set()
1331
+
1332
+ def __getitem__(self, key):
1333
+ return super().__getitem__(str(key).rstrip())
1334
+
1335
+
1336
+ def __delitem__(self,key):
1337
+ key = str(key).rstrip()
1338
+ if key == DEFAULTS_INDICATOR_KEY:
1339
+ self.defaults = [DEFAULTS_INDICATOR_KEY]
1340
+ if self.verbose:
1341
+ self.__teePrintOrNot("Defaults cleared")
1342
+ if not self.memoryOnly:
1343
+ self.__appendEmptyLine(key)
1344
+ if self.verbose:
1345
+ self.__teePrintOrNot(f"Appending empty default line {key}")
1346
+ return
1347
+ # delete the key from the dictionary and update the file
1348
+ if key not in self:
1349
+ if self.verbose:
1350
+ self.__teePrintOrNot(f"Key {key} not found")
1351
+ return
1352
+ super().__delitem__(key)
1353
+ if self.memoryOnly or key.startswith('#'):
1354
+ if self.verbose:
1355
+ self.__teePrintOrNot(f"Key {key} deleted in memory")
1356
+ return
1357
+ self.__appendEmptyLine(key)
1358
+ if self.verbose:
1359
+ self.__teePrintOrNot(f"Appending empty line {key}")
1360
+ self.lastUpdateTime = get_time_ns()
1361
+
1362
+ def __appendEmptyLine(self,key):
1363
+ self.dirty = True
1364
+ if self.correctColumnNum > 0:
1365
+ emptyLine = [key]+[self.delimiter]*(self.correctColumnNum-1)
1366
+ elif len(self[key]) > 1:
1367
+ self.correctColumnNum = len(self[key])
1368
+ emptyLine = [key]+[self.delimiter]*(self.correctColumnNum-1)
1369
+ else:
1370
+ emptyLine = [key]
1371
+ if self.verbose:
1372
+ self.__teePrintOrNot(f"Appending {emptyLine} to the appendQueue")
1373
+ self.appendQueue.append(emptyLine)
1374
+ return self
1375
+
1376
+ def getListView(self):
1377
+ return getListView(self,header=self.header,delimiter=self.delimiter)
1378
+
1379
+ def clear(self):
1380
+ # clear the dictionary and update the file
1381
+ super().clear()
1382
+ if self.verbose:
1383
+ self.__teePrintOrNot(f"Clearing {self._fileName}")
1384
+ if self.memoryOnly:
1385
+ return self
1386
+ self.clear_file()
1387
+ self.lastUpdateTime = self.externalFileUpdateTime
1388
+ return self
1389
+
1390
+ def clear_file(self):
1391
+ try:
1392
+ if self.header:
1393
+ file = self.get_file_obj('wb')
1394
+ header = self.delimiter.join(_sanitize(self.header,delimiter=self.delimiter))
1395
+ file.write(header.encode(self.encoding,errors='replace') + b'\n')
1396
+ self.release_file_obj(file)
1397
+ if self.verbose:
1398
+ self.__teePrintOrNot(f"Header {header} written to {self._fileName}")
1399
+ self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
1400
+ else:
1401
+ file = self.get_file_obj('wb')
1402
+ self.release_file_obj(file)
1403
+ if self.verbose:
1404
+ self.__teePrintOrNot(f"File {self._fileName} cleared empty")
1405
+ self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
1406
+ self.dirty = False
1407
+ self.deSynced = False
1408
+ except Exception as e:
1409
+ self.release_file_obj(file)
1410
+ self.__teePrintOrNot(f"Failed to write at clear_file() to {self._fileName}: {e}",'error')
1411
+ import traceback
1412
+ self.__teePrintOrNot(traceback.format_exc(),'error')
1413
+ self.deSynced = True
1414
+ return self
1415
+
1416
+ def __enter__(self):
1417
+ return self
1418
+
1419
+ def close(self):
1420
+ self.stopAppendThread()
1421
+ return self
1422
+
1423
+ def __exit__(self,exc_type,exc_value,traceback):
1424
+ return self.close()
1425
+
1426
+ def __repr__(self):
1427
+ return f"""TSVZed(
1197
1428
  file_name:{self._fileName}
1198
1429
  teeLogger:{self.teeLogger}
1199
1430
  header:{self.header}
@@ -1210,372 +1441,860 @@ dirty:{self.dirty}
1210
1441
  deSynced:{self.deSynced}
1211
1442
  memoryOnly:{self.memoryOnly}
1212
1443
  {dict(self)})"""
1213
-
1214
- def __str__(self):
1215
- return f"TSVZed({self._fileName},{dict(self)})"
1216
-
1217
- def __del__(self):
1218
- return self.close()
1219
-
1220
- def popitem(self, last=True):
1221
- key, value = super().popitem(last)
1222
- if not self.memoryOnly:
1223
- self.__appendEmptyLine(key)
1224
- self.lastUpdateTime = get_time_ns()
1225
- return key, value
1226
-
1227
- __marker = object()
1228
-
1229
- def pop(self, key, default=__marker):
1230
- '''od.pop(k[,d]) -> v, remove specified key and return the corresponding
1231
- value. If key is not found, d is returned if given, otherwise KeyError
1232
- is raised.
1233
-
1234
- '''
1235
- if key not in self:
1236
- if default is self.__marker:
1237
- raise KeyError(key)
1238
- return default
1239
- value = super().pop(key)
1240
- if not self.memoryOnly:
1241
- self.__appendEmptyLine(key)
1242
- self.lastUpdateTime = get_time_ns()
1243
- return value
1244
-
1245
- def move_to_end(self, key, last=True):
1246
- '''Move an existing element to the end (or beginning if last is false).
1247
- Raise KeyError if the element does not exist.
1248
- '''
1249
- super().move_to_end(key, last)
1250
- self.dirty = True
1251
- if not self.rewrite_on_exit:
1252
- self.rewrite_on_exit = True
1253
- self.__teePrintOrNot(f"Warning: move_to_end had been called. Need to resync for changes to apply to disk.")
1254
- self.__teePrintOrNot(f"rewrite_on_exit set to True")
1255
- if self.verbose:
1256
- self.__teePrintOrNot(f"Warning: Trying to move Key {key} moved to {'end' if last else 'beginning'} Need to resync for changes to apply to disk")
1257
- self.lastUpdateTime = get_time_ns()
1258
- return self
1259
-
1260
- @classmethod
1261
- def fromkeys(cls, iterable, value=None,fileName = None,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,verbose = False):
1262
- '''Create a new ordered dictionary with keys from iterable and values set to value.
1263
- '''
1264
- self = cls(fileName,teeLogger,header,createIfNotExist,verifyHeader,rewrite_on_load,rewrite_on_exit,rewrite_interval,append_check_delay,verbose)
1265
- for key in iterable:
1266
- self[key] = value
1267
- return self
1268
-
1269
-
1270
- def rewrite(self,force = False,reloadInternalFromFile = None):
1271
- if not self.deSynced and not force:
1272
- if not self.dirty:
1273
- return False
1274
- if self.rewrite_interval == 0 or time.time() - os.path.getmtime(self._fileName) < self.rewrite_interval:
1275
- return False
1276
- try:
1277
-
1278
- if reloadInternalFromFile is None:
1279
- reloadInternalFromFile = self.monitor_external_changes
1280
- if reloadInternalFromFile and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
1281
- # this will be needed if more than 1 process is accessing the file
1282
- self.commitAppendToFile()
1283
- self.reload()
1284
- if self.memoryOnly:
1285
- if self.verbose:
1286
- self.__teePrintOrNot(f"Memory only mode. Map to file skipped.")
1287
- return False
1288
- if self.dirty:
1289
- if self.verbose:
1290
- self.__teePrintOrNot(f"Rewriting {self._fileName}")
1291
- self.mapToFile()
1292
- if self.verbose:
1293
- self.__teePrintOrNot(f"{len(self)} records rewrote to {self._fileName}")
1294
- if not self.appendThread.is_alive():
1295
- self.commitAppendToFile()
1296
- # else:
1297
- # self.appendEvent.set()
1298
- return True
1299
- except Exception as e:
1300
- self.__teePrintOrNot(f"Failed to write at sync() to {self._fileName}: {e}",'error')
1301
- import traceback
1302
- self.__teePrintOrNot(traceback.format_exc(),'error')
1303
- self.deSynced = True
1304
- return False
1305
-
1306
- def hardMapToFile(self):
1307
- try:
1308
- if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
1309
- self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
1310
- file = self.get_file_obj('wb')
1311
- if self.header:
1312
- file.write(self.header.encode(self.encoding,errors='replace') + b'\n')
1313
- for key in self:
1314
- file.write(self.delimiter.join(self[key]).encode(encoding=self.encoding,errors='replace')+b'\n')
1315
- self.release_file_obj(file)
1316
- if self.verbose:
1317
- self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
1318
- self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
1319
- self.dirty = False
1320
- self.deSynced = False
1321
- except Exception as e:
1322
- self.release_file_obj(file)
1323
- self.__teePrintOrNot(f"Failed to write at hardMapToFile() to {self._fileName}: {e}",'error')
1324
- import traceback
1325
- self.__teePrintOrNot(traceback.format_exc(),'error')
1326
- self.deSynced = True
1327
- return self
1328
-
1329
- def mapToFile(self):
1330
- mec = self.monitor_external_changes
1331
- self.monitor_external_changes = False
1332
- try:
1333
- if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
1334
- self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
1335
- if self._fileName.rpartition('.')[2] in COMPRESSED_FILE_EXTENSIONS:
1336
- # if the file is compressed, we need to use the hardMapToFile method
1337
- return self.hardMapToFile()
1338
- file = self.get_file_obj('r+b')
1339
- overWrite = False
1340
- if self.header:
1341
- line = file.readline().decode(self.encoding,errors='replace')
1342
- aftPos = file.tell()
1343
- if not _lineContainHeader(self.header,line,verbose = self.verbose,teeLogger = self.teeLogger,strict = self.strict):
1344
- file.seek(0)
1345
- file.write(f'{self.header}\n'.encode(encoding=self.encoding,errors='replace'))
1346
- # if the header is not the same length as the line, we need to overwrite the file
1347
- if aftPos != file.tell():
1348
- overWrite = True
1349
- if self.verbose:
1350
- self.__teePrintOrNot(f"Header {self.header} written to {self._fileName}")
1351
- for value in self.values():
1352
- if value[0].startswith('#'):
1353
- continue
1354
- strToWrite = self.delimiter.join(value)
1355
- if overWrite:
1356
- if self.verbose:
1357
- self.__teePrintOrNot(f"Overwriting {value} to {self._fileName}")
1358
- file.write(strToWrite.encode(encoding=self.encoding,errors='replace')+b'\n')
1359
- continue
1360
- pos = file.tell()
1361
- line = file.readline()
1362
- aftPos = file.tell()
1363
- if not line or pos == aftPos:
1364
- if self.verbose:
1365
- self.__teePrintOrNot(f"End of file reached. Appending {value} to {self._fileName}")
1366
- file.write(strToWrite.encode(encoding=self.encoding,errors='replace'))
1367
- overWrite = True
1368
- continue
1369
- strToWrite = strToWrite.encode(encoding=self.encoding,errors='replace').ljust(len(line)-1)+b'\n'
1370
- if line != strToWrite:
1371
- if self.verbose:
1372
- self.__teePrintOrNot(f"Modifing {value} to {self._fileName}")
1373
- file.seek(pos)
1374
- # fill the string with space to write to the correct length
1375
- #file.write(strToWrite.rstrip('\n').ljust(len(line)-1)+'\n')
1376
- file.write(strToWrite)
1377
- if aftPos != file.tell():
1378
- overWrite = True
1379
- file.truncate()
1380
- self.release_file_obj(file)
1381
- if self.verbose:
1382
- self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
1383
- self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
1384
- self.dirty = False
1385
- self.deSynced = False
1386
- except Exception as e:
1387
- self.release_file_obj(file)
1388
- self.__teePrintOrNot(f"Failed to write at mapToFile() to {self._fileName}: {e}",'error')
1389
- import traceback
1390
- self.__teePrintOrNot(traceback.format_exc(),'error')
1391
- self.deSynced = True
1392
- self.__teePrintOrNot("Trying failback hardMapToFile()")
1393
- self.hardMapToFile()
1394
- self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1395
- self.monitor_external_changes = mec
1396
- return self
1397
-
1398
- def checkExternalChanges(self):
1399
- if self.deSynced:
1400
- return self
1401
- if not self.monitor_external_changes:
1402
- return self
1403
- realExternalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1404
- if self.externalFileUpdateTime < realExternalFileUpdateTime:
1405
- self.deSynced = True
1406
- self.__teePrintOrNot(f"External changes detected in {self._fileName}")
1407
- elif self.externalFileUpdateTime > realExternalFileUpdateTime:
1408
- self.__teePrintOrNot(f"Time anomalies detected in {self._fileName}, resetting externalFileUpdateTime")
1409
- self.externalFileUpdateTime = realExternalFileUpdateTime
1410
- return self
1411
-
1412
- def _appendWorker(self):
1413
- while not self.shutdownEvent.is_set():
1414
- if not self.memoryOnly:
1415
- self.checkExternalChanges()
1416
- self.rewrite()
1417
- self.commitAppendToFile()
1418
- time.sleep(self.append_check_delay)
1419
- # self.appendEvent.wait()
1420
- # self.appendEvent.clear()
1421
- if self.verbose:
1422
- self.__teePrintOrNot(f"Append worker for {self._fileName} shut down")
1423
- self.commitAppendToFile()
1424
-
1425
- def commitAppendToFile(self):
1426
- if self.appendQueue:
1427
- if self.memoryOnly:
1428
- self.appendQueue.clear()
1429
- if self.verbose:
1430
- self.__teePrintOrNot(f"Memory only mode. Append queue cleared.")
1431
- return self
1432
- try:
1433
- if self.verbose:
1434
- self.__teePrintOrNot(f"Commiting {len(self.appendQueue)} records to {self._fileName}")
1435
- self.__teePrintOrNot(f"Before size of {self._fileName}: {os.path.getsize(self._fileName)}")
1436
- file = self.get_file_obj('ab')
1437
- while self.appendQueue:
1438
- line = self.appendQueue.popleft()
1439
- file.write(line.encode(encoding=self.encoding,errors='replace')+b'\n')
1440
- self.release_file_obj(file)
1441
- if self.verbose:
1442
- self.__teePrintOrNot(f"Records commited to {self._fileName}")
1443
- self.__teePrintOrNot(f"After size of {self._fileName}: {os.path.getsize(self._fileName)}")
1444
- except Exception as e:
1445
- self.release_file_obj(file)
1446
- self.__teePrintOrNot(f"Failed to write at commitAppendToFile to {self._fileName}: {e}",'error')
1447
- import traceback
1448
- self.__teePrintOrNot(traceback.format_exc(),'error')
1449
- self.deSynced = True
1450
- return self
1451
-
1452
- def stopAppendThread(self):
1453
- try:
1454
- if self.shutdownEvent.is_set():
1455
- # if self.verbose:
1456
- # self.__teePrintOrNot(f"Append thread for {self._fileName} already stopped")
1457
- return
1458
- self.rewrite(force=self.rewrite_on_exit) # Ensure any final sync operations are performed
1459
- # self.appendEvent.set()
1460
- self.shutdownEvent.set() # Signal the append thread to shut down
1461
- self.appendThread.join() # Wait for the append thread to complete
1462
- if self.verbose:
1463
- self.__teePrintOrNot(f"Append thread for {self._fileName} stopped")
1464
- except Exception as e:
1465
- self.__teePrintOrNot(f"Failed to stop append thread for {self._fileName}: {e}",'error')
1466
- import traceback
1467
- self.__teePrintOrNot(traceback.format_exc(),'error')
1468
-
1469
- def get_file_obj(self,modes = 'ab'):
1470
- self.writeLock.acquire()
1471
- try:
1472
- if not self.encoding:
1473
- self.encoding = 'utf8'
1474
- file = openFileAsCompressed(self._fileName, mode=modes, encoding=self.encoding,teeLogger=self.teeLogger)
1475
- # Lock the file after opening
1476
- if os.name == 'posix':
1477
- fcntl.lockf(file, fcntl.LOCK_EX)
1478
- elif os.name == 'nt':
1479
- # For Windows, locking the entire file, avoiding locking an empty file
1480
- #lock_length = max(1, os.path.getsize(self._fileName))
1481
- lock_length = 2147483647
1482
- msvcrt.locking(file.fileno(), msvcrt.LK_LOCK, lock_length)
1483
- if self.verbose:
1484
- self.__teePrintOrNot(f"File {self._fileName} locked with mode {modes}")
1485
- except Exception as e:
1486
- try:
1487
- self.writeLock.release() # Release the thread lock in case of an error
1488
- except Exception as e:
1489
- self.__teePrintOrNot(f"Failed to release writeLock for {self._fileName}: {e}",'error')
1490
- self.__teePrintOrNot(f"Failed to open file {self._fileName}: {e}",'error')
1491
- return file
1492
-
1493
- def release_file_obj(self,file):
1494
- # if write lock is already released, return
1495
- if not self.writeLock.locked():
1496
- return
1497
- try:
1498
- file.flush() # Ensure the file is flushed before unlocking
1499
- os.fsync(file.fileno()) # Ensure the file is synced to disk before unlocking
1500
- if not file.closed:
1501
- if os.name == 'posix':
1502
- fcntl.lockf(file, fcntl.LOCK_UN)
1503
- elif os.name == 'nt':
1504
- # Unlocking the entire file; for Windows, ensure not unlocking an empty file
1505
- #unlock_length = max(1, os.path.getsize(os.path.realpath(file.name)))
1506
- unlock_length = 2147483647
1507
- try:
1508
- msvcrt.locking(file.fileno(), msvcrt.LK_UNLCK, unlock_length)
1509
- except:
1510
- pass
1511
- file.close() # Ensure file is closed after unlocking
1512
- if self.verbose:
1513
- self.__teePrintOrNot(f"File {file.name} unlocked / released")
1514
- except Exception as e:
1515
- try:
1516
- self.writeLock.release() # Ensure the thread lock is always released
1517
- except Exception as e:
1518
- self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
1519
- self.__teePrintOrNot(f"Failed to release file {file.name}: {e}",'error')
1520
- import traceback
1521
- self.__teePrintOrNot(traceback.format_exc(),'error')
1522
- # release the write lock if not already released
1523
- if self.writeLock.locked():
1524
- try:
1525
- self.writeLock.release() # Ensure the thread lock is always released
1526
- except Exception as e:
1527
- self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
1528
- self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1444
+
1445
+ def __str__(self):
1446
+ return f"TSVZed({self._fileName},{dict(self)})"
1447
+
1448
+ def __del__(self):
1449
+ return self.close()
1450
+
1451
+ def popitem(self, last=True):
1452
+ key, value = super().popitem(last)
1453
+ if not self.memoryOnly:
1454
+ self.__appendEmptyLine(key)
1455
+ self.lastUpdateTime = get_time_ns()
1456
+ return key, value
1457
+
1458
+ __marker = object()
1459
+
1460
+ def pop(self, key, default=__marker):
1461
+ '''od.pop(k[,d]) -> v, remove specified key and return the corresponding
1462
+ value. If key is not found, d is returned if given, otherwise KeyError
1463
+ is raised.
1464
+
1465
+ '''
1466
+ key = str(key).rstrip()
1467
+ if key not in self:
1468
+ if default is self.__marker:
1469
+ raise KeyError(key)
1470
+ return default
1471
+ value = super().pop(key)
1472
+ if not self.memoryOnly:
1473
+ self.__appendEmptyLine(key)
1474
+ self.lastUpdateTime = get_time_ns()
1475
+ return value
1476
+
1477
+ def move_to_end(self, key, last=True):
1478
+ '''Move an existing element to the end (or beginning if last is false).
1479
+ Raise KeyError if the element does not exist.
1480
+ '''
1481
+ key = str(key).rstrip()
1482
+ super().move_to_end(key, last)
1483
+ self.dirty = True
1484
+ if not self.rewrite_on_exit:
1485
+ self.rewrite_on_exit = True
1486
+ self.__teePrintOrNot("Warning: move_to_end had been called. Need to resync for changes to apply to disk.")
1487
+ self.__teePrintOrNot("rewrite_on_exit set to True")
1488
+ if self.verbose:
1489
+ self.__teePrintOrNot(f"Warning: Trying to move Key {key} moved to {'end' if last else 'beginning'} Need to resync for changes to apply to disk")
1490
+ self.lastUpdateTime = get_time_ns()
1491
+ return self
1492
+
1493
+ def __sizeof__(self):
1494
+ sizeof = sys.getsizeof
1495
+ size = sizeof(super()) + sizeof(True) * 12 # for the booleans / integers
1496
+ size += sizeof(self.externalFileUpdateTime)
1497
+ size += sizeof(self.lastUpdateTime)
1498
+ size += sizeof(self._fileName)
1499
+ size += sizeof(self.teeLogger)
1500
+ size += sizeof(self.delimiter)
1501
+ size += sizeof(self.defaults)
1502
+ size += sizeof(self.header)
1503
+ size += sizeof(self.appendQueue)
1504
+ size += sizeof(self.encoding)
1505
+ size += sizeof(self.writeLock)
1506
+ size += sizeof(self.shutdownEvent)
1507
+ size += sizeof(self.appendThread)
1508
+ size += super().__sizeof__()
1509
+ return size
1510
+
1511
+ @classmethod
1512
+ def fromkeys(cls, iterable, value=None,fileName = None,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,verbose = False):
1513
+ '''Create a new ordered dictionary with keys from iterable and values set to value.
1514
+ '''
1515
+ self = cls(fileName,teeLogger,header,createIfNotExist,verifyHeader,rewrite_on_load,rewrite_on_exit,rewrite_interval,append_check_delay,verbose)
1516
+ for key in iterable:
1517
+ self[key] = value
1518
+ return self
1519
+
1520
+
1521
+ def rewrite(self,force = False,reloadInternalFromFile = None):
1522
+ if not self.deSynced and not force:
1523
+ if not self.dirty:
1524
+ return False
1525
+ if self.rewrite_interval == 0 or time.time() - os.path.getmtime(self._fileName) < self.rewrite_interval:
1526
+ return False
1527
+ try:
1528
+
1529
+ if reloadInternalFromFile is None:
1530
+ reloadInternalFromFile = self.monitor_external_changes
1531
+ if reloadInternalFromFile and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
1532
+ # this will be needed if more than 1 process is accessing the file
1533
+ self.commitAppendToFile()
1534
+ self.reload()
1535
+ if self.memoryOnly:
1536
+ if self.verbose:
1537
+ self.__teePrintOrNot("Memory only mode. Map to file skipped.")
1538
+ return False
1539
+ if self.dirty:
1540
+ if self.verbose:
1541
+ self.__teePrintOrNot(f"Rewriting {self._fileName}")
1542
+ self.mapToFile()
1543
+ if self.verbose:
1544
+ self.__teePrintOrNot(f"{len(self)} records rewrote to {self._fileName}")
1545
+ if not self.appendThread.is_alive():
1546
+ self.commitAppendToFile()
1547
+ # else:
1548
+ # self.appendEvent.set()
1549
+ return True
1550
+ except Exception as e:
1551
+ self.__teePrintOrNot(f"Failed to write at sync() to {self._fileName}: {e}",'error')
1552
+ import traceback
1553
+ self.__teePrintOrNot(traceback.format_exc(),'error')
1554
+ self.deSynced = True
1555
+ return False
1556
+
1557
+ def hardMapToFile(self):
1558
+ try:
1559
+ if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
1560
+ self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
1561
+ file = self.get_file_obj('wb')
1562
+ buf = io.BufferedWriter(file, buffer_size=64*1024*1024) # 64MB buffer
1563
+ if self.header:
1564
+ header = self.delimiter.join(_sanitize(self.header,delimiter=self.delimiter))
1565
+ buf.write(header.encode(self.encoding,errors='replace') + b'\n')
1566
+ for key in self:
1567
+ segments = _sanitize(self[key],delimiter=self.delimiter)
1568
+ buf.write(self.delimiter.join(segments).encode(encoding=self.encoding,errors='replace')+b'\n')
1569
+ buf.flush()
1570
+ self.release_file_obj(file)
1571
+ if self.verbose:
1572
+ self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
1573
+ self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
1574
+ self.dirty = False
1575
+ self.deSynced = False
1576
+ except Exception as e:
1577
+ self.release_file_obj(file)
1578
+ self.__teePrintOrNot(f"Failed to write at hardMapToFile() to {self._fileName}: {e}",'error')
1579
+ import traceback
1580
+ self.__teePrintOrNot(traceback.format_exc(),'error')
1581
+ self.deSynced = True
1582
+ return self
1583
+
1584
+ def mapToFile(self):
1585
+ mec = self.monitor_external_changes
1586
+ self.monitor_external_changes = False
1587
+ try:
1588
+ if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
1589
+ self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
1590
+ if self._fileName.rpartition('.')[2] in COMPRESSED_FILE_EXTENSIONS:
1591
+ # if the file is compressed, we need to use the hardMapToFile method
1592
+ return self.hardMapToFile()
1593
+ file = self.get_file_obj('r+b')
1594
+ overWrite = False
1595
+ if self.header:
1596
+ line = file.readline().decode(self.encoding,errors='replace')
1597
+ aftPos = file.tell()
1598
+ if not _lineContainHeader(self.header,line,verbose = self.verbose,teeLogger = self.teeLogger,strict = self.strict):
1599
+ header = self.delimiter.join(_sanitize(self.header,delimiter=self.delimiter))
1600
+ file.seek(0)
1601
+ file.write(f'{header}\n'.encode(encoding=self.encoding,errors='replace'))
1602
+ # if the header is not the same length as the line, we need to overwrite the file
1603
+ if aftPos != file.tell():
1604
+ overWrite = True
1605
+ if self.verbose:
1606
+ self.__teePrintOrNot(f"Header {header} written to {self._fileName}")
1607
+ for value in self.values():
1608
+ if value[0].startswith('#'):
1609
+ continue
1610
+ segments = _sanitize(value,delimiter=self.delimiter)
1611
+ strToWrite = self.delimiter.join(segments)
1612
+ if overWrite:
1613
+ if self.verbose:
1614
+ self.__teePrintOrNot(f"Overwriting {value} to {self._fileName}")
1615
+ file.write(strToWrite.encode(encoding=self.encoding,errors='replace')+b'\n')
1616
+ continue
1617
+ pos = file.tell()
1618
+ line = file.readline()
1619
+ aftPos = file.tell()
1620
+ if not line or pos == aftPos:
1621
+ if self.verbose:
1622
+ self.__teePrintOrNot(f"End of file reached. Appending {value} to {self._fileName}")
1623
+ file.write(strToWrite.encode(encoding=self.encoding,errors='replace'))
1624
+ overWrite = True
1625
+ continue
1626
+ strToWrite = strToWrite.encode(encoding=self.encoding,errors='replace').ljust(len(line)-1)+b'\n'
1627
+ if line != strToWrite:
1628
+ if self.verbose:
1629
+ self.__teePrintOrNot(f"Modifing {value} to {self._fileName}")
1630
+ file.seek(pos)
1631
+ # fill the string with space to write to the correct length
1632
+ file.write(strToWrite)
1633
+ if aftPos != file.tell():
1634
+ overWrite = True
1635
+ file.truncate()
1636
+ self.release_file_obj(file)
1637
+ if self.verbose:
1638
+ self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
1639
+ self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
1640
+ self.dirty = False
1641
+ self.deSynced = False
1642
+ except Exception as e:
1643
+ self.release_file_obj(file)
1644
+ self.__teePrintOrNot(f"Failed to write at mapToFile() to {self._fileName}: {e}",'error')
1645
+ import traceback
1646
+ self.__teePrintOrNot(traceback.format_exc(),'error')
1647
+ self.deSynced = True
1648
+ self.__teePrintOrNot("Trying failback hardMapToFile()")
1649
+ self.hardMapToFile()
1650
+ self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1651
+ self.monitor_external_changes = mec
1652
+ return self
1653
+
1654
+ def checkExternalChanges(self):
1655
+ if self.deSynced:
1656
+ return self
1657
+ if not self.monitor_external_changes:
1658
+ return self
1659
+ realExternalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1660
+ if self.externalFileUpdateTime < realExternalFileUpdateTime:
1661
+ self.deSynced = True
1662
+ self.__teePrintOrNot(f"External changes detected in {self._fileName}")
1663
+ elif self.externalFileUpdateTime > realExternalFileUpdateTime:
1664
+ self.__teePrintOrNot(f"Time anomalies detected in {self._fileName}, resetting externalFileUpdateTime")
1665
+ self.externalFileUpdateTime = realExternalFileUpdateTime
1666
+ return self
1667
+
1668
+ def _appendWorker(self):
1669
+ while not self.shutdownEvent.is_set():
1670
+ if not self.memoryOnly:
1671
+ self.checkExternalChanges()
1672
+ self.rewrite()
1673
+ self.commitAppendToFile()
1674
+ time.sleep(self.append_check_delay)
1675
+ # self.appendEvent.wait()
1676
+ # self.appendEvent.clear()
1677
+ if self.verbose:
1678
+ self.__teePrintOrNot(f"Append worker for {self._fileName} shut down")
1679
+ self.commitAppendToFile()
1680
+
1681
+ def commitAppendToFile(self):
1682
+ if self.appendQueue:
1683
+ if self.memoryOnly:
1684
+ self.appendQueue.clear()
1685
+ if self.verbose:
1686
+ self.__teePrintOrNot("Memory only mode. Append queue cleared.")
1687
+ return self
1688
+ try:
1689
+ if self.verbose:
1690
+ self.__teePrintOrNot(f"Commiting {len(self.appendQueue)} records to {self._fileName}")
1691
+ self.__teePrintOrNot(f"Before size of {self._fileName}: {os.path.getsize(self._fileName)}")
1692
+ file = self.get_file_obj('ab')
1693
+ buf = io.BufferedWriter(file, buffer_size=64*1024*1024) # 64MB buffer
1694
+ while self.appendQueue:
1695
+ line = _sanitize(self.appendQueue.popleft(),delimiter=self.delimiter)
1696
+ buf.write(self.delimiter.join(line).encode(encoding=self.encoding,errors='replace')+b'\n')
1697
+ buf.flush()
1698
+ self.release_file_obj(file)
1699
+ if self.verbose:
1700
+ self.__teePrintOrNot(f"Records commited to {self._fileName}")
1701
+ self.__teePrintOrNot(f"After size of {self._fileName}: {os.path.getsize(self._fileName)}")
1702
+ except Exception as e:
1703
+ self.release_file_obj(file)
1704
+ self.__teePrintOrNot(f"Failed to write at commitAppendToFile to {self._fileName}: {e}",'error')
1705
+ import traceback
1706
+ self.__teePrintOrNot(traceback.format_exc(),'error')
1707
+ self.deSynced = True
1708
+ return self
1709
+
1710
+ def stopAppendThread(self):
1711
+ try:
1712
+ if self.shutdownEvent.is_set():
1713
+ # if self.verbose:
1714
+ # self.__teePrintOrNot(f"Append thread for {self._fileName} already stopped")
1715
+ return
1716
+ self.rewrite(force=self.rewrite_on_exit) # Ensure any final sync operations are performed
1717
+ # self.appendEvent.set()
1718
+ self.shutdownEvent.set() # Signal the append thread to shut down
1719
+ self.appendThread.join() # Wait for the append thread to complete
1720
+ if self.verbose:
1721
+ self.__teePrintOrNot(f"Append thread for {self._fileName} stopped")
1722
+ except Exception as e:
1723
+ self.__teePrintOrNot(f"Failed to stop append thread for {self._fileName}: {e}",'error')
1724
+ import traceback
1725
+ self.__teePrintOrNot(traceback.format_exc(),'error')
1726
+
1727
+ def get_file_obj(self,modes = 'ab'):
1728
+ self.writeLock.acquire()
1729
+ try:
1730
+ if not self.encoding:
1731
+ self.encoding = 'utf8'
1732
+ file = openFileAsCompressed(self._fileName, mode=modes, encoding=self.encoding,teeLogger=self.teeLogger)
1733
+ # Lock the file after opening
1734
+ if os.name == 'posix':
1735
+ fcntl.lockf(file, fcntl.LOCK_EX)
1736
+ elif os.name == 'nt':
1737
+ # For Windows, locking the entire file, avoiding locking an empty file
1738
+ #lock_length = max(1, os.path.getsize(self._fileName))
1739
+ lock_length = 2147483647
1740
+ msvcrt.locking(file.fileno(), msvcrt.LK_LOCK, lock_length)
1741
+ if self.verbose:
1742
+ self.__teePrintOrNot(f"File {self._fileName} locked with mode {modes}")
1743
+ except Exception as e:
1744
+ try:
1745
+ self.writeLock.release() # Release the thread lock in case of an error
1746
+ except Exception as e:
1747
+ self.__teePrintOrNot(f"Failed to release writeLock for {self._fileName}: {e}",'error')
1748
+ self.__teePrintOrNot(f"Failed to open file {self._fileName}: {e}",'error')
1749
+ return file
1750
+
1751
+ def release_file_obj(self,file):
1752
+ # if write lock is already released, return
1753
+ if not self.writeLock.locked():
1754
+ return
1755
+ try:
1756
+ file.flush() # Ensure the file is flushed before unlocking
1757
+ os.fsync(file.fileno()) # Ensure the file is synced to disk before unlocking
1758
+ if not file.closed:
1759
+ if os.name == 'posix':
1760
+ fcntl.lockf(file, fcntl.LOCK_UN)
1761
+ elif os.name == 'nt':
1762
+ # Unlocking the entire file; for Windows, ensure not unlocking an empty file
1763
+ #unlock_length = max(1, os.path.getsize(os.path.realpath(file.name)))
1764
+ unlock_length = 2147483647
1765
+ try:
1766
+ msvcrt.locking(file.fileno(), msvcrt.LK_UNLCK, unlock_length)
1767
+ except Exception:
1768
+ pass
1769
+ file.close() # Ensure file is closed after unlocking
1770
+ if self.verbose:
1771
+ self.__teePrintOrNot(f"File {file.name} unlocked / released")
1772
+ except Exception as e:
1773
+ try:
1774
+ self.writeLock.release() # Ensure the thread lock is always released
1775
+ except Exception as e:
1776
+ self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
1777
+ self.__teePrintOrNot(f"Failed to release file {file.name}: {e}",'error')
1778
+ import traceback
1779
+ self.__teePrintOrNot(traceback.format_exc(),'error')
1780
+ # release the write lock if not already released
1781
+ if self.writeLock.locked():
1782
+ try:
1783
+ self.writeLock.release() # Ensure the thread lock is always released
1784
+ except Exception as e:
1785
+ self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
1786
+ self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1787
+
1788
+ class TSVZedLite(MutableMapping):
1789
+ """
1790
+ A mutable mapping class that provides a dictionary-like interface to a Tabular (TSV by default) file.
1791
+ TSVZedLite stores key-value pairs where each row in the file represents an entry, with the first
1792
+ column serving as the key. The class maintains an in-memory index of file positions for efficient
1793
+ random access while keeping the actual data on disk.
1794
+ TSVZedLite is designed for light memory footprint and forgoes some features from TSVZed, Notably,
1795
+ - Does not support simultaneous multi-process access.
1796
+ - Does not support compressed file formats.
1797
+ - Does not support automatic file rewriting on load / exit / periodically.
1798
+ - Does not support append worker thread for background writes.
1799
+ - Does not support external file change monitoring.
1800
+ - Does not support in-place updates; updates are append-only.
1801
+ - Does not support logging via teeLogger.
1802
+ - Does not support move_to_end method.
1803
+ - Does not support in-memory only mode. ( please just use a dict )
1804
+ - Does not lock the file during operations.
1805
+ - Does not track last update times.
1806
+
1807
+ However, it may be preferred in scenarios when:
1808
+ - Memory usage needs to be minimized.
1809
+ - Working with extremely large datasets where loading everything into memory is impractical.
1810
+ - Simplicity and ease of use are prioritized over advanced features.
1811
+ - The dataset is primarily write-only with infrequent reads.
1812
+ - The application can tolerate the lack of concurrency control. (single process access only)
1813
+ - Underlying file system is fast and can do constant time random seek (e.g., SSD).
1814
+
1815
+ Note: It is possible to load a custom dict like object for indexes (like TSVZed or pre-built dict)
1816
+ to avoid reading the entire data file to load the indexes at startup.
1817
+ Index consistency is not enforced in this case.
1818
+ Will raise error if mismatch happen (only checkes key exist in file) and strict mode is enabled.
1819
+ If using an external file-backed Index. This can function similar to a key-value store (like nosql).
1820
+
1821
+ Parameters
1822
+ ----------
1823
+ fileName : str
1824
+ Path to the Tabular file to read from or create.
1825
+ header : str, optional
1826
+ Header row for the file. Can be a delimited string or empty string (default: '').
1827
+ createIfNotExist : bool, optional
1828
+ If True, creates the file if it doesn't exist (default: True).
1829
+ verifyHeader : bool, optional
1830
+ If True, verifies that the file header matches the provided header (default: True).
1831
+ verbose : bool, optional
1832
+ If True, prints detailed operation information to stderr (default: False).
1833
+ encoding : str, optional
1834
+ Character encoding for the file (default: 'utf8').
1835
+ delimiter : str, optional
1836
+ Field delimiter character. If Ellipsis (...), automatically detects from filename (default: ...).
1837
+ defaults : str, list, or None, optional
1838
+ Default values for columns. Can be a delimited string, list, or None (default: None).
1839
+ strict : bool, optional
1840
+ If True, enforces strict column count validation and raises errors on mismatches (default: True).
1841
+ correctColumnNum : int, optional
1842
+ Expected number of columns. -1 means auto-detect (default: -1).
1843
+ indexes : dict, optional
1844
+ Pre-existing index dictionary mapping keys to file positions (default: ...).
1845
+ fileObj : file object, optional
1846
+ Pre-existing file object to use (default: ...).
1847
+ Attributes
1848
+ ----------
1849
+ version : str
1850
+ Version identifier for the TSVZedLite format.
1851
+ indexes : dict
1852
+ Dictionary mapping keys to their file positions (or in-memory data for keys starting with '#').
1853
+ fileObj : file object
1854
+ Binary file object for reading/writing the underlying file.
1855
+ defaults : list
1856
+ List of default values for columns, with DEFAULTS_INDICATOR_KEY as the first element.
1857
+ correctColumnNum : int
1858
+ The validated number of columns per row.
1859
+ Notes
1860
+ -----
1861
+ - Keys starting with '#' are stored in memory only and not written to file.
1862
+ - The special key DEFAULTS_INDICATOR_KEY is used to store and retrieve default column values.
1863
+ - Empty values in rows are automatically filled with defaults if available.
1864
+ - The class implements the MutableMapping interface, providing dict-like operations.
1865
+ - File operations are buffered and written immediately (append-only for updates).
1866
+ - Deleted entries are marked by writing a row with only the key (empty values).
1867
+ Examples
1868
+ --------
1869
+ >>> db = TSVZedLite('data.tsv', header='id\tname\tage')
1870
+ >>> db['user1'] = ['user1', 'Alice', '30']
1871
+ >>> print(db['user1'])
1872
+ ['user1', 'Alice', '30']
1873
+ >>> del db['user1']
1874
+ >>> 'user1' in db
1875
+ False
1876
+ See Also
1877
+ --------
1878
+ collections.abc.MutableMapping : The abstract base class that this class implements.
1879
+ """
1880
+
1881
+ #['__new__', '__repr__', '__hash__', '__lt__', '__le__', '__eq__', '__ne__', '__gt__', '__ge__', '__iter__', '__init__',
1882
+ # '__or__', '__ror__', '__ior__', '__len__', '__getitem__', '__setitem__', '__delitem__', '__contains__', '__sizeof__',
1883
+ # 'get', 'setdefault', 'pop', 'popitem', 'keys', 'items', 'values', 'update', 'fromkeys', 'clear', 'copy', '__reversed__',
1884
+ # '__class_getitem__', '__doc__']
1885
+ def __init__ (self,fileName,header = '',createIfNotExist = True,verifyHeader = True,
1886
+ verbose = False,encoding = 'utf8',
1887
+ delimiter = ...,defaults = None,strict = True,correctColumnNum = -1,
1888
+ indexes = ..., fileObj = ...
1889
+ ):
1890
+ self.version = version
1891
+ self.strict = strict
1892
+ self._fileName = fileName
1893
+ self.delimiter = get_delimiter(delimiter,file_name=fileName)
1894
+ self.setDefaults(defaults)
1895
+ self.header = _formatHeader(header,verbose = verbose,delimiter=self.delimiter)
1896
+ self.correctColumnNum = correctColumnNum
1897
+ self.createIfNotExist = createIfNotExist
1898
+ self.verifyHeader = verifyHeader
1899
+ self.verbose = verbose
1900
+ self.encoding = encoding
1901
+ if indexes is ...:
1902
+ self.indexes = dict()
1903
+ self.load()
1904
+ else:
1905
+ self.indexes = indexes
1906
+ if fileObj is ...:
1907
+ self.fileObj = open(self._fileName,'r+b')
1908
+ else:
1909
+ self.fileObj = fileObj
1910
+ atexit.register(self.close)
1911
+
1912
+ # Implement custom methods just for TSVZedLite
1913
+ def getResourceUsage(self,return_dict = False):
1914
+ return get_resource_usage(return_dict = return_dict)
1915
+
1916
+ def setDefaults(self,defaults):
1917
+ if not defaults:
1918
+ defaults = []
1919
+ if isinstance(defaults,str):
1920
+ defaults = defaults.split(self.delimiter)
1921
+ elif not isinstance(defaults,list):
1922
+ try:
1923
+ defaults = list(defaults)
1924
+ except Exception:
1925
+ if self.verbose:
1926
+ eprint('Error: Invalid defaults, setting defaults to empty.')
1927
+ defaults = []
1928
+ defaults = [str(s).rstrip() if s else '' for s in defaults]
1929
+ if not any(defaults):
1930
+ defaults = []
1931
+ if not defaults or defaults[0] != DEFAULTS_INDICATOR_KEY:
1932
+ defaults = [DEFAULTS_INDICATOR_KEY]+defaults
1933
+ self.defaults = defaults
1934
+
1935
+ def load(self):
1936
+ if self.verbose:
1937
+ eprint(f"Loading {self._fileName}")
1938
+ readTabularFile(self._fileName, header = self.header, createIfNotExist = self.createIfNotExist,
1939
+ verifyHeader = self.verifyHeader, verbose = self.verbose, taskDic = self.indexes,
1940
+ encoding = self.encoding if self.encoding else None, strict = self.strict,
1941
+ delimiter = self.delimiter, defaults=self.defaults,storeOffset=True)
1942
+ return self
1943
+
1944
+ def positions(self):
1945
+ return self.indexes.values()
1946
+
1947
+ def reload(self):
1948
+ self.indexes.clear()
1949
+ return self.load()
1950
+
1951
+ def getListView(self):
1952
+ return getListView(self,header=self.header,delimiter=self.delimiter)
1953
+
1954
+ def clear_file(self):
1955
+ if self.verbose:
1956
+ eprint(f"Clearing {self._fileName}")
1957
+ self.fileObj.seek(0)
1958
+ self.fileObj.truncate()
1959
+ if self.verbose:
1960
+ eprint(f"File {self._fileName} cleared empty")
1961
+ if self.header:
1962
+ location = self.__writeValues(self.header)
1963
+ if self.verbose:
1964
+ eprint(f"Header {self.header} written to {self._fileName}")
1965
+ eprint(f"At {location} size: {self.fileObj.tell()}")
1966
+ return self
1967
+
1968
+ def switchFile(self,newFileName,createIfNotExist = ...,verifyHeader = ...):
1969
+ if createIfNotExist is ...:
1970
+ createIfNotExist = self.createIfNotExist
1971
+ if verifyHeader is ...:
1972
+ verifyHeader = self.verifyHeader
1973
+ self.fileObj.close()
1974
+ self._fileName = newFileName
1975
+ self.reload()
1976
+ self.fileObj = open(self._fileName,'r+b')
1977
+ self.createIfNotExist = createIfNotExist
1978
+ self.verifyHeader = verifyHeader
1979
+ return self
1980
+
1981
+ # Private methods for reading and writing values for TSVZedLite
1982
+
1983
+ def __writeValues(self,data):
1984
+ self.fileObj.seek(0, os.SEEK_END)
1985
+ write_at = self.fileObj.tell()
1986
+ if self.verbose:
1987
+ eprint(f"Writing at position {write_at}")
1988
+ data = _sanitize(data,delimiter=self.delimiter)
1989
+ data = self.delimiter.join(data)
1990
+ bytes = self.fileObj.write((data.encode(encoding=self.encoding,errors='replace') + b'\n'))
1991
+ if self.verbose:
1992
+ eprint(f"Wrote {bytes} bytes")
1993
+ return write_at
1994
+
1995
+ def __mapDeleteToFile(self,key):
1996
+ if key == DEFAULTS_INDICATOR_KEY:
1997
+ self.defaults = [DEFAULTS_INDICATOR_KEY]
1998
+ if self.verbose:
1999
+ eprint("Defaults cleared")
2000
+ # delete the key from the dictionary and update the file
2001
+ elif key not in self.indexes:
2002
+ if self.verbose:
2003
+ eprint(f"Key {key} not found")
2004
+ return
2005
+ elif key.startswith('#'):
2006
+ if self.verbose:
2007
+ eprint(f"Key {key} deleted in memory")
2008
+ return
2009
+ if self.verbose:
2010
+ eprint(f"Appending empty line {key}")
2011
+ self.indexes[key] = self.__writeValues([key])
2012
+
2013
+ def __readValuesAtPos(self,pos,key = ...):
2014
+ self.fileObj.seek(pos)
2015
+ line = self.fileObj.readline().decode(self.encoding,errors='replace')
2016
+ self.correctColumnNum, segments = _processLine(
2017
+ line=line,
2018
+ taskDic={},
2019
+ correctColumnNum=self.correctColumnNum,
2020
+ strict=self.strict,
2021
+ delimiter=self.delimiter,
2022
+ defaults=self.defaults,
2023
+ storeOffset=True,
2024
+ )
2025
+ if self.verbose:
2026
+ eprint(f"Read at position {pos}: {segments}")
2027
+ if key is not ... and segments[0] != key:
2028
+ eprint(f"Warning: Key mismatch at position {pos}: expected {key}, got {segments[0]}")
2029
+ if self.strict:
2030
+ eprint("Error: Key mismatch and strict mode enabled. Raising KeyError.")
2031
+ raise KeyError(key)
2032
+ else :
2033
+ eprint("Continuing despite key mismatch due to non-strict mode. Expect errors!")
2034
+ return segments
2035
+
2036
+ # Implement basic __getitem__, __setitem__, __delitem__, __iter__, and __len__. needed for MutableMapping
2037
+ def __getitem__(self,key):
2038
+ key = str(key).rstrip()
2039
+ if key not in self.indexes:
2040
+ if key == DEFAULTS_INDICATOR_KEY:
2041
+ return self.defaults
2042
+ raise KeyError(key)
2043
+ pos = self.indexes[key]
2044
+ return self.__readValuesAtPos(pos,key)
2045
+
2046
+ def __setitem__(self,key,value):
2047
+ key = str(key).rstrip()
2048
+ if not key:
2049
+ eprint('Error: Key cannot be empty')
2050
+ return
2051
+ if isinstance(value,str):
2052
+ value = value.split(self.delimiter)
2053
+ # sanitize the value
2054
+ value = [str(s).rstrip() if s else '' for s in value]
2055
+ # the first field in value should be the key
2056
+ # add it if it is not there
2057
+ if not value or value[0] != key:
2058
+ value = [key]+value
2059
+ # verify the value has the correct number of columns
2060
+ if self.correctColumnNum != 1 and len(value) == 1:
2061
+ # this means we want to clear / delete the key
2062
+ del self[key]
2063
+ elif self.correctColumnNum > 0:
2064
+ if len(value) != self.correctColumnNum:
2065
+ if self.strict:
2066
+ eprint(f"Error: Value {value} does not have the correct number of columns: {self.correctColumnNum}. Refuse adding key...")
2067
+ return
2068
+ elif self.verbose:
2069
+ eprint(f"Warning: Value {value} does not have the correct number of columns: {self.correctColumnNum}, correcting...")
2070
+ if len(value) < self.correctColumnNum:
2071
+ value += ['']*(self.correctColumnNum-len(value))
2072
+ elif len(value) > self.correctColumnNum:
2073
+ value = value[:self.correctColumnNum]
2074
+ else:
2075
+ self.correctColumnNum = len(value)
2076
+ if self.defaults and len(self.defaults) > 1:
2077
+ for i in range(1,len(value)):
2078
+ if not value[i] and i < len(self.defaults) and self.defaults[i]:
2079
+ value[i] = self.defaults[i]
2080
+ if self.verbose:
2081
+ eprint(f" Replacing empty value at {i} with default: {self.defaults[i]}")
2082
+ if key == DEFAULTS_INDICATOR_KEY:
2083
+ self.defaults = value
2084
+ if self.verbose:
2085
+ eprint(f"Defaults set to {value}")
2086
+ elif key.startswith('#'):
2087
+ if self.verbose:
2088
+ eprint(f"Key {key} updated in memory (data in index) as it starts with #")
2089
+ self.indexes[key] = value
2090
+ return
2091
+ if self.verbose:
2092
+ eprint(f"Writing {key}: {value}")
2093
+ self.indexes[key] = self.__writeValues(value)
2094
+
2095
+ def __delitem__(self,key):
2096
+ key = str(key).rstrip()
2097
+ self.indexes.pop(key,None)
2098
+ self.__mapDeleteToFile(key)
2099
+
2100
+ def __iter__(self):
2101
+ return iter(self.indexes)
2102
+
2103
+ def __len__(self):
2104
+ return len(self.indexes)
2105
+
2106
+ # Implement additional methods for dict like interface (order of function are somewhat from OrderedDict)
2107
+ def __reversed__(self):
2108
+ return reversed(self.indexes)
2109
+
2110
+ def clear(self):
2111
+ # clear the dictionary and update the file
2112
+ self.indexes.clear()
2113
+ self.clear_file()
2114
+ return self
2115
+
2116
+ def popitem(self, last=True,return_pos = False):
2117
+ if last:
2118
+ key, pos = self.indexes.popitem()
2119
+ else:
2120
+ try:
2121
+ key = next(iter(self.indexes))
2122
+ pos = self.indexes.pop(key)
2123
+ except StopIteration:
2124
+ raise KeyError("popitem(): dictionary is empty")
2125
+ if return_pos:
2126
+ value = pos
2127
+ else:
2128
+ value = self.__readValuesAtPos(pos,key)
2129
+ self.__mapDeleteToFile(key)
2130
+ return key, value
2131
+
2132
+ __marker = object()
2133
+ def pop(self, key, default=__marker, return_pos = False):
2134
+ key = str(key).rstrip()
2135
+ try:
2136
+ pos = self.indexes.pop(key)
2137
+ except KeyError:
2138
+ if default is self.__marker:
2139
+ raise KeyError(key)
2140
+ elif default is ...:
2141
+ return self.defaults
2142
+ return default
2143
+ if return_pos:
2144
+ value = pos
2145
+ else:
2146
+ value = self.__readValuesAtPos(pos,key)
2147
+ self.__mapDeleteToFile(key)
2148
+ return value
2149
+
2150
+ def __sizeof__(self):
2151
+ sizeof = sys.getsizeof
2152
+ size = sizeof(super()) + sizeof(True) * 6 # for the booleans / integers
2153
+ size += sizeof(self._fileName)
2154
+ size += sizeof(self.header)
2155
+ size += sizeof(self.encoding)
2156
+ size += sizeof(self.delimiter)
2157
+ size += sizeof(self.defaults)
2158
+ size += sizeof(self.indexes)
2159
+ size += sizeof(self.fileObj)
2160
+ return size
2161
+
2162
+ def __repr__(self):
2163
+ return f"""TSVZed at {hex(id(self))}(
2164
+ file_name:{self._fileName}
2165
+ index_count:{len(self.indexes)}
2166
+ header:{self.header}
2167
+ correctColumnNum:{self.correctColumnNum}
2168
+ createIfNotExist:{self.createIfNotExist}
2169
+ verifyHeader:{self.verifyHeader}
2170
+ strict:{self.strict}
2171
+ delimiter:{self.delimiter}
2172
+ defaults:{self.defaults}
2173
+ verbose:{self.verbose}
2174
+ encoding:{self.encoding}
2175
+ file_descriptor:{self.fileObj.fileno()}
2176
+ )"""
2177
+
2178
+ def __str__(self):
2179
+ return f"TSVZedLite({self._fileName})"
2180
+
2181
+ def __reduce__(self):
2182
+ 'Return state information for pickling'
2183
+ # Return minimal state needed to reconstruct
2184
+ return (
2185
+ self.__class__,
2186
+ (self._fileName, self.header, self.createIfNotExist, self.verifyHeader,
2187
+ self.verbose, self.encoding, self.delimiter, self.defaults, self.strict,
2188
+ self.correctColumnNum),
2189
+ None,
2190
+ None,
2191
+ None
2192
+ )
2193
+ def copy(self):
2194
+ 'Return a shallow copy of the ordered dictionary.'
2195
+ new = self.__class__(
2196
+ self._fileName,
2197
+ self.header,
2198
+ self.createIfNotExist,
2199
+ self.verifyHeader,
2200
+ self.verbose,
2201
+ self.encoding,
2202
+ self.delimiter,
2203
+ self.defaults,
2204
+ self.strict,
2205
+ self.correctColumnNum,
2206
+ self.indexes,
2207
+ self.fileObj,
2208
+ )
2209
+ eprint("""
2210
+ Warning: Copying TSVZedLite will share the same file object and indexes.
2211
+ Changes in one will affect the other.
2212
+ There is likely very little reason to copy a TSVZedLite instance unless you are immadiately then calling switchFile() on it.
2213
+ """)
2214
+ return new
2215
+
2216
+ @classmethod
2217
+ def fromkeys(cls, iterable, value=None,fileName = None,header = '',createIfNotExist = True,verifyHeader = True,verbose = False,encoding = 'utf8',
2218
+ delimiter = ...,defaults = None,strict = True,correctColumnNum = -1):
2219
+ '''Create a new ordered dictionary with keys from iterable and values set to value.
2220
+ '''
2221
+ self = cls(fileName,header,createIfNotExist,verifyHeader,verbose,encoding,delimiter,defaults,strict,correctColumnNum)
2222
+ for key in iterable:
2223
+ self[key] = value
2224
+ return self
2225
+
2226
+ def __eq__(self, other):
2227
+ if isinstance(other, TSVZedLite):
2228
+ eprint("Warning: Comparing two TSVZedLite instances will only compare their indexes. Data content is not compared.")
2229
+ return self.indexes == other.indexes
2230
+ return super().__eq__(other)
2231
+
2232
+ def __ior__(self, other):
2233
+ self.update(other)
2234
+ return self
2235
+
2236
+ # Implement context manager methods
2237
+ def __enter__(self):
2238
+ return self
2239
+
2240
+ def close(self):
2241
+ self.fileObj.close()
2242
+ return self
2243
+
2244
+ def __exit__(self,exc_type,exc_value,traceback):
2245
+ return self.close()
2246
+
2247
+
1529
2248
 
1530
2249
 
1531
2250
  def __main__():
1532
- import argparse
1533
- parser = argparse.ArgumentParser(description='TSVZed: A TSV / CSV / NSV file manager')
1534
- parser.add_argument('filename', type=str, help='The file to read')
1535
- parser.add_argument('operation', type=str,nargs='?', choices=['read','append','delete','clear','scrub'], help='The operation to perform. Note: scrub will also remove all comments. Default: read', default='read')
1536
- parser.add_argument('line', type=str, nargs='*', help='The line to append to the Tabular file. it follows as : {key} {value1} {value2} ... if a key without value be inserted, the value will get deleted.')
1537
- parser.add_argument('-d', '--delimiter', type=str, help='The delimiter of the Tabular file. Default: Infer from last part of filename, or tab if cannot determine. Note: accept unicode escaped char, raw char, or string "comma,tab,null" will refer to their characters. ', default=...)
1538
- parser.add_argument('-c', '--header', type=str, help='Perform checks with this header of the Tabular file. seperate using --delimiter.')
1539
- parser.add_argument('--defaults', type=str, help='Default values to fill in the missing columns. seperate using --delimiter. Ex. if -d = comma, --defaults="key,value1,value2..." Note: Please specify the key. But it will not be used as a key need to be unique in data.')
1540
- strictMode = parser.add_mutually_exclusive_group()
1541
- strictMode.add_argument('-s', '--strict', dest = 'strict',action='store_true', help='Strict mode. Do not parse values that seems malformed, check for column numbers / headers')
1542
- strictMode.add_argument('-f', '--force', dest = 'strict',action='store_false', help='Force the operation. Ignore checks for column numbers / headers')
1543
- parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
1544
- parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {version} @ {COMMIT_DATE} by {author}')
1545
- args = parser.parse_args()
1546
- args.delimiter = get_delimiter(delimiter=args.delimiter,file_name=args.filename)
1547
- if args.header and args.header.endswith('\\'):
1548
- args.header += '\\'
1549
- try:
1550
- header = args.header.encode().decode('unicode_escape') if args.header else ''
1551
- except Exception:
1552
- print(f"Failed to decode header: {args.header}")
1553
- header = ''
1554
- defaults = []
1555
- if args.defaults:
1556
- try:
1557
- defaults = args.defaults.encode().decode('unicode_escape').split(args.delimiter)
1558
- except Exception:
1559
- print(f"Failed to decode defaults: {args.defaults}")
1560
- defaults = []
1561
-
1562
- if args.operation == 'read':
1563
- # check if the file exist
1564
- if not os.path.isfile(args.filename):
1565
- print(f"File not found: {args.filename}")
1566
- return
1567
- # read the file
1568
- data = readTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= args.strict, delimiter=args.delimiter, defaults=defaults)
1569
- print(pretty_format_table(data.values(),delimiter=args.delimiter))
1570
- elif args.operation == 'append':
1571
- appendTabularFile(args.filename, args.line,createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
1572
- elif args.operation == 'delete':
1573
- appendTabularFile(args.filename, args.line[:1],createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
1574
- elif args.operation == 'clear':
1575
- clearTabularFile(args.filename, header=header, verbose=args.verbose, verifyHeader=args.strict, delimiter=args.delimiter)
1576
- elif args.operation == 'scrub':
1577
- scrubTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= args.strict, delimiter=args.delimiter, defaults=defaults)
1578
- else:
1579
- print("Invalid operation")
2251
+ import argparse
2252
+ parser = argparse.ArgumentParser(description='TSVZed: A TSV / CSV / NSV file manager')
2253
+ parser.add_argument('filename', type=str, help='The file to read')
2254
+ parser.add_argument('operation', type=str,nargs='?', choices=['read','append','delete','clear','scrub'], help='The operation to perform. Note: scrub will also remove all comments. Default: read', default='read')
2255
+ parser.add_argument('line', type=str, nargs='*', help='The line to append to the Tabular file. it follows as : {key} {value1} {value2} ... if a key without value be inserted, the value will get deleted.')
2256
+ parser.add_argument('-d', '--delimiter', type=str, help='The delimiter of the Tabular file. Default: Infer from last part of filename, or tab if cannot determine. Note: accept unicode escaped char, raw char, or string "comma,tab,null" will refer to their characters. ', default=...)
2257
+ parser.add_argument('-c', '--header', type=str, help='Perform checks with this header of the Tabular file. seperate using --delimiter.')
2258
+ parser.add_argument('--defaults', type=str, help='Default values to fill in the missing columns. seperate using --delimiter. Ex. if -d = comma, --defaults="key,value1,value2..." Note: Please specify the key. But it will not be used as a key need to be unique in data.')
2259
+ strictMode = parser.add_mutually_exclusive_group()
2260
+ strictMode.add_argument('-s', '--strict', dest = 'strict',action='store_true', help='Strict mode. Do not parse values that seems malformed, check for column numbers / headers')
2261
+ strictMode.add_argument('-f', '--force', dest = 'strict',action='store_false', help='Force the operation. Ignore checks for column numbers / headers')
2262
+ parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
2263
+ parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {version} @ {COMMIT_DATE} by {author}')
2264
+ args = parser.parse_args()
2265
+ args.delimiter = get_delimiter(delimiter=args.delimiter,file_name=args.filename)
2266
+ if args.header and args.header.endswith('\\'):
2267
+ args.header += '\\'
2268
+ try:
2269
+ header = args.header.encode().decode('unicode_escape') if args.header else ''
2270
+ except Exception:
2271
+ print(f"Failed to decode header: {args.header}")
2272
+ header = ''
2273
+ defaults = []
2274
+ if args.defaults:
2275
+ try:
2276
+ defaults = args.defaults.encode().decode('unicode_escape').split(args.delimiter)
2277
+ except Exception:
2278
+ print(f"Failed to decode defaults: {args.defaults}")
2279
+ defaults = []
2280
+
2281
+ if args.operation == 'read':
2282
+ # check if the file exist
2283
+ if not os.path.isfile(args.filename):
2284
+ print(f"File not found: {args.filename}")
2285
+ return
2286
+ # read the file
2287
+ data = readTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= args.strict, delimiter=args.delimiter, defaults=defaults)
2288
+ print(pretty_format_table(data.values(),delimiter=args.delimiter))
2289
+ elif args.operation == 'append':
2290
+ appendTabularFile(args.filename, args.line,createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
2291
+ elif args.operation == 'delete':
2292
+ appendTabularFile(args.filename, args.line[:1],createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
2293
+ elif args.operation == 'clear':
2294
+ clearTabularFile(args.filename, header=header, verbose=args.verbose, verifyHeader=args.strict, delimiter=args.delimiter)
2295
+ elif args.operation == 'scrub':
2296
+ scrubTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= args.strict, delimiter=args.delimiter, defaults=defaults)
2297
+ else:
2298
+ print("Invalid operation")
1580
2299
  if __name__ == '__main__':
1581
- __main__()
2300
+ __main__()