TSVZ 2.70__py3-none-any.whl → 3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
TSVZ.py CHANGED
@@ -4,59 +4,91 @@ from collections import OrderedDict , deque
4
4
  import time
5
5
  import atexit
6
6
  import threading
7
+ import re
7
8
 
8
9
  if os.name == 'nt':
9
10
  import msvcrt
10
11
  elif os.name == 'posix':
11
12
  import fcntl
12
13
 
13
- version = '2.70'
14
+ version = '3.10'
14
15
  author = 'pan@zopyr.us'
15
16
 
17
+ DEFAULT_DELIMITER = '\t'
18
+ DEFAULTS_INDICATOR_KEY = '#_defaults_#'
16
19
 
17
- def pretty_format_table(data):
18
- version = 1.0
19
- if not data:
20
- return ''
21
- if type(data) == str:
22
- data = data.strip('\n').split('\n')
23
- data = [line.split('\t') for line in data]
24
- elif isinstance(data, dict):
25
- # flatten the 2D dict to a list of lists
26
- if isinstance(next(iter(data.values())), dict):
27
- tempData = [['key'] + list(next(iter(data.values())).keys())]
28
- tempData.extend( [[key] + list(value.values()) for key, value in data.items()])
29
- data = tempData
30
- else:
31
- # it is a dict of lists
32
- data = [[key] + list(value) for key, value in data.items()]
33
- elif type(data) != list:
34
- data = list(data)
35
- # format the list into 2d list of list of strings
36
- if isinstance(data[0], dict):
37
- tempData = [data[0].keys()]
38
- tempData.extend([list(item.values()) for item in data])
39
- data = tempData
40
- data = [[str(item) for item in row] for row in data]
41
- num_cols = len(data[0])
42
- col_widths = [0] * num_cols
43
- # Calculate the maximum width of each column
44
- for c in range(num_cols):
45
- col_widths[c] = max(len(row[c]) for row in data)
46
- # Build the row format string
47
- row_format = ' | '.join('{{:<{}}}'.format(width) for width in col_widths)
48
- # Print the header
49
- header = data[0]
50
- outTable = []
51
- outTable.append(row_format.format(*header))
52
- outTable.append('-+-'.join('-' * width for width in col_widths))
53
- for row in data[1:]:
54
- # if the row is empty, print an divider
55
- if not any(row):
56
- outTable.append('-+-'.join('-' * width for width in col_widths))
57
- else:
58
- outTable.append(row_format.format(*row))
59
- return '\n'.join(outTable) + '\n'
20
+ def get_delimiter(delimiter,file_name = ''):
21
+ if not delimiter:
22
+ return DEFAULT_DELIMITER
23
+ elif delimiter == ...:
24
+ if not file_name:
25
+ rtn = '\t'
26
+ if file_name.endswith('.csv'):
27
+ rtn = ','
28
+ elif file_name.endswith('.nsv'):
29
+ rtn = '\0'
30
+ elif file_name.endswith('.psv'):
31
+ rtn = '|'
32
+ else:
33
+ rtn = '\t'
34
+ elif delimiter == 'comma':
35
+ rtn = ','
36
+ elif delimiter == 'tab':
37
+ rtn = '\t'
38
+ elif delimiter == 'pipe':
39
+ rtn = '|'
40
+ elif delimiter == 'null':
41
+ rtn = '\0'
42
+ else:
43
+ rtn = delimiter.encode().decode('unicode_escape')
44
+ DEFAULT_DELIMITER = rtn
45
+ return rtn
46
+
47
+ def pretty_format_table(data, delimiter = DEFAULT_DELIMITER):
48
+ version = 1.0
49
+ if not data:
50
+ return ''
51
+ if type(data) == str:
52
+ data = data.strip('\n').split('\n')
53
+ data = [line.split(delimiter) for line in data]
54
+ elif isinstance(data, dict):
55
+ # flatten the 2D dict to a list of lists
56
+ if isinstance(next(iter(data.values())), dict):
57
+ tempData = [['key'] + list(next(iter(data.values())).keys())]
58
+ tempData.extend( [[key] + list(value.values()) for key, value in data.items()])
59
+ data = tempData
60
+ else:
61
+ # it is a dict of lists
62
+ data = [[key] + list(value) for key, value in data.items()]
63
+ elif type(data) != list:
64
+ data = list(data)
65
+ # format the list into 2d list of list of strings
66
+ if isinstance(data[0], dict):
67
+ tempData = [data[0].keys()]
68
+ tempData.extend([list(item.values()) for item in data])
69
+ data = tempData
70
+ data = [[str(item) for item in row] for row in data]
71
+ num_cols = len(data[0])
72
+ col_widths = [0] * num_cols
73
+ # Calculate the maximum width of each column
74
+ for c in range(num_cols):
75
+ #col_widths[c] = max(len(row[c]) for row in data)
76
+ # handle ansii escape sequences
77
+ col_widths[c] = max(len(re.sub(r'\x1b\[[0-?]*[ -/]*[@-~]','',row[c])) for row in data)
78
+ # Build the row format string
79
+ row_format = ' | '.join('{{:<{}}}'.format(width) for width in col_widths)
80
+ # Print the header
81
+ header = data[0]
82
+ outTable = []
83
+ outTable.append(row_format.format(*header))
84
+ outTable.append('-+-'.join('-' * width for width in col_widths))
85
+ for row in data[1:]:
86
+ # if the row is empty, print an divider
87
+ if not any(row):
88
+ outTable.append('-+-'.join('-' * width for width in col_widths))
89
+ else:
90
+ outTable.append(row_format.format(*row))
91
+ return '\n'.join(outTable) + '\n'
60
92
 
61
93
  def __teePrintOrNot(message,level = 'info',teeLogger = None):
62
94
  """
@@ -78,7 +110,7 @@ def __teePrintOrNot(message,level = 'info',teeLogger = None):
78
110
  except Exception as e:
79
111
  print(message,flush=True)
80
112
 
81
- def processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,strict = True):
113
+ def _processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,strict = True,delimiter = DEFAULT_DELIMITER,defaults = []):
82
114
  """
83
115
  Process a line of text and update the task dictionary.
84
116
 
@@ -89,47 +121,52 @@ def processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,s
89
121
  verbose (bool, optional): Whether to print verbose output. Defaults to False.
90
122
  teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
91
123
  strict (bool, optional): Whether to strictly enforce the correct number of columns. Defaults to True.
124
+ defaults (list, optional): The default values to use for missing columns. Defaults to [].
92
125
 
93
126
  Returns:
94
127
  tuple: A tuple containing the updated correctColumnNum and the processed lineCache.
95
128
 
96
129
  """
97
- line = line.decode().strip(' ').strip('\x00')
130
+ line = line.strip(' ').strip('\x00').rstrip('\r\n')
98
131
  # we throw away the lines that start with '#'
99
132
  if not line :
100
133
  if verbose:
101
134
  __teePrintOrNot(f"Ignoring empty line: {line}",teeLogger=teeLogger)
102
135
  return correctColumnNum , []
103
- if line.startswith('#'):
136
+ if line.startswith('#') and not line.startswith(DEFAULTS_INDICATOR_KEY):
104
137
  if verbose:
105
138
  __teePrintOrNot(f"Ignoring comment line: {line}",teeLogger=teeLogger)
106
139
  return correctColumnNum , []
107
140
  # we only interested in the lines that have the correct number of columns
108
- lineCache = [segment.strip() for segment in line.split('\t')]
141
+ lineCache = [segment.rstrip() for segment in line.split(delimiter)]
109
142
  if not lineCache:
110
143
  return correctColumnNum , []
111
144
  if correctColumnNum == -1:
145
+ if defaults and len(defaults) > 1:
146
+ correctColumnNum = len(defaults)
147
+ else:
148
+ correctColumnNum = len(lineCache)
112
149
  if verbose:
113
150
  __teePrintOrNot(f"detected correctColumnNum: {len(lineCache)}",teeLogger=teeLogger)
114
- correctColumnNum = len(lineCache)
115
151
  if not lineCache[0]:
116
152
  if verbose:
117
153
  __teePrintOrNot(f"Ignoring line with empty key: {line}",teeLogger=teeLogger)
118
154
  return correctColumnNum , []
119
155
  if len(lineCache) == 1 or not any(lineCache[1:]):
120
- if correctColumnNum == 1: taskDic[lineCache[0]] = lineCache
156
+ if correctColumnNum == 1:
157
+ taskDic[lineCache[0]] = lineCache
158
+ elif lineCache[0] == DEFAULTS_INDICATOR_KEY:
159
+ if verbose:
160
+ __teePrintOrNot(f"Empty defaults line found: {line}",teeLogger=teeLogger)
161
+ defaults = []
121
162
  else:
122
163
  if verbose:
123
164
  __teePrintOrNot(f"Key {lineCache[0]} found with empty value, deleting such key's representaion",teeLogger=teeLogger)
124
165
  if lineCache[0] in taskDic:
125
166
  del taskDic[lineCache[0]]
126
167
  return correctColumnNum , []
127
- elif len(lineCache) == correctColumnNum:
128
- taskDic[lineCache[0]] = lineCache
129
- if verbose:
130
- __teePrintOrNot(f"Key {lineCache[0]} added",teeLogger=teeLogger)
131
- else:
132
- if strict:
168
+ elif len(lineCache) != correctColumnNum:
169
+ if strict and not any(defaults):
133
170
  if verbose:
134
171
  __teePrintOrNot(f"Ignoring line with {len(lineCache)} columns: {line}",teeLogger=teeLogger)
135
172
  return correctColumnNum , []
@@ -139,12 +176,26 @@ def processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,s
139
176
  lineCache += ['']*(correctColumnNum-len(lineCache))
140
177
  elif len(lineCache) > correctColumnNum:
141
178
  lineCache = lineCache[:correctColumnNum]
142
- taskDic[lineCache[0]] = lineCache
143
179
  if verbose:
144
- __teePrintOrNot(f"Key {lineCache[0]} added after correction",teeLogger=teeLogger)
180
+ __teePrintOrNot(f"Correcting {lineCache[0]}",teeLogger=teeLogger)
181
+ # now replace empty values with defaults
182
+ if defaults and len(defaults) > 1:
183
+ for i in range(1,len(lineCache)):
184
+ if not lineCache[i] and i < len(defaults) and defaults[i]:
185
+ lineCache[i] = defaults[i]
186
+ if verbose:
187
+ __teePrintOrNot(f"Replacing empty value at {i} with default: {defaults[i]}",teeLogger=teeLogger)
188
+ if lineCache[0] == DEFAULTS_INDICATOR_KEY:
189
+ if verbose:
190
+ __teePrintOrNot(f"Defaults line found: {line}",teeLogger=teeLogger)
191
+ defaults = lineCache
192
+ return correctColumnNum , []
193
+ taskDic[lineCache[0]] = lineCache
194
+ if verbose:
195
+ __teePrintOrNot(f"Key {lineCache[0]} added",teeLogger=teeLogger)
145
196
  return correctColumnNum, lineCache
146
197
 
147
- def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False):
198
+ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False,encoding = 'utf8',delimiter = ...,defaults = []):
148
199
  """
149
200
  Reads the last valid line from a file.
150
201
 
@@ -154,13 +205,17 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
154
205
  correctColumnNum (int): A column number to pass to processLine function.
155
206
  verbose (bool, optional): Whether to print verbose output. Defaults to False.
156
207
  teeLogger (optional): Logger to use for tee print. Defaults to None.
208
+ encoding (str, optional): The encoding of the file. Defaults to None.
157
209
  strict (bool, optional): Whether to enforce strict processing. Defaults to False.
210
+ delimiter (str, optional): The delimiter used in the file. Defaults to None.
211
+ defaults (list, optional): The default values to use for missing columns. Defaults to [].
158
212
 
159
213
  Returns:
160
214
  list: The last valid line data processed by processLine, or an empty list if none found.
161
215
  """
162
216
  chunk_size = 1024 # Read in chunks of 1024 bytes
163
217
  last_valid_line = []
218
+ delimiter = get_delimiter(delimiter,file_name=fileName)
164
219
  if verbose:
165
220
  __teePrintOrNot(f"Reading last line only from {fileName}",teeLogger=teeLogger)
166
221
  with open(fileName, 'rb') as file:
@@ -186,13 +241,15 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
186
241
  for i in range(len(lines) - 1, -1, -1):
187
242
  if lines[i].strip(): # Skip empty lines
188
243
  # Process the line
189
- correctColumnNum, lineCache = processLine(
190
- lines[i],
191
- taskDic,
192
- correctColumnNum,
244
+ correctColumnNum, lineCache = _processLine(
245
+ line=lines[i].decode(encoding=encoding),
246
+ taskDic=taskDic,
247
+ correctColumnNum=correctColumnNum,
193
248
  verbose=verbose,
194
249
  teeLogger=teeLogger,
195
- strict=strict
250
+ strict=strict,
251
+ delimiter=delimiter,
252
+ defaults=defaults,
196
253
  )
197
254
  # If the line is valid, return it
198
255
  if lineCache and any(lineCache):
@@ -204,7 +261,7 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
204
261
  # Return empty list if no valid line found
205
262
  return last_valid_line
206
263
 
207
- def formatHeader(header,verbose = False,teeLogger = None):
264
+ def _formatHeader(header,verbose = False,teeLogger = None,delimiter = DEFAULT_DELIMITER):
208
265
  """
209
266
  Format the header string.
210
267
 
@@ -218,12 +275,12 @@ def formatHeader(header,verbose = False,teeLogger = None):
218
275
  """
219
276
  if type(header) != str:
220
277
  try:
221
- header = '\t'.join(header)
278
+ header = delimiter.join(header)
222
279
  except:
223
280
  if verbose:
224
281
  __teePrintOrNot('Invalid header, setting header to empty.','error',teeLogger=teeLogger)
225
282
  header = ''
226
- header = header.strip()
283
+ header = delimiter.join([segment.rstrip() for segment in header.split(delimiter)])
227
284
  # if header:
228
285
  # if not header.endswith('\n'):
229
286
  # header += '\n'
@@ -231,7 +288,7 @@ def formatHeader(header,verbose = False,teeLogger = None):
231
288
  # header = ''
232
289
  return header
233
290
 
234
- def lineContainHeader(header,line,verbose = False,teeLogger = None,strict = False):
291
+ def _lineContainHeader(header,line,verbose = False,teeLogger = None,strict = False,delimiter = DEFAULT_DELIMITER):
235
292
  """
236
293
  Verify if a line contains the header.
237
294
 
@@ -245,26 +302,24 @@ def lineContainHeader(header,line,verbose = False,teeLogger = None,strict = Fals
245
302
  Returns:
246
303
  bool: True if the header matches the line, False otherwise.
247
304
  """
248
- escapedHeader = repr(header.strip())
249
- escapedLine = repr(line.strip())
305
+ header = [segment.rstrip() for segment in header.split(delimiter)]
306
+ line = [segment.rstrip() for segment in line.split(delimiter)]
250
307
  if verbose:
251
- __teePrintOrNot(f"Header: \n{escapedHeader}",teeLogger=teeLogger)
252
- __teePrintOrNot(f"First line: \n{escapedLine}",teeLogger=teeLogger)
253
- headerList = header.strip().lower().split('\t')
254
- lineList = line.strip().lower().split('\t')
255
- if len(headerList) != len(lineList) or any([headerList[i] not in lineList[i] for i in range(len(headerList))]):
256
- __teePrintOrNot(f"Header mismatch: \n{escapedLine} \n!= \n{escapedHeader}",teeLogger=teeLogger)
308
+ __teePrintOrNot(f"Header: \n{header}",teeLogger=teeLogger)
309
+ __teePrintOrNot(f"First line: \n{line}",teeLogger=teeLogger)
310
+ if len(header) != len(line) or any([header[i] not in line[i] for i in range(len(header))]):
311
+ __teePrintOrNot(f"Header mismatch: \n{line} \n!= \n{header}",teeLogger=teeLogger)
257
312
  if strict:
258
313
  raise Exception("Data format error! Header mismatch")
259
314
  return False
260
315
  return True
261
316
 
262
- def verifyTSVExistence(fileName,createIfNotExist = True,teeLogger = None,header = '',encoding = 'utf8',strict = True):
317
+ def _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = None,header = '',encoding = 'utf8',strict = True,delimiter = DEFAULT_DELIMITER):
263
318
  """
264
- Verify the existence of a TSV file.
319
+ Verify the existence of the tabular file.
265
320
 
266
321
  Parameters:
267
- - fileName (str): The path of the TSV file.
322
+ - fileName (str): The path of the tabular file.
268
323
  - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to True.
269
324
  - teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
270
325
  - header (str, optional): The header line to verify against. Defaults to ''.
@@ -274,8 +329,14 @@ def verifyTSVExistence(fileName,createIfNotExist = True,teeLogger = None,header
274
329
  Returns:
275
330
  bool: True if the file exists, False otherwise.
276
331
  """
277
- if not fileName.endswith('.tsv'):
332
+ if delimiter and delimiter == '\t' and not fileName.endswith('.tsv'):
278
333
  __teePrintOrNot(f'Warning: Filename {fileName} does not end with .tsv','warning',teeLogger=teeLogger)
334
+ elif delimiter and delimiter == ',' and not fileName.endswith('.csv'):
335
+ __teePrintOrNot(f'Warning: Filename {fileName} does not end with .csv','warning',teeLogger=teeLogger)
336
+ elif delimiter and delimiter == '\0' and not fileName.endswith('.nsv'):
337
+ __teePrintOrNot(f'Warning: Filename {fileName} does not end with .nsv','warning',teeLogger=teeLogger)
338
+ elif delimiter and delimiter == '|' and not fileName.endswith('.psv'):
339
+ __teePrintOrNot(f'Warning: Filename {fileName} does not end with .psv','warning',teeLogger=teeLogger)
279
340
  if not os.path.isfile(fileName):
280
341
  if createIfNotExist:
281
342
  with open(fileName, mode ='w',encoding=encoding)as file:
@@ -289,14 +350,15 @@ def verifyTSVExistence(fileName,createIfNotExist = True,teeLogger = None,header
289
350
  return False
290
351
  return True
291
352
 
292
- def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True):
353
+ def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = '\t',defaults = []):
293
354
  """
294
- Read a TSV (Tab-Separated Values) file and return the data as a dictionary.
355
+ Compatibility method, calls readTabularFile.
356
+ Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
295
357
 
296
358
  Parameters:
297
- - fileName (str): The path to the TSV file.
359
+ - fileName (str): The path to the Tabular file.
298
360
  - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
299
- - header (str or list, optional): The header of the TSV file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
361
+ - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
300
362
  - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
301
363
  - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
302
364
  - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
@@ -304,9 +366,38 @@ def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, last
304
366
  - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
305
367
  - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
306
368
  - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
369
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t'.
370
+ - defaults (list, optional): The default values to use for missing columns. Defaults to [].
307
371
 
308
372
  Returns:
309
- - OrderedDict: The dictionary containing the data from the TSV file.
373
+ - OrderedDict: The dictionary containing the data from the Tabular file.
374
+
375
+ Raises:
376
+ - Exception: If the file is not found or there is a data format error.
377
+
378
+ """
379
+ return readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
380
+
381
+ def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = ...,defaults = []):
382
+ """
383
+ Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
384
+
385
+ Parameters:
386
+ - fileName (str): The path to the Tabular file.
387
+ - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
388
+ - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
389
+ - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
390
+ - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
391
+ - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
392
+ - verbose (bool, optional): Whether to print verbose output. Defaults to False.
393
+ - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
394
+ - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
395
+ - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
396
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
397
+ - defaults (list, optional): The default values to use for missing columns. Defaults to [].
398
+
399
+ Returns:
400
+ - OrderedDict: The dictionary containing the data from the Tabular file.
310
401
 
311
402
  Raises:
312
403
  - Exception: If the file is not found or there is a data format error.
@@ -314,33 +405,55 @@ def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, last
314
405
  """
315
406
  if taskDic is None:
316
407
  taskDic = {}
317
- header = formatHeader(header,verbose = verbose,teeLogger = teeLogger)
318
- if not verifyTSVExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict):
408
+ delimiter = get_delimiter(delimiter,file_name=fileName)
409
+ header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger, delimiter = delimiter)
410
+ if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
319
411
  return taskDic
320
412
  with open(fileName, mode ='rb')as file:
321
413
  correctColumnNum = -1
322
- if header.strip():
414
+ if header.rstrip():
323
415
  if verifyHeader:
324
- line = file.readline().decode().strip()
325
- if lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
326
- correctColumnNum = len(header.strip().split('\t'))
416
+ line = file.readline().decode(encoding=encoding)
417
+ if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
418
+ correctColumnNum = len(header.split(delimiter))
327
419
  if verbose:
328
420
  __teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
329
421
  if lastLineOnly:
330
- lineCache = read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=verbose, teeLogger=teeLogger, strict=strict)
422
+ lineCache = read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=verbose, teeLogger=teeLogger, strict=strict, delimiter=delimiter, defaults=defaults)
331
423
  if lineCache:
332
424
  taskDic[lineCache[0]] = lineCache
333
425
  return lineCache
334
426
  for line in file:
335
- correctColumnNum, lineCache = processLine(line,taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict)
427
+ correctColumnNum, lineCache = _processLine(line.decode(encoding=encoding),taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict,delimiter=delimiter,defaults = defaults)
336
428
  return taskDic
337
429
 
338
- def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True):
430
+ def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = '\t'):
431
+ """
432
+ Compatibility method, calls appendTabularFile.
433
+ Append a line of data to a Tabular file.
434
+ Parameters:
435
+ - fileName (str): The path of the Tabular file.
436
+ - lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
437
+ - teeLogger (optional): A logger object for logging messages.
438
+ - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
439
+ - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
440
+ - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
441
+ - verbose (bool, optional): If True, additional information will be printed during the execution.
442
+ - encoding (str, optional): The encoding of the file.
443
+ - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
444
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
445
+ Raises:
446
+ - Exception: If the file does not exist and createIfNotExist is False.
447
+ - Exception: If the existing header does not match the provided header.
448
+ """
449
+ return appendTabularFile(fileName,lineToAppend,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding, strict = strict, delimiter = delimiter)
450
+
451
+ def appendTabularFile(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = ...):
339
452
  """
340
- Append a line of data to a TSV file.
453
+ Append a line of data to a Tabular file.
341
454
  Parameters:
342
- - fileName (str): The path of the TSV file.
343
- - lineToAppend (str or list): The line of data to append. If it is a string, it will be split by tabs ('\t') to form a list.
455
+ - fileName (str): The path of the Tabular file.
456
+ - lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
344
457
  - teeLogger (optional): A logger object for logging messages.
345
458
  - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
346
459
  - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
@@ -348,15 +461,17 @@ def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExis
348
461
  - verbose (bool, optional): If True, additional information will be printed during the execution.
349
462
  - encoding (str, optional): The encoding of the file.
350
463
  - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
464
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
351
465
  Raises:
352
466
  - Exception: If the file does not exist and createIfNotExist is False.
353
467
  - Exception: If the existing header does not match the provided header.
354
468
  """
355
- header = formatHeader(header,verbose = verbose,teeLogger = teeLogger)
356
- if not verifyTSVExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict):
469
+ delimiter = get_delimiter(delimiter,file_name=fileName)
470
+ header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
471
+ if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
357
472
  return
358
473
  if type(lineToAppend) == str:
359
- lineToAppend = lineToAppend.strip().split('\t')
474
+ lineToAppend = lineToAppend.split(delimiter)
360
475
  else:
361
476
  for i in range(len(lineToAppend)):
362
477
  if type(lineToAppend[i]) != str:
@@ -367,11 +482,11 @@ def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExis
367
482
 
368
483
  with open(fileName, mode ='r+b')as file:
369
484
  correctColumnNum = len(lineToAppend)
370
- if header.strip():
485
+ if header.rstrip():
371
486
  if verifyHeader:
372
- line = file.readline().decode().strip()
373
- if lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
374
- correctColumnNum = len(header.strip().split('\t'))
487
+ line = file.readline().decode(encoding=encoding)
488
+ if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
489
+ correctColumnNum = len(header.split(delimiter))
375
490
  if verbose:
376
491
  __teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
377
492
  # truncate / fill the lineToAppend to the correct number of columns
@@ -383,15 +498,16 @@ def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExis
383
498
  file.seek(-1, os.SEEK_END)
384
499
  if file.read(1) != b'\n':
385
500
  file.write(b'\n')
386
- file.write('\t'.join(lineToAppend).encode() + b'\n')
501
+ file.write(get_delimiter(delimiter).join(lineToAppend).encode(encoding=encoding) + b'\n')
387
502
  if verbose:
388
503
  __teePrintOrNot(f"Appended {lineToAppend} to {fileName}",teeLogger=teeLogger)
389
504
 
390
- def clearTSV(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False):
505
+ def clearTSV(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False,delimiter = '\t'):
391
506
  """
392
- Clear the contents of a TSV file. Will create if not exist.
507
+ Compatibility method, calls clearTabularFile.
508
+ Clear the contents of a Tabular file. Will create if not exist.
393
509
  Parameters:
394
- - fileName (str): The path of the TSV file.
510
+ - fileName (str): The path of the Tabular file.
395
511
  - teeLogger (optional): A logger object for logging messages.
396
512
  - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
397
513
  - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
@@ -399,14 +515,29 @@ def clearTSV(fileName,teeLogger = None,header = '',verifyHeader = False,verbose
399
515
  - encoding (str, optional): The encoding of the file.
400
516
  - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
401
517
  """
402
- header = formatHeader(header,verbose = verbose,teeLogger = teeLogger)
403
- if not verifyTSVExistence(fileName,createIfNotExist = True,teeLogger = teeLogger,header = header,encoding = encoding,strict = False):
518
+ return clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
519
+
520
+ def clearTabularFile(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False,delimiter = ...):
521
+ """
522
+ Clear the contents of a Tabular file. Will create if not exist.
523
+ Parameters:
524
+ - fileName (str): The path of the Tabular file.
525
+ - teeLogger (optional): A logger object for logging messages.
526
+ - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
527
+ - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
528
+ - verbose (bool, optional): If True, additional information will be printed during the execution.
529
+ - encoding (str, optional): The encoding of the file.
530
+ - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
531
+ """
532
+ delimiter = get_delimiter(delimiter,file_name=fileName)
533
+ header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
534
+ if not _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = teeLogger,header = header,encoding = encoding,strict = False,delimiter=delimiter):
404
535
  raise Exception("Something catastrophic happened! File still not found after creation")
405
536
  else:
406
537
  with open(fileName, mode ='r+',encoding=encoding)as file:
407
- if header.strip() and verifyHeader:
408
- line = file.readline().strip()
409
- if not lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
538
+ if header.rstrip() and verifyHeader:
539
+ line = file.readline()
540
+ if not _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
410
541
  __teePrintOrNot(f'Warning: Header mismatch in {fileName}. Keeping original header in file...','warning',teeLogger)
411
542
  file.truncate()
412
543
  else:
@@ -442,14 +573,17 @@ class TSVZed(OrderedDict):
442
573
  except Exception as e:
443
574
  print(message,flush=True)
444
575
 
445
- def __init__ (self,fileName,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,monitor_external_changes = True,verbose = False,encoding = None):
576
+ def __init__ (self,fileName,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,monitor_external_changes = True,verbose = False,encoding = 'utf8',delimiter = ...,defualts = [],strict = False):
446
577
  super().__init__()
447
578
  self.version = version
579
+ self.strict = strict
448
580
  self.externalFileUpdateTime = getFileUpdateTimeNs(fileName)
449
581
  self.lastUpdateTime = self.externalFileUpdateTime
450
582
  self._fileName = fileName
451
583
  self.teeLogger = teeLogger
452
- self.header = formatHeader(header,verbose = verbose,teeLogger = self.teeLogger)
584
+ self.delimiter = get_delimiter(delimiter,file_name=fileName)
585
+ self.defaults = defualts
586
+ self.header = _formatHeader(header,verbose = verbose,teeLogger = self.teeLogger,delimiter=self.delimiter)
453
587
  self.correctColumnNum = -1
454
588
  self.createIfNotExist = createIfNotExist
455
589
  self.verifyHeader = verifyHeader
@@ -477,6 +611,27 @@ class TSVZed(OrderedDict):
477
611
  self.load()
478
612
  atexit.register(self.stopAppendThread)
479
613
 
614
+ def setDefaults(self,defaults):
615
+ if not defaults:
616
+ defaults = []
617
+ return
618
+ if isinstance(defaults,str):
619
+ defaults = defaults.split(self.delimiter)
620
+ elif not isinstance(defaults,list):
621
+ try:
622
+ defaults = list(defaults)
623
+ except:
624
+ if self.verbose:
625
+ self.__teePrintOrNot('Invalid defaults, setting defaults to empty.','error')
626
+ defaults = []
627
+ return
628
+ if not any(defaults):
629
+ defaults = []
630
+ return
631
+ if defaults[0] != DEFAULTS_INDICATOR_KEY:
632
+ defaults = [DEFAULTS_INDICATOR_KEY]+defaults
633
+ self.defaults = defaults
634
+
480
635
  def load(self):
481
636
  self.reload()
482
637
  if self.rewrite_on_load:
@@ -490,10 +645,10 @@ class TSVZed(OrderedDict):
490
645
  if self.verbose:
491
646
  self.__teePrintOrNot(f"Loading {self._fileName}")
492
647
  super().clear()
493
- readTSV(self._fileName, teeLogger = self.teeLogger, header = self.header, createIfNotExist = self.createIfNotExist, verifyHeader = self.verifyHeader, verbose = self.verbose, taskDic = self,encoding = self.encoding if self.encoding else None)
648
+ readTabularFile(self._fileName, teeLogger = self.teeLogger, header = self.header, createIfNotExist = self.createIfNotExist, verifyHeader = self.verifyHeader, verbose = self.verbose, taskDic = self,encoding = self.encoding if self.encoding else None, strict = self.strict, delimiter = self.delimiter, defaults=self.defaults)
494
649
  if self.verbose:
495
650
  self.__teePrintOrNot(f"Loaded {len(self)} records from {self._fileName}")
496
- self.correctColumnNum = len(self.header.split('\t')) if (self.header and self.verifyHeader) else (len(self[next(iter(self))]) if self else -1)
651
+ self.correctColumnNum = len(self.header.split(self.delimiter)) if (self.header and self.verifyHeader) else (len(self[next(iter(self))]) if self else -1)
497
652
  if self.verbose:
498
653
  self.__teePrintOrNot(f"correctColumnNum: {self.correctColumnNum}")
499
654
  #super().update(loadedData)
@@ -505,30 +660,55 @@ class TSVZed(OrderedDict):
505
660
  return self
506
661
 
507
662
  def __setitem__(self,key,value):
508
- key = str(key).strip()
663
+ key = str(key).rstrip()
509
664
  if not key:
510
665
  self.__teePrintOrNot('Key cannot be empty','error')
511
666
  return
512
667
  if type(value) == str:
513
- value = value.strip().split('\t')
668
+ value = value.split(self.delimiter)
514
669
  # sanitize the value
515
- value = [(str(segment).strip() if type(segment) != str else segment.strip()) if segment else '' for segment in value]
516
- #value = list(map(lambda segment: str(segment).strip(), value))
670
+ value = [(str(segment).rstrip() if type(segment) != str else segment.rstrip()) if segment else '' for segment in value]
671
+ # escape the delimiter and newline characters
672
+ value = [segment.replace(self.delimiter,'<sep>').replace('\n','\\n') for segment in value]
517
673
  # the first field in value should be the key
518
674
  # add it if it is not there
519
675
  if not value or value[0] != key:
520
676
  value = [key]+value
521
677
  # verify the value has the correct number of columns
522
678
  if self.correctColumnNum != 1 and len(value) == 1:
523
- # this means we want to clear / deelte the key
679
+ # this means we want to clear / delete the key
524
680
  self.__delitem__(key)
525
681
  elif self.correctColumnNum > 0:
526
- assert len(value) == self.correctColumnNum, f"Data format error! Expected {self.correctColumnNum} columns, but got {len(value) } columns"
682
+ if len(value) != self.correctColumnNum:
683
+ if self.strict:
684
+ self.__teePrintOrNot(f"Value {value} does not have the correct number of columns: {self.correctColumnNum}. Refuse adding key...",'error')
685
+ return
686
+ elif self.verbose:
687
+ self.__teePrintOrNot(f"Value {value} does not have the correct number of columns: {self.correctColumnNum}, correcting...",'warning')
688
+ if len(value) < self.correctColumnNum:
689
+ value += ['']*(self.correctColumnNum-len(value))
690
+ elif len(value) > self.correctColumnNum:
691
+ value = value[:self.correctColumnNum]
527
692
  else:
528
693
  self.correctColumnNum = len(value)
694
+ if self.defaults and len(self.defaults) > 1:
695
+ for i in range(1,len(value)):
696
+ if not value[i] and i < len(self.defaults) and self.defaults[i]:
697
+ value[i] = self.defaults[i]
698
+ if self.verbose:
699
+ self.__teePrintOrNot(f" Replacing empty value at {i} with default: {self.defaults[i]}")
700
+ if key == DEFAULTS_INDICATOR_KEY:
701
+ self.defaults = value
702
+ if self.verbose:
703
+ self.__teePrintOrNot(f"Defaults set to {value}")
704
+ if not self.memoryOnly:
705
+ self.appendQueue.append(self.delimiter.join(value))
706
+ self.lastUpdateTime = get_time_ns()
707
+ if self.verbose:
708
+ self.__teePrintOrNot(f"Appending Defaults {key} to the appendQueue")
709
+ return
529
710
  if self.verbose:
530
711
  self.__teePrintOrNot(f"Setting {key} to {value}")
531
-
532
712
  if key in self:
533
713
  if self[key] == value:
534
714
  if self.verbose:
@@ -537,13 +717,17 @@ class TSVZed(OrderedDict):
537
717
  self.dirty = True
538
718
  # update the dictionary,
539
719
  super().__setitem__(key,value)
540
- if self.verbose:
541
- self.__teePrintOrNot(f"Key {key} updated")
542
720
  if self.memoryOnly:
721
+ if self.verbose:
722
+ self.__teePrintOrNot(f"Key {key} updated in memory only")
723
+ return
724
+ elif key.startswith('#'):
725
+ if self.verbose:
726
+ self.__teePrintOrNot(f"Key {key} updated in memory only as it starts with #")
543
727
  return
544
728
  if self.verbose:
545
729
  self.__teePrintOrNot(f"Appending {key} to the appendQueue")
546
- self.appendQueue.append('\t'.join(value))
730
+ self.appendQueue.append(self.delimiter.join(value))
547
731
  self.lastUpdateTime = get_time_ns()
548
732
  # if not self.appendThread.is_alive():
549
733
  # self.commitAppendToFile()
@@ -552,25 +736,38 @@ class TSVZed(OrderedDict):
552
736
 
553
737
 
554
738
  def __delitem__(self,key):
555
- key = str(key).strip()
739
+ key = str(key).rstrip()
740
+ if key == DEFAULTS_INDICATOR_KEY:
741
+ self.defaults = []
742
+ if self.verbose:
743
+ self.__teePrintOrNot(f"Defaults cleared")
744
+ if not self.memoryOnly:
745
+ self.__appendEmptyLine(key)
746
+ if self.verbose:
747
+ self.__teePrintOrNot(f"Appending empty default line {key}")
748
+ return
556
749
  # delete the key from the dictionary and update the file
557
750
  if key not in self:
558
751
  if self.verbose:
559
752
  self.__teePrintOrNot(f"Key {key} not found")
560
753
  return
561
754
  super().__delitem__(key)
562
- if self.memoryOnly:
755
+ if self.memoryOnly or key.startswith('#'):
756
+ if self.verbose:
757
+ self.__teePrintOrNot(f"Key {key} deleted in memory")
563
758
  return
564
759
  self.__appendEmptyLine(key)
760
+ if self.verbose:
761
+ self.__teePrintOrNot(f"Appending empty line {key}")
565
762
  self.lastUpdateTime = get_time_ns()
566
763
 
567
764
  def __appendEmptyLine(self,key):
568
765
  self.dirty = True
569
766
  if self.correctColumnNum > 0:
570
- emptyLine = key+'\t'*(self.correctColumnNum-1)
767
+ emptyLine = key+self.delimiter*(self.correctColumnNum-1)
571
768
  elif len(self[key]) > 1:
572
769
  self.correctColumnNum = len(self[key])
573
- emptyLine = key+'\t'*(self.correctColumnNum-1)
770
+ emptyLine = key+self.delimiter*(self.correctColumnNum-1)
574
771
  else:
575
772
  emptyLine = key
576
773
  if self.verbose:
@@ -745,7 +942,7 @@ memoryOnly:{self.memoryOnly}
745
942
  if self.header:
746
943
  file.write(self.header+'\n')
747
944
  for key in self:
748
- file.write('\t'.join(self[key])+'\n')
945
+ file.write(self.delimiter.join(self[key])+'\n')
749
946
  self.release_file_obj(file)
750
947
  if self.verbose:
751
948
  self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
@@ -761,27 +958,32 @@ memoryOnly:{self.memoryOnly}
761
958
  return self
762
959
 
763
960
  def mapToFile(self):
961
+ mec = self.monitor_external_changes
962
+ self.monitor_external_changes = False
764
963
  try:
765
964
  if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
766
965
  self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
767
- file = self.get_file_obj('r+')
966
+ file = self.get_file_obj('r+b')
768
967
  overWrite = False
769
- line = file.readline()
770
- aftPos = file.tell()
771
- if self.header and not lineContainHeader(self.header,line,verbose = self.verbose,teeLogger = self.teeLogger,strict = False):
772
- file.seek(0)
773
- file.write(self.header+'\n')
774
- # if the header is not the same length as the line, we need to overwrite the file
775
- if aftPos != file.tell():
776
- overWrite = True
777
- if self.verbose:
778
- self.__teePrintOrNot(f"Header {self.header} written to {self._fileName}")
968
+ if self.header:
969
+ line = file.readline().decode(self.encoding)
970
+ aftPos = file.tell()
971
+ if not _lineContainHeader(self.header,line,verbose = self.verbose,teeLogger = self.teeLogger,strict = self.strict):
972
+ file.seek(0)
973
+ file.write(f'{self.header}\n'.encode(encoding=self.encoding))
974
+ # if the header is not the same length as the line, we need to overwrite the file
975
+ if aftPos != file.tell():
976
+ overWrite = True
977
+ if self.verbose:
978
+ self.__teePrintOrNot(f"Header {self.header} written to {self._fileName}")
779
979
  for value in self.values():
780
- strToWrite = '\t'.join(value)+'\n'
980
+ if value[0].startswith('#'):
981
+ continue
982
+ strToWrite = self.delimiter.join(value)
781
983
  if overWrite:
782
984
  if self.verbose:
783
985
  self.__teePrintOrNot(f"Overwriting {value} to {self._fileName}")
784
- file.write(strToWrite)
986
+ file.write(strToWrite.encode(encoding=self.encoding)+b'\n')
785
987
  continue
786
988
  pos = file.tell()
787
989
  line = file.readline()
@@ -789,15 +991,17 @@ memoryOnly:{self.memoryOnly}
789
991
  if not line or pos == aftPos:
790
992
  if self.verbose:
791
993
  self.__teePrintOrNot(f"End of file reached. Appending {value} to {self._fileName}")
792
- file.write(strToWrite)
994
+ file.write(strToWrite.encode(encoding=self.encoding))
793
995
  overWrite = True
794
996
  continue
997
+ strToWrite = strToWrite.encode(encoding=self.encoding).ljust(len(line)-1)+b'\n'
795
998
  if line != strToWrite:
796
999
  if self.verbose:
797
- self.__teePrintOrNot(f"Overwriting {value} to {self._fileName}")
1000
+ self.__teePrintOrNot(f"Modifing {value} to {self._fileName}")
798
1001
  file.seek(pos)
799
1002
  # fill the string with space to write to the correct length
800
- file.write(strToWrite.rstrip('\n').ljust(len(line)-1)+'\n')
1003
+ #file.write(strToWrite.rstrip('\n').ljust(len(line)-1)+'\n')
1004
+ file.write(strToWrite)
801
1005
  if aftPos != file.tell():
802
1006
  overWrite = True
803
1007
  file.truncate()
@@ -813,6 +1017,8 @@ memoryOnly:{self.memoryOnly}
813
1017
  import traceback
814
1018
  self.__teePrintOrNot(traceback.format_exc(),'error')
815
1019
  self.deSynced = True
1020
+ self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1021
+ self.monitor_external_changes = mec
816
1022
  return self
817
1023
 
818
1024
  def checkExternalChanges(self):
@@ -831,9 +1037,10 @@ memoryOnly:{self.memoryOnly}
831
1037
 
832
1038
  def _appendWorker(self):
833
1039
  while not self.shutdownEvent.is_set():
834
- self.checkExternalChanges()
835
- self.rewrite()
836
- self.commitAppendToFile()
1040
+ if not self.memoryOnly:
1041
+ self.checkExternalChanges()
1042
+ self.rewrite()
1043
+ self.commitAppendToFile()
837
1044
  time.sleep(self.append_check_delay)
838
1045
  # self.appendEvent.wait()
839
1046
  # self.appendEvent.clear()
@@ -883,15 +1090,19 @@ memoryOnly:{self.memoryOnly}
883
1090
  def get_file_obj(self,modes = 'a'):
884
1091
  self.writeLock.acquire()
885
1092
  try:
886
- if not self.encoding:
887
- self.encoding = 'utf8'
888
- file = open(self._fileName, mode=modes, encoding=self.encoding)
1093
+ if 'b' not in modes:
1094
+ if not self.encoding:
1095
+ self.encoding = 'utf8'
1096
+ file = open(self._fileName, mode=modes, encoding=self.encoding)
1097
+ else:
1098
+ file = open(self._fileName, mode=modes)
889
1099
  # Lock the file after opening
890
1100
  if os.name == 'posix':
891
1101
  fcntl.lockf(file, fcntl.LOCK_EX)
892
1102
  elif os.name == 'nt':
893
1103
  # For Windows, locking the entire file, avoiding locking an empty file
894
- lock_length = max(1, os.path.getsize(self._fileName))
1104
+ #lock_length = max(1, os.path.getsize(self._fileName))
1105
+ lock_length = 2147483647
895
1106
  msvcrt.locking(file.fileno(), msvcrt.LK_LOCK, lock_length)
896
1107
  if self.verbose:
897
1108
  self.__teePrintOrNot(f"File {self._fileName} locked with mode {modes}")
@@ -910,13 +1121,18 @@ memoryOnly:{self.memoryOnly}
910
1121
  try:
911
1122
  file.flush() # Ensure the file is flushed before unlocking
912
1123
  os.fsync(file.fileno()) # Ensure the file is synced to disk before unlocking
913
- if os.name == 'posix':
914
- fcntl.lockf(file, fcntl.LOCK_UN)
915
- elif os.name == 'nt':
916
- # Unlocking the entire file; for Windows, ensure not unlocking an empty file
917
- unlock_length = max(1, os.path.getsize(file.name))
918
- msvcrt.locking(file.fileno(), msvcrt.LK_UNLCK, unlock_length)
919
- file.close() # Ensure file is closed after unlocking
1124
+ if not file.closed:
1125
+ if os.name == 'posix':
1126
+ fcntl.lockf(file, fcntl.LOCK_UN)
1127
+ elif os.name == 'nt':
1128
+ # Unlocking the entire file; for Windows, ensure not unlocking an empty file
1129
+ #unlock_length = max(1, os.path.getsize(os.path.realpath(file.name)))
1130
+ unlock_length = 2147483647
1131
+ try:
1132
+ msvcrt.locking(file.fileno(), msvcrt.LK_UNLCK, unlock_length)
1133
+ except:
1134
+ pass
1135
+ file.close() # Ensure file is closed after unlocking
920
1136
  if self.verbose:
921
1137
  self.__teePrintOrNot(f"File {file.name} unlocked / released")
922
1138
  except Exception as e:
@@ -925,26 +1141,47 @@ memoryOnly:{self.memoryOnly}
925
1141
  except Exception as e:
926
1142
  self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
927
1143
  self.__teePrintOrNot(f"Failed to release file {file.name}: {e}",'error')
928
- try:
929
- self.writeLock.release() # Ensure the thread lock is always released
930
- except Exception as e:
931
- self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
932
- self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1144
+ import traceback
1145
+ self.__teePrintOrNot(traceback.format_exc(),'error')
1146
+ # release the write lock if not already released
1147
+ if self.writeLock.locked():
1148
+ try:
1149
+ self.writeLock.release() # Ensure the thread lock is always released
1150
+ except Exception as e:
1151
+ self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
1152
+ self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
933
1153
 
934
1154
 
935
1155
  def __main__():
936
1156
  import argparse
937
- parser = argparse.ArgumentParser(description='TSVZed: A TSV file manager')
938
- parser.add_argument('filename', type=str, help='The TSV file to read')
1157
+ parser = argparse.ArgumentParser(description='TSVZed: A TSV / CSV / NSV file manager')
1158
+ parser.add_argument('filename', type=str, help='The file to read')
939
1159
  parser.add_argument('operation', type=str,nargs='?', choices=['read','append','delete','clear'], help='The operation to perform. Default: read', default='read')
940
- parser.add_argument('line', type=str, nargs='*', help='The line to append to the TSV file. it follows as : {key} {value1} {value2} ... if a key without value be inserted, the value will get deleted.')
941
- parser.add_argument('-c', '--header', type=str, help='Perform checks with this header of the TSV file. seperate using \\t')
942
- parser.add_argument('-f', '--force', action='store_true', help='Force the operation. Ignore checks for column numbers / headers')
1160
+ parser.add_argument('line', type=str, nargs='*', help='The line to append to the Tabular file. it follows as : {key} {value1} {value2} ... if a key without value be inserted, the value will get deleted.')
1161
+ parser.add_argument('-d', '--delimiter', type=str, help='The delimiter of the Tabular file. Default: Infer from last part of filename, or tab if cannot determine. Note: accept unicode escaped char, raw char, or string "comma,tab,null" will refer to their characters. ', default=...)
1162
+ parser.add_argument('-c', '--header', type=str, help='Perform checks with this header of the Tabular file. seperate using --delimiter.')
1163
+ parser.add_argument('--defaults', type=str, help='Default values to fill in the missing columns. seperate using --delimiter. Ex. if -d = comma, --defaults="key,value1,value2..." Note: Please specify the key. But it will not be used as a key need to be unique in data.')
1164
+ strictMode = parser.add_mutually_exclusive_group()
1165
+ strictMode.add_argument('-s', '--strict', dest = 'strict',action='store_true', help='Strict mode. Do not parse values that seems malformed, check for column numbers / headers')
1166
+ strictMode.add_argument('-f', '--force', dest = 'strict',action='store_false', help='Force the operation. Ignore checks for column numbers / headers')
943
1167
  parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
944
1168
  parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {version} by {author}')
945
1169
  args = parser.parse_args()
946
-
947
- header = args.header.replace('\\t','\t') if args.header else ''
1170
+ args.delimiter = get_delimiter(delimiter=args.delimiter,file_name=args.filename)
1171
+ if args.header and args.header.endswith('\\'):
1172
+ args.header += '\\'
1173
+ try:
1174
+ header = args.header.encode().decode('unicode_escape') if args.header else ''
1175
+ except Exception as e:
1176
+ print(f"Failed to decode header: {args.header}")
1177
+ header = ''
1178
+ defaults = []
1179
+ if args.defaults:
1180
+ try:
1181
+ defaults = args.defaults.encode().decode('unicode_escape').split(args.delimiter)
1182
+ except Exception as e:
1183
+ print(f"Failed to decode defaults: {args.defaults}")
1184
+ defaults = []
948
1185
 
949
1186
  if args.operation == 'read':
950
1187
  # check if the file exist
@@ -952,14 +1189,14 @@ def __main__():
952
1189
  print(f"File not found: {args.filename}")
953
1190
  return
954
1191
  # read the file
955
- data = readTSV(args.filename, verifyHeader = False, verbose=args.verbose,strict= not args.force)
956
- print(pretty_format_table(data.values()))
1192
+ data = readTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= args.strict, delimiter=args.delimiter, defaults=defaults)
1193
+ print(pretty_format_table(data.values(),delimiter=args.delimiter))
957
1194
  elif args.operation == 'append':
958
- appendTSV(args.filename, args.line,createIfNotExist = True, header=header, verbose=args.verbose, strict= not args.force)
1195
+ appendTabularFile(args.filename, args.line,createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
959
1196
  elif args.operation == 'delete':
960
- appendTSV(args.filename, args.line[:1],createIfNotExist = True, header=header, verbose=args.verbose, strict= not args.force)
1197
+ appendTabularFile(args.filename, args.line[:1],createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
961
1198
  elif args.operation == 'clear':
962
- clearTSV(args.filename, header=header, verbose=args.verbose, verifyHeader=not args.force)
1199
+ clearTabularFile(args.filename, header=header, verbose=args.verbose, verifyHeader=args.strict, delimiter=args.delimiter)
963
1200
  else:
964
1201
  print("Invalid operation")
965
1202
  return