TSVZ 2.70__py3-none-any.whl → 3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
TSVZ.py CHANGED
@@ -4,59 +4,90 @@ from collections import OrderedDict , deque
4
4
  import time
5
5
  import atexit
6
6
  import threading
7
+ import re
7
8
 
8
9
  if os.name == 'nt':
9
10
  import msvcrt
10
11
  elif os.name == 'posix':
11
12
  import fcntl
12
13
 
13
- version = '2.70'
14
+ version = '3.02'
14
15
  author = 'pan@zopyr.us'
15
16
 
17
+ DEFAULT_DELIMITER = '\t'
16
18
 
17
- def pretty_format_table(data):
18
- version = 1.0
19
- if not data:
20
- return ''
21
- if type(data) == str:
22
- data = data.strip('\n').split('\n')
23
- data = [line.split('\t') for line in data]
24
- elif isinstance(data, dict):
25
- # flatten the 2D dict to a list of lists
26
- if isinstance(next(iter(data.values())), dict):
27
- tempData = [['key'] + list(next(iter(data.values())).keys())]
28
- tempData.extend( [[key] + list(value.values()) for key, value in data.items()])
29
- data = tempData
30
- else:
31
- # it is a dict of lists
32
- data = [[key] + list(value) for key, value in data.items()]
33
- elif type(data) != list:
34
- data = list(data)
35
- # format the list into 2d list of list of strings
36
- if isinstance(data[0], dict):
37
- tempData = [data[0].keys()]
38
- tempData.extend([list(item.values()) for item in data])
39
- data = tempData
40
- data = [[str(item) for item in row] for row in data]
41
- num_cols = len(data[0])
42
- col_widths = [0] * num_cols
43
- # Calculate the maximum width of each column
44
- for c in range(num_cols):
45
- col_widths[c] = max(len(row[c]) for row in data)
46
- # Build the row format string
47
- row_format = ' | '.join('{{:<{}}}'.format(width) for width in col_widths)
48
- # Print the header
49
- header = data[0]
50
- outTable = []
51
- outTable.append(row_format.format(*header))
52
- outTable.append('-+-'.join('-' * width for width in col_widths))
53
- for row in data[1:]:
54
- # if the row is empty, print an divider
55
- if not any(row):
56
- outTable.append('-+-'.join('-' * width for width in col_widths))
57
- else:
58
- outTable.append(row_format.format(*row))
59
- return '\n'.join(outTable) + '\n'
19
+ def get_delimiter(delimiter,file_name = ''):
20
+ if not delimiter:
21
+ return DEFAULT_DELIMITER
22
+ elif delimiter == ...:
23
+ if not file_name:
24
+ rtn = '\t'
25
+ if file_name.endswith('.csv'):
26
+ rtn = ','
27
+ elif file_name.endswith('.nsv'):
28
+ rtn = '\0'
29
+ elif file_name.endswith('.psv'):
30
+ rtn = '|'
31
+ else:
32
+ rtn = '\t'
33
+ elif delimiter == 'comma':
34
+ rtn = ','
35
+ elif delimiter == 'tab':
36
+ rtn = '\t'
37
+ elif delimiter == 'pipe':
38
+ rtn = '|'
39
+ elif delimiter == 'null':
40
+ rtn = '\0'
41
+ else:
42
+ rtn = delimiter.encode().decode('unicode_escape')
43
+ DEFAULT_DELIMITER = rtn
44
+ return rtn
45
+
46
+ def pretty_format_table(data, delimiter = DEFAULT_DELIMITER):
47
+ version = 1.0
48
+ if not data:
49
+ return ''
50
+ if type(data) == str:
51
+ data = data.strip('\n').split('\n')
52
+ data = [line.split(delimiter) for line in data]
53
+ elif isinstance(data, dict):
54
+ # flatten the 2D dict to a list of lists
55
+ if isinstance(next(iter(data.values())), dict):
56
+ tempData = [['key'] + list(next(iter(data.values())).keys())]
57
+ tempData.extend( [[key] + list(value.values()) for key, value in data.items()])
58
+ data = tempData
59
+ else:
60
+ # it is a dict of lists
61
+ data = [[key] + list(value) for key, value in data.items()]
62
+ elif type(data) != list:
63
+ data = list(data)
64
+ # format the list into 2d list of list of strings
65
+ if isinstance(data[0], dict):
66
+ tempData = [data[0].keys()]
67
+ tempData.extend([list(item.values()) for item in data])
68
+ data = tempData
69
+ data = [[str(item) for item in row] for row in data]
70
+ num_cols = len(data[0])
71
+ col_widths = [0] * num_cols
72
+ # Calculate the maximum width of each column
73
+ for c in range(num_cols):
74
+ #col_widths[c] = max(len(row[c]) for row in data)
75
+ # handle ansii escape sequences
76
+ col_widths[c] = max(len(re.sub(r'\x1b\[[0-?]*[ -/]*[@-~]','',row[c])) for row in data)
77
+ # Build the row format string
78
+ row_format = ' | '.join('{{:<{}}}'.format(width) for width in col_widths)
79
+ # Print the header
80
+ header = data[0]
81
+ outTable = []
82
+ outTable.append(row_format.format(*header))
83
+ outTable.append('-+-'.join('-' * width for width in col_widths))
84
+ for row in data[1:]:
85
+ # if the row is empty, print an divider
86
+ if not any(row):
87
+ outTable.append('-+-'.join('-' * width for width in col_widths))
88
+ else:
89
+ outTable.append(row_format.format(*row))
90
+ return '\n'.join(outTable) + '\n'
60
91
 
61
92
  def __teePrintOrNot(message,level = 'info',teeLogger = None):
62
93
  """
@@ -78,7 +109,7 @@ def __teePrintOrNot(message,level = 'info',teeLogger = None):
78
109
  except Exception as e:
79
110
  print(message,flush=True)
80
111
 
81
- def processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,strict = True):
112
+ def _processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,strict = True,delimiter = DEFAULT_DELIMITER):
82
113
  """
83
114
  Process a line of text and update the task dictionary.
84
115
 
@@ -94,7 +125,7 @@ def processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,s
94
125
  tuple: A tuple containing the updated correctColumnNum and the processed lineCache.
95
126
 
96
127
  """
97
- line = line.decode().strip(' ').strip('\x00')
128
+ line = line.strip(' ').strip('\x00').rstrip('\r\n')
98
129
  # we throw away the lines that start with '#'
99
130
  if not line :
100
131
  if verbose:
@@ -105,7 +136,7 @@ def processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,s
105
136
  __teePrintOrNot(f"Ignoring comment line: {line}",teeLogger=teeLogger)
106
137
  return correctColumnNum , []
107
138
  # we only interested in the lines that have the correct number of columns
108
- lineCache = [segment.strip() for segment in line.split('\t')]
139
+ lineCache = [segment.strip() for segment in line.split(delimiter)]
109
140
  if not lineCache:
110
141
  return correctColumnNum , []
111
142
  if correctColumnNum == -1:
@@ -144,7 +175,7 @@ def processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,s
144
175
  __teePrintOrNot(f"Key {lineCache[0]} added after correction",teeLogger=teeLogger)
145
176
  return correctColumnNum, lineCache
146
177
 
147
- def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False):
178
+ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False,encoding = 'utf8',delimiter = ...):
148
179
  """
149
180
  Reads the last valid line from a file.
150
181
 
@@ -154,6 +185,7 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
154
185
  correctColumnNum (int): A column number to pass to processLine function.
155
186
  verbose (bool, optional): Whether to print verbose output. Defaults to False.
156
187
  teeLogger (optional): Logger to use for tee print. Defaults to None.
188
+ encoding (str, optional): The encoding of the file. Defaults to None.
157
189
  strict (bool, optional): Whether to enforce strict processing. Defaults to False.
158
190
 
159
191
  Returns:
@@ -161,6 +193,7 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
161
193
  """
162
194
  chunk_size = 1024 # Read in chunks of 1024 bytes
163
195
  last_valid_line = []
196
+ delimiter = get_delimiter(delimiter,file_name=fileName)
164
197
  if verbose:
165
198
  __teePrintOrNot(f"Reading last line only from {fileName}",teeLogger=teeLogger)
166
199
  with open(fileName, 'rb') as file:
@@ -186,13 +219,14 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
186
219
  for i in range(len(lines) - 1, -1, -1):
187
220
  if lines[i].strip(): # Skip empty lines
188
221
  # Process the line
189
- correctColumnNum, lineCache = processLine(
190
- lines[i],
222
+ correctColumnNum, lineCache = _processLine(
223
+ lines[i].decode(encoding=encoding),
191
224
  taskDic,
192
225
  correctColumnNum,
193
226
  verbose=verbose,
194
227
  teeLogger=teeLogger,
195
- strict=strict
228
+ strict=strict,
229
+ delimiter=delimiter
196
230
  )
197
231
  # If the line is valid, return it
198
232
  if lineCache and any(lineCache):
@@ -204,7 +238,7 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
204
238
  # Return empty list if no valid line found
205
239
  return last_valid_line
206
240
 
207
- def formatHeader(header,verbose = False,teeLogger = None):
241
+ def _formatHeader(header,verbose = False,teeLogger = None,delimiter = DEFAULT_DELIMITER):
208
242
  """
209
243
  Format the header string.
210
244
 
@@ -218,12 +252,12 @@ def formatHeader(header,verbose = False,teeLogger = None):
218
252
  """
219
253
  if type(header) != str:
220
254
  try:
221
- header = '\t'.join(header)
255
+ header = delimiter.join(header)
222
256
  except:
223
257
  if verbose:
224
258
  __teePrintOrNot('Invalid header, setting header to empty.','error',teeLogger=teeLogger)
225
259
  header = ''
226
- header = header.strip()
260
+ header = delimiter.join([segment.rstrip() for segment in header.split(delimiter)])
227
261
  # if header:
228
262
  # if not header.endswith('\n'):
229
263
  # header += '\n'
@@ -231,7 +265,7 @@ def formatHeader(header,verbose = False,teeLogger = None):
231
265
  # header = ''
232
266
  return header
233
267
 
234
- def lineContainHeader(header,line,verbose = False,teeLogger = None,strict = False):
268
+ def _lineContainHeader(header,line,verbose = False,teeLogger = None,strict = False,delimiter = DEFAULT_DELIMITER):
235
269
  """
236
270
  Verify if a line contains the header.
237
271
 
@@ -245,26 +279,24 @@ def lineContainHeader(header,line,verbose = False,teeLogger = None,strict = Fals
245
279
  Returns:
246
280
  bool: True if the header matches the line, False otherwise.
247
281
  """
248
- escapedHeader = repr(header.strip())
249
- escapedLine = repr(line.strip())
282
+ header = [segment.rstrip() for segment in header.split(delimiter)]
283
+ line = [segment.rstrip() for segment in line.split(delimiter)]
250
284
  if verbose:
251
- __teePrintOrNot(f"Header: \n{escapedHeader}",teeLogger=teeLogger)
252
- __teePrintOrNot(f"First line: \n{escapedLine}",teeLogger=teeLogger)
253
- headerList = header.strip().lower().split('\t')
254
- lineList = line.strip().lower().split('\t')
255
- if len(headerList) != len(lineList) or any([headerList[i] not in lineList[i] for i in range(len(headerList))]):
256
- __teePrintOrNot(f"Header mismatch: \n{escapedLine} \n!= \n{escapedHeader}",teeLogger=teeLogger)
285
+ __teePrintOrNot(f"Header: \n{header}",teeLogger=teeLogger)
286
+ __teePrintOrNot(f"First line: \n{line}",teeLogger=teeLogger)
287
+ if len(header) != len(line) or any([header[i] not in line[i] for i in range(len(header))]):
288
+ __teePrintOrNot(f"Header mismatch: \n{line} \n!= \n{header}",teeLogger=teeLogger)
257
289
  if strict:
258
290
  raise Exception("Data format error! Header mismatch")
259
291
  return False
260
292
  return True
261
293
 
262
- def verifyTSVExistence(fileName,createIfNotExist = True,teeLogger = None,header = '',encoding = 'utf8',strict = True):
294
+ def _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = None,header = '',encoding = 'utf8',strict = True,delimiter = DEFAULT_DELIMITER):
263
295
  """
264
- Verify the existence of a TSV file.
296
+ Verify the existence of the tabular file.
265
297
 
266
298
  Parameters:
267
- - fileName (str): The path of the TSV file.
299
+ - fileName (str): The path of the tabular file.
268
300
  - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to True.
269
301
  - teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
270
302
  - header (str, optional): The header line to verify against. Defaults to ''.
@@ -274,8 +306,14 @@ def verifyTSVExistence(fileName,createIfNotExist = True,teeLogger = None,header
274
306
  Returns:
275
307
  bool: True if the file exists, False otherwise.
276
308
  """
277
- if not fileName.endswith('.tsv'):
309
+ if delimiter and delimiter == '\t' and not fileName.endswith('.tsv'):
278
310
  __teePrintOrNot(f'Warning: Filename {fileName} does not end with .tsv','warning',teeLogger=teeLogger)
311
+ elif delimiter and delimiter == ',' and not fileName.endswith('.csv'):
312
+ __teePrintOrNot(f'Warning: Filename {fileName} does not end with .csv','warning',teeLogger=teeLogger)
313
+ elif delimiter and delimiter == '\0' and not fileName.endswith('.nsv'):
314
+ __teePrintOrNot(f'Warning: Filename {fileName} does not end with .nsv','warning',teeLogger=teeLogger)
315
+ elif delimiter and delimiter == '|' and not fileName.endswith('.psv'):
316
+ __teePrintOrNot(f'Warning: Filename {fileName} does not end with .psv','warning',teeLogger=teeLogger)
279
317
  if not os.path.isfile(fileName):
280
318
  if createIfNotExist:
281
319
  with open(fileName, mode ='w',encoding=encoding)as file:
@@ -289,14 +327,41 @@ def verifyTSVExistence(fileName,createIfNotExist = True,teeLogger = None,header
289
327
  return False
290
328
  return True
291
329
 
292
- def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True):
330
+ def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = '\t'):
331
+ """
332
+ Compatibility method, calls readTabularFile.
333
+ Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
334
+
335
+ Parameters:
336
+ - fileName (str): The path to the Tabular file.
337
+ - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
338
+ - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
339
+ - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
340
+ - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
341
+ - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
342
+ - verbose (bool, optional): Whether to print verbose output. Defaults to False.
343
+ - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
344
+ - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
345
+ - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
346
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t'.
347
+
348
+ Returns:
349
+ - OrderedDict: The dictionary containing the data from the Tabular file.
350
+
351
+ Raises:
352
+ - Exception: If the file is not found or there is a data format error.
353
+
293
354
  """
294
- Read a TSV (Tab-Separated Values) file and return the data as a dictionary.
355
+ return readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter)
356
+
357
+ def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = ...):
358
+ """
359
+ Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
295
360
 
296
361
  Parameters:
297
- - fileName (str): The path to the TSV file.
362
+ - fileName (str): The path to the Tabular file.
298
363
  - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
299
- - header (str or list, optional): The header of the TSV file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
364
+ - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
300
365
  - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
301
366
  - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
302
367
  - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
@@ -304,9 +369,10 @@ def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, last
304
369
  - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
305
370
  - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
306
371
  - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
372
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
307
373
 
308
374
  Returns:
309
- - OrderedDict: The dictionary containing the data from the TSV file.
375
+ - OrderedDict: The dictionary containing the data from the Tabular file.
310
376
 
311
377
  Raises:
312
378
  - Exception: If the file is not found or there is a data format error.
@@ -314,33 +380,35 @@ def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, last
314
380
  """
315
381
  if taskDic is None:
316
382
  taskDic = {}
317
- header = formatHeader(header,verbose = verbose,teeLogger = teeLogger)
318
- if not verifyTSVExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict):
383
+ delimiter = get_delimiter(delimiter,file_name=fileName)
384
+ header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger, delimiter = delimiter)
385
+ if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
319
386
  return taskDic
320
387
  with open(fileName, mode ='rb')as file:
321
388
  correctColumnNum = -1
322
- if header.strip():
389
+ if header.rstrip():
323
390
  if verifyHeader:
324
- line = file.readline().decode().strip()
325
- if lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
326
- correctColumnNum = len(header.strip().split('\t'))
391
+ line = file.readline().decode(encoding=encoding)
392
+ if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
393
+ correctColumnNum = len(header.split(delimiter))
327
394
  if verbose:
328
395
  __teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
329
396
  if lastLineOnly:
330
- lineCache = read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=verbose, teeLogger=teeLogger, strict=strict)
397
+ lineCache = read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=verbose, teeLogger=teeLogger, strict=strict, delimiter=delimiter)
331
398
  if lineCache:
332
399
  taskDic[lineCache[0]] = lineCache
333
400
  return lineCache
334
401
  for line in file:
335
- correctColumnNum, lineCache = processLine(line,taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict)
402
+ correctColumnNum, lineCache = _processLine(line.decode(encoding=encoding),taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict,delimiter=delimiter)
336
403
  return taskDic
337
404
 
338
- def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True):
405
+ def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = '\t'):
339
406
  """
340
- Append a line of data to a TSV file.
407
+ Compatibility method, calls appendTabularFile.
408
+ Append a line of data to a Tabular file.
341
409
  Parameters:
342
- - fileName (str): The path of the TSV file.
343
- - lineToAppend (str or list): The line of data to append. If it is a string, it will be split by tabs ('\t') to form a list.
410
+ - fileName (str): The path of the Tabular file.
411
+ - lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
344
412
  - teeLogger (optional): A logger object for logging messages.
345
413
  - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
346
414
  - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
@@ -348,15 +416,37 @@ def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExis
348
416
  - verbose (bool, optional): If True, additional information will be printed during the execution.
349
417
  - encoding (str, optional): The encoding of the file.
350
418
  - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
419
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
351
420
  Raises:
352
421
  - Exception: If the file does not exist and createIfNotExist is False.
353
422
  - Exception: If the existing header does not match the provided header.
354
423
  """
355
- header = formatHeader(header,verbose = verbose,teeLogger = teeLogger)
356
- if not verifyTSVExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict):
424
+ return appendTabularFile(fileName,lineToAppend,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding, strict = strict, delimiter = delimiter)
425
+
426
+ def appendTabularFile(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = ...):
427
+ """
428
+ Append a line of data to a Tabular file.
429
+ Parameters:
430
+ - fileName (str): The path of the Tabular file.
431
+ - lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
432
+ - teeLogger (optional): A logger object for logging messages.
433
+ - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
434
+ - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
435
+ - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
436
+ - verbose (bool, optional): If True, additional information will be printed during the execution.
437
+ - encoding (str, optional): The encoding of the file.
438
+ - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
439
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
440
+ Raises:
441
+ - Exception: If the file does not exist and createIfNotExist is False.
442
+ - Exception: If the existing header does not match the provided header.
443
+ """
444
+ delimiter = get_delimiter(delimiter,file_name=fileName)
445
+ header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
446
+ if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
357
447
  return
358
448
  if type(lineToAppend) == str:
359
- lineToAppend = lineToAppend.strip().split('\t')
449
+ lineToAppend = lineToAppend.strip().split(delimiter)
360
450
  else:
361
451
  for i in range(len(lineToAppend)):
362
452
  if type(lineToAppend[i]) != str:
@@ -367,11 +457,11 @@ def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExis
367
457
 
368
458
  with open(fileName, mode ='r+b')as file:
369
459
  correctColumnNum = len(lineToAppend)
370
- if header.strip():
460
+ if header.rstrip():
371
461
  if verifyHeader:
372
- line = file.readline().decode().strip()
373
- if lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
374
- correctColumnNum = len(header.strip().split('\t'))
462
+ line = file.readline().decode(encoding=encoding)
463
+ if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
464
+ correctColumnNum = len(header.split(delimiter))
375
465
  if verbose:
376
466
  __teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
377
467
  # truncate / fill the lineToAppend to the correct number of columns
@@ -383,15 +473,16 @@ def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExis
383
473
  file.seek(-1, os.SEEK_END)
384
474
  if file.read(1) != b'\n':
385
475
  file.write(b'\n')
386
- file.write('\t'.join(lineToAppend).encode() + b'\n')
476
+ file.write(get_delimiter(delimiter).join(lineToAppend).encode(encoding=encoding) + b'\n')
387
477
  if verbose:
388
478
  __teePrintOrNot(f"Appended {lineToAppend} to {fileName}",teeLogger=teeLogger)
389
479
 
390
- def clearTSV(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False):
480
+ def clearTSV(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False,delimiter = '\t'):
391
481
  """
392
- Clear the contents of a TSV file. Will create if not exist.
482
+ Compatibility method, calls clearTabularFile.
483
+ Clear the contents of a Tabular file. Will create if not exist.
393
484
  Parameters:
394
- - fileName (str): The path of the TSV file.
485
+ - fileName (str): The path of the Tabular file.
395
486
  - teeLogger (optional): A logger object for logging messages.
396
487
  - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
397
488
  - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
@@ -399,14 +490,29 @@ def clearTSV(fileName,teeLogger = None,header = '',verifyHeader = False,verbose
399
490
  - encoding (str, optional): The encoding of the file.
400
491
  - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
401
492
  """
402
- header = formatHeader(header,verbose = verbose,teeLogger = teeLogger)
403
- if not verifyTSVExistence(fileName,createIfNotExist = True,teeLogger = teeLogger,header = header,encoding = encoding,strict = False):
493
+ return clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
494
+
495
+ def clearTabularFile(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False,delimiter = ...):
496
+ """
497
+ Clear the contents of a Tabular file. Will create if not exist.
498
+ Parameters:
499
+ - fileName (str): The path of the Tabular file.
500
+ - teeLogger (optional): A logger object for logging messages.
501
+ - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
502
+ - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
503
+ - verbose (bool, optional): If True, additional information will be printed during the execution.
504
+ - encoding (str, optional): The encoding of the file.
505
+ - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
506
+ """
507
+ delimiter = get_delimiter(delimiter,file_name=fileName)
508
+ header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
509
+ if not _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = teeLogger,header = header,encoding = encoding,strict = False,delimiter=delimiter):
404
510
  raise Exception("Something catastrophic happened! File still not found after creation")
405
511
  else:
406
512
  with open(fileName, mode ='r+',encoding=encoding)as file:
407
- if header.strip() and verifyHeader:
408
- line = file.readline().strip()
409
- if not lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
513
+ if header.rstrip() and verifyHeader:
514
+ line = file.readline()
515
+ if not _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
410
516
  __teePrintOrNot(f'Warning: Header mismatch in {fileName}. Keeping original header in file...','warning',teeLogger)
411
517
  file.truncate()
412
518
  else:
@@ -442,14 +548,15 @@ class TSVZed(OrderedDict):
442
548
  except Exception as e:
443
549
  print(message,flush=True)
444
550
 
445
- def __init__ (self,fileName,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,monitor_external_changes = True,verbose = False,encoding = None):
551
+ def __init__ (self,fileName,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,monitor_external_changes = True,verbose = False,encoding = 'utf8',delimiter = ...):
446
552
  super().__init__()
447
553
  self.version = version
448
554
  self.externalFileUpdateTime = getFileUpdateTimeNs(fileName)
449
555
  self.lastUpdateTime = self.externalFileUpdateTime
450
556
  self._fileName = fileName
451
557
  self.teeLogger = teeLogger
452
- self.header = formatHeader(header,verbose = verbose,teeLogger = self.teeLogger)
558
+ self.delimiter = get_delimiter(delimiter,file_name=fileName)
559
+ self.header = _formatHeader(header,verbose = verbose,teeLogger = self.teeLogger,delimiter=self.delimiter)
453
560
  self.correctColumnNum = -1
454
561
  self.createIfNotExist = createIfNotExist
455
562
  self.verifyHeader = verifyHeader
@@ -490,10 +597,10 @@ class TSVZed(OrderedDict):
490
597
  if self.verbose:
491
598
  self.__teePrintOrNot(f"Loading {self._fileName}")
492
599
  super().clear()
493
- readTSV(self._fileName, teeLogger = self.teeLogger, header = self.header, createIfNotExist = self.createIfNotExist, verifyHeader = self.verifyHeader, verbose = self.verbose, taskDic = self,encoding = self.encoding if self.encoding else None)
600
+ readTabularFile(self._fileName, teeLogger = self.teeLogger, header = self.header, createIfNotExist = self.createIfNotExist, verifyHeader = self.verifyHeader, verbose = self.verbose, taskDic = self,encoding = self.encoding if self.encoding else None, strict = False, delimiter = self.delimiter)
494
601
  if self.verbose:
495
602
  self.__teePrintOrNot(f"Loaded {len(self)} records from {self._fileName}")
496
- self.correctColumnNum = len(self.header.split('\t')) if (self.header and self.verifyHeader) else (len(self[next(iter(self))]) if self else -1)
603
+ self.correctColumnNum = len(self.header.split(self.delimiter)) if (self.header and self.verifyHeader) else (len(self[next(iter(self))]) if self else -1)
497
604
  if self.verbose:
498
605
  self.__teePrintOrNot(f"correctColumnNum: {self.correctColumnNum}")
499
606
  #super().update(loadedData)
@@ -510,7 +617,7 @@ class TSVZed(OrderedDict):
510
617
  self.__teePrintOrNot('Key cannot be empty','error')
511
618
  return
512
619
  if type(value) == str:
513
- value = value.strip().split('\t')
620
+ value = value.strip().split(self.delimiter)
514
621
  # sanitize the value
515
622
  value = [(str(segment).strip() if type(segment) != str else segment.strip()) if segment else '' for segment in value]
516
623
  #value = list(map(lambda segment: str(segment).strip(), value))
@@ -543,7 +650,7 @@ class TSVZed(OrderedDict):
543
650
  return
544
651
  if self.verbose:
545
652
  self.__teePrintOrNot(f"Appending {key} to the appendQueue")
546
- self.appendQueue.append('\t'.join(value))
653
+ self.appendQueue.append(self.delimiter.join(value))
547
654
  self.lastUpdateTime = get_time_ns()
548
655
  # if not self.appendThread.is_alive():
549
656
  # self.commitAppendToFile()
@@ -567,10 +674,10 @@ class TSVZed(OrderedDict):
567
674
  def __appendEmptyLine(self,key):
568
675
  self.dirty = True
569
676
  if self.correctColumnNum > 0:
570
- emptyLine = key+'\t'*(self.correctColumnNum-1)
677
+ emptyLine = key+self.delimiter*(self.correctColumnNum-1)
571
678
  elif len(self[key]) > 1:
572
679
  self.correctColumnNum = len(self[key])
573
- emptyLine = key+'\t'*(self.correctColumnNum-1)
680
+ emptyLine = key+self.delimiter*(self.correctColumnNum-1)
574
681
  else:
575
682
  emptyLine = key
576
683
  if self.verbose:
@@ -745,7 +852,7 @@ memoryOnly:{self.memoryOnly}
745
852
  if self.header:
746
853
  file.write(self.header+'\n')
747
854
  for key in self:
748
- file.write('\t'.join(self[key])+'\n')
855
+ file.write(self.delimiter.join(self[key])+'\n')
749
856
  self.release_file_obj(file)
750
857
  if self.verbose:
751
858
  self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
@@ -764,32 +871,32 @@ memoryOnly:{self.memoryOnly}
764
871
  try:
765
872
  if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
766
873
  self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
767
- file = self.get_file_obj('r+')
874
+ file = self.get_file_obj('r+b')
768
875
  overWrite = False
769
- line = file.readline()
876
+ line = file.readline().decode(self.encoding)
770
877
  aftPos = file.tell()
771
- if self.header and not lineContainHeader(self.header,line,verbose = self.verbose,teeLogger = self.teeLogger,strict = False):
878
+ if self.header and not _lineContainHeader(self.header,line,verbose = self.verbose,teeLogger = self.teeLogger,strict = False):
772
879
  file.seek(0)
773
- file.write(self.header+'\n')
880
+ file.write(f'{self.header}\n'.encode(encoding=self.encoding))
774
881
  # if the header is not the same length as the line, we need to overwrite the file
775
882
  if aftPos != file.tell():
776
883
  overWrite = True
777
884
  if self.verbose:
778
885
  self.__teePrintOrNot(f"Header {self.header} written to {self._fileName}")
779
886
  for value in self.values():
780
- strToWrite = '\t'.join(value)+'\n'
887
+ strToWrite = self.delimiter.join(value)+'\n'
781
888
  if overWrite:
782
889
  if self.verbose:
783
890
  self.__teePrintOrNot(f"Overwriting {value} to {self._fileName}")
784
- file.write(strToWrite)
891
+ file.write(strToWrite.encode(encoding=self.encoding))
785
892
  continue
786
893
  pos = file.tell()
787
- line = file.readline()
894
+ line = file.readline().decode(encoding=self.encoding)
788
895
  aftPos = file.tell()
789
896
  if not line or pos == aftPos:
790
897
  if self.verbose:
791
898
  self.__teePrintOrNot(f"End of file reached. Appending {value} to {self._fileName}")
792
- file.write(strToWrite)
899
+ file.write(strToWrite.encode(encoding=self.encoding))
793
900
  overWrite = True
794
901
  continue
795
902
  if line != strToWrite:
@@ -797,7 +904,8 @@ memoryOnly:{self.memoryOnly}
797
904
  self.__teePrintOrNot(f"Overwriting {value} to {self._fileName}")
798
905
  file.seek(pos)
799
906
  # fill the string with space to write to the correct length
800
- file.write(strToWrite.rstrip('\n').ljust(len(line)-1)+'\n')
907
+ #file.write(strToWrite.rstrip('\n').ljust(len(line)-1)+'\n')
908
+ file.write(strToWrite.encode(encoding=self.encoding).rstrip(b'\n').ljust(len(line)-1)+b'\n')
801
909
  if aftPos != file.tell():
802
910
  overWrite = True
803
911
  file.truncate()
@@ -831,9 +939,10 @@ memoryOnly:{self.memoryOnly}
831
939
 
832
940
  def _appendWorker(self):
833
941
  while not self.shutdownEvent.is_set():
834
- self.checkExternalChanges()
835
- self.rewrite()
836
- self.commitAppendToFile()
942
+ if not self.memoryOnly:
943
+ self.checkExternalChanges()
944
+ self.rewrite()
945
+ self.commitAppendToFile()
837
946
  time.sleep(self.append_check_delay)
838
947
  # self.appendEvent.wait()
839
948
  # self.appendEvent.clear()
@@ -883,15 +992,19 @@ memoryOnly:{self.memoryOnly}
883
992
  def get_file_obj(self,modes = 'a'):
884
993
  self.writeLock.acquire()
885
994
  try:
886
- if not self.encoding:
887
- self.encoding = 'utf8'
888
- file = open(self._fileName, mode=modes, encoding=self.encoding)
995
+ if 'b' not in modes:
996
+ if not self.encoding:
997
+ self.encoding = 'utf8'
998
+ file = open(self._fileName, mode=modes, encoding=self.encoding)
999
+ else:
1000
+ file = open(self._fileName, mode=modes)
889
1001
  # Lock the file after opening
890
1002
  if os.name == 'posix':
891
1003
  fcntl.lockf(file, fcntl.LOCK_EX)
892
1004
  elif os.name == 'nt':
893
1005
  # For Windows, locking the entire file, avoiding locking an empty file
894
- lock_length = max(1, os.path.getsize(self._fileName))
1006
+ #lock_length = max(1, os.path.getsize(self._fileName))
1007
+ lock_length = 2147483647
895
1008
  msvcrt.locking(file.fileno(), msvcrt.LK_LOCK, lock_length)
896
1009
  if self.verbose:
897
1010
  self.__teePrintOrNot(f"File {self._fileName} locked with mode {modes}")
@@ -910,13 +1023,18 @@ memoryOnly:{self.memoryOnly}
910
1023
  try:
911
1024
  file.flush() # Ensure the file is flushed before unlocking
912
1025
  os.fsync(file.fileno()) # Ensure the file is synced to disk before unlocking
913
- if os.name == 'posix':
914
- fcntl.lockf(file, fcntl.LOCK_UN)
915
- elif os.name == 'nt':
916
- # Unlocking the entire file; for Windows, ensure not unlocking an empty file
917
- unlock_length = max(1, os.path.getsize(file.name))
918
- msvcrt.locking(file.fileno(), msvcrt.LK_UNLCK, unlock_length)
919
- file.close() # Ensure file is closed after unlocking
1026
+ if not file.closed:
1027
+ if os.name == 'posix':
1028
+ fcntl.lockf(file, fcntl.LOCK_UN)
1029
+ elif os.name == 'nt':
1030
+ # Unlocking the entire file; for Windows, ensure not unlocking an empty file
1031
+ #unlock_length = max(1, os.path.getsize(os.path.realpath(file.name)))
1032
+ unlock_length = 2147483647
1033
+ try:
1034
+ msvcrt.locking(file.fileno(), msvcrt.LK_UNLCK, unlock_length)
1035
+ except:
1036
+ pass
1037
+ file.close() # Ensure file is closed after unlocking
920
1038
  if self.verbose:
921
1039
  self.__teePrintOrNot(f"File {file.name} unlocked / released")
922
1040
  except Exception as e:
@@ -925,26 +1043,37 @@ memoryOnly:{self.memoryOnly}
925
1043
  except Exception as e:
926
1044
  self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
927
1045
  self.__teePrintOrNot(f"Failed to release file {file.name}: {e}",'error')
928
- try:
929
- self.writeLock.release() # Ensure the thread lock is always released
930
- except Exception as e:
931
- self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
932
- self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1046
+ import traceback
1047
+ self.__teePrintOrNot(traceback.format_exc(),'error')
1048
+ # release the write lock if not already released
1049
+ if self.writeLock.locked():
1050
+ try:
1051
+ self.writeLock.release() # Ensure the thread lock is always released
1052
+ except Exception as e:
1053
+ self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
1054
+ self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
933
1055
 
934
1056
 
935
1057
  def __main__():
936
1058
  import argparse
937
- parser = argparse.ArgumentParser(description='TSVZed: A TSV file manager')
938
- parser.add_argument('filename', type=str, help='The TSV file to read')
1059
+ parser = argparse.ArgumentParser(description='TSVZed: A TSV / CSV / NSV file manager')
1060
+ parser.add_argument('filename', type=str, help='The file to read')
939
1061
  parser.add_argument('operation', type=str,nargs='?', choices=['read','append','delete','clear'], help='The operation to perform. Default: read', default='read')
940
- parser.add_argument('line', type=str, nargs='*', help='The line to append to the TSV file. it follows as : {key} {value1} {value2} ... if a key without value be inserted, the value will get deleted.')
941
- parser.add_argument('-c', '--header', type=str, help='Perform checks with this header of the TSV file. seperate using \\t')
1062
+ parser.add_argument('line', type=str, nargs='*', help='The line to append to the Tabular file. it follows as : {key} {value1} {value2} ... if a key without value be inserted, the value will get deleted.')
1063
+ parser.add_argument('-d', '--delimiter', type=str, help='The delimiter of the Tabular file. Default: Infer from last part of filename, or tab if cannot determine. Note: accept unicode escaped char, raw char, or string "comma,tab,null" will refer to their characters. ', default=...)
1064
+ parser.add_argument('-c', '--header', type=str, help='Perform checks with this header of the Tabular file. seperate using --delimiter.')
942
1065
  parser.add_argument('-f', '--force', action='store_true', help='Force the operation. Ignore checks for column numbers / headers')
943
1066
  parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
944
1067
  parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {version} by {author}')
945
1068
  args = parser.parse_args()
946
-
947
- header = args.header.replace('\\t','\t') if args.header else ''
1069
+ args.delimiter = get_delimiter(delimiter=args.delimiter,file_name=args.filename)
1070
+ if args.header and args.header.endswith('\\'):
1071
+ args.header += '\\'
1072
+ try:
1073
+ header = args.header.encode().decode('unicode_escape') if args.header else ''
1074
+ except Exception as e:
1075
+ print(f"Failed to decode header: {args.header}")
1076
+ header = ''
948
1077
 
949
1078
  if args.operation == 'read':
950
1079
  # check if the file exist
@@ -952,14 +1081,14 @@ def __main__():
952
1081
  print(f"File not found: {args.filename}")
953
1082
  return
954
1083
  # read the file
955
- data = readTSV(args.filename, verifyHeader = False, verbose=args.verbose,strict= not args.force)
956
- print(pretty_format_table(data.values()))
1084
+ data = readTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= not args.force, delimiter=args.delimiter)
1085
+ print(pretty_format_table(data.values(),delimiter=args.delimiter))
957
1086
  elif args.operation == 'append':
958
- appendTSV(args.filename, args.line,createIfNotExist = True, header=header, verbose=args.verbose, strict= not args.force)
1087
+ appendTabularFile(args.filename, args.line,createIfNotExist = True, header=header, verbose=args.verbose, strict= not args.force, delimiter=args.delimiter)
959
1088
  elif args.operation == 'delete':
960
- appendTSV(args.filename, args.line[:1],createIfNotExist = True, header=header, verbose=args.verbose, strict= not args.force)
1089
+ appendTabularFile(args.filename, args.line[:1],createIfNotExist = True, header=header, verbose=args.verbose, strict= not args.force, delimiter=args.delimiter)
961
1090
  elif args.operation == 'clear':
962
- clearTSV(args.filename, header=header, verbose=args.verbose, verifyHeader=not args.force)
1091
+ clearTabularFile(args.filename, header=header, verbose=args.verbose, verifyHeader=not args.force, delimiter=args.delimiter)
963
1092
  else:
964
1093
  print("Invalid operation")
965
1094
  return