TSVZ 2.67__py3-none-any.whl → 3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
TSVZ.py CHANGED
@@ -4,39 +4,90 @@ from collections import OrderedDict , deque
4
4
  import time
5
5
  import atexit
6
6
  import threading
7
+ import re
7
8
 
8
9
  if os.name == 'nt':
9
10
  import msvcrt
10
11
  elif os.name == 'posix':
11
12
  import fcntl
12
13
 
13
- version = '2.67'
14
+ version = '3.02'
14
15
  author = 'pan@zopyr.us'
15
16
 
17
+ DEFAULT_DELIMITER = '\t'
16
18
 
17
- def pretty_format_table(data):
18
- if not data:
19
- return
20
- if type(data) == str:
21
- data = data.strip('\n').split('\n')
22
- elif type(data) != list:
23
- data = list(data)
24
- num_cols = len(data[0])
25
- col_widths = [0] * num_cols
26
- # Calculate the maximum width of each column
27
- for c in range(num_cols):
28
- col_items = [str(row[c]) for row in data]
29
- col_widths[c] = max(len(item) for item in col_items)
30
- # Build the row format string
31
- row_format = ' | '.join('{{:<{}}}'.format(width) for width in col_widths)
32
- # Print the header
33
- header = data[0]
34
- outTable = []
35
- outTable.append(row_format.format(*header))
36
- outTable.append('-+-'.join('-' * width for width in col_widths))
37
- for row in data[1:]:
38
- outTable.append(row_format.format(*row))
39
- return '\n'.join(outTable) + '\n'
19
+ def get_delimiter(delimiter,file_name = ''):
20
+ if not delimiter:
21
+ return DEFAULT_DELIMITER
22
+ elif delimiter == ...:
23
+ if not file_name:
24
+ rtn = '\t'
25
+ if file_name.endswith('.csv'):
26
+ rtn = ','
27
+ elif file_name.endswith('.nsv'):
28
+ rtn = '\0'
29
+ elif file_name.endswith('.psv'):
30
+ rtn = '|'
31
+ else:
32
+ rtn = '\t'
33
+ elif delimiter == 'comma':
34
+ rtn = ','
35
+ elif delimiter == 'tab':
36
+ rtn = '\t'
37
+ elif delimiter == 'pipe':
38
+ rtn = '|'
39
+ elif delimiter == 'null':
40
+ rtn = '\0'
41
+ else:
42
+ rtn = delimiter.encode().decode('unicode_escape')
43
+ DEFAULT_DELIMITER = rtn
44
+ return rtn
45
+
46
+ def pretty_format_table(data, delimiter = DEFAULT_DELIMITER):
47
+ version = 1.0
48
+ if not data:
49
+ return ''
50
+ if type(data) == str:
51
+ data = data.strip('\n').split('\n')
52
+ data = [line.split(delimiter) for line in data]
53
+ elif isinstance(data, dict):
54
+ # flatten the 2D dict to a list of lists
55
+ if isinstance(next(iter(data.values())), dict):
56
+ tempData = [['key'] + list(next(iter(data.values())).keys())]
57
+ tempData.extend( [[key] + list(value.values()) for key, value in data.items()])
58
+ data = tempData
59
+ else:
60
+ # it is a dict of lists
61
+ data = [[key] + list(value) for key, value in data.items()]
62
+ elif type(data) != list:
63
+ data = list(data)
64
+ # format the list into 2d list of list of strings
65
+ if isinstance(data[0], dict):
66
+ tempData = [data[0].keys()]
67
+ tempData.extend([list(item.values()) for item in data])
68
+ data = tempData
69
+ data = [[str(item) for item in row] for row in data]
70
+ num_cols = len(data[0])
71
+ col_widths = [0] * num_cols
72
+ # Calculate the maximum width of each column
73
+ for c in range(num_cols):
74
+ #col_widths[c] = max(len(row[c]) for row in data)
75
+ # handle ansii escape sequences
76
+ col_widths[c] = max(len(re.sub(r'\x1b\[[0-?]*[ -/]*[@-~]','',row[c])) for row in data)
77
+ # Build the row format string
78
+ row_format = ' | '.join('{{:<{}}}'.format(width) for width in col_widths)
79
+ # Print the header
80
+ header = data[0]
81
+ outTable = []
82
+ outTable.append(row_format.format(*header))
83
+ outTable.append('-+-'.join('-' * width for width in col_widths))
84
+ for row in data[1:]:
85
+ # if the row is empty, print an divider
86
+ if not any(row):
87
+ outTable.append('-+-'.join('-' * width for width in col_widths))
88
+ else:
89
+ outTable.append(row_format.format(*row))
90
+ return '\n'.join(outTable) + '\n'
40
91
 
41
92
  def __teePrintOrNot(message,level = 'info',teeLogger = None):
42
93
  """
@@ -58,7 +109,7 @@ def __teePrintOrNot(message,level = 'info',teeLogger = None):
58
109
  except Exception as e:
59
110
  print(message,flush=True)
60
111
 
61
- def processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,strict = True):
112
+ def _processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,strict = True,delimiter = DEFAULT_DELIMITER):
62
113
  """
63
114
  Process a line of text and update the task dictionary.
64
115
 
@@ -74,7 +125,7 @@ def processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,s
74
125
  tuple: A tuple containing the updated correctColumnNum and the processed lineCache.
75
126
 
76
127
  """
77
- line = line.decode().strip(' ').strip('\x00')
128
+ line = line.strip(' ').strip('\x00').rstrip('\r\n')
78
129
  # we throw away the lines that start with '#'
79
130
  if not line :
80
131
  if verbose:
@@ -85,7 +136,7 @@ def processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,s
85
136
  __teePrintOrNot(f"Ignoring comment line: {line}",teeLogger=teeLogger)
86
137
  return correctColumnNum , []
87
138
  # we only interested in the lines that have the correct number of columns
88
- lineCache = [segment.strip() for segment in line.split('\t')]
139
+ lineCache = [segment.strip() for segment in line.split(delimiter)]
89
140
  if not lineCache:
90
141
  return correctColumnNum , []
91
142
  if correctColumnNum == -1:
@@ -124,7 +175,7 @@ def processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,s
124
175
  __teePrintOrNot(f"Key {lineCache[0]} added after correction",teeLogger=teeLogger)
125
176
  return correctColumnNum, lineCache
126
177
 
127
- def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False):
178
+ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False,encoding = 'utf8',delimiter = ...):
128
179
  """
129
180
  Reads the last valid line from a file.
130
181
 
@@ -134,6 +185,7 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
134
185
  correctColumnNum (int): A column number to pass to processLine function.
135
186
  verbose (bool, optional): Whether to print verbose output. Defaults to False.
136
187
  teeLogger (optional): Logger to use for tee print. Defaults to None.
188
+ encoding (str, optional): The encoding of the file. Defaults to None.
137
189
  strict (bool, optional): Whether to enforce strict processing. Defaults to False.
138
190
 
139
191
  Returns:
@@ -141,6 +193,7 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
141
193
  """
142
194
  chunk_size = 1024 # Read in chunks of 1024 bytes
143
195
  last_valid_line = []
196
+ delimiter = get_delimiter(delimiter,file_name=fileName)
144
197
  if verbose:
145
198
  __teePrintOrNot(f"Reading last line only from {fileName}",teeLogger=teeLogger)
146
199
  with open(fileName, 'rb') as file:
@@ -166,13 +219,14 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
166
219
  for i in range(len(lines) - 1, -1, -1):
167
220
  if lines[i].strip(): # Skip empty lines
168
221
  # Process the line
169
- correctColumnNum, lineCache = processLine(
170
- lines[i],
222
+ correctColumnNum, lineCache = _processLine(
223
+ lines[i].decode(encoding=encoding),
171
224
  taskDic,
172
225
  correctColumnNum,
173
226
  verbose=verbose,
174
227
  teeLogger=teeLogger,
175
- strict=strict
228
+ strict=strict,
229
+ delimiter=delimiter
176
230
  )
177
231
  # If the line is valid, return it
178
232
  if lineCache and any(lineCache):
@@ -184,7 +238,7 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
184
238
  # Return empty list if no valid line found
185
239
  return last_valid_line
186
240
 
187
- def formatHeader(header,verbose = False,teeLogger = None):
241
+ def _formatHeader(header,verbose = False,teeLogger = None,delimiter = DEFAULT_DELIMITER):
188
242
  """
189
243
  Format the header string.
190
244
 
@@ -198,12 +252,12 @@ def formatHeader(header,verbose = False,teeLogger = None):
198
252
  """
199
253
  if type(header) != str:
200
254
  try:
201
- header = '\t'.join(header)
255
+ header = delimiter.join(header)
202
256
  except:
203
257
  if verbose:
204
258
  __teePrintOrNot('Invalid header, setting header to empty.','error',teeLogger=teeLogger)
205
259
  header = ''
206
- header = header.strip()
260
+ header = delimiter.join([segment.rstrip() for segment in header.split(delimiter)])
207
261
  # if header:
208
262
  # if not header.endswith('\n'):
209
263
  # header += '\n'
@@ -211,7 +265,7 @@ def formatHeader(header,verbose = False,teeLogger = None):
211
265
  # header = ''
212
266
  return header
213
267
 
214
- def lineContainHeader(header,line,verbose = False,teeLogger = None,strict = False):
268
+ def _lineContainHeader(header,line,verbose = False,teeLogger = None,strict = False,delimiter = DEFAULT_DELIMITER):
215
269
  """
216
270
  Verify if a line contains the header.
217
271
 
@@ -225,22 +279,24 @@ def lineContainHeader(header,line,verbose = False,teeLogger = None,strict = Fals
225
279
  Returns:
226
280
  bool: True if the header matches the line, False otherwise.
227
281
  """
282
+ header = [segment.rstrip() for segment in header.split(delimiter)]
283
+ line = [segment.rstrip() for segment in line.split(delimiter)]
228
284
  if verbose:
229
- __teePrintOrNot(f"Header: {header.strip()}",teeLogger=teeLogger)
230
- __teePrintOrNot(f"First line: {line}",teeLogger=teeLogger)
231
- if not line.lower().replace(' ','').startswith(header.strip().lower().replace(' ','')):
232
- __teePrintOrNot(f"Header mismatch: \n{line} \n!= \n{header.strip()}",teeLogger=teeLogger)
285
+ __teePrintOrNot(f"Header: \n{header}",teeLogger=teeLogger)
286
+ __teePrintOrNot(f"First line: \n{line}",teeLogger=teeLogger)
287
+ if len(header) != len(line) or any([header[i] not in line[i] for i in range(len(header))]):
288
+ __teePrintOrNot(f"Header mismatch: \n{line} \n!= \n{header}",teeLogger=teeLogger)
233
289
  if strict:
234
290
  raise Exception("Data format error! Header mismatch")
235
291
  return False
236
292
  return True
237
293
 
238
- def verifyTSVExistence(fileName,createIfNotExist = True,teeLogger = None,header = '',encoding = 'utf8',strict = True):
294
+ def _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = None,header = '',encoding = 'utf8',strict = True,delimiter = DEFAULT_DELIMITER):
239
295
  """
240
- Verify the existence of a TSV file.
296
+ Verify the existence of the tabular file.
241
297
 
242
298
  Parameters:
243
- - fileName (str): The path of the TSV file.
299
+ - fileName (str): The path of the tabular file.
244
300
  - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to True.
245
301
  - teeLogger (object, optional): The tee logger object for printing output. Defaults to None.
246
302
  - header (str, optional): The header line to verify against. Defaults to ''.
@@ -250,8 +306,14 @@ def verifyTSVExistence(fileName,createIfNotExist = True,teeLogger = None,header
250
306
  Returns:
251
307
  bool: True if the file exists, False otherwise.
252
308
  """
253
- if not fileName.endswith('.tsv'):
309
+ if delimiter and delimiter == '\t' and not fileName.endswith('.tsv'):
254
310
  __teePrintOrNot(f'Warning: Filename {fileName} does not end with .tsv','warning',teeLogger=teeLogger)
311
+ elif delimiter and delimiter == ',' and not fileName.endswith('.csv'):
312
+ __teePrintOrNot(f'Warning: Filename {fileName} does not end with .csv','warning',teeLogger=teeLogger)
313
+ elif delimiter and delimiter == '\0' and not fileName.endswith('.nsv'):
314
+ __teePrintOrNot(f'Warning: Filename {fileName} does not end with .nsv','warning',teeLogger=teeLogger)
315
+ elif delimiter and delimiter == '|' and not fileName.endswith('.psv'):
316
+ __teePrintOrNot(f'Warning: Filename {fileName} does not end with .psv','warning',teeLogger=teeLogger)
255
317
  if not os.path.isfile(fileName):
256
318
  if createIfNotExist:
257
319
  with open(fileName, mode ='w',encoding=encoding)as file:
@@ -265,14 +327,41 @@ def verifyTSVExistence(fileName,createIfNotExist = True,teeLogger = None,header
265
327
  return False
266
328
  return True
267
329
 
268
- def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True):
330
+ def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = '\t'):
331
+ """
332
+ Compatibility method, calls readTabularFile.
333
+ Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
334
+
335
+ Parameters:
336
+ - fileName (str): The path to the Tabular file.
337
+ - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
338
+ - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
339
+ - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
340
+ - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
341
+ - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
342
+ - verbose (bool, optional): Whether to print verbose output. Defaults to False.
343
+ - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
344
+ - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
345
+ - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
346
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t'.
347
+
348
+ Returns:
349
+ - OrderedDict: The dictionary containing the data from the Tabular file.
350
+
351
+ Raises:
352
+ - Exception: If the file is not found or there is a data format error.
353
+
269
354
  """
270
- Read a TSV (Tab-Separated Values) file and return the data as a dictionary.
355
+ return readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter)
356
+
357
+ def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = ...):
358
+ """
359
+ Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
271
360
 
272
361
  Parameters:
273
- - fileName (str): The path to the TSV file.
362
+ - fileName (str): The path to the Tabular file.
274
363
  - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
275
- - header (str or list, optional): The header of the TSV file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
364
+ - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
276
365
  - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
277
366
  - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
278
367
  - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
@@ -280,9 +369,10 @@ def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, last
280
369
  - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
281
370
  - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
282
371
  - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to True.
372
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
283
373
 
284
374
  Returns:
285
- - OrderedDict: The dictionary containing the data from the TSV file.
375
+ - OrderedDict: The dictionary containing the data from the Tabular file.
286
376
 
287
377
  Raises:
288
378
  - Exception: If the file is not found or there is a data format error.
@@ -290,33 +380,55 @@ def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, last
290
380
  """
291
381
  if taskDic is None:
292
382
  taskDic = {}
293
- header = formatHeader(header,verbose = verbose,teeLogger = teeLogger)
294
- if not verifyTSVExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict):
383
+ delimiter = get_delimiter(delimiter,file_name=fileName)
384
+ header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger, delimiter = delimiter)
385
+ if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
295
386
  return taskDic
296
387
  with open(fileName, mode ='rb')as file:
297
388
  correctColumnNum = -1
298
- if header.strip():
389
+ if header.rstrip():
299
390
  if verifyHeader:
300
- line = file.readline().decode().strip()
301
- if lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
302
- correctColumnNum = len(header.strip().split('\t'))
391
+ line = file.readline().decode(encoding=encoding)
392
+ if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
393
+ correctColumnNum = len(header.split(delimiter))
303
394
  if verbose:
304
395
  __teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
305
396
  if lastLineOnly:
306
- lineCache = read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=verbose, teeLogger=teeLogger, strict=strict)
397
+ lineCache = read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=verbose, teeLogger=teeLogger, strict=strict, delimiter=delimiter)
307
398
  if lineCache:
308
399
  taskDic[lineCache[0]] = lineCache
309
400
  return lineCache
310
401
  for line in file:
311
- correctColumnNum, lineCache = processLine(line,taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict)
402
+ correctColumnNum, lineCache = _processLine(line.decode(encoding=encoding),taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict,delimiter=delimiter)
312
403
  return taskDic
313
404
 
314
- def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True):
405
+ def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = '\t'):
406
+ """
407
+ Compatibility method, calls appendTabularFile.
408
+ Append a line of data to a Tabular file.
409
+ Parameters:
410
+ - fileName (str): The path of the Tabular file.
411
+ - lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
412
+ - teeLogger (optional): A logger object for logging messages.
413
+ - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
414
+ - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
415
+ - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
416
+ - verbose (bool, optional): If True, additional information will be printed during the execution.
417
+ - encoding (str, optional): The encoding of the file.
418
+ - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
419
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
420
+ Raises:
421
+ - Exception: If the file does not exist and createIfNotExist is False.
422
+ - Exception: If the existing header does not match the provided header.
423
+ """
424
+ return appendTabularFile(fileName,lineToAppend,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding, strict = strict, delimiter = delimiter)
425
+
426
+ def appendTabularFile(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = ...):
315
427
  """
316
- Append a line of data to a TSV file.
428
+ Append a line of data to a Tabular file.
317
429
  Parameters:
318
- - fileName (str): The path of the TSV file.
319
- - lineToAppend (str or list): The line of data to append. If it is a string, it will be split by tabs ('\t') to form a list.
430
+ - fileName (str): The path of the Tabular file.
431
+ - lineToAppend (str or list): The line of data to append. If it is a string, it will be split by delimiter to form a list.
320
432
  - teeLogger (optional): A logger object for logging messages.
321
433
  - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
322
434
  - createIfNotExist (bool, optional): If True, the file will be created if it does not exist. If False and the file does not exist, an exception will be raised.
@@ -324,23 +436,32 @@ def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExis
324
436
  - verbose (bool, optional): If True, additional information will be printed during the execution.
325
437
  - encoding (str, optional): The encoding of the file.
326
438
  - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
439
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
327
440
  Raises:
328
441
  - Exception: If the file does not exist and createIfNotExist is False.
329
442
  - Exception: If the existing header does not match the provided header.
330
443
  """
331
- header = formatHeader(header,verbose = verbose,teeLogger = teeLogger)
332
- if not verifyTSVExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict):
444
+ delimiter = get_delimiter(delimiter,file_name=fileName)
445
+ header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
446
+ if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
333
447
  return
334
448
  if type(lineToAppend) == str:
335
- lineToAppend = lineToAppend.strip().split('\t')
449
+ lineToAppend = lineToAppend.strip().split(delimiter)
450
+ else:
451
+ for i in range(len(lineToAppend)):
452
+ if type(lineToAppend[i]) != str:
453
+ try:
454
+ lineToAppend[i] = str(lineToAppend[i])
455
+ except Exception as e:
456
+ lineToAppend[i] = str(e)
336
457
 
337
458
  with open(fileName, mode ='r+b')as file:
338
459
  correctColumnNum = len(lineToAppend)
339
- if header.strip():
460
+ if header.rstrip():
340
461
  if verifyHeader:
341
- line = file.readline().decode().strip()
342
- if lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
343
- correctColumnNum = len(header.strip().split('\t'))
462
+ line = file.readline().decode(encoding=encoding)
463
+ if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
464
+ correctColumnNum = len(header.split(delimiter))
344
465
  if verbose:
345
466
  __teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
346
467
  # truncate / fill the lineToAppend to the correct number of columns
@@ -352,15 +473,16 @@ def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExis
352
473
  file.seek(-1, os.SEEK_END)
353
474
  if file.read(1) != b'\n':
354
475
  file.write(b'\n')
355
- file.write('\t'.join(lineToAppend).encode() + b'\n')
476
+ file.write(get_delimiter(delimiter).join(lineToAppend).encode(encoding=encoding) + b'\n')
356
477
  if verbose:
357
478
  __teePrintOrNot(f"Appended {lineToAppend} to {fileName}",teeLogger=teeLogger)
358
479
 
359
- def clearTSV(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False):
480
+ def clearTSV(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False,delimiter = '\t'):
360
481
  """
361
- Clear the contents of a TSV file. Will create if not exist.
482
+ Compatibility method, calls clearTabularFile.
483
+ Clear the contents of a Tabular file. Will create if not exist.
362
484
  Parameters:
363
- - fileName (str): The path of the TSV file.
485
+ - fileName (str): The path of the Tabular file.
364
486
  - teeLogger (optional): A logger object for logging messages.
365
487
  - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
366
488
  - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
@@ -368,14 +490,29 @@ def clearTSV(fileName,teeLogger = None,header = '',verifyHeader = False,verbose
368
490
  - encoding (str, optional): The encoding of the file.
369
491
  - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
370
492
  """
371
- header = formatHeader(header,verbose = verbose,teeLogger = teeLogger)
372
- if not verifyTSVExistence(fileName,createIfNotExist = True,teeLogger = teeLogger,header = header,encoding = encoding,strict = False):
493
+ return clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
494
+
495
+ def clearTabularFile(fileName,teeLogger = None,header = '',verifyHeader = False,verbose = False,encoding = 'utf8',strict = False,delimiter = ...):
496
+ """
497
+ Clear the contents of a Tabular file. Will create if not exist.
498
+ Parameters:
499
+ - fileName (str): The path of the Tabular file.
500
+ - teeLogger (optional): A logger object for logging messages.
501
+ - header (str, optional): The header line to verify against. If provided, the function will check if the existing header matches the provided header.
502
+ - verifyHeader (bool, optional): If True, the function will verify if the existing header matches the provided header. If False, the header will not be verified.
503
+ - verbose (bool, optional): If True, additional information will be printed during the execution.
504
+ - encoding (str, optional): The encoding of the file.
505
+ - strict (bool, optional): If True, the function will raise an exception if there is a data format error. If False, the function will ignore the error and continue.
506
+ """
507
+ delimiter = get_delimiter(delimiter,file_name=fileName)
508
+ header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger,delimiter=delimiter)
509
+ if not _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = teeLogger,header = header,encoding = encoding,strict = False,delimiter=delimiter):
373
510
  raise Exception("Something catastrophic happened! File still not found after creation")
374
511
  else:
375
512
  with open(fileName, mode ='r+',encoding=encoding)as file:
376
- if header.strip() and verifyHeader:
377
- line = file.readline().strip()
378
- if not lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
513
+ if header.rstrip() and verifyHeader:
514
+ line = file.readline()
515
+ if not _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
379
516
  __teePrintOrNot(f'Warning: Header mismatch in {fileName}. Keeping original header in file...','warning',teeLogger)
380
517
  file.truncate()
381
518
  else:
@@ -411,14 +548,15 @@ class TSVZed(OrderedDict):
411
548
  except Exception as e:
412
549
  print(message,flush=True)
413
550
 
414
- def __init__ (self,fileName,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,monitor_external_changes = True,verbose = False,encoding = None):
551
+ def __init__ (self,fileName,teeLogger = None,header = '',createIfNotExist = True,verifyHeader = True,rewrite_on_load = True,rewrite_on_exit = False,rewrite_interval = 0, append_check_delay = 0.01,monitor_external_changes = True,verbose = False,encoding = 'utf8',delimiter = ...):
415
552
  super().__init__()
416
553
  self.version = version
417
554
  self.externalFileUpdateTime = getFileUpdateTimeNs(fileName)
418
555
  self.lastUpdateTime = self.externalFileUpdateTime
419
556
  self._fileName = fileName
420
557
  self.teeLogger = teeLogger
421
- self.header = formatHeader(header,verbose = verbose,teeLogger = self.teeLogger)
558
+ self.delimiter = get_delimiter(delimiter,file_name=fileName)
559
+ self.header = _formatHeader(header,verbose = verbose,teeLogger = self.teeLogger,delimiter=self.delimiter)
422
560
  self.correctColumnNum = -1
423
561
  self.createIfNotExist = createIfNotExist
424
562
  self.verifyHeader = verifyHeader
@@ -459,10 +597,10 @@ class TSVZed(OrderedDict):
459
597
  if self.verbose:
460
598
  self.__teePrintOrNot(f"Loading {self._fileName}")
461
599
  super().clear()
462
- readTSV(self._fileName, teeLogger = self.teeLogger, header = self.header, createIfNotExist = self.createIfNotExist, verifyHeader = self.verifyHeader, verbose = self.verbose, taskDic = self,encoding = self.encoding if self.encoding else None)
600
+ readTabularFile(self._fileName, teeLogger = self.teeLogger, header = self.header, createIfNotExist = self.createIfNotExist, verifyHeader = self.verifyHeader, verbose = self.verbose, taskDic = self,encoding = self.encoding if self.encoding else None, strict = False, delimiter = self.delimiter)
463
601
  if self.verbose:
464
602
  self.__teePrintOrNot(f"Loaded {len(self)} records from {self._fileName}")
465
- self.correctColumnNum = len(self.header.split('\t')) if (self.header and self.verifyHeader) else (len(self[next(iter(self))]) if self else -1)
603
+ self.correctColumnNum = len(self.header.split(self.delimiter)) if (self.header and self.verifyHeader) else (len(self[next(iter(self))]) if self else -1)
466
604
  if self.verbose:
467
605
  self.__teePrintOrNot(f"correctColumnNum: {self.correctColumnNum}")
468
606
  #super().update(loadedData)
@@ -479,7 +617,7 @@ class TSVZed(OrderedDict):
479
617
  self.__teePrintOrNot('Key cannot be empty','error')
480
618
  return
481
619
  if type(value) == str:
482
- value = value.strip().split('\t')
620
+ value = value.strip().split(self.delimiter)
483
621
  # sanitize the value
484
622
  value = [(str(segment).strip() if type(segment) != str else segment.strip()) if segment else '' for segment in value]
485
623
  #value = list(map(lambda segment: str(segment).strip(), value))
@@ -512,7 +650,7 @@ class TSVZed(OrderedDict):
512
650
  return
513
651
  if self.verbose:
514
652
  self.__teePrintOrNot(f"Appending {key} to the appendQueue")
515
- self.appendQueue.append('\t'.join(value))
653
+ self.appendQueue.append(self.delimiter.join(value))
516
654
  self.lastUpdateTime = get_time_ns()
517
655
  # if not self.appendThread.is_alive():
518
656
  # self.commitAppendToFile()
@@ -536,10 +674,10 @@ class TSVZed(OrderedDict):
536
674
  def __appendEmptyLine(self,key):
537
675
  self.dirty = True
538
676
  if self.correctColumnNum > 0:
539
- emptyLine = key+'\t'*(self.correctColumnNum-1)
677
+ emptyLine = key+self.delimiter*(self.correctColumnNum-1)
540
678
  elif len(self[key]) > 1:
541
679
  self.correctColumnNum = len(self[key])
542
- emptyLine = key+'\t'*(self.correctColumnNum-1)
680
+ emptyLine = key+self.delimiter*(self.correctColumnNum-1)
543
681
  else:
544
682
  emptyLine = key
545
683
  if self.verbose:
@@ -714,7 +852,7 @@ memoryOnly:{self.memoryOnly}
714
852
  if self.header:
715
853
  file.write(self.header+'\n')
716
854
  for key in self:
717
- file.write('\t'.join(self[key])+'\n')
855
+ file.write(self.delimiter.join(self[key])+'\n')
718
856
  self.release_file_obj(file)
719
857
  if self.verbose:
720
858
  self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
@@ -733,32 +871,32 @@ memoryOnly:{self.memoryOnly}
733
871
  try:
734
872
  if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
735
873
  self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
736
- file = self.get_file_obj('r+')
874
+ file = self.get_file_obj('r+b')
737
875
  overWrite = False
738
- line = file.readline()
876
+ line = file.readline().decode(self.encoding)
739
877
  aftPos = file.tell()
740
- if self.header and not lineContainHeader(self.header,line,verbose = self.verbose,teeLogger = self.teeLogger,strict = False):
878
+ if self.header and not _lineContainHeader(self.header,line,verbose = self.verbose,teeLogger = self.teeLogger,strict = False):
741
879
  file.seek(0)
742
- file.write(self.header+'\n')
880
+ file.write(f'{self.header}\n'.encode(encoding=self.encoding))
743
881
  # if the header is not the same length as the line, we need to overwrite the file
744
882
  if aftPos != file.tell():
745
883
  overWrite = True
746
884
  if self.verbose:
747
885
  self.__teePrintOrNot(f"Header {self.header} written to {self._fileName}")
748
886
  for value in self.values():
749
- strToWrite = '\t'.join(value)+'\n'
887
+ strToWrite = self.delimiter.join(value)+'\n'
750
888
  if overWrite:
751
889
  if self.verbose:
752
890
  self.__teePrintOrNot(f"Overwriting {value} to {self._fileName}")
753
- file.write(strToWrite)
891
+ file.write(strToWrite.encode(encoding=self.encoding))
754
892
  continue
755
893
  pos = file.tell()
756
- line = file.readline()
894
+ line = file.readline().decode(encoding=self.encoding)
757
895
  aftPos = file.tell()
758
896
  if not line or pos == aftPos:
759
897
  if self.verbose:
760
898
  self.__teePrintOrNot(f"End of file reached. Appending {value} to {self._fileName}")
761
- file.write(strToWrite)
899
+ file.write(strToWrite.encode(encoding=self.encoding))
762
900
  overWrite = True
763
901
  continue
764
902
  if line != strToWrite:
@@ -766,7 +904,8 @@ memoryOnly:{self.memoryOnly}
766
904
  self.__teePrintOrNot(f"Overwriting {value} to {self._fileName}")
767
905
  file.seek(pos)
768
906
  # fill the string with space to write to the correct length
769
- file.write(strToWrite.rstrip('\n').ljust(len(line)-1)+'\n')
907
+ #file.write(strToWrite.rstrip('\n').ljust(len(line)-1)+'\n')
908
+ file.write(strToWrite.encode(encoding=self.encoding).rstrip(b'\n').ljust(len(line)-1)+b'\n')
770
909
  if aftPos != file.tell():
771
910
  overWrite = True
772
911
  file.truncate()
@@ -800,9 +939,10 @@ memoryOnly:{self.memoryOnly}
800
939
 
801
940
  def _appendWorker(self):
802
941
  while not self.shutdownEvent.is_set():
803
- self.checkExternalChanges()
804
- self.rewrite()
805
- self.commitAppendToFile()
942
+ if not self.memoryOnly:
943
+ self.checkExternalChanges()
944
+ self.rewrite()
945
+ self.commitAppendToFile()
806
946
  time.sleep(self.append_check_delay)
807
947
  # self.appendEvent.wait()
808
948
  # self.appendEvent.clear()
@@ -852,15 +992,19 @@ memoryOnly:{self.memoryOnly}
852
992
  def get_file_obj(self,modes = 'a'):
853
993
  self.writeLock.acquire()
854
994
  try:
855
- if not self.encoding:
856
- self.encoding = 'utf8'
857
- file = open(self._fileName, mode=modes, encoding=self.encoding)
995
+ if 'b' not in modes:
996
+ if not self.encoding:
997
+ self.encoding = 'utf8'
998
+ file = open(self._fileName, mode=modes, encoding=self.encoding)
999
+ else:
1000
+ file = open(self._fileName, mode=modes)
858
1001
  # Lock the file after opening
859
1002
  if os.name == 'posix':
860
1003
  fcntl.lockf(file, fcntl.LOCK_EX)
861
1004
  elif os.name == 'nt':
862
1005
  # For Windows, locking the entire file, avoiding locking an empty file
863
- lock_length = max(1, os.path.getsize(self._fileName))
1006
+ #lock_length = max(1, os.path.getsize(self._fileName))
1007
+ lock_length = 2147483647
864
1008
  msvcrt.locking(file.fileno(), msvcrt.LK_LOCK, lock_length)
865
1009
  if self.verbose:
866
1010
  self.__teePrintOrNot(f"File {self._fileName} locked with mode {modes}")
@@ -879,13 +1023,18 @@ memoryOnly:{self.memoryOnly}
879
1023
  try:
880
1024
  file.flush() # Ensure the file is flushed before unlocking
881
1025
  os.fsync(file.fileno()) # Ensure the file is synced to disk before unlocking
882
- if os.name == 'posix':
883
- fcntl.lockf(file, fcntl.LOCK_UN)
884
- elif os.name == 'nt':
885
- # Unlocking the entire file; for Windows, ensure not unlocking an empty file
886
- unlock_length = max(1, os.path.getsize(file.name))
887
- msvcrt.locking(file.fileno(), msvcrt.LK_UNLCK, unlock_length)
888
- file.close() # Ensure file is closed after unlocking
1026
+ if not file.closed:
1027
+ if os.name == 'posix':
1028
+ fcntl.lockf(file, fcntl.LOCK_UN)
1029
+ elif os.name == 'nt':
1030
+ # Unlocking the entire file; for Windows, ensure not unlocking an empty file
1031
+ #unlock_length = max(1, os.path.getsize(os.path.realpath(file.name)))
1032
+ unlock_length = 2147483647
1033
+ try:
1034
+ msvcrt.locking(file.fileno(), msvcrt.LK_UNLCK, unlock_length)
1035
+ except:
1036
+ pass
1037
+ file.close() # Ensure file is closed after unlocking
889
1038
  if self.verbose:
890
1039
  self.__teePrintOrNot(f"File {file.name} unlocked / released")
891
1040
  except Exception as e:
@@ -894,26 +1043,37 @@ memoryOnly:{self.memoryOnly}
894
1043
  except Exception as e:
895
1044
  self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
896
1045
  self.__teePrintOrNot(f"Failed to release file {file.name}: {e}",'error')
897
- try:
898
- self.writeLock.release() # Ensure the thread lock is always released
899
- except Exception as e:
900
- self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
901
- self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1046
+ import traceback
1047
+ self.__teePrintOrNot(traceback.format_exc(),'error')
1048
+ # release the write lock if not already released
1049
+ if self.writeLock.locked():
1050
+ try:
1051
+ self.writeLock.release() # Ensure the thread lock is always released
1052
+ except Exception as e:
1053
+ self.__teePrintOrNot(f"Failed to release writeLock for {file.name}: {e}",'error')
1054
+ self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
902
1055
 
903
1056
 
904
1057
  def __main__():
905
1058
  import argparse
906
- parser = argparse.ArgumentParser(description='TSVZed: A TSV file manager')
907
- parser.add_argument('filename', type=str, help='The TSV file to read')
1059
+ parser = argparse.ArgumentParser(description='TSVZed: A TSV / CSV / NSV file manager')
1060
+ parser.add_argument('filename', type=str, help='The file to read')
908
1061
  parser.add_argument('operation', type=str,nargs='?', choices=['read','append','delete','clear'], help='The operation to perform. Default: read', default='read')
909
- parser.add_argument('line', type=str, nargs='*', help='The line to append to the TSV file. it follows as : {key} {value1} {value2} ... if a key without value be inserted, the value will get deleted.')
910
- parser.add_argument('-c', '--header', type=str, help='Perform checks with this header of the TSV file. seperate using \\t')
1062
+ parser.add_argument('line', type=str, nargs='*', help='The line to append to the Tabular file. it follows as : {key} {value1} {value2} ... if a key without value be inserted, the value will get deleted.')
1063
+ parser.add_argument('-d', '--delimiter', type=str, help='The delimiter of the Tabular file. Default: Infer from last part of filename, or tab if cannot determine. Note: accept unicode escaped char, raw char, or string "comma,tab,null" will refer to their characters. ', default=...)
1064
+ parser.add_argument('-c', '--header', type=str, help='Perform checks with this header of the Tabular file. seperate using --delimiter.')
911
1065
  parser.add_argument('-f', '--force', action='store_true', help='Force the operation. Ignore checks for column numbers / headers')
912
1066
  parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
913
1067
  parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {version} by {author}')
914
1068
  args = parser.parse_args()
915
-
916
- header = args.header.replace('\\t','\t') if args.header else ''
1069
+ args.delimiter = get_delimiter(delimiter=args.delimiter,file_name=args.filename)
1070
+ if args.header and args.header.endswith('\\'):
1071
+ args.header += '\\'
1072
+ try:
1073
+ header = args.header.encode().decode('unicode_escape') if args.header else ''
1074
+ except Exception as e:
1075
+ print(f"Failed to decode header: {args.header}")
1076
+ header = ''
917
1077
 
918
1078
  if args.operation == 'read':
919
1079
  # check if the file exist
@@ -921,14 +1081,14 @@ def __main__():
921
1081
  print(f"File not found: {args.filename}")
922
1082
  return
923
1083
  # read the file
924
- data = readTSV(args.filename, verifyHeader = False, verbose=args.verbose,strict= not args.force)
925
- print(pretty_format_table(data.values()))
1084
+ data = readTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= not args.force, delimiter=args.delimiter)
1085
+ print(pretty_format_table(data.values(),delimiter=args.delimiter))
926
1086
  elif args.operation == 'append':
927
- appendTSV(args.filename, args.line,createIfNotExist = True, header=header, verbose=args.verbose, strict= not args.force)
1087
+ appendTabularFile(args.filename, args.line,createIfNotExist = True, header=header, verbose=args.verbose, strict= not args.force, delimiter=args.delimiter)
928
1088
  elif args.operation == 'delete':
929
- appendTSV(args.filename, args.line[:1],createIfNotExist = True, header=header, verbose=args.verbose, strict= not args.force)
1089
+ appendTabularFile(args.filename, args.line[:1],createIfNotExist = True, header=header, verbose=args.verbose, strict= not args.force, delimiter=args.delimiter)
930
1090
  elif args.operation == 'clear':
931
- clearTSV(args.filename, header=header, verbose=args.verbose, verifyHeader=not args.force)
1091
+ clearTabularFile(args.filename, header=header, verbose=args.verbose, verifyHeader=not args.force, delimiter=args.delimiter)
932
1092
  else:
933
1093
  print("Invalid operation")
934
1094
  return