TSVZ 3.24__tar.gz → 3.26__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: TSVZ
3
- Version: 3.24
3
+ Version: 3.26
4
4
  Summary: An simple in memory wrapper around a TSV file to function as a database
5
5
  Home-page: https://github.com/yufei-pan/TSVZ
6
6
  Author: Yufei Pan
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: TSVZ
3
- Version: 3.24
3
+ Version: 3.26
4
4
  Summary: An simple in memory wrapper around a TSV file to function as a database
5
5
  Home-page: https://github.com/yufei-pan/TSVZ
6
6
  Author: Yufei Pan
@@ -22,13 +22,16 @@ if os.name == 'nt':
22
22
  elif os.name == 'posix':
23
23
  import fcntl
24
24
 
25
- version = '3.24'
25
+ version = '3.26'
26
26
  __version__ = version
27
27
  author = 'pan@zopyr.us'
28
+ COMMIT_DATE = '2025-05-19'
28
29
 
29
30
  DEFAULT_DELIMITER = '\t'
30
31
  DEFAULTS_INDICATOR_KEY = '#_defaults_#'
31
32
 
33
+ COMPRESSED_FILE_EXTENSIONS = ['gz','gzip','bz2','bzip2','xz','lzma']
34
+
32
35
  def get_delimiter(delimiter,file_name = ''):
33
36
  global DEFAULT_DELIMITER
34
37
  if not delimiter:
@@ -57,6 +60,43 @@ def get_delimiter(delimiter,file_name = ''):
57
60
  DEFAULT_DELIMITER = rtn
58
61
  return rtn
59
62
 
63
+ def openFileAsCompressed(fileName,mode = 'rb',encoding = 'utf8',teeLogger = None,compressLevel = 1):
64
+ if 'b' not in mode:
65
+ mode += 't'
66
+ kwargs = {}
67
+ if 'r' not in mode:
68
+ if fileName.endswith('.xz'):
69
+ kwargs['preset'] = compressLevel
70
+ else:
71
+ kwargs['compresslevel'] = compressLevel
72
+ if 'b' not in mode:
73
+ kwargs['encoding'] = encoding
74
+ if fileName.endswith('.xz') or fileName.endswith('.lzma'):
75
+ try:
76
+ import lzma
77
+ return lzma.open(fileName, mode, **kwargs)
78
+ except:
79
+ __teePrintOrNot(f"Failed to open {fileName} with lzma, trying bin",teeLogger=teeLogger)
80
+ elif fileName.endswith('.gz') or fileName.endswith('.gzip'):
81
+ try:
82
+ import gzip
83
+ return gzip.open(fileName, mode, **kwargs)
84
+ except:
85
+ __teePrintOrNot(f"Failed to open {fileName} with gzip, trying bin",teeLogger=teeLogger)
86
+ elif fileName.endswith('.bz2') or fileName.endswith('.bzip2'):
87
+ try:
88
+ import bz2
89
+ return bz2.open(fileName, mode, **kwargs)
90
+ except:
91
+ __teePrintOrNot(f"Failed to open {fileName} with bz2, trying bin",teeLogger=teeLogger)
92
+ if 't' in mode:
93
+ mode = mode.replace('t','')
94
+ return open(fileName, mode, encoding=encoding)
95
+ if 'b' not in mode:
96
+ mode += 'b'
97
+ return open(fileName, mode)
98
+
99
+
60
100
  def pretty_format_table(data, delimiter = DEFAULT_DELIMITER,header = None):
61
101
  version = 1.11
62
102
  _ = version
@@ -280,7 +320,7 @@ def __teePrintOrNot(message,level = 'info',teeLogger = None):
280
320
  except Exception:
281
321
  print(message,flush=True)
282
322
 
283
- def _processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,strict = True,delimiter = DEFAULT_DELIMITER,defaults = None):
323
+ def _processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,strict = True,delimiter = DEFAULT_DELIMITER,defaults = ...):
284
324
  """
285
325
  Process a line of text and update the task dictionary.
286
326
 
@@ -297,7 +337,7 @@ def _processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,
297
337
  tuple: A tuple containing the updated correctColumnNum and the processed lineCache.
298
338
 
299
339
  """
300
- if not defaults:
340
+ if defaults is ...:
301
341
  defaults = []
302
342
  line = line.strip(' ').strip('\x00').rstrip('\r\n')
303
343
  # we throw away the lines that start with '#'
@@ -367,7 +407,7 @@ def _processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,
367
407
  __teePrintOrNot(f"Key {lineCache[0]} added",teeLogger=teeLogger)
368
408
  return correctColumnNum, lineCache
369
409
 
370
- def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False,encoding = 'utf8',delimiter = ...,defaults = []):
410
+ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False,encoding = 'utf8',delimiter = ...,defaults = ...):
371
411
  """
372
412
  Reads the last valid line from a file.
373
413
 
@@ -387,10 +427,12 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
387
427
  """
388
428
  chunk_size = 1024 # Read in chunks of 1024 bytes
389
429
  last_valid_line = []
430
+ if defaults is ...:
431
+ defaults = []
390
432
  delimiter = get_delimiter(delimiter,file_name=fileName)
391
433
  if verbose:
392
434
  __teePrintOrNot(f"Reading last line only from {fileName}",teeLogger=teeLogger)
393
- with open(fileName, 'rb') as file:
435
+ with openFileAsCompressed(fileName, 'rb',encoding=encoding, teeLogger=teeLogger) as file:
394
436
  file.seek(0, os.SEEK_END)
395
437
  file_size = file.tell()
396
438
  buffer = b''
@@ -414,7 +456,7 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
414
456
  if lines[i].strip(): # Skip empty lines
415
457
  # Process the line
416
458
  correctColumnNum, lineCache = _processLine(
417
- line=lines[i].decode(encoding=encoding),
459
+ line=lines[i].decode(encoding=encoding,errors='replace'),
418
460
  taskDic=taskDic,
419
461
  correctColumnNum=correctColumnNum,
420
462
  verbose=verbose,
@@ -501,19 +543,22 @@ def _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = None,heade
501
543
  Returns:
502
544
  bool: True if the file exists, False otherwise.
503
545
  """
504
- if delimiter and delimiter == '\t' and not fileName.endswith('.tsv'):
546
+ remainingFileName, _ ,extenstionName = fileName.rpartition('.')
547
+ if extenstionName in COMPRESSED_FILE_EXTENSIONS:
548
+ remainingFileName, _ ,extenstionName = remainingFileName.rpartition('.')
549
+ if delimiter and delimiter == '\t' and not extenstionName == 'tsv':
505
550
  __teePrintOrNot(f'Warning: Filename {fileName} does not end with .tsv','warning',teeLogger=teeLogger)
506
- elif delimiter and delimiter == ',' and not fileName.endswith('.csv'):
551
+ elif delimiter and delimiter == ',' and not extenstionName == 'csv':
507
552
  __teePrintOrNot(f'Warning: Filename {fileName} does not end with .csv','warning',teeLogger=teeLogger)
508
- elif delimiter and delimiter == '\0' and not fileName.endswith('.nsv'):
553
+ elif delimiter and delimiter == '\0' and not extenstionName == 'nsv':
509
554
  __teePrintOrNot(f'Warning: Filename {fileName} does not end with .nsv','warning',teeLogger=teeLogger)
510
- elif delimiter and delimiter == '|' and not fileName.endswith('.psv'):
555
+ elif delimiter and delimiter == '|' and not extenstionName == 'psv':
511
556
  __teePrintOrNot(f'Warning: Filename {fileName} does not end with .psv','warning',teeLogger=teeLogger)
512
557
  if not os.path.isfile(fileName):
513
558
  if createIfNotExist:
514
559
  try:
515
- with open(fileName, mode ='w',encoding=encoding)as file:
516
- file.write(header+'\n')
560
+ with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
561
+ file.write(header.encode(encoding=encoding,errors='replace')+b'\n')
517
562
  __teePrintOrNot('Created '+fileName,teeLogger=teeLogger)
518
563
  return True
519
564
  except:
@@ -528,7 +573,7 @@ def _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = None,heade
528
573
  return False
529
574
  return True
530
575
 
531
- def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = '\t',defaults = []):
576
+ def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = '\t',defaults = ...):
532
577
  """
533
578
  Compatibility method, calls readTabularFile.
534
579
  Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
@@ -556,7 +601,7 @@ def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, last
556
601
  """
557
602
  return readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
558
603
 
559
- def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = ...,defaults = []):
604
+ def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = ...,defaults = ...):
560
605
  """
561
606
  Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
562
607
 
@@ -583,14 +628,16 @@ def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = Fal
583
628
  """
584
629
  if taskDic is None:
585
630
  taskDic = {}
631
+ if defaults is ...:
632
+ defaults = []
586
633
  delimiter = get_delimiter(delimiter,file_name=fileName)
587
634
  header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger, delimiter = delimiter)
588
635
  if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
589
636
  return taskDic
590
- with open(fileName, mode ='rb')as file:
637
+ with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
591
638
  correctColumnNum = -1
592
639
  if header.rstrip() and verifyHeader:
593
- line = file.readline().decode(encoding=encoding)
640
+ line = file.readline().decode(encoding=encoding,errors='replace')
594
641
  if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
595
642
  correctColumnNum = len(header.split(delimiter))
596
643
  if verbose:
@@ -601,7 +648,7 @@ def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = Fal
601
648
  taskDic[lineCache[0]] = lineCache
602
649
  return lineCache
603
650
  for line in file:
604
- correctColumnNum, lineCache = _processLine(line.decode(encoding=encoding),taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict,delimiter=delimiter,defaults = defaults)
651
+ correctColumnNum, lineCache = _processLine(line.decode(encoding=encoding,errors='replace'),taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict,delimiter=delimiter,defaults = defaults)
605
652
  return taskDic
606
653
 
607
654
  def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = '\t'):
@@ -689,10 +736,10 @@ def appendLinesTabularFile(fileName,linesToAppend,teeLogger = None,header = '',c
689
736
  if verbose:
690
737
  __teePrintOrNot(f"No lines to append to {fileName}",teeLogger=teeLogger)
691
738
  return
692
- with open(fileName, mode ='r+b')as file:
739
+ with openFileAsCompressed(fileName, mode ='ab',encoding=encoding,teeLogger=teeLogger)as file:
693
740
  correctColumnNum = max([len(line) for line in formatedLines])
694
741
  if header.rstrip() and verifyHeader:
695
- line = file.readline().decode(encoding=encoding)
742
+ line = file.readline().decode(encoding=encoding,errors='replace')
696
743
  if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
697
744
  correctColumnNum = len(header.split(delimiter))
698
745
  if verbose:
@@ -704,10 +751,10 @@ def appendLinesTabularFile(fileName,linesToAppend,teeLogger = None,header = '',c
704
751
  elif len(formatedLines[i]) > correctColumnNum:
705
752
  formatedLines[i] = formatedLines[i][:correctColumnNum]
706
753
  # check if the file ends in a newline
707
- file.seek(-1, os.SEEK_END)
708
- if file.read(1) != b'\n':
709
- file.write(b'\n')
710
- file.write(b'\n'.join([delimiter.join(line).encode(encoding=encoding) for line in formatedLines]) + b'\n')
754
+ # file.seek(-1, os.SEEK_END)
755
+ # if file.read(1) != b'\n':
756
+ # file.write(b'\n')
757
+ file.write(b'\n'.join([delimiter.join(line).encode(encoding=encoding,errors='replace') for line in formatedLines]) + b'\n')
711
758
  if verbose:
712
759
  __teePrintOrNot(f"Appended {len(formatedLines)} lines to {fileName}",teeLogger=teeLogger)
713
760
 
@@ -743,14 +790,17 @@ def clearTabularFile(fileName,teeLogger = None,header = '',verifyHeader = False,
743
790
  if not _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = teeLogger,header = header,encoding = encoding,strict = False,delimiter=delimiter):
744
791
  raise FileNotFoundError("Something catastrophic happened! File still not found after creation")
745
792
  else:
746
- with open(fileName, mode ='r+',encoding=encoding)as file:
793
+ with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
747
794
  if header.rstrip() and verifyHeader:
748
- line = file.readline()
795
+ line = file.readline().decode(encoding=encoding,errors='replace')
749
796
  if not _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
750
797
  __teePrintOrNot(f'Warning: Header mismatch in {fileName}. Keeping original header in file...','warning',teeLogger)
751
- file.truncate()
752
- else:
753
- file.write(header+'\n')
798
+ header = line
799
+ with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
800
+ if header:
801
+ if not header.endswith('\n'):
802
+ header += '\n'
803
+ file.write(header.encode(encoding=encoding,errors='replace'))
754
804
  if verbose:
755
805
  __teePrintOrNot(f"Cleared {fileName}",teeLogger=teeLogger)
756
806
 
@@ -770,7 +820,69 @@ def get_time_ns():
770
820
  except:
771
821
  # try to get the time in nanoseconds
772
822
  return int(time.time()*1e9)
823
+
824
+ def scrubTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = False,delimiter = '\t',defaults = ...):
825
+ """
826
+ Compatibility method, calls scrubTabularFile.
827
+ Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
828
+ Return the data as a dictionary.
829
+
830
+ Parameters:
831
+ - fileName (str): The path to the Tabular file.
832
+ - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
833
+ - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
834
+ - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
835
+ - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
836
+ - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
837
+ - verbose (bool, optional): Whether to print verbose output. Defaults to False.
838
+ - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
839
+ - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
840
+ - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
841
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
842
+ - defaults (list, optional): The default values to use for missing columns. Defaults to [].
843
+
844
+ Returns:
845
+ - OrderedDict: The dictionary containing the data from the Tabular file.
846
+
847
+ Raises:
848
+ - Exception: If the file is not found or there is a data format error.
849
+
850
+ """
851
+ return scrubTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
852
+
853
+ def scrubTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = False,delimiter = ...,defaults = ...):
854
+ """
855
+ Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
856
+ If using compressed files. This will recompress the file in whole and possibily increase the compression ratio reducing the file size.
857
+ Return the data as a dictionary.
773
858
 
859
+ Parameters:
860
+ - fileName (str): The path to the Tabular file.
861
+ - teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
862
+ - header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
863
+ - createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
864
+ - lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
865
+ - verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
866
+ - verbose (bool, optional): Whether to print verbose output. Defaults to False.
867
+ - taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
868
+ - encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
869
+ - strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
870
+ - delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
871
+ - defaults (list, optional): The default values to use for missing columns. Defaults to [].
872
+
873
+ Returns:
874
+ - OrderedDict: The dictionary containing the data from the Tabular file.
875
+
876
+ Raises:
877
+ - Exception: If the file is not found or there is a data format error.
878
+
879
+ """
880
+ file = readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
881
+ if file:
882
+ clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
883
+ appendLinesTabularFile(fileName,file,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
884
+ return file
885
+
774
886
  # create a tsv class that functions like a ordered dictionary but will update the file when modified
775
887
  class TSVZed(OrderedDict):
776
888
  def __teePrintOrNot(self,message,level = 'info'):
@@ -1006,14 +1118,14 @@ class TSVZed(OrderedDict):
1006
1118
  def clear_file(self):
1007
1119
  try:
1008
1120
  if self.header:
1009
- file = self.get_file_obj('w')
1010
- file.write(self.header+'\n')
1121
+ file = self.get_file_obj('wb')
1122
+ file.write(self.header.encode(self.encoding,errors='replace') + b'\n')
1011
1123
  self.release_file_obj(file)
1012
1124
  if self.verbose:
1013
1125
  self.__teePrintOrNot(f"Header {self.header} written to {self._fileName}")
1014
1126
  self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
1015
1127
  else:
1016
- file = self.get_file_obj('w')
1128
+ file = self.get_file_obj('wb')
1017
1129
  self.release_file_obj(file)
1018
1130
  if self.verbose:
1019
1131
  self.__teePrintOrNot(f"File {self._fileName} cleared empty")
@@ -1149,15 +1261,15 @@ memoryOnly:{self.memoryOnly}
1149
1261
  self.deSynced = True
1150
1262
  return False
1151
1263
 
1152
- def oldMapToFile(self):
1264
+ def hardMapToFile(self):
1153
1265
  try:
1154
1266
  if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
1155
1267
  self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
1156
- file = self.get_file_obj('w')
1268
+ file = self.get_file_obj('wb')
1157
1269
  if self.header:
1158
- file.write(self.header+'\n')
1270
+ file.write(self.header.encode(self.encoding,errors='replace') + b'\n')
1159
1271
  for key in self:
1160
- file.write(self.delimiter.join(self[key])+'\n')
1272
+ file.write(self.delimiter.join(self[key]).encode(encoding=self.encoding,errors='replace')+b'\n')
1161
1273
  self.release_file_obj(file)
1162
1274
  if self.verbose:
1163
1275
  self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
@@ -1166,7 +1278,7 @@ memoryOnly:{self.memoryOnly}
1166
1278
  self.deSynced = False
1167
1279
  except Exception as e:
1168
1280
  self.release_file_obj(file)
1169
- self.__teePrintOrNot(f"Failed to write at oldMapToFile() to {self._fileName}: {e}",'error')
1281
+ self.__teePrintOrNot(f"Failed to write at hardMapToFile() to {self._fileName}: {e}",'error')
1170
1282
  import traceback
1171
1283
  self.__teePrintOrNot(traceback.format_exc(),'error')
1172
1284
  self.deSynced = True
@@ -1178,14 +1290,17 @@ memoryOnly:{self.memoryOnly}
1178
1290
  try:
1179
1291
  if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
1180
1292
  self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
1293
+ if self._fileName.rpartition('.')[2] in COMPRESSED_FILE_EXTENSIONS:
1294
+ # if the file is compressed, we need to use the hardMapToFile method
1295
+ return self.hardMapToFile()
1181
1296
  file = self.get_file_obj('r+b')
1182
1297
  overWrite = False
1183
1298
  if self.header:
1184
- line = file.readline().decode(self.encoding)
1299
+ line = file.readline().decode(self.encoding,errors='replace')
1185
1300
  aftPos = file.tell()
1186
1301
  if not _lineContainHeader(self.header,line,verbose = self.verbose,teeLogger = self.teeLogger,strict = self.strict):
1187
1302
  file.seek(0)
1188
- file.write(f'{self.header}\n'.encode(encoding=self.encoding))
1303
+ file.write(f'{self.header}\n'.encode(encoding=self.encoding,errors='replace'))
1189
1304
  # if the header is not the same length as the line, we need to overwrite the file
1190
1305
  if aftPos != file.tell():
1191
1306
  overWrite = True
@@ -1198,7 +1313,7 @@ memoryOnly:{self.memoryOnly}
1198
1313
  if overWrite:
1199
1314
  if self.verbose:
1200
1315
  self.__teePrintOrNot(f"Overwriting {value} to {self._fileName}")
1201
- file.write(strToWrite.encode(encoding=self.encoding)+b'\n')
1316
+ file.write(strToWrite.encode(encoding=self.encoding,errors='replace')+b'\n')
1202
1317
  continue
1203
1318
  pos = file.tell()
1204
1319
  line = file.readline()
@@ -1206,10 +1321,10 @@ memoryOnly:{self.memoryOnly}
1206
1321
  if not line or pos == aftPos:
1207
1322
  if self.verbose:
1208
1323
  self.__teePrintOrNot(f"End of file reached. Appending {value} to {self._fileName}")
1209
- file.write(strToWrite.encode(encoding=self.encoding))
1324
+ file.write(strToWrite.encode(encoding=self.encoding,errors='replace'))
1210
1325
  overWrite = True
1211
1326
  continue
1212
- strToWrite = strToWrite.encode(encoding=self.encoding).ljust(len(line)-1)+b'\n'
1327
+ strToWrite = strToWrite.encode(encoding=self.encoding,errors='replace').ljust(len(line)-1)+b'\n'
1213
1328
  if line != strToWrite:
1214
1329
  if self.verbose:
1215
1330
  self.__teePrintOrNot(f"Modifing {value} to {self._fileName}")
@@ -1232,6 +1347,8 @@ memoryOnly:{self.memoryOnly}
1232
1347
  import traceback
1233
1348
  self.__teePrintOrNot(traceback.format_exc(),'error')
1234
1349
  self.deSynced = True
1350
+ self.__teePrintOrNot("Trying failback hardMapToFile()")
1351
+ self.hardMapToFile()
1235
1352
  self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
1236
1353
  self.monitor_external_changes = mec
1237
1354
  return self
@@ -1274,10 +1391,10 @@ memoryOnly:{self.memoryOnly}
1274
1391
  if self.verbose:
1275
1392
  self.__teePrintOrNot(f"Commiting {len(self.appendQueue)} records to {self._fileName}")
1276
1393
  self.__teePrintOrNot(f"Before size of {self._fileName}: {os.path.getsize(self._fileName)}")
1277
- file = self.get_file_obj('a')
1394
+ file = self.get_file_obj('ab')
1278
1395
  while self.appendQueue:
1279
1396
  line = self.appendQueue.popleft()
1280
- file.write(line+'\n')
1397
+ file.write(line.encode(encoding=self.encoding,errors='replace')+b'\n')
1281
1398
  self.release_file_obj(file)
1282
1399
  if self.verbose:
1283
1400
  self.__teePrintOrNot(f"Records commited to {self._fileName}")
@@ -1302,15 +1419,12 @@ memoryOnly:{self.memoryOnly}
1302
1419
  if self.verbose:
1303
1420
  self.__teePrintOrNot(f"Append thread for {self._fileName} stopped")
1304
1421
 
1305
- def get_file_obj(self,modes = 'a'):
1422
+ def get_file_obj(self,modes = 'ab'):
1306
1423
  self.writeLock.acquire()
1307
1424
  try:
1308
- if 'b' not in modes:
1309
- if not self.encoding:
1310
- self.encoding = 'utf8'
1311
- file = open(self._fileName, mode=modes, encoding=self.encoding)
1312
- else:
1313
- file = open(self._fileName, mode=modes)
1425
+ if not self.encoding:
1426
+ self.encoding = 'utf8'
1427
+ file = openFileAsCompressed(self._fileName, mode=modes, encoding=self.encoding,teeLogger=self.teeLogger)
1314
1428
  # Lock the file after opening
1315
1429
  if os.name == 'posix':
1316
1430
  fcntl.lockf(file, fcntl.LOCK_EX)
@@ -1371,7 +1485,7 @@ def __main__():
1371
1485
  import argparse
1372
1486
  parser = argparse.ArgumentParser(description='TSVZed: A TSV / CSV / NSV file manager')
1373
1487
  parser.add_argument('filename', type=str, help='The file to read')
1374
- parser.add_argument('operation', type=str,nargs='?', choices=['read','append','delete','clear'], help='The operation to perform. Default: read', default='read')
1488
+ parser.add_argument('operation', type=str,nargs='?', choices=['read','append','delete','clear','scrub'], help='The operation to perform. Note: scrub will also remove all comments. Default: read', default='read')
1375
1489
  parser.add_argument('line', type=str, nargs='*', help='The line to append to the Tabular file. it follows as : {key} {value1} {value2} ... if a key without value be inserted, the value will get deleted.')
1376
1490
  parser.add_argument('-d', '--delimiter', type=str, help='The delimiter of the Tabular file. Default: Infer from last part of filename, or tab if cannot determine. Note: accept unicode escaped char, raw char, or string "comma,tab,null" will refer to their characters. ', default=...)
1377
1491
  parser.add_argument('-c', '--header', type=str, help='Perform checks with this header of the Tabular file. seperate using --delimiter.')
@@ -1380,7 +1494,7 @@ def __main__():
1380
1494
  strictMode.add_argument('-s', '--strict', dest = 'strict',action='store_true', help='Strict mode. Do not parse values that seems malformed, check for column numbers / headers')
1381
1495
  strictMode.add_argument('-f', '--force', dest = 'strict',action='store_false', help='Force the operation. Ignore checks for column numbers / headers')
1382
1496
  parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
1383
- parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {version} by {author}')
1497
+ parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {version} @ {COMMIT_DATE} by {author}')
1384
1498
  args = parser.parse_args()
1385
1499
  args.delimiter = get_delimiter(delimiter=args.delimiter,file_name=args.filename)
1386
1500
  if args.header and args.header.endswith('\\'):
@@ -1412,6 +1526,8 @@ def __main__():
1412
1526
  appendTabularFile(args.filename, args.line[:1],createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
1413
1527
  elif args.operation == 'clear':
1414
1528
  clearTabularFile(args.filename, header=header, verbose=args.verbose, verifyHeader=args.strict, delimiter=args.delimiter)
1529
+ elif args.operation == 'scrub':
1530
+ scrubTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= args.strict, delimiter=args.delimiter, defaults=defaults)
1415
1531
  else:
1416
1532
  print("Invalid operation")
1417
1533
  if __name__ == '__main__':
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes