TSVZ 3.25__py3-none-any.whl → 3.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- TSVZ.py +159 -47
- {tsvz-3.25.dist-info → tsvz-3.26.dist-info}/METADATA +1 -1
- tsvz-3.26.dist-info/RECORD +6 -0
- {tsvz-3.25.dist-info → tsvz-3.26.dist-info}/WHEEL +1 -1
- tsvz-3.25.dist-info/RECORD +0 -6
- {tsvz-3.25.dist-info → tsvz-3.26.dist-info}/entry_points.txt +0 -0
- {tsvz-3.25.dist-info → tsvz-3.26.dist-info}/top_level.txt +0 -0
TSVZ.py
CHANGED
|
@@ -22,13 +22,16 @@ if os.name == 'nt':
|
|
|
22
22
|
elif os.name == 'posix':
|
|
23
23
|
import fcntl
|
|
24
24
|
|
|
25
|
-
version = '3.
|
|
25
|
+
version = '3.26'
|
|
26
26
|
__version__ = version
|
|
27
27
|
author = 'pan@zopyr.us'
|
|
28
|
+
COMMIT_DATE = '2025-05-19'
|
|
28
29
|
|
|
29
30
|
DEFAULT_DELIMITER = '\t'
|
|
30
31
|
DEFAULTS_INDICATOR_KEY = '#_defaults_#'
|
|
31
32
|
|
|
33
|
+
COMPRESSED_FILE_EXTENSIONS = ['gz','gzip','bz2','bzip2','xz','lzma']
|
|
34
|
+
|
|
32
35
|
def get_delimiter(delimiter,file_name = ''):
|
|
33
36
|
global DEFAULT_DELIMITER
|
|
34
37
|
if not delimiter:
|
|
@@ -57,6 +60,43 @@ def get_delimiter(delimiter,file_name = ''):
|
|
|
57
60
|
DEFAULT_DELIMITER = rtn
|
|
58
61
|
return rtn
|
|
59
62
|
|
|
63
|
+
def openFileAsCompressed(fileName,mode = 'rb',encoding = 'utf8',teeLogger = None,compressLevel = 1):
|
|
64
|
+
if 'b' not in mode:
|
|
65
|
+
mode += 't'
|
|
66
|
+
kwargs = {}
|
|
67
|
+
if 'r' not in mode:
|
|
68
|
+
if fileName.endswith('.xz'):
|
|
69
|
+
kwargs['preset'] = compressLevel
|
|
70
|
+
else:
|
|
71
|
+
kwargs['compresslevel'] = compressLevel
|
|
72
|
+
if 'b' not in mode:
|
|
73
|
+
kwargs['encoding'] = encoding
|
|
74
|
+
if fileName.endswith('.xz') or fileName.endswith('.lzma'):
|
|
75
|
+
try:
|
|
76
|
+
import lzma
|
|
77
|
+
return lzma.open(fileName, mode, **kwargs)
|
|
78
|
+
except:
|
|
79
|
+
__teePrintOrNot(f"Failed to open {fileName} with lzma, trying bin",teeLogger=teeLogger)
|
|
80
|
+
elif fileName.endswith('.gz') or fileName.endswith('.gzip'):
|
|
81
|
+
try:
|
|
82
|
+
import gzip
|
|
83
|
+
return gzip.open(fileName, mode, **kwargs)
|
|
84
|
+
except:
|
|
85
|
+
__teePrintOrNot(f"Failed to open {fileName} with gzip, trying bin",teeLogger=teeLogger)
|
|
86
|
+
elif fileName.endswith('.bz2') or fileName.endswith('.bzip2'):
|
|
87
|
+
try:
|
|
88
|
+
import bz2
|
|
89
|
+
return bz2.open(fileName, mode, **kwargs)
|
|
90
|
+
except:
|
|
91
|
+
__teePrintOrNot(f"Failed to open {fileName} with bz2, trying bin",teeLogger=teeLogger)
|
|
92
|
+
if 't' in mode:
|
|
93
|
+
mode = mode.replace('t','')
|
|
94
|
+
return open(fileName, mode, encoding=encoding)
|
|
95
|
+
if 'b' not in mode:
|
|
96
|
+
mode += 'b'
|
|
97
|
+
return open(fileName, mode)
|
|
98
|
+
|
|
99
|
+
|
|
60
100
|
def pretty_format_table(data, delimiter = DEFAULT_DELIMITER,header = None):
|
|
61
101
|
version = 1.11
|
|
62
102
|
_ = version
|
|
@@ -392,7 +432,7 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
|
|
|
392
432
|
delimiter = get_delimiter(delimiter,file_name=fileName)
|
|
393
433
|
if verbose:
|
|
394
434
|
__teePrintOrNot(f"Reading last line only from {fileName}",teeLogger=teeLogger)
|
|
395
|
-
with
|
|
435
|
+
with openFileAsCompressed(fileName, 'rb',encoding=encoding, teeLogger=teeLogger) as file:
|
|
396
436
|
file.seek(0, os.SEEK_END)
|
|
397
437
|
file_size = file.tell()
|
|
398
438
|
buffer = b''
|
|
@@ -416,7 +456,7 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
|
|
|
416
456
|
if lines[i].strip(): # Skip empty lines
|
|
417
457
|
# Process the line
|
|
418
458
|
correctColumnNum, lineCache = _processLine(
|
|
419
|
-
line=lines[i].decode(encoding=encoding),
|
|
459
|
+
line=lines[i].decode(encoding=encoding,errors='replace'),
|
|
420
460
|
taskDic=taskDic,
|
|
421
461
|
correctColumnNum=correctColumnNum,
|
|
422
462
|
verbose=verbose,
|
|
@@ -503,19 +543,22 @@ def _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = None,heade
|
|
|
503
543
|
Returns:
|
|
504
544
|
bool: True if the file exists, False otherwise.
|
|
505
545
|
"""
|
|
506
|
-
|
|
546
|
+
remainingFileName, _ ,extenstionName = fileName.rpartition('.')
|
|
547
|
+
if extenstionName in COMPRESSED_FILE_EXTENSIONS:
|
|
548
|
+
remainingFileName, _ ,extenstionName = remainingFileName.rpartition('.')
|
|
549
|
+
if delimiter and delimiter == '\t' and not extenstionName == 'tsv':
|
|
507
550
|
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .tsv','warning',teeLogger=teeLogger)
|
|
508
|
-
elif delimiter and delimiter == ',' and not
|
|
551
|
+
elif delimiter and delimiter == ',' and not extenstionName == 'csv':
|
|
509
552
|
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .csv','warning',teeLogger=teeLogger)
|
|
510
|
-
elif delimiter and delimiter == '\0' and not
|
|
553
|
+
elif delimiter and delimiter == '\0' and not extenstionName == 'nsv':
|
|
511
554
|
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .nsv','warning',teeLogger=teeLogger)
|
|
512
|
-
elif delimiter and delimiter == '|' and not
|
|
555
|
+
elif delimiter and delimiter == '|' and not extenstionName == 'psv':
|
|
513
556
|
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .psv','warning',teeLogger=teeLogger)
|
|
514
557
|
if not os.path.isfile(fileName):
|
|
515
558
|
if createIfNotExist:
|
|
516
559
|
try:
|
|
517
|
-
with
|
|
518
|
-
file.write(header+'\n')
|
|
560
|
+
with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
561
|
+
file.write(header.encode(encoding=encoding,errors='replace')+b'\n')
|
|
519
562
|
__teePrintOrNot('Created '+fileName,teeLogger=teeLogger)
|
|
520
563
|
return True
|
|
521
564
|
except:
|
|
@@ -591,10 +634,10 @@ def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = Fal
|
|
|
591
634
|
header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger, delimiter = delimiter)
|
|
592
635
|
if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
|
|
593
636
|
return taskDic
|
|
594
|
-
with
|
|
637
|
+
with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
595
638
|
correctColumnNum = -1
|
|
596
639
|
if header.rstrip() and verifyHeader:
|
|
597
|
-
line = file.readline().decode(encoding=encoding)
|
|
640
|
+
line = file.readline().decode(encoding=encoding,errors='replace')
|
|
598
641
|
if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
|
|
599
642
|
correctColumnNum = len(header.split(delimiter))
|
|
600
643
|
if verbose:
|
|
@@ -605,7 +648,7 @@ def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = Fal
|
|
|
605
648
|
taskDic[lineCache[0]] = lineCache
|
|
606
649
|
return lineCache
|
|
607
650
|
for line in file:
|
|
608
|
-
correctColumnNum, lineCache = _processLine(line.decode(encoding=encoding),taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict,delimiter=delimiter,defaults = defaults)
|
|
651
|
+
correctColumnNum, lineCache = _processLine(line.decode(encoding=encoding,errors='replace'),taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict,delimiter=delimiter,defaults = defaults)
|
|
609
652
|
return taskDic
|
|
610
653
|
|
|
611
654
|
def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = '\t'):
|
|
@@ -693,10 +736,10 @@ def appendLinesTabularFile(fileName,linesToAppend,teeLogger = None,header = '',c
|
|
|
693
736
|
if verbose:
|
|
694
737
|
__teePrintOrNot(f"No lines to append to {fileName}",teeLogger=teeLogger)
|
|
695
738
|
return
|
|
696
|
-
with
|
|
739
|
+
with openFileAsCompressed(fileName, mode ='ab',encoding=encoding,teeLogger=teeLogger)as file:
|
|
697
740
|
correctColumnNum = max([len(line) for line in formatedLines])
|
|
698
741
|
if header.rstrip() and verifyHeader:
|
|
699
|
-
line = file.readline().decode(encoding=encoding)
|
|
742
|
+
line = file.readline().decode(encoding=encoding,errors='replace')
|
|
700
743
|
if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
|
|
701
744
|
correctColumnNum = len(header.split(delimiter))
|
|
702
745
|
if verbose:
|
|
@@ -708,10 +751,10 @@ def appendLinesTabularFile(fileName,linesToAppend,teeLogger = None,header = '',c
|
|
|
708
751
|
elif len(formatedLines[i]) > correctColumnNum:
|
|
709
752
|
formatedLines[i] = formatedLines[i][:correctColumnNum]
|
|
710
753
|
# check if the file ends in a newline
|
|
711
|
-
file.seek(-1, os.SEEK_END)
|
|
712
|
-
if file.read(1) != b'\n':
|
|
713
|
-
|
|
714
|
-
file.write(b'\n'.join([delimiter.join(line).encode(encoding=encoding) for line in formatedLines]) + b'\n')
|
|
754
|
+
# file.seek(-1, os.SEEK_END)
|
|
755
|
+
# if file.read(1) != b'\n':
|
|
756
|
+
# file.write(b'\n')
|
|
757
|
+
file.write(b'\n'.join([delimiter.join(line).encode(encoding=encoding,errors='replace') for line in formatedLines]) + b'\n')
|
|
715
758
|
if verbose:
|
|
716
759
|
__teePrintOrNot(f"Appended {len(formatedLines)} lines to {fileName}",teeLogger=teeLogger)
|
|
717
760
|
|
|
@@ -747,14 +790,17 @@ def clearTabularFile(fileName,teeLogger = None,header = '',verifyHeader = False,
|
|
|
747
790
|
if not _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = teeLogger,header = header,encoding = encoding,strict = False,delimiter=delimiter):
|
|
748
791
|
raise FileNotFoundError("Something catastrophic happened! File still not found after creation")
|
|
749
792
|
else:
|
|
750
|
-
with
|
|
793
|
+
with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
751
794
|
if header.rstrip() and verifyHeader:
|
|
752
|
-
line = file.readline()
|
|
795
|
+
line = file.readline().decode(encoding=encoding,errors='replace')
|
|
753
796
|
if not _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
|
|
754
797
|
__teePrintOrNot(f'Warning: Header mismatch in {fileName}. Keeping original header in file...','warning',teeLogger)
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
798
|
+
header = line
|
|
799
|
+
with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
800
|
+
if header:
|
|
801
|
+
if not header.endswith('\n'):
|
|
802
|
+
header += '\n'
|
|
803
|
+
file.write(header.encode(encoding=encoding,errors='replace'))
|
|
758
804
|
if verbose:
|
|
759
805
|
__teePrintOrNot(f"Cleared {fileName}",teeLogger=teeLogger)
|
|
760
806
|
|
|
@@ -774,7 +820,69 @@ def get_time_ns():
|
|
|
774
820
|
except:
|
|
775
821
|
# try to get the time in nanoseconds
|
|
776
822
|
return int(time.time()*1e9)
|
|
823
|
+
|
|
824
|
+
def scrubTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = False,delimiter = '\t',defaults = ...):
|
|
825
|
+
"""
|
|
826
|
+
Compatibility method, calls scrubTabularFile.
|
|
827
|
+
Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
|
|
828
|
+
Return the data as a dictionary.
|
|
829
|
+
|
|
830
|
+
Parameters:
|
|
831
|
+
- fileName (str): The path to the Tabular file.
|
|
832
|
+
- teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
|
|
833
|
+
- header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
|
|
834
|
+
- createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
|
|
835
|
+
- lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
|
|
836
|
+
- verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
|
|
837
|
+
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
838
|
+
- taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
|
|
839
|
+
- encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
|
|
840
|
+
- strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
|
|
841
|
+
- delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
|
|
842
|
+
- defaults (list, optional): The default values to use for missing columns. Defaults to [].
|
|
843
|
+
|
|
844
|
+
Returns:
|
|
845
|
+
- OrderedDict: The dictionary containing the data from the Tabular file.
|
|
846
|
+
|
|
847
|
+
Raises:
|
|
848
|
+
- Exception: If the file is not found or there is a data format error.
|
|
849
|
+
|
|
850
|
+
"""
|
|
851
|
+
return scrubTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
|
|
852
|
+
|
|
853
|
+
def scrubTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = False,delimiter = ...,defaults = ...):
|
|
854
|
+
"""
|
|
855
|
+
Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
|
|
856
|
+
If using compressed files. This will recompress the file in whole and possibily increase the compression ratio reducing the file size.
|
|
857
|
+
Return the data as a dictionary.
|
|
777
858
|
|
|
859
|
+
Parameters:
|
|
860
|
+
- fileName (str): The path to the Tabular file.
|
|
861
|
+
- teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
|
|
862
|
+
- header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
|
|
863
|
+
- createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
|
|
864
|
+
- lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
|
|
865
|
+
- verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
|
|
866
|
+
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
867
|
+
- taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
|
|
868
|
+
- encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
|
|
869
|
+
- strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
|
|
870
|
+
- delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
|
|
871
|
+
- defaults (list, optional): The default values to use for missing columns. Defaults to [].
|
|
872
|
+
|
|
873
|
+
Returns:
|
|
874
|
+
- OrderedDict: The dictionary containing the data from the Tabular file.
|
|
875
|
+
|
|
876
|
+
Raises:
|
|
877
|
+
- Exception: If the file is not found or there is a data format error.
|
|
878
|
+
|
|
879
|
+
"""
|
|
880
|
+
file = readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
|
|
881
|
+
if file:
|
|
882
|
+
clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
|
|
883
|
+
appendLinesTabularFile(fileName,file,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
|
|
884
|
+
return file
|
|
885
|
+
|
|
778
886
|
# create a tsv class that functions like a ordered dictionary but will update the file when modified
|
|
779
887
|
class TSVZed(OrderedDict):
|
|
780
888
|
def __teePrintOrNot(self,message,level = 'info'):
|
|
@@ -1010,14 +1118,14 @@ class TSVZed(OrderedDict):
|
|
|
1010
1118
|
def clear_file(self):
|
|
1011
1119
|
try:
|
|
1012
1120
|
if self.header:
|
|
1013
|
-
file = self.get_file_obj('
|
|
1014
|
-
file.write(self.header+'\n')
|
|
1121
|
+
file = self.get_file_obj('wb')
|
|
1122
|
+
file.write(self.header.encode(self.encoding,errors='replace') + b'\n')
|
|
1015
1123
|
self.release_file_obj(file)
|
|
1016
1124
|
if self.verbose:
|
|
1017
1125
|
self.__teePrintOrNot(f"Header {self.header} written to {self._fileName}")
|
|
1018
1126
|
self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
|
|
1019
1127
|
else:
|
|
1020
|
-
file = self.get_file_obj('
|
|
1128
|
+
file = self.get_file_obj('wb')
|
|
1021
1129
|
self.release_file_obj(file)
|
|
1022
1130
|
if self.verbose:
|
|
1023
1131
|
self.__teePrintOrNot(f"File {self._fileName} cleared empty")
|
|
@@ -1153,15 +1261,15 @@ memoryOnly:{self.memoryOnly}
|
|
|
1153
1261
|
self.deSynced = True
|
|
1154
1262
|
return False
|
|
1155
1263
|
|
|
1156
|
-
def
|
|
1264
|
+
def hardMapToFile(self):
|
|
1157
1265
|
try:
|
|
1158
1266
|
if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
|
|
1159
1267
|
self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
|
|
1160
|
-
file = self.get_file_obj('
|
|
1268
|
+
file = self.get_file_obj('wb')
|
|
1161
1269
|
if self.header:
|
|
1162
|
-
file.write(self.header+'\n')
|
|
1270
|
+
file.write(self.header.encode(self.encoding,errors='replace') + b'\n')
|
|
1163
1271
|
for key in self:
|
|
1164
|
-
file.write(self.delimiter.join(self[key])+'\n')
|
|
1272
|
+
file.write(self.delimiter.join(self[key]).encode(encoding=self.encoding,errors='replace')+b'\n')
|
|
1165
1273
|
self.release_file_obj(file)
|
|
1166
1274
|
if self.verbose:
|
|
1167
1275
|
self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
|
|
@@ -1170,7 +1278,7 @@ memoryOnly:{self.memoryOnly}
|
|
|
1170
1278
|
self.deSynced = False
|
|
1171
1279
|
except Exception as e:
|
|
1172
1280
|
self.release_file_obj(file)
|
|
1173
|
-
self.__teePrintOrNot(f"Failed to write at
|
|
1281
|
+
self.__teePrintOrNot(f"Failed to write at hardMapToFile() to {self._fileName}: {e}",'error')
|
|
1174
1282
|
import traceback
|
|
1175
1283
|
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
1176
1284
|
self.deSynced = True
|
|
@@ -1182,14 +1290,17 @@ memoryOnly:{self.memoryOnly}
|
|
|
1182
1290
|
try:
|
|
1183
1291
|
if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
|
|
1184
1292
|
self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
|
|
1293
|
+
if self._fileName.rpartition('.')[2] in COMPRESSED_FILE_EXTENSIONS:
|
|
1294
|
+
# if the file is compressed, we need to use the hardMapToFile method
|
|
1295
|
+
return self.hardMapToFile()
|
|
1185
1296
|
file = self.get_file_obj('r+b')
|
|
1186
1297
|
overWrite = False
|
|
1187
1298
|
if self.header:
|
|
1188
|
-
line = file.readline().decode(self.encoding)
|
|
1299
|
+
line = file.readline().decode(self.encoding,errors='replace')
|
|
1189
1300
|
aftPos = file.tell()
|
|
1190
1301
|
if not _lineContainHeader(self.header,line,verbose = self.verbose,teeLogger = self.teeLogger,strict = self.strict):
|
|
1191
1302
|
file.seek(0)
|
|
1192
|
-
file.write(f'{self.header}\n'.encode(encoding=self.encoding))
|
|
1303
|
+
file.write(f'{self.header}\n'.encode(encoding=self.encoding,errors='replace'))
|
|
1193
1304
|
# if the header is not the same length as the line, we need to overwrite the file
|
|
1194
1305
|
if aftPos != file.tell():
|
|
1195
1306
|
overWrite = True
|
|
@@ -1202,7 +1313,7 @@ memoryOnly:{self.memoryOnly}
|
|
|
1202
1313
|
if overWrite:
|
|
1203
1314
|
if self.verbose:
|
|
1204
1315
|
self.__teePrintOrNot(f"Overwriting {value} to {self._fileName}")
|
|
1205
|
-
file.write(strToWrite.encode(encoding=self.encoding)+b'\n')
|
|
1316
|
+
file.write(strToWrite.encode(encoding=self.encoding,errors='replace')+b'\n')
|
|
1206
1317
|
continue
|
|
1207
1318
|
pos = file.tell()
|
|
1208
1319
|
line = file.readline()
|
|
@@ -1210,10 +1321,10 @@ memoryOnly:{self.memoryOnly}
|
|
|
1210
1321
|
if not line or pos == aftPos:
|
|
1211
1322
|
if self.verbose:
|
|
1212
1323
|
self.__teePrintOrNot(f"End of file reached. Appending {value} to {self._fileName}")
|
|
1213
|
-
file.write(strToWrite.encode(encoding=self.encoding))
|
|
1324
|
+
file.write(strToWrite.encode(encoding=self.encoding,errors='replace'))
|
|
1214
1325
|
overWrite = True
|
|
1215
1326
|
continue
|
|
1216
|
-
strToWrite = strToWrite.encode(encoding=self.encoding).ljust(len(line)-1)+b'\n'
|
|
1327
|
+
strToWrite = strToWrite.encode(encoding=self.encoding,errors='replace').ljust(len(line)-1)+b'\n'
|
|
1217
1328
|
if line != strToWrite:
|
|
1218
1329
|
if self.verbose:
|
|
1219
1330
|
self.__teePrintOrNot(f"Modifing {value} to {self._fileName}")
|
|
@@ -1236,6 +1347,8 @@ memoryOnly:{self.memoryOnly}
|
|
|
1236
1347
|
import traceback
|
|
1237
1348
|
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
1238
1349
|
self.deSynced = True
|
|
1350
|
+
self.__teePrintOrNot("Trying failback hardMapToFile()")
|
|
1351
|
+
self.hardMapToFile()
|
|
1239
1352
|
self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
|
|
1240
1353
|
self.monitor_external_changes = mec
|
|
1241
1354
|
return self
|
|
@@ -1278,10 +1391,10 @@ memoryOnly:{self.memoryOnly}
|
|
|
1278
1391
|
if self.verbose:
|
|
1279
1392
|
self.__teePrintOrNot(f"Commiting {len(self.appendQueue)} records to {self._fileName}")
|
|
1280
1393
|
self.__teePrintOrNot(f"Before size of {self._fileName}: {os.path.getsize(self._fileName)}")
|
|
1281
|
-
file = self.get_file_obj('
|
|
1394
|
+
file = self.get_file_obj('ab')
|
|
1282
1395
|
while self.appendQueue:
|
|
1283
1396
|
line = self.appendQueue.popleft()
|
|
1284
|
-
file.write(line+'\n')
|
|
1397
|
+
file.write(line.encode(encoding=self.encoding,errors='replace')+b'\n')
|
|
1285
1398
|
self.release_file_obj(file)
|
|
1286
1399
|
if self.verbose:
|
|
1287
1400
|
self.__teePrintOrNot(f"Records commited to {self._fileName}")
|
|
@@ -1306,15 +1419,12 @@ memoryOnly:{self.memoryOnly}
|
|
|
1306
1419
|
if self.verbose:
|
|
1307
1420
|
self.__teePrintOrNot(f"Append thread for {self._fileName} stopped")
|
|
1308
1421
|
|
|
1309
|
-
def get_file_obj(self,modes = '
|
|
1422
|
+
def get_file_obj(self,modes = 'ab'):
|
|
1310
1423
|
self.writeLock.acquire()
|
|
1311
1424
|
try:
|
|
1312
|
-
if
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
file = open(self._fileName, mode=modes, encoding=self.encoding)
|
|
1316
|
-
else:
|
|
1317
|
-
file = open(self._fileName, mode=modes)
|
|
1425
|
+
if not self.encoding:
|
|
1426
|
+
self.encoding = 'utf8'
|
|
1427
|
+
file = openFileAsCompressed(self._fileName, mode=modes, encoding=self.encoding,teeLogger=self.teeLogger)
|
|
1318
1428
|
# Lock the file after opening
|
|
1319
1429
|
if os.name == 'posix':
|
|
1320
1430
|
fcntl.lockf(file, fcntl.LOCK_EX)
|
|
@@ -1375,7 +1485,7 @@ def __main__():
|
|
|
1375
1485
|
import argparse
|
|
1376
1486
|
parser = argparse.ArgumentParser(description='TSVZed: A TSV / CSV / NSV file manager')
|
|
1377
1487
|
parser.add_argument('filename', type=str, help='The file to read')
|
|
1378
|
-
parser.add_argument('operation', type=str,nargs='?', choices=['read','append','delete','clear'], help='The operation to perform. Default: read', default='read')
|
|
1488
|
+
parser.add_argument('operation', type=str,nargs='?', choices=['read','append','delete','clear','scrub'], help='The operation to perform. Note: scrub will also remove all comments. Default: read', default='read')
|
|
1379
1489
|
parser.add_argument('line', type=str, nargs='*', help='The line to append to the Tabular file. it follows as : {key} {value1} {value2} ... if a key without value be inserted, the value will get deleted.')
|
|
1380
1490
|
parser.add_argument('-d', '--delimiter', type=str, help='The delimiter of the Tabular file. Default: Infer from last part of filename, or tab if cannot determine. Note: accept unicode escaped char, raw char, or string "comma,tab,null" will refer to their characters. ', default=...)
|
|
1381
1491
|
parser.add_argument('-c', '--header', type=str, help='Perform checks with this header of the Tabular file. seperate using --delimiter.')
|
|
@@ -1384,7 +1494,7 @@ def __main__():
|
|
|
1384
1494
|
strictMode.add_argument('-s', '--strict', dest = 'strict',action='store_true', help='Strict mode. Do not parse values that seems malformed, check for column numbers / headers')
|
|
1385
1495
|
strictMode.add_argument('-f', '--force', dest = 'strict',action='store_false', help='Force the operation. Ignore checks for column numbers / headers')
|
|
1386
1496
|
parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
|
|
1387
|
-
parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {version} by {author}')
|
|
1497
|
+
parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {version} @ {COMMIT_DATE} by {author}')
|
|
1388
1498
|
args = parser.parse_args()
|
|
1389
1499
|
args.delimiter = get_delimiter(delimiter=args.delimiter,file_name=args.filename)
|
|
1390
1500
|
if args.header and args.header.endswith('\\'):
|
|
@@ -1416,6 +1526,8 @@ def __main__():
|
|
|
1416
1526
|
appendTabularFile(args.filename, args.line[:1],createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
|
|
1417
1527
|
elif args.operation == 'clear':
|
|
1418
1528
|
clearTabularFile(args.filename, header=header, verbose=args.verbose, verifyHeader=args.strict, delimiter=args.delimiter)
|
|
1529
|
+
elif args.operation == 'scrub':
|
|
1530
|
+
scrubTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= args.strict, delimiter=args.delimiter, defaults=defaults)
|
|
1419
1531
|
else:
|
|
1420
1532
|
print("Invalid operation")
|
|
1421
1533
|
if __name__ == '__main__':
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
TSVZ.py,sha256=WanY7DemCKyfMB4qOiAFkYj_95AaeqQ4R6x02UTg89Q,77385
|
|
2
|
+
tsvz-3.26.dist-info/METADATA,sha256=hfHZtBL5SxPxkPvar3SWXLrA9Vps5HqFPNxhnqSAh2k,1826
|
|
3
|
+
tsvz-3.26.dist-info/WHEEL,sha256=Nw36Djuh_5VDukK0H78QzOX-_FQEo6V37m3nkm96gtU,91
|
|
4
|
+
tsvz-3.26.dist-info/entry_points.txt,sha256=WeXidyV5yKCRLaVsnAY35xGa08QgytOfvr1CK9aescI,60
|
|
5
|
+
tsvz-3.26.dist-info/top_level.txt,sha256=OPx4LvOpaYykaos7oL_jGaObSWXxLzhHiWLuz-K147g,5
|
|
6
|
+
tsvz-3.26.dist-info/RECORD,,
|
tsvz-3.25.dist-info/RECORD
DELETED
|
@@ -1,6 +0,0 @@
|
|
|
1
|
-
TSVZ.py,sha256=LGbNbhS3BS9AH1AD9UQCyMk-f-iAgfBk7CXUdRr5Vy4,69461
|
|
2
|
-
tsvz-3.25.dist-info/METADATA,sha256=8ArDrlBsAE26X80qLBeZ9gVJp8HFlFzd2o4EzhMTPUI,1826
|
|
3
|
-
tsvz-3.25.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
|
4
|
-
tsvz-3.25.dist-info/entry_points.txt,sha256=WeXidyV5yKCRLaVsnAY35xGa08QgytOfvr1CK9aescI,60
|
|
5
|
-
tsvz-3.25.dist-info/top_level.txt,sha256=OPx4LvOpaYykaos7oL_jGaObSWXxLzhHiWLuz-K147g,5
|
|
6
|
-
tsvz-3.25.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|