TSVZ 3.25__tar.gz → 3.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tsvz-3.25 → tsvz-3.27}/PKG-INFO +1 -1
- {tsvz-3.25 → tsvz-3.27}/TSVZ.egg-info/PKG-INFO +1 -1
- {tsvz-3.25 → tsvz-3.27}/TSVZ.py +173 -59
- {tsvz-3.25 → tsvz-3.27}/README.md +0 -0
- {tsvz-3.25 → tsvz-3.27}/TSVZ.egg-info/SOURCES.txt +0 -0
- {tsvz-3.25 → tsvz-3.27}/TSVZ.egg-info/dependency_links.txt +0 -0
- {tsvz-3.25 → tsvz-3.27}/TSVZ.egg-info/entry_points.txt +0 -0
- {tsvz-3.25 → tsvz-3.27}/TSVZ.egg-info/top_level.txt +0 -0
- {tsvz-3.25 → tsvz-3.27}/setup.cfg +0 -0
- {tsvz-3.25 → tsvz-3.27}/setup.py +0 -0
{tsvz-3.25 → tsvz-3.27}/PKG-INFO
RENAMED
{tsvz-3.25 → tsvz-3.27}/TSVZ.py
RENAMED
|
@@ -22,13 +22,16 @@ if os.name == 'nt':
|
|
|
22
22
|
elif os.name == 'posix':
|
|
23
23
|
import fcntl
|
|
24
24
|
|
|
25
|
-
version = '3.
|
|
25
|
+
version = '3.27'
|
|
26
26
|
__version__ = version
|
|
27
27
|
author = 'pan@zopyr.us'
|
|
28
|
+
COMMIT_DATE = '2025-06-25'
|
|
28
29
|
|
|
29
30
|
DEFAULT_DELIMITER = '\t'
|
|
30
31
|
DEFAULTS_INDICATOR_KEY = '#_defaults_#'
|
|
31
32
|
|
|
33
|
+
COMPRESSED_FILE_EXTENSIONS = ['gz','gzip','bz2','bzip2','xz','lzma']
|
|
34
|
+
|
|
32
35
|
def get_delimiter(delimiter,file_name = ''):
|
|
33
36
|
global DEFAULT_DELIMITER
|
|
34
37
|
if not delimiter:
|
|
@@ -57,6 +60,43 @@ def get_delimiter(delimiter,file_name = ''):
|
|
|
57
60
|
DEFAULT_DELIMITER = rtn
|
|
58
61
|
return rtn
|
|
59
62
|
|
|
63
|
+
def openFileAsCompressed(fileName,mode = 'rb',encoding = 'utf8',teeLogger = None,compressLevel = 1):
|
|
64
|
+
if 'b' not in mode:
|
|
65
|
+
mode += 't'
|
|
66
|
+
kwargs = {}
|
|
67
|
+
if 'r' not in mode:
|
|
68
|
+
if fileName.endswith('.xz'):
|
|
69
|
+
kwargs['preset'] = compressLevel
|
|
70
|
+
else:
|
|
71
|
+
kwargs['compresslevel'] = compressLevel
|
|
72
|
+
if 'b' not in mode:
|
|
73
|
+
kwargs['encoding'] = encoding
|
|
74
|
+
if fileName.endswith('.xz') or fileName.endswith('.lzma'):
|
|
75
|
+
try:
|
|
76
|
+
import lzma
|
|
77
|
+
return lzma.open(fileName, mode, **kwargs)
|
|
78
|
+
except:
|
|
79
|
+
__teePrintOrNot(f"Failed to open {fileName} with lzma, trying bin",teeLogger=teeLogger)
|
|
80
|
+
elif fileName.endswith('.gz') or fileName.endswith('.gzip'):
|
|
81
|
+
try:
|
|
82
|
+
import gzip
|
|
83
|
+
return gzip.open(fileName, mode, **kwargs)
|
|
84
|
+
except:
|
|
85
|
+
__teePrintOrNot(f"Failed to open {fileName} with gzip, trying bin",teeLogger=teeLogger)
|
|
86
|
+
elif fileName.endswith('.bz2') or fileName.endswith('.bzip2'):
|
|
87
|
+
try:
|
|
88
|
+
import bz2
|
|
89
|
+
return bz2.open(fileName, mode, **kwargs)
|
|
90
|
+
except:
|
|
91
|
+
__teePrintOrNot(f"Failed to open {fileName} with bz2, trying bin",teeLogger=teeLogger)
|
|
92
|
+
if 't' in mode:
|
|
93
|
+
mode = mode.replace('t','')
|
|
94
|
+
return open(fileName, mode, encoding=encoding)
|
|
95
|
+
if 'b' not in mode:
|
|
96
|
+
mode += 'b'
|
|
97
|
+
return open(fileName, mode)
|
|
98
|
+
|
|
99
|
+
|
|
60
100
|
def pretty_format_table(data, delimiter = DEFAULT_DELIMITER,header = None):
|
|
61
101
|
version = 1.11
|
|
62
102
|
_ = version
|
|
@@ -392,7 +432,7 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
|
|
|
392
432
|
delimiter = get_delimiter(delimiter,file_name=fileName)
|
|
393
433
|
if verbose:
|
|
394
434
|
__teePrintOrNot(f"Reading last line only from {fileName}",teeLogger=teeLogger)
|
|
395
|
-
with
|
|
435
|
+
with openFileAsCompressed(fileName, 'rb',encoding=encoding, teeLogger=teeLogger) as file:
|
|
396
436
|
file.seek(0, os.SEEK_END)
|
|
397
437
|
file_size = file.tell()
|
|
398
438
|
buffer = b''
|
|
@@ -416,7 +456,7 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
|
|
|
416
456
|
if lines[i].strip(): # Skip empty lines
|
|
417
457
|
# Process the line
|
|
418
458
|
correctColumnNum, lineCache = _processLine(
|
|
419
|
-
line=lines[i].decode(encoding=encoding),
|
|
459
|
+
line=lines[i].decode(encoding=encoding,errors='replace'),
|
|
420
460
|
taskDic=taskDic,
|
|
421
461
|
correctColumnNum=correctColumnNum,
|
|
422
462
|
verbose=verbose,
|
|
@@ -503,19 +543,22 @@ def _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = None,heade
|
|
|
503
543
|
Returns:
|
|
504
544
|
bool: True if the file exists, False otherwise.
|
|
505
545
|
"""
|
|
506
|
-
|
|
546
|
+
remainingFileName, _ ,extenstionName = fileName.rpartition('.')
|
|
547
|
+
if extenstionName in COMPRESSED_FILE_EXTENSIONS:
|
|
548
|
+
remainingFileName, _ ,extenstionName = remainingFileName.rpartition('.')
|
|
549
|
+
if delimiter and delimiter == '\t' and not extenstionName == 'tsv':
|
|
507
550
|
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .tsv','warning',teeLogger=teeLogger)
|
|
508
|
-
elif delimiter and delimiter == ',' and not
|
|
551
|
+
elif delimiter and delimiter == ',' and not extenstionName == 'csv':
|
|
509
552
|
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .csv','warning',teeLogger=teeLogger)
|
|
510
|
-
elif delimiter and delimiter == '\0' and not
|
|
553
|
+
elif delimiter and delimiter == '\0' and not extenstionName == 'nsv':
|
|
511
554
|
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .nsv','warning',teeLogger=teeLogger)
|
|
512
|
-
elif delimiter and delimiter == '|' and not
|
|
555
|
+
elif delimiter and delimiter == '|' and not extenstionName == 'psv':
|
|
513
556
|
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .psv','warning',teeLogger=teeLogger)
|
|
514
557
|
if not os.path.isfile(fileName):
|
|
515
558
|
if createIfNotExist:
|
|
516
559
|
try:
|
|
517
|
-
with
|
|
518
|
-
file.write(header+'\n')
|
|
560
|
+
with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
561
|
+
file.write(header.encode(encoding=encoding,errors='replace')+b'\n')
|
|
519
562
|
__teePrintOrNot('Created '+fileName,teeLogger=teeLogger)
|
|
520
563
|
return True
|
|
521
564
|
except:
|
|
@@ -591,10 +634,10 @@ def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = Fal
|
|
|
591
634
|
header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger, delimiter = delimiter)
|
|
592
635
|
if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
|
|
593
636
|
return taskDic
|
|
594
|
-
with
|
|
637
|
+
with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
595
638
|
correctColumnNum = -1
|
|
596
639
|
if header.rstrip() and verifyHeader:
|
|
597
|
-
line = file.readline().decode(encoding=encoding)
|
|
640
|
+
line = file.readline().decode(encoding=encoding,errors='replace')
|
|
598
641
|
if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
|
|
599
642
|
correctColumnNum = len(header.split(delimiter))
|
|
600
643
|
if verbose:
|
|
@@ -605,7 +648,7 @@ def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = Fal
|
|
|
605
648
|
taskDic[lineCache[0]] = lineCache
|
|
606
649
|
return lineCache
|
|
607
650
|
for line in file:
|
|
608
|
-
correctColumnNum, lineCache = _processLine(line.decode(encoding=encoding),taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict,delimiter=delimiter,defaults = defaults)
|
|
651
|
+
correctColumnNum, lineCache = _processLine(line.decode(encoding=encoding,errors='replace'),taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict,delimiter=delimiter,defaults = defaults)
|
|
609
652
|
return taskDic
|
|
610
653
|
|
|
611
654
|
def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = '\t'):
|
|
@@ -693,25 +736,27 @@ def appendLinesTabularFile(fileName,linesToAppend,teeLogger = None,header = '',c
|
|
|
693
736
|
if verbose:
|
|
694
737
|
__teePrintOrNot(f"No lines to append to {fileName}",teeLogger=teeLogger)
|
|
695
738
|
return
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
739
|
+
correctColumnNum = max([len(line) for line in formatedLines])
|
|
740
|
+
|
|
741
|
+
if header.rstrip() and verifyHeader:
|
|
742
|
+
with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
743
|
+
line = file.readline().decode(encoding=encoding,errors='replace')
|
|
744
|
+
if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
|
|
745
|
+
correctColumnNum = len(header.split(delimiter))
|
|
746
|
+
if verbose:
|
|
747
|
+
__teePrintOrNot(f"correctColumnNum: {correctColumnNum}",teeLogger=teeLogger)
|
|
748
|
+
# truncate / fill the lines to the correct number of columns
|
|
749
|
+
for i in range(len(formatedLines)):
|
|
750
|
+
if len(formatedLines[i]) < correctColumnNum:
|
|
751
|
+
formatedLines[i] += ['']*(correctColumnNum-len(formatedLines[i]))
|
|
752
|
+
elif len(formatedLines[i]) > correctColumnNum:
|
|
753
|
+
formatedLines[i] = formatedLines[i][:correctColumnNum]
|
|
754
|
+
with openFileAsCompressed(fileName, mode ='ab',encoding=encoding,teeLogger=teeLogger)as file:
|
|
710
755
|
# check if the file ends in a newline
|
|
711
|
-
file.seek(-1, os.SEEK_END)
|
|
712
|
-
if file.read(1) != b'\n':
|
|
713
|
-
|
|
714
|
-
file.write(b'\n'.join([delimiter.join(line).encode(encoding=encoding) for line in formatedLines]) + b'\n')
|
|
756
|
+
# file.seek(-1, os.SEEK_END)
|
|
757
|
+
# if file.read(1) != b'\n':
|
|
758
|
+
# file.write(b'\n')
|
|
759
|
+
file.write(b'\n'.join([delimiter.join(line).encode(encoding=encoding,errors='replace') for line in formatedLines]) + b'\n')
|
|
715
760
|
if verbose:
|
|
716
761
|
__teePrintOrNot(f"Appended {len(formatedLines)} lines to {fileName}",teeLogger=teeLogger)
|
|
717
762
|
|
|
@@ -747,14 +792,17 @@ def clearTabularFile(fileName,teeLogger = None,header = '',verifyHeader = False,
|
|
|
747
792
|
if not _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = teeLogger,header = header,encoding = encoding,strict = False,delimiter=delimiter):
|
|
748
793
|
raise FileNotFoundError("Something catastrophic happened! File still not found after creation")
|
|
749
794
|
else:
|
|
750
|
-
with
|
|
795
|
+
with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
751
796
|
if header.rstrip() and verifyHeader:
|
|
752
|
-
line = file.readline()
|
|
797
|
+
line = file.readline().decode(encoding=encoding,errors='replace')
|
|
753
798
|
if not _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
|
|
754
799
|
__teePrintOrNot(f'Warning: Header mismatch in {fileName}. Keeping original header in file...','warning',teeLogger)
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
800
|
+
header = line
|
|
801
|
+
with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
802
|
+
if header:
|
|
803
|
+
if not header.endswith('\n'):
|
|
804
|
+
header += '\n'
|
|
805
|
+
file.write(header.encode(encoding=encoding,errors='replace'))
|
|
758
806
|
if verbose:
|
|
759
807
|
__teePrintOrNot(f"Cleared {fileName}",teeLogger=teeLogger)
|
|
760
808
|
|
|
@@ -774,7 +822,69 @@ def get_time_ns():
|
|
|
774
822
|
except:
|
|
775
823
|
# try to get the time in nanoseconds
|
|
776
824
|
return int(time.time()*1e9)
|
|
825
|
+
|
|
826
|
+
def scrubTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = False,delimiter = '\t',defaults = ...):
|
|
827
|
+
"""
|
|
828
|
+
Compatibility method, calls scrubTabularFile.
|
|
829
|
+
Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
|
|
830
|
+
Return the data as a dictionary.
|
|
831
|
+
|
|
832
|
+
Parameters:
|
|
833
|
+
- fileName (str): The path to the Tabular file.
|
|
834
|
+
- teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
|
|
835
|
+
- header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
|
|
836
|
+
- createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
|
|
837
|
+
- lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
|
|
838
|
+
- verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
|
|
839
|
+
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
840
|
+
- taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
|
|
841
|
+
- encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
|
|
842
|
+
- strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
|
|
843
|
+
- delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
|
|
844
|
+
- defaults (list, optional): The default values to use for missing columns. Defaults to [].
|
|
845
|
+
|
|
846
|
+
Returns:
|
|
847
|
+
- OrderedDict: The dictionary containing the data from the Tabular file.
|
|
848
|
+
|
|
849
|
+
Raises:
|
|
850
|
+
- Exception: If the file is not found or there is a data format error.
|
|
851
|
+
|
|
852
|
+
"""
|
|
853
|
+
return scrubTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
|
|
854
|
+
|
|
855
|
+
def scrubTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = False,delimiter = ...,defaults = ...):
|
|
856
|
+
"""
|
|
857
|
+
Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
|
|
858
|
+
If using compressed files. This will recompress the file in whole and possibily increase the compression ratio reducing the file size.
|
|
859
|
+
Return the data as a dictionary.
|
|
777
860
|
|
|
861
|
+
Parameters:
|
|
862
|
+
- fileName (str): The path to the Tabular file.
|
|
863
|
+
- teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
|
|
864
|
+
- header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
|
|
865
|
+
- createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
|
|
866
|
+
- lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
|
|
867
|
+
- verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
|
|
868
|
+
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
869
|
+
- taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
|
|
870
|
+
- encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
|
|
871
|
+
- strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
|
|
872
|
+
- delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
|
|
873
|
+
- defaults (list, optional): The default values to use for missing columns. Defaults to [].
|
|
874
|
+
|
|
875
|
+
Returns:
|
|
876
|
+
- OrderedDict: The dictionary containing the data from the Tabular file.
|
|
877
|
+
|
|
878
|
+
Raises:
|
|
879
|
+
- Exception: If the file is not found or there is a data format error.
|
|
880
|
+
|
|
881
|
+
"""
|
|
882
|
+
file = readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
|
|
883
|
+
if file:
|
|
884
|
+
clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
|
|
885
|
+
appendLinesTabularFile(fileName,file,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
|
|
886
|
+
return file
|
|
887
|
+
|
|
778
888
|
# create a tsv class that functions like a ordered dictionary but will update the file when modified
|
|
779
889
|
class TSVZed(OrderedDict):
|
|
780
890
|
def __teePrintOrNot(self,message,level = 'info'):
|
|
@@ -1010,14 +1120,14 @@ class TSVZed(OrderedDict):
|
|
|
1010
1120
|
def clear_file(self):
|
|
1011
1121
|
try:
|
|
1012
1122
|
if self.header:
|
|
1013
|
-
file = self.get_file_obj('
|
|
1014
|
-
file.write(self.header+'\n')
|
|
1123
|
+
file = self.get_file_obj('wb')
|
|
1124
|
+
file.write(self.header.encode(self.encoding,errors='replace') + b'\n')
|
|
1015
1125
|
self.release_file_obj(file)
|
|
1016
1126
|
if self.verbose:
|
|
1017
1127
|
self.__teePrintOrNot(f"Header {self.header} written to {self._fileName}")
|
|
1018
1128
|
self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
|
|
1019
1129
|
else:
|
|
1020
|
-
file = self.get_file_obj('
|
|
1130
|
+
file = self.get_file_obj('wb')
|
|
1021
1131
|
self.release_file_obj(file)
|
|
1022
1132
|
if self.verbose:
|
|
1023
1133
|
self.__teePrintOrNot(f"File {self._fileName} cleared empty")
|
|
@@ -1153,15 +1263,15 @@ memoryOnly:{self.memoryOnly}
|
|
|
1153
1263
|
self.deSynced = True
|
|
1154
1264
|
return False
|
|
1155
1265
|
|
|
1156
|
-
def
|
|
1266
|
+
def hardMapToFile(self):
|
|
1157
1267
|
try:
|
|
1158
1268
|
if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
|
|
1159
1269
|
self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
|
|
1160
|
-
file = self.get_file_obj('
|
|
1270
|
+
file = self.get_file_obj('wb')
|
|
1161
1271
|
if self.header:
|
|
1162
|
-
file.write(self.header+'\n')
|
|
1272
|
+
file.write(self.header.encode(self.encoding,errors='replace') + b'\n')
|
|
1163
1273
|
for key in self:
|
|
1164
|
-
file.write(self.delimiter.join(self[key])+'\n')
|
|
1274
|
+
file.write(self.delimiter.join(self[key]).encode(encoding=self.encoding,errors='replace')+b'\n')
|
|
1165
1275
|
self.release_file_obj(file)
|
|
1166
1276
|
if self.verbose:
|
|
1167
1277
|
self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
|
|
@@ -1170,7 +1280,7 @@ memoryOnly:{self.memoryOnly}
|
|
|
1170
1280
|
self.deSynced = False
|
|
1171
1281
|
except Exception as e:
|
|
1172
1282
|
self.release_file_obj(file)
|
|
1173
|
-
self.__teePrintOrNot(f"Failed to write at
|
|
1283
|
+
self.__teePrintOrNot(f"Failed to write at hardMapToFile() to {self._fileName}: {e}",'error')
|
|
1174
1284
|
import traceback
|
|
1175
1285
|
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
1176
1286
|
self.deSynced = True
|
|
@@ -1182,14 +1292,17 @@ memoryOnly:{self.memoryOnly}
|
|
|
1182
1292
|
try:
|
|
1183
1293
|
if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
|
|
1184
1294
|
self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
|
|
1295
|
+
if self._fileName.rpartition('.')[2] in COMPRESSED_FILE_EXTENSIONS:
|
|
1296
|
+
# if the file is compressed, we need to use the hardMapToFile method
|
|
1297
|
+
return self.hardMapToFile()
|
|
1185
1298
|
file = self.get_file_obj('r+b')
|
|
1186
1299
|
overWrite = False
|
|
1187
1300
|
if self.header:
|
|
1188
|
-
line = file.readline().decode(self.encoding)
|
|
1301
|
+
line = file.readline().decode(self.encoding,errors='replace')
|
|
1189
1302
|
aftPos = file.tell()
|
|
1190
1303
|
if not _lineContainHeader(self.header,line,verbose = self.verbose,teeLogger = self.teeLogger,strict = self.strict):
|
|
1191
1304
|
file.seek(0)
|
|
1192
|
-
file.write(f'{self.header}\n'.encode(encoding=self.encoding))
|
|
1305
|
+
file.write(f'{self.header}\n'.encode(encoding=self.encoding,errors='replace'))
|
|
1193
1306
|
# if the header is not the same length as the line, we need to overwrite the file
|
|
1194
1307
|
if aftPos != file.tell():
|
|
1195
1308
|
overWrite = True
|
|
@@ -1202,7 +1315,7 @@ memoryOnly:{self.memoryOnly}
|
|
|
1202
1315
|
if overWrite:
|
|
1203
1316
|
if self.verbose:
|
|
1204
1317
|
self.__teePrintOrNot(f"Overwriting {value} to {self._fileName}")
|
|
1205
|
-
file.write(strToWrite.encode(encoding=self.encoding)+b'\n')
|
|
1318
|
+
file.write(strToWrite.encode(encoding=self.encoding,errors='replace')+b'\n')
|
|
1206
1319
|
continue
|
|
1207
1320
|
pos = file.tell()
|
|
1208
1321
|
line = file.readline()
|
|
@@ -1210,10 +1323,10 @@ memoryOnly:{self.memoryOnly}
|
|
|
1210
1323
|
if not line or pos == aftPos:
|
|
1211
1324
|
if self.verbose:
|
|
1212
1325
|
self.__teePrintOrNot(f"End of file reached. Appending {value} to {self._fileName}")
|
|
1213
|
-
file.write(strToWrite.encode(encoding=self.encoding))
|
|
1326
|
+
file.write(strToWrite.encode(encoding=self.encoding,errors='replace'))
|
|
1214
1327
|
overWrite = True
|
|
1215
1328
|
continue
|
|
1216
|
-
strToWrite = strToWrite.encode(encoding=self.encoding).ljust(len(line)-1)+b'\n'
|
|
1329
|
+
strToWrite = strToWrite.encode(encoding=self.encoding,errors='replace').ljust(len(line)-1)+b'\n'
|
|
1217
1330
|
if line != strToWrite:
|
|
1218
1331
|
if self.verbose:
|
|
1219
1332
|
self.__teePrintOrNot(f"Modifing {value} to {self._fileName}")
|
|
@@ -1236,6 +1349,8 @@ memoryOnly:{self.memoryOnly}
|
|
|
1236
1349
|
import traceback
|
|
1237
1350
|
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
1238
1351
|
self.deSynced = True
|
|
1352
|
+
self.__teePrintOrNot("Trying failback hardMapToFile()")
|
|
1353
|
+
self.hardMapToFile()
|
|
1239
1354
|
self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
|
|
1240
1355
|
self.monitor_external_changes = mec
|
|
1241
1356
|
return self
|
|
@@ -1278,10 +1393,10 @@ memoryOnly:{self.memoryOnly}
|
|
|
1278
1393
|
if self.verbose:
|
|
1279
1394
|
self.__teePrintOrNot(f"Commiting {len(self.appendQueue)} records to {self._fileName}")
|
|
1280
1395
|
self.__teePrintOrNot(f"Before size of {self._fileName}: {os.path.getsize(self._fileName)}")
|
|
1281
|
-
file = self.get_file_obj('
|
|
1396
|
+
file = self.get_file_obj('ab')
|
|
1282
1397
|
while self.appendQueue:
|
|
1283
1398
|
line = self.appendQueue.popleft()
|
|
1284
|
-
file.write(line+'\n')
|
|
1399
|
+
file.write(line.encode(encoding=self.encoding,errors='replace')+b'\n')
|
|
1285
1400
|
self.release_file_obj(file)
|
|
1286
1401
|
if self.verbose:
|
|
1287
1402
|
self.__teePrintOrNot(f"Records commited to {self._fileName}")
|
|
@@ -1306,15 +1421,12 @@ memoryOnly:{self.memoryOnly}
|
|
|
1306
1421
|
if self.verbose:
|
|
1307
1422
|
self.__teePrintOrNot(f"Append thread for {self._fileName} stopped")
|
|
1308
1423
|
|
|
1309
|
-
def get_file_obj(self,modes = '
|
|
1424
|
+
def get_file_obj(self,modes = 'ab'):
|
|
1310
1425
|
self.writeLock.acquire()
|
|
1311
1426
|
try:
|
|
1312
|
-
if
|
|
1313
|
-
|
|
1314
|
-
|
|
1315
|
-
file = open(self._fileName, mode=modes, encoding=self.encoding)
|
|
1316
|
-
else:
|
|
1317
|
-
file = open(self._fileName, mode=modes)
|
|
1427
|
+
if not self.encoding:
|
|
1428
|
+
self.encoding = 'utf8'
|
|
1429
|
+
file = openFileAsCompressed(self._fileName, mode=modes, encoding=self.encoding,teeLogger=self.teeLogger)
|
|
1318
1430
|
# Lock the file after opening
|
|
1319
1431
|
if os.name == 'posix':
|
|
1320
1432
|
fcntl.lockf(file, fcntl.LOCK_EX)
|
|
@@ -1375,7 +1487,7 @@ def __main__():
|
|
|
1375
1487
|
import argparse
|
|
1376
1488
|
parser = argparse.ArgumentParser(description='TSVZed: A TSV / CSV / NSV file manager')
|
|
1377
1489
|
parser.add_argument('filename', type=str, help='The file to read')
|
|
1378
|
-
parser.add_argument('operation', type=str,nargs='?', choices=['read','append','delete','clear'], help='The operation to perform. Default: read', default='read')
|
|
1490
|
+
parser.add_argument('operation', type=str,nargs='?', choices=['read','append','delete','clear','scrub'], help='The operation to perform. Note: scrub will also remove all comments. Default: read', default='read')
|
|
1379
1491
|
parser.add_argument('line', type=str, nargs='*', help='The line to append to the Tabular file. it follows as : {key} {value1} {value2} ... if a key without value be inserted, the value will get deleted.')
|
|
1380
1492
|
parser.add_argument('-d', '--delimiter', type=str, help='The delimiter of the Tabular file. Default: Infer from last part of filename, or tab if cannot determine. Note: accept unicode escaped char, raw char, or string "comma,tab,null" will refer to their characters. ', default=...)
|
|
1381
1493
|
parser.add_argument('-c', '--header', type=str, help='Perform checks with this header of the Tabular file. seperate using --delimiter.')
|
|
@@ -1384,7 +1496,7 @@ def __main__():
|
|
|
1384
1496
|
strictMode.add_argument('-s', '--strict', dest = 'strict',action='store_true', help='Strict mode. Do not parse values that seems malformed, check for column numbers / headers')
|
|
1385
1497
|
strictMode.add_argument('-f', '--force', dest = 'strict',action='store_false', help='Force the operation. Ignore checks for column numbers / headers')
|
|
1386
1498
|
parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
|
|
1387
|
-
parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {version} by {author}')
|
|
1499
|
+
parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {version} @ {COMMIT_DATE} by {author}')
|
|
1388
1500
|
args = parser.parse_args()
|
|
1389
1501
|
args.delimiter = get_delimiter(delimiter=args.delimiter,file_name=args.filename)
|
|
1390
1502
|
if args.header and args.header.endswith('\\'):
|
|
@@ -1416,6 +1528,8 @@ def __main__():
|
|
|
1416
1528
|
appendTabularFile(args.filename, args.line[:1],createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
|
|
1417
1529
|
elif args.operation == 'clear':
|
|
1418
1530
|
clearTabularFile(args.filename, header=header, verbose=args.verbose, verifyHeader=args.strict, delimiter=args.delimiter)
|
|
1531
|
+
elif args.operation == 'scrub':
|
|
1532
|
+
scrubTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= args.strict, delimiter=args.delimiter, defaults=defaults)
|
|
1419
1533
|
else:
|
|
1420
1534
|
print("Invalid operation")
|
|
1421
1535
|
if __name__ == '__main__':
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{tsvz-3.25 → tsvz-3.27}/setup.py
RENAMED
|
File without changes
|