TSVZ 3.24__tar.gz → 3.26__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {tsvz-3.24 → tsvz-3.26}/PKG-INFO +1 -1
- {tsvz-3.24 → tsvz-3.26}/TSVZ.egg-info/PKG-INFO +1 -1
- {tsvz-3.24 → tsvz-3.26}/TSVZ.py +168 -52
- {tsvz-3.24 → tsvz-3.26}/README.md +0 -0
- {tsvz-3.24 → tsvz-3.26}/TSVZ.egg-info/SOURCES.txt +0 -0
- {tsvz-3.24 → tsvz-3.26}/TSVZ.egg-info/dependency_links.txt +0 -0
- {tsvz-3.24 → tsvz-3.26}/TSVZ.egg-info/entry_points.txt +0 -0
- {tsvz-3.24 → tsvz-3.26}/TSVZ.egg-info/top_level.txt +0 -0
- {tsvz-3.24 → tsvz-3.26}/setup.cfg +0 -0
- {tsvz-3.24 → tsvz-3.26}/setup.py +0 -0
{tsvz-3.24 → tsvz-3.26}/PKG-INFO
RENAMED
{tsvz-3.24 → tsvz-3.26}/TSVZ.py
RENAMED
|
@@ -22,13 +22,16 @@ if os.name == 'nt':
|
|
|
22
22
|
elif os.name == 'posix':
|
|
23
23
|
import fcntl
|
|
24
24
|
|
|
25
|
-
version = '3.
|
|
25
|
+
version = '3.26'
|
|
26
26
|
__version__ = version
|
|
27
27
|
author = 'pan@zopyr.us'
|
|
28
|
+
COMMIT_DATE = '2025-05-19'
|
|
28
29
|
|
|
29
30
|
DEFAULT_DELIMITER = '\t'
|
|
30
31
|
DEFAULTS_INDICATOR_KEY = '#_defaults_#'
|
|
31
32
|
|
|
33
|
+
COMPRESSED_FILE_EXTENSIONS = ['gz','gzip','bz2','bzip2','xz','lzma']
|
|
34
|
+
|
|
32
35
|
def get_delimiter(delimiter,file_name = ''):
|
|
33
36
|
global DEFAULT_DELIMITER
|
|
34
37
|
if not delimiter:
|
|
@@ -57,6 +60,43 @@ def get_delimiter(delimiter,file_name = ''):
|
|
|
57
60
|
DEFAULT_DELIMITER = rtn
|
|
58
61
|
return rtn
|
|
59
62
|
|
|
63
|
+
def openFileAsCompressed(fileName,mode = 'rb',encoding = 'utf8',teeLogger = None,compressLevel = 1):
|
|
64
|
+
if 'b' not in mode:
|
|
65
|
+
mode += 't'
|
|
66
|
+
kwargs = {}
|
|
67
|
+
if 'r' not in mode:
|
|
68
|
+
if fileName.endswith('.xz'):
|
|
69
|
+
kwargs['preset'] = compressLevel
|
|
70
|
+
else:
|
|
71
|
+
kwargs['compresslevel'] = compressLevel
|
|
72
|
+
if 'b' not in mode:
|
|
73
|
+
kwargs['encoding'] = encoding
|
|
74
|
+
if fileName.endswith('.xz') or fileName.endswith('.lzma'):
|
|
75
|
+
try:
|
|
76
|
+
import lzma
|
|
77
|
+
return lzma.open(fileName, mode, **kwargs)
|
|
78
|
+
except:
|
|
79
|
+
__teePrintOrNot(f"Failed to open {fileName} with lzma, trying bin",teeLogger=teeLogger)
|
|
80
|
+
elif fileName.endswith('.gz') or fileName.endswith('.gzip'):
|
|
81
|
+
try:
|
|
82
|
+
import gzip
|
|
83
|
+
return gzip.open(fileName, mode, **kwargs)
|
|
84
|
+
except:
|
|
85
|
+
__teePrintOrNot(f"Failed to open {fileName} with gzip, trying bin",teeLogger=teeLogger)
|
|
86
|
+
elif fileName.endswith('.bz2') or fileName.endswith('.bzip2'):
|
|
87
|
+
try:
|
|
88
|
+
import bz2
|
|
89
|
+
return bz2.open(fileName, mode, **kwargs)
|
|
90
|
+
except:
|
|
91
|
+
__teePrintOrNot(f"Failed to open {fileName} with bz2, trying bin",teeLogger=teeLogger)
|
|
92
|
+
if 't' in mode:
|
|
93
|
+
mode = mode.replace('t','')
|
|
94
|
+
return open(fileName, mode, encoding=encoding)
|
|
95
|
+
if 'b' not in mode:
|
|
96
|
+
mode += 'b'
|
|
97
|
+
return open(fileName, mode)
|
|
98
|
+
|
|
99
|
+
|
|
60
100
|
def pretty_format_table(data, delimiter = DEFAULT_DELIMITER,header = None):
|
|
61
101
|
version = 1.11
|
|
62
102
|
_ = version
|
|
@@ -280,7 +320,7 @@ def __teePrintOrNot(message,level = 'info',teeLogger = None):
|
|
|
280
320
|
except Exception:
|
|
281
321
|
print(message,flush=True)
|
|
282
322
|
|
|
283
|
-
def _processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,strict = True,delimiter = DEFAULT_DELIMITER,defaults =
|
|
323
|
+
def _processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,strict = True,delimiter = DEFAULT_DELIMITER,defaults = ...):
|
|
284
324
|
"""
|
|
285
325
|
Process a line of text and update the task dictionary.
|
|
286
326
|
|
|
@@ -297,7 +337,7 @@ def _processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,
|
|
|
297
337
|
tuple: A tuple containing the updated correctColumnNum and the processed lineCache.
|
|
298
338
|
|
|
299
339
|
"""
|
|
300
|
-
if
|
|
340
|
+
if defaults is ...:
|
|
301
341
|
defaults = []
|
|
302
342
|
line = line.strip(' ').strip('\x00').rstrip('\r\n')
|
|
303
343
|
# we throw away the lines that start with '#'
|
|
@@ -367,7 +407,7 @@ def _processLine(line,taskDic,correctColumnNum,verbose = False,teeLogger = None,
|
|
|
367
407
|
__teePrintOrNot(f"Key {lineCache[0]} added",teeLogger=teeLogger)
|
|
368
408
|
return correctColumnNum, lineCache
|
|
369
409
|
|
|
370
|
-
def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False,encoding = 'utf8',delimiter = ...,defaults =
|
|
410
|
+
def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, teeLogger=None, strict=False,encoding = 'utf8',delimiter = ...,defaults = ...):
|
|
371
411
|
"""
|
|
372
412
|
Reads the last valid line from a file.
|
|
373
413
|
|
|
@@ -387,10 +427,12 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
|
|
|
387
427
|
"""
|
|
388
428
|
chunk_size = 1024 # Read in chunks of 1024 bytes
|
|
389
429
|
last_valid_line = []
|
|
430
|
+
if defaults is ...:
|
|
431
|
+
defaults = []
|
|
390
432
|
delimiter = get_delimiter(delimiter,file_name=fileName)
|
|
391
433
|
if verbose:
|
|
392
434
|
__teePrintOrNot(f"Reading last line only from {fileName}",teeLogger=teeLogger)
|
|
393
|
-
with
|
|
435
|
+
with openFileAsCompressed(fileName, 'rb',encoding=encoding, teeLogger=teeLogger) as file:
|
|
394
436
|
file.seek(0, os.SEEK_END)
|
|
395
437
|
file_size = file.tell()
|
|
396
438
|
buffer = b''
|
|
@@ -414,7 +456,7 @@ def read_last_valid_line(fileName, taskDic, correctColumnNum, verbose=False, tee
|
|
|
414
456
|
if lines[i].strip(): # Skip empty lines
|
|
415
457
|
# Process the line
|
|
416
458
|
correctColumnNum, lineCache = _processLine(
|
|
417
|
-
line=lines[i].decode(encoding=encoding),
|
|
459
|
+
line=lines[i].decode(encoding=encoding,errors='replace'),
|
|
418
460
|
taskDic=taskDic,
|
|
419
461
|
correctColumnNum=correctColumnNum,
|
|
420
462
|
verbose=verbose,
|
|
@@ -501,19 +543,22 @@ def _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = None,heade
|
|
|
501
543
|
Returns:
|
|
502
544
|
bool: True if the file exists, False otherwise.
|
|
503
545
|
"""
|
|
504
|
-
|
|
546
|
+
remainingFileName, _ ,extenstionName = fileName.rpartition('.')
|
|
547
|
+
if extenstionName in COMPRESSED_FILE_EXTENSIONS:
|
|
548
|
+
remainingFileName, _ ,extenstionName = remainingFileName.rpartition('.')
|
|
549
|
+
if delimiter and delimiter == '\t' and not extenstionName == 'tsv':
|
|
505
550
|
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .tsv','warning',teeLogger=teeLogger)
|
|
506
|
-
elif delimiter and delimiter == ',' and not
|
|
551
|
+
elif delimiter and delimiter == ',' and not extenstionName == 'csv':
|
|
507
552
|
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .csv','warning',teeLogger=teeLogger)
|
|
508
|
-
elif delimiter and delimiter == '\0' and not
|
|
553
|
+
elif delimiter and delimiter == '\0' and not extenstionName == 'nsv':
|
|
509
554
|
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .nsv','warning',teeLogger=teeLogger)
|
|
510
|
-
elif delimiter and delimiter == '|' and not
|
|
555
|
+
elif delimiter and delimiter == '|' and not extenstionName == 'psv':
|
|
511
556
|
__teePrintOrNot(f'Warning: Filename {fileName} does not end with .psv','warning',teeLogger=teeLogger)
|
|
512
557
|
if not os.path.isfile(fileName):
|
|
513
558
|
if createIfNotExist:
|
|
514
559
|
try:
|
|
515
|
-
with
|
|
516
|
-
file.write(header+'\n')
|
|
560
|
+
with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
561
|
+
file.write(header.encode(encoding=encoding,errors='replace')+b'\n')
|
|
517
562
|
__teePrintOrNot('Created '+fileName,teeLogger=teeLogger)
|
|
518
563
|
return True
|
|
519
564
|
except:
|
|
@@ -528,7 +573,7 @@ def _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = None,heade
|
|
|
528
573
|
return False
|
|
529
574
|
return True
|
|
530
575
|
|
|
531
|
-
def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = '\t',defaults =
|
|
576
|
+
def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = '\t',defaults = ...):
|
|
532
577
|
"""
|
|
533
578
|
Compatibility method, calls readTabularFile.
|
|
534
579
|
Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
|
|
@@ -556,7 +601,7 @@ def readTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, last
|
|
|
556
601
|
"""
|
|
557
602
|
return readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
|
|
558
603
|
|
|
559
|
-
def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = ...,defaults =
|
|
604
|
+
def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = True,delimiter = ...,defaults = ...):
|
|
560
605
|
"""
|
|
561
606
|
Read a Tabular (CSV / TSV / NSV) file and return the data as a dictionary.
|
|
562
607
|
|
|
@@ -583,14 +628,16 @@ def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = Fal
|
|
|
583
628
|
"""
|
|
584
629
|
if taskDic is None:
|
|
585
630
|
taskDic = {}
|
|
631
|
+
if defaults is ...:
|
|
632
|
+
defaults = []
|
|
586
633
|
delimiter = get_delimiter(delimiter,file_name=fileName)
|
|
587
634
|
header = _formatHeader(header,verbose = verbose,teeLogger = teeLogger, delimiter = delimiter)
|
|
588
635
|
if not _verifyFileExistence(fileName,createIfNotExist = createIfNotExist,teeLogger = teeLogger,header = header,encoding = encoding,strict = strict,delimiter=delimiter):
|
|
589
636
|
return taskDic
|
|
590
|
-
with
|
|
637
|
+
with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
591
638
|
correctColumnNum = -1
|
|
592
639
|
if header.rstrip() and verifyHeader:
|
|
593
|
-
line = file.readline().decode(encoding=encoding)
|
|
640
|
+
line = file.readline().decode(encoding=encoding,errors='replace')
|
|
594
641
|
if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
|
|
595
642
|
correctColumnNum = len(header.split(delimiter))
|
|
596
643
|
if verbose:
|
|
@@ -601,7 +648,7 @@ def readTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = Fal
|
|
|
601
648
|
taskDic[lineCache[0]] = lineCache
|
|
602
649
|
return lineCache
|
|
603
650
|
for line in file:
|
|
604
|
-
correctColumnNum, lineCache = _processLine(line.decode(encoding=encoding),taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict,delimiter=delimiter,defaults = defaults)
|
|
651
|
+
correctColumnNum, lineCache = _processLine(line.decode(encoding=encoding,errors='replace'),taskDic,correctColumnNum,verbose = verbose,teeLogger = teeLogger,strict = strict,delimiter=delimiter,defaults = defaults)
|
|
605
652
|
return taskDic
|
|
606
653
|
|
|
607
654
|
def appendTSV(fileName,lineToAppend,teeLogger = None,header = '',createIfNotExist = False,verifyHeader = True,verbose = False,encoding = 'utf8', strict = True, delimiter = '\t'):
|
|
@@ -689,10 +736,10 @@ def appendLinesTabularFile(fileName,linesToAppend,teeLogger = None,header = '',c
|
|
|
689
736
|
if verbose:
|
|
690
737
|
__teePrintOrNot(f"No lines to append to {fileName}",teeLogger=teeLogger)
|
|
691
738
|
return
|
|
692
|
-
with
|
|
739
|
+
with openFileAsCompressed(fileName, mode ='ab',encoding=encoding,teeLogger=teeLogger)as file:
|
|
693
740
|
correctColumnNum = max([len(line) for line in formatedLines])
|
|
694
741
|
if header.rstrip() and verifyHeader:
|
|
695
|
-
line = file.readline().decode(encoding=encoding)
|
|
742
|
+
line = file.readline().decode(encoding=encoding,errors='replace')
|
|
696
743
|
if _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
|
|
697
744
|
correctColumnNum = len(header.split(delimiter))
|
|
698
745
|
if verbose:
|
|
@@ -704,10 +751,10 @@ def appendLinesTabularFile(fileName,linesToAppend,teeLogger = None,header = '',c
|
|
|
704
751
|
elif len(formatedLines[i]) > correctColumnNum:
|
|
705
752
|
formatedLines[i] = formatedLines[i][:correctColumnNum]
|
|
706
753
|
# check if the file ends in a newline
|
|
707
|
-
file.seek(-1, os.SEEK_END)
|
|
708
|
-
if file.read(1) != b'\n':
|
|
709
|
-
|
|
710
|
-
file.write(b'\n'.join([delimiter.join(line).encode(encoding=encoding) for line in formatedLines]) + b'\n')
|
|
754
|
+
# file.seek(-1, os.SEEK_END)
|
|
755
|
+
# if file.read(1) != b'\n':
|
|
756
|
+
# file.write(b'\n')
|
|
757
|
+
file.write(b'\n'.join([delimiter.join(line).encode(encoding=encoding,errors='replace') for line in formatedLines]) + b'\n')
|
|
711
758
|
if verbose:
|
|
712
759
|
__teePrintOrNot(f"Appended {len(formatedLines)} lines to {fileName}",teeLogger=teeLogger)
|
|
713
760
|
|
|
@@ -743,14 +790,17 @@ def clearTabularFile(fileName,teeLogger = None,header = '',verifyHeader = False,
|
|
|
743
790
|
if not _verifyFileExistence(fileName,createIfNotExist = True,teeLogger = teeLogger,header = header,encoding = encoding,strict = False,delimiter=delimiter):
|
|
744
791
|
raise FileNotFoundError("Something catastrophic happened! File still not found after creation")
|
|
745
792
|
else:
|
|
746
|
-
with
|
|
793
|
+
with openFileAsCompressed(fileName, mode ='rb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
747
794
|
if header.rstrip() and verifyHeader:
|
|
748
|
-
line = file.readline()
|
|
795
|
+
line = file.readline().decode(encoding=encoding,errors='replace')
|
|
749
796
|
if not _lineContainHeader(header,line,verbose = verbose,teeLogger = teeLogger,strict = strict):
|
|
750
797
|
__teePrintOrNot(f'Warning: Header mismatch in {fileName}. Keeping original header in file...','warning',teeLogger)
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
798
|
+
header = line
|
|
799
|
+
with openFileAsCompressed(fileName, mode ='wb',encoding=encoding,teeLogger=teeLogger)as file:
|
|
800
|
+
if header:
|
|
801
|
+
if not header.endswith('\n'):
|
|
802
|
+
header += '\n'
|
|
803
|
+
file.write(header.encode(encoding=encoding,errors='replace'))
|
|
754
804
|
if verbose:
|
|
755
805
|
__teePrintOrNot(f"Cleared {fileName}",teeLogger=teeLogger)
|
|
756
806
|
|
|
@@ -770,7 +820,69 @@ def get_time_ns():
|
|
|
770
820
|
except:
|
|
771
821
|
# try to get the time in nanoseconds
|
|
772
822
|
return int(time.time()*1e9)
|
|
823
|
+
|
|
824
|
+
def scrubTSV(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = False,delimiter = '\t',defaults = ...):
|
|
825
|
+
"""
|
|
826
|
+
Compatibility method, calls scrubTabularFile.
|
|
827
|
+
Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
|
|
828
|
+
Return the data as a dictionary.
|
|
829
|
+
|
|
830
|
+
Parameters:
|
|
831
|
+
- fileName (str): The path to the Tabular file.
|
|
832
|
+
- teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
|
|
833
|
+
- header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
|
|
834
|
+
- createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
|
|
835
|
+
- lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
|
|
836
|
+
- verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
|
|
837
|
+
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
838
|
+
- taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
|
|
839
|
+
- encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
|
|
840
|
+
- strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
|
|
841
|
+
- delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
|
|
842
|
+
- defaults (list, optional): The default values to use for missing columns. Defaults to [].
|
|
843
|
+
|
|
844
|
+
Returns:
|
|
845
|
+
- OrderedDict: The dictionary containing the data from the Tabular file.
|
|
846
|
+
|
|
847
|
+
Raises:
|
|
848
|
+
- Exception: If the file is not found or there is a data format error.
|
|
849
|
+
|
|
850
|
+
"""
|
|
851
|
+
return scrubTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
|
|
852
|
+
|
|
853
|
+
def scrubTabularFile(fileName,teeLogger = None,header = '',createIfNotExist = False, lastLineOnly = False,verifyHeader = True,verbose = False,taskDic = None,encoding = 'utf8',strict = False,delimiter = ...,defaults = ...):
|
|
854
|
+
"""
|
|
855
|
+
Scrub a Tabular (CSV / TSV / NSV) file by reading it and writing the contents back into the file.
|
|
856
|
+
If using compressed files. This will recompress the file in whole and possibily increase the compression ratio reducing the file size.
|
|
857
|
+
Return the data as a dictionary.
|
|
773
858
|
|
|
859
|
+
Parameters:
|
|
860
|
+
- fileName (str): The path to the Tabular file.
|
|
861
|
+
- teeLogger (Logger, optional): The logger object to log messages. Defaults to None.
|
|
862
|
+
- header (str or list, optional): The header of the Tabular file. If a string, it should be a tab-separated list of column names. If a list, it should contain the column names. Defaults to ''.
|
|
863
|
+
- createIfNotExist (bool, optional): Whether to create the file if it doesn't exist. Defaults to False.
|
|
864
|
+
- lastLineOnly (bool, optional): Whether to read only the last valid line of the file. Defaults to False.
|
|
865
|
+
- verifyHeader (bool, optional): Whether to verify the header of the file. Defaults to True.
|
|
866
|
+
- verbose (bool, optional): Whether to print verbose output. Defaults to False.
|
|
867
|
+
- taskDic (OrderedDict, optional): The dictionary to store the data. Defaults to an empty OrderedDict.
|
|
868
|
+
- encoding (str, optional): The encoding of the file. Defaults to 'utf8'.
|
|
869
|
+
- strict (bool, optional): Whether to raise an exception if there is a data format error. Defaults to False.
|
|
870
|
+
- delimiter (str, optional): The delimiter used in the Tabular file. Defaults to '\t' for TSV, ',' for CSV, '\0' for NSV.
|
|
871
|
+
- defaults (list, optional): The default values to use for missing columns. Defaults to [].
|
|
872
|
+
|
|
873
|
+
Returns:
|
|
874
|
+
- OrderedDict: The dictionary containing the data from the Tabular file.
|
|
875
|
+
|
|
876
|
+
Raises:
|
|
877
|
+
- Exception: If the file is not found or there is a data format error.
|
|
878
|
+
|
|
879
|
+
"""
|
|
880
|
+
file = readTabularFile(fileName,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,lastLineOnly = lastLineOnly,verifyHeader = verifyHeader,verbose = verbose,taskDic = taskDic,encoding = encoding,strict = strict,delimiter = delimiter,defaults=defaults)
|
|
881
|
+
if file:
|
|
882
|
+
clearTabularFile(fileName,teeLogger = teeLogger,header = header,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
|
|
883
|
+
appendLinesTabularFile(fileName,file,teeLogger = teeLogger,header = header,createIfNotExist = createIfNotExist,verifyHeader = verifyHeader,verbose = verbose,encoding = encoding,strict = strict,delimiter = delimiter)
|
|
884
|
+
return file
|
|
885
|
+
|
|
774
886
|
# create a tsv class that functions like a ordered dictionary but will update the file when modified
|
|
775
887
|
class TSVZed(OrderedDict):
|
|
776
888
|
def __teePrintOrNot(self,message,level = 'info'):
|
|
@@ -1006,14 +1118,14 @@ class TSVZed(OrderedDict):
|
|
|
1006
1118
|
def clear_file(self):
|
|
1007
1119
|
try:
|
|
1008
1120
|
if self.header:
|
|
1009
|
-
file = self.get_file_obj('
|
|
1010
|
-
file.write(self.header+'\n')
|
|
1121
|
+
file = self.get_file_obj('wb')
|
|
1122
|
+
file.write(self.header.encode(self.encoding,errors='replace') + b'\n')
|
|
1011
1123
|
self.release_file_obj(file)
|
|
1012
1124
|
if self.verbose:
|
|
1013
1125
|
self.__teePrintOrNot(f"Header {self.header} written to {self._fileName}")
|
|
1014
1126
|
self.__teePrintOrNot(f"File {self._fileName} size: {os.path.getsize(self._fileName)}")
|
|
1015
1127
|
else:
|
|
1016
|
-
file = self.get_file_obj('
|
|
1128
|
+
file = self.get_file_obj('wb')
|
|
1017
1129
|
self.release_file_obj(file)
|
|
1018
1130
|
if self.verbose:
|
|
1019
1131
|
self.__teePrintOrNot(f"File {self._fileName} cleared empty")
|
|
@@ -1149,15 +1261,15 @@ memoryOnly:{self.memoryOnly}
|
|
|
1149
1261
|
self.deSynced = True
|
|
1150
1262
|
return False
|
|
1151
1263
|
|
|
1152
|
-
def
|
|
1264
|
+
def hardMapToFile(self):
|
|
1153
1265
|
try:
|
|
1154
1266
|
if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
|
|
1155
1267
|
self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
|
|
1156
|
-
file = self.get_file_obj('
|
|
1268
|
+
file = self.get_file_obj('wb')
|
|
1157
1269
|
if self.header:
|
|
1158
|
-
file.write(self.header+'\n')
|
|
1270
|
+
file.write(self.header.encode(self.encoding,errors='replace') + b'\n')
|
|
1159
1271
|
for key in self:
|
|
1160
|
-
file.write(self.delimiter.join(self[key])+'\n')
|
|
1272
|
+
file.write(self.delimiter.join(self[key]).encode(encoding=self.encoding,errors='replace')+b'\n')
|
|
1161
1273
|
self.release_file_obj(file)
|
|
1162
1274
|
if self.verbose:
|
|
1163
1275
|
self.__teePrintOrNot(f"{len(self)} records written to {self._fileName}")
|
|
@@ -1166,7 +1278,7 @@ memoryOnly:{self.memoryOnly}
|
|
|
1166
1278
|
self.deSynced = False
|
|
1167
1279
|
except Exception as e:
|
|
1168
1280
|
self.release_file_obj(file)
|
|
1169
|
-
self.__teePrintOrNot(f"Failed to write at
|
|
1281
|
+
self.__teePrintOrNot(f"Failed to write at hardMapToFile() to {self._fileName}: {e}",'error')
|
|
1170
1282
|
import traceback
|
|
1171
1283
|
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
1172
1284
|
self.deSynced = True
|
|
@@ -1178,14 +1290,17 @@ memoryOnly:{self.memoryOnly}
|
|
|
1178
1290
|
try:
|
|
1179
1291
|
if (not self.monitor_external_changes) and self.externalFileUpdateTime < getFileUpdateTimeNs(self._fileName):
|
|
1180
1292
|
self.__teePrintOrNot(f"Warning: Overwriting external changes in {self._fileName}",'warning')
|
|
1293
|
+
if self._fileName.rpartition('.')[2] in COMPRESSED_FILE_EXTENSIONS:
|
|
1294
|
+
# if the file is compressed, we need to use the hardMapToFile method
|
|
1295
|
+
return self.hardMapToFile()
|
|
1181
1296
|
file = self.get_file_obj('r+b')
|
|
1182
1297
|
overWrite = False
|
|
1183
1298
|
if self.header:
|
|
1184
|
-
line = file.readline().decode(self.encoding)
|
|
1299
|
+
line = file.readline().decode(self.encoding,errors='replace')
|
|
1185
1300
|
aftPos = file.tell()
|
|
1186
1301
|
if not _lineContainHeader(self.header,line,verbose = self.verbose,teeLogger = self.teeLogger,strict = self.strict):
|
|
1187
1302
|
file.seek(0)
|
|
1188
|
-
file.write(f'{self.header}\n'.encode(encoding=self.encoding))
|
|
1303
|
+
file.write(f'{self.header}\n'.encode(encoding=self.encoding,errors='replace'))
|
|
1189
1304
|
# if the header is not the same length as the line, we need to overwrite the file
|
|
1190
1305
|
if aftPos != file.tell():
|
|
1191
1306
|
overWrite = True
|
|
@@ -1198,7 +1313,7 @@ memoryOnly:{self.memoryOnly}
|
|
|
1198
1313
|
if overWrite:
|
|
1199
1314
|
if self.verbose:
|
|
1200
1315
|
self.__teePrintOrNot(f"Overwriting {value} to {self._fileName}")
|
|
1201
|
-
file.write(strToWrite.encode(encoding=self.encoding)+b'\n')
|
|
1316
|
+
file.write(strToWrite.encode(encoding=self.encoding,errors='replace')+b'\n')
|
|
1202
1317
|
continue
|
|
1203
1318
|
pos = file.tell()
|
|
1204
1319
|
line = file.readline()
|
|
@@ -1206,10 +1321,10 @@ memoryOnly:{self.memoryOnly}
|
|
|
1206
1321
|
if not line or pos == aftPos:
|
|
1207
1322
|
if self.verbose:
|
|
1208
1323
|
self.__teePrintOrNot(f"End of file reached. Appending {value} to {self._fileName}")
|
|
1209
|
-
file.write(strToWrite.encode(encoding=self.encoding))
|
|
1324
|
+
file.write(strToWrite.encode(encoding=self.encoding,errors='replace'))
|
|
1210
1325
|
overWrite = True
|
|
1211
1326
|
continue
|
|
1212
|
-
strToWrite = strToWrite.encode(encoding=self.encoding).ljust(len(line)-1)+b'\n'
|
|
1327
|
+
strToWrite = strToWrite.encode(encoding=self.encoding,errors='replace').ljust(len(line)-1)+b'\n'
|
|
1213
1328
|
if line != strToWrite:
|
|
1214
1329
|
if self.verbose:
|
|
1215
1330
|
self.__teePrintOrNot(f"Modifing {value} to {self._fileName}")
|
|
@@ -1232,6 +1347,8 @@ memoryOnly:{self.memoryOnly}
|
|
|
1232
1347
|
import traceback
|
|
1233
1348
|
self.__teePrintOrNot(traceback.format_exc(),'error')
|
|
1234
1349
|
self.deSynced = True
|
|
1350
|
+
self.__teePrintOrNot("Trying failback hardMapToFile()")
|
|
1351
|
+
self.hardMapToFile()
|
|
1235
1352
|
self.externalFileUpdateTime = getFileUpdateTimeNs(self._fileName)
|
|
1236
1353
|
self.monitor_external_changes = mec
|
|
1237
1354
|
return self
|
|
@@ -1274,10 +1391,10 @@ memoryOnly:{self.memoryOnly}
|
|
|
1274
1391
|
if self.verbose:
|
|
1275
1392
|
self.__teePrintOrNot(f"Commiting {len(self.appendQueue)} records to {self._fileName}")
|
|
1276
1393
|
self.__teePrintOrNot(f"Before size of {self._fileName}: {os.path.getsize(self._fileName)}")
|
|
1277
|
-
file = self.get_file_obj('
|
|
1394
|
+
file = self.get_file_obj('ab')
|
|
1278
1395
|
while self.appendQueue:
|
|
1279
1396
|
line = self.appendQueue.popleft()
|
|
1280
|
-
file.write(line+'\n')
|
|
1397
|
+
file.write(line.encode(encoding=self.encoding,errors='replace')+b'\n')
|
|
1281
1398
|
self.release_file_obj(file)
|
|
1282
1399
|
if self.verbose:
|
|
1283
1400
|
self.__teePrintOrNot(f"Records commited to {self._fileName}")
|
|
@@ -1302,15 +1419,12 @@ memoryOnly:{self.memoryOnly}
|
|
|
1302
1419
|
if self.verbose:
|
|
1303
1420
|
self.__teePrintOrNot(f"Append thread for {self._fileName} stopped")
|
|
1304
1421
|
|
|
1305
|
-
def get_file_obj(self,modes = '
|
|
1422
|
+
def get_file_obj(self,modes = 'ab'):
|
|
1306
1423
|
self.writeLock.acquire()
|
|
1307
1424
|
try:
|
|
1308
|
-
if
|
|
1309
|
-
|
|
1310
|
-
|
|
1311
|
-
file = open(self._fileName, mode=modes, encoding=self.encoding)
|
|
1312
|
-
else:
|
|
1313
|
-
file = open(self._fileName, mode=modes)
|
|
1425
|
+
if not self.encoding:
|
|
1426
|
+
self.encoding = 'utf8'
|
|
1427
|
+
file = openFileAsCompressed(self._fileName, mode=modes, encoding=self.encoding,teeLogger=self.teeLogger)
|
|
1314
1428
|
# Lock the file after opening
|
|
1315
1429
|
if os.name == 'posix':
|
|
1316
1430
|
fcntl.lockf(file, fcntl.LOCK_EX)
|
|
@@ -1371,7 +1485,7 @@ def __main__():
|
|
|
1371
1485
|
import argparse
|
|
1372
1486
|
parser = argparse.ArgumentParser(description='TSVZed: A TSV / CSV / NSV file manager')
|
|
1373
1487
|
parser.add_argument('filename', type=str, help='The file to read')
|
|
1374
|
-
parser.add_argument('operation', type=str,nargs='?', choices=['read','append','delete','clear'], help='The operation to perform. Default: read', default='read')
|
|
1488
|
+
parser.add_argument('operation', type=str,nargs='?', choices=['read','append','delete','clear','scrub'], help='The operation to perform. Note: scrub will also remove all comments. Default: read', default='read')
|
|
1375
1489
|
parser.add_argument('line', type=str, nargs='*', help='The line to append to the Tabular file. it follows as : {key} {value1} {value2} ... if a key without value be inserted, the value will get deleted.')
|
|
1376
1490
|
parser.add_argument('-d', '--delimiter', type=str, help='The delimiter of the Tabular file. Default: Infer from last part of filename, or tab if cannot determine. Note: accept unicode escaped char, raw char, or string "comma,tab,null" will refer to their characters. ', default=...)
|
|
1377
1491
|
parser.add_argument('-c', '--header', type=str, help='Perform checks with this header of the Tabular file. seperate using --delimiter.')
|
|
@@ -1380,7 +1494,7 @@ def __main__():
|
|
|
1380
1494
|
strictMode.add_argument('-s', '--strict', dest = 'strict',action='store_true', help='Strict mode. Do not parse values that seems malformed, check for column numbers / headers')
|
|
1381
1495
|
strictMode.add_argument('-f', '--force', dest = 'strict',action='store_false', help='Force the operation. Ignore checks for column numbers / headers')
|
|
1382
1496
|
parser.add_argument('-v', '--verbose', action='store_true', help='Print verbose output')
|
|
1383
|
-
parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {version} by {author}')
|
|
1497
|
+
parser.add_argument('-V', '--version', action='version', version=f'%(prog)s {version} @ {COMMIT_DATE} by {author}')
|
|
1384
1498
|
args = parser.parse_args()
|
|
1385
1499
|
args.delimiter = get_delimiter(delimiter=args.delimiter,file_name=args.filename)
|
|
1386
1500
|
if args.header and args.header.endswith('\\'):
|
|
@@ -1412,6 +1526,8 @@ def __main__():
|
|
|
1412
1526
|
appendTabularFile(args.filename, args.line[:1],createIfNotExist = True, header=header, verbose=args.verbose, strict= args.strict, delimiter=args.delimiter)
|
|
1413
1527
|
elif args.operation == 'clear':
|
|
1414
1528
|
clearTabularFile(args.filename, header=header, verbose=args.verbose, verifyHeader=args.strict, delimiter=args.delimiter)
|
|
1529
|
+
elif args.operation == 'scrub':
|
|
1530
|
+
scrubTabularFile(args.filename, verifyHeader = False, verbose=args.verbose,strict= args.strict, delimiter=args.delimiter, defaults=defaults)
|
|
1415
1531
|
else:
|
|
1416
1532
|
print("Invalid operation")
|
|
1417
1533
|
if __name__ == '__main__':
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{tsvz-3.24 → tsvz-3.26}/setup.py
RENAMED
|
File without changes
|