damage 0.3.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ MIT License
2
+
3
+ Copyright 2021 University of British Columbia Library
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
6
+
7
+ The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
8
+
9
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
damage-0.3.14/PKG-INFO ADDED
@@ -0,0 +1,41 @@
1
+ Metadata-Version: 2.3
2
+ Name: damage
3
+ Version: 0.3.14
4
+ Summary: File manifest generator and python package for statistical data files and documentation'
5
+ License: MIT
6
+ Keywords: metadata,SAS,SPSS,Stata,rectangular files,manifest generator
7
+ Author: Paul Lesack
8
+ Author-email: paul.lesack@ubc.ca
9
+ Requires-Python: >=3.12, <4
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Environment :: Console
12
+ Classifier: Environment :: MacOS X
13
+ Classifier: Environment :: Win32 (MS Windows)
14
+ Classifier: Environment :: X11 Applications
15
+ Classifier: Intended Audience :: Education
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.12
19
+ Classifier: Programming Language :: Python :: 3.13
20
+ Classifier: Topic :: Education
21
+ Classifier: Topic :: Utilities
22
+ Requires-Dist: chardet (>=5.2.0,<6.0.0)
23
+ Requires-Dist: freesimplegui (>=5.2.0,<6.0.0)
24
+ Requires-Dist: numpy (>=2.2.3,<3.0.0)
25
+ Requires-Dist: pandas (>=2.2.3,<3.0.0)
26
+ Requires-Dist: pyreadstat (>=1.2.8,<2.0.0)
27
+ Project-URL: Homepage, https://ubc-library-rc.github.io/damage
28
+ Project-URL: Issue Tracker, https://github.com/ubc-library-rc/damage/issues
29
+ Project-URL: Repository, https://github.com/ubc-library-rc/damage
30
+ Description-Content-Type: text/markdown
31
+
32
+ # File manifest tools: Damage
33
+
34
+ Damage is a simple command-line utility which outputs a file manifest in a variety of formats, with a special focus on statistical package files from SPSS, SAS and Stata. It's also the name of the Python package which you can use in your own code and which powers the _damage_ utility.
35
+
36
+ Source code and documentation files are available at <https://github.com/ubc-library-rc/damage>. Documentation is in the intuitively named _docs_ subdirectory.
37
+
38
+ Binary versions of the *damage* utility for Windows and MacOS computers can be found on the project's Github release page: <https://github.com/ubc-library-rc/damage/releases>.
39
+
40
+ A less utilitarian documentation viewing experience is available at <https://ubc-library-rc.github.io/damage/>.
41
+
@@ -0,0 +1,9 @@
1
+ # File manifest tools: Damage
2
+
3
+ Damage is a simple command-line utility which outputs a file manifest in a variety of formats, with a special focus on statistical package files from SPSS, SAS and Stata. It's also the name of the Python package which you can use in your own code and which powers the _damage_ utility.
4
+
5
+ Source code and documentation files are available at <https://github.com/ubc-library-rc/damage>. Documentation is in the intuitively named _docs_ subdirectory.
6
+
7
+ Binary versions of the *damage* utility for Windows and MacOS computers can be found on the project's Github release page: <https://github.com/ubc-library-rc/damage/releases>.
8
+
9
+ A less utilitarian documentation viewing experience is available at <https://ubc-library-rc.github.io/damage/>.
@@ -0,0 +1,52 @@
1
+ [project]
2
+ name = "damage"
3
+ version = "0.3.14"
4
+ description = "File manifest generator and python package for statistical data files and documentation'"
5
+ authors = [
6
+ {name = "Paul Lesack",email = "paul.lesack@ubc.ca"}
7
+ ]
8
+ license = {text = "MIT"}
9
+ readme = "README.md"
10
+ requires-python = ">=3.12, <4"
11
+ dependencies = [
12
+ "freesimplegui (>=5.2.0,<6.0.0)",
13
+ "chardet (>=5.2.0,<6.0.0)",
14
+ "numpy (>=2.2.3,<3.0.0)",
15
+ "pandas (>=2.2.3,<3.0.0)",
16
+ "pyreadstat (>=1.2.8,<2.0.0)"
17
+ ]
18
+ keywords =["metadata","SAS", "SPSS", "Stata", "rectangular files", "manifest generator"]
19
+
20
+ [project.urls]
21
+ homepage = "https://ubc-library-rc.github.io/damage"
22
+ repository = "https://github.com/ubc-library-rc/damage"
23
+ "Issue Tracker" = "https://github.com/ubc-library-rc/damage/issues"
24
+
25
+ [tool.poetry]
26
+ packages = [{"include" = "damage", "from"="src"}]
27
+ classifiers = ["Development Status :: 4 - Beta",
28
+ "Environment :: Console",
29
+ "Environment :: MacOS X",
30
+ "Environment :: Win32 (MS Windows)",
31
+ "Environment :: X11 Applications",
32
+ "Intended Audience :: Education",
33
+ "License :: OSI Approved :: MIT License",
34
+ "Topic :: Education",
35
+ "Topic :: Utilities"]
36
+
37
+ [tool.poetry.group.dev.dependencies]
38
+ pylint = ">=3.3.4"
39
+ mkdocs = ">=1.6.1"
40
+ pydoc-markdown = ">=4.0.0"
41
+
42
+ [project.scripts]
43
+ damage = "damage.console.damage_cmd:main"
44
+ damage-gui = "damage.gui.damage_gui:main"
45
+
46
+ #windows only?
47
+ [project.gui-scripts]
48
+ damage-gui = "damage.gui.damage_gui:main"
49
+
50
+ [build-system]
51
+ requires = ["poetry-core>=2.0.0,<3.0.0"]
52
+ build-backend = "poetry.core.masonry.api"
@@ -0,0 +1,444 @@
1
+ '''
2
+ Manifest generator for data files.
3
+
4
+ Produces a text file with user specificied checksums for all files
5
+ from the top of a specified tree and checks line length
6
+ and ASCII character status for text files.
7
+
8
+ For statistics program files:
9
+ SAS .sas7bdat
10
+ SPSS .sav
11
+ Stata .dta
12
+
13
+ Checker() will report number of cases and variables as
14
+ rows and columns respectively.
15
+
16
+ '''
17
+
18
+ import copy
19
+ import csv
20
+ import hashlib
21
+ import io
22
+ import json
23
+ import logging
24
+ import mimetypes
25
+ import pathlib
26
+ import string
27
+
28
+ import chardet
29
+ import pyreadstat
30
+
31
+ LOGGER = logging.getLogger()
32
+
33
+ VERSION = (0, 3, 14)
34
+ __version__ = '.'.join([str(x) for x in VERSION])
35
+
36
+ #PDB note check private variables with self._Checker__private_var
37
+ #Note *single* underscore before Checker
38
+ class Checker():
39
+ '''
40
+ A collection of various tools attached to a file
41
+ '''
42
+
43
+ def __init__(self, fname: str) -> None: #DONE
44
+ '''
45
+ Initializes Checker instance
46
+
47
+ fname : str
48
+ Path to file
49
+ '''
50
+ #Commercial stats files extensions
51
+ #I am aware that extension checking is not perfect
52
+ self.statfiles = ['.dta', '.sav', '.sas7bdat']
53
+ #brute force is best force
54
+ self.textfiles= ['.dat', '.txt', '.md', '.csv',
55
+ '.tsv', '.asc', '.html', '.xml',
56
+ '.xsd', '.htm', '.log', '.nfo',
57
+ '.text', '.xsl', '.py', '.r',
58
+ '.toml', '.yaml', '.yml']
59
+ self.fname = pathlib.Path(fname)
60
+ #self._ext = fname.suffix
61
+ self.__istext = self.__istextfile()
62
+ self.__text_obj = None
63
+ with open(self.fname, 'rb') as fil:
64
+ self.__fobj_bin = io.BytesIO(fil.read())
65
+ self.encoding = self.__encoding()
66
+ if self.__istext:
67
+ with open(self.fname, encoding=self.encoding.get('encoding')) as f:
68
+ self.__text_obj = io.StringIO(f.read())
69
+
70
+
71
+ @property
72
+ def hidden(self)->bool:
73
+ '''
74
+ Returns True if file is hidden (ie, startswith '.')
75
+ or is in in a hidden directory (ie, any directory on the path
76
+ starts with '.')
77
+ '''
78
+ if any([x.startswith('.') for x in self.fname.parts]):
79
+ return True
80
+ return False
81
+
82
+ def __istextfile(self):
83
+ '''
84
+ Check to see if file is a text file based on mimetype.
85
+ Works with extensions only which is not ideal
86
+ '''
87
+ try:
88
+ if ('text' in mimetypes.guess_file_type(self.fname)
89
+ or self.fname.suffix.lower() in self.textfiles):
90
+ return True
91
+ except AttributeError: #soft deprecation fix
92
+ if ('text' in mimetypes.guess_type(self.fname)
93
+ or self.fname.suffix.lower() in self.textfiles):
94
+ return True
95
+
96
+ return False
97
+
98
+ def __encoding(self) -> dict: #DONE
99
+ '''
100
+ Returns most likely encoding of self.fname, dict with keys
101
+ encoding, confidence, language (the output of chardet.detect)
102
+ and sets Checker.__is_text
103
+ '''
104
+ enc = chardet.detect(self.__fobj_bin.read())
105
+ self.__fobj_bin.seek(0) #leave it as you found it
106
+ if self.__istext:
107
+ return enc
108
+
109
+ return {'encoding': None,
110
+ 'confidence': 0.0,
111
+ 'language' : ''}
112
+
113
+ def __del__(self) -> None:#DONE
114
+ '''
115
+ Destructor closes file
116
+ '''
117
+ self.__fobj_bin.close()
118
+
119
+ def produce_digest(self, prot: str = 'md5', blocksize: int = 2*16) -> str: #DONE
120
+ '''
121
+ Returns hex digest for object
122
+
123
+ fname : str
124
+ Path to a file object
125
+
126
+ prot : str
127
+ Hash type. Supported hashes: 'sha1', 'sha224', 'sha256',
128
+ 'sha384', 'sha512', 'blake2b', 'blake2s', 'md5'.
129
+ Default: 'md5'
130
+
131
+ blocksize : int
132
+ Read block size in bytes
133
+ '''
134
+ ok_hash = {'sha1' : hashlib.sha1(),
135
+ 'sha224' : hashlib.sha224(),
136
+ 'sha256' : hashlib.sha256(),
137
+ 'sha384' : hashlib.sha384(),
138
+ 'sha512' : hashlib.sha512(),
139
+ 'blake2b' : hashlib.blake2b(),
140
+ 'blake2s' : hashlib.blake2s(),
141
+ 'md5': hashlib.md5()}
142
+
143
+ self.__fobj_bin.seek(0)
144
+ try:
145
+ _hash = ok_hash[prot]
146
+ except (UnboundLocalError, KeyError):
147
+ message = ('Unsupported hash type. Valid values are '
148
+ f'{list(ok_hash)}.')
149
+ LOGGER.exception('Unsupported hash type. Valid values are %s', message)
150
+ raise
151
+
152
+ fblock = self.__fobj_bin.read(blocksize)
153
+ while fblock:
154
+ _hash.update(fblock)
155
+ fblock = self.__fobj_bin.read(blocksize)
156
+ return _hash.hexdigest()
157
+
158
+ def flat_tester(self, **kwargs) -> dict: #DONE
159
+ '''
160
+ Checks file for line length and number of records.
161
+
162
+ Returns a dictionary:
163
+
164
+ `{'min_cols': int, 'max_cols' : int, 'numrec':int, 'constant' : bool}`
165
+ '''
166
+ if not kwargs.get('flatfile'):
167
+ return {'min_cols': 'N/A', 'max_cols': 'N/A', 'numrec' : 'N/A',
168
+ 'constant': 'N/A', 'encoding' : 'N/A'}
169
+
170
+ if self.fname.suffix.lower() in self.statfiles:
171
+ return self._flat_tester_commercial(**kwargs)
172
+
173
+ if self.__istext:
174
+ return self._flat_tester_txt()
175
+ #this should not happen but you never know
176
+ return {'min_cols': 'N/A', 'max_cols': 'N/A', 'numrec' : 'N/A',
177
+ 'constant': 'N/A', 'encoding' : 'N/A'}
178
+
179
+ def _flat_tester_commercial(self, **kwargs) -> dict: #DONE
180
+ '''
181
+ Checks SPSS sav, SAS sas7bdat and Stata .dta files for rectangularity
182
+
183
+ Returns a dictionary:
184
+
185
+ `{'min_cols': int, 'max_cols': int, 'numrec' : int,
186
+ 'constant': True, 'encoding': str}`
187
+
188
+ These files are by definition rectanglar, at least as checked here
189
+ by pyreadstat/pandas, so constant will always == True.
190
+ '''
191
+ if not kwargs.get('flatfile'):
192
+ return {'min_cols': 'N/A', 'max_cols': 'N/A', 'numrec' : 'N/A',
193
+ 'constant': 'N/A', 'encoding': 'N/A'}
194
+ options = {'.sav' : pyreadstat.read_sav,
195
+ '.dta' : pyreadstat.read_dta,
196
+ '.sas7bdat' : pyreadstat.read_sas7bdat}
197
+ meta = options[self.fname.suffix.lower()](self.fname)[1]
198
+ #self._encoding = meta.file_encoding
199
+ self.encoding['encoding'] = meta.file_encoding
200
+ return {'min_cols':meta.number_columns,
201
+ 'max_cols':meta.number_columns,
202
+ 'numrec': meta.number_rows,
203
+ 'constant':True,
204
+ 'encoding': self.encoding['encoding']}
205
+
206
+ def _flat_tester_txt(self) -> dict: #DONE
207
+ '''
208
+ Checks file for line length and number of records.
209
+
210
+ Returns a dictionary:
211
+
212
+ `{'min_cols': int, 'max_cols' : int, 'numrec':int, 'constant' : bool}`
213
+ '''
214
+ linecount = 0
215
+ self.__text_obj.seek(0)
216
+ if not self.__istext:
217
+ raise TypeError('Not a text file')
218
+ maxline = len(self.__text_obj.readline())
219
+ minline = maxline
220
+ orig = maxline # baseline to which new values are compared
221
+ for row in self.__text_obj.readlines():
222
+ linecount += 1
223
+ maxline = max(maxline, len(row))
224
+ minline = min(minline, len(row))
225
+ constant = bool(maxline == orig == minline)
226
+ self.__text_obj.seek(0)
227
+ return {'min_cols': minline, 'max_cols': maxline, 'numrec' : linecount,
228
+ 'constant': constant, 'encoding': self.encoding['encoding']}
229
+
230
+ def non_ascii_tester(self, **kwargs) -> list: #DONE
231
+ '''
232
+ Returns a list of dicts of positions of non-ASCII characters in a text file.
233
+
234
+ `[{'row': int, 'col':int, 'char':str}...]`
235
+
236
+ fname : str
237
+ Path/filename
238
+
239
+ Keyword arguments:
240
+
241
+ #flatfile : bool
242
+ asctest : bool
243
+ — Perform character check (assuming it is text)
244
+ '''
245
+ if (kwargs.get('asctest', False)
246
+ or not self.__istext
247
+ or not kwargs.get('flatfile')):
248
+ return []
249
+ outlist = []
250
+ self.__text_obj.seek(0)
251
+ for rown, row in enumerate(self.__text_obj):
252
+ for coln, char in enumerate(row):
253
+ if char not in string.printable and char != '\x00':
254
+ non_asc = {'row':rown+1, 'col': coln+1, 'char':char}
255
+ outlist.append(non_asc)
256
+ self.__text_obj.seek(0)
257
+ return outlist
258
+
259
+ def null_count(self, **kwargs) -> dict: #DONE
260
+ '''
261
+ Returns an integer count of null characters in the file
262
+ ('\x00') or None if skipped
263
+
264
+ Keyword arguments:
265
+
266
+ flatfile : bool
267
+ — Test is useless if not a text file. If False, returns 'N/A'
268
+ '''
269
+ if (not kwargs.get('flatfile')
270
+ or not self.__istext
271
+ or not kwargs.get('null_chars')):
272
+ return None
273
+ self.__text_obj.seek(0)
274
+ count = self.__text_obj.read().count('\x00')
275
+ if not count:
276
+ return None
277
+ return count
278
+
279
+ def dos(self, **kwargs) -> bool: #DONE
280
+ '''
281
+ Checks for presence of carriage returns in file
282
+
283
+ Returns True if a carriage return ie, ord(13) is present
284
+
285
+ Keyword arguments:
286
+
287
+ flatfile : bool
288
+ — Perform rectangularity check. If False, returns dictionary
289
+ with all values as 'N/A'
290
+ '''
291
+ if not kwargs.get('flatfile') or not self.__istext:
292
+ return None
293
+ self.__fobj_bin.seek(0)
294
+ for text in self.__fobj_bin:
295
+ if b'\r\n' in text:
296
+ return True
297
+ return False
298
+
299
+ def _mime_type(self, fname:pathlib.Path)->tuple:
300
+ '''
301
+ Returns mimetype or 'application/octet-stream'
302
+ '''
303
+ try:
304
+ out = mimetypes.guess_file_type(fname, strict=False)[0]
305
+ except AttributeError:
306
+ #soft deprecation
307
+ out = mimetypes.guess_type(fname)[0]
308
+ if not out:
309
+ out = 'application/octet-stream'
310
+ return out
311
+
312
+ def _report(self, **kwargs) -> dict: #DONE
313
+ '''
314
+ Returns a dictionary of outputs based on keywords below.
315
+ Performs each test and returns the appropriate values. A convenience
316
+ function so that you don't have to run the tests individually.
317
+
318
+ Sample output:
319
+
320
+ ```
321
+ {'filename':'/tmp/test.csv',
322
+ 'flat': True,
323
+ 'min_cols': 100, 'max_cols': 100, 'numrec' : 101, 'constant': True,
324
+ 'nonascii':False,
325
+ 'dos':False}
326
+ ```
327
+ Accepted keywords and defaults:
328
+ digest : str
329
+ — Hash algorithm. Default 'md5'
330
+
331
+ flat : bool
332
+ — Flat file checking.
333
+
334
+ nonascii : bool
335
+ — Check for non-ASCII characters.
336
+
337
+ flatfile : bool
338
+ — Perform rectangularity check. If False, returns dictionary
339
+ with all values as 'N/A'
340
+
341
+ null_chars : bool
342
+ - check for null characters
343
+ '''
344
+ out = {'filename': self.fname}
345
+ digest = kwargs.get('digest', 'md5')
346
+ #dos = kwargs.get('dos')
347
+
348
+ out.update({'digestType' : digest})
349
+ out.update({'digest' : self.produce_digest(digest)})
350
+ #out.update({'flat': self.flat_tester(**kwargs)})
351
+ out.update(self.flat_tester(**kwargs))
352
+ #out.update({'flat':'FFFFFFFFFFFF'})
353
+ out.update({'nonascii': self.non_ascii_tester(**kwargs)})
354
+ out.update({'encoding': self.encoding['encoding']})
355
+ out.update({'null_chars': self.null_count(**kwargs)})
356
+ out.update({'mimetype': self._mime_type(self.fname)})
357
+ #if dos:
358
+ # out.update({'dos' : self.dos(**kwargs)})
359
+ #else:
360
+ # out.update({'dos': None})
361
+ out.update({'dos': self.dos(**kwargs)})
362
+ return out
363
+
364
+ def _manifest_txt(self, **kwargs)->str:
365
+ '''
366
+ Returns manifest as plain text
367
+ '''
368
+ return '\n'.join([f'{k}: {v}' for k,v in kwargs['report'].items()
369
+ if v not in ['', None]])
370
+
371
+ def _manifest_json(self, **kwargs)->str:
372
+ '''
373
+ Returns manifest as JSON
374
+ '''
375
+ out = kwargs['report'].copy()
376
+ out['filename'] = str(kwargs['report']['filename'])
377
+ return json.dumps(out)
378
+
379
+ def _manifest_csv(self, **kwargs)->str:
380
+ '''
381
+ Returns manifest as [whatever]-separated value
382
+ '''
383
+ outstr = io.StringIO(newline='')
384
+ writer = csv.DictWriter(outstr, fieldnames=kwargs['report'].keys(),
385
+ delimiter=kwargs.get('sep', ','),
386
+ quoting=csv.QUOTE_MINIMAL)
387
+ if kwargs.get('headers'):
388
+ writer.writeheader()
389
+ writer.writerow(kwargs['report'])
390
+ outstr.seek(0)
391
+ return outstr.read()
392
+
393
+ def manifest(self, **kwargs) -> str: #really as str #DONE
394
+ '''
395
+ Returns desired output type as string
396
+
397
+ out : str
398
+ — Acceptable values are 'txt', 'json', 'csv'
399
+ 'txt' Plain text
400
+ 'json' JSON
401
+ 'csv' Comma-separated value
402
+
403
+ Accepted keywords and defaults:
404
+
405
+ digest : str
406
+ — Hash algorithm. Default 'md5'
407
+
408
+ flat : bool
409
+ — Flat file checking. Default True
410
+
411
+ nonascii : bool
412
+ — Check for non-ASCII characters. Default True
413
+
414
+ dos : bool
415
+ — check for Windows CR/LF combo. Default True
416
+
417
+ flatfile : bool
418
+ — Perform rectangularity check. If False, returns dictionary
419
+ with all values as 'N/A'
420
+
421
+ headers : bool
422
+ — Include csv header (only has any effect with out='csv')
423
+ Default is False
424
+
425
+ sep: str
426
+ — Separator if you want a different plain text separator like a
427
+ tab (\t) or pipe (|). Only functional with csv output, obviously.
428
+
429
+ '''
430
+ report = self._report(**kwargs)
431
+ report_type={'txt': self._manifest_txt,
432
+ 'json': self._manifest_json,
433
+ 'csv': self._manifest_csv,
434
+ 'tsv': self._manifest_csv,
435
+ 'psv': self._manifest_csv}
436
+
437
+ try:
438
+ return report_type[kwargs['out']](report=report, **kwargs)
439
+ except KeyError:
440
+ LOGGER.error('Unsupported manifest type %s; defaulting to text', kwargs['out'])
441
+ return report_type[kwargs['out']](report=report, out='txt', **kwargs)
442
+
443
+ if __name__ == '__main__':
444
+ pass
@@ -0,0 +1,126 @@
1
+ '''
2
+ Manifest generator for data files.
3
+
4
+ Produces a text file with user specified checksums for all files
5
+ from the top of a specified tree and checks line length
6
+ and ASCII character status for text files.
7
+
8
+ For statistics program files:
9
+ SAS .sas7bdat
10
+ SPSS .sav
11
+ Stata .dta
12
+
13
+ Checker() will report number of cases and variables as
14
+ rows and columns respectively.
15
+
16
+ '''
17
+
18
+ import argparse
19
+ import glob #God I hate Windows
20
+ import json
21
+ import os
22
+ import pathlib
23
+ import sys
24
+
25
+ import damage
26
+
27
+ def parse() -> argparse.ArgumentParser(): #DONE
28
+ '''
29
+ Separates argparser into function. Returns arparse.ArgumentParser()
30
+ '''
31
+ desc = ('Produces a text, csv or JSON output with checksums for files, '
32
+ 'testing for Windows CRLF combinations, '
33
+ 'as well as checking text files for regularity and non/ASCII characters')
34
+ parser = argparse.ArgumentParser(description=desc)
35
+ parser.add_argument('files', help='Files to check. Wildcards acceptable (eg, *)',
36
+ nargs='+', default=' ')
37
+ #note 'prog' is built into argparse
38
+ parser.add_argument('-v', '--version', action='version', version='%(prog)s '+damage.__version__,
39
+ help='Show version number and exit')
40
+ parser.add_argument('-o', '--output', dest='out',
41
+ help='Output format. One of txt, csv, json, tsv',
42
+ default='txt',
43
+ choices = ['txt', 'csv', 'tsv', 'json'],
44
+ type=str.lower)
45
+ parser.add_argument('-n', '--no-flat', action='store_false', dest='flatfile',
46
+ help="Don't check text files for rectangularity")
47
+ parser.add_argument('-r', '--recursive', action='store_true', dest='recur',
48
+ help='Recursive *directory* processing of file tree. Assumes that the '
49
+ 'arguments point to a directory (eg, tmp/), and a slash will '
50
+ 'be appended if one does not exist')
51
+ parser.add_argument('-t', '--hash-type', dest='digest', default='md5',
52
+ help="Checksum hash type. Supported hashes: 'sha1', "
53
+ "'sha224', 'sha256', 'sha384', 'sha512', 'blake2b', "
54
+ "'blake2s', 'md5'. Default: 'md5'",
55
+ choices = ['md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512',
56
+ 'blake2b', 'blake2s'],
57
+ type=str.lower)
58
+ parser.add_argument('-a', '--no-ascii', action='store_true', dest='asctest',
59
+ help="Don't check text files for non-ASCII characters")
60
+ parser.add_argument('-f', '--to-file',
61
+ help='Output to -f [file] instead of stdout')
62
+ return parser
63
+
64
+ def recurse_files(inlist) -> map:
65
+ '''
66
+ Returns a map object with pathlib.Paths of files
67
+ '''
68
+ outlist = []
69
+ for flist in inlist:
70
+ rec = os.walk(flist)
71
+ outlist += [pathlib.Path(x[0], y) for x in rec for y in x[2]]
72
+ return outlist #includes hidden files
73
+
74
+ def main(): #pylint: disable=too-many-branches
75
+ '''
76
+ Main function to output manifests to stdout.
77
+ '''
78
+ separator_types = {'csv': ',', 'tsv': '\t'}
79
+ #Purely for formatting output
80
+ line_spacer = {'txt':'\n\n', 'csv':'', 'tsv': ''}
81
+ parser = parse()
82
+ args = parser.parse_args()
83
+ if not args.recur:
84
+ #Windows does not do wildcard expansion at the shell level
85
+ if sys.platform.startswith('win'): #Maybe they will have win64 sometime:
86
+ files = map(pathlib.Path, [y for x in args.files for y in glob.glob(x)])
87
+ else:
88
+ files = map(pathlib.Path, list(args.files))
89
+ else:
90
+ files = recurse_files(args.files)
91
+
92
+
93
+ output = []
94
+ try: ###
95
+ for num, fil in enumerate(files):
96
+ if not fil.is_file() or not fil.exists():
97
+ continue
98
+ testme = damage.Checker(fil)
99
+ if args.out in separator_types and num == 0:
100
+ output.append(testme.manifest(headers=True,
101
+ sep=separator_types.get(args.out),
102
+ **vars(args)))
103
+ else:
104
+ output.append(testme.manifest(sep=separator_types.get(args.out),
105
+ **vars(args)))
106
+ if not args.out == 'json':
107
+ #print(line_spacer[args.out].join(output).strip())
108
+ out_info =line_spacer[args.out].join(output).strip()
109
+ else:
110
+ outjson = ('{"files" :' +
111
+ '[' + ','.join(output) + ']'
112
+ + '}')
113
+ out_info = json.dumps(json.loads(outjson)) #validate
114
+ except Exception as err: #pylint: disable=broad-exception-caught
115
+ print(f'Abnormal program termination {err}')
116
+ sys.exit()
117
+
118
+ if args.to_file:
119
+ with open(pathlib.Path(args.to_file), mode='w',
120
+ encoding='utf-8') as outf:
121
+ outf.write(out_info)
122
+ else:
123
+ print(out_info)
124
+
125
+ if __name__ == '__main__':
126
+ main()