damage 0.3.14__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- damage-0.3.14/LICENCE.md +9 -0
- damage-0.3.14/PKG-INFO +41 -0
- damage-0.3.14/README.md +9 -0
- damage-0.3.14/pyproject.toml +52 -0
- damage-0.3.14/src/damage/__init__.py +444 -0
- damage-0.3.14/src/damage/console/damage_cmd.py +126 -0
- damage-0.3.14/src/damage/gui/assets/DamageAppIcon.icns +0 -0
- damage-0.3.14/src/damage/gui/assets/DamageAppIcon.ico +0 -0
- damage-0.3.14/src/damage/gui/assets/DamageAppIcon.jpg +0 -0
- damage-0.3.14/src/damage/gui/assets/DamageAppIcon.png +0 -0
- damage-0.3.14/src/damage/gui/assets/LICENCE.txt +10 -0
- damage-0.3.14/src/damage/gui/damage_gui.py +678 -0
damage-0.3.14/LICENCE.md
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright 2021 University of British Columbia Library
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
|
|
6
|
+
|
|
7
|
+
The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
8
|
+
|
|
9
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
damage-0.3.14/PKG-INFO
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
Metadata-Version: 2.3
|
|
2
|
+
Name: damage
|
|
3
|
+
Version: 0.3.14
|
|
4
|
+
Summary: File manifest generator and python package for statistical data files and documentation'
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: metadata,SAS,SPSS,Stata,rectangular files,manifest generator
|
|
7
|
+
Author: Paul Lesack
|
|
8
|
+
Author-email: paul.lesack@ubc.ca
|
|
9
|
+
Requires-Python: >=3.12, <4
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Environment :: Console
|
|
12
|
+
Classifier: Environment :: MacOS X
|
|
13
|
+
Classifier: Environment :: Win32 (MS Windows)
|
|
14
|
+
Classifier: Environment :: X11 Applications
|
|
15
|
+
Classifier: Intended Audience :: Education
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
+
Classifier: Topic :: Education
|
|
21
|
+
Classifier: Topic :: Utilities
|
|
22
|
+
Requires-Dist: chardet (>=5.2.0,<6.0.0)
|
|
23
|
+
Requires-Dist: freesimplegui (>=5.2.0,<6.0.0)
|
|
24
|
+
Requires-Dist: numpy (>=2.2.3,<3.0.0)
|
|
25
|
+
Requires-Dist: pandas (>=2.2.3,<3.0.0)
|
|
26
|
+
Requires-Dist: pyreadstat (>=1.2.8,<2.0.0)
|
|
27
|
+
Project-URL: Homepage, https://ubc-library-rc.github.io/damage
|
|
28
|
+
Project-URL: Issue Tracker, https://github.com/ubc-library-rc/damage/issues
|
|
29
|
+
Project-URL: Repository, https://github.com/ubc-library-rc/damage
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
|
|
32
|
+
# File manifest tools: Damage
|
|
33
|
+
|
|
34
|
+
Damage is a simple command-line utility which outputs a file manifest in a variety of formats, with a special focus on statistical package files from SPSS, SAS and Stata. It's also the name of the Python package which you can use in your own code and which powers the _damage_ utility.
|
|
35
|
+
|
|
36
|
+
Source code and documentation files are available at <https://github.com/ubc-library-rc/damage>. Documentation is in the intuitively named _docs_ subdirectory.
|
|
37
|
+
|
|
38
|
+
Binary versions of the *damage* utility for Windows and MacOS computers can be found on the project's Github release page: <https://github.com/ubc-library-rc/damage/releases>.
|
|
39
|
+
|
|
40
|
+
A less utilitarian documentation viewing experience is available at <https://ubc-library-rc.github.io/damage/>.
|
|
41
|
+
|
damage-0.3.14/README.md
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
# File manifest tools: Damage
|
|
2
|
+
|
|
3
|
+
Damage is a simple command-line utility which outputs a file manifest in a variety of formats, with a special focus on statistical package files from SPSS, SAS and Stata. It's also the name of the Python package which you can use in your own code and which powers the _damage_ utility.
|
|
4
|
+
|
|
5
|
+
Source code and documentation files are available at <https://github.com/ubc-library-rc/damage>. Documentation is in the intuitively named _docs_ subdirectory.
|
|
6
|
+
|
|
7
|
+
Binary versions of the *damage* utility for Windows and MacOS computers can be found on the project's Github release page: <https://github.com/ubc-library-rc/damage/releases>.
|
|
8
|
+
|
|
9
|
+
A less utilitarian documentation viewing experience is available at <https://ubc-library-rc.github.io/damage/>.
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "damage"
|
|
3
|
+
version = "0.3.14"
|
|
4
|
+
description = "File manifest generator and python package for statistical data files and documentation'"
|
|
5
|
+
authors = [
|
|
6
|
+
{name = "Paul Lesack",email = "paul.lesack@ubc.ca"}
|
|
7
|
+
]
|
|
8
|
+
license = {text = "MIT"}
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12, <4"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"freesimplegui (>=5.2.0,<6.0.0)",
|
|
13
|
+
"chardet (>=5.2.0,<6.0.0)",
|
|
14
|
+
"numpy (>=2.2.3,<3.0.0)",
|
|
15
|
+
"pandas (>=2.2.3,<3.0.0)",
|
|
16
|
+
"pyreadstat (>=1.2.8,<2.0.0)"
|
|
17
|
+
]
|
|
18
|
+
keywords =["metadata","SAS", "SPSS", "Stata", "rectangular files", "manifest generator"]
|
|
19
|
+
|
|
20
|
+
[project.urls]
|
|
21
|
+
homepage = "https://ubc-library-rc.github.io/damage"
|
|
22
|
+
repository = "https://github.com/ubc-library-rc/damage"
|
|
23
|
+
"Issue Tracker" = "https://github.com/ubc-library-rc/damage/issues"
|
|
24
|
+
|
|
25
|
+
[tool.poetry]
|
|
26
|
+
packages = [{"include" = "damage", "from"="src"}]
|
|
27
|
+
classifiers = ["Development Status :: 4 - Beta",
|
|
28
|
+
"Environment :: Console",
|
|
29
|
+
"Environment :: MacOS X",
|
|
30
|
+
"Environment :: Win32 (MS Windows)",
|
|
31
|
+
"Environment :: X11 Applications",
|
|
32
|
+
"Intended Audience :: Education",
|
|
33
|
+
"License :: OSI Approved :: MIT License",
|
|
34
|
+
"Topic :: Education",
|
|
35
|
+
"Topic :: Utilities"]
|
|
36
|
+
|
|
37
|
+
[tool.poetry.group.dev.dependencies]
|
|
38
|
+
pylint = ">=3.3.4"
|
|
39
|
+
mkdocs = ">=1.6.1"
|
|
40
|
+
pydoc-markdown = ">=4.0.0"
|
|
41
|
+
|
|
42
|
+
[project.scripts]
|
|
43
|
+
damage = "damage.console.damage_cmd:main"
|
|
44
|
+
damage-gui = "damage.gui.damage_gui:main"
|
|
45
|
+
|
|
46
|
+
#windows only?
|
|
47
|
+
[project.gui-scripts]
|
|
48
|
+
damage-gui = "damage.gui.damage_gui:main"
|
|
49
|
+
|
|
50
|
+
[build-system]
|
|
51
|
+
requires = ["poetry-core>=2.0.0,<3.0.0"]
|
|
52
|
+
build-backend = "poetry.core.masonry.api"
|
|
@@ -0,0 +1,444 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Manifest generator for data files.
|
|
3
|
+
|
|
4
|
+
Produces a text file with user specificied checksums for all files
|
|
5
|
+
from the top of a specified tree and checks line length
|
|
6
|
+
and ASCII character status for text files.
|
|
7
|
+
|
|
8
|
+
For statistics program files:
|
|
9
|
+
SAS .sas7bdat
|
|
10
|
+
SPSS .sav
|
|
11
|
+
Stata .dta
|
|
12
|
+
|
|
13
|
+
Checker() will report number of cases and variables as
|
|
14
|
+
rows and columns respectively.
|
|
15
|
+
|
|
16
|
+
'''
|
|
17
|
+
|
|
18
|
+
import copy
|
|
19
|
+
import csv
|
|
20
|
+
import hashlib
|
|
21
|
+
import io
|
|
22
|
+
import json
|
|
23
|
+
import logging
|
|
24
|
+
import mimetypes
|
|
25
|
+
import pathlib
|
|
26
|
+
import string
|
|
27
|
+
|
|
28
|
+
import chardet
|
|
29
|
+
import pyreadstat
|
|
30
|
+
|
|
31
|
+
LOGGER = logging.getLogger()
|
|
32
|
+
|
|
33
|
+
VERSION = (0, 3, 14)
|
|
34
|
+
__version__ = '.'.join([str(x) for x in VERSION])
|
|
35
|
+
|
|
36
|
+
#PDB note check private variables with self._Checker__private_var
|
|
37
|
+
#Note *single* underscore before Checker
|
|
38
|
+
class Checker():
|
|
39
|
+
'''
|
|
40
|
+
A collection of various tools attached to a file
|
|
41
|
+
'''
|
|
42
|
+
|
|
43
|
+
def __init__(self, fname: str) -> None: #DONE
|
|
44
|
+
'''
|
|
45
|
+
Initializes Checker instance
|
|
46
|
+
|
|
47
|
+
fname : str
|
|
48
|
+
Path to file
|
|
49
|
+
'''
|
|
50
|
+
#Commercial stats files extensions
|
|
51
|
+
#I am aware that extension checking is not perfect
|
|
52
|
+
self.statfiles = ['.dta', '.sav', '.sas7bdat']
|
|
53
|
+
#brute force is best force
|
|
54
|
+
self.textfiles= ['.dat', '.txt', '.md', '.csv',
|
|
55
|
+
'.tsv', '.asc', '.html', '.xml',
|
|
56
|
+
'.xsd', '.htm', '.log', '.nfo',
|
|
57
|
+
'.text', '.xsl', '.py', '.r',
|
|
58
|
+
'.toml', '.yaml', '.yml']
|
|
59
|
+
self.fname = pathlib.Path(fname)
|
|
60
|
+
#self._ext = fname.suffix
|
|
61
|
+
self.__istext = self.__istextfile()
|
|
62
|
+
self.__text_obj = None
|
|
63
|
+
with open(self.fname, 'rb') as fil:
|
|
64
|
+
self.__fobj_bin = io.BytesIO(fil.read())
|
|
65
|
+
self.encoding = self.__encoding()
|
|
66
|
+
if self.__istext:
|
|
67
|
+
with open(self.fname, encoding=self.encoding.get('encoding')) as f:
|
|
68
|
+
self.__text_obj = io.StringIO(f.read())
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
@property
|
|
72
|
+
def hidden(self)->bool:
|
|
73
|
+
'''
|
|
74
|
+
Returns True if file is hidden (ie, startswith '.')
|
|
75
|
+
or is in in a hidden directory (ie, any directory on the path
|
|
76
|
+
starts with '.')
|
|
77
|
+
'''
|
|
78
|
+
if any([x.startswith('.') for x in self.fname.parts]):
|
|
79
|
+
return True
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
def __istextfile(self):
|
|
83
|
+
'''
|
|
84
|
+
Check to see if file is a text file based on mimetype.
|
|
85
|
+
Works with extensions only which is not ideal
|
|
86
|
+
'''
|
|
87
|
+
try:
|
|
88
|
+
if ('text' in mimetypes.guess_file_type(self.fname)
|
|
89
|
+
or self.fname.suffix.lower() in self.textfiles):
|
|
90
|
+
return True
|
|
91
|
+
except AttributeError: #soft deprecation fix
|
|
92
|
+
if ('text' in mimetypes.guess_type(self.fname)
|
|
93
|
+
or self.fname.suffix.lower() in self.textfiles):
|
|
94
|
+
return True
|
|
95
|
+
|
|
96
|
+
return False
|
|
97
|
+
|
|
98
|
+
def __encoding(self) -> dict: #DONE
|
|
99
|
+
'''
|
|
100
|
+
Returns most likely encoding of self.fname, dict with keys
|
|
101
|
+
encoding, confidence, language (the output of chardet.detect)
|
|
102
|
+
and sets Checker.__is_text
|
|
103
|
+
'''
|
|
104
|
+
enc = chardet.detect(self.__fobj_bin.read())
|
|
105
|
+
self.__fobj_bin.seek(0) #leave it as you found it
|
|
106
|
+
if self.__istext:
|
|
107
|
+
return enc
|
|
108
|
+
|
|
109
|
+
return {'encoding': None,
|
|
110
|
+
'confidence': 0.0,
|
|
111
|
+
'language' : ''}
|
|
112
|
+
|
|
113
|
+
def __del__(self) -> None:#DONE
|
|
114
|
+
'''
|
|
115
|
+
Destructor closes file
|
|
116
|
+
'''
|
|
117
|
+
self.__fobj_bin.close()
|
|
118
|
+
|
|
119
|
+
def produce_digest(self, prot: str = 'md5', blocksize: int = 2*16) -> str: #DONE
|
|
120
|
+
'''
|
|
121
|
+
Returns hex digest for object
|
|
122
|
+
|
|
123
|
+
fname : str
|
|
124
|
+
Path to a file object
|
|
125
|
+
|
|
126
|
+
prot : str
|
|
127
|
+
Hash type. Supported hashes: 'sha1', 'sha224', 'sha256',
|
|
128
|
+
'sha384', 'sha512', 'blake2b', 'blake2s', 'md5'.
|
|
129
|
+
Default: 'md5'
|
|
130
|
+
|
|
131
|
+
blocksize : int
|
|
132
|
+
Read block size in bytes
|
|
133
|
+
'''
|
|
134
|
+
ok_hash = {'sha1' : hashlib.sha1(),
|
|
135
|
+
'sha224' : hashlib.sha224(),
|
|
136
|
+
'sha256' : hashlib.sha256(),
|
|
137
|
+
'sha384' : hashlib.sha384(),
|
|
138
|
+
'sha512' : hashlib.sha512(),
|
|
139
|
+
'blake2b' : hashlib.blake2b(),
|
|
140
|
+
'blake2s' : hashlib.blake2s(),
|
|
141
|
+
'md5': hashlib.md5()}
|
|
142
|
+
|
|
143
|
+
self.__fobj_bin.seek(0)
|
|
144
|
+
try:
|
|
145
|
+
_hash = ok_hash[prot]
|
|
146
|
+
except (UnboundLocalError, KeyError):
|
|
147
|
+
message = ('Unsupported hash type. Valid values are '
|
|
148
|
+
f'{list(ok_hash)}.')
|
|
149
|
+
LOGGER.exception('Unsupported hash type. Valid values are %s', message)
|
|
150
|
+
raise
|
|
151
|
+
|
|
152
|
+
fblock = self.__fobj_bin.read(blocksize)
|
|
153
|
+
while fblock:
|
|
154
|
+
_hash.update(fblock)
|
|
155
|
+
fblock = self.__fobj_bin.read(blocksize)
|
|
156
|
+
return _hash.hexdigest()
|
|
157
|
+
|
|
158
|
+
def flat_tester(self, **kwargs) -> dict: #DONE
|
|
159
|
+
'''
|
|
160
|
+
Checks file for line length and number of records.
|
|
161
|
+
|
|
162
|
+
Returns a dictionary:
|
|
163
|
+
|
|
164
|
+
`{'min_cols': int, 'max_cols' : int, 'numrec':int, 'constant' : bool}`
|
|
165
|
+
'''
|
|
166
|
+
if not kwargs.get('flatfile'):
|
|
167
|
+
return {'min_cols': 'N/A', 'max_cols': 'N/A', 'numrec' : 'N/A',
|
|
168
|
+
'constant': 'N/A', 'encoding' : 'N/A'}
|
|
169
|
+
|
|
170
|
+
if self.fname.suffix.lower() in self.statfiles:
|
|
171
|
+
return self._flat_tester_commercial(**kwargs)
|
|
172
|
+
|
|
173
|
+
if self.__istext:
|
|
174
|
+
return self._flat_tester_txt()
|
|
175
|
+
#this should not happen but you never know
|
|
176
|
+
return {'min_cols': 'N/A', 'max_cols': 'N/A', 'numrec' : 'N/A',
|
|
177
|
+
'constant': 'N/A', 'encoding' : 'N/A'}
|
|
178
|
+
|
|
179
|
+
def _flat_tester_commercial(self, **kwargs) -> dict: #DONE
|
|
180
|
+
'''
|
|
181
|
+
Checks SPSS sav, SAS sas7bdat and Stata .dta files for rectangularity
|
|
182
|
+
|
|
183
|
+
Returns a dictionary:
|
|
184
|
+
|
|
185
|
+
`{'min_cols': int, 'max_cols': int, 'numrec' : int,
|
|
186
|
+
'constant': True, 'encoding': str}`
|
|
187
|
+
|
|
188
|
+
These files are by definition rectanglar, at least as checked here
|
|
189
|
+
by pyreadstat/pandas, so constant will always == True.
|
|
190
|
+
'''
|
|
191
|
+
if not kwargs.get('flatfile'):
|
|
192
|
+
return {'min_cols': 'N/A', 'max_cols': 'N/A', 'numrec' : 'N/A',
|
|
193
|
+
'constant': 'N/A', 'encoding': 'N/A'}
|
|
194
|
+
options = {'.sav' : pyreadstat.read_sav,
|
|
195
|
+
'.dta' : pyreadstat.read_dta,
|
|
196
|
+
'.sas7bdat' : pyreadstat.read_sas7bdat}
|
|
197
|
+
meta = options[self.fname.suffix.lower()](self.fname)[1]
|
|
198
|
+
#self._encoding = meta.file_encoding
|
|
199
|
+
self.encoding['encoding'] = meta.file_encoding
|
|
200
|
+
return {'min_cols':meta.number_columns,
|
|
201
|
+
'max_cols':meta.number_columns,
|
|
202
|
+
'numrec': meta.number_rows,
|
|
203
|
+
'constant':True,
|
|
204
|
+
'encoding': self.encoding['encoding']}
|
|
205
|
+
|
|
206
|
+
def _flat_tester_txt(self) -> dict: #DONE
|
|
207
|
+
'''
|
|
208
|
+
Checks file for line length and number of records.
|
|
209
|
+
|
|
210
|
+
Returns a dictionary:
|
|
211
|
+
|
|
212
|
+
`{'min_cols': int, 'max_cols' : int, 'numrec':int, 'constant' : bool}`
|
|
213
|
+
'''
|
|
214
|
+
linecount = 0
|
|
215
|
+
self.__text_obj.seek(0)
|
|
216
|
+
if not self.__istext:
|
|
217
|
+
raise TypeError('Not a text file')
|
|
218
|
+
maxline = len(self.__text_obj.readline())
|
|
219
|
+
minline = maxline
|
|
220
|
+
orig = maxline # baseline to which new values are compared
|
|
221
|
+
for row in self.__text_obj.readlines():
|
|
222
|
+
linecount += 1
|
|
223
|
+
maxline = max(maxline, len(row))
|
|
224
|
+
minline = min(minline, len(row))
|
|
225
|
+
constant = bool(maxline == orig == minline)
|
|
226
|
+
self.__text_obj.seek(0)
|
|
227
|
+
return {'min_cols': minline, 'max_cols': maxline, 'numrec' : linecount,
|
|
228
|
+
'constant': constant, 'encoding': self.encoding['encoding']}
|
|
229
|
+
|
|
230
|
+
def non_ascii_tester(self, **kwargs) -> list: #DONE
|
|
231
|
+
'''
|
|
232
|
+
Returns a list of dicts of positions of non-ASCII characters in a text file.
|
|
233
|
+
|
|
234
|
+
`[{'row': int, 'col':int, 'char':str}...]`
|
|
235
|
+
|
|
236
|
+
fname : str
|
|
237
|
+
Path/filename
|
|
238
|
+
|
|
239
|
+
Keyword arguments:
|
|
240
|
+
|
|
241
|
+
#flatfile : bool
|
|
242
|
+
asctest : bool
|
|
243
|
+
— Perform character check (assuming it is text)
|
|
244
|
+
'''
|
|
245
|
+
if (kwargs.get('asctest', False)
|
|
246
|
+
or not self.__istext
|
|
247
|
+
or not kwargs.get('flatfile')):
|
|
248
|
+
return []
|
|
249
|
+
outlist = []
|
|
250
|
+
self.__text_obj.seek(0)
|
|
251
|
+
for rown, row in enumerate(self.__text_obj):
|
|
252
|
+
for coln, char in enumerate(row):
|
|
253
|
+
if char not in string.printable and char != '\x00':
|
|
254
|
+
non_asc = {'row':rown+1, 'col': coln+1, 'char':char}
|
|
255
|
+
outlist.append(non_asc)
|
|
256
|
+
self.__text_obj.seek(0)
|
|
257
|
+
return outlist
|
|
258
|
+
|
|
259
|
+
def null_count(self, **kwargs) -> dict: #DONE
|
|
260
|
+
'''
|
|
261
|
+
Returns an integer count of null characters in the file
|
|
262
|
+
('\x00') or None if skipped
|
|
263
|
+
|
|
264
|
+
Keyword arguments:
|
|
265
|
+
|
|
266
|
+
flatfile : bool
|
|
267
|
+
— Test is useless if not a text file. If False, returns 'N/A'
|
|
268
|
+
'''
|
|
269
|
+
if (not kwargs.get('flatfile')
|
|
270
|
+
or not self.__istext
|
|
271
|
+
or not kwargs.get('null_chars')):
|
|
272
|
+
return None
|
|
273
|
+
self.__text_obj.seek(0)
|
|
274
|
+
count = self.__text_obj.read().count('\x00')
|
|
275
|
+
if not count:
|
|
276
|
+
return None
|
|
277
|
+
return count
|
|
278
|
+
|
|
279
|
+
def dos(self, **kwargs) -> bool: #DONE
|
|
280
|
+
'''
|
|
281
|
+
Checks for presence of carriage returns in file
|
|
282
|
+
|
|
283
|
+
Returns True if a carriage return ie, ord(13) is present
|
|
284
|
+
|
|
285
|
+
Keyword arguments:
|
|
286
|
+
|
|
287
|
+
flatfile : bool
|
|
288
|
+
— Perform rectangularity check. If False, returns dictionary
|
|
289
|
+
with all values as 'N/A'
|
|
290
|
+
'''
|
|
291
|
+
if not kwargs.get('flatfile') or not self.__istext:
|
|
292
|
+
return None
|
|
293
|
+
self.__fobj_bin.seek(0)
|
|
294
|
+
for text in self.__fobj_bin:
|
|
295
|
+
if b'\r\n' in text:
|
|
296
|
+
return True
|
|
297
|
+
return False
|
|
298
|
+
|
|
299
|
+
def _mime_type(self, fname:pathlib.Path)->tuple:
|
|
300
|
+
'''
|
|
301
|
+
Returns mimetype or 'application/octet-stream'
|
|
302
|
+
'''
|
|
303
|
+
try:
|
|
304
|
+
out = mimetypes.guess_file_type(fname, strict=False)[0]
|
|
305
|
+
except AttributeError:
|
|
306
|
+
#soft deprecation
|
|
307
|
+
out = mimetypes.guess_type(fname)[0]
|
|
308
|
+
if not out:
|
|
309
|
+
out = 'application/octet-stream'
|
|
310
|
+
return out
|
|
311
|
+
|
|
312
|
+
def _report(self, **kwargs) -> dict: #DONE
|
|
313
|
+
'''
|
|
314
|
+
Returns a dictionary of outputs based on keywords below.
|
|
315
|
+
Performs each test and returns the appropriate values. A convenience
|
|
316
|
+
function so that you don't have to run the tests individually.
|
|
317
|
+
|
|
318
|
+
Sample output:
|
|
319
|
+
|
|
320
|
+
```
|
|
321
|
+
{'filename':'/tmp/test.csv',
|
|
322
|
+
'flat': True,
|
|
323
|
+
'min_cols': 100, 'max_cols': 100, 'numrec' : 101, 'constant': True,
|
|
324
|
+
'nonascii':False,
|
|
325
|
+
'dos':False}
|
|
326
|
+
```
|
|
327
|
+
Accepted keywords and defaults:
|
|
328
|
+
digest : str
|
|
329
|
+
— Hash algorithm. Default 'md5'
|
|
330
|
+
|
|
331
|
+
flat : bool
|
|
332
|
+
— Flat file checking.
|
|
333
|
+
|
|
334
|
+
nonascii : bool
|
|
335
|
+
— Check for non-ASCII characters.
|
|
336
|
+
|
|
337
|
+
flatfile : bool
|
|
338
|
+
— Perform rectangularity check. If False, returns dictionary
|
|
339
|
+
with all values as 'N/A'
|
|
340
|
+
|
|
341
|
+
null_chars : bool
|
|
342
|
+
- check for null characters
|
|
343
|
+
'''
|
|
344
|
+
out = {'filename': self.fname}
|
|
345
|
+
digest = kwargs.get('digest', 'md5')
|
|
346
|
+
#dos = kwargs.get('dos')
|
|
347
|
+
|
|
348
|
+
out.update({'digestType' : digest})
|
|
349
|
+
out.update({'digest' : self.produce_digest(digest)})
|
|
350
|
+
#out.update({'flat': self.flat_tester(**kwargs)})
|
|
351
|
+
out.update(self.flat_tester(**kwargs))
|
|
352
|
+
#out.update({'flat':'FFFFFFFFFFFF'})
|
|
353
|
+
out.update({'nonascii': self.non_ascii_tester(**kwargs)})
|
|
354
|
+
out.update({'encoding': self.encoding['encoding']})
|
|
355
|
+
out.update({'null_chars': self.null_count(**kwargs)})
|
|
356
|
+
out.update({'mimetype': self._mime_type(self.fname)})
|
|
357
|
+
#if dos:
|
|
358
|
+
# out.update({'dos' : self.dos(**kwargs)})
|
|
359
|
+
#else:
|
|
360
|
+
# out.update({'dos': None})
|
|
361
|
+
out.update({'dos': self.dos(**kwargs)})
|
|
362
|
+
return out
|
|
363
|
+
|
|
364
|
+
def _manifest_txt(self, **kwargs)->str:
|
|
365
|
+
'''
|
|
366
|
+
Returns manifest as plain text
|
|
367
|
+
'''
|
|
368
|
+
return '\n'.join([f'{k}: {v}' for k,v in kwargs['report'].items()
|
|
369
|
+
if v not in ['', None]])
|
|
370
|
+
|
|
371
|
+
def _manifest_json(self, **kwargs)->str:
|
|
372
|
+
'''
|
|
373
|
+
Returns manifest as JSON
|
|
374
|
+
'''
|
|
375
|
+
out = kwargs['report'].copy()
|
|
376
|
+
out['filename'] = str(kwargs['report']['filename'])
|
|
377
|
+
return json.dumps(out)
|
|
378
|
+
|
|
379
|
+
def _manifest_csv(self, **kwargs)->str:
|
|
380
|
+
'''
|
|
381
|
+
Returns manifest as [whatever]-separated value
|
|
382
|
+
'''
|
|
383
|
+
outstr = io.StringIO(newline='')
|
|
384
|
+
writer = csv.DictWriter(outstr, fieldnames=kwargs['report'].keys(),
|
|
385
|
+
delimiter=kwargs.get('sep', ','),
|
|
386
|
+
quoting=csv.QUOTE_MINIMAL)
|
|
387
|
+
if kwargs.get('headers'):
|
|
388
|
+
writer.writeheader()
|
|
389
|
+
writer.writerow(kwargs['report'])
|
|
390
|
+
outstr.seek(0)
|
|
391
|
+
return outstr.read()
|
|
392
|
+
|
|
393
|
+
def manifest(self, **kwargs) -> str: #really as str #DONE
|
|
394
|
+
'''
|
|
395
|
+
Returns desired output type as string
|
|
396
|
+
|
|
397
|
+
out : str
|
|
398
|
+
— Acceptable values are 'txt', 'json', 'csv'
|
|
399
|
+
'txt' Plain text
|
|
400
|
+
'json' JSON
|
|
401
|
+
'csv' Comma-separated value
|
|
402
|
+
|
|
403
|
+
Accepted keywords and defaults:
|
|
404
|
+
|
|
405
|
+
digest : str
|
|
406
|
+
— Hash algorithm. Default 'md5'
|
|
407
|
+
|
|
408
|
+
flat : bool
|
|
409
|
+
— Flat file checking. Default True
|
|
410
|
+
|
|
411
|
+
nonascii : bool
|
|
412
|
+
— Check for non-ASCII characters. Default True
|
|
413
|
+
|
|
414
|
+
dos : bool
|
|
415
|
+
— check for Windows CR/LF combo. Default True
|
|
416
|
+
|
|
417
|
+
flatfile : bool
|
|
418
|
+
— Perform rectangularity check. If False, returns dictionary
|
|
419
|
+
with all values as 'N/A'
|
|
420
|
+
|
|
421
|
+
headers : bool
|
|
422
|
+
— Include csv header (only has any effect with out='csv')
|
|
423
|
+
Default is False
|
|
424
|
+
|
|
425
|
+
sep: str
|
|
426
|
+
— Separator if you want a different plain text separator like a
|
|
427
|
+
tab (\t) or pipe (|). Only functional with csv output, obviously.
|
|
428
|
+
|
|
429
|
+
'''
|
|
430
|
+
report = self._report(**kwargs)
|
|
431
|
+
report_type={'txt': self._manifest_txt,
|
|
432
|
+
'json': self._manifest_json,
|
|
433
|
+
'csv': self._manifest_csv,
|
|
434
|
+
'tsv': self._manifest_csv,
|
|
435
|
+
'psv': self._manifest_csv}
|
|
436
|
+
|
|
437
|
+
try:
|
|
438
|
+
return report_type[kwargs['out']](report=report, **kwargs)
|
|
439
|
+
except KeyError:
|
|
440
|
+
LOGGER.error('Unsupported manifest type %s; defaulting to text', kwargs['out'])
|
|
441
|
+
return report_type[kwargs['out']](report=report, out='txt', **kwargs)
|
|
442
|
+
|
|
443
|
+
if __name__ == '__main__':
|
|
444
|
+
pass
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
'''
|
|
2
|
+
Manifest generator for data files.
|
|
3
|
+
|
|
4
|
+
Produces a text file with user specified checksums for all files
|
|
5
|
+
from the top of a specified tree and checks line length
|
|
6
|
+
and ASCII character status for text files.
|
|
7
|
+
|
|
8
|
+
For statistics program files:
|
|
9
|
+
SAS .sas7bdat
|
|
10
|
+
SPSS .sav
|
|
11
|
+
Stata .dta
|
|
12
|
+
|
|
13
|
+
Checker() will report number of cases and variables as
|
|
14
|
+
rows and columns respectively.
|
|
15
|
+
|
|
16
|
+
'''
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import glob #God I hate Windows
|
|
20
|
+
import json
|
|
21
|
+
import os
|
|
22
|
+
import pathlib
|
|
23
|
+
import sys
|
|
24
|
+
|
|
25
|
+
import damage
|
|
26
|
+
|
|
27
|
+
def parse() -> argparse.ArgumentParser(): #DONE
|
|
28
|
+
'''
|
|
29
|
+
Separates argparser into function. Returns arparse.ArgumentParser()
|
|
30
|
+
'''
|
|
31
|
+
desc = ('Produces a text, csv or JSON output with checksums for files, '
|
|
32
|
+
'testing for Windows CRLF combinations, '
|
|
33
|
+
'as well as checking text files for regularity and non/ASCII characters')
|
|
34
|
+
parser = argparse.ArgumentParser(description=desc)
|
|
35
|
+
parser.add_argument('files', help='Files to check. Wildcards acceptable (eg, *)',
|
|
36
|
+
nargs='+', default=' ')
|
|
37
|
+
#note 'prog' is built into argparse
|
|
38
|
+
parser.add_argument('-v', '--version', action='version', version='%(prog)s '+damage.__version__,
|
|
39
|
+
help='Show version number and exit')
|
|
40
|
+
parser.add_argument('-o', '--output', dest='out',
|
|
41
|
+
help='Output format. One of txt, csv, json, tsv',
|
|
42
|
+
default='txt',
|
|
43
|
+
choices = ['txt', 'csv', 'tsv', 'json'],
|
|
44
|
+
type=str.lower)
|
|
45
|
+
parser.add_argument('-n', '--no-flat', action='store_false', dest='flatfile',
|
|
46
|
+
help="Don't check text files for rectangularity")
|
|
47
|
+
parser.add_argument('-r', '--recursive', action='store_true', dest='recur',
|
|
48
|
+
help='Recursive *directory* processing of file tree. Assumes that the '
|
|
49
|
+
'arguments point to a directory (eg, tmp/), and a slash will '
|
|
50
|
+
'be appended if one does not exist')
|
|
51
|
+
parser.add_argument('-t', '--hash-type', dest='digest', default='md5',
|
|
52
|
+
help="Checksum hash type. Supported hashes: 'sha1', "
|
|
53
|
+
"'sha224', 'sha256', 'sha384', 'sha512', 'blake2b', "
|
|
54
|
+
"'blake2s', 'md5'. Default: 'md5'",
|
|
55
|
+
choices = ['md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512',
|
|
56
|
+
'blake2b', 'blake2s'],
|
|
57
|
+
type=str.lower)
|
|
58
|
+
parser.add_argument('-a', '--no-ascii', action='store_true', dest='asctest',
|
|
59
|
+
help="Don't check text files for non-ASCII characters")
|
|
60
|
+
parser.add_argument('-f', '--to-file',
|
|
61
|
+
help='Output to -f [file] instead of stdout')
|
|
62
|
+
return parser
|
|
63
|
+
|
|
64
|
+
def recurse_files(inlist) -> map:
|
|
65
|
+
'''
|
|
66
|
+
Returns a map object with pathlib.Paths of files
|
|
67
|
+
'''
|
|
68
|
+
outlist = []
|
|
69
|
+
for flist in inlist:
|
|
70
|
+
rec = os.walk(flist)
|
|
71
|
+
outlist += [pathlib.Path(x[0], y) for x in rec for y in x[2]]
|
|
72
|
+
return outlist #includes hidden files
|
|
73
|
+
|
|
74
|
+
def main(): #pylint: disable=too-many-branches
|
|
75
|
+
'''
|
|
76
|
+
Main function to output manifests to stdout.
|
|
77
|
+
'''
|
|
78
|
+
separator_types = {'csv': ',', 'tsv': '\t'}
|
|
79
|
+
#Purely for formatting output
|
|
80
|
+
line_spacer = {'txt':'\n\n', 'csv':'', 'tsv': ''}
|
|
81
|
+
parser = parse()
|
|
82
|
+
args = parser.parse_args()
|
|
83
|
+
if not args.recur:
|
|
84
|
+
#Windows does not do wildcard expansion at the shell level
|
|
85
|
+
if sys.platform.startswith('win'): #Maybe they will have win64 sometime:
|
|
86
|
+
files = map(pathlib.Path, [y for x in args.files for y in glob.glob(x)])
|
|
87
|
+
else:
|
|
88
|
+
files = map(pathlib.Path, list(args.files))
|
|
89
|
+
else:
|
|
90
|
+
files = recurse_files(args.files)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
output = []
|
|
94
|
+
try: ###
|
|
95
|
+
for num, fil in enumerate(files):
|
|
96
|
+
if not fil.is_file() or not fil.exists():
|
|
97
|
+
continue
|
|
98
|
+
testme = damage.Checker(fil)
|
|
99
|
+
if args.out in separator_types and num == 0:
|
|
100
|
+
output.append(testme.manifest(headers=True,
|
|
101
|
+
sep=separator_types.get(args.out),
|
|
102
|
+
**vars(args)))
|
|
103
|
+
else:
|
|
104
|
+
output.append(testme.manifest(sep=separator_types.get(args.out),
|
|
105
|
+
**vars(args)))
|
|
106
|
+
if not args.out == 'json':
|
|
107
|
+
#print(line_spacer[args.out].join(output).strip())
|
|
108
|
+
out_info =line_spacer[args.out].join(output).strip()
|
|
109
|
+
else:
|
|
110
|
+
outjson = ('{"files" :' +
|
|
111
|
+
'[' + ','.join(output) + ']'
|
|
112
|
+
+ '}')
|
|
113
|
+
out_info = json.dumps(json.loads(outjson)) #validate
|
|
114
|
+
except Exception as err: #pylint: disable=broad-exception-caught
|
|
115
|
+
print(f'Abnormal program termination {err}')
|
|
116
|
+
sys.exit()
|
|
117
|
+
|
|
118
|
+
if args.to_file:
|
|
119
|
+
with open(pathlib.Path(args.to_file), mode='w',
|
|
120
|
+
encoding='utf-8') as outf:
|
|
121
|
+
outf.write(out_info)
|
|
122
|
+
else:
|
|
123
|
+
print(out_info)
|
|
124
|
+
|
|
125
|
+
if __name__ == '__main__':
|
|
126
|
+
main()
|
|
Binary file
|
|
Binary file
|