pdfh 0.1.8 → 0.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +37 -12
- data/.rubocop.yml +16 -2
- data/.rubocop_todo.yml +133 -3
- data/.ruby-version +1 -1
- data/CHANGELOG.md +3 -0
- data/Gemfile +14 -3
- data/Gemfile.lock +52 -36
- data/README.md +9 -0
- data/Rakefile +6 -6
- data/bin/console +6 -6
- data/exe/pdfh +18 -8
- data/lib/ext/string.rb +1 -1
- data/lib/pdfh.rb +17 -17
- data/lib/pdfh/document.rb +24 -24
- data/lib/pdfh/pdf_handler.rb +1 -1
- data/lib/pdfh/settings.rb +7 -7
- data/lib/pdfh/utils.rb +2 -2
- data/lib/pdfh/version.rb +1 -1
- data/pdfh.gemspec +22 -31
- metadata +3 -117
- data/.travis.yml +0 -7
- data/docs/legacy.md +0 -453
data/.travis.yml
DELETED
data/docs/legacy.md
DELETED
@@ -1,453 +0,0 @@
|
|
1
|
-
# Legacy
|
2
|
-
|
3
|
-
## Python
|
4
|
-
|
5
|
-
This project was born as a bash script. It was initially ported to a Python script,
|
6
|
-
and ended as a Ruby gem. Below is the old Python code, provided just for fun.
|
7
|
-
|
8
|
-
```python
|
9
|
-
#!/usr/bin/env python3
|
10
|
-
"""Organize PDF protected password files, using rules defined in yaml format."""
|
11
|
-
from __future__ import print_function
|
12
|
-
import os
|
13
|
-
import re
|
14
|
-
import base64
|
15
|
-
import pprint
|
16
|
-
import argparse
|
17
|
-
import tempfile
|
18
|
-
import subprocess
|
19
|
-
import yaml
|
20
|
-
from shutil import copyfile
|
21
|
-
from colorama import Fore
|
22
|
-
|
23
|
-
IS_VERBOSE = False
|
24
|
-
IS_DRY = False
|
25
|
-
# TODO: calendar.month_name[11] current locale
|
26
|
-
MONTHS = dict(
|
27
|
-
enero = 1,
|
28
|
-
febrero = 2,
|
29
|
-
marzo = 3,
|
30
|
-
abril = 4,
|
31
|
-
mayo = 5,
|
32
|
-
junio = 6,
|
33
|
-
julio = 7,
|
34
|
-
agosto = 8,
|
35
|
-
septiembre = 9,
|
36
|
-
octubre = 10,
|
37
|
-
noviembre = 11,
|
38
|
-
diciembre = 12
|
39
|
-
)
|
40
|
-
|
41
|
-
class InlineClass(object):
|
42
|
-
"""Wrapper to have an object like dictionary"""
|
43
|
-
def __init__(self, dict):
|
44
|
-
self.__dict__ = dict
|
45
|
-
def has_key(self, key):
|
46
|
-
return key in self.__dict__.keys()
|
47
|
-
|
48
|
-
def get_month_num(num):
|
49
|
-
# Not implemented yet
|
50
|
-
import locale
|
51
|
-
locale.setlocale(locale.LC_ALL, 'es_MX')
|
52
|
-
import calendar
|
53
|
-
calendar.month_name[num]
|
54
|
-
|
55
|
-
class Document(object):
|
56
|
-
"""Handles the PDF detected by the rules, and makes tranformations"""
|
57
|
-
def __init__(self, file, account, **kwargs):
|
58
|
-
self._file = file
|
59
|
-
self._act = account
|
60
|
-
self._extra = ''
|
61
|
-
self._has_xml = False
|
62
|
-
self._verbose = kwargs['verbose']
|
63
|
-
verbose = self._verbose
|
64
|
-
if verbose:
|
65
|
-
print(Fore.CYAN + account.name, '==================' + Fore.RESET)
|
66
|
-
|
67
|
-
self._pwd = base64.b64decode(self._act.pwd) if self._act.pwd else ''
|
68
|
-
if type(self._pwd) is bytes:
|
69
|
-
self._pwd = self._pwd.decode()
|
70
|
-
|
71
|
-
if not os.path.exists(self._file):
|
72
|
-
raise IOError("I can't find the PDF")
|
73
|
-
|
74
|
-
# Check if aditional XML file exists
|
75
|
-
self._xml_file = os.path.splitext(self._file)[0]+'.xml'
|
76
|
-
if os.path.exists(self._xml_file):
|
77
|
-
self._has_xml = True
|
78
|
-
|
79
|
-
self._tmp = tempfile.mktemp(suffix=".pdf")
|
80
|
-
if verbose:
|
81
|
-
print(Fore.CYAN + ' --> ' + self._tmp + ' temporal file assigned.' + Fore.RESET)
|
82
|
-
|
83
|
-
cmd1 = "qpdf --password='{}' --decrypt --stream-data=uncompress '{}' '{}'" \
|
84
|
-
.format(self._pwd, self._file, self._tmp)
|
85
|
-
subprocess.call(cmd1, shell=True)
|
86
|
-
|
87
|
-
cmd2 = "pdftotext -enc UTF-8 '{}' -".format(self._tmp)
|
88
|
-
|
89
|
-
p = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True)
|
90
|
-
self._text, _err = p.communicate()
|
91
|
-
if type(self._text) is bytes:
|
92
|
-
self._text = self._text.decode(encoding="utf-8", errors="replace")
|
93
|
-
if verbose:
|
94
|
-
print(Fore.CYAN + self._text + Fore.RESET)
|
95
|
-
|
96
|
-
match = re.search(self._act.re_date, self._text, re.MULTILINE)
|
97
|
-
if not match:
|
98
|
-
print(Fore.RED, 'Err, date was not extracted with regex provided: ' + Fore.LIGHTRED_EX +
|
99
|
-
self._act.re_date + Fore.RESET)
|
100
|
-
exit(1)
|
101
|
-
if verbose:
|
102
|
-
print(Fore.CYAN, '==== Regex Groups:', match.groups(), Fore.RESET)
|
103
|
-
try:
|
104
|
-
self._month = match.group('m')
|
105
|
-
self._year = match.group('y')
|
106
|
-
except IndexError:
|
107
|
-
self._month, self._year = match.groups()
|
108
|
-
|
109
|
-
if len(match.groups()) > 2:
|
110
|
-
self._extra = match.group(3)
|
111
|
-
|
112
|
-
self._month = self._month.lower()
|
113
|
-
if verbose:
|
114
|
-
print(Fore.CYAN, '==== Assigned:', (self._month, self._year, self._extra),
|
115
|
-
'==( Month, Year, Extra )================' + Fore.RESET)
|
116
|
-
|
117
|
-
if self._act.has_key('types'):
|
118
|
-
for t in self._act.types:
|
119
|
-
name = t['name']
|
120
|
-
if re.search(name, self._text, re.IGNORECASE):
|
121
|
-
self.type = name
|
122
|
-
self.offset = t.get('month_offset', 0)
|
123
|
-
else:
|
124
|
-
self.type = None
|
125
|
-
self.offset = 0
|
126
|
-
|
127
|
-
if verbose:
|
128
|
-
print(Fore.CYAN, 'Offset settings, Type:', self.type, '/ Month:', self.offset, Fore.RESET)
|
129
|
-
#Used if the month offset results in change in year.
|
130
|
-
self._year_offset = 0
|
131
|
-
if verbose:
|
132
|
-
print(Fore.CYAN, 'END INIT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + Fore.RESET)
|
133
|
-
|
134
|
-
def __repr__(self):
|
135
|
-
type_str = self.type if self.type else 'N/A'
|
136
|
-
format_string = 'Name : {}\nType : {}\nPeriod : {}\nFile Path: {}\n'+\
|
137
|
-
'File Name: {}\nNew Name : {}\nStorePath: {}\nFullPath : {}'
|
138
|
-
return format_string.format(
|
139
|
-
self.name, type_str, self.period, self._file,
|
140
|
-
self.filename_only, self.new_name, self.store_path, self.full_path)
|
141
|
-
|
142
|
-
def write_pdf(self):
|
143
|
-
dir_path = os.path.dirname(self.full_path)
|
144
|
-
if not os.path.exists(dir_path):
|
145
|
-
raise IOError("I can't find the store_path")
|
146
|
-
|
147
|
-
cmd = "qpdf --password='{}' --decrypt '{}' '{}'" \
|
148
|
-
.format(self._pwd, self._file, self.full_path)
|
149
|
-
subprocess.call(cmd, shell=True)
|
150
|
-
|
151
|
-
if os.path.exists(self.full_path):
|
152
|
-
bkp = self._file + '_'
|
153
|
-
os.rename(self._file, bkp)
|
154
|
-
# Copy XML File if exists
|
155
|
-
if self._has_xml:
|
156
|
-
xml_new_path = os.path.splitext(self.full_path)[0]+'.xml'
|
157
|
-
copyfile(self._xml_file, xml_new_path)
|
158
|
-
xml_bkp = self._xml_file + '_'
|
159
|
-
os.rename(self._xml_file, xml_bkp)
|
160
|
-
if self._verbose:
|
161
|
-
print(Fore.CYAN, 'XML Written: ', xml_new_path, Fore.RESET)
|
162
|
-
else:
|
163
|
-
raise IOError("The file was not created.")
|
164
|
-
|
165
|
-
@property
|
166
|
-
def name(self): return self._act.name
|
167
|
-
@property
|
168
|
-
def filename_only(self):
|
169
|
-
dir, file = os.path.split(self._file)
|
170
|
-
filename, ext = os.path.splitext(file)
|
171
|
-
return filename
|
172
|
-
@property
|
173
|
-
def text(self): return self._text
|
174
|
-
@property
|
175
|
-
def month(self):
|
176
|
-
try:
|
177
|
-
month_num = int(self._month)
|
178
|
-
except:
|
179
|
-
if len(self._month) == 3:
|
180
|
-
for month in MONTHS:
|
181
|
-
if month[0:3] == self._month:
|
182
|
-
month_num = MONTHS[month]
|
183
|
-
else:
|
184
|
-
month_num = MONTHS[self._month]
|
185
|
-
|
186
|
-
|
187
|
-
if self.offset:
|
188
|
-
tmp = month_num + self.offset
|
189
|
-
if tmp == 0:
|
190
|
-
tmp = 12
|
191
|
-
self._year_offset = -1
|
192
|
-
elif tmp == 13:
|
193
|
-
tmp = 1
|
194
|
-
self._year_offset = 1
|
195
|
-
else:
|
196
|
-
tmp = month_num
|
197
|
-
return str(tmp).zfill(2)
|
198
|
-
@property
|
199
|
-
def year(self):
|
200
|
-
if len(self._year) == 2:
|
201
|
-
tmp = '20' + self._year
|
202
|
-
else:
|
203
|
-
tmp = self._year
|
204
|
-
year = int(tmp) + self._year_offset
|
205
|
-
|
206
|
-
return str(year)
|
207
|
-
@property
|
208
|
-
def period(self): return "{}-{}".format(self.year, self.month)
|
209
|
-
@property
|
210
|
-
def new_name(self):
|
211
|
-
if self._act.has_key('name_template'):
|
212
|
-
template = self._act.name_template
|
213
|
-
else:
|
214
|
-
template = '{original}'
|
215
|
-
|
216
|
-
type = self.type if self.type else 'NA'
|
217
|
-
new = template \
|
218
|
-
.replace('{original}', self.filename_only) \
|
219
|
-
.replace('{period}', self.period) \
|
220
|
-
.replace('{type}', type) \
|
221
|
-
.replace('{extra}', self._extra)
|
222
|
-
return new + '.pdf'
|
223
|
-
@property
|
224
|
-
def store_path(self):
|
225
|
-
tmp = self._act.store_path.replace('{YEAR}', self.year)
|
226
|
-
return tmp
|
227
|
-
@property
|
228
|
-
def full_path(self):
|
229
|
-
tmp = self.store_path
|
230
|
-
tmp = tmp if tmp[0] != '/' else tmp[1:]
|
231
|
-
base = os.path.expanduser(self._act.base_path)
|
232
|
-
base = os.path.abspath(base)
|
233
|
-
return os.path.join(base, tmp, self.new_name)
|
234
|
-
|
235
|
-
class Settings(object):
|
236
|
-
"""Open the rules YAML file"""
|
237
|
-
def __init__(self):
|
238
|
-
name = os.path.basename(__file__).replace('py', 'yml')
|
239
|
-
dir_oder = []
|
240
|
-
dir_oder.append(os.path.dirname(__file__))
|
241
|
-
dir_oder.append(os.path.expanduser('~'))
|
242
|
-
|
243
|
-
paths = map(lambda x: os.path.join(x, name), dir_oder)
|
244
|
-
|
245
|
-
for path in paths:
|
246
|
-
if os.path.isfile(path):
|
247
|
-
conf_path = path
|
248
|
-
break
|
249
|
-
|
250
|
-
if 'conf_path' not in locals():
|
251
|
-
print('{}Error, no configuraton file was found: {}{}{}'
|
252
|
-
.format(Fore.RED, Fore.MAGENTA, ', '.join(paths), Fore.RESET))
|
253
|
-
exit(1)
|
254
|
-
|
255
|
-
fsettings = open(conf_path, 'r')
|
256
|
-
if IS_VERBOSE:
|
257
|
-
print("Loaded configuration file: {}{}{}"
|
258
|
-
.format(Fore.GREEN, conf_path, Fore.RESET))
|
259
|
-
self.__dict__ = yaml.load(fsettings)
|
260
|
-
|
261
|
-
def print(self):
|
262
|
-
pp = pprint.PrettyPrinter(indent=2)
|
263
|
-
pp.pprint(self.__dict__)
|
264
|
-
|
265
|
-
def getAccount(self, file_name):
|
266
|
-
for act in self.accounts:
|
267
|
-
srch = re.search(act['re_file'], file_name)
|
268
|
-
if srch != None:
|
269
|
-
act['base_path'] = self.base_path
|
270
|
-
return InlineClass(act)
|
271
|
-
|
272
|
-
def getScrapeDirectories(self):
|
273
|
-
max_length = len(max(self.scrape_dirs, key=len))
|
274
|
-
|
275
|
-
if IS_VERBOSE:
|
276
|
-
print('Processing directories:')
|
277
|
-
for directory in self.scrape_dirs:
|
278
|
-
path = os.path.expanduser(directory)
|
279
|
-
path = os.path.abspath(path)
|
280
|
-
print_ident(directory, path, color=Fore.BLUE, field_width=max_length)
|
281
|
-
print()
|
282
|
-
|
283
|
-
for directory in self.scrape_dirs:
|
284
|
-
path = os.path.expanduser(directory)
|
285
|
-
path = os.path.abspath(path)
|
286
|
-
yield path
|
287
|
-
|
288
|
-
def get_files(directory=None):
|
289
|
-
"""Analyze current directory for PDF files"""
|
290
|
-
path = os.path.dirname(os.path.abspath(__file__)) if directory == None else directory
|
291
|
-
for pdffile in os.listdir(path):
|
292
|
-
if pdffile.endswith(".pdf"):
|
293
|
-
yield os.path.join(path, pdffile)
|
294
|
-
|
295
|
-
def print_ident(field, value, **kwargs):
|
296
|
-
"""Print value with the color specified and correct identation.
|
297
|
-
|
298
|
-
Args:
|
299
|
-
field (int): The value name
|
300
|
-
value (str): The value to print
|
301
|
-
color (AnsiFore): The color to use
|
302
|
-
field_width (int): The identation lenght of fields
|
303
|
-
|
304
|
-
Returns:
|
305
|
-
None: No value is returned.
|
306
|
-
"""
|
307
|
-
color = kwargs['color'] if 'color' in kwargs else Fore.GREEN
|
308
|
-
field_width = kwargs['field_width'] if 'field_width' in kwargs else 7
|
309
|
-
string_format = ' {:>'+str(field_width)+'}: {}{}{}'
|
310
|
-
print(string_format.format(field, color, value, Fore.RESET))
|
311
|
-
|
312
|
-
def print_separator(title, color=Fore.LIGHTYELLOW_EX):
|
313
|
-
_rows, cols = os.popen('stty size', 'r').read().split()
|
314
|
-
sep = '\n' + color
|
315
|
-
sep += '-' * 40 + ' ' + title + ' '
|
316
|
-
remaining_cols = int(cols) - len(sep)
|
317
|
-
if remaining_cols > 0:
|
318
|
-
sep += '-' * remaining_cols
|
319
|
-
sep += Fore.RESET
|
320
|
-
print(sep)
|
321
|
-
|
322
|
-
|
323
|
-
def main():
|
324
|
-
parser = argparse.ArgumentParser()
|
325
|
-
parser.add_argument("-d", "--dry",
|
326
|
-
action="store_true",
|
327
|
-
help="Dry run, does not write new pdf")
|
328
|
-
parser.add_argument("-v", "--verbose",
|
329
|
-
action="store_true",
|
330
|
-
help="Show more output, useful for debug")
|
331
|
-
args = parser.parse_args()
|
332
|
-
|
333
|
-
if args.dry:
|
334
|
-
global IS_DRY
|
335
|
-
IS_DRY = True
|
336
|
-
print(Fore.CYAN + "Running in dry mode..." + Fore.RESET)
|
337
|
-
if args.verbose:
|
338
|
-
global IS_VERBOSE
|
339
|
-
IS_VERBOSE = True
|
340
|
-
print(Fore.CYAN + "Running in verbose mode..." + Fore.RESET)
|
341
|
-
|
342
|
-
settings = Settings()
|
343
|
-
#settings.getScrapeDirectories()
|
344
|
-
#sys.exit(1)
|
345
|
-
|
346
|
-
for work_directory in settings.getScrapeDirectories():
|
347
|
-
print_separator(work_directory)
|
348
|
-
ignored_files = []
|
349
|
-
for pdffile in get_files(work_directory):
|
350
|
-
try:
|
351
|
-
base = os.path.basename(pdffile)
|
352
|
-
act = settings.getAccount(pdffile)
|
353
|
-
if not act:
|
354
|
-
raise ValueError('no account was matched.')
|
355
|
-
print('Working on' + Fore.LIGHTGREEN_EX, base, Fore.RESET)
|
356
|
-
print_ident(' Cuenta', act.name, color=Fore.LIGHTBLUE_EX)
|
357
|
-
doc = Document(pdffile, act, verbose=IS_VERBOSE)
|
358
|
-
#print(edocta) # Debug ----
|
359
|
-
print_ident('Periodo', doc.period)
|
360
|
-
if IS_VERBOSE:
|
361
|
-
print(Fore.CYAN, doc, Fore.RESET)
|
362
|
-
if not IS_DRY:
|
363
|
-
doc.write_pdf()
|
364
|
-
print_ident('NewFile', doc.full_path)
|
365
|
-
except ValueError as e:
|
366
|
-
#print(e)
|
367
|
-
ignored_files.append(base)
|
368
|
-
#print(Fore.LIGHTRED_EX + ' Error!', e, Fore.RESET)
|
369
|
-
except IOError as e:
|
370
|
-
print('Error, the filepath {} does not exists.'.format(doc.full_path))
|
371
|
-
|
372
|
-
print('\nNo account was matched for these PDF files:')
|
373
|
-
for num, path in enumerate(ignored_files, start=1):
|
374
|
-
print_ident(num, path, color=Fore.RED, field_width=3)
|
375
|
-
|
376
|
-
|
377
|
-
if __name__ == '__main__': main()
|
378
|
-
|
379
|
-
```
|
380
|
-
|
381
|
-
## Bash
|
382
|
-
|
383
|
-
```bash
|
384
|
-
#!/bin/env bash
|
385
|
-
. .common
|
386
|
-
|
387
|
-
YEAR=$(date +%Y)
|
388
|
-
PASS=abcdef
|
389
|
-
GREP_PERIOD='al [0-9]{1,2} de ([A-Zz-z]*) de.? [0-9]+'
|
390
|
-
#Path to move, Dropbox. Use "{YEAR}" to replace with actual year
|
391
|
-
MVTO=../"Impuestos/FISCAL-{YEAR}/Edo Cuenta"
|
392
|
-
|
393
|
-
app_installed qpdf
|
394
|
-
|
395
|
-
count=$(find . -type f -name '[!2]*.pdf' | wc -l)
|
396
|
-
if [ "$count" == '0' ]; then
|
397
|
-
echo -e "${RED}Error, no pdf files found.${RST}"
|
398
|
-
exit 1
|
399
|
-
fi
|
400
|
-
|
401
|
-
for pdf in [!2]*.pdf; do
|
402
|
-
[ ! -r "$pdf" ] && echo -e "${RED}Error, can't access $pdf${RST}" && exit 1
|
403
|
-
echo -e "Working on ${GRE}$pdf${RST}..."
|
404
|
-
|
405
|
-
# Decrypt PDF and uncompress to work with it
|
406
|
-
temp=$(mktemp)
|
407
|
-
#trap 'rm $temp' 0 SIGINT SIGQUIT SIGTERM
|
408
|
-
qpdf --password="$PASS" --decrypt --stream-data=uncompress "$pdf" "$temp"
|
409
|
-
|
410
|
-
# Extract Data from PDF
|
411
|
-
account=$(strings "$temp" | grep -ioE 'platinum|perfiles' | head -1)
|
412
|
-
account=${account,,}
|
413
|
-
account=${account^}
|
414
|
-
echo -e " account: ${BLU}$account${RST}"
|
415
|
-
#period=$(strings "$temp" | grep -iEo 'al [0-9]{1,2} de ([A-Zz-z]*) de [0-9]+' | tail -1)
|
416
|
-
#month=$(echo "$period" | tr ' ' '\n'| tail -3 | head -1)
|
417
|
-
#year=$(echo "$period" | tr ' ' '\n' | tail -1)
|
418
|
-
period=$(pdftotext "$temp" - | grep -iEo "$GREP_PERIOD" | tail -1 )
|
419
|
-
month=$(echo "$period" | awk '{print $4}')
|
420
|
-
year=$(echo "$period" | awk '{print $6}')
|
421
|
-
period=${month,,}
|
422
|
-
|
423
|
-
if [ -z "$period" ]; then
|
424
|
-
echo -e "${RED}Error, period not found.${RST}"
|
425
|
-
exit 1
|
426
|
-
fi
|
427
|
-
|
428
|
-
number=$(convert_month $period)
|
429
|
-
if [ "$account" == "Perfiles" ]; then
|
430
|
-
#number=$(( number - 1 ))
|
431
|
-
number=$(echo "$number - 1" | bc)
|
432
|
-
if [ "${#number}" -eq 1 ]; then
|
433
|
-
number="0$number"
|
434
|
-
fi
|
435
|
-
fi
|
436
|
-
echo -e " period: ${BLU}$year-$period${RST}"
|
437
|
-
|
438
|
-
#Prepare new PDF
|
439
|
-
newfile="$year-${number} ${account}.pdf"
|
440
|
-
#pdftk "$pdf" input_pw "$PASS" output "$newfile"
|
441
|
-
qpdf --password="$PASS" --decrypt "$pdf" "$newfile"
|
442
|
-
if [ -f "$newfile" ]; then
|
443
|
-
mv "$pdf" "${newfile/.pdf/}_$pdf"
|
444
|
-
echo -e " new file: ${BLU}$newfile${RST}"
|
445
|
-
fi
|
446
|
-
|
447
|
-
#Copy it
|
448
|
-
MVTO="${MVTO//'{YEAR}'/$year}"
|
449
|
-
if [ -d "$MVTO" ]; then
|
450
|
-
cp -v "$newfile" "$MVTO"
|
451
|
-
fi
|
452
|
-
done
|
453
|
-
```
|