pdfh 0.1.8 → 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.travis.yml DELETED
@@ -1,7 +0,0 @@
1
- ---
2
- sudo: false
3
- language: ruby
4
- cache: bundler
5
- rvm:
6
- - 2.7.0
7
- before_install: gem install bundler -v 1.17.2
data/docs/legacy.md DELETED
@@ -1,453 +0,0 @@
1
- # Legacy
2
-
3
- ## Python
4
-
5
- This project was born as a bash script. It was initially ported to a Python script,
6
- and ended as a Ruby gem. Below is the old Python code, provided just for fun.
7
-
8
- ```python
9
- #!/usr/bin/env python3
10
- """Organize PDF protected password files, using rules defined in yaml format."""
11
- from __future__ import print_function
12
- import os
13
- import re
14
- import base64
15
- import pprint
16
- import argparse
17
- import tempfile
18
- import subprocess
19
- import yaml
20
- from shutil import copyfile
21
- from colorama import Fore
22
-
23
- IS_VERBOSE = False
24
- IS_DRY = False
25
- # TODO: calendar.month_name[11] current locale
26
- MONTHS = dict(
27
- enero = 1,
28
- febrero = 2,
29
- marzo = 3,
30
- abril = 4,
31
- mayo = 5,
32
- junio = 6,
33
- julio = 7,
34
- agosto = 8,
35
- septiembre = 9,
36
- octubre = 10,
37
- noviembre = 11,
38
- diciembre = 12
39
- )
40
-
41
- class InlineClass(object):
42
- """Wrapper to have an object like dictionary"""
43
- def __init__(self, dict):
44
- self.__dict__ = dict
45
- def has_key(self, key):
46
- return key in self.__dict__.keys()
47
-
48
- def get_month_num(num):
49
- # Not implemented yet
50
- import locale
51
- locale.setlocale(locale.LC_ALL, 'es_MX')
52
- import calendar
53
- calendar.month_name[num]
54
-
55
- class Document(object):
56
- """Handles the PDF detected by the rules, and makes tranformations"""
57
- def __init__(self, file, account, **kwargs):
58
- self._file = file
59
- self._act = account
60
- self._extra = ''
61
- self._has_xml = False
62
- self._verbose = kwargs['verbose']
63
- verbose = self._verbose
64
- if verbose:
65
- print(Fore.CYAN + account.name, '==================' + Fore.RESET)
66
-
67
- self._pwd = base64.b64decode(self._act.pwd) if self._act.pwd else ''
68
- if type(self._pwd) is bytes:
69
- self._pwd = self._pwd.decode()
70
-
71
- if not os.path.exists(self._file):
72
- raise IOError("I can't find the PDF")
73
-
74
- # Check if aditional XML file exists
75
- self._xml_file = os.path.splitext(self._file)[0]+'.xml'
76
- if os.path.exists(self._xml_file):
77
- self._has_xml = True
78
-
79
- self._tmp = tempfile.mktemp(suffix=".pdf")
80
- if verbose:
81
- print(Fore.CYAN + ' --> ' + self._tmp + ' temporal file assigned.' + Fore.RESET)
82
-
83
- cmd1 = "qpdf --password='{}' --decrypt --stream-data=uncompress '{}' '{}'" \
84
- .format(self._pwd, self._file, self._tmp)
85
- subprocess.call(cmd1, shell=True)
86
-
87
- cmd2 = "pdftotext -enc UTF-8 '{}' -".format(self._tmp)
88
-
89
- p = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True)
90
- self._text, _err = p.communicate()
91
- if type(self._text) is bytes:
92
- self._text = self._text.decode(encoding="utf-8", errors="replace")
93
- if verbose:
94
- print(Fore.CYAN + self._text + Fore.RESET)
95
-
96
- match = re.search(self._act.re_date, self._text, re.MULTILINE)
97
- if not match:
98
- print(Fore.RED, 'Err, date was not extracted with regex provided: ' + Fore.LIGHTRED_EX +
99
- self._act.re_date + Fore.RESET)
100
- exit(1)
101
- if verbose:
102
- print(Fore.CYAN, '==== Regex Groups:', match.groups(), Fore.RESET)
103
- try:
104
- self._month = match.group('m')
105
- self._year = match.group('y')
106
- except IndexError:
107
- self._month, self._year = match.groups()
108
-
109
- if len(match.groups()) > 2:
110
- self._extra = match.group(3)
111
-
112
- self._month = self._month.lower()
113
- if verbose:
114
- print(Fore.CYAN, '==== Assigned:', (self._month, self._year, self._extra),
115
- '==( Month, Year, Extra )================' + Fore.RESET)
116
-
117
- if self._act.has_key('types'):
118
- for t in self._act.types:
119
- name = t['name']
120
- if re.search(name, self._text, re.IGNORECASE):
121
- self.type = name
122
- self.offset = t.get('month_offset', 0)
123
- else:
124
- self.type = None
125
- self.offset = 0
126
-
127
- if verbose:
128
- print(Fore.CYAN, 'Offset settings, Type:', self.type, '/ Month:', self.offset, Fore.RESET)
129
- #Used if the month offset results in change in year.
130
- self._year_offset = 0
131
- if verbose:
132
- print(Fore.CYAN, 'END INIT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~' + Fore.RESET)
133
-
134
- def __repr__(self):
135
- type_str = self.type if self.type else 'N/A'
136
- format_string = 'Name : {}\nType : {}\nPeriod : {}\nFile Path: {}\n'+\
137
- 'File Name: {}\nNew Name : {}\nStorePath: {}\nFullPath : {}'
138
- return format_string.format(
139
- self.name, type_str, self.period, self._file,
140
- self.filename_only, self.new_name, self.store_path, self.full_path)
141
-
142
- def write_pdf(self):
143
- dir_path = os.path.dirname(self.full_path)
144
- if not os.path.exists(dir_path):
145
- raise IOError("I can't find the store_path")
146
-
147
- cmd = "qpdf --password='{}' --decrypt '{}' '{}'" \
148
- .format(self._pwd, self._file, self.full_path)
149
- subprocess.call(cmd, shell=True)
150
-
151
- if os.path.exists(self.full_path):
152
- bkp = self._file + '_'
153
- os.rename(self._file, bkp)
154
- # Copy XML File if exists
155
- if self._has_xml:
156
- xml_new_path = os.path.splitext(self.full_path)[0]+'.xml'
157
- copyfile(self._xml_file, xml_new_path)
158
- xml_bkp = self._xml_file + '_'
159
- os.rename(self._xml_file, xml_bkp)
160
- if self._verbose:
161
- print(Fore.CYAN, 'XML Written: ', xml_new_path, Fore.RESET)
162
- else:
163
- raise IOError("The file was not created.")
164
-
165
- @property
166
- def name(self): return self._act.name
167
- @property
168
- def filename_only(self):
169
- dir, file = os.path.split(self._file)
170
- filename, ext = os.path.splitext(file)
171
- return filename
172
- @property
173
- def text(self): return self._text
174
- @property
175
- def month(self):
176
- try:
177
- month_num = int(self._month)
178
- except:
179
- if len(self._month) == 3:
180
- for month in MONTHS:
181
- if month[0:3] == self._month:
182
- month_num = MONTHS[month]
183
- else:
184
- month_num = MONTHS[self._month]
185
-
186
-
187
- if self.offset:
188
- tmp = month_num + self.offset
189
- if tmp == 0:
190
- tmp = 12
191
- self._year_offset = -1
192
- elif tmp == 13:
193
- tmp = 1
194
- self._year_offset = 1
195
- else:
196
- tmp = month_num
197
- return str(tmp).zfill(2)
198
- @property
199
- def year(self):
200
- if len(self._year) == 2:
201
- tmp = '20' + self._year
202
- else:
203
- tmp = self._year
204
- year = int(tmp) + self._year_offset
205
-
206
- return str(year)
207
- @property
208
- def period(self): return "{}-{}".format(self.year, self.month)
209
- @property
210
- def new_name(self):
211
- if self._act.has_key('name_template'):
212
- template = self._act.name_template
213
- else:
214
- template = '{original}'
215
-
216
- type = self.type if self.type else 'NA'
217
- new = template \
218
- .replace('{original}', self.filename_only) \
219
- .replace('{period}', self.period) \
220
- .replace('{type}', type) \
221
- .replace('{extra}', self._extra)
222
- return new + '.pdf'
223
- @property
224
- def store_path(self):
225
- tmp = self._act.store_path.replace('{YEAR}', self.year)
226
- return tmp
227
- @property
228
- def full_path(self):
229
- tmp = self.store_path
230
- tmp = tmp if tmp[0] != '/' else tmp[1:]
231
- base = os.path.expanduser(self._act.base_path)
232
- base = os.path.abspath(base)
233
- return os.path.join(base, tmp, self.new_name)
234
-
235
- class Settings(object):
236
- """Open the rules YAML file"""
237
- def __init__(self):
238
- name = os.path.basename(__file__).replace('py', 'yml')
239
- dir_oder = []
240
- dir_oder.append(os.path.dirname(__file__))
241
- dir_oder.append(os.path.expanduser('~'))
242
-
243
- paths = map(lambda x: os.path.join(x, name), dir_oder)
244
-
245
- for path in paths:
246
- if os.path.isfile(path):
247
- conf_path = path
248
- break
249
-
250
- if 'conf_path' not in locals():
251
- print('{}Error, no configuraton file was found: {}{}{}'
252
- .format(Fore.RED, Fore.MAGENTA, ', '.join(paths), Fore.RESET))
253
- exit(1)
254
-
255
- fsettings = open(conf_path, 'r')
256
- if IS_VERBOSE:
257
- print("Loaded configuration file: {}{}{}"
258
- .format(Fore.GREEN, conf_path, Fore.RESET))
259
- self.__dict__ = yaml.load(fsettings)
260
-
261
- def print(self):
262
- pp = pprint.PrettyPrinter(indent=2)
263
- pp.pprint(self.__dict__)
264
-
265
- def getAccount(self, file_name):
266
- for act in self.accounts:
267
- srch = re.search(act['re_file'], file_name)
268
- if srch != None:
269
- act['base_path'] = self.base_path
270
- return InlineClass(act)
271
-
272
- def getScrapeDirectories(self):
273
- max_length = len(max(self.scrape_dirs, key=len))
274
-
275
- if IS_VERBOSE:
276
- print('Processing directories:')
277
- for directory in self.scrape_dirs:
278
- path = os.path.expanduser(directory)
279
- path = os.path.abspath(path)
280
- print_ident(directory, path, color=Fore.BLUE, field_width=max_length)
281
- print()
282
-
283
- for directory in self.scrape_dirs:
284
- path = os.path.expanduser(directory)
285
- path = os.path.abspath(path)
286
- yield path
287
-
288
- def get_files(directory=None):
289
- """Analyze current directory for PDF files"""
290
- path = os.path.dirname(os.path.abspath(__file__)) if directory == None else directory
291
- for pdffile in os.listdir(path):
292
- if pdffile.endswith(".pdf"):
293
- yield os.path.join(path, pdffile)
294
-
295
- def print_ident(field, value, **kwargs):
296
- """Print value with the color specified and correct identation.
297
-
298
- Args:
299
- field (int): The value name
300
- value (str): The value to print
301
- color (AnsiFore): The color to use
302
- field_width (int): The identation lenght of fields
303
-
304
- Returns:
305
- None: No value is returned.
306
- """
307
- color = kwargs['color'] if 'color' in kwargs else Fore.GREEN
308
- field_width = kwargs['field_width'] if 'field_width' in kwargs else 7
309
- string_format = ' {:>'+str(field_width)+'}: {}{}{}'
310
- print(string_format.format(field, color, value, Fore.RESET))
311
-
312
- def print_separator(title, color=Fore.LIGHTYELLOW_EX):
313
- _rows, cols = os.popen('stty size', 'r').read().split()
314
- sep = '\n' + color
315
- sep += '-' * 40 + ' ' + title + ' '
316
- remaining_cols = int(cols) - len(sep)
317
- if remaining_cols > 0:
318
- sep += '-' * remaining_cols
319
- sep += Fore.RESET
320
- print(sep)
321
-
322
-
323
- def main():
324
- parser = argparse.ArgumentParser()
325
- parser.add_argument("-d", "--dry",
326
- action="store_true",
327
- help="Dry run, does not write new pdf")
328
- parser.add_argument("-v", "--verbose",
329
- action="store_true",
330
- help="Show more output, useful for debug")
331
- args = parser.parse_args()
332
-
333
- if args.dry:
334
- global IS_DRY
335
- IS_DRY = True
336
- print(Fore.CYAN + "Running in dry mode..." + Fore.RESET)
337
- if args.verbose:
338
- global IS_VERBOSE
339
- IS_VERBOSE = True
340
- print(Fore.CYAN + "Running in verbose mode..." + Fore.RESET)
341
-
342
- settings = Settings()
343
- #settings.getScrapeDirectories()
344
- #sys.exit(1)
345
-
346
- for work_directory in settings.getScrapeDirectories():
347
- print_separator(work_directory)
348
- ignored_files = []
349
- for pdffile in get_files(work_directory):
350
- try:
351
- base = os.path.basename(pdffile)
352
- act = settings.getAccount(pdffile)
353
- if not act:
354
- raise ValueError('no account was matched.')
355
- print('Working on' + Fore.LIGHTGREEN_EX, base, Fore.RESET)
356
- print_ident(' Cuenta', act.name, color=Fore.LIGHTBLUE_EX)
357
- doc = Document(pdffile, act, verbose=IS_VERBOSE)
358
- #print(edocta) # Debug ----
359
- print_ident('Periodo', doc.period)
360
- if IS_VERBOSE:
361
- print(Fore.CYAN, doc, Fore.RESET)
362
- if not IS_DRY:
363
- doc.write_pdf()
364
- print_ident('NewFile', doc.full_path)
365
- except ValueError as e:
366
- #print(e)
367
- ignored_files.append(base)
368
- #print(Fore.LIGHTRED_EX + ' Error!', e, Fore.RESET)
369
- except IOError as e:
370
- print('Error, the filepath {} does not exists.'.format(doc.full_path))
371
-
372
- print('\nNo account was matched for these PDF files:')
373
- for num, path in enumerate(ignored_files, start=1):
374
- print_ident(num, path, color=Fore.RED, field_width=3)
375
-
376
-
377
- if __name__ == '__main__': main()
378
-
379
- ```
380
-
381
- ## Bash
382
-
383
- ```bash
384
- #!/bin/env bash
385
- . .common
386
-
387
- YEAR=$(date +%Y)
388
- PASS=abcdef
389
- GREP_PERIOD='al [0-9]{1,2} de ([A-Zz-z]*) de.? [0-9]+'
390
- #Path to move, Dropbox. Use "{YEAR}" to replace with actual year
391
- MVTO=../"Impuestos/FISCAL-{YEAR}/Edo Cuenta"
392
-
393
- app_installed qpdf
394
-
395
- count=$(find . -type f -name '[!2]*.pdf' | wc -l)
396
- if [ "$count" == '0' ]; then
397
- echo -e "${RED}Error, no pdf files found.${RST}"
398
- exit 1
399
- fi
400
-
401
- for pdf in [!2]*.pdf; do
402
- [ ! -r "$pdf" ] && echo -e "${RED}Error, can't access $pdf${RST}" && exit 1
403
- echo -e "Working on ${GRE}$pdf${RST}..."
404
-
405
- # Decrypt PDF and uncompress to work with it
406
- temp=$(mktemp)
407
- #trap 'rm $temp' 0 SIGINT SIGQUIT SIGTERM
408
- qpdf --password="$PASS" --decrypt --stream-data=uncompress "$pdf" "$temp"
409
-
410
- # Extract Data from PDF
411
- account=$(strings "$temp" | grep -ioE 'platinum|perfiles' | head -1)
412
- account=${account,,}
413
- account=${account^}
414
- echo -e " account: ${BLU}$account${RST}"
415
- #period=$(strings "$temp" | grep -iEo 'al [0-9]{1,2} de ([A-Zz-z]*) de [0-9]+' | tail -1)
416
- #month=$(echo "$period" | tr ' ' '\n'| tail -3 | head -1)
417
- #year=$(echo "$period" | tr ' ' '\n' | tail -1)
418
- period=$(pdftotext "$temp" - | grep -iEo "$GREP_PERIOD" | tail -1 )
419
- month=$(echo "$period" | awk '{print $4}')
420
- year=$(echo "$period" | awk '{print $6}')
421
- period=${month,,}
422
-
423
- if [ -z "$period" ]; then
424
- echo -e "${RED}Error, period not found.${RST}"
425
- exit 1
426
- fi
427
-
428
- number=$(convert_month $period)
429
- if [ "$account" == "Perfiles" ]; then
430
- #number=$(( number - 1 ))
431
- number=$(echo "$number - 1" | bc)
432
- if [ "${#number}" -eq 1 ]; then
433
- number="0$number"
434
- fi
435
- fi
436
- echo -e " period: ${BLU}$year-$period${RST}"
437
-
438
- #Prepare new PDF
439
- newfile="$year-${number} ${account}.pdf"
440
- #pdftk "$pdf" input_pw "$PASS" output "$newfile"
441
- qpdf --password="$PASS" --decrypt "$pdf" "$newfile"
442
- if [ -f "$newfile" ]; then
443
- mv "$pdf" "${newfile/.pdf/}_$pdf"
444
- echo -e " new file: ${BLU}$newfile${RST}"
445
- fi
446
-
447
- #Copy it
448
- MVTO="${MVTO//'{YEAR}'/$year}"
449
- if [ -d "$MVTO" ]; then
450
- cp -v "$newfile" "$MVTO"
451
- fi
452
- done
453
- ```