slaw 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +6 -1
- data/lib/slaw/extract/extractor.rb +31 -7
- data/lib/slaw/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 30ad1ae5e3d2aadfcf89800d91f16f0c0b38620f
|
4
|
+
data.tar.gz: c93431db28804db785afda0dae9684bad5a62dbd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2a0881b4ba4225943bd4b491f7385ba7b70125301c0aed149f70144991c1fec5ee3bdee49d56123848abecd94fd6c9b73f06c15ca66b00e08e50d46dada2f6b3
|
7
|
+
data.tar.gz: fa04f689a4a136067fe4b22961d6d6b5597ded5807cddb2b9fa7e620a25982ef31ed830e50df8a30e62300b3e51c79df5478020d74a391262981a41d50ae1311
|
data/README.md
CHANGED
@@ -28,11 +28,16 @@ Or install it with:
|
|
28
28
|
|
29
29
|
$ gem install slaw
|
30
30
|
|
31
|
-
To run PDF extraction you will also need [xpdf](http://www.foolabs.com/xpdf/)
|
31
|
+
To run PDF extraction you will also need [xpdf](http://www.foolabs.com/xpdf/) and
|
32
32
|
If you're on a Mac, you can use:
|
33
33
|
|
34
34
|
brew install xpdf
|
35
35
|
|
36
|
+
You may also need Ghostscript to remove password protection from PDF files. This is
|
37
|
+
installed by default on most systems (including Mac). On Ubuntu you can use:
|
38
|
+
|
39
|
+
sudo apt-get install ghostscript
|
40
|
+
|
36
41
|
## Overview
|
37
42
|
|
38
43
|
Slaw generates Acts in the [Akoma Ntoso](http://www.akomantoso.org) 2.0 XML
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'open3'
|
2
|
+
require 'tempfile'
|
2
3
|
|
3
4
|
module Slaw
|
4
5
|
module Extract
|
@@ -46,14 +47,23 @@ module Slaw
|
|
46
47
|
#
|
47
48
|
# @return [String] extracted text
|
48
49
|
def extract_from_pdf(filename)
|
49
|
-
|
50
|
-
logger.info("Executing: #{cmd}")
|
51
|
-
stdout, status = Open3.capture2(*cmd)
|
50
|
+
retried = false
|
52
51
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
52
|
+
while true
|
53
|
+
cmd = pdf_to_text_cmd(filename)
|
54
|
+
logger.info("Executing: #{cmd}")
|
55
|
+
stdout, status = Open3.capture2(*cmd)
|
56
|
+
|
57
|
+
case status.exitstatus
|
58
|
+
when 0
|
59
|
+
return cleanup(stdout)
|
60
|
+
when 3
|
61
|
+
return nil if retried
|
62
|
+
retried = true
|
63
|
+
self.remove_pdf_password(filename)
|
64
|
+
else
|
65
|
+
return nil
|
66
|
+
end
|
57
67
|
end
|
58
68
|
end
|
59
69
|
|
@@ -79,6 +89,20 @@ module Slaw
|
|
79
89
|
text
|
80
90
|
end
|
81
91
|
|
92
|
+
def remove_pdf_password(filename)
|
93
|
+
file = Tempfile.new('steno')
|
94
|
+
begin
|
95
|
+
logger.info("Trying to remove password from #{filename}")
|
96
|
+
cmd = "gs -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -sOutputFile=#{file.path} -c .setpdfwrite -f #{filename}".split(" ")
|
97
|
+
logger.info("Executing: #{cmd}")
|
98
|
+
Open3.capture2(*cmd)
|
99
|
+
FileUtils.move(file.path, filename)
|
100
|
+
ensure
|
101
|
+
file.close
|
102
|
+
file.unlink
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
82
106
|
# Get location of the pdftotext executable for all instances.
|
83
107
|
def self.pdftotext_path
|
84
108
|
@@pdftotext_path
|
data/lib/slaw/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: slaw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Greg Kempe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-12-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|