slaw 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +6 -1
- data/lib/slaw/extract/extractor.rb +31 -7
- data/lib/slaw/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 30ad1ae5e3d2aadfcf89800d91f16f0c0b38620f
|
4
|
+
data.tar.gz: c93431db28804db785afda0dae9684bad5a62dbd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 2a0881b4ba4225943bd4b491f7385ba7b70125301c0aed149f70144991c1fec5ee3bdee49d56123848abecd94fd6c9b73f06c15ca66b00e08e50d46dada2f6b3
|
7
|
+
data.tar.gz: fa04f689a4a136067fe4b22961d6d6b5597ded5807cddb2b9fa7e620a25982ef31ed830e50df8a30e62300b3e51c79df5478020d74a391262981a41d50ae1311
|
data/README.md
CHANGED
@@ -28,11 +28,16 @@ Or install it with:
|
|
28
28
|
|
29
29
|
$ gem install slaw
|
30
30
|
|
31
|
-
To run PDF extraction you will also need [xpdf](http://www.foolabs.com/xpdf/)
|
31
|
+
To run PDF extraction you will also need [xpdf](http://www.foolabs.com/xpdf/) and
|
32
32
|
If you're on a Mac, you can use:
|
33
33
|
|
34
34
|
brew install xpdf
|
35
35
|
|
36
|
+
You may also need Ghostscript to remove password protection from PDF files. This is
|
37
|
+
installed by default on most systems (including Mac). On Ubuntu you can use:
|
38
|
+
|
39
|
+
sudo apt-get install ghostscript
|
40
|
+
|
36
41
|
## Overview
|
37
42
|
|
38
43
|
Slaw generates Acts in the [Akoma Ntoso](http://www.akomantoso.org) 2.0 XML
|
@@ -1,4 +1,5 @@
|
|
1
1
|
require 'open3'
|
2
|
+
require 'tempfile'
|
2
3
|
|
3
4
|
module Slaw
|
4
5
|
module Extract
|
@@ -46,14 +47,23 @@ module Slaw
|
|
46
47
|
#
|
47
48
|
# @return [String] extracted text
|
48
49
|
def extract_from_pdf(filename)
|
49
|
-
|
50
|
-
logger.info("Executing: #{cmd}")
|
51
|
-
stdout, status = Open3.capture2(*cmd)
|
50
|
+
retried = false
|
52
51
|
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
52
|
+
while true
|
53
|
+
cmd = pdf_to_text_cmd(filename)
|
54
|
+
logger.info("Executing: #{cmd}")
|
55
|
+
stdout, status = Open3.capture2(*cmd)
|
56
|
+
|
57
|
+
case status.exitstatus
|
58
|
+
when 0
|
59
|
+
return cleanup(stdout)
|
60
|
+
when 3
|
61
|
+
return nil if retried
|
62
|
+
retried = true
|
63
|
+
self.remove_pdf_password(filename)
|
64
|
+
else
|
65
|
+
return nil
|
66
|
+
end
|
57
67
|
end
|
58
68
|
end
|
59
69
|
|
@@ -79,6 +89,20 @@ module Slaw
|
|
79
89
|
text
|
80
90
|
end
|
81
91
|
|
92
|
+
def remove_pdf_password(filename)
|
93
|
+
file = Tempfile.new('steno')
|
94
|
+
begin
|
95
|
+
logger.info("Trying to remove password from #{filename}")
|
96
|
+
cmd = "gs -q -dNOPAUSE -dBATCH -sDEVICE=pdfwrite -sOutputFile=#{file.path} -c .setpdfwrite -f #{filename}".split(" ")
|
97
|
+
logger.info("Executing: #{cmd}")
|
98
|
+
Open3.capture2(*cmd)
|
99
|
+
FileUtils.move(file.path, filename)
|
100
|
+
ensure
|
101
|
+
file.close
|
102
|
+
file.unlink
|
103
|
+
end
|
104
|
+
end
|
105
|
+
|
82
106
|
# Get location of the pdftotext executable for all instances.
|
83
107
|
def self.pdftotext_path
|
84
108
|
@@pdftotext_path
|
data/lib/slaw/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: slaw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Greg Kempe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2014-
|
11
|
+
date: 2014-12-07 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|