slaw 0.15.2 → 0.16.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +6 -2
- data/bin/slaw +8 -0
- data/lib/slaw/extract/extractor.rb +10 -1
- data/lib/slaw/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 27b008d695654755f54463b4aad7e409fdafc848
|
4
|
+
data.tar.gz: f5efa73340972f8a7b3544b6c1f9b7a407bca342
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 14ed037256e022623d0d593bb1276e4dc669c2d3ce8dc0823e20c1e276b50e1509b62f30809db9d6c6bd8a3de56643f2d534f310f28161ef1a3fe15280466be6
|
7
|
+
data.tar.gz: 13b3c0757cb8c49ce28c8c0751831cb79f839d2e8a43459b3a0003339a3051d1edfc480fd936650a8acd3b08cf4a18763f13707df617e3d865b28a19f00bebb8
|
data/README.md
CHANGED
@@ -28,10 +28,10 @@ Or install it with:
|
|
28
28
|
|
29
29
|
$ gem install slaw
|
30
30
|
|
31
|
-
To run PDF extraction you will also need [
|
31
|
+
To run PDF extraction you will also need [poppler's pdftotext](https://poppler.freedesktop.org/).
|
32
32
|
If you're on a Mac, you can use:
|
33
33
|
|
34
|
-
$ brew install
|
34
|
+
$ brew install poppler
|
35
35
|
|
36
36
|
You may also need Ghostscript to remove password protection from PDF files. This is
|
37
37
|
installed by default on most systems (including Mac). On Ubuntu you can use:
|
@@ -218,6 +218,10 @@ Akoma Ntoso `component` elements at the end of the XML document, with a name of
|
|
218
218
|
|
219
219
|
## Changelog
|
220
220
|
|
221
|
+
### 0.16.0
|
222
|
+
|
223
|
+
* Support --crop for PDFs. Requires [poppler](https://poppler.freedesktop.org/) pdftotex, not xpdf.
|
224
|
+
|
221
225
|
### 0.15.2
|
222
226
|
|
223
227
|
* Update nokogiri to ~> 1.8.1
|
data/bin/slaw
CHANGED
@@ -22,6 +22,7 @@ class SlawCLI < Thor
|
|
22
22
|
option :id_prefix, type: :string, desc: "Prefix to be used when generating ID elements when parsing a fragment."
|
23
23
|
option :section_number_position, enum: ['before-title', 'after-title', 'guess'], desc: "Where do section titles come in relation to the section number? Default: before-title"
|
24
24
|
option :reformat, type: :boolean, desc: "Reformat common formatting issues to make grammar matching better. Default: true for PDF files, false otherwise"
|
25
|
+
option :crop, type: :string, desc: "Crop box for PDF files, as 'left,top,width,height'."
|
25
26
|
def parse(name)
|
26
27
|
logging
|
27
28
|
|
@@ -32,6 +33,13 @@ class SlawCLI < Thor
|
|
32
33
|
Slaw::Extract::Extractor.pdftotext_path = options[:pdftotext] if options[:pdftotext]
|
33
34
|
extractor = Slaw::Extract::Extractor.new
|
34
35
|
|
36
|
+
if options[:crop]
|
37
|
+
extractor.cropbox = options[:crop].split(',').map(&:to_i)
|
38
|
+
if extractor.cropbox.length != 4
|
39
|
+
raise Thor::Error.new("--crop requires four comma-separated integers")
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
35
43
|
case options[:input]
|
36
44
|
when 'pdf'
|
37
45
|
text = extractor.extract_from_pdf(name)
|
@@ -17,6 +17,8 @@ module Slaw
|
|
17
17
|
|
18
18
|
@@pdftotext_path = "pdftotext"
|
19
19
|
|
20
|
+
attr_accessor :cropbox
|
21
|
+
|
20
22
|
# Extract text from a file.
|
21
23
|
#
|
22
24
|
# @param filename [String] filename to extract from
|
@@ -73,7 +75,14 @@ module Slaw
|
|
73
75
|
#
|
74
76
|
# @return [Array<String>] command and params to execute
|
75
77
|
def pdf_to_text_cmd(filename)
|
76
|
-
[Extractor.pdftotext_path, "-enc", "UTF-8",
|
78
|
+
cmd = [Extractor.pdftotext_path, "-enc", "UTF-8", "-nopgbrk"]
|
79
|
+
|
80
|
+
if @cropbox
|
81
|
+
# left, top, width, height
|
82
|
+
cmd += "-x -y -W -H".split.zip(@cropbox.map(&:to_s)).flatten
|
83
|
+
end
|
84
|
+
|
85
|
+
cmd + [filename, "-"]
|
77
86
|
end
|
78
87
|
|
79
88
|
def extract_from_text(filename)
|
data/lib/slaw/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: slaw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Greg Kempe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|