slaw 0.15.2 → 0.16.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +6 -2
- data/bin/slaw +8 -0
- data/lib/slaw/extract/extractor.rb +10 -1
- data/lib/slaw/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 27b008d695654755f54463b4aad7e409fdafc848
|
4
|
+
data.tar.gz: f5efa73340972f8a7b3544b6c1f9b7a407bca342
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 14ed037256e022623d0d593bb1276e4dc669c2d3ce8dc0823e20c1e276b50e1509b62f30809db9d6c6bd8a3de56643f2d534f310f28161ef1a3fe15280466be6
|
7
|
+
data.tar.gz: 13b3c0757cb8c49ce28c8c0751831cb79f839d2e8a43459b3a0003339a3051d1edfc480fd936650a8acd3b08cf4a18763f13707df617e3d865b28a19f00bebb8
|
data/README.md
CHANGED
@@ -28,10 +28,10 @@ Or install it with:
|
|
28
28
|
|
29
29
|
$ gem install slaw
|
30
30
|
|
31
|
-
To run PDF extraction you will also need [
|
31
|
+
To run PDF extraction you will also need [poppler's pdftotext](https://poppler.freedesktop.org/).
|
32
32
|
If you're on a Mac, you can use:
|
33
33
|
|
34
|
-
$ brew install
|
34
|
+
$ brew install poppler
|
35
35
|
|
36
36
|
You may also need Ghostscript to remove password protection from PDF files. This is
|
37
37
|
installed by default on most systems (including Mac). On Ubuntu you can use:
|
@@ -218,6 +218,10 @@ Akoma Ntoso `component` elements at the end of the XML document, with a name of
|
|
218
218
|
|
219
219
|
## Changelog
|
220
220
|
|
221
|
+
### 0.16.0
|
222
|
+
|
223
|
+
* Support --crop for PDFs. Requires [poppler](https://poppler.freedesktop.org/) pdftotex, not xpdf.
|
224
|
+
|
221
225
|
### 0.15.2
|
222
226
|
|
223
227
|
* Update nokogiri to ~> 1.8.1
|
data/bin/slaw
CHANGED
@@ -22,6 +22,7 @@ class SlawCLI < Thor
|
|
22
22
|
option :id_prefix, type: :string, desc: "Prefix to be used when generating ID elements when parsing a fragment."
|
23
23
|
option :section_number_position, enum: ['before-title', 'after-title', 'guess'], desc: "Where do section titles come in relation to the section number? Default: before-title"
|
24
24
|
option :reformat, type: :boolean, desc: "Reformat common formatting issues to make grammar matching better. Default: true for PDF files, false otherwise"
|
25
|
+
option :crop, type: :string, desc: "Crop box for PDF files, as 'left,top,width,height'."
|
25
26
|
def parse(name)
|
26
27
|
logging
|
27
28
|
|
@@ -32,6 +33,13 @@ class SlawCLI < Thor
|
|
32
33
|
Slaw::Extract::Extractor.pdftotext_path = options[:pdftotext] if options[:pdftotext]
|
33
34
|
extractor = Slaw::Extract::Extractor.new
|
34
35
|
|
36
|
+
if options[:crop]
|
37
|
+
extractor.cropbox = options[:crop].split(',').map(&:to_i)
|
38
|
+
if extractor.cropbox.length != 4
|
39
|
+
raise Thor::Error.new("--crop requires four comma-separated integers")
|
40
|
+
end
|
41
|
+
end
|
42
|
+
|
35
43
|
case options[:input]
|
36
44
|
when 'pdf'
|
37
45
|
text = extractor.extract_from_pdf(name)
|
@@ -17,6 +17,8 @@ module Slaw
|
|
17
17
|
|
18
18
|
@@pdftotext_path = "pdftotext"
|
19
19
|
|
20
|
+
attr_accessor :cropbox
|
21
|
+
|
20
22
|
# Extract text from a file.
|
21
23
|
#
|
22
24
|
# @param filename [String] filename to extract from
|
@@ -73,7 +75,14 @@ module Slaw
|
|
73
75
|
#
|
74
76
|
# @return [Array<String>] command and params to execute
|
75
77
|
def pdf_to_text_cmd(filename)
|
76
|
-
[Extractor.pdftotext_path, "-enc", "UTF-8",
|
78
|
+
cmd = [Extractor.pdftotext_path, "-enc", "UTF-8", "-nopgbrk"]
|
79
|
+
|
80
|
+
if @cropbox
|
81
|
+
# left, top, width, height
|
82
|
+
cmd += "-x -y -W -H".split.zip(@cropbox.map(&:to_s)).flatten
|
83
|
+
end
|
84
|
+
|
85
|
+
cmd + [filename, "-"]
|
77
86
|
end
|
78
87
|
|
79
88
|
def extract_from_text(filename)
|
data/lib/slaw/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: slaw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.16.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Greg Kempe
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2018-01-02 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: bundler
|