slaw 0.15.2 → 0.16.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: ee90224965ffec0a5ecc89db6255f71a0c130985
4
- data.tar.gz: db7f6af7264958130f57970af720b447bfea5955
3
+ metadata.gz: 27b008d695654755f54463b4aad7e409fdafc848
4
+ data.tar.gz: f5efa73340972f8a7b3544b6c1f9b7a407bca342
5
5
  SHA512:
6
- metadata.gz: a7c52d0b2bc3ced5734e5ebd2c1bacc06cffd9c8d48ec9e7c5411c59f51622ce3eea70b2913b2c39de5f49eecebd15403447c28cda4b4c3ecccac0110ef8b436
7
- data.tar.gz: de5f75e9033bac55ba54877e1730d2a12d973fef04c4f76c2ead30e78a9efdebd7908b065c0f307772318cf39e14d58afb978dc3a1c41a602311b46533b60546
6
+ metadata.gz: 14ed037256e022623d0d593bb1276e4dc669c2d3ce8dc0823e20c1e276b50e1509b62f30809db9d6c6bd8a3de56643f2d534f310f28161ef1a3fe15280466be6
7
+ data.tar.gz: 13b3c0757cb8c49ce28c8c0751831cb79f839d2e8a43459b3a0003339a3051d1edfc480fd936650a8acd3b08cf4a18763f13707df617e3d865b28a19f00bebb8
data/README.md CHANGED
@@ -28,10 +28,10 @@ Or install it with:
28
28
 
29
29
  $ gem install slaw
30
30
 
31
- To run PDF extraction you will also need [xpdf](http://www.foolabs.com/xpdf/) and
31
+ To run PDF extraction you will also need [poppler's pdftotext](https://poppler.freedesktop.org/).
32
32
  If you're on a Mac, you can use:
33
33
 
34
- $ brew install xpdf
34
+ $ brew install poppler
35
35
 
36
36
  You may also need Ghostscript to remove password protection from PDF files. This is
37
37
  installed by default on most systems (including Mac). On Ubuntu you can use:
@@ -218,6 +218,10 @@ Akoma Ntoso `component` elements at the end of the XML document, with a name of
218
218
 
219
219
  ## Changelog
220
220
 
221
+ ### 0.16.0
222
+
223
+ * Support --crop for PDFs. Requires [poppler](https://poppler.freedesktop.org/) pdftotex, not xpdf.
224
+
221
225
  ### 0.15.2
222
226
 
223
227
  * Update nokogiri to ~> 1.8.1
data/bin/slaw CHANGED
@@ -22,6 +22,7 @@ class SlawCLI < Thor
22
22
  option :id_prefix, type: :string, desc: "Prefix to be used when generating ID elements when parsing a fragment."
23
23
  option :section_number_position, enum: ['before-title', 'after-title', 'guess'], desc: "Where do section titles come in relation to the section number? Default: before-title"
24
24
  option :reformat, type: :boolean, desc: "Reformat common formatting issues to make grammar matching better. Default: true for PDF files, false otherwise"
25
+ option :crop, type: :string, desc: "Crop box for PDF files, as 'left,top,width,height'."
25
26
  def parse(name)
26
27
  logging
27
28
 
@@ -32,6 +33,13 @@ class SlawCLI < Thor
32
33
  Slaw::Extract::Extractor.pdftotext_path = options[:pdftotext] if options[:pdftotext]
33
34
  extractor = Slaw::Extract::Extractor.new
34
35
 
36
+ if options[:crop]
37
+ extractor.cropbox = options[:crop].split(',').map(&:to_i)
38
+ if extractor.cropbox.length != 4
39
+ raise Thor::Error.new("--crop requires four comma-separated integers")
40
+ end
41
+ end
42
+
35
43
  case options[:input]
36
44
  when 'pdf'
37
45
  text = extractor.extract_from_pdf(name)
@@ -17,6 +17,8 @@ module Slaw
17
17
 
18
18
  @@pdftotext_path = "pdftotext"
19
19
 
20
+ attr_accessor :cropbox
21
+
20
22
  # Extract text from a file.
21
23
  #
22
24
  # @param filename [String] filename to extract from
@@ -73,7 +75,14 @@ module Slaw
73
75
  #
74
76
  # @return [Array<String>] command and params to execute
75
77
  def pdf_to_text_cmd(filename)
76
- [Extractor.pdftotext_path, "-enc", "UTF-8", filename, "-"]
78
+ cmd = [Extractor.pdftotext_path, "-enc", "UTF-8", "-nopgbrk"]
79
+
80
+ if @cropbox
81
+ # left, top, width, height
82
+ cmd += "-x -y -W -H".split.zip(@cropbox.map(&:to_s)).flatten
83
+ end
84
+
85
+ cmd + [filename, "-"]
77
86
  end
78
87
 
79
88
  def extract_from_text(filename)
data/lib/slaw/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Slaw
2
- VERSION = "0.15.2"
2
+ VERSION = "0.16.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: slaw
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.15.2
4
+ version: 0.16.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Greg Kempe
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2017-12-22 00:00:00.000000000 Z
11
+ date: 2018-01-02 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler