slaw 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5d293949a3ac2383cd254f0efd2268ef24286f1d
4
- data.tar.gz: 2439db865559b2f9b494b7755004d3f7a7d9b4ba
3
+ metadata.gz: 26e7c12c3e421410a6be3b18c19f589915e920fb
4
+ data.tar.gz: e8c1e77994e30c194b42bcca11bbc3b4eace6f76
5
5
  SHA512:
6
- metadata.gz: 2a7af42dc109723ea0908e8bb3ba9cbf4269f923dbd8a68e6cabd87f53e98dec59884cb7d4daf0df8de98b3d209302a9ba214e45d04fadffd372fabf91c9db26
7
- data.tar.gz: 41bac579c874b9ef29aaf5a87e243fd9ba268efe46c60e94cdc728e0e4d0f01b37b60b40fb0fa80a82889a43ac44174c6d8385386fb72a98c37dd0c5868c3b97
6
+ metadata.gz: 3e229100f80879b9646135b9c9f6fab2c543a20cd89ed6b021a74164fb92874b876451571bc9552d3150a2953a00be8d927b63c3e7ad780dec800751bff086c3
7
+ data.tar.gz: c8d2d73f08b67a2535816d9b5ae8f8abfb7cd2627f3aef1688a986bd4e6ba1e71a01cb1c53c0a5b66d9f836191abef6c89e8f43767a12d8b949d329e635c1043
data/bin/slaw CHANGED
@@ -6,11 +6,15 @@ require 'slaw'
6
6
  class SlawCLI < Thor
7
7
  # TODO: support different grammars and locales
8
8
 
9
+ class_option :verbose, type: :boolean, desc: "Display log output on stderr"
10
+
9
11
  desc "parse FILE", "parse FILE into Akoma Ntoso XML"
10
12
  option :input, enum: ['text', 'pdf'], desc: "Type of input if it can't be determined automatically"
11
13
  option :pdftotext, desc: "Location of the pdftotext binary if not in PATH"
12
14
  option :definitions, type: :boolean, desc: "Find and link definitions (this can be slow). Default: false"
13
15
  def parse(name)
16
+ logging
17
+
14
18
  Slaw::Extract::Extractor.pdftotext_path = options[:pdftotext] if options[:pdftotext]
15
19
  extractor = Slaw::Extract::Extractor.new
16
20
 
@@ -31,6 +35,14 @@ class SlawCLI < Thor
31
35
 
32
36
  puts act.to_xml(indent: 2)
33
37
  end
38
+
39
+ no_commands do
40
+ def logging
41
+ logger = Log4r::Logger.new('Slaw')
42
+ logger.outputters = Log4r::Outputter.stderr
43
+ logger.outputters[0].level = options[:verbose] ? Log4r::DEBUG : Log4r::ERROR
44
+ end
45
+ end
34
46
  end
35
47
 
36
48
  SlawCLI.start(ARGV)
@@ -30,8 +30,6 @@ module Slaw
30
30
  #
31
31
  # @return [String] extracted text
32
32
  def extract_from_file(filename)
33
- ext = filename[-4..-1].downcase
34
-
35
33
  mimetype = get_mimetype(filename)
36
34
 
37
35
  case mimetype && mimetype.type
@@ -40,11 +38,11 @@ module Slaw
40
38
  when 'text/plain', nil
41
39
  extract_from_text(filename)
42
40
  else
43
- if mimetype.text?
44
- extract_from_text(filename)
45
- else
46
- raise ArgumentError.new("Unsupported file type #{ext} (#{mimetype || unknown})")
41
+ text = extract_via_tika(filename)
42
+ if text.empty? or text.nil?
43
+ raise ArgumentError.new("Unsupported file type #{mimetype || 'unknown'}")
47
44
  end
45
+ text
48
46
  end
49
47
  end
50
48
 
@@ -87,6 +85,20 @@ module Slaw
87
85
  cleanup(File.read(filename))
88
86
  end
89
87
 
88
+ # Extract text from +filename+ by sending it to apache tika
89
+ # http://tika.apache.org/
90
+ def extract_via_tika(filename)
91
+ # the Yomu gem falls over when trying to write large amounts of data
92
+ # the JVM stdin, so we manually call java ourselves, relying on yomu
93
+ # to supply the gem
94
+ require 'slaw/extract/yomu_patch'
95
+ logger.info("Using Tika to get text from #{filename}. You need a JVM installed for this.")
96
+
97
+ text = Yomu.text_from_file(filename)
98
+ logger.info("Tika returned #{text.length} bytes")
99
+ text
100
+ end
101
+
90
102
  # Run general once-off cleanup of extracted text.
91
103
  def cleanup(text)
92
104
  text = @cleanser.cleanup(text)
@@ -0,0 +1,9 @@
1
+ require 'yomu'
2
+
3
+ class Yomu
4
+ def self.text_from_file(filename)
5
+ IO.popen("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} -t '#{filename}'", 'r') do |io|
6
+ io.read
7
+ end
8
+ end
9
+ end
data/lib/slaw/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Slaw
2
- VERSION = "0.5.0"
2
+ VERSION = "0.5.1"
3
3
  end
data/slaw.gemspec CHANGED
@@ -28,4 +28,5 @@ Gem::Specification.new do |spec|
28
28
  spec.add_runtime_dependency "log4r", "~> 1.1.10"
29
29
  spec.add_runtime_dependency "thor", "~> 0.19.1"
30
30
  spec.add_runtime_dependency "mimemagic", "~> 0.2.1"
31
+ spec.add_runtime_dependency 'yomu', '~> 0.2.2'
31
32
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: slaw
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Greg Kempe
@@ -136,6 +136,20 @@ dependencies:
136
136
  - - "~>"
137
137
  - !ruby/object:Gem::Version
138
138
  version: 0.2.1
139
+ - !ruby/object:Gem::Dependency
140
+ name: yomu
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: 0.2.2
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: 0.2.2
139
153
  description: Slaw is a lightweight library for rendering and generating Akoma Ntoso
140
154
  acts from plain text and PDF documents.
141
155
  email:
@@ -157,6 +171,7 @@ files:
157
171
  - lib/slaw/bylaw.rb
158
172
  - lib/slaw/collection.rb
159
173
  - lib/slaw/extract/extractor.rb
174
+ - lib/slaw/extract/yomu_patch.rb
160
175
  - lib/slaw/generator.rb
161
176
  - lib/slaw/lifecycle_event.rb
162
177
  - lib/slaw/logging.rb