slaw 0.5.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 5d293949a3ac2383cd254f0efd2268ef24286f1d
4
- data.tar.gz: 2439db865559b2f9b494b7755004d3f7a7d9b4ba
3
+ metadata.gz: 26e7c12c3e421410a6be3b18c19f589915e920fb
4
+ data.tar.gz: e8c1e77994e30c194b42bcca11bbc3b4eace6f76
5
5
  SHA512:
6
- metadata.gz: 2a7af42dc109723ea0908e8bb3ba9cbf4269f923dbd8a68e6cabd87f53e98dec59884cb7d4daf0df8de98b3d209302a9ba214e45d04fadffd372fabf91c9db26
7
- data.tar.gz: 41bac579c874b9ef29aaf5a87e243fd9ba268efe46c60e94cdc728e0e4d0f01b37b60b40fb0fa80a82889a43ac44174c6d8385386fb72a98c37dd0c5868c3b97
6
+ metadata.gz: 3e229100f80879b9646135b9c9f6fab2c543a20cd89ed6b021a74164fb92874b876451571bc9552d3150a2953a00be8d927b63c3e7ad780dec800751bff086c3
7
+ data.tar.gz: c8d2d73f08b67a2535816d9b5ae8f8abfb7cd2627f3aef1688a986bd4e6ba1e71a01cb1c53c0a5b66d9f836191abef6c89e8f43767a12d8b949d329e635c1043
data/bin/slaw CHANGED
@@ -6,11 +6,15 @@ require 'slaw'
6
6
  class SlawCLI < Thor
7
7
  # TODO: support different grammars and locales
8
8
 
9
+ class_option :verbose, type: :boolean, desc: "Display log output on stderr"
10
+
9
11
  desc "parse FILE", "parse FILE into Akoma Ntoso XML"
10
12
  option :input, enum: ['text', 'pdf'], desc: "Type of input if it can't be determined automatically"
11
13
  option :pdftotext, desc: "Location of the pdftotext binary if not in PATH"
12
14
  option :definitions, type: :boolean, desc: "Find and link definitions (this can be slow). Default: false"
13
15
  def parse(name)
16
+ logging
17
+
14
18
  Slaw::Extract::Extractor.pdftotext_path = options[:pdftotext] if options[:pdftotext]
15
19
  extractor = Slaw::Extract::Extractor.new
16
20
 
@@ -31,6 +35,14 @@ class SlawCLI < Thor
31
35
 
32
36
  puts act.to_xml(indent: 2)
33
37
  end
38
+
39
+ no_commands do
40
+ def logging
41
+ logger = Log4r::Logger.new('Slaw')
42
+ logger.outputters = Log4r::Outputter.stderr
43
+ logger.outputters[0].level = options[:verbose] ? Log4r::DEBUG : Log4r::ERROR
44
+ end
45
+ end
34
46
  end
35
47
 
36
48
  SlawCLI.start(ARGV)
@@ -30,8 +30,6 @@ module Slaw
30
30
  #
31
31
  # @return [String] extracted text
32
32
  def extract_from_file(filename)
33
- ext = filename[-4..-1].downcase
34
-
35
33
  mimetype = get_mimetype(filename)
36
34
 
37
35
  case mimetype && mimetype.type
@@ -40,11 +38,11 @@ module Slaw
40
38
  when 'text/plain', nil
41
39
  extract_from_text(filename)
42
40
  else
43
- if mimetype.text?
44
- extract_from_text(filename)
45
- else
46
- raise ArgumentError.new("Unsupported file type #{ext} (#{mimetype || unknown})")
41
+ text = extract_via_tika(filename)
42
+ if text.empty? or text.nil?
43
+ raise ArgumentError.new("Unsupported file type #{mimetype || 'unknown'}")
47
44
  end
45
+ text
48
46
  end
49
47
  end
50
48
 
@@ -87,6 +85,20 @@ module Slaw
87
85
  cleanup(File.read(filename))
88
86
  end
89
87
 
88
+ # Extract text from +filename+ by sending it to apache tika
89
+ # http://tika.apache.org/
90
+ def extract_via_tika(filename)
91
+ # the Yomu gem falls over when trying to write large amounts of data
92
+ # the JVM stdin, so we manually call java ourselves, relying on yomu
93
+ # to supply the gem
94
+ require 'slaw/extract/yomu_patch'
95
+ logger.info("Using Tika to get text from #{filename}. You need a JVM installed for this.")
96
+
97
+ text = Yomu.text_from_file(filename)
98
+ logger.info("Tika returned #{text.length} bytes")
99
+ text
100
+ end
101
+
90
102
  # Run general once-off cleanup of extracted text.
91
103
  def cleanup(text)
92
104
  text = @cleanser.cleanup(text)
@@ -0,0 +1,9 @@
1
+ require 'yomu'
2
+
3
+ class Yomu
4
+ def self.text_from_file(filename)
5
+ IO.popen("#{java} -Djava.awt.headless=true -jar #{Yomu::JARPATH} -t '#{filename}'", 'r') do |io|
6
+ io.read
7
+ end
8
+ end
9
+ end
data/lib/slaw/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Slaw
2
- VERSION = "0.5.0"
2
+ VERSION = "0.5.1"
3
3
  end
data/slaw.gemspec CHANGED
@@ -28,4 +28,5 @@ Gem::Specification.new do |spec|
28
28
  spec.add_runtime_dependency "log4r", "~> 1.1.10"
29
29
  spec.add_runtime_dependency "thor", "~> 0.19.1"
30
30
  spec.add_runtime_dependency "mimemagic", "~> 0.2.1"
31
+ spec.add_runtime_dependency 'yomu', '~> 0.2.2'
31
32
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: slaw
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Greg Kempe
@@ -136,6 +136,20 @@ dependencies:
136
136
  - - "~>"
137
137
  - !ruby/object:Gem::Version
138
138
  version: 0.2.1
139
+ - !ruby/object:Gem::Dependency
140
+ name: yomu
141
+ requirement: !ruby/object:Gem::Requirement
142
+ requirements:
143
+ - - "~>"
144
+ - !ruby/object:Gem::Version
145
+ version: 0.2.2
146
+ type: :runtime
147
+ prerelease: false
148
+ version_requirements: !ruby/object:Gem::Requirement
149
+ requirements:
150
+ - - "~>"
151
+ - !ruby/object:Gem::Version
152
+ version: 0.2.2
139
153
  description: Slaw is a lightweight library for rendering and generating Akoma Ntoso
140
154
  acts from plain text and PDF documents.
141
155
  email:
@@ -157,6 +171,7 @@ files:
157
171
  - lib/slaw/bylaw.rb
158
172
  - lib/slaw/collection.rb
159
173
  - lib/slaw/extract/extractor.rb
174
+ - lib/slaw/extract/yomu_patch.rb
160
175
  - lib/slaw/generator.rb
161
176
  - lib/slaw/lifecycle_event.rb
162
177
  - lib/slaw/logging.rb