slaw 0.5.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/slaw +12 -0
- data/lib/slaw/extract/extractor.rb +18 -6
- data/lib/slaw/extract/yomu_patch.rb +9 -0
- data/lib/slaw/version.rb +1 -1
- data/slaw.gemspec +1 -0
- metadata +16 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 26e7c12c3e421410a6be3b18c19f589915e920fb
|
4
|
+
data.tar.gz: e8c1e77994e30c194b42bcca11bbc3b4eace6f76
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3e229100f80879b9646135b9c9f6fab2c543a20cd89ed6b021a74164fb92874b876451571bc9552d3150a2953a00be8d927b63c3e7ad780dec800751bff086c3
|
7
|
+
data.tar.gz: c8d2d73f08b67a2535816d9b5ae8f8abfb7cd2627f3aef1688a986bd4e6ba1e71a01cb1c53c0a5b66d9f836191abef6c89e8f43767a12d8b949d329e635c1043
|
data/bin/slaw
CHANGED
@@ -6,11 +6,15 @@ require 'slaw'
|
|
6
6
|
class SlawCLI < Thor
|
7
7
|
# TODO: support different grammars and locales
|
8
8
|
|
9
|
+
class_option :verbose, type: :boolean, desc: "Display log output on stderr"
|
10
|
+
|
9
11
|
desc "parse FILE", "parse FILE into Akoma Ntoso XML"
|
10
12
|
option :input, enum: ['text', 'pdf'], desc: "Type of input if it can't be determined automatically"
|
11
13
|
option :pdftotext, desc: "Location of the pdftotext binary if not in PATH"
|
12
14
|
option :definitions, type: :boolean, desc: "Find and link definitions (this can be slow). Default: false"
|
13
15
|
def parse(name)
|
16
|
+
logging
|
17
|
+
|
14
18
|
Slaw::Extract::Extractor.pdftotext_path = options[:pdftotext] if options[:pdftotext]
|
15
19
|
extractor = Slaw::Extract::Extractor.new
|
16
20
|
|
@@ -31,6 +35,14 @@ class SlawCLI < Thor
|
|
31
35
|
|
32
36
|
puts act.to_xml(indent: 2)
|
33
37
|
end
|
38
|
+
|
39
|
+
no_commands do
|
40
|
+
def logging
|
41
|
+
logger = Log4r::Logger.new('Slaw')
|
42
|
+
logger.outputters = Log4r::Outputter.stderr
|
43
|
+
logger.outputters[0].level = options[:verbose] ? Log4r::DEBUG : Log4r::ERROR
|
44
|
+
end
|
45
|
+
end
|
34
46
|
end
|
35
47
|
|
36
48
|
SlawCLI.start(ARGV)
|
@@ -30,8 +30,6 @@ module Slaw
|
|
30
30
|
#
|
31
31
|
# @return [String] extracted text
|
32
32
|
def extract_from_file(filename)
|
33
|
-
ext = filename[-4..-1].downcase
|
34
|
-
|
35
33
|
mimetype = get_mimetype(filename)
|
36
34
|
|
37
35
|
case mimetype && mimetype.type
|
@@ -40,11 +38,11 @@ module Slaw
|
|
40
38
|
when 'text/plain', nil
|
41
39
|
extract_from_text(filename)
|
42
40
|
else
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
raise ArgumentError.new("Unsupported file type #{ext} (#{mimetype || unknown})")
|
41
|
+
text = extract_via_tika(filename)
|
42
|
+
if text.empty? or text.nil?
|
43
|
+
raise ArgumentError.new("Unsupported file type #{mimetype || 'unknown'}")
|
47
44
|
end
|
45
|
+
text
|
48
46
|
end
|
49
47
|
end
|
50
48
|
|
@@ -87,6 +85,20 @@ module Slaw
|
|
87
85
|
cleanup(File.read(filename))
|
88
86
|
end
|
89
87
|
|
88
|
+
# Extract text from +filename+ by sending it to apache tika
|
89
|
+
# http://tika.apache.org/
|
90
|
+
def extract_via_tika(filename)
|
91
|
+
# the Yomu gem falls over when trying to write large amounts of data
|
92
|
+
# the JVM stdin, so we manually call java ourselves, relying on yomu
|
93
|
+
# to supply the gem
|
94
|
+
require 'slaw/extract/yomu_patch'
|
95
|
+
logger.info("Using Tika to get text from #{filename}. You need a JVM installed for this.")
|
96
|
+
|
97
|
+
text = Yomu.text_from_file(filename)
|
98
|
+
logger.info("Tika returned #{text.length} bytes")
|
99
|
+
text
|
100
|
+
end
|
101
|
+
|
90
102
|
# Run general once-off cleanup of extracted text.
|
91
103
|
def cleanup(text)
|
92
104
|
text = @cleanser.cleanup(text)
|
data/lib/slaw/version.rb
CHANGED
data/slaw.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: slaw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Greg Kempe
|
@@ -136,6 +136,20 @@ dependencies:
|
|
136
136
|
- - "~>"
|
137
137
|
- !ruby/object:Gem::Version
|
138
138
|
version: 0.2.1
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: yomu
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - "~>"
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: 0.2.2
|
146
|
+
type: :runtime
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: 0.2.2
|
139
153
|
description: Slaw is a lightweight library for rendering and generating Akoma Ntoso
|
140
154
|
acts from plain text and PDF documents.
|
141
155
|
email:
|
@@ -157,6 +171,7 @@ files:
|
|
157
171
|
- lib/slaw/bylaw.rb
|
158
172
|
- lib/slaw/collection.rb
|
159
173
|
- lib/slaw/extract/extractor.rb
|
174
|
+
- lib/slaw/extract/yomu_patch.rb
|
160
175
|
- lib/slaw/generator.rb
|
161
176
|
- lib/slaw/lifecycle_event.rb
|
162
177
|
- lib/slaw/logging.rb
|