slaw 0.5.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/bin/slaw +12 -0
- data/lib/slaw/extract/extractor.rb +18 -6
- data/lib/slaw/extract/yomu_patch.rb +9 -0
- data/lib/slaw/version.rb +1 -1
- data/slaw.gemspec +1 -0
- metadata +16 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 26e7c12c3e421410a6be3b18c19f589915e920fb
|
4
|
+
data.tar.gz: e8c1e77994e30c194b42bcca11bbc3b4eace6f76
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3e229100f80879b9646135b9c9f6fab2c543a20cd89ed6b021a74164fb92874b876451571bc9552d3150a2953a00be8d927b63c3e7ad780dec800751bff086c3
|
7
|
+
data.tar.gz: c8d2d73f08b67a2535816d9b5ae8f8abfb7cd2627f3aef1688a986bd4e6ba1e71a01cb1c53c0a5b66d9f836191abef6c89e8f43767a12d8b949d329e635c1043
|
data/bin/slaw
CHANGED
@@ -6,11 +6,15 @@ require 'slaw'
|
|
6
6
|
class SlawCLI < Thor
|
7
7
|
# TODO: support different grammars and locales
|
8
8
|
|
9
|
+
class_option :verbose, type: :boolean, desc: "Display log output on stderr"
|
10
|
+
|
9
11
|
desc "parse FILE", "parse FILE into Akoma Ntoso XML"
|
10
12
|
option :input, enum: ['text', 'pdf'], desc: "Type of input if it can't be determined automatically"
|
11
13
|
option :pdftotext, desc: "Location of the pdftotext binary if not in PATH"
|
12
14
|
option :definitions, type: :boolean, desc: "Find and link definitions (this can be slow). Default: false"
|
13
15
|
def parse(name)
|
16
|
+
logging
|
17
|
+
|
14
18
|
Slaw::Extract::Extractor.pdftotext_path = options[:pdftotext] if options[:pdftotext]
|
15
19
|
extractor = Slaw::Extract::Extractor.new
|
16
20
|
|
@@ -31,6 +35,14 @@ class SlawCLI < Thor
|
|
31
35
|
|
32
36
|
puts act.to_xml(indent: 2)
|
33
37
|
end
|
38
|
+
|
39
|
+
no_commands do
|
40
|
+
def logging
|
41
|
+
logger = Log4r::Logger.new('Slaw')
|
42
|
+
logger.outputters = Log4r::Outputter.stderr
|
43
|
+
logger.outputters[0].level = options[:verbose] ? Log4r::DEBUG : Log4r::ERROR
|
44
|
+
end
|
45
|
+
end
|
34
46
|
end
|
35
47
|
|
36
48
|
SlawCLI.start(ARGV)
|
@@ -30,8 +30,6 @@ module Slaw
|
|
30
30
|
#
|
31
31
|
# @return [String] extracted text
|
32
32
|
def extract_from_file(filename)
|
33
|
-
ext = filename[-4..-1].downcase
|
34
|
-
|
35
33
|
mimetype = get_mimetype(filename)
|
36
34
|
|
37
35
|
case mimetype && mimetype.type
|
@@ -40,11 +38,11 @@ module Slaw
|
|
40
38
|
when 'text/plain', nil
|
41
39
|
extract_from_text(filename)
|
42
40
|
else
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
raise ArgumentError.new("Unsupported file type #{ext} (#{mimetype || unknown})")
|
41
|
+
text = extract_via_tika(filename)
|
42
|
+
if text.empty? or text.nil?
|
43
|
+
raise ArgumentError.new("Unsupported file type #{mimetype || 'unknown'}")
|
47
44
|
end
|
45
|
+
text
|
48
46
|
end
|
49
47
|
end
|
50
48
|
|
@@ -87,6 +85,20 @@ module Slaw
|
|
87
85
|
cleanup(File.read(filename))
|
88
86
|
end
|
89
87
|
|
88
|
+
# Extract text from +filename+ by sending it to apache tika
|
89
|
+
# http://tika.apache.org/
|
90
|
+
def extract_via_tika(filename)
|
91
|
+
# the Yomu gem falls over when trying to write large amounts of data
|
92
|
+
# the JVM stdin, so we manually call java ourselves, relying on yomu
|
93
|
+
# to supply the gem
|
94
|
+
require 'slaw/extract/yomu_patch'
|
95
|
+
logger.info("Using Tika to get text from #{filename}. You need a JVM installed for this.")
|
96
|
+
|
97
|
+
text = Yomu.text_from_file(filename)
|
98
|
+
logger.info("Tika returned #{text.length} bytes")
|
99
|
+
text
|
100
|
+
end
|
101
|
+
|
90
102
|
# Run general once-off cleanup of extracted text.
|
91
103
|
def cleanup(text)
|
92
104
|
text = @cleanser.cleanup(text)
|
data/lib/slaw/version.rb
CHANGED
data/slaw.gemspec
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: slaw
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Greg Kempe
|
@@ -136,6 +136,20 @@ dependencies:
|
|
136
136
|
- - "~>"
|
137
137
|
- !ruby/object:Gem::Version
|
138
138
|
version: 0.2.1
|
139
|
+
- !ruby/object:Gem::Dependency
|
140
|
+
name: yomu
|
141
|
+
requirement: !ruby/object:Gem::Requirement
|
142
|
+
requirements:
|
143
|
+
- - "~>"
|
144
|
+
- !ruby/object:Gem::Version
|
145
|
+
version: 0.2.2
|
146
|
+
type: :runtime
|
147
|
+
prerelease: false
|
148
|
+
version_requirements: !ruby/object:Gem::Requirement
|
149
|
+
requirements:
|
150
|
+
- - "~>"
|
151
|
+
- !ruby/object:Gem::Version
|
152
|
+
version: 0.2.2
|
139
153
|
description: Slaw is a lightweight library for rendering and generating Akoma Ntoso
|
140
154
|
acts from plain text and PDF documents.
|
141
155
|
email:
|
@@ -157,6 +171,7 @@ files:
|
|
157
171
|
- lib/slaw/bylaw.rb
|
158
172
|
- lib/slaw/collection.rb
|
159
173
|
- lib/slaw/extract/extractor.rb
|
174
|
+
- lib/slaw/extract/yomu_patch.rb
|
160
175
|
- lib/slaw/generator.rb
|
161
176
|
- lib/slaw/lifecycle_event.rb
|
162
177
|
- lib/slaw/logging.rb
|