tika_wrapper 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +35 -0
- data/.rubocop.yml +15 -0
- data/.rubocop_hound.yml +1063 -0
- data/.rubocop_todo.yml +0 -0
- data/.travis.yml +4 -0
- data/Gemfile +4 -0
- data/LICENSE +22 -0
- data/README.md +15 -0
- data/Rakefile +7 -0
- data/bin/console +14 -0
- data/bin/setup +7 -0
- data/coveralls.yml +1 -0
- data/exe/tika_wrapper +30 -0
- data/lib/tika_wrapper.rb +18 -0
- data/lib/tika_wrapper/instance.rb +234 -0
- data/lib/tika_wrapper/version.rb +3 -0
- data/spec/lib/tika_wrapper_spec.rb +15 -0
- data/spec/spec_helper.rb +9 -0
- data/tika_wrapper.gemspec +28 -0
- metadata +136 -0
data/.rubocop_todo.yml
ADDED
File without changes
|
data/.travis.yml
ADDED
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
The MIT License (MIT)
|
2
|
+
|
3
|
+
Copyright (c) 2015 Chris Beer
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
22
|
+
|
data/README.md
ADDED
data/Rakefile
ADDED
data/bin/console
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'bundler/setup'
|
4
|
+
require 'tika_wrapper'
|
5
|
+
|
6
|
+
# You can add fixtures and/or initialization code here to make experimenting
|
7
|
+
# with your gem easier. You can also use a different console, if you like.
|
8
|
+
|
9
|
+
# (If you use this, don't forget to add pry to your Gemfile!)
|
10
|
+
# require "pry"
|
11
|
+
# Pry.start
|
12
|
+
|
13
|
+
require 'irb'
|
14
|
+
IRB.start
|
data/bin/setup
ADDED
data/coveralls.yml
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
service_name: travis-ci
|
data/exe/tika_wrapper
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
#!/usr/bin/env ruby
|
2
|
+
|
3
|
+
require 'tika_wrapper'
|
4
|
+
require 'optparse'
|
5
|
+
|
6
|
+
options = {}
|
7
|
+
OptionParser.new do |opts|
|
8
|
+
opts.banner = "Usage: tika_wrapper [options]"
|
9
|
+
|
10
|
+
opts.on("-v", "--[no-]verbose", "Run verbosely") do |v|
|
11
|
+
options[:verbose] = v
|
12
|
+
end
|
13
|
+
|
14
|
+
opts.on("--version VERSION", "Specify a tika version to download (default: #{TikaWrapper.default_tika_version})") do |v|
|
15
|
+
options[:version] = v
|
16
|
+
end
|
17
|
+
|
18
|
+
opts.on("-pPORT", "--port PORT", "Specify the port tika should run at (default: 9998)") do |p|
|
19
|
+
options[:port] = p
|
20
|
+
end
|
21
|
+
end.parse!
|
22
|
+
|
23
|
+
# default to verbose
|
24
|
+
options[:verbose] = true if options[:verbose].nil?
|
25
|
+
|
26
|
+
TikaWrapper.wrap(options) do |conn|
|
27
|
+
while conn.status
|
28
|
+
sleep 1
|
29
|
+
end
|
30
|
+
end
|
data/lib/tika_wrapper.rb
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
require 'tika_wrapper/version'
|
2
|
+
require 'tika_wrapper/instance'
|
3
|
+
|
4
|
+
module TikaWrapper
|
5
|
+
def self.default_tika_version
|
6
|
+
"1.8"
|
7
|
+
end
|
8
|
+
|
9
|
+
def self.default_instance(options = {})
|
10
|
+
@default_instance ||= TikaWrapper::Instance.new options
|
11
|
+
end
|
12
|
+
|
13
|
+
##
|
14
|
+
# Ensures a tika service is running before executing the block
|
15
|
+
def self.wrap(options = {}, &block)
|
16
|
+
default_instance(options).wrap(&block)
|
17
|
+
end
|
18
|
+
end
|
@@ -0,0 +1,234 @@
|
|
1
|
+
require 'digest'
|
2
|
+
require 'fileutils'
|
3
|
+
require 'json'
|
4
|
+
require 'open-uri'
|
5
|
+
require 'ruby-progressbar'
|
6
|
+
require 'securerandom'
|
7
|
+
require 'stringio'
|
8
|
+
require 'tmpdir'
|
9
|
+
|
10
|
+
module TikaWrapper
|
11
|
+
class Instance
|
12
|
+
attr_reader :options, :pid
|
13
|
+
|
14
|
+
##
|
15
|
+
# @param [Hash] options
|
16
|
+
# @option options [String] :url
|
17
|
+
# @option options [String] :version
|
18
|
+
# @option options [String] :port
|
19
|
+
# @option options [String] :version_file
|
20
|
+
# @option options [String] :instance_dir
|
21
|
+
# @option options [String] :download_path
|
22
|
+
# @option options [String] :md5sum
|
23
|
+
# @option options [String] :tika_xml
|
24
|
+
# @option options [Boolean] :verbose
|
25
|
+
# @option options [Boolean] :managed
|
26
|
+
# @option options [Boolean] :ignore_md5sum
|
27
|
+
# @option options [Hash] :tika_options
|
28
|
+
# @option options [Hash] :env
|
29
|
+
def initialize(options = {})
|
30
|
+
@options = options
|
31
|
+
end
|
32
|
+
|
33
|
+
def wrap(&_block)
|
34
|
+
start
|
35
|
+
yield self
|
36
|
+
ensure
|
37
|
+
stop
|
38
|
+
end
|
39
|
+
|
40
|
+
##
|
41
|
+
# Start tika and wait for it to become available
|
42
|
+
def start
|
43
|
+
download
|
44
|
+
if managed?
|
45
|
+
exec(p: port)
|
46
|
+
|
47
|
+
# Wait for tika to start
|
48
|
+
unless status
|
49
|
+
sleep 1
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
|
54
|
+
##
|
55
|
+
# Stop tika and wait for it to finish exiting
|
56
|
+
def stop
|
57
|
+
if managed? && started?
|
58
|
+
Process.kill("KILL", pid.to_i)
|
59
|
+
|
60
|
+
# Wait for tika to stop
|
61
|
+
while status
|
62
|
+
sleep 1
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
@pid = nil
|
67
|
+
end
|
68
|
+
|
69
|
+
##
|
70
|
+
# Check the status of a managed tika service
|
71
|
+
def status
|
72
|
+
return true unless managed?
|
73
|
+
|
74
|
+
begin
|
75
|
+
open(url + "version")
|
76
|
+
true
|
77
|
+
rescue
|
78
|
+
false
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
##
|
83
|
+
# Is tika running?
|
84
|
+
def started?
|
85
|
+
!!status
|
86
|
+
end
|
87
|
+
|
88
|
+
##
|
89
|
+
# Get the port this tika instance is running at
|
90
|
+
def port
|
91
|
+
options.fetch(:port, "9998").to_s
|
92
|
+
end
|
93
|
+
|
94
|
+
##
|
95
|
+
# Clean up any files tika_wrapper may have downloaded
|
96
|
+
def clean!
|
97
|
+
stop
|
98
|
+
FileUtils.remove_entry(download_path) if File.exists? download_path
|
99
|
+
FileUtils.remove_entry(md5sum_path) if File.exists? md5sum_path
|
100
|
+
end
|
101
|
+
|
102
|
+
##
|
103
|
+
# Get a (likely) URL to the tika instance
|
104
|
+
def url
|
105
|
+
"http://127.0.0.1:#{port}/"
|
106
|
+
end
|
107
|
+
|
108
|
+
protected
|
109
|
+
|
110
|
+
def download
|
111
|
+
unless File.exists?(download_path) && validate?(download_path)
|
112
|
+
fetch_with_progressbar download_url, download_path
|
113
|
+
validate! download_path
|
114
|
+
end
|
115
|
+
|
116
|
+
download_path
|
117
|
+
end
|
118
|
+
|
119
|
+
def validate?(file)
|
120
|
+
Digest::MD5.file(file).hexdigest == expected_md5sum
|
121
|
+
end
|
122
|
+
|
123
|
+
def validate!(file)
|
124
|
+
unless validate? file
|
125
|
+
raise "MD5 mismatch" unless options[:ignore_md5sum]
|
126
|
+
end
|
127
|
+
end
|
128
|
+
|
129
|
+
##
|
130
|
+
# Run the tika server
|
131
|
+
def exec(options = {})
|
132
|
+
args = ["java", "-jar", tika_binary] + tika_options.merge(options).map { |k, v| ["-#{k}", "#{v}"] }.flatten + [">&2"]
|
133
|
+
io = IO.popen(env, args + [err: [:child, :out]])
|
134
|
+
@pid = io.pid
|
135
|
+
end
|
136
|
+
|
137
|
+
private
|
138
|
+
|
139
|
+
def download_url
|
140
|
+
@download_url ||= options.fetch(:url, default_download_url)
|
141
|
+
end
|
142
|
+
|
143
|
+
def default_download_url
|
144
|
+
@default_url ||= begin
|
145
|
+
mirror_url = "http://www.apache.org/dyn/closer.cgi/tika/tika-server-#{version}.jar?asjson=true"
|
146
|
+
json = open(mirror_url).read
|
147
|
+
doc = JSON.parse(json)
|
148
|
+
doc['preferred'] + doc['path_info']
|
149
|
+
end
|
150
|
+
end
|
151
|
+
|
152
|
+
def md5url
|
153
|
+
"http://archive.apache.org/dist/tika/tika-server-#{version}.jar.md5"
|
154
|
+
end
|
155
|
+
|
156
|
+
def version
|
157
|
+
@version ||= options.fetch(:version, default_tika_version)
|
158
|
+
end
|
159
|
+
|
160
|
+
def tika_options
|
161
|
+
options.fetch(:tika_options, {})
|
162
|
+
end
|
163
|
+
|
164
|
+
def env
|
165
|
+
options.fetch(:env, {})
|
166
|
+
end
|
167
|
+
|
168
|
+
def default_tika_version
|
169
|
+
TikaWrapper.default_tika_version
|
170
|
+
end
|
171
|
+
|
172
|
+
def download_path
|
173
|
+
@download_path ||= options.fetch(:download_path, default_download_path)
|
174
|
+
end
|
175
|
+
|
176
|
+
def default_download_path
|
177
|
+
File.join(Dir.tmpdir, File.basename(download_url))
|
178
|
+
end
|
179
|
+
|
180
|
+
def tika_dir
|
181
|
+
@tika_dir ||= options.fetch(:instance_dir, File.join(Dir.tmpdir, File.basename(download_url, ".jar")))
|
182
|
+
end
|
183
|
+
|
184
|
+
def verbose?
|
185
|
+
!!options.fetch(:verbose, false)
|
186
|
+
end
|
187
|
+
|
188
|
+
def managed?
|
189
|
+
!!options.fetch(:managed, true)
|
190
|
+
end
|
191
|
+
|
192
|
+
def version_file
|
193
|
+
options.fetch(:version_file, File.join(tika_dir, "VERSION"))
|
194
|
+
end
|
195
|
+
|
196
|
+
def expected_md5sum
|
197
|
+
@md5sum ||= options.fetch(:md5sum, open(md5file).read.split(" ").first)
|
198
|
+
end
|
199
|
+
|
200
|
+
def tika_binary
|
201
|
+
download_path
|
202
|
+
end
|
203
|
+
|
204
|
+
def md5sum_path
|
205
|
+
File.join(Dir.tmpdir, File.basename(md5url))
|
206
|
+
end
|
207
|
+
|
208
|
+
def tmp_save_dir
|
209
|
+
@tmp_save_dir ||= Dir.mktmpdir
|
210
|
+
end
|
211
|
+
|
212
|
+
def fetch_with_progressbar(url, output)
|
213
|
+
pbar = ProgressBar.create(title: File.basename(url), total: nil, format: "%t: |%B| %p%% (%e )")
|
214
|
+
open(url, content_length_proc: lambda do|t|
|
215
|
+
if t && 0 < t
|
216
|
+
pbar.total = t
|
217
|
+
end
|
218
|
+
end,
|
219
|
+
progress_proc: lambda do|s|
|
220
|
+
pbar.progress = s
|
221
|
+
end) do |io|
|
222
|
+
IO.copy_stream(io, output)
|
223
|
+
end
|
224
|
+
end
|
225
|
+
|
226
|
+
def md5file
|
227
|
+
unless File.exists? md5sum_path
|
228
|
+
fetch_with_progressbar md5url, md5sum_path
|
229
|
+
end
|
230
|
+
|
231
|
+
md5sum_path
|
232
|
+
end
|
233
|
+
end
|
234
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
describe TikaWrapper do
|
4
|
+
describe ".wrap" do
|
5
|
+
it "should launch tika" do
|
6
|
+
TikaWrapper.wrap do |tika|
|
7
|
+
expect do
|
8
|
+
Timeout::timeout(15) do
|
9
|
+
TCPSocket.new('127.0.0.1', tika.port).close
|
10
|
+
end
|
11
|
+
end.not_to raise_exception
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'tika_wrapper/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "tika_wrapper"
|
8
|
+
spec.version = TikaWrapper::VERSION
|
9
|
+
spec.authors = ["Chris Beer"]
|
10
|
+
spec.email = ["chris@cbeer.info"]
|
11
|
+
spec.summary = %q{Tika service wrapper}
|
12
|
+
spec.homepage = "https://github.com/cbeer/tika_wrapper"
|
13
|
+
spec.license = "MIT"
|
14
|
+
|
15
|
+
spec.files = `git ls-files -z`.split("\x0")
|
16
|
+
spec.bindir = 'exe'
|
17
|
+
spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ["lib"]
|
20
|
+
|
21
|
+
spec.add_dependency "ruby-progressbar"
|
22
|
+
|
23
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
24
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
25
|
+
|
26
|
+
spec.add_development_dependency "rspec"
|
27
|
+
spec.add_development_dependency "coveralls"
|
28
|
+
end
|
metadata
ADDED
@@ -0,0 +1,136 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tika_wrapper
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Chris Beer
|
8
|
+
autorequire:
|
9
|
+
bindir: exe
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-06-03 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: ruby-progressbar
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: bundler
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '1.7'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '1.7'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rake
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '10.0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '10.0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - ">="
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - ">="
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: coveralls
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - ">="
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '0'
|
83
|
+
description:
|
84
|
+
email:
|
85
|
+
- chris@cbeer.info
|
86
|
+
executables:
|
87
|
+
- tika_wrapper
|
88
|
+
extensions: []
|
89
|
+
extra_rdoc_files: []
|
90
|
+
files:
|
91
|
+
- ".gitignore"
|
92
|
+
- ".rubocop.yml"
|
93
|
+
- ".rubocop_hound.yml"
|
94
|
+
- ".rubocop_todo.yml"
|
95
|
+
- ".travis.yml"
|
96
|
+
- Gemfile
|
97
|
+
- LICENSE
|
98
|
+
- README.md
|
99
|
+
- Rakefile
|
100
|
+
- bin/console
|
101
|
+
- bin/setup
|
102
|
+
- coveralls.yml
|
103
|
+
- exe/tika_wrapper
|
104
|
+
- lib/tika_wrapper.rb
|
105
|
+
- lib/tika_wrapper/instance.rb
|
106
|
+
- lib/tika_wrapper/version.rb
|
107
|
+
- spec/lib/tika_wrapper_spec.rb
|
108
|
+
- spec/spec_helper.rb
|
109
|
+
- tika_wrapper.gemspec
|
110
|
+
homepage: https://github.com/cbeer/tika_wrapper
|
111
|
+
licenses:
|
112
|
+
- MIT
|
113
|
+
metadata: {}
|
114
|
+
post_install_message:
|
115
|
+
rdoc_options: []
|
116
|
+
require_paths:
|
117
|
+
- lib
|
118
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
119
|
+
requirements:
|
120
|
+
- - ">="
|
121
|
+
- !ruby/object:Gem::Version
|
122
|
+
version: '0'
|
123
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
124
|
+
requirements:
|
125
|
+
- - ">="
|
126
|
+
- !ruby/object:Gem::Version
|
127
|
+
version: '0'
|
128
|
+
requirements: []
|
129
|
+
rubyforge_project:
|
130
|
+
rubygems_version: 2.4.5
|
131
|
+
signing_key:
|
132
|
+
specification_version: 4
|
133
|
+
summary: Tika service wrapper
|
134
|
+
test_files:
|
135
|
+
- spec/lib/tika_wrapper_spec.rb
|
136
|
+
- spec/spec_helper.rb
|