ruby_tika_app 1.8.0 → 1.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/HISTORY +3 -0
- data/LICENSE +1 -1
- data/README.md +1 -1
- data/Rakefile +2 -0
- data/ext/{tika-app-1.19.1.jar → tika-app-1.23.jar} +0 -0
- data/lib/ruby_tika_app.rb +8 -12
- data/ruby_tika_app.gemspec +4 -4
- data/spec/ruby_tika_app_spec.rb +13 -2
- data/spec/spec_helper.rb +4 -2
- data/spec/support/test_server.rb +2 -0
- metadata +9 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f0427b0e689b3e45dfb50a6e97819438c306606f7b889e60da80752c21d72b22
|
4
|
+
data.tar.gz: c8cd7c7eee7e1159f550873351cbfb89ce35f342d4a81b21831c76baecb3e64a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1c2fd6d9c927085051b5bda116d337869289a1a8616a0ce445c0914c4c223ca54f38726144dbad522fe7eb2e43d6eb3c7c520a6324b014c37eac41ecb91b4973
|
7
|
+
data.tar.gz: bcf71992f5372c8594a452fedce88f0d22f435104f7b63477ec49beaa62e9b0ec50bb73ee6939942cb84a963add608f64a80bbc294b17cc4f024232d3b24d901
|
data/Gemfile
CHANGED
data/HISTORY
CHANGED
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -6,7 +6,7 @@ This is a simple frontend to the Java Tika parser command line jar / app.
|
|
6
6
|
|
7
7
|
It is the same as running:
|
8
8
|
|
9
|
-
java -server -Djava.awt.headless=true -jar tika-app-
|
9
|
+
java -server -Djava.awt.headless=true -Dfile.encoding=UTF-8 -jar tika-app-1.23.jar FileToParse.pdf
|
10
10
|
|
11
11
|
with options like --xml, --text, etc.
|
12
12
|
|
data/Rakefile
CHANGED
Binary file
|
data/lib/ruby_tika_app.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Based on the rake remote task code
|
2
4
|
|
3
5
|
require 'rubygems'
|
@@ -5,6 +7,8 @@ require 'stringio'
|
|
5
7
|
require 'open4'
|
6
8
|
|
7
9
|
class RubyTikaApp
|
10
|
+
TIKA_APP_VERSION = '1.23'
|
11
|
+
|
8
12
|
class Error < RuntimeError; end
|
9
13
|
|
10
14
|
class CommandFailedError < Error
|
@@ -22,9 +26,9 @@ class RubyTikaApp
|
|
22
26
|
end
|
23
27
|
|
24
28
|
java_cmd = 'java'
|
25
|
-
java_args = '-server -Djava.awt.headless=true'
|
29
|
+
java_args = '-server -Djava.awt.headless=true -Dfile.encoding=UTF-8'
|
26
30
|
ext_dir = File.join(File.dirname(__FILE__))
|
27
|
-
tika_path = "#{ext_dir}/../ext/tika-app
|
31
|
+
tika_path = "#{ext_dir}/../ext/tika-app-#{TIKA_APP_VERSION}.jar"
|
28
32
|
tika_config_path = "#{ext_dir}/../ext/tika-config.xml"
|
29
33
|
|
30
34
|
@tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}' --config='#{tika_config_path}'"
|
@@ -38,7 +42,7 @@ class RubyTikaApp
|
|
38
42
|
run_tika('--html')
|
39
43
|
end
|
40
44
|
|
41
|
-
def to_json
|
45
|
+
def to_json(*_args)
|
42
46
|
run_tika('--json')
|
43
47
|
end
|
44
48
|
|
@@ -64,7 +68,7 @@ class RubyTikaApp
|
|
64
68
|
stdout_result = stdout.read.strip
|
65
69
|
stderr_result = stderr.read.strip
|
66
70
|
|
67
|
-
|
71
|
+
if stdout_result.empty? && !stderr_result.empty?
|
68
72
|
raise(CommandFailedError.new(stderr_result),
|
69
73
|
"execution failed with status #{stderr_result}: #{final_cmd}")
|
70
74
|
end
|
@@ -75,12 +79,4 @@ class RubyTikaApp
|
|
75
79
|
stdout.close
|
76
80
|
stderr.close
|
77
81
|
end
|
78
|
-
|
79
|
-
def strip_stderr(err)
|
80
|
-
err
|
81
|
-
.gsub(/^(info|warn) - .*$/i, '')
|
82
|
-
.strip
|
83
|
-
.gsub(/Picked up JAVA_TOOL_OPTIONS: .+ -Dfile.encoding=UTF-8/i, '')
|
84
|
-
.strip
|
85
|
-
end
|
86
82
|
end
|
data/ruby_tika_app.gemspec
CHANGED
@@ -1,8 +1,10 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
$LOAD_PATH.push File.expand_path('lib', __dir__)
|
2
4
|
|
3
5
|
Gem::Specification.new do |s|
|
4
6
|
s.name = 'ruby_tika_app'
|
5
|
-
s.version = '1.
|
7
|
+
s.version = '1.9.0'
|
6
8
|
s.platform = Gem::Platform::RUBY
|
7
9
|
s.authors = ['Chris Parker']
|
8
10
|
s.email = %w[mrcsparker@gmail.com]
|
@@ -10,8 +12,6 @@ Gem::Specification.new do |s|
|
|
10
12
|
s.summary = 'Wrapper around the tika-app jar'
|
11
13
|
s.description = 'Wrapper around the tika-app jar'
|
12
14
|
|
13
|
-
s.rubyforge_project = 'ruby_tika_app'
|
14
|
-
|
15
15
|
s.files = `git ls-files`.split("\n") +
|
16
16
|
%w[LICENSE README.md HISTORY]
|
17
17
|
s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
@@ -25,7 +25,7 @@ Gem::Specification.new do |s|
|
|
25
25
|
s.add_development_dependency('pry')
|
26
26
|
s.add_development_dependency('rack')
|
27
27
|
s.add_development_dependency('rake')
|
28
|
-
s.add_development_dependency('rspec', '~> 3.
|
28
|
+
s.add_development_dependency('rspec', '~> 3.9.0')
|
29
29
|
s.add_development_dependency('simplecov')
|
30
30
|
s.add_development_dependency('thin')
|
31
31
|
end
|
data/spec/ruby_tika_app_spec.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'spec_helper'
|
2
4
|
|
3
5
|
describe RubyTikaApp do
|
@@ -19,6 +21,15 @@ describe RubyTikaApp do
|
|
19
21
|
end
|
20
22
|
end
|
21
23
|
|
24
|
+
describe 'CommandFailedError' do
|
25
|
+
it 'is raised correctly' do
|
26
|
+
expect do
|
27
|
+
rta = RubyTikaApp.new('/file_not_found.pdf')
|
28
|
+
rta.to_text
|
29
|
+
end.to raise_error(RubyTikaApp::CommandFailedError)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
22
33
|
describe '#to_xml' do
|
23
34
|
it 'header' do
|
24
35
|
rta = RubyTikaApp.new(@test_file)
|
@@ -31,7 +42,7 @@ describe RubyTikaApp do
|
|
31
42
|
|
32
43
|
xml_size = xml.size / 2
|
33
44
|
|
34
|
-
expect(xml[xml_size..(xml_size + 100)]).to eq("
|
45
|
+
expect(xml[xml_size..(xml_size + 100)]).to eq("dology is also used in Frontier Sampling (FS).\nSince this is the only difference between MHRW and USD")
|
35
46
|
end
|
36
47
|
end
|
37
48
|
|
@@ -43,7 +54,7 @@ describe RubyTikaApp do
|
|
43
54
|
|
44
55
|
it 'middle' do
|
45
56
|
rta = RubyTikaApp.new(@test_file)
|
46
|
-
expect(rta.to_html[1000...1100]).to eq("
|
57
|
+
expect(rta.to_html[1000...1100]).to eq("nfo:modified\" content=\"2011-03-29T13:00:16Z\"/>\n<meta name=\"meta:save-date\" content=\"2011-03-29T13:00")
|
47
58
|
end
|
48
59
|
end
|
49
60
|
|
data/spec/spec_helper.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'simplecov'
|
2
4
|
SimpleCov.start
|
3
5
|
|
@@ -8,11 +10,11 @@ require 'ruby_tika_app'
|
|
8
10
|
require 'rspec'
|
9
11
|
|
10
12
|
# Include all files under spec/support
|
11
|
-
Dir['./spec/support/**/*.rb'].each { |f| require f }
|
13
|
+
Dir['./spec/support/**/*.rb'].sort.each { |f| require f }
|
12
14
|
|
13
15
|
# Start a local rack server to serve up test pages.
|
14
16
|
@server_thread = Thread.new do
|
15
|
-
Rack::Handler::Thin.run
|
17
|
+
Rack::Handler::Thin.run(MyApp::Test::Server.new, Port: 9299, Host: '127.0.0.1')
|
16
18
|
end
|
17
19
|
|
18
20
|
sleep(1) # wait a sec for the server to be booted
|
data/spec/support/test_server.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby_tika_app
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Parker
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-02-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: open4
|
@@ -100,14 +100,14 @@ dependencies:
|
|
100
100
|
requirements:
|
101
101
|
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: 3.
|
103
|
+
version: 3.9.0
|
104
104
|
type: :development
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: 3.
|
110
|
+
version: 3.9.0
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
112
|
name: simplecov
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -150,7 +150,7 @@ files:
|
|
150
150
|
- LICENSE
|
151
151
|
- README.md
|
152
152
|
- Rakefile
|
153
|
-
- ext/tika-app-1.
|
153
|
+
- ext/tika-app-1.23.jar
|
154
154
|
- ext/tika-config.xml
|
155
155
|
- lib/ruby_tika_app.rb
|
156
156
|
- ruby_tika_app.gemspec
|
@@ -178,15 +178,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
178
178
|
- !ruby/object:Gem::Version
|
179
179
|
version: '0'
|
180
180
|
requirements: []
|
181
|
-
|
182
|
-
rubygems_version: 2.7.7
|
181
|
+
rubygems_version: 3.1.2
|
183
182
|
signing_key:
|
184
183
|
specification_version: 4
|
185
184
|
summary: Wrapper around the tika-app jar
|
186
185
|
test_files:
|
187
|
-
- spec/
|
188
|
-
- spec/docs/graph sampling simplex - 11.pdf
|
186
|
+
- spec/spec_helper.rb
|
189
187
|
- spec/docs/cnn.com
|
190
188
|
- spec/docs/news.ycombinator.com
|
189
|
+
- spec/docs/graph sampling simplex - 11.pdf
|
191
190
|
- spec/support/test_server.rb
|
192
|
-
- spec/
|
191
|
+
- spec/ruby_tika_app_spec.rb
|