ruby_tika_app 1.8.0 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +2 -0
- data/HISTORY +3 -0
- data/LICENSE +1 -1
- data/README.md +1 -1
- data/Rakefile +2 -0
- data/ext/{tika-app-1.19.1.jar → tika-app-1.23.jar} +0 -0
- data/lib/ruby_tika_app.rb +8 -12
- data/ruby_tika_app.gemspec +4 -4
- data/spec/ruby_tika_app_spec.rb +13 -2
- data/spec/spec_helper.rb +4 -2
- data/spec/support/test_server.rb +2 -0
- metadata +9 -10
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: f0427b0e689b3e45dfb50a6e97819438c306606f7b889e60da80752c21d72b22
|
4
|
+
data.tar.gz: c8cd7c7eee7e1159f550873351cbfb89ce35f342d4a81b21831c76baecb3e64a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1c2fd6d9c927085051b5bda116d337869289a1a8616a0ce445c0914c4c223ca54f38726144dbad522fe7eb2e43d6eb3c7c520a6324b014c37eac41ecb91b4973
|
7
|
+
data.tar.gz: bcf71992f5372c8594a452fedce88f0d22f435104f7b63477ec49beaa62e9b0ec50bb73ee6939942cb84a963add608f64a80bbc294b17cc4f024232d3b24d901
|
data/Gemfile
CHANGED
data/HISTORY
CHANGED
data/LICENSE
CHANGED
data/README.md
CHANGED
@@ -6,7 +6,7 @@ This is a simple frontend to the Java Tika parser command line jar / app.
|
|
6
6
|
|
7
7
|
It is the same as running:
|
8
8
|
|
9
|
-
java -server -Djava.awt.headless=true -jar tika-app-
|
9
|
+
java -server -Djava.awt.headless=true -Dfile.encoding=UTF-8 -jar tika-app-1.23.jar FileToParse.pdf
|
10
10
|
|
11
11
|
with options like --xml, --text, etc.
|
12
12
|
|
data/Rakefile
CHANGED
Binary file
|
data/lib/ruby_tika_app.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Based on the rake remote task code
|
2
4
|
|
3
5
|
require 'rubygems'
|
@@ -5,6 +7,8 @@ require 'stringio'
|
|
5
7
|
require 'open4'
|
6
8
|
|
7
9
|
class RubyTikaApp
|
10
|
+
TIKA_APP_VERSION = '1.23'
|
11
|
+
|
8
12
|
class Error < RuntimeError; end
|
9
13
|
|
10
14
|
class CommandFailedError < Error
|
@@ -22,9 +26,9 @@ class RubyTikaApp
|
|
22
26
|
end
|
23
27
|
|
24
28
|
java_cmd = 'java'
|
25
|
-
java_args = '-server -Djava.awt.headless=true'
|
29
|
+
java_args = '-server -Djava.awt.headless=true -Dfile.encoding=UTF-8'
|
26
30
|
ext_dir = File.join(File.dirname(__FILE__))
|
27
|
-
tika_path = "#{ext_dir}/../ext/tika-app
|
31
|
+
tika_path = "#{ext_dir}/../ext/tika-app-#{TIKA_APP_VERSION}.jar"
|
28
32
|
tika_config_path = "#{ext_dir}/../ext/tika-config.xml"
|
29
33
|
|
30
34
|
@tika_cmd = "#{java_cmd} #{java_args} -jar '#{tika_path}' --config='#{tika_config_path}'"
|
@@ -38,7 +42,7 @@ class RubyTikaApp
|
|
38
42
|
run_tika('--html')
|
39
43
|
end
|
40
44
|
|
41
|
-
def to_json
|
45
|
+
def to_json(*_args)
|
42
46
|
run_tika('--json')
|
43
47
|
end
|
44
48
|
|
@@ -64,7 +68,7 @@ class RubyTikaApp
|
|
64
68
|
stdout_result = stdout.read.strip
|
65
69
|
stderr_result = stderr.read.strip
|
66
70
|
|
67
|
-
|
71
|
+
if stdout_result.empty? && !stderr_result.empty?
|
68
72
|
raise(CommandFailedError.new(stderr_result),
|
69
73
|
"execution failed with status #{stderr_result}: #{final_cmd}")
|
70
74
|
end
|
@@ -75,12 +79,4 @@ class RubyTikaApp
|
|
75
79
|
stdout.close
|
76
80
|
stderr.close
|
77
81
|
end
|
78
|
-
|
79
|
-
def strip_stderr(err)
|
80
|
-
err
|
81
|
-
.gsub(/^(info|warn) - .*$/i, '')
|
82
|
-
.strip
|
83
|
-
.gsub(/Picked up JAVA_TOOL_OPTIONS: .+ -Dfile.encoding=UTF-8/i, '')
|
84
|
-
.strip
|
85
|
-
end
|
86
82
|
end
|
data/ruby_tika_app.gemspec
CHANGED
@@ -1,8 +1,10 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
$LOAD_PATH.push File.expand_path('lib', __dir__)
|
2
4
|
|
3
5
|
Gem::Specification.new do |s|
|
4
6
|
s.name = 'ruby_tika_app'
|
5
|
-
s.version = '1.
|
7
|
+
s.version = '1.9.0'
|
6
8
|
s.platform = Gem::Platform::RUBY
|
7
9
|
s.authors = ['Chris Parker']
|
8
10
|
s.email = %w[mrcsparker@gmail.com]
|
@@ -10,8 +12,6 @@ Gem::Specification.new do |s|
|
|
10
12
|
s.summary = 'Wrapper around the tika-app jar'
|
11
13
|
s.description = 'Wrapper around the tika-app jar'
|
12
14
|
|
13
|
-
s.rubyforge_project = 'ruby_tika_app'
|
14
|
-
|
15
15
|
s.files = `git ls-files`.split("\n") +
|
16
16
|
%w[LICENSE README.md HISTORY]
|
17
17
|
s.executables = `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
@@ -25,7 +25,7 @@ Gem::Specification.new do |s|
|
|
25
25
|
s.add_development_dependency('pry')
|
26
26
|
s.add_development_dependency('rack')
|
27
27
|
s.add_development_dependency('rake')
|
28
|
-
s.add_development_dependency('rspec', '~> 3.
|
28
|
+
s.add_development_dependency('rspec', '~> 3.9.0')
|
29
29
|
s.add_development_dependency('simplecov')
|
30
30
|
s.add_development_dependency('thin')
|
31
31
|
end
|
data/spec/ruby_tika_app_spec.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'spec_helper'
|
2
4
|
|
3
5
|
describe RubyTikaApp do
|
@@ -19,6 +21,15 @@ describe RubyTikaApp do
|
|
19
21
|
end
|
20
22
|
end
|
21
23
|
|
24
|
+
describe 'CommandFailedError' do
|
25
|
+
it 'is raised correctly' do
|
26
|
+
expect do
|
27
|
+
rta = RubyTikaApp.new('/file_not_found.pdf')
|
28
|
+
rta.to_text
|
29
|
+
end.to raise_error(RubyTikaApp::CommandFailedError)
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
22
33
|
describe '#to_xml' do
|
23
34
|
it 'header' do
|
24
35
|
rta = RubyTikaApp.new(@test_file)
|
@@ -31,7 +42,7 @@ describe RubyTikaApp do
|
|
31
42
|
|
32
43
|
xml_size = xml.size / 2
|
33
44
|
|
34
|
-
expect(xml[xml_size..(xml_size + 100)]).to eq("
|
45
|
+
expect(xml[xml_size..(xml_size + 100)]).to eq("dology is also used in Frontier Sampling (FS).\nSince this is the only difference between MHRW and USD")
|
35
46
|
end
|
36
47
|
end
|
37
48
|
|
@@ -43,7 +54,7 @@ describe RubyTikaApp do
|
|
43
54
|
|
44
55
|
it 'middle' do
|
45
56
|
rta = RubyTikaApp.new(@test_file)
|
46
|
-
expect(rta.to_html[1000...1100]).to eq("
|
57
|
+
expect(rta.to_html[1000...1100]).to eq("nfo:modified\" content=\"2011-03-29T13:00:16Z\"/>\n<meta name=\"meta:save-date\" content=\"2011-03-29T13:00")
|
47
58
|
end
|
48
59
|
end
|
49
60
|
|
data/spec/spec_helper.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
require 'simplecov'
|
2
4
|
SimpleCov.start
|
3
5
|
|
@@ -8,11 +10,11 @@ require 'ruby_tika_app'
|
|
8
10
|
require 'rspec'
|
9
11
|
|
10
12
|
# Include all files under spec/support
|
11
|
-
Dir['./spec/support/**/*.rb'].each { |f| require f }
|
13
|
+
Dir['./spec/support/**/*.rb'].sort.each { |f| require f }
|
12
14
|
|
13
15
|
# Start a local rack server to serve up test pages.
|
14
16
|
@server_thread = Thread.new do
|
15
|
-
Rack::Handler::Thin.run
|
17
|
+
Rack::Handler::Thin.run(MyApp::Test::Server.new, Port: 9299, Host: '127.0.0.1')
|
16
18
|
end
|
17
19
|
|
18
20
|
sleep(1) # wait a sec for the server to be booted
|
data/spec/support/test_server.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: ruby_tika_app
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.
|
4
|
+
version: 1.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Chris Parker
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2020-02-04 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: open4
|
@@ -100,14 +100,14 @@ dependencies:
|
|
100
100
|
requirements:
|
101
101
|
- - "~>"
|
102
102
|
- !ruby/object:Gem::Version
|
103
|
-
version: 3.
|
103
|
+
version: 3.9.0
|
104
104
|
type: :development
|
105
105
|
prerelease: false
|
106
106
|
version_requirements: !ruby/object:Gem::Requirement
|
107
107
|
requirements:
|
108
108
|
- - "~>"
|
109
109
|
- !ruby/object:Gem::Version
|
110
|
-
version: 3.
|
110
|
+
version: 3.9.0
|
111
111
|
- !ruby/object:Gem::Dependency
|
112
112
|
name: simplecov
|
113
113
|
requirement: !ruby/object:Gem::Requirement
|
@@ -150,7 +150,7 @@ files:
|
|
150
150
|
- LICENSE
|
151
151
|
- README.md
|
152
152
|
- Rakefile
|
153
|
-
- ext/tika-app-1.
|
153
|
+
- ext/tika-app-1.23.jar
|
154
154
|
- ext/tika-config.xml
|
155
155
|
- lib/ruby_tika_app.rb
|
156
156
|
- ruby_tika_app.gemspec
|
@@ -178,15 +178,14 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
178
178
|
- !ruby/object:Gem::Version
|
179
179
|
version: '0'
|
180
180
|
requirements: []
|
181
|
-
|
182
|
-
rubygems_version: 2.7.7
|
181
|
+
rubygems_version: 3.1.2
|
183
182
|
signing_key:
|
184
183
|
specification_version: 4
|
185
184
|
summary: Wrapper around the tika-app jar
|
186
185
|
test_files:
|
187
|
-
- spec/
|
188
|
-
- spec/docs/graph sampling simplex - 11.pdf
|
186
|
+
- spec/spec_helper.rb
|
189
187
|
- spec/docs/cnn.com
|
190
188
|
- spec/docs/news.ycombinator.com
|
189
|
+
- spec/docs/graph sampling simplex - 11.pdf
|
191
190
|
- spec/support/test_server.rb
|
192
|
-
- spec/
|
191
|
+
- spec/ruby_tika_app_spec.rb
|