tika-masala 1.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/Gemfile +7 -0
- data/Gemfile.lock +36 -0
- data/README.md +37 -0
- data/dependencies/tika-app-1.5.jar +0 -0
- data/lib/tika-masala.rb +1 -0
- data/lib/tika-masala/exceptions/tika_error.rb +11 -0
- data/lib/tika-masala/parser.rb +53 -0
- data/spec/fixtures/test.docx +0 -0
- data/spec/fixtures/test.pdf +0 -0
- data/spec/lib/parser_spec.rb +126 -0
- data/spec/spec_helper.rb +7 -0
- data/spec/support/spec_paths.rb +13 -0
- data/tika-masala.gemspec +15 -0
- metadata +75 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 0fb65303c39a65b4aa4c400c6799b29191129312
|
4
|
+
data.tar.gz: 243d05f8cf30c689d816eac20cbd2e623781cb20
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 63b7b6cff214a93f4dc4b8469e037d2ca9c2a106fe428aed195c7673e11974ba396fd9d121e68916b10a5c8211f0391f76a60c41ffff316ea6a5ba50fcb46758
|
7
|
+
data.tar.gz: a7ae5316c4f9aa14408af801dd9365728346afbff5387349ac4a14c8ca3867a3e86522884cc9b62fcee45dcbac5629bd8a225d887cb57025e2228ba5143a9d1c
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
tika-masala (1.5.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
coderay (1.1.0)
|
10
|
+
diff-lcs (1.2.5)
|
11
|
+
method_source (0.8.2)
|
12
|
+
pry (0.9.12.3)
|
13
|
+
coderay (~> 1.0)
|
14
|
+
method_source (~> 0.8)
|
15
|
+
slop (~> 3.4)
|
16
|
+
rspec (3.0.0)
|
17
|
+
rspec-core (~> 3.0.0)
|
18
|
+
rspec-expectations (~> 3.0.0)
|
19
|
+
rspec-mocks (~> 3.0.0)
|
20
|
+
rspec-core (3.0.4)
|
21
|
+
rspec-support (~> 3.0.0)
|
22
|
+
rspec-expectations (3.0.4)
|
23
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
24
|
+
rspec-support (~> 3.0.0)
|
25
|
+
rspec-mocks (3.0.4)
|
26
|
+
rspec-support (~> 3.0.0)
|
27
|
+
rspec-support (3.0.4)
|
28
|
+
slop (3.4.7)
|
29
|
+
|
30
|
+
PLATFORMS
|
31
|
+
ruby
|
32
|
+
|
33
|
+
DEPENDENCIES
|
34
|
+
pry
|
35
|
+
rspec (~> 3.0)
|
36
|
+
tika-masala!
|
data/README.md
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# TikaMasala
|
2
|
+
|
3
|
+
Simple Wrapper around Tika to parse documents.
|
4
|
+
|
5
|
+
## Usage
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
parser = TikaMasala::Parser.new('/path/to/tika/jar/file')
|
9
|
+
parser.parse('/path/to/pdf')
|
10
|
+
```
|
11
|
+
|
12
|
+
Everything uses `options` behind the scene to pass arguments:
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
parser.options('--detect', '/path/to/file')
|
16
|
+
```
|
17
|
+
|
18
|
+
## What if tika can't parse a document?
|
19
|
+
|
20
|
+
The exception thrown contain both the stdout and stderr output. Which means
|
21
|
+
that you can still retrieve part of the content you were trying to extract.
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
parser = TikaMasala::Parser.new('/path/to/tika/jar/file')
|
25
|
+
|
26
|
+
begin
|
27
|
+
parser.parse('/path/to/pdf')
|
28
|
+
rescue TikaMasala::TikaError => e
|
29
|
+
# Let's say it produced an error
|
30
|
+
e.stdout # contains the parsed text until it reached an error
|
31
|
+
e.stderr # contains the exception raised by Java
|
32
|
+
end
|
33
|
+
```
|
34
|
+
|
35
|
+
## Versions
|
36
|
+
|
37
|
+
The version match the version of tika distributed.
|
Binary file
|
data/lib/tika-masala.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require_relative 'tika-masala/parser'
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'open3'
|
2
|
+
require_relative 'exceptions/tika_error'
|
3
|
+
|
4
|
+
module TikaMasala
|
5
|
+
class FileNotFound < StandardError; end
|
6
|
+
|
7
|
+
class Parser
|
8
|
+
attr_reader :jar_path
|
9
|
+
|
10
|
+
def initialize(jar_path = nil)
|
11
|
+
if jar_path.nil?
|
12
|
+
jar_path = File.expand_path(File.join('..', '..', '..', 'dependencies', 'tika-app-1.5.jar'), __FILE__)
|
13
|
+
end
|
14
|
+
|
15
|
+
raise FileNotFound, "Jar #{jar_path} does not exist" unless File.exists?(jar_path)
|
16
|
+
|
17
|
+
@jar_path = jar_path
|
18
|
+
end
|
19
|
+
|
20
|
+
def parse(file)
|
21
|
+
java_exec('--text', file)
|
22
|
+
end
|
23
|
+
|
24
|
+
def metadata(file)
|
25
|
+
java_exec('--metadata', file)
|
26
|
+
end
|
27
|
+
|
28
|
+
def detect_type(file)
|
29
|
+
java_exec('--detect', file)
|
30
|
+
end
|
31
|
+
|
32
|
+
def java_exec(*args)
|
33
|
+
stdin, stdout, stderr, wait_thr = Open3.popen3("java -jar #{@jar_path.shellescape} #{args.shelljoin}")
|
34
|
+
|
35
|
+
exitstatus = wait_thr.value
|
36
|
+
|
37
|
+
if exitstatus != 0
|
38
|
+
expection = TikaError.new(
|
39
|
+
stdout: stdout.read,
|
40
|
+
stderr: stderr.read,
|
41
|
+
exitstatus: exitstatus
|
42
|
+
)
|
43
|
+
raise expection
|
44
|
+
else
|
45
|
+
stdout.read
|
46
|
+
end
|
47
|
+
ensure
|
48
|
+
stdin.close
|
49
|
+
stdout.close
|
50
|
+
stderr.close
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
Binary file
|
Binary file
|
@@ -0,0 +1,126 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe TikaMasala::Parser do
|
4
|
+
subject { described_class.new(tika_jar) }
|
5
|
+
|
6
|
+
let(:tika_jar) { jar_path('tika-app-1.5.jar') }
|
7
|
+
let(:pdf_file) { fixtures_path('test.pdf') }
|
8
|
+
let(:word_file) { fixtures_path('test.docx') }
|
9
|
+
|
10
|
+
it { respond_to(:parse) }
|
11
|
+
|
12
|
+
describe 'initialize' do
|
13
|
+
subject { described_class }
|
14
|
+
|
15
|
+
let(:fake_path) { '/no/such/file/exist' }
|
16
|
+
|
17
|
+
it 'raises an exception if the file does not exist' do
|
18
|
+
expect {
|
19
|
+
subject.new(fake_path)
|
20
|
+
}.to raise_error TikaMasala::FileNotFound, "Jar #{fake_path} does not exist"
|
21
|
+
end
|
22
|
+
|
23
|
+
context 'when no parameters are passed' do
|
24
|
+
it 'sets the jar to the one in our dependencies folder' do
|
25
|
+
expect(subject.new.jar_path).to eq(jar_path('tika-app-1.5.jar'))
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe '#parse' do
|
31
|
+
it 'parses pdf files properly' do
|
32
|
+
expect(subject.parse(pdf_file)).to match(/hello world/i)
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'parses word files properly' do
|
36
|
+
expect(subject.parse(word_file)).to match(/hello world/i)
|
37
|
+
end
|
38
|
+
|
39
|
+
context 'when the file does not exist' do
|
40
|
+
it 'raises an exception' do
|
41
|
+
expect { subject.parse(fixtures_path('file-dont-exist.ppt')) }.to raise_error(TikaMasala::TikaError)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
context 'when the file url contains reserved shell words' do
|
46
|
+
let(:file_path) { 'http://example.com/?token=123¶m=a' }
|
47
|
+
let(:file_path_escaped) { 'http://example.com/\\?token\\=123\\¶m\\=a' }
|
48
|
+
let(:popen_result_double) do
|
49
|
+
[
|
50
|
+
double('stdin', close: true),
|
51
|
+
double('stdout', close: true, read: ''),
|
52
|
+
double('stderr', close: true),
|
53
|
+
double('wait_thr', value: 0)
|
54
|
+
]
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'escapes the string properly' do
|
58
|
+
expect(Open3).to receive(:popen3).with("java -jar #{tika_jar} --text #{file_path_escaped}") do
|
59
|
+
popen_result_double
|
60
|
+
end
|
61
|
+
|
62
|
+
subject.parse(file_path)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
describe '#metadata' do
|
68
|
+
it 'checks the metadata from the file' do
|
69
|
+
expect(subject.metadata(word_file)).to match(/Content-Type.*word/i)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
describe '#detect_type' do
|
74
|
+
it 'finds the file type' do
|
75
|
+
expect(subject.detect_type(pdf_file)).to match(/application.pdf/i)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
describe '#java_exec' do
|
80
|
+
it 'takes option directly to tika' do
|
81
|
+
expect(subject.java_exec('--text', word_file)).to match(/hello world/i)
|
82
|
+
end
|
83
|
+
|
84
|
+
context 'when exit code is not zero' do
|
85
|
+
let(:command) { "echo '#{normal_message}'; echo '#{error_message}' 1>&2; exit 1;" }
|
86
|
+
let(:error_message) { 'Some nasty error occured' }
|
87
|
+
let(:normal_message) { 'Hello world' }
|
88
|
+
|
89
|
+
before do
|
90
|
+
popen_result = Open3.popen3(command)
|
91
|
+
|
92
|
+
allow(Open3).to receive(:popen3) do |*args, &block|
|
93
|
+
if block_given?
|
94
|
+
block.call(*popen_result)
|
95
|
+
else
|
96
|
+
popen_result
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
it 'raises an exception containing stderr' do
|
102
|
+
expect { subject.java_exec }.to raise_error(TikaMasala::TikaError, /#{error_message}/)
|
103
|
+
end
|
104
|
+
|
105
|
+
describe 'the expection raised' do
|
106
|
+
let(:returned_expection) do
|
107
|
+
[].tap do |returned_expection|
|
108
|
+
begin
|
109
|
+
subject.java_exec
|
110
|
+
rescue TikaMasala::TikaError => e
|
111
|
+
returned_expection << e
|
112
|
+
end
|
113
|
+
end.first
|
114
|
+
end
|
115
|
+
|
116
|
+
it 'does not contain the normal message' do
|
117
|
+
expect(returned_expection.message).to_not include(normal_message)
|
118
|
+
end
|
119
|
+
|
120
|
+
it 'provides another attribute which contains the ouput of stdout' do
|
121
|
+
expect(returned_expection.stdout).to include(normal_message)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
module SpecPaths
|
2
|
+
def fixtures_path(*file)
|
3
|
+
root_path('spec', 'fixtures', *file)
|
4
|
+
end
|
5
|
+
|
6
|
+
def jar_path(*file)
|
7
|
+
root_path('dependencies', *file)
|
8
|
+
end
|
9
|
+
|
10
|
+
def root_path(*file)
|
11
|
+
File.expand_path(File.join('..', '..', '..', *file), __FILE__)
|
12
|
+
end
|
13
|
+
end
|
data/tika-masala.gemspec
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = 'tika-masala'
|
3
|
+
s.version = '1.5.0'
|
4
|
+
s.date = '2014-03-31'
|
5
|
+
s.summary = "Tika wrapper for MRI"
|
6
|
+
s.description = "Provides ruby wrapper around tika command line tools"
|
7
|
+
s.authors = ["Vincent Bonmalais"]
|
8
|
+
s.email = 'vincent.bonmalais@econsultancy.com'
|
9
|
+
s.files = `git ls-files`.split("\n")
|
10
|
+
s.test_files = `git ls-files -- spec`.split("\n")
|
11
|
+
s.homepage = 'https://github.com/econsultancy/tika-masala'
|
12
|
+
s.license = 'MIT'
|
13
|
+
|
14
|
+
s.add_development_dependency('rspec', '~>3.0')
|
15
|
+
end
|
metadata
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tika-masala
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.5.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Vincent Bonmalais
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-03-31 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '3.0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '3.0'
|
27
|
+
description: Provides ruby wrapper around tika command line tools
|
28
|
+
email: vincent.bonmalais@econsultancy.com
|
29
|
+
executables: []
|
30
|
+
extensions: []
|
31
|
+
extra_rdoc_files: []
|
32
|
+
files:
|
33
|
+
- Gemfile
|
34
|
+
- Gemfile.lock
|
35
|
+
- README.md
|
36
|
+
- dependencies/tika-app-1.5.jar
|
37
|
+
- lib/tika-masala.rb
|
38
|
+
- lib/tika-masala/exceptions/tika_error.rb
|
39
|
+
- lib/tika-masala/parser.rb
|
40
|
+
- spec/fixtures/test.docx
|
41
|
+
- spec/fixtures/test.pdf
|
42
|
+
- spec/lib/parser_spec.rb
|
43
|
+
- spec/spec_helper.rb
|
44
|
+
- spec/support/spec_paths.rb
|
45
|
+
- tika-masala.gemspec
|
46
|
+
homepage: https://github.com/econsultancy/tika-masala
|
47
|
+
licenses:
|
48
|
+
- MIT
|
49
|
+
metadata: {}
|
50
|
+
post_install_message:
|
51
|
+
rdoc_options: []
|
52
|
+
require_paths:
|
53
|
+
- lib
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - '>='
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: '0'
|
59
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - '>='
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
requirements: []
|
65
|
+
rubyforge_project:
|
66
|
+
rubygems_version: 2.2.1
|
67
|
+
signing_key:
|
68
|
+
specification_version: 4
|
69
|
+
summary: Tika wrapper for MRI
|
70
|
+
test_files:
|
71
|
+
- spec/fixtures/test.docx
|
72
|
+
- spec/fixtures/test.pdf
|
73
|
+
- spec/lib/parser_spec.rb
|
74
|
+
- spec/spec_helper.rb
|
75
|
+
- spec/support/spec_paths.rb
|