tika-masala 1.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/Gemfile +7 -0
- data/Gemfile.lock +36 -0
- data/README.md +37 -0
- data/dependencies/tika-app-1.5.jar +0 -0
- data/lib/tika-masala.rb +1 -0
- data/lib/tika-masala/exceptions/tika_error.rb +11 -0
- data/lib/tika-masala/parser.rb +53 -0
- data/spec/fixtures/test.docx +0 -0
- data/spec/fixtures/test.pdf +0 -0
- data/spec/lib/parser_spec.rb +126 -0
- data/spec/spec_helper.rb +7 -0
- data/spec/support/spec_paths.rb +13 -0
- data/tika-masala.gemspec +15 -0
- metadata +75 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: 0fb65303c39a65b4aa4c400c6799b29191129312
|
4
|
+
data.tar.gz: 243d05f8cf30c689d816eac20cbd2e623781cb20
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 63b7b6cff214a93f4dc4b8469e037d2ca9c2a106fe428aed195c7673e11974ba396fd9d121e68916b10a5c8211f0391f76a60c41ffff316ea6a5ba50fcb46758
|
7
|
+
data.tar.gz: a7ae5316c4f9aa14408af801dd9365728346afbff5387349ac4a14c8ca3867a3e86522884cc9b62fcee45dcbac5629bd8a225d887cb57025e2228ba5143a9d1c
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
tika-masala (1.5.0)
|
5
|
+
|
6
|
+
GEM
|
7
|
+
remote: https://rubygems.org/
|
8
|
+
specs:
|
9
|
+
coderay (1.1.0)
|
10
|
+
diff-lcs (1.2.5)
|
11
|
+
method_source (0.8.2)
|
12
|
+
pry (0.9.12.3)
|
13
|
+
coderay (~> 1.0)
|
14
|
+
method_source (~> 0.8)
|
15
|
+
slop (~> 3.4)
|
16
|
+
rspec (3.0.0)
|
17
|
+
rspec-core (~> 3.0.0)
|
18
|
+
rspec-expectations (~> 3.0.0)
|
19
|
+
rspec-mocks (~> 3.0.0)
|
20
|
+
rspec-core (3.0.4)
|
21
|
+
rspec-support (~> 3.0.0)
|
22
|
+
rspec-expectations (3.0.4)
|
23
|
+
diff-lcs (>= 1.2.0, < 2.0)
|
24
|
+
rspec-support (~> 3.0.0)
|
25
|
+
rspec-mocks (3.0.4)
|
26
|
+
rspec-support (~> 3.0.0)
|
27
|
+
rspec-support (3.0.4)
|
28
|
+
slop (3.4.7)
|
29
|
+
|
30
|
+
PLATFORMS
|
31
|
+
ruby
|
32
|
+
|
33
|
+
DEPENDENCIES
|
34
|
+
pry
|
35
|
+
rspec (~> 3.0)
|
36
|
+
tika-masala!
|
data/README.md
ADDED
@@ -0,0 +1,37 @@
|
|
1
|
+
# TikaMasala
|
2
|
+
|
3
|
+
Simple Wrapper around Tika to parse documents.
|
4
|
+
|
5
|
+
## Usage
|
6
|
+
|
7
|
+
```ruby
|
8
|
+
parser = TikaMasala::Parser.new('/path/to/tika/jar/file')
|
9
|
+
parser.parse('/path/to/pdf')
|
10
|
+
```
|
11
|
+
|
12
|
+
Everything uses `options` behind the scene to pass arguments:
|
13
|
+
|
14
|
+
```ruby
|
15
|
+
parser.options('--detect', '/path/to/file')
|
16
|
+
```
|
17
|
+
|
18
|
+
## What if tika can't parse a document?
|
19
|
+
|
20
|
+
The exception thrown contain both the stdout and stderr output. Which means
|
21
|
+
that you can still retrieve part of the content you were trying to extract.
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
parser = TikaMasala::Parser.new('/path/to/tika/jar/file')
|
25
|
+
|
26
|
+
begin
|
27
|
+
parser.parse('/path/to/pdf')
|
28
|
+
rescue TikaMasala::TikaError => e
|
29
|
+
# Let's say it produced an error
|
30
|
+
e.stdout # contains the parsed text until it reached an error
|
31
|
+
e.stderr # contains the exception raised by Java
|
32
|
+
end
|
33
|
+
```
|
34
|
+
|
35
|
+
## Versions
|
36
|
+
|
37
|
+
The version match the version of tika distributed.
|
Binary file
|
data/lib/tika-masala.rb
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require_relative 'tika-masala/parser'
|
@@ -0,0 +1,53 @@
|
|
1
|
+
require 'open3'
|
2
|
+
require_relative 'exceptions/tika_error'
|
3
|
+
|
4
|
+
module TikaMasala
|
5
|
+
class FileNotFound < StandardError; end
|
6
|
+
|
7
|
+
class Parser
|
8
|
+
attr_reader :jar_path
|
9
|
+
|
10
|
+
def initialize(jar_path = nil)
|
11
|
+
if jar_path.nil?
|
12
|
+
jar_path = File.expand_path(File.join('..', '..', '..', 'dependencies', 'tika-app-1.5.jar'), __FILE__)
|
13
|
+
end
|
14
|
+
|
15
|
+
raise FileNotFound, "Jar #{jar_path} does not exist" unless File.exists?(jar_path)
|
16
|
+
|
17
|
+
@jar_path = jar_path
|
18
|
+
end
|
19
|
+
|
20
|
+
def parse(file)
|
21
|
+
java_exec('--text', file)
|
22
|
+
end
|
23
|
+
|
24
|
+
def metadata(file)
|
25
|
+
java_exec('--metadata', file)
|
26
|
+
end
|
27
|
+
|
28
|
+
def detect_type(file)
|
29
|
+
java_exec('--detect', file)
|
30
|
+
end
|
31
|
+
|
32
|
+
def java_exec(*args)
|
33
|
+
stdin, stdout, stderr, wait_thr = Open3.popen3("java -jar #{@jar_path.shellescape} #{args.shelljoin}")
|
34
|
+
|
35
|
+
exitstatus = wait_thr.value
|
36
|
+
|
37
|
+
if exitstatus != 0
|
38
|
+
expection = TikaError.new(
|
39
|
+
stdout: stdout.read,
|
40
|
+
stderr: stderr.read,
|
41
|
+
exitstatus: exitstatus
|
42
|
+
)
|
43
|
+
raise expection
|
44
|
+
else
|
45
|
+
stdout.read
|
46
|
+
end
|
47
|
+
ensure
|
48
|
+
stdin.close
|
49
|
+
stdout.close
|
50
|
+
stderr.close
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
Binary file
|
Binary file
|
@@ -0,0 +1,126 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe TikaMasala::Parser do
|
4
|
+
subject { described_class.new(tika_jar) }
|
5
|
+
|
6
|
+
let(:tika_jar) { jar_path('tika-app-1.5.jar') }
|
7
|
+
let(:pdf_file) { fixtures_path('test.pdf') }
|
8
|
+
let(:word_file) { fixtures_path('test.docx') }
|
9
|
+
|
10
|
+
it { respond_to(:parse) }
|
11
|
+
|
12
|
+
describe 'initialize' do
|
13
|
+
subject { described_class }
|
14
|
+
|
15
|
+
let(:fake_path) { '/no/such/file/exist' }
|
16
|
+
|
17
|
+
it 'raises an exception if the file does not exist' do
|
18
|
+
expect {
|
19
|
+
subject.new(fake_path)
|
20
|
+
}.to raise_error TikaMasala::FileNotFound, "Jar #{fake_path} does not exist"
|
21
|
+
end
|
22
|
+
|
23
|
+
context 'when no parameters are passed' do
|
24
|
+
it 'sets the jar to the one in our dependencies folder' do
|
25
|
+
expect(subject.new.jar_path).to eq(jar_path('tika-app-1.5.jar'))
|
26
|
+
end
|
27
|
+
end
|
28
|
+
end
|
29
|
+
|
30
|
+
describe '#parse' do
|
31
|
+
it 'parses pdf files properly' do
|
32
|
+
expect(subject.parse(pdf_file)).to match(/hello world/i)
|
33
|
+
end
|
34
|
+
|
35
|
+
it 'parses word files properly' do
|
36
|
+
expect(subject.parse(word_file)).to match(/hello world/i)
|
37
|
+
end
|
38
|
+
|
39
|
+
context 'when the file does not exist' do
|
40
|
+
it 'raises an exception' do
|
41
|
+
expect { subject.parse(fixtures_path('file-dont-exist.ppt')) }.to raise_error(TikaMasala::TikaError)
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
context 'when the file url contains reserved shell words' do
|
46
|
+
let(:file_path) { 'http://example.com/?token=123¶m=a' }
|
47
|
+
let(:file_path_escaped) { 'http://example.com/\\?token\\=123\\¶m\\=a' }
|
48
|
+
let(:popen_result_double) do
|
49
|
+
[
|
50
|
+
double('stdin', close: true),
|
51
|
+
double('stdout', close: true, read: ''),
|
52
|
+
double('stderr', close: true),
|
53
|
+
double('wait_thr', value: 0)
|
54
|
+
]
|
55
|
+
end
|
56
|
+
|
57
|
+
it 'escapes the string properly' do
|
58
|
+
expect(Open3).to receive(:popen3).with("java -jar #{tika_jar} --text #{file_path_escaped}") do
|
59
|
+
popen_result_double
|
60
|
+
end
|
61
|
+
|
62
|
+
subject.parse(file_path)
|
63
|
+
end
|
64
|
+
end
|
65
|
+
end
|
66
|
+
|
67
|
+
describe '#metadata' do
|
68
|
+
it 'checks the metadata from the file' do
|
69
|
+
expect(subject.metadata(word_file)).to match(/Content-Type.*word/i)
|
70
|
+
end
|
71
|
+
end
|
72
|
+
|
73
|
+
describe '#detect_type' do
|
74
|
+
it 'finds the file type' do
|
75
|
+
expect(subject.detect_type(pdf_file)).to match(/application.pdf/i)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
describe '#java_exec' do
|
80
|
+
it 'takes option directly to tika' do
|
81
|
+
expect(subject.java_exec('--text', word_file)).to match(/hello world/i)
|
82
|
+
end
|
83
|
+
|
84
|
+
context 'when exit code is not zero' do
|
85
|
+
let(:command) { "echo '#{normal_message}'; echo '#{error_message}' 1>&2; exit 1;" }
|
86
|
+
let(:error_message) { 'Some nasty error occured' }
|
87
|
+
let(:normal_message) { 'Hello world' }
|
88
|
+
|
89
|
+
before do
|
90
|
+
popen_result = Open3.popen3(command)
|
91
|
+
|
92
|
+
allow(Open3).to receive(:popen3) do |*args, &block|
|
93
|
+
if block_given?
|
94
|
+
block.call(*popen_result)
|
95
|
+
else
|
96
|
+
popen_result
|
97
|
+
end
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
it 'raises an exception containing stderr' do
|
102
|
+
expect { subject.java_exec }.to raise_error(TikaMasala::TikaError, /#{error_message}/)
|
103
|
+
end
|
104
|
+
|
105
|
+
describe 'the expection raised' do
|
106
|
+
let(:returned_expection) do
|
107
|
+
[].tap do |returned_expection|
|
108
|
+
begin
|
109
|
+
subject.java_exec
|
110
|
+
rescue TikaMasala::TikaError => e
|
111
|
+
returned_expection << e
|
112
|
+
end
|
113
|
+
end.first
|
114
|
+
end
|
115
|
+
|
116
|
+
it 'does not contain the normal message' do
|
117
|
+
expect(returned_expection.message).to_not include(normal_message)
|
118
|
+
end
|
119
|
+
|
120
|
+
it 'provides another attribute which contains the ouput of stdout' do
|
121
|
+
expect(returned_expection.stdout).to include(normal_message)
|
122
|
+
end
|
123
|
+
end
|
124
|
+
end
|
125
|
+
end
|
126
|
+
end
|
data/spec/spec_helper.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
module SpecPaths
|
2
|
+
def fixtures_path(*file)
|
3
|
+
root_path('spec', 'fixtures', *file)
|
4
|
+
end
|
5
|
+
|
6
|
+
def jar_path(*file)
|
7
|
+
root_path('dependencies', *file)
|
8
|
+
end
|
9
|
+
|
10
|
+
def root_path(*file)
|
11
|
+
File.expand_path(File.join('..', '..', '..', *file), __FILE__)
|
12
|
+
end
|
13
|
+
end
|
data/tika-masala.gemspec
ADDED
@@ -0,0 +1,15 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = 'tika-masala'
|
3
|
+
s.version = '1.5.0'
|
4
|
+
s.date = '2014-03-31'
|
5
|
+
s.summary = "Tika wrapper for MRI"
|
6
|
+
s.description = "Provides ruby wrapper around tika command line tools"
|
7
|
+
s.authors = ["Vincent Bonmalais"]
|
8
|
+
s.email = 'vincent.bonmalais@econsultancy.com'
|
9
|
+
s.files = `git ls-files`.split("\n")
|
10
|
+
s.test_files = `git ls-files -- spec`.split("\n")
|
11
|
+
s.homepage = 'https://github.com/econsultancy/tika-masala'
|
12
|
+
s.license = 'MIT'
|
13
|
+
|
14
|
+
s.add_development_dependency('rspec', '~>3.0')
|
15
|
+
end
|
metadata
ADDED
@@ -0,0 +1,75 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: tika-masala
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.5.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Vincent Bonmalais
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-03-31 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rspec
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '3.0'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '3.0'
|
27
|
+
description: Provides ruby wrapper around tika command line tools
|
28
|
+
email: vincent.bonmalais@econsultancy.com
|
29
|
+
executables: []
|
30
|
+
extensions: []
|
31
|
+
extra_rdoc_files: []
|
32
|
+
files:
|
33
|
+
- Gemfile
|
34
|
+
- Gemfile.lock
|
35
|
+
- README.md
|
36
|
+
- dependencies/tika-app-1.5.jar
|
37
|
+
- lib/tika-masala.rb
|
38
|
+
- lib/tika-masala/exceptions/tika_error.rb
|
39
|
+
- lib/tika-masala/parser.rb
|
40
|
+
- spec/fixtures/test.docx
|
41
|
+
- spec/fixtures/test.pdf
|
42
|
+
- spec/lib/parser_spec.rb
|
43
|
+
- spec/spec_helper.rb
|
44
|
+
- spec/support/spec_paths.rb
|
45
|
+
- tika-masala.gemspec
|
46
|
+
homepage: https://github.com/econsultancy/tika-masala
|
47
|
+
licenses:
|
48
|
+
- MIT
|
49
|
+
metadata: {}
|
50
|
+
post_install_message:
|
51
|
+
rdoc_options: []
|
52
|
+
require_paths:
|
53
|
+
- lib
|
54
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
55
|
+
requirements:
|
56
|
+
- - '>='
|
57
|
+
- !ruby/object:Gem::Version
|
58
|
+
version: '0'
|
59
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
60
|
+
requirements:
|
61
|
+
- - '>='
|
62
|
+
- !ruby/object:Gem::Version
|
63
|
+
version: '0'
|
64
|
+
requirements: []
|
65
|
+
rubyforge_project:
|
66
|
+
rubygems_version: 2.2.1
|
67
|
+
signing_key:
|
68
|
+
specification_version: 4
|
69
|
+
summary: Tika wrapper for MRI
|
70
|
+
test_files:
|
71
|
+
- spec/fixtures/test.docx
|
72
|
+
- spec/fixtures/test.pdf
|
73
|
+
- spec/lib/parser_spec.rb
|
74
|
+
- spec/spec_helper.rb
|
75
|
+
- spec/support/spec_paths.rb
|