tika-masala 1.5.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0fb65303c39a65b4aa4c400c6799b29191129312
4
+ data.tar.gz: 243d05f8cf30c689d816eac20cbd2e623781cb20
5
+ SHA512:
6
+ metadata.gz: 63b7b6cff214a93f4dc4b8469e037d2ca9c2a106fe428aed195c7673e11974ba396fd9d121e68916b10a5c8211f0391f76a60c41ffff316ea6a5ba50fcb46758
7
+ data.tar.gz: a7ae5316c4f9aa14408af801dd9365728346afbff5387349ac4a14c8ca3867a3e86522884cc9b62fcee45dcbac5629bd8a225d887cb57025e2228ba5143a9d1c
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ group :test do
6
+ gem 'pry'
7
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,36 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ tika-masala (1.5.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ coderay (1.1.0)
10
+ diff-lcs (1.2.5)
11
+ method_source (0.8.2)
12
+ pry (0.9.12.3)
13
+ coderay (~> 1.0)
14
+ method_source (~> 0.8)
15
+ slop (~> 3.4)
16
+ rspec (3.0.0)
17
+ rspec-core (~> 3.0.0)
18
+ rspec-expectations (~> 3.0.0)
19
+ rspec-mocks (~> 3.0.0)
20
+ rspec-core (3.0.4)
21
+ rspec-support (~> 3.0.0)
22
+ rspec-expectations (3.0.4)
23
+ diff-lcs (>= 1.2.0, < 2.0)
24
+ rspec-support (~> 3.0.0)
25
+ rspec-mocks (3.0.4)
26
+ rspec-support (~> 3.0.0)
27
+ rspec-support (3.0.4)
28
+ slop (3.4.7)
29
+
30
+ PLATFORMS
31
+ ruby
32
+
33
+ DEPENDENCIES
34
+ pry
35
+ rspec (~> 3.0)
36
+ tika-masala!
data/README.md ADDED
@@ -0,0 +1,37 @@
1
+ # TikaMasala
2
+
3
+ Simple Wrapper around Tika to parse documents.
4
+
5
+ ## Usage
6
+
7
+ ```ruby
8
+ parser = TikaMasala::Parser.new('/path/to/tika/jar/file')
9
+ parser.parse('/path/to/pdf')
10
+ ```
11
+
12
+ Everything uses `options` behind the scene to pass arguments:
13
+
14
+ ```ruby
15
+ parser.options('--detect', '/path/to/file')
16
+ ```
17
+
18
+ ## What if tika can't parse a document?
19
+
20
+ The exception thrown contain both the stdout and stderr output. Which means
21
+ that you can still retrieve part of the content you were trying to extract.
22
+
23
+ ```ruby
24
+ parser = TikaMasala::Parser.new('/path/to/tika/jar/file')
25
+
26
+ begin
27
+ parser.parse('/path/to/pdf')
28
+ rescue TikaMasala::TikaError => e
29
+ # Let's say it produced an error
30
+ e.stdout # contains the parsed text until it reached an error
31
+ e.stderr # contains the exception raised by Java
32
+ end
33
+ ```
34
+
35
+ ## Versions
36
+
37
+ The version match the version of tika distributed.
Binary file
@@ -0,0 +1 @@
1
+ require_relative 'tika-masala/parser'
@@ -0,0 +1,11 @@
1
+ module TikaMasala
2
+ class TikaError < StandardError
3
+ attr_reader :stdout
4
+
5
+ def initialize(options = {})
6
+ @stdout = options.fetch(:stdout, '')
7
+
8
+ super("Non-zero exit code (#{options[:exitstatus]}): #{options[:stderr]}")
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,53 @@
1
+ require 'open3'
2
+ require_relative 'exceptions/tika_error'
3
+
4
+ module TikaMasala
5
+ class FileNotFound < StandardError; end
6
+
7
+ class Parser
8
+ attr_reader :jar_path
9
+
10
+ def initialize(jar_path = nil)
11
+ if jar_path.nil?
12
+ jar_path = File.expand_path(File.join('..', '..', '..', 'dependencies', 'tika-app-1.5.jar'), __FILE__)
13
+ end
14
+
15
+ raise FileNotFound, "Jar #{jar_path} does not exist" unless File.exists?(jar_path)
16
+
17
+ @jar_path = jar_path
18
+ end
19
+
20
+ def parse(file)
21
+ java_exec('--text', file)
22
+ end
23
+
24
+ def metadata(file)
25
+ java_exec('--metadata', file)
26
+ end
27
+
28
+ def detect_type(file)
29
+ java_exec('--detect', file)
30
+ end
31
+
32
+ def java_exec(*args)
33
+ stdin, stdout, stderr, wait_thr = Open3.popen3("java -jar #{@jar_path.shellescape} #{args.shelljoin}")
34
+
35
+ exitstatus = wait_thr.value
36
+
37
+ if exitstatus != 0
38
+ expection = TikaError.new(
39
+ stdout: stdout.read,
40
+ stderr: stderr.read,
41
+ exitstatus: exitstatus
42
+ )
43
+ raise expection
44
+ else
45
+ stdout.read
46
+ end
47
+ ensure
48
+ stdin.close
49
+ stdout.close
50
+ stderr.close
51
+ end
52
+ end
53
+ end
Binary file
Binary file
@@ -0,0 +1,126 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe TikaMasala::Parser do
4
+ subject { described_class.new(tika_jar) }
5
+
6
+ let(:tika_jar) { jar_path('tika-app-1.5.jar') }
7
+ let(:pdf_file) { fixtures_path('test.pdf') }
8
+ let(:word_file) { fixtures_path('test.docx') }
9
+
10
+ it { respond_to(:parse) }
11
+
12
+ describe 'initialize' do
13
+ subject { described_class }
14
+
15
+ let(:fake_path) { '/no/such/file/exist' }
16
+
17
+ it 'raises an exception if the file does not exist' do
18
+ expect {
19
+ subject.new(fake_path)
20
+ }.to raise_error TikaMasala::FileNotFound, "Jar #{fake_path} does not exist"
21
+ end
22
+
23
+ context 'when no parameters are passed' do
24
+ it 'sets the jar to the one in our dependencies folder' do
25
+ expect(subject.new.jar_path).to eq(jar_path('tika-app-1.5.jar'))
26
+ end
27
+ end
28
+ end
29
+
30
+ describe '#parse' do
31
+ it 'parses pdf files properly' do
32
+ expect(subject.parse(pdf_file)).to match(/hello world/i)
33
+ end
34
+
35
+ it 'parses word files properly' do
36
+ expect(subject.parse(word_file)).to match(/hello world/i)
37
+ end
38
+
39
+ context 'when the file does not exist' do
40
+ it 'raises an exception' do
41
+ expect { subject.parse(fixtures_path('file-dont-exist.ppt')) }.to raise_error(TikaMasala::TikaError)
42
+ end
43
+ end
44
+
45
+ context 'when the file url contains reserved shell words' do
46
+ let(:file_path) { 'http://example.com/?token=123&param=a' }
47
+ let(:file_path_escaped) { 'http://example.com/\\?token\\=123\\&param\\=a' }
48
+ let(:popen_result_double) do
49
+ [
50
+ double('stdin', close: true),
51
+ double('stdout', close: true, read: ''),
52
+ double('stderr', close: true),
53
+ double('wait_thr', value: 0)
54
+ ]
55
+ end
56
+
57
+ it 'escapes the string properly' do
58
+ expect(Open3).to receive(:popen3).with("java -jar #{tika_jar} --text #{file_path_escaped}") do
59
+ popen_result_double
60
+ end
61
+
62
+ subject.parse(file_path)
63
+ end
64
+ end
65
+ end
66
+
67
+ describe '#metadata' do
68
+ it 'checks the metadata from the file' do
69
+ expect(subject.metadata(word_file)).to match(/Content-Type.*word/i)
70
+ end
71
+ end
72
+
73
+ describe '#detect_type' do
74
+ it 'finds the file type' do
75
+ expect(subject.detect_type(pdf_file)).to match(/application.pdf/i)
76
+ end
77
+ end
78
+
79
+ describe '#java_exec' do
80
+ it 'takes option directly to tika' do
81
+ expect(subject.java_exec('--text', word_file)).to match(/hello world/i)
82
+ end
83
+
84
+ context 'when exit code is not zero' do
85
+ let(:command) { "echo '#{normal_message}'; echo '#{error_message}' 1>&2; exit 1;" }
86
+ let(:error_message) { 'Some nasty error occured' }
87
+ let(:normal_message) { 'Hello world' }
88
+
89
+ before do
90
+ popen_result = Open3.popen3(command)
91
+
92
+ allow(Open3).to receive(:popen3) do |*args, &block|
93
+ if block_given?
94
+ block.call(*popen_result)
95
+ else
96
+ popen_result
97
+ end
98
+ end
99
+ end
100
+
101
+ it 'raises an exception containing stderr' do
102
+ expect { subject.java_exec }.to raise_error(TikaMasala::TikaError, /#{error_message}/)
103
+ end
104
+
105
+ describe 'the expection raised' do
106
+ let(:returned_expection) do
107
+ [].tap do |returned_expection|
108
+ begin
109
+ subject.java_exec
110
+ rescue TikaMasala::TikaError => e
111
+ returned_expection << e
112
+ end
113
+ end.first
114
+ end
115
+
116
+ it 'does not contain the normal message' do
117
+ expect(returned_expection.message).to_not include(normal_message)
118
+ end
119
+
120
+ it 'provides another attribute which contains the ouput of stdout' do
121
+ expect(returned_expection.stdout).to include(normal_message)
122
+ end
123
+ end
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,7 @@
1
+ require_relative '../lib/tika-masala'
2
+
3
+ Dir[File.dirname(__FILE__) + "/support/**/*.rb"].each {|f| require f}
4
+
5
+ RSpec.configure do |config|
6
+ config.include SpecPaths
7
+ end
@@ -0,0 +1,13 @@
1
+ module SpecPaths
2
+ def fixtures_path(*file)
3
+ root_path('spec', 'fixtures', *file)
4
+ end
5
+
6
+ def jar_path(*file)
7
+ root_path('dependencies', *file)
8
+ end
9
+
10
+ def root_path(*file)
11
+ File.expand_path(File.join('..', '..', '..', *file), __FILE__)
12
+ end
13
+ end
@@ -0,0 +1,15 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'tika-masala'
3
+ s.version = '1.5.0'
4
+ s.date = '2014-03-31'
5
+ s.summary = "Tika wrapper for MRI"
6
+ s.description = "Provides ruby wrapper around tika command line tools"
7
+ s.authors = ["Vincent Bonmalais"]
8
+ s.email = 'vincent.bonmalais@econsultancy.com'
9
+ s.files = `git ls-files`.split("\n")
10
+ s.test_files = `git ls-files -- spec`.split("\n")
11
+ s.homepage = 'https://github.com/econsultancy/tika-masala'
12
+ s.license = 'MIT'
13
+
14
+ s.add_development_dependency('rspec', '~>3.0')
15
+ end
metadata ADDED
@@ -0,0 +1,75 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tika-masala
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.5.0
5
+ platform: ruby
6
+ authors:
7
+ - Vincent Bonmalais
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-03-31 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '3.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '3.0'
27
+ description: Provides ruby wrapper around tika command line tools
28
+ email: vincent.bonmalais@econsultancy.com
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - Gemfile
34
+ - Gemfile.lock
35
+ - README.md
36
+ - dependencies/tika-app-1.5.jar
37
+ - lib/tika-masala.rb
38
+ - lib/tika-masala/exceptions/tika_error.rb
39
+ - lib/tika-masala/parser.rb
40
+ - spec/fixtures/test.docx
41
+ - spec/fixtures/test.pdf
42
+ - spec/lib/parser_spec.rb
43
+ - spec/spec_helper.rb
44
+ - spec/support/spec_paths.rb
45
+ - tika-masala.gemspec
46
+ homepage: https://github.com/econsultancy/tika-masala
47
+ licenses:
48
+ - MIT
49
+ metadata: {}
50
+ post_install_message:
51
+ rdoc_options: []
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - '>='
57
+ - !ruby/object:Gem::Version
58
+ version: '0'
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ requirements: []
65
+ rubyforge_project:
66
+ rubygems_version: 2.2.1
67
+ signing_key:
68
+ specification_version: 4
69
+ summary: Tika wrapper for MRI
70
+ test_files:
71
+ - spec/fixtures/test.docx
72
+ - spec/fixtures/test.pdf
73
+ - spec/lib/parser_spec.rb
74
+ - spec/spec_helper.rb
75
+ - spec/support/spec_paths.rb