tika-masala 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 0fb65303c39a65b4aa4c400c6799b29191129312
4
+ data.tar.gz: 243d05f8cf30c689d816eac20cbd2e623781cb20
5
+ SHA512:
6
+ metadata.gz: 63b7b6cff214a93f4dc4b8469e037d2ca9c2a106fe428aed195c7673e11974ba396fd9d121e68916b10a5c8211f0391f76a60c41ffff316ea6a5ba50fcb46758
7
+ data.tar.gz: a7ae5316c4f9aa14408af801dd9365728346afbff5387349ac4a14c8ca3867a3e86522884cc9b62fcee45dcbac5629bd8a225d887cb57025e2228ba5143a9d1c
data/Gemfile ADDED
@@ -0,0 +1,7 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
4
+
5
+ group :test do
6
+ gem 'pry'
7
+ end
data/Gemfile.lock ADDED
@@ -0,0 +1,36 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ tika-masala (1.5.0)
5
+
6
+ GEM
7
+ remote: https://rubygems.org/
8
+ specs:
9
+ coderay (1.1.0)
10
+ diff-lcs (1.2.5)
11
+ method_source (0.8.2)
12
+ pry (0.9.12.3)
13
+ coderay (~> 1.0)
14
+ method_source (~> 0.8)
15
+ slop (~> 3.4)
16
+ rspec (3.0.0)
17
+ rspec-core (~> 3.0.0)
18
+ rspec-expectations (~> 3.0.0)
19
+ rspec-mocks (~> 3.0.0)
20
+ rspec-core (3.0.4)
21
+ rspec-support (~> 3.0.0)
22
+ rspec-expectations (3.0.4)
23
+ diff-lcs (>= 1.2.0, < 2.0)
24
+ rspec-support (~> 3.0.0)
25
+ rspec-mocks (3.0.4)
26
+ rspec-support (~> 3.0.0)
27
+ rspec-support (3.0.4)
28
+ slop (3.4.7)
29
+
30
+ PLATFORMS
31
+ ruby
32
+
33
+ DEPENDENCIES
34
+ pry
35
+ rspec (~> 3.0)
36
+ tika-masala!
data/README.md ADDED
@@ -0,0 +1,37 @@
1
+ # TikaMasala
2
+
3
+ Simple Wrapper around Tika to parse documents.
4
+
5
+ ## Usage
6
+
7
+ ```ruby
8
+ parser = TikaMasala::Parser.new('/path/to/tika/jar/file')
9
+ parser.parse('/path/to/pdf')
10
+ ```
11
+
12
+ Everything uses `options` behind the scene to pass arguments:
13
+
14
+ ```ruby
15
+ parser.options('--detect', '/path/to/file')
16
+ ```
17
+
18
+ ## What if tika can't parse a document?
19
+
20
+ The exception thrown contain both the stdout and stderr output. Which means
21
+ that you can still retrieve part of the content you were trying to extract.
22
+
23
+ ```ruby
24
+ parser = TikaMasala::Parser.new('/path/to/tika/jar/file')
25
+
26
+ begin
27
+ parser.parse('/path/to/pdf')
28
+ rescue TikaMasala::TikaError => e
29
+ # Let's say it produced an error
30
+ e.stdout # contains the parsed text until it reached an error
31
+ e.stderr # contains the exception raised by Java
32
+ end
33
+ ```
34
+
35
+ ## Versions
36
+
37
+ The version match the version of tika distributed.
Binary file
@@ -0,0 +1 @@
1
+ require_relative 'tika-masala/parser'
@@ -0,0 +1,11 @@
1
+ module TikaMasala
2
+ class TikaError < StandardError
3
+ attr_reader :stdout
4
+
5
+ def initialize(options = {})
6
+ @stdout = options.fetch(:stdout, '')
7
+
8
+ super("Non-zero exit code (#{options[:exitstatus]}): #{options[:stderr]}")
9
+ end
10
+ end
11
+ end
@@ -0,0 +1,53 @@
1
+ require 'open3'
2
+ require_relative 'exceptions/tika_error'
3
+
4
+ module TikaMasala
5
+ class FileNotFound < StandardError; end
6
+
7
+ class Parser
8
+ attr_reader :jar_path
9
+
10
+ def initialize(jar_path = nil)
11
+ if jar_path.nil?
12
+ jar_path = File.expand_path(File.join('..', '..', '..', 'dependencies', 'tika-app-1.5.jar'), __FILE__)
13
+ end
14
+
15
+ raise FileNotFound, "Jar #{jar_path} does not exist" unless File.exists?(jar_path)
16
+
17
+ @jar_path = jar_path
18
+ end
19
+
20
+ def parse(file)
21
+ java_exec('--text', file)
22
+ end
23
+
24
+ def metadata(file)
25
+ java_exec('--metadata', file)
26
+ end
27
+
28
+ def detect_type(file)
29
+ java_exec('--detect', file)
30
+ end
31
+
32
+ def java_exec(*args)
33
+ stdin, stdout, stderr, wait_thr = Open3.popen3("java -jar #{@jar_path.shellescape} #{args.shelljoin}")
34
+
35
+ exitstatus = wait_thr.value
36
+
37
+ if exitstatus != 0
38
+ expection = TikaError.new(
39
+ stdout: stdout.read,
40
+ stderr: stderr.read,
41
+ exitstatus: exitstatus
42
+ )
43
+ raise expection
44
+ else
45
+ stdout.read
46
+ end
47
+ ensure
48
+ stdin.close
49
+ stdout.close
50
+ stderr.close
51
+ end
52
+ end
53
+ end
Binary file
Binary file
@@ -0,0 +1,126 @@
1
+ require 'spec_helper'
2
+
3
+ RSpec.describe TikaMasala::Parser do
4
+ subject { described_class.new(tika_jar) }
5
+
6
+ let(:tika_jar) { jar_path('tika-app-1.5.jar') }
7
+ let(:pdf_file) { fixtures_path('test.pdf') }
8
+ let(:word_file) { fixtures_path('test.docx') }
9
+
10
+ it { respond_to(:parse) }
11
+
12
+ describe 'initialize' do
13
+ subject { described_class }
14
+
15
+ let(:fake_path) { '/no/such/file/exist' }
16
+
17
+ it 'raises an exception if the file does not exist' do
18
+ expect {
19
+ subject.new(fake_path)
20
+ }.to raise_error TikaMasala::FileNotFound, "Jar #{fake_path} does not exist"
21
+ end
22
+
23
+ context 'when no parameters are passed' do
24
+ it 'sets the jar to the one in our dependencies folder' do
25
+ expect(subject.new.jar_path).to eq(jar_path('tika-app-1.5.jar'))
26
+ end
27
+ end
28
+ end
29
+
30
+ describe '#parse' do
31
+ it 'parses pdf files properly' do
32
+ expect(subject.parse(pdf_file)).to match(/hello world/i)
33
+ end
34
+
35
+ it 'parses word files properly' do
36
+ expect(subject.parse(word_file)).to match(/hello world/i)
37
+ end
38
+
39
+ context 'when the file does not exist' do
40
+ it 'raises an exception' do
41
+ expect { subject.parse(fixtures_path('file-dont-exist.ppt')) }.to raise_error(TikaMasala::TikaError)
42
+ end
43
+ end
44
+
45
+ context 'when the file url contains reserved shell words' do
46
+ let(:file_path) { 'http://example.com/?token=123&param=a' }
47
+ let(:file_path_escaped) { 'http://example.com/\\?token\\=123\\&param\\=a' }
48
+ let(:popen_result_double) do
49
+ [
50
+ double('stdin', close: true),
51
+ double('stdout', close: true, read: ''),
52
+ double('stderr', close: true),
53
+ double('wait_thr', value: 0)
54
+ ]
55
+ end
56
+
57
+ it 'escapes the string properly' do
58
+ expect(Open3).to receive(:popen3).with("java -jar #{tika_jar} --text #{file_path_escaped}") do
59
+ popen_result_double
60
+ end
61
+
62
+ subject.parse(file_path)
63
+ end
64
+ end
65
+ end
66
+
67
+ describe '#metadata' do
68
+ it 'checks the metadata from the file' do
69
+ expect(subject.metadata(word_file)).to match(/Content-Type.*word/i)
70
+ end
71
+ end
72
+
73
+ describe '#detect_type' do
74
+ it 'finds the file type' do
75
+ expect(subject.detect_type(pdf_file)).to match(/application.pdf/i)
76
+ end
77
+ end
78
+
79
+ describe '#java_exec' do
80
+ it 'takes option directly to tika' do
81
+ expect(subject.java_exec('--text', word_file)).to match(/hello world/i)
82
+ end
83
+
84
+ context 'when exit code is not zero' do
85
+ let(:command) { "echo '#{normal_message}'; echo '#{error_message}' 1>&2; exit 1;" }
86
+ let(:error_message) { 'Some nasty error occured' }
87
+ let(:normal_message) { 'Hello world' }
88
+
89
+ before do
90
+ popen_result = Open3.popen3(command)
91
+
92
+ allow(Open3).to receive(:popen3) do |*args, &block|
93
+ if block_given?
94
+ block.call(*popen_result)
95
+ else
96
+ popen_result
97
+ end
98
+ end
99
+ end
100
+
101
+ it 'raises an exception containing stderr' do
102
+ expect { subject.java_exec }.to raise_error(TikaMasala::TikaError, /#{error_message}/)
103
+ end
104
+
105
+ describe 'the expection raised' do
106
+ let(:returned_expection) do
107
+ [].tap do |returned_expection|
108
+ begin
109
+ subject.java_exec
110
+ rescue TikaMasala::TikaError => e
111
+ returned_expection << e
112
+ end
113
+ end.first
114
+ end
115
+
116
+ it 'does not contain the normal message' do
117
+ expect(returned_expection.message).to_not include(normal_message)
118
+ end
119
+
120
+ it 'provides another attribute which contains the ouput of stdout' do
121
+ expect(returned_expection.stdout).to include(normal_message)
122
+ end
123
+ end
124
+ end
125
+ end
126
+ end
@@ -0,0 +1,7 @@
1
+ require_relative '../lib/tika-masala'
2
+
3
+ Dir[File.dirname(__FILE__) + "/support/**/*.rb"].each {|f| require f}
4
+
5
+ RSpec.configure do |config|
6
+ config.include SpecPaths
7
+ end
@@ -0,0 +1,13 @@
1
+ module SpecPaths
2
+ def fixtures_path(*file)
3
+ root_path('spec', 'fixtures', *file)
4
+ end
5
+
6
+ def jar_path(*file)
7
+ root_path('dependencies', *file)
8
+ end
9
+
10
+ def root_path(*file)
11
+ File.expand_path(File.join('..', '..', '..', *file), __FILE__)
12
+ end
13
+ end
@@ -0,0 +1,15 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = 'tika-masala'
3
+ s.version = '1.5.0'
4
+ s.date = '2014-03-31'
5
+ s.summary = "Tika wrapper for MRI"
6
+ s.description = "Provides ruby wrapper around tika command line tools"
7
+ s.authors = ["Vincent Bonmalais"]
8
+ s.email = 'vincent.bonmalais@econsultancy.com'
9
+ s.files = `git ls-files`.split("\n")
10
+ s.test_files = `git ls-files -- spec`.split("\n")
11
+ s.homepage = 'https://github.com/econsultancy/tika-masala'
12
+ s.license = 'MIT'
13
+
14
+ s.add_development_dependency('rspec', '~>3.0')
15
+ end
metadata ADDED
@@ -0,0 +1,75 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: tika-masala
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.5.0
5
+ platform: ruby
6
+ authors:
7
+ - Vincent Bonmalais
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-03-31 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rspec
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '3.0'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '3.0'
27
+ description: Provides ruby wrapper around tika command line tools
28
+ email: vincent.bonmalais@econsultancy.com
29
+ executables: []
30
+ extensions: []
31
+ extra_rdoc_files: []
32
+ files:
33
+ - Gemfile
34
+ - Gemfile.lock
35
+ - README.md
36
+ - dependencies/tika-app-1.5.jar
37
+ - lib/tika-masala.rb
38
+ - lib/tika-masala/exceptions/tika_error.rb
39
+ - lib/tika-masala/parser.rb
40
+ - spec/fixtures/test.docx
41
+ - spec/fixtures/test.pdf
42
+ - spec/lib/parser_spec.rb
43
+ - spec/spec_helper.rb
44
+ - spec/support/spec_paths.rb
45
+ - tika-masala.gemspec
46
+ homepage: https://github.com/econsultancy/tika-masala
47
+ licenses:
48
+ - MIT
49
+ metadata: {}
50
+ post_install_message:
51
+ rdoc_options: []
52
+ require_paths:
53
+ - lib
54
+ required_ruby_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - '>='
57
+ - !ruby/object:Gem::Version
58
+ version: '0'
59
+ required_rubygems_version: !ruby/object:Gem::Requirement
60
+ requirements:
61
+ - - '>='
62
+ - !ruby/object:Gem::Version
63
+ version: '0'
64
+ requirements: []
65
+ rubyforge_project:
66
+ rubygems_version: 2.2.1
67
+ signing_key:
68
+ specification_version: 4
69
+ summary: Tika wrapper for MRI
70
+ test_files:
71
+ - spec/fixtures/test.docx
72
+ - spec/fixtures/test.pdf
73
+ - spec/lib/parser_spec.rb
74
+ - spec/spec_helper.rb
75
+ - spec/support/spec_paths.rb