rika 1.11.1-java → 2.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +5 -4
- data/.rubocop.yml +49 -0
- data/Gemfile +12 -0
- data/README.md +213 -76
- data/RELEASE_NOTES.md +26 -0
- data/Rakefile +4 -7
- data/bin/rika +13 -0
- data/lib/rika/cli/args_parser.rb +131 -0
- data/lib/rika/cli/rika_command.rb +129 -0
- data/lib/rika/formatters.rb +39 -0
- data/lib/rika/parse_result.rb +34 -0
- data/lib/rika/parser.rb +64 -70
- data/lib/rika/tika_loader.rb +65 -0
- data/lib/rika/version.rb +3 -1
- data/lib/rika.rb +98 -27
- data/rika.gemspec +30 -17
- data/rika_helper.rb +14 -22
- data/spec/fixtures/image_jpg_without_extension +0 -0
- data/spec/fixtures/tiny.txt +1 -0
- data/spec/rika/cli/args_parser_spec.rb +117 -0
- data/spec/rika/cli/rika_command_spec.rb +120 -0
- data/spec/rika/formatters_spec.rb +23 -0
- data/spec/rika/parse_result_spec.rb +42 -0
- data/spec/rika/parser_spec.rb +304 -0
- data/spec/rika/rika_spec.rb +10 -0
- data/spec/rika/tika_loader_spec.rb +57 -0
- data/spec/spec_helper.rb +10 -3
- metadata +40 -49
- data/.travis.yml +0 -7
- data/java-lib/tika-app-1.24.1.jar +0 -0
- data/spec/fixtures/text_file_without_extension +0 -23
- data/spec/rika_spec.rb +0 -245
- /data/spec/fixtures/{text_file.txt → document.txt} +0 -0
data/rika_helper.rb
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
# Defines some shortcuts for ad-hoc work with Rika.
|
2
4
|
#
|
3
5
|
# Can be used with the `irb`/`jirb` or `pry` (https://github.com/pry/pry) interactive shells:
|
@@ -11,28 +13,18 @@
|
|
11
13
|
|
12
14
|
require 'rika'
|
13
15
|
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
def m(resource)
|
19
|
-
Rika.parse_metadata(resource)
|
16
|
+
# Add shortuct to Rika.parse.
|
17
|
+
def pa(resource)
|
18
|
+
Rika.parse(resource)
|
20
19
|
end
|
21
20
|
|
22
|
-
|
23
|
-
|
21
|
+
# Add abbreviated aliases for the ParseResult class methods.
|
22
|
+
class ParseResult
|
23
|
+
alias c content
|
24
|
+
alias m metadata
|
25
|
+
alias l language
|
26
|
+
alias i input_type
|
27
|
+
alias d data_source
|
28
|
+
alias t content_type
|
29
|
+
alias j metadata_java
|
24
30
|
end
|
25
|
-
|
26
|
-
def cmh(resource)
|
27
|
-
Rika.parse_content_and_metadata_as_hash(resource)
|
28
|
-
end
|
29
|
-
|
30
|
-
def mj(resource); m(resource).to_json ; end
|
31
|
-
def mJ(resource); JSON.pretty_generate(m(resource)) ; end
|
32
|
-
def my(resource); m(resource).to_yaml ; end
|
33
|
-
def my(resource); require 'awesome_print'; m(resource).ai ;end
|
34
|
-
|
35
|
-
def cmj(resource); c(resource).to_json; end
|
36
|
-
def cmJ(resource); JSON.pretty_generate(c(resource)); end
|
37
|
-
def cmy(resource); c(resource).to_yaml ; end
|
38
|
-
def cma(resource); require 'awesome_print'; c,m = cm(resource); { content: c, metadata: m }; end
|
Binary file
|
@@ -0,0 +1 @@
|
|
1
|
+
hello
|
@@ -0,0 +1,117 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
require 'rika/cli/args_parser'
|
6
|
+
|
7
|
+
describe ArgsParser do
|
8
|
+
let(:versions_regex) { /Versions:.*Rika: (\d+\.\d+\.\d+(-\w+)?).*Tika: (\d+\.\d+\.\d+(-\w+)?)/ }
|
9
|
+
|
10
|
+
specify 'returns a hash of options, a target array, and help text' do
|
11
|
+
options, targets, help_text = described_class.call([])
|
12
|
+
expect(options).to be_a(Hash)
|
13
|
+
expect(targets).to be_an(Array)
|
14
|
+
expect(help_text).to be_a(String)
|
15
|
+
end
|
16
|
+
|
17
|
+
context 'when parsing options' do
|
18
|
+
RSpec.shared_examples 'sets_options_correctly' do |args, option_key, expected_value|
|
19
|
+
specify "correctly sets #{option_key} to #{expected_value} when args are #{args}" do
|
20
|
+
options, _, _ = described_class.call(args)
|
21
|
+
expect(options[option_key]).to eq(expected_value)
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
# Test default option values:
|
26
|
+
include_examples('sets_options_correctly', [], :as_array, false)
|
27
|
+
include_examples('sets_options_correctly', [], :text, true)
|
28
|
+
include_examples('sets_options_correctly', [], :metadata, true)
|
29
|
+
include_examples('sets_options_correctly', [], :format, 'at')
|
30
|
+
include_examples('sets_options_correctly', [], :key_sort, true)
|
31
|
+
include_examples('sets_options_correctly', [], :source, true)
|
32
|
+
|
33
|
+
# Test -a as_array option:
|
34
|
+
include_examples('sets_options_correctly', %w[-a], :as_array, true)
|
35
|
+
include_examples('sets_options_correctly', %w[--as_array], :as_array, true)
|
36
|
+
include_examples('sets_options_correctly', %w[-a -a-], :as_array, false)
|
37
|
+
include_examples('sets_options_correctly', %w[--no-as_array], :as_array, false)
|
38
|
+
|
39
|
+
# Test -f format option:
|
40
|
+
include_examples('sets_options_correctly', %w[-fyy], :format, 'yy')
|
41
|
+
include_examples('sets_options_correctly', %w[--format yy], :format, 'yy')
|
42
|
+
include_examples('sets_options_correctly', %w[-f yy], :format, 'yy')
|
43
|
+
include_examples('sets_options_correctly', %w[-f y], :format, 'yy')
|
44
|
+
include_examples('sets_options_correctly', %w[-f yj], :format, 'yj')
|
45
|
+
include_examples('sets_options_correctly', %w[-f yjJ], :format, 'yj') # Test extra characters after valid format
|
46
|
+
|
47
|
+
# Test -m metadata option:
|
48
|
+
include_examples('sets_options_correctly', %w[-m- -m], :metadata, true)
|
49
|
+
include_examples('sets_options_correctly', %w[-m- -m+], :metadata, true)
|
50
|
+
include_examples('sets_options_correctly', %w[--metadata false --metadata], :metadata, true)
|
51
|
+
include_examples('sets_options_correctly', %w[-m -m-], :metadata, false)
|
52
|
+
include_examples('sets_options_correctly', %w[-m yes], :metadata, true)
|
53
|
+
include_examples('sets_options_correctly', %w[-m no], :metadata, false)
|
54
|
+
include_examples('sets_options_correctly', %w[-m true], :metadata, true)
|
55
|
+
include_examples('sets_options_correctly', %w[-m false], :metadata, false)
|
56
|
+
include_examples('sets_options_correctly', %w[--metadata false], :metadata, false)
|
57
|
+
include_examples('sets_options_correctly', %w[--no-metadata], :metadata, false)
|
58
|
+
|
59
|
+
# Test -t text option:
|
60
|
+
include_examples('sets_options_correctly', %w[-t], :text, true)
|
61
|
+
include_examples('sets_options_correctly', %w[-t -t-], :text, false)
|
62
|
+
include_examples('sets_options_correctly', %w[-t yes], :text, true)
|
63
|
+
include_examples('sets_options_correctly', %w[-t no], :text, false)
|
64
|
+
include_examples('sets_options_correctly', %w[-t true], :text, true)
|
65
|
+
include_examples('sets_options_correctly', %w[-t false], :text, false)
|
66
|
+
include_examples('sets_options_correctly', %w[--text false], :text, false)
|
67
|
+
include_examples('sets_options_correctly', %w[--text false --text], :text, true)
|
68
|
+
|
69
|
+
# Test -k key sort option:
|
70
|
+
include_examples('sets_options_correctly', %w[-k-], :key_sort, false)
|
71
|
+
|
72
|
+
# Test -s source option:
|
73
|
+
include_examples('sets_options_correctly', %w[-s-], :source, false)
|
74
|
+
end
|
75
|
+
|
76
|
+
describe '#versions_string' do
|
77
|
+
specify 'returns a Rika version and a Tika version' do
|
78
|
+
expect(described_class.new.send(:versions_string)).to match(versions_regex)
|
79
|
+
end
|
80
|
+
end
|
81
|
+
|
82
|
+
context 'when processing environment variables' do
|
83
|
+
it 'adds arguments from the environment to the args list' do
|
84
|
+
args_parser = described_class.new
|
85
|
+
allow(args_parser).to receive(:environment_options).and_return('-t-')
|
86
|
+
options, _, _ = args_parser.call([])
|
87
|
+
expect(options[:text]).to be(false)
|
88
|
+
end
|
89
|
+
|
90
|
+
it 'overrides environment variable options with command line options' do
|
91
|
+
env_format_arg = '-fyy'
|
92
|
+
cmd_line_format = 'JJ'
|
93
|
+
cmd_line_args = ["-f#{cmd_line_format}"]
|
94
|
+
args_parser = described_class.new
|
95
|
+
allow(args_parser).to receive(:environment_options).and_return(env_format_arg)
|
96
|
+
options, _, _ = args_parser.call(cmd_line_args)
|
97
|
+
expect(options[:format]).to eq(cmd_line_format)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
describe 'DEFAULT_OPTIONS hash' do
|
102
|
+
specify 'has the correct default values' do
|
103
|
+
expect(described_class::DEFAULT_OPTIONS).to eq(
|
104
|
+
as_array: false,
|
105
|
+
text: true,
|
106
|
+
metadata: true,
|
107
|
+
format: 'at',
|
108
|
+
key_sort: true,
|
109
|
+
source: true
|
110
|
+
)
|
111
|
+
end
|
112
|
+
|
113
|
+
specify 'is frozen' do
|
114
|
+
expect(described_class::DEFAULT_OPTIONS).to be_frozen
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
@@ -0,0 +1,120 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
require 'rika/cli/rika_command'
|
5
|
+
|
6
|
+
RF = Rika::Formatters
|
7
|
+
|
8
|
+
describe RikaCommand do
|
9
|
+
let(:versions_regex) { /Versions:.*Rika: (\d+\.\d+\.\d+(-\w+)?).*Tika: (\d+\.\d+\.\d+(-\w+)?)/ }
|
10
|
+
|
11
|
+
before do
|
12
|
+
@original_stdout = $stdout
|
13
|
+
@original_stderr = $stderr
|
14
|
+
$stdout = StringIO.new
|
15
|
+
$stderr = StringIO.new
|
16
|
+
end
|
17
|
+
|
18
|
+
after do
|
19
|
+
$stdout = @original_stdout
|
20
|
+
$stderr = @original_stderr
|
21
|
+
end
|
22
|
+
|
23
|
+
describe '#call' do
|
24
|
+
specify 'call should run the command without error' do
|
25
|
+
expect { described_class.new([fixture_path('tiny.txt')]).call }.not_to raise_error
|
26
|
+
end
|
27
|
+
|
28
|
+
specify 'prints version and exits when -v or --version is specified' do
|
29
|
+
expect { described_class.new(%w[-v]).call }.to output(versions_regex).to_stdout.and raise_error(SystemExit)
|
30
|
+
end
|
31
|
+
|
32
|
+
specify 'prints help and exits when -h or --help is specified' do
|
33
|
+
regex = /Usage: rika \[options\] <file or url> /m
|
34
|
+
expect { described_class.new(%w[-h]).call }.to output(regex).to_stdout.and raise_error(SystemExit)
|
35
|
+
end
|
36
|
+
|
37
|
+
specify 'when run in array mode, outputs the string representation of an array of parse results' do
|
38
|
+
original_stdout = $stdout
|
39
|
+
$stdout = StringIO.new
|
40
|
+
begin
|
41
|
+
tiny_filespec = fixture_path('tiny.txt')
|
42
|
+
args = ['-a', '-fJ', tiny_filespec, tiny_filespec]
|
43
|
+
described_class.new(args).call
|
44
|
+
output = $stdout.string
|
45
|
+
object = JSON.parse(output)
|
46
|
+
expect(object).to be_an(Array)
|
47
|
+
expect(object.size).to eq(2)
|
48
|
+
expect(object.map(&:class)).to eq([Hash, Hash])
|
49
|
+
ensure
|
50
|
+
$stdout = original_stdout
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
54
|
+
|
55
|
+
describe '#single_document_output' do
|
56
|
+
RSpec.shared_examples 'verify_result_is_hash' do |format_chars, parser|
|
57
|
+
specify "correctly uses result hash for JSON and YAML when options are #{format_chars}" do
|
58
|
+
original_stdout = $stdout
|
59
|
+
$stdout = StringIO.new
|
60
|
+
begin
|
61
|
+
rika_command = described_class.new(["-f#{format_chars}", fixture_path('tiny.txt')])
|
62
|
+
rika_command.call
|
63
|
+
output = $stdout.string
|
64
|
+
warn output
|
65
|
+
result_hash = parser.call(output)
|
66
|
+
expect(result_hash).to be_a(Hash)
|
67
|
+
expect(result_hash['metadata']).to be_a(Hash)
|
68
|
+
expect(result_hash['text']).to be_a(String)
|
69
|
+
ensure
|
70
|
+
$stdout = original_stdout
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
include_examples('verify_result_is_hash', 'JJ', ->(s) { JSON.parse(s) })
|
76
|
+
include_examples('verify_result_is_hash', 'jj', ->(s) { JSON.parse(s) })
|
77
|
+
include_examples('verify_result_is_hash', 'yy', ->(s) { YAML.safe_load(s) })
|
78
|
+
end
|
79
|
+
|
80
|
+
describe '#set_output_formats' do
|
81
|
+
RSpec.shared_examples 'verify_correct_output_formats_selected' \
|
82
|
+
do |format_chars, expected_m_formatter, expected_t_formatter|
|
83
|
+
specify "correctly sets output formats when options are #{format_chars}" do
|
84
|
+
rika_command = described_class.new(["-f#{format_chars}"])
|
85
|
+
rika_command.send(:prepare)
|
86
|
+
expect(rika_command.send(:metadata_formatter)).to eq(expected_m_formatter)
|
87
|
+
expect(rika_command.send(:text_formatter)).to eq(expected_t_formatter)
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
include_examples('verify_correct_output_formats_selected', 'aj', RF::AWESOME_PRINT_FORMATTER, RF::JSON_FORMATTER)
|
92
|
+
include_examples('verify_correct_output_formats_selected', 'Jy', RF::PRETTY_JSON_FORMATTER, RF::YAML_FORMATTER)
|
93
|
+
include_examples('verify_correct_output_formats_selected', 'ti', RF::TO_S_FORMATTER, RF::INSPECT_FORMATTER)
|
94
|
+
|
95
|
+
RSpec.shared_examples 'verify_bad_output_format_exits' do |format_chars|
|
96
|
+
specify "exits when a bad output format is specified with #{format_chars}" do
|
97
|
+
expect { described_class.new(["-f#{format_chars}"]).call }.to raise_error(SystemExit)
|
98
|
+
end
|
99
|
+
end
|
100
|
+
|
101
|
+
include_examples 'verify_bad_output_format_exits', 'ax'
|
102
|
+
include_examples 'verify_bad_output_format_exits', 'xa'
|
103
|
+
include_examples 'verify_bad_output_format_exits', 'x'
|
104
|
+
end
|
105
|
+
|
106
|
+
describe '#warn_if_no_targets_specified' do
|
107
|
+
it 'prints a warning if no targets are specified' do
|
108
|
+
rika_command = described_class.new([])
|
109
|
+
allow(rika_command).to receive_messages(
|
110
|
+
targets: [],
|
111
|
+
help_text: 'sample help text'
|
112
|
+
)
|
113
|
+
expect { rika_command.send(:report_and_exit_if_no_targets_specified) }.to raise_error(SystemExit)
|
114
|
+
expect(rika_command).to have_received(:help_text).once
|
115
|
+
output = $stderr.string
|
116
|
+
expect(output).to match(/No targets specified/)
|
117
|
+
expect(output).to include('sample help text')
|
118
|
+
end
|
119
|
+
end
|
120
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
require 'rika/formatters'
|
5
|
+
|
6
|
+
describe Rika::Formatters do
|
7
|
+
describe '.get' do
|
8
|
+
let(:rf) { described_class }
|
9
|
+
|
10
|
+
it 'returns the correct formatter for each option character' do
|
11
|
+
expect(rf.get('a')).to eq(rf::AWESOME_PRINT_FORMATTER)
|
12
|
+
expect(rf.get('i')).to eq(rf::INSPECT_FORMATTER)
|
13
|
+
expect(rf.get('j')).to eq(rf::JSON_FORMATTER)
|
14
|
+
expect(rf.get('J')).to eq(rf::PRETTY_JSON_FORMATTER)
|
15
|
+
expect(rf.get('t')).to eq(rf::TO_S_FORMATTER)
|
16
|
+
expect(rf.get('y')).to eq(rf::YAML_FORMATTER)
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'raises an error if the option character is invalid' do
|
20
|
+
expect { rf.get('x') }.to raise_error(KeyError)
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
require 'rika/parse_result'
|
5
|
+
|
6
|
+
describe Rika::ParseResult do
|
7
|
+
context 'when initialized' do
|
8
|
+
specify 'contains the necessary fields' do
|
9
|
+
expect(described_class.new).to respond_to(
|
10
|
+
:content,
|
11
|
+
:text, # alias for content
|
12
|
+
:metadata,
|
13
|
+
:metadata_java,
|
14
|
+
:content_type,
|
15
|
+
:language,
|
16
|
+
:input_type,
|
17
|
+
:data_source,
|
18
|
+
:max_content_length
|
19
|
+
)
|
20
|
+
end
|
21
|
+
end
|
22
|
+
|
23
|
+
describe '#file?' do
|
24
|
+
specify 'returns true if input_type is :file' do
|
25
|
+
expect(described_class.new(input_type: :file).file?).to be true
|
26
|
+
end
|
27
|
+
|
28
|
+
specify 'returns false if input_type is not :file' do
|
29
|
+
expect(described_class.new.file?).to be false
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
describe '#http?' do
|
34
|
+
specify 'returns true if input_type is :http' do
|
35
|
+
expect(described_class.new(input_type: :http).http?).to be true
|
36
|
+
end
|
37
|
+
|
38
|
+
specify 'returns false if input_type is not :http' do
|
39
|
+
expect(described_class.new.http?).to be false
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,304 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
require 'rika/parser'
|
5
|
+
require 'rika/parse_result'
|
6
|
+
require 'webrick'
|
7
|
+
|
8
|
+
describe Rika::Parser do
|
9
|
+
port = 50515
|
10
|
+
|
11
|
+
let(:text_parse_result) { Rika.parse(fixture_path('document.txt')) }
|
12
|
+
let(:docx_parse_result) { Rika.parse(fixture_path('document.docx')) }
|
13
|
+
let(:doc_parse_result) { Rika.parse(fixture_path('document.doc')) }
|
14
|
+
let(:pdf_parse_result) { Rika.parse(fixture_path('document.pdf')) }
|
15
|
+
let(:image_parse_result) { Rika.parse(fixture_path('image.jpg')) }
|
16
|
+
let(:unknown_parse_result) { Rika.parse(fixture_path('unknown.bin')) }
|
17
|
+
let(:fixtures_dir) { File.expand_path(File.join(File.dirname(__FILE__), '../fixtures')) }
|
18
|
+
let(:quote_first_line) { 'Stopping by Woods on a Snowy Evening' }
|
19
|
+
let(:url) { "http://#{Socket.gethostname}:#{port}" }
|
20
|
+
let(:sample_pdf_filespec) { fixture_path('document.pdf') }
|
21
|
+
let(:first_line) { ->(string) { string.split("\n").first.strip } }
|
22
|
+
|
23
|
+
# returns a lambda that, when passed an action, will wrap it in an HTTP server
|
24
|
+
let(:server_runner) do
|
25
|
+
->(action) do
|
26
|
+
server = nil
|
27
|
+
server_thread = Thread.new do
|
28
|
+
server = WEBrick::HTTPServer.new(
|
29
|
+
Port: port,
|
30
|
+
DocumentRoot: fixtures_dir,
|
31
|
+
AccessLog: [],
|
32
|
+
Logger: WEBrick::Log.new('/dev/null')
|
33
|
+
)
|
34
|
+
server.start
|
35
|
+
end
|
36
|
+
|
37
|
+
# Wait for server to become ready on its new thread
|
38
|
+
sleep 0.01 while server.nil?
|
39
|
+
begin
|
40
|
+
action.call
|
41
|
+
ensure
|
42
|
+
server.shutdown
|
43
|
+
server_thread.exit
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
|
48
|
+
context 'when initialized with a content string and metadata' do
|
49
|
+
let(:content) { 'Magnifique' }
|
50
|
+
let(:metadata) { { 'author' => 'John Doe' } }
|
51
|
+
let(:result) { Rika::ParseResult.new(content: content, metadata: metadata) }
|
52
|
+
|
53
|
+
specify '#content_and_metadata_hash returns a hash with content and metadata' do
|
54
|
+
expect(result.content_and_metadata_hash).to eq({ content: content, metadata: metadata })
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
describe '#parse' do
|
59
|
+
let(:parser) { described_class.new('spec/fixtures/document.pdf') }
|
60
|
+
let(:parse_result) { parser.parse }
|
61
|
+
let(:metadata) { parse_result.metadata }
|
62
|
+
|
63
|
+
specify 'returns an instance of ParseResult' do
|
64
|
+
expect(parse_result).to be_a(Rika::ParseResult)
|
65
|
+
end
|
66
|
+
|
67
|
+
specify 'returns a ParseResult with the expected access methods' do
|
68
|
+
expect(parse_result).to respond_to(
|
69
|
+
:content,
|
70
|
+
:metadata,
|
71
|
+
:metadata_java,
|
72
|
+
:content_type,
|
73
|
+
:language,
|
74
|
+
:input_type,
|
75
|
+
:data_source,
|
76
|
+
:max_content_length
|
77
|
+
)
|
78
|
+
end
|
79
|
+
|
80
|
+
specify 'returns a ParseResult with the expected content' do
|
81
|
+
expect(parse_result.content).to include('Stopping by Woods on a Snowy Evening')
|
82
|
+
end
|
83
|
+
|
84
|
+
specify 'returns a ParseResult with the expected metadata' do
|
85
|
+
expect(parse_result.metadata).to include(
|
86
|
+
'dc:creator' => 'Robert Frost',
|
87
|
+
'dc:format' => 'application/pdf; version=1.3',
|
88
|
+
'dc:title' => 'Stopping by Woods on a Snowy Evening',
|
89
|
+
'rika:data-source' => 'spec/fixtures/document.pdf',
|
90
|
+
'rika:language' => 'en'
|
91
|
+
)
|
92
|
+
end
|
93
|
+
|
94
|
+
specify 'returns a ParseResult with the expected metadata_java' do
|
95
|
+
expect(parse_result.metadata_java).to be_a(Java::OrgApacheTikaMetadata::Metadata)
|
96
|
+
end
|
97
|
+
|
98
|
+
specify 'returns a ParseResult with the expected content_type' do
|
99
|
+
expect(parse_result.content_type).to eq('application/pdf')
|
100
|
+
end
|
101
|
+
|
102
|
+
specify 'returns a ParseResult with the expected language' do
|
103
|
+
expect(parse_result.language).to eq('en')
|
104
|
+
end
|
105
|
+
|
106
|
+
specify 'returns a ParseResult with the expected input_type' do
|
107
|
+
expect(parse_result.input_type).to eq(:file)
|
108
|
+
end
|
109
|
+
|
110
|
+
specify 'returns a ParseResult with the expected data_source' do
|
111
|
+
expect(parse_result.data_source).to eq('spec/fixtures/document.pdf')
|
112
|
+
end
|
113
|
+
|
114
|
+
describe 'metadata key sorting' do
|
115
|
+
RSpec.shared_examples('metadata key sorting') do |caption, key_sort|
|
116
|
+
specify "Metadata keys are #{caption} case insensitively when key_sort is #{key_sort}" do
|
117
|
+
parser = described_class.new('spec/fixtures/document.pdf', key_sort: key_sort)
|
118
|
+
keys = parser.parse.metadata.keys
|
119
|
+
expect(keys == keys.sort_by(&:downcase)).to eq(key_sort)
|
120
|
+
expect(keys).not_to eq(keys.map(&:downcase)) # Above test only valid if both upper and lower case occur.
|
121
|
+
end
|
122
|
+
end
|
123
|
+
|
124
|
+
include_examples 'metadata key sorting', 'sorted', true
|
125
|
+
include_examples 'metadata key sorting', 'not sorted', false
|
126
|
+
end
|
127
|
+
|
128
|
+
specify 'returns a ParseResult with the expected max_content_length' do
|
129
|
+
expect(parse_result.max_content_length).to eq(-1)
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
it 'raises an error if the file does not exist' do
|
134
|
+
expect { Rika.parse(fixture_path('nonexistent_file.txt')) }.to raise_error(IOError)
|
135
|
+
end
|
136
|
+
|
137
|
+
it 'raises an error if the URL does not exist' do
|
138
|
+
unavailable_server = 'http://k6075sd0dfkr8nvfw0zvwfwckucf2aba.com'
|
139
|
+
unavailable_file_on_web = File.join(unavailable_server, 'x.pdf')
|
140
|
+
expect { Rika.parse(unavailable_file_on_web) }.to raise_error(Java::JavaNet::UnknownHostException)
|
141
|
+
end
|
142
|
+
|
143
|
+
it 'detects a file type without a file extension' do
|
144
|
+
parse_result = Rika.parse(fixture_path('image_jpg_without_extension'))
|
145
|
+
expect(parse_result.metadata['Content-Type']).to eq('image/jpeg')
|
146
|
+
end
|
147
|
+
|
148
|
+
describe '#content' do
|
149
|
+
it 'returns the content in a text file' do
|
150
|
+
expect(first_line.(text_parse_result.content)).to eq(quote_first_line)
|
151
|
+
end
|
152
|
+
|
153
|
+
it 'returns the content in a docx file' do
|
154
|
+
expect(first_line.(docx_parse_result.content)).to eq(quote_first_line)
|
155
|
+
end
|
156
|
+
|
157
|
+
it 'returns the content in a pdf file' do
|
158
|
+
# For some reason, the generated PDF file has a newline at the beginning
|
159
|
+
# and trailing spaces on the lines, so we use the second line, and
|
160
|
+
# use `include` to do the text match.
|
161
|
+
expect(pdf_parse_result.content.lines[1]).to include(quote_first_line)
|
162
|
+
end
|
163
|
+
|
164
|
+
it 'returns no content for an image' do
|
165
|
+
expect(image_parse_result.content).to be_empty
|
166
|
+
end
|
167
|
+
|
168
|
+
it 'only returns max content length from a text file' do
|
169
|
+
expect(Rika.parse(fixture_path('document.txt'), max_content_length: 8).content).to eq('Stopping')
|
170
|
+
end
|
171
|
+
|
172
|
+
it 'only returns max content length from a PDF' do
|
173
|
+
expect(Rika.parse(fixture_path('document.pdf'), max_content_length: 9).content).to eq("\nStopping")
|
174
|
+
end
|
175
|
+
|
176
|
+
it 'only returns max content length for file over http' do
|
177
|
+
server_runner.call(-> do
|
178
|
+
content = Rika.parse(File.join(url, 'document.txt'), max_content_length: 8).content
|
179
|
+
expect(content).to eq('Stopping')
|
180
|
+
end)
|
181
|
+
end
|
182
|
+
|
183
|
+
it 'returns the content from a file over http' do
|
184
|
+
content = server_runner.call(-> do
|
185
|
+
Rika.parse(File.join(url, 'document.txt')).content
|
186
|
+
end)
|
187
|
+
expect(first_line.(content)).to eq(quote_first_line)
|
188
|
+
end
|
189
|
+
|
190
|
+
it 'return empty string for unknown file' do
|
191
|
+
expect(unknown_parse_result.content).to be_empty
|
192
|
+
end
|
193
|
+
end
|
194
|
+
|
195
|
+
# We just test a few of the metadata fields for some common file formats
|
196
|
+
# to make sure the integration with Apache Tika works. Apache Tika already
|
197
|
+
# have tests for all file formats it supports so we won't retest that
|
198
|
+
describe '#metadata' do
|
199
|
+
it 'returns nil if metadata field does not exist' do
|
200
|
+
expect(text_parse_result.metadata['nonsense']).to be_nil
|
201
|
+
end
|
202
|
+
|
203
|
+
it 'returns metadata from a docx file' do
|
204
|
+
expect(docx_parse_result.metadata['meta:page-count']).to eq('1')
|
205
|
+
end
|
206
|
+
|
207
|
+
it 'returns metadata from a pdf file' do
|
208
|
+
expect(pdf_parse_result.metadata['pdf:docinfo:creator']).to eq('Robert Frost')
|
209
|
+
end
|
210
|
+
|
211
|
+
it 'returns metadata from a file over http' do
|
212
|
+
server_runner.call(-> do
|
213
|
+
parser = Rika.parse(File.join(url, 'document.pdf'))
|
214
|
+
expect(parser.metadata['pdf:docinfo:creator']).to eq('Robert Frost')
|
215
|
+
end)
|
216
|
+
end
|
217
|
+
|
218
|
+
it 'returns metadata from an image' do
|
219
|
+
expect(image_parse_result.metadata['Image Height']).to eq('72 pixels')
|
220
|
+
expect(image_parse_result.metadata['Image Width']).to eq('72 pixels')
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
describe '#content_type' do
|
225
|
+
it 'returns application/pdf for a pdf file' do
|
226
|
+
expect(pdf_parse_result.content_type).to eq('application/pdf')
|
227
|
+
end
|
228
|
+
|
229
|
+
it 'returns text/plain for a txt file' do
|
230
|
+
expect(text_parse_result.content_type).to eq('text/plain; charset=UTF-8')
|
231
|
+
end
|
232
|
+
|
233
|
+
it 'returns application/pdf for a pdf over http' do
|
234
|
+
server_runner.call(-> do
|
235
|
+
parse_result = Rika.parse(File.join(url, 'document.pdf'))
|
236
|
+
expect(parse_result.content_type).to eq('application/pdf')
|
237
|
+
end)
|
238
|
+
end
|
239
|
+
|
240
|
+
it 'returns application/octet-stream for unknown file' do
|
241
|
+
expect(unknown_parse_result.content_type).to eq('application/octet-stream')
|
242
|
+
end
|
243
|
+
|
244
|
+
it 'returns msword for a doc file' do
|
245
|
+
# There seem to be two permissible content types for a doc file.
|
246
|
+
expect(%w{application/msword application/x-tika-msoffice}.include?(doc_parse_result.content_type)).to be true
|
247
|
+
end
|
248
|
+
|
249
|
+
it 'returns wordprocessingml for a docx file' do
|
250
|
+
expect(docx_parse_result.content_type).to eq(
|
251
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
252
|
+
)
|
253
|
+
end
|
254
|
+
end
|
255
|
+
|
256
|
+
describe '#language' do
|
257
|
+
it 'returns the language of the content' do
|
258
|
+
%w(en de fr ru es).each do |lang|
|
259
|
+
parse_result = Rika.parse(fixture_path("#{lang}.txt"))
|
260
|
+
expect(parse_result.language).to eq(lang)
|
261
|
+
end
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
it 'returns valid content using Rika.parse_content' do
|
266
|
+
content = Rika.parse_content(sample_pdf_filespec)
|
267
|
+
expect(content).to be_a(String)
|
268
|
+
expect(content).not_to be_empty
|
269
|
+
end
|
270
|
+
|
271
|
+
it 'returns valid metadata using Rika.parse_metadata' do
|
272
|
+
metadata = Rika.parse_metadata(sample_pdf_filespec)
|
273
|
+
expect(metadata).to be_a(Hash)
|
274
|
+
expect(metadata).not_to be_empty
|
275
|
+
end
|
276
|
+
|
277
|
+
it 'returns valid content and metadata using Rika.parse_content_and_metadata' do
|
278
|
+
content, metadata = Rika.parse_content_and_metadata(sample_pdf_filespec)
|
279
|
+
expect(content).to be_a(String)
|
280
|
+
expect(content).not_to be_empty
|
281
|
+
expect(metadata).to be_a(Hash)
|
282
|
+
expect(metadata).not_to be_empty
|
283
|
+
end
|
284
|
+
|
285
|
+
specify 'both means of getting both content and metadata return the same values' do
|
286
|
+
content1, metadata1 = Rika.parse_content_and_metadata(sample_pdf_filespec)
|
287
|
+
|
288
|
+
h = Rika.parse_content_and_metadata_as_hash(sample_pdf_filespec)
|
289
|
+
content2 = h[:content]
|
290
|
+
metadata2 = h[:metadata]
|
291
|
+
|
292
|
+
expect(content1).to eq(content2)
|
293
|
+
expect(metadata1).to eq(metadata2)
|
294
|
+
end
|
295
|
+
|
296
|
+
specify 'getting content and metadata individually and together return the same values' do
|
297
|
+
content1, metadata1 = Rika.parse_content_and_metadata(sample_pdf_filespec)
|
298
|
+
content2 = Rika.parse_content(sample_pdf_filespec)
|
299
|
+
metadata2 = Rika.parse_metadata(sample_pdf_filespec)
|
300
|
+
|
301
|
+
expect(content1).to eq(content2)
|
302
|
+
expect(metadata1).to eq(metadata2)
|
303
|
+
end
|
304
|
+
end
|