rika 1.11.1-java → 2.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/rika_helper.rb CHANGED
@@ -1,3 +1,5 @@
1
+ # frozen_string_literal: true
2
+
1
3
  # Defines some shortcuts for ad-hoc work with Rika.
2
4
  #
3
5
  # Can be used with the `irb`/`jirb` or `pry` (https://github.com/pry/pry) interactive shells:
@@ -11,28 +13,18 @@
11
13
 
12
14
  require 'rika'
13
15
 
14
- def c(resource)
15
- Rika.parse_content(resource)
16
- end
17
-
18
- def m(resource)
19
- Rika.parse_metadata(resource)
16
+ # Add shortuct to Rika.parse.
17
+ def pa(resource)
18
+ Rika.parse(resource)
20
19
  end
21
20
 
22
- def cm(resource)
23
- Rika.parse_content_and_metadata(resource)
21
+ # Add abbreviated aliases for the ParseResult class methods.
22
+ class ParseResult
23
+ alias c content
24
+ alias m metadata
25
+ alias l language
26
+ alias i input_type
27
+ alias d data_source
28
+ alias t content_type
29
+ alias j metadata_java
24
30
  end
25
-
26
- def cmh(resource)
27
- Rika.parse_content_and_metadata_as_hash(resource)
28
- end
29
-
30
- def mj(resource); m(resource).to_json ; end
31
- def mJ(resource); JSON.pretty_generate(m(resource)) ; end
32
- def my(resource); m(resource).to_yaml ; end
33
- def my(resource); require 'awesome_print'; m(resource).ai ;end
34
-
35
- def cmj(resource); c(resource).to_json; end
36
- def cmJ(resource); JSON.pretty_generate(c(resource)); end
37
- def cmy(resource); c(resource).to_yaml ; end
38
- def cma(resource); require 'awesome_print'; c,m = cm(resource); { content: c, metadata: m }; end
@@ -0,0 +1 @@
1
+ hello
@@ -0,0 +1,117 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+
5
+ require 'rika/cli/args_parser'
6
+
7
+ describe ArgsParser do
8
+ let(:versions_regex) { /Versions:.*Rika: (\d+\.\d+\.\d+(-\w+)?).*Tika: (\d+\.\d+\.\d+(-\w+)?)/ }
9
+
10
+ specify 'returns a hash of options, a target array, and help text' do
11
+ options, targets, help_text = described_class.call([])
12
+ expect(options).to be_a(Hash)
13
+ expect(targets).to be_an(Array)
14
+ expect(help_text).to be_a(String)
15
+ end
16
+
17
+ context 'when parsing options' do
18
+ RSpec.shared_examples 'sets_options_correctly' do |args, option_key, expected_value|
19
+ specify "correctly sets #{option_key} to #{expected_value} when args are #{args}" do
20
+ options, _, _ = described_class.call(args)
21
+ expect(options[option_key]).to eq(expected_value)
22
+ end
23
+ end
24
+
25
+ # Test default option values:
26
+ include_examples('sets_options_correctly', [], :as_array, false)
27
+ include_examples('sets_options_correctly', [], :text, true)
28
+ include_examples('sets_options_correctly', [], :metadata, true)
29
+ include_examples('sets_options_correctly', [], :format, 'at')
30
+ include_examples('sets_options_correctly', [], :key_sort, true)
31
+ include_examples('sets_options_correctly', [], :source, true)
32
+
33
+ # Test -a as_array option:
34
+ include_examples('sets_options_correctly', %w[-a], :as_array, true)
35
+ include_examples('sets_options_correctly', %w[--as_array], :as_array, true)
36
+ include_examples('sets_options_correctly', %w[-a -a-], :as_array, false)
37
+ include_examples('sets_options_correctly', %w[--no-as_array], :as_array, false)
38
+
39
+ # Test -f format option:
40
+ include_examples('sets_options_correctly', %w[-fyy], :format, 'yy')
41
+ include_examples('sets_options_correctly', %w[--format yy], :format, 'yy')
42
+ include_examples('sets_options_correctly', %w[-f yy], :format, 'yy')
43
+ include_examples('sets_options_correctly', %w[-f y], :format, 'yy')
44
+ include_examples('sets_options_correctly', %w[-f yj], :format, 'yj')
45
+ include_examples('sets_options_correctly', %w[-f yjJ], :format, 'yj') # Test extra characters after valid format
46
+
47
+ # Test -m metadata option:
48
+ include_examples('sets_options_correctly', %w[-m- -m], :metadata, true)
49
+ include_examples('sets_options_correctly', %w[-m- -m+], :metadata, true)
50
+ include_examples('sets_options_correctly', %w[--metadata false --metadata], :metadata, true)
51
+ include_examples('sets_options_correctly', %w[-m -m-], :metadata, false)
52
+ include_examples('sets_options_correctly', %w[-m yes], :metadata, true)
53
+ include_examples('sets_options_correctly', %w[-m no], :metadata, false)
54
+ include_examples('sets_options_correctly', %w[-m true], :metadata, true)
55
+ include_examples('sets_options_correctly', %w[-m false], :metadata, false)
56
+ include_examples('sets_options_correctly', %w[--metadata false], :metadata, false)
57
+ include_examples('sets_options_correctly', %w[--no-metadata], :metadata, false)
58
+
59
+ # Test -t text option:
60
+ include_examples('sets_options_correctly', %w[-t], :text, true)
61
+ include_examples('sets_options_correctly', %w[-t -t-], :text, false)
62
+ include_examples('sets_options_correctly', %w[-t yes], :text, true)
63
+ include_examples('sets_options_correctly', %w[-t no], :text, false)
64
+ include_examples('sets_options_correctly', %w[-t true], :text, true)
65
+ include_examples('sets_options_correctly', %w[-t false], :text, false)
66
+ include_examples('sets_options_correctly', %w[--text false], :text, false)
67
+ include_examples('sets_options_correctly', %w[--text false --text], :text, true)
68
+
69
+ # Test -k key sort option:
70
+ include_examples('sets_options_correctly', %w[-k-], :key_sort, false)
71
+
72
+ # Test -s source option:
73
+ include_examples('sets_options_correctly', %w[-s-], :source, false)
74
+ end
75
+
76
+ describe '#versions_string' do
77
+ specify 'returns a Rika version and a Tika version' do
78
+ expect(described_class.new.send(:versions_string)).to match(versions_regex)
79
+ end
80
+ end
81
+
82
+ context 'when processing environment variables' do
83
+ it 'adds arguments from the environment to the args list' do
84
+ args_parser = described_class.new
85
+ allow(args_parser).to receive(:environment_options).and_return('-t-')
86
+ options, _, _ = args_parser.call([])
87
+ expect(options[:text]).to be(false)
88
+ end
89
+
90
+ it 'overrides environment variable options with command line options' do
91
+ env_format_arg = '-fyy'
92
+ cmd_line_format = 'JJ'
93
+ cmd_line_args = ["-f#{cmd_line_format}"]
94
+ args_parser = described_class.new
95
+ allow(args_parser).to receive(:environment_options).and_return(env_format_arg)
96
+ options, _, _ = args_parser.call(cmd_line_args)
97
+ expect(options[:format]).to eq(cmd_line_format)
98
+ end
99
+ end
100
+
101
+ describe 'DEFAULT_OPTIONS hash' do
102
+ specify 'has the correct default values' do
103
+ expect(described_class::DEFAULT_OPTIONS).to eq(
104
+ as_array: false,
105
+ text: true,
106
+ metadata: true,
107
+ format: 'at',
108
+ key_sort: true,
109
+ source: true
110
+ )
111
+ end
112
+
113
+ specify 'is frozen' do
114
+ expect(described_class::DEFAULT_OPTIONS).to be_frozen
115
+ end
116
+ end
117
+ end
@@ -0,0 +1,120 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+ require 'rika/cli/rika_command'
5
+
6
+ RF = Rika::Formatters
7
+
8
+ describe RikaCommand do
9
+ let(:versions_regex) { /Versions:.*Rika: (\d+\.\d+\.\d+(-\w+)?).*Tika: (\d+\.\d+\.\d+(-\w+)?)/ }
10
+
11
+ before do
12
+ @original_stdout = $stdout
13
+ @original_stderr = $stderr
14
+ $stdout = StringIO.new
15
+ $stderr = StringIO.new
16
+ end
17
+
18
+ after do
19
+ $stdout = @original_stdout
20
+ $stderr = @original_stderr
21
+ end
22
+
23
+ describe '#call' do
24
+ specify 'call should run the command without error' do
25
+ expect { described_class.new([fixture_path('tiny.txt')]).call }.not_to raise_error
26
+ end
27
+
28
+ specify 'prints version and exits when -v or --version is specified' do
29
+ expect { described_class.new(%w[-v]).call }.to output(versions_regex).to_stdout.and raise_error(SystemExit)
30
+ end
31
+
32
+ specify 'prints help and exits when -h or --help is specified' do
33
+ regex = /Usage: rika \[options\] <file or url> /m
34
+ expect { described_class.new(%w[-h]).call }.to output(regex).to_stdout.and raise_error(SystemExit)
35
+ end
36
+
37
+ specify 'when run in array mode, outputs the string representation of an array of parse results' do
38
+ original_stdout = $stdout
39
+ $stdout = StringIO.new
40
+ begin
41
+ tiny_filespec = fixture_path('tiny.txt')
42
+ args = ['-a', '-fJ', tiny_filespec, tiny_filespec]
43
+ described_class.new(args).call
44
+ output = $stdout.string
45
+ object = JSON.parse(output)
46
+ expect(object).to be_an(Array)
47
+ expect(object.size).to eq(2)
48
+ expect(object.map(&:class)).to eq([Hash, Hash])
49
+ ensure
50
+ $stdout = original_stdout
51
+ end
52
+ end
53
+ end
54
+
55
+ describe '#single_document_output' do
56
+ RSpec.shared_examples 'verify_result_is_hash' do |format_chars, parser|
57
+ specify "correctly uses result hash for JSON and YAML when options are #{format_chars}" do
58
+ original_stdout = $stdout
59
+ $stdout = StringIO.new
60
+ begin
61
+ rika_command = described_class.new(["-f#{format_chars}", fixture_path('tiny.txt')])
62
+ rika_command.call
63
+ output = $stdout.string
64
+ warn output
65
+ result_hash = parser.call(output)
66
+ expect(result_hash).to be_a(Hash)
67
+ expect(result_hash['metadata']).to be_a(Hash)
68
+ expect(result_hash['text']).to be_a(String)
69
+ ensure
70
+ $stdout = original_stdout
71
+ end
72
+ end
73
+ end
74
+
75
+ include_examples('verify_result_is_hash', 'JJ', ->(s) { JSON.parse(s) })
76
+ include_examples('verify_result_is_hash', 'jj', ->(s) { JSON.parse(s) })
77
+ include_examples('verify_result_is_hash', 'yy', ->(s) { YAML.safe_load(s) })
78
+ end
79
+
80
+ describe '#set_output_formats' do
81
+ RSpec.shared_examples 'verify_correct_output_formats_selected' \
82
+ do |format_chars, expected_m_formatter, expected_t_formatter|
83
+ specify "correctly sets output formats when options are #{format_chars}" do
84
+ rika_command = described_class.new(["-f#{format_chars}"])
85
+ rika_command.send(:prepare)
86
+ expect(rika_command.send(:metadata_formatter)).to eq(expected_m_formatter)
87
+ expect(rika_command.send(:text_formatter)).to eq(expected_t_formatter)
88
+ end
89
+ end
90
+
91
+ include_examples('verify_correct_output_formats_selected', 'aj', RF::AWESOME_PRINT_FORMATTER, RF::JSON_FORMATTER)
92
+ include_examples('verify_correct_output_formats_selected', 'Jy', RF::PRETTY_JSON_FORMATTER, RF::YAML_FORMATTER)
93
+ include_examples('verify_correct_output_formats_selected', 'ti', RF::TO_S_FORMATTER, RF::INSPECT_FORMATTER)
94
+
95
+ RSpec.shared_examples 'verify_bad_output_format_exits' do |format_chars|
96
+ specify "exits when a bad output format is specified with #{format_chars}" do
97
+ expect { described_class.new(["-f#{format_chars}"]).call }.to raise_error(SystemExit)
98
+ end
99
+ end
100
+
101
+ include_examples 'verify_bad_output_format_exits', 'ax'
102
+ include_examples 'verify_bad_output_format_exits', 'xa'
103
+ include_examples 'verify_bad_output_format_exits', 'x'
104
+ end
105
+
106
+ describe '#warn_if_no_targets_specified' do
107
+ it 'prints a warning if no targets are specified' do
108
+ rika_command = described_class.new([])
109
+ allow(rika_command).to receive_messages(
110
+ targets: [],
111
+ help_text: 'sample help text'
112
+ )
113
+ expect { rika_command.send(:report_and_exit_if_no_targets_specified) }.to raise_error(SystemExit)
114
+ expect(rika_command).to have_received(:help_text).once
115
+ output = $stderr.string
116
+ expect(output).to match(/No targets specified/)
117
+ expect(output).to include('sample help text')
118
+ end
119
+ end
120
+ end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+ require 'rika/formatters'
5
+
6
+ describe Rika::Formatters do
7
+ describe '.get' do
8
+ let(:rf) { described_class }
9
+
10
+ it 'returns the correct formatter for each option character' do
11
+ expect(rf.get('a')).to eq(rf::AWESOME_PRINT_FORMATTER)
12
+ expect(rf.get('i')).to eq(rf::INSPECT_FORMATTER)
13
+ expect(rf.get('j')).to eq(rf::JSON_FORMATTER)
14
+ expect(rf.get('J')).to eq(rf::PRETTY_JSON_FORMATTER)
15
+ expect(rf.get('t')).to eq(rf::TO_S_FORMATTER)
16
+ expect(rf.get('y')).to eq(rf::YAML_FORMATTER)
17
+ end
18
+
19
+ it 'raises an error if the option character is invalid' do
20
+ expect { rf.get('x') }.to raise_error(KeyError)
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,42 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+ require 'rika/parse_result'
5
+
6
+ describe Rika::ParseResult do
7
+ context 'when initialized' do
8
+ specify 'contains the necessary fields' do
9
+ expect(described_class.new).to respond_to(
10
+ :content,
11
+ :text, # alias for content
12
+ :metadata,
13
+ :metadata_java,
14
+ :content_type,
15
+ :language,
16
+ :input_type,
17
+ :data_source,
18
+ :max_content_length
19
+ )
20
+ end
21
+ end
22
+
23
+ describe '#file?' do
24
+ specify 'returns true if input_type is :file' do
25
+ expect(described_class.new(input_type: :file).file?).to be true
26
+ end
27
+
28
+ specify 'returns false if input_type is not :file' do
29
+ expect(described_class.new.file?).to be false
30
+ end
31
+ end
32
+
33
+ describe '#http?' do
34
+ specify 'returns true if input_type is :http' do
35
+ expect(described_class.new(input_type: :http).http?).to be true
36
+ end
37
+
38
+ specify 'returns false if input_type is not :http' do
39
+ expect(described_class.new.http?).to be false
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,304 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+ require 'rika/parser'
5
+ require 'rika/parse_result'
6
+ require 'webrick'
7
+
8
+ describe Rika::Parser do
9
+ port = 50515
10
+
11
+ let(:text_parse_result) { Rika.parse(fixture_path('document.txt')) }
12
+ let(:docx_parse_result) { Rika.parse(fixture_path('document.docx')) }
13
+ let(:doc_parse_result) { Rika.parse(fixture_path('document.doc')) }
14
+ let(:pdf_parse_result) { Rika.parse(fixture_path('document.pdf')) }
15
+ let(:image_parse_result) { Rika.parse(fixture_path('image.jpg')) }
16
+ let(:unknown_parse_result) { Rika.parse(fixture_path('unknown.bin')) }
17
+ let(:fixtures_dir) { File.expand_path(File.join(File.dirname(__FILE__), '../fixtures')) }
18
+ let(:quote_first_line) { 'Stopping by Woods on a Snowy Evening' }
19
+ let(:url) { "http://#{Socket.gethostname}:#{port}" }
20
+ let(:sample_pdf_filespec) { fixture_path('document.pdf') }
21
+ let(:first_line) { ->(string) { string.split("\n").first.strip } }
22
+
23
+ # returns a lambda that, when passed an action, will wrap it in an HTTP server
24
+ let(:server_runner) do
25
+ ->(action) do
26
+ server = nil
27
+ server_thread = Thread.new do
28
+ server = WEBrick::HTTPServer.new(
29
+ Port: port,
30
+ DocumentRoot: fixtures_dir,
31
+ AccessLog: [],
32
+ Logger: WEBrick::Log.new('/dev/null')
33
+ )
34
+ server.start
35
+ end
36
+
37
+ # Wait for server to become ready on its new thread
38
+ sleep 0.01 while server.nil?
39
+ begin
40
+ action.call
41
+ ensure
42
+ server.shutdown
43
+ server_thread.exit
44
+ end
45
+ end
46
+ end
47
+
48
+ context 'when initialized with a content string and metadata' do
49
+ let(:content) { 'Magnifique' }
50
+ let(:metadata) { { 'author' => 'John Doe' } }
51
+ let(:result) { Rika::ParseResult.new(content: content, metadata: metadata) }
52
+
53
+ specify '#content_and_metadata_hash returns a hash with content and metadata' do
54
+ expect(result.content_and_metadata_hash).to eq({ content: content, metadata: metadata })
55
+ end
56
+ end
57
+
58
+ describe '#parse' do
59
+ let(:parser) { described_class.new('spec/fixtures/document.pdf') }
60
+ let(:parse_result) { parser.parse }
61
+ let(:metadata) { parse_result.metadata }
62
+
63
+ specify 'returns an instance of ParseResult' do
64
+ expect(parse_result).to be_a(Rika::ParseResult)
65
+ end
66
+
67
+ specify 'returns a ParseResult with the expected access methods' do
68
+ expect(parse_result).to respond_to(
69
+ :content,
70
+ :metadata,
71
+ :metadata_java,
72
+ :content_type,
73
+ :language,
74
+ :input_type,
75
+ :data_source,
76
+ :max_content_length
77
+ )
78
+ end
79
+
80
+ specify 'returns a ParseResult with the expected content' do
81
+ expect(parse_result.content).to include('Stopping by Woods on a Snowy Evening')
82
+ end
83
+
84
+ specify 'returns a ParseResult with the expected metadata' do
85
+ expect(parse_result.metadata).to include(
86
+ 'dc:creator' => 'Robert Frost',
87
+ 'dc:format' => 'application/pdf; version=1.3',
88
+ 'dc:title' => 'Stopping by Woods on a Snowy Evening',
89
+ 'rika:data-source' => 'spec/fixtures/document.pdf',
90
+ 'rika:language' => 'en'
91
+ )
92
+ end
93
+
94
+ specify 'returns a ParseResult with the expected metadata_java' do
95
+ expect(parse_result.metadata_java).to be_a(Java::OrgApacheTikaMetadata::Metadata)
96
+ end
97
+
98
+ specify 'returns a ParseResult with the expected content_type' do
99
+ expect(parse_result.content_type).to eq('application/pdf')
100
+ end
101
+
102
+ specify 'returns a ParseResult with the expected language' do
103
+ expect(parse_result.language).to eq('en')
104
+ end
105
+
106
+ specify 'returns a ParseResult with the expected input_type' do
107
+ expect(parse_result.input_type).to eq(:file)
108
+ end
109
+
110
+ specify 'returns a ParseResult with the expected data_source' do
111
+ expect(parse_result.data_source).to eq('spec/fixtures/document.pdf')
112
+ end
113
+
114
+ describe 'metadata key sorting' do
115
+ RSpec.shared_examples('metadata key sorting') do |caption, key_sort|
116
+ specify "Metadata keys are #{caption} case insensitively when key_sort is #{key_sort}" do
117
+ parser = described_class.new('spec/fixtures/document.pdf', key_sort: key_sort)
118
+ keys = parser.parse.metadata.keys
119
+ expect(keys == keys.sort_by(&:downcase)).to eq(key_sort)
120
+ expect(keys).not_to eq(keys.map(&:downcase)) # Above test only valid if both upper and lower case occur.
121
+ end
122
+ end
123
+
124
+ include_examples 'metadata key sorting', 'sorted', true
125
+ include_examples 'metadata key sorting', 'not sorted', false
126
+ end
127
+
128
+ specify 'returns a ParseResult with the expected max_content_length' do
129
+ expect(parse_result.max_content_length).to eq(-1)
130
+ end
131
+ end
132
+
133
+ it 'raises an error if the file does not exist' do
134
+ expect { Rika.parse(fixture_path('nonexistent_file.txt')) }.to raise_error(IOError)
135
+ end
136
+
137
+ it 'raises an error if the URL does not exist' do
138
+ unavailable_server = 'http://k6075sd0dfkr8nvfw0zvwfwckucf2aba.com'
139
+ unavailable_file_on_web = File.join(unavailable_server, 'x.pdf')
140
+ expect { Rika.parse(unavailable_file_on_web) }.to raise_error(Java::JavaNet::UnknownHostException)
141
+ end
142
+
143
+ it 'detects a file type without a file extension' do
144
+ parse_result = Rika.parse(fixture_path('image_jpg_without_extension'))
145
+ expect(parse_result.metadata['Content-Type']).to eq('image/jpeg')
146
+ end
147
+
148
+ describe '#content' do
149
+ it 'returns the content in a text file' do
150
+ expect(first_line.(text_parse_result.content)).to eq(quote_first_line)
151
+ end
152
+
153
+ it 'returns the content in a docx file' do
154
+ expect(first_line.(docx_parse_result.content)).to eq(quote_first_line)
155
+ end
156
+
157
+ it 'returns the content in a pdf file' do
158
+ # For some reason, the generated PDF file has a newline at the beginning
159
+ # and trailing spaces on the lines, so we use the second line, and
160
+ # use `include` to do the text match.
161
+ expect(pdf_parse_result.content.lines[1]).to include(quote_first_line)
162
+ end
163
+
164
+ it 'returns no content for an image' do
165
+ expect(image_parse_result.content).to be_empty
166
+ end
167
+
168
+ it 'only returns max content length from a text file' do
169
+ expect(Rika.parse(fixture_path('document.txt'), max_content_length: 8).content).to eq('Stopping')
170
+ end
171
+
172
+ it 'only returns max content length from a PDF' do
173
+ expect(Rika.parse(fixture_path('document.pdf'), max_content_length: 9).content).to eq("\nStopping")
174
+ end
175
+
176
+ it 'only returns max content length for file over http' do
177
+ server_runner.call(-> do
178
+ content = Rika.parse(File.join(url, 'document.txt'), max_content_length: 8).content
179
+ expect(content).to eq('Stopping')
180
+ end)
181
+ end
182
+
183
+ it 'returns the content from a file over http' do
184
+ content = server_runner.call(-> do
185
+ Rika.parse(File.join(url, 'document.txt')).content
186
+ end)
187
+ expect(first_line.(content)).to eq(quote_first_line)
188
+ end
189
+
190
+ it 'return empty string for unknown file' do
191
+ expect(unknown_parse_result.content).to be_empty
192
+ end
193
+ end
194
+
195
+ # We just test a few of the metadata fields for some common file formats
196
+ # to make sure the integration with Apache Tika works. Apache Tika already
197
+ # have tests for all file formats it supports so we won't retest that
198
+ describe '#metadata' do
199
+ it 'returns nil if metadata field does not exist' do
200
+ expect(text_parse_result.metadata['nonsense']).to be_nil
201
+ end
202
+
203
+ it 'returns metadata from a docx file' do
204
+ expect(docx_parse_result.metadata['meta:page-count']).to eq('1')
205
+ end
206
+
207
+ it 'returns metadata from a pdf file' do
208
+ expect(pdf_parse_result.metadata['pdf:docinfo:creator']).to eq('Robert Frost')
209
+ end
210
+
211
+ it 'returns metadata from a file over http' do
212
+ server_runner.call(-> do
213
+ parser = Rika.parse(File.join(url, 'document.pdf'))
214
+ expect(parser.metadata['pdf:docinfo:creator']).to eq('Robert Frost')
215
+ end)
216
+ end
217
+
218
+ it 'returns metadata from an image' do
219
+ expect(image_parse_result.metadata['Image Height']).to eq('72 pixels')
220
+ expect(image_parse_result.metadata['Image Width']).to eq('72 pixels')
221
+ end
222
+ end
223
+
224
+ describe '#content_type' do
225
+ it 'returns application/pdf for a pdf file' do
226
+ expect(pdf_parse_result.content_type).to eq('application/pdf')
227
+ end
228
+
229
+ it 'returns text/plain for a txt file' do
230
+ expect(text_parse_result.content_type).to eq('text/plain; charset=UTF-8')
231
+ end
232
+
233
+ it 'returns application/pdf for a pdf over http' do
234
+ server_runner.call(-> do
235
+ parse_result = Rika.parse(File.join(url, 'document.pdf'))
236
+ expect(parse_result.content_type).to eq('application/pdf')
237
+ end)
238
+ end
239
+
240
+ it 'returns application/octet-stream for unknown file' do
241
+ expect(unknown_parse_result.content_type).to eq('application/octet-stream')
242
+ end
243
+
244
+ it 'returns msword for a doc file' do
245
+ # There seem to be two permissible content types for a doc file.
246
+ expect(%w{application/msword application/x-tika-msoffice}.include?(doc_parse_result.content_type)).to be true
247
+ end
248
+
249
+ it 'returns wordprocessingml for a docx file' do
250
+ expect(docx_parse_result.content_type).to eq(
251
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
252
+ )
253
+ end
254
+ end
255
+
256
+ describe '#language' do
257
+ it 'returns the language of the content' do
258
+ %w(en de fr ru es).each do |lang|
259
+ parse_result = Rika.parse(fixture_path("#{lang}.txt"))
260
+ expect(parse_result.language).to eq(lang)
261
+ end
262
+ end
263
+ end
264
+
265
+ it 'returns valid content using Rika.parse_content' do
266
+ content = Rika.parse_content(sample_pdf_filespec)
267
+ expect(content).to be_a(String)
268
+ expect(content).not_to be_empty
269
+ end
270
+
271
+ it 'returns valid metadata using Rika.parse_metadata' do
272
+ metadata = Rika.parse_metadata(sample_pdf_filespec)
273
+ expect(metadata).to be_a(Hash)
274
+ expect(metadata).not_to be_empty
275
+ end
276
+
277
+ it 'returns valid content and metadata using Rika.parse_content_and_metadata' do
278
+ content, metadata = Rika.parse_content_and_metadata(sample_pdf_filespec)
279
+ expect(content).to be_a(String)
280
+ expect(content).not_to be_empty
281
+ expect(metadata).to be_a(Hash)
282
+ expect(metadata).not_to be_empty
283
+ end
284
+
285
+ specify 'both means of getting both content and metadata return the same values' do
286
+ content1, metadata1 = Rika.parse_content_and_metadata(sample_pdf_filespec)
287
+
288
+ h = Rika.parse_content_and_metadata_as_hash(sample_pdf_filespec)
289
+ content2 = h[:content]
290
+ metadata2 = h[:metadata]
291
+
292
+ expect(content1).to eq(content2)
293
+ expect(metadata1).to eq(metadata2)
294
+ end
295
+
296
+ specify 'getting content and metadata individually and together return the same values' do
297
+ content1, metadata1 = Rika.parse_content_and_metadata(sample_pdf_filespec)
298
+ content2 = Rika.parse_content(sample_pdf_filespec)
299
+ metadata2 = Rika.parse_metadata(sample_pdf_filespec)
300
+
301
+ expect(content1).to eq(content2)
302
+ expect(metadata1).to eq(metadata2)
303
+ end
304
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+ require 'rika'
5
+
6
+ describe Rika do
7
+ it 'has a version number' do
8
+ expect(Rika::VERSION).not_to be_nil
9
+ end
10
+ end