rika 1.11.1-java → 2.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.gitignore +5 -4
- data/.rubocop.yml +49 -0
- data/Gemfile +12 -0
- data/README.md +213 -76
- data/RELEASE_NOTES.md +26 -0
- data/Rakefile +4 -7
- data/bin/rika +13 -0
- data/lib/rika/cli/args_parser.rb +131 -0
- data/lib/rika/cli/rika_command.rb +129 -0
- data/lib/rika/formatters.rb +39 -0
- data/lib/rika/parse_result.rb +34 -0
- data/lib/rika/parser.rb +64 -70
- data/lib/rika/tika_loader.rb +65 -0
- data/lib/rika/version.rb +3 -1
- data/lib/rika.rb +98 -27
- data/rika.gemspec +30 -17
- data/rika_helper.rb +14 -22
- data/spec/fixtures/image_jpg_without_extension +0 -0
- data/spec/fixtures/tiny.txt +1 -0
- data/spec/rika/cli/args_parser_spec.rb +117 -0
- data/spec/rika/cli/rika_command_spec.rb +120 -0
- data/spec/rika/formatters_spec.rb +23 -0
- data/spec/rika/parse_result_spec.rb +42 -0
- data/spec/rika/parser_spec.rb +304 -0
- data/spec/rika/rika_spec.rb +10 -0
- data/spec/rika/tika_loader_spec.rb +57 -0
- data/spec/spec_helper.rb +10 -3
- metadata +40 -49
- data/.travis.yml +0 -7
- data/java-lib/tika-app-1.24.1.jar +0 -0
- data/spec/fixtures/text_file_without_extension +0 -23
- data/spec/rika_spec.rb +0 -245
- /data/spec/fixtures/{text_file.txt → document.txt} +0 -0
@@ -0,0 +1,57 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
require 'rika/tika_loader'
|
5
|
+
|
6
|
+
describe Rika::TikaLoader do
|
7
|
+
describe '.require_tika' do
|
8
|
+
it 'returns the correct Tika jar file path' do
|
9
|
+
expect(described_class.require_tika).to match(/tika-app-.*\.jar/)
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'calls print_message_and_exit if the Tika jar file cannot be loaded' do
|
13
|
+
allow(ENV).to receive(:[]).with('TIKA_JAR_FILESPEC').and_return('nonexistent_file')
|
14
|
+
expect { described_class.require_tika }.to raise_error(Rika::TikaLoadError) \
|
15
|
+
.with_message(/Unable to load Tika jar file from 'nonexistent_file'./)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe '.specified_tika_filespec' do
|
20
|
+
it 'returns the correct Tika jar file path' do
|
21
|
+
expect(described_class.send(:specified_tika_filespec)).to match(/tika-app-.*\.jar/)
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'raises a TikaLoadError if the Tika jar filespec is not specified at all in TIKA_JAR_FILESPEC' do
|
25
|
+
allow(ENV).to receive(:[]).with('TIKA_JAR_FILESPEC').and_return(nil)
|
26
|
+
expect { described_class.send(:specified_tika_filespec) }.to raise_error(Rika::TikaLoadError) \
|
27
|
+
.with_message(/Environment variable TIKA_JAR_FILESPEC is not set./)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe '.print_message_and_exit' do
|
32
|
+
it 'prints the correct message and exits with an exit code of 1' do
|
33
|
+
stderr_orig = $stderr
|
34
|
+
$stderr = StringIO.new
|
35
|
+
|
36
|
+
begin
|
37
|
+
expect { described_class.send(:print_message_and_exit, 'message') }.to raise_error(SystemExit) do |error|
|
38
|
+
expect(error.status).to eq(1)
|
39
|
+
end
|
40
|
+
expect($stderr.string).to match(/message/)
|
41
|
+
ensure
|
42
|
+
$stderr = stderr_orig
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
describe '.formatted_error_message' do
|
48
|
+
it 'returns the correct message' do
|
49
|
+
message = 'This is a test message.'
|
50
|
+
expect(described_class.send(:formatted_error_message, message)).to match(/#{message}/)
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'returns the correct banner' do
|
54
|
+
expect(described_class.send(:formatted_error_message, 'message').lines.grep(/!{79}/).size).to be >= 2
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,15 +1,22 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
3
|
+
require 'simplecov'
|
4
|
+
SimpleCov.start { add_filter '/spec/' }
|
5
|
+
|
6
|
+
require 'rika'
|
7
|
+
|
8
|
+
def fixture_path(*paths)
|
4
9
|
File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', *paths))
|
5
10
|
end
|
6
11
|
|
7
12
|
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
8
13
|
RSpec.configure do |config|
|
9
|
-
|
10
14
|
# Enable the line below if you want ", focus: true" after a test declaration to
|
11
15
|
# denote the only tests that will be run:
|
12
16
|
# config.filter_run :focus
|
13
17
|
|
14
18
|
config.order = 'random'
|
19
|
+
config.example_status_persistence_file_path = 'spec/rspec-failed-tests-control-file.txt'
|
15
20
|
end
|
21
|
+
|
22
|
+
Rika.init
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Richard Nyström
|
@@ -9,56 +9,48 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2023-09-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
20
|
-
name:
|
21
|
-
type: :development
|
19
|
+
version: '0'
|
20
|
+
name: awesome_print
|
22
21
|
prerelease: false
|
22
|
+
type: :runtime
|
23
23
|
version_requirements: !ruby/object:Gem::Requirement
|
24
24
|
requirements:
|
25
|
-
- - "
|
25
|
+
- - ">="
|
26
26
|
- !ruby/object:Gem::Version
|
27
|
-
version: '
|
28
|
-
|
29
|
-
|
30
|
-
requirements:
|
31
|
-
- - "~>"
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '13.0'
|
34
|
-
name: rake
|
35
|
-
type: :development
|
36
|
-
prerelease: false
|
37
|
-
version_requirements: !ruby/object:Gem::Requirement
|
38
|
-
requirements:
|
39
|
-
- - "~>"
|
40
|
-
- !ruby/object:Gem::Version
|
41
|
-
version: '13.0'
|
42
|
-
description: " A JRuby wrapper for Apache Tika to extract text and metadata from files\
|
43
|
-
\ of various formats. "
|
27
|
+
version: '0'
|
28
|
+
description: A JRuby wrapper for Apache Tika to extract text and metadata from files
|
29
|
+
of various formats.
|
44
30
|
email:
|
45
31
|
- ricny046@gmail.com
|
46
32
|
- keithrbennett@gmail.com
|
47
|
-
executables:
|
33
|
+
executables:
|
34
|
+
- rika
|
48
35
|
extensions: []
|
49
36
|
extra_rdoc_files: []
|
50
37
|
files:
|
51
38
|
- ".gitignore"
|
52
39
|
- ".rspec"
|
53
|
-
- ".
|
40
|
+
- ".rubocop.yml"
|
54
41
|
- Gemfile
|
55
42
|
- LICENSE.txt
|
56
43
|
- README.md
|
57
44
|
- RELEASE_NOTES.md
|
58
45
|
- Rakefile
|
59
|
-
-
|
46
|
+
- bin/rika
|
60
47
|
- lib/rika.rb
|
48
|
+
- lib/rika/cli/args_parser.rb
|
49
|
+
- lib/rika/cli/rika_command.rb
|
50
|
+
- lib/rika/formatters.rb
|
51
|
+
- lib/rika/parse_result.rb
|
61
52
|
- lib/rika/parser.rb
|
53
|
+
- lib/rika/tika_loader.rb
|
62
54
|
- lib/rika/version.rb
|
63
55
|
- pom.xml
|
64
56
|
- rika.gemspec
|
@@ -67,22 +59,36 @@ files:
|
|
67
59
|
- spec/fixtures/document.doc
|
68
60
|
- spec/fixtures/document.docx
|
69
61
|
- spec/fixtures/document.pdf
|
62
|
+
- spec/fixtures/document.txt
|
70
63
|
- spec/fixtures/en.txt
|
71
64
|
- spec/fixtures/es.txt
|
72
65
|
- spec/fixtures/fr.txt
|
73
66
|
- spec/fixtures/image.jpg
|
67
|
+
- spec/fixtures/image_jpg_without_extension
|
74
68
|
- spec/fixtures/lang_cant_be_determined.txt
|
75
69
|
- spec/fixtures/ru.txt
|
76
|
-
- spec/fixtures/
|
77
|
-
- spec/fixtures/text_file_without_extension
|
70
|
+
- spec/fixtures/tiny.txt
|
78
71
|
- spec/fixtures/unknown.bin
|
79
|
-
- spec/
|
72
|
+
- spec/rika/cli/args_parser_spec.rb
|
73
|
+
- spec/rika/cli/rika_command_spec.rb
|
74
|
+
- spec/rika/formatters_spec.rb
|
75
|
+
- spec/rika/parse_result_spec.rb
|
76
|
+
- spec/rika/parser_spec.rb
|
77
|
+
- spec/rika/rika_spec.rb
|
78
|
+
- spec/rika/tika_loader_spec.rb
|
80
79
|
- spec/spec_helper.rb
|
81
80
|
homepage: https://github.com/keithrbennett/rika
|
82
81
|
licenses:
|
83
82
|
- Apache-2.0
|
84
|
-
metadata:
|
85
|
-
|
83
|
+
metadata:
|
84
|
+
rubygems_mfa_required: 'true'
|
85
|
+
post_install_message: |2+
|
86
|
+
|
87
|
+
Using the rika gem requires that you:
|
88
|
+
1) download the Apache Tika tika-app jar file from https://tika.apache.org/download.html
|
89
|
+
2) place it somewhere accessible to the running application
|
90
|
+
3) specify its location in the TIKA_JAR_FILESPEC environment variable
|
91
|
+
|
86
92
|
rdoc_options: []
|
87
93
|
require_paths:
|
88
94
|
- lib
|
@@ -97,24 +103,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
97
103
|
- !ruby/object:Gem::Version
|
98
104
|
version: '0'
|
99
105
|
requirements: []
|
100
|
-
rubygems_version: 3.
|
106
|
+
rubygems_version: 3.3.26
|
101
107
|
signing_key:
|
102
108
|
specification_version: 4
|
103
109
|
summary: A JRuby wrapper for Apache Tika to extract text and metadata from files of
|
104
110
|
various formats.
|
105
|
-
test_files:
|
106
|
-
- spec/fixtures/de.txt
|
107
|
-
- spec/fixtures/document.doc
|
108
|
-
- spec/fixtures/document.docx
|
109
|
-
- spec/fixtures/document.pdf
|
110
|
-
- spec/fixtures/en.txt
|
111
|
-
- spec/fixtures/es.txt
|
112
|
-
- spec/fixtures/fr.txt
|
113
|
-
- spec/fixtures/image.jpg
|
114
|
-
- spec/fixtures/lang_cant_be_determined.txt
|
115
|
-
- spec/fixtures/ru.txt
|
116
|
-
- spec/fixtures/text_file.txt
|
117
|
-
- spec/fixtures/text_file_without_extension
|
118
|
-
- spec/fixtures/unknown.bin
|
119
|
-
- spec/rika_spec.rb
|
120
|
-
- spec/spec_helper.rb
|
111
|
+
test_files: []
|
data/.travis.yml
DELETED
Binary file
|
@@ -1,23 +0,0 @@
|
|
1
|
-
Stopping by Woods on a Snowy Evening
|
2
|
-
|
3
|
-
By Robert Frost
|
4
|
-
|
5
|
-
Whose woods these are I think I know.
|
6
|
-
His house is in the village though;
|
7
|
-
He will not see me stopping here
|
8
|
-
To watch his woods fill up with snow.
|
9
|
-
|
10
|
-
My little horse must think it queer
|
11
|
-
To stop without a farmhouse near
|
12
|
-
Between the woods and frozen lake
|
13
|
-
The darkest evening of the year.
|
14
|
-
|
15
|
-
He gives his harness bells a shake
|
16
|
-
To ask if there is some mistake.
|
17
|
-
The only other sound’s the sweep
|
18
|
-
Of easy wind and downy flake.
|
19
|
-
|
20
|
-
The woods are lovely, dark and deep,
|
21
|
-
But I have promises to keep,
|
22
|
-
And miles to go before I sleep,
|
23
|
-
And miles to go before I sleep.
|
data/spec/rika_spec.rb
DELETED
@@ -1,245 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
require 'spec_helper'
|
4
|
-
require 'webrick'
|
5
|
-
|
6
|
-
include WEBrick
|
7
|
-
|
8
|
-
describe Rika::Parser do
|
9
|
-
|
10
|
-
let (:txt_parser) { Rika::Parser.new(file_path('text_file.txt')) }
|
11
|
-
let (:docx_parser) { Rika::Parser.new(file_path('document.docx')) }
|
12
|
-
let (:doc_parser) { Rika::Parser.new(file_path('document.doc')) }
|
13
|
-
let (:pdf_parser) { Rika::Parser.new(file_path('document.pdf')) }
|
14
|
-
let (:image_parser) { Rika::Parser.new(file_path('image.jpg')) }
|
15
|
-
let (:unknown_parser) { Rika::Parser.new(file_path('unknown.bin')) }
|
16
|
-
let (:dir) { File.expand_path(File.join(File.dirname(__FILE__), 'fixtures')) }
|
17
|
-
let (:quote_first_line) { 'Stopping by Woods on a Snowy Evening' }
|
18
|
-
|
19
|
-
port = 50515
|
20
|
-
let (:url) { "http://#{Socket.gethostname}:#{port}" }
|
21
|
-
|
22
|
-
let (:sample_pdf_filespec) { file_path('document.pdf') }
|
23
|
-
|
24
|
-
let(:first_line) { ->(string) { string.split("\n").first.strip } }
|
25
|
-
|
26
|
-
let(:server_runner) do
|
27
|
-
# returns a lambda that, when passed an action, will wrap it in an HTTP server
|
28
|
-
->(action) do
|
29
|
-
server = nil
|
30
|
-
server_thread = Thread.new do
|
31
|
-
server = HTTPServer.new(
|
32
|
-
Port: port,
|
33
|
-
DocumentRoot: dir,
|
34
|
-
AccessLog: [],
|
35
|
-
Logger: WEBrick::Log::new('/dev/null')
|
36
|
-
)
|
37
|
-
server.start
|
38
|
-
end
|
39
|
-
|
40
|
-
# Wait for server to become ready on its new thread
|
41
|
-
sleep 0.01 while server.nil?
|
42
|
-
begin
|
43
|
-
action.call
|
44
|
-
ensure
|
45
|
-
server.shutdown
|
46
|
-
server_thread.exit
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
|
52
|
-
it 'should raise error if file does not exist' do
|
53
|
-
expect(-> { Rika::Parser.new(file_path('nonexistent_file.txt')) }).to raise_error(IOError)
|
54
|
-
end
|
55
|
-
|
56
|
-
it 'should raise error if URL does not exist' do
|
57
|
-
unavailable_server = 'http://k6075sd0dfkr8nvfw0zvwfwckucf2aba.com'
|
58
|
-
unavailable_file_on_web = File.join(unavailable_server, 'x.pdf')
|
59
|
-
expect(-> { Rika::Parser.new(unavailable_file_on_web) }).to raise_error(SocketError)
|
60
|
-
end
|
61
|
-
|
62
|
-
it 'should detect file type without a file extension' do
|
63
|
-
parser = Rika::Parser.new(file_path('text_file_without_extension'))
|
64
|
-
expect(parser.metadata['Content-Type']).to eq('text/plain; charset=UTF-8')
|
65
|
-
end
|
66
|
-
|
67
|
-
describe '#content' do
|
68
|
-
it 'should return the content in a text file' do
|
69
|
-
expect(first_line.(txt_parser.content)).to eq(quote_first_line)
|
70
|
-
end
|
71
|
-
|
72
|
-
it 'should return the content in a docx file' do
|
73
|
-
expect(first_line.(docx_parser.content)).to eq(quote_first_line)
|
74
|
-
end
|
75
|
-
|
76
|
-
it 'should return the content in a pdf file' do
|
77
|
-
expect(first_line.(pdf_parser.content)).to eq(quote_first_line)
|
78
|
-
end
|
79
|
-
|
80
|
-
it 'should return no content for an image' do
|
81
|
-
expect(image_parser.metadata.keys).to_not be_empty
|
82
|
-
end
|
83
|
-
|
84
|
-
it 'should only return max content length' do
|
85
|
-
expect(Rika::Parser.new(file_path('text_file.txt'), 9).content).to eq('Stopping')
|
86
|
-
end
|
87
|
-
|
88
|
-
it 'should only return max content length for file over http', focus: true do
|
89
|
-
server_runner.call( -> do
|
90
|
-
expect(Rika::Parser.new(File.join(url, 'document.pdf'), 9).content).to eq('Stopping')
|
91
|
-
end)
|
92
|
-
end
|
93
|
-
|
94
|
-
it 'should return the content from a file over http' do
|
95
|
-
server_runner.call( -> do
|
96
|
-
content = Rika::Parser.new(File.join(url, 'document.pdf')).content
|
97
|
-
expect(first_line.(content)).to eq(quote_first_line)
|
98
|
-
end)
|
99
|
-
end
|
100
|
-
|
101
|
-
it 'should return empty string for unknown file' do
|
102
|
-
expect(unknown_parser.content).to be_empty
|
103
|
-
end
|
104
|
-
end
|
105
|
-
|
106
|
-
# We just test a few of the metadata fields for some common file formats
|
107
|
-
# to make sure the integration with Apache Tika works. Apache Tika already
|
108
|
-
# have tests for all file formats it supports so we won't retest that
|
109
|
-
describe '#metadata' do
|
110
|
-
it 'should return nil if metadata field does not exist' do
|
111
|
-
expect(txt_parser.metadata['nonsense']).to be_nil
|
112
|
-
end
|
113
|
-
|
114
|
-
it 'should return metadata from a docx file' do
|
115
|
-
expect(docx_parser.metadata['Page-Count']).to eq('1')
|
116
|
-
end
|
117
|
-
|
118
|
-
it 'should return metadata from a pdf file' do
|
119
|
-
expect(pdf_parser.metadata['Author']).to eq('Robert Frost')
|
120
|
-
end
|
121
|
-
|
122
|
-
it 'should return metadata from a file over http', focus: true do
|
123
|
-
server_runner.call( -> do
|
124
|
-
parser = Rika::Parser.new(File.join(url, 'document.pdf'))
|
125
|
-
expect(parser.metadata['Author']).to eq('Robert Frost')
|
126
|
-
end)
|
127
|
-
end
|
128
|
-
|
129
|
-
it 'should return metadata from an image' do
|
130
|
-
expect(image_parser.metadata['Image Height']).to eq('72 pixels')
|
131
|
-
expect(image_parser.metadata['Image Width']).to eq('72 pixels')
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
describe '#available_metadata' do
|
136
|
-
it 'should return available metadata fields' do
|
137
|
-
expect(txt_parser.available_metadata).to_not be_empty
|
138
|
-
end
|
139
|
-
|
140
|
-
it 'should be an array' do
|
141
|
-
expect(txt_parser.available_metadata).to be_an(Array)
|
142
|
-
end
|
143
|
-
end
|
144
|
-
|
145
|
-
describe '#metadata_exists?' do
|
146
|
-
it 'should return false if metadata does not exist' do
|
147
|
-
expect(txt_parser.metadata_exists?('title')).to be false
|
148
|
-
end
|
149
|
-
|
150
|
-
it 'should return true if metadata exist' do
|
151
|
-
expect(docx_parser.metadata_exists?('title')).to be true
|
152
|
-
end
|
153
|
-
end
|
154
|
-
|
155
|
-
describe '#media_type' do
|
156
|
-
it 'should return application/pdf for a pdf file' do
|
157
|
-
expect(pdf_parser.media_type).to eq('application/pdf')
|
158
|
-
end
|
159
|
-
|
160
|
-
it 'should return text/plain for a txt file' do
|
161
|
-
expect(txt_parser.media_type).to eq('text/plain')
|
162
|
-
end
|
163
|
-
|
164
|
-
it 'should return application/pdf for a pdf over http' do
|
165
|
-
server_runner.call( -> do
|
166
|
-
parser = Rika::Parser.new(File.join(url, 'document.pdf'))
|
167
|
-
expect(parser.media_type).to eq('application/pdf')
|
168
|
-
end)
|
169
|
-
end
|
170
|
-
|
171
|
-
it 'should return application/octet-stream for unknown file' do
|
172
|
-
expect(unknown_parser.media_type).to eq('application/octet-stream')
|
173
|
-
end
|
174
|
-
|
175
|
-
it 'should return msword for a doc file' do
|
176
|
-
expect(doc_parser.media_type).to eq('application/msword')
|
177
|
-
end
|
178
|
-
|
179
|
-
it 'should return wordprocessingml for a docx file' do
|
180
|
-
expect(docx_parser.media_type).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document')
|
181
|
-
end
|
182
|
-
end
|
183
|
-
|
184
|
-
describe '#language' do
|
185
|
-
it 'should return the language of the content' do
|
186
|
-
%w(en de fr ru es).each do |lang|
|
187
|
-
txt = Rika::Parser.new(file_path("#{lang}.txt"))
|
188
|
-
expect(txt.language).to eq(lang)
|
189
|
-
end
|
190
|
-
end
|
191
|
-
end
|
192
|
-
|
193
|
-
# See note in rika.rb #language_is_reasonably_certain? regarding this method's future.
|
194
|
-
describe '#language_is_reasonably_certain?' do
|
195
|
-
it "should return false if lang can't be determined" do
|
196
|
-
lang = Rika::Parser.new(file_path("lang_cant_be_determined.txt"))
|
197
|
-
lang.language_is_reasonably_certain? == false
|
198
|
-
end
|
199
|
-
|
200
|
-
it "should return true if language can be determined" do
|
201
|
-
lang = Rika::Parser.new(file_path("en.txt"))
|
202
|
-
lang.language_is_reasonably_certain? == true
|
203
|
-
end
|
204
|
-
end
|
205
|
-
|
206
|
-
it 'should return valid content using Rika.parse_content' do
|
207
|
-
content = Rika.parse_content(sample_pdf_filespec)
|
208
|
-
expect(content).to be_a(String)
|
209
|
-
expect(content).to_not be_empty
|
210
|
-
end
|
211
|
-
|
212
|
-
it 'should return valid metadata using Rika.parse_metadata' do
|
213
|
-
metadata = Rika.parse_metadata(sample_pdf_filespec)
|
214
|
-
expect(metadata).to be_a(Hash)
|
215
|
-
expect(metadata).to_not be_empty
|
216
|
-
end
|
217
|
-
|
218
|
-
it 'should return valid content and metadata using Rika.parse_content_and_metadata' do
|
219
|
-
content, metadata = Rika.parse_content_and_metadata(sample_pdf_filespec)
|
220
|
-
expect(content).to be_a(String)
|
221
|
-
expect(content).to_not be_empty
|
222
|
-
expect(metadata).to be_a(Hash)
|
223
|
-
expect(metadata).to_not be_empty
|
224
|
-
end
|
225
|
-
|
226
|
-
specify 'both means of getting both content and metadata should return the same values' do
|
227
|
-
content_1, metadata_1 = Rika.parse_content_and_metadata(sample_pdf_filespec)
|
228
|
-
|
229
|
-
h = Rika.parse_content_and_metadata_as_hash(sample_pdf_filespec)
|
230
|
-
content_2 = h[:content]
|
231
|
-
metadata_2 = h[:metadata]
|
232
|
-
|
233
|
-
expect(content_1).to eq(content_2)
|
234
|
-
expect(metadata_1).to eq(metadata_2)
|
235
|
-
end
|
236
|
-
|
237
|
-
specify 'getting content and metadata individually and together should return the same values' do
|
238
|
-
content_1, metadata_1 = Rika.parse_content_and_metadata(sample_pdf_filespec, -1)
|
239
|
-
content_2 = Rika.parse_content(sample_pdf_filespec)
|
240
|
-
metadata_2 = Rika.parse_metadata(sample_pdf_filespec, -1)
|
241
|
-
|
242
|
-
expect(content_1).to eq(content_2)
|
243
|
-
expect(metadata_1).to eq(metadata_2)
|
244
|
-
end
|
245
|
-
end
|
File without changes
|