rika 1.11.1-java → 2.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +5 -4
- data/.rubocop.yml +49 -0
- data/Gemfile +12 -0
- data/README.md +213 -76
- data/RELEASE_NOTES.md +26 -0
- data/Rakefile +4 -7
- data/bin/rika +13 -0
- data/lib/rika/cli/args_parser.rb +131 -0
- data/lib/rika/cli/rika_command.rb +129 -0
- data/lib/rika/formatters.rb +39 -0
- data/lib/rika/parse_result.rb +34 -0
- data/lib/rika/parser.rb +64 -70
- data/lib/rika/tika_loader.rb +65 -0
- data/lib/rika/version.rb +3 -1
- data/lib/rika.rb +98 -27
- data/rika.gemspec +30 -17
- data/rika_helper.rb +14 -22
- data/spec/fixtures/image_jpg_without_extension +0 -0
- data/spec/fixtures/tiny.txt +1 -0
- data/spec/rika/cli/args_parser_spec.rb +117 -0
- data/spec/rika/cli/rika_command_spec.rb +120 -0
- data/spec/rika/formatters_spec.rb +23 -0
- data/spec/rika/parse_result_spec.rb +42 -0
- data/spec/rika/parser_spec.rb +304 -0
- data/spec/rika/rika_spec.rb +10 -0
- data/spec/rika/tika_loader_spec.rb +57 -0
- data/spec/spec_helper.rb +10 -3
- metadata +40 -49
- data/.travis.yml +0 -7
- data/java-lib/tika-app-1.24.1.jar +0 -0
- data/spec/fixtures/text_file_without_extension +0 -23
- data/spec/rika_spec.rb +0 -245
- /data/spec/fixtures/{text_file.txt → document.txt} +0 -0
@@ -0,0 +1,57 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
require 'rika/tika_loader'
|
5
|
+
|
6
|
+
describe Rika::TikaLoader do
|
7
|
+
describe '.require_tika' do
|
8
|
+
it 'returns the correct Tika jar file path' do
|
9
|
+
expect(described_class.require_tika).to match(/tika-app-.*\.jar/)
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'calls print_message_and_exit if the Tika jar file cannot be loaded' do
|
13
|
+
allow(ENV).to receive(:[]).with('TIKA_JAR_FILESPEC').and_return('nonexistent_file')
|
14
|
+
expect { described_class.require_tika }.to raise_error(Rika::TikaLoadError) \
|
15
|
+
.with_message(/Unable to load Tika jar file from 'nonexistent_file'./)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
describe '.specified_tika_filespec' do
|
20
|
+
it 'returns the correct Tika jar file path' do
|
21
|
+
expect(described_class.send(:specified_tika_filespec)).to match(/tika-app-.*\.jar/)
|
22
|
+
end
|
23
|
+
|
24
|
+
it 'raises a TikaLoadError if the Tika jar filespec is not specified at all in TIKA_JAR_FILESPEC' do
|
25
|
+
allow(ENV).to receive(:[]).with('TIKA_JAR_FILESPEC').and_return(nil)
|
26
|
+
expect { described_class.send(:specified_tika_filespec) }.to raise_error(Rika::TikaLoadError) \
|
27
|
+
.with_message(/Environment variable TIKA_JAR_FILESPEC is not set./)
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
describe '.print_message_and_exit' do
|
32
|
+
it 'prints the correct message and exits with an exit code of 1' do
|
33
|
+
stderr_orig = $stderr
|
34
|
+
$stderr = StringIO.new
|
35
|
+
|
36
|
+
begin
|
37
|
+
expect { described_class.send(:print_message_and_exit, 'message') }.to raise_error(SystemExit) do |error|
|
38
|
+
expect(error.status).to eq(1)
|
39
|
+
end
|
40
|
+
expect($stderr.string).to match(/message/)
|
41
|
+
ensure
|
42
|
+
$stderr = stderr_orig
|
43
|
+
end
|
44
|
+
end
|
45
|
+
end
|
46
|
+
|
47
|
+
describe '.formatted_error_message' do
|
48
|
+
it 'returns the correct message' do
|
49
|
+
message = 'This is a test message.'
|
50
|
+
expect(described_class.send(:formatted_error_message, message)).to match(/#{message}/)
|
51
|
+
end
|
52
|
+
|
53
|
+
it 'returns the correct banner' do
|
54
|
+
expect(described_class.send(:formatted_error_message, 'message').lines.grep(/!{79}/).size).to be >= 2
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
data/spec/spec_helper.rb
CHANGED
@@ -1,15 +1,22 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
2
|
|
3
|
-
|
3
|
+
require 'simplecov'
|
4
|
+
SimpleCov.start { add_filter '/spec/' }
|
5
|
+
|
6
|
+
require 'rika'
|
7
|
+
|
8
|
+
def fixture_path(*paths)
|
4
9
|
File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', *paths))
|
5
10
|
end
|
6
11
|
|
7
12
|
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
8
13
|
RSpec.configure do |config|
|
9
|
-
|
10
14
|
# Enable the line below if you want ", focus: true" after a test declaration to
|
11
15
|
# denote the only tests that will be run:
|
12
16
|
# config.filter_run :focus
|
13
17
|
|
14
18
|
config.order = 'random'
|
19
|
+
config.example_status_persistence_file_path = 'spec/rspec-failed-tests-control-file.txt'
|
15
20
|
end
|
21
|
+
|
22
|
+
Rika.init
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version:
|
4
|
+
version: 2.0.0
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Richard Nyström
|
@@ -9,56 +9,48 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2023-09-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- - "
|
17
|
+
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: '
|
20
|
-
name:
|
21
|
-
type: :development
|
19
|
+
version: '0'
|
20
|
+
name: awesome_print
|
22
21
|
prerelease: false
|
22
|
+
type: :runtime
|
23
23
|
version_requirements: !ruby/object:Gem::Requirement
|
24
24
|
requirements:
|
25
|
-
- - "
|
25
|
+
- - ">="
|
26
26
|
- !ruby/object:Gem::Version
|
27
|
-
version: '
|
28
|
-
|
29
|
-
|
30
|
-
requirements:
|
31
|
-
- - "~>"
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '13.0'
|
34
|
-
name: rake
|
35
|
-
type: :development
|
36
|
-
prerelease: false
|
37
|
-
version_requirements: !ruby/object:Gem::Requirement
|
38
|
-
requirements:
|
39
|
-
- - "~>"
|
40
|
-
- !ruby/object:Gem::Version
|
41
|
-
version: '13.0'
|
42
|
-
description: " A JRuby wrapper for Apache Tika to extract text and metadata from files\
|
43
|
-
\ of various formats. "
|
27
|
+
version: '0'
|
28
|
+
description: A JRuby wrapper for Apache Tika to extract text and metadata from files
|
29
|
+
of various formats.
|
44
30
|
email:
|
45
31
|
- ricny046@gmail.com
|
46
32
|
- keithrbennett@gmail.com
|
47
|
-
executables:
|
33
|
+
executables:
|
34
|
+
- rika
|
48
35
|
extensions: []
|
49
36
|
extra_rdoc_files: []
|
50
37
|
files:
|
51
38
|
- ".gitignore"
|
52
39
|
- ".rspec"
|
53
|
-
- ".
|
40
|
+
- ".rubocop.yml"
|
54
41
|
- Gemfile
|
55
42
|
- LICENSE.txt
|
56
43
|
- README.md
|
57
44
|
- RELEASE_NOTES.md
|
58
45
|
- Rakefile
|
59
|
-
-
|
46
|
+
- bin/rika
|
60
47
|
- lib/rika.rb
|
48
|
+
- lib/rika/cli/args_parser.rb
|
49
|
+
- lib/rika/cli/rika_command.rb
|
50
|
+
- lib/rika/formatters.rb
|
51
|
+
- lib/rika/parse_result.rb
|
61
52
|
- lib/rika/parser.rb
|
53
|
+
- lib/rika/tika_loader.rb
|
62
54
|
- lib/rika/version.rb
|
63
55
|
- pom.xml
|
64
56
|
- rika.gemspec
|
@@ -67,22 +59,36 @@ files:
|
|
67
59
|
- spec/fixtures/document.doc
|
68
60
|
- spec/fixtures/document.docx
|
69
61
|
- spec/fixtures/document.pdf
|
62
|
+
- spec/fixtures/document.txt
|
70
63
|
- spec/fixtures/en.txt
|
71
64
|
- spec/fixtures/es.txt
|
72
65
|
- spec/fixtures/fr.txt
|
73
66
|
- spec/fixtures/image.jpg
|
67
|
+
- spec/fixtures/image_jpg_without_extension
|
74
68
|
- spec/fixtures/lang_cant_be_determined.txt
|
75
69
|
- spec/fixtures/ru.txt
|
76
|
-
- spec/fixtures/
|
77
|
-
- spec/fixtures/text_file_without_extension
|
70
|
+
- spec/fixtures/tiny.txt
|
78
71
|
- spec/fixtures/unknown.bin
|
79
|
-
- spec/
|
72
|
+
- spec/rika/cli/args_parser_spec.rb
|
73
|
+
- spec/rika/cli/rika_command_spec.rb
|
74
|
+
- spec/rika/formatters_spec.rb
|
75
|
+
- spec/rika/parse_result_spec.rb
|
76
|
+
- spec/rika/parser_spec.rb
|
77
|
+
- spec/rika/rika_spec.rb
|
78
|
+
- spec/rika/tika_loader_spec.rb
|
80
79
|
- spec/spec_helper.rb
|
81
80
|
homepage: https://github.com/keithrbennett/rika
|
82
81
|
licenses:
|
83
82
|
- Apache-2.0
|
84
|
-
metadata:
|
85
|
-
|
83
|
+
metadata:
|
84
|
+
rubygems_mfa_required: 'true'
|
85
|
+
post_install_message: |2+
|
86
|
+
|
87
|
+
Using the rika gem requires that you:
|
88
|
+
1) download the Apache Tika tika-app jar file from https://tika.apache.org/download.html
|
89
|
+
2) place it somewhere accessible to the running application
|
90
|
+
3) specify its location in the TIKA_JAR_FILESPEC environment variable
|
91
|
+
|
86
92
|
rdoc_options: []
|
87
93
|
require_paths:
|
88
94
|
- lib
|
@@ -97,24 +103,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
97
103
|
- !ruby/object:Gem::Version
|
98
104
|
version: '0'
|
99
105
|
requirements: []
|
100
|
-
rubygems_version: 3.
|
106
|
+
rubygems_version: 3.3.26
|
101
107
|
signing_key:
|
102
108
|
specification_version: 4
|
103
109
|
summary: A JRuby wrapper for Apache Tika to extract text and metadata from files of
|
104
110
|
various formats.
|
105
|
-
test_files:
|
106
|
-
- spec/fixtures/de.txt
|
107
|
-
- spec/fixtures/document.doc
|
108
|
-
- spec/fixtures/document.docx
|
109
|
-
- spec/fixtures/document.pdf
|
110
|
-
- spec/fixtures/en.txt
|
111
|
-
- spec/fixtures/es.txt
|
112
|
-
- spec/fixtures/fr.txt
|
113
|
-
- spec/fixtures/image.jpg
|
114
|
-
- spec/fixtures/lang_cant_be_determined.txt
|
115
|
-
- spec/fixtures/ru.txt
|
116
|
-
- spec/fixtures/text_file.txt
|
117
|
-
- spec/fixtures/text_file_without_extension
|
118
|
-
- spec/fixtures/unknown.bin
|
119
|
-
- spec/rika_spec.rb
|
120
|
-
- spec/spec_helper.rb
|
111
|
+
test_files: []
|
data/.travis.yml
DELETED
Binary file
|
@@ -1,23 +0,0 @@
|
|
1
|
-
Stopping by Woods on a Snowy Evening
|
2
|
-
|
3
|
-
By Robert Frost
|
4
|
-
|
5
|
-
Whose woods these are I think I know.
|
6
|
-
His house is in the village though;
|
7
|
-
He will not see me stopping here
|
8
|
-
To watch his woods fill up with snow.
|
9
|
-
|
10
|
-
My little horse must think it queer
|
11
|
-
To stop without a farmhouse near
|
12
|
-
Between the woods and frozen lake
|
13
|
-
The darkest evening of the year.
|
14
|
-
|
15
|
-
He gives his harness bells a shake
|
16
|
-
To ask if there is some mistake.
|
17
|
-
The only other sound’s the sweep
|
18
|
-
Of easy wind and downy flake.
|
19
|
-
|
20
|
-
The woods are lovely, dark and deep,
|
21
|
-
But I have promises to keep,
|
22
|
-
And miles to go before I sleep,
|
23
|
-
And miles to go before I sleep.
|
data/spec/rika_spec.rb
DELETED
@@ -1,245 +0,0 @@
|
|
1
|
-
# encoding: utf-8
|
2
|
-
|
3
|
-
require 'spec_helper'
|
4
|
-
require 'webrick'
|
5
|
-
|
6
|
-
include WEBrick
|
7
|
-
|
8
|
-
describe Rika::Parser do
|
9
|
-
|
10
|
-
let (:txt_parser) { Rika::Parser.new(file_path('text_file.txt')) }
|
11
|
-
let (:docx_parser) { Rika::Parser.new(file_path('document.docx')) }
|
12
|
-
let (:doc_parser) { Rika::Parser.new(file_path('document.doc')) }
|
13
|
-
let (:pdf_parser) { Rika::Parser.new(file_path('document.pdf')) }
|
14
|
-
let (:image_parser) { Rika::Parser.new(file_path('image.jpg')) }
|
15
|
-
let (:unknown_parser) { Rika::Parser.new(file_path('unknown.bin')) }
|
16
|
-
let (:dir) { File.expand_path(File.join(File.dirname(__FILE__), 'fixtures')) }
|
17
|
-
let (:quote_first_line) { 'Stopping by Woods on a Snowy Evening' }
|
18
|
-
|
19
|
-
port = 50515
|
20
|
-
let (:url) { "http://#{Socket.gethostname}:#{port}" }
|
21
|
-
|
22
|
-
let (:sample_pdf_filespec) { file_path('document.pdf') }
|
23
|
-
|
24
|
-
let(:first_line) { ->(string) { string.split("\n").first.strip } }
|
25
|
-
|
26
|
-
let(:server_runner) do
|
27
|
-
# returns a lambda that, when passed an action, will wrap it in an HTTP server
|
28
|
-
->(action) do
|
29
|
-
server = nil
|
30
|
-
server_thread = Thread.new do
|
31
|
-
server = HTTPServer.new(
|
32
|
-
Port: port,
|
33
|
-
DocumentRoot: dir,
|
34
|
-
AccessLog: [],
|
35
|
-
Logger: WEBrick::Log::new('/dev/null')
|
36
|
-
)
|
37
|
-
server.start
|
38
|
-
end
|
39
|
-
|
40
|
-
# Wait for server to become ready on its new thread
|
41
|
-
sleep 0.01 while server.nil?
|
42
|
-
begin
|
43
|
-
action.call
|
44
|
-
ensure
|
45
|
-
server.shutdown
|
46
|
-
server_thread.exit
|
47
|
-
end
|
48
|
-
end
|
49
|
-
end
|
50
|
-
|
51
|
-
|
52
|
-
it 'should raise error if file does not exist' do
|
53
|
-
expect(-> { Rika::Parser.new(file_path('nonexistent_file.txt')) }).to raise_error(IOError)
|
54
|
-
end
|
55
|
-
|
56
|
-
it 'should raise error if URL does not exist' do
|
57
|
-
unavailable_server = 'http://k6075sd0dfkr8nvfw0zvwfwckucf2aba.com'
|
58
|
-
unavailable_file_on_web = File.join(unavailable_server, 'x.pdf')
|
59
|
-
expect(-> { Rika::Parser.new(unavailable_file_on_web) }).to raise_error(SocketError)
|
60
|
-
end
|
61
|
-
|
62
|
-
it 'should detect file type without a file extension' do
|
63
|
-
parser = Rika::Parser.new(file_path('text_file_without_extension'))
|
64
|
-
expect(parser.metadata['Content-Type']).to eq('text/plain; charset=UTF-8')
|
65
|
-
end
|
66
|
-
|
67
|
-
describe '#content' do
|
68
|
-
it 'should return the content in a text file' do
|
69
|
-
expect(first_line.(txt_parser.content)).to eq(quote_first_line)
|
70
|
-
end
|
71
|
-
|
72
|
-
it 'should return the content in a docx file' do
|
73
|
-
expect(first_line.(docx_parser.content)).to eq(quote_first_line)
|
74
|
-
end
|
75
|
-
|
76
|
-
it 'should return the content in a pdf file' do
|
77
|
-
expect(first_line.(pdf_parser.content)).to eq(quote_first_line)
|
78
|
-
end
|
79
|
-
|
80
|
-
it 'should return no content for an image' do
|
81
|
-
expect(image_parser.metadata.keys).to_not be_empty
|
82
|
-
end
|
83
|
-
|
84
|
-
it 'should only return max content length' do
|
85
|
-
expect(Rika::Parser.new(file_path('text_file.txt'), 9).content).to eq('Stopping')
|
86
|
-
end
|
87
|
-
|
88
|
-
it 'should only return max content length for file over http', focus: true do
|
89
|
-
server_runner.call( -> do
|
90
|
-
expect(Rika::Parser.new(File.join(url, 'document.pdf'), 9).content).to eq('Stopping')
|
91
|
-
end)
|
92
|
-
end
|
93
|
-
|
94
|
-
it 'should return the content from a file over http' do
|
95
|
-
server_runner.call( -> do
|
96
|
-
content = Rika::Parser.new(File.join(url, 'document.pdf')).content
|
97
|
-
expect(first_line.(content)).to eq(quote_first_line)
|
98
|
-
end)
|
99
|
-
end
|
100
|
-
|
101
|
-
it 'should return empty string for unknown file' do
|
102
|
-
expect(unknown_parser.content).to be_empty
|
103
|
-
end
|
104
|
-
end
|
105
|
-
|
106
|
-
# We just test a few of the metadata fields for some common file formats
|
107
|
-
# to make sure the integration with Apache Tika works. Apache Tika already
|
108
|
-
# have tests for all file formats it supports so we won't retest that
|
109
|
-
describe '#metadata' do
|
110
|
-
it 'should return nil if metadata field does not exist' do
|
111
|
-
expect(txt_parser.metadata['nonsense']).to be_nil
|
112
|
-
end
|
113
|
-
|
114
|
-
it 'should return metadata from a docx file' do
|
115
|
-
expect(docx_parser.metadata['Page-Count']).to eq('1')
|
116
|
-
end
|
117
|
-
|
118
|
-
it 'should return metadata from a pdf file' do
|
119
|
-
expect(pdf_parser.metadata['Author']).to eq('Robert Frost')
|
120
|
-
end
|
121
|
-
|
122
|
-
it 'should return metadata from a file over http', focus: true do
|
123
|
-
server_runner.call( -> do
|
124
|
-
parser = Rika::Parser.new(File.join(url, 'document.pdf'))
|
125
|
-
expect(parser.metadata['Author']).to eq('Robert Frost')
|
126
|
-
end)
|
127
|
-
end
|
128
|
-
|
129
|
-
it 'should return metadata from an image' do
|
130
|
-
expect(image_parser.metadata['Image Height']).to eq('72 pixels')
|
131
|
-
expect(image_parser.metadata['Image Width']).to eq('72 pixels')
|
132
|
-
end
|
133
|
-
end
|
134
|
-
|
135
|
-
describe '#available_metadata' do
|
136
|
-
it 'should return available metadata fields' do
|
137
|
-
expect(txt_parser.available_metadata).to_not be_empty
|
138
|
-
end
|
139
|
-
|
140
|
-
it 'should be an array' do
|
141
|
-
expect(txt_parser.available_metadata).to be_an(Array)
|
142
|
-
end
|
143
|
-
end
|
144
|
-
|
145
|
-
describe '#metadata_exists?' do
|
146
|
-
it 'should return false if metadata does not exist' do
|
147
|
-
expect(txt_parser.metadata_exists?('title')).to be false
|
148
|
-
end
|
149
|
-
|
150
|
-
it 'should return true if metadata exist' do
|
151
|
-
expect(docx_parser.metadata_exists?('title')).to be true
|
152
|
-
end
|
153
|
-
end
|
154
|
-
|
155
|
-
describe '#media_type' do
|
156
|
-
it 'should return application/pdf for a pdf file' do
|
157
|
-
expect(pdf_parser.media_type).to eq('application/pdf')
|
158
|
-
end
|
159
|
-
|
160
|
-
it 'should return text/plain for a txt file' do
|
161
|
-
expect(txt_parser.media_type).to eq('text/plain')
|
162
|
-
end
|
163
|
-
|
164
|
-
it 'should return application/pdf for a pdf over http' do
|
165
|
-
server_runner.call( -> do
|
166
|
-
parser = Rika::Parser.new(File.join(url, 'document.pdf'))
|
167
|
-
expect(parser.media_type).to eq('application/pdf')
|
168
|
-
end)
|
169
|
-
end
|
170
|
-
|
171
|
-
it 'should return application/octet-stream for unknown file' do
|
172
|
-
expect(unknown_parser.media_type).to eq('application/octet-stream')
|
173
|
-
end
|
174
|
-
|
175
|
-
it 'should return msword for a doc file' do
|
176
|
-
expect(doc_parser.media_type).to eq('application/msword')
|
177
|
-
end
|
178
|
-
|
179
|
-
it 'should return wordprocessingml for a docx file' do
|
180
|
-
expect(docx_parser.media_type).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document')
|
181
|
-
end
|
182
|
-
end
|
183
|
-
|
184
|
-
describe '#language' do
|
185
|
-
it 'should return the language of the content' do
|
186
|
-
%w(en de fr ru es).each do |lang|
|
187
|
-
txt = Rika::Parser.new(file_path("#{lang}.txt"))
|
188
|
-
expect(txt.language).to eq(lang)
|
189
|
-
end
|
190
|
-
end
|
191
|
-
end
|
192
|
-
|
193
|
-
# See note in rika.rb #language_is_reasonably_certain? regarding this method's future.
|
194
|
-
describe '#language_is_reasonably_certain?' do
|
195
|
-
it "should return false if lang can't be determined" do
|
196
|
-
lang = Rika::Parser.new(file_path("lang_cant_be_determined.txt"))
|
197
|
-
lang.language_is_reasonably_certain? == false
|
198
|
-
end
|
199
|
-
|
200
|
-
it "should return true if language can be determined" do
|
201
|
-
lang = Rika::Parser.new(file_path("en.txt"))
|
202
|
-
lang.language_is_reasonably_certain? == true
|
203
|
-
end
|
204
|
-
end
|
205
|
-
|
206
|
-
it 'should return valid content using Rika.parse_content' do
|
207
|
-
content = Rika.parse_content(sample_pdf_filespec)
|
208
|
-
expect(content).to be_a(String)
|
209
|
-
expect(content).to_not be_empty
|
210
|
-
end
|
211
|
-
|
212
|
-
it 'should return valid metadata using Rika.parse_metadata' do
|
213
|
-
metadata = Rika.parse_metadata(sample_pdf_filespec)
|
214
|
-
expect(metadata).to be_a(Hash)
|
215
|
-
expect(metadata).to_not be_empty
|
216
|
-
end
|
217
|
-
|
218
|
-
it 'should return valid content and metadata using Rika.parse_content_and_metadata' do
|
219
|
-
content, metadata = Rika.parse_content_and_metadata(sample_pdf_filespec)
|
220
|
-
expect(content).to be_a(String)
|
221
|
-
expect(content).to_not be_empty
|
222
|
-
expect(metadata).to be_a(Hash)
|
223
|
-
expect(metadata).to_not be_empty
|
224
|
-
end
|
225
|
-
|
226
|
-
specify 'both means of getting both content and metadata should return the same values' do
|
227
|
-
content_1, metadata_1 = Rika.parse_content_and_metadata(sample_pdf_filespec)
|
228
|
-
|
229
|
-
h = Rika.parse_content_and_metadata_as_hash(sample_pdf_filespec)
|
230
|
-
content_2 = h[:content]
|
231
|
-
metadata_2 = h[:metadata]
|
232
|
-
|
233
|
-
expect(content_1).to eq(content_2)
|
234
|
-
expect(metadata_1).to eq(metadata_2)
|
235
|
-
end
|
236
|
-
|
237
|
-
specify 'getting content and metadata individually and together should return the same values' do
|
238
|
-
content_1, metadata_1 = Rika.parse_content_and_metadata(sample_pdf_filespec, -1)
|
239
|
-
content_2 = Rika.parse_content(sample_pdf_filespec)
|
240
|
-
metadata_2 = Rika.parse_metadata(sample_pdf_filespec, -1)
|
241
|
-
|
242
|
-
expect(content_1).to eq(content_2)
|
243
|
-
expect(metadata_1).to eq(metadata_2)
|
244
|
-
end
|
245
|
-
end
|
File without changes
|