rika 1.6.0-java → 2.0.0-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +6 -4
- data/.rubocop.yml +49 -0
- data/Gemfile +12 -0
- data/README.md +226 -47
- data/RELEASE_NOTES.md +43 -0
- data/Rakefile +4 -7
- data/bin/rika +13 -0
- data/lib/rika/cli/args_parser.rb +131 -0
- data/lib/rika/cli/rika_command.rb +129 -0
- data/lib/rika/formatters.rb +39 -0
- data/lib/rika/parse_result.rb +34 -0
- data/lib/rika/parser.rb +84 -0
- data/lib/rika/tika_loader.rb +65 -0
- data/lib/rika/version.rb +3 -1
- data/lib/rika.rb +96 -104
- data/pom.xml +2 -2
- data/rika.gemspec +30 -15
- data/rika_helper.rb +30 -0
- data/spec/fixtures/de.txt +21 -1
- data/spec/fixtures/document.doc +0 -0
- data/spec/fixtures/document.docx +0 -0
- data/spec/fixtures/document.pdf +0 -0
- data/spec/fixtures/document.txt +23 -0
- data/spec/fixtures/en.txt +23 -1
- data/spec/fixtures/es.txt +21 -1
- data/spec/fixtures/fr.txt +23 -1
- data/spec/fixtures/image_jpg_without_extension +0 -0
- data/spec/fixtures/ru.txt +21 -1
- data/spec/fixtures/tiny.txt +1 -0
- data/spec/rika/cli/args_parser_spec.rb +117 -0
- data/spec/rika/cli/rika_command_spec.rb +120 -0
- data/spec/rika/formatters_spec.rb +23 -0
- data/spec/rika/parse_result_spec.rb +42 -0
- data/spec/rika/parser_spec.rb +304 -0
- data/spec/rika/rika_spec.rb +10 -0
- data/spec/rika/tika_loader_spec.rb +57 -0
- data/spec/spec_helper.rb +13 -5
- metadata +54 -98
- data/.travis.yml +0 -7
- data/spec/fixtures/over_100k_file.txt +0 -1241
- data/spec/fixtures/text_file.txt +0 -1
- data/spec/fixtures/text_file_without_extension +0 -1
- data/spec/rika_spec.rb +0 -202
- data/target/dependency/apache-mime4j-core-0.7.2.jar +0 -0
- data/target/dependency/apache-mime4j-dom-0.7.2.jar +0 -0
- data/target/dependency/asm-debug-all-4.1.jar +0 -0
- data/target/dependency/aspectjrt-1.8.0.jar +0 -0
- data/target/dependency/bcmail-jdk15-1.45.jar +0 -0
- data/target/dependency/bcprov-jdk15-1.45.jar +0 -0
- data/target/dependency/boilerpipe-1.1.0.jar +0 -0
- data/target/dependency/commons-codec-1.9.jar +0 -0
- data/target/dependency/commons-compress-1.8.1.jar +0 -0
- data/target/dependency/commons-httpclient-3.1.jar +0 -0
- data/target/dependency/commons-logging-1.1.1.jar +0 -0
- data/target/dependency/fontbox-1.8.6.jar +0 -0
- data/target/dependency/isoparser-1.0.2.jar +0 -0
- data/target/dependency/java-libpst-0.8.1.jar +0 -0
- data/target/dependency/jcip-annotations-1.0.jar +0 -0
- data/target/dependency/jdom-1.0.jar +0 -0
- data/target/dependency/jempbox-1.8.6.jar +0 -0
- data/target/dependency/jhighlight-1.0.jar +0 -0
- data/target/dependency/jmatio-1.0.jar +0 -0
- data/target/dependency/juniversalchardet-1.0.3.jar +0 -0
- data/target/dependency/metadata-extractor-2.6.2.jar +0 -0
- data/target/dependency/netcdf-4.2.20.jar +0 -0
- data/target/dependency/pdfbox-1.8.6.jar +0 -0
- data/target/dependency/poi-3.11-beta2.jar +0 -0
- data/target/dependency/poi-ooxml-3.11-beta2.jar +0 -0
- data/target/dependency/poi-ooxml-schemas-3.11-beta2.jar +0 -0
- data/target/dependency/poi-scratchpad-3.11-beta2.jar +0 -0
- data/target/dependency/rome-1.0.jar +0 -0
- data/target/dependency/slf4j-api-1.6.1.jar +0 -0
- data/target/dependency/tagsoup-1.2.1.jar +0 -0
- data/target/dependency/tika-core-1.6.jar +0 -0
- data/target/dependency/tika-parsers-1.6.jar +0 -0
- data/target/dependency/unidataCommon-4.2.20.jar +0 -0
- data/target/dependency/vorbis-java-core-0.6.jar +0 -0
- data/target/dependency/vorbis-java-tika-0.6.jar +0 -0
- data/target/dependency/xercesImpl-2.8.1.jar +0 -0
- data/target/dependency/xml-apis-1.3.03.jar +0 -0
- data/target/dependency/xmlbeans-2.6.0.jar +0 -0
- data/target/dependency/xmpcore-5.1.2.jar +0 -0
- data/target/dependency/xz-1.5.jar +0 -0
|
@@ -0,0 +1,304 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
require 'rika/parser'
|
|
5
|
+
require 'rika/parse_result'
|
|
6
|
+
require 'webrick'
|
|
7
|
+
|
|
8
|
+
describe Rika::Parser do
|
|
9
|
+
port = 50515
|
|
10
|
+
|
|
11
|
+
let(:text_parse_result) { Rika.parse(fixture_path('document.txt')) }
|
|
12
|
+
let(:docx_parse_result) { Rika.parse(fixture_path('document.docx')) }
|
|
13
|
+
let(:doc_parse_result) { Rika.parse(fixture_path('document.doc')) }
|
|
14
|
+
let(:pdf_parse_result) { Rika.parse(fixture_path('document.pdf')) }
|
|
15
|
+
let(:image_parse_result) { Rika.parse(fixture_path('image.jpg')) }
|
|
16
|
+
let(:unknown_parse_result) { Rika.parse(fixture_path('unknown.bin')) }
|
|
17
|
+
let(:fixtures_dir) { File.expand_path(File.join(File.dirname(__FILE__), '../fixtures')) }
|
|
18
|
+
let(:quote_first_line) { 'Stopping by Woods on a Snowy Evening' }
|
|
19
|
+
let(:url) { "http://#{Socket.gethostname}:#{port}" }
|
|
20
|
+
let(:sample_pdf_filespec) { fixture_path('document.pdf') }
|
|
21
|
+
let(:first_line) { ->(string) { string.split("\n").first.strip } }
|
|
22
|
+
|
|
23
|
+
# returns a lambda that, when passed an action, will wrap it in an HTTP server
|
|
24
|
+
let(:server_runner) do
|
|
25
|
+
->(action) do
|
|
26
|
+
server = nil
|
|
27
|
+
server_thread = Thread.new do
|
|
28
|
+
server = WEBrick::HTTPServer.new(
|
|
29
|
+
Port: port,
|
|
30
|
+
DocumentRoot: fixtures_dir,
|
|
31
|
+
AccessLog: [],
|
|
32
|
+
Logger: WEBrick::Log.new('/dev/null')
|
|
33
|
+
)
|
|
34
|
+
server.start
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Wait for server to become ready on its new thread
|
|
38
|
+
sleep 0.01 while server.nil?
|
|
39
|
+
begin
|
|
40
|
+
action.call
|
|
41
|
+
ensure
|
|
42
|
+
server.shutdown
|
|
43
|
+
server_thread.exit
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
context 'when initialized with a content string and metadata' do
|
|
49
|
+
let(:content) { 'Magnifique' }
|
|
50
|
+
let(:metadata) { { 'author' => 'John Doe' } }
|
|
51
|
+
let(:result) { Rika::ParseResult.new(content: content, metadata: metadata) }
|
|
52
|
+
|
|
53
|
+
specify '#content_and_metadata_hash returns a hash with content and metadata' do
|
|
54
|
+
expect(result.content_and_metadata_hash).to eq({ content: content, metadata: metadata })
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
describe '#parse' do
|
|
59
|
+
let(:parser) { described_class.new('spec/fixtures/document.pdf') }
|
|
60
|
+
let(:parse_result) { parser.parse }
|
|
61
|
+
let(:metadata) { parse_result.metadata }
|
|
62
|
+
|
|
63
|
+
specify 'returns an instance of ParseResult' do
|
|
64
|
+
expect(parse_result).to be_a(Rika::ParseResult)
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
specify 'returns a ParseResult with the expected access methods' do
|
|
68
|
+
expect(parse_result).to respond_to(
|
|
69
|
+
:content,
|
|
70
|
+
:metadata,
|
|
71
|
+
:metadata_java,
|
|
72
|
+
:content_type,
|
|
73
|
+
:language,
|
|
74
|
+
:input_type,
|
|
75
|
+
:data_source,
|
|
76
|
+
:max_content_length
|
|
77
|
+
)
|
|
78
|
+
end
|
|
79
|
+
|
|
80
|
+
specify 'returns a ParseResult with the expected content' do
|
|
81
|
+
expect(parse_result.content).to include('Stopping by Woods on a Snowy Evening')
|
|
82
|
+
end
|
|
83
|
+
|
|
84
|
+
specify 'returns a ParseResult with the expected metadata' do
|
|
85
|
+
expect(parse_result.metadata).to include(
|
|
86
|
+
'dc:creator' => 'Robert Frost',
|
|
87
|
+
'dc:format' => 'application/pdf; version=1.3',
|
|
88
|
+
'dc:title' => 'Stopping by Woods on a Snowy Evening',
|
|
89
|
+
'rika:data-source' => 'spec/fixtures/document.pdf',
|
|
90
|
+
'rika:language' => 'en'
|
|
91
|
+
)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
specify 'returns a ParseResult with the expected metadata_java' do
|
|
95
|
+
expect(parse_result.metadata_java).to be_a(Java::OrgApacheTikaMetadata::Metadata)
|
|
96
|
+
end
|
|
97
|
+
|
|
98
|
+
specify 'returns a ParseResult with the expected content_type' do
|
|
99
|
+
expect(parse_result.content_type).to eq('application/pdf')
|
|
100
|
+
end
|
|
101
|
+
|
|
102
|
+
specify 'returns a ParseResult with the expected language' do
|
|
103
|
+
expect(parse_result.language).to eq('en')
|
|
104
|
+
end
|
|
105
|
+
|
|
106
|
+
specify 'returns a ParseResult with the expected input_type' do
|
|
107
|
+
expect(parse_result.input_type).to eq(:file)
|
|
108
|
+
end
|
|
109
|
+
|
|
110
|
+
specify 'returns a ParseResult with the expected data_source' do
|
|
111
|
+
expect(parse_result.data_source).to eq('spec/fixtures/document.pdf')
|
|
112
|
+
end
|
|
113
|
+
|
|
114
|
+
describe 'metadata key sorting' do
|
|
115
|
+
RSpec.shared_examples('metadata key sorting') do |caption, key_sort|
|
|
116
|
+
specify "Metadata keys are #{caption} case insensitively when key_sort is #{key_sort}" do
|
|
117
|
+
parser = described_class.new('spec/fixtures/document.pdf', key_sort: key_sort)
|
|
118
|
+
keys = parser.parse.metadata.keys
|
|
119
|
+
expect(keys == keys.sort_by(&:downcase)).to eq(key_sort)
|
|
120
|
+
expect(keys).not_to eq(keys.map(&:downcase)) # Above test only valid if both upper and lower case occur.
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
include_examples 'metadata key sorting', 'sorted', true
|
|
125
|
+
include_examples 'metadata key sorting', 'not sorted', false
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
specify 'returns a ParseResult with the expected max_content_length' do
|
|
129
|
+
expect(parse_result.max_content_length).to eq(-1)
|
|
130
|
+
end
|
|
131
|
+
end
|
|
132
|
+
|
|
133
|
+
it 'raises an error if the file does not exist' do
|
|
134
|
+
expect { Rika.parse(fixture_path('nonexistent_file.txt')) }.to raise_error(IOError)
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
it 'raises an error if the URL does not exist' do
|
|
138
|
+
unavailable_server = 'http://k6075sd0dfkr8nvfw0zvwfwckucf2aba.com'
|
|
139
|
+
unavailable_file_on_web = File.join(unavailable_server, 'x.pdf')
|
|
140
|
+
expect { Rika.parse(unavailable_file_on_web) }.to raise_error(Java::JavaNet::UnknownHostException)
|
|
141
|
+
end
|
|
142
|
+
|
|
143
|
+
it 'detects a file type without a file extension' do
|
|
144
|
+
parse_result = Rika.parse(fixture_path('image_jpg_without_extension'))
|
|
145
|
+
expect(parse_result.metadata['Content-Type']).to eq('image/jpeg')
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
describe '#content' do
|
|
149
|
+
it 'returns the content in a text file' do
|
|
150
|
+
expect(first_line.(text_parse_result.content)).to eq(quote_first_line)
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
it 'returns the content in a docx file' do
|
|
154
|
+
expect(first_line.(docx_parse_result.content)).to eq(quote_first_line)
|
|
155
|
+
end
|
|
156
|
+
|
|
157
|
+
it 'returns the content in a pdf file' do
|
|
158
|
+
# For some reason, the generated PDF file has a newline at the beginning
|
|
159
|
+
# and trailing spaces on the lines, so we use the second line, and
|
|
160
|
+
# use `include` to do the text match.
|
|
161
|
+
expect(pdf_parse_result.content.lines[1]).to include(quote_first_line)
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
it 'returns no content for an image' do
|
|
165
|
+
expect(image_parse_result.content).to be_empty
|
|
166
|
+
end
|
|
167
|
+
|
|
168
|
+
it 'only returns max content length from a text file' do
|
|
169
|
+
expect(Rika.parse(fixture_path('document.txt'), max_content_length: 8).content).to eq('Stopping')
|
|
170
|
+
end
|
|
171
|
+
|
|
172
|
+
it 'only returns max content length from a PDF' do
|
|
173
|
+
expect(Rika.parse(fixture_path('document.pdf'), max_content_length: 9).content).to eq("\nStopping")
|
|
174
|
+
end
|
|
175
|
+
|
|
176
|
+
it 'only returns max content length for file over http' do
|
|
177
|
+
server_runner.call(-> do
|
|
178
|
+
content = Rika.parse(File.join(url, 'document.txt'), max_content_length: 8).content
|
|
179
|
+
expect(content).to eq('Stopping')
|
|
180
|
+
end)
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
it 'returns the content from a file over http' do
|
|
184
|
+
content = server_runner.call(-> do
|
|
185
|
+
Rika.parse(File.join(url, 'document.txt')).content
|
|
186
|
+
end)
|
|
187
|
+
expect(first_line.(content)).to eq(quote_first_line)
|
|
188
|
+
end
|
|
189
|
+
|
|
190
|
+
it 'return empty string for unknown file' do
|
|
191
|
+
expect(unknown_parse_result.content).to be_empty
|
|
192
|
+
end
|
|
193
|
+
end
|
|
194
|
+
|
|
195
|
+
# We just test a few of the metadata fields for some common file formats
|
|
196
|
+
# to make sure the integration with Apache Tika works. Apache Tika already
|
|
197
|
+
# have tests for all file formats it supports so we won't retest that
|
|
198
|
+
describe '#metadata' do
|
|
199
|
+
it 'returns nil if metadata field does not exist' do
|
|
200
|
+
expect(text_parse_result.metadata['nonsense']).to be_nil
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
it 'returns metadata from a docx file' do
|
|
204
|
+
expect(docx_parse_result.metadata['meta:page-count']).to eq('1')
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
it 'returns metadata from a pdf file' do
|
|
208
|
+
expect(pdf_parse_result.metadata['pdf:docinfo:creator']).to eq('Robert Frost')
|
|
209
|
+
end
|
|
210
|
+
|
|
211
|
+
it 'returns metadata from a file over http' do
|
|
212
|
+
server_runner.call(-> do
|
|
213
|
+
parser = Rika.parse(File.join(url, 'document.pdf'))
|
|
214
|
+
expect(parser.metadata['pdf:docinfo:creator']).to eq('Robert Frost')
|
|
215
|
+
end)
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
it 'returns metadata from an image' do
|
|
219
|
+
expect(image_parse_result.metadata['Image Height']).to eq('72 pixels')
|
|
220
|
+
expect(image_parse_result.metadata['Image Width']).to eq('72 pixels')
|
|
221
|
+
end
|
|
222
|
+
end
|
|
223
|
+
|
|
224
|
+
describe '#content_type' do
|
|
225
|
+
it 'returns application/pdf for a pdf file' do
|
|
226
|
+
expect(pdf_parse_result.content_type).to eq('application/pdf')
|
|
227
|
+
end
|
|
228
|
+
|
|
229
|
+
it 'returns text/plain for a txt file' do
|
|
230
|
+
expect(text_parse_result.content_type).to eq('text/plain; charset=UTF-8')
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
it 'returns application/pdf for a pdf over http' do
|
|
234
|
+
server_runner.call(-> do
|
|
235
|
+
parse_result = Rika.parse(File.join(url, 'document.pdf'))
|
|
236
|
+
expect(parse_result.content_type).to eq('application/pdf')
|
|
237
|
+
end)
|
|
238
|
+
end
|
|
239
|
+
|
|
240
|
+
it 'returns application/octet-stream for unknown file' do
|
|
241
|
+
expect(unknown_parse_result.content_type).to eq('application/octet-stream')
|
|
242
|
+
end
|
|
243
|
+
|
|
244
|
+
it 'returns msword for a doc file' do
|
|
245
|
+
# There seem to be two permissible content types for a doc file.
|
|
246
|
+
expect(%w{application/msword application/x-tika-msoffice}.include?(doc_parse_result.content_type)).to be true
|
|
247
|
+
end
|
|
248
|
+
|
|
249
|
+
it 'returns wordprocessingml for a docx file' do
|
|
250
|
+
expect(docx_parse_result.content_type).to eq(
|
|
251
|
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
|
252
|
+
)
|
|
253
|
+
end
|
|
254
|
+
end
|
|
255
|
+
|
|
256
|
+
describe '#language' do
|
|
257
|
+
it 'returns the language of the content' do
|
|
258
|
+
%w(en de fr ru es).each do |lang|
|
|
259
|
+
parse_result = Rika.parse(fixture_path("#{lang}.txt"))
|
|
260
|
+
expect(parse_result.language).to eq(lang)
|
|
261
|
+
end
|
|
262
|
+
end
|
|
263
|
+
end
|
|
264
|
+
|
|
265
|
+
it 'returns valid content using Rika.parse_content' do
|
|
266
|
+
content = Rika.parse_content(sample_pdf_filespec)
|
|
267
|
+
expect(content).to be_a(String)
|
|
268
|
+
expect(content).not_to be_empty
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
it 'returns valid metadata using Rika.parse_metadata' do
|
|
272
|
+
metadata = Rika.parse_metadata(sample_pdf_filespec)
|
|
273
|
+
expect(metadata).to be_a(Hash)
|
|
274
|
+
expect(metadata).not_to be_empty
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
it 'returns valid content and metadata using Rika.parse_content_and_metadata' do
|
|
278
|
+
content, metadata = Rika.parse_content_and_metadata(sample_pdf_filespec)
|
|
279
|
+
expect(content).to be_a(String)
|
|
280
|
+
expect(content).not_to be_empty
|
|
281
|
+
expect(metadata).to be_a(Hash)
|
|
282
|
+
expect(metadata).not_to be_empty
|
|
283
|
+
end
|
|
284
|
+
|
|
285
|
+
specify 'both means of getting both content and metadata return the same values' do
|
|
286
|
+
content1, metadata1 = Rika.parse_content_and_metadata(sample_pdf_filespec)
|
|
287
|
+
|
|
288
|
+
h = Rika.parse_content_and_metadata_as_hash(sample_pdf_filespec)
|
|
289
|
+
content2 = h[:content]
|
|
290
|
+
metadata2 = h[:metadata]
|
|
291
|
+
|
|
292
|
+
expect(content1).to eq(content2)
|
|
293
|
+
expect(metadata1).to eq(metadata2)
|
|
294
|
+
end
|
|
295
|
+
|
|
296
|
+
specify 'getting content and metadata individually and together return the same values' do
|
|
297
|
+
content1, metadata1 = Rika.parse_content_and_metadata(sample_pdf_filespec)
|
|
298
|
+
content2 = Rika.parse_content(sample_pdf_filespec)
|
|
299
|
+
metadata2 = Rika.parse_metadata(sample_pdf_filespec)
|
|
300
|
+
|
|
301
|
+
expect(content1).to eq(content2)
|
|
302
|
+
expect(metadata1).to eq(metadata2)
|
|
303
|
+
end
|
|
304
|
+
end
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'spec_helper'
|
|
4
|
+
require 'rika/tika_loader'
|
|
5
|
+
|
|
6
|
+
describe Rika::TikaLoader do
|
|
7
|
+
describe '.require_tika' do
|
|
8
|
+
it 'returns the correct Tika jar file path' do
|
|
9
|
+
expect(described_class.require_tika).to match(/tika-app-.*\.jar/)
|
|
10
|
+
end
|
|
11
|
+
|
|
12
|
+
it 'calls print_message_and_exit if the Tika jar file cannot be loaded' do
|
|
13
|
+
allow(ENV).to receive(:[]).with('TIKA_JAR_FILESPEC').and_return('nonexistent_file')
|
|
14
|
+
expect { described_class.require_tika }.to raise_error(Rika::TikaLoadError) \
|
|
15
|
+
.with_message(/Unable to load Tika jar file from 'nonexistent_file'./)
|
|
16
|
+
end
|
|
17
|
+
end
|
|
18
|
+
|
|
19
|
+
describe '.specified_tika_filespec' do
|
|
20
|
+
it 'returns the correct Tika jar file path' do
|
|
21
|
+
expect(described_class.send(:specified_tika_filespec)).to match(/tika-app-.*\.jar/)
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
it 'raises a TikaLoadError if the Tika jar filespec is not specified at all in TIKA_JAR_FILESPEC' do
|
|
25
|
+
allow(ENV).to receive(:[]).with('TIKA_JAR_FILESPEC').and_return(nil)
|
|
26
|
+
expect { described_class.send(:specified_tika_filespec) }.to raise_error(Rika::TikaLoadError) \
|
|
27
|
+
.with_message(/Environment variable TIKA_JAR_FILESPEC is not set./)
|
|
28
|
+
end
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
describe '.print_message_and_exit' do
|
|
32
|
+
it 'prints the correct message and exits with an exit code of 1' do
|
|
33
|
+
stderr_orig = $stderr
|
|
34
|
+
$stderr = StringIO.new
|
|
35
|
+
|
|
36
|
+
begin
|
|
37
|
+
expect { described_class.send(:print_message_and_exit, 'message') }.to raise_error(SystemExit) do |error|
|
|
38
|
+
expect(error.status).to eq(1)
|
|
39
|
+
end
|
|
40
|
+
expect($stderr.string).to match(/message/)
|
|
41
|
+
ensure
|
|
42
|
+
$stderr = stderr_orig
|
|
43
|
+
end
|
|
44
|
+
end
|
|
45
|
+
end
|
|
46
|
+
|
|
47
|
+
describe '.formatted_error_message' do
|
|
48
|
+
it 'returns the correct message' do
|
|
49
|
+
message = 'This is a test message.'
|
|
50
|
+
expect(described_class.send(:formatted_error_message, message)).to match(/#{message}/)
|
|
51
|
+
end
|
|
52
|
+
|
|
53
|
+
it 'returns the correct banner' do
|
|
54
|
+
expect(described_class.send(:formatted_error_message, 'message').lines.grep(/!{79}/).size).to be >= 2
|
|
55
|
+
end
|
|
56
|
+
end
|
|
57
|
+
end
|
data/spec/spec_helper.rb
CHANGED
|
@@ -1,14 +1,22 @@
|
|
|
1
|
-
|
|
1
|
+
# frozen_string_literal: true
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
require 'simplecov'
|
|
4
|
+
SimpleCov.start { add_filter '/spec/' }
|
|
5
|
+
|
|
6
|
+
require 'rika'
|
|
7
|
+
|
|
8
|
+
def fixture_path(*paths)
|
|
4
9
|
File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', *paths))
|
|
5
10
|
end
|
|
6
11
|
|
|
7
12
|
# See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
|
|
8
13
|
RSpec.configure do |config|
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
config.filter_run :focus
|
|
14
|
+
# Enable the line below if you want ", focus: true" after a test declaration to
|
|
15
|
+
# denote the only tests that will be run:
|
|
16
|
+
# config.filter_run :focus
|
|
12
17
|
|
|
13
18
|
config.order = 'random'
|
|
19
|
+
config.example_status_persistence_file_path = 'spec/rspec-failed-tests-control-file.txt'
|
|
14
20
|
end
|
|
21
|
+
|
|
22
|
+
Rika.init
|
metadata
CHANGED
|
@@ -1,155 +1,111 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: rika
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version:
|
|
4
|
+
version: 2.0.0
|
|
5
5
|
platform: java
|
|
6
6
|
authors:
|
|
7
7
|
- Richard Nyström
|
|
8
|
+
- Keith Bennett
|
|
8
9
|
autorequire:
|
|
9
10
|
bindir: bin
|
|
10
11
|
cert_chain: []
|
|
11
|
-
date:
|
|
12
|
+
date: 2023-09-08 00:00:00.000000000 Z
|
|
12
13
|
dependencies:
|
|
13
14
|
- !ruby/object:Gem::Dependency
|
|
14
15
|
requirement: !ruby/object:Gem::Requirement
|
|
15
16
|
requirements:
|
|
16
|
-
- -
|
|
17
|
+
- - ">="
|
|
17
18
|
- !ruby/object:Gem::Version
|
|
18
|
-
version:
|
|
19
|
-
name:
|
|
19
|
+
version: '0'
|
|
20
|
+
name: awesome_print
|
|
20
21
|
prerelease: false
|
|
21
|
-
type: :
|
|
22
|
+
type: :runtime
|
|
22
23
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
24
|
requirements:
|
|
24
|
-
- -
|
|
25
|
+
- - ">="
|
|
25
26
|
- !ruby/object:Gem::Version
|
|
26
|
-
version:
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
requirements:
|
|
30
|
-
- - '='
|
|
31
|
-
- !ruby/object:Gem::Version
|
|
32
|
-
version: 10.3.1
|
|
33
|
-
name: rake
|
|
34
|
-
prerelease: false
|
|
35
|
-
type: :development
|
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
-
requirements:
|
|
38
|
-
- - '='
|
|
39
|
-
- !ruby/object:Gem::Version
|
|
40
|
-
version: 10.3.1
|
|
41
|
-
description: ' A JRuby wrapper for Apache Tika to extract text and metadata from various
|
|
42
|
-
file formats. '
|
|
27
|
+
version: '0'
|
|
28
|
+
description: A JRuby wrapper for Apache Tika to extract text and metadata from files
|
|
29
|
+
of various formats.
|
|
43
30
|
email:
|
|
44
31
|
- ricny046@gmail.com
|
|
45
|
-
|
|
32
|
+
- keithrbennett@gmail.com
|
|
33
|
+
executables:
|
|
34
|
+
- rika
|
|
46
35
|
extensions: []
|
|
47
36
|
extra_rdoc_files: []
|
|
48
37
|
files:
|
|
49
|
-
- .gitignore
|
|
50
|
-
- .rspec
|
|
51
|
-
- .
|
|
38
|
+
- ".gitignore"
|
|
39
|
+
- ".rspec"
|
|
40
|
+
- ".rubocop.yml"
|
|
52
41
|
- Gemfile
|
|
53
42
|
- LICENSE.txt
|
|
54
43
|
- README.md
|
|
44
|
+
- RELEASE_NOTES.md
|
|
55
45
|
- Rakefile
|
|
46
|
+
- bin/rika
|
|
56
47
|
- lib/rika.rb
|
|
48
|
+
- lib/rika/cli/args_parser.rb
|
|
49
|
+
- lib/rika/cli/rika_command.rb
|
|
50
|
+
- lib/rika/formatters.rb
|
|
51
|
+
- lib/rika/parse_result.rb
|
|
52
|
+
- lib/rika/parser.rb
|
|
53
|
+
- lib/rika/tika_loader.rb
|
|
57
54
|
- lib/rika/version.rb
|
|
58
55
|
- pom.xml
|
|
59
56
|
- rika.gemspec
|
|
57
|
+
- rika_helper.rb
|
|
60
58
|
- spec/fixtures/de.txt
|
|
61
59
|
- spec/fixtures/document.doc
|
|
62
60
|
- spec/fixtures/document.docx
|
|
63
61
|
- spec/fixtures/document.pdf
|
|
62
|
+
- spec/fixtures/document.txt
|
|
64
63
|
- spec/fixtures/en.txt
|
|
65
64
|
- spec/fixtures/es.txt
|
|
66
65
|
- spec/fixtures/fr.txt
|
|
67
66
|
- spec/fixtures/image.jpg
|
|
67
|
+
- spec/fixtures/image_jpg_without_extension
|
|
68
68
|
- spec/fixtures/lang_cant_be_determined.txt
|
|
69
|
-
- spec/fixtures/over_100k_file.txt
|
|
70
69
|
- spec/fixtures/ru.txt
|
|
71
|
-
- spec/fixtures/
|
|
72
|
-
- spec/fixtures/text_file_without_extension
|
|
70
|
+
- spec/fixtures/tiny.txt
|
|
73
71
|
- spec/fixtures/unknown.bin
|
|
74
|
-
- spec/
|
|
72
|
+
- spec/rika/cli/args_parser_spec.rb
|
|
73
|
+
- spec/rika/cli/rika_command_spec.rb
|
|
74
|
+
- spec/rika/formatters_spec.rb
|
|
75
|
+
- spec/rika/parse_result_spec.rb
|
|
76
|
+
- spec/rika/parser_spec.rb
|
|
77
|
+
- spec/rika/rika_spec.rb
|
|
78
|
+
- spec/rika/tika_loader_spec.rb
|
|
75
79
|
- spec/spec_helper.rb
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
- target/dependency/isoparser-1.0.2.jar
|
|
89
|
-
- target/dependency/java-libpst-0.8.1.jar
|
|
90
|
-
- target/dependency/jcip-annotations-1.0.jar
|
|
91
|
-
- target/dependency/jdom-1.0.jar
|
|
92
|
-
- target/dependency/jempbox-1.8.6.jar
|
|
93
|
-
- target/dependency/jhighlight-1.0.jar
|
|
94
|
-
- target/dependency/jmatio-1.0.jar
|
|
95
|
-
- target/dependency/juniversalchardet-1.0.3.jar
|
|
96
|
-
- target/dependency/metadata-extractor-2.6.2.jar
|
|
97
|
-
- target/dependency/netcdf-4.2.20.jar
|
|
98
|
-
- target/dependency/pdfbox-1.8.6.jar
|
|
99
|
-
- target/dependency/poi-3.11-beta2.jar
|
|
100
|
-
- target/dependency/poi-ooxml-3.11-beta2.jar
|
|
101
|
-
- target/dependency/poi-ooxml-schemas-3.11-beta2.jar
|
|
102
|
-
- target/dependency/poi-scratchpad-3.11-beta2.jar
|
|
103
|
-
- target/dependency/rome-1.0.jar
|
|
104
|
-
- target/dependency/slf4j-api-1.6.1.jar
|
|
105
|
-
- target/dependency/tagsoup-1.2.1.jar
|
|
106
|
-
- target/dependency/tika-core-1.6.jar
|
|
107
|
-
- target/dependency/tika-parsers-1.6.jar
|
|
108
|
-
- target/dependency/unidataCommon-4.2.20.jar
|
|
109
|
-
- target/dependency/vorbis-java-core-0.6.jar
|
|
110
|
-
- target/dependency/vorbis-java-tika-0.6.jar
|
|
111
|
-
- target/dependency/xercesImpl-2.8.1.jar
|
|
112
|
-
- target/dependency/xml-apis-1.3.03.jar
|
|
113
|
-
- target/dependency/xmlbeans-2.6.0.jar
|
|
114
|
-
- target/dependency/xmpcore-5.1.2.jar
|
|
115
|
-
- target/dependency/xz-1.5.jar
|
|
116
|
-
homepage: https://github.com/ricn/rika
|
|
117
|
-
licenses: []
|
|
118
|
-
metadata: {}
|
|
119
|
-
post_install_message:
|
|
80
|
+
homepage: https://github.com/keithrbennett/rika
|
|
81
|
+
licenses:
|
|
82
|
+
- Apache-2.0
|
|
83
|
+
metadata:
|
|
84
|
+
rubygems_mfa_required: 'true'
|
|
85
|
+
post_install_message: |2+
|
|
86
|
+
|
|
87
|
+
Using the rika gem requires that you:
|
|
88
|
+
1) download the Apache Tika tika-app jar file from https://tika.apache.org/download.html
|
|
89
|
+
2) place it somewhere accessible to the running application
|
|
90
|
+
3) specify its location in the TIKA_JAR_FILESPEC environment variable
|
|
91
|
+
|
|
120
92
|
rdoc_options: []
|
|
121
93
|
require_paths:
|
|
122
94
|
- lib
|
|
123
95
|
required_ruby_version: !ruby/object:Gem::Requirement
|
|
124
96
|
requirements:
|
|
125
|
-
- -
|
|
97
|
+
- - ">="
|
|
126
98
|
- !ruby/object:Gem::Version
|
|
127
99
|
version: '0'
|
|
128
100
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
129
101
|
requirements:
|
|
130
|
-
- -
|
|
102
|
+
- - ">="
|
|
131
103
|
- !ruby/object:Gem::Version
|
|
132
104
|
version: '0'
|
|
133
105
|
requirements: []
|
|
134
|
-
|
|
135
|
-
rubygems_version: 2.1.9
|
|
106
|
+
rubygems_version: 3.3.26
|
|
136
107
|
signing_key:
|
|
137
108
|
specification_version: 4
|
|
138
|
-
summary: A JRuby wrapper for Apache Tika to extract text and metadata from
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
- spec/fixtures/document.doc
|
|
142
|
-
- spec/fixtures/document.docx
|
|
143
|
-
- spec/fixtures/document.pdf
|
|
144
|
-
- spec/fixtures/en.txt
|
|
145
|
-
- spec/fixtures/es.txt
|
|
146
|
-
- spec/fixtures/fr.txt
|
|
147
|
-
- spec/fixtures/image.jpg
|
|
148
|
-
- spec/fixtures/lang_cant_be_determined.txt
|
|
149
|
-
- spec/fixtures/over_100k_file.txt
|
|
150
|
-
- spec/fixtures/ru.txt
|
|
151
|
-
- spec/fixtures/text_file.txt
|
|
152
|
-
- spec/fixtures/text_file_without_extension
|
|
153
|
-
- spec/fixtures/unknown.bin
|
|
154
|
-
- spec/rika_spec.rb
|
|
155
|
-
- spec/spec_helper.rb
|
|
109
|
+
summary: A JRuby wrapper for Apache Tika to extract text and metadata from files of
|
|
110
|
+
various formats.
|
|
111
|
+
test_files: []
|