rika 1.11.1-java → 2.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+ require 'rika/tika_loader'
5
+
6
+ describe Rika::TikaLoader do
7
+ describe '.require_tika' do
8
+ it 'returns the correct Tika jar file path' do
9
+ expect(described_class.require_tika).to match(/tika-app-.*\.jar/)
10
+ end
11
+
12
+ it 'calls print_message_and_exit if the Tika jar file cannot be loaded' do
13
+ allow(ENV).to receive(:[]).with('TIKA_JAR_FILESPEC').and_return('nonexistent_file')
14
+ expect { described_class.require_tika }.to raise_error(Rika::TikaLoadError) \
15
+ .with_message(/Unable to load Tika jar file from 'nonexistent_file'./)
16
+ end
17
+ end
18
+
19
+ describe '.specified_tika_filespec' do
20
+ it 'returns the correct Tika jar file path' do
21
+ expect(described_class.send(:specified_tika_filespec)).to match(/tika-app-.*\.jar/)
22
+ end
23
+
24
+ it 'raises a TikaLoadError if the Tika jar filespec is not specified at all in TIKA_JAR_FILESPEC' do
25
+ allow(ENV).to receive(:[]).with('TIKA_JAR_FILESPEC').and_return(nil)
26
+ expect { described_class.send(:specified_tika_filespec) }.to raise_error(Rika::TikaLoadError) \
27
+ .with_message(/Environment variable TIKA_JAR_FILESPEC is not set./)
28
+ end
29
+ end
30
+
31
+ describe '.print_message_and_exit' do
32
+ it 'prints the correct message and exits with an exit code of 1' do
33
+ stderr_orig = $stderr
34
+ $stderr = StringIO.new
35
+
36
+ begin
37
+ expect { described_class.send(:print_message_and_exit, 'message') }.to raise_error(SystemExit) do |error|
38
+ expect(error.status).to eq(1)
39
+ end
40
+ expect($stderr.string).to match(/message/)
41
+ ensure
42
+ $stderr = stderr_orig
43
+ end
44
+ end
45
+ end
46
+
47
+ describe '.formatted_error_message' do
48
+ it 'returns the correct message' do
49
+ message = 'This is a test message.'
50
+ expect(described_class.send(:formatted_error_message, message)).to match(/#{message}/)
51
+ end
52
+
53
+ it 'returns the correct banner' do
54
+ expect(described_class.send(:formatted_error_message, 'message').lines.grep(/!{79}/).size).to be >= 2
55
+ end
56
+ end
57
+ end
data/spec/spec_helper.rb CHANGED
@@ -1,15 +1,22 @@
1
- require "rika"
1
+ # frozen_string_literal: true
2
2
 
3
- def file_path( *paths )
3
+ require 'simplecov'
4
+ SimpleCov.start { add_filter '/spec/' }
5
+
6
+ require 'rika'
7
+
8
+ def fixture_path(*paths)
4
9
  File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', *paths))
5
10
  end
6
11
 
7
12
  # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
8
13
  RSpec.configure do |config|
9
-
10
14
  # Enable the line below if you want ", focus: true" after a test declaration to
11
15
  # denote the only tests that will be run:
12
16
  # config.filter_run :focus
13
17
 
14
18
  config.order = 'random'
19
+ config.example_status_persistence_file_path = 'spec/rspec-failed-tests-control-file.txt'
15
20
  end
21
+
22
+ Rika.init
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rika
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.11.1
4
+ version: 2.0.0
5
5
  platform: java
6
6
  authors:
7
7
  - Richard Nyström
@@ -9,56 +9,48 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2020-07-24 00:00:00.000000000 Z
12
+ date: 2023-09-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '3.9'
20
- name: rspec
21
- type: :development
19
+ version: '0'
20
+ name: awesome_print
22
21
  prerelease: false
22
+ type: :runtime
23
23
  version_requirements: !ruby/object:Gem::Requirement
24
24
  requirements:
25
- - - "~>"
25
+ - - ">="
26
26
  - !ruby/object:Gem::Version
27
- version: '3.9'
28
- - !ruby/object:Gem::Dependency
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - "~>"
32
- - !ruby/object:Gem::Version
33
- version: '13.0'
34
- name: rake
35
- type: :development
36
- prerelease: false
37
- version_requirements: !ruby/object:Gem::Requirement
38
- requirements:
39
- - - "~>"
40
- - !ruby/object:Gem::Version
41
- version: '13.0'
42
- description: " A JRuby wrapper for Apache Tika to extract text and metadata from files\
43
- \ of various formats. "
27
+ version: '0'
28
+ description: A JRuby wrapper for Apache Tika to extract text and metadata from files
29
+ of various formats.
44
30
  email:
45
31
  - ricny046@gmail.com
46
32
  - keithrbennett@gmail.com
47
- executables: []
33
+ executables:
34
+ - rika
48
35
  extensions: []
49
36
  extra_rdoc_files: []
50
37
  files:
51
38
  - ".gitignore"
52
39
  - ".rspec"
53
- - ".travis.yml"
40
+ - ".rubocop.yml"
54
41
  - Gemfile
55
42
  - LICENSE.txt
56
43
  - README.md
57
44
  - RELEASE_NOTES.md
58
45
  - Rakefile
59
- - java-lib/tika-app-1.24.1.jar
46
+ - bin/rika
60
47
  - lib/rika.rb
48
+ - lib/rika/cli/args_parser.rb
49
+ - lib/rika/cli/rika_command.rb
50
+ - lib/rika/formatters.rb
51
+ - lib/rika/parse_result.rb
61
52
  - lib/rika/parser.rb
53
+ - lib/rika/tika_loader.rb
62
54
  - lib/rika/version.rb
63
55
  - pom.xml
64
56
  - rika.gemspec
@@ -67,22 +59,36 @@ files:
67
59
  - spec/fixtures/document.doc
68
60
  - spec/fixtures/document.docx
69
61
  - spec/fixtures/document.pdf
62
+ - spec/fixtures/document.txt
70
63
  - spec/fixtures/en.txt
71
64
  - spec/fixtures/es.txt
72
65
  - spec/fixtures/fr.txt
73
66
  - spec/fixtures/image.jpg
67
+ - spec/fixtures/image_jpg_without_extension
74
68
  - spec/fixtures/lang_cant_be_determined.txt
75
69
  - spec/fixtures/ru.txt
76
- - spec/fixtures/text_file.txt
77
- - spec/fixtures/text_file_without_extension
70
+ - spec/fixtures/tiny.txt
78
71
  - spec/fixtures/unknown.bin
79
- - spec/rika_spec.rb
72
+ - spec/rika/cli/args_parser_spec.rb
73
+ - spec/rika/cli/rika_command_spec.rb
74
+ - spec/rika/formatters_spec.rb
75
+ - spec/rika/parse_result_spec.rb
76
+ - spec/rika/parser_spec.rb
77
+ - spec/rika/rika_spec.rb
78
+ - spec/rika/tika_loader_spec.rb
80
79
  - spec/spec_helper.rb
81
80
  homepage: https://github.com/keithrbennett/rika
82
81
  licenses:
83
82
  - Apache-2.0
84
- metadata: {}
85
- post_install_message:
83
+ metadata:
84
+ rubygems_mfa_required: 'true'
85
+ post_install_message: |2+
86
+
87
+ Using the rika gem requires that you:
88
+ 1) download the Apache Tika tika-app jar file from https://tika.apache.org/download.html
89
+ 2) place it somewhere accessible to the running application
90
+ 3) specify its location in the TIKA_JAR_FILESPEC environment variable
91
+
86
92
  rdoc_options: []
87
93
  require_paths:
88
94
  - lib
@@ -97,24 +103,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
97
103
  - !ruby/object:Gem::Version
98
104
  version: '0'
99
105
  requirements: []
100
- rubygems_version: 3.0.6
106
+ rubygems_version: 3.3.26
101
107
  signing_key:
102
108
  specification_version: 4
103
109
  summary: A JRuby wrapper for Apache Tika to extract text and metadata from files of
104
110
  various formats.
105
- test_files:
106
- - spec/fixtures/de.txt
107
- - spec/fixtures/document.doc
108
- - spec/fixtures/document.docx
109
- - spec/fixtures/document.pdf
110
- - spec/fixtures/en.txt
111
- - spec/fixtures/es.txt
112
- - spec/fixtures/fr.txt
113
- - spec/fixtures/image.jpg
114
- - spec/fixtures/lang_cant_be_determined.txt
115
- - spec/fixtures/ru.txt
116
- - spec/fixtures/text_file.txt
117
- - spec/fixtures/text_file_without_extension
118
- - spec/fixtures/unknown.bin
119
- - spec/rika_spec.rb
120
- - spec/spec_helper.rb
111
+ test_files: []
data/.travis.yml DELETED
@@ -1,7 +0,0 @@
1
- language: ruby
2
- rvm:
3
- - jruby-9.2.12.0
4
- notifications:
5
- recipients:
6
- - ricny046@gmail.com
7
- - keithrbennett@gmail.com
Binary file
@@ -1,23 +0,0 @@
1
- Stopping by Woods on a Snowy Evening
2
-
3
- By Robert Frost
4
-
5
- Whose woods these are I think I know.
6
- His house is in the village though;
7
- He will not see me stopping here
8
- To watch his woods fill up with snow.
9
-
10
- My little horse must think it queer
11
- To stop without a farmhouse near
12
- Between the woods and frozen lake
13
- The darkest evening of the year.
14
-
15
- He gives his harness bells a shake
16
- To ask if there is some mistake.
17
- The only other sound’s the sweep
18
- Of easy wind and downy flake.
19
-
20
- The woods are lovely, dark and deep,
21
- But I have promises to keep,
22
- And miles to go before I sleep,
23
- And miles to go before I sleep.
data/spec/rika_spec.rb DELETED
@@ -1,245 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require 'spec_helper'
4
- require 'webrick'
5
-
6
- include WEBrick
7
-
8
- describe Rika::Parser do
9
-
10
- let (:txt_parser) { Rika::Parser.new(file_path('text_file.txt')) }
11
- let (:docx_parser) { Rika::Parser.new(file_path('document.docx')) }
12
- let (:doc_parser) { Rika::Parser.new(file_path('document.doc')) }
13
- let (:pdf_parser) { Rika::Parser.new(file_path('document.pdf')) }
14
- let (:image_parser) { Rika::Parser.new(file_path('image.jpg')) }
15
- let (:unknown_parser) { Rika::Parser.new(file_path('unknown.bin')) }
16
- let (:dir) { File.expand_path(File.join(File.dirname(__FILE__), 'fixtures')) }
17
- let (:quote_first_line) { 'Stopping by Woods on a Snowy Evening' }
18
-
19
- port = 50515
20
- let (:url) { "http://#{Socket.gethostname}:#{port}" }
21
-
22
- let (:sample_pdf_filespec) { file_path('document.pdf') }
23
-
24
- let(:first_line) { ->(string) { string.split("\n").first.strip } }
25
-
26
- let(:server_runner) do
27
- # returns a lambda that, when passed an action, will wrap it in an HTTP server
28
- ->(action) do
29
- server = nil
30
- server_thread = Thread.new do
31
- server = HTTPServer.new(
32
- Port: port,
33
- DocumentRoot: dir,
34
- AccessLog: [],
35
- Logger: WEBrick::Log::new('/dev/null')
36
- )
37
- server.start
38
- end
39
-
40
- # Wait for server to become ready on its new thread
41
- sleep 0.01 while server.nil?
42
- begin
43
- action.call
44
- ensure
45
- server.shutdown
46
- server_thread.exit
47
- end
48
- end
49
- end
50
-
51
-
52
- it 'should raise error if file does not exist' do
53
- expect(-> { Rika::Parser.new(file_path('nonexistent_file.txt')) }).to raise_error(IOError)
54
- end
55
-
56
- it 'should raise error if URL does not exist' do
57
- unavailable_server = 'http://k6075sd0dfkr8nvfw0zvwfwckucf2aba.com'
58
- unavailable_file_on_web = File.join(unavailable_server, 'x.pdf')
59
- expect(-> { Rika::Parser.new(unavailable_file_on_web) }).to raise_error(SocketError)
60
- end
61
-
62
- it 'should detect file type without a file extension' do
63
- parser = Rika::Parser.new(file_path('text_file_without_extension'))
64
- expect(parser.metadata['Content-Type']).to eq('text/plain; charset=UTF-8')
65
- end
66
-
67
- describe '#content' do
68
- it 'should return the content in a text file' do
69
- expect(first_line.(txt_parser.content)).to eq(quote_first_line)
70
- end
71
-
72
- it 'should return the content in a docx file' do
73
- expect(first_line.(docx_parser.content)).to eq(quote_first_line)
74
- end
75
-
76
- it 'should return the content in a pdf file' do
77
- expect(first_line.(pdf_parser.content)).to eq(quote_first_line)
78
- end
79
-
80
- it 'should return no content for an image' do
81
- expect(image_parser.metadata.keys).to_not be_empty
82
- end
83
-
84
- it 'should only return max content length' do
85
- expect(Rika::Parser.new(file_path('text_file.txt'), 9).content).to eq('Stopping')
86
- end
87
-
88
- it 'should only return max content length for file over http', focus: true do
89
- server_runner.call( -> do
90
- expect(Rika::Parser.new(File.join(url, 'document.pdf'), 9).content).to eq('Stopping')
91
- end)
92
- end
93
-
94
- it 'should return the content from a file over http' do
95
- server_runner.call( -> do
96
- content = Rika::Parser.new(File.join(url, 'document.pdf')).content
97
- expect(first_line.(content)).to eq(quote_first_line)
98
- end)
99
- end
100
-
101
- it 'should return empty string for unknown file' do
102
- expect(unknown_parser.content).to be_empty
103
- end
104
- end
105
-
106
- # We just test a few of the metadata fields for some common file formats
107
- # to make sure the integration with Apache Tika works. Apache Tika already
108
- # have tests for all file formats it supports so we won't retest that
109
- describe '#metadata' do
110
- it 'should return nil if metadata field does not exist' do
111
- expect(txt_parser.metadata['nonsense']).to be_nil
112
- end
113
-
114
- it 'should return metadata from a docx file' do
115
- expect(docx_parser.metadata['Page-Count']).to eq('1')
116
- end
117
-
118
- it 'should return metadata from a pdf file' do
119
- expect(pdf_parser.metadata['Author']).to eq('Robert Frost')
120
- end
121
-
122
- it 'should return metadata from a file over http', focus: true do
123
- server_runner.call( -> do
124
- parser = Rika::Parser.new(File.join(url, 'document.pdf'))
125
- expect(parser.metadata['Author']).to eq('Robert Frost')
126
- end)
127
- end
128
-
129
- it 'should return metadata from an image' do
130
- expect(image_parser.metadata['Image Height']).to eq('72 pixels')
131
- expect(image_parser.metadata['Image Width']).to eq('72 pixels')
132
- end
133
- end
134
-
135
- describe '#available_metadata' do
136
- it 'should return available metadata fields' do
137
- expect(txt_parser.available_metadata).to_not be_empty
138
- end
139
-
140
- it 'should be an array' do
141
- expect(txt_parser.available_metadata).to be_an(Array)
142
- end
143
- end
144
-
145
- describe '#metadata_exists?' do
146
- it 'should return false if metadata does not exist' do
147
- expect(txt_parser.metadata_exists?('title')).to be false
148
- end
149
-
150
- it 'should return true if metadata exist' do
151
- expect(docx_parser.metadata_exists?('title')).to be true
152
- end
153
- end
154
-
155
- describe '#media_type' do
156
- it 'should return application/pdf for a pdf file' do
157
- expect(pdf_parser.media_type).to eq('application/pdf')
158
- end
159
-
160
- it 'should return text/plain for a txt file' do
161
- expect(txt_parser.media_type).to eq('text/plain')
162
- end
163
-
164
- it 'should return application/pdf for a pdf over http' do
165
- server_runner.call( -> do
166
- parser = Rika::Parser.new(File.join(url, 'document.pdf'))
167
- expect(parser.media_type).to eq('application/pdf')
168
- end)
169
- end
170
-
171
- it 'should return application/octet-stream for unknown file' do
172
- expect(unknown_parser.media_type).to eq('application/octet-stream')
173
- end
174
-
175
- it 'should return msword for a doc file' do
176
- expect(doc_parser.media_type).to eq('application/msword')
177
- end
178
-
179
- it 'should return wordprocessingml for a docx file' do
180
- expect(docx_parser.media_type).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document')
181
- end
182
- end
183
-
184
- describe '#language' do
185
- it 'should return the language of the content' do
186
- %w(en de fr ru es).each do |lang|
187
- txt = Rika::Parser.new(file_path("#{lang}.txt"))
188
- expect(txt.language).to eq(lang)
189
- end
190
- end
191
- end
192
-
193
- # See note in rika.rb #language_is_reasonably_certain? regarding this method's future.
194
- describe '#language_is_reasonably_certain?' do
195
- it "should return false if lang can't be determined" do
196
- lang = Rika::Parser.new(file_path("lang_cant_be_determined.txt"))
197
- lang.language_is_reasonably_certain? == false
198
- end
199
-
200
- it "should return true if language can be determined" do
201
- lang = Rika::Parser.new(file_path("en.txt"))
202
- lang.language_is_reasonably_certain? == true
203
- end
204
- end
205
-
206
- it 'should return valid content using Rika.parse_content' do
207
- content = Rika.parse_content(sample_pdf_filespec)
208
- expect(content).to be_a(String)
209
- expect(content).to_not be_empty
210
- end
211
-
212
- it 'should return valid metadata using Rika.parse_metadata' do
213
- metadata = Rika.parse_metadata(sample_pdf_filespec)
214
- expect(metadata).to be_a(Hash)
215
- expect(metadata).to_not be_empty
216
- end
217
-
218
- it 'should return valid content and metadata using Rika.parse_content_and_metadata' do
219
- content, metadata = Rika.parse_content_and_metadata(sample_pdf_filespec)
220
- expect(content).to be_a(String)
221
- expect(content).to_not be_empty
222
- expect(metadata).to be_a(Hash)
223
- expect(metadata).to_not be_empty
224
- end
225
-
226
- specify 'both means of getting both content and metadata should return the same values' do
227
- content_1, metadata_1 = Rika.parse_content_and_metadata(sample_pdf_filespec)
228
-
229
- h = Rika.parse_content_and_metadata_as_hash(sample_pdf_filespec)
230
- content_2 = h[:content]
231
- metadata_2 = h[:metadata]
232
-
233
- expect(content_1).to eq(content_2)
234
- expect(metadata_1).to eq(metadata_2)
235
- end
236
-
237
- specify 'getting content and metadata individually and together should return the same values' do
238
- content_1, metadata_1 = Rika.parse_content_and_metadata(sample_pdf_filespec, -1)
239
- content_2 = Rika.parse_content(sample_pdf_filespec)
240
- metadata_2 = Rika.parse_metadata(sample_pdf_filespec, -1)
241
-
242
- expect(content_1).to eq(content_2)
243
- expect(metadata_1).to eq(metadata_2)
244
- end
245
- end
File without changes