rika 1.11.1-java → 2.0.0-java

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+ require 'rika/tika_loader'
5
+
6
+ describe Rika::TikaLoader do
7
+ describe '.require_tika' do
8
+ it 'returns the correct Tika jar file path' do
9
+ expect(described_class.require_tika).to match(/tika-app-.*\.jar/)
10
+ end
11
+
12
+ it 'calls print_message_and_exit if the Tika jar file cannot be loaded' do
13
+ allow(ENV).to receive(:[]).with('TIKA_JAR_FILESPEC').and_return('nonexistent_file')
14
+ expect { described_class.require_tika }.to raise_error(Rika::TikaLoadError) \
15
+ .with_message(/Unable to load Tika jar file from 'nonexistent_file'./)
16
+ end
17
+ end
18
+
19
+ describe '.specified_tika_filespec' do
20
+ it 'returns the correct Tika jar file path' do
21
+ expect(described_class.send(:specified_tika_filespec)).to match(/tika-app-.*\.jar/)
22
+ end
23
+
24
+ it 'raises a TikaLoadError if the Tika jar filespec is not specified at all in TIKA_JAR_FILESPEC' do
25
+ allow(ENV).to receive(:[]).with('TIKA_JAR_FILESPEC').and_return(nil)
26
+ expect { described_class.send(:specified_tika_filespec) }.to raise_error(Rika::TikaLoadError) \
27
+ .with_message(/Environment variable TIKA_JAR_FILESPEC is not set./)
28
+ end
29
+ end
30
+
31
+ describe '.print_message_and_exit' do
32
+ it 'prints the correct message and exits with an exit code of 1' do
33
+ stderr_orig = $stderr
34
+ $stderr = StringIO.new
35
+
36
+ begin
37
+ expect { described_class.send(:print_message_and_exit, 'message') }.to raise_error(SystemExit) do |error|
38
+ expect(error.status).to eq(1)
39
+ end
40
+ expect($stderr.string).to match(/message/)
41
+ ensure
42
+ $stderr = stderr_orig
43
+ end
44
+ end
45
+ end
46
+
47
+ describe '.formatted_error_message' do
48
+ it 'returns the correct message' do
49
+ message = 'This is a test message.'
50
+ expect(described_class.send(:formatted_error_message, message)).to match(/#{message}/)
51
+ end
52
+
53
+ it 'returns the correct banner' do
54
+ expect(described_class.send(:formatted_error_message, 'message').lines.grep(/!{79}/).size).to be >= 2
55
+ end
56
+ end
57
+ end
data/spec/spec_helper.rb CHANGED
@@ -1,15 +1,22 @@
1
- require "rika"
1
+ # frozen_string_literal: true
2
2
 
3
- def file_path( *paths )
3
+ require 'simplecov'
4
+ SimpleCov.start { add_filter '/spec/' }
5
+
6
+ require 'rika'
7
+
8
+ def fixture_path(*paths)
4
9
  File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', *paths))
5
10
  end
6
11
 
7
12
  # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
8
13
  RSpec.configure do |config|
9
-
10
14
  # Enable the line below if you want ", focus: true" after a test declaration to
11
15
  # denote the only tests that will be run:
12
16
  # config.filter_run :focus
13
17
 
14
18
  config.order = 'random'
19
+ config.example_status_persistence_file_path = 'spec/rspec-failed-tests-control-file.txt'
15
20
  end
21
+
22
+ Rika.init
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rika
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.11.1
4
+ version: 2.0.0
5
5
  platform: java
6
6
  authors:
7
7
  - Richard Nyström
@@ -9,56 +9,48 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2020-07-24 00:00:00.000000000 Z
12
+ date: 2023-09-08 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
- - - "~>"
17
+ - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '3.9'
20
- name: rspec
21
- type: :development
19
+ version: '0'
20
+ name: awesome_print
22
21
  prerelease: false
22
+ type: :runtime
23
23
  version_requirements: !ruby/object:Gem::Requirement
24
24
  requirements:
25
- - - "~>"
25
+ - - ">="
26
26
  - !ruby/object:Gem::Version
27
- version: '3.9'
28
- - !ruby/object:Gem::Dependency
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - "~>"
32
- - !ruby/object:Gem::Version
33
- version: '13.0'
34
- name: rake
35
- type: :development
36
- prerelease: false
37
- version_requirements: !ruby/object:Gem::Requirement
38
- requirements:
39
- - - "~>"
40
- - !ruby/object:Gem::Version
41
- version: '13.0'
42
- description: " A JRuby wrapper for Apache Tika to extract text and metadata from files\
43
- \ of various formats. "
27
+ version: '0'
28
+ description: A JRuby wrapper for Apache Tika to extract text and metadata from files
29
+ of various formats.
44
30
  email:
45
31
  - ricny046@gmail.com
46
32
  - keithrbennett@gmail.com
47
- executables: []
33
+ executables:
34
+ - rika
48
35
  extensions: []
49
36
  extra_rdoc_files: []
50
37
  files:
51
38
  - ".gitignore"
52
39
  - ".rspec"
53
- - ".travis.yml"
40
+ - ".rubocop.yml"
54
41
  - Gemfile
55
42
  - LICENSE.txt
56
43
  - README.md
57
44
  - RELEASE_NOTES.md
58
45
  - Rakefile
59
- - java-lib/tika-app-1.24.1.jar
46
+ - bin/rika
60
47
  - lib/rika.rb
48
+ - lib/rika/cli/args_parser.rb
49
+ - lib/rika/cli/rika_command.rb
50
+ - lib/rika/formatters.rb
51
+ - lib/rika/parse_result.rb
61
52
  - lib/rika/parser.rb
53
+ - lib/rika/tika_loader.rb
62
54
  - lib/rika/version.rb
63
55
  - pom.xml
64
56
  - rika.gemspec
@@ -67,22 +59,36 @@ files:
67
59
  - spec/fixtures/document.doc
68
60
  - spec/fixtures/document.docx
69
61
  - spec/fixtures/document.pdf
62
+ - spec/fixtures/document.txt
70
63
  - spec/fixtures/en.txt
71
64
  - spec/fixtures/es.txt
72
65
  - spec/fixtures/fr.txt
73
66
  - spec/fixtures/image.jpg
67
+ - spec/fixtures/image_jpg_without_extension
74
68
  - spec/fixtures/lang_cant_be_determined.txt
75
69
  - spec/fixtures/ru.txt
76
- - spec/fixtures/text_file.txt
77
- - spec/fixtures/text_file_without_extension
70
+ - spec/fixtures/tiny.txt
78
71
  - spec/fixtures/unknown.bin
79
- - spec/rika_spec.rb
72
+ - spec/rika/cli/args_parser_spec.rb
73
+ - spec/rika/cli/rika_command_spec.rb
74
+ - spec/rika/formatters_spec.rb
75
+ - spec/rika/parse_result_spec.rb
76
+ - spec/rika/parser_spec.rb
77
+ - spec/rika/rika_spec.rb
78
+ - spec/rika/tika_loader_spec.rb
80
79
  - spec/spec_helper.rb
81
80
  homepage: https://github.com/keithrbennett/rika
82
81
  licenses:
83
82
  - Apache-2.0
84
- metadata: {}
85
- post_install_message:
83
+ metadata:
84
+ rubygems_mfa_required: 'true'
85
+ post_install_message: |2+
86
+
87
+ Using the rika gem requires that you:
88
+ 1) download the Apache Tika tika-app jar file from https://tika.apache.org/download.html
89
+ 2) place it somewhere accessible to the running application
90
+ 3) specify its location in the TIKA_JAR_FILESPEC environment variable
91
+
86
92
  rdoc_options: []
87
93
  require_paths:
88
94
  - lib
@@ -97,24 +103,9 @@ required_rubygems_version: !ruby/object:Gem::Requirement
97
103
  - !ruby/object:Gem::Version
98
104
  version: '0'
99
105
  requirements: []
100
- rubygems_version: 3.0.6
106
+ rubygems_version: 3.3.26
101
107
  signing_key:
102
108
  specification_version: 4
103
109
  summary: A JRuby wrapper for Apache Tika to extract text and metadata from files of
104
110
  various formats.
105
- test_files:
106
- - spec/fixtures/de.txt
107
- - spec/fixtures/document.doc
108
- - spec/fixtures/document.docx
109
- - spec/fixtures/document.pdf
110
- - spec/fixtures/en.txt
111
- - spec/fixtures/es.txt
112
- - spec/fixtures/fr.txt
113
- - spec/fixtures/image.jpg
114
- - spec/fixtures/lang_cant_be_determined.txt
115
- - spec/fixtures/ru.txt
116
- - spec/fixtures/text_file.txt
117
- - spec/fixtures/text_file_without_extension
118
- - spec/fixtures/unknown.bin
119
- - spec/rika_spec.rb
120
- - spec/spec_helper.rb
111
+ test_files: []
data/.travis.yml DELETED
@@ -1,7 +0,0 @@
1
- language: ruby
2
- rvm:
3
- - jruby-9.2.12.0
4
- notifications:
5
- recipients:
6
- - ricny046@gmail.com
7
- - keithrbennett@gmail.com
Binary file
@@ -1,23 +0,0 @@
1
- Stopping by Woods on a Snowy Evening
2
-
3
- By Robert Frost
4
-
5
- Whose woods these are I think I know.
6
- His house is in the village though;
7
- He will not see me stopping here
8
- To watch his woods fill up with snow.
9
-
10
- My little horse must think it queer
11
- To stop without a farmhouse near
12
- Between the woods and frozen lake
13
- The darkest evening of the year.
14
-
15
- He gives his harness bells a shake
16
- To ask if there is some mistake.
17
- The only other sound’s the sweep
18
- Of easy wind and downy flake.
19
-
20
- The woods are lovely, dark and deep,
21
- But I have promises to keep,
22
- And miles to go before I sleep,
23
- And miles to go before I sleep.
data/spec/rika_spec.rb DELETED
@@ -1,245 +0,0 @@
1
- # encoding: utf-8
2
-
3
- require 'spec_helper'
4
- require 'webrick'
5
-
6
- include WEBrick
7
-
8
- describe Rika::Parser do
9
-
10
- let (:txt_parser) { Rika::Parser.new(file_path('text_file.txt')) }
11
- let (:docx_parser) { Rika::Parser.new(file_path('document.docx')) }
12
- let (:doc_parser) { Rika::Parser.new(file_path('document.doc')) }
13
- let (:pdf_parser) { Rika::Parser.new(file_path('document.pdf')) }
14
- let (:image_parser) { Rika::Parser.new(file_path('image.jpg')) }
15
- let (:unknown_parser) { Rika::Parser.new(file_path('unknown.bin')) }
16
- let (:dir) { File.expand_path(File.join(File.dirname(__FILE__), 'fixtures')) }
17
- let (:quote_first_line) { 'Stopping by Woods on a Snowy Evening' }
18
-
19
- port = 50515
20
- let (:url) { "http://#{Socket.gethostname}:#{port}" }
21
-
22
- let (:sample_pdf_filespec) { file_path('document.pdf') }
23
-
24
- let(:first_line) { ->(string) { string.split("\n").first.strip } }
25
-
26
- let(:server_runner) do
27
- # returns a lambda that, when passed an action, will wrap it in an HTTP server
28
- ->(action) do
29
- server = nil
30
- server_thread = Thread.new do
31
- server = HTTPServer.new(
32
- Port: port,
33
- DocumentRoot: dir,
34
- AccessLog: [],
35
- Logger: WEBrick::Log::new('/dev/null')
36
- )
37
- server.start
38
- end
39
-
40
- # Wait for server to become ready on its new thread
41
- sleep 0.01 while server.nil?
42
- begin
43
- action.call
44
- ensure
45
- server.shutdown
46
- server_thread.exit
47
- end
48
- end
49
- end
50
-
51
-
52
- it 'should raise error if file does not exist' do
53
- expect(-> { Rika::Parser.new(file_path('nonexistent_file.txt')) }).to raise_error(IOError)
54
- end
55
-
56
- it 'should raise error if URL does not exist' do
57
- unavailable_server = 'http://k6075sd0dfkr8nvfw0zvwfwckucf2aba.com'
58
- unavailable_file_on_web = File.join(unavailable_server, 'x.pdf')
59
- expect(-> { Rika::Parser.new(unavailable_file_on_web) }).to raise_error(SocketError)
60
- end
61
-
62
- it 'should detect file type without a file extension' do
63
- parser = Rika::Parser.new(file_path('text_file_without_extension'))
64
- expect(parser.metadata['Content-Type']).to eq('text/plain; charset=UTF-8')
65
- end
66
-
67
- describe '#content' do
68
- it 'should return the content in a text file' do
69
- expect(first_line.(txt_parser.content)).to eq(quote_first_line)
70
- end
71
-
72
- it 'should return the content in a docx file' do
73
- expect(first_line.(docx_parser.content)).to eq(quote_first_line)
74
- end
75
-
76
- it 'should return the content in a pdf file' do
77
- expect(first_line.(pdf_parser.content)).to eq(quote_first_line)
78
- end
79
-
80
- it 'should return no content for an image' do
81
- expect(image_parser.metadata.keys).to_not be_empty
82
- end
83
-
84
- it 'should only return max content length' do
85
- expect(Rika::Parser.new(file_path('text_file.txt'), 9).content).to eq('Stopping')
86
- end
87
-
88
- it 'should only return max content length for file over http', focus: true do
89
- server_runner.call( -> do
90
- expect(Rika::Parser.new(File.join(url, 'document.pdf'), 9).content).to eq('Stopping')
91
- end)
92
- end
93
-
94
- it 'should return the content from a file over http' do
95
- server_runner.call( -> do
96
- content = Rika::Parser.new(File.join(url, 'document.pdf')).content
97
- expect(first_line.(content)).to eq(quote_first_line)
98
- end)
99
- end
100
-
101
- it 'should return empty string for unknown file' do
102
- expect(unknown_parser.content).to be_empty
103
- end
104
- end
105
-
106
- # We just test a few of the metadata fields for some common file formats
107
- # to make sure the integration with Apache Tika works. Apache Tika already
108
- # have tests for all file formats it supports so we won't retest that
109
- describe '#metadata' do
110
- it 'should return nil if metadata field does not exist' do
111
- expect(txt_parser.metadata['nonsense']).to be_nil
112
- end
113
-
114
- it 'should return metadata from a docx file' do
115
- expect(docx_parser.metadata['Page-Count']).to eq('1')
116
- end
117
-
118
- it 'should return metadata from a pdf file' do
119
- expect(pdf_parser.metadata['Author']).to eq('Robert Frost')
120
- end
121
-
122
- it 'should return metadata from a file over http', focus: true do
123
- server_runner.call( -> do
124
- parser = Rika::Parser.new(File.join(url, 'document.pdf'))
125
- expect(parser.metadata['Author']).to eq('Robert Frost')
126
- end)
127
- end
128
-
129
- it 'should return metadata from an image' do
130
- expect(image_parser.metadata['Image Height']).to eq('72 pixels')
131
- expect(image_parser.metadata['Image Width']).to eq('72 pixels')
132
- end
133
- end
134
-
135
- describe '#available_metadata' do
136
- it 'should return available metadata fields' do
137
- expect(txt_parser.available_metadata).to_not be_empty
138
- end
139
-
140
- it 'should be an array' do
141
- expect(txt_parser.available_metadata).to be_an(Array)
142
- end
143
- end
144
-
145
- describe '#metadata_exists?' do
146
- it 'should return false if metadata does not exist' do
147
- expect(txt_parser.metadata_exists?('title')).to be false
148
- end
149
-
150
- it 'should return true if metadata exist' do
151
- expect(docx_parser.metadata_exists?('title')).to be true
152
- end
153
- end
154
-
155
- describe '#media_type' do
156
- it 'should return application/pdf for a pdf file' do
157
- expect(pdf_parser.media_type).to eq('application/pdf')
158
- end
159
-
160
- it 'should return text/plain for a txt file' do
161
- expect(txt_parser.media_type).to eq('text/plain')
162
- end
163
-
164
- it 'should return application/pdf for a pdf over http' do
165
- server_runner.call( -> do
166
- parser = Rika::Parser.new(File.join(url, 'document.pdf'))
167
- expect(parser.media_type).to eq('application/pdf')
168
- end)
169
- end
170
-
171
- it 'should return application/octet-stream for unknown file' do
172
- expect(unknown_parser.media_type).to eq('application/octet-stream')
173
- end
174
-
175
- it 'should return msword for a doc file' do
176
- expect(doc_parser.media_type).to eq('application/msword')
177
- end
178
-
179
- it 'should return wordprocessingml for a docx file' do
180
- expect(docx_parser.media_type).to eq('application/vnd.openxmlformats-officedocument.wordprocessingml.document')
181
- end
182
- end
183
-
184
- describe '#language' do
185
- it 'should return the language of the content' do
186
- %w(en de fr ru es).each do |lang|
187
- txt = Rika::Parser.new(file_path("#{lang}.txt"))
188
- expect(txt.language).to eq(lang)
189
- end
190
- end
191
- end
192
-
193
- # See note in rika.rb #language_is_reasonably_certain? regarding this method's future.
194
- describe '#language_is_reasonably_certain?' do
195
- it "should return false if lang can't be determined" do
196
- lang = Rika::Parser.new(file_path("lang_cant_be_determined.txt"))
197
- lang.language_is_reasonably_certain? == false
198
- end
199
-
200
- it "should return true if language can be determined" do
201
- lang = Rika::Parser.new(file_path("en.txt"))
202
- lang.language_is_reasonably_certain? == true
203
- end
204
- end
205
-
206
- it 'should return valid content using Rika.parse_content' do
207
- content = Rika.parse_content(sample_pdf_filespec)
208
- expect(content).to be_a(String)
209
- expect(content).to_not be_empty
210
- end
211
-
212
- it 'should return valid metadata using Rika.parse_metadata' do
213
- metadata = Rika.parse_metadata(sample_pdf_filespec)
214
- expect(metadata).to be_a(Hash)
215
- expect(metadata).to_not be_empty
216
- end
217
-
218
- it 'should return valid content and metadata using Rika.parse_content_and_metadata' do
219
- content, metadata = Rika.parse_content_and_metadata(sample_pdf_filespec)
220
- expect(content).to be_a(String)
221
- expect(content).to_not be_empty
222
- expect(metadata).to be_a(Hash)
223
- expect(metadata).to_not be_empty
224
- end
225
-
226
- specify 'both means of getting both content and metadata should return the same values' do
227
- content_1, metadata_1 = Rika.parse_content_and_metadata(sample_pdf_filespec)
228
-
229
- h = Rika.parse_content_and_metadata_as_hash(sample_pdf_filespec)
230
- content_2 = h[:content]
231
- metadata_2 = h[:metadata]
232
-
233
- expect(content_1).to eq(content_2)
234
- expect(metadata_1).to eq(metadata_2)
235
- end
236
-
237
- specify 'getting content and metadata individually and together should return the same values' do
238
- content_1, metadata_1 = Rika.parse_content_and_metadata(sample_pdf_filespec, -1)
239
- content_2 = Rika.parse_content(sample_pdf_filespec)
240
- metadata_2 = Rika.parse_metadata(sample_pdf_filespec, -1)
241
-
242
- expect(content_1).to eq(content_2)
243
- expect(metadata_1).to eq(metadata_2)
244
- end
245
- end
File without changes