rika 1.6.0-java → 2.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +6 -4
  3. data/.rubocop.yml +49 -0
  4. data/Gemfile +12 -0
  5. data/README.md +226 -47
  6. data/RELEASE_NOTES.md +43 -0
  7. data/Rakefile +4 -7
  8. data/bin/rika +13 -0
  9. data/lib/rika/cli/args_parser.rb +131 -0
  10. data/lib/rika/cli/rika_command.rb +129 -0
  11. data/lib/rika/formatters.rb +39 -0
  12. data/lib/rika/parse_result.rb +34 -0
  13. data/lib/rika/parser.rb +84 -0
  14. data/lib/rika/tika_loader.rb +65 -0
  15. data/lib/rika/version.rb +3 -1
  16. data/lib/rika.rb +96 -104
  17. data/pom.xml +2 -2
  18. data/rika.gemspec +30 -15
  19. data/rika_helper.rb +30 -0
  20. data/spec/fixtures/de.txt +21 -1
  21. data/spec/fixtures/document.doc +0 -0
  22. data/spec/fixtures/document.docx +0 -0
  23. data/spec/fixtures/document.pdf +0 -0
  24. data/spec/fixtures/document.txt +23 -0
  25. data/spec/fixtures/en.txt +23 -1
  26. data/spec/fixtures/es.txt +21 -1
  27. data/spec/fixtures/fr.txt +23 -1
  28. data/spec/fixtures/image_jpg_without_extension +0 -0
  29. data/spec/fixtures/ru.txt +21 -1
  30. data/spec/fixtures/tiny.txt +1 -0
  31. data/spec/rika/cli/args_parser_spec.rb +117 -0
  32. data/spec/rika/cli/rika_command_spec.rb +120 -0
  33. data/spec/rika/formatters_spec.rb +23 -0
  34. data/spec/rika/parse_result_spec.rb +42 -0
  35. data/spec/rika/parser_spec.rb +304 -0
  36. data/spec/rika/rika_spec.rb +10 -0
  37. data/spec/rika/tika_loader_spec.rb +57 -0
  38. data/spec/spec_helper.rb +13 -5
  39. metadata +54 -98
  40. data/.travis.yml +0 -7
  41. data/spec/fixtures/over_100k_file.txt +0 -1241
  42. data/spec/fixtures/text_file.txt +0 -1
  43. data/spec/fixtures/text_file_without_extension +0 -1
  44. data/spec/rika_spec.rb +0 -202
  45. data/target/dependency/apache-mime4j-core-0.7.2.jar +0 -0
  46. data/target/dependency/apache-mime4j-dom-0.7.2.jar +0 -0
  47. data/target/dependency/asm-debug-all-4.1.jar +0 -0
  48. data/target/dependency/aspectjrt-1.8.0.jar +0 -0
  49. data/target/dependency/bcmail-jdk15-1.45.jar +0 -0
  50. data/target/dependency/bcprov-jdk15-1.45.jar +0 -0
  51. data/target/dependency/boilerpipe-1.1.0.jar +0 -0
  52. data/target/dependency/commons-codec-1.9.jar +0 -0
  53. data/target/dependency/commons-compress-1.8.1.jar +0 -0
  54. data/target/dependency/commons-httpclient-3.1.jar +0 -0
  55. data/target/dependency/commons-logging-1.1.1.jar +0 -0
  56. data/target/dependency/fontbox-1.8.6.jar +0 -0
  57. data/target/dependency/isoparser-1.0.2.jar +0 -0
  58. data/target/dependency/java-libpst-0.8.1.jar +0 -0
  59. data/target/dependency/jcip-annotations-1.0.jar +0 -0
  60. data/target/dependency/jdom-1.0.jar +0 -0
  61. data/target/dependency/jempbox-1.8.6.jar +0 -0
  62. data/target/dependency/jhighlight-1.0.jar +0 -0
  63. data/target/dependency/jmatio-1.0.jar +0 -0
  64. data/target/dependency/juniversalchardet-1.0.3.jar +0 -0
  65. data/target/dependency/metadata-extractor-2.6.2.jar +0 -0
  66. data/target/dependency/netcdf-4.2.20.jar +0 -0
  67. data/target/dependency/pdfbox-1.8.6.jar +0 -0
  68. data/target/dependency/poi-3.11-beta2.jar +0 -0
  69. data/target/dependency/poi-ooxml-3.11-beta2.jar +0 -0
  70. data/target/dependency/poi-ooxml-schemas-3.11-beta2.jar +0 -0
  71. data/target/dependency/poi-scratchpad-3.11-beta2.jar +0 -0
  72. data/target/dependency/rome-1.0.jar +0 -0
  73. data/target/dependency/slf4j-api-1.6.1.jar +0 -0
  74. data/target/dependency/tagsoup-1.2.1.jar +0 -0
  75. data/target/dependency/tika-core-1.6.jar +0 -0
  76. data/target/dependency/tika-parsers-1.6.jar +0 -0
  77. data/target/dependency/unidataCommon-4.2.20.jar +0 -0
  78. data/target/dependency/vorbis-java-core-0.6.jar +0 -0
  79. data/target/dependency/vorbis-java-tika-0.6.jar +0 -0
  80. data/target/dependency/xercesImpl-2.8.1.jar +0 -0
  81. data/target/dependency/xml-apis-1.3.03.jar +0 -0
  82. data/target/dependency/xmlbeans-2.6.0.jar +0 -0
  83. data/target/dependency/xmpcore-5.1.2.jar +0 -0
  84. data/target/dependency/xz-1.5.jar +0 -0
@@ -0,0 +1,304 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+ require 'rika/parser'
5
+ require 'rika/parse_result'
6
+ require 'webrick'
7
+
8
+ describe Rika::Parser do
9
+ port = 50515
10
+
11
+ let(:text_parse_result) { Rika.parse(fixture_path('document.txt')) }
12
+ let(:docx_parse_result) { Rika.parse(fixture_path('document.docx')) }
13
+ let(:doc_parse_result) { Rika.parse(fixture_path('document.doc')) }
14
+ let(:pdf_parse_result) { Rika.parse(fixture_path('document.pdf')) }
15
+ let(:image_parse_result) { Rika.parse(fixture_path('image.jpg')) }
16
+ let(:unknown_parse_result) { Rika.parse(fixture_path('unknown.bin')) }
17
+ let(:fixtures_dir) { File.expand_path(File.join(File.dirname(__FILE__), '../fixtures')) }
18
+ let(:quote_first_line) { 'Stopping by Woods on a Snowy Evening' }
19
+ let(:url) { "http://#{Socket.gethostname}:#{port}" }
20
+ let(:sample_pdf_filespec) { fixture_path('document.pdf') }
21
+ let(:first_line) { ->(string) { string.split("\n").first.strip } }
22
+
23
+ # returns a lambda that, when passed an action, will wrap it in an HTTP server
24
+ let(:server_runner) do
25
+ ->(action) do
26
+ server = nil
27
+ server_thread = Thread.new do
28
+ server = WEBrick::HTTPServer.new(
29
+ Port: port,
30
+ DocumentRoot: fixtures_dir,
31
+ AccessLog: [],
32
+ Logger: WEBrick::Log.new('/dev/null')
33
+ )
34
+ server.start
35
+ end
36
+
37
+ # Wait for server to become ready on its new thread
38
+ sleep 0.01 while server.nil?
39
+ begin
40
+ action.call
41
+ ensure
42
+ server.shutdown
43
+ server_thread.exit
44
+ end
45
+ end
46
+ end
47
+
48
+ context 'when initialized with a content string and metadata' do
49
+ let(:content) { 'Magnifique' }
50
+ let(:metadata) { { 'author' => 'John Doe' } }
51
+ let(:result) { Rika::ParseResult.new(content: content, metadata: metadata) }
52
+
53
+ specify '#content_and_metadata_hash returns a hash with content and metadata' do
54
+ expect(result.content_and_metadata_hash).to eq({ content: content, metadata: metadata })
55
+ end
56
+ end
57
+
58
+ describe '#parse' do
59
+ let(:parser) { described_class.new('spec/fixtures/document.pdf') }
60
+ let(:parse_result) { parser.parse }
61
+ let(:metadata) { parse_result.metadata }
62
+
63
+ specify 'returns an instance of ParseResult' do
64
+ expect(parse_result).to be_a(Rika::ParseResult)
65
+ end
66
+
67
+ specify 'returns a ParseResult with the expected access methods' do
68
+ expect(parse_result).to respond_to(
69
+ :content,
70
+ :metadata,
71
+ :metadata_java,
72
+ :content_type,
73
+ :language,
74
+ :input_type,
75
+ :data_source,
76
+ :max_content_length
77
+ )
78
+ end
79
+
80
+ specify 'returns a ParseResult with the expected content' do
81
+ expect(parse_result.content).to include('Stopping by Woods on a Snowy Evening')
82
+ end
83
+
84
+ specify 'returns a ParseResult with the expected metadata' do
85
+ expect(parse_result.metadata).to include(
86
+ 'dc:creator' => 'Robert Frost',
87
+ 'dc:format' => 'application/pdf; version=1.3',
88
+ 'dc:title' => 'Stopping by Woods on a Snowy Evening',
89
+ 'rika:data-source' => 'spec/fixtures/document.pdf',
90
+ 'rika:language' => 'en'
91
+ )
92
+ end
93
+
94
+ specify 'returns a ParseResult with the expected metadata_java' do
95
+ expect(parse_result.metadata_java).to be_a(Java::OrgApacheTikaMetadata::Metadata)
96
+ end
97
+
98
+ specify 'returns a ParseResult with the expected content_type' do
99
+ expect(parse_result.content_type).to eq('application/pdf')
100
+ end
101
+
102
+ specify 'returns a ParseResult with the expected language' do
103
+ expect(parse_result.language).to eq('en')
104
+ end
105
+
106
+ specify 'returns a ParseResult with the expected input_type' do
107
+ expect(parse_result.input_type).to eq(:file)
108
+ end
109
+
110
+ specify 'returns a ParseResult with the expected data_source' do
111
+ expect(parse_result.data_source).to eq('spec/fixtures/document.pdf')
112
+ end
113
+
114
+ describe 'metadata key sorting' do
115
+ RSpec.shared_examples('metadata key sorting') do |caption, key_sort|
116
+ specify "Metadata keys are #{caption} case insensitively when key_sort is #{key_sort}" do
117
+ parser = described_class.new('spec/fixtures/document.pdf', key_sort: key_sort)
118
+ keys = parser.parse.metadata.keys
119
+ expect(keys == keys.sort_by(&:downcase)).to eq(key_sort)
120
+ expect(keys).not_to eq(keys.map(&:downcase)) # Above test only valid if both upper and lower case occur.
121
+ end
122
+ end
123
+
124
+ include_examples 'metadata key sorting', 'sorted', true
125
+ include_examples 'metadata key sorting', 'not sorted', false
126
+ end
127
+
128
+ specify 'returns a ParseResult with the expected max_content_length' do
129
+ expect(parse_result.max_content_length).to eq(-1)
130
+ end
131
+ end
132
+
133
+ it 'raises an error if the file does not exist' do
134
+ expect { Rika.parse(fixture_path('nonexistent_file.txt')) }.to raise_error(IOError)
135
+ end
136
+
137
+ it 'raises an error if the URL does not exist' do
138
+ unavailable_server = 'http://k6075sd0dfkr8nvfw0zvwfwckucf2aba.com'
139
+ unavailable_file_on_web = File.join(unavailable_server, 'x.pdf')
140
+ expect { Rika.parse(unavailable_file_on_web) }.to raise_error(Java::JavaNet::UnknownHostException)
141
+ end
142
+
143
+ it 'detects a file type without a file extension' do
144
+ parse_result = Rika.parse(fixture_path('image_jpg_without_extension'))
145
+ expect(parse_result.metadata['Content-Type']).to eq('image/jpeg')
146
+ end
147
+
148
+ describe '#content' do
149
+ it 'returns the content in a text file' do
150
+ expect(first_line.(text_parse_result.content)).to eq(quote_first_line)
151
+ end
152
+
153
+ it 'returns the content in a docx file' do
154
+ expect(first_line.(docx_parse_result.content)).to eq(quote_first_line)
155
+ end
156
+
157
+ it 'returns the content in a pdf file' do
158
+ # For some reason, the generated PDF file has a newline at the beginning
159
+ # and trailing spaces on the lines, so we use the second line, and
160
+ # use `include` to do the text match.
161
+ expect(pdf_parse_result.content.lines[1]).to include(quote_first_line)
162
+ end
163
+
164
+ it 'returns no content for an image' do
165
+ expect(image_parse_result.content).to be_empty
166
+ end
167
+
168
+ it 'only returns max content length from a text file' do
169
+ expect(Rika.parse(fixture_path('document.txt'), max_content_length: 8).content).to eq('Stopping')
170
+ end
171
+
172
+ it 'only returns max content length from a PDF' do
173
+ expect(Rika.parse(fixture_path('document.pdf'), max_content_length: 9).content).to eq("\nStopping")
174
+ end
175
+
176
+ it 'only returns max content length for file over http' do
177
+ server_runner.call(-> do
178
+ content = Rika.parse(File.join(url, 'document.txt'), max_content_length: 8).content
179
+ expect(content).to eq('Stopping')
180
+ end)
181
+ end
182
+
183
+ it 'returns the content from a file over http' do
184
+ content = server_runner.call(-> do
185
+ Rika.parse(File.join(url, 'document.txt')).content
186
+ end)
187
+ expect(first_line.(content)).to eq(quote_first_line)
188
+ end
189
+
190
+ it 'return empty string for unknown file' do
191
+ expect(unknown_parse_result.content).to be_empty
192
+ end
193
+ end
194
+
195
+ # We just test a few of the metadata fields for some common file formats
196
+ # to make sure the integration with Apache Tika works. Apache Tika already
197
+ # have tests for all file formats it supports so we won't retest that
198
+ describe '#metadata' do
199
+ it 'returns nil if metadata field does not exist' do
200
+ expect(text_parse_result.metadata['nonsense']).to be_nil
201
+ end
202
+
203
+ it 'returns metadata from a docx file' do
204
+ expect(docx_parse_result.metadata['meta:page-count']).to eq('1')
205
+ end
206
+
207
+ it 'returns metadata from a pdf file' do
208
+ expect(pdf_parse_result.metadata['pdf:docinfo:creator']).to eq('Robert Frost')
209
+ end
210
+
211
+ it 'returns metadata from a file over http' do
212
+ server_runner.call(-> do
213
+ parser = Rika.parse(File.join(url, 'document.pdf'))
214
+ expect(parser.metadata['pdf:docinfo:creator']).to eq('Robert Frost')
215
+ end)
216
+ end
217
+
218
+ it 'returns metadata from an image' do
219
+ expect(image_parse_result.metadata['Image Height']).to eq('72 pixels')
220
+ expect(image_parse_result.metadata['Image Width']).to eq('72 pixels')
221
+ end
222
+ end
223
+
224
+ describe '#content_type' do
225
+ it 'returns application/pdf for a pdf file' do
226
+ expect(pdf_parse_result.content_type).to eq('application/pdf')
227
+ end
228
+
229
+ it 'returns text/plain for a txt file' do
230
+ expect(text_parse_result.content_type).to eq('text/plain; charset=UTF-8')
231
+ end
232
+
233
+ it 'returns application/pdf for a pdf over http' do
234
+ server_runner.call(-> do
235
+ parse_result = Rika.parse(File.join(url, 'document.pdf'))
236
+ expect(parse_result.content_type).to eq('application/pdf')
237
+ end)
238
+ end
239
+
240
+ it 'returns application/octet-stream for unknown file' do
241
+ expect(unknown_parse_result.content_type).to eq('application/octet-stream')
242
+ end
243
+
244
+ it 'returns msword for a doc file' do
245
+ # There seem to be two permissible content types for a doc file.
246
+ expect(%w{application/msword application/x-tika-msoffice}.include?(doc_parse_result.content_type)).to be true
247
+ end
248
+
249
+ it 'returns wordprocessingml for a docx file' do
250
+ expect(docx_parse_result.content_type).to eq(
251
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
252
+ )
253
+ end
254
+ end
255
+
256
+ describe '#language' do
257
+ it 'returns the language of the content' do
258
+ %w(en de fr ru es).each do |lang|
259
+ parse_result = Rika.parse(fixture_path("#{lang}.txt"))
260
+ expect(parse_result.language).to eq(lang)
261
+ end
262
+ end
263
+ end
264
+
265
+ it 'returns valid content using Rika.parse_content' do
266
+ content = Rika.parse_content(sample_pdf_filespec)
267
+ expect(content).to be_a(String)
268
+ expect(content).not_to be_empty
269
+ end
270
+
271
+ it 'returns valid metadata using Rika.parse_metadata' do
272
+ metadata = Rika.parse_metadata(sample_pdf_filespec)
273
+ expect(metadata).to be_a(Hash)
274
+ expect(metadata).not_to be_empty
275
+ end
276
+
277
+ it 'returns valid content and metadata using Rika.parse_content_and_metadata' do
278
+ content, metadata = Rika.parse_content_and_metadata(sample_pdf_filespec)
279
+ expect(content).to be_a(String)
280
+ expect(content).not_to be_empty
281
+ expect(metadata).to be_a(Hash)
282
+ expect(metadata).not_to be_empty
283
+ end
284
+
285
+ specify 'both means of getting both content and metadata return the same values' do
286
+ content1, metadata1 = Rika.parse_content_and_metadata(sample_pdf_filespec)
287
+
288
+ h = Rika.parse_content_and_metadata_as_hash(sample_pdf_filespec)
289
+ content2 = h[:content]
290
+ metadata2 = h[:metadata]
291
+
292
+ expect(content1).to eq(content2)
293
+ expect(metadata1).to eq(metadata2)
294
+ end
295
+
296
+ specify 'getting content and metadata individually and together return the same values' do
297
+ content1, metadata1 = Rika.parse_content_and_metadata(sample_pdf_filespec)
298
+ content2 = Rika.parse_content(sample_pdf_filespec)
299
+ metadata2 = Rika.parse_metadata(sample_pdf_filespec)
300
+
301
+ expect(content1).to eq(content2)
302
+ expect(metadata1).to eq(metadata2)
303
+ end
304
+ end
@@ -0,0 +1,10 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+ require 'rika'
5
+
6
+ describe Rika do
7
+ it 'has a version number' do
8
+ expect(Rika::VERSION).not_to be_nil
9
+ end
10
+ end
@@ -0,0 +1,57 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'spec_helper'
4
+ require 'rika/tika_loader'
5
+
6
+ describe Rika::TikaLoader do
7
+ describe '.require_tika' do
8
+ it 'returns the correct Tika jar file path' do
9
+ expect(described_class.require_tika).to match(/tika-app-.*\.jar/)
10
+ end
11
+
12
+ it 'calls print_message_and_exit if the Tika jar file cannot be loaded' do
13
+ allow(ENV).to receive(:[]).with('TIKA_JAR_FILESPEC').and_return('nonexistent_file')
14
+ expect { described_class.require_tika }.to raise_error(Rika::TikaLoadError) \
15
+ .with_message(/Unable to load Tika jar file from 'nonexistent_file'./)
16
+ end
17
+ end
18
+
19
+ describe '.specified_tika_filespec' do
20
+ it 'returns the correct Tika jar file path' do
21
+ expect(described_class.send(:specified_tika_filespec)).to match(/tika-app-.*\.jar/)
22
+ end
23
+
24
+ it 'raises a TikaLoadError if the Tika jar filespec is not specified at all in TIKA_JAR_FILESPEC' do
25
+ allow(ENV).to receive(:[]).with('TIKA_JAR_FILESPEC').and_return(nil)
26
+ expect { described_class.send(:specified_tika_filespec) }.to raise_error(Rika::TikaLoadError) \
27
+ .with_message(/Environment variable TIKA_JAR_FILESPEC is not set./)
28
+ end
29
+ end
30
+
31
+ describe '.print_message_and_exit' do
32
+ it 'prints the correct message and exits with an exit code of 1' do
33
+ stderr_orig = $stderr
34
+ $stderr = StringIO.new
35
+
36
+ begin
37
+ expect { described_class.send(:print_message_and_exit, 'message') }.to raise_error(SystemExit) do |error|
38
+ expect(error.status).to eq(1)
39
+ end
40
+ expect($stderr.string).to match(/message/)
41
+ ensure
42
+ $stderr = stderr_orig
43
+ end
44
+ end
45
+ end
46
+
47
+ describe '.formatted_error_message' do
48
+ it 'returns the correct message' do
49
+ message = 'This is a test message.'
50
+ expect(described_class.send(:formatted_error_message, message)).to match(/#{message}/)
51
+ end
52
+
53
+ it 'returns the correct banner' do
54
+ expect(described_class.send(:formatted_error_message, 'message').lines.grep(/!{79}/).size).to be >= 2
55
+ end
56
+ end
57
+ end
data/spec/spec_helper.rb CHANGED
@@ -1,14 +1,22 @@
1
- require "rika"
1
+ # frozen_string_literal: true
2
2
 
3
- def file_path( *paths )
3
+ require 'simplecov'
4
+ SimpleCov.start { add_filter '/spec/' }
5
+
6
+ require 'rika'
7
+
8
+ def fixture_path(*paths)
4
9
  File.expand_path(File.join(File.dirname(__FILE__), 'fixtures', *paths))
5
10
  end
6
11
 
7
12
  # See http://rubydoc.info/gems/rspec-core/RSpec/Core/Configuration
8
13
  RSpec.configure do |config|
9
- config.treat_symbols_as_metadata_keys_with_true_values = true
10
- config.run_all_when_everything_filtered = true
11
- config.filter_run :focus
14
+ # Enable the line below if you want ", focus: true" after a test declaration to
15
+ # denote the only tests that will be run:
16
+ # config.filter_run :focus
12
17
 
13
18
  config.order = 'random'
19
+ config.example_status_persistence_file_path = 'spec/rspec-failed-tests-control-file.txt'
14
20
  end
21
+
22
+ Rika.init
metadata CHANGED
@@ -1,155 +1,111 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rika
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.6.0
4
+ version: 2.0.0
5
5
  platform: java
6
6
  authors:
7
7
  - Richard Nyström
8
+ - Keith Bennett
8
9
  autorequire:
9
10
  bindir: bin
10
11
  cert_chain: []
11
- date: 2014-10-06 00:00:00.000000000 Z
12
+ date: 2023-09-08 00:00:00.000000000 Z
12
13
  dependencies:
13
14
  - !ruby/object:Gem::Dependency
14
15
  requirement: !ruby/object:Gem::Requirement
15
16
  requirements:
16
- - - '='
17
+ - - ">="
17
18
  - !ruby/object:Gem::Version
18
- version: 2.14.1
19
- name: rspec
19
+ version: '0'
20
+ name: awesome_print
20
21
  prerelease: false
21
- type: :development
22
+ type: :runtime
22
23
  version_requirements: !ruby/object:Gem::Requirement
23
24
  requirements:
24
- - - '='
25
+ - - ">="
25
26
  - !ruby/object:Gem::Version
26
- version: 2.14.1
27
- - !ruby/object:Gem::Dependency
28
- requirement: !ruby/object:Gem::Requirement
29
- requirements:
30
- - - '='
31
- - !ruby/object:Gem::Version
32
- version: 10.3.1
33
- name: rake
34
- prerelease: false
35
- type: :development
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - '='
39
- - !ruby/object:Gem::Version
40
- version: 10.3.1
41
- description: ' A JRuby wrapper for Apache Tika to extract text and metadata from various
42
- file formats. '
27
+ version: '0'
28
+ description: A JRuby wrapper for Apache Tika to extract text and metadata from files
29
+ of various formats.
43
30
  email:
44
31
  - ricny046@gmail.com
45
- executables: []
32
+ - keithrbennett@gmail.com
33
+ executables:
34
+ - rika
46
35
  extensions: []
47
36
  extra_rdoc_files: []
48
37
  files:
49
- - .gitignore
50
- - .rspec
51
- - .travis.yml
38
+ - ".gitignore"
39
+ - ".rspec"
40
+ - ".rubocop.yml"
52
41
  - Gemfile
53
42
  - LICENSE.txt
54
43
  - README.md
44
+ - RELEASE_NOTES.md
55
45
  - Rakefile
46
+ - bin/rika
56
47
  - lib/rika.rb
48
+ - lib/rika/cli/args_parser.rb
49
+ - lib/rika/cli/rika_command.rb
50
+ - lib/rika/formatters.rb
51
+ - lib/rika/parse_result.rb
52
+ - lib/rika/parser.rb
53
+ - lib/rika/tika_loader.rb
57
54
  - lib/rika/version.rb
58
55
  - pom.xml
59
56
  - rika.gemspec
57
+ - rika_helper.rb
60
58
  - spec/fixtures/de.txt
61
59
  - spec/fixtures/document.doc
62
60
  - spec/fixtures/document.docx
63
61
  - spec/fixtures/document.pdf
62
+ - spec/fixtures/document.txt
64
63
  - spec/fixtures/en.txt
65
64
  - spec/fixtures/es.txt
66
65
  - spec/fixtures/fr.txt
67
66
  - spec/fixtures/image.jpg
67
+ - spec/fixtures/image_jpg_without_extension
68
68
  - spec/fixtures/lang_cant_be_determined.txt
69
- - spec/fixtures/over_100k_file.txt
70
69
  - spec/fixtures/ru.txt
71
- - spec/fixtures/text_file.txt
72
- - spec/fixtures/text_file_without_extension
70
+ - spec/fixtures/tiny.txt
73
71
  - spec/fixtures/unknown.bin
74
- - spec/rika_spec.rb
72
+ - spec/rika/cli/args_parser_spec.rb
73
+ - spec/rika/cli/rika_command_spec.rb
74
+ - spec/rika/formatters_spec.rb
75
+ - spec/rika/parse_result_spec.rb
76
+ - spec/rika/parser_spec.rb
77
+ - spec/rika/rika_spec.rb
78
+ - spec/rika/tika_loader_spec.rb
75
79
  - spec/spec_helper.rb
76
- - target/dependency/apache-mime4j-core-0.7.2.jar
77
- - target/dependency/apache-mime4j-dom-0.7.2.jar
78
- - target/dependency/asm-debug-all-4.1.jar
79
- - target/dependency/aspectjrt-1.8.0.jar
80
- - target/dependency/bcmail-jdk15-1.45.jar
81
- - target/dependency/bcprov-jdk15-1.45.jar
82
- - target/dependency/boilerpipe-1.1.0.jar
83
- - target/dependency/commons-codec-1.9.jar
84
- - target/dependency/commons-compress-1.8.1.jar
85
- - target/dependency/commons-httpclient-3.1.jar
86
- - target/dependency/commons-logging-1.1.1.jar
87
- - target/dependency/fontbox-1.8.6.jar
88
- - target/dependency/isoparser-1.0.2.jar
89
- - target/dependency/java-libpst-0.8.1.jar
90
- - target/dependency/jcip-annotations-1.0.jar
91
- - target/dependency/jdom-1.0.jar
92
- - target/dependency/jempbox-1.8.6.jar
93
- - target/dependency/jhighlight-1.0.jar
94
- - target/dependency/jmatio-1.0.jar
95
- - target/dependency/juniversalchardet-1.0.3.jar
96
- - target/dependency/metadata-extractor-2.6.2.jar
97
- - target/dependency/netcdf-4.2.20.jar
98
- - target/dependency/pdfbox-1.8.6.jar
99
- - target/dependency/poi-3.11-beta2.jar
100
- - target/dependency/poi-ooxml-3.11-beta2.jar
101
- - target/dependency/poi-ooxml-schemas-3.11-beta2.jar
102
- - target/dependency/poi-scratchpad-3.11-beta2.jar
103
- - target/dependency/rome-1.0.jar
104
- - target/dependency/slf4j-api-1.6.1.jar
105
- - target/dependency/tagsoup-1.2.1.jar
106
- - target/dependency/tika-core-1.6.jar
107
- - target/dependency/tika-parsers-1.6.jar
108
- - target/dependency/unidataCommon-4.2.20.jar
109
- - target/dependency/vorbis-java-core-0.6.jar
110
- - target/dependency/vorbis-java-tika-0.6.jar
111
- - target/dependency/xercesImpl-2.8.1.jar
112
- - target/dependency/xml-apis-1.3.03.jar
113
- - target/dependency/xmlbeans-2.6.0.jar
114
- - target/dependency/xmpcore-5.1.2.jar
115
- - target/dependency/xz-1.5.jar
116
- homepage: https://github.com/ricn/rika
117
- licenses: []
118
- metadata: {}
119
- post_install_message:
80
+ homepage: https://github.com/keithrbennett/rika
81
+ licenses:
82
+ - Apache-2.0
83
+ metadata:
84
+ rubygems_mfa_required: 'true'
85
+ post_install_message: |2+
86
+
87
+ Using the rika gem requires that you:
88
+ 1) download the Apache Tika tika-app jar file from https://tika.apache.org/download.html
89
+ 2) place it somewhere accessible to the running application
90
+ 3) specify its location in the TIKA_JAR_FILESPEC environment variable
91
+
120
92
  rdoc_options: []
121
93
  require_paths:
122
94
  - lib
123
95
  required_ruby_version: !ruby/object:Gem::Requirement
124
96
  requirements:
125
- - - '>='
97
+ - - ">="
126
98
  - !ruby/object:Gem::Version
127
99
  version: '0'
128
100
  required_rubygems_version: !ruby/object:Gem::Requirement
129
101
  requirements:
130
- - - '>='
102
+ - - ">="
131
103
  - !ruby/object:Gem::Version
132
104
  version: '0'
133
105
  requirements: []
134
- rubyforge_project:
135
- rubygems_version: 2.1.9
106
+ rubygems_version: 3.3.26
136
107
  signing_key:
137
108
  specification_version: 4
138
- summary: A JRuby wrapper for Apache Tika to extract text and metadata from various file formats.
139
- test_files:
140
- - spec/fixtures/de.txt
141
- - spec/fixtures/document.doc
142
- - spec/fixtures/document.docx
143
- - spec/fixtures/document.pdf
144
- - spec/fixtures/en.txt
145
- - spec/fixtures/es.txt
146
- - spec/fixtures/fr.txt
147
- - spec/fixtures/image.jpg
148
- - spec/fixtures/lang_cant_be_determined.txt
149
- - spec/fixtures/over_100k_file.txt
150
- - spec/fixtures/ru.txt
151
- - spec/fixtures/text_file.txt
152
- - spec/fixtures/text_file_without_extension
153
- - spec/fixtures/unknown.bin
154
- - spec/rika_spec.rb
155
- - spec/spec_helper.rb
109
+ summary: A JRuby wrapper for Apache Tika to extract text and metadata from files of
110
+ various formats.
111
+ test_files: []
data/.travis.yml DELETED
@@ -1,7 +0,0 @@
1
- language: ruby
2
- rvm:
3
- - jruby-19mode
4
- - jruby-head
5
- notifications:
6
- recipients:
7
- - ricny046@gmail.com