rika 2.0.1-java → 2.0.3-java

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 9885d200caf6116b8356a6eedf8c953aba4429e05d7a20d30a6c0a2281a9e57e
4
- data.tar.gz: 9c71e6ae01ea608c45cb302627bea7ee3612058fbd237787d4aebc5ff07476b4
3
+ metadata.gz: d1ff6f1340908e2f573ac758ded769bf4a94db0835e06a436addf6dd9408b7ba
4
+ data.tar.gz: 6e99972c0cf72ab44b85e11847ed6685101c8498f2cd8c1a9a51237b2cc7ede2
5
5
  SHA512:
6
- metadata.gz: 407fe5fe996b47c080a1a9282ae5b7c3bc48685f18ee243af81ecb6e567c5eef3dfec86b554ed3fd59d3735f8ba5af7712f9c5388f59a9a22905b0b0c8e01c73
7
- data.tar.gz: 20e0f6c590f5d533866616e2cba8229d1d003959214e452702f19f5fc462b6afa3e4f2e1c8ce5feb30468f9207c0231b9778185333c91af4173c35970684bc2c
6
+ metadata.gz: eaf3905403a4a0f66ace5a9658f1aa0ba77f153d67e588f094fbbe426b7cd4c30640c3f21652b8312493e4ad827719d6ff1d83a613b6ae0ff9a73164b179fd4e
7
+ data.tar.gz: dda02d0f0377f54db2fe59535c4b59b073c5fdcae3f1f8c6e66506427e34c248b923c2b01683a6ea811ca4a1ed45305b96739611a119d7dd244895e28fe7dac4
data/Gemfile CHANGED
@@ -6,11 +6,11 @@ source 'https://rubygems.org'
6
6
  gemspec
7
7
 
8
8
  group :development do
9
- gem 'pry'
10
- gem 'rake', '~> 13.0'
11
- gem 'rspec', '~> 3.9'
12
- gem 'rubocop'
13
- gem 'rubocop-rspec'
14
- gem 'simplecov', require: false
15
- gem 'webrick', '~> 1.6'
9
+ gem 'pry', '~> 0.14', '>= 0.14.2'
10
+ gem 'rake', '~> 13.2', '>= 13.2.1'
11
+ gem 'rspec', '~> 3.13', '>= 3.13.0'
12
+ gem 'rubocop', '~> 1.65', '>= 1.65.1'
13
+ gem 'rubocop-rspec', '~> 3.0', '>= 3.0.3'
14
+ gem 'simplecov', '~>0.22', '>= 0.22.0', require: false
15
+ gem 'webrick', '~> 1.8', '>= 1.8.1'
16
16
  end
data/README.md CHANGED
@@ -82,10 +82,14 @@ specify one or more filespecs or URL's as arguments:
82
82
  ```bash
83
83
  rika x.pdf https://github.com/keithrbennett/rika
84
84
  ```
85
+
86
+ > [!NOTE]
87
+ > If running `rika` produces an error indicating that the JRuby interpreter cannot be found, try preceding it with `jruby`, e.g. `jruby rika x.pdf`.
88
+
85
89
  Here is the help text:
86
90
 
87
91
  ```
88
- Rika v2.0.1 (Tika v2.9.0) - https://github.com/keithrbennett/rika
92
+ Rika v2.0.2 (Tika v2.9.0) - https://github.com/keithrbennett/rika
89
93
 
90
94
  Usage: rika [options] <file or url> [...file or url...]
91
95
  Output formats are: [a]wesome_print, [t]o_s, [i]nspect, [j]son), [J] for pretty json, and [y]aml.
@@ -98,7 +102,7 @@ Values for the text, metadata, and as_array boolean options may be specified as
98
102
  -m, --[no-]metadata [FLAG] Output metadata (default: true)
99
103
  -t, --[no-]text [FLAG] Output text (default: true)
100
104
  -k, --[no-]key-sort [FLAG] Sort metadata keys case insensitively (default: true)
101
- -s, --[no-]source [FLAG] Output document source file or URL
105
+ -s, --[no-]source [FLAG] Output document source file or URL (default: false)
102
106
  -a, --[no-]as-array [FLAG] Output all parsed results as an array (default: false)
103
107
  -v, --version Output version
104
108
  -h, --help Output help
@@ -245,6 +249,14 @@ rexe -in -oJ -mb 'downcase \
245
249
  This gem has been tested with JRuby managed by rvm. It should work with other Ruby version managers and
246
250
  without any version manager at all, but those configurations have not been tested.
247
251
 
252
+ ## Using the Tika Java Jar File Directly
253
+
254
+ Rika provides only the most common Tika use cases. You may want to dig deeper than Rika does into the massive amount
255
+ of functionality provided by the Tika library. You can do so by bypassing Rika altogether and using the Tika jar file
256
+ directly in your own JRuby code. In addition. Tika provides its own command line application that can be called as,
257
+ for example, `java -jar $TIKA_JAR_FILESPEC --help`. This Tika command line application has finer grained control
258
+ over some Tika options, but is missing some conveniences provided by the Rika command line application.
259
+
248
260
  ## Other Tika Resources
249
261
 
250
262
  * The Apache Tika wiki is at https://cwiki.apache.org/confluence/display/tika.
@@ -265,3 +277,8 @@ without any version manager at all, but those configurations have not been teste
265
277
  3. Commit your changes (`git commit -am 'Add some feature'`)
266
278
  4. Push to the branch (`git push origin my-new-feature`)
267
279
  5. Create new Pull Request
280
+
281
+ ## Acknowledgments
282
+
283
+ Many thanks to the brilliant and dedicated developers who have worked to build Apache Tika since its inception many years ago.
284
+
data/RELEASE_NOTES.md CHANGED
@@ -1,5 +1,15 @@
1
1
  ## Release Notes
2
2
 
3
+ ### v2.0.3
4
+
5
+ * Fix parsing of empty files so they do not halt parsing of all files.
6
+ * Update gem dependencies.
7
+
8
+ #### v2.0.2
9
+
10
+ * Now prints source name on line with header and footer lines.
11
+ * Improve help text.
12
+
3
13
  #### v2.0.1
4
14
 
5
15
  * Fix license specification in gemspec, update copyright name and year.
@@ -72,7 +72,7 @@ class ArgsParser
72
72
  options[:key_sort] = (v.nil? ? true : v)
73
73
  end
74
74
 
75
- opts.on('-s', '--[no-]source [FLAG]', TrueClass, 'Output document source file or URL') do |v|
75
+ opts.on('-s', '--[no-]source [FLAG]', TrueClass, 'Output document source file or URL (default: false)') do |v|
76
76
  options[:source] = (v.nil? ? true : v)
77
77
  end
78
78
 
@@ -32,6 +32,13 @@ class RikaCommand
32
32
  puts result_array_output
33
33
  else
34
34
  targets.each do |target|
35
+ # If we don't do this, Tika will raise an org.apache.tika.exception.ZeroByteFileException
36
+ # TODO: Do same for URL?
37
+ if File.file?(target) && File.zero?(target)
38
+ $stderr.puts("\n\nFile empty!: #{target}\n\n")
39
+ next
40
+ end
41
+
35
42
  result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
36
43
  puts single_document_output(target, result)
37
44
  end
@@ -71,6 +78,20 @@ class RikaCommand
71
78
  h
72
79
  end
73
80
 
81
+ # Outputs the source file or URL in the form of:
82
+ # -------------------------------------------------------------------------------
83
+ # Source: path/to/file.ext
84
+ # -------------------------------------------------------------------------------
85
+ # @param [String] source document source identifier
86
+ # @return multiline string as displayed above
87
+ private def source_output_string(source)
88
+ <<~STRING
89
+ -------------------------------------------------------------------------------
90
+ Source: #{source}
91
+ -------------------------------------------------------------------------------
92
+ STRING
93
+ end
94
+
74
95
  # Builds the string representation of the result of parsing a single document
75
96
  # @param [String] target the target document
76
97
  # @param [ParseResult] result the parse result
@@ -80,7 +101,7 @@ class RikaCommand
80
101
  metadata_formatter.(result_hash(result))
81
102
  else
82
103
  sio = StringIO.new
83
- sio << "Source: #{target}\n" if options[:source]
104
+ sio << source_output_string(target) if options[:source]
84
105
  sio << metadata_formatter.(result.metadata) << "\n" if options[:metadata]
85
106
  sio << text_formatter.(result.content) << "\n" if options[:text]
86
107
  sio.string
data/lib/rika/version.rb CHANGED
@@ -1,5 +1,5 @@
1
1
  # frozen_string_literal: true
2
2
 
3
3
  module Rika
4
- VERSION = '2.0.1'
4
+ VERSION = '2.0.3'
5
5
  end
data/rika.gemspec CHANGED
@@ -17,7 +17,7 @@ Gem::Specification.new do |gem|
17
17
  gem.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
18
18
  gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
19
19
  gem.require_paths = ['lib']
20
- gem.add_dependency 'awesome_print'
20
+ gem.add_dependency 'awesome_print', '~> 1.9', '>= 1.9.2'
21
21
  gem.platform = 'java'
22
22
  gem.license = 'MIT'
23
23
  gem.metadata['rubygems_mfa_required'] = 'true'
File without changes
@@ -0,0 +1 @@
1
+ something
@@ -117,4 +117,39 @@ describe RikaCommand do
117
117
  expect(output).to include('sample help text')
118
118
  end
119
119
  end
120
+
121
+ describe '#source_output_string' do
122
+ let(:rika_command) { described_class.new([]) }
123
+ let(:sample_filespec) { 'path/to/file.ext' }
124
+ let(:sample_output_string) { rika_command.send(:source_output_string, sample_filespec) }
125
+ let(:sample_output_lines) { sample_output_string.lines.map(&:chomp) }
126
+ let(:header_trailer_line) { '-' * 79 }
127
+
128
+ specify 'it has a header and trailer line' do
129
+ expect(sample_output_lines[0]).to eq(header_trailer_line)
130
+ expect(sample_output_lines[2]).to eq(header_trailer_line)
131
+ end
132
+
133
+ specify 'information line is well formed' do
134
+ line = sample_output_lines[1]
135
+ expect(line).to match("Source: #{sample_filespec}")
136
+ end
137
+ end
138
+
139
+ describe 'empty file behavior' do
140
+ let(:empty_file_path) { fixture_path('empty.txt') }
141
+ let(:something_file_path) { fixture_path('something.txt') } # containts "something"
142
+
143
+ specify 'parsing an empty file outputs a message to stderr' do
144
+ expect {
145
+ described_class.new([empty_file_path]).call
146
+ }.to output("\n\nFile empty!: #{empty_file_path}\n\n").to_stderr
147
+ end
148
+
149
+ specify 'parsing an empty file does not interrupt parsing of subsequent files' do
150
+ expect {
151
+ described_class.new([empty_file_path, something_file_path]).call
152
+ }.to output(/something/).to_stdout
153
+ end
154
+ end
120
155
  end
@@ -161,10 +161,6 @@ describe Rika::Parser do
161
161
  expect(pdf_parse_result.content.lines[1]).to include(quote_first_line)
162
162
  end
163
163
 
164
- it 'returns no content for an image' do
165
- expect(image_parse_result.content).to be_empty
166
- end
167
-
168
164
  it 'only returns max content length from a text file' do
169
165
  expect(Rika.parse(fixture_path('document.txt'), max_content_length: 8).content).to eq('Stopping')
170
166
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rika
3
3
  version: !ruby/object:Gem::Version
4
- version: 2.0.1
4
+ version: 2.0.3
5
5
  platform: java
6
6
  authors:
7
7
  - Richard Nyström
@@ -9,22 +9,28 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2023-09-15 00:00:00.000000000 Z
12
+ date: 2024-08-02 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  requirement: !ruby/object:Gem::Requirement
16
16
  requirements:
17
+ - - "~>"
18
+ - !ruby/object:Gem::Version
19
+ version: '1.9'
17
20
  - - ">="
18
21
  - !ruby/object:Gem::Version
19
- version: '0'
22
+ version: 1.9.2
20
23
  name: awesome_print
21
- prerelease: false
22
24
  type: :runtime
25
+ prerelease: false
23
26
  version_requirements: !ruby/object:Gem::Requirement
24
27
  requirements:
28
+ - - "~>"
29
+ - !ruby/object:Gem::Version
30
+ version: '1.9'
25
31
  - - ">="
26
32
  - !ruby/object:Gem::Version
27
- version: '0'
33
+ version: 1.9.2
28
34
  description: A JRuby wrapper for Apache Tika to extract text and metadata from files
29
35
  of various formats.
30
36
  email:
@@ -60,6 +66,7 @@ files:
60
66
  - spec/fixtures/document.docx
61
67
  - spec/fixtures/document.pdf
62
68
  - spec/fixtures/document.txt
69
+ - spec/fixtures/empty.txt
63
70
  - spec/fixtures/en.txt
64
71
  - spec/fixtures/es.txt
65
72
  - spec/fixtures/fr.txt
@@ -67,6 +74,7 @@ files:
67
74
  - spec/fixtures/image_jpg_without_extension
68
75
  - spec/fixtures/lang_cant_be_determined.txt
69
76
  - spec/fixtures/ru.txt
77
+ - spec/fixtures/something.txt
70
78
  - spec/fixtures/tiny.txt
71
79
  - spec/fixtures/unknown.bin
72
80
  - spec/rika/cli/args_parser_spec.rb