rika 2.0.1-java → 2.0.3-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +7 -7
- data/README.md +19 -2
- data/RELEASE_NOTES.md +10 -0
- data/lib/rika/cli/args_parser.rb +1 -1
- data/lib/rika/cli/rika_command.rb +22 -1
- data/lib/rika/version.rb +1 -1
- data/rika.gemspec +1 -1
- data/spec/fixtures/empty.txt +0 -0
- data/spec/fixtures/something.txt +1 -0
- data/spec/rika/cli/rika_command_spec.rb +35 -0
- data/spec/rika/parser_spec.rb +0 -4
- metadata +13 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d1ff6f1340908e2f573ac758ded769bf4a94db0835e06a436addf6dd9408b7ba
|
4
|
+
data.tar.gz: 6e99972c0cf72ab44b85e11847ed6685101c8498f2cd8c1a9a51237b2cc7ede2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: eaf3905403a4a0f66ace5a9658f1aa0ba77f153d67e588f094fbbe426b7cd4c30640c3f21652b8312493e4ad827719d6ff1d83a613b6ae0ff9a73164b179fd4e
|
7
|
+
data.tar.gz: dda02d0f0377f54db2fe59535c4b59b073c5fdcae3f1f8c6e66506427e34c248b923c2b01683a6ea811ca4a1ed45305b96739611a119d7dd244895e28fe7dac4
|
data/Gemfile
CHANGED
@@ -6,11 +6,11 @@ source 'https://rubygems.org'
|
|
6
6
|
gemspec
|
7
7
|
|
8
8
|
group :development do
|
9
|
-
gem 'pry'
|
10
|
-
gem 'rake', '~> 13.
|
11
|
-
gem 'rspec', '~> 3.
|
12
|
-
gem 'rubocop'
|
13
|
-
gem 'rubocop-rspec'
|
14
|
-
gem 'simplecov', require: false
|
15
|
-
gem 'webrick', '~> 1.
|
9
|
+
gem 'pry', '~> 0.14', '>= 0.14.2'
|
10
|
+
gem 'rake', '~> 13.2', '>= 13.2.1'
|
11
|
+
gem 'rspec', '~> 3.13', '>= 3.13.0'
|
12
|
+
gem 'rubocop', '~> 1.65', '>= 1.65.1'
|
13
|
+
gem 'rubocop-rspec', '~> 3.0', '>= 3.0.3'
|
14
|
+
gem 'simplecov', '~>0.22', '>= 0.22.0', require: false
|
15
|
+
gem 'webrick', '~> 1.8', '>= 1.8.1'
|
16
16
|
end
|
data/README.md
CHANGED
@@ -82,10 +82,14 @@ specify one or more filespecs or URL's as arguments:
|
|
82
82
|
```bash
|
83
83
|
rika x.pdf https://github.com/keithrbennett/rika
|
84
84
|
```
|
85
|
+
|
86
|
+
> [!NOTE]
|
87
|
+
> If running `rika` produces an error indicating that the JRuby interpreter cannot be found, try preceding it with `jruby`, e.g. `jruby rika x.pdf`.
|
88
|
+
|
85
89
|
Here is the help text:
|
86
90
|
|
87
91
|
```
|
88
|
-
Rika v2.0.
|
92
|
+
Rika v2.0.2 (Tika v2.9.0) - https://github.com/keithrbennett/rika
|
89
93
|
|
90
94
|
Usage: rika [options] <file or url> [...file or url...]
|
91
95
|
Output formats are: [a]wesome_print, [t]o_s, [i]nspect, [j]son), [J] for pretty json, and [y]aml.
|
@@ -98,7 +102,7 @@ Values for the text, metadata, and as_array boolean options may be specified as
|
|
98
102
|
-m, --[no-]metadata [FLAG] Output metadata (default: true)
|
99
103
|
-t, --[no-]text [FLAG] Output text (default: true)
|
100
104
|
-k, --[no-]key-sort [FLAG] Sort metadata keys case insensitively (default: true)
|
101
|
-
-s, --[no-]source [FLAG] Output document source file or URL
|
105
|
+
-s, --[no-]source [FLAG] Output document source file or URL (default: false)
|
102
106
|
-a, --[no-]as-array [FLAG] Output all parsed results as an array (default: false)
|
103
107
|
-v, --version Output version
|
104
108
|
-h, --help Output help
|
@@ -245,6 +249,14 @@ rexe -in -oJ -mb 'downcase \
|
|
245
249
|
This gem has been tested with JRuby managed by rvm. It should work with other Ruby version managers and
|
246
250
|
without any version manager at all, but those configurations have not been tested.
|
247
251
|
|
252
|
+
## Using the Tika Java Jar File Directly
|
253
|
+
|
254
|
+
Rika provides only the most common Tika use cases. You may want to dig deeper than Rika does into the massive amount
|
255
|
+
of functionality provided by the Tika library. You can do so by bypassing Rika altogether and using the Tika jar file
|
256
|
+
directly in your own JRuby code. In addition. Tika provides its own command line application that can be called as,
|
257
|
+
for example, `java -jar $TIKA_JAR_FILESPEC --help`. This Tika command line application has finer grained control
|
258
|
+
over some Tika options, but is missing some conveniences provided by the Rika command line application.
|
259
|
+
|
248
260
|
## Other Tika Resources
|
249
261
|
|
250
262
|
* The Apache Tika wiki is at https://cwiki.apache.org/confluence/display/tika.
|
@@ -265,3 +277,8 @@ without any version manager at all, but those configurations have not been teste
|
|
265
277
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
266
278
|
4. Push to the branch (`git push origin my-new-feature`)
|
267
279
|
5. Create new Pull Request
|
280
|
+
|
281
|
+
## Acknowledgments
|
282
|
+
|
283
|
+
Many thanks to the brilliant and dedicated developers who have worked to build Apache Tika since its inception many years ago.
|
284
|
+
|
data/RELEASE_NOTES.md
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
## Release Notes
|
2
2
|
|
3
|
+
### v2.0.3
|
4
|
+
|
5
|
+
* Fix parsing of empty files so they do not halt parsing of all files.
|
6
|
+
* Update gem dependencies.
|
7
|
+
|
8
|
+
#### v2.0.2
|
9
|
+
|
10
|
+
* Now prints source name on line with header and footer lines.
|
11
|
+
* Improve help text.
|
12
|
+
|
3
13
|
#### v2.0.1
|
4
14
|
|
5
15
|
* Fix license specification in gemspec, update copyright name and year.
|
data/lib/rika/cli/args_parser.rb
CHANGED
@@ -72,7 +72,7 @@ class ArgsParser
|
|
72
72
|
options[:key_sort] = (v.nil? ? true : v)
|
73
73
|
end
|
74
74
|
|
75
|
-
opts.on('-s', '--[no-]source [FLAG]', TrueClass, 'Output document source file or URL') do |v|
|
75
|
+
opts.on('-s', '--[no-]source [FLAG]', TrueClass, 'Output document source file or URL (default: false)') do |v|
|
76
76
|
options[:source] = (v.nil? ? true : v)
|
77
77
|
end
|
78
78
|
|
@@ -32,6 +32,13 @@ class RikaCommand
|
|
32
32
|
puts result_array_output
|
33
33
|
else
|
34
34
|
targets.each do |target|
|
35
|
+
# If we don't do this, Tika will raise an org.apache.tika.exception.ZeroByteFileException
|
36
|
+
# TODO: Do same for URL?
|
37
|
+
if File.file?(target) && File.zero?(target)
|
38
|
+
$stderr.puts("\n\nFile empty!: #{target}\n\n")
|
39
|
+
next
|
40
|
+
end
|
41
|
+
|
35
42
|
result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
|
36
43
|
puts single_document_output(target, result)
|
37
44
|
end
|
@@ -71,6 +78,20 @@ class RikaCommand
|
|
71
78
|
h
|
72
79
|
end
|
73
80
|
|
81
|
+
# Outputs the source file or URL in the form of:
|
82
|
+
# -------------------------------------------------------------------------------
|
83
|
+
# Source: path/to/file.ext
|
84
|
+
# -------------------------------------------------------------------------------
|
85
|
+
# @param [String] source document source identifier
|
86
|
+
# @return multiline string as displayed above
|
87
|
+
private def source_output_string(source)
|
88
|
+
<<~STRING
|
89
|
+
-------------------------------------------------------------------------------
|
90
|
+
Source: #{source}
|
91
|
+
-------------------------------------------------------------------------------
|
92
|
+
STRING
|
93
|
+
end
|
94
|
+
|
74
95
|
# Builds the string representation of the result of parsing a single document
|
75
96
|
# @param [String] target the target document
|
76
97
|
# @param [ParseResult] result the parse result
|
@@ -80,7 +101,7 @@ class RikaCommand
|
|
80
101
|
metadata_formatter.(result_hash(result))
|
81
102
|
else
|
82
103
|
sio = StringIO.new
|
83
|
-
sio <<
|
104
|
+
sio << source_output_string(target) if options[:source]
|
84
105
|
sio << metadata_formatter.(result.metadata) << "\n" if options[:metadata]
|
85
106
|
sio << text_formatter.(result.content) << "\n" if options[:text]
|
86
107
|
sio.string
|
data/lib/rika/version.rb
CHANGED
data/rika.gemspec
CHANGED
@@ -17,7 +17,7 @@ Gem::Specification.new do |gem|
|
|
17
17
|
gem.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
|
18
18
|
gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
|
19
19
|
gem.require_paths = ['lib']
|
20
|
-
gem.add_dependency 'awesome_print'
|
20
|
+
gem.add_dependency 'awesome_print', '~> 1.9', '>= 1.9.2'
|
21
21
|
gem.platform = 'java'
|
22
22
|
gem.license = 'MIT'
|
23
23
|
gem.metadata['rubygems_mfa_required'] = 'true'
|
File without changes
|
@@ -0,0 +1 @@
|
|
1
|
+
something
|
@@ -117,4 +117,39 @@ describe RikaCommand do
|
|
117
117
|
expect(output).to include('sample help text')
|
118
118
|
end
|
119
119
|
end
|
120
|
+
|
121
|
+
describe '#source_output_string' do
|
122
|
+
let(:rika_command) { described_class.new([]) }
|
123
|
+
let(:sample_filespec) { 'path/to/file.ext' }
|
124
|
+
let(:sample_output_string) { rika_command.send(:source_output_string, sample_filespec) }
|
125
|
+
let(:sample_output_lines) { sample_output_string.lines.map(&:chomp) }
|
126
|
+
let(:header_trailer_line) { '-' * 79 }
|
127
|
+
|
128
|
+
specify 'it has a header and trailer line' do
|
129
|
+
expect(sample_output_lines[0]).to eq(header_trailer_line)
|
130
|
+
expect(sample_output_lines[2]).to eq(header_trailer_line)
|
131
|
+
end
|
132
|
+
|
133
|
+
specify 'information line is well formed' do
|
134
|
+
line = sample_output_lines[1]
|
135
|
+
expect(line).to match("Source: #{sample_filespec}")
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
describe 'empty file behavior' do
|
140
|
+
let(:empty_file_path) { fixture_path('empty.txt') }
|
141
|
+
let(:something_file_path) { fixture_path('something.txt') } # containts "something"
|
142
|
+
|
143
|
+
specify 'parsing an empty file outputs a message to stderr' do
|
144
|
+
expect {
|
145
|
+
described_class.new([empty_file_path]).call
|
146
|
+
}.to output("\n\nFile empty!: #{empty_file_path}\n\n").to_stderr
|
147
|
+
end
|
148
|
+
|
149
|
+
specify 'parsing an empty file does not interrupt parsing of subsequent files' do
|
150
|
+
expect {
|
151
|
+
described_class.new([empty_file_path, something_file_path]).call
|
152
|
+
}.to output(/something/).to_stdout
|
153
|
+
end
|
154
|
+
end
|
120
155
|
end
|
data/spec/rika/parser_spec.rb
CHANGED
@@ -161,10 +161,6 @@ describe Rika::Parser do
|
|
161
161
|
expect(pdf_parse_result.content.lines[1]).to include(quote_first_line)
|
162
162
|
end
|
163
163
|
|
164
|
-
it 'returns no content for an image' do
|
165
|
-
expect(image_parse_result.content).to be_empty
|
166
|
-
end
|
167
|
-
|
168
164
|
it 'only returns max content length from a text file' do
|
169
165
|
expect(Rika.parse(fixture_path('document.txt'), max_content_length: 8).content).to eq('Stopping')
|
170
166
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.3
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Richard Nyström
|
@@ -9,22 +9,28 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2024-08-02 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.9'
|
17
20
|
- - ">="
|
18
21
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
22
|
+
version: 1.9.2
|
20
23
|
name: awesome_print
|
21
|
-
prerelease: false
|
22
24
|
type: :runtime
|
25
|
+
prerelease: false
|
23
26
|
version_requirements: !ruby/object:Gem::Requirement
|
24
27
|
requirements:
|
28
|
+
- - "~>"
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: '1.9'
|
25
31
|
- - ">="
|
26
32
|
- !ruby/object:Gem::Version
|
27
|
-
version:
|
33
|
+
version: 1.9.2
|
28
34
|
description: A JRuby wrapper for Apache Tika to extract text and metadata from files
|
29
35
|
of various formats.
|
30
36
|
email:
|
@@ -60,6 +66,7 @@ files:
|
|
60
66
|
- spec/fixtures/document.docx
|
61
67
|
- spec/fixtures/document.pdf
|
62
68
|
- spec/fixtures/document.txt
|
69
|
+
- spec/fixtures/empty.txt
|
63
70
|
- spec/fixtures/en.txt
|
64
71
|
- spec/fixtures/es.txt
|
65
72
|
- spec/fixtures/fr.txt
|
@@ -67,6 +74,7 @@ files:
|
|
67
74
|
- spec/fixtures/image_jpg_without_extension
|
68
75
|
- spec/fixtures/lang_cant_be_determined.txt
|
69
76
|
- spec/fixtures/ru.txt
|
77
|
+
- spec/fixtures/something.txt
|
70
78
|
- spec/fixtures/tiny.txt
|
71
79
|
- spec/fixtures/unknown.bin
|
72
80
|
- spec/rika/cli/args_parser_spec.rb
|