rika 2.0.1-java → 2.0.3-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +7 -7
- data/README.md +19 -2
- data/RELEASE_NOTES.md +10 -0
- data/lib/rika/cli/args_parser.rb +1 -1
- data/lib/rika/cli/rika_command.rb +22 -1
- data/lib/rika/version.rb +1 -1
- data/rika.gemspec +1 -1
- data/spec/fixtures/empty.txt +0 -0
- data/spec/fixtures/something.txt +1 -0
- data/spec/rika/cli/rika_command_spec.rb +35 -0
- data/spec/rika/parser_spec.rb +0 -4
- metadata +13 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d1ff6f1340908e2f573ac758ded769bf4a94db0835e06a436addf6dd9408b7ba
|
4
|
+
data.tar.gz: 6e99972c0cf72ab44b85e11847ed6685101c8498f2cd8c1a9a51237b2cc7ede2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: eaf3905403a4a0f66ace5a9658f1aa0ba77f153d67e588f094fbbe426b7cd4c30640c3f21652b8312493e4ad827719d6ff1d83a613b6ae0ff9a73164b179fd4e
|
7
|
+
data.tar.gz: dda02d0f0377f54db2fe59535c4b59b073c5fdcae3f1f8c6e66506427e34c248b923c2b01683a6ea811ca4a1ed45305b96739611a119d7dd244895e28fe7dac4
|
data/Gemfile
CHANGED
@@ -6,11 +6,11 @@ source 'https://rubygems.org'
|
|
6
6
|
gemspec
|
7
7
|
|
8
8
|
group :development do
|
9
|
-
gem 'pry'
|
10
|
-
gem 'rake', '~> 13.
|
11
|
-
gem 'rspec', '~> 3.
|
12
|
-
gem 'rubocop'
|
13
|
-
gem 'rubocop-rspec'
|
14
|
-
gem 'simplecov', require: false
|
15
|
-
gem 'webrick', '~> 1.
|
9
|
+
gem 'pry', '~> 0.14', '>= 0.14.2'
|
10
|
+
gem 'rake', '~> 13.2', '>= 13.2.1'
|
11
|
+
gem 'rspec', '~> 3.13', '>= 3.13.0'
|
12
|
+
gem 'rubocop', '~> 1.65', '>= 1.65.1'
|
13
|
+
gem 'rubocop-rspec', '~> 3.0', '>= 3.0.3'
|
14
|
+
gem 'simplecov', '~>0.22', '>= 0.22.0', require: false
|
15
|
+
gem 'webrick', '~> 1.8', '>= 1.8.1'
|
16
16
|
end
|
data/README.md
CHANGED
@@ -82,10 +82,14 @@ specify one or more filespecs or URL's as arguments:
|
|
82
82
|
```bash
|
83
83
|
rika x.pdf https://github.com/keithrbennett/rika
|
84
84
|
```
|
85
|
+
|
86
|
+
> [!NOTE]
|
87
|
+
> If running `rika` produces an error indicating that the JRuby interpreter cannot be found, try preceding it with `jruby`, e.g. `jruby rika x.pdf`.
|
88
|
+
|
85
89
|
Here is the help text:
|
86
90
|
|
87
91
|
```
|
88
|
-
Rika v2.0.
|
92
|
+
Rika v2.0.2 (Tika v2.9.0) - https://github.com/keithrbennett/rika
|
89
93
|
|
90
94
|
Usage: rika [options] <file or url> [...file or url...]
|
91
95
|
Output formats are: [a]wesome_print, [t]o_s, [i]nspect, [j]son), [J] for pretty json, and [y]aml.
|
@@ -98,7 +102,7 @@ Values for the text, metadata, and as_array boolean options may be specified as
|
|
98
102
|
-m, --[no-]metadata [FLAG] Output metadata (default: true)
|
99
103
|
-t, --[no-]text [FLAG] Output text (default: true)
|
100
104
|
-k, --[no-]key-sort [FLAG] Sort metadata keys case insensitively (default: true)
|
101
|
-
-s, --[no-]source [FLAG] Output document source file or URL
|
105
|
+
-s, --[no-]source [FLAG] Output document source file or URL (default: false)
|
102
106
|
-a, --[no-]as-array [FLAG] Output all parsed results as an array (default: false)
|
103
107
|
-v, --version Output version
|
104
108
|
-h, --help Output help
|
@@ -245,6 +249,14 @@ rexe -in -oJ -mb 'downcase \
|
|
245
249
|
This gem has been tested with JRuby managed by rvm. It should work with other Ruby version managers and
|
246
250
|
without any version manager at all, but those configurations have not been tested.
|
247
251
|
|
252
|
+
## Using the Tika Java Jar File Directly
|
253
|
+
|
254
|
+
Rika provides only the most common Tika use cases. You may want to dig deeper than Rika does into the massive amount
|
255
|
+
of functionality provided by the Tika library. You can do so by bypassing Rika altogether and using the Tika jar file
|
256
|
+
directly in your own JRuby code. In addition. Tika provides its own command line application that can be called as,
|
257
|
+
for example, `java -jar $TIKA_JAR_FILESPEC --help`. This Tika command line application has finer grained control
|
258
|
+
over some Tika options, but is missing some conveniences provided by the Rika command line application.
|
259
|
+
|
248
260
|
## Other Tika Resources
|
249
261
|
|
250
262
|
* The Apache Tika wiki is at https://cwiki.apache.org/confluence/display/tika.
|
@@ -265,3 +277,8 @@ without any version manager at all, but those configurations have not been teste
|
|
265
277
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
266
278
|
4. Push to the branch (`git push origin my-new-feature`)
|
267
279
|
5. Create new Pull Request
|
280
|
+
|
281
|
+
## Acknowledgments
|
282
|
+
|
283
|
+
Many thanks to the brilliant and dedicated developers who have worked to build Apache Tika since its inception many years ago.
|
284
|
+
|
data/RELEASE_NOTES.md
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
## Release Notes
|
2
2
|
|
3
|
+
### v2.0.3
|
4
|
+
|
5
|
+
* Fix parsing of empty files so they do not halt parsing of all files.
|
6
|
+
* Update gem dependencies.
|
7
|
+
|
8
|
+
#### v2.0.2
|
9
|
+
|
10
|
+
* Now prints source name on line with header and footer lines.
|
11
|
+
* Improve help text.
|
12
|
+
|
3
13
|
#### v2.0.1
|
4
14
|
|
5
15
|
* Fix license specification in gemspec, update copyright name and year.
|
data/lib/rika/cli/args_parser.rb
CHANGED
@@ -72,7 +72,7 @@ class ArgsParser
|
|
72
72
|
options[:key_sort] = (v.nil? ? true : v)
|
73
73
|
end
|
74
74
|
|
75
|
-
opts.on('-s', '--[no-]source [FLAG]', TrueClass, 'Output document source file or URL') do |v|
|
75
|
+
opts.on('-s', '--[no-]source [FLAG]', TrueClass, 'Output document source file or URL (default: false)') do |v|
|
76
76
|
options[:source] = (v.nil? ? true : v)
|
77
77
|
end
|
78
78
|
|
@@ -32,6 +32,13 @@ class RikaCommand
|
|
32
32
|
puts result_array_output
|
33
33
|
else
|
34
34
|
targets.each do |target|
|
35
|
+
# If we don't do this, Tika will raise an org.apache.tika.exception.ZeroByteFileException
|
36
|
+
# TODO: Do same for URL?
|
37
|
+
if File.file?(target) && File.zero?(target)
|
38
|
+
$stderr.puts("\n\nFile empty!: #{target}\n\n")
|
39
|
+
next
|
40
|
+
end
|
41
|
+
|
35
42
|
result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
|
36
43
|
puts single_document_output(target, result)
|
37
44
|
end
|
@@ -71,6 +78,20 @@ class RikaCommand
|
|
71
78
|
h
|
72
79
|
end
|
73
80
|
|
81
|
+
# Outputs the source file or URL in the form of:
|
82
|
+
# -------------------------------------------------------------------------------
|
83
|
+
# Source: path/to/file.ext
|
84
|
+
# -------------------------------------------------------------------------------
|
85
|
+
# @param [String] source document source identifier
|
86
|
+
# @return multiline string as displayed above
|
87
|
+
private def source_output_string(source)
|
88
|
+
<<~STRING
|
89
|
+
-------------------------------------------------------------------------------
|
90
|
+
Source: #{source}
|
91
|
+
-------------------------------------------------------------------------------
|
92
|
+
STRING
|
93
|
+
end
|
94
|
+
|
74
95
|
# Builds the string representation of the result of parsing a single document
|
75
96
|
# @param [String] target the target document
|
76
97
|
# @param [ParseResult] result the parse result
|
@@ -80,7 +101,7 @@ class RikaCommand
|
|
80
101
|
metadata_formatter.(result_hash(result))
|
81
102
|
else
|
82
103
|
sio = StringIO.new
|
83
|
-
sio <<
|
104
|
+
sio << source_output_string(target) if options[:source]
|
84
105
|
sio << metadata_formatter.(result.metadata) << "\n" if options[:metadata]
|
85
106
|
sio << text_formatter.(result.content) << "\n" if options[:text]
|
86
107
|
sio.string
|
data/lib/rika/version.rb
CHANGED
data/rika.gemspec
CHANGED
@@ -17,7 +17,7 @@ Gem::Specification.new do |gem|
|
|
17
17
|
gem.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
|
18
18
|
gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
|
19
19
|
gem.require_paths = ['lib']
|
20
|
-
gem.add_dependency 'awesome_print'
|
20
|
+
gem.add_dependency 'awesome_print', '~> 1.9', '>= 1.9.2'
|
21
21
|
gem.platform = 'java'
|
22
22
|
gem.license = 'MIT'
|
23
23
|
gem.metadata['rubygems_mfa_required'] = 'true'
|
File without changes
|
@@ -0,0 +1 @@
|
|
1
|
+
something
|
@@ -117,4 +117,39 @@ describe RikaCommand do
|
|
117
117
|
expect(output).to include('sample help text')
|
118
118
|
end
|
119
119
|
end
|
120
|
+
|
121
|
+
describe '#source_output_string' do
|
122
|
+
let(:rika_command) { described_class.new([]) }
|
123
|
+
let(:sample_filespec) { 'path/to/file.ext' }
|
124
|
+
let(:sample_output_string) { rika_command.send(:source_output_string, sample_filespec) }
|
125
|
+
let(:sample_output_lines) { sample_output_string.lines.map(&:chomp) }
|
126
|
+
let(:header_trailer_line) { '-' * 79 }
|
127
|
+
|
128
|
+
specify 'it has a header and trailer line' do
|
129
|
+
expect(sample_output_lines[0]).to eq(header_trailer_line)
|
130
|
+
expect(sample_output_lines[2]).to eq(header_trailer_line)
|
131
|
+
end
|
132
|
+
|
133
|
+
specify 'information line is well formed' do
|
134
|
+
line = sample_output_lines[1]
|
135
|
+
expect(line).to match("Source: #{sample_filespec}")
|
136
|
+
end
|
137
|
+
end
|
138
|
+
|
139
|
+
describe 'empty file behavior' do
|
140
|
+
let(:empty_file_path) { fixture_path('empty.txt') }
|
141
|
+
let(:something_file_path) { fixture_path('something.txt') } # containts "something"
|
142
|
+
|
143
|
+
specify 'parsing an empty file outputs a message to stderr' do
|
144
|
+
expect {
|
145
|
+
described_class.new([empty_file_path]).call
|
146
|
+
}.to output("\n\nFile empty!: #{empty_file_path}\n\n").to_stderr
|
147
|
+
end
|
148
|
+
|
149
|
+
specify 'parsing an empty file does not interrupt parsing of subsequent files' do
|
150
|
+
expect {
|
151
|
+
described_class.new([empty_file_path, something_file_path]).call
|
152
|
+
}.to output(/something/).to_stdout
|
153
|
+
end
|
154
|
+
end
|
120
155
|
end
|
data/spec/rika/parser_spec.rb
CHANGED
@@ -161,10 +161,6 @@ describe Rika::Parser do
|
|
161
161
|
expect(pdf_parse_result.content.lines[1]).to include(quote_first_line)
|
162
162
|
end
|
163
163
|
|
164
|
-
it 'returns no content for an image' do
|
165
|
-
expect(image_parse_result.content).to be_empty
|
166
|
-
end
|
167
|
-
|
168
164
|
it 'only returns max content length from a text file' do
|
169
165
|
expect(Rika.parse(fixture_path('document.txt'), max_content_length: 8).content).to eq('Stopping')
|
170
166
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.3
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Richard Nyström
|
@@ -9,22 +9,28 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2024-08-02 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.9'
|
17
20
|
- - ">="
|
18
21
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
22
|
+
version: 1.9.2
|
20
23
|
name: awesome_print
|
21
|
-
prerelease: false
|
22
24
|
type: :runtime
|
25
|
+
prerelease: false
|
23
26
|
version_requirements: !ruby/object:Gem::Requirement
|
24
27
|
requirements:
|
28
|
+
- - "~>"
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: '1.9'
|
25
31
|
- - ">="
|
26
32
|
- !ruby/object:Gem::Version
|
27
|
-
version:
|
33
|
+
version: 1.9.2
|
28
34
|
description: A JRuby wrapper for Apache Tika to extract text and metadata from files
|
29
35
|
of various formats.
|
30
36
|
email:
|
@@ -60,6 +66,7 @@ files:
|
|
60
66
|
- spec/fixtures/document.docx
|
61
67
|
- spec/fixtures/document.pdf
|
62
68
|
- spec/fixtures/document.txt
|
69
|
+
- spec/fixtures/empty.txt
|
63
70
|
- spec/fixtures/en.txt
|
64
71
|
- spec/fixtures/es.txt
|
65
72
|
- spec/fixtures/fr.txt
|
@@ -67,6 +74,7 @@ files:
|
|
67
74
|
- spec/fixtures/image_jpg_without_extension
|
68
75
|
- spec/fixtures/lang_cant_be_determined.txt
|
69
76
|
- spec/fixtures/ru.txt
|
77
|
+
- spec/fixtures/something.txt
|
70
78
|
- spec/fixtures/tiny.txt
|
71
79
|
- spec/fixtures/unknown.bin
|
72
80
|
- spec/rika/cli/args_parser_spec.rb
|