rika 2.0.2-java → 2.0.3-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +7 -7
- data/README.md +17 -0
- data/RELEASE_NOTES.md +5 -0
- data/lib/rika/cli/rika_command.rb +7 -0
- data/lib/rika/version.rb +1 -1
- data/rika.gemspec +1 -1
- data/spec/fixtures/empty.txt +0 -0
- data/spec/fixtures/something.txt +1 -0
- data/spec/rika/cli/rika_command_spec.rb +17 -0
- metadata +13 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: d1ff6f1340908e2f573ac758ded769bf4a94db0835e06a436addf6dd9408b7ba
|
4
|
+
data.tar.gz: 6e99972c0cf72ab44b85e11847ed6685101c8498f2cd8c1a9a51237b2cc7ede2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: eaf3905403a4a0f66ace5a9658f1aa0ba77f153d67e588f094fbbe426b7cd4c30640c3f21652b8312493e4ad827719d6ff1d83a613b6ae0ff9a73164b179fd4e
|
7
|
+
data.tar.gz: dda02d0f0377f54db2fe59535c4b59b073c5fdcae3f1f8c6e66506427e34c248b923c2b01683a6ea811ca4a1ed45305b96739611a119d7dd244895e28fe7dac4
|
data/Gemfile
CHANGED
@@ -6,11 +6,11 @@ source 'https://rubygems.org'
|
|
6
6
|
gemspec
|
7
7
|
|
8
8
|
group :development do
|
9
|
-
gem 'pry'
|
10
|
-
gem 'rake', '~> 13.
|
11
|
-
gem 'rspec', '~> 3.
|
12
|
-
gem 'rubocop'
|
13
|
-
gem 'rubocop-rspec'
|
14
|
-
gem 'simplecov', require: false
|
15
|
-
gem 'webrick', '~> 1.
|
9
|
+
gem 'pry', '~> 0.14', '>= 0.14.2'
|
10
|
+
gem 'rake', '~> 13.2', '>= 13.2.1'
|
11
|
+
gem 'rspec', '~> 3.13', '>= 3.13.0'
|
12
|
+
gem 'rubocop', '~> 1.65', '>= 1.65.1'
|
13
|
+
gem 'rubocop-rspec', '~> 3.0', '>= 3.0.3'
|
14
|
+
gem 'simplecov', '~>0.22', '>= 0.22.0', require: false
|
15
|
+
gem 'webrick', '~> 1.8', '>= 1.8.1'
|
16
16
|
end
|
data/README.md
CHANGED
@@ -82,6 +82,10 @@ specify one or more filespecs or URL's as arguments:
|
|
82
82
|
```bash
|
83
83
|
rika x.pdf https://github.com/keithrbennett/rika
|
84
84
|
```
|
85
|
+
|
86
|
+
> [!NOTE]
|
87
|
+
> If running `rika` produces an error indicating that the JRuby interpreter cannot be found, try preceding it with `jruby`, e.g. `jruby rika x.pdf`.
|
88
|
+
|
85
89
|
Here is the help text:
|
86
90
|
|
87
91
|
```
|
@@ -245,6 +249,14 @@ rexe -in -oJ -mb 'downcase \
|
|
245
249
|
This gem has been tested with JRuby managed by rvm. It should work with other Ruby version managers and
|
246
250
|
without any version manager at all, but those configurations have not been tested.
|
247
251
|
|
252
|
+
## Using the Tika Java Jar File Directly
|
253
|
+
|
254
|
+
Rika provides only the most common Tika use cases. You may want to dig deeper than Rika does into the massive amount
|
255
|
+
of functionality provided by the Tika library. You can do so by bypassing Rika altogether and using the Tika jar file
|
256
|
+
directly in your own JRuby code. In addition. Tika provides its own command line application that can be called as,
|
257
|
+
for example, `java -jar $TIKA_JAR_FILESPEC --help`. This Tika command line application has finer grained control
|
258
|
+
over some Tika options, but is missing some conveniences provided by the Rika command line application.
|
259
|
+
|
248
260
|
## Other Tika Resources
|
249
261
|
|
250
262
|
* The Apache Tika wiki is at https://cwiki.apache.org/confluence/display/tika.
|
@@ -265,3 +277,8 @@ without any version manager at all, but those configurations have not been teste
|
|
265
277
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
266
278
|
4. Push to the branch (`git push origin my-new-feature`)
|
267
279
|
5. Create new Pull Request
|
280
|
+
|
281
|
+
## Acknowledgments
|
282
|
+
|
283
|
+
Many thanks to the brilliant and dedicated developers who have worked to build Apache Tika since its inception many years ago.
|
284
|
+
|
data/RELEASE_NOTES.md
CHANGED
@@ -32,6 +32,13 @@ class RikaCommand
|
|
32
32
|
puts result_array_output
|
33
33
|
else
|
34
34
|
targets.each do |target|
|
35
|
+
# If we don't do this, Tika will raise an org.apache.tika.exception.ZeroByteFileException
|
36
|
+
# TODO: Do same for URL?
|
37
|
+
if File.file?(target) && File.zero?(target)
|
38
|
+
$stderr.puts("\n\nFile empty!: #{target}\n\n")
|
39
|
+
next
|
40
|
+
end
|
41
|
+
|
35
42
|
result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
|
36
43
|
puts single_document_output(target, result)
|
37
44
|
end
|
data/lib/rika/version.rb
CHANGED
data/rika.gemspec
CHANGED
@@ -17,7 +17,7 @@ Gem::Specification.new do |gem|
|
|
17
17
|
gem.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
|
18
18
|
gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
|
19
19
|
gem.require_paths = ['lib']
|
20
|
-
gem.add_dependency 'awesome_print'
|
20
|
+
gem.add_dependency 'awesome_print', '~> 1.9', '>= 1.9.2'
|
21
21
|
gem.platform = 'java'
|
22
22
|
gem.license = 'MIT'
|
23
23
|
gem.metadata['rubygems_mfa_required'] = 'true'
|
File without changes
|
@@ -0,0 +1 @@
|
|
1
|
+
something
|
@@ -135,4 +135,21 @@ describe RikaCommand do
|
|
135
135
|
expect(line).to match("Source: #{sample_filespec}")
|
136
136
|
end
|
137
137
|
end
|
138
|
+
|
139
|
+
describe 'empty file behavior' do
|
140
|
+
let(:empty_file_path) { fixture_path('empty.txt') }
|
141
|
+
let(:something_file_path) { fixture_path('something.txt') } # containts "something"
|
142
|
+
|
143
|
+
specify 'parsing an empty file outputs a message to stderr' do
|
144
|
+
expect {
|
145
|
+
described_class.new([empty_file_path]).call
|
146
|
+
}.to output("\n\nFile empty!: #{empty_file_path}\n\n").to_stderr
|
147
|
+
end
|
148
|
+
|
149
|
+
specify 'parsing an empty file does not interrupt parsing of subsequent files' do
|
150
|
+
expect {
|
151
|
+
described_class.new([empty_file_path, something_file_path]).call
|
152
|
+
}.to output(/something/).to_stdout
|
153
|
+
end
|
154
|
+
end
|
138
155
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.3
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Richard Nyström
|
@@ -9,22 +9,28 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2024-08-02 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.9'
|
17
20
|
- - ">="
|
18
21
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
22
|
+
version: 1.9.2
|
20
23
|
name: awesome_print
|
21
|
-
prerelease: false
|
22
24
|
type: :runtime
|
25
|
+
prerelease: false
|
23
26
|
version_requirements: !ruby/object:Gem::Requirement
|
24
27
|
requirements:
|
28
|
+
- - "~>"
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: '1.9'
|
25
31
|
- - ">="
|
26
32
|
- !ruby/object:Gem::Version
|
27
|
-
version:
|
33
|
+
version: 1.9.2
|
28
34
|
description: A JRuby wrapper for Apache Tika to extract text and metadata from files
|
29
35
|
of various formats.
|
30
36
|
email:
|
@@ -60,6 +66,7 @@ files:
|
|
60
66
|
- spec/fixtures/document.docx
|
61
67
|
- spec/fixtures/document.pdf
|
62
68
|
- spec/fixtures/document.txt
|
69
|
+
- spec/fixtures/empty.txt
|
63
70
|
- spec/fixtures/en.txt
|
64
71
|
- spec/fixtures/es.txt
|
65
72
|
- spec/fixtures/fr.txt
|
@@ -67,6 +74,7 @@ files:
|
|
67
74
|
- spec/fixtures/image_jpg_without_extension
|
68
75
|
- spec/fixtures/lang_cant_be_determined.txt
|
69
76
|
- spec/fixtures/ru.txt
|
77
|
+
- spec/fixtures/something.txt
|
70
78
|
- spec/fixtures/tiny.txt
|
71
79
|
- spec/fixtures/unknown.bin
|
72
80
|
- spec/rika/cli/args_parser_spec.rb
|