rika 2.0.2-java → 2.0.4-java
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile +7 -7
- data/README.md +17 -0
- data/RELEASE_NOTES.md +10 -0
- data/lib/rika/cli/args_parser.rb +2 -1
- data/lib/rika/cli/rika_command.rb +8 -0
- data/lib/rika/version.rb +1 -1
- data/lib/rika.rb +2 -1
- data/rika.gemspec +1 -1
- data/spec/fixtures/empty.txt +0 -0
- data/spec/fixtures/something.txt +1 -0
- data/spec/rika/cli/rika_command_spec.rb +17 -0
- data/words.txt +0 -0
- metadata +14 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1a4cc1c56edb22131c3409bbaee3617febac8d40ba3aed28600cde997d93d465
|
4
|
+
data.tar.gz: 8b38c319ca598ab107762222bc0b097bcf5001aaf53fc1c33e912cff83ff7997
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1cfea7c20c8c2b294ff6a5d29877cc4670034ac6c58ac299bd5b3ac716d55bc4f1df39843d7c3a33776186ca73d7652747e8853fc914f114e2ecef16e8b26278
|
7
|
+
data.tar.gz: c6c5aeef86b5e9b2cba4b31f395602f94b8c1094975f5543c388175cb619c34c87258ade2043cc0faa47cff7f5bbb87f85e298179faaa4465e96e47647b1bfe0
|
data/Gemfile
CHANGED
@@ -6,11 +6,11 @@ source 'https://rubygems.org'
|
|
6
6
|
gemspec
|
7
7
|
|
8
8
|
group :development do
|
9
|
-
gem 'pry'
|
10
|
-
gem 'rake', '~> 13.
|
11
|
-
gem 'rspec', '~> 3.
|
12
|
-
gem 'rubocop'
|
13
|
-
gem 'rubocop-rspec'
|
14
|
-
gem 'simplecov', require: false
|
15
|
-
gem 'webrick', '~> 1.
|
9
|
+
gem 'pry', '~> 0.14', '>= 0.14.2'
|
10
|
+
gem 'rake', '~> 13.2', '>= 13.2.1'
|
11
|
+
gem 'rspec', '~> 3.13', '>= 3.13.0'
|
12
|
+
gem 'rubocop', '~> 1.65', '>= 1.65.1'
|
13
|
+
gem 'rubocop-rspec', '~> 3.0', '>= 3.0.3'
|
14
|
+
gem 'simplecov', '~>0.22', '>= 0.22.0', require: false
|
15
|
+
gem 'webrick', '~> 1.8', '>= 1.8.1'
|
16
16
|
end
|
data/README.md
CHANGED
@@ -82,6 +82,10 @@ specify one or more filespecs or URL's as arguments:
|
|
82
82
|
```bash
|
83
83
|
rika x.pdf https://github.com/keithrbennett/rika
|
84
84
|
```
|
85
|
+
|
86
|
+
> [!NOTE]
|
87
|
+
> If running `rika` produces an error indicating that the JRuby interpreter cannot be found, try preceding it with `jruby`, e.g. `jruby rika x.pdf`.
|
88
|
+
|
85
89
|
Here is the help text:
|
86
90
|
|
87
91
|
```
|
@@ -245,6 +249,14 @@ rexe -in -oJ -mb 'downcase \
|
|
245
249
|
This gem has been tested with JRuby managed by rvm. It should work with other Ruby version managers and
|
246
250
|
without any version manager at all, but those configurations have not been tested.
|
247
251
|
|
252
|
+
## Using the Tika Java Jar File Directly
|
253
|
+
|
254
|
+
Rika provides only the most common Tika use cases. You may want to dig deeper than Rika does into the massive amount
|
255
|
+
of functionality provided by the Tika library. You can do so by bypassing Rika altogether and using the Tika jar file
|
256
|
+
directly in your own JRuby code. In addition. Tika provides its own command line application that can be called as,
|
257
|
+
for example, `java -jar $TIKA_JAR_FILESPEC --help`. This Tika command line application has finer grained control
|
258
|
+
over some Tika options, but is missing some conveniences provided by the Rika command line application.
|
259
|
+
|
248
260
|
## Other Tika Resources
|
249
261
|
|
250
262
|
* The Apache Tika wiki is at https://cwiki.apache.org/confluence/display/tika.
|
@@ -265,3 +277,8 @@ without any version manager at all, but those configurations have not been teste
|
|
265
277
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
266
278
|
4. Push to the branch (`git push origin my-new-feature`)
|
267
279
|
5. Create new Pull Request
|
280
|
+
|
281
|
+
## Acknowledgments
|
282
|
+
|
283
|
+
Many thanks to the brilliant and dedicated developers who have worked to build Apache Tika since its inception many years ago.
|
284
|
+
|
data/RELEASE_NOTES.md
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
## Release Notes
|
2
2
|
|
3
|
+
### v2.0.4
|
4
|
+
|
5
|
+
* Fix uninitialized constant StringIO error (issue #16).
|
6
|
+
|
7
|
+
|
8
|
+
### v2.0.3
|
9
|
+
|
10
|
+
* Fix parsing of empty files so they do not halt parsing of all files.
|
11
|
+
* Update gem dependencies.
|
12
|
+
|
3
13
|
#### v2.0.2
|
4
14
|
|
5
15
|
* Now prints source name on line with header and footer lines.
|
data/lib/rika/cli/args_parser.rb
CHANGED
@@ -126,6 +126,7 @@ class ArgsParser
|
|
126
126
|
|
127
127
|
# @return [String] string containing versions of Rika and Tika, with labels
|
128
128
|
private def versions_string
|
129
|
-
|
129
|
+
java_version = Java::java.lang.System.getProperty("java.version")
|
130
|
+
"Versions: Rika: #{Rika::VERSION}, Tika: #{Rika.tika_version}, Java: #{java_version}"
|
130
131
|
end
|
131
132
|
end
|
@@ -5,6 +5,7 @@ require 'optparse'
|
|
5
5
|
require 'rika'
|
6
6
|
require 'rika/formatters'
|
7
7
|
require 'rika/cli/args_parser'
|
8
|
+
require 'stringio'
|
8
9
|
|
9
10
|
# This command line application enables the parsing of documents on the command line.
|
10
11
|
# Syntax is:
|
@@ -32,6 +33,13 @@ class RikaCommand
|
|
32
33
|
puts result_array_output
|
33
34
|
else
|
34
35
|
targets.each do |target|
|
36
|
+
# If we don't do this, Tika will raise an org.apache.tika.exception.ZeroByteFileException
|
37
|
+
# TODO: Do same for URL?
|
38
|
+
if File.file?(target) && File.zero?(target)
|
39
|
+
$stderr.puts("\n\nFile empty!: #{target}\n\n")
|
40
|
+
next
|
41
|
+
end
|
42
|
+
|
35
43
|
result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
|
36
44
|
puts single_document_output(target, result)
|
37
45
|
end
|
data/lib/rika/version.rb
CHANGED
data/lib/rika.rb
CHANGED
@@ -40,8 +40,9 @@ module Rika
|
|
40
40
|
# @param [Integer] max_content_length maximum content length to return, defaults to all
|
41
41
|
# @param [Detector] detector Tika detector, defaults to DefaultDetector
|
42
42
|
# @return [ParseResult]
|
43
|
-
def self.parse(data_source, key_sort: true, max_content_length: -1, detector:
|
43
|
+
def self.parse(data_source, key_sort: true, max_content_length: -1, detector: nil)
|
44
44
|
init
|
45
|
+
detector ||= DefaultDetector.new
|
45
46
|
parser = Parser.new(data_source, key_sort: key_sort, max_content_length: max_content_length, detector: detector)
|
46
47
|
parser.parse
|
47
48
|
end
|
data/rika.gemspec
CHANGED
@@ -17,7 +17,7 @@ Gem::Specification.new do |gem|
|
|
17
17
|
gem.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
|
18
18
|
gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
|
19
19
|
gem.require_paths = ['lib']
|
20
|
-
gem.add_dependency 'awesome_print'
|
20
|
+
gem.add_dependency 'awesome_print', '~> 1.9', '>= 1.9.2'
|
21
21
|
gem.platform = 'java'
|
22
22
|
gem.license = 'MIT'
|
23
23
|
gem.metadata['rubygems_mfa_required'] = 'true'
|
File without changes
|
@@ -0,0 +1 @@
|
|
1
|
+
something
|
@@ -135,4 +135,21 @@ describe RikaCommand do
|
|
135
135
|
expect(line).to match("Source: #{sample_filespec}")
|
136
136
|
end
|
137
137
|
end
|
138
|
+
|
139
|
+
describe 'empty file behavior' do
|
140
|
+
let(:empty_file_path) { fixture_path('empty.txt') }
|
141
|
+
let(:something_file_path) { fixture_path('something.txt') } # containts "something"
|
142
|
+
|
143
|
+
specify 'parsing an empty file outputs a message to stderr' do
|
144
|
+
expect {
|
145
|
+
described_class.new([empty_file_path]).call
|
146
|
+
}.to output("\n\nFile empty!: #{empty_file_path}\n\n").to_stderr
|
147
|
+
end
|
148
|
+
|
149
|
+
specify 'parsing an empty file does not interrupt parsing of subsequent files' do
|
150
|
+
expect {
|
151
|
+
described_class.new([empty_file_path, something_file_path]).call
|
152
|
+
}.to output(/something/).to_stdout
|
153
|
+
end
|
154
|
+
end
|
138
155
|
end
|
data/words.txt
ADDED
File without changes
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.4
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Richard Nyström
|
@@ -9,22 +9,28 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2025-02-01 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.9'
|
17
20
|
- - ">="
|
18
21
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
22
|
+
version: 1.9.2
|
20
23
|
name: awesome_print
|
21
24
|
prerelease: false
|
22
25
|
type: :runtime
|
23
26
|
version_requirements: !ruby/object:Gem::Requirement
|
24
27
|
requirements:
|
28
|
+
- - "~>"
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: '1.9'
|
25
31
|
- - ">="
|
26
32
|
- !ruby/object:Gem::Version
|
27
|
-
version:
|
33
|
+
version: 1.9.2
|
28
34
|
description: A JRuby wrapper for Apache Tika to extract text and metadata from files
|
29
35
|
of various formats.
|
30
36
|
email:
|
@@ -60,6 +66,7 @@ files:
|
|
60
66
|
- spec/fixtures/document.docx
|
61
67
|
- spec/fixtures/document.pdf
|
62
68
|
- spec/fixtures/document.txt
|
69
|
+
- spec/fixtures/empty.txt
|
63
70
|
- spec/fixtures/en.txt
|
64
71
|
- spec/fixtures/es.txt
|
65
72
|
- spec/fixtures/fr.txt
|
@@ -67,6 +74,7 @@ files:
|
|
67
74
|
- spec/fixtures/image_jpg_without_extension
|
68
75
|
- spec/fixtures/lang_cant_be_determined.txt
|
69
76
|
- spec/fixtures/ru.txt
|
77
|
+
- spec/fixtures/something.txt
|
70
78
|
- spec/fixtures/tiny.txt
|
71
79
|
- spec/fixtures/unknown.bin
|
72
80
|
- spec/rika/cli/args_parser_spec.rb
|
@@ -77,6 +85,7 @@ files:
|
|
77
85
|
- spec/rika/rika_spec.rb
|
78
86
|
- spec/rika/tika_loader_spec.rb
|
79
87
|
- spec/spec_helper.rb
|
88
|
+
- words.txt
|
80
89
|
homepage: https://github.com/keithrbennett/rika
|
81
90
|
licenses:
|
82
91
|
- MIT
|
@@ -103,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
103
112
|
- !ruby/object:Gem::Version
|
104
113
|
version: '0'
|
105
114
|
requirements: []
|
106
|
-
rubygems_version: 3.
|
115
|
+
rubygems_version: 3.1.6
|
107
116
|
signing_key:
|
108
117
|
specification_version: 4
|
109
118
|
summary: A JRuby wrapper for Apache Tika to extract text and metadata from files of
|