rika 2.0.2-java → 2.0.4-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile +7 -7
- data/README.md +17 -0
- data/RELEASE_NOTES.md +10 -0
- data/lib/rika/cli/args_parser.rb +2 -1
- data/lib/rika/cli/rika_command.rb +8 -0
- data/lib/rika/version.rb +1 -1
- data/lib/rika.rb +2 -1
- data/rika.gemspec +1 -1
- data/spec/fixtures/empty.txt +0 -0
- data/spec/fixtures/something.txt +1 -0
- data/spec/rika/cli/rika_command_spec.rb +17 -0
- data/words.txt +0 -0
- metadata +14 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 1a4cc1c56edb22131c3409bbaee3617febac8d40ba3aed28600cde997d93d465
|
4
|
+
data.tar.gz: 8b38c319ca598ab107762222bc0b097bcf5001aaf53fc1c33e912cff83ff7997
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 1cfea7c20c8c2b294ff6a5d29877cc4670034ac6c58ac299bd5b3ac716d55bc4f1df39843d7c3a33776186ca73d7652747e8853fc914f114e2ecef16e8b26278
|
7
|
+
data.tar.gz: c6c5aeef86b5e9b2cba4b31f395602f94b8c1094975f5543c388175cb619c34c87258ade2043cc0faa47cff7f5bbb87f85e298179faaa4465e96e47647b1bfe0
|
data/Gemfile
CHANGED
@@ -6,11 +6,11 @@ source 'https://rubygems.org'
|
|
6
6
|
gemspec
|
7
7
|
|
8
8
|
group :development do
|
9
|
-
gem 'pry'
|
10
|
-
gem 'rake', '~> 13.
|
11
|
-
gem 'rspec', '~> 3.
|
12
|
-
gem 'rubocop'
|
13
|
-
gem 'rubocop-rspec'
|
14
|
-
gem 'simplecov', require: false
|
15
|
-
gem 'webrick', '~> 1.
|
9
|
+
gem 'pry', '~> 0.14', '>= 0.14.2'
|
10
|
+
gem 'rake', '~> 13.2', '>= 13.2.1'
|
11
|
+
gem 'rspec', '~> 3.13', '>= 3.13.0'
|
12
|
+
gem 'rubocop', '~> 1.65', '>= 1.65.1'
|
13
|
+
gem 'rubocop-rspec', '~> 3.0', '>= 3.0.3'
|
14
|
+
gem 'simplecov', '~>0.22', '>= 0.22.0', require: false
|
15
|
+
gem 'webrick', '~> 1.8', '>= 1.8.1'
|
16
16
|
end
|
data/README.md
CHANGED
@@ -82,6 +82,10 @@ specify one or more filespecs or URL's as arguments:
|
|
82
82
|
```bash
|
83
83
|
rika x.pdf https://github.com/keithrbennett/rika
|
84
84
|
```
|
85
|
+
|
86
|
+
> [!NOTE]
|
87
|
+
> If running `rika` produces an error indicating that the JRuby interpreter cannot be found, try preceding it with `jruby`, e.g. `jruby rika x.pdf`.
|
88
|
+
|
85
89
|
Here is the help text:
|
86
90
|
|
87
91
|
```
|
@@ -245,6 +249,14 @@ rexe -in -oJ -mb 'downcase \
|
|
245
249
|
This gem has been tested with JRuby managed by rvm. It should work with other Ruby version managers and
|
246
250
|
without any version manager at all, but those configurations have not been tested.
|
247
251
|
|
252
|
+
## Using the Tika Java Jar File Directly
|
253
|
+
|
254
|
+
Rika provides only the most common Tika use cases. You may want to dig deeper than Rika does into the massive amount
|
255
|
+
of functionality provided by the Tika library. You can do so by bypassing Rika altogether and using the Tika jar file
|
256
|
+
directly in your own JRuby code. In addition. Tika provides its own command line application that can be called as,
|
257
|
+
for example, `java -jar $TIKA_JAR_FILESPEC --help`. This Tika command line application has finer grained control
|
258
|
+
over some Tika options, but is missing some conveniences provided by the Rika command line application.
|
259
|
+
|
248
260
|
## Other Tika Resources
|
249
261
|
|
250
262
|
* The Apache Tika wiki is at https://cwiki.apache.org/confluence/display/tika.
|
@@ -265,3 +277,8 @@ without any version manager at all, but those configurations have not been teste
|
|
265
277
|
3. Commit your changes (`git commit -am 'Add some feature'`)
|
266
278
|
4. Push to the branch (`git push origin my-new-feature`)
|
267
279
|
5. Create new Pull Request
|
280
|
+
|
281
|
+
## Acknowledgments
|
282
|
+
|
283
|
+
Many thanks to the brilliant and dedicated developers who have worked to build Apache Tika since its inception many years ago.
|
284
|
+
|
data/RELEASE_NOTES.md
CHANGED
@@ -1,5 +1,15 @@
|
|
1
1
|
## Release Notes
|
2
2
|
|
3
|
+
### v2.0.4
|
4
|
+
|
5
|
+
* Fix uninitialized constant StringIO error (issue #16).
|
6
|
+
|
7
|
+
|
8
|
+
### v2.0.3
|
9
|
+
|
10
|
+
* Fix parsing of empty files so they do not halt parsing of all files.
|
11
|
+
* Update gem dependencies.
|
12
|
+
|
3
13
|
#### v2.0.2
|
4
14
|
|
5
15
|
* Now prints source name on line with header and footer lines.
|
data/lib/rika/cli/args_parser.rb
CHANGED
@@ -126,6 +126,7 @@ class ArgsParser
|
|
126
126
|
|
127
127
|
# @return [String] string containing versions of Rika and Tika, with labels
|
128
128
|
private def versions_string
|
129
|
-
|
129
|
+
java_version = Java::java.lang.System.getProperty("java.version")
|
130
|
+
"Versions: Rika: #{Rika::VERSION}, Tika: #{Rika.tika_version}, Java: #{java_version}"
|
130
131
|
end
|
131
132
|
end
|
@@ -5,6 +5,7 @@ require 'optparse'
|
|
5
5
|
require 'rika'
|
6
6
|
require 'rika/formatters'
|
7
7
|
require 'rika/cli/args_parser'
|
8
|
+
require 'stringio'
|
8
9
|
|
9
10
|
# This command line application enables the parsing of documents on the command line.
|
10
11
|
# Syntax is:
|
@@ -32,6 +33,13 @@ class RikaCommand
|
|
32
33
|
puts result_array_output
|
33
34
|
else
|
34
35
|
targets.each do |target|
|
36
|
+
# If we don't do this, Tika will raise an org.apache.tika.exception.ZeroByteFileException
|
37
|
+
# TODO: Do same for URL?
|
38
|
+
if File.file?(target) && File.zero?(target)
|
39
|
+
$stderr.puts("\n\nFile empty!: #{target}\n\n")
|
40
|
+
next
|
41
|
+
end
|
42
|
+
|
35
43
|
result = Rika.parse(target, max_content_length: max_content_length, key_sort: options[:key_sort])
|
36
44
|
puts single_document_output(target, result)
|
37
45
|
end
|
data/lib/rika/version.rb
CHANGED
data/lib/rika.rb
CHANGED
@@ -40,8 +40,9 @@ module Rika
|
|
40
40
|
# @param [Integer] max_content_length maximum content length to return, defaults to all
|
41
41
|
# @param [Detector] detector Tika detector, defaults to DefaultDetector
|
42
42
|
# @return [ParseResult]
|
43
|
-
def self.parse(data_source, key_sort: true, max_content_length: -1, detector:
|
43
|
+
def self.parse(data_source, key_sort: true, max_content_length: -1, detector: nil)
|
44
44
|
init
|
45
|
+
detector ||= DefaultDetector.new
|
45
46
|
parser = Parser.new(data_source, key_sort: key_sort, max_content_length: max_content_length, detector: detector)
|
46
47
|
parser.parse
|
47
48
|
end
|
data/rika.gemspec
CHANGED
@@ -17,7 +17,7 @@ Gem::Specification.new do |gem|
|
|
17
17
|
gem.files = `git ls-files`.split($INPUT_RECORD_SEPARATOR)
|
18
18
|
gem.executables = gem.files.grep(%r{^bin/}).map { |f| File.basename(f) }
|
19
19
|
gem.require_paths = ['lib']
|
20
|
-
gem.add_dependency 'awesome_print'
|
20
|
+
gem.add_dependency 'awesome_print', '~> 1.9', '>= 1.9.2'
|
21
21
|
gem.platform = 'java'
|
22
22
|
gem.license = 'MIT'
|
23
23
|
gem.metadata['rubygems_mfa_required'] = 'true'
|
File without changes
|
@@ -0,0 +1 @@
|
|
1
|
+
something
|
@@ -135,4 +135,21 @@ describe RikaCommand do
|
|
135
135
|
expect(line).to match("Source: #{sample_filespec}")
|
136
136
|
end
|
137
137
|
end
|
138
|
+
|
139
|
+
describe 'empty file behavior' do
|
140
|
+
let(:empty_file_path) { fixture_path('empty.txt') }
|
141
|
+
let(:something_file_path) { fixture_path('something.txt') } # containts "something"
|
142
|
+
|
143
|
+
specify 'parsing an empty file outputs a message to stderr' do
|
144
|
+
expect {
|
145
|
+
described_class.new([empty_file_path]).call
|
146
|
+
}.to output("\n\nFile empty!: #{empty_file_path}\n\n").to_stderr
|
147
|
+
end
|
148
|
+
|
149
|
+
specify 'parsing an empty file does not interrupt parsing of subsequent files' do
|
150
|
+
expect {
|
151
|
+
described_class.new([empty_file_path, something_file_path]).call
|
152
|
+
}.to output(/something/).to_stdout
|
153
|
+
end
|
154
|
+
end
|
138
155
|
end
|
data/words.txt
ADDED
File without changes
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: rika
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 2.0.
|
4
|
+
version: 2.0.4
|
5
5
|
platform: java
|
6
6
|
authors:
|
7
7
|
- Richard Nyström
|
@@ -9,22 +9,28 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date:
|
12
|
+
date: 2025-02-01 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.9'
|
17
20
|
- - ">="
|
18
21
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
22
|
+
version: 1.9.2
|
20
23
|
name: awesome_print
|
21
24
|
prerelease: false
|
22
25
|
type: :runtime
|
23
26
|
version_requirements: !ruby/object:Gem::Requirement
|
24
27
|
requirements:
|
28
|
+
- - "~>"
|
29
|
+
- !ruby/object:Gem::Version
|
30
|
+
version: '1.9'
|
25
31
|
- - ">="
|
26
32
|
- !ruby/object:Gem::Version
|
27
|
-
version:
|
33
|
+
version: 1.9.2
|
28
34
|
description: A JRuby wrapper for Apache Tika to extract text and metadata from files
|
29
35
|
of various formats.
|
30
36
|
email:
|
@@ -60,6 +66,7 @@ files:
|
|
60
66
|
- spec/fixtures/document.docx
|
61
67
|
- spec/fixtures/document.pdf
|
62
68
|
- spec/fixtures/document.txt
|
69
|
+
- spec/fixtures/empty.txt
|
63
70
|
- spec/fixtures/en.txt
|
64
71
|
- spec/fixtures/es.txt
|
65
72
|
- spec/fixtures/fr.txt
|
@@ -67,6 +74,7 @@ files:
|
|
67
74
|
- spec/fixtures/image_jpg_without_extension
|
68
75
|
- spec/fixtures/lang_cant_be_determined.txt
|
69
76
|
- spec/fixtures/ru.txt
|
77
|
+
- spec/fixtures/something.txt
|
70
78
|
- spec/fixtures/tiny.txt
|
71
79
|
- spec/fixtures/unknown.bin
|
72
80
|
- spec/rika/cli/args_parser_spec.rb
|
@@ -77,6 +85,7 @@ files:
|
|
77
85
|
- spec/rika/rika_spec.rb
|
78
86
|
- spec/rika/tika_loader_spec.rb
|
79
87
|
- spec/spec_helper.rb
|
88
|
+
- words.txt
|
80
89
|
homepage: https://github.com/keithrbennett/rika
|
81
90
|
licenses:
|
82
91
|
- MIT
|
@@ -103,7 +112,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
103
112
|
- !ruby/object:Gem::Version
|
104
113
|
version: '0'
|
105
114
|
requirements: []
|
106
|
-
rubygems_version: 3.
|
115
|
+
rubygems_version: 3.1.6
|
107
116
|
signing_key:
|
108
117
|
specification_version: 4
|
109
118
|
summary: A JRuby wrapper for Apache Tika to extract text and metadata from files of
|