rika 1.11.1-java → 2.0.0-java
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.gitignore +5 -4
- data/.rubocop.yml +49 -0
- data/Gemfile +12 -0
- data/README.md +213 -76
- data/RELEASE_NOTES.md +26 -0
- data/Rakefile +4 -7
- data/bin/rika +13 -0
- data/lib/rika/cli/args_parser.rb +131 -0
- data/lib/rika/cli/rika_command.rb +129 -0
- data/lib/rika/formatters.rb +39 -0
- data/lib/rika/parse_result.rb +34 -0
- data/lib/rika/parser.rb +64 -70
- data/lib/rika/tika_loader.rb +65 -0
- data/lib/rika/version.rb +3 -1
- data/lib/rika.rb +98 -27
- data/rika.gemspec +30 -17
- data/rika_helper.rb +14 -22
- data/spec/fixtures/image_jpg_without_extension +0 -0
- data/spec/fixtures/tiny.txt +1 -0
- data/spec/rika/cli/args_parser_spec.rb +117 -0
- data/spec/rika/cli/rika_command_spec.rb +120 -0
- data/spec/rika/formatters_spec.rb +23 -0
- data/spec/rika/parse_result_spec.rb +42 -0
- data/spec/rika/parser_spec.rb +304 -0
- data/spec/rika/rika_spec.rb +10 -0
- data/spec/rika/tika_loader_spec.rb +57 -0
- data/spec/spec_helper.rb +10 -3
- metadata +40 -49
- data/.travis.yml +0 -7
- data/java-lib/tika-app-1.24.1.jar +0 -0
- data/spec/fixtures/text_file_without_extension +0 -23
- data/spec/rika_spec.rb +0 -245
- /data/spec/fixtures/{text_file.txt → document.txt} +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 70dc9cbc6d2da17d3bcff5cdeb3a905fdbcf2e2b9d24c131558566cbfc19eada
|
4
|
+
data.tar.gz: fb81f98476322d2291488b2bd2cb9c89a1544b4ee1f85ce22ab773c30ac4765c
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 780650b18df09662b8b67e7321641c2cf36d78335c5cf1f59f5d81a23cf262dc478958ad7aab306f80deb235e9c9910332086b3ea5231888ea96c26abd2d2505
|
7
|
+
data.tar.gz: afa841ebfe9ebb1bee5ccb0550fedde43edab6e1d6b86eed671537604d64b318c0abc7e41ffee054443f6bfd4a7b3c9423926e137fba4fdd95da66775e35d695
|
data/.gitignore
CHANGED
@@ -1,7 +1,10 @@
|
|
1
1
|
*.gem
|
2
2
|
*.rbc
|
3
|
+
.DS_Store
|
3
4
|
.bundle
|
4
5
|
.config
|
6
|
+
coverage/
|
7
|
+
.idea/
|
5
8
|
.yardoc
|
6
9
|
Gemfile.lock
|
7
10
|
InstalledFiles
|
@@ -10,13 +13,11 @@ coverage
|
|
10
13
|
doc/
|
11
14
|
lib/bundler/man
|
12
15
|
pkg
|
16
|
+
projectFilesBackup/
|
13
17
|
rdoc
|
14
18
|
spec/reports
|
19
|
+
spec/rspec-failed-tests-control-file.txt
|
15
20
|
target/
|
16
21
|
test/tmp
|
17
22
|
test/version_tmp
|
18
23
|
tmp
|
19
|
-
|
20
|
-
.DS_Store
|
21
|
-
projectFilesBackup/
|
22
|
-
.idea/
|
data/.rubocop.yml
ADDED
@@ -0,0 +1,49 @@
|
|
1
|
+
require: rubocop-rspec
|
2
|
+
AllCops:
|
3
|
+
NewCops: enable
|
4
|
+
Include:
|
5
|
+
- '**/*.rb'
|
6
|
+
- '*.gemspec'
|
7
|
+
- '**/Rakefile'
|
8
|
+
- '**/Gemfile'
|
9
|
+
- 'bin/rika'
|
10
|
+
Gemspec/RequiredRubyVersion:
|
11
|
+
Enabled: false
|
12
|
+
Layout/HashAlignment:
|
13
|
+
Enabled: false
|
14
|
+
Metrics/AbcSize:
|
15
|
+
Enabled: false
|
16
|
+
Metrics/BlockLength:
|
17
|
+
Enabled: false
|
18
|
+
Metrics/MethodLength:
|
19
|
+
Enabled: false
|
20
|
+
RSpec/ExampleLength:
|
21
|
+
Enabled: false
|
22
|
+
RSpec/ExpectOutput:
|
23
|
+
Enabled: false
|
24
|
+
RSpec/InstanceVariable:
|
25
|
+
Enabled: false
|
26
|
+
RSpec/MultipleExpectations:
|
27
|
+
Enabled: false
|
28
|
+
RSpec/MultipleMemoizedHelpers:
|
29
|
+
Enabled: false
|
30
|
+
Style/AccessModifierDeclarations:
|
31
|
+
Enabled: false
|
32
|
+
Style/FetchEnvVar:
|
33
|
+
Enabled: false
|
34
|
+
Style/GuardClause:
|
35
|
+
Enabled: false
|
36
|
+
Style/IfUnlessModifier:
|
37
|
+
Enabled: false
|
38
|
+
Style/Lambda:
|
39
|
+
Enabled: false
|
40
|
+
Style/LambdaCall:
|
41
|
+
Enabled: false
|
42
|
+
Style/NumericLiterals:
|
43
|
+
Enabled: false
|
44
|
+
Style/PercentLiteralDelimiters:
|
45
|
+
Enabled: false
|
46
|
+
Style/StderrPuts:
|
47
|
+
Enabled: false
|
48
|
+
Style/TrailingUnderscoreVariable:
|
49
|
+
Enabled: false
|
data/Gemfile
CHANGED
@@ -1,4 +1,16 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
1
3
|
source 'https://rubygems.org'
|
2
4
|
|
3
5
|
# Specify your gem's dependencies in rika.gemspec
|
4
6
|
gemspec
|
7
|
+
|
8
|
+
group :development do
|
9
|
+
gem 'pry'
|
10
|
+
gem 'rake', '~> 13.0'
|
11
|
+
gem 'rspec', '~> 3.9'
|
12
|
+
gem 'rubocop'
|
13
|
+
gem 'rubocop-rspec'
|
14
|
+
gem 'simplecov', require: false
|
15
|
+
gem 'webrick', '~> 1.6'
|
16
|
+
end
|
data/README.md
CHANGED
@@ -1,125 +1,262 @@
|
|
1
|
-
|
2
1
|
# Rika
|
3
2
|
|
4
|
-
Rika is a [JRuby](https://www.jruby.org) wrapper for
|
5
|
-
|
6
|
-
|
3
|
+
[Rika](https://github.com/keithrbennett/rika) is a [JRuby](https://www.jruby.org) wrapper for
|
4
|
+
the [Apache Tika](http://tika.apache.org/) Java library, which extracts text and metadata from files and resources
|
5
|
+
of [many different formats](https://tika.apache.org/1.24.1/formats.html).
|
7
6
|
|
8
|
-
Rika
|
7
|
+
Rika can be used as a library in your Ruby code, or on the command line.
|
9
8
|
|
10
|
-
[
|
11
|
-
|
9
|
+
For class and method level documentation, please use [YARD](https://rubydoc.info/gems/yard).
|
10
|
+
You can `gem install yard`, then run `yard doc` from the project root,
|
11
|
+
and then open the `doc/index.html` file in a browser.
|
12
12
|
|
13
|
-
## Usage
|
14
13
|
|
15
|
-
|
16
|
-
|
17
|
-
```ruby
|
18
|
-
require 'rika'
|
19
|
-
|
20
|
-
content = Rika.parse_content('x.pdf') # string containing all content text
|
21
|
-
metadata = Rika.parse_metadata('x.pdf') # hash containing the document metadata
|
22
|
-
content, metadata = Rika.parse_content_and_metadata('x.pdf') # both of the above
|
23
|
-
```
|
14
|
+
### Requirements
|
24
15
|
|
25
|
-
|
16
|
+
* This gem only works with [JRuby](https://www.jruby.org).
|
17
|
+
* The [Apache Tika](http://tika.apache.org/) jar file must be installed on your system.
|
18
|
+
See the [Installation](#installation) section below for more information.
|
26
19
|
|
27
|
-
|
28
|
-
|
29
|
-
|
20
|
+
Rika currently supports some basic and commonly used functions of Tika.
|
21
|
+
Since it runs on JRuby, the Tika library's Java methods can be called directly from Ruby code
|
22
|
+
for more advanced needs.
|
23
|
+
See the [Other Tika Resources](#other-tika-resources) section of this document for alternatives to
|
24
|
+
Rika that may suit more demanding needs.
|
30
25
|
|
31
|
-
|
26
|
+
Rika can be used either as a gem in your own Ruby project, or on the command line using the provided executable.
|
32
27
|
|
33
|
-
|
34
|
-
require 'rika'
|
28
|
+
## Usage in Your Ruby Code
|
35
29
|
|
36
|
-
|
30
|
+
> [!IMPORTANT]
|
31
|
+
> **It is necessary to call `Rika.init` before using Rika.** This is because the loading of the Tika library
|
32
|
+
has been put in an init method, rather than at load time, so that 'jar file not found or specified' errors
|
33
|
+
do not prevent your application from loading. If you forget to call `Rika.init`, you may see seemingly unrelated
|
34
|
+
error messages.
|
37
35
|
|
38
|
-
|
39
|
-
|
36
|
+
As a convenience, the `Rika.init` method is called automatically when you call the Rika module methods. However,
|
37
|
+
if you access other Rika classes and methods, `init` may not have been called yet, so you should call it yourself.
|
40
38
|
|
41
|
-
|
42
|
-
parser.metadata
|
39
|
+
----
|
43
40
|
|
44
|
-
|
45
|
-
|
41
|
+
The Rika `parse` method returns a `Rika::ParseResult` object that contains the parsed text and
|
42
|
+
various pieces of metadata. The `ParseResult` class' main methods are:
|
46
43
|
|
47
|
-
|
48
|
-
|
49
|
-
|
44
|
+
* `content` - the parsed text
|
45
|
+
* `metadata` - a hash of metadata key/value pairs
|
46
|
+
* `content_type` - the content type of the parsed data, e.g. "text/plain; charset=UTF-8"
|
47
|
+
* `language` - the language of the parsed data, e.g. "en"
|
48
|
+
* `data_source` - the data source, either a filespec or a URL
|
50
49
|
|
51
|
-
|
52
|
-
parser = Rika::Parser.new('http://example.com/x.pdf', 200)
|
53
|
-
parser.content
|
50
|
+
For example:
|
54
51
|
|
55
|
-
|
56
|
-
|
57
|
-
parser.language
|
58
|
-
=> "de"
|
52
|
+
```ruby
|
53
|
+
require 'rika'
|
59
54
|
|
60
|
-
|
61
|
-
|
62
|
-
|
55
|
+
parse_result = Rika.parse('x.pdf') # returns a Rika::ParseResult object
|
56
|
+
parse_result.content # string containing all content text
|
57
|
+
parse_result.text # 'text' is an alias for 'content'
|
58
|
+
parse_result.metadata # hash containing the document metadata
|
59
|
+
parse_result.content_type # e.g. "application/pdf"
|
60
|
+
parse_result.language # e.g. "en"
|
61
|
+
parse_result.data_source # e.g. "x.pdf"
|
63
62
|
```
|
64
63
|
|
65
|
-
|
66
|
-
|
67
|
-
Since Ruby supports the `-r` option to require a library, and the `-e` option to evaluate a string of code, you can easily do simple parsing on the command line, such as:
|
64
|
+
A URL can be used instead of a filespec wherever a data source is specified:
|
68
65
|
|
69
|
-
```
|
70
|
-
|
66
|
+
```ruby
|
67
|
+
parse_result = Rika.parse('https://github.com/keithrbennett/rika')
|
71
68
|
```
|
72
69
|
|
73
|
-
|
70
|
+
The Rika module also has the following methods:
|
74
71
|
|
75
|
-
```
|
76
|
-
|
72
|
+
```ruby
|
73
|
+
Rika.language("magnifique") # => "fr"
|
74
|
+
Rika.tika_version # => "2.9.0"
|
77
75
|
```
|
78
76
|
|
79
|
-
|
77
|
+
## Command Line Executable Usage
|
78
|
+
|
79
|
+
Rika can also be used on the command line using the `rika` executable. For example, the simplest form is to simply
|
80
|
+
specify one or more filespecs or URL's as arguments:
|
80
81
|
|
82
|
+
```bash
|
83
|
+
rika x.pdf https://github.com/keithrbennett/rika
|
81
84
|
```
|
82
|
-
|
83
|
-
```
|
84
|
-
|
85
|
-
Using the [rexe](https://github.com/keithrbennett/rexe) gem, that can be made much more concise:
|
85
|
+
Here is the help text:
|
86
86
|
|
87
87
|
```
|
88
|
-
|
88
|
+
Rika v2.0.0 (Tika v2.9.0) - https://github.com/keithrbennett/rika
|
89
|
+
|
90
|
+
Usage: rika [options] <file or url> [...file or url...]
|
91
|
+
Output formats are: [a]wesome_print, [t]o_s, [i]nspect, [j]son), [J] for pretty json, and [y]aml.
|
92
|
+
If a format contains two letters, the first will be used for metadata, the second for text.
|
93
|
+
Values for the text, metadata, and as_array boolean options may be specified as follows:
|
94
|
+
Enable: +, true, yes, [empty]
|
95
|
+
Disable: -, false, no, [long form option with no- prefix, e.g. --no-metadata]
|
96
|
+
|
97
|
+
-f, --format FORMAT Output format (default: at)
|
98
|
+
-m, --[no-]metadata [FLAG] Output metadata (default: true)
|
99
|
+
-t, --[no-]text [FLAG] Output text (default: true)
|
100
|
+
-k, --[no-]key-sort [FLAG] Sort metadata keys case insensitively (default: true)
|
101
|
+
-s, --[no-]source [FLAG] Document source file or URL
|
102
|
+
-a, --[no-]as-array [FLAG] Output all parsed results as an array (default: false)
|
103
|
+
-v, --version Output version
|
104
|
+
-h, --help Output help
|
105
|
+
```
|
106
|
+
|
107
|
+
### Outputting Only Metadata or Only Parsed Text
|
108
|
+
|
109
|
+
The default setting is to output both metadata and text. To disable either, use the `-m` or `-t` options
|
110
|
+
with a disabling flag, e.g. `-m-`, `-m false`, `-m no`, or `--no-metadata` to disable metadata.
|
111
|
+
|
112
|
+
### Outputting the Document Source Identifier (Filespec or URL)
|
113
|
+
|
114
|
+
There are many times when it is useful to know the source of the document. For example, if you are processing
|
115
|
+
a large number of documents, you may want to know which document a particular piece of output came from.
|
116
|
+
|
117
|
+
The document source identifier is output by default. To disable it, use the `-s` option with a disabling flag, e.g. `-s-`,
|
118
|
+
`-s false`, `-s no`, or `--no-source`.
|
119
|
+
|
120
|
+
### Output Formats
|
121
|
+
|
122
|
+
The `-f` option can be used to specify the output format. The default is `at`, which means that the metadata will be
|
123
|
+
output in awesome_print format, and the text will be output using `to_s`
|
124
|
+
(i.e. without any changes to the parsed string).
|
125
|
+
|
126
|
+
If a single argument to `-f` is specified, it will be used for both metadata and text. If two arguments are specified,
|
127
|
+
the first will be used for metadata and the second for the parsed text.
|
128
|
+
|
129
|
+
### Sorting of Metadata Keys
|
130
|
+
|
131
|
+
By default, metadata keys will be sorted case insensitively. To disable this, use the `-k` option
|
132
|
+
with a disabling flag, i.e. `-k-`, `-k false`, `-k no`, or `--no-key-sort`.
|
133
|
+
|
134
|
+
The case insensitivity is implemented by using `String#downcase`.
|
135
|
+
This may not sort correctly on some non-English systems.
|
136
|
+
|
137
|
+
### Specifying Command Line Options in the RIKA_OPTIONS Environment Variable
|
138
|
+
|
139
|
+
If you find yourself using the same options over and over again, you can put them in the `RIKA_OPTIONS` environment
|
140
|
+
variable. For example, if the default behavior of sorting keys does not work for your language, you can disable it
|
141
|
+
for all invocations of the `rika` command by specifying `-k-` in the RIKA_OPTIONS environment variable.
|
142
|
+
|
143
|
+
### Machine Readable Data Support
|
144
|
+
|
145
|
+
If both metadata and text are output, and the same output format is used for both, and that format is JSON
|
146
|
+
(plain or "pretty") or YAML, then the output per document will be a single JSON or YAML hash representation
|
147
|
+
containing both the metadata and the text (whose keys are "metadata" and "text"). This enables piping
|
148
|
+
the results of multiple documents to a file or to another program that can use it as a data source.
|
149
|
+
In addition, when processing multiple files, this streaming approach will be more efficient
|
150
|
+
than calling Rika separately for each file, since each invocation of the rika command requires starting up
|
151
|
+
a Java Virtual Machine.
|
152
|
+
|
153
|
+
If the `-a` (`--as-array`) option is specified, then the output will be an array of such hashes, one for each file.
|
154
|
+
This enables the output to be used as a data source for programs that can process an array of hashes, e.g. for analysis.
|
155
|
+
|
156
|
+
For example, here is an example of how to use Rika and [rexe](https://github.com/keithrbennett/rexe]) to get a tally
|
157
|
+
of content types for a set of documents, sorted by content type:
|
158
|
+
|
159
|
+
```bash
|
160
|
+
$ rika -t- -s- -fy -a spec/fixtures/* | \
|
161
|
+
rexe -iy -oa -mb "map { |r| r['metadata']['Content-Type'] }.tally.sort.to_h"
|
162
|
+
{
|
163
|
+
"application/msword" => 1,
|
164
|
+
"application/octet-stream" => 1,
|
165
|
+
"application/pdf" => 1,
|
166
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document" => 1,
|
167
|
+
"image/jpeg" => 2,
|
168
|
+
"text/plain; charset=ISO-8859-1" => 1,
|
169
|
+
"text/plain; charset=UTF-8" => 6,
|
170
|
+
"text/x-matlab; charset=ISO-8859-1" => 1
|
171
|
+
}
|
172
|
+
```
|
173
|
+
Here is a breakdown of the above command:
|
174
|
+
|
175
|
+
* `rika`
|
176
|
+
* `-t-` suppresses the output of text
|
177
|
+
* `-s-` suppresses the output of the source identifier
|
178
|
+
* `-fy` outputs the data in YAML format.
|
179
|
+
* `-a` option causes the output to be an array of hashes, one for each file
|
180
|
+
* `rexe`
|
181
|
+
* `-iy` indicates that the input is YAML
|
182
|
+
* `-oa` indicates that the output should be done using awesome_print/amazing_print
|
183
|
+
* `-mb` indicates that all input should be ingested as a single string ("b" for "big string", as opposed to streamed)
|
184
|
+
|
185
|
+
* Ruby code passed to `rexe`
|
186
|
+
* `map` is called on the array to extract the content type from each parsed document hash
|
187
|
+
* `tally` is called on the resulting array to get the count of each content type
|
188
|
+
* `sort` is called on the hash to sort it by key (content type) and return an array of 2-element arrays
|
189
|
+
* `to_h` is called on the array of 2-element arrays to convert it back to a hash
|
190
|
+
|
191
|
+
Here is another example that prints out the 5 most common words in all the parsed text, and their counts,
|
192
|
+
as "pretty" JSON:
|
193
|
+
|
194
|
+
```bash
|
195
|
+
$ rika -m- spec/fixtures/* | \
|
196
|
+
rexe -in -oJ -mb 'downcase \
|
197
|
+
.split \
|
198
|
+
.tally \
|
199
|
+
.sort_by { |word, count| [-count, word] }
|
200
|
+
.first(5) \
|
201
|
+
.to_h'
|
202
|
+
|
203
|
+
{
|
204
|
+
"the": 35,
|
205
|
+
"to": 30,
|
206
|
+
"woods": 25,
|
207
|
+
"i": 25,
|
208
|
+
"and": 25
|
209
|
+
}
|
89
210
|
```
|
90
|
-
|
91
|
-
...and changing the `-oj` option gives you access to other output formats such as "Pretty JSON", YAML, and AwesomePrint (a very human readable format).
|
92
|
-
|
93
211
|
|
94
212
|
## Installation
|
95
213
|
|
96
|
-
|
214
|
+
* Install [JRuby](https://www.jruby.org) if you don't already have it. Ruby version managers such as
|
215
|
+
[rvm](https://rvm.io/) and [rbenv](https://github.com/rbenv) can simplify this process.
|
216
|
+
* Download the [Apache Tika](http://tika.apache.org/) jar file from
|
217
|
+
http://tika.apache.org/download.html (look for the "tika-app" jar file).
|
218
|
+
Put it in a place that makes sense for your system, such as `/usr/local/lib`.
|
219
|
+
* Configure the `TIKA_JAR_FILESPEC` environment variable to point to the Tika jar file.
|
220
|
+
For example, if you are using tika-app-2.9.0.jar, and put the jar file in `/opt/jars',
|
221
|
+
then the setting of the environment variable should look like this:
|
97
222
|
|
98
|
-
|
223
|
+
```bash
|
224
|
+
export TIKA_JAR_FILESPEC=/opt/jars/tika-app-2.9.0.jar
|
225
|
+
```
|
99
226
|
|
100
|
-
|
227
|
+
You can put this in your `.bashrc` or `.zshrc` file to make it persistent.
|
101
228
|
|
102
|
-
|
229
|
+
* Install the gem:
|
103
230
|
|
104
|
-
|
231
|
+
```bash
|
232
|
+
gem install rika
|
233
|
+
```
|
105
234
|
|
106
|
-
|
235
|
+
or, if you're using [bundler](https://bundler.io/), add this to your Gemfile:
|
107
236
|
|
108
|
-
|
109
|
-
|
110
|
-
|
237
|
+
```ruby
|
238
|
+
gem 'rika'
|
239
|
+
```
|
111
240
|
|
112
|
-
|
113
|
-
|
241
|
+
and then run `bundle install`.
|
242
|
+
* Verify that it works by running (as an example) `rika -m https://www.github.com`.
|
243
|
+
You should see key/value pairs representing the metadata of the Github home page.
|
114
244
|
|
115
|
-
|
245
|
+
This gem has been tested with JRuby managed by rvm. It should work with other Ruby version managers and
|
246
|
+
without any version manager at all, but those configurations have not been tested.
|
116
247
|
|
117
|
-
|
248
|
+
## Other Tika Resources
|
118
249
|
|
250
|
+
* The Apache Tika wiki is at https://cwiki.apache.org/confluence/display/tika.
|
119
251
|
|
120
|
-
|
252
|
+
* Tika also provides another jar file containing a RESTful server that you can run on the command line.
|
253
|
+
You can download this server jar from http://tika.apache.org/download.html (look for the "tika-server-standard" jar
|
254
|
+
file).
|
255
|
+
See the "Running the Tika Server as a Jar file" section of https://cwiki.apache.org/confluence/display/TIKA/TikaServer
|
256
|
+
for more information.
|
121
257
|
|
122
|
-
|
258
|
+
* @chrismattman and others have provided a ["tika_python" Python library and CLI](https://github.com/chrismattmann/tika-python)
|
259
|
+
that interfaces with the Tika server.
|
123
260
|
|
124
261
|
## Contributing
|
125
262
|
|
data/RELEASE_NOTES.md
CHANGED
@@ -1,5 +1,31 @@
|
|
1
1
|
## Release Notes
|
2
2
|
|
3
|
+
#### v2.0.0
|
4
|
+
|
5
|
+
* Add features:
|
6
|
+
* command line interface
|
7
|
+
* support for JSON, Pretty JSON, YAML, AwesomePrint, to_s, and inspect output formats
|
8
|
+
* optional array mode (previously only nonarray streaming mode).
|
9
|
+
* more persistent options can be specified in an environment variable, `RIKA_OPTIONS`.
|
10
|
+
* metadata keys can optionally be sorted alphabetically (not all languages though).
|
11
|
+
* properties added by Rika to the metadata: data-source, language
|
12
|
+
* Filespec or URL data source identifier can optionally be output with metadata and text.
|
13
|
+
* Add support for Tika 2.8.0, breaks compatibility with Tika 1.x.
|
14
|
+
* Remove tika-app-1.24.1.jar from code base and gem (but it is still in git history).
|
15
|
+
* Tika jar file is now downloaded by the user and found via environment variable `TIKA_JAR_FILESPEC`.
|
16
|
+
* New class ParseResult created to simplify result access and Parser class.
|
17
|
+
* Add `Rika.tika_version`.
|
18
|
+
* Add `webrick` dependency, needed for current versions of Ruby.
|
19
|
+
* Remove deprecated methods `Parser#available_metadata` and `Parser#metadata_exists?`.
|
20
|
+
* Move `Parser#language` to `Rika.language`.
|
21
|
+
* Remove `Parser#language_is_reasonably_certain?`, no longer supported by Tika.
|
22
|
+
* Remove obsolete `LanguageIdentifier` import. Otherwise updated language detection.
|
23
|
+
* Various refactorings and improvements.
|
24
|
+
* Add SimpleCov test coverage and Rubocop linting tools to project.
|
25
|
+
* Set up RSpec configuration to enable --only-failures and --next-failure options.
|
26
|
+
|
27
|
+
|
28
|
+
|
3
29
|
#### v1.11.1
|
4
30
|
|
5
31
|
* Add Apache-2.0 license to gemspec.
|
data/Rakefile
CHANGED
@@ -1,11 +1,8 @@
|
|
1
|
-
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'bundler/gem_tasks'
|
2
4
|
require 'rspec/core/rake_task'
|
3
5
|
|
4
6
|
RSpec::Core::RakeTask.new(:spec)
|
5
7
|
|
6
|
-
task :
|
7
|
-
|
8
|
-
desc 'Download jars'
|
9
|
-
task :download_jars do
|
10
|
-
system "mvn dependency:copy-dependencies"
|
11
|
-
end
|
8
|
+
task default: :spec
|
data/bin/rika
ADDED
@@ -0,0 +1,131 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
# Processes the array of arguments (ARGV by default) and returns the options, targets, and help string.
|
4
|
+
class ArgsParser
|
5
|
+
attr_reader :args, :options, :option_parser
|
6
|
+
private :args, :options, :option_parser
|
7
|
+
|
8
|
+
DEFAULT_OPTIONS =
|
9
|
+
{
|
10
|
+
as_array: false,
|
11
|
+
format: 'at', # AwesomePrint for metadata, to_s for text content
|
12
|
+
metadata: true,
|
13
|
+
text: true,
|
14
|
+
source: true,
|
15
|
+
key_sort: true
|
16
|
+
}.freeze
|
17
|
+
|
18
|
+
# Parses the command line arguments.
|
19
|
+
# Shorthand for ArgsParser.new.call. This call is recommended to pro tect the caller in case
|
20
|
+
# this functionality is repackaged as a Module or otherwise modified.
|
21
|
+
# @param [Array] args the command line arguments (overridable for testing, etc.)
|
22
|
+
# @return [Array<Hash,String>] [options, targets, help_string],
|
23
|
+
# or exits if help or version requested or no targets specified.
|
24
|
+
def self.call(args = ARGV)
|
25
|
+
new.call(args)
|
26
|
+
end
|
27
|
+
|
28
|
+
# Parses the command line arguments.
|
29
|
+
# @param [Array] args the command line arguments (overridable for testing, etc.)
|
30
|
+
# @return [Array<Hash,String>] [options, targets, help_string],
|
31
|
+
# or exits if help or version requested or no targets specified.
|
32
|
+
def call(args = ARGV)
|
33
|
+
@args = args
|
34
|
+
@options = DEFAULT_OPTIONS.dup
|
35
|
+
prepend_environment_args
|
36
|
+
@option_parser = create_option_parser
|
37
|
+
option_parser.parse!(args)
|
38
|
+
postprocess_format_options
|
39
|
+
targets = create_target_array
|
40
|
+
[options, targets, option_parser.help]
|
41
|
+
end
|
42
|
+
|
43
|
+
# @return [OptionParser]
|
44
|
+
private def create_option_parser
|
45
|
+
OptionParser.new do |opts|
|
46
|
+
opts.banner = <<~BANNER
|
47
|
+
Rika v#{Rika::VERSION} (Tika v#{Rika.tika_version}) - #{Rika::PROJECT_URL}
|
48
|
+
|
49
|
+
Usage: rika [options] <file or url> [...file or url...]
|
50
|
+
Output formats are: [a]wesome_print, [t]o_s, [i]nspect, [j]son), [J] for pretty json, and [y]aml.
|
51
|
+
If a format contains two letters, the first will be used for metadata, the second for text.
|
52
|
+
Values for the text, metadata, and as_array boolean options may be specified as follows:
|
53
|
+
Enable: +, true, yes, [empty]
|
54
|
+
Disable: -, false, no, [long form option with no- prefix, e.g. --no-metadata]
|
55
|
+
|
56
|
+
BANNER
|
57
|
+
|
58
|
+
format_message = 'Output format (default: at)'
|
59
|
+
opts.on('-f', '--format FORMAT', format_message) do |format|
|
60
|
+
options[:format] = format
|
61
|
+
end
|
62
|
+
|
63
|
+
opts.on('-m', '--[no-]metadata [FLAG]', TrueClass, 'Output metadata (default: true)') do |v|
|
64
|
+
options[:metadata] = (v.nil? ? true : v)
|
65
|
+
end
|
66
|
+
|
67
|
+
opts.on('-t', '--[no-]text [FLAG]', TrueClass, 'Output text (default: true)') do |v|
|
68
|
+
options[:text] = (v.nil? ? true : v)
|
69
|
+
end
|
70
|
+
|
71
|
+
opts.on('-k', '--[no-]key-sort [FLAG]', TrueClass, 'Sort metadata keys case insensitively (default: true)') do |v|
|
72
|
+
options[:key_sort] = (v.nil? ? true : v)
|
73
|
+
end
|
74
|
+
|
75
|
+
opts.on('-s', '--[no-]source [FLAG]', TrueClass, 'Document source file or URL') do |v|
|
76
|
+
options[:source] = (v.nil? ? true : v)
|
77
|
+
end
|
78
|
+
|
79
|
+
opts.on('-a', '--[no-]as-array [FLAG]', TrueClass,
|
80
|
+
'Output all parsed results as an array (default: false)') do |v|
|
81
|
+
options[:as_array] = (v.nil? ? true : v)
|
82
|
+
end
|
83
|
+
|
84
|
+
opts.on('-v', '--version', 'Output version') do
|
85
|
+
puts versions_string
|
86
|
+
exit
|
87
|
+
end
|
88
|
+
|
89
|
+
opts.on('-h', '--help', 'Output help') do
|
90
|
+
puts opts
|
91
|
+
exit
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
# @return [Array] the targets specified on the command line, possibly expanded by the shell,
|
97
|
+
# and with any directories removed.
|
98
|
+
private def create_target_array
|
99
|
+
targets = args.dup.reject { |arg| File.directory?(arg) }.freeze # reject dirs to handle **/* globbing
|
100
|
+
targets.map(&:freeze)
|
101
|
+
end
|
102
|
+
|
103
|
+
# Fills in the second format option character if absent, and removes any excess characters
|
104
|
+
# @return [String] format options 2-character value, e.g. 'at'
|
105
|
+
private def postprocess_format_options
|
106
|
+
# If only one format letter is specified, use it for both metadata and text.
|
107
|
+
options[:format] *= 2 if options[:format].length == 1
|
108
|
+
|
109
|
+
# Ignore and remove extra characters after the first two format characters.
|
110
|
+
options[:format] = options[:format][0..1]
|
111
|
+
end
|
112
|
+
|
113
|
+
# If the user wants to specify options in an environment variable ("RIKA_OPTIONS"),
|
114
|
+
# then this method will insert those options at the beginning of the `args` array,
|
115
|
+
# where they can be overridden by command line arguments.
|
116
|
+
private def prepend_environment_args
|
117
|
+
env_opt_string = environment_options
|
118
|
+
args_to_prepend = Shellwords.shellsplit(env_opt_string)
|
119
|
+
args.unshift(args_to_prepend).flatten!
|
120
|
+
end
|
121
|
+
|
122
|
+
# @return [String] the value of the RIKA_OPTIONS environment variable if present, else ''.
|
123
|
+
private def environment_options
|
124
|
+
ENV['RIKA_OPTIONS'] || ''
|
125
|
+
end
|
126
|
+
|
127
|
+
# @return [String] string containing versions of Rika and Tika, with labels
|
128
|
+
private def versions_string
|
129
|
+
"Versions: Rika: #{Rika::VERSION}, Tika: #{Rika.tika_version}"
|
130
|
+
end
|
131
|
+
end
|