rika 1.6.0-java → 2.0.0-java

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (84) hide show
  1. checksums.yaml +5 -5
  2. data/.gitignore +6 -4
  3. data/.rubocop.yml +49 -0
  4. data/Gemfile +12 -0
  5. data/README.md +226 -47
  6. data/RELEASE_NOTES.md +43 -0
  7. data/Rakefile +4 -7
  8. data/bin/rika +13 -0
  9. data/lib/rika/cli/args_parser.rb +131 -0
  10. data/lib/rika/cli/rika_command.rb +129 -0
  11. data/lib/rika/formatters.rb +39 -0
  12. data/lib/rika/parse_result.rb +34 -0
  13. data/lib/rika/parser.rb +84 -0
  14. data/lib/rika/tika_loader.rb +65 -0
  15. data/lib/rika/version.rb +3 -1
  16. data/lib/rika.rb +96 -104
  17. data/pom.xml +2 -2
  18. data/rika.gemspec +30 -15
  19. data/rika_helper.rb +30 -0
  20. data/spec/fixtures/de.txt +21 -1
  21. data/spec/fixtures/document.doc +0 -0
  22. data/spec/fixtures/document.docx +0 -0
  23. data/spec/fixtures/document.pdf +0 -0
  24. data/spec/fixtures/document.txt +23 -0
  25. data/spec/fixtures/en.txt +23 -1
  26. data/spec/fixtures/es.txt +21 -1
  27. data/spec/fixtures/fr.txt +23 -1
  28. data/spec/fixtures/image_jpg_without_extension +0 -0
  29. data/spec/fixtures/ru.txt +21 -1
  30. data/spec/fixtures/tiny.txt +1 -0
  31. data/spec/rika/cli/args_parser_spec.rb +117 -0
  32. data/spec/rika/cli/rika_command_spec.rb +120 -0
  33. data/spec/rika/formatters_spec.rb +23 -0
  34. data/spec/rika/parse_result_spec.rb +42 -0
  35. data/spec/rika/parser_spec.rb +304 -0
  36. data/spec/rika/rika_spec.rb +10 -0
  37. data/spec/rika/tika_loader_spec.rb +57 -0
  38. data/spec/spec_helper.rb +13 -5
  39. metadata +54 -98
  40. data/.travis.yml +0 -7
  41. data/spec/fixtures/over_100k_file.txt +0 -1241
  42. data/spec/fixtures/text_file.txt +0 -1
  43. data/spec/fixtures/text_file_without_extension +0 -1
  44. data/spec/rika_spec.rb +0 -202
  45. data/target/dependency/apache-mime4j-core-0.7.2.jar +0 -0
  46. data/target/dependency/apache-mime4j-dom-0.7.2.jar +0 -0
  47. data/target/dependency/asm-debug-all-4.1.jar +0 -0
  48. data/target/dependency/aspectjrt-1.8.0.jar +0 -0
  49. data/target/dependency/bcmail-jdk15-1.45.jar +0 -0
  50. data/target/dependency/bcprov-jdk15-1.45.jar +0 -0
  51. data/target/dependency/boilerpipe-1.1.0.jar +0 -0
  52. data/target/dependency/commons-codec-1.9.jar +0 -0
  53. data/target/dependency/commons-compress-1.8.1.jar +0 -0
  54. data/target/dependency/commons-httpclient-3.1.jar +0 -0
  55. data/target/dependency/commons-logging-1.1.1.jar +0 -0
  56. data/target/dependency/fontbox-1.8.6.jar +0 -0
  57. data/target/dependency/isoparser-1.0.2.jar +0 -0
  58. data/target/dependency/java-libpst-0.8.1.jar +0 -0
  59. data/target/dependency/jcip-annotations-1.0.jar +0 -0
  60. data/target/dependency/jdom-1.0.jar +0 -0
  61. data/target/dependency/jempbox-1.8.6.jar +0 -0
  62. data/target/dependency/jhighlight-1.0.jar +0 -0
  63. data/target/dependency/jmatio-1.0.jar +0 -0
  64. data/target/dependency/juniversalchardet-1.0.3.jar +0 -0
  65. data/target/dependency/metadata-extractor-2.6.2.jar +0 -0
  66. data/target/dependency/netcdf-4.2.20.jar +0 -0
  67. data/target/dependency/pdfbox-1.8.6.jar +0 -0
  68. data/target/dependency/poi-3.11-beta2.jar +0 -0
  69. data/target/dependency/poi-ooxml-3.11-beta2.jar +0 -0
  70. data/target/dependency/poi-ooxml-schemas-3.11-beta2.jar +0 -0
  71. data/target/dependency/poi-scratchpad-3.11-beta2.jar +0 -0
  72. data/target/dependency/rome-1.0.jar +0 -0
  73. data/target/dependency/slf4j-api-1.6.1.jar +0 -0
  74. data/target/dependency/tagsoup-1.2.1.jar +0 -0
  75. data/target/dependency/tika-core-1.6.jar +0 -0
  76. data/target/dependency/tika-parsers-1.6.jar +0 -0
  77. data/target/dependency/unidataCommon-4.2.20.jar +0 -0
  78. data/target/dependency/vorbis-java-core-0.6.jar +0 -0
  79. data/target/dependency/vorbis-java-tika-0.6.jar +0 -0
  80. data/target/dependency/xercesImpl-2.8.1.jar +0 -0
  81. data/target/dependency/xml-apis-1.3.03.jar +0 -0
  82. data/target/dependency/xmlbeans-2.6.0.jar +0 -0
  83. data/target/dependency/xmpcore-5.1.2.jar +0 -0
  84. data/target/dependency/xz-1.5.jar +0 -0
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
- SHA1:
3
- metadata.gz: 077d41fc4797ad2074f4acdd6df554cb7221ef01
4
- data.tar.gz: 7f1a591fe8bad0e68ca8173e1d8d7690b16341db
2
+ SHA256:
3
+ metadata.gz: 70dc9cbc6d2da17d3bcff5cdeb3a905fdbcf2e2b9d24c131558566cbfc19eada
4
+ data.tar.gz: fb81f98476322d2291488b2bd2cb9c89a1544b4ee1f85ce22ab773c30ac4765c
5
5
  SHA512:
6
- metadata.gz: 07346daa2e40986329f5b19e85d12e02b9a6cb802249d9a303aa86022e6b7515f309c3b411338bfe1d82e5bb27e2c4e5de0a6cdb18221e80b8c414c5bb5deaa2
7
- data.tar.gz: c26125c52c0c0411aa8c5dff2f7f0f81e28d0d63f4b241ca6ef47d5ea841c50576c5c40ce47a072c31ec7f16a6312510ab194c6f47499847b323f4bd80c29c68
6
+ metadata.gz: 780650b18df09662b8b67e7321641c2cf36d78335c5cf1f59f5d81a23cf262dc478958ad7aab306f80deb235e9c9910332086b3ea5231888ea96c26abd2d2505
7
+ data.tar.gz: afa841ebfe9ebb1bee5ccb0550fedde43edab6e1d6b86eed671537604d64b318c0abc7e41ffee054443f6bfd4a7b3c9423926e137fba4fdd95da66775e35d695
data/.gitignore CHANGED
@@ -1,7 +1,10 @@
1
1
  *.gem
2
2
  *.rbc
3
+ .DS_Store
3
4
  .bundle
4
5
  .config
6
+ coverage/
7
+ .idea/
5
8
  .yardoc
6
9
  Gemfile.lock
7
10
  InstalledFiles
@@ -10,12 +13,11 @@ coverage
10
13
  doc/
11
14
  lib/bundler/man
12
15
  pkg
16
+ projectFilesBackup/
13
17
  rdoc
14
18
  spec/reports
19
+ spec/rspec-failed-tests-control-file.txt
20
+ target/
15
21
  test/tmp
16
22
  test/version_tmp
17
23
  tmp
18
-
19
- .DS_Store
20
- projectFilesBackup/
21
- .idea/
data/.rubocop.yml ADDED
@@ -0,0 +1,49 @@
1
+ require: rubocop-rspec
2
+ AllCops:
3
+ NewCops: enable
4
+ Include:
5
+ - '**/*.rb'
6
+ - '*.gemspec'
7
+ - '**/Rakefile'
8
+ - '**/Gemfile'
9
+ - 'bin/rika'
10
+ Gemspec/RequiredRubyVersion:
11
+ Enabled: false
12
+ Layout/HashAlignment:
13
+ Enabled: false
14
+ Metrics/AbcSize:
15
+ Enabled: false
16
+ Metrics/BlockLength:
17
+ Enabled: false
18
+ Metrics/MethodLength:
19
+ Enabled: false
20
+ RSpec/ExampleLength:
21
+ Enabled: false
22
+ RSpec/ExpectOutput:
23
+ Enabled: false
24
+ RSpec/InstanceVariable:
25
+ Enabled: false
26
+ RSpec/MultipleExpectations:
27
+ Enabled: false
28
+ RSpec/MultipleMemoizedHelpers:
29
+ Enabled: false
30
+ Style/AccessModifierDeclarations:
31
+ Enabled: false
32
+ Style/FetchEnvVar:
33
+ Enabled: false
34
+ Style/GuardClause:
35
+ Enabled: false
36
+ Style/IfUnlessModifier:
37
+ Enabled: false
38
+ Style/Lambda:
39
+ Enabled: false
40
+ Style/LambdaCall:
41
+ Enabled: false
42
+ Style/NumericLiterals:
43
+ Enabled: false
44
+ Style/PercentLiteralDelimiters:
45
+ Enabled: false
46
+ Style/StderrPuts:
47
+ Enabled: false
48
+ Style/TrailingUnderscoreVariable:
49
+ Enabled: false
data/Gemfile CHANGED
@@ -1,4 +1,16 @@
1
+ # frozen_string_literal: true
2
+
1
3
  source 'https://rubygems.org'
2
4
 
3
5
  # Specify your gem's dependencies in rika.gemspec
4
6
  gemspec
7
+
8
+ group :development do
9
+ gem 'pry'
10
+ gem 'rake', '~> 13.0'
11
+ gem 'rspec', '~> 3.9'
12
+ gem 'rubocop'
13
+ gem 'rubocop-rspec'
14
+ gem 'simplecov', require: false
15
+ gem 'webrick', '~> 1.6'
16
+ end
data/README.md CHANGED
@@ -1,83 +1,262 @@
1
1
  # Rika
2
2
 
3
- A JRuby wrapper for Apache Tika to extract text and metadata from various file formats.
3
+ [Rika](https://github.com/keithrbennett/rika) is a [JRuby](https://www.jruby.org) wrapper for
4
+ the [Apache Tika](http://tika.apache.org/) Java library, which extracts text and metadata from files and resources
5
+ of [many different formats](https://tika.apache.org/1.24.1/formats.html).
4
6
 
5
- More information about Apache Tika can be found here: http://tika.apache.org/
7
+ Rika can be used as a library in your Ruby code, or on the command line.
6
8
 
7
- [![Code Climate](https://codeclimate.com/github/ricn/rika.png)](https://codeclimate.com/github/ricn/rika)
8
- [![Build Status](https://travis-ci.org/ricn/rika.png?branch=master)](https://travis-ci.org/ricn/rika)
9
+ For class and method level documentation, please use [YARD](https://rubydoc.info/gems/yard).
10
+ You can `gem install yard`, then run `yard doc` from the project root,
11
+ and then open the `doc/index.html` file in a browser.
9
12
 
10
- ## Installation
11
13
 
12
- Add this line to your application's Gemfile:
14
+ ### Requirements
15
+
16
+ * This gem only works with [JRuby](https://www.jruby.org).
17
+ * The [Apache Tika](http://tika.apache.org/) jar file must be installed on your system.
18
+ See the [Installation](#installation) section below for more information.
19
+
20
+ Rika currently supports some basic and commonly used functions of Tika.
21
+ Since it runs on JRuby, the Tika library's Java methods can be called directly from Ruby code
22
+ for more advanced needs.
23
+ See the [Other Tika Resources](#other-tika-resources) section of this document for alternatives to
24
+ Rika that may suit more demanding needs.
13
25
 
14
- gem 'rika'
26
+ Rika can be used either as a gem in your own Ruby project, or on the command line using the provided executable.
15
27
 
16
- Remember that this gem only works on JRuby.
28
+ ## Usage in Your Ruby Code
17
29
 
18
- And then execute:
30
+ > [!IMPORTANT]
31
+ > **It is necessary to call `Rika.init` before using Rika.** This is because the loading of the Tika library
32
+ has been put in an init method, rather than at load time, so that 'jar file not found or specified' errors
33
+ do not prevent your application from loading. If you forget to call `Rika.init`, you may see seemingly unrelated
34
+ error messages.
19
35
 
20
- $ bundle
36
+ As a convenience, the `Rika.init` method is called automatically when you call the Rika module methods. However,
37
+ if you access other Rika classes and methods, `init` may not have been called yet, so you should call it yourself.
21
38
 
22
- Or install it yourself as:
39
+ ----
23
40
 
24
- $ gem install rika
41
+ The Rika `parse` method returns a `Rika::ParseResult` object that contains the parsed text and
42
+ various pieces of metadata. The `ParseResult` class' main methods are:
25
43
 
26
- ## Usage
44
+ * `content` - the parsed text
45
+ * `metadata` - a hash of metadata key/value pairs
46
+ * `content_type` - the content type of the parsed data, e.g. "text/plain; charset=UTF-8"
47
+ * `language` - the language of the parsed data, e.g. "en"
48
+ * `data_source` - the data source, either a filespec or a URL
27
49
 
28
- For a quick start with the simplest use cases, the following functions
29
- are provided to get what you need in a single function call, for your convenience:
50
+ For example:
30
51
 
31
52
  ```ruby
32
53
  require 'rika'
33
54
 
34
- content = Rika.parse_content('document.pdf') # string containing all content text
35
- metadata = Rika.parse_metadata('document.pdf') # hash containing the document metadata
36
- content, metadata = Rika.parse_content_and_metadata('document.pdf') # both of the above
55
+ parse_result = Rika.parse('x.pdf') # returns a Rika::ParseResult object
56
+ parse_result.content # string containing all content text
57
+ parse_result.text # 'text' is an alias for 'content'
58
+ parse_result.metadata # hash containing the document metadata
59
+ parse_result.content_type # e.g. "application/pdf"
60
+ parse_result.language # e.g. "en"
61
+ parse_result.data_source # e.g. "x.pdf"
37
62
  ```
38
63
 
39
- For other use cases and finer control, you can work directly with the Rika::Parser object:
64
+ A URL can be used instead of a filespec wherever a data source is specified:
40
65
 
41
66
  ```ruby
42
- require 'rika'
67
+ parse_result = Rika.parse('https://github.com/keithrbennett/rika')
68
+ ```
69
+
70
+ The Rika module also has the following methods:
71
+
72
+ ```ruby
73
+ Rika.language("magnifique") # => "fr"
74
+ Rika.tika_version # => "2.9.0"
75
+ ```
76
+
77
+ ## Command Line Executable Usage
78
+
79
+ Rika can also be used on the command line using the `rika` executable. For example, the simplest form is to simply
80
+ specify one or more filespecs or URL's as arguments:
81
+
82
+ ```bash
83
+ rika x.pdf https://github.com/keithrbennett/rika
84
+ ```
85
+ Here is the help text:
86
+
87
+ ```
88
+ Rika v2.0.0 (Tika v2.9.0) - https://github.com/keithrbennett/rika
89
+
90
+ Usage: rika [options] <file or url> [...file or url...]
91
+ Output formats are: [a]wesome_print, [t]o_s, [i]nspect, [j]son), [J] for pretty json, and [y]aml.
92
+ If a format contains two letters, the first will be used for metadata, the second for text.
93
+ Values for the text, metadata, and as_array boolean options may be specified as follows:
94
+ Enable: +, true, yes, [empty]
95
+ Disable: -, false, no, [long form option with no- prefix, e.g. --no-metadata]
96
+
97
+ -f, --format FORMAT Output format (default: at)
98
+ -m, --[no-]metadata [FLAG] Output metadata (default: true)
99
+ -t, --[no-]text [FLAG] Output text (default: true)
100
+ -k, --[no-]key-sort [FLAG] Sort metadata keys case insensitively (default: true)
101
+ -s, --[no-]source [FLAG] Document source file or URL
102
+ -a, --[no-]as-array [FLAG] Output all parsed results as an array (default: false)
103
+ -v, --version Output version
104
+ -h, --help Output help
105
+ ```
106
+
107
+ ### Outputting Only Metadata or Only Parsed Text
108
+
109
+ The default setting is to output both metadata and text. To disable either, use the `-m` or `-t` options
110
+ with a disabling flag, e.g. `-m-`, `-m false`, `-m no`, or `--no-metadata` to disable metadata.
111
+
112
+ ### Outputting the Document Source Identifier (Filespec or URL)
113
+
114
+ There are many times when it is useful to know the source of the document. For example, if you are processing
115
+ a large number of documents, you may want to know which document a particular piece of output came from.
116
+
117
+ The document source identifier is output by default. To disable it, use the `-s` option with a disabling flag, e.g. `-s-`,
118
+ `-s false`, `-s no`, or `--no-source`.
119
+
120
+ ### Output Formats
43
121
 
44
- parser = Rika::Parser.new('document.pdf')
122
+ The `-f` option can be used to specify the output format. The default is `at`, which means that the metadata will be
123
+ output in awesome_print format, and the text will be output using `to_s`
124
+ (i.e. without any changes to the parsed string).
45
125
 
46
- # Return the content of the document:
47
- parser.content
126
+ If a single argument to `-f` is specified, it will be used for both metadata and text. If two arguments are specified,
127
+ the first will be used for metadata and the second for the parsed text.
48
128
 
49
- # Return the media type for the document:
50
- parser.media_type
51
- => "application/pdf"
129
+ ### Sorting of Metadata Keys
52
130
 
53
- # Return the metadata field title if it exists:
54
- parser.metadata["title"] if parser.metadata_exists?("title")
131
+ By default, metadata keys will be sorted case insensitively. To disable this, use the `-k` option
132
+ with a disabling flag, i.e. `-k-`, `-k false`, `-k no`, or `--no-key-sort`.
55
133
 
56
- # Return all the available metadata keys that can be read from the document
57
- parser.available_metadata
134
+ The case insensitivity is implemented by using `String#downcase`.
135
+ This may not sort correctly on some non-English systems.
58
136
 
59
- # Return only the first 10000 chars of the content:
60
- parser = Rika::Parser.new('document.pdf', 10000)
61
- parser.content # 10000 first chars returned
137
+ ### Specifying Command Line Options in the RIKA_OPTIONS Environment Variable
62
138
 
63
- # Return content from URL
64
- parser = Rika::Parser.new('http://riakhandbook.com/sample.pdf', 200)
65
- parser.content
139
+ If you find yourself using the same options over and over again, you can put them in the `RIKA_OPTIONS` environment
140
+ variable. For example, if the default behavior of sorting keys does not work for your language, you can disable it
141
+ for all invocations of the `rika` command by specifying `-k-` in the RIKA_OPTIONS environment variable.
66
142
 
67
- # Return the language for the content
68
- parser = parser = Rika::Parser.new('german document.pdf')
69
- parser.language
70
- => "de"
143
+ ### Machine Readable Data Support
71
144
 
72
- # Check whether the langugage identification is certain enough to be trusted
73
- parser.language_is_reasonably_certain?
74
-
145
+ If both metadata and text are output, and the same output format is used for both, and that format is JSON
146
+ (plain or "pretty") or YAML, then the output per document will be a single JSON or YAML hash representation
147
+ containing both the metadata and the text (whose keys are "metadata" and "text"). This enables piping
148
+ the results of multiple documents to a file or to another program that can use it as a data source.
149
+ In addition, when processing multiple files, this streaming approach will be more efficient
150
+ than calling Rika separately for each file, since each invocation of the rika command requires starting up
151
+ a Java Virtual Machine.
152
+
153
+ If the `-a` (`--as-array`) option is specified, then the output will be an array of such hashes, one for each file.
154
+ This enables the output to be used as a data source for programs that can process an array of hashes, e.g. for analysis.
155
+
156
+ For example, here is an example of how to use Rika and [rexe](https://github.com/keithrbennett/rexe]) to get a tally
157
+ of content types for a set of documents, sorted by content type:
158
+
159
+ ```bash
160
+ $ rika -t- -s- -fy -a spec/fixtures/* | \
161
+ rexe -iy -oa -mb "map { |r| r['metadata']['Content-Type'] }.tally.sort.to_h"
162
+ {
163
+ "application/msword" => 1,
164
+ "application/octet-stream" => 1,
165
+ "application/pdf" => 1,
166
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document" => 1,
167
+ "image/jpeg" => 2,
168
+ "text/plain; charset=ISO-8859-1" => 1,
169
+ "text/plain; charset=UTF-8" => 6,
170
+ "text/x-matlab; charset=ISO-8859-1" => 1
171
+ }
75
172
  ```
173
+ Here is a breakdown of the above command:
174
+
175
+ * `rika`
176
+ * `-t-` suppresses the output of text
177
+ * `-s-` suppresses the output of the source identifier
178
+ * `-fy` outputs the data in YAML format.
179
+ * `-a` option causes the output to be an array of hashes, one for each file
180
+ * `rexe`
181
+ * `-iy` indicates that the input is YAML
182
+ * `-oa` indicates that the output should be done using awesome_print/amazing_print
183
+ * `-mb` indicates that all input should be ingested as a single string ("b" for "big string", as opposed to streamed)
184
+
185
+ * Ruby code passed to `rexe`
186
+ * `map` is called on the array to extract the content type from each parsed document hash
187
+ * `tally` is called on the resulting array to get the count of each content type
188
+ * `sort` is called on the hash to sort it by key (content type) and return an array of 2-element arrays
189
+ * `to_h` is called on the array of 2-element arrays to convert it back to a hash
190
+
191
+ Here is another example that prints out the 5 most common words in all the parsed text, and their counts,
192
+ as "pretty" JSON:
193
+
194
+ ```bash
195
+ $ rika -m- spec/fixtures/* | \
196
+ rexe -in -oJ -mb 'downcase \
197
+ .split \
198
+ .tally \
199
+ .sort_by { |word, count| [-count, word] }
200
+ .first(5) \
201
+ .to_h'
202
+
203
+ {
204
+ "the": 35,
205
+ "to": 30,
206
+ "woods": 25,
207
+ "i": 25,
208
+ "and": 25
209
+ }
210
+ ```
211
+
212
+ ## Installation
213
+
214
+ * Install [JRuby](https://www.jruby.org) if you don't already have it. Ruby version managers such as
215
+ [rvm](https://rvm.io/) and [rbenv](https://github.com/rbenv) can simplify this process.
216
+ * Download the [Apache Tika](http://tika.apache.org/) jar file from
217
+ http://tika.apache.org/download.html (look for the "tika-app" jar file).
218
+ Put it in a place that makes sense for your system, such as `/usr/local/lib`.
219
+ * Configure the `TIKA_JAR_FILESPEC` environment variable to point to the Tika jar file.
220
+ For example, if you are using tika-app-2.9.0.jar, and put the jar file in `/opt/jars',
221
+ then the setting of the environment variable should look like this:
222
+
223
+ ```bash
224
+ export TIKA_JAR_FILESPEC=/opt/jars/tika-app-2.9.0.jar
225
+ ```
226
+
227
+ You can put this in your `.bashrc` or `.zshrc` file to make it persistent.
228
+
229
+ * Install the gem:
230
+
231
+ ```bash
232
+ gem install rika
233
+ ```
234
+
235
+ or, if you're using [bundler](https://bundler.io/), add this to your Gemfile:
236
+
237
+ ```ruby
238
+ gem 'rika'
239
+ ```
240
+
241
+ and then run `bundle install`.
242
+ * Verify that it works by running (as an example) `rika -m https://www.github.com`.
243
+ You should see key/value pairs representing the metadata of the Github home page.
244
+
245
+ This gem has been tested with JRuby managed by rvm. It should work with other Ruby version managers and
246
+ without any version manager at all, but those configurations have not been tested.
247
+
248
+ ## Other Tika Resources
249
+
250
+ * The Apache Tika wiki is at https://cwiki.apache.org/confluence/display/tika.
251
+
252
+ * Tika also provides another jar file containing a RESTful server that you can run on the command line.
253
+ You can download this server jar from http://tika.apache.org/download.html (look for the "tika-server-standard" jar
254
+ file).
255
+ See the "Running the Tika Server as a Jar file" section of https://cwiki.apache.org/confluence/display/TIKA/TikaServer
256
+ for more information.
76
257
 
77
- ## Credits
78
- The following people have contributed ideas, documentation, or code to Rika:
79
- * Keith Bennett
80
- * Richard Nyström
258
+ * @chrismattman and others have provided a ["tika_python" Python library and CLI](https://github.com/chrismattmann/tika-python)
259
+ that interfaces with the Tika server.
81
260
 
82
261
  ## Contributing
83
262
 
data/RELEASE_NOTES.md ADDED
@@ -0,0 +1,43 @@
1
+ ## Release Notes
2
+
3
+ #### v2.0.0
4
+
5
+ * Add features:
6
+ * command line interface
7
+ * support for JSON, Pretty JSON, YAML, AwesomePrint, to_s, and inspect output formats
8
+ * optional array mode (previously only nonarray streaming mode).
9
+ * more persistent options can be specified in an environment variable, `RIKA_OPTIONS`.
10
+ * metadata keys can optionally be sorted alphabetically (not all languages though).
11
+ * properties added by Rika to the metadata: data-source, language
12
+ * Filespec or URL data source identifier can optionally be output with metadata and text.
13
+ * Add support for Tika 2.8.0, breaks compatibility with Tika 1.x.
14
+ * Remove tika-app-1.24.1.jar from code base and gem (but it is still in git history).
15
+ * Tika jar file is now downloaded by the user and found via environment variable `TIKA_JAR_FILESPEC`.
16
+ * New class ParseResult created to simplify result access and Parser class.
17
+ * Add `Rika.tika_version`.
18
+ * Add `webrick` dependency, needed for current versions of Ruby.
19
+ * Remove deprecated methods `Parser#available_metadata` and `Parser#metadata_exists?`.
20
+ * Move `Parser#language` to `Rika.language`.
21
+ * Remove `Parser#language_is_reasonably_certain?`, no longer supported by Tika.
22
+ * Remove obsolete `LanguageIdentifier` import. Otherwise updated language detection.
23
+ * Various refactorings and improvements.
24
+ * Add SimpleCov test coverage and Rubocop linting tools to project.
25
+ * Set up RSpec configuration to enable --only-failures and --next-failure options.
26
+
27
+
28
+
29
+ #### v1.11.1
30
+
31
+ * Add Apache-2.0 license to gemspec.
32
+
33
+
34
+ #### v1.11.0
35
+
36
+ * Replace 2015 Tika jar files w/2020 tika-app-1.24.1.jar.
37
+ * Handover of maintainer status from @ricn to @keithrbennett.
38
+ * Add rika_helper.rb to provide abbreviated method names for interactive use w/pry, etc.
39
+ * Extract parser class to its own file.
40
+ * Various cleanup and refactoring.
41
+ * Improve README.md documentation.
42
+ * Tested successfully on Java 14.
43
+ * Move Tika jar file from /target/dependency to /java-lib.
data/Rakefile CHANGED
@@ -1,11 +1,8 @@
1
- require "bundler/gem_tasks"
1
+ # frozen_string_literal: true
2
+
3
+ require 'bundler/gem_tasks'
2
4
  require 'rspec/core/rake_task'
3
5
 
4
6
  RSpec::Core::RakeTask.new(:spec)
5
7
 
6
- task :default => :spec
7
-
8
- desc 'Download jars'
9
- task :download_jars do
10
- system "mvn dependency:copy-dependencies"
11
- end
8
+ task default: :spec
data/bin/rika ADDED
@@ -0,0 +1,13 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require 'rika/cli/rika_command'
5
+
6
+ begin
7
+ Rika.init
8
+ rescue Rika::TikaLoadError => e
9
+ $stderr.puts e.message
10
+ exit 1
11
+ end
12
+
13
+ RikaCommand.new.call
@@ -0,0 +1,131 @@
1
+ # frozen_string_literal: true
2
+
3
+ # Processes the array of arguments (ARGV by default) and returns the options, targets, and help string.
4
+ class ArgsParser
5
+ attr_reader :args, :options, :option_parser
6
+ private :args, :options, :option_parser
7
+
8
+ DEFAULT_OPTIONS =
9
+ {
10
+ as_array: false,
11
+ format: 'at', # AwesomePrint for metadata, to_s for text content
12
+ metadata: true,
13
+ text: true,
14
+ source: true,
15
+ key_sort: true
16
+ }.freeze
17
+
18
+ # Parses the command line arguments.
19
+ # Shorthand for ArgsParser.new.call. This call is recommended to pro tect the caller in case
20
+ # this functionality is repackaged as a Module or otherwise modified.
21
+ # @param [Array] args the command line arguments (overridable for testing, etc.)
22
+ # @return [Array<Hash,String>] [options, targets, help_string],
23
+ # or exits if help or version requested or no targets specified.
24
+ def self.call(args = ARGV)
25
+ new.call(args)
26
+ end
27
+
28
+ # Parses the command line arguments.
29
+ # @param [Array] args the command line arguments (overridable for testing, etc.)
30
+ # @return [Array<Hash,String>] [options, targets, help_string],
31
+ # or exits if help or version requested or no targets specified.
32
+ def call(args = ARGV)
33
+ @args = args
34
+ @options = DEFAULT_OPTIONS.dup
35
+ prepend_environment_args
36
+ @option_parser = create_option_parser
37
+ option_parser.parse!(args)
38
+ postprocess_format_options
39
+ targets = create_target_array
40
+ [options, targets, option_parser.help]
41
+ end
42
+
43
+ # @return [OptionParser]
44
+ private def create_option_parser
45
+ OptionParser.new do |opts|
46
+ opts.banner = <<~BANNER
47
+ Rika v#{Rika::VERSION} (Tika v#{Rika.tika_version}) - #{Rika::PROJECT_URL}
48
+
49
+ Usage: rika [options] <file or url> [...file or url...]
50
+ Output formats are: [a]wesome_print, [t]o_s, [i]nspect, [j]son), [J] for pretty json, and [y]aml.
51
+ If a format contains two letters, the first will be used for metadata, the second for text.
52
+ Values for the text, metadata, and as_array boolean options may be specified as follows:
53
+ Enable: +, true, yes, [empty]
54
+ Disable: -, false, no, [long form option with no- prefix, e.g. --no-metadata]
55
+
56
+ BANNER
57
+
58
+ format_message = 'Output format (default: at)'
59
+ opts.on('-f', '--format FORMAT', format_message) do |format|
60
+ options[:format] = format
61
+ end
62
+
63
+ opts.on('-m', '--[no-]metadata [FLAG]', TrueClass, 'Output metadata (default: true)') do |v|
64
+ options[:metadata] = (v.nil? ? true : v)
65
+ end
66
+
67
+ opts.on('-t', '--[no-]text [FLAG]', TrueClass, 'Output text (default: true)') do |v|
68
+ options[:text] = (v.nil? ? true : v)
69
+ end
70
+
71
+ opts.on('-k', '--[no-]key-sort [FLAG]', TrueClass, 'Sort metadata keys case insensitively (default: true)') do |v|
72
+ options[:key_sort] = (v.nil? ? true : v)
73
+ end
74
+
75
+ opts.on('-s', '--[no-]source [FLAG]', TrueClass, 'Document source file or URL') do |v|
76
+ options[:source] = (v.nil? ? true : v)
77
+ end
78
+
79
+ opts.on('-a', '--[no-]as-array [FLAG]', TrueClass,
80
+ 'Output all parsed results as an array (default: false)') do |v|
81
+ options[:as_array] = (v.nil? ? true : v)
82
+ end
83
+
84
+ opts.on('-v', '--version', 'Output version') do
85
+ puts versions_string
86
+ exit
87
+ end
88
+
89
+ opts.on('-h', '--help', 'Output help') do
90
+ puts opts
91
+ exit
92
+ end
93
+ end
94
+ end
95
+
96
+ # @return [Array] the targets specified on the command line, possibly expanded by the shell,
97
+ # and with any directories removed.
98
+ private def create_target_array
99
+ targets = args.dup.reject { |arg| File.directory?(arg) }.freeze # reject dirs to handle **/* globbing
100
+ targets.map(&:freeze)
101
+ end
102
+
103
+ # Fills in the second format option character if absent, and removes any excess characters
104
+ # @return [String] format options 2-character value, e.g. 'at'
105
+ private def postprocess_format_options
106
+ # If only one format letter is specified, use it for both metadata and text.
107
+ options[:format] *= 2 if options[:format].length == 1
108
+
109
+ # Ignore and remove extra characters after the first two format characters.
110
+ options[:format] = options[:format][0..1]
111
+ end
112
+
113
+ # If the user wants to specify options in an environment variable ("RIKA_OPTIONS"),
114
+ # then this method will insert those options at the beginning of the `args` array,
115
+ # where they can be overridden by command line arguments.
116
+ private def prepend_environment_args
117
+ env_opt_string = environment_options
118
+ args_to_prepend = Shellwords.shellsplit(env_opt_string)
119
+ args.unshift(args_to_prepend).flatten!
120
+ end
121
+
122
+ # @return [String] the value of the RIKA_OPTIONS environment variable if present, else ''.
123
+ private def environment_options
124
+ ENV['RIKA_OPTIONS'] || ''
125
+ end
126
+
127
+ # @return [String] string containing versions of Rika and Tika, with labels
128
+ private def versions_string
129
+ "Versions: Rika: #{Rika::VERSION}, Tika: #{Rika.tika_version}"
130
+ end
131
+ end